├── .github └── workflows │ └── python-publish.yml ├── .gitignore ├── .idx └── dev.nix ├── LICENSE ├── README.md ├── __init__.py ├── base ├── __init__.py ├── operations.py └── operations_test.py ├── benchmarks ├── import_benchmarks.ipynb ├── main.py └── requirements.txt ├── databases ├── __init__.py ├── firestore_kg.py ├── mdb.py └── n4j.py ├── datamodel ├── __init__.py └── data_model.py ├── requirements.txt └── setup.py /.github/workflows/python-publish.yml: -------------------------------------------------------------------------------- 1 | # This workflow will upload a Python Package using Twine when a release is created 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries 3 | 4 | # This workflow uses actions that are not certified by GitHub. 5 | # They are provided by a third-party and are governed by 6 | # separate terms of service, privacy policy, and support 7 | # documentation. 8 | 9 | name: Upload Python Package 10 | 11 | on: 12 | release: 13 | types: [published] 14 | 15 | permissions: 16 | contents: read 17 | 18 | jobs: 19 | deploy: 20 | 21 | runs-on: ubuntu-latest 22 | 23 | steps: 24 | - uses: actions/checkout@v4 25 | - name: Set up Python 26 | uses: actions/setup-python@v3 27 | with: 28 | python-version: '3.x' 29 | - name: Install dependencies 30 | run: | 31 | python -m pip install --upgrade pip 32 | pip install build 33 | - name: Build package 34 | run: python -m build 35 | - name: Publish package 36 | uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29 37 | with: 38 | user: __token__ 39 | password: ${{ secrets.PYPI_API_TOKEN }} 40 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.json 2 | .DS_Store 3 | *.venv* 4 | .env 5 | .graph2nosql_venv 6 | *.pyc 7 | */test_graph.png 8 | 9 | *.egg-info 10 | Neo4j-39cb28f0-Created-2024-09-23.txt 11 | -------------------------------------------------------------------------------- /.idx/dev.nix: -------------------------------------------------------------------------------- 1 | # To learn more about how to use Nix to configure your environment 2 | # see: https://developers.google.com/idx/guides/customize-idx-env 3 | { pkgs, ... }: { 4 | # Which nixpkgs channel to use. 5 | channel = "stable-23.11"; # or "unstable" 6 | 7 | # Use https://search.nixos.org/packages to find packages 8 | packages = [ 9 | pkgs.python311 10 | pkgs.python311Packages.pip 11 | pkgs.streamlit 12 | pkgs.gnumake 13 | ]; 14 | idx = { 15 | # Search for the extensions you want on https://open-vsx.org/ and use "publisher.id" 16 | extensions = [ 17 | "ms-python.autopep8" 18 | "ms-python.debugpy" 19 | "ms-python.pythonv2024.12.3" 20 | "googlecloudtools.cloudcode" 21 | "ms-toolsai.jupyter" 22 | "ms-toolsai.jupyter-keymap" 23 | "ms-toolsai.jupyter-renderers" 24 | "ms-toolsai.vscode-jupyter-cell-tags" 25 | "ms-toolsai.vscode-jupyter-slideshow" 26 | ]; 27 | # Enable previews 28 | previews = { 29 | enable = true; 30 | previews = { 31 | # web = { 32 | # # Example: run "npm run dev" with PORT set to IDX's defined port for previews, 33 | # # and show it in IDX's web preview panel 34 | # command = ["npm" "run" "dev"]; 35 | # manager = "web"; 36 | # env = { 37 | # # Environment variables to set for your server 38 | # PORT = "$PORT"; 39 | # }; 40 | # }; 41 | }; 42 | }; 43 | 44 | # Workspace lifecycle hooks 45 | workspace = { 46 | # Runs when a workspace is first created 47 | onCreate = { 48 | install = 49 | "python3 -m venv .graph2nosql_venv && source .graph2nosql_venv/bin/activate && pip install --upgrade pip && pip install -r requirements.txt"; 50 | }; 51 | # Runs when the workspace is (re)started 52 | onStart = { 53 | # Example: start a background task to watch and re-build backend code 54 | # watch-backend = "npm run watch-backend"; 55 | }; 56 | }; 57 | }; 58 | } 59 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | Copyright 2024 Jakob Pörschmann 179 | 180 | Licensed under the Apache License, Version 2.0 (the "License"); 181 | you may not use this file except in compliance with the License. 182 | You may obtain a copy of the License at 183 | 184 | http://www.apache.org/licenses/LICENSE-2.0 185 | 186 | Unless required by applicable law or agreed to in writing, software 187 | distributed under the License is distributed on an "AS IS" BASIS, 188 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 189 | See the License for the specific language governing permissions and 190 | limitations under the License. 191 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # graph2nosql 2 | [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) 3 | [![Python](https://img.shields.io/badge/python-3.x-blue.svg)](https://www.python.org/) 4 | 5 | 6 | 7 | 10 | 13 | Open in IDX 17 | 18 | 19 | 20 | A simple Python interface to store and interact with knowledge graphs in your favourite NoSQL DB. 21 | 22 | Knowledge Graphs are the up and coming tool to index knowlegde and make it understandable to your LLM applications. Working with Graph Databases are a pain though. 23 | 24 | This Python interface aims to solve this by offering a set of basic functions to store and manage your (knowledge graph) in your existing NoSQL DB. From experience Document based databases offer an exteremely attractive performance an price position comparing to some existing specialized databases. I found this to be attractive for simple graph storage use cases in which no fully structured query language is required. 25 | 26 | This repository mostly caters own use and is not regularly updated or maintained. 27 | 28 | ## Implemented Databases for graph storage: 29 | * [Firestore](https://firebase.google.com/docs/firestore) 30 | * [MongoDB](https://www.mongodb.com/docs/) 31 | * [Neo4J for latency & cost benchmark](https://neo4j.com/docs/) 32 | 33 | ## Performance Benchmark 34 | Approximate latency performance benchmark comparing tool and technology. Benchmarking framework [can be found in `./benchmarks`](https://github.com/jakobap/graph2nosql/tree/main/benchmarks). 35 | 36 | Values are processing seconds -> lower = better 37 | | Feature | Firestore | MongoDB | Neo4j | 38 | |---|---|---|---| 39 | | Adding 100 Nodes | 3.03 | 2.51 | 1.91 | 40 | | Query 100 individual nodes | 0.94 | 1.10 | 7.12 | 41 | | Count 2nd degree connection of given node | 0.8 | tbd | 10.5 | 42 | | Count 3rd degree connection of given node | 10.9 | tbd | 13.3 | 43 | 44 | ## Getting Started 45 | `graph2nosql.py` is the abstract class defining the available operations. 46 | 47 | 1. Create an `.env` that stores your secrets & env vars. 48 | 2. Use a database object to interact with your nosql db. 49 | 50 | ### Initialize knowledge graph object 51 | Every knowledge graph store object is a child of `NoSQLKnowledgeGraph` in `./base/operations.py`. 52 | 53 | The graph contains three data objects: `NodeData`, `EdgeData` and `CommunityData`. Their respective attributes are defined in `./data_model/datamodel.py`. 54 | 55 | ``` 56 | from databases.firestore_kg import FirestoreKG 57 | 58 | secrets = dotenv_values("../.env") 59 | credentials, _ = google.auth.load_credentials_from_file(secrets["GCP_CREDENTIAL_FILE"]) 60 | 61 | fskg = FirestoreKG(gcp_credential_file=secret["GCP_CREDENTIAL_FILE"], 62 | gcp_project_id=str(secrets["GCP_PROJECT_ID"]), 63 | firestore_db_id=str(secrets["WIKIDATA_FS_DB"]), 64 | node_collection_id=str(secrets["NODE_COLL_ID"]), 65 | edges_collection_id=str(secrets["EDGES_COLL_ID"]), 66 | community_collection_id=str(secrets["COMM_COLL_ID"]) 67 | ) 68 | ``` 69 | ### Add nodes 70 | ``` 71 | node_data_1 = NodeData( 72 | node_uid="test_egde_node_1", 73 | node_title="Test Node 1", 74 | node_type="Person", 75 | node_description="This is a test node", 76 | node_degree=0, 77 | document_id="doc_1", 78 | edges_to=[], 79 | edges_from=[], 80 | embedding=[0.1, 0.2, 0.3], 81 | ) 82 | 83 | node_data_2 = NodeData( 84 | node_uid="test_egde_node_2", 85 | node_title="Test Node 2", 86 | node_type="Person", 87 | node_description="This is another test node", 88 | node_degree=0, 89 | document_id="doc_2", 90 | edges_to=[], 91 | edges_from=[], 92 | embedding=[0.4, 0.5, 0.6], 93 | ) 94 | 95 | self.kg.add_node(node_uid="test_egde_node_1"node_data=node_data_1) 96 | self.kg.add_node(node_uid="test_egde_node_2"node_data=node_data_2) 97 | ``` 98 | 99 | ### Add directed and undirected edges 100 | ``` 101 | edge_data1 = EdgeData( 102 | source_uid="test_egde_node_1", 103 | target_uid="test_egde_node_2", 104 | description="This is a test egde description", 105 | directed=True 106 | ) 107 | 108 | edge_data2 = EdgeData( 109 | source_uid="test_egde_node_3", 110 | target_uid="test_egde_node_2", 111 | description="This is a test egde description", 112 | directed=False 113 | ) 114 | 115 | self.kg.add_edge(edge_data=edge_data1) 116 | self.kg.add_edge(edge_data=edge_data2) 117 | ``` 118 | 119 | 120 | ## Contributing 121 | * If you decide to add new DB operations, please add corresponding tests to `graph2nosql_tests.py` 122 | * If you decide to write an implementation for another NoSQL db please make sure all tests in `graph2nosql_tests.py` succeed. 123 | 124 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- 1 | from .graph2nosql import graph2nosql 2 | from graph2nosql import databases 3 | from graph2nosql import datamodel -------------------------------------------------------------------------------- /base/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jakobap/graph2nosql/77df8ecba857c61381a37b878d57c20d52ff9834/base/__init__.py -------------------------------------------------------------------------------- /base/operations.py: -------------------------------------------------------------------------------- 1 | """graph2nosql base class for required database operations""" 2 | 3 | from abc import ABC, abstractmethod 4 | 5 | from typing import List 6 | import datetime 7 | 8 | import networkx as nx # type: ignore 9 | import matplotlib.pyplot as plt 10 | from matplotlib.lines import Line2D 11 | import graspologic as gc 12 | 13 | from datamodel.data_model import NodeData, EdgeData, CommunityData, NodeEmbeddings 14 | 15 | 16 | class NoSQLKnowledgeGraph(ABC): 17 | """ 18 | Base Class for storing and interacting with the KG and manages data model. 19 | """ 20 | networkx: nx.Graph | nx.DiGraph = nx.Graph( 21 | ) # networkx representation of graph in nosqldb 22 | 23 | @abstractmethod 24 | def add_node(self, node_uid: str, node_data: NodeData) -> None: 25 | """Adds an node to the knowledge graph.""" 26 | 27 | @abstractmethod 28 | def get_node(self, node_uid: str) -> NodeData: 29 | """Retrieves an node from the knowledge graph.""" 30 | 31 | @abstractmethod 32 | def update_node(self, node_uid: str, node_data: NodeData) -> None: 33 | """Updates an existing node in the knowledge graph.""" 34 | 35 | @abstractmethod 36 | def remove_node(self, node_uid: str) -> None: 37 | """Removes an node from the knowledge graph.""" 38 | 39 | @abstractmethod 40 | def add_edge(self, edge_data: EdgeData) -> None: 41 | """Adds an edge (relationship) between two entities in the knowledge graph.""" 42 | 43 | @abstractmethod 44 | def get_edge(self, source_uid: str, target_uid: str) -> EdgeData: 45 | """Retrieves an edge between two entities.""" 46 | 47 | @abstractmethod 48 | def update_edge(self, edge_data: EdgeData) -> None: 49 | """Updates an existing edge in the knowledge graph.""" 50 | 51 | @abstractmethod 52 | def _delete_from_edge_coll(self, egde_uid: str) -> None: 53 | """Method to delete record from edge collection of given kg store""" 54 | 55 | @abstractmethod 56 | def remove_edge(self, source_uid: str, target_uid: str) -> None: 57 | """Removes an edge between two entities.""" 58 | 59 | @abstractmethod 60 | def build_networkx(self) -> None: 61 | """Builds the NetworkX representation of the full graph. 62 | https://networkx.org/documentation/stable/index.html 63 | """ 64 | 65 | @abstractmethod 66 | def store_community(self, community: CommunityData) -> None: 67 | """Takes valid graph community data and upserts the database with it. 68 | https://www.nature.com/articles/s41598-019-41695-z 69 | """ 70 | 71 | @abstractmethod 72 | def _generate_edge_uid(self, source_uid: str, target_uid: str) -> str: 73 | """Generates Edge uid for the network based on source and target nod uid""" 74 | return "" 75 | 76 | @abstractmethod 77 | def get_nearest_neighbors(self, query_vec) -> List[str]: 78 | """Implements nearest neighbor search based on nosql db index.""" 79 | 80 | @abstractmethod 81 | def get_community(self, community_id: str) -> CommunityData: 82 | """Retrieves the community report for a given community id.""" 83 | 84 | @abstractmethod 85 | def list_communities(self) -> List[CommunityData]: 86 | """Lists all stored communities for the given network.""" 87 | 88 | @abstractmethod 89 | def clean_zerodegree_nodes(self) -> None: 90 | """Removes all nodes with degree 0.""" 91 | 92 | @abstractmethod 93 | def edge_exist(self, source_uid: str, target_uid: str) -> bool: 94 | """Checks for edge existence and returns boolean""" 95 | 96 | @abstractmethod 97 | def node_exist(self, node_uid: str) -> bool: 98 | """Checks for node existence and returns boolean""" 99 | 100 | @abstractmethod 101 | def flush_kg(self) -> None: 102 | """Method to wipe the complete datastore of the knowledge graph""" 103 | 104 | def visualize_graph(self, filename: str = f"graph_{datetime.datetime.now()}.png") -> None: 105 | """Visualizes the provided networkx graph using matplotlib. 106 | 107 | Args: 108 | graph (nx.Graph): The graph to visualize. 109 | """ 110 | self.build_networkx() 111 | 112 | if self.networkx is not None: 113 | # Create a larger figure for better visualization 114 | plt.figure(figsize=(12, 12)) 115 | 116 | # Use a spring layout for a more visually appealing arrangement 117 | pos = nx.spring_layout(self.networkx, k=0.3, iterations=50) 118 | 119 | # Draw nodes with different colors based on entity type 120 | entity_types = set(data["node_type"] 121 | for _, data in self.networkx.nodes(data=True)) 122 | color_map = plt.cm.get_cmap("tab10", len(entity_types)) 123 | for i, entity_type in enumerate(entity_types): 124 | nodes = [n for n, d in self.networkx.nodes( 125 | data=True) if d["node_type"] == entity_type] 126 | nx.draw_networkx_nodes( 127 | self.networkx, 128 | pos, 129 | nodelist=nodes, 130 | node_color=[color_map(i)], # type: ignore 131 | label=entity_type, 132 | # type: ignore 133 | node_size=[ 134 | 10 + 50 * self.networkx.degree(n) for n in nodes] # type: ignore 135 | ) 136 | 137 | # Draw edges with labels 138 | nx.draw_networkx_edges(self.networkx, pos, width=0.5, alpha=0.5) 139 | # edge_labels = nx.get_edge_attributes(graph, "description") 140 | # nx.draw_networkx_edge_labels(graph, pos, edge_labels=edge_labels, font_size=6) 141 | 142 | # Add node labels with descriptions 143 | node_labels = { 144 | node: node 145 | for node, data in self.networkx.nodes(data=True) 146 | } 147 | nx.draw_networkx_labels( 148 | self.networkx, pos, labels=node_labels, font_size=8) 149 | 150 | plt.title("Extracted Knowledge Graph") 151 | plt.axis("off") # Turn off the axis 152 | 153 | # Add a legend for node colors 154 | plt.legend(handles=[Line2D([0], [0], marker='o', color='w', label=entity_type, 155 | markersize=10, markerfacecolor=color_map(i)) for i, entity_type in enumerate(entity_types)]) 156 | 157 | plt.savefig(filename) 158 | 159 | else: 160 | raise ValueError( 161 | "Error: NetworkX graph is not initialized. Call build_networkx() first.") 162 | 163 | def get_louvain_communities(self) -> list: 164 | """Computes and returns all Louvain communities for the given network. 165 | https://www.nature.com/articles/s41598-019-41695-z 166 | 167 | Sample Output: 168 | [{'"2023 NOBEL PEACE PRIZE"'}, {'"ANDREI SAKHAROV PRIZE"'}, 169 | {'"ANDREI SAKHAROV"'}] 170 | """ 171 | # 1. Build (or update) the NetworkX graph 172 | self.build_networkx() 173 | 174 | # 2. Apply Louvain algorithm 175 | if self.networkx is not None: 176 | louvain_comm_list = nx.algorithms.community.louvain_communities( 177 | self.networkx) 178 | return louvain_comm_list # type: ignore 179 | raise ValueError( 180 | "Error: NetworkX graph is not initialized. Call build_networkx() first.") 181 | 182 | def get_node2vec_embeddings( 183 | self, 184 | dimensions: int = 768, 185 | num_walks: int = 10, 186 | walk_length: int = 40, 187 | window_size: int = 2, 188 | iterations: int = 3, 189 | random_seed: int = 69 190 | ) -> NodeEmbeddings: 191 | """Generate node embeddings using Node2Vec.""" 192 | 193 | # update networkx representation of graph 194 | self.build_networkx() 195 | 196 | # generate embedding 197 | lcc_tensors = gc.embed.node2vec_embed( # type: ignore 198 | graph=self.networkx, 199 | dimensions=dimensions, 200 | window_size=window_size, 201 | iterations=iterations, 202 | num_walks=num_walks, 203 | walk_length=walk_length, 204 | random_seed=random_seed, 205 | ) 206 | return NodeEmbeddings(embeddings=lcc_tensors[0], nodes=lcc_tensors[1]) 207 | 208 | 209 | if __name__ == "__main__": 210 | print("Hello World!") 211 | -------------------------------------------------------------------------------- /base/operations_test.py: -------------------------------------------------------------------------------- 1 | """graph2nosql database operations unittests""" 2 | 3 | import unittest 4 | from abc import ABC, abstractmethod 5 | 6 | import os 7 | import dotenv 8 | from dotenv import dotenv_values 9 | 10 | import networkx as nx # type: ignore 11 | 12 | from base.operations import NoSQLKnowledgeGraph 13 | from databases.firestore_kg import FirestoreKG 14 | from databases.n4j import AuraKG 15 | from databases.mdb import MongoKG 16 | from datamodel.data_model import NodeData, EdgeData 17 | 18 | 19 | class _NoSQLKnowledgeGraphTests(ABC): 20 | """ 21 | Abstract base class to define test cases for NoSQLKnowledgeGraph implementations. 22 | 23 | Concrete test classes for specific NoSQL databases should inherit from this class 24 | and implement the required abstract methods. 25 | """ 26 | @abstractmethod 27 | def create_kg_instance(self) -> NoSQLKnowledgeGraph: 28 | """Create and return an instance of the NoSQLKnowledgeGraph implementation.""" 29 | 30 | def setUp(self): 31 | """Set up for test methods.""" 32 | self.kg = self.create_kg_instance() 33 | # Add any setup logic specific to your NoSQL database here 34 | 35 | def test_add_and_remove_node(self): 36 | """ Test adding a node""" 37 | node_data = NodeData( 38 | node_uid="added_test_node_1", 39 | node_title="Test Node 1", 40 | node_type="Person", 41 | node_description="This is a test node", 42 | node_degree=0, 43 | document_id="doc_1", 44 | edges_to=[], 45 | edges_from=[], 46 | embedding=[0.1, 0.2, 0.3], 47 | ) 48 | self.kg.add_node(node_uid="added_test_node_1", node_data=node_data) 49 | 50 | # Retrieve the node and verify its data 51 | retrieved_node_data = self.kg.get_node(node_uid="added_test_node_1") 52 | print(retrieved_node_data) 53 | self.assertEqual(retrieved_node_data, node_data) # type: ignore 54 | 55 | # Remove the node 56 | self.kg.remove_node(node_uid="added_test_node_1") 57 | 58 | # Try to retrieve the node again (should raise KeyError) 59 | with self.assertRaises(KeyError): # type: ignore 60 | self.kg.get_node(node_uid="added_test_node_1") 61 | 62 | def test_update_node(self): 63 | """Add a node""" 64 | node_data = NodeData( 65 | node_uid="test_update_node_1", 66 | node_title="Test Node 1", 67 | node_type="Person", 68 | node_description="This is a test node", 69 | node_degree=0, 70 | document_id="doc_1", 71 | edges_to=[], 72 | edges_from=[], 73 | embedding=[0.1, 0.2, 0.3], 74 | ) 75 | self.kg.add_node(node_uid="test_update_node_1", node_data=node_data) 76 | 77 | # Retrieve the node and verify its data 78 | retrieved_node_data = self.kg.get_node(node_uid="test_update_node_1") 79 | self.assertEqual(retrieved_node_data, node_data) # type: ignore 80 | 81 | # Update the node 82 | updated_node_data = NodeData( 83 | node_uid="test_update_node_1", 84 | node_title="Updated Test Node 1", # updated title 85 | node_type="Person", 86 | node_description="This is an updated test node", # updated description 87 | node_degree=1, # Updated degree 88 | document_id="doc_1", 89 | edges_to=[], 90 | edges_from=[], 91 | embedding=[0.1, 0.2, 0.3], 92 | ) 93 | self.kg.update_node(node_uid="test_update_node_1", 94 | node_data=updated_node_data) 95 | 96 | # Retrieve the node again and verify the update 97 | retrieved_updated_node_data = self.kg.get_node( 98 | node_uid="test_update_node_1") 99 | self.assertEqual(retrieved_updated_node_data, 100 | updated_node_data) # type: ignore 101 | 102 | # Remove the node 103 | self.kg.remove_node(node_uid="test_update_node_1") 104 | 105 | def test_add_node_with_edge(self): 106 | """Add a node with edge to other node that doesn't exist""" 107 | node_data = NodeData( 108 | node_uid="test_egde_node_1", 109 | node_title="Test Node 1", 110 | node_type="Person", 111 | node_description="This is a test node", 112 | node_degree=0, 113 | document_id="doc_1", 114 | edges_to=["fake node",], 115 | edges_from=[], 116 | embedding=[0.1, 0.2, 0.3], 117 | ) 118 | 119 | # Assert that adding the node raises a KeyError (or a more specific exception you handle) 120 | # type: ignore # Adjust exception type if needed 121 | with self.assertRaises(ValueError): 122 | self.kg.add_node(node_uid="test_egde_node_1", node_data=node_data) 123 | 124 | # Add valid nodes (required for edges) 125 | node_data_1 = NodeData( 126 | node_uid="test_egde_node_1", 127 | node_title="Test Node 1", 128 | node_type="Person", 129 | node_description="This is a test node", 130 | node_degree=0, 131 | document_id="doc_1", 132 | edges_to=[], 133 | edges_from=[], 134 | embedding=[0.1, 0.2, 0.3], 135 | ) 136 | node_data_2 = NodeData( 137 | node_uid="test_egde_node_2", 138 | node_title="Test Node 2", 139 | node_type="Person", 140 | node_description="This is another test node", 141 | node_degree=0, 142 | document_id="doc_2", 143 | edges_to=[], 144 | edges_from=[], 145 | embedding=[0.4, 0.5, 0.6], 146 | ) 147 | 148 | node_data_3 = NodeData( 149 | node_uid="test_egde_node_3", 150 | node_title="Test Node 2", 151 | node_type="Person", 152 | node_description="This is another test node", 153 | node_degree=0, 154 | document_id="doc_2", 155 | edges_to=[], 156 | edges_from=[], 157 | embedding=[0.4, 0.5, 0.6], 158 | ) 159 | 160 | self.kg.add_node(node_uid="test_egde_node_1", node_data=node_data_1) 161 | self.kg.add_node(node_uid="test_egde_node_2", node_data=node_data_2) 162 | self.kg.add_node(node_uid="test_egde_node_3", node_data=node_data_3) 163 | 164 | edge_data1 = EdgeData( 165 | source_uid="test_egde_node_1", 166 | target_uid="test_egde_node_2", 167 | description="This is a test egde description", 168 | directed=True 169 | ) 170 | 171 | edge_data2 = EdgeData( 172 | source_uid="test_egde_node_3", 173 | target_uid="test_egde_node_2", 174 | description="This is a test egde description", 175 | directed=False 176 | ) 177 | 178 | self.kg.add_edge(edge_data=edge_data1) 179 | self.kg.add_edge(edge_data=edge_data2) 180 | 181 | # Assert that the edges are reflected in the nodes' edge lists 182 | node1 = self.kg.get_node("test_egde_node_1") 183 | node2 = self.kg.get_node("test_egde_node_2") 184 | node3 = self.kg.get_node("test_egde_node_3") 185 | 186 | self.assertIn("test_egde_node_3", node2.edges_from) # type: ignore 187 | self.assertIn("test_egde_node_3", node2.edges_to) # type: ignore 188 | self.assertIn("test_egde_node_1", node2.edges_from) # type: ignore 189 | self.assertIn("test_egde_node_2", node1.edges_to) # type: ignore 190 | 191 | # Clean up 192 | self.kg.remove_node(node_uid="test_egde_node_1") 193 | self.kg.remove_node(node_uid="test_egde_node_2") 194 | self.kg.remove_node(node_uid="test_egde_node_3") 195 | 196 | def test_add_direcred_edge(self): 197 | """Test adding an edge between nodes.""" 198 | 199 | # Add valid nodes (required for edges) 200 | node_data_1 = NodeData( 201 | node_uid="test_directed_node_1", 202 | node_title="Test Node 1", 203 | node_type="Person", 204 | node_description="This is a test node", 205 | node_degree=0, 206 | document_id="doc_1", 207 | edges_to=[], 208 | edges_from=[], 209 | embedding=[0.1, 0.2, 0.3], 210 | ) 211 | node_data_2 = NodeData( 212 | node_uid="test_directed_node_2", 213 | node_title="Test Node 2", 214 | node_type="Person", 215 | node_description="This is another test node", 216 | node_degree=0, 217 | document_id="doc_2", 218 | edges_to=[], 219 | edges_from=[], 220 | embedding=[0.4, 0.5, 0.6], 221 | ) 222 | 223 | self.kg.add_node(node_uid="test_directed_node_1", 224 | node_data=node_data_1) 225 | self.kg.add_node(node_uid="test_directed_node_2", 226 | node_data=node_data_2) 227 | 228 | # add edges between nodes 229 | edge_data = EdgeData( 230 | source_uid="test_directed_node_1", 231 | target_uid="test_directed_node_2", 232 | description="This is a test egde description", 233 | directed=True 234 | ) 235 | 236 | self.kg.add_edge(edge_data=edge_data) 237 | 238 | # Assert that the edge is reflected in the nodes' edge lists 239 | node1 = self.kg.get_node("test_directed_node_1") 240 | node2 = self.kg.get_node("test_directed_node_2") 241 | self.assertIn("test_directed_node_2", node1.edges_to) # type: ignore 242 | self.assertIn("test_directed_node_1", node2.edges_from) # type: ignore 243 | 244 | # Clean Up egdes 245 | self.kg.remove_edge(source_uid="test_directed_node_1", 246 | target_uid="test_directed_node_2") 247 | 248 | # Clean Up nodes 249 | self.kg.remove_node(node_uid="test_directed_node_1") 250 | self.kg.remove_node(node_uid="test_directed_node_2") 251 | 252 | def test_add_undirecred_edge(self): 253 | """Test adding an edge between nodes.""" 254 | 255 | # Add valid nodes (required for edges) 256 | node_data_1 = NodeData( 257 | node_uid="test_undirected_node_1", 258 | node_title="Test Node 1", 259 | node_type="Person", 260 | node_description="This is a test node", 261 | node_degree=0, 262 | document_id="doc_1", 263 | edges_to=[], 264 | edges_from=[], 265 | embedding=[0.1, 0.2, 0.3], 266 | ) 267 | node_data_2 = NodeData( 268 | node_uid="test_undirected_node_2", 269 | node_title="Test Node 2", 270 | node_type="Person", 271 | node_description="This is another test node", 272 | node_degree=0, 273 | document_id="doc_2", 274 | edges_to=[], 275 | edges_from=[], 276 | embedding=[0.4, 0.5, 0.6], 277 | ) 278 | 279 | self.kg.add_node(node_uid="test_undirected_node_1", 280 | node_data=node_data_1) 281 | self.kg.add_node(node_uid="test_undirected_node_2", 282 | node_data=node_data_2) 283 | 284 | # add edges between nodes 285 | edge_data = EdgeData( 286 | source_uid="test_undirected_node_1", 287 | target_uid="test_undirected_node_2", 288 | description="This is a test egde description", 289 | directed=False 290 | ) 291 | 292 | self.kg.add_edge(edge_data=edge_data) 293 | 294 | # Assert that the edge is reflected in the nodes' edge lists 295 | node1 = self.kg.get_node("test_undirected_node_1") 296 | node2 = self.kg.get_node("test_undirected_node_2") 297 | self.assertIn("test_undirected_node_2", node1.edges_to) # type: ignore 298 | self.assertIn("test_undirected_node_1", node2.edges_to) # type: ignore 299 | self.assertIn("test_undirected_node_1", 300 | node2.edges_from) # type: ignore 301 | self.assertIn("test_undirected_node_2", 302 | node1.edges_from) # type: ignore 303 | 304 | # Clean Up egdes 305 | self.kg.remove_edge(source_uid="test_undirected_node_1", 306 | target_uid="test_undirected_node_2") 307 | 308 | # Clean Up nodes 309 | self.kg.remove_node(node_uid="test_undirected_node_1") 310 | self.kg.remove_node(node_uid="test_undirected_node_2") 311 | 312 | def test_get_edge(self): 313 | """Test retrieving an existing edge.""" 314 | # 1. Add nodes (required for edges) 315 | node_data_1 = NodeData( 316 | node_uid="test_getedge_node_1", 317 | node_title="Test Node 1", 318 | node_type="Person", 319 | node_description="This is a test node", 320 | node_degree=0, 321 | document_id="doc_1", 322 | edges_to=[], 323 | edges_from=[], 324 | embedding=[0.1, 0.2, 0.3], 325 | ) 326 | node_data_2 = NodeData( 327 | node_uid="test_getedge_node_2", 328 | node_title="Test Node 2", 329 | node_type="Person", 330 | node_description="This is another test node", 331 | node_degree=0, 332 | document_id="doc_2", 333 | edges_to=[], 334 | edges_from=[], 335 | embedding=[0.4, 0.5, 0.6], 336 | ) 337 | self.kg.add_node(node_uid="test_getedge_node_1", node_data=node_data_1) 338 | self.kg.add_node(node_uid="test_getedge_node_2", node_data=node_data_2) 339 | 340 | # 2. Add an edge 341 | edge_data = EdgeData( 342 | source_uid="test_getedge_node_1", 343 | target_uid="test_getedge_node_2", 344 | description="This might be a description of the relationship of these two nodes", 345 | directed=False 346 | ) 347 | self.kg.add_edge(edge_data=edge_data) 348 | 349 | # Assuming you have a way to retrieve edge data in your implementation 350 | retrieved_edge_data = self.kg.get_edge(source_uid="test_getedge_node_1", 351 | target_uid="test_getedge_node_2") 352 | 353 | new_edge_uid = self.kg._generate_edge_uid(source_uid=edge_data.source_uid, 354 | target_uid=edge_data.target_uid) 355 | target_edge_data = EdgeData( 356 | source_uid="test_getedge_node_1", 357 | target_uid="test_getedge_node_2", 358 | description="This might be a description of the relationship of these two nodes", 359 | directed=False, 360 | edge_uid=new_edge_uid 361 | ) 362 | 363 | self.assertEqual(retrieved_edge_data, target_edge_data) # type: ignore 364 | 365 | # Clean up edges 366 | self.kg.remove_edge(source_uid="test_getedge_node_1", 367 | target_uid="test_getedge_node_2") 368 | 369 | # Clean up nodes 370 | self.kg.remove_node(node_uid="test_getedge_node_1") 371 | self.kg.remove_node(node_uid="test_getedge_node_2") 372 | 373 | def test_update_edge(self): 374 | """Test updating the data of an existing edge.""" 375 | # Add nodes (required for edges) 376 | node_data_1 = NodeData( 377 | node_uid="test_edgeupdate_node_1", 378 | node_title="Test Node 1", 379 | node_type="Person", 380 | node_description="This is a test node", 381 | node_degree=0, 382 | document_id="doc_1", 383 | edges_to=[], 384 | edges_from=[], 385 | embedding=[0.1, 0.2, 0.3], 386 | ) 387 | node_data_2 = NodeData( 388 | node_uid="test_edgeupdate_node_2", 389 | node_title="Test Node 2", 390 | node_type="Person", 391 | node_description="This is another test node", 392 | node_degree=0, 393 | document_id="doc_2", 394 | edges_to=[], 395 | edges_from=[], 396 | embedding=[0.4, 0.5, 0.6], 397 | ) 398 | self.kg.add_node(node_uid="test_edgeupdate_node_1", 399 | node_data=node_data_1) 400 | self.kg.add_node(node_uid="test_edgeupdate_node_2", 401 | node_data=node_data_2) 402 | 403 | # Add an edge between 404 | edge_data = EdgeData( 405 | source_uid="test_edgeupdate_node_1", 406 | target_uid="test_edgeupdate_node_2", 407 | description="This is a boring egde description" 408 | ) 409 | 410 | self.kg.add_edge( 411 | edge_data=edge_data 412 | ) 413 | 414 | # Update edge with new data 415 | updated_edge_data = EdgeData( 416 | source_uid="test_edgeupdate_node_1", 417 | target_uid="test_edgeupdate_node_2", 418 | description="Updated much better description" 419 | ) 420 | self.kg.update_edge(edge_data=updated_edge_data) 421 | 422 | # Verify that the edge data is updated 423 | retrieved_updated_edge_data = self.kg.get_edge( 424 | source_uid="test_edgeupdate_node_1", target_uid="test_edgeupdate_node_2" 425 | ) 426 | 427 | validate_edge_data = EdgeData( 428 | source_uid="test_edgeupdate_node_1", 429 | target_uid="test_edgeupdate_node_2", 430 | description="Updated much better description", 431 | edge_uid=self.kg._generate_edge_uid( 432 | edge_data.source_uid, edge_data.target_uid 433 | ) 434 | ) 435 | 436 | self.assertEqual( # type: ignore 437 | retrieved_updated_edge_data, validate_edge_data 438 | ) 439 | 440 | # Cleanup edges 441 | self.kg.remove_edge(source_uid="test_edgeupdate_node_1", 442 | target_uid="test_edgeupdate_node_2") 443 | 444 | # Cleanup nodes 445 | self.kg.remove_node(node_uid="test_edgeupdate_node_1") 446 | self.kg.remove_node(node_uid="test_edgeupdate_node_2") 447 | 448 | def test_remove_edge(self): 449 | """Test removing an edge between nodes.""" 450 | # Add nodes (required for edges) 451 | node_data_1 = NodeData( 452 | node_uid="test_removeegde_node_1", 453 | node_title="Test Node 1", 454 | node_type="Person", 455 | node_description="This is a test node", 456 | node_degree=0, 457 | document_id="doc_1", 458 | edges_to=[], 459 | edges_from=[], 460 | embedding=[0.1, 0.2, 0.3], 461 | ) 462 | node_data_2 = NodeData( 463 | node_uid="test_removeegde_node_2", 464 | node_title="Test Node 2", 465 | node_type="Person", 466 | node_description="This is another test node", 467 | node_degree=0, 468 | document_id="doc_2", 469 | edges_to=[], 470 | edges_from=[], 471 | embedding=[0.4, 0.5, 0.6], 472 | ) 473 | self.kg.add_node(node_uid="test_removeegde_node_1", 474 | node_data=node_data_1) 475 | self.kg.add_node(node_uid="test_removeegde_node_2", 476 | node_data=node_data_2) 477 | 478 | # add edges between nodes 479 | edge_data = EdgeData( 480 | source_uid="test_removeegde_node_2", 481 | target_uid="test_removeegde_node_1", 482 | description="This is a test egde description" 483 | ) 484 | self.kg.add_edge(edge_data=edge_data) 485 | 486 | # Assert that the edge is reflected in the nodes' edge lists 487 | node1 = self.kg.get_node("test_removeegde_node_1") 488 | node2 = self.kg.get_node("test_removeegde_node_2") 489 | self.assertIn("test_removeegde_node_1", node2.edges_to) # type: ignore 490 | self.assertIn("test_removeegde_node_2", 491 | node1.edges_from) # type: ignore 492 | 493 | # Remove the edge 494 | self.kg.remove_edge(source_uid="test_removeegde_node_2", 495 | target_uid="test_removeegde_node_1") 496 | 497 | # Assert that the edge is no longer in the nodes' edge lists 498 | node1 = self.kg.get_node("test_removeegde_node_1") 499 | node2 = self.kg.get_node("test_removeegde_node_2") 500 | self.assertNotIn("test_removeegde_node_2", 501 | node1.edges_to) # type: ignore 502 | self.assertNotIn("test_removeegde_node_2", 503 | node1.edges_from) # type: ignore 504 | self.assertNotIn("test_removeegde_node_1", 505 | node2.edges_from) # type: ignore 506 | self.assertNotIn("test_removeegde_node_1", 507 | node2.edges_to) # type: ignore 508 | 509 | # Clean up nodes 510 | self.kg.remove_node(node_uid="test_removeegde_node_1") 511 | self.kg.remove_node(node_uid="test_removeegde_node_2") 512 | 513 | def test_get_networkx(self): 514 | """Test getting the networkx graph.""" 515 | # 1. Add nodes 516 | node_data_1 = NodeData( 517 | node_uid="test_getnx_node_1", 518 | node_title="Test Node 1", 519 | node_type="Person", 520 | node_description="This is a test node", 521 | node_degree=0, 522 | document_id="doc_1", 523 | edges_to=[], 524 | edges_from=[], 525 | embedding=[0.1, 0.2, 0.3], 526 | ) 527 | node_data_2 = NodeData( 528 | node_uid="test_getnx_node_2", 529 | node_title="Test Node 2", 530 | node_type="Person", 531 | node_description="This is another test node", 532 | node_degree=0, 533 | document_id="doc_2", 534 | edges_to=[], 535 | edges_from=[], 536 | embedding=[0.4, 0.5, 0.6], 537 | ) 538 | self.kg.add_node(node_uid="test_getnx_node_1", node_data=node_data_1) 539 | self.kg.add_node(node_uid="test_getnx_node_2", node_data=node_data_2) 540 | 541 | # 2. Add an edge 542 | edge_data = EdgeData( 543 | source_uid="test_getnx_node_1", 544 | target_uid="test_getnx_node_2", 545 | description="Test Edge Description" 546 | ) 547 | self.kg.add_edge(edge_data=edge_data) 548 | 549 | # 3. Get the NetworkX graph 550 | self.kg.build_networkx() 551 | 552 | # 4. Assertions 553 | # Check if the graph is the correct type 554 | self.assertIsInstance(self.kg.networkx, nx.Graph) # type: ignore 555 | # Check if the number of nodes is correct 556 | self.assertEqual(self.kg.networkx.number_of_nodes(), 2) # type: ignore 557 | # Check if the number of edges is correct 558 | self.assertEqual(self.kg.networkx.number_of_edges(), 1) # type: ignore 559 | # Check if specific nodes exist in the graph 560 | self.assertTrue(self.kg.networkx.has_node( 561 | "test_getnx_node_1")) # type: ignore 562 | self.assertTrue(self.kg.networkx.has_node( 563 | "test_getnx_node_2")) # type: ignore 564 | # Check if a specific edge exists in the graph 565 | self.assertTrue(self.kg.networkx.has_edge( 566 | "test_getnx_node_1", "test_getnx_node_2")) # type: ignore 567 | 568 | # 5. Clean up (optional, depending on your test setup) 569 | self.kg.remove_edge(source_uid="test_getnx_node_1", 570 | target_uid="test_getnx_node_2") 571 | self.kg.remove_node(node_uid="test_getnx_node_1") 572 | self.kg.remove_node(node_uid="test_getnx_node_2") 573 | 574 | def test_get_louvain_communities(self): 575 | """Test getting Louvain communities.""" 576 | # 1. Add nodes 577 | node_data_1 = NodeData( 578 | node_uid="test_louvain_node_1", 579 | node_title="Test Node 1", 580 | node_type="Person", 581 | node_description="This is a test node", 582 | node_degree=0, 583 | document_id="doc_1", 584 | edges_to=[], 585 | edges_from=[], 586 | embedding=[0.1, 0.2, 0.3], 587 | ) 588 | node_data_2 = NodeData( 589 | node_uid="test_louvain_node_2", 590 | node_title="Test Node 2", 591 | node_type="Person", 592 | node_description="This is another test node", 593 | node_degree=0, 594 | document_id="doc_2", 595 | edges_to=[], 596 | edges_from=[], 597 | embedding=[0.4, 0.5, 0.6], 598 | ) 599 | node_data_3 = NodeData( 600 | node_uid="test_louvain_node_3", 601 | node_title="Test Node 3", 602 | node_type="Person", 603 | node_description="This is another test node", 604 | node_degree=0, 605 | document_id="doc_3", 606 | edges_to=[], 607 | edges_from=[], 608 | embedding=[0.4, 0.5, 0.6], 609 | ) 610 | 611 | node_data_4 = NodeData( 612 | node_uid="test_louvain_node_4", 613 | node_title="Test Node 4", 614 | node_type="Person", 615 | node_description="This is another test node", 616 | node_degree=0, 617 | document_id="doc_3", 618 | edges_to=[], 619 | edges_from=[], 620 | embedding=[0.4, 0.5, 0.6], 621 | ) 622 | 623 | self.kg.add_node(node_uid="test_louvain_node_1", node_data=node_data_1) 624 | self.kg.add_node(node_uid="test_louvain_node_2", node_data=node_data_2) 625 | self.kg.add_node(node_uid="test_louvain_node_3", node_data=node_data_3) 626 | self.kg.add_node(node_uid="test_louvain_node_4", node_data=node_data_4) 627 | 628 | # 2. Add edges to create a connected structure for community detection 629 | edge_data_1 = EdgeData( 630 | source_uid="test_louvain_node_1", 631 | target_uid="test_louvain_node_2", 632 | description="Test Edge Description 1" 633 | ) 634 | edge_data_2 = EdgeData( 635 | source_uid="test_louvain_node_2", 636 | target_uid="test_louvain_node_3", 637 | description="Test Edge Description 2" 638 | ) 639 | self.kg.add_edge(edge_data=edge_data_1) 640 | self.kg.add_edge(edge_data=edge_data_2) 641 | 642 | # 3. Get the Louvain communities 643 | communities = self.kg.get_louvain_communities() 644 | 645 | # 4. Assertions 646 | # Ensure communities is a list 647 | self.assertIsInstance(communities, list) # type: ignore 648 | # We are expecting exactly two communities since one node has no edges 649 | self.assertTrue(len(communities) == 2) # type: ignore 650 | # Check if each community is a set (or your expected data structure) 651 | for community in communities: 652 | self.assertIsInstance(community, set) # type: ignore 653 | 654 | # 5. Clean up (optional, depending on your test setup) 655 | self.kg.remove_edge(source_uid="test_louvain_node_1", 656 | target_uid="test_louvain_node_2") 657 | self.kg.remove_edge(source_uid="test_louvain_node_2", 658 | target_uid="test_louvain_node_3") 659 | self.kg.remove_node(node_uid="test_louvain_node_1") 660 | self.kg.remove_node(node_uid="test_louvain_node_2") 661 | self.kg.remove_node(node_uid="test_louvain_node_3") 662 | self.kg.remove_node(node_uid="test_louvain_node_4") 663 | 664 | def test_visualize_graph(self): 665 | """Test visualizing the graph. This test is not asserting anything. 666 | It's only creating a visualization for manual inspection.""" 667 | 668 | # 1. Add nodes 669 | node_data_1 = NodeData( 670 | node_uid="test_vis_node_1", 671 | node_title="Test Node 1", 672 | node_type="Person", 673 | node_description="This is a test node", 674 | node_degree=0, 675 | document_id="doc_1", 676 | edges_to=[], 677 | edges_from=[], 678 | embedding=[0.1, 0.2, 0.3], 679 | ) 680 | node_data_2 = NodeData( 681 | node_uid="test_vis_node_2", 682 | node_title="Test Node 2", 683 | node_type="Person", 684 | node_description="This is another test node", 685 | node_degree=0, 686 | document_id="doc_2", 687 | edges_to=[], 688 | edges_from=[], 689 | embedding=[0.4, 0.5, 0.6], 690 | ) 691 | node_data_3 = NodeData( 692 | node_uid="test_vis_node_3", 693 | node_title="Test Node 3", 694 | node_type="Organization", 695 | node_description="This is another test node", 696 | node_degree=0, 697 | document_id="doc_3", 698 | edges_to=[], 699 | edges_from=[], 700 | embedding=[0.4, 0.5, 0.6], 701 | ) 702 | self.kg.add_node(node_uid="test_vis_node_1", node_data=node_data_1) 703 | self.kg.add_node(node_uid="test_vis_node_2", node_data=node_data_2) 704 | self.kg.add_node(node_uid="test_vis_node_3", node_data=node_data_3) 705 | 706 | # 2. Add edges to create connections for visualization 707 | edge_data_1 = EdgeData( 708 | source_uid="test_vis_node_1", 709 | target_uid="test_vis_node_2", 710 | description="Test Edge Description 1" 711 | ) 712 | edge_data_2 = EdgeData( 713 | source_uid="test_vis_node_2", 714 | target_uid="test_vis_node_3", 715 | description="Test Edge Description 2" 716 | ) 717 | self.kg.add_edge(edge_data=edge_data_1) 718 | self.kg.add_edge(edge_data=edge_data_2) 719 | 720 | # 3. Visualize the graph 721 | try: 722 | self.kg.visualize_graph(filename="test_graph.png") 723 | except Exception as e: 724 | raise ValueError(f"An error occurred during visualization: {e}") 725 | 726 | # 4. Clean up (optional, depending on your test setup) 727 | self.kg.remove_edge(source_uid="test_vis_node_1", 728 | target_uid="test_vis_node_2") 729 | self.kg.remove_edge(source_uid="test_vis_node_2", 730 | target_uid="test_vis_node_3") 731 | self.kg.remove_node(node_uid="test_vis_node_1") 732 | self.kg.remove_node(node_uid="test_vis_node_2") 733 | self.kg.remove_node(node_uid="test_vis_node_3") 734 | 735 | 736 | class FirestoreKGTests(_NoSQLKnowledgeGraphTests, unittest.TestCase): 737 | """ 738 | Test cases for the FirestoreKG implementation of NoSQLKnowledgeGraph. 739 | 740 | This test suite inherits from _NoSQLKnowledgeGraphTests to reuse common test cases. 741 | It specifically tests the FirestoreKG class by creating an instance connected to a Firestore 742 | database and then running various operations on it. 743 | 744 | Before running the tests, it attempts to clear the Firestore collections to ensure a clean slate. 745 | However, this clearing operation is currently commented out. 746 | """ 747 | 748 | def create_kg_instance(self) -> NoSQLKnowledgeGraph: 749 | os.chdir(os.path.dirname(os.path.abspath(__file__))) 750 | 751 | secrets = dotenv_values("../.env") 752 | 753 | gcp_credential_file = str(secrets["GCP_CREDENTIAL_FILE"]) 754 | project_id = str(secrets["GCP_PROJECT_ID"]) 755 | database_id = str(secrets["FIRESTORE_DB_ID"]) 756 | node_coll_id = str(secrets["NODE_COLL_ID"]) 757 | edges_coll_id = str(secrets["EDGES_COLL_ID"]) 758 | community_coll_id = str(secrets["COMM_COLL_ID"]) 759 | 760 | fskg = FirestoreKG( 761 | gcp_project_id=project_id, 762 | gcp_credential_file=gcp_credential_file, 763 | firestore_db_id=database_id, 764 | node_collection_id=node_coll_id, 765 | edges_collection_id=edges_coll_id, 766 | community_collection_id=community_coll_id 767 | ) 768 | 769 | # Clear the collections before running tests 770 | fskg.flush_kg() 771 | return fskg 772 | 773 | 774 | class AuraKGTest(_NoSQLKnowledgeGraphTests, unittest.TestCase): 775 | """ 776 | Test cases for the Neo4j Aura implementation of NoSQLKnowledgeGraph. 777 | 778 | This test suite inherits from _NoSQLKnowledgeGraphTests to reuse common test cases. 779 | It specifically tests the Neo4j Aura class by creating an instance connected to a Neo4j Aura 780 | database and then running various operations on it. 781 | 782 | Before running the tests, it attempts to clear the Neo4j Aura collections to ensure a clean slate. 783 | However, this clearing operation is currently commented out. 784 | """ 785 | 786 | def create_kg_instance(self) -> NoSQLKnowledgeGraph: 787 | os.chdir(os.path.dirname(os.path.abspath(__file__))) 788 | 789 | dotenv.load_dotenv("../Neo4j-39cb28f0-Created-2024-09-23.txt") 790 | 791 | uri = os.getenv("NEO4J_URI") 792 | auth = (os.getenv("NEO4J_USERNAME"), os.getenv("NEO4J_PASSWORD")) 793 | 794 | aura = AuraKG(uri=uri, auth=auth) 795 | 796 | # empty graph store before running tests 797 | # aura.flush_kg() 798 | return aura 799 | 800 | 801 | class MongoKGTest(_NoSQLKnowledgeGraphTests, unittest.TestCase): 802 | """ 803 | Test cases for the MongoDB implementation of NoSQLKnowledgeGraph. 804 | 805 | This test suite inherits from _NoSQLKnowledgeGraphTests to reuse common test cases. 806 | It specifically tests the MongoDB class by creating an instance connected to a MongoDB 807 | database and then running various operations on it. 808 | 809 | Before running the tests, it attempts to clear the MongoDB collections to ensure a clean slate. 810 | However, this clearing operation is currently commented out. 811 | """ 812 | 813 | def create_kg_instance(self) -> NoSQLKnowledgeGraph: 814 | os.chdir(os.path.dirname(os.path.abspath(__file__))) 815 | 816 | secrets = dotenv_values("../.env") 817 | 818 | mdb_username = str(secrets["MDB_USERNAME"]) 819 | mdb_passowrd = str(secrets["MDB_PASSWORD"]) 820 | mdb_cluster = str(secrets["MDB_CLUSTER"]) 821 | uri = f"mongodb+srv://{mdb_username}:{mdb_passowrd}@cluster0.pjx3w.mongodb.net/?retryWrites=true&w=majority&appName={mdb_cluster}" 822 | 823 | mkg = MongoKG( 824 | mdb_uri=uri, 825 | mdb_db_id=str(secrets["MDB_DB_ID"]), 826 | node_coll_id=str(secrets["NODE_COLL_ID"]), 827 | edges_coll_id=str(secrets["EDGES_COLL_ID"]), 828 | community_collection_id=str(secrets["COMM_COLL_ID"]) 829 | ) 830 | 831 | # flush full kg before running tests 832 | mkg.flush_kg() 833 | return mkg 834 | 835 | 836 | def suite(): 837 | """testing suite def""" 838 | suite = unittest.TestSuite() 839 | suite.addTest(unittest.makeSuite(FirestoreKGTests)) 840 | suite.addTest(unittest.makeSuite(AuraKGTest)) 841 | suite.addTest(unittest.makeSuite(MongoKGTest)) 842 | # Add tests for other database classes as needed 843 | return suite 844 | 845 | 846 | if __name__ == "__main__": 847 | runner = unittest.TextTestRunner() 848 | runner.run(suite()) 849 | -------------------------------------------------------------------------------- /benchmarks/import_benchmarks.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stderr", 10 | "output_type": "stream", 11 | "text": [ 12 | "/home/user/graph2nosql/.venv-g2nsql/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", 13 | " from .autonotebook import tqdm as notebook_tqdm\n" 14 | ] 15 | } 16 | ], 17 | "source": [ 18 | "from google.cloud import bigquery\n", 19 | "import google.auth\n", 20 | "\n", 21 | "from neo4j import GraphDatabase\n", 22 | "\n", 23 | "import os\n", 24 | "import json\n", 25 | "from dotenv import dotenv_values\n", 26 | "import time\n", 27 | "\n", 28 | "from main import NodeImportBenchmark, NodeQueryBenchmark\n", 29 | "\n", 30 | "from base.operations import NoSQLKnowledgeGraph\n", 31 | "from databases.firestore_kg import FirestoreKG\n", 32 | "from databases.n4j import AuraKG\n", 33 | "from databases.mdb import MongoKG\n", 34 | "\n", 35 | "from datamodel.data_model import NodeData, EdgeData, CommunityData" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": {}, 41 | "source": [ 42 | "#### Setting env and global variables" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": 2, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "secrets = dotenv_values(\"../.env\")\n", 52 | "credentials, _ = google.auth.load_credentials_from_file(secrets[\"GCP_CREDENTIAL_FILE\"])" 53 | ] 54 | }, 55 | { 56 | "cell_type": "markdown", 57 | "metadata": {}, 58 | "source": [ 59 | "## Firestore Knowledge Graph vs. AuraDB (Neo4J) latency comparison" 60 | ] 61 | }, 62 | { 63 | "cell_type": "markdown", 64 | "metadata": {}, 65 | "source": [ 66 | "#### Define Knowledge Graph DB Interface Options" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": 3, 72 | "metadata": {}, 73 | "outputs": [ 74 | { 75 | "name": "stdout", 76 | "output_type": "stream", 77 | "text": [ 78 | "Pinged your deployment. You successfully connected to MongoDB!\n" 79 | ] 80 | } 81 | ], 82 | "source": [ 83 | "fskg = FirestoreKG(gcp_credential_file=secrets[\"GCP_CREDENTIAL_FILE\"],\n", 84 | " gcp_project_id=str(secrets[\"GCP_PROJECT_ID\"]),\n", 85 | " firestore_db_id=str(secrets[\"WIKIDATA_FS_DB\"]),\n", 86 | " node_collection_id=str(secrets[\"NODE_COLL_ID\"]),\n", 87 | " edges_collection_id=str(secrets[\"EDGES_COLL_ID\"]),\n", 88 | " community_collection_id=str(\n", 89 | " secrets[\"COMM_COLL_ID\"])\n", 90 | " )\n", 91 | "\n", 92 | "aura_kg = AuraKG(uri=str(secrets[\"NEO4J_URI\"]),\n", 93 | " auth=(str(secrets[\"NEO4J_USERNAME\"]),\n", 94 | " str(secrets[\"NEO4J_PASSWORD\"]))\n", 95 | " )\n", 96 | "\n", 97 | "mdb_username = str(secrets[\"MDB_USERNAME\"])\n", 98 | "mdb_passowrd = str(secrets[\"MDB_PASSWORD\"])\n", 99 | "mdb_cluster = str(secrets[\"MDB_CLUSTER\"])\n", 100 | "mdb_uri = f\"mongodb+srv://{mdb_username}:{mdb_passowrd}@cluster0.pjx3w.mongodb.net/?retryWrites=true&w=majority&appName={mdb_cluster}\"\n", 101 | "\n", 102 | "mkg = MongoKG(\n", 103 | " mdb_uri=mdb_uri,\n", 104 | " mdb_db_id=str(secrets[\"MDB_DB_ID\"]),\n", 105 | " node_coll_id=str(secrets[\"NODE_COLL_ID\"]),\n", 106 | " edges_coll_id=str(secrets[\"EDGES_COLL_ID\"]),\n", 107 | " community_collection_id=str(secrets[\"COMM_COLL_ID\"])\n", 108 | ")" 109 | ] 110 | }, 111 | { 112 | "cell_type": "markdown", 113 | "metadata": {}, 114 | "source": [ 115 | "### Importing Nodes Comparison" 116 | ] 117 | }, 118 | { 119 | "cell_type": "markdown", 120 | "metadata": {}, 121 | "source": [ 122 | "#### Fetch graph data from BigQuery" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": 4, 128 | "metadata": {}, 129 | "outputs": [ 130 | { 131 | "name": "stdout", 132 | "output_type": "stream", 133 | "text": [ 134 | "$$$$ Task Index 0, Task Count 1, Offset 302400\n" 135 | ] 136 | } 137 | ], 138 | "source": [ 139 | "import_lim = 100\n", 140 | "task_index = int(os.getenv('CLOUD_RUN_TASK_INDEX', '0'))\n", 141 | "task_count = int(os.getenv('CLOUD_RUN_TASK_COUNT', '1'))\n", 142 | "rows_per_task = import_lim // task_count\n", 143 | "offset = task_index * rows_per_task + 302400\n", 144 | "\n", 145 | "print(\n", 146 | " f'$$$$ Task Index {task_index}, Task Count {task_count}, Offset {offset}')\n", 147 | "\n", 148 | "# Fetch Node data from BigQuery\n", 149 | "client = bigquery.Client(project=str(\n", 150 | " secrets[\"GCP_PROJECT_ID\"]), credentials=credentials)\n", 151 | "\n", 152 | "query_job = client.query(\n", 153 | " f\"SELECT * FROM poerschmann-sem-search.wikidata_kg.entity_doc_alias_joined LIMIT {rows_per_task} OFFSET {offset}\")" 154 | ] 155 | }, 156 | { 157 | "cell_type": "markdown", 158 | "metadata": {}, 159 | "source": [ 160 | "#### Run Node Import Benchmark" 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": 5, 166 | "metadata": {}, 167 | "outputs": [ 168 | { 169 | "name": "stdout", 170 | "output_type": "stream", 171 | "text": [ 172 | "$$$$ Starting Benchmark Node Import with options: ['Firestore', 'Mongo', 'Aura'] $$$$\n", 173 | "Firestore time for 100 Node Import: 3.035799026489258\n", 174 | "Mongo time for 100 Node Import: 2.513396739959717\n", 175 | "Aura time for 100 Node Import: 1.9137544631958008\n", 176 | "hEllO wOrlD!\n" 177 | ] 178 | } 179 | ], 180 | "source": [ 181 | "add_nodes_testing = NodeImportBenchmark(\n", 182 | " benchmark_name=\"Node Import\", import_lim=import_lim, options_dict={\"Firestore\": fskg, \"Mongo\": mkg, \"Aura\": aura_kg})\n", 183 | "add_nodes_testing(records=query_job)" 184 | ] 185 | }, 186 | { 187 | "cell_type": "markdown", 188 | "metadata": {}, 189 | "source": [ 190 | "### Querying Nodes Comparison" 191 | ] 192 | }, 193 | { 194 | "cell_type": "code", 195 | "execution_count": 6, 196 | "metadata": {}, 197 | "outputs": [ 198 | { 199 | "name": "stdout", 200 | "output_type": "stream", 201 | "text": [ 202 | "$$$$ Starting Benchmark Node Query with options: ['Firestore', 'Mongo', 'Aura'] $$$$\n", 203 | "Firestore time for 100 Node Query: 0.9364566802978516\n", 204 | "Mongo time for 100 Node Query: 1.0969457626342773\n", 205 | "Aura time for 100 Node Query: 7.117481708526611\n", 206 | "hEllO wOrlD!\n" 207 | ] 208 | } 209 | ], 210 | "source": [ 211 | "query_nodes_testing = NodeQueryBenchmark(\n", 212 | " benchmark_name=\"Node Query\", import_lim=import_lim, options_dict={\"Firestore\": fskg, \"Mongo\": mkg, \"Aura\": aura_kg})\n", 213 | "query_nodes_testing(records=query_job)" 214 | ] 215 | }, 216 | { 217 | "cell_type": "markdown", 218 | "metadata": {}, 219 | "source": [ 220 | "### Querying deeply nested structures comparison" 221 | ] 222 | }, 223 | { 224 | "cell_type": "markdown", 225 | "metadata": {}, 226 | "source": [ 227 | "Challenge: Finding friends of friends of \"Q901\" (2nd degree directed)" 228 | ] 229 | }, 230 | { 231 | "cell_type": "code", 232 | "execution_count": 7, 233 | "metadata": {}, 234 | "outputs": [ 235 | { 236 | "data": { 237 | "text/plain": [ 238 | "113" 239 | ] 240 | }, 241 | "execution_count": 7, 242 | "metadata": {}, 243 | "output_type": "execute_result" 244 | } 245 | ], 246 | "source": [ 247 | "f0f_list = []\n", 248 | "\n", 249 | "node_data = fskg.get_node(node_uid='Q901')\n", 250 | "\n", 251 | "for e in node_data.edges_from:\n", 252 | " neigh_node = fskg.get_node(node_uid=e)\n", 253 | " f0f_list.append(neigh_node.edges_from)\n", 254 | "\n", 255 | "len(sum(f0f_list, []))" 256 | ] 257 | }, 258 | { 259 | "cell_type": "code", 260 | "execution_count": 8, 261 | "metadata": {}, 262 | "outputs": [], 263 | "source": [ 264 | "# f0f_list = []\n", 265 | "\n", 266 | "# node_data = mkg.get_node(node_uid='Q901')\n", 267 | "\n", 268 | "# for e in node_data.edges_from:\n", 269 | "# neigh_node = mkg.get_node(node_uid=e)\n", 270 | "# f0f_list.append(neigh_node.edges_from)\n", 271 | "\n", 272 | "# len(sum(f0f_list, []))" 273 | ] 274 | }, 275 | { 276 | "cell_type": "markdown", 277 | "metadata": {}, 278 | "source": [ 279 | "f0f_list" 280 | ] 281 | }, 282 | { 283 | "cell_type": "code", 284 | "execution_count": 9, 285 | "metadata": {}, 286 | "outputs": [ 287 | { 288 | "data": { 289 | "text/plain": [ 290 | "53" 291 | ] 292 | }, 293 | "execution_count": 9, 294 | "metadata": {}, 295 | "output_type": "execute_result" 296 | } 297 | ], 298 | "source": [ 299 | "records, summary, keys = aura_kg.driver.execute_query(\n", 300 | " \"MATCH (n)-[]-()-[]-(result) WHERE n.node_uid = 'Q901' RETURN result\")\n", 301 | "\n", 302 | "len(records)" 303 | ] 304 | }, 305 | { 306 | "cell_type": "code", 307 | "execution_count": 10, 308 | "metadata": {}, 309 | "outputs": [ 310 | { 311 | "data": { 312 | "text/plain": [ 313 | "53" 314 | ] 315 | }, 316 | "execution_count": 10, 317 | "metadata": {}, 318 | "output_type": "execute_result" 319 | } 320 | ], 321 | "source": [ 322 | "records, summary, keys = aura_kg.driver.execute_query(\n", 323 | " \"\"\"MATCH (n)-[:DIRECTED]-()-[:DIRECTED]-(result)\n", 324 | " WHERE n.node_uid = 'Q901'\n", 325 | " RETURN result\"\"\")\n", 326 | "\n", 327 | "len(records)" 328 | ] 329 | }, 330 | { 331 | "cell_type": "code", 332 | "execution_count": 11, 333 | "metadata": {}, 334 | "outputs": [ 335 | { 336 | "data": { 337 | "text/plain": [ 338 | "53" 339 | ] 340 | }, 341 | "execution_count": 11, 342 | "metadata": {}, 343 | "output_type": "execute_result" 344 | } 345 | ], 346 | "source": [ 347 | "records, summary, keys = aura_kg.driver.execute_query(\n", 348 | " \"\"\"MATCH (n)-[:DIRECTED*2]-(result)\n", 349 | " WHERE n.node_uid = 'Q901'\n", 350 | " RETURN result\"\"\")\n", 351 | "\n", 352 | "len(records)" 353 | ] 354 | }, 355 | { 356 | "cell_type": "markdown", 357 | "metadata": {}, 358 | "source": [ 359 | "Challenge 2: Finding friends of friends of friends \"Q901\" (3rd degree undirected)" 360 | ] 361 | }, 362 | { 363 | "cell_type": "code", 364 | "execution_count": 12, 365 | "metadata": {}, 366 | "outputs": [ 367 | { 368 | "data": { 369 | "text/plain": [ 370 | "51947" 371 | ] 372 | }, 373 | "execution_count": 12, 374 | "metadata": {}, 375 | "output_type": "execute_result" 376 | } 377 | ], 378 | "source": [ 379 | "f0fof_list = []\n", 380 | "\n", 381 | "node_data = fskg.get_node(node_uid='Q901')\n", 382 | "\n", 383 | "for e in node_data.edges_from + node_data.edges_to:\n", 384 | " neigh_node = fskg.get_node(node_uid=e)\n", 385 | " for e2 in neigh_node.edges_from + neigh_node.edges_to:\n", 386 | " neigh_node2 = fskg.get_node(node_uid=e2)\n", 387 | " f0fof_list.append(neigh_node2.edges_from)\n", 388 | " f0fof_list.append(neigh_node2.edges_to)\n", 389 | "\n", 390 | "len(sum(f0fof_list, []))" 391 | ] 392 | }, 393 | { 394 | "cell_type": "code", 395 | "execution_count": 13, 396 | "metadata": {}, 397 | "outputs": [ 398 | { 399 | "data": { 400 | "text/plain": [ 401 | "10078" 402 | ] 403 | }, 404 | "execution_count": 13, 405 | "metadata": {}, 406 | "output_type": "execute_result" 407 | } 408 | ], 409 | "source": [ 410 | "with GraphDatabase.driver(uri=aura_kg.uri, auth=aura_kg.auth) as driver:\n", 411 | " driver.verify_connectivity()\n", 412 | "\n", 413 | " # Use a parameter for node_uid in the Cypher query\n", 414 | " records, summary, keys = driver.execute_query(\n", 415 | " \"MATCH (n)-[]-()-[]-()-[]-(result) WHERE n.node_uid = 'Q901' RETURN result\")\n", 416 | "\n", 417 | "len(records)" 418 | ] 419 | }, 420 | { 421 | "cell_type": "markdown", 422 | "metadata": {}, 423 | "source": [ 424 | "### Running Community Identification Comparison" 425 | ] 426 | }, 427 | { 428 | "cell_type": "markdown", 429 | "metadata": {}, 430 | "source": [ 431 | "tbd in comparison" 432 | ] 433 | }, 434 | { 435 | "cell_type": "code", 436 | "execution_count": null, 437 | "metadata": {}, 438 | "outputs": [], 439 | "source": [] 440 | } 441 | ], 442 | "metadata": { 443 | "kernelspec": { 444 | "display_name": ".venv", 445 | "language": "python", 446 | "name": "python3" 447 | }, 448 | "language_info": { 449 | "codemirror_mode": { 450 | "name": "ipython", 451 | "version": 3 452 | }, 453 | "file_extension": ".py", 454 | "mimetype": "text/x-python", 455 | "name": "python", 456 | "nbconvert_exporter": "python", 457 | "pygments_lexer": "ipython3", 458 | "version": "3.11.8" 459 | } 460 | }, 461 | "nbformat": 4, 462 | "nbformat_minor": 2 463 | } 464 | -------------------------------------------------------------------------------- /benchmarks/main.py: -------------------------------------------------------------------------------- 1 | """graph2nosql latency benchmark library across graph storage implementations""" 2 | 3 | import os 4 | import json 5 | import time 6 | from abc import ABC, abstractmethod 7 | from typing import Any, Dict 8 | 9 | from dotenv import dotenv_values 10 | 11 | from google.cloud import bigquery 12 | import google.auth 13 | 14 | 15 | from base.operations import NoSQLKnowledgeGraph 16 | from databases.firestore_kg import FirestoreKG 17 | from databases.n4j import AuraKG 18 | 19 | from datamodel.data_model import NodeData, EdgeData 20 | 21 | 22 | class KGDBBenchmark(ABC): 23 | """ 24 | Abstract base class for defining latency benchmark experiments 25 | for different Knowledge Graph Databases (KGDBs). 26 | 27 | This class provides a framework for comparing the performance 28 | of different KGDB implementations on specific database operations. 29 | Concrete benchmark classes should inherit from this class and implement 30 | the `_construct_data` and `_db_transaction` methods. 31 | 32 | Attributes: 33 | benchmark_name (str): The name of the benchmark experiment. 34 | options_dict (Dict[str, NoSQLKnowledgeGraph]): Dictionary of 35 | KGDB implementations being compared. 36 | 37 | import_lim (int): The number of records to import/process in the benchmark. 38 | 39 | Example Usage: 40 | ```python 41 | class MyBenchmark(KGDBBenchmark): 42 | def __init__(self, options_dict, import_lim): 43 | super().__init__("My Benchmark", options_dict, import_lim) 44 | 45 | def _construct_data(self, row): 46 | # Implement logic to construct data for the benchmark from a row of input data. 47 | pass 48 | 49 | def _db_transaction(self, kgdb, option_name, data): 50 | # Implement the specific database operation to benchmark 51 | # using the provided kgdb and data. 52 | pass 53 | 54 | # Create instances of your KGDB implementations (e.g., FirestoreKG, AuraKG) 55 | option_1 = ... 56 | option_2 = ... 57 | 58 | options_dict = {"option_1_name": option_1, "option_2_name": option_2} 59 | 60 | # Create an instance of your benchmark class 61 | benchmark = MyBenchmark(options_dict, 1000) 62 | 63 | # Execute the benchmark 64 | benchmark(records) # 'records' would be your input data 65 | ``` 66 | """ 67 | 68 | 69 | def __init__(self, 70 | benchmark_name: str, 71 | options_dict: Dict[str, NoSQLKnowledgeGraph], 72 | import_lim: int, 73 | ): 74 | self.benchmark_name = benchmark_name 75 | self.import_lim = import_lim 76 | self.options_dict = options_dict 77 | self.option_names = list(options_dict.keys()) 78 | self.option_times = {} 79 | 80 | def __call__(self, records): 81 | 82 | print( 83 | f'$$$$ Starting Benchmark {self.benchmark_name} with options: {self.option_names} $$$$') 84 | 85 | for option_name in self.option_names: 86 | start_time = time.time() 87 | 88 | for row in records: 89 | data = self._construct_data(row) 90 | try: 91 | self._db_transaction(kgdb=self.options_dict[option_name], 92 | data=data, option_name=option_name) 93 | except Exception: 94 | pass 95 | 96 | end_time = time.time() 97 | self.option_times[option_name] = end_time - start_time 98 | 99 | self._benchmark_reporting() 100 | print("hEllO wOrlD!") 101 | 102 | def _benchmark_reporting(self) -> None: 103 | for option_name in self.option_names: 104 | print(f'{option_name} time for {self.import_lim} {self.benchmark_name}: {self.option_times[option_name]}') 105 | return None 106 | 107 | @abstractmethod 108 | def _construct_data(self, row: Any): 109 | """constructs the data to be used for the benchmark db transaction given the data records""" 110 | 111 | @abstractmethod 112 | def _db_transaction(self, kgdb: NoSQLKnowledgeGraph, option_name: str, data) -> None: 113 | """defines the db transaction that this benchmark run should compare""" 114 | 115 | 116 | class NodeImportBenchmark(KGDBBenchmark): 117 | """ 118 | Define Latency Benchmark for Node Import. Inhertits from KGDBBenchmark. 119 | Implements _construct_data and _db_transaction methods for edge import. 120 | """ 121 | def _construct_data(self, row: Any): 122 | # constructs NodeData given a tuple[str, str, str] record 123 | record_values = row.values() 124 | 125 | body_str = json.loads(record_values[1])[0] 126 | # alias_list = json.loads(record_values[2]) 127 | node_uid = record_values[0] 128 | 129 | node_data = NodeData(node_uid=node_uid, 130 | node_title=node_uid, 131 | node_description=body_str, 132 | node_degree=0, 133 | node_type="na", 134 | document_id="na") 135 | return node_data 136 | 137 | def _db_transaction(self, kgdb: NoSQLKnowledgeGraph, option_name, data: NodeData) -> None: 138 | # defines the db transaction that this benchmark run should compare 139 | try: 140 | kgdb.add_node(node_uid=data.node_uid, node_data=data) 141 | # print(f'Success adding node {data.node_uid} with {option_name}') 142 | except Exception as e: 143 | print(f"Error adding node {data.node_uid} with {option_name}: {e}") 144 | 145 | 146 | class EdgeImportBenchmark(KGDBBenchmark): 147 | """ 148 | Define Latency Benchmark for edge import. Inhertits from KGDBBenchmark. 149 | Implements _construct_data and _db_transaction methods for edge import. 150 | """ 151 | def _construct_data(self, row: Any): 152 | # constructs NodeData given a tuple[str, str, str] record 153 | # record_values = row.values() 154 | 155 | source_uid = row[0] 156 | edge_uid = row[1] 157 | target_uid = row[2] 158 | # description_body = json.loads(row.values()[3]) 159 | edge_description = json.loads(row.values()[3])[0] 160 | 161 | edge_data = EdgeData(source_uid=source_uid, 162 | target_uid=target_uid, 163 | description=edge_description, 164 | edge_uid=edge_uid 165 | ) 166 | return edge_data 167 | 168 | def _db_transaction(self, kgdb: NoSQLKnowledgeGraph, option_name: str, data: EdgeData) -> None: 169 | # defines the db transaction that this benchmark run should compare 170 | try: 171 | kgdb.add_edge(edge_data=data, directed=True) 172 | # print(f'Success adding edge {data.edge_uid} with {option_name}') 173 | except Exception as e: 174 | print(f"Error adding edge {data.edge_uid} with {option_name}: {e}") 175 | return None 176 | 177 | 178 | class NodeQueryBenchmark(KGDBBenchmark): 179 | """ 180 | Define Latency Benchmark for node query. Inhertits from KGDBBenchmark. 181 | Implements _construct_data and _db_transaction methods for node query. 182 | """ 183 | def _construct_data(self, row: Any): 184 | record_values = row.values() 185 | node_uid = record_values[0] 186 | return node_uid 187 | 188 | def _db_transaction(self, kgdb: NoSQLKnowledgeGraph, option_name, data: str): 189 | # defines the db transaction that this benchmark run should compare 190 | try: 191 | kgdb.get_node(node_uid=data) 192 | # print(f'Success fetching node data {data} with {option_name}') 193 | except Exception as e: 194 | print(f"Error fetching node data {data} with {option_name}: {e}") 195 | return None 196 | 197 | 198 | if __name__ == "__main__": 199 | os.chdir('../') 200 | current_directory = os.getcwd() 201 | print(f"Current directory: {current_directory}") 202 | 203 | secrets = dotenv_values(".env") 204 | credentials, _ = google.auth.load_credentials_from_file( 205 | secrets["GCP_CREDENTIAL_FILE"]) 206 | 207 | IMPORT_LIMIT = 100 208 | 209 | # Fetch Node data from BigQuery 210 | client = bigquery.Client(project=str( 211 | secrets["GCP_PROJECT_ID"]), credentials=credentials) 212 | 213 | fskg = FirestoreKG(gcp_credential_file=str(secrets["GCP_CREDENTIAL_FILE"]), 214 | gcp_project_id=str(secrets["GCP_PROJECT_ID"]), 215 | firestore_db_id=str(secrets["FIRESTORE_DB_ID"]), 216 | node_collection_id=str(secrets["NODE_COLL_ID"]), 217 | edges_collection_id=str(secrets["EDGES_COLL_ID"]), 218 | community_collection_id=str( 219 | secrets["COMM_COLL_ID"]) 220 | ) 221 | 222 | aura_kg = AuraKG(uri=str(secrets["NEO4J_URI"]), 223 | auth=(str(secrets["NEO4J_USERNAME"]), 224 | str(secrets["NEO4J_PASSWORD"])) 225 | ) 226 | 227 | # clean kg storages before starting test run 228 | # fskg.flush_kg() 229 | # aura_kg.flush_kg() 230 | 231 | # # # add nodes testing 232 | # query_job = client.query( 233 | # f"SELECT * FROM poerschmann-sem-search.wikidata_kg.entity_doc_alias_joined LIMIT {import_lim}") 234 | # add_nodes_testing = NodeImportBenchmark(benchmark_name="Node Import", option_1=fskg, option_2=aura_kg, import_lim=100) 235 | # add_nodes_testing(records=query_job) 236 | 237 | # # add egdes testing 238 | # query_job = client.query( 239 | # f"SELECT * FROM poerschmann-sem-search.wikidata_kg.triplets_relations_joined") 240 | # add_edges_testing = EdgeImportBenchmark(benchmark_name="Edge Import", option_1=fskg, option_2=aura_kg, import_lim=100) 241 | # add_edges_testing(records=query_job) 242 | 243 | print('hello base!') 244 | -------------------------------------------------------------------------------- /benchmarks/requirements.txt: -------------------------------------------------------------------------------- 1 | google-cloud-bigquery==3.26.0 2 | -------------------------------------------------------------------------------- /databases/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jakobap/graph2nosql/77df8ecba857c61381a37b878d57c20d52ff9834/databases/__init__.py -------------------------------------------------------------------------------- /databases/firestore_kg.py: -------------------------------------------------------------------------------- 1 | """Firestore database operations implementation""" 2 | 3 | from typing import List 4 | 5 | import firebase_admin # type: ignore 6 | from firebase_admin import firestore 7 | from google.cloud.firestore_v1.base_vector_query import DistanceMeasure 8 | from google.cloud.firestore_v1.vector import Vector 9 | import google.auth 10 | 11 | import networkx as nx # type: ignore 12 | 13 | from datamodel.data_model import NodeData, EdgeData, CommunityData 14 | from base.operations import NoSQLKnowledgeGraph 15 | 16 | 17 | class FirestoreKG(NoSQLKnowledgeGraph): 18 | """Firestore database operations implementation class""" 19 | 20 | def __init__(self, 21 | gcp_project_id: str, 22 | gcp_credential_file: str, 23 | firestore_db_id: str, 24 | node_collection_id: str, 25 | edges_collection_id: str, 26 | community_collection_id: str 27 | ) -> None: 28 | """ 29 | Initializes the FirestoreKG object. 30 | 31 | Args: 32 | project_id (str): The Google Cloud project ID. 33 | database_id (str): The ID of the Firestore database. 34 | collection_name (str): The name of the collection to store the KG. 35 | """ 36 | super().__init__() 37 | 38 | if not firebase_admin._apps: 39 | credentials = firebase_admin.credentials.Certificate( 40 | gcp_credential_file 41 | ) 42 | app = firebase_admin.initialize_app(credentials) 43 | 44 | self.credentials, self.project_id = google.auth.load_credentials_from_file( 45 | gcp_credential_file) 46 | 47 | self.db = firestore.Client(project=gcp_project_id, # type: ignore 48 | credentials=self.credentials, 49 | database=firestore_db_id) 50 | 51 | self.gcp_project_id = gcp_project_id 52 | self.database_id = firestore_db_id 53 | self.node_coll_id = node_collection_id 54 | self.edges_coll_id = edges_collection_id 55 | self.community_coll_id = community_collection_id 56 | 57 | def add_node(self, node_uid: str, node_data: NodeData) -> None: 58 | """Adds an node to the knowledge graph.""" 59 | doc_ref = self.db.collection(self.node_coll_id).document(node_uid) 60 | 61 | # Check if a node with the same node_uid already exists 62 | if doc_ref.get().exists: 63 | raise ValueError( 64 | f"Error: Node with node_uid '{node_uid}' already exists.") 65 | 66 | # block NodeData if edge info is included 67 | if node_data.edges_to or node_data.edges_from: 68 | raise ValueError( 69 | f"""Error: NodeData cannot be initiated with edges_to or edges_from. Please add edges separately.""") 70 | 71 | # Convert NodeData to a dictionary for Firestore storage 72 | try: 73 | node_data_dict = node_data.__dict__ 74 | except TypeError as e: 75 | raise ValueError( 76 | f"Error: Provided node_data for node_uid '{node_uid}' cannot be converted to a dictionary. Details: {e}" 77 | ) from e 78 | 79 | # Set the document ID to match the node_uid 80 | try: 81 | doc_ref.set(node_data_dict) 82 | except ValueError as e: 83 | raise ValueError( 84 | f"Error: Could not add node with node_uid '{node_uid}' to Firestore. Details: {e}" 85 | ) from e 86 | 87 | # Update references in other nodes 88 | for other_node_uid in node_data.edges_to: 89 | try: 90 | other_node_data = self.get_node(other_node_uid) 91 | other_node_data.edges_from = list(set(other_node_data.edges_from) | { 92 | node_uid}) # Add to edges_from 93 | self.update_node(other_node_uid, other_node_data) 94 | except KeyError: 95 | # If the other node doesn't exist, just continue 96 | continue 97 | 98 | for other_node_uid in node_data.edges_from: 99 | try: 100 | other_node_data = self.get_node(other_node_uid) 101 | other_node_data.edges_to = list(set(other_node_data.edges_from) | { 102 | node_uid}) # Add to edges_to 103 | self.update_node(other_node_uid, other_node_data) 104 | except KeyError: 105 | # If the other node doesn't exist, just continue 106 | continue 107 | 108 | def get_node(self, node_uid: str) -> NodeData: 109 | """Retrieves an node from the knowledge graph.""" 110 | doc_ref = self.db.collection(self.node_coll_id).document(node_uid) 111 | doc_snapshot = doc_ref.get() 112 | 113 | if doc_snapshot.exists: 114 | try: 115 | node_data = NodeData(**doc_snapshot.to_dict()) 116 | return node_data 117 | except TypeError as e: 118 | raise ValueError( 119 | f"Error: Data fetched for node_uid '{node_uid}' does not match the NodeData format. Details: {e}" 120 | ) from e 121 | else: 122 | raise KeyError(f"Error: No node found with node_uid: {node_uid}") 123 | 124 | def update_node(self, node_uid: str, node_data: NodeData) -> None: 125 | """Updates an existing node in the knowledge graph.""" 126 | doc_ref = self.db.collection(self.node_coll_id).document(node_uid) 127 | 128 | # Check if the node exists 129 | if not doc_ref.get().exists: 130 | raise KeyError( 131 | f"Error: Node with node_uid '{node_uid}' does not exist.") 132 | 133 | # Convert NodeData to a dictionary for Firestore storage 134 | try: 135 | node_data_dict = node_data.__dict__ 136 | except TypeError as e: 137 | raise ValueError( 138 | f"Error: Provided node_data for node_uid '{node_uid}' cannot be converted to a dictionary. Details: {e}" 139 | ) from e 140 | 141 | # Update the document 142 | try: 143 | doc_ref.update(node_data_dict) 144 | except ValueError as e: 145 | raise ValueError( 146 | f"Error: Could not update node with node_uid '{node_uid}' in Firestore. Details: {e}" 147 | ) from e 148 | 149 | def remove_node(self, node_uid: str) -> None: 150 | """ 151 | Removes an node from the knowledge graph. 152 | Also removed all edges to and from the node to be removed from all other nodes. 153 | """ 154 | doc_ref = self.db.collection(self.node_coll_id).document(node_uid) 155 | 156 | # TODO: Update edge collection on edge removal. 157 | 158 | # Check if the node exists 159 | if not doc_ref.get().exists: 160 | raise KeyError( 161 | f"Error: Node with node_uid '{node_uid}' does not exist.") 162 | 163 | # 1. Get the node data to find its connections 164 | node_data = self.get_node(node_uid) 165 | 166 | # 2. Remove connections TO this node from other nodes 167 | for other_node_uid in node_data.edges_from: 168 | try: 169 | other_node_data = self.get_node(other_node_uid) 170 | other_node_data.edges_to = list( 171 | edge for edge in other_node_data.edges_to if edge != node_uid 172 | ) 173 | self.update_node(other_node_uid, other_node_data) 174 | except KeyError: 175 | # If the other node doesn't exist, just continue 176 | continue 177 | 178 | # 3. Remove connections FROM this node to other nodes 179 | for other_node_uid in node_data.edges_to: 180 | try: 181 | other_node_data = self.get_node(other_node_uid) 182 | other_node_data.edges_from = list( 183 | edge for edge in other_node_data.edges_from if edge != node_uid 184 | ) 185 | self.update_node(other_node_uid, other_node_data) 186 | except KeyError: 187 | # If the other node doesn't exist, just continue 188 | continue 189 | 190 | # 4. Finally, remove the node itself 191 | doc_ref.delete() 192 | 193 | def add_edge(self, edge_data: EdgeData) -> None: 194 | """ 195 | Adds an edge (relationship) between two entities in the knowledge graph. 196 | 197 | Args: 198 | source_uid (str): The UID of the source node. 199 | target_uid (str): The UID of the target node. 200 | edge_data (EdgeData): The edge data to be added. 201 | directed (bool, optional): Whether the edge is directed. Defaults to True. 202 | """ 203 | 204 | # Check if source and target nodes exist 205 | if not self.get_node(edge_data.source_uid): 206 | raise KeyError( 207 | f"Error: Source node with node_uid '{edge_data.source_uid}' does not exist.") 208 | if not self.get_node(edge_data.target_uid): 209 | raise KeyError( 210 | f"Error: Target node with node_uid '{edge_data.target_uid}' does not exist.") 211 | 212 | # Type checking for edge_data 213 | if not isinstance(edge_data, EdgeData): 214 | raise TypeError( 215 | f"Error: edge_data must be of type EdgeData, not {type(edge_data)}") 216 | 217 | edge_uid = self._generate_edge_uid( 218 | source_uid=edge_data.source_uid, target_uid=edge_data.target_uid) 219 | 220 | try: 221 | source_node_data = self.get_node(edge_data.source_uid) 222 | target_node_data = self.get_node(edge_data.target_uid) 223 | 224 | source_node_data.edges_to = list( 225 | set(source_node_data.edges_to) | {edge_data.target_uid}) 226 | self.update_node(edge_data.source_uid, source_node_data) 227 | 228 | # Add the edge to the target node's edges_from 229 | target_node_data.edges_from = list( 230 | set(target_node_data.edges_from) | {edge_data.source_uid}) 231 | self.update_node(edge_data.target_uid, target_node_data) 232 | 233 | # Add the edge to the edges collection 234 | self._update_egde_coll(edge_uid=edge_uid, 235 | target_uid=edge_data.target_uid, 236 | source_uid=edge_data.source_uid, 237 | description=edge_data.description, 238 | directed=edge_data.directed) 239 | 240 | if not edge_data.directed: # If undirected, add the reverse edge as well 241 | target_node_data.edges_to = list( 242 | set(target_node_data.edges_to) | {edge_data.source_uid}) 243 | self.update_node(edge_data.target_uid, target_node_data) 244 | 245 | # Since it's undirected, also add source_uid to target_node_data.edges_from 246 | source_node_data.edges_from = list( 247 | set(source_node_data.edges_from) | {edge_data.target_uid}) 248 | self.update_node(edge_data.source_uid, source_node_data) 249 | 250 | # Add the reverse edge to the edges collection 251 | reverse_edge_uid = self._generate_edge_uid(source_uid=edge_data.target_uid, 252 | target_uid=edge_data.source_uid) 253 | self._update_egde_coll(edge_uid=reverse_edge_uid, 254 | target_uid=edge_data.source_uid, 255 | source_uid=edge_data.target_uid, 256 | description=edge_data.description, 257 | directed=edge_data.directed) 258 | 259 | except ValueError as e: 260 | raise ValueError( 261 | f"Error: Could not add edge from '{edge_data.source_uid}' to '{edge_data.target_uid}'. Details: {e}" 262 | ) from e 263 | 264 | def get_edge(self, source_uid: str, target_uid: str) -> EdgeData: 265 | """Retrieves an edge between two entities from the edges collection.""" 266 | edge_uid = self._generate_edge_uid(source_uid, target_uid) 267 | edge_doc_ref = self.db.collection( 268 | self.edges_coll_id).document(edge_uid) 269 | doc_snapshot = edge_doc_ref.get() 270 | 271 | if doc_snapshot.exists: 272 | try: 273 | edge_data = EdgeData(**doc_snapshot.to_dict()) 274 | return edge_data 275 | except TypeError as e: 276 | raise ValueError( 277 | f"Error: Data fetched for edge_uid '{edge_uid}' does not match the EdgeData format. Details: {e}" 278 | ) from e 279 | else: 280 | raise KeyError(f"Error: No edge found with edge_uid: {edge_uid}") 281 | 282 | def update_edge(self, edge_data: EdgeData) -> None: 283 | """Updates an existing edge in the knowledge graph.""" 284 | 285 | # 1. Validate input and check if the edge exists 286 | if not isinstance(edge_data, EdgeData): 287 | raise TypeError( 288 | f"Error: edge_data must be of type EdgeData, not {type(edge_data)}") 289 | 290 | edge_uid = self._generate_edge_uid( 291 | edge_data.source_uid, edge_data.target_uid) 292 | 293 | if not self.db.collection(self.edges_coll_id).document(edge_uid).get().exists: 294 | raise KeyError( 295 | f"Error: Edge with edge_uid '{edge_uid}' does not exist.") 296 | 297 | # 2. Update the edge document in the EDGES collection 298 | try: 299 | self._update_egde_coll( 300 | edge_uid=edge_uid, 301 | target_uid=edge_data.target_uid, 302 | source_uid=edge_data.source_uid, 303 | description=edge_data.description, 304 | directed=edge_data.directed 305 | ) 306 | except Exception as e: 307 | raise Exception( 308 | f"Error updating edge in edges collection: {e}") from e 309 | 310 | # 3. Update edge references in the NODES collection 311 | try: 312 | # 3a. Update source node 313 | source_node_data = self.get_node(edge_data.source_uid) 314 | # Ensure the target_uid is present in edges_to 315 | if edge_data.target_uid not in source_node_data.edges_to: 316 | source_node_data.edges_to = list( 317 | set(source_node_data.edges_to) | {edge_data.target_uid}) 318 | self.update_node(edge_data.source_uid, source_node_data) 319 | 320 | # 3b. Update target node 321 | target_node_data = self.get_node(edge_data.target_uid) 322 | # Ensure the source_uid is present in edges_from 323 | if edge_data.source_uid not in target_node_data.edges_from: 324 | target_node_data.edges_from = list( 325 | set(target_node_data.edges_from) | {edge_data.source_uid}) 326 | self.update_node(edge_data.target_uid, target_node_data) 327 | 328 | except Exception as e: 329 | raise Exception( 330 | f"Error updating edge references in nodes: {e}") from e 331 | 332 | def _delete_from_edge_coll(self, edge_uid: str) -> None: 333 | """Method to delete record from edge collection of given kg store""" 334 | edge_doc_ref = self.db.collection( 335 | self.edges_coll_id).document(edge_uid) 336 | edge_doc_ref.delete() 337 | 338 | def remove_edge(self, source_uid: str, target_uid: str) -> None: 339 | """Removes an edge between two entities.""" 340 | 341 | # Get involved edge and node data 342 | try: 343 | edge_data = self.get_edge( 344 | source_uid=source_uid, target_uid=target_uid) 345 | except Exception as e: 346 | raise Exception(f"Error getting edge: {e}") from e 347 | 348 | try: 349 | source_node_data = self.get_node(node_uid=source_uid) 350 | except Exception as e: 351 | raise Exception(f"Error getting source node: {e}") from e 352 | 353 | try: 354 | target_node_data = self.get_node(node_uid=target_uid) 355 | except Exception as e: 356 | raise Exception(f"Error getting target node: {e}") from e 357 | 358 | # remove target_uid from from source -> target 359 | try: 360 | source_node_data.edges_to.remove(target_uid) 361 | self.update_node(source_uid, source_node_data) 362 | except ValueError as e: 363 | raise ValueError( 364 | f"Error: Target node not in source's edges_to: {e}") 365 | 366 | # remove source_uid from target <- source 367 | try: 368 | target_node_data.edges_from.remove(source_uid) 369 | self.update_node(target_uid, target_node_data) 370 | except ValueError as e: 371 | raise ValueError( 372 | f"Error: Source node not in target's edges_to: {e}") 373 | 374 | # Remove the edge from the edges collection 375 | edge_uid = self._generate_edge_uid(source_uid, target_uid) 376 | self._delete_from_edge_coll(edge_uid=edge_uid) 377 | 378 | # remove the opposite direction if edge undirected 379 | if not edge_data.directed: 380 | # remove target_uid from source <- target 381 | try: 382 | source_node_data.edges_from.remove(target_uid) 383 | self.update_node(source_uid, source_node_data) 384 | except ValueError as e: 385 | raise ValueError( 386 | f"Error: Target node not in source's edges_to: {e}") 387 | 388 | # remove source_uid from target -> source 389 | try: 390 | target_node_data.edges_to.remove(source_uid) 391 | self.update_node(target_uid, target_node_data) 392 | except ValueError as e: 393 | raise ValueError( 394 | f"Error: Source node not in target's edges_to: {e}") 395 | 396 | # Remove the edge from the edges collection 397 | reverse_edge_uid = self._generate_edge_uid(target_uid, source_uid) 398 | self._delete_from_edge_coll(edge_uid=reverse_edge_uid) 399 | else: 400 | pass 401 | 402 | def build_networkx(self): 403 | """Get the NetworkX representation of the full graph.""" 404 | 405 | graph = nx.Graph() # Initialize an undirected NetworkX graph 406 | 407 | # 1. Add Nodes to the NetworkX Graph 408 | nodes_ref = self.db.collection(self.node_coll_id).stream() 409 | for doc in nodes_ref: 410 | node_data = doc.to_dict() 411 | graph.add_node(doc.id, **node_data) 412 | 413 | # 2. Add Edges to the NetworkX Graph 414 | edges_ref = self.db.collection(self.edges_coll_id).stream() 415 | for doc in edges_ref: 416 | edge_data = doc.to_dict() 417 | source_uid = edge_data['source_uid'] 418 | target_uid = edge_data['target_uid'] 419 | # Consider adding edge attributes if needed (e.g., 'description') 420 | graph.add_edge(source_uid, target_uid) 421 | 422 | self.networkx = graph 423 | 424 | def get_community(self, community_id: str) -> CommunityData: 425 | """Retrieves the community report for a given community id.""" 426 | doc_ref = self.db.collection( 427 | self.community_coll_id).document(community_id) 428 | doc_snapshot = doc_ref.get() 429 | 430 | if doc_snapshot.exists: 431 | try: 432 | community_data = CommunityData(**doc_snapshot.to_dict()) 433 | return community_data 434 | except TypeError as e: 435 | raise ValueError( 436 | f"Error: Data fetched for community_id '{community_id}' does not match the CommunityData format. Details: {e}" 437 | ) from e 438 | else: 439 | raise KeyError( 440 | f"Error: No community found with community_id: {community_id}") 441 | 442 | def list_communities(self) -> List[CommunityData]: 443 | """Lists all communities for the given network.""" 444 | docs = self.db.collection(self.community_coll_id).stream() 445 | return [CommunityData.__from_dict__(doc.to_dict()) for doc in docs] 446 | 447 | def _update_egde_coll(self, edge_uid: str, source_uid: str, target_uid: str, description: str, directed: bool) -> None: 448 | """Update edge record in the edges collection.""" 449 | edge_doc_ref = self.db.collection( 450 | self.edges_coll_id).document(edge_uid) 451 | edge_data_dict = { 452 | "edge_uid": edge_uid, 453 | "source_uid": source_uid, 454 | "target_uid": target_uid, 455 | "description": description, 456 | "directed": directed 457 | } 458 | edge_doc_ref.set(edge_data_dict) 459 | 460 | def store_community(self, community: CommunityData) -> None: 461 | """Takes valid graph community data and upserts the database with it. 462 | https://www.nature.com/articles/s41598-019-41695-z 463 | """ 464 | # Convert CommunityData to a dictionary for Firestore storage 465 | try: 466 | community_data_dict = community.__dict__ 467 | except TypeError as e: 468 | raise ValueError( 469 | f"Error: Provided community data for community '{community.title}' cannot be converted to a dictionary. Details: {e}" 470 | ) from e 471 | 472 | # Get a reference to the document 473 | doc_ref = self.db.collection( 474 | self.community_coll_id).document(community.title) 475 | 476 | # Use set with merge=True to upsert the document 477 | try: 478 | doc_ref.set(community_data_dict, merge=True) 479 | except Exception as e: 480 | raise Exception(f"Error storing community data: {e}") from e 481 | 482 | def _generate_edge_uid(self, source_uid: str, target_uid: str): 483 | return f"{source_uid}_to_{target_uid}" 484 | 485 | def node_exist(self, node_uid: str) -> bool: 486 | """Checks for node existence and returns boolean""" 487 | doc_ref = self.db.collection(self.node_coll_id).document(node_uid) 488 | doc_snapshot = doc_ref.get() 489 | 490 | if doc_snapshot.exists: 491 | return True 492 | else: 493 | return False 494 | 495 | def edge_exist(self, source_uid: str, target_uid: str) -> bool: 496 | """Checks for edge existence and returns boolean""" 497 | edge_uid = self._generate_edge_uid( 498 | source_uid=source_uid, target_uid=target_uid) 499 | doc_ref = self.db.collection(self.edges_coll_id).document(edge_uid) 500 | doc_snapshot = doc_ref.get() 501 | 502 | if doc_snapshot.exists: 503 | return True 504 | else: 505 | return False 506 | 507 | def get_nearest_neighbors(self, query_vec: list[float]) -> list: 508 | """ 509 | Implements nearest neighbor search based on Firestore embedding index: 510 | https://firebase.google.com/docs/firestore/vector-search 511 | """ 512 | 513 | collection = self.db.collection(self.node_coll_id) 514 | 515 | # Requires vector index 516 | nn = collection.find_nearest( 517 | vector_field="embedding", 518 | query_vector=Vector(query_vec), 519 | distance_measure=DistanceMeasure.EUCLIDEAN, 520 | limit=10).get() 521 | return [n.to_dict() for n in nn] 522 | 523 | def clean_zerodegree_nodes(self) -> None: 524 | """Removes all nodes with degree 0.""" 525 | nodes_to_remove = [] 526 | 527 | # 1. Iterate through all nodes to find those with degree 0 528 | nodes_ref = self.db.collection(self.node_coll_id).stream() 529 | for doc in nodes_ref: 530 | node_data = doc.to_dict() 531 | if len(node_data.get('edges_to', [])) + len(node_data.get('edges_from', [])) == 0: 532 | nodes_to_remove.append(doc.id) 533 | 534 | # 2. Remove the identified nodes 535 | for node_uid in nodes_to_remove: 536 | self.remove_node(node_uid) 537 | return None 538 | 539 | def flush_kg(self) -> None: 540 | """Method to wipe the complete datastore of the knowledge graph""" 541 | for collection_id in [self.node_coll_id, self.edges_coll_id, self.community_coll_id]: 542 | docs = self.db.collection(collection_id).stream() 543 | for doc in docs: 544 | doc.reference.delete() 545 | return None 546 | 547 | 548 | if __name__ == "__main__": 549 | import os 550 | from dotenv import dotenv_values 551 | 552 | os.chdir(os.path.dirname(os.path.abspath(__file__))) 553 | 554 | secrets = dotenv_values(".env") 555 | 556 | firestore_credential_file = str(secrets["GCP_CREDENTIAL_FILE"]) 557 | project_id = str(secrets["GCP_PROJECT_ID"]) 558 | database_id = str(secrets["FIRESTORE_DB_ID"]) 559 | node_coll_id = str(secrets["NODE_COLL_ID"]) 560 | edges_coll_id = str(secrets["EDGES_COLL_ID"]) 561 | community_coll_id = str(secrets["COMM_COLL_ID"]) 562 | 563 | fskg = FirestoreKG( 564 | gcp_project_id=project_id, 565 | gcp_credential_file=firestore_credential_file, 566 | firestore_db_id=database_id, 567 | node_collection_id=node_coll_id, 568 | edges_collection_id=edges_coll_id, 569 | community_collection_id=community_coll_id 570 | ) 571 | 572 | node = fskg.get_node(node_uid="2022 IRANIAN PROTESTS") 573 | 574 | nn = fskg.get_nearest_neighbors(node.embedding) 575 | 576 | for n in nn: 577 | print(n["node_uid"]) 578 | 579 | print("Hello World!") 580 | print("") 581 | -------------------------------------------------------------------------------- /databases/mdb.py: -------------------------------------------------------------------------------- 1 | """MongoDB Database Operations""" 2 | 3 | from typing import List 4 | 5 | from pymongo.mongo_client import MongoClient 6 | from pymongo.server_api import ServerApi 7 | 8 | from datamodel.data_model import NodeData, EdgeData, CommunityData 9 | from base.operations import NoSQLKnowledgeGraph 10 | 11 | import networkx as nx # type: ignore 12 | 13 | 14 | class MongoKG(NoSQLKnowledgeGraph): 15 | """MongoDB Database Operations Class""" 16 | 17 | def __init__(self, 18 | mdb_uri: str, 19 | mdb_db_id: str, 20 | node_coll_id: str, 21 | edges_coll_id: str, 22 | community_collection_id: str 23 | ): 24 | super().__init__() 25 | 26 | # Connect and send a ping to confirm a successful mongo db connection 27 | self.mdb_client = MongoClient(str(mdb_uri), server_api=ServerApi('1')) 28 | 29 | self.db = self.mdb_client[mdb_db_id] 30 | self.mdb_node_coll = self.db[node_coll_id] 31 | self.mdbe_edges_coll = self.db[edges_coll_id] 32 | self.mdb_comm_coll = self.db[community_collection_id] 33 | 34 | try: 35 | # client.admin.command('ping') 36 | self.mdb_client.admin.command('ping') 37 | print("Pinged your deployment. You successfully connected to MongoDB!") 38 | except Exception as e: 39 | print(e) 40 | raise Exception(f"Error connecting to MongoDB: {e}") 41 | 42 | def add_node(self, node_uid: str, node_data: NodeData) -> None: 43 | """Adds an node to the knowledge graph.""" 44 | # Check if a node with the same node_uid already exists 45 | if self.mdb_node_coll.find_one({"node_uid": node_uid}): 46 | raise KeyError( 47 | f"Error: Node with node_uid '{node_uid}' already exists.") 48 | 49 | if node_data.edges_to or node_data.edges_from: 50 | raise ValueError( 51 | f"""Error: NodeData cannot be initiated with edges_to or edges_from. Please add edges separately.""") 52 | 53 | try: 54 | # Convert NodeData to a dictionary for MongoDB storage 55 | node_data_dict = node_data.__dict__ 56 | 57 | # Insert the node data into the collection 58 | self.mdb_node_coll.insert_one(node_data_dict) 59 | 60 | except Exception as e: 61 | raise Exception( 62 | f"Error adding node with node_uid '{node_uid}': {e}") from e 63 | 64 | def get_node(self, node_uid: str) -> NodeData: 65 | """Retrieves an node from the knowledge graph.""" 66 | # Find the node data based on node_uid 67 | node_data_dict = self.mdb_node_coll.find_one({"node_uid": node_uid}) 68 | 69 | if node_data_dict: 70 | # Convert the dictionary back to a NodeData object 71 | return NodeData( 72 | node_uid=node_data_dict['node_uid'], 73 | node_title=node_data_dict['node_title'], 74 | node_type=node_data_dict['node_type'], 75 | node_description=node_data_dict['node_description'], 76 | node_degree=node_data_dict.get('node_degree', 0), 77 | document_id=node_data_dict.get('document_id', ''), 78 | community_id=node_data_dict.get('community_id', ''), 79 | edges_to=node_data_dict.get('edges_to', []), 80 | edges_from=node_data_dict.get('edges_from', []), 81 | embedding=node_data_dict.get('embedding', []) 82 | ) 83 | else: 84 | raise KeyError(f"Error: No node found with node_uid: {node_uid}") 85 | 86 | def update_node(self, node_uid: str, node_data: NodeData) -> None: 87 | """Updates an existing node in the knowledge graph.""" 88 | try: 89 | # Check if the node exists 90 | if not self.mdb_node_coll.find_one({"node_uid": node_uid}): 91 | raise KeyError( 92 | f"Error: Node with node_uid '{node_uid}' does not exist.") 93 | 94 | # Convert NodeData to a dictionary for MongoDB storage 95 | node_data_dict = node_data.__dict__ 96 | 97 | # Update the node data in the collection 98 | self.mdb_node_coll.update_one( 99 | {"node_uid": node_uid}, {"$set": node_data_dict} 100 | ) 101 | 102 | except Exception as e: 103 | raise Exception( 104 | f"Error updating node with node_uid '{node_uid}': {e}") from e 105 | 106 | def remove_node(self, node_uid: str) -> None: 107 | """Removes a node from the knowledge graph.""" 108 | 109 | # Check if the node exists 110 | if not self.node_exist(node_uid=node_uid): 111 | raise KeyError( 112 | f"Error: Node with node_uid '{node_uid}' does not exist.") 113 | 114 | # 1. Get the node data to find its connections 115 | node_data = self.get_node(node_uid) 116 | 117 | # TODO: Update edge collection on edge removal. 118 | 119 | # 2. Remove connections TO this node from other nodes 120 | for other_node_uid in node_data.edges_from: 121 | try: 122 | other_node_data = self.get_node(other_node_uid) 123 | other_node_data.edges_to = list( 124 | edge for edge in other_node_data.edges_to if edge != node_uid 125 | ) 126 | self.update_node(other_node_uid, other_node_data) 127 | except KeyError: 128 | # If the other node doesn't exist, just continue 129 | continue 130 | 131 | # 3. Remove connections FROM this node to other nodes 132 | for other_node_uid in node_data.edges_to: 133 | try: 134 | other_node_data = self.get_node(other_node_uid) 135 | other_node_data.edges_from = list( 136 | edge for edge in other_node_data.edges_from if edge != node_uid 137 | ) 138 | self.update_node(other_node_uid, other_node_data) 139 | except KeyError: 140 | # If the other node doesn't exist, just continue 141 | continue 142 | 143 | # 4. Finally, remove the node itself 144 | delete_result = self.mdb_node_coll.delete_one({"node_uid": node_uid}) 145 | if delete_result.deleted_count == 1: 146 | return None 147 | else: 148 | raise KeyError(f"Error: No node found with node_uid: {node_uid}") 149 | 150 | def add_edge(self, edge_data: EdgeData) -> None: 151 | """Adds an edge (relationship) between two entities in the knowledge graph.""" 152 | 153 | # TODO: consider moving to base class. 154 | 155 | # Check if source and target nodes exist 156 | if not self.node_exist(edge_data.source_uid): 157 | raise KeyError( 158 | f"Error: Source node with node_uid '{edge_data.source_uid}' does not exist.") 159 | if not self.node_exist(edge_data.target_uid): 160 | raise KeyError( 161 | f"Error: Target node with node_uid '{edge_data.target_uid}' does not exist.") 162 | 163 | # Type checking for edge_data 164 | if not isinstance(edge_data, EdgeData): 165 | raise TypeError( 166 | f"Error: edge_data must be of type EdgeData, not {type(edge_data)}") 167 | 168 | edge_uid = self._generate_edge_uid( 169 | edge_data.source_uid, edge_data.target_uid) 170 | 171 | try: 172 | source_node_data = self.get_node(edge_data.source_uid) 173 | target_node_data = self.get_node(edge_data.target_uid) 174 | 175 | source_node_data.edges_to = list( 176 | set(source_node_data.edges_to) | {edge_data.target_uid}) 177 | self.update_node(edge_data.source_uid, source_node_data) 178 | 179 | # Add the edge to the target node's edges_from 180 | target_node_data.edges_from = list( 181 | set(target_node_data.edges_from) | {edge_data.source_uid}) 182 | self.update_node(edge_data.target_uid, target_node_data) 183 | 184 | # Add the edge to the edges collection 185 | self._update_egde_coll(edge_uid=edge_uid, 186 | target_uid=edge_data.target_uid, 187 | source_uid=edge_data.source_uid, 188 | description=edge_data.description, 189 | directed=edge_data.directed) 190 | 191 | if not edge_data.directed: # If undirected, add the reverse edge as well 192 | reverse_edge_uid = self._generate_edge_uid( 193 | edge_data.target_uid, edge_data.source_uid) 194 | 195 | target_node_data.edges_to = list( 196 | set(target_node_data.edges_to) | {edge_data.source_uid}) 197 | self.update_node(edge_data.target_uid, target_node_data) 198 | 199 | # Since it's undirected, also add source_uid to target_node_data.edges_from 200 | source_node_data.edges_from = list( 201 | set(source_node_data.edges_from) | {edge_data.target_uid}) 202 | self.update_node(edge_data.source_uid, source_node_data) 203 | 204 | # Add the reverse edge to the edges collection 205 | self._update_egde_coll(edge_uid=reverse_edge_uid, 206 | target_uid=edge_data.source_uid, 207 | source_uid=edge_data.target_uid, 208 | description=edge_data.description, 209 | directed=edge_data.directed) 210 | 211 | except ValueError as e: 212 | raise ValueError( 213 | f"Error: Could not add edge from '{edge_data.source_uid}' to '{edge_data.target_uid}'. Details: {e}" 214 | ) from e 215 | 216 | def get_edge(self, source_uid: str, target_uid: str) -> EdgeData: 217 | """Retrieves an edge between two entities.""" 218 | edge_uid = self._generate_edge_uid(source_uid, target_uid) 219 | edge_data_dict = self.mdbe_edges_coll.find_one({"edge_uid": edge_uid}) 220 | 221 | if edge_data_dict: 222 | return EdgeData( 223 | edge_uid=edge_data_dict.get('edge_uid', ''), 224 | source_uid=edge_data_dict.get('source_uid', ''), 225 | target_uid=edge_data_dict.get('target_uid', ''), 226 | description=edge_data_dict.get('description', ''), 227 | directed=edge_data_dict.get('directed', True) 228 | ) 229 | else: 230 | raise KeyError(f"Error: No edge found with edge_uid: {edge_uid}") 231 | 232 | def update_edge(self, edge_data: EdgeData) -> None: 233 | """Updates an existing edge in the knowledge graph.""" 234 | 235 | # TODO: Consider moving to base 236 | 237 | # 1. Validate input and check if the edge exists 238 | if not isinstance(edge_data, EdgeData): 239 | raise TypeError( 240 | f"Error: edge_data must be of type EdgeData, not {type(edge_data)}") 241 | 242 | edge_uid = self._generate_edge_uid( 243 | edge_data.source_uid, edge_data.target_uid) 244 | 245 | if not self.edge_exist(source_uid=edge_data.source_uid, target_uid=edge_data.target_uid): 246 | raise KeyError( 247 | f"Error: Edge with edge_uid '{edge_uid}' does not exist.") 248 | 249 | # 2. Update the edge document in the EDGES collection 250 | try: 251 | self._update_egde_coll( 252 | edge_uid=edge_uid, 253 | target_uid=edge_data.target_uid, 254 | source_uid=edge_data.source_uid, 255 | description=edge_data.description, 256 | directed=edge_data.directed 257 | ) 258 | except Exception as e: 259 | raise Exception( 260 | f"Error updating edge in edges collection: {e}") from e 261 | 262 | # 3. Update edge references in the NODES collection 263 | try: 264 | # 3a. Update source node 265 | source_node_data = self.get_node(edge_data.source_uid) 266 | # Ensure the target_uid is present in edges_to 267 | if edge_data.target_uid not in source_node_data.edges_to: 268 | source_node_data.edges_to = list( 269 | set(source_node_data.edges_to) | {edge_data.target_uid}) 270 | self.update_node(edge_data.source_uid, source_node_data) 271 | 272 | # 3b. Update target node 273 | target_node_data = self.get_node(edge_data.target_uid) 274 | # Ensure the source_uid is present in edges_from 275 | if edge_data.source_uid not in target_node_data.edges_from: 276 | target_node_data.edges_from = list( 277 | set(target_node_data.edges_from) | {edge_data.source_uid}) 278 | self.update_node(edge_data.target_uid, target_node_data) 279 | 280 | except Exception as e: 281 | raise Exception( 282 | f"Error updating edge references in nodes: {e}") from e 283 | 284 | def _delete_from_edge_coll(self, edge_uid: str) -> None: 285 | """Method to delete record from edge collection of given kg store""" 286 | delete_result = self.mdbe_edges_coll.delete_one({"edge_uid": edge_uid}) 287 | if delete_result.deleted_count == 0: 288 | raise KeyError( 289 | f"Error: No edge found with source_uid '{source_uid}' and target_uid '{target_uid}'") 290 | 291 | def remove_edge(self, source_uid: str, target_uid: str) -> None: 292 | """Removes an edge between two entities.""" 293 | 294 | # Get involved edge and node data 295 | try: 296 | edge_data = self.get_edge( 297 | source_uid=source_uid, target_uid=target_uid) 298 | except Exception as e: 299 | raise KeyError(f"Error getting edge: {e}") from e 300 | 301 | try: 302 | source_node_data = self.get_node(node_uid=source_uid) 303 | except Exception as e: 304 | raise KeyError(f"Error getting source node: {e}") from e 305 | 306 | try: 307 | target_node_data = self.get_node(node_uid=target_uid) 308 | except Exception as e: 309 | raise KeyError(f"Error getting target node: {e}") from e 310 | 311 | # remove target_uid from from source -> target 312 | try: 313 | source_node_data.edges_to.remove(target_uid) 314 | self.update_node(source_uid, source_node_data) 315 | except ValueError as e: 316 | raise ValueError( 317 | f"Error: Target node not in source's edges_to: {e}") from e 318 | 319 | # remove source_uid from target <- source 320 | try: 321 | target_node_data.edges_from.remove(source_uid) 322 | self.update_node(target_uid, target_node_data) 323 | except ValueError as e: 324 | raise ValueError( 325 | f"Error: Source node not in target's edges_to: {e}") from e 326 | 327 | # Remove the edge from the edges collection 328 | edge_uid = self._generate_edge_uid(source_uid, target_uid) 329 | self._delete_from_edge_coll(edge_uid=edge_uid) 330 | 331 | # remove the opposite direction if edge undirected 332 | if not edge_data.directed: 333 | # remove target_uid from source <- target 334 | try: 335 | source_node_data.edges_from.remove(target_uid) 336 | self.update_node(source_uid, source_node_data) 337 | except ValueError as e: 338 | raise ValueError( 339 | f"Error: Target node not in source's edges_to: {e}") from e 340 | 341 | # remove source_uid from target -> source 342 | try: 343 | target_node_data.edges_to.remove(source_uid) 344 | self.update_node(target_uid, target_node_data) 345 | except ValueError as e: 346 | raise ValueError( 347 | f"Error: Source node not in target's edges_to: {e}") from e 348 | 349 | # Remove the edge from the edges collection 350 | reverse_edge_uid = self._generate_edge_uid(source_uid=target_uid, 351 | target_uid=source_uid) 352 | self._delete_from_edge_coll(edge_uid=reverse_edge_uid) 353 | else: 354 | pass 355 | 356 | def build_networkx(self) -> None: 357 | """Builds the NetworkX representation of the full graph. 358 | https://networkx.org/documentation/stable/index.html 359 | """ 360 | graph = nx.Graph() # Initialize an undirected NetworkX graph 361 | 362 | # 1. Add Nodes to the NetworkX Graph 363 | for node in self.mdb_node_coll.find(): 364 | graph.add_node(node['node_uid'], **node) 365 | 366 | # 2. Add Edges to the NetworkX Graph 367 | for edge in self.mdbe_edges_coll.find(): 368 | source_uid = edge['source_uid'] 369 | target_uid = edge['target_uid'] 370 | graph.add_edge(source_uid, target_uid) 371 | 372 | self.networkx = graph 373 | 374 | def store_community(self, community: CommunityData) -> None: 375 | """Takes valid graph community data and upserts the database with it. 376 | https://www.nature.com/articles/s41598-019-41695-z 377 | """ 378 | pass 379 | 380 | def _generate_edge_uid(self, source_uid: str, target_uid: str): 381 | return f"{source_uid}_to_{target_uid}" 382 | 383 | def _update_egde_coll(self, edge_uid: str, source_uid: str, 384 | target_uid: str, description: str, directed: bool) -> None: 385 | """Update edge record in the edges collection.""" 386 | edge_data_dict = { 387 | "edge_uid": edge_uid, 388 | "source_uid": source_uid, 389 | "target_uid": target_uid, 390 | "description": description, 391 | "directed": directed 392 | } 393 | self.mdbe_edges_coll.update_one( 394 | {"edge_uid": edge_uid}, {"$set": edge_data_dict}, upsert=True 395 | ) 396 | return None 397 | 398 | def get_nearest_neighbors(self, query_vec) -> List[str]: 399 | """Implements nearest neighbor search based on nosql db index.""" 400 | pass 401 | 402 | def get_community(self, community_id: str) -> CommunityData: 403 | """Retrieves the community report for a given community id.""" 404 | return 405 | 406 | def list_communities(self) -> List[CommunityData]: 407 | """Lists all stored communities for the given network.""" 408 | return 409 | 410 | def clean_zerodegree_nodes(self) -> None: 411 | """Removes all nodes with degree 0.""" 412 | return 413 | 414 | def edge_exist(self, source_uid: str, target_uid: str) -> bool: 415 | """Checks for edge existence and returns boolean""" 416 | edge_uid = self._generate_edge_uid(source_uid, target_uid) 417 | if self.mdbe_edges_coll.find_one({"edge_uid": edge_uid}) is not None: 418 | return True 419 | return False 420 | 421 | def node_exist(self, node_uid: str) -> bool: 422 | """Checks for node existence and returns boolean""" 423 | if self.mdb_node_coll.find_one({"node_uid": node_uid}) is not None: 424 | return True 425 | else: 426 | return False 427 | 428 | def flush_kg(self) -> None: 429 | """Method to wipe the complete datastore of the knowledge graph""" 430 | try: 431 | # Drop the node collection 432 | self.mdb_node_coll.drop() 433 | 434 | # Drop the edges collection 435 | self.mdbe_edges_coll.drop() 436 | 437 | # Drop the community collection 438 | self.mdb_comm_coll.drop() 439 | 440 | except Exception as e: 441 | raise Exception(f"Error flushing MongoDB collections: {e}") from e 442 | 443 | 444 | if __name__ == "__main__": 445 | import os 446 | from dotenv import dotenv_values 447 | 448 | os.chdir(os.path.dirname(os.path.abspath(__file__))) 449 | 450 | secrets = dotenv_values("../.env") 451 | 452 | mdb_username = str(secrets["MDB_USERNAME"]) 453 | mdb_passowrd = str(secrets["MDB_PASSWORD"]) 454 | mdb_cluster = str(secrets["MDB_CLUSTER"]) 455 | 456 | uri = f"mongodb+srv://{mdb_username}:{mdb_passowrd}@cluster0.pjx3w.mongodb.net/?retryWrites=true&w=majority&appName={mdb_cluster}" 457 | 458 | mkg = MongoKG( 459 | mdb_uri=uri, 460 | mdb_db_id=str(secrets["MDB_DB_ID"]), 461 | node_coll_id=str(secrets["NODE_COLL_ID"]), 462 | edges_coll_id=str(secrets["EDGES_COLL_ID"]), 463 | community_collection_id=str(secrets["COMM_COLL_ID"]) 464 | ) 465 | 466 | # node = mkg.get_node(node_uid="2022 IRANIAN PROTESTS") 467 | 468 | print("helLO wOrLD!") 469 | -------------------------------------------------------------------------------- /databases/n4j.py: -------------------------------------------------------------------------------- 1 | """Neo4j database operations""" 2 | 3 | import os 4 | from typing import List 5 | 6 | import dotenv 7 | 8 | from neo4j import GraphDatabase 9 | import networkx as nx # type: ignore 10 | 11 | from base.operations import NoSQLKnowledgeGraph 12 | from datamodel.data_model import NodeData, EdgeData, CommunityData 13 | 14 | 15 | class AuraKG(NoSQLKnowledgeGraph): 16 | """ 17 | Base Class for storing and interacting with the KG and manages data model. 18 | """ 19 | 20 | def __init__(self, 21 | uri: str, 22 | auth: tuple[str, str] 23 | ): 24 | super().__init__() 25 | self.uri = uri 26 | self.auth = auth 27 | 28 | self.driver = GraphDatabase.driver(uri, auth=auth) 29 | 30 | def add_node(self, node_uid: str, node_data: NodeData) -> None: 31 | """Adds an node to the knowledge graph.""" 32 | 33 | # with GraphDatabase.driver(self.uri, auth=self.auth) as driver: 34 | # self.driver.verify_connectivity() 35 | # print("Connection established.") 36 | 37 | summary = self.driver.execute_query( 38 | "CREATE (:" + node_data.node_type + " { " 39 | "node_uid: $node_uid, " 40 | "node_title: $node_title, " 41 | "node_type: $node_type, " 42 | "node_description: $node_description, " 43 | "node_degree: $node_degree, " 44 | "document_id: $document_id, " 45 | "community_id: $community_id, " 46 | "edges_to: $edges_to, " 47 | "edges_from: $edges_from, " 48 | "embedding: $embedding " 49 | "})", 50 | node_uid=node_data.node_uid, 51 | node_title=node_data.node_title, 52 | node_type=node_data.node_type, 53 | node_description=node_data.node_description, 54 | node_degree=node_data.node_degree, 55 | document_id=node_data.document_id, 56 | community_id=node_data.community_id, 57 | edges_to=node_data.edges_to, 58 | edges_from=node_data.edges_from, 59 | embedding=node_data.embedding 60 | ).summary 61 | 62 | # print("Created {nodes_created} nodes with if {node_uid} in {time} ms.".format( 63 | # nodes_created=summary.counters.nodes_created, 64 | # node_uid=node_uid, 65 | # time=summary.result_available_after 66 | # )) 67 | return None 68 | 69 | def get_node(self, node_uid: str) -> NodeData: 70 | """Retrieves a node from the knowledge graph.""" 71 | 72 | self.driver.verify_connectivity() 73 | 74 | # Use a parameter for node_uid in the Cypher query 75 | records, summary, keys = self.driver.execute_query( 76 | "MATCH (n {node_uid: $node_uid}) RETURN n", 77 | node_uid=node_uid # Pass node_uid as a parameter 78 | ) 79 | 80 | if records: # Check if any records were returned 81 | record = records[0] # Get the first record 82 | node_data = record['n'] 83 | # Convert Neo4j node properties to NodeData object 84 | return NodeData( 85 | # Assuming node_uid is a property 86 | node_uid=node_data.get('node_uid'), 87 | node_title=node_data.get('node_title'), 88 | node_type=node_data.get('node_type'), 89 | node_description=node_data.get('node_description'), 90 | node_degree=node_data.get('node_degree'), 91 | document_id=node_data.get('document_id'), 92 | edges_to=node_data.get('edges_to', []), 93 | edges_from=node_data.get('edges_from', []), 94 | embedding=node_data.get('embedding', []) 95 | ) 96 | else: 97 | raise KeyError( 98 | f"Error: No node found with node_uid: {node_uid}") 99 | 100 | def update_node(self, node_uid: str, node_data: NodeData) -> None: 101 | """Updates an existing node in the knowledge graph.""" 102 | 103 | self.driver.verify_connectivity() 104 | 105 | # Use parameters for all properties in the Cypher query 106 | summary = self.driver.execute_query( 107 | """ 108 | MATCH (n { node_uid: $node_uid }) 109 | SET n.node_title = $node_title, 110 | n.node_type = $node_type, 111 | n.node_description = $node_description, 112 | n.node_degree = $node_degree, 113 | n.document_id = $document_id, 114 | n.community_id = $community_id, 115 | n.edges_to = $edges_to, 116 | n.edges_from = $edges_from, 117 | n.embedding = $embedding 118 | RETURN n 119 | """, 120 | node_uid=node_uid, 121 | node_title=node_data.node_title, 122 | node_type=node_data.node_type, 123 | node_description=node_data.node_description, 124 | node_degree=node_data.node_degree, 125 | document_id=node_data.document_id, 126 | community_id=node_data.community_id, 127 | edges_to=node_data.edges_to, 128 | edges_from=node_data.edges_from, 129 | embedding=node_data.embedding 130 | ).summary 131 | 132 | def _delete_from_edge_coll(self, edge_uid: str) -> None: 133 | """Method to delete record from edge collection of given kg store""" 134 | raise NotImplementedError("Not implemented for n4j because no collections used.") 135 | 136 | def remove_node(self, node_uid: str) -> None: 137 | """Removes a node from the knowledge graph.""" 138 | 139 | self.driver.verify_connectivity() 140 | 141 | summary = self.driver.execute_query( 142 | "MATCH (n {node_uid: $node_uid}) DETACH DELETE n", 143 | node_uid=node_uid 144 | ).summary 145 | 146 | if summary.counters.nodes_deleted == 0: 147 | raise KeyError( 148 | f"Error: No node found with node_uid: {node_uid}") 149 | return None 150 | 151 | def add_edge(self, edge_data: EdgeData) -> None: 152 | """Adds an edge (relationship) between two entities in the knowledge graph.""" 153 | 154 | # get source and target node data 155 | source_node_data = self.get_node(edge_data.source_uid) 156 | target_node_data = self.get_node(edge_data.target_uid) 157 | 158 | # update source and target node data 159 | source_node_data.edges_to = list( 160 | set(source_node_data.edges_to) | {edge_data.target_uid}) 161 | self.update_node(edge_data.source_uid, source_node_data) 162 | target_node_data.edges_from = list( 163 | set(target_node_data.edges_from) | {edge_data.source_uid}) 164 | self.update_node(edge_data.target_uid, target_node_data) 165 | 166 | self.driver.verify_connectivity() 167 | 168 | if edge_data.directed: 169 | query = """ 170 | MATCH (source:""" + source_node_data.node_type + """ {node_uid: $source_uid}), (target:""" + target_node_data.node_type + """ {node_uid: $target_uid}) 171 | CREATE (source)-[:DIRECTED {description: $description}]->(target) 172 | """ 173 | 174 | elif not edge_data.directed: 175 | query = """ 176 | MATCH (source:""" + source_node_data.node_type + """ {node_uid: $source_uid}), (target:""" + target_node_data.node_type + """ {node_uid: $target_uid}) 177 | CREATE (source)-[:UNDIRECTED {description: $description}]->(target), (target)-[:UNDIRECTED {description: $description}]->(source) 178 | """ 179 | 180 | # Since it's undirected, also add source_uid to target_node_data.edges_from and vice versa 181 | target_node_data.edges_to = list( 182 | set(target_node_data.edges_to) | {edge_data.source_uid}) 183 | self.update_node(edge_data.target_uid, target_node_data) 184 | source_node_data.edges_from = list( 185 | set(source_node_data.edges_from) | {edge_data.target_uid}) 186 | self.update_node(edge_data.source_uid, source_node_data) 187 | 188 | summary = self.driver.execute_query( 189 | query, 190 | source_uid=edge_data.source_uid, 191 | target_uid=edge_data.target_uid, 192 | description=edge_data.description 193 | ).summary 194 | 195 | print("#### Created {count} egdes {origin} -> {target} egdes in {time} ms.".format( 196 | count=str(summary.counters.relationships_created), 197 | origin=str(edge_data.source_uid), 198 | target=str(edge_data.target_uid), 199 | time=str(summary.result_available_after) 200 | )) 201 | 202 | return None 203 | 204 | def get_edge(self, source_uid: str, target_uid: str) -> EdgeData: 205 | """Retrieves an edge between two entities.""" 206 | 207 | # get source and target node data 208 | source_node_data = self.get_node(source_uid) 209 | target_node_data = self.get_node(target_uid) 210 | 211 | self.driver.verify_connectivity() 212 | 213 | # Use parameters for source_uid and target_uid 214 | records, summary, keys = self.driver.execute_query( 215 | """ 216 | MATCH (source:""" + source_node_data.node_type + """ {node_uid: $source_uid})-[r]->(target:""" + target_node_data.node_type + """ {node_uid: $target_uid}) 217 | RETURN r 218 | """, 219 | source_uid=source_uid, 220 | target_uid=target_uid 221 | ) 222 | 223 | if records: 224 | record = records[0][0] 225 | edge_type = record.type 226 | description = record.get('description') 227 | return EdgeData(source_uid=source_uid, target_uid=target_uid, description=description, edge_uid=self._generate_edge_uid(source_uid, target_uid)) 228 | else: 229 | raise KeyError( 230 | f"Error: No edge found between source_uid: '{source_uid}' and target_uid: '{target_uid}'") 231 | 232 | def update_edge(self, edge_data: EdgeData) -> None: 233 | """Updates an existing edge in the knowledge graph.""" 234 | 235 | # get source and target node data 236 | source_node_data = self.get_node(edge_data.source_uid) 237 | target_node_data = self.get_node(edge_data.target_uid) 238 | 239 | self.driver.verify_connectivity() 240 | 241 | # Use parameters for all properties in the Cypher query 242 | summary = self.driver.execute_query( 243 | """ 244 | MATCH (source:""" + source_node_data.node_type + """ {node_uid: $source_uid})-[r]->(target:""" + target_node_data.node_type + """ {node_uid: $target_uid}) 245 | SET r.description = $description 246 | RETURN r 247 | """, 248 | source_uid=edge_data.source_uid, 249 | target_uid=edge_data.target_uid, 250 | description=edge_data.description 251 | ).summary 252 | return None 253 | 254 | def remove_edge(self, source_uid: str, target_uid: str) -> None: 255 | """Removes an edge between two entities.""" 256 | 257 | try: 258 | # Get source and target node data (this will raise KeyError if not found) 259 | source_node_data = self.get_node(source_uid) 260 | target_node_data = self.get_node(target_uid) 261 | 262 | self.driver.verify_connectivity() 263 | 264 | # Remove edge from source to target 265 | summary = self.driver.execute_query( 266 | """ 267 | MATCH (source:""" + source_node_data.node_type + """ {node_uid: $source_uid})-[r]->(target:""" + target_node_data.node_type + """ {node_uid: $target_uid}) 268 | DELETE r 269 | """, 270 | source_uid=source_uid, 271 | target_uid=target_uid 272 | ).summary 273 | 274 | # Optionally, you might want to check if the edge was actually deleted 275 | if summary.counters.relationships_deleted == 0: 276 | raise KeyError( 277 | f"Error: No edge found between source_uid: '{source_uid}' and target_uid: '{target_uid}'") 278 | except KeyError: 279 | empty_node = NodeData( 280 | node_uid="", 281 | node_title="", 282 | node_type="", 283 | node_description="", 284 | node_degree=0, 285 | document_id="", 286 | ) 287 | source_node_data = empty_node 288 | target_node_data = empty_node 289 | 290 | # Update the node data to reflect the removed edge 291 | try: 292 | source_node_data.edges_to.remove(target_uid) 293 | self.update_node(source_uid, source_node_data) 294 | except ValueError: 295 | pass # Target node not in source's edges_to, likely due to a directed edge 296 | 297 | try: 298 | target_node_data.edges_from.remove(source_uid) 299 | self.update_node(target_uid, target_node_data) 300 | except ValueError: 301 | pass # Source node not in target's edges_from, likely due to a directed edge 302 | return None 303 | 304 | def build_networkx(self) -> nx.Graph: 305 | """Builds the NetworkX representation of the full graph. 306 | https://networkx.org/documentation/stable/index.html 307 | """ 308 | graph = nx.Graph() # Initialize an undirected NetworkX graph 309 | 310 | self.driver.verify_connectivity() 311 | 312 | # 1. Fetch all nodes and their properties 313 | records, summary, keys = self.driver.execute_query("MATCH (n) RETURN n") 314 | 315 | # Check if any records were returned 316 | if records: 317 | for record in records: 318 | node = record["n"] 319 | node_data = { 320 | "node_uid": node.get("node_uid"), 321 | "node_title": node.get("node_title"), 322 | "node_type": node.get("node_type"), 323 | "node_description": node.get("node_description"), 324 | "node_degree": node.get("node_degree"), 325 | "document_id": node.get("document_id"), 326 | "edges_to": node.get("edges_to", []), 327 | "edges_from": node.get("edges_from", []), 328 | "embedding": node.get("embedding", []) 329 | } 330 | graph.add_node(node.get("node_uid"), **node_data) 331 | 332 | # 2. Fetch all relationships and add edges to the graph 333 | records, summary, keys = self.driver.execute_query( 334 | "MATCH (source)-[r]->(target) RETURN source, r, target") 335 | for record in records: 336 | source_uid = record["source"]["node_uid"] 337 | target_uid = record["target"]["node_uid"] 338 | # Add edge attributes if needed (e.g., 'description' from 'r') 339 | graph.add_edge(source_uid, target_uid) 340 | else: 341 | print( 342 | "Warning: No nodes found in the database. Returning an empty NetworkX graph.") 343 | 344 | self.networkx = graph 345 | return graph 346 | 347 | def store_community(self, community: CommunityData) -> None: 348 | """Takes valid graph community data and upserts the database with it. 349 | https://www.nature.com/articles/s41598-019-41695-z 350 | """ 351 | pass 352 | 353 | def _generate_edge_uid(self, source_uid: str, target_uid: str) -> str: 354 | """Generates Edge uid for the network based on source and target nod uid""" 355 | return f"{source_uid}_to_{target_uid}" 356 | 357 | def edge_exist(self, source_uid: str, target_uid: str) -> bool: 358 | """Checks for edge existence and returns boolean""" 359 | try: 360 | # Try to retrieve the edge 361 | self.get_edge(source_uid, target_uid) 362 | return True # Edge exists 363 | except KeyError: 364 | return False # Edge does not exist 365 | 366 | def node_exist(self, node_uid: str) -> bool: 367 | """Checks for node existence and returns boolean""" 368 | try: 369 | # Try to retrieve the node 370 | self.get_node(node_uid) 371 | return True # Node exists 372 | except KeyError: 373 | return False # Node does not exist 374 | 375 | def get_nearest_neighbors(self, query_vec) -> List[str]: 376 | """Implements nearest neighbor search based on nosql db index.""" 377 | pass 378 | 379 | def get_community(self, community_id: str) -> CommunityData: 380 | """Retrieves the community report for a given community id.""" 381 | pass 382 | 383 | def list_communities(self) -> List[CommunityData]: 384 | """Lists all stored communities for the given network.""" 385 | pass 386 | 387 | def clean_zerodegree_nodes(self) -> None: 388 | """Removes all nodes with degree 0.""" 389 | pass 390 | 391 | def flush_kg(self) -> None: 392 | """Method to wipe the complete datastore of the knowledge graph""" 393 | self.driver.verify_connectivity() 394 | summary = self.driver.execute_query( 395 | """ 396 | MATCH (n) 397 | DETACH DELETE n 398 | """ 399 | ).summary 400 | return None 401 | 402 | 403 | if __name__ == "__main__": 404 | 405 | load_status = dotenv.load_dotenv("Neo4j-39cb28f0-Created-2024-09-23.txt") 406 | if load_status is False: 407 | raise RuntimeError('Environment variables not loaded.') 408 | 409 | URI = str(os.getenv("NEO4J_URI")) 410 | AUTH = (str(os.getenv("NEO4J_USERNAME")), str(os.getenv("NEO4J_PASSWORD"))) 411 | 412 | aura = AuraKG(uri=URI, auth=AUTH) 413 | 414 | # aura.add_node(NodeData(node_uid="test_uid_2", node_title="test2", node_type="test", node_description="test", node_degree=0, document_id="doc test")) 415 | 416 | print(aura.get_node("test_uid")) 417 | print(aura.get_node("test_uid_2")) 418 | 419 | print("Hello World!") 420 | -------------------------------------------------------------------------------- /datamodel/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jakobap/graph2nosql/77df8ecba857c61381a37b878d57c20d52ff9834/datamodel/__init__.py -------------------------------------------------------------------------------- /datamodel/data_model.py: -------------------------------------------------------------------------------- 1 | """Module providing Data Model definitions for storing and processing Graph Data. """ 2 | 3 | from dataclasses import dataclass, field 4 | from typing import Tuple 5 | import numpy as np 6 | 7 | 8 | @dataclass 9 | class EdgeData: 10 | """EdgeData data model definition""" 11 | source_uid: str 12 | target_uid: str 13 | description: str 14 | directed: bool = True 15 | edge_uid: str | None = None 16 | document_id: str | None = None 17 | 18 | 19 | @dataclass 20 | class NodeData: 21 | """NodeData data model definition""" 22 | node_uid: str 23 | node_title: str 24 | node_type: str 25 | node_description: str 26 | node_degree: int 27 | document_id: str # identifier for source knowlede base document for this entity 28 | community_id: int | None = None # community id based on source document 29 | edges_to: list[str] = field(default_factory=list) 30 | edges_from: list[str] = field(default_factory=list) # in case of directed graph 31 | embedding: list[float] = field(default_factory=list) # text embedding for node 32 | 33 | 34 | @dataclass 35 | class CommunityData: 36 | """CommunityData data model definition""" 37 | title: str # title of comm, None if not yet computed 38 | community_nodes: set[str] = field(default_factory=set) # list of node_uid belonging to community 39 | summary: str | None = None # description of comm, None if not yet computed 40 | document_id: str | None = None # identifier for source knowlede base document for this entity 41 | community_uid: str | None = None # community identifier 42 | community_embedding: Tuple[float, ...] = field(default_factory=tuple) 43 | rating: int | None = None 44 | rating_explanation: str | None = None 45 | findings: list[dict] | None = None 46 | 47 | def __to_dict__(self): 48 | """Converts the CommunityData instance to a dictionary.""" 49 | return { 50 | "title": self.title, 51 | "community_nodes": list(self.community_nodes), # Convert set to list 52 | "summary": self.summary, 53 | "document_id": self.document_id, 54 | "community_uid": self.community_uid, 55 | "community_embedding": list(self.community_embedding), # Convert tuple to list 56 | "rating": self.rating, 57 | "rating_explanation": self.rating_explanation, 58 | "findings": self.findings 59 | } 60 | 61 | @classmethod 62 | def __from_dict__(cls, data: dict): 63 | """Creates a CommunityData instance from a dictionary.""" 64 | return cls( 65 | title=data.get("title") or "", 66 | community_nodes=set(data.get("community_nodes", [])), # Convert list to set 67 | summary=data.get("summary"), 68 | document_id=data.get("document_id"), 69 | community_uid=data.get("community_uid"), 70 | community_embedding=tuple(data.get("community_embedding", [])), # Convert list to tuple 71 | rating=data.get("rating"), 72 | rating_explanation=data.get("rating_explanation"), 73 | findings=data.get("findings") 74 | ) 75 | 76 | 77 | @dataclass 78 | class NodeEmbeddings: 79 | """Node embeddings class definition.""" 80 | nodes: list[str] 81 | embeddings: np.ndarray 82 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # general dependencies 2 | networkx==3.3 3 | matplotlib==3.9.1 4 | graspologic 5 | numpy 6 | future==1.0.0 7 | python-dotenv==1.0.1 8 | 9 | # db specific dependencies 10 | firebase-admin==6.5.0 11 | neo4j==5.24.0 12 | pymongo==4.10.1 13 | 14 | -e . -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | setup( 4 | name='graph2nosql', # Choose a name for your library 5 | version='0.1', 6 | packages=find_packages(), 7 | install_requires=[ 8 | 'networkx==3.3', 9 | 'matplotlib==3.9.1', 10 | 'graspologic', 11 | 'numpy', 12 | 'firebase-admin==6.5.0', 13 | 'python-dotenv==1.0.1', 14 | 'future==1.0.0', 15 | 'neo4j==5.24.0' 16 | ] 17 | ) --------------------------------------------------------------------------------