├── .github
    └── workflows
    │   └── python-publish.yml
├── .gitignore
├── .idx
    └── dev.nix
├── LICENSE
├── README.md
├── __init__.py
├── base
    ├── __init__.py
    ├── operations.py
    └── operations_test.py
├── benchmarks
    ├── import_benchmarks.ipynb
    ├── main.py
    └── requirements.txt
├── databases
    ├── __init__.py
    ├── firestore_kg.py
    ├── mdb.py
    └── n4j.py
├── datamodel
    ├── __init__.py
    └── data_model.py
├── requirements.txt
└── setup.py


/.github/workflows/python-publish.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will upload a Python Package using Twine when a release is created
 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries
 3 | 
 4 | # This workflow uses actions that are not certified by GitHub.
 5 | # They are provided by a third-party and are governed by
 6 | # separate terms of service, privacy policy, and support
 7 | # documentation.
 8 | 
 9 | name: Upload Python Package
10 | 
11 | on:
12 |   release:
13 |     types: [published]
14 | 
15 | permissions:
16 |   contents: read
17 | 
18 | jobs:
19 |   deploy:
20 | 
21 |     runs-on: ubuntu-latest
22 | 
23 |     steps:
24 |     - uses: actions/checkout@v4
25 |     - name: Set up Python
26 |       uses: actions/setup-python@v3
27 |       with:
28 |         python-version: '3.x'
29 |     - name: Install dependencies
30 |       run: |
31 |         python -m pip install --upgrade pip
32 |         pip install build
33 |     - name: Build package
34 |       run: python -m build
35 |     - name: Publish package
36 |       uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29
37 |       with:
38 |         user: __token__
39 |         password: ${{ secrets.PYPI_API_TOKEN }}
40 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.json
 2 | .DS_Store
 3 | *.venv*
 4 | .env
 5 | .graph2nosql_venv
 6 | *.pyc
 7 | */test_graph.png
 8 | 
 9 | *.egg-info
10 | Neo4j-39cb28f0-Created-2024-09-23.txt
11 | 


--------------------------------------------------------------------------------
/.idx/dev.nix:
--------------------------------------------------------------------------------
 1 | # To learn more about how to use Nix to configure your environment
 2 | # see: https://developers.google.com/idx/guides/customize-idx-env
 3 | { pkgs, ... }: {
 4 |   # Which nixpkgs channel to use.
 5 |   channel = "stable-23.11"; # or "unstable"
 6 | 
 7 |   # Use https://search.nixos.org/packages to find packages
 8 |   packages = [
 9 |     pkgs.python311
10 |     pkgs.python311Packages.pip
11 |     pkgs.streamlit
12 |     pkgs.gnumake
13 |   ];
14 |   idx = {
15 |     # Search for the extensions you want on https://open-vsx.org/ and use "publisher.id"
16 |     extensions = [
17 |       "ms-python.autopep8"
18 |       "ms-python.debugpy"
19 |       "ms-python.pythonv2024.12.3"
20 |       "googlecloudtools.cloudcode"
21 |       "ms-toolsai.jupyter"
22 |       "ms-toolsai.jupyter-keymap"
23 |       "ms-toolsai.jupyter-renderers"
24 |       "ms-toolsai.vscode-jupyter-cell-tags"
25 |       "ms-toolsai.vscode-jupyter-slideshow"
26 |     ];
27 |     # Enable previews
28 |     previews = {
29 |       enable = true;
30 |       previews = {
31 |         # web = {
32 |         #   # Example: run "npm run dev" with PORT set to IDX's defined port for previews,
33 |         #   # and show it in IDX's web preview panel
34 |         #   command = ["npm" "run" "dev"];
35 |         #   manager = "web";
36 |         #   env = {
37 |         #     # Environment variables to set for your server
38 |         #     PORT = "$PORT";
39 |         #   };
40 |         # };
41 |       };
42 |     };
43 | 
44 |     # Workspace lifecycle hooks
45 |     workspace = {
46 |       # Runs when a workspace is first created
47 |       onCreate = {
48 |         install =
49 |           "python3 -m venv .graph2nosql_venv && source .graph2nosql_venv/bin/activate && pip install --upgrade pip && pip install -r requirements.txt";
50 |       };
51 |       # Runs when the workspace is (re)started
52 |       onStart = {
53 |         # Example: start a background task to watch and re-build backend code
54 |         # watch-backend = "npm run watch-backend";
55 |       };
56 |     };
57 |   };
58 | }
59 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    Copyright 2024 Jakob Pörschmann
179 | 
180 |    Licensed under the Apache License, Version 2.0 (the "License");
181 |    you may not use this file except in compliance with the License.
182 |    You may obtain a copy of the License at
183 | 
184 |        http://www.apache.org/licenses/LICENSE-2.0
185 | 
186 |    Unless required by applicable law or agreed to in writing, software
187 |    distributed under the License is distributed on an "AS IS" BASIS,
188 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
189 |    See the License for the specific language governing permissions and
190 |    limitations under the License.
191 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # graph2nosql
  2 | [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)
  3 | [![Python](https://img.shields.io/badge/python-3.x-blue.svg)](https://www.python.org/)
  4 | 
  5 | <a href="https://idx.google.com/import?url=https%3A%2F%2Fgithub.com%2Fjakobap%2Fgraph2nosql">
  6 |   <picture>
  7 |     <source
  8 |       media="(prefers-color-scheme: dark)"
  9 |       srcset="https://cdn.idx.dev/btn/open_dark_32.svg">
 10 |     <source
 11 |       media="(prefers-color-scheme: light)"
 12 |       srcset="https://cdn.idx.dev/btn/open_light_32.svg">
 13 |     <img
 14 |       height="32"
 15 |       alt="Open in IDX"
 16 |       src="https://cdn.idx.dev/btn/open_purple_32.svg">
 17 |   </picture>
 18 | </a>
 19 | 
 20 | A simple Python interface to store and interact with knowledge graphs in your favourite NoSQL DB.
 21 | 
 22 | Knowledge Graphs are the up and coming tool to index knowlegde and make it understandable to your LLM applications. Working with Graph Databases are a pain though.
 23 | 
 24 | This Python interface aims to solve  this by offering a set of basic functions to store and manage your (knowledge graph) in your existing NoSQL DB. From experience Document based databases offer an exteremely attractive performance an price position comparing to some existing specialized databases. I found this to be attractive for simple graph storage use cases in which no fully structured query language is required.
 25 | 
 26 | This repository mostly caters own use and is not regularly updated or maintained.
 27 | 
 28 | ## Implemented Databases for graph storage:
 29 | * [Firestore](https://firebase.google.com/docs/firestore)
 30 | * [MongoDB](https://www.mongodb.com/docs/)
 31 | * [Neo4J for latency & cost benchmark](https://neo4j.com/docs/)
 32 | 
 33 | ## Performance Benchmark
 34 | Approximate latency performance benchmark comparing tool and technology. Benchmarking framework [can be found in `./benchmarks`](https://github.com/jakobap/graph2nosql/tree/main/benchmarks).
 35 | 
 36 | Values are processing seconds -> lower = better
 37 | | Feature | Firestore | MongoDB | Neo4j |
 38 | |---|---|---|---|
 39 | | Adding 100 Nodes | 3.03 | 2.51 | 1.91 |
 40 | | Query 100 individual nodes | 0.94 | 1.10 | 7.12 |
 41 | | Count 2nd degree connection of given node | 0.8 | tbd | 10.5 |
 42 | | Count 3rd degree connection of given node | 10.9 | tbd | 13.3 |
 43 | 
 44 | ## Getting Started
 45 | `graph2nosql.py` is the abstract class defining the available operations.
 46 | 
 47 | 1. Create an `.env` that stores your secrets & env vars.
 48 | 2. Use a database object to interact with your nosql db.
 49 | 
 50 | ### Initialize knowledge graph object
 51 | Every knowledge graph store object is a child of `NoSQLKnowledgeGraph` in `./base/operations.py`.
 52 | 
 53 | The graph contains three data objects: `NodeData`, `EdgeData` and `CommunityData`. Their respective attributes are defined in `./data_model/datamodel.py`.
 54 | 
 55 | ```
 56 | from databases.firestore_kg import FirestoreKG
 57 | 
 58 | secrets = dotenv_values("../.env")
 59 | credentials, _ = google.auth.load_credentials_from_file(secrets["GCP_CREDENTIAL_FILE"])
 60 | 
 61 | fskg = FirestoreKG(gcp_credential_file=secret["GCP_CREDENTIAL_FILE"],
 62 |         gcp_project_id=str(secrets["GCP_PROJECT_ID"]),
 63 |         firestore_db_id=str(secrets["WIKIDATA_FS_DB"]),
 64 |         node_collection_id=str(secrets["NODE_COLL_ID"]),
 65 |         edges_collection_id=str(secrets["EDGES_COLL_ID"]),
 66 |         community_collection_id=str(secrets["COMM_COLL_ID"])
 67 |         )
 68 | ```
 69 | ### Add nodes
 70 | ```
 71 | node_data_1 = NodeData(
 72 |     node_uid="test_egde_node_1",
 73 |     node_title="Test Node 1",
 74 |     node_type="Person",
 75 |     node_description="This is a test node",
 76 |     node_degree=0,
 77 |     document_id="doc_1",
 78 |     edges_to=[],
 79 |     edges_from=[],
 80 |     embedding=[0.1, 0.2, 0.3],
 81 | )
 82 | 
 83 | node_data_2 = NodeData(
 84 |     node_uid="test_egde_node_2",
 85 |     node_title="Test Node 2",
 86 |     node_type="Person",
 87 |     node_description="This is another test node",
 88 |     node_degree=0,
 89 |     document_id="doc_2",
 90 |     edges_to=[],
 91 |     edges_from=[],
 92 |     embedding=[0.4, 0.5, 0.6],
 93 | )
 94 | 
 95 | self.kg.add_node(node_uid="test_egde_node_1"node_data=node_data_1)
 96 | self.kg.add_node(node_uid="test_egde_node_2"node_data=node_data_2)
 97 | ```
 98 | 
 99 | ### Add directed and undirected edges
100 | ```
101 | edge_data1 = EdgeData(
102 |     source_uid="test_egde_node_1",
103 |     target_uid="test_egde_node_2",
104 |     description="This is a test egde description",
105 |     directed=True
106 | )
107 | 
108 | edge_data2 = EdgeData(
109 |     source_uid="test_egde_node_3",
110 |     target_uid="test_egde_node_2",
111 |     description="This is a test egde description",
112 |     directed=False
113 | )
114 | 
115 | self.kg.add_edge(edge_data=edge_data1)
116 | self.kg.add_edge(edge_data=edge_data2)
117 | ```
118 | 
119 | 
120 | ## Contributing
121 | * If you decide to add new DB operations, please add corresponding tests to `graph2nosql_tests.py` 
122 | * If you decide to write an implementation for another NoSQL db please make sure all tests in `graph2nosql_tests.py` succeed.
123 | 
124 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
1 | from .graph2nosql import graph2nosql
2 | from graph2nosql import databases
3 | from graph2nosql import datamodel


--------------------------------------------------------------------------------
/base/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jakobap/graph2nosql/77df8ecba857c61381a37b878d57c20d52ff9834/base/__init__.py


--------------------------------------------------------------------------------
/base/operations.py:
--------------------------------------------------------------------------------
  1 | """graph2nosql base class for required database operations"""
  2 | 
  3 | from abc import ABC, abstractmethod
  4 | 
  5 | from typing import List
  6 | import datetime
  7 | 
  8 | import networkx as nx  # type: ignore
  9 | import matplotlib.pyplot as plt
 10 | from matplotlib.lines import Line2D
 11 | import graspologic as gc
 12 | 
 13 | from datamodel.data_model import NodeData, EdgeData, CommunityData, NodeEmbeddings
 14 | 
 15 | 
 16 | class NoSQLKnowledgeGraph(ABC):
 17 |     """
 18 |     Base Class for storing and interacting with the KG and manages data model.
 19 |     """
 20 |     networkx: nx.Graph | nx.DiGraph = nx.Graph(
 21 |     )  # networkx representation of graph in nosqldb
 22 | 
 23 |     @abstractmethod
 24 |     def add_node(self, node_uid: str, node_data: NodeData) -> None:
 25 |         """Adds an node to the knowledge graph."""
 26 | 
 27 |     @abstractmethod
 28 |     def get_node(self, node_uid: str) -> NodeData:
 29 |         """Retrieves an node from the knowledge graph."""
 30 | 
 31 |     @abstractmethod
 32 |     def update_node(self, node_uid: str, node_data: NodeData) -> None:
 33 |         """Updates an existing node in the knowledge graph."""
 34 | 
 35 |     @abstractmethod
 36 |     def remove_node(self, node_uid: str) -> None:
 37 |         """Removes an node from the knowledge graph."""
 38 | 
 39 |     @abstractmethod
 40 |     def add_edge(self, edge_data: EdgeData) -> None:
 41 |         """Adds an edge (relationship) between two entities in the knowledge graph."""
 42 | 
 43 |     @abstractmethod
 44 |     def get_edge(self, source_uid: str, target_uid: str) -> EdgeData:
 45 |         """Retrieves an edge between two entities."""
 46 | 
 47 |     @abstractmethod
 48 |     def update_edge(self, edge_data: EdgeData) -> None:
 49 |         """Updates an existing edge in the knowledge graph."""
 50 | 
 51 |     @abstractmethod
 52 |     def _delete_from_edge_coll(self, egde_uid: str) -> None:
 53 |         """Method to delete record from edge collection of given kg store"""
 54 | 
 55 |     @abstractmethod
 56 |     def remove_edge(self, source_uid: str, target_uid: str) -> None:
 57 |         """Removes an edge between two entities."""
 58 | 
 59 |     @abstractmethod
 60 |     def build_networkx(self) -> None:
 61 |         """Builds the NetworkX representation of the full graph.
 62 |         https://networkx.org/documentation/stable/index.html
 63 |         """
 64 | 
 65 |     @abstractmethod
 66 |     def store_community(self, community: CommunityData) -> None:
 67 |         """Takes valid graph community data and upserts the database with it.
 68 |         https://www.nature.com/articles/s41598-019-41695-z
 69 |         """
 70 | 
 71 |     @abstractmethod
 72 |     def _generate_edge_uid(self, source_uid: str, target_uid: str) -> str:
 73 |         """Generates Edge uid for the network based on source and target nod uid"""
 74 |         return ""
 75 | 
 76 |     @abstractmethod
 77 |     def get_nearest_neighbors(self, query_vec) -> List[str]:
 78 |         """Implements nearest neighbor search based on nosql db index."""
 79 | 
 80 |     @abstractmethod
 81 |     def get_community(self, community_id: str) -> CommunityData:
 82 |         """Retrieves the community report for a given community id."""
 83 | 
 84 |     @abstractmethod
 85 |     def list_communities(self) -> List[CommunityData]:
 86 |         """Lists all stored communities for the given network."""
 87 | 
 88 |     @abstractmethod
 89 |     def clean_zerodegree_nodes(self) -> None:
 90 |         """Removes all nodes with degree 0."""
 91 | 
 92 |     @abstractmethod
 93 |     def edge_exist(self, source_uid: str, target_uid: str) -> bool:
 94 |         """Checks for edge existence and returns boolean"""
 95 | 
 96 |     @abstractmethod
 97 |     def node_exist(self, node_uid: str) -> bool:
 98 |         """Checks for node existence and returns boolean"""
 99 | 
100 |     @abstractmethod
101 |     def flush_kg(self) -> None:
102 |         """Method to wipe the complete datastore of the knowledge graph"""
103 | 
104 |     def visualize_graph(self, filename: str = f"graph_{datetime.datetime.now()}.png") -> None:
105 |         """Visualizes the provided networkx graph using matplotlib.
106 | 
107 |         Args:
108 |             graph (nx.Graph): The graph to visualize.
109 |         """
110 |         self.build_networkx()
111 | 
112 |         if self.networkx is not None:
113 |             # Create a larger figure for better visualization
114 |             plt.figure(figsize=(12, 12))
115 | 
116 |             # Use a spring layout for a more visually appealing arrangement
117 |             pos = nx.spring_layout(self.networkx, k=0.3, iterations=50)
118 | 
119 |             # Draw nodes with different colors based on entity type
120 |             entity_types = set(data["node_type"]
121 |                                for _, data in self.networkx.nodes(data=True))
122 |             color_map = plt.cm.get_cmap("tab10", len(entity_types))
123 |             for i, entity_type in enumerate(entity_types):
124 |                 nodes = [n for n, d in self.networkx.nodes(
125 |                     data=True) if d["node_type"] == entity_type]
126 |                 nx.draw_networkx_nodes(
127 |                     self.networkx,
128 |                     pos,
129 |                     nodelist=nodes,
130 |                     node_color=[color_map(i)],  # type: ignore
131 |                     label=entity_type,
132 |                     # type: ignore
133 |                     node_size=[
134 |                         10 + 50 * self.networkx.degree(n) for n in nodes]  # type: ignore
135 |                 )
136 | 
137 |             # Draw edges with labels
138 |             nx.draw_networkx_edges(self.networkx, pos, width=0.5, alpha=0.5)
139 |             # edge_labels = nx.get_edge_attributes(graph, "description")
140 |             # nx.draw_networkx_edge_labels(graph, pos, edge_labels=edge_labels, font_size=6)
141 | 
142 |             # Add node labels with descriptions
143 |             node_labels = {
144 |                 node: node
145 |                 for node, data in self.networkx.nodes(data=True)
146 |             }
147 |             nx.draw_networkx_labels(
148 |                 self.networkx, pos, labels=node_labels, font_size=8)
149 | 
150 |             plt.title("Extracted Knowledge Graph")
151 |             plt.axis("off")  # Turn off the axis
152 | 
153 |             # Add a legend for node colors
154 |             plt.legend(handles=[Line2D([0], [0], marker='o', color='w', label=entity_type,
155 |                                        markersize=10, markerfacecolor=color_map(i)) for i, entity_type in enumerate(entity_types)])
156 | 
157 |             plt.savefig(filename)
158 | 
159 |         else:
160 |             raise ValueError(
161 |                 "Error: NetworkX graph is not initialized. Call build_networkx() first.")
162 | 
163 |     def get_louvain_communities(self) -> list:
164 |         """Computes and returns all Louvain communities for the given network.
165 |         https://www.nature.com/articles/s41598-019-41695-z
166 | 
167 |         Sample Output:
168 |         [{'"2023 NOBEL PEACE PRIZE"'}, {'"ANDREI SAKHAROV PRIZE"'},
169 |         {'"ANDREI SAKHAROV"'}]
170 |         """
171 |         # 1. Build (or update) the NetworkX graph
172 |         self.build_networkx()
173 | 
174 |         # 2. Apply Louvain algorithm
175 |         if self.networkx is not None:
176 |             louvain_comm_list = nx.algorithms.community.louvain_communities(
177 |                 self.networkx)
178 |             return louvain_comm_list  # type: ignore
179 |         raise ValueError(
180 |             "Error: NetworkX graph is not initialized. Call build_networkx() first.")
181 | 
182 |     def get_node2vec_embeddings(
183 |         self,
184 |         dimensions: int = 768,
185 |         num_walks: int = 10,
186 |         walk_length: int = 40,
187 |         window_size: int = 2,
188 |         iterations: int = 3,
189 |         random_seed: int = 69
190 |     ) -> NodeEmbeddings:
191 |         """Generate node embeddings using Node2Vec."""
192 | 
193 |         # update networkx representation of graph
194 |         self.build_networkx()
195 | 
196 |         # generate embedding
197 |         lcc_tensors = gc.embed.node2vec_embed(  # type: ignore
198 |             graph=self.networkx,
199 |             dimensions=dimensions,
200 |             window_size=window_size,
201 |             iterations=iterations,
202 |             num_walks=num_walks,
203 |             walk_length=walk_length,
204 |             random_seed=random_seed,
205 |         )
206 |         return NodeEmbeddings(embeddings=lcc_tensors[0], nodes=lcc_tensors[1])
207 | 
208 | 
209 | if __name__ == "__main__":
210 |     print("Hello World!")
211 | 


--------------------------------------------------------------------------------
/base/operations_test.py:
--------------------------------------------------------------------------------
  1 | """graph2nosql database operations unittests"""
  2 | 
  3 | import unittest
  4 | from abc import ABC, abstractmethod
  5 | 
  6 | import os
  7 | import dotenv
  8 | from dotenv import dotenv_values
  9 | 
 10 | import networkx as nx  # type: ignore
 11 | 
 12 | from base.operations import NoSQLKnowledgeGraph
 13 | from databases.firestore_kg import FirestoreKG
 14 | from databases.n4j import AuraKG
 15 | from databases.mdb import MongoKG
 16 | from datamodel.data_model import NodeData, EdgeData
 17 | 
 18 | 
 19 | class _NoSQLKnowledgeGraphTests(ABC):
 20 |     """
 21 |     Abstract base class to define test cases for NoSQLKnowledgeGraph implementations.
 22 | 
 23 |     Concrete test classes for specific NoSQL databases should inherit from this class
 24 |     and implement the required abstract methods.
 25 |     """
 26 |     @abstractmethod
 27 |     def create_kg_instance(self) -> NoSQLKnowledgeGraph:
 28 |         """Create and return an instance of the NoSQLKnowledgeGraph implementation."""
 29 | 
 30 |     def setUp(self):
 31 |         """Set up for test methods."""
 32 |         self.kg = self.create_kg_instance()
 33 |         # Add any setup logic specific to your NoSQL database here
 34 | 
 35 |     def test_add_and_remove_node(self):
 36 |         """ Test adding a node"""
 37 |         node_data = NodeData(
 38 |             node_uid="added_test_node_1",
 39 |             node_title="Test Node 1",
 40 |             node_type="Person",
 41 |             node_description="This is a test node",
 42 |             node_degree=0,
 43 |             document_id="doc_1",
 44 |             edges_to=[],
 45 |             edges_from=[],
 46 |             embedding=[0.1, 0.2, 0.3],
 47 |         )
 48 |         self.kg.add_node(node_uid="added_test_node_1", node_data=node_data)
 49 | 
 50 |         # Retrieve the node and verify its data
 51 |         retrieved_node_data = self.kg.get_node(node_uid="added_test_node_1")
 52 |         print(retrieved_node_data)
 53 |         self.assertEqual(retrieved_node_data, node_data)  # type: ignore
 54 | 
 55 |         # Remove the node
 56 |         self.kg.remove_node(node_uid="added_test_node_1")
 57 | 
 58 |         # Try to retrieve the node again (should raise KeyError)
 59 |         with self.assertRaises(KeyError):  # type: ignore
 60 |             self.kg.get_node(node_uid="added_test_node_1")
 61 | 
 62 |     def test_update_node(self):
 63 |         """Add a node"""
 64 |         node_data = NodeData(
 65 |             node_uid="test_update_node_1",
 66 |             node_title="Test Node 1",
 67 |             node_type="Person",
 68 |             node_description="This is a test node",
 69 |             node_degree=0,
 70 |             document_id="doc_1",
 71 |             edges_to=[],
 72 |             edges_from=[],
 73 |             embedding=[0.1, 0.2, 0.3],
 74 |         )
 75 |         self.kg.add_node(node_uid="test_update_node_1", node_data=node_data)
 76 | 
 77 |         # Retrieve the node and verify its data
 78 |         retrieved_node_data = self.kg.get_node(node_uid="test_update_node_1")
 79 |         self.assertEqual(retrieved_node_data, node_data)  # type: ignore
 80 | 
 81 |         # Update the node
 82 |         updated_node_data = NodeData(
 83 |             node_uid="test_update_node_1",
 84 |             node_title="Updated Test Node 1",  # updated title
 85 |             node_type="Person",
 86 |             node_description="This is an updated test node",  # updated description
 87 |             node_degree=1,  # Updated degree
 88 |             document_id="doc_1",
 89 |             edges_to=[],
 90 |             edges_from=[],
 91 |             embedding=[0.1, 0.2, 0.3],
 92 |         )
 93 |         self.kg.update_node(node_uid="test_update_node_1",
 94 |                             node_data=updated_node_data)
 95 | 
 96 |         # Retrieve the node again and verify the update
 97 |         retrieved_updated_node_data = self.kg.get_node(
 98 |             node_uid="test_update_node_1")
 99 |         self.assertEqual(retrieved_updated_node_data,
100 |                          updated_node_data)  # type: ignore
101 | 
102 |         # Remove the node
103 |         self.kg.remove_node(node_uid="test_update_node_1")
104 | 
105 |     def test_add_node_with_edge(self):
106 |         """Add a node with edge to other node that doesn't exist"""
107 |         node_data = NodeData(
108 |             node_uid="test_egde_node_1",
109 |             node_title="Test Node 1",
110 |             node_type="Person",
111 |             node_description="This is a test node",
112 |             node_degree=0,
113 |             document_id="doc_1",
114 |             edges_to=["fake node",],
115 |             edges_from=[],
116 |             embedding=[0.1, 0.2, 0.3],
117 |         )
118 | 
119 |         # Assert that adding the node raises a KeyError (or a more specific exception you handle)
120 |         # type: ignore # Adjust exception type if needed
121 |         with self.assertRaises(ValueError):
122 |             self.kg.add_node(node_uid="test_egde_node_1", node_data=node_data)
123 | 
124 |         # Add valid nodes (required for edges)
125 |         node_data_1 = NodeData(
126 |             node_uid="test_egde_node_1",
127 |             node_title="Test Node 1",
128 |             node_type="Person",
129 |             node_description="This is a test node",
130 |             node_degree=0,
131 |             document_id="doc_1",
132 |             edges_to=[],
133 |             edges_from=[],
134 |             embedding=[0.1, 0.2, 0.3],
135 |         )
136 |         node_data_2 = NodeData(
137 |             node_uid="test_egde_node_2",
138 |             node_title="Test Node 2",
139 |             node_type="Person",
140 |             node_description="This is another test node",
141 |             node_degree=0,
142 |             document_id="doc_2",
143 |             edges_to=[],
144 |             edges_from=[],
145 |             embedding=[0.4, 0.5, 0.6],
146 |         )
147 | 
148 |         node_data_3 = NodeData(
149 |             node_uid="test_egde_node_3",
150 |             node_title="Test Node 2",
151 |             node_type="Person",
152 |             node_description="This is another test node",
153 |             node_degree=0,
154 |             document_id="doc_2",
155 |             edges_to=[],
156 |             edges_from=[],
157 |             embedding=[0.4, 0.5, 0.6],
158 |         )
159 | 
160 |         self.kg.add_node(node_uid="test_egde_node_1", node_data=node_data_1)
161 |         self.kg.add_node(node_uid="test_egde_node_2", node_data=node_data_2)
162 |         self.kg.add_node(node_uid="test_egde_node_3", node_data=node_data_3)
163 | 
164 |         edge_data1 = EdgeData(
165 |             source_uid="test_egde_node_1",
166 |             target_uid="test_egde_node_2",
167 |             description="This is a test egde description",
168 |             directed=True
169 |         )
170 | 
171 |         edge_data2 = EdgeData(
172 |             source_uid="test_egde_node_3",
173 |             target_uid="test_egde_node_2",
174 |             description="This is a test egde description",
175 |             directed=False
176 |         )
177 | 
178 |         self.kg.add_edge(edge_data=edge_data1)
179 |         self.kg.add_edge(edge_data=edge_data2)
180 | 
181 |         # Assert that the edges are reflected in the nodes' edge lists
182 |         node1 = self.kg.get_node("test_egde_node_1")
183 |         node2 = self.kg.get_node("test_egde_node_2")
184 |         node3 = self.kg.get_node("test_egde_node_3")
185 | 
186 |         self.assertIn("test_egde_node_3", node2.edges_from)  # type: ignore
187 |         self.assertIn("test_egde_node_3", node2.edges_to)  # type: ignore
188 |         self.assertIn("test_egde_node_1", node2.edges_from)  # type: ignore
189 |         self.assertIn("test_egde_node_2", node1.edges_to)  # type: ignore
190 | 
191 |         # Clean up
192 |         self.kg.remove_node(node_uid="test_egde_node_1")
193 |         self.kg.remove_node(node_uid="test_egde_node_2")
194 |         self.kg.remove_node(node_uid="test_egde_node_3")
195 | 
196 |     def test_add_direcred_edge(self):
197 |         """Test adding an edge between nodes."""
198 | 
199 |         # Add valid nodes (required for edges)
200 |         node_data_1 = NodeData(
201 |             node_uid="test_directed_node_1",
202 |             node_title="Test Node 1",
203 |             node_type="Person",
204 |             node_description="This is a test node",
205 |             node_degree=0,
206 |             document_id="doc_1",
207 |             edges_to=[],
208 |             edges_from=[],
209 |             embedding=[0.1, 0.2, 0.3],
210 |         )
211 |         node_data_2 = NodeData(
212 |             node_uid="test_directed_node_2",
213 |             node_title="Test Node 2",
214 |             node_type="Person",
215 |             node_description="This is another test node",
216 |             node_degree=0,
217 |             document_id="doc_2",
218 |             edges_to=[],
219 |             edges_from=[],
220 |             embedding=[0.4, 0.5, 0.6],
221 |         )
222 | 
223 |         self.kg.add_node(node_uid="test_directed_node_1",
224 |                          node_data=node_data_1)
225 |         self.kg.add_node(node_uid="test_directed_node_2",
226 |                          node_data=node_data_2)
227 | 
228 |         # add edges between nodes
229 |         edge_data = EdgeData(
230 |             source_uid="test_directed_node_1",
231 |             target_uid="test_directed_node_2",
232 |             description="This is a test egde description",
233 |             directed=True
234 |         )
235 | 
236 |         self.kg.add_edge(edge_data=edge_data)
237 | 
238 |         # Assert that the edge is reflected in the nodes' edge lists
239 |         node1 = self.kg.get_node("test_directed_node_1")
240 |         node2 = self.kg.get_node("test_directed_node_2")
241 |         self.assertIn("test_directed_node_2", node1.edges_to)  # type: ignore
242 |         self.assertIn("test_directed_node_1", node2.edges_from)  # type: ignore
243 | 
244 |         # Clean Up egdes
245 |         self.kg.remove_edge(source_uid="test_directed_node_1",
246 |                             target_uid="test_directed_node_2")
247 | 
248 |         # Clean Up nodes
249 |         self.kg.remove_node(node_uid="test_directed_node_1")
250 |         self.kg.remove_node(node_uid="test_directed_node_2")
251 | 
252 |     def test_add_undirecred_edge(self):
253 |         """Test adding an edge between nodes."""
254 | 
255 |         # Add valid nodes (required for edges)
256 |         node_data_1 = NodeData(
257 |             node_uid="test_undirected_node_1",
258 |             node_title="Test Node 1",
259 |             node_type="Person",
260 |             node_description="This is a test node",
261 |             node_degree=0,
262 |             document_id="doc_1",
263 |             edges_to=[],
264 |             edges_from=[],
265 |             embedding=[0.1, 0.2, 0.3],
266 |         )
267 |         node_data_2 = NodeData(
268 |             node_uid="test_undirected_node_2",
269 |             node_title="Test Node 2",
270 |             node_type="Person",
271 |             node_description="This is another test node",
272 |             node_degree=0,
273 |             document_id="doc_2",
274 |             edges_to=[],
275 |             edges_from=[],
276 |             embedding=[0.4, 0.5, 0.6],
277 |         )
278 | 
279 |         self.kg.add_node(node_uid="test_undirected_node_1",
280 |                          node_data=node_data_1)
281 |         self.kg.add_node(node_uid="test_undirected_node_2",
282 |                          node_data=node_data_2)
283 | 
284 |         # add edges between nodes
285 |         edge_data = EdgeData(
286 |             source_uid="test_undirected_node_1",
287 |             target_uid="test_undirected_node_2",
288 |             description="This is a test egde description",
289 |             directed=False
290 |         )
291 | 
292 |         self.kg.add_edge(edge_data=edge_data)
293 | 
294 |         # Assert that the edge is reflected in the nodes' edge lists
295 |         node1 = self.kg.get_node("test_undirected_node_1")
296 |         node2 = self.kg.get_node("test_undirected_node_2")
297 |         self.assertIn("test_undirected_node_2", node1.edges_to)  # type: ignore
298 |         self.assertIn("test_undirected_node_1", node2.edges_to)  # type: ignore
299 |         self.assertIn("test_undirected_node_1",
300 |                       node2.edges_from)  # type: ignore
301 |         self.assertIn("test_undirected_node_2",
302 |                       node1.edges_from)  # type: ignore
303 | 
304 |         # Clean Up egdes
305 |         self.kg.remove_edge(source_uid="test_undirected_node_1",
306 |                             target_uid="test_undirected_node_2")
307 | 
308 |         # Clean Up nodes
309 |         self.kg.remove_node(node_uid="test_undirected_node_1")
310 |         self.kg.remove_node(node_uid="test_undirected_node_2")
311 | 
312 |     def test_get_edge(self):
313 |         """Test retrieving an existing edge."""
314 |         # 1. Add nodes (required for edges)
315 |         node_data_1 = NodeData(
316 |             node_uid="test_getedge_node_1",
317 |             node_title="Test Node 1",
318 |             node_type="Person",
319 |             node_description="This is a test node",
320 |             node_degree=0,
321 |             document_id="doc_1",
322 |             edges_to=[],
323 |             edges_from=[],
324 |             embedding=[0.1, 0.2, 0.3],
325 |         )
326 |         node_data_2 = NodeData(
327 |             node_uid="test_getedge_node_2",
328 |             node_title="Test Node 2",
329 |             node_type="Person",
330 |             node_description="This is another test node",
331 |             node_degree=0,
332 |             document_id="doc_2",
333 |             edges_to=[],
334 |             edges_from=[],
335 |             embedding=[0.4, 0.5, 0.6],
336 |         )
337 |         self.kg.add_node(node_uid="test_getedge_node_1", node_data=node_data_1)
338 |         self.kg.add_node(node_uid="test_getedge_node_2", node_data=node_data_2)
339 | 
340 |         # 2. Add an edge
341 |         edge_data = EdgeData(
342 |             source_uid="test_getedge_node_1",
343 |             target_uid="test_getedge_node_2",
344 |             description="This might be a description of the relationship of these two nodes",
345 |             directed=False
346 |         )
347 |         self.kg.add_edge(edge_data=edge_data)
348 | 
349 |         # Assuming you have a way to retrieve edge data in your implementation
350 |         retrieved_edge_data = self.kg.get_edge(source_uid="test_getedge_node_1",
351 |                                                target_uid="test_getedge_node_2")
352 | 
353 |         new_edge_uid = self.kg._generate_edge_uid(source_uid=edge_data.source_uid,
354 |                                                   target_uid=edge_data.target_uid)
355 |         target_edge_data = EdgeData(
356 |             source_uid="test_getedge_node_1",
357 |             target_uid="test_getedge_node_2",
358 |             description="This might be a description of the relationship of these two nodes",
359 |             directed=False,
360 |             edge_uid=new_edge_uid
361 |         )
362 | 
363 |         self.assertEqual(retrieved_edge_data, target_edge_data)  # type: ignore
364 | 
365 |         # Clean up edges
366 |         self.kg.remove_edge(source_uid="test_getedge_node_1",
367 |                             target_uid="test_getedge_node_2")
368 | 
369 |         # Clean up nodes
370 |         self.kg.remove_node(node_uid="test_getedge_node_1")
371 |         self.kg.remove_node(node_uid="test_getedge_node_2")
372 | 
373 |     def test_update_edge(self):
374 |         """Test updating the data of an existing edge."""
375 |         # Add nodes (required for edges)
376 |         node_data_1 = NodeData(
377 |             node_uid="test_edgeupdate_node_1",
378 |             node_title="Test Node 1",
379 |             node_type="Person",
380 |             node_description="This is a test node",
381 |             node_degree=0,
382 |             document_id="doc_1",
383 |             edges_to=[],
384 |             edges_from=[],
385 |             embedding=[0.1, 0.2, 0.3],
386 |         )
387 |         node_data_2 = NodeData(
388 |             node_uid="test_edgeupdate_node_2",
389 |             node_title="Test Node 2",
390 |             node_type="Person",
391 |             node_description="This is another test node",
392 |             node_degree=0,
393 |             document_id="doc_2",
394 |             edges_to=[],
395 |             edges_from=[],
396 |             embedding=[0.4, 0.5, 0.6],
397 |         )
398 |         self.kg.add_node(node_uid="test_edgeupdate_node_1",
399 |                          node_data=node_data_1)
400 |         self.kg.add_node(node_uid="test_edgeupdate_node_2",
401 |                          node_data=node_data_2)
402 | 
403 |         # Add an edge between
404 |         edge_data = EdgeData(
405 |             source_uid="test_edgeupdate_node_1",
406 |             target_uid="test_edgeupdate_node_2",
407 |             description="This is a boring egde description"
408 |         )
409 | 
410 |         self.kg.add_edge(
411 |             edge_data=edge_data
412 |         )
413 | 
414 |         # Update edge with new data
415 |         updated_edge_data = EdgeData(
416 |             source_uid="test_edgeupdate_node_1",
417 |             target_uid="test_edgeupdate_node_2",
418 |             description="Updated much better description"
419 |         )
420 |         self.kg.update_edge(edge_data=updated_edge_data)
421 | 
422 |         # Verify that the edge data is updated
423 |         retrieved_updated_edge_data = self.kg.get_edge(
424 |             source_uid="test_edgeupdate_node_1", target_uid="test_edgeupdate_node_2"
425 |         )
426 | 
427 |         validate_edge_data = EdgeData(
428 |             source_uid="test_edgeupdate_node_1",
429 |             target_uid="test_edgeupdate_node_2",
430 |             description="Updated much better description",
431 |             edge_uid=self.kg._generate_edge_uid(
432 |                 edge_data.source_uid, edge_data.target_uid
433 |             )
434 |         )
435 | 
436 |         self.assertEqual(  # type: ignore
437 |             retrieved_updated_edge_data, validate_edge_data
438 |         )
439 | 
440 |         # Cleanup edges
441 |         self.kg.remove_edge(source_uid="test_edgeupdate_node_1",
442 |                             target_uid="test_edgeupdate_node_2")
443 | 
444 |         # Cleanup nodes
445 |         self.kg.remove_node(node_uid="test_edgeupdate_node_1")
446 |         self.kg.remove_node(node_uid="test_edgeupdate_node_2")
447 | 
448 |     def test_remove_edge(self):
449 |         """Test removing an edge between nodes."""
450 |         # Add nodes (required for edges)
451 |         node_data_1 = NodeData(
452 |             node_uid="test_removeegde_node_1",
453 |             node_title="Test Node 1",
454 |             node_type="Person",
455 |             node_description="This is a test node",
456 |             node_degree=0,
457 |             document_id="doc_1",
458 |             edges_to=[],
459 |             edges_from=[],
460 |             embedding=[0.1, 0.2, 0.3],
461 |         )
462 |         node_data_2 = NodeData(
463 |             node_uid="test_removeegde_node_2",
464 |             node_title="Test Node 2",
465 |             node_type="Person",
466 |             node_description="This is another test node",
467 |             node_degree=0,
468 |             document_id="doc_2",
469 |             edges_to=[],
470 |             edges_from=[],
471 |             embedding=[0.4, 0.5, 0.6],
472 |         )
473 |         self.kg.add_node(node_uid="test_removeegde_node_1",
474 |                          node_data=node_data_1)
475 |         self.kg.add_node(node_uid="test_removeegde_node_2",
476 |                          node_data=node_data_2)
477 | 
478 |         # add edges between nodes
479 |         edge_data = EdgeData(
480 |             source_uid="test_removeegde_node_2",
481 |             target_uid="test_removeegde_node_1",
482 |             description="This is a test egde description"
483 |         )
484 |         self.kg.add_edge(edge_data=edge_data)
485 | 
486 |         # Assert that the edge is reflected in the nodes' edge lists
487 |         node1 = self.kg.get_node("test_removeegde_node_1")
488 |         node2 = self.kg.get_node("test_removeegde_node_2")
489 |         self.assertIn("test_removeegde_node_1", node2.edges_to)  # type: ignore
490 |         self.assertIn("test_removeegde_node_2",
491 |                       node1.edges_from)  # type: ignore
492 | 
493 |         # Remove the edge
494 |         self.kg.remove_edge(source_uid="test_removeegde_node_2",
495 |                             target_uid="test_removeegde_node_1")
496 | 
497 |         # Assert that the edge is no longer in the nodes' edge lists
498 |         node1 = self.kg.get_node("test_removeegde_node_1")
499 |         node2 = self.kg.get_node("test_removeegde_node_2")
500 |         self.assertNotIn("test_removeegde_node_2",
501 |                          node1.edges_to)  # type: ignore
502 |         self.assertNotIn("test_removeegde_node_2",
503 |                          node1.edges_from)  # type: ignore
504 |         self.assertNotIn("test_removeegde_node_1",
505 |                          node2.edges_from)  # type: ignore
506 |         self.assertNotIn("test_removeegde_node_1",
507 |                          node2.edges_to)  # type: ignore
508 | 
509 |         # Clean up nodes
510 |         self.kg.remove_node(node_uid="test_removeegde_node_1")
511 |         self.kg.remove_node(node_uid="test_removeegde_node_2")
512 | 
513 |     def test_get_networkx(self):
514 |         """Test getting the networkx graph."""
515 |         # 1. Add nodes
516 |         node_data_1 = NodeData(
517 |             node_uid="test_getnx_node_1",
518 |             node_title="Test Node 1",
519 |             node_type="Person",
520 |             node_description="This is a test node",
521 |             node_degree=0,
522 |             document_id="doc_1",
523 |             edges_to=[],
524 |             edges_from=[],
525 |             embedding=[0.1, 0.2, 0.3],
526 |         )
527 |         node_data_2 = NodeData(
528 |             node_uid="test_getnx_node_2",
529 |             node_title="Test Node 2",
530 |             node_type="Person",
531 |             node_description="This is another test node",
532 |             node_degree=0,
533 |             document_id="doc_2",
534 |             edges_to=[],
535 |             edges_from=[],
536 |             embedding=[0.4, 0.5, 0.6],
537 |         )
538 |         self.kg.add_node(node_uid="test_getnx_node_1", node_data=node_data_1)
539 |         self.kg.add_node(node_uid="test_getnx_node_2", node_data=node_data_2)
540 | 
541 |         # 2. Add an edge
542 |         edge_data = EdgeData(
543 |             source_uid="test_getnx_node_1",
544 |             target_uid="test_getnx_node_2",
545 |             description="Test Edge Description"
546 |         )
547 |         self.kg.add_edge(edge_data=edge_data)
548 | 
549 |         # 3. Get the NetworkX graph
550 |         self.kg.build_networkx()
551 | 
552 |         # 4. Assertions
553 |         # Check if the graph is the correct type
554 |         self.assertIsInstance(self.kg.networkx, nx.Graph)  # type: ignore
555 |         # Check if the number of nodes is correct
556 |         self.assertEqual(self.kg.networkx.number_of_nodes(), 2)  # type: ignore
557 |         # Check if the number of edges is correct
558 |         self.assertEqual(self.kg.networkx.number_of_edges(), 1)  # type: ignore
559 |         # Check if specific nodes exist in the graph
560 |         self.assertTrue(self.kg.networkx.has_node(
561 |             "test_getnx_node_1"))  # type: ignore
562 |         self.assertTrue(self.kg.networkx.has_node(
563 |             "test_getnx_node_2"))  # type: ignore
564 |         # Check if a specific edge exists in the graph
565 |         self.assertTrue(self.kg.networkx.has_edge(
566 |             "test_getnx_node_1", "test_getnx_node_2"))  # type: ignore
567 | 
568 |         # 5. Clean up (optional, depending on your test setup)
569 |         self.kg.remove_edge(source_uid="test_getnx_node_1",
570 |                             target_uid="test_getnx_node_2")
571 |         self.kg.remove_node(node_uid="test_getnx_node_1")
572 |         self.kg.remove_node(node_uid="test_getnx_node_2")
573 | 
574 |     def test_get_louvain_communities(self):
575 |         """Test getting Louvain communities."""
576 |         # 1. Add nodes
577 |         node_data_1 = NodeData(
578 |             node_uid="test_louvain_node_1",
579 |             node_title="Test Node 1",
580 |             node_type="Person",
581 |             node_description="This is a test node",
582 |             node_degree=0,
583 |             document_id="doc_1",
584 |             edges_to=[],
585 |             edges_from=[],
586 |             embedding=[0.1, 0.2, 0.3],
587 |         )
588 |         node_data_2 = NodeData(
589 |             node_uid="test_louvain_node_2",
590 |             node_title="Test Node 2",
591 |             node_type="Person",
592 |             node_description="This is another test node",
593 |             node_degree=0,
594 |             document_id="doc_2",
595 |             edges_to=[],
596 |             edges_from=[],
597 |             embedding=[0.4, 0.5, 0.6],
598 |         )
599 |         node_data_3 = NodeData(
600 |             node_uid="test_louvain_node_3",
601 |             node_title="Test Node 3",
602 |             node_type="Person",
603 |             node_description="This is another test node",
604 |             node_degree=0,
605 |             document_id="doc_3",
606 |             edges_to=[],
607 |             edges_from=[],
608 |             embedding=[0.4, 0.5, 0.6],
609 |         )
610 | 
611 |         node_data_4 = NodeData(
612 |             node_uid="test_louvain_node_4",
613 |             node_title="Test Node 4",
614 |             node_type="Person",
615 |             node_description="This is another test node",
616 |             node_degree=0,
617 |             document_id="doc_3",
618 |             edges_to=[],
619 |             edges_from=[],
620 |             embedding=[0.4, 0.5, 0.6],
621 |         )
622 | 
623 |         self.kg.add_node(node_uid="test_louvain_node_1", node_data=node_data_1)
624 |         self.kg.add_node(node_uid="test_louvain_node_2", node_data=node_data_2)
625 |         self.kg.add_node(node_uid="test_louvain_node_3", node_data=node_data_3)
626 |         self.kg.add_node(node_uid="test_louvain_node_4", node_data=node_data_4)
627 | 
628 |         # 2. Add edges to create a connected structure for community detection
629 |         edge_data_1 = EdgeData(
630 |             source_uid="test_louvain_node_1",
631 |             target_uid="test_louvain_node_2",
632 |             description="Test Edge Description 1"
633 |         )
634 |         edge_data_2 = EdgeData(
635 |             source_uid="test_louvain_node_2",
636 |             target_uid="test_louvain_node_3",
637 |             description="Test Edge Description 2"
638 |         )
639 |         self.kg.add_edge(edge_data=edge_data_1)
640 |         self.kg.add_edge(edge_data=edge_data_2)
641 | 
642 |         # 3. Get the Louvain communities
643 |         communities = self.kg.get_louvain_communities()
644 | 
645 |         # 4. Assertions
646 |         # Ensure communities is a list
647 |         self.assertIsInstance(communities, list)  # type: ignore
648 |         # We are expecting exactly two communities since one node has no edges
649 |         self.assertTrue(len(communities) == 2)  # type: ignore
650 |         # Check if each community is a set (or your expected data structure)
651 |         for community in communities:
652 |             self.assertIsInstance(community, set)  # type: ignore
653 | 
654 |         # 5. Clean up (optional, depending on your test setup)
655 |         self.kg.remove_edge(source_uid="test_louvain_node_1",
656 |                             target_uid="test_louvain_node_2")
657 |         self.kg.remove_edge(source_uid="test_louvain_node_2",
658 |                             target_uid="test_louvain_node_3")
659 |         self.kg.remove_node(node_uid="test_louvain_node_1")
660 |         self.kg.remove_node(node_uid="test_louvain_node_2")
661 |         self.kg.remove_node(node_uid="test_louvain_node_3")
662 |         self.kg.remove_node(node_uid="test_louvain_node_4")
663 | 
664 |     def test_visualize_graph(self):
665 |         """Test visualizing the graph. This test is not asserting anything.
666 |         It's only creating a visualization for manual inspection."""
667 | 
668 |         # 1. Add nodes
669 |         node_data_1 = NodeData(
670 |             node_uid="test_vis_node_1",
671 |             node_title="Test Node 1",
672 |             node_type="Person",
673 |             node_description="This is a test node",
674 |             node_degree=0,
675 |             document_id="doc_1",
676 |             edges_to=[],
677 |             edges_from=[],
678 |             embedding=[0.1, 0.2, 0.3],
679 |         )
680 |         node_data_2 = NodeData(
681 |             node_uid="test_vis_node_2",
682 |             node_title="Test Node 2",
683 |             node_type="Person",
684 |             node_description="This is another test node",
685 |             node_degree=0,
686 |             document_id="doc_2",
687 |             edges_to=[],
688 |             edges_from=[],
689 |             embedding=[0.4, 0.5, 0.6],
690 |         )
691 |         node_data_3 = NodeData(
692 |             node_uid="test_vis_node_3",
693 |             node_title="Test Node 3",
694 |             node_type="Organization",
695 |             node_description="This is another test node",
696 |             node_degree=0,
697 |             document_id="doc_3",
698 |             edges_to=[],
699 |             edges_from=[],
700 |             embedding=[0.4, 0.5, 0.6],
701 |         )
702 |         self.kg.add_node(node_uid="test_vis_node_1", node_data=node_data_1)
703 |         self.kg.add_node(node_uid="test_vis_node_2", node_data=node_data_2)
704 |         self.kg.add_node(node_uid="test_vis_node_3", node_data=node_data_3)
705 | 
706 |         # 2. Add edges to create connections for visualization
707 |         edge_data_1 = EdgeData(
708 |             source_uid="test_vis_node_1",
709 |             target_uid="test_vis_node_2",
710 |             description="Test Edge Description 1"
711 |         )
712 |         edge_data_2 = EdgeData(
713 |             source_uid="test_vis_node_2",
714 |             target_uid="test_vis_node_3",
715 |             description="Test Edge Description 2"
716 |         )
717 |         self.kg.add_edge(edge_data=edge_data_1)
718 |         self.kg.add_edge(edge_data=edge_data_2)
719 | 
720 |         # 3. Visualize the graph
721 |         try:
722 |             self.kg.visualize_graph(filename="test_graph.png")
723 |         except Exception as e:
724 |             raise ValueError(f"An error occurred during visualization: {e}")
725 | 
726 |         # 4. Clean up (optional, depending on your test setup)
727 |         self.kg.remove_edge(source_uid="test_vis_node_1",
728 |                             target_uid="test_vis_node_2")
729 |         self.kg.remove_edge(source_uid="test_vis_node_2",
730 |                             target_uid="test_vis_node_3")
731 |         self.kg.remove_node(node_uid="test_vis_node_1")
732 |         self.kg.remove_node(node_uid="test_vis_node_2")
733 |         self.kg.remove_node(node_uid="test_vis_node_3")
734 | 
735 | 
736 | class FirestoreKGTests(_NoSQLKnowledgeGraphTests, unittest.TestCase):
737 |     """
738 |     Test cases for the FirestoreKG implementation of NoSQLKnowledgeGraph.
739 | 
740 |     This test suite inherits from _NoSQLKnowledgeGraphTests to reuse common test cases.
741 |     It specifically tests the FirestoreKG class by creating an instance connected to a Firestore
742 |     database and then running various operations on it.
743 | 
744 |     Before running the tests, it attempts to clear the Firestore collections to ensure a clean slate.
745 |     However, this clearing operation is currently commented out.
746 |     """
747 | 
748 |     def create_kg_instance(self) -> NoSQLKnowledgeGraph:
749 |         os.chdir(os.path.dirname(os.path.abspath(__file__)))
750 | 
751 |         secrets = dotenv_values("../.env")
752 | 
753 |         gcp_credential_file = str(secrets["GCP_CREDENTIAL_FILE"])
754 |         project_id = str(secrets["GCP_PROJECT_ID"])
755 |         database_id = str(secrets["FIRESTORE_DB_ID"])
756 |         node_coll_id = str(secrets["NODE_COLL_ID"])
757 |         edges_coll_id = str(secrets["EDGES_COLL_ID"])
758 |         community_coll_id = str(secrets["COMM_COLL_ID"])
759 | 
760 |         fskg = FirestoreKG(
761 |             gcp_project_id=project_id,
762 |             gcp_credential_file=gcp_credential_file,
763 |             firestore_db_id=database_id,
764 |             node_collection_id=node_coll_id,
765 |             edges_collection_id=edges_coll_id,
766 |             community_collection_id=community_coll_id
767 |         )
768 | 
769 |         # Clear the collections before running tests
770 |         fskg.flush_kg()
771 |         return fskg
772 | 
773 | 
774 | class AuraKGTest(_NoSQLKnowledgeGraphTests, unittest.TestCase):
775 |     """
776 |     Test cases for the Neo4j Aura implementation of NoSQLKnowledgeGraph.
777 | 
778 |     This test suite inherits from _NoSQLKnowledgeGraphTests to reuse common test cases.
779 |     It specifically tests the Neo4j Aura class by creating an instance connected to a Neo4j Aura
780 |     database and then running various operations on it.
781 | 
782 |     Before running the tests, it attempts to clear the Neo4j Aura collections to ensure a clean slate.
783 |     However, this clearing operation is currently commented out.
784 |     """
785 | 
786 |     def create_kg_instance(self) -> NoSQLKnowledgeGraph:
787 |         os.chdir(os.path.dirname(os.path.abspath(__file__)))
788 | 
789 |         dotenv.load_dotenv("../Neo4j-39cb28f0-Created-2024-09-23.txt")
790 | 
791 |         uri = os.getenv("NEO4J_URI")
792 |         auth = (os.getenv("NEO4J_USERNAME"), os.getenv("NEO4J_PASSWORD"))
793 | 
794 |         aura = AuraKG(uri=uri, auth=auth)
795 | 
796 |         # empty graph store before running tests
797 |         # aura.flush_kg()
798 |         return aura
799 | 
800 | 
801 | class MongoKGTest(_NoSQLKnowledgeGraphTests, unittest.TestCase):
802 |     """
803 |     Test cases for the MongoDB implementation of NoSQLKnowledgeGraph.
804 | 
805 |     This test suite inherits from _NoSQLKnowledgeGraphTests to reuse common test cases.
806 |     It specifically tests the MongoDB class by creating an instance connected to a MongoDB
807 |     database and then running various operations on it.
808 | 
809 |     Before running the tests, it attempts to clear the MongoDB collections to ensure a clean slate.
810 |     However, this clearing operation is currently commented out.
811 |     """
812 | 
813 |     def create_kg_instance(self) -> NoSQLKnowledgeGraph:
814 |         os.chdir(os.path.dirname(os.path.abspath(__file__)))
815 | 
816 |         secrets = dotenv_values("../.env")
817 | 
818 |         mdb_username = str(secrets["MDB_USERNAME"])
819 |         mdb_passowrd = str(secrets["MDB_PASSWORD"])
820 |         mdb_cluster = str(secrets["MDB_CLUSTER"])
821 |         uri = f"mongodb+srv://{mdb_username}:{mdb_passowrd}@cluster0.pjx3w.mongodb.net/?retryWrites=true&w=majority&appName={mdb_cluster}"
822 | 
823 |         mkg = MongoKG(
824 |             mdb_uri=uri,
825 |             mdb_db_id=str(secrets["MDB_DB_ID"]),
826 |             node_coll_id=str(secrets["NODE_COLL_ID"]),
827 |             edges_coll_id=str(secrets["EDGES_COLL_ID"]),
828 |             community_collection_id=str(secrets["COMM_COLL_ID"])
829 |         )
830 | 
831 |         # flush full kg before running tests
832 |         mkg.flush_kg()
833 |         return mkg
834 | 
835 | 
836 | def suite():
837 |     """testing suite def"""
838 |     suite = unittest.TestSuite()
839 |     suite.addTest(unittest.makeSuite(FirestoreKGTests))
840 |     suite.addTest(unittest.makeSuite(AuraKGTest))
841 |     suite.addTest(unittest.makeSuite(MongoKGTest))
842 |     # Add tests for other database classes as needed
843 |     return suite
844 | 
845 | 
846 | if __name__ == "__main__":
847 |     runner = unittest.TextTestRunner()
848 |     runner.run(suite())
849 | 


--------------------------------------------------------------------------------
/benchmarks/import_benchmarks.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [
  8 |     {
  9 |      "name": "stderr",
 10 |      "output_type": "stream",
 11 |      "text": [
 12 |       "/home/user/graph2nosql/.venv-g2nsql/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
 13 |       "  from .autonotebook import tqdm as notebook_tqdm\n"
 14 |      ]
 15 |     }
 16 |    ],
 17 |    "source": [
 18 |     "from google.cloud import bigquery\n",
 19 |     "import google.auth\n",
 20 |     "\n",
 21 |     "from neo4j import GraphDatabase\n",
 22 |     "\n",
 23 |     "import os\n",
 24 |     "import json\n",
 25 |     "from dotenv import dotenv_values\n",
 26 |     "import time\n",
 27 |     "\n",
 28 |     "from main import NodeImportBenchmark, NodeQueryBenchmark\n",
 29 |     "\n",
 30 |     "from base.operations import NoSQLKnowledgeGraph\n",
 31 |     "from databases.firestore_kg import FirestoreKG\n",
 32 |     "from databases.n4j import AuraKG\n",
 33 |     "from databases.mdb import MongoKG\n",
 34 |     "\n",
 35 |     "from datamodel.data_model import NodeData, EdgeData, CommunityData"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "markdown",
 40 |    "metadata": {},
 41 |    "source": [
 42 |     "#### Setting env and global variables"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": 2,
 48 |    "metadata": {},
 49 |    "outputs": [],
 50 |    "source": [
 51 |     "secrets = dotenv_values(\"../.env\")\n",
 52 |     "credentials, _ = google.auth.load_credentials_from_file(secrets[\"GCP_CREDENTIAL_FILE\"])"
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "markdown",
 57 |    "metadata": {},
 58 |    "source": [
 59 |     "## Firestore Knowledge Graph vs. AuraDB (Neo4J) latency comparison"
 60 |    ]
 61 |   },
 62 |   {
 63 |    "cell_type": "markdown",
 64 |    "metadata": {},
 65 |    "source": [
 66 |     "#### Define Knowledge Graph DB Interface Options"
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "code",
 71 |    "execution_count": 3,
 72 |    "metadata": {},
 73 |    "outputs": [
 74 |     {
 75 |      "name": "stdout",
 76 |      "output_type": "stream",
 77 |      "text": [
 78 |       "Pinged your deployment. You successfully connected to MongoDB!\n"
 79 |      ]
 80 |     }
 81 |    ],
 82 |    "source": [
 83 |     "fskg = FirestoreKG(gcp_credential_file=secrets[\"GCP_CREDENTIAL_FILE\"],\n",
 84 |     "                   gcp_project_id=str(secrets[\"GCP_PROJECT_ID\"]),\n",
 85 |     "                   firestore_db_id=str(secrets[\"WIKIDATA_FS_DB\"]),\n",
 86 |     "                   node_collection_id=str(secrets[\"NODE_COLL_ID\"]),\n",
 87 |     "                   edges_collection_id=str(secrets[\"EDGES_COLL_ID\"]),\n",
 88 |     "                   community_collection_id=str(\n",
 89 |     "                       secrets[\"COMM_COLL_ID\"])\n",
 90 |     "                   )\n",
 91 |     "\n",
 92 |     "aura_kg = AuraKG(uri=str(secrets[\"NEO4J_URI\"]),\n",
 93 |     "                 auth=(str(secrets[\"NEO4J_USERNAME\"]),\n",
 94 |     "                       str(secrets[\"NEO4J_PASSWORD\"]))\n",
 95 |     "                 )\n",
 96 |     "\n",
 97 |     "mdb_username = str(secrets[\"MDB_USERNAME\"])\n",
 98 |     "mdb_passowrd = str(secrets[\"MDB_PASSWORD\"])\n",
 99 |     "mdb_cluster = str(secrets[\"MDB_CLUSTER\"])\n",
100 |     "mdb_uri = f\"mongodb+srv://{mdb_username}:{mdb_passowrd}@cluster0.pjx3w.mongodb.net/?retryWrites=true&w=majority&appName={mdb_cluster}\"\n",
101 |     "\n",
102 |     "mkg = MongoKG(\n",
103 |     "    mdb_uri=mdb_uri,\n",
104 |     "    mdb_db_id=str(secrets[\"MDB_DB_ID\"]),\n",
105 |     "    node_coll_id=str(secrets[\"NODE_COLL_ID\"]),\n",
106 |     "    edges_coll_id=str(secrets[\"EDGES_COLL_ID\"]),\n",
107 |     "    community_collection_id=str(secrets[\"COMM_COLL_ID\"])\n",
108 |     ")"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "markdown",
113 |    "metadata": {},
114 |    "source": [
115 |     "### Importing Nodes Comparison"
116 |    ]
117 |   },
118 |   {
119 |    "cell_type": "markdown",
120 |    "metadata": {},
121 |    "source": [
122 |     "#### Fetch graph data from BigQuery"
123 |    ]
124 |   },
125 |   {
126 |    "cell_type": "code",
127 |    "execution_count": 4,
128 |    "metadata": {},
129 |    "outputs": [
130 |     {
131 |      "name": "stdout",
132 |      "output_type": "stream",
133 |      "text": [
134 |       "$$$$ Task Index 0, Task Count 1, Offset 302400\n"
135 |      ]
136 |     }
137 |    ],
138 |    "source": [
139 |     "import_lim = 100\n",
140 |     "task_index = int(os.getenv('CLOUD_RUN_TASK_INDEX', '0'))\n",
141 |     "task_count = int(os.getenv('CLOUD_RUN_TASK_COUNT', '1'))\n",
142 |     "rows_per_task = import_lim // task_count\n",
143 |     "offset = task_index * rows_per_task + 302400\n",
144 |     "\n",
145 |     "print(\n",
146 |     "    f'$$$$ Task Index {task_index}, Task Count {task_count}, Offset {offset}')\n",
147 |     "\n",
148 |     "# Fetch Node data from BigQuery\n",
149 |     "client = bigquery.Client(project=str(\n",
150 |     "    secrets[\"GCP_PROJECT_ID\"]), credentials=credentials)\n",
151 |     "\n",
152 |     "query_job = client.query(\n",
153 |     "    f\"SELECT * FROM poerschmann-sem-search.wikidata_kg.entity_doc_alias_joined LIMIT {rows_per_task} OFFSET {offset}\")"
154 |    ]
155 |   },
156 |   {
157 |    "cell_type": "markdown",
158 |    "metadata": {},
159 |    "source": [
160 |     "#### Run Node Import Benchmark"
161 |    ]
162 |   },
163 |   {
164 |    "cell_type": "code",
165 |    "execution_count": 5,
166 |    "metadata": {},
167 |    "outputs": [
168 |     {
169 |      "name": "stdout",
170 |      "output_type": "stream",
171 |      "text": [
172 |       "$$$$ Starting Benchmark Node Import with options: ['Firestore', 'Mongo', 'Aura'] $$$$\n",
173 |       "Firestore time for 100 Node Import: 3.035799026489258\n",
174 |       "Mongo time for 100 Node Import: 2.513396739959717\n",
175 |       "Aura time for 100 Node Import: 1.9137544631958008\n",
176 |       "hEllO wOrlD!\n"
177 |      ]
178 |     }
179 |    ],
180 |    "source": [
181 |     "add_nodes_testing = NodeImportBenchmark(\n",
182 |     "    benchmark_name=\"Node Import\", import_lim=import_lim, options_dict={\"Firestore\": fskg, \"Mongo\": mkg, \"Aura\": aura_kg})\n",
183 |     "add_nodes_testing(records=query_job)"
184 |    ]
185 |   },
186 |   {
187 |    "cell_type": "markdown",
188 |    "metadata": {},
189 |    "source": [
190 |     "### Querying Nodes Comparison"
191 |    ]
192 |   },
193 |   {
194 |    "cell_type": "code",
195 |    "execution_count": 6,
196 |    "metadata": {},
197 |    "outputs": [
198 |     {
199 |      "name": "stdout",
200 |      "output_type": "stream",
201 |      "text": [
202 |       "$$$$ Starting Benchmark Node Query with options: ['Firestore', 'Mongo', 'Aura'] $$$$\n",
203 |       "Firestore time for 100 Node Query: 0.9364566802978516\n",
204 |       "Mongo time for 100 Node Query: 1.0969457626342773\n",
205 |       "Aura time for 100 Node Query: 7.117481708526611\n",
206 |       "hEllO wOrlD!\n"
207 |      ]
208 |     }
209 |    ],
210 |    "source": [
211 |     "query_nodes_testing = NodeQueryBenchmark(\n",
212 |     "    benchmark_name=\"Node Query\", import_lim=import_lim, options_dict={\"Firestore\": fskg, \"Mongo\": mkg, \"Aura\": aura_kg})\n",
213 |     "query_nodes_testing(records=query_job)"
214 |    ]
215 |   },
216 |   {
217 |    "cell_type": "markdown",
218 |    "metadata": {},
219 |    "source": [
220 |     "### Querying deeply nested structures comparison"
221 |    ]
222 |   },
223 |   {
224 |    "cell_type": "markdown",
225 |    "metadata": {},
226 |    "source": [
227 |     "Challenge: Finding friends of friends of \"Q901\" (2nd degree directed)"
228 |    ]
229 |   },
230 |   {
231 |    "cell_type": "code",
232 |    "execution_count": 7,
233 |    "metadata": {},
234 |    "outputs": [
235 |     {
236 |      "data": {
237 |       "text/plain": [
238 |        "113"
239 |       ]
240 |      },
241 |      "execution_count": 7,
242 |      "metadata": {},
243 |      "output_type": "execute_result"
244 |     }
245 |    ],
246 |    "source": [
247 |     "f0f_list = []\n",
248 |     "\n",
249 |     "node_data = fskg.get_node(node_uid='Q901')\n",
250 |     "\n",
251 |     "for e in node_data.edges_from:\n",
252 |     "    neigh_node = fskg.get_node(node_uid=e)\n",
253 |     "    f0f_list.append(neigh_node.edges_from)\n",
254 |     "\n",
255 |     "len(sum(f0f_list, []))"
256 |    ]
257 |   },
258 |   {
259 |    "cell_type": "code",
260 |    "execution_count": 8,
261 |    "metadata": {},
262 |    "outputs": [],
263 |    "source": [
264 |     "# f0f_list = []\n",
265 |     "\n",
266 |     "# node_data = mkg.get_node(node_uid='Q901')\n",
267 |     "\n",
268 |     "# for e in node_data.edges_from:\n",
269 |     "#     neigh_node = mkg.get_node(node_uid=e)\n",
270 |     "#     f0f_list.append(neigh_node.edges_from)\n",
271 |     "\n",
272 |     "# len(sum(f0f_list, []))"
273 |    ]
274 |   },
275 |   {
276 |    "cell_type": "markdown",
277 |    "metadata": {},
278 |    "source": [
279 |     "f0f_list"
280 |    ]
281 |   },
282 |   {
283 |    "cell_type": "code",
284 |    "execution_count": 9,
285 |    "metadata": {},
286 |    "outputs": [
287 |     {
288 |      "data": {
289 |       "text/plain": [
290 |        "53"
291 |       ]
292 |      },
293 |      "execution_count": 9,
294 |      "metadata": {},
295 |      "output_type": "execute_result"
296 |     }
297 |    ],
298 |    "source": [
299 |     "records, summary, keys = aura_kg.driver.execute_query(\n",
300 |     "        \"MATCH (n)-[]-()-[]-(result) WHERE n.node_uid = 'Q901' RETURN result\")\n",
301 |     "\n",
302 |     "len(records)"
303 |    ]
304 |   },
305 |   {
306 |    "cell_type": "code",
307 |    "execution_count": 10,
308 |    "metadata": {},
309 |    "outputs": [
310 |     {
311 |      "data": {
312 |       "text/plain": [
313 |        "53"
314 |       ]
315 |      },
316 |      "execution_count": 10,
317 |      "metadata": {},
318 |      "output_type": "execute_result"
319 |     }
320 |    ],
321 |    "source": [
322 |     "records, summary, keys = aura_kg.driver.execute_query(\n",
323 |     "        \"\"\"MATCH (n)-[:DIRECTED]-()-[:DIRECTED]-(result)\n",
324 |     "            WHERE n.node_uid = 'Q901'\n",
325 |     "            RETURN result\"\"\")\n",
326 |     "\n",
327 |     "len(records)"
328 |    ]
329 |   },
330 |   {
331 |    "cell_type": "code",
332 |    "execution_count": 11,
333 |    "metadata": {},
334 |    "outputs": [
335 |     {
336 |      "data": {
337 |       "text/plain": [
338 |        "53"
339 |       ]
340 |      },
341 |      "execution_count": 11,
342 |      "metadata": {},
343 |      "output_type": "execute_result"
344 |     }
345 |    ],
346 |    "source": [
347 |     "records, summary, keys = aura_kg.driver.execute_query(\n",
348 |     "        \"\"\"MATCH (n)-[:DIRECTED*2]-(result)\n",
349 |     "            WHERE n.node_uid = 'Q901'\n",
350 |     "            RETURN result\"\"\")\n",
351 |     "\n",
352 |     "len(records)"
353 |    ]
354 |   },
355 |   {
356 |    "cell_type": "markdown",
357 |    "metadata": {},
358 |    "source": [
359 |     "Challenge 2: Finding friends of friends of friends \"Q901\" (3rd degree undirected)"
360 |    ]
361 |   },
362 |   {
363 |    "cell_type": "code",
364 |    "execution_count": 12,
365 |    "metadata": {},
366 |    "outputs": [
367 |     {
368 |      "data": {
369 |       "text/plain": [
370 |        "51947"
371 |       ]
372 |      },
373 |      "execution_count": 12,
374 |      "metadata": {},
375 |      "output_type": "execute_result"
376 |     }
377 |    ],
378 |    "source": [
379 |     "f0fof_list = []\n",
380 |     "\n",
381 |     "node_data = fskg.get_node(node_uid='Q901')\n",
382 |     "\n",
383 |     "for e in node_data.edges_from + node_data.edges_to:\n",
384 |     "    neigh_node = fskg.get_node(node_uid=e)\n",
385 |     "    for e2 in neigh_node.edges_from + neigh_node.edges_to:\n",
386 |     "        neigh_node2 = fskg.get_node(node_uid=e2)\n",
387 |     "        f0fof_list.append(neigh_node2.edges_from)\n",
388 |     "        f0fof_list.append(neigh_node2.edges_to)\n",
389 |     "\n",
390 |     "len(sum(f0fof_list, []))"
391 |    ]
392 |   },
393 |   {
394 |    "cell_type": "code",
395 |    "execution_count": 13,
396 |    "metadata": {},
397 |    "outputs": [
398 |     {
399 |      "data": {
400 |       "text/plain": [
401 |        "10078"
402 |       ]
403 |      },
404 |      "execution_count": 13,
405 |      "metadata": {},
406 |      "output_type": "execute_result"
407 |     }
408 |    ],
409 |    "source": [
410 |     "with GraphDatabase.driver(uri=aura_kg.uri, auth=aura_kg.auth) as driver:\n",
411 |     "    driver.verify_connectivity()\n",
412 |     "\n",
413 |     "    # Use a parameter for node_uid in the Cypher query\n",
414 |     "    records, summary, keys = driver.execute_query(\n",
415 |     "        \"MATCH (n)-[]-()-[]-()-[]-(result) WHERE n.node_uid = 'Q901' RETURN result\")\n",
416 |     "\n",
417 |     "len(records)"
418 |    ]
419 |   },
420 |   {
421 |    "cell_type": "markdown",
422 |    "metadata": {},
423 |    "source": [
424 |     "### Running Community Identification Comparison"
425 |    ]
426 |   },
427 |   {
428 |    "cell_type": "markdown",
429 |    "metadata": {},
430 |    "source": [
431 |     "tbd in comparison"
432 |    ]
433 |   },
434 |   {
435 |    "cell_type": "code",
436 |    "execution_count": null,
437 |    "metadata": {},
438 |    "outputs": [],
439 |    "source": []
440 |   }
441 |  ],
442 |  "metadata": {
443 |   "kernelspec": {
444 |    "display_name": ".venv",
445 |    "language": "python",
446 |    "name": "python3"
447 |   },
448 |   "language_info": {
449 |    "codemirror_mode": {
450 |     "name": "ipython",
451 |     "version": 3
452 |    },
453 |    "file_extension": ".py",
454 |    "mimetype": "text/x-python",
455 |    "name": "python",
456 |    "nbconvert_exporter": "python",
457 |    "pygments_lexer": "ipython3",
458 |    "version": "3.11.8"
459 |   }
460 |  },
461 |  "nbformat": 4,
462 |  "nbformat_minor": 2
463 | }
464 | 


--------------------------------------------------------------------------------
/benchmarks/main.py:
--------------------------------------------------------------------------------
  1 | """graph2nosql latency benchmark library across graph storage implementations"""
  2 | 
  3 | import os
  4 | import json
  5 | import time
  6 | from abc import ABC, abstractmethod
  7 | from typing import Any, Dict
  8 | 
  9 | from dotenv import dotenv_values
 10 | 
 11 | from google.cloud import bigquery
 12 | import google.auth
 13 | 
 14 | 
 15 | from base.operations import NoSQLKnowledgeGraph
 16 | from databases.firestore_kg import FirestoreKG
 17 | from databases.n4j import AuraKG
 18 | 
 19 | from datamodel.data_model import NodeData, EdgeData
 20 | 
 21 | 
 22 | class KGDBBenchmark(ABC):
 23 |     """
 24 |     Abstract base class for defining latency benchmark experiments
 25 |     for different Knowledge Graph Databases (KGDBs).
 26 | 
 27 |     This class provides a framework for comparing the performance
 28 |     of different KGDB implementations on specific database operations.
 29 |     Concrete benchmark classes should inherit from this class and implement
 30 |     the `_construct_data` and `_db_transaction` methods.
 31 | 
 32 |     Attributes:
 33 |         benchmark_name (str): The name of the benchmark experiment.
 34 |         options_dict (Dict[str, NoSQLKnowledgeGraph]): Dictionary of
 35 |         KGDB implementations being compared. 
 36 |         
 37 |         import_lim (int): The number of records to import/process in the benchmark.
 38 | 
 39 |     Example Usage:
 40 |         ```python
 41 |         class MyBenchmark(KGDBBenchmark):
 42 |             def __init__(self, options_dict, import_lim):
 43 |                 super().__init__("My Benchmark", options_dict, import_lim)
 44 | 
 45 |             def _construct_data(self, row):
 46 |                 # Implement logic to construct data for the benchmark from a row of input data.
 47 |                 pass
 48 | 
 49 |             def _db_transaction(self, kgdb, option_name, data):
 50 |                 # Implement the specific database operation to benchmark
 51 |                 # using the provided kgdb and data.
 52 |                 pass
 53 | 
 54 |         # Create instances of your KGDB implementations (e.g., FirestoreKG, AuraKG)
 55 |         option_1 = ...
 56 |         option_2 = ...
 57 | 
 58 |         options_dict = {"option_1_name": option_1, "option_2_name": option_2}
 59 | 
 60 |         # Create an instance of your benchmark class
 61 |         benchmark = MyBenchmark(options_dict, 1000)
 62 | 
 63 |         # Execute the benchmark
 64 |         benchmark(records)  # 'records' would be your input data
 65 |         ```
 66 |     """
 67 | 
 68 | 
 69 |     def __init__(self,
 70 |                  benchmark_name: str,
 71 |                  options_dict: Dict[str, NoSQLKnowledgeGraph],
 72 |                  import_lim: int,
 73 |                  ):
 74 |         self.benchmark_name = benchmark_name
 75 |         self.import_lim = import_lim
 76 |         self.options_dict = options_dict
 77 |         self.option_names = list(options_dict.keys())
 78 |         self.option_times = {}
 79 | 
 80 |     def __call__(self, records):
 81 | 
 82 |         print(
 83 |             f'$$$$ Starting Benchmark {self.benchmark_name} with options: {self.option_names} $$$$')
 84 | 
 85 |         for option_name in self.option_names:
 86 |             start_time = time.time()
 87 | 
 88 |             for row in records:
 89 |                 data = self._construct_data(row)
 90 |                 try:
 91 |                     self._db_transaction(kgdb=self.options_dict[option_name],
 92 |                                         data=data, option_name=option_name)
 93 |                 except Exception:
 94 |                     pass
 95 | 
 96 |             end_time = time.time()
 97 |             self.option_times[option_name] = end_time - start_time
 98 | 
 99 |         self._benchmark_reporting()
100 |         print("hEllO wOrlD!")
101 | 
102 |     def _benchmark_reporting(self) -> None:
103 |         for option_name in self.option_names:
104 |             print(f'{option_name} time for {self.import_lim} {self.benchmark_name}: {self.option_times[option_name]}')
105 |         return None
106 | 
107 |     @abstractmethod
108 |     def _construct_data(self, row: Any):
109 |         """constructs the data to be used for the benchmark db transaction given the data records"""
110 | 
111 |     @abstractmethod
112 |     def _db_transaction(self, kgdb: NoSQLKnowledgeGraph, option_name: str, data) -> None:
113 |         """defines the db transaction that this benchmark run should compare"""
114 | 
115 | 
116 | class NodeImportBenchmark(KGDBBenchmark):
117 |     """
118 |     Define Latency Benchmark for Node Import. Inhertits from KGDBBenchmark.
119 |     Implements _construct_data and _db_transaction methods for edge import.
120 |     """
121 |     def _construct_data(self, row: Any):
122 |         # constructs NodeData given a tuple[str, str, str] record
123 |         record_values = row.values()
124 | 
125 |         body_str = json.loads(record_values[1])[0]
126 |         # alias_list = json.loads(record_values[2])
127 |         node_uid = record_values[0]
128 | 
129 |         node_data = NodeData(node_uid=node_uid,
130 |                              node_title=node_uid,
131 |                              node_description=body_str,
132 |                              node_degree=0,
133 |                              node_type="na",
134 |                              document_id="na")
135 |         return node_data
136 | 
137 |     def _db_transaction(self, kgdb: NoSQLKnowledgeGraph, option_name, data: NodeData) -> None:
138 |         # defines the db transaction that this benchmark run should compare
139 |         try:
140 |             kgdb.add_node(node_uid=data.node_uid, node_data=data)
141 |             # print(f'Success adding node {data.node_uid} with {option_name}')
142 |         except Exception as e:
143 |             print(f"Error adding node {data.node_uid} with {option_name}: {e}")
144 | 
145 | 
146 | class EdgeImportBenchmark(KGDBBenchmark):
147 |     """
148 |     Define Latency Benchmark for edge import. Inhertits from KGDBBenchmark.
149 |     Implements _construct_data and _db_transaction methods for edge import.
150 |     """
151 |     def _construct_data(self, row: Any):
152 |         # constructs NodeData given a tuple[str, str, str] record
153 |         # record_values = row.values()
154 | 
155 |         source_uid = row[0]
156 |         edge_uid = row[1]
157 |         target_uid = row[2]
158 |         # description_body = json.loads(row.values()[3])
159 |         edge_description = json.loads(row.values()[3])[0]
160 | 
161 |         edge_data = EdgeData(source_uid=source_uid,
162 |                              target_uid=target_uid,
163 |                              description=edge_description,
164 |                              edge_uid=edge_uid
165 |                              )
166 |         return edge_data
167 | 
168 |     def _db_transaction(self, kgdb: NoSQLKnowledgeGraph, option_name: str, data: EdgeData) -> None:
169 |         # defines the db transaction that this benchmark run should compare
170 |         try:
171 |             kgdb.add_edge(edge_data=data, directed=True)
172 |             # print(f'Success adding edge {data.edge_uid} with {option_name}')
173 |         except Exception as e:
174 |             print(f"Error adding edge {data.edge_uid} with {option_name}: {e}")
175 |         return None
176 | 
177 | 
178 | class NodeQueryBenchmark(KGDBBenchmark):
179 |     """
180 |     Define Latency Benchmark for node query. Inhertits from KGDBBenchmark.
181 |     Implements _construct_data and _db_transaction methods for node query.
182 |     """
183 |     def _construct_data(self, row: Any):
184 |         record_values = row.values()
185 |         node_uid = record_values[0]
186 |         return  node_uid
187 | 
188 |     def _db_transaction(self, kgdb: NoSQLKnowledgeGraph, option_name, data: str):
189 |         # defines the db transaction that this benchmark run should compare
190 |         try:
191 |             kgdb.get_node(node_uid=data)
192 |             # print(f'Success fetching node data {data} with {option_name}')
193 |         except Exception as e:
194 |             print(f"Error fetching node data {data} with {option_name}: {e}")
195 |         return None
196 | 
197 | 
198 | if __name__ == "__main__":
199 |     os.chdir('../')
200 |     current_directory = os.getcwd()
201 |     print(f"Current directory: {current_directory}")
202 | 
203 |     secrets = dotenv_values(".env")
204 |     credentials, _ = google.auth.load_credentials_from_file(
205 |         secrets["GCP_CREDENTIAL_FILE"])
206 | 
207 |     IMPORT_LIMIT = 100
208 | 
209 |     # Fetch Node data from BigQuery
210 |     client = bigquery.Client(project=str(
211 |         secrets["GCP_PROJECT_ID"]), credentials=credentials)
212 | 
213 |     fskg = FirestoreKG(gcp_credential_file=str(secrets["GCP_CREDENTIAL_FILE"]),
214 |                        gcp_project_id=str(secrets["GCP_PROJECT_ID"]),
215 |                        firestore_db_id=str(secrets["FIRESTORE_DB_ID"]),
216 |                        node_collection_id=str(secrets["NODE_COLL_ID"]),
217 |                        edges_collection_id=str(secrets["EDGES_COLL_ID"]),
218 |                        community_collection_id=str(
219 |         secrets["COMM_COLL_ID"])
220 |     )
221 | 
222 |     aura_kg = AuraKG(uri=str(secrets["NEO4J_URI"]),
223 |                      auth=(str(secrets["NEO4J_USERNAME"]),
224 |                            str(secrets["NEO4J_PASSWORD"]))
225 |                      )
226 | 
227 |     # clean kg storages before starting test run
228 |     # fskg.flush_kg()
229 |     # aura_kg.flush_kg()
230 | 
231 |     # # # add nodes testing
232 |     # query_job = client.query(
233 |     #     f"SELECT * FROM poerschmann-sem-search.wikidata_kg.entity_doc_alias_joined LIMIT {import_lim}")
234 |     # add_nodes_testing = NodeImportBenchmark(benchmark_name="Node Import", option_1=fskg, option_2=aura_kg, import_lim=100)
235 |     # add_nodes_testing(records=query_job)
236 | 
237 |     # # add egdes testing
238 |     # query_job = client.query(
239 |     #     f"SELECT * FROM poerschmann-sem-search.wikidata_kg.triplets_relations_joined")
240 |     # add_edges_testing = EdgeImportBenchmark(benchmark_name="Edge Import", option_1=fskg, option_2=aura_kg, import_lim=100)
241 |     # add_edges_testing(records=query_job)
242 | 
243 |     print('hello base!')
244 | 


--------------------------------------------------------------------------------
/benchmarks/requirements.txt:
--------------------------------------------------------------------------------
1 | google-cloud-bigquery==3.26.0
2 | 


--------------------------------------------------------------------------------
/databases/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jakobap/graph2nosql/77df8ecba857c61381a37b878d57c20d52ff9834/databases/__init__.py


--------------------------------------------------------------------------------
/databases/firestore_kg.py:
--------------------------------------------------------------------------------
  1 | """Firestore database operations implementation"""
  2 | 
  3 | from typing import List
  4 | 
  5 | import firebase_admin  # type: ignore
  6 | from firebase_admin import firestore
  7 | from google.cloud.firestore_v1.base_vector_query import DistanceMeasure
  8 | from google.cloud.firestore_v1.vector import Vector
  9 | import google.auth
 10 | 
 11 | import networkx as nx  # type: ignore
 12 | 
 13 | from datamodel.data_model import NodeData, EdgeData, CommunityData
 14 | from base.operations import NoSQLKnowledgeGraph
 15 | 
 16 | 
 17 | class FirestoreKG(NoSQLKnowledgeGraph):
 18 |     """Firestore database operations implementation class"""
 19 | 
 20 |     def __init__(self,
 21 |                  gcp_project_id: str,
 22 |                  gcp_credential_file: str,
 23 |                  firestore_db_id: str,
 24 |                  node_collection_id: str,
 25 |                  edges_collection_id: str,
 26 |                  community_collection_id: str
 27 |                  ) -> None:
 28 |         """
 29 |         Initializes the FirestoreKG object.
 30 | 
 31 |         Args:
 32 |             project_id (str): The Google Cloud project ID.
 33 |             database_id (str): The ID of the Firestore database.
 34 |             collection_name (str): The name of the collection to store the KG.
 35 |         """
 36 |         super().__init__()
 37 | 
 38 |         if not firebase_admin._apps:
 39 |             credentials = firebase_admin.credentials.Certificate(
 40 |                 gcp_credential_file
 41 |             )
 42 |             app = firebase_admin.initialize_app(credentials)
 43 | 
 44 |         self.credentials, self.project_id = google.auth.load_credentials_from_file(
 45 |             gcp_credential_file)
 46 | 
 47 |         self.db = firestore.Client(project=gcp_project_id,  # type: ignore
 48 |                                    credentials=self.credentials,
 49 |                                    database=firestore_db_id)
 50 | 
 51 |         self.gcp_project_id = gcp_project_id
 52 |         self.database_id = firestore_db_id
 53 |         self.node_coll_id = node_collection_id
 54 |         self.edges_coll_id = edges_collection_id
 55 |         self.community_coll_id = community_collection_id
 56 | 
 57 |     def add_node(self, node_uid: str, node_data: NodeData) -> None:
 58 |         """Adds an node to the knowledge graph."""
 59 |         doc_ref = self.db.collection(self.node_coll_id).document(node_uid)
 60 | 
 61 |         # Check if a node with the same node_uid already exists
 62 |         if doc_ref.get().exists:
 63 |             raise ValueError(
 64 |                 f"Error: Node with node_uid '{node_uid}' already exists.")
 65 | 
 66 |         # block NodeData if edge info is included
 67 |         if node_data.edges_to or node_data.edges_from:
 68 |             raise ValueError(
 69 |                 f"""Error: NodeData cannot be initiated with edges_to or edges_from. Please add edges separately.""")
 70 | 
 71 |         # Convert NodeData to a dictionary for Firestore storage
 72 |         try:
 73 |             node_data_dict = node_data.__dict__
 74 |         except TypeError as e:
 75 |             raise ValueError(
 76 |                 f"Error: Provided node_data for node_uid '{node_uid}' cannot be converted to a dictionary. Details: {e}"
 77 |             ) from e
 78 | 
 79 |         # Set the document ID to match the node_uid
 80 |         try:
 81 |             doc_ref.set(node_data_dict)
 82 |         except ValueError as e:
 83 |             raise ValueError(
 84 |                 f"Error: Could not add node with node_uid '{node_uid}' to Firestore. Details: {e}"
 85 |             ) from e
 86 | 
 87 |         # Update references in other nodes
 88 |         for other_node_uid in node_data.edges_to:
 89 |             try:
 90 |                 other_node_data = self.get_node(other_node_uid)
 91 |                 other_node_data.edges_from = list(set(other_node_data.edges_from) | {
 92 |                                                   node_uid})  # Add to edges_from
 93 |                 self.update_node(other_node_uid, other_node_data)
 94 |             except KeyError:
 95 |                 # If the other node doesn't exist, just continue
 96 |                 continue
 97 | 
 98 |         for other_node_uid in node_data.edges_from:
 99 |             try:
100 |                 other_node_data = self.get_node(other_node_uid)
101 |                 other_node_data.edges_to = list(set(other_node_data.edges_from) | {
102 |                                                 node_uid})  # Add to edges_to
103 |                 self.update_node(other_node_uid, other_node_data)
104 |             except KeyError:
105 |                 # If the other node doesn't exist, just continue
106 |                 continue
107 | 
108 |     def get_node(self, node_uid: str) -> NodeData:
109 |         """Retrieves an node from the knowledge graph."""
110 |         doc_ref = self.db.collection(self.node_coll_id).document(node_uid)
111 |         doc_snapshot = doc_ref.get()
112 | 
113 |         if doc_snapshot.exists:
114 |             try:
115 |                 node_data = NodeData(**doc_snapshot.to_dict())
116 |                 return node_data
117 |             except TypeError as e:
118 |                 raise ValueError(
119 |                     f"Error: Data fetched for node_uid '{node_uid}' does not match the NodeData format. Details: {e}"
120 |                 ) from e
121 |         else:
122 |             raise KeyError(f"Error: No node found with node_uid: {node_uid}")
123 | 
124 |     def update_node(self, node_uid: str, node_data: NodeData) -> None:
125 |         """Updates an existing node in the knowledge graph."""
126 |         doc_ref = self.db.collection(self.node_coll_id).document(node_uid)
127 | 
128 |         # Check if the node exists
129 |         if not doc_ref.get().exists:
130 |             raise KeyError(
131 |                 f"Error: Node with node_uid '{node_uid}' does not exist.")
132 | 
133 |         # Convert NodeData to a dictionary for Firestore storage
134 |         try:
135 |             node_data_dict = node_data.__dict__
136 |         except TypeError as e:
137 |             raise ValueError(
138 |                 f"Error: Provided node_data for node_uid '{node_uid}' cannot be converted to a dictionary. Details: {e}"
139 |             ) from e
140 | 
141 |         # Update the document
142 |         try:
143 |             doc_ref.update(node_data_dict)
144 |         except ValueError as e:
145 |             raise ValueError(
146 |                 f"Error: Could not update node with node_uid '{node_uid}' in Firestore. Details: {e}"
147 |             ) from e
148 | 
149 |     def remove_node(self, node_uid: str) -> None:
150 |         """
151 |         Removes an node from the knowledge graph.
152 |         Also removed all edges to and from the node to be removed from all other nodes.
153 |         """
154 |         doc_ref = self.db.collection(self.node_coll_id).document(node_uid)
155 | 
156 |         # TODO: Update edge collection on edge removal.
157 | 
158 |         # Check if the node exists
159 |         if not doc_ref.get().exists:
160 |             raise KeyError(
161 |                 f"Error: Node with node_uid '{node_uid}' does not exist.")
162 | 
163 |         # 1. Get the node data to find its connections
164 |         node_data = self.get_node(node_uid)
165 | 
166 |         # 2. Remove connections TO this node from other nodes
167 |         for other_node_uid in node_data.edges_from:
168 |             try:
169 |                 other_node_data = self.get_node(other_node_uid)
170 |                 other_node_data.edges_to = list(
171 |                     edge for edge in other_node_data.edges_to if edge != node_uid
172 |                 )
173 |                 self.update_node(other_node_uid, other_node_data)
174 |             except KeyError:
175 |                 # If the other node doesn't exist, just continue
176 |                 continue
177 | 
178 |         # 3. Remove connections FROM this node to other nodes
179 |         for other_node_uid in node_data.edges_to:
180 |             try:
181 |                 other_node_data = self.get_node(other_node_uid)
182 |                 other_node_data.edges_from = list(
183 |                     edge for edge in other_node_data.edges_from if edge != node_uid
184 |                 )
185 |                 self.update_node(other_node_uid, other_node_data)
186 |             except KeyError:
187 |                 # If the other node doesn't exist, just continue
188 |                 continue
189 | 
190 |         # 4. Finally, remove the node itself
191 |         doc_ref.delete()
192 | 
193 |     def add_edge(self, edge_data: EdgeData) -> None:
194 |         """
195 |         Adds an edge (relationship) between two entities in the knowledge graph.
196 | 
197 |         Args:
198 |             source_uid (str): The UID of the source node.
199 |             target_uid (str): The UID of the target node.
200 |             edge_data (EdgeData): The edge data to be added.
201 |             directed (bool, optional): Whether the edge is directed. Defaults to True.
202 |         """
203 | 
204 |         # Check if source and target nodes exist
205 |         if not self.get_node(edge_data.source_uid):
206 |             raise KeyError(
207 |                 f"Error: Source node with node_uid '{edge_data.source_uid}' does not exist.")
208 |         if not self.get_node(edge_data.target_uid):
209 |             raise KeyError(
210 |                 f"Error: Target node with node_uid '{edge_data.target_uid}' does not exist.")
211 | 
212 |         # Type checking for edge_data
213 |         if not isinstance(edge_data, EdgeData):
214 |             raise TypeError(
215 |                 f"Error: edge_data must be of type EdgeData, not {type(edge_data)}")
216 | 
217 |         edge_uid = self._generate_edge_uid(
218 |             source_uid=edge_data.source_uid, target_uid=edge_data.target_uid)
219 | 
220 |         try:
221 |             source_node_data = self.get_node(edge_data.source_uid)
222 |             target_node_data = self.get_node(edge_data.target_uid)
223 | 
224 |             source_node_data.edges_to = list(
225 |                 set(source_node_data.edges_to) | {edge_data.target_uid})
226 |             self.update_node(edge_data.source_uid, source_node_data)
227 | 
228 |             # Add the edge to the target node's edges_from
229 |             target_node_data.edges_from = list(
230 |                 set(target_node_data.edges_from) | {edge_data.source_uid})
231 |             self.update_node(edge_data.target_uid, target_node_data)
232 | 
233 |             # Add the edge to the edges collection
234 |             self._update_egde_coll(edge_uid=edge_uid,
235 |                                    target_uid=edge_data.target_uid,
236 |                                    source_uid=edge_data.source_uid,
237 |                                    description=edge_data.description,
238 |                                    directed=edge_data.directed)
239 | 
240 |             if not edge_data.directed:  # If undirected, add the reverse edge as well
241 |                 target_node_data.edges_to = list(
242 |                     set(target_node_data.edges_to) | {edge_data.source_uid})
243 |                 self.update_node(edge_data.target_uid, target_node_data)
244 | 
245 |                 # Since it's undirected, also add source_uid to target_node_data.edges_from
246 |                 source_node_data.edges_from = list(
247 |                     set(source_node_data.edges_from) | {edge_data.target_uid})
248 |                 self.update_node(edge_data.source_uid, source_node_data)
249 | 
250 |                 # Add the reverse edge to the edges collection
251 |                 reverse_edge_uid = self._generate_edge_uid(source_uid=edge_data.target_uid,
252 |                                                            target_uid=edge_data.source_uid)
253 |                 self._update_egde_coll(edge_uid=reverse_edge_uid,
254 |                                        target_uid=edge_data.source_uid,
255 |                                        source_uid=edge_data.target_uid,
256 |                                        description=edge_data.description,
257 |                                        directed=edge_data.directed)
258 | 
259 |         except ValueError as e:
260 |             raise ValueError(
261 |                 f"Error: Could not add edge from '{edge_data.source_uid}' to '{edge_data.target_uid}'. Details: {e}"
262 |             ) from e
263 | 
264 |     def get_edge(self, source_uid: str, target_uid: str) -> EdgeData:
265 |         """Retrieves an edge between two entities from the edges collection."""
266 |         edge_uid = self._generate_edge_uid(source_uid, target_uid)
267 |         edge_doc_ref = self.db.collection(
268 |             self.edges_coll_id).document(edge_uid)
269 |         doc_snapshot = edge_doc_ref.get()
270 | 
271 |         if doc_snapshot.exists:
272 |             try:
273 |                 edge_data = EdgeData(**doc_snapshot.to_dict())
274 |                 return edge_data
275 |             except TypeError as e:
276 |                 raise ValueError(
277 |                     f"Error: Data fetched for edge_uid '{edge_uid}' does not match the EdgeData format. Details: {e}"
278 |                 ) from e
279 |         else:
280 |             raise KeyError(f"Error: No edge found with edge_uid: {edge_uid}")
281 | 
282 |     def update_edge(self, edge_data: EdgeData) -> None:
283 |         """Updates an existing edge in the knowledge graph."""
284 | 
285 |         # 1. Validate input and check if the edge exists
286 |         if not isinstance(edge_data, EdgeData):
287 |             raise TypeError(
288 |                 f"Error: edge_data must be of type EdgeData, not {type(edge_data)}")
289 | 
290 |         edge_uid = self._generate_edge_uid(
291 |             edge_data.source_uid, edge_data.target_uid)
292 | 
293 |         if not self.db.collection(self.edges_coll_id).document(edge_uid).get().exists:
294 |             raise KeyError(
295 |                 f"Error: Edge with edge_uid '{edge_uid}' does not exist.")
296 | 
297 |         # 2. Update the edge document in the EDGES collection
298 |         try:
299 |             self._update_egde_coll(
300 |                 edge_uid=edge_uid,
301 |                 target_uid=edge_data.target_uid,
302 |                 source_uid=edge_data.source_uid,
303 |                 description=edge_data.description,
304 |                 directed=edge_data.directed
305 |             )
306 |         except Exception as e:
307 |             raise Exception(
308 |                 f"Error updating edge in edges collection: {e}") from e
309 | 
310 |         # 3. Update edge references in the NODES collection
311 |         try:
312 |             # 3a. Update source node
313 |             source_node_data = self.get_node(edge_data.source_uid)
314 |             # Ensure the target_uid is present in edges_to
315 |             if edge_data.target_uid not in source_node_data.edges_to:
316 |                 source_node_data.edges_to = list(
317 |                     set(source_node_data.edges_to) | {edge_data.target_uid})
318 |                 self.update_node(edge_data.source_uid, source_node_data)
319 | 
320 |             # 3b. Update target node
321 |             target_node_data = self.get_node(edge_data.target_uid)
322 |             # Ensure the source_uid is present in edges_from
323 |             if edge_data.source_uid not in target_node_data.edges_from:
324 |                 target_node_data.edges_from = list(
325 |                     set(target_node_data.edges_from) | {edge_data.source_uid})
326 |                 self.update_node(edge_data.target_uid, target_node_data)
327 | 
328 |         except Exception as e:
329 |             raise Exception(
330 |                 f"Error updating edge references in nodes: {e}") from e
331 | 
332 |     def _delete_from_edge_coll(self, edge_uid: str) -> None:
333 |         """Method to delete record from edge collection of given kg store"""
334 |         edge_doc_ref = self.db.collection(
335 |             self.edges_coll_id).document(edge_uid)
336 |         edge_doc_ref.delete()
337 | 
338 |     def remove_edge(self, source_uid: str, target_uid: str) -> None:
339 |         """Removes an edge between two entities."""
340 | 
341 |         # Get involved edge and node data
342 |         try:
343 |             edge_data = self.get_edge(
344 |                 source_uid=source_uid, target_uid=target_uid)
345 |         except Exception as e:
346 |             raise Exception(f"Error getting edge: {e}") from e
347 | 
348 |         try:
349 |             source_node_data = self.get_node(node_uid=source_uid)
350 |         except Exception as e:
351 |             raise Exception(f"Error getting source node: {e}") from e
352 | 
353 |         try:
354 |             target_node_data = self.get_node(node_uid=target_uid)
355 |         except Exception as e:
356 |             raise Exception(f"Error getting target node: {e}") from e
357 | 
358 |         # remove target_uid from from source -> target
359 |         try:
360 |             source_node_data.edges_to.remove(target_uid)
361 |             self.update_node(source_uid, source_node_data)
362 |         except ValueError as e:
363 |             raise ValueError(
364 |                 f"Error: Target node not in source's edges_to: {e}")
365 | 
366 |         # remove source_uid from target <- source
367 |         try:
368 |             target_node_data.edges_from.remove(source_uid)
369 |             self.update_node(target_uid, target_node_data)
370 |         except ValueError as e:
371 |             raise ValueError(
372 |                 f"Error: Source node not in target's edges_to: {e}")
373 | 
374 |         # Remove the edge from the edges collection
375 |         edge_uid = self._generate_edge_uid(source_uid, target_uid)
376 |         self._delete_from_edge_coll(edge_uid=edge_uid)
377 | 
378 |         # remove the opposite direction if edge undirected
379 |         if not edge_data.directed:
380 |             # remove target_uid from source <- target
381 |             try:
382 |                 source_node_data.edges_from.remove(target_uid)
383 |                 self.update_node(source_uid, source_node_data)
384 |             except ValueError as e:
385 |                 raise ValueError(
386 |                     f"Error: Target node not in source's edges_to: {e}")
387 | 
388 |             # remove source_uid from target -> source
389 |             try:
390 |                 target_node_data.edges_to.remove(source_uid)
391 |                 self.update_node(target_uid, target_node_data)
392 |             except ValueError as e:
393 |                 raise ValueError(
394 |                     f"Error: Source node not in target's edges_to: {e}")
395 | 
396 |             # Remove the edge from the edges collection
397 |             reverse_edge_uid = self._generate_edge_uid(target_uid, source_uid)
398 |             self._delete_from_edge_coll(edge_uid=reverse_edge_uid)
399 |         else:
400 |             pass
401 | 
402 |     def build_networkx(self):
403 |         """Get the NetworkX representation of the full graph."""
404 | 
405 |         graph = nx.Graph()  # Initialize an undirected NetworkX graph
406 | 
407 |         # 1. Add Nodes to the NetworkX Graph
408 |         nodes_ref = self.db.collection(self.node_coll_id).stream()
409 |         for doc in nodes_ref:
410 |             node_data = doc.to_dict()
411 |             graph.add_node(doc.id, **node_data)
412 | 
413 |         # 2. Add Edges to the NetworkX Graph
414 |         edges_ref = self.db.collection(self.edges_coll_id).stream()
415 |         for doc in edges_ref:
416 |             edge_data = doc.to_dict()
417 |             source_uid = edge_data['source_uid']
418 |             target_uid = edge_data['target_uid']
419 |             # Consider adding edge attributes if needed (e.g., 'description')
420 |             graph.add_edge(source_uid, target_uid)
421 | 
422 |         self.networkx = graph
423 | 
424 |     def get_community(self, community_id: str) -> CommunityData:
425 |         """Retrieves the community report for a given community id."""
426 |         doc_ref = self.db.collection(
427 |             self.community_coll_id).document(community_id)
428 |         doc_snapshot = doc_ref.get()
429 | 
430 |         if doc_snapshot.exists:
431 |             try:
432 |                 community_data = CommunityData(**doc_snapshot.to_dict())
433 |                 return community_data
434 |             except TypeError as e:
435 |                 raise ValueError(
436 |                     f"Error: Data fetched for community_id '{community_id}' does not match the CommunityData format. Details: {e}"
437 |                 ) from e
438 |         else:
439 |             raise KeyError(
440 |                 f"Error: No community found with community_id: {community_id}")
441 | 
442 |     def list_communities(self) -> List[CommunityData]:
443 |         """Lists all communities for the given network."""
444 |         docs = self.db.collection(self.community_coll_id).stream()
445 |         return [CommunityData.__from_dict__(doc.to_dict()) for doc in docs]
446 | 
447 |     def _update_egde_coll(self, edge_uid: str, source_uid: str, target_uid: str, description: str, directed: bool) -> None:
448 |         """Update edge record in the edges collection."""
449 |         edge_doc_ref = self.db.collection(
450 |             self.edges_coll_id).document(edge_uid)
451 |         edge_data_dict = {
452 |             "edge_uid": edge_uid,
453 |             "source_uid": source_uid,
454 |             "target_uid": target_uid,
455 |             "description": description,
456 |             "directed": directed
457 |         }
458 |         edge_doc_ref.set(edge_data_dict)
459 | 
460 |     def store_community(self, community: CommunityData) -> None:
461 |         """Takes valid graph community data and upserts the database with it.
462 |         https://www.nature.com/articles/s41598-019-41695-z
463 |         """
464 |         # Convert CommunityData to a dictionary for Firestore storage
465 |         try:
466 |             community_data_dict = community.__dict__
467 |         except TypeError as e:
468 |             raise ValueError(
469 |                 f"Error: Provided community data for community '{community.title}' cannot be converted to a dictionary. Details: {e}"
470 |             ) from e
471 | 
472 |         # Get a reference to the document
473 |         doc_ref = self.db.collection(
474 |             self.community_coll_id).document(community.title)
475 | 
476 |         # Use set with merge=True to upsert the document
477 |         try:
478 |             doc_ref.set(community_data_dict, merge=True)
479 |         except Exception as e:
480 |             raise Exception(f"Error storing community data: {e}") from e
481 | 
482 |     def _generate_edge_uid(self, source_uid: str, target_uid: str):
483 |         return f"{source_uid}_to_{target_uid}"
484 | 
485 |     def node_exist(self, node_uid: str) -> bool:
486 |         """Checks for node existence and returns boolean"""
487 |         doc_ref = self.db.collection(self.node_coll_id).document(node_uid)
488 |         doc_snapshot = doc_ref.get()
489 | 
490 |         if doc_snapshot.exists:
491 |             return True
492 |         else:
493 |             return False
494 | 
495 |     def edge_exist(self, source_uid: str, target_uid: str) -> bool:
496 |         """Checks for edge existence and returns boolean"""
497 |         edge_uid = self._generate_edge_uid(
498 |             source_uid=source_uid, target_uid=target_uid)
499 |         doc_ref = self.db.collection(self.edges_coll_id).document(edge_uid)
500 |         doc_snapshot = doc_ref.get()
501 | 
502 |         if doc_snapshot.exists:
503 |             return True
504 |         else:
505 |             return False
506 | 
507 |     def get_nearest_neighbors(self, query_vec: list[float]) -> list:
508 |         """
509 |         Implements nearest neighbor search based on Firestore embedding index:
510 |         https://firebase.google.com/docs/firestore/vector-search
511 |         """
512 | 
513 |         collection = self.db.collection(self.node_coll_id)
514 | 
515 |         # Requires vector index
516 |         nn = collection.find_nearest(
517 |             vector_field="embedding",
518 |             query_vector=Vector(query_vec),
519 |             distance_measure=DistanceMeasure.EUCLIDEAN,
520 |             limit=10).get()
521 |         return [n.to_dict() for n in nn]
522 | 
523 |     def clean_zerodegree_nodes(self) -> None:
524 |         """Removes all nodes with degree 0."""
525 |         nodes_to_remove = []
526 | 
527 |         # 1. Iterate through all nodes to find those with degree 0
528 |         nodes_ref = self.db.collection(self.node_coll_id).stream()
529 |         for doc in nodes_ref:
530 |             node_data = doc.to_dict()
531 |             if len(node_data.get('edges_to', [])) + len(node_data.get('edges_from', [])) == 0:
532 |                 nodes_to_remove.append(doc.id)
533 | 
534 |         # 2. Remove the identified nodes
535 |         for node_uid in nodes_to_remove:
536 |             self.remove_node(node_uid)
537 |         return None
538 | 
539 |     def flush_kg(self) -> None:
540 |         """Method to wipe the complete datastore of the knowledge graph"""
541 |         for collection_id in [self.node_coll_id, self.edges_coll_id, self.community_coll_id]:
542 |             docs = self.db.collection(collection_id).stream()
543 |             for doc in docs:
544 |                 doc.reference.delete()
545 |         return None
546 | 
547 | 
548 | if __name__ == "__main__":
549 |     import os
550 |     from dotenv import dotenv_values
551 | 
552 |     os.chdir(os.path.dirname(os.path.abspath(__file__)))
553 | 
554 |     secrets = dotenv_values(".env")
555 | 
556 |     firestore_credential_file = str(secrets["GCP_CREDENTIAL_FILE"])
557 |     project_id = str(secrets["GCP_PROJECT_ID"])
558 |     database_id = str(secrets["FIRESTORE_DB_ID"])
559 |     node_coll_id = str(secrets["NODE_COLL_ID"])
560 |     edges_coll_id = str(secrets["EDGES_COLL_ID"])
561 |     community_coll_id = str(secrets["COMM_COLL_ID"])
562 | 
563 |     fskg = FirestoreKG(
564 |         gcp_project_id=project_id,
565 |         gcp_credential_file=firestore_credential_file,
566 |         firestore_db_id=database_id,
567 |         node_collection_id=node_coll_id,
568 |         edges_collection_id=edges_coll_id,
569 |         community_collection_id=community_coll_id
570 |     )
571 | 
572 |     node = fskg.get_node(node_uid="2022 IRANIAN PROTESTS")
573 | 
574 |     nn = fskg.get_nearest_neighbors(node.embedding)
575 | 
576 |     for n in nn:
577 |         print(n["node_uid"])
578 | 
579 |     print("Hello World!")
580 |     print("")
581 | 


--------------------------------------------------------------------------------
/databases/mdb.py:
--------------------------------------------------------------------------------
  1 | """MongoDB Database Operations"""
  2 | 
  3 | from typing import List
  4 | 
  5 | from pymongo.mongo_client import MongoClient
  6 | from pymongo.server_api import ServerApi
  7 | 
  8 | from datamodel.data_model import NodeData, EdgeData, CommunityData
  9 | from base.operations import NoSQLKnowledgeGraph
 10 | 
 11 | import networkx as nx  # type: ignore
 12 | 
 13 | 
 14 | class MongoKG(NoSQLKnowledgeGraph):
 15 |     """MongoDB Database Operations Class"""
 16 | 
 17 |     def __init__(self,
 18 |                  mdb_uri: str,
 19 |                  mdb_db_id: str,
 20 |                  node_coll_id: str,
 21 |                  edges_coll_id: str,
 22 |                  community_collection_id: str
 23 |                  ):
 24 |         super().__init__()
 25 | 
 26 |         # Connect and send a ping to confirm a successful mongo db connection
 27 |         self.mdb_client = MongoClient(str(mdb_uri), server_api=ServerApi('1'))
 28 | 
 29 |         self.db = self.mdb_client[mdb_db_id]
 30 |         self.mdb_node_coll = self.db[node_coll_id]
 31 |         self.mdbe_edges_coll = self.db[edges_coll_id]
 32 |         self.mdb_comm_coll = self.db[community_collection_id]
 33 | 
 34 |         try:
 35 |             # client.admin.command('ping')
 36 |             self.mdb_client.admin.command('ping')
 37 |             print("Pinged your deployment. You successfully connected to MongoDB!")
 38 |         except Exception as e:
 39 |             print(e)
 40 |             raise Exception(f"Error connecting to MongoDB: {e}")
 41 | 
 42 |     def add_node(self, node_uid: str, node_data: NodeData) -> None:
 43 |         """Adds an node to the knowledge graph."""
 44 |         # Check if a node with the same node_uid already exists
 45 |         if self.mdb_node_coll.find_one({"node_uid": node_uid}):
 46 |             raise KeyError(
 47 |                 f"Error: Node with node_uid '{node_uid}' already exists.")
 48 | 
 49 |         if node_data.edges_to or node_data.edges_from:
 50 |             raise ValueError(
 51 |                 f"""Error: NodeData cannot be initiated with edges_to or edges_from. Please add edges separately.""")
 52 | 
 53 |         try:
 54 |             # Convert NodeData to a dictionary for MongoDB storage
 55 |             node_data_dict = node_data.__dict__
 56 | 
 57 |             # Insert the node data into the collection
 58 |             self.mdb_node_coll.insert_one(node_data_dict)
 59 | 
 60 |         except Exception as e:
 61 |             raise Exception(
 62 |                 f"Error adding node with node_uid '{node_uid}': {e}") from e
 63 | 
 64 |     def get_node(self, node_uid: str) -> NodeData:
 65 |         """Retrieves an node from the knowledge graph."""
 66 |         # Find the node data based on node_uid
 67 |         node_data_dict = self.mdb_node_coll.find_one({"node_uid": node_uid})
 68 | 
 69 |         if node_data_dict:
 70 |             # Convert the dictionary back to a NodeData object
 71 |             return NodeData(
 72 |                 node_uid=node_data_dict['node_uid'],
 73 |                 node_title=node_data_dict['node_title'],
 74 |                 node_type=node_data_dict['node_type'],
 75 |                 node_description=node_data_dict['node_description'],
 76 |                 node_degree=node_data_dict.get('node_degree', 0),
 77 |                 document_id=node_data_dict.get('document_id', ''),
 78 |                 community_id=node_data_dict.get('community_id', ''),
 79 |                 edges_to=node_data_dict.get('edges_to', []),
 80 |                 edges_from=node_data_dict.get('edges_from', []),
 81 |                 embedding=node_data_dict.get('embedding', [])
 82 |             )
 83 |         else:
 84 |             raise KeyError(f"Error: No node found with node_uid: {node_uid}")
 85 | 
 86 |     def update_node(self, node_uid: str, node_data: NodeData) -> None:
 87 |         """Updates an existing node in the knowledge graph."""
 88 |         try:
 89 |             # Check if the node exists
 90 |             if not self.mdb_node_coll.find_one({"node_uid": node_uid}):
 91 |                 raise KeyError(
 92 |                     f"Error: Node with node_uid '{node_uid}' does not exist.")
 93 | 
 94 |             # Convert NodeData to a dictionary for MongoDB storage
 95 |             node_data_dict = node_data.__dict__
 96 | 
 97 |             # Update the node data in the collection
 98 |             self.mdb_node_coll.update_one(
 99 |                 {"node_uid": node_uid}, {"$set": node_data_dict}
100 |             )
101 | 
102 |         except Exception as e:
103 |             raise Exception(
104 |                 f"Error updating node with node_uid '{node_uid}': {e}") from e
105 | 
106 |     def remove_node(self, node_uid: str) -> None:
107 |         """Removes a node from the knowledge graph."""
108 | 
109 |         # Check if the node exists
110 |         if not self.node_exist(node_uid=node_uid):
111 |             raise KeyError(
112 |                 f"Error: Node with node_uid '{node_uid}' does not exist.")
113 | 
114 |         # 1. Get the node data to find its connections
115 |         node_data = self.get_node(node_uid)
116 | 
117 |         # TODO: Update edge collection on edge removal.
118 | 
119 |         # 2. Remove connections TO this node from other nodes
120 |         for other_node_uid in node_data.edges_from:
121 |             try:
122 |                 other_node_data = self.get_node(other_node_uid)
123 |                 other_node_data.edges_to = list(
124 |                     edge for edge in other_node_data.edges_to if edge != node_uid
125 |                 )
126 |                 self.update_node(other_node_uid, other_node_data)
127 |             except KeyError:
128 |                 # If the other node doesn't exist, just continue
129 |                 continue
130 | 
131 |         # 3. Remove connections FROM this node to other nodes
132 |         for other_node_uid in node_data.edges_to:
133 |             try:
134 |                 other_node_data = self.get_node(other_node_uid)
135 |                 other_node_data.edges_from = list(
136 |                     edge for edge in other_node_data.edges_from if edge != node_uid
137 |                 )
138 |                 self.update_node(other_node_uid, other_node_data)
139 |             except KeyError:
140 |                 # If the other node doesn't exist, just continue
141 |                 continue
142 | 
143 |         # 4. Finally, remove the node itself
144 |         delete_result = self.mdb_node_coll.delete_one({"node_uid": node_uid})
145 |         if delete_result.deleted_count == 1:
146 |             return None
147 |         else:
148 |             raise KeyError(f"Error: No node found with node_uid: {node_uid}")
149 | 
150 |     def add_edge(self, edge_data: EdgeData) -> None:
151 |         """Adds an edge (relationship) between two entities in the knowledge graph."""
152 | 
153 |         # TODO: consider moving to base class.
154 | 
155 |         # Check if source and target nodes exist
156 |         if not self.node_exist(edge_data.source_uid):
157 |             raise KeyError(
158 |                 f"Error: Source node with node_uid '{edge_data.source_uid}' does not exist.")
159 |         if not self.node_exist(edge_data.target_uid):
160 |             raise KeyError(
161 |                 f"Error: Target node with node_uid '{edge_data.target_uid}' does not exist.")
162 | 
163 |         # Type checking for edge_data
164 |         if not isinstance(edge_data, EdgeData):
165 |             raise TypeError(
166 |                 f"Error: edge_data must be of type EdgeData, not {type(edge_data)}")
167 | 
168 |         edge_uid = self._generate_edge_uid(
169 |             edge_data.source_uid, edge_data.target_uid)
170 | 
171 |         try:
172 |             source_node_data = self.get_node(edge_data.source_uid)
173 |             target_node_data = self.get_node(edge_data.target_uid)
174 | 
175 |             source_node_data.edges_to = list(
176 |                 set(source_node_data.edges_to) | {edge_data.target_uid})
177 |             self.update_node(edge_data.source_uid, source_node_data)
178 | 
179 |             # Add the edge to the target node's edges_from
180 |             target_node_data.edges_from = list(
181 |                 set(target_node_data.edges_from) | {edge_data.source_uid})
182 |             self.update_node(edge_data.target_uid, target_node_data)
183 | 
184 |             # Add the edge to the edges collection
185 |             self._update_egde_coll(edge_uid=edge_uid,
186 |                                    target_uid=edge_data.target_uid,
187 |                                    source_uid=edge_data.source_uid,
188 |                                    description=edge_data.description,
189 |                                    directed=edge_data.directed)
190 | 
191 |             if not edge_data.directed:  # If undirected, add the reverse edge as well
192 |                 reverse_edge_uid = self._generate_edge_uid(
193 |                     edge_data.target_uid, edge_data.source_uid)
194 | 
195 |                 target_node_data.edges_to = list(
196 |                     set(target_node_data.edges_to) | {edge_data.source_uid})
197 |                 self.update_node(edge_data.target_uid, target_node_data)
198 | 
199 |                 # Since it's undirected, also add source_uid to target_node_data.edges_from
200 |                 source_node_data.edges_from = list(
201 |                     set(source_node_data.edges_from) | {edge_data.target_uid})
202 |                 self.update_node(edge_data.source_uid, source_node_data)
203 | 
204 |                 # Add the reverse edge to the edges collection
205 |                 self._update_egde_coll(edge_uid=reverse_edge_uid,
206 |                                        target_uid=edge_data.source_uid,
207 |                                        source_uid=edge_data.target_uid,
208 |                                        description=edge_data.description,
209 |                                        directed=edge_data.directed)
210 | 
211 |         except ValueError as e:
212 |             raise ValueError(
213 |                 f"Error: Could not add edge from '{edge_data.source_uid}' to '{edge_data.target_uid}'. Details: {e}"
214 |             ) from e
215 | 
216 |     def get_edge(self, source_uid: str, target_uid: str) -> EdgeData:
217 |         """Retrieves an edge between two entities."""
218 |         edge_uid = self._generate_edge_uid(source_uid, target_uid)
219 |         edge_data_dict = self.mdbe_edges_coll.find_one({"edge_uid": edge_uid})
220 | 
221 |         if edge_data_dict:
222 |             return EdgeData(
223 |                 edge_uid=edge_data_dict.get('edge_uid', ''),
224 |                 source_uid=edge_data_dict.get('source_uid', ''),
225 |                 target_uid=edge_data_dict.get('target_uid', ''),
226 |                 description=edge_data_dict.get('description', ''),
227 |                 directed=edge_data_dict.get('directed', True)
228 |             )
229 |         else:
230 |             raise KeyError(f"Error: No edge found with edge_uid: {edge_uid}")
231 | 
232 |     def update_edge(self, edge_data: EdgeData) -> None:
233 |         """Updates an existing edge in the knowledge graph."""
234 | 
235 |         # TODO: Consider moving to base
236 | 
237 |         # 1. Validate input and check if the edge exists
238 |         if not isinstance(edge_data, EdgeData):
239 |             raise TypeError(
240 |                 f"Error: edge_data must be of type EdgeData, not {type(edge_data)}")
241 | 
242 |         edge_uid = self._generate_edge_uid(
243 |             edge_data.source_uid, edge_data.target_uid)
244 | 
245 |         if not self.edge_exist(source_uid=edge_data.source_uid, target_uid=edge_data.target_uid):
246 |             raise KeyError(
247 |                 f"Error: Edge with edge_uid '{edge_uid}' does not exist.")
248 | 
249 |         # 2. Update the edge document in the EDGES collection
250 |         try:
251 |             self._update_egde_coll(
252 |                 edge_uid=edge_uid,
253 |                 target_uid=edge_data.target_uid,
254 |                 source_uid=edge_data.source_uid,
255 |                 description=edge_data.description,
256 |                 directed=edge_data.directed
257 |             )
258 |         except Exception as e:
259 |             raise Exception(
260 |                 f"Error updating edge in edges collection: {e}") from e
261 | 
262 |         # 3. Update edge references in the NODES collection
263 |         try:
264 |             # 3a. Update source node
265 |             source_node_data = self.get_node(edge_data.source_uid)
266 |             # Ensure the target_uid is present in edges_to
267 |             if edge_data.target_uid not in source_node_data.edges_to:
268 |                 source_node_data.edges_to = list(
269 |                     set(source_node_data.edges_to) | {edge_data.target_uid})
270 |                 self.update_node(edge_data.source_uid, source_node_data)
271 | 
272 |             # 3b. Update target node
273 |             target_node_data = self.get_node(edge_data.target_uid)
274 |             # Ensure the source_uid is present in edges_from
275 |             if edge_data.source_uid not in target_node_data.edges_from:
276 |                 target_node_data.edges_from = list(
277 |                     set(target_node_data.edges_from) | {edge_data.source_uid})
278 |                 self.update_node(edge_data.target_uid, target_node_data)
279 | 
280 |         except Exception as e:
281 |             raise Exception(
282 |                 f"Error updating edge references in nodes: {e}") from e
283 | 
284 |     def _delete_from_edge_coll(self, edge_uid: str) -> None:
285 |         """Method to delete record from edge collection of given kg store"""
286 |         delete_result = self.mdbe_edges_coll.delete_one({"edge_uid": edge_uid})
287 |         if delete_result.deleted_count == 0:
288 |             raise KeyError(
289 |                 f"Error: No edge found with source_uid '{source_uid}' and target_uid '{target_uid}'")
290 | 
291 |     def remove_edge(self, source_uid: str, target_uid: str) -> None:
292 |         """Removes an edge between two entities."""
293 | 
294 |         # Get involved edge and node data
295 |         try:
296 |             edge_data = self.get_edge(
297 |                 source_uid=source_uid, target_uid=target_uid)
298 |         except Exception as e:
299 |             raise KeyError(f"Error getting edge: {e}") from e
300 | 
301 |         try:
302 |             source_node_data = self.get_node(node_uid=source_uid)
303 |         except Exception as e:
304 |             raise KeyError(f"Error getting source node: {e}") from e
305 | 
306 |         try:
307 |             target_node_data = self.get_node(node_uid=target_uid)
308 |         except Exception as e:
309 |             raise KeyError(f"Error getting target node: {e}") from e
310 | 
311 |         # remove target_uid from from source -> target
312 |         try:
313 |             source_node_data.edges_to.remove(target_uid)
314 |             self.update_node(source_uid, source_node_data)
315 |         except ValueError as e:
316 |             raise ValueError(
317 |                 f"Error: Target node not in source's edges_to: {e}") from e
318 | 
319 |         # remove source_uid from target <- source
320 |         try:
321 |             target_node_data.edges_from.remove(source_uid)
322 |             self.update_node(target_uid, target_node_data)
323 |         except ValueError as e:
324 |             raise ValueError(
325 |                 f"Error: Source node not in target's edges_to: {e}") from e
326 | 
327 |         # Remove the edge from the edges collection
328 |         edge_uid = self._generate_edge_uid(source_uid, target_uid)
329 |         self._delete_from_edge_coll(edge_uid=edge_uid)
330 | 
331 |         # remove the opposite direction if edge undirected
332 |         if not edge_data.directed:
333 |             # remove target_uid from source <- target
334 |             try:
335 |                 source_node_data.edges_from.remove(target_uid)
336 |                 self.update_node(source_uid, source_node_data)
337 |             except ValueError as e:
338 |                 raise ValueError(
339 |                     f"Error: Target node not in source's edges_to: {e}") from e
340 | 
341 |             # remove source_uid from target -> source
342 |             try:
343 |                 target_node_data.edges_to.remove(source_uid)
344 |                 self.update_node(target_uid, target_node_data)
345 |             except ValueError as e:
346 |                 raise ValueError(
347 |                     f"Error: Source node not in target's edges_to: {e}") from e
348 | 
349 |             # Remove the edge from the edges collection
350 |             reverse_edge_uid = self._generate_edge_uid(source_uid=target_uid,
351 |                                                        target_uid=source_uid)
352 |             self._delete_from_edge_coll(edge_uid=reverse_edge_uid)
353 |         else:
354 |             pass
355 | 
356 |     def build_networkx(self) -> None:
357 |         """Builds the NetworkX representation of the full graph.
358 |         https://networkx.org/documentation/stable/index.html
359 |         """
360 |         graph = nx.Graph()  # Initialize an undirected NetworkX graph
361 | 
362 |         # 1. Add Nodes to the NetworkX Graph
363 |         for node in self.mdb_node_coll.find():
364 |             graph.add_node(node['node_uid'], **node)
365 | 
366 |         # 2. Add Edges to the NetworkX Graph
367 |         for edge in self.mdbe_edges_coll.find():
368 |             source_uid = edge['source_uid']
369 |             target_uid = edge['target_uid']
370 |             graph.add_edge(source_uid, target_uid)
371 | 
372 |         self.networkx = graph
373 | 
374 |     def store_community(self, community: CommunityData) -> None:
375 |         """Takes valid graph community data and upserts the database with it.
376 |         https://www.nature.com/articles/s41598-019-41695-z
377 |         """
378 |         pass
379 | 
380 |     def _generate_edge_uid(self, source_uid: str, target_uid: str):
381 |         return f"{source_uid}_to_{target_uid}"
382 | 
383 |     def _update_egde_coll(self, edge_uid: str, source_uid: str,
384 |                           target_uid: str, description: str, directed: bool) -> None:
385 |         """Update edge record in the edges collection."""
386 |         edge_data_dict = {
387 |             "edge_uid": edge_uid,
388 |             "source_uid": source_uid,
389 |             "target_uid": target_uid,
390 |             "description": description,
391 |             "directed": directed
392 |         }
393 |         self.mdbe_edges_coll.update_one(
394 |             {"edge_uid": edge_uid}, {"$set": edge_data_dict}, upsert=True
395 |         )
396 |         return None
397 | 
398 |     def get_nearest_neighbors(self, query_vec) -> List[str]:
399 |         """Implements nearest neighbor search based on nosql db index."""
400 |         pass
401 | 
402 |     def get_community(self, community_id: str) -> CommunityData:
403 |         """Retrieves the community report for a given community id."""
404 |         return
405 | 
406 |     def list_communities(self) -> List[CommunityData]:
407 |         """Lists all stored communities for the given network."""
408 |         return
409 | 
410 |     def clean_zerodegree_nodes(self) -> None:
411 |         """Removes all nodes with degree 0."""
412 |         return
413 | 
414 |     def edge_exist(self, source_uid: str, target_uid: str) -> bool:
415 |         """Checks for edge existence and returns boolean"""
416 |         edge_uid = self._generate_edge_uid(source_uid, target_uid)
417 |         if self.mdbe_edges_coll.find_one({"edge_uid": edge_uid}) is not None:
418 |             return True
419 |         return False
420 | 
421 |     def node_exist(self, node_uid: str) -> bool:
422 |         """Checks for node existence and returns boolean"""
423 |         if self.mdb_node_coll.find_one({"node_uid": node_uid}) is not None:
424 |             return True
425 |         else:
426 |             return False
427 | 
428 |     def flush_kg(self) -> None:
429 |         """Method to wipe the complete datastore of the knowledge graph"""
430 |         try:
431 |             # Drop the node collection
432 |             self.mdb_node_coll.drop()
433 | 
434 |             # Drop the edges collection
435 |             self.mdbe_edges_coll.drop()
436 | 
437 |             # Drop the community collection
438 |             self.mdb_comm_coll.drop()
439 | 
440 |         except Exception as e:
441 |             raise Exception(f"Error flushing MongoDB collections: {e}") from e
442 | 
443 | 
444 | if __name__ == "__main__":
445 |     import os
446 |     from dotenv import dotenv_values
447 | 
448 |     os.chdir(os.path.dirname(os.path.abspath(__file__)))
449 | 
450 |     secrets = dotenv_values("../.env")
451 | 
452 |     mdb_username = str(secrets["MDB_USERNAME"])
453 |     mdb_passowrd = str(secrets["MDB_PASSWORD"])
454 |     mdb_cluster = str(secrets["MDB_CLUSTER"])
455 | 
456 |     uri = f"mongodb+srv://{mdb_username}:{mdb_passowrd}@cluster0.pjx3w.mongodb.net/?retryWrites=true&w=majority&appName={mdb_cluster}"
457 | 
458 |     mkg = MongoKG(
459 |         mdb_uri=uri,
460 |         mdb_db_id=str(secrets["MDB_DB_ID"]),
461 |         node_coll_id=str(secrets["NODE_COLL_ID"]),
462 |         edges_coll_id=str(secrets["EDGES_COLL_ID"]),
463 |         community_collection_id=str(secrets["COMM_COLL_ID"])
464 |     )
465 | 
466 |     # node = mkg.get_node(node_uid="2022 IRANIAN PROTESTS")
467 | 
468 |     print("helLO wOrLD!")
469 | 


--------------------------------------------------------------------------------
/databases/n4j.py:
--------------------------------------------------------------------------------
  1 | """Neo4j database operations"""
  2 | 
  3 | import os
  4 | from typing import List
  5 | 
  6 | import dotenv
  7 | 
  8 | from neo4j import GraphDatabase
  9 | import networkx as nx  # type: ignore
 10 | 
 11 | from base.operations import NoSQLKnowledgeGraph
 12 | from datamodel.data_model import NodeData, EdgeData, CommunityData
 13 | 
 14 | 
 15 | class AuraKG(NoSQLKnowledgeGraph):
 16 |     """
 17 |     Base Class for storing and interacting with the KG and manages data model.
 18 |     """
 19 | 
 20 |     def __init__(self,
 21 |                  uri: str,
 22 |                  auth: tuple[str, str]
 23 |                  ):
 24 |         super().__init__()
 25 |         self.uri = uri
 26 |         self.auth = auth
 27 | 
 28 |         self.driver = GraphDatabase.driver(uri, auth=auth)
 29 | 
 30 |     def add_node(self, node_uid: str, node_data: NodeData) -> None:
 31 |         """Adds an node to the knowledge graph."""
 32 | 
 33 |         # with GraphDatabase.driver(self.uri, auth=self.auth) as driver:
 34 |         # self.driver.verify_connectivity()
 35 |         # print("Connection established.")
 36 | 
 37 |         summary = self.driver.execute_query(
 38 |             "CREATE (:" + node_data.node_type + " { "
 39 |             "node_uid: $node_uid, "
 40 |             "node_title: $node_title, "
 41 |             "node_type: $node_type, "
 42 |             "node_description: $node_description, "
 43 |             "node_degree: $node_degree, "
 44 |             "document_id: $document_id, "
 45 |             "community_id: $community_id, "
 46 |             "edges_to: $edges_to, "
 47 |             "edges_from: $edges_from, "
 48 |             "embedding: $embedding "
 49 |             "})",
 50 |             node_uid=node_data.node_uid,
 51 |             node_title=node_data.node_title,
 52 |             node_type=node_data.node_type,
 53 |             node_description=node_data.node_description,
 54 |             node_degree=node_data.node_degree,
 55 |             document_id=node_data.document_id,
 56 |             community_id=node_data.community_id,
 57 |             edges_to=node_data.edges_to,
 58 |             edges_from=node_data.edges_from,
 59 |             embedding=node_data.embedding
 60 |         ).summary
 61 | 
 62 |         # print("Created {nodes_created} nodes with if {node_uid} in {time} ms.".format(
 63 |         #     nodes_created=summary.counters.nodes_created,
 64 |         #     node_uid=node_uid,
 65 |         #     time=summary.result_available_after
 66 |         # ))
 67 |         return None
 68 | 
 69 |     def get_node(self, node_uid: str) -> NodeData:
 70 |         """Retrieves a node from the knowledge graph."""
 71 | 
 72 |         self.driver.verify_connectivity()
 73 | 
 74 |         # Use a parameter for node_uid in the Cypher query
 75 |         records, summary, keys = self.driver.execute_query(
 76 |              "MATCH (n {node_uid: $node_uid}) RETURN n",
 77 |               node_uid=node_uid  # Pass node_uid as a parameter
 78 |              )
 79 | 
 80 |         if records:  # Check if any records were returned
 81 |             record = records[0]  # Get the first record
 82 |             node_data = record['n']
 83 |             # Convert Neo4j node properties to NodeData object
 84 |             return NodeData(
 85 |                 # Assuming node_uid is a property
 86 |                 node_uid=node_data.get('node_uid'),
 87 |                 node_title=node_data.get('node_title'),
 88 |                 node_type=node_data.get('node_type'),
 89 |                 node_description=node_data.get('node_description'),
 90 |                 node_degree=node_data.get('node_degree'),
 91 |                 document_id=node_data.get('document_id'),
 92 |                 edges_to=node_data.get('edges_to', []),
 93 |                 edges_from=node_data.get('edges_from', []),
 94 |                 embedding=node_data.get('embedding', [])
 95 |             )
 96 |         else:
 97 |             raise KeyError(
 98 |                 f"Error: No node found with node_uid: {node_uid}")
 99 | 
100 |     def update_node(self, node_uid: str, node_data: NodeData) -> None:
101 |         """Updates an existing node in the knowledge graph."""
102 | 
103 |         self.driver.verify_connectivity()
104 | 
105 |         # Use parameters for all properties in the Cypher query
106 |         summary = self.driver.execute_query(
107 |             """
108 |             MATCH (n { node_uid: $node_uid })
109 |             SET n.node_title = $node_title,
110 |                 n.node_type = $node_type,
111 |                 n.node_description = $node_description,
112 |                 n.node_degree = $node_degree,
113 |                 n.document_id = $document_id,
114 |                 n.community_id = $community_id,
115 |                 n.edges_to = $edges_to,
116 |                 n.edges_from = $edges_from,
117 |                 n.embedding = $embedding
118 |             RETURN n
119 |             """,
120 |             node_uid=node_uid,
121 |             node_title=node_data.node_title,
122 |             node_type=node_data.node_type,
123 |             node_description=node_data.node_description,
124 |             node_degree=node_data.node_degree,
125 |             document_id=node_data.document_id,
126 |             community_id=node_data.community_id,
127 |             edges_to=node_data.edges_to,
128 |             edges_from=node_data.edges_from,
129 |             embedding=node_data.embedding
130 |         ).summary
131 | 
132 |     def _delete_from_edge_coll(self, edge_uid: str) -> None:
133 |         """Method to delete record from edge collection of given kg store"""
134 |         raise NotImplementedError("Not implemented for n4j because no collections used.")
135 | 
136 |     def remove_node(self, node_uid: str) -> None:
137 |         """Removes a node from the knowledge graph."""
138 | 
139 |         self.driver.verify_connectivity()
140 | 
141 |         summary = self.driver.execute_query(
142 |             "MATCH (n {node_uid: $node_uid}) DETACH DELETE n",
143 |             node_uid=node_uid
144 |         ).summary
145 | 
146 |         if summary.counters.nodes_deleted == 0:
147 |             raise KeyError(
148 |                 f"Error: No node found with node_uid: {node_uid}")
149 |         return None
150 | 
151 |     def add_edge(self, edge_data: EdgeData) -> None:
152 |         """Adds an edge (relationship) between two entities in the knowledge graph."""
153 | 
154 |         # get source and target node data
155 |         source_node_data = self.get_node(edge_data.source_uid)
156 |         target_node_data = self.get_node(edge_data.target_uid)
157 | 
158 |         # update source and target node data
159 |         source_node_data.edges_to = list(
160 |             set(source_node_data.edges_to) | {edge_data.target_uid})
161 |         self.update_node(edge_data.source_uid, source_node_data)
162 |         target_node_data.edges_from = list(
163 |             set(target_node_data.edges_from) | {edge_data.source_uid})
164 |         self.update_node(edge_data.target_uid, target_node_data)
165 | 
166 |         self.driver.verify_connectivity()
167 | 
168 |         if edge_data.directed:
169 |             query = """
170 |             MATCH (source:""" + source_node_data.node_type + """ {node_uid: $source_uid}), (target:""" + target_node_data.node_type + """ {node_uid: $target_uid})
171 |             CREATE (source)-[:DIRECTED {description: $description}]->(target)
172 |             """
173 | 
174 |         elif not edge_data.directed:
175 |             query = """
176 |             MATCH (source:""" + source_node_data.node_type + """ {node_uid: $source_uid}), (target:""" + target_node_data.node_type + """ {node_uid: $target_uid})
177 |             CREATE (source)-[:UNDIRECTED {description: $description}]->(target), (target)-[:UNDIRECTED {description: $description}]->(source)
178 |             """
179 | 
180 |             # Since it's undirected, also add source_uid to target_node_data.edges_from and vice versa
181 |             target_node_data.edges_to = list(
182 |                 set(target_node_data.edges_to) | {edge_data.source_uid})
183 |             self.update_node(edge_data.target_uid, target_node_data)
184 |             source_node_data.edges_from = list(
185 |                 set(source_node_data.edges_from) | {edge_data.target_uid})
186 |             self.update_node(edge_data.source_uid, source_node_data)
187 | 
188 |         summary = self.driver.execute_query(
189 |             query,
190 |             source_uid=edge_data.source_uid,
191 |             target_uid=edge_data.target_uid,
192 |             description=edge_data.description
193 |         ).summary
194 | 
195 |         print("#### Created {count} egdes {origin} -> {target} egdes in {time} ms.".format(
196 |             count=str(summary.counters.relationships_created),
197 |             origin=str(edge_data.source_uid),
198 |             target=str(edge_data.target_uid),
199 |             time=str(summary.result_available_after)
200 |             ))
201 | 
202 |         return None
203 | 
204 |     def get_edge(self, source_uid: str, target_uid: str) -> EdgeData:
205 |         """Retrieves an edge between two entities."""
206 | 
207 |         # get source and target node data
208 |         source_node_data = self.get_node(source_uid)
209 |         target_node_data = self.get_node(target_uid)
210 | 
211 |         self.driver.verify_connectivity()
212 | 
213 |         # Use parameters for source_uid and target_uid
214 |         records, summary, keys = self.driver.execute_query(
215 |             """
216 |             MATCH (source:""" + source_node_data.node_type + """ {node_uid: $source_uid})-[r]->(target:""" + target_node_data.node_type + """ {node_uid: $target_uid}) 
217 |             RETURN r
218 |             """,
219 |             source_uid=source_uid,
220 |             target_uid=target_uid
221 |         )
222 | 
223 |         if records:
224 |             record = records[0][0]
225 |             edge_type = record.type
226 |             description = record.get('description')
227 |             return EdgeData(source_uid=source_uid, target_uid=target_uid, description=description, edge_uid=self._generate_edge_uid(source_uid, target_uid))
228 |         else:
229 |             raise KeyError(
230 |                 f"Error: No edge found between source_uid: '{source_uid}' and target_uid: '{target_uid}'")
231 | 
232 |     def update_edge(self, edge_data: EdgeData) -> None:
233 |         """Updates an existing edge in the knowledge graph."""
234 | 
235 |         # get source and target node data
236 |         source_node_data = self.get_node(edge_data.source_uid)
237 |         target_node_data = self.get_node(edge_data.target_uid)
238 | 
239 |         self.driver.verify_connectivity()
240 | 
241 |         # Use parameters for all properties in the Cypher query
242 |         summary = self.driver.execute_query(
243 |             """
244 |             MATCH (source:""" + source_node_data.node_type + """ {node_uid: $source_uid})-[r]->(target:""" + target_node_data.node_type + """ {node_uid: $target_uid})
245 |             SET r.description = $description
246 |             RETURN r
247 |             """,
248 |             source_uid=edge_data.source_uid,
249 |             target_uid=edge_data.target_uid,
250 |             description=edge_data.description
251 |         ).summary
252 |         return None
253 | 
254 |     def remove_edge(self, source_uid: str, target_uid: str) -> None:
255 |         """Removes an edge between two entities."""
256 | 
257 |         try:
258 |             # Get source and target node data (this will raise KeyError if not found)
259 |             source_node_data = self.get_node(source_uid)
260 |             target_node_data = self.get_node(target_uid)
261 | 
262 |             self.driver.verify_connectivity()
263 | 
264 |             # Remove edge from source to target
265 |             summary = self.driver.execute_query(
266 |                 """
267 |                 MATCH (source:""" + source_node_data.node_type + """ {node_uid: $source_uid})-[r]->(target:""" + target_node_data.node_type + """ {node_uid: $target_uid})
268 |                 DELETE r
269 |                 """,
270 |                 source_uid=source_uid,
271 |                 target_uid=target_uid
272 |             ).summary
273 | 
274 |             # Optionally, you might want to check if the edge was actually deleted
275 |             if summary.counters.relationships_deleted == 0:
276 |                 raise KeyError(
277 |                     f"Error: No edge found between source_uid: '{source_uid}' and target_uid: '{target_uid}'")
278 |         except KeyError:
279 |             empty_node = NodeData(
280 |                 node_uid="",
281 |                 node_title="",
282 |                 node_type="",
283 |                 node_description="",
284 |                 node_degree=0,
285 |                 document_id="",
286 |             )
287 |             source_node_data = empty_node
288 |             target_node_data = empty_node
289 | 
290 |         # Update the node data to reflect the removed edge
291 |         try:
292 |             source_node_data.edges_to.remove(target_uid)
293 |             self.update_node(source_uid, source_node_data)
294 |         except ValueError:
295 |             pass  # Target node not in source's edges_to, likely due to a directed edge
296 | 
297 |         try:
298 |             target_node_data.edges_from.remove(source_uid)
299 |             self.update_node(target_uid, target_node_data)
300 |         except ValueError:
301 |             pass  # Source node not in target's edges_from, likely due to a directed edge
302 |         return None
303 | 
304 |     def build_networkx(self) -> nx.Graph:
305 |         """Builds the NetworkX representation of the full graph.
306 |         https://networkx.org/documentation/stable/index.html
307 |         """
308 |         graph = nx.Graph()  # Initialize an undirected NetworkX graph
309 | 
310 |         self.driver.verify_connectivity()
311 | 
312 |         # 1. Fetch all nodes and their properties
313 |         records, summary, keys = self.driver.execute_query("MATCH (n) RETURN n")
314 | 
315 |             # Check if any records were returned
316 |         if records:
317 |             for record in records:
318 |                 node = record["n"]
319 |                 node_data = {
320 |                     "node_uid": node.get("node_uid"),
321 |                     "node_title": node.get("node_title"),
322 |                     "node_type": node.get("node_type"),
323 |                     "node_description": node.get("node_description"),
324 |                     "node_degree": node.get("node_degree"),
325 |                     "document_id": node.get("document_id"),
326 |                     "edges_to": node.get("edges_to", []),
327 |                     "edges_from": node.get("edges_from", []),
328 |                     "embedding": node.get("embedding", [])
329 |                 }
330 |                 graph.add_node(node.get("node_uid"), **node_data)
331 | 
332 |             # 2. Fetch all relationships and add edges to the graph
333 |             records, summary, keys = self.driver.execute_query(
334 |                 "MATCH (source)-[r]->(target) RETURN source, r, target")
335 |             for record in records:
336 |                 source_uid = record["source"]["node_uid"]
337 |                 target_uid = record["target"]["node_uid"]
338 |                 # Add edge attributes if needed (e.g., 'description' from 'r')
339 |                 graph.add_edge(source_uid, target_uid)
340 |         else:
341 |             print(
342 |                 "Warning: No nodes found in the database. Returning an empty NetworkX graph.")
343 | 
344 |         self.networkx = graph
345 |         return graph
346 | 
347 |     def store_community(self, community: CommunityData) -> None:
348 |         """Takes valid graph community data and upserts the database with it.
349 |         https://www.nature.com/articles/s41598-019-41695-z
350 |         """
351 |         pass
352 | 
353 |     def _generate_edge_uid(self, source_uid: str, target_uid: str) -> str:
354 |         """Generates Edge uid for the network based on source and target nod uid"""
355 |         return f"{source_uid}_to_{target_uid}"
356 | 
357 |     def edge_exist(self, source_uid: str, target_uid: str) -> bool:
358 |         """Checks for edge existence and returns boolean"""
359 |         try:
360 |             # Try to retrieve the edge
361 |             self.get_edge(source_uid, target_uid)
362 |             return True  # Edge exists
363 |         except KeyError:
364 |             return False  # Edge does not exist
365 | 
366 |     def node_exist(self, node_uid: str) -> bool:
367 |         """Checks for node existence and returns boolean"""
368 |         try:
369 |             # Try to retrieve the node
370 |             self.get_node(node_uid)
371 |             return True  # Node exists
372 |         except KeyError:
373 |             return False  # Node does not exist
374 | 
375 |     def get_nearest_neighbors(self, query_vec) -> List[str]:
376 |         """Implements nearest neighbor search based on nosql db index."""
377 |         pass
378 | 
379 |     def get_community(self, community_id: str) -> CommunityData:
380 |         """Retrieves the community report for a given community id."""
381 |         pass
382 | 
383 |     def list_communities(self) -> List[CommunityData]:
384 |         """Lists all stored communities for the given network."""
385 |         pass
386 | 
387 |     def clean_zerodegree_nodes(self) -> None:
388 |         """Removes all nodes with degree 0."""
389 |         pass
390 | 
391 |     def flush_kg(self) -> None:
392 |         """Method to wipe the complete datastore of the knowledge graph"""
393 |         self.driver.verify_connectivity()
394 |         summary = self.driver.execute_query(
395 |                 """
396 |                 MATCH (n) 
397 |                 DETACH DELETE n
398 |                 """
399 |             ).summary
400 |         return None
401 | 
402 | 
403 | if __name__ == "__main__":
404 | 
405 |     load_status = dotenv.load_dotenv("Neo4j-39cb28f0-Created-2024-09-23.txt")
406 |     if load_status is False:
407 |         raise RuntimeError('Environment variables not loaded.')
408 | 
409 |     URI = str(os.getenv("NEO4J_URI"))
410 |     AUTH = (str(os.getenv("NEO4J_USERNAME")), str(os.getenv("NEO4J_PASSWORD")))
411 | 
412 |     aura = AuraKG(uri=URI, auth=AUTH)
413 | 
414 |     # aura.add_node(NodeData(node_uid="test_uid_2", node_title="test2", node_type="test", node_description="test", node_degree=0, document_id="doc test"))
415 | 
416 |     print(aura.get_node("test_uid"))
417 |     print(aura.get_node("test_uid_2"))
418 | 
419 |     print("Hello World!")
420 | 


--------------------------------------------------------------------------------
/datamodel/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jakobap/graph2nosql/77df8ecba857c61381a37b878d57c20d52ff9834/datamodel/__init__.py


--------------------------------------------------------------------------------
/datamodel/data_model.py:
--------------------------------------------------------------------------------
 1 | """Module providing Data Model definitions for storing and processing Graph Data. """
 2 | 
 3 | from dataclasses import dataclass, field
 4 | from typing import Tuple
 5 | import numpy as np
 6 | 
 7 | 
 8 | @dataclass
 9 | class EdgeData:
10 |     """EdgeData data model definition"""
11 |     source_uid: str
12 |     target_uid: str
13 |     description: str
14 |     directed: bool = True
15 |     edge_uid: str | None = None
16 |     document_id: str | None = None
17 | 
18 | 
19 | @dataclass
20 | class NodeData:
21 |     """NodeData data model definition"""
22 |     node_uid: str
23 |     node_title: str
24 |     node_type: str
25 |     node_description: str
26 |     node_degree: int
27 |     document_id: str # identifier for source knowlede base document for this entity
28 |     community_id: int | None = None # community id based on source document
29 |     edges_to: list[str] = field(default_factory=list)
30 |     edges_from: list[str] = field(default_factory=list)  # in case of directed graph
31 |     embedding: list[float] = field(default_factory=list)  # text embedding for node
32 | 
33 | 
34 | @dataclass
35 | class CommunityData:
36 |     """CommunityData data model definition"""
37 |     title: str # title of comm, None if not yet computed
38 |     community_nodes: set[str] = field(default_factory=set) # list of node_uid belonging to community
39 |     summary: str | None = None # description of comm, None if not yet computed
40 |     document_id: str | None = None # identifier for source knowlede base document for this entity
41 |     community_uid: str | None = None # community identifier
42 |     community_embedding: Tuple[float, ...] = field(default_factory=tuple)
43 |     rating: int | None = None
44 |     rating_explanation: str | None = None
45 |     findings: list[dict] | None = None
46 | 
47 |     def __to_dict__(self):
48 |         """Converts the CommunityData instance to a dictionary."""
49 |         return {
50 |             "title": self.title,
51 |             "community_nodes": list(self.community_nodes),  # Convert set to list 
52 |             "summary": self.summary,
53 |             "document_id": self.document_id,
54 |             "community_uid": self.community_uid,
55 |             "community_embedding": list(self.community_embedding),  # Convert tuple to list 
56 |             "rating": self.rating,
57 |             "rating_explanation": self.rating_explanation,
58 |             "findings": self.findings
59 |         }
60 | 
61 |     @classmethod
62 |     def __from_dict__(cls, data: dict):
63 |         """Creates a CommunityData instance from a dictionary."""
64 |         return cls(
65 |             title=data.get("title") or "",
66 |             community_nodes=set(data.get("community_nodes", [])),  # Convert list to set
67 |             summary=data.get("summary"),
68 |             document_id=data.get("document_id"),
69 |             community_uid=data.get("community_uid"),
70 |             community_embedding=tuple(data.get("community_embedding", [])),  # Convert list to tuple
71 |             rating=data.get("rating"),
72 |             rating_explanation=data.get("rating_explanation"),
73 |             findings=data.get("findings")
74 |         )
75 | 
76 | 
77 | @dataclass
78 | class NodeEmbeddings:
79 |     """Node embeddings class definition."""
80 |     nodes: list[str]
81 |     embeddings: np.ndarray
82 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | # general dependencies
 2 | networkx==3.3
 3 | matplotlib==3.9.1
 4 | graspologic
 5 | numpy
 6 | future==1.0.0
 7 | python-dotenv==1.0.1
 8 | 
 9 | # db specific dependencies
10 | firebase-admin==6.5.0
11 | neo4j==5.24.0
12 | pymongo==4.10.1
13 | 
14 | -e .


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | setup(
 4 |     name='graph2nosql',  # Choose a name for your library
 5 |     version='0.1', 
 6 |     packages=find_packages(),
 7 |     install_requires=[
 8 |         'networkx==3.3',
 9 |         'matplotlib==3.9.1',
10 |         'graspologic',
11 |         'numpy',
12 |         'firebase-admin==6.5.0',
13 |         'python-dotenv==1.0.1',
14 |         'future==1.0.0',
15 |         'neo4j==5.24.0'
16 |     ]
17 | )


--------------------------------------------------------------------------------