├── .env.example
├── .gitignore
├── .python-version
├── LICENSE
├── README.md
├── assets
    └── hybrid-rag.png
├── crud.py
├── data
    └── blackrock
    │   ├── fink_1.txt
    │   ├── kapito_1.txt
    │   └── wagner_1.txt
├── docker-compose.yml
├── graph_rag.py
├── hybrid_rag.py
├── prompts.py
├── pyproject.toml
├── requirements.txt
├── uv.lock
└── vector_rag.py


/.env.example:
--------------------------------------------------------------------------------
1 | OPENAI_API_KEY =
2 | COHERE_API_KEY =


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
110 | .pdm.toml
111 | .pdm-python
112 | .pdm-build/
113 | 
114 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
115 | __pypackages__/
116 | 
117 | # Celery stuff
118 | celerybeat-schedule
119 | celerybeat.pid
120 | 
121 | # SageMath parsed files
122 | *.sage.py
123 | 
124 | # Environments
125 | .env
126 | .venv
127 | env/
128 | venv/
129 | ENV/
130 | env.bak/
131 | venv.bak/
132 | 
133 | # Spyder project settings
134 | .spyderproject
135 | .spyproject
136 | 
137 | # Rope project settings
138 | .ropeproject
139 | 
140 | # mkdocs documentation
141 | /site
142 | 
143 | # mypy
144 | .mypy_cache/
145 | .dmypy.json
146 | dmypy.json
147 | 
148 | # Pyre type checker
149 | .pyre/
150 | 
151 | # pytype static type analyzer
152 | .pytype/
153 | 
154 | # Cython debug symbols
155 | cython_debug/
156 | 
157 | # PyCharm
158 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
159 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
160 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
161 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
162 | #.idea/
163 | .DS_Store
164 | 
165 | # Custom
166 | test_kuzudb
167 | test_lancedb


--------------------------------------------------------------------------------
/.python-version:
--------------------------------------------------------------------------------
1 | 3.12


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 Kùzu Inc.
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Graph RAG and Hybrid RAG Workshop
  2 | 
  3 | Workshop on showing the benefits of Graph RAG and its combination with Vector RAG (Hybrid RAG).
  4 | 
  5 | The following stack is used:
  6 | 
  7 | - Graph database: [Kùzu](https://kuzudb.com/)
  8 | - Vector database: [LanceDB](https://lancedb.com/)
  9 | - LLM prompting: [ell](https://docs.ell.so/), a language model programming framework
 10 | - Embedding model: OpenAI `text-embedding-3-small`
 11 | - Entity & relationship extraction: [LlamaIndex](https://docs.llamaindex.ai/) + OpenAI `gpt-4o-mini`
 12 | - Generation model: OpenAI `gpt-4o-mini`
 13 | - Reranking: Cohere [reranker](https://docs.cohere.com/v2/reference/rerank)
 14 | 
 15 | The system we'll be building has the following high-level architecture:
 16 | 
 17 | ![](./assets/hybrid-rag.png)
 18 | 
 19 | ## Dataset
 20 | 
 21 | The dataset used in this workshop is the [BlackRock founders dataset](./data/blackrock), which
 22 | are three small text files containing information about the founders of the asset management firm
 23 | BlackRock.
 24 | 
 25 | The aim of the workshop is to show how we can build a hybrid RAG system that utilizes a graph
 26 | database and a vector database to answer questions about the dataset.
 27 | 
 28 | ## Setup environment
 29 | 
 30 | We will be using the Python API of Kùzu and a combination of scripts that utilize the required
 31 | dependencies.
 32 | 
 33 | ### `uv` package manager
 34 | 
 35 | It's recommended to use Astral's [`uv` package manager](https://docs.astral.sh/uv/) to manage both
 36 | Python and its dependencies. You can install the required version of Python (3.12) using `uv` with
 37 | the following command:
 38 | 
 39 | ```bash
 40 | uv python install 3.12
 41 | ```
 42 | 
 43 | All the dependencies are indicated in the `pyproject.toml` file and the associated `uv.lock` file
 44 | provided in this repo. Simply sync the dependencies to your local virtual environment with the
 45 | following command:
 46 | 
 47 | ```bash
 48 | # Sync dependencies and allow uv to create a local .venv
 49 | uv sync
 50 | 
 51 | # Run scripts
 52 | uv run crud.py
 53 | uv run graph_rag.py
 54 | uv run vector_rag.py
 55 | uv run hybrid_rag.py
 56 | ```
 57 | 
 58 | 
 59 | ### If using system Python
 60 | 
 61 | > [!NOTE]
 62 | > Alternatively you can use your system's Python installation and pip to install the dependencies
 63 | > via `requirements.txt`.
 64 | 
 65 | ```bash
 66 | # Activate virtual environment
 67 | python -m venv .venv
 68 | source .venv/bin/activate
 69 | python -m pip install -r requirements.txt
 70 | 
 71 | # Run scripts
 72 | python crud.py
 73 | python graph_rag.py
 74 | python vector_rag.py
 75 | python hybrid_rag.py
 76 | ```
 77 | 
 78 | ## Description of steps
 79 | 
 80 | ### 1. Construct the graph
 81 | 
 82 | The script `crud.py` extracts entities and relationships from the provided
 83 | [BlackRock founders dataset](./data/blackrock) and constructs a graph that is stored in Kùzu.
 84 | 
 85 | ```bash
 86 | uv run crud.py
 87 | ```
 88 | 
 89 | The script `crud.py` does the following:
 90 | - Chunk the text, generate embeddings, and stores the embeddings in a [LanceDB](https://lancedb.com/),
 91 | an embedded vector database
 92 | - Use the LlamaIndex framework and its
 93 | [property graph index](https://docs.llamaindex.ai/en/stable/module_guides/indexing/lpg_index_guide/)
 94 | to extract entities and relationships from the unstructured text.
 95 | - Store the extracted entities and relationships in Kùzu, an embedded graph database
 96 | - Augment the graph with additional entities and relationships obtained from external sources
 97 | 
 98 | ### 2. Traditional RAG (via vector search)
 99 | 
100 | The script `vector_rag.py` runs retrieval-augmented generation (RAG) that leverages semantic
101 | (vector) search. To retrieve from the vector database, the script first embeds the question and then
102 | searches for the nearest neighbors using cosine similarity. It then retrieves the context (chunks of
103 | text) that are most similar to the question. The script finally uses the LLM to generate a response
104 | using the retrieved context.
105 | 
106 | ```bash
107 | uv run vector_rag.py
108 | ```
109 | 
110 | ### 3. Graph RAG
111 | 
112 | The script `graph_rag.py` runs retrieval-augmented generation (RAG) that leverages the graph
113 | database to answer questions. To retrieve from the graph database, the script first translates
114 | the question into a Cypher query, which is then executed against the graph database. The retrieved
115 | entities and relationships are then used as context to generate a response using the LLM.
116 | 
117 | ```bash
118 | uv run graph_rag.py
119 | ```
120 | 
121 | ### 4. Hybrid RAG
122 | 
123 | The script `hybrid_rag.py` runs retrieval-augmented generation (RAG) that leverages *both* the
124 | vector database and the graph database. The vector and graph retrieval contexts are concatenated
125 | together and passed to the LLM to generate a response.
126 | 
127 | ```bash
128 | uv run hybrid_rag.py
129 | ```
130 | 
131 | ## Workshop exercises
132 | 
133 | In this section, we'll go through the workshop exercises.
134 | 
135 | ### 1. Traditional RAG
136 | 
137 | You can use the script `vector_rag.py`, that performs naive chunking of the text, creates vector
138 | embeddings of the chunks, and stores them in a vector database.
139 | 
140 | We'll answer the following questions using traditional RAG.
141 | 
142 | #### Q1: Who are the founders of BlackRock? Return the names as a numbered list.
143 | ```
144 | The founders of BlackRock are:
145 | 
146 | 1. Larry Fink
147 | 2. Robert Kapito
148 | 3. Susan Wagner
149 | ```
150 | 
151 | > [!NOTE]
152 | > The above list of the BlackRock founders is not exhaustive. Five more cofounders exist, as is
153 | > revealed in a simple Google search. To address this, you can augment the graph (in the next step) with the additional information to improve the relevance and factual accuracy of the response.
154 | 
155 | #### Q2: Where did Larry Fink graduate from?
156 | ```
157 | Larry Fink graduated from UCLA, where he earned a BA in political science in 1974 and an MBA in 1976.
158 | ```
159 | 
160 | #### Q3: When was Susan Wagner born?
161 | ```
162 | The relevant context does not provide information about Susan Wagner's birth date. Therefore, I cannot answer the question about when Susan Wagner was born.
163 | ```
164 | 
165 | > [!NOTE]
166 | > In the given text data, Susan Wagner's birth date is not mentioned. You can use a graph
167 | > with the additional information added in it to improve the relevance and factual accuracy of the response.
168 | 
169 | #### Q4: How did Larry Fink and Rob Kapito meet?
170 | ```
171 | Larry Fink and Rob Kapito first met while working at First Boston in 1979, where Kapito served in the Public Finance department. This initial meeting laid the foundation for their future partnership when they later co-founded BlackRock in 1988.
172 | ```
173 | 
174 | ### 2. Graph RAG
175 | 
176 | You can use the script `graph_rag.py` to answer the following questions using graph RAG. During
177 | graph construction, we added additional cofounders of BlackRock to the graph, as well as some of
178 | their birth dates.
179 | 
180 | Just like earlier, we'll answer the following questions using Graph RAG.
181 | 
182 | #### Q1: Who are the founders of BlackRock? Return the names as a numbered list.
183 | ```
184 | 1. Barbara Novick  
185 | 2. Hugh Frater  
186 | 3. Keith Anderson  
187 | 4. Ralph Schlosstein  
188 | 5. Robert Kapito  
189 | 6. Larry Fink  
190 | 7. Ben Golub  
191 | 8. Susan Wagner  
192 | ```
193 | 
194 | > [!NOTE]
195 | > Because we augmented the graph with external knowledge, the list of founders is now exhaustive.
196 | 
197 | #### Q2: Where did Larry Fink graduate from?
198 | 
199 | ```
200 | Larry Fink graduated from UCLA.
201 | ```
202 | 
203 | > [!NOTE]
204 | > Unlike vector retrieval-based RAG, the graph only stored the name of the university that Larry Fink
205 | > graduated from, so the LLM only has this context to answer the question as it did.
206 | 
207 | #### Q3: When was Susan Wagner born?
208 | 
209 | ```
210 | Susan Wagner was born on May 26, 1961.
211 | ```
212 | 
213 | > [!NOTE]
214 | > The graph stored the birth dates of Larry Fink and Susan Wagner, so the LLM was able to answer the
215 | > question correctly where the vector search did not.
216 | 
217 | #### Q4: How did Larry Fink and Rob Kapito meet?
218 | 
219 | ```
220 | The relevant context does not provide any information about how Larry Fink and Rob Kapito met. Therefore, I cannot answer the question.
221 | ```
222 | 
223 | > [!NOTE]
224 | > The graph did not store any information about how Larry Fink and Rob Kapito met, so the LLM
225 | > was unable to answer the question in this case.
226 | 
227 | 
228 | ### 3. Hybrid RAG
229 | 
230 | You can use the script `hybrid_rag.py` to answer the following questions using hybrid RAG. The two
231 | retrieval contexts (vector and graph) are concatenated together and passed to the LLM to generate
232 | a response.
233 | 
234 | #### Q1: Who are the founders of BlackRock? Return the names as a numbered list.
235 | ```
236 | The founders of BlackRock are:
237 | 
238 | 1. Barbara Novick  
239 | 2. Hugh Frater  
240 | 3. Keith Anderson  
241 | 4. Ralph Schlosstein  
242 | 5. Robert Kapito  
243 | 6. Larry Fink  
244 | 7. Ben Golub  
245 | 8. Susan Wagner  
246 | ```
247 | 
248 | #### Q2: Where did Larry Fink graduate from?
249 | 
250 | ```
251 | Larry Fink graduated from UCLA, where he earned both a BA in political science in 1974 and an MBA in 1976.
252 | ```
253 | 
254 | #### Q3: When was Susan Wagner born?
255 | 
256 | ```
257 | Susan Wagner was born on May 26, 1961.
258 | ```
259 | 
260 | #### Q4: How did Larry Fink and Rob Kapito meet?
261 | 
262 | ```
263 | Larry Fink and Rob Kapito first met while working at First Boston in 1979, where Kapito served in the Public Finance department. This meeting marked the beginning of their professional relationship, which later led them to become partners in founding BlackRock in 1988.
264 | ```
265 | 
266 | ## Additional exercises
267 | 
268 | - Try to answer questions about data that is not present in the original text. For example, you
269 | can try to ask "When was Barbara Novick born?"
270 | - Try to answer questions that require some commonsense reasoning based on the text provided.
271 | For example, "Does Susan Wagner still work at BlackRock?"
272 | - Try to answer questions that require reasoning over multiple sentences. For example, "Which of
273 | BlackRock's cofounders also worked at First Boston, and where were they born?"
274 | 
275 | 
276 | ## Conclusions
277 | 
278 | The hybrid RAG methodology (with reranking) provides factually accurate
279 | responses in all four cases. In cases where the graph didn't contain the answer, the vector search
280 | provided relevant context that allowed the LLM to generate a response. In cases where the graph
281 | contained the answer but the raw text didn't, hybrid RAG was able to rerank the results from the
282 | graph and vector search in way that on average, provided relevant responses.
283 | 
284 | Note that the hybrid RAG system is not perfect. If the information is not present (either explicitly
285 | or implicitly in the text), it cannot provide an answer to the question because the LLM cannot
286 | reason over the required information to formulate a response.
287 | 
288 | The key takeaways are:
289 | - Graphs can be a helpful tool for factual (extractive) question-answering tasks in RAG
290 | - Traditional RAG (vector-based) is useful for abstractive question-answering tasks, where the
291 | information is not explicitly stated in the exact words of the question
292 | - Just like data quality is of paramount importance in any retrieval system, for hybrid or Graph
293 | RAG, the quality of the graph (entities and relationships) is crucial to the quality of the responses
294 | generated
295 | 
296 | Feel free to clone/fork this repo and try out the workflow on your own datasets!


--------------------------------------------------------------------------------
/assets/hybrid-rag.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kuzudb/graph-rag-workshop/cf29d2d5b97d340df7295f656fa6813b0ba9ded6/assets/hybrid-rag.png


--------------------------------------------------------------------------------
/crud.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import shutil
  3 | import warnings
  4 | from typing import Literal
  5 | 
  6 | import kuzu
  7 | import nest_asyncio
  8 | import openai
  9 | from dotenv import load_dotenv
 10 | from llama_index.core import PropertyGraphIndex, SimpleDirectoryReader, VectorStoreIndex
 11 | from llama_index.core.indices.property_graph import SchemaLLMPathExtractor
 12 | from llama_index.core.ingestion import IngestionPipeline
 13 | from llama_index.core.node_parser import SentenceSplitter
 14 | from llama_index.embeddings.openai import OpenAIEmbedding
 15 | from llama_index.graph_stores.kuzu import KuzuPropertyGraphStore
 16 | from llama_index.llms.openai import OpenAI
 17 | from llama_index.vector_stores.lancedb import LanceDBVectorStore
 18 | 
 19 | # Load environment variables
 20 | load_dotenv()
 21 | SEED = 42
 22 | nest_asyncio.apply()
 23 | 
 24 | OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
 25 | COHERE_API_KEY = os.environ.get("COHERE_API_KEY")
 26 | 
 27 | assert OPENAI_API_KEY is not None, "OPENAI_API_KEY is not set"
 28 | assert COHERE_API_KEY is not None, "COHERE_API_KEY is not set"
 29 | 
 30 | # Set up the embedding model and LLM
 31 | embed_model = OpenAIEmbedding(model_name="text-embedding-3-small")
 32 | extraction_llm = OpenAI(model="gpt-4o-mini", temperature=0.0, seed=SEED)
 33 | generation_llm = OpenAI(model="gpt-4o-mini", temperature=0.3, seed=SEED)
 34 | 
 35 | # Load the dataset on Larry Fink
 36 | original_documents = SimpleDirectoryReader("./data/blackrock").load_data()
 37 | # print(len(original_documents))
 38 | 
 39 | # --- Step 1: Chunk and store the vector embeddings in LanceDB ---
 40 | shutil.rmtree("./test_lancedb", ignore_errors=True)
 41 | 
 42 | openai.api_key = OPENAI_API_KEY
 43 | 
 44 | vector_store = LanceDBVectorStore(
 45 |     uri="./test_lancedb",
 46 |     mode="overwrite",
 47 | )
 48 | 
 49 | pipeline = IngestionPipeline(
 50 |     transformations=[
 51 |         SentenceSplitter(chunk_size=1024, chunk_overlap=32),
 52 |         OpenAIEmbedding(),
 53 |     ],
 54 |     vector_store=vector_store,
 55 | )
 56 | pipeline.run(documents=original_documents)
 57 | 
 58 | # Create the vector index
 59 | vector_index = VectorStoreIndex.from_vector_store(
 60 |     vector_store,
 61 |     embed_model=OpenAIEmbedding(model_name="text-embedding-3-small"),
 62 |     llm=OpenAI(model="gpt-4o-mini", temperature=0.3, seed=SEED),
 63 | )
 64 | 
 65 | # --- Step 2: Construct the graph in KùzuDB ---
 66 | 
 67 | shutil.rmtree("test_kuzudb", ignore_errors=True)
 68 | db = kuzu.Database("test_kuzudb")
 69 | 
 70 | warnings.filterwarnings("ignore")
 71 | 
 72 | # Define the allowed entities and relationships
 73 | entities = Literal["PERSON", "CITY", "STATE", "UNIVERSITY", "ORGANIZATION"]
 74 | relations = Literal[
 75 |     "STUDIED_AT",
 76 |     "IS_FOUNDER_OF",
 77 |     "IS_CEO_OF",
 78 |     "BORN_IN",
 79 |     "IS_CITY_IN",
 80 | ]
 81 | 
 82 | validation_schema = [
 83 |     ("PERSON", "STUDIED_AT", "UNIVERSITY"),
 84 |     ("PERSON", "IS_CEO_OF", "ORGANIZATION"),
 85 |     ("PERSON", "IS_FOUNDER_OF", "ORGANIZATION"),
 86 |     ("PERSON", "BORN_IN", "CITY"),
 87 |     ("CITY", "IS_CITY_IN", "STATE"),
 88 | ]
 89 | 
 90 | graph_store = KuzuPropertyGraphStore(
 91 |     db,
 92 |     has_structured_schema=True,
 93 |     relationship_schema=validation_schema,
 94 | )
 95 | 
 96 | schema_path_extractor = SchemaLLMPathExtractor(
 97 |     llm=extraction_llm,
 98 |     possible_entities=entities,
 99 |     possible_relations=relations,
100 |     kg_validation_schema=validation_schema,
101 |     strict=True,
102 | )
103 | 
104 | kg_index = PropertyGraphIndex.from_documents(
105 |     original_documents,
106 |     embed_model=embed_model,
107 |     kg_extractors=[schema_path_extractor],
108 |     property_graph_store=graph_store,
109 |     show_progress=True,
110 | )
111 | 
112 | # Step 3: Augment the graph with external knowledge and fix erroneous relationships
113 | 
114 | # Say we have this knowledge obtained from other sources about additional founders of BlackRock
115 | additional_founders = [
116 |     "Ben Golub",
117 |     "Barbara Novick",
118 |     "Ralph Schlosstein",
119 |     "Keith Anderson",
120 |     "Hugh Frater",
121 | ]
122 | 
123 | # Open a connection to the database to modify the graph
124 | conn = kuzu.Connection(db)
125 | 
126 | # Add additional founder nodes of type PERSON to the graph store
127 | for founder in additional_founders:
128 |     conn.execute(
129 |         """
130 |         MATCH (o:ORGANIZATION {id: "BlackRock"})
131 |         MERGE (p:PERSON {id: $name, name: $name})
132 |         MERGE (p)-[r:LINKS]->(o)
133 |         SET r.label = "IS_FOUNDER_OF"
134 |         """,
135 |         parameters={"name": founder},
136 |     )
137 | 
138 | # Alter PERSON schema and add a birth_date property
139 | try:
140 |     conn.execute("ALTER TABLE PERSON ADD birth_date STRING")
141 | except RuntimeError:
142 |     pass
143 | 
144 | names = ["Larry Fink", "Susan Wagner", "Robert Kapito"]
145 | dates = ["1952-11-02", "1961-05-26", "1957-02-08"]
146 | 
147 | for name, date in zip(names, dates):
148 |     conn.execute(
149 |         """
150 |     MERGE (p:PERSON {id: $name})
151 |     ON MATCH SET p.birth_date = $date
152 |     """,
153 |         parameters={"name": name, "date": date},
154 |     )
155 | 
156 | conn.close()
157 | 


--------------------------------------------------------------------------------
/data/blackrock/fink_1.txt:
--------------------------------------------------------------------------------
1 | Larry Fink is Founder, Chairman and CEO of BlackRock. He and seven partners founded BlackRock in 1988, and under his leadership, the firm has grown into a global leader in investment and technology solutions to help investors build better financial futures. Today, BlackRock is trusted to manage $10 trillion in assets, more than any other investment firm in the world.
2 | 
3 | Mr. Fink was was born on November 2, 1952, in Van Nuys, California. He earned an MBA from UCLA in 1976 and a BA in political science, also from UCLA, in 1974.
4 | 


--------------------------------------------------------------------------------
/data/blackrock/kapito_1.txt:
--------------------------------------------------------------------------------
1 | Robert Kapito is one of the most successful persons in the financial industry globally and well recognized as the President and cofounder of BlackRock. His career, full of innovation and top-class leadership, has changed and shaped the outlook of the world’s financial scene.
2 | 
3 | Robert Kapito was born on February 8, 1957, in Monticello, New York.
4 | 
5 | Mr. Kapito started his professional career in the First Boston in 1979 where he served in the Public Finance department. This work was the onset of his career, which was long and proved to be significant in finance. During his work at First Boston he first met Larry Fink, who would later be his partner at BlackRock.


--------------------------------------------------------------------------------
/data/blackrock/wagner_1.txt:
--------------------------------------------------------------------------------
1 | Susan Wagner is a cofounder and director of asset manager BlackRock, which she started with Larry Fink and others in 1988. She cofounded the company at age 26 and went on to serve as chief operating officer and vice chairman. Wagner oversaw BlackRock's 2009 merger with Barclay's Global Investors, which transformed the firm into the world's largest asset manager.
2 | 
3 | Prior to founding BlackRock, Wagner worked as a vice president of the mortgage finance group at Lehman Brothers. She retired in 2012 but remains on BlackRock's board and also serves as a director of Apple and startups Color Health and Samsara.


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | services:
 2 |   explorer:
 3 |     image: kuzudb/explorer:0.6.1
 4 |     environment:
 5 |       - MODE=READ_ONLY
 6 |     ports:
 7 |       - 8000:8000
 8 |     volumes:
 9 |       - ./test_kuzudb:/database
10 | 


--------------------------------------------------------------------------------
/graph_rag.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | import kuzu
  4 | from dotenv import load_dotenv
  5 | from ell import ell
  6 | from openai import OpenAI
  7 | 
  8 | import prompts
  9 | 
 10 | load_dotenv()
 11 | OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
 12 | MODEL_NAME = "gpt-4o-mini"
 13 | SEED = 42
 14 | 
 15 | 
 16 | class GraphRAG:
 17 |     """Graph Retrieval Augmented Generation from a Kùzu database."""
 18 | 
 19 |     def __init__(self, db_path="./test_kuzudb"):
 20 |         self.db = kuzu.Database(db_path)
 21 |         self.conn = kuzu.Connection(self.db)
 22 | 
 23 |     def get_schema(self) -> str:
 24 |         """Provides the graph schema information for the purposes of Cypher generation via an LLM."""
 25 |         node_properties = []
 26 |         node_table_names = self.conn._get_node_table_names()
 27 |         for table_name in node_table_names:
 28 |             current_table_schema = {"properties": [], "label": table_name}
 29 |             properties = self.conn._get_node_property_names(table_name)
 30 |             for property_name in properties:
 31 |                 property_type = properties[property_name]["type"]
 32 |                 list_type_flag = ""
 33 |                 if properties[property_name]["dimension"] > 0:
 34 |                     if "shape" in properties[property_name]:
 35 |                         for s in properties[property_name]["shape"]:
 36 |                             list_type_flag += "[%s]" % s
 37 |                     else:
 38 |                         for i in range(properties[property_name]["dimension"]):
 39 |                             list_type_flag += "[]"
 40 |                 property_type += list_type_flag
 41 |                 current_table_schema["properties"].append((property_name, property_type))
 42 |             node_properties.append(current_table_schema)
 43 | 
 44 |         relationships = []
 45 |         rel_tables = self.conn._get_rel_table_names()
 46 |         for table in rel_tables:
 47 |             relationships.append("(:%s)-[:%s]->(:%s)" % (table["src"], table["name"], table["dst"]))
 48 | 
 49 |         rel_properties = []
 50 |         for table in rel_tables:
 51 |             table_name = table["name"]
 52 |             current_table_schema = {"properties": [], "label": table_name}
 53 |             query_result = self.conn.execute(f"CALL table_info('{table_name}') RETURN *;")
 54 |             while query_result.has_next():
 55 |                 row = query_result.get_next()
 56 |                 prop_name = row[1]
 57 |                 prop_type = row[2]
 58 |                 current_table_schema["properties"].append((prop_name, prop_type))
 59 |             rel_properties.append(current_table_schema)
 60 | 
 61 |         schema = (
 62 |             f"Node properties: {node_properties}\n"
 63 |             f"Relationships properties: {rel_properties}\n"
 64 |             f"Relationships: {relationships}\n"
 65 |         )
 66 |         return schema
 67 | 
 68 |     def query(self, question: str, cypher: str) -> str:
 69 |         """Use the generated Cypher statement to query the graph database."""
 70 |         response = self.conn.execute(cypher)
 71 |         result = []
 72 |         while response.has_next():
 73 |             item = response.get_next()
 74 |             if item not in result:
 75 |                 result.extend(item)
 76 | 
 77 |         # Handle both hashable and non-hashable types
 78 |         if all(isinstance(x, (str, int, float, bool, tuple)) for x in result):
 79 |             final_result = {question: list(set(result))}
 80 |         else:
 81 |             # For non-hashable types, we can't use set() directly
 82 |             # Instead, we'll use a list comprehension to remove duplicates
 83 |             final_result = {question: [x for i, x in enumerate(result) if x not in result[:i]]}
 84 | 
 85 |         return final_result
 86 | 
 87 |     @ell.simple(model=MODEL_NAME, temperature=0.1, client=OpenAI(api_key=OPENAI_API_KEY), seed=SEED)
 88 |     def generate_cypher(self, question: str) -> str:
 89 |         return [
 90 |             ell.system(prompts.CYPHER_SYSTEM_PROMPT),
 91 |             ell.user(
 92 |                 prompts.CYPHER_USER_PROMPT.format(schema=self.get_schema(), question=question)
 93 |             ),
 94 |         ]
 95 | 
 96 |     @ell.simple(model=MODEL_NAME, temperature=0.3, client=OpenAI(api_key=OPENAI_API_KEY), seed=SEED)
 97 |     def retrieve(self, question: str, context: str) -> str:
 98 |         return [
 99 |             ell.system(prompts.RAG_SYSTEM_PROMPT),
100 |             ell.user(prompts.RAG_USER_PROMPT.format(question=question, context=context)),
101 |         ]
102 | 
103 |     def run(self, question: str) -> str:
104 |         cypher = self.generate_cypher(question)
105 |         print(f"\n{cypher}\n")
106 |         context = self.query(question, cypher)
107 |         return self.retrieve(question, context)
108 | 
109 | 
110 | if __name__ == "__main__":
111 |     graph_rag = GraphRAG("./test_kuzudb")
112 |     question = "Who are the founders of BlackRock? Return the names as a numbered list."
113 |     response = graph_rag.run(question)
114 |     print(f"Q1: {question}\n\n{response}\n---\n")
115 | 
116 |     question = "Where did Larry Fink graduate from?"
117 |     response = graph_rag.run(question)
118 |     print(f"Q2: {question}\n\n{response}\n---\n")
119 | 
120 |     question = "When was Susan Wagner born?"
121 |     response = graph_rag.run(question)
122 |     print(f"Q3: {question}\n\n{response}\n---\n")
123 | 
124 |     question = "How did Larry Fink and Rob Kapito meet?"
125 |     response = graph_rag.run(question)
126 |     print(f"---\nQ4: {question}\n\n{response}")
127 | 


--------------------------------------------------------------------------------
/hybrid_rag.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import cohere
 4 | from dotenv import load_dotenv
 5 | from ell import ell
 6 | from openai import OpenAI
 7 | 
 8 | import prompts
 9 | from graph_rag import GraphRAG
10 | from vector_rag import VectorRAG
11 | 
12 | load_dotenv()
13 | MODEL_NAME = "gpt-4o-mini"
14 | COHERE_API_KEY = os.environ.get("COHERE_API_KEY")
15 | OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
16 | SEED = 42
17 | 
18 | 
19 | class HybridRAG:
20 |     def __init__(
21 |         self,
22 |         graph_db_path="./test_kuzudb",
23 |         vector_db_path="./test_lancedb",
24 |     ):
25 |         self.graph_rag = GraphRAG(graph_db_path)
26 |         self.vector_rag = VectorRAG(vector_db_path)
27 |         self.co = cohere.ClientV2(COHERE_API_KEY)
28 | 
29 |     @ell.simple(model=MODEL_NAME, temperature=0.3, client=OpenAI(api_key=OPENAI_API_KEY), seed=SEED)
30 |     def hybrid_rag(self, question: str, context: str) -> str:
31 |         return [
32 |             ell.system(prompts.RAG_SYSTEM_PROMPT),
33 |             ell.user(prompts.RAG_USER_PROMPT.format(question=question, context=context)),
34 |         ]
35 | 
36 |     def run(self, question: str) -> str:
37 |         question_embedding = self.vector_rag.embed(question)
38 |         vector_docs = self.vector_rag.query(question_embedding)
39 |         vector_docs = [doc["text"] for doc in vector_docs]
40 | 
41 |         cypher = self.graph_rag.generate_cypher(question)
42 |         graph_docs = self.graph_rag.query(question, cypher)
43 | 
44 |         docs = [graph_docs] + vector_docs
45 |         # Ensure the doc contents are strings
46 |         docs = [str(doc) for doc in docs]
47 | 
48 |         combined_context = self.co.rerank(
49 |             model="rerank-english-v3.0",
50 |             query=question,
51 |             documents=docs,
52 |             top_n=20,
53 |             return_documents=True,
54 |         )
55 |         return self.hybrid_rag(question, combined_context)
56 | 
57 | 
58 | if __name__ == "__main__":
59 |     hybrid_rag = HybridRAG(
60 |         graph_db_path="./test_kuzudb",
61 |         vector_db_path="./test_lancedb"
62 |     )
63 |     question = "Who are the founders of BlackRock? Return the names as a numbered list."
64 |     response = hybrid_rag.run(question)
65 |     print(f"Q1: {question}\n\n{response}")
66 | 
67 |     question = "Where did Larry Fink graduate from?"
68 |     response = hybrid_rag.run(question)
69 |     print(f"---\nQ2: {question}\n\n{response}")
70 | 
71 |     question = "When was Susan Wagner born?"
72 |     response = hybrid_rag.run(question)
73 |     print(f"---\nQ3: {question}\n\n{response}")
74 | 
75 |     question = "How did Larry Fink and Rob Kapito meet?"
76 |     response = hybrid_rag.run(question)
77 |     print(f"---\nQ4: {question}\n\n{response}")
78 | 


--------------------------------------------------------------------------------
/prompts.py:
--------------------------------------------------------------------------------
 1 | RAG_SYSTEM_PROMPT = """
 2 | You are an AI assistant using Retrieval-Augmented Generation (RAG).
 3 | RAG enhances your responses by retrieving relevant information from a knowledge base.
 4 | You will be provided with a question and relevant context. Use only this context to answer the question.
 5 | Do not make up an answer. If you don't know the answer, say so clearly.
 6 | Always strive to provide concise, helpful, and context-aware answers.
 7 | """
 8 | 
 9 | CYPHER_SYSTEM_PROMPT = """
10 | You are an expert in translating natural language questions into Cypher statements.
11 | You will be provided with a question and a graph schema.
12 | Use only the provided relationship types and properties in the schema to generate a Cypher statement.
13 | The Cypher statement could retrieve nodes, relationships, or both.
14 | Do not include any explanations or apologies in your responses.
15 | Do not respond to any questions that might ask anything else than for you to construct a Cypher statement.
16 | """
17 | 
18 | RAG_USER_PROMPT = """
19 | Given the following question and relevant context, please provide a comprehensive and accurate response:
20 | 
21 | Question: {question}
22 | 
23 | Relevant context:
24 | {context}
25 | 
26 | Response:
27 | """
28 | 
29 | CYPHER_USER_PROMPT = """
30 | Task: Generate Cypher statement to query a graph database.
31 | Instructions:
32 | Schema:
33 | {schema}
34 | 
35 | The question is:
36 | {question}
37 | 
38 | Instructions:
39 | Generate the Kùzu dialect of Cypher with the following rules in mind:
40 | 1. Do not include triple backticks ``` in your response. Return only Cypher.
41 | 2. Only use the nodes and relationships provided in the schema.
42 | 3. Use only the provided node and relationship types and properties in the schema.
43 | """
44 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "graph-rag-workshop"
 3 | version = "0.1.0"
 4 | description = "Add your description here"
 5 | readme = "README.md"
 6 | requires-python = ">=3.12"
 7 | dependencies = [
 8 |     "cohere==5.11.1",
 9 |     "ell-ai==0.0.14",
10 |     "kuzu==0.6.1",
11 |     "lancedb==0.14.0",
12 |     "llama-index==0.11.19",
13 |     "llama-index-embeddings-openai==0.2.5",
14 |     "llama-index-graph-stores-kuzu==0.3.2",
15 |     "llama-index-vector-stores-lancedb==0.2.4",
16 |     "python-dotenv==1.0.1",
17 | ]
18 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | kuzu==0.6.1
2 | lancedb==0.14.0
3 | python-dotenv==1.0.1
4 | ell-ai==0.0.14
5 | llama-index==0.11.19
6 | llama-index-embeddings-openai==0.2.5
7 | llama-index-vector-stores-lancedb==0.2.4
8 | llama-index-graph-stores-kuzu==0.3.2
9 | cohere==5.11.1


--------------------------------------------------------------------------------
/vector_rag.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import lancedb
 4 | from dotenv import load_dotenv
 5 | from ell import ell
 6 | from openai import OpenAI
 7 | 
 8 | import prompts
 9 | 
10 | load_dotenv()
11 | OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
12 | MODEL_NAME = "gpt-4o-mini"
13 | SEED = 42
14 | 
15 | 
16 | class VectorRAG:
17 |     def __init__(self, db_path: str, table_name: str = "vectors"):
18 |         load_dotenv()
19 |         self.openai_client = OpenAI(api_key=OPENAI_API_KEY)
20 |         self.db = lancedb.connect(db_path)
21 |         self.table = self.db.open_table(table_name)
22 | 
23 |     def query(self, query_vector: list, limit: int = 10) -> list:
24 |         search_result = (
25 |             self.table.search(query_vector).metric("cosine").select(["text"]).limit(limit)
26 |         ).to_list()
27 |         return search_result if search_result else None
28 | 
29 |     def embed(self, query: str) -> list:
30 |         # For now just using an OpenAI embedding model
31 |         response = self.openai_client.embeddings.create(model="text-embedding-3-small", input=query)
32 |         return response.data[0].embedding
33 | 
34 |     @ell.simple(model=MODEL_NAME, temperature=0.3, client=OpenAI(api_key=OPENAI_API_KEY), seed=SEED)
35 |     def retrieve(self, question: str, context: str) -> str:
36 |         return [
37 |             ell.system(prompts.RAG_SYSTEM_PROMPT),
38 |             ell.user(prompts.RAG_USER_PROMPT.format(question=question, context=context)),
39 |         ]
40 | 
41 |     def run(self, question: str) -> str:
42 |         question_embedding = self.embed(question)
43 |         context = self.query(question_embedding)
44 |         return self.retrieve(question, context)
45 | 
46 | 
47 | if __name__ == "__main__":
48 |     vector_rag = VectorRAG("./test_lancedb")
49 |     question = "Who are the founders of BlackRock? Return the names as a numbered list."
50 |     response = vector_rag.run(question)
51 |     print(f"Q1: {question}\n\n{response}")
52 | 
53 |     question = "Where did Larry Fink graduate from?"
54 |     response = vector_rag.run(question)
55 |     print(f"---\nQ2: {question}\n\n{response}")
56 | 
57 |     question = "When was Susan Wagner born?"
58 |     response = vector_rag.run(question)
59 |     print(f"---\nQ3: {question}\n\n{response}")
60 | 
61 |     question = "How did Larry Fink and Rob Kapito meet?"
62 |     response = vector_rag.run(question)
63 |     print(f"---\nQ4: {question}\n\n{response}")
64 | 


--------------------------------------------------------------------------------