├── .env.example ├── .gitignore ├── .python-version ├── LICENSE ├── README.md ├── assets └── hybrid-rag.png ├── crud.py ├── data └── blackrock │ ├── fink_1.txt │ ├── kapito_1.txt │ └── wagner_1.txt ├── docker-compose.yml ├── graph_rag.py ├── hybrid_rag.py ├── prompts.py ├── pyproject.toml ├── requirements.txt ├── uv.lock └── vector_rag.py /.env.example: -------------------------------------------------------------------------------- 1 | OPENAI_API_KEY = 2 | COHERE_API_KEY = -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control 110 | .pdm.toml 111 | .pdm-python 112 | .pdm-build/ 113 | 114 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 115 | __pypackages__/ 116 | 117 | # Celery stuff 118 | celerybeat-schedule 119 | celerybeat.pid 120 | 121 | # SageMath parsed files 122 | *.sage.py 123 | 124 | # Environments 125 | .env 126 | .venv 127 | env/ 128 | venv/ 129 | ENV/ 130 | env.bak/ 131 | venv.bak/ 132 | 133 | # Spyder project settings 134 | .spyderproject 135 | .spyproject 136 | 137 | # Rope project settings 138 | .ropeproject 139 | 140 | # mkdocs documentation 141 | /site 142 | 143 | # mypy 144 | .mypy_cache/ 145 | .dmypy.json 146 | dmypy.json 147 | 148 | # Pyre type checker 149 | .pyre/ 150 | 151 | # pytype static type analyzer 152 | .pytype/ 153 | 154 | # Cython debug symbols 155 | cython_debug/ 156 | 157 | # PyCharm 158 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 159 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 160 | # and can be added to the global gitignore or merged into this file. For a more nuclear 161 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 162 | #.idea/ 163 | .DS_Store 164 | 165 | # Custom 166 | test_kuzudb 167 | test_lancedb -------------------------------------------------------------------------------- /.python-version: -------------------------------------------------------------------------------- 1 | 3.12 -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Kùzu Inc. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Graph RAG and Hybrid RAG Workshop 2 | 3 | Workshop on showing the benefits of Graph RAG and its combination with Vector RAG (Hybrid RAG). 4 | 5 | The following stack is used: 6 | 7 | - Graph database: [Kùzu](https://kuzudb.com/) 8 | - Vector database: [LanceDB](https://lancedb.com/) 9 | - LLM prompting: [ell](https://docs.ell.so/), a language model programming framework 10 | - Embedding model: OpenAI `text-embedding-3-small` 11 | - Entity & relationship extraction: [LlamaIndex](https://docs.llamaindex.ai/) + OpenAI `gpt-4o-mini` 12 | - Generation model: OpenAI `gpt-4o-mini` 13 | - Reranking: Cohere [reranker](https://docs.cohere.com/v2/reference/rerank) 14 | 15 | The system we'll be building has the following high-level architecture: 16 | 17 | ![](./assets/hybrid-rag.png) 18 | 19 | ## Dataset 20 | 21 | The dataset used in this workshop is the [BlackRock founders dataset](./data/blackrock), which 22 | are three small text files containing information about the founders of the asset management firm 23 | BlackRock. 24 | 25 | The aim of the workshop is to show how we can build a hybrid RAG system that utilizes a graph 26 | database and a vector database to answer questions about the dataset. 27 | 28 | ## Setup environment 29 | 30 | We will be using the Python API of Kùzu and a combination of scripts that utilize the required 31 | dependencies. 32 | 33 | ### `uv` package manager 34 | 35 | It's recommended to use Astral's [`uv` package manager](https://docs.astral.sh/uv/) to manage both 36 | Python and its dependencies. You can install the required version of Python (3.12) using `uv` with 37 | the following command: 38 | 39 | ```bash 40 | uv python install 3.12 41 | ``` 42 | 43 | All the dependencies are indicated in the `pyproject.toml` file and the associated `uv.lock` file 44 | provided in this repo. Simply sync the dependencies to your local virtual environment with the 45 | following command: 46 | 47 | ```bash 48 | # Sync dependencies and allow uv to create a local .venv 49 | uv sync 50 | 51 | # Run scripts 52 | uv run crud.py 53 | uv run graph_rag.py 54 | uv run vector_rag.py 55 | uv run hybrid_rag.py 56 | ``` 57 | 58 | 59 | ### If using system Python 60 | 61 | > [!NOTE] 62 | > Alternatively you can use your system's Python installation and pip to install the dependencies 63 | > via `requirements.txt`. 64 | 65 | ```bash 66 | # Activate virtual environment 67 | python -m venv .venv 68 | source .venv/bin/activate 69 | python -m pip install -r requirements.txt 70 | 71 | # Run scripts 72 | python crud.py 73 | python graph_rag.py 74 | python vector_rag.py 75 | python hybrid_rag.py 76 | ``` 77 | 78 | ## Description of steps 79 | 80 | ### 1. Construct the graph 81 | 82 | The script `crud.py` extracts entities and relationships from the provided 83 | [BlackRock founders dataset](./data/blackrock) and constructs a graph that is stored in Kùzu. 84 | 85 | ```bash 86 | uv run crud.py 87 | ``` 88 | 89 | The script `crud.py` does the following: 90 | - Chunk the text, generate embeddings, and stores the embeddings in a [LanceDB](https://lancedb.com/), 91 | an embedded vector database 92 | - Use the LlamaIndex framework and its 93 | [property graph index](https://docs.llamaindex.ai/en/stable/module_guides/indexing/lpg_index_guide/) 94 | to extract entities and relationships from the unstructured text. 95 | - Store the extracted entities and relationships in Kùzu, an embedded graph database 96 | - Augment the graph with additional entities and relationships obtained from external sources 97 | 98 | ### 2. Traditional RAG (via vector search) 99 | 100 | The script `vector_rag.py` runs retrieval-augmented generation (RAG) that leverages semantic 101 | (vector) search. To retrieve from the vector database, the script first embeds the question and then 102 | searches for the nearest neighbors using cosine similarity. It then retrieves the context (chunks of 103 | text) that are most similar to the question. The script finally uses the LLM to generate a response 104 | using the retrieved context. 105 | 106 | ```bash 107 | uv run vector_rag.py 108 | ``` 109 | 110 | ### 3. Graph RAG 111 | 112 | The script `graph_rag.py` runs retrieval-augmented generation (RAG) that leverages the graph 113 | database to answer questions. To retrieve from the graph database, the script first translates 114 | the question into a Cypher query, which is then executed against the graph database. The retrieved 115 | entities and relationships are then used as context to generate a response using the LLM. 116 | 117 | ```bash 118 | uv run graph_rag.py 119 | ``` 120 | 121 | ### 4. Hybrid RAG 122 | 123 | The script `hybrid_rag.py` runs retrieval-augmented generation (RAG) that leverages *both* the 124 | vector database and the graph database. The vector and graph retrieval contexts are concatenated 125 | together and passed to the LLM to generate a response. 126 | 127 | ```bash 128 | uv run hybrid_rag.py 129 | ``` 130 | 131 | ## Workshop exercises 132 | 133 | In this section, we'll go through the workshop exercises. 134 | 135 | ### 1. Traditional RAG 136 | 137 | You can use the script `vector_rag.py`, that performs naive chunking of the text, creates vector 138 | embeddings of the chunks, and stores them in a vector database. 139 | 140 | We'll answer the following questions using traditional RAG. 141 | 142 | #### Q1: Who are the founders of BlackRock? Return the names as a numbered list. 143 | ``` 144 | The founders of BlackRock are: 145 | 146 | 1. Larry Fink 147 | 2. Robert Kapito 148 | 3. Susan Wagner 149 | ``` 150 | 151 | > [!NOTE] 152 | > The above list of the BlackRock founders is not exhaustive. Five more cofounders exist, as is 153 | > revealed in a simple Google search. To address this, you can augment the graph (in the next step) with the additional information to improve the relevance and factual accuracy of the response. 154 | 155 | #### Q2: Where did Larry Fink graduate from? 156 | ``` 157 | Larry Fink graduated from UCLA, where he earned a BA in political science in 1974 and an MBA in 1976. 158 | ``` 159 | 160 | #### Q3: When was Susan Wagner born? 161 | ``` 162 | The relevant context does not provide information about Susan Wagner's birth date. Therefore, I cannot answer the question about when Susan Wagner was born. 163 | ``` 164 | 165 | > [!NOTE] 166 | > In the given text data, Susan Wagner's birth date is not mentioned. You can use a graph 167 | > with the additional information added in it to improve the relevance and factual accuracy of the response. 168 | 169 | #### Q4: How did Larry Fink and Rob Kapito meet? 170 | ``` 171 | Larry Fink and Rob Kapito first met while working at First Boston in 1979, where Kapito served in the Public Finance department. This initial meeting laid the foundation for their future partnership when they later co-founded BlackRock in 1988. 172 | ``` 173 | 174 | ### 2. Graph RAG 175 | 176 | You can use the script `graph_rag.py` to answer the following questions using graph RAG. During 177 | graph construction, we added additional cofounders of BlackRock to the graph, as well as some of 178 | their birth dates. 179 | 180 | Just like earlier, we'll answer the following questions using Graph RAG. 181 | 182 | #### Q1: Who are the founders of BlackRock? Return the names as a numbered list. 183 | ``` 184 | 1. Barbara Novick 185 | 2. Hugh Frater 186 | 3. Keith Anderson 187 | 4. Ralph Schlosstein 188 | 5. Robert Kapito 189 | 6. Larry Fink 190 | 7. Ben Golub 191 | 8. Susan Wagner 192 | ``` 193 | 194 | > [!NOTE] 195 | > Because we augmented the graph with external knowledge, the list of founders is now exhaustive. 196 | 197 | #### Q2: Where did Larry Fink graduate from? 198 | 199 | ``` 200 | Larry Fink graduated from UCLA. 201 | ``` 202 | 203 | > [!NOTE] 204 | > Unlike vector retrieval-based RAG, the graph only stored the name of the university that Larry Fink 205 | > graduated from, so the LLM only has this context to answer the question as it did. 206 | 207 | #### Q3: When was Susan Wagner born? 208 | 209 | ``` 210 | Susan Wagner was born on May 26, 1961. 211 | ``` 212 | 213 | > [!NOTE] 214 | > The graph stored the birth dates of Larry Fink and Susan Wagner, so the LLM was able to answer the 215 | > question correctly where the vector search did not. 216 | 217 | #### Q4: How did Larry Fink and Rob Kapito meet? 218 | 219 | ``` 220 | The relevant context does not provide any information about how Larry Fink and Rob Kapito met. Therefore, I cannot answer the question. 221 | ``` 222 | 223 | > [!NOTE] 224 | > The graph did not store any information about how Larry Fink and Rob Kapito met, so the LLM 225 | > was unable to answer the question in this case. 226 | 227 | 228 | ### 3. Hybrid RAG 229 | 230 | You can use the script `hybrid_rag.py` to answer the following questions using hybrid RAG. The two 231 | retrieval contexts (vector and graph) are concatenated together and passed to the LLM to generate 232 | a response. 233 | 234 | #### Q1: Who are the founders of BlackRock? Return the names as a numbered list. 235 | ``` 236 | The founders of BlackRock are: 237 | 238 | 1. Barbara Novick 239 | 2. Hugh Frater 240 | 3. Keith Anderson 241 | 4. Ralph Schlosstein 242 | 5. Robert Kapito 243 | 6. Larry Fink 244 | 7. Ben Golub 245 | 8. Susan Wagner 246 | ``` 247 | 248 | #### Q2: Where did Larry Fink graduate from? 249 | 250 | ``` 251 | Larry Fink graduated from UCLA, where he earned both a BA in political science in 1974 and an MBA in 1976. 252 | ``` 253 | 254 | #### Q3: When was Susan Wagner born? 255 | 256 | ``` 257 | Susan Wagner was born on May 26, 1961. 258 | ``` 259 | 260 | #### Q4: How did Larry Fink and Rob Kapito meet? 261 | 262 | ``` 263 | Larry Fink and Rob Kapito first met while working at First Boston in 1979, where Kapito served in the Public Finance department. This meeting marked the beginning of their professional relationship, which later led them to become partners in founding BlackRock in 1988. 264 | ``` 265 | 266 | ## Additional exercises 267 | 268 | - Try to answer questions about data that is not present in the original text. For example, you 269 | can try to ask "When was Barbara Novick born?" 270 | - Try to answer questions that require some commonsense reasoning based on the text provided. 271 | For example, "Does Susan Wagner still work at BlackRock?" 272 | - Try to answer questions that require reasoning over multiple sentences. For example, "Which of 273 | BlackRock's cofounders also worked at First Boston, and where were they born?" 274 | 275 | 276 | ## Conclusions 277 | 278 | The hybrid RAG methodology (with reranking) provides factually accurate 279 | responses in all four cases. In cases where the graph didn't contain the answer, the vector search 280 | provided relevant context that allowed the LLM to generate a response. In cases where the graph 281 | contained the answer but the raw text didn't, hybrid RAG was able to rerank the results from the 282 | graph and vector search in way that on average, provided relevant responses. 283 | 284 | Note that the hybrid RAG system is not perfect. If the information is not present (either explicitly 285 | or implicitly in the text), it cannot provide an answer to the question because the LLM cannot 286 | reason over the required information to formulate a response. 287 | 288 | The key takeaways are: 289 | - Graphs can be a helpful tool for factual (extractive) question-answering tasks in RAG 290 | - Traditional RAG (vector-based) is useful for abstractive question-answering tasks, where the 291 | information is not explicitly stated in the exact words of the question 292 | - Just like data quality is of paramount importance in any retrieval system, for hybrid or Graph 293 | RAG, the quality of the graph (entities and relationships) is crucial to the quality of the responses 294 | generated 295 | 296 | Feel free to clone/fork this repo and try out the workflow on your own datasets! -------------------------------------------------------------------------------- /assets/hybrid-rag.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kuzudb/graph-rag-workshop/cf29d2d5b97d340df7295f656fa6813b0ba9ded6/assets/hybrid-rag.png -------------------------------------------------------------------------------- /crud.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | import warnings 4 | from typing import Literal 5 | 6 | import kuzu 7 | import nest_asyncio 8 | import openai 9 | from dotenv import load_dotenv 10 | from llama_index.core import PropertyGraphIndex, SimpleDirectoryReader, VectorStoreIndex 11 | from llama_index.core.indices.property_graph import SchemaLLMPathExtractor 12 | from llama_index.core.ingestion import IngestionPipeline 13 | from llama_index.core.node_parser import SentenceSplitter 14 | from llama_index.embeddings.openai import OpenAIEmbedding 15 | from llama_index.graph_stores.kuzu import KuzuPropertyGraphStore 16 | from llama_index.llms.openai import OpenAI 17 | from llama_index.vector_stores.lancedb import LanceDBVectorStore 18 | 19 | # Load environment variables 20 | load_dotenv() 21 | SEED = 42 22 | nest_asyncio.apply() 23 | 24 | OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY") 25 | COHERE_API_KEY = os.environ.get("COHERE_API_KEY") 26 | 27 | assert OPENAI_API_KEY is not None, "OPENAI_API_KEY is not set" 28 | assert COHERE_API_KEY is not None, "COHERE_API_KEY is not set" 29 | 30 | # Set up the embedding model and LLM 31 | embed_model = OpenAIEmbedding(model_name="text-embedding-3-small") 32 | extraction_llm = OpenAI(model="gpt-4o-mini", temperature=0.0, seed=SEED) 33 | generation_llm = OpenAI(model="gpt-4o-mini", temperature=0.3, seed=SEED) 34 | 35 | # Load the dataset on Larry Fink 36 | original_documents = SimpleDirectoryReader("./data/blackrock").load_data() 37 | # print(len(original_documents)) 38 | 39 | # --- Step 1: Chunk and store the vector embeddings in LanceDB --- 40 | shutil.rmtree("./test_lancedb", ignore_errors=True) 41 | 42 | openai.api_key = OPENAI_API_KEY 43 | 44 | vector_store = LanceDBVectorStore( 45 | uri="./test_lancedb", 46 | mode="overwrite", 47 | ) 48 | 49 | pipeline = IngestionPipeline( 50 | transformations=[ 51 | SentenceSplitter(chunk_size=1024, chunk_overlap=32), 52 | OpenAIEmbedding(), 53 | ], 54 | vector_store=vector_store, 55 | ) 56 | pipeline.run(documents=original_documents) 57 | 58 | # Create the vector index 59 | vector_index = VectorStoreIndex.from_vector_store( 60 | vector_store, 61 | embed_model=OpenAIEmbedding(model_name="text-embedding-3-small"), 62 | llm=OpenAI(model="gpt-4o-mini", temperature=0.3, seed=SEED), 63 | ) 64 | 65 | # --- Step 2: Construct the graph in KùzuDB --- 66 | 67 | shutil.rmtree("test_kuzudb", ignore_errors=True) 68 | db = kuzu.Database("test_kuzudb") 69 | 70 | warnings.filterwarnings("ignore") 71 | 72 | # Define the allowed entities and relationships 73 | entities = Literal["PERSON", "CITY", "STATE", "UNIVERSITY", "ORGANIZATION"] 74 | relations = Literal[ 75 | "STUDIED_AT", 76 | "IS_FOUNDER_OF", 77 | "IS_CEO_OF", 78 | "BORN_IN", 79 | "IS_CITY_IN", 80 | ] 81 | 82 | validation_schema = [ 83 | ("PERSON", "STUDIED_AT", "UNIVERSITY"), 84 | ("PERSON", "IS_CEO_OF", "ORGANIZATION"), 85 | ("PERSON", "IS_FOUNDER_OF", "ORGANIZATION"), 86 | ("PERSON", "BORN_IN", "CITY"), 87 | ("CITY", "IS_CITY_IN", "STATE"), 88 | ] 89 | 90 | graph_store = KuzuPropertyGraphStore( 91 | db, 92 | has_structured_schema=True, 93 | relationship_schema=validation_schema, 94 | ) 95 | 96 | schema_path_extractor = SchemaLLMPathExtractor( 97 | llm=extraction_llm, 98 | possible_entities=entities, 99 | possible_relations=relations, 100 | kg_validation_schema=validation_schema, 101 | strict=True, 102 | ) 103 | 104 | kg_index = PropertyGraphIndex.from_documents( 105 | original_documents, 106 | embed_model=embed_model, 107 | kg_extractors=[schema_path_extractor], 108 | property_graph_store=graph_store, 109 | show_progress=True, 110 | ) 111 | 112 | # Step 3: Augment the graph with external knowledge and fix erroneous relationships 113 | 114 | # Say we have this knowledge obtained from other sources about additional founders of BlackRock 115 | additional_founders = [ 116 | "Ben Golub", 117 | "Barbara Novick", 118 | "Ralph Schlosstein", 119 | "Keith Anderson", 120 | "Hugh Frater", 121 | ] 122 | 123 | # Open a connection to the database to modify the graph 124 | conn = kuzu.Connection(db) 125 | 126 | # Add additional founder nodes of type PERSON to the graph store 127 | for founder in additional_founders: 128 | conn.execute( 129 | """ 130 | MATCH (o:ORGANIZATION {id: "BlackRock"}) 131 | MERGE (p:PERSON {id: $name, name: $name}) 132 | MERGE (p)-[r:LINKS]->(o) 133 | SET r.label = "IS_FOUNDER_OF" 134 | """, 135 | parameters={"name": founder}, 136 | ) 137 | 138 | # Alter PERSON schema and add a birth_date property 139 | try: 140 | conn.execute("ALTER TABLE PERSON ADD birth_date STRING") 141 | except RuntimeError: 142 | pass 143 | 144 | names = ["Larry Fink", "Susan Wagner", "Robert Kapito"] 145 | dates = ["1952-11-02", "1961-05-26", "1957-02-08"] 146 | 147 | for name, date in zip(names, dates): 148 | conn.execute( 149 | """ 150 | MERGE (p:PERSON {id: $name}) 151 | ON MATCH SET p.birth_date = $date 152 | """, 153 | parameters={"name": name, "date": date}, 154 | ) 155 | 156 | conn.close() 157 | -------------------------------------------------------------------------------- /data/blackrock/fink_1.txt: -------------------------------------------------------------------------------- 1 | Larry Fink is Founder, Chairman and CEO of BlackRock. He and seven partners founded BlackRock in 1988, and under his leadership, the firm has grown into a global leader in investment and technology solutions to help investors build better financial futures. Today, BlackRock is trusted to manage $10 trillion in assets, more than any other investment firm in the world. 2 | 3 | Mr. Fink was was born on November 2, 1952, in Van Nuys, California. He earned an MBA from UCLA in 1976 and a BA in political science, also from UCLA, in 1974. 4 | -------------------------------------------------------------------------------- /data/blackrock/kapito_1.txt: -------------------------------------------------------------------------------- 1 | Robert Kapito is one of the most successful persons in the financial industry globally and well recognized as the President and cofounder of BlackRock. His career, full of innovation and top-class leadership, has changed and shaped the outlook of the world’s financial scene. 2 | 3 | Robert Kapito was born on February 8, 1957, in Monticello, New York. 4 | 5 | Mr. Kapito started his professional career in the First Boston in 1979 where he served in the Public Finance department. This work was the onset of his career, which was long and proved to be significant in finance. During his work at First Boston he first met Larry Fink, who would later be his partner at BlackRock. -------------------------------------------------------------------------------- /data/blackrock/wagner_1.txt: -------------------------------------------------------------------------------- 1 | Susan Wagner is a cofounder and director of asset manager BlackRock, which she started with Larry Fink and others in 1988. She cofounded the company at age 26 and went on to serve as chief operating officer and vice chairman. Wagner oversaw BlackRock's 2009 merger with Barclay's Global Investors, which transformed the firm into the world's largest asset manager. 2 | 3 | Prior to founding BlackRock, Wagner worked as a vice president of the mortgage finance group at Lehman Brothers. She retired in 2012 but remains on BlackRock's board and also serves as a director of Apple and startups Color Health and Samsara. -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | services: 2 | explorer: 3 | image: kuzudb/explorer:0.6.1 4 | environment: 5 | - MODE=READ_ONLY 6 | ports: 7 | - 8000:8000 8 | volumes: 9 | - ./test_kuzudb:/database 10 | -------------------------------------------------------------------------------- /graph_rag.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import kuzu 4 | from dotenv import load_dotenv 5 | from ell import ell 6 | from openai import OpenAI 7 | 8 | import prompts 9 | 10 | load_dotenv() 11 | OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY") 12 | MODEL_NAME = "gpt-4o-mini" 13 | SEED = 42 14 | 15 | 16 | class GraphRAG: 17 | """Graph Retrieval Augmented Generation from a Kùzu database.""" 18 | 19 | def __init__(self, db_path="./test_kuzudb"): 20 | self.db = kuzu.Database(db_path) 21 | self.conn = kuzu.Connection(self.db) 22 | 23 | def get_schema(self) -> str: 24 | """Provides the graph schema information for the purposes of Cypher generation via an LLM.""" 25 | node_properties = [] 26 | node_table_names = self.conn._get_node_table_names() 27 | for table_name in node_table_names: 28 | current_table_schema = {"properties": [], "label": table_name} 29 | properties = self.conn._get_node_property_names(table_name) 30 | for property_name in properties: 31 | property_type = properties[property_name]["type"] 32 | list_type_flag = "" 33 | if properties[property_name]["dimension"] > 0: 34 | if "shape" in properties[property_name]: 35 | for s in properties[property_name]["shape"]: 36 | list_type_flag += "[%s]" % s 37 | else: 38 | for i in range(properties[property_name]["dimension"]): 39 | list_type_flag += "[]" 40 | property_type += list_type_flag 41 | current_table_schema["properties"].append((property_name, property_type)) 42 | node_properties.append(current_table_schema) 43 | 44 | relationships = [] 45 | rel_tables = self.conn._get_rel_table_names() 46 | for table in rel_tables: 47 | relationships.append("(:%s)-[:%s]->(:%s)" % (table["src"], table["name"], table["dst"])) 48 | 49 | rel_properties = [] 50 | for table in rel_tables: 51 | table_name = table["name"] 52 | current_table_schema = {"properties": [], "label": table_name} 53 | query_result = self.conn.execute(f"CALL table_info('{table_name}') RETURN *;") 54 | while query_result.has_next(): 55 | row = query_result.get_next() 56 | prop_name = row[1] 57 | prop_type = row[2] 58 | current_table_schema["properties"].append((prop_name, prop_type)) 59 | rel_properties.append(current_table_schema) 60 | 61 | schema = ( 62 | f"Node properties: {node_properties}\n" 63 | f"Relationships properties: {rel_properties}\n" 64 | f"Relationships: {relationships}\n" 65 | ) 66 | return schema 67 | 68 | def query(self, question: str, cypher: str) -> str: 69 | """Use the generated Cypher statement to query the graph database.""" 70 | response = self.conn.execute(cypher) 71 | result = [] 72 | while response.has_next(): 73 | item = response.get_next() 74 | if item not in result: 75 | result.extend(item) 76 | 77 | # Handle both hashable and non-hashable types 78 | if all(isinstance(x, (str, int, float, bool, tuple)) for x in result): 79 | final_result = {question: list(set(result))} 80 | else: 81 | # For non-hashable types, we can't use set() directly 82 | # Instead, we'll use a list comprehension to remove duplicates 83 | final_result = {question: [x for i, x in enumerate(result) if x not in result[:i]]} 84 | 85 | return final_result 86 | 87 | @ell.simple(model=MODEL_NAME, temperature=0.1, client=OpenAI(api_key=OPENAI_API_KEY), seed=SEED) 88 | def generate_cypher(self, question: str) -> str: 89 | return [ 90 | ell.system(prompts.CYPHER_SYSTEM_PROMPT), 91 | ell.user( 92 | prompts.CYPHER_USER_PROMPT.format(schema=self.get_schema(), question=question) 93 | ), 94 | ] 95 | 96 | @ell.simple(model=MODEL_NAME, temperature=0.3, client=OpenAI(api_key=OPENAI_API_KEY), seed=SEED) 97 | def retrieve(self, question: str, context: str) -> str: 98 | return [ 99 | ell.system(prompts.RAG_SYSTEM_PROMPT), 100 | ell.user(prompts.RAG_USER_PROMPT.format(question=question, context=context)), 101 | ] 102 | 103 | def run(self, question: str) -> str: 104 | cypher = self.generate_cypher(question) 105 | print(f"\n{cypher}\n") 106 | context = self.query(question, cypher) 107 | return self.retrieve(question, context) 108 | 109 | 110 | if __name__ == "__main__": 111 | graph_rag = GraphRAG("./test_kuzudb") 112 | question = "Who are the founders of BlackRock? Return the names as a numbered list." 113 | response = graph_rag.run(question) 114 | print(f"Q1: {question}\n\n{response}\n---\n") 115 | 116 | question = "Where did Larry Fink graduate from?" 117 | response = graph_rag.run(question) 118 | print(f"Q2: {question}\n\n{response}\n---\n") 119 | 120 | question = "When was Susan Wagner born?" 121 | response = graph_rag.run(question) 122 | print(f"Q3: {question}\n\n{response}\n---\n") 123 | 124 | question = "How did Larry Fink and Rob Kapito meet?" 125 | response = graph_rag.run(question) 126 | print(f"---\nQ4: {question}\n\n{response}") 127 | -------------------------------------------------------------------------------- /hybrid_rag.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import cohere 4 | from dotenv import load_dotenv 5 | from ell import ell 6 | from openai import OpenAI 7 | 8 | import prompts 9 | from graph_rag import GraphRAG 10 | from vector_rag import VectorRAG 11 | 12 | load_dotenv() 13 | MODEL_NAME = "gpt-4o-mini" 14 | COHERE_API_KEY = os.environ.get("COHERE_API_KEY") 15 | OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY") 16 | SEED = 42 17 | 18 | 19 | class HybridRAG: 20 | def __init__( 21 | self, 22 | graph_db_path="./test_kuzudb", 23 | vector_db_path="./test_lancedb", 24 | ): 25 | self.graph_rag = GraphRAG(graph_db_path) 26 | self.vector_rag = VectorRAG(vector_db_path) 27 | self.co = cohere.ClientV2(COHERE_API_KEY) 28 | 29 | @ell.simple(model=MODEL_NAME, temperature=0.3, client=OpenAI(api_key=OPENAI_API_KEY), seed=SEED) 30 | def hybrid_rag(self, question: str, context: str) -> str: 31 | return [ 32 | ell.system(prompts.RAG_SYSTEM_PROMPT), 33 | ell.user(prompts.RAG_USER_PROMPT.format(question=question, context=context)), 34 | ] 35 | 36 | def run(self, question: str) -> str: 37 | question_embedding = self.vector_rag.embed(question) 38 | vector_docs = self.vector_rag.query(question_embedding) 39 | vector_docs = [doc["text"] for doc in vector_docs] 40 | 41 | cypher = self.graph_rag.generate_cypher(question) 42 | graph_docs = self.graph_rag.query(question, cypher) 43 | 44 | docs = [graph_docs] + vector_docs 45 | # Ensure the doc contents are strings 46 | docs = [str(doc) for doc in docs] 47 | 48 | combined_context = self.co.rerank( 49 | model="rerank-english-v3.0", 50 | query=question, 51 | documents=docs, 52 | top_n=20, 53 | return_documents=True, 54 | ) 55 | return self.hybrid_rag(question, combined_context) 56 | 57 | 58 | if __name__ == "__main__": 59 | hybrid_rag = HybridRAG( 60 | graph_db_path="./test_kuzudb", 61 | vector_db_path="./test_lancedb" 62 | ) 63 | question = "Who are the founders of BlackRock? Return the names as a numbered list." 64 | response = hybrid_rag.run(question) 65 | print(f"Q1: {question}\n\n{response}") 66 | 67 | question = "Where did Larry Fink graduate from?" 68 | response = hybrid_rag.run(question) 69 | print(f"---\nQ2: {question}\n\n{response}") 70 | 71 | question = "When was Susan Wagner born?" 72 | response = hybrid_rag.run(question) 73 | print(f"---\nQ3: {question}\n\n{response}") 74 | 75 | question = "How did Larry Fink and Rob Kapito meet?" 76 | response = hybrid_rag.run(question) 77 | print(f"---\nQ4: {question}\n\n{response}") 78 | -------------------------------------------------------------------------------- /prompts.py: -------------------------------------------------------------------------------- 1 | RAG_SYSTEM_PROMPT = """ 2 | You are an AI assistant using Retrieval-Augmented Generation (RAG). 3 | RAG enhances your responses by retrieving relevant information from a knowledge base. 4 | You will be provided with a question and relevant context. Use only this context to answer the question. 5 | Do not make up an answer. If you don't know the answer, say so clearly. 6 | Always strive to provide concise, helpful, and context-aware answers. 7 | """ 8 | 9 | CYPHER_SYSTEM_PROMPT = """ 10 | You are an expert in translating natural language questions into Cypher statements. 11 | You will be provided with a question and a graph schema. 12 | Use only the provided relationship types and properties in the schema to generate a Cypher statement. 13 | The Cypher statement could retrieve nodes, relationships, or both. 14 | Do not include any explanations or apologies in your responses. 15 | Do not respond to any questions that might ask anything else than for you to construct a Cypher statement. 16 | """ 17 | 18 | RAG_USER_PROMPT = """ 19 | Given the following question and relevant context, please provide a comprehensive and accurate response: 20 | 21 | Question: {question} 22 | 23 | Relevant context: 24 | {context} 25 | 26 | Response: 27 | """ 28 | 29 | CYPHER_USER_PROMPT = """ 30 | Task: Generate Cypher statement to query a graph database. 31 | Instructions: 32 | Schema: 33 | {schema} 34 | 35 | The question is: 36 | {question} 37 | 38 | Instructions: 39 | Generate the Kùzu dialect of Cypher with the following rules in mind: 40 | 1. Do not include triple backticks ``` in your response. Return only Cypher. 41 | 2. Only use the nodes and relationships provided in the schema. 42 | 3. Use only the provided node and relationship types and properties in the schema. 43 | """ 44 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "graph-rag-workshop" 3 | version = "0.1.0" 4 | description = "Add your description here" 5 | readme = "README.md" 6 | requires-python = ">=3.12" 7 | dependencies = [ 8 | "cohere==5.11.1", 9 | "ell-ai==0.0.14", 10 | "kuzu==0.6.1", 11 | "lancedb==0.14.0", 12 | "llama-index==0.11.19", 13 | "llama-index-embeddings-openai==0.2.5", 14 | "llama-index-graph-stores-kuzu==0.3.2", 15 | "llama-index-vector-stores-lancedb==0.2.4", 16 | "python-dotenv==1.0.1", 17 | ] 18 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | kuzu==0.6.1 2 | lancedb==0.14.0 3 | python-dotenv==1.0.1 4 | ell-ai==0.0.14 5 | llama-index==0.11.19 6 | llama-index-embeddings-openai==0.2.5 7 | llama-index-vector-stores-lancedb==0.2.4 8 | llama-index-graph-stores-kuzu==0.3.2 9 | cohere==5.11.1 -------------------------------------------------------------------------------- /vector_rag.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import lancedb 4 | from dotenv import load_dotenv 5 | from ell import ell 6 | from openai import OpenAI 7 | 8 | import prompts 9 | 10 | load_dotenv() 11 | OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY") 12 | MODEL_NAME = "gpt-4o-mini" 13 | SEED = 42 14 | 15 | 16 | class VectorRAG: 17 | def __init__(self, db_path: str, table_name: str = "vectors"): 18 | load_dotenv() 19 | self.openai_client = OpenAI(api_key=OPENAI_API_KEY) 20 | self.db = lancedb.connect(db_path) 21 | self.table = self.db.open_table(table_name) 22 | 23 | def query(self, query_vector: list, limit: int = 10) -> list: 24 | search_result = ( 25 | self.table.search(query_vector).metric("cosine").select(["text"]).limit(limit) 26 | ).to_list() 27 | return search_result if search_result else None 28 | 29 | def embed(self, query: str) -> list: 30 | # For now just using an OpenAI embedding model 31 | response = self.openai_client.embeddings.create(model="text-embedding-3-small", input=query) 32 | return response.data[0].embedding 33 | 34 | @ell.simple(model=MODEL_NAME, temperature=0.3, client=OpenAI(api_key=OPENAI_API_KEY), seed=SEED) 35 | def retrieve(self, question: str, context: str) -> str: 36 | return [ 37 | ell.system(prompts.RAG_SYSTEM_PROMPT), 38 | ell.user(prompts.RAG_USER_PROMPT.format(question=question, context=context)), 39 | ] 40 | 41 | def run(self, question: str) -> str: 42 | question_embedding = self.embed(question) 43 | context = self.query(question_embedding) 44 | return self.retrieve(question, context) 45 | 46 | 47 | if __name__ == "__main__": 48 | vector_rag = VectorRAG("./test_lancedb") 49 | question = "Who are the founders of BlackRock? Return the names as a numbered list." 50 | response = vector_rag.run(question) 51 | print(f"Q1: {question}\n\n{response}") 52 | 53 | question = "Where did Larry Fink graduate from?" 54 | response = vector_rag.run(question) 55 | print(f"---\nQ2: {question}\n\n{response}") 56 | 57 | question = "When was Susan Wagner born?" 58 | response = vector_rag.run(question) 59 | print(f"---\nQ3: {question}\n\n{response}") 60 | 61 | question = "How did Larry Fink and Rob Kapito meet?" 62 | response = vector_rag.run(question) 63 | print(f"---\nQ4: {question}\n\n{response}") 64 | --------------------------------------------------------------------------------