├── files ├── IR-001.pdf ├── IR-002.pdf ├── IR-003.pdf ├── graph_rag_result.png └── pipeline_result.png ├── requirements.txt ├── docker-compose.yml ├── graph_rag.py ├── README.md ├── .gitignore └── pipeline.py /files/IR-001.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rathcoding/knowledge-graph-rag/HEAD/files/IR-001.pdf -------------------------------------------------------------------------------- /files/IR-002.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rathcoding/knowledge-graph-rag/HEAD/files/IR-002.pdf -------------------------------------------------------------------------------- /files/IR-003.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rathcoding/knowledge-graph-rag/HEAD/files/IR-003.pdf -------------------------------------------------------------------------------- /files/graph_rag_result.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rathcoding/knowledge-graph-rag/HEAD/files/graph_rag_result.png -------------------------------------------------------------------------------- /files/pipeline_result.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rathcoding/knowledge-graph-rag/HEAD/files/pipeline_result.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # pip install --upgrade pip 2 | python-dotenv 3 | neo4j 4 | pypdf 5 | langchain 6 | langchain-community 7 | langchain-experimental 8 | # langchain-huggingface 9 | transformers 10 | sentence-transformers 11 | json-repair -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | services: 2 | neo4j: 3 | container_name: neo4j 4 | image: neo4j:latest 5 | ports: 6 | - 7474:7474 7 | - 7687:7687 8 | environment: 9 | - NEO4J_AUTH=none 10 | - NEO4J_apoc_export_file_enabled=true 11 | - NEO4J_apoc_import_file_enabled=true 12 | - NEO4J_apoc_import_file_use__neo4j__config=true 13 | - NEO4J_PLUGINS=["apoc", "graph-data-science"] 14 | - NEO4J_server_memory_heap_initial__size=1G # Adjust as needed (check you Docker) 15 | - NEO4J_server_memory_heap_max__size=1G # Adjust as needed (check you Docker) 16 | volumes: 17 | - ./neo4j_db/data:/data 18 | - ./neo4j_db/logs:/logs 19 | - ./neo4j_db/import:/var/lib/neo4j/import 20 | - ./neo4j_db/plugins:/plugins 21 | -------------------------------------------------------------------------------- /graph_rag.py: -------------------------------------------------------------------------------- 1 | import os 2 | import logging 3 | import dotenv 4 | dotenv.load_dotenv() 5 | logging.basicConfig(level=logging.INFO) 6 | logging.info('Starting up the Knowledge Graph RAG...') 7 | 8 | # Instantiate the Neo4J connector 9 | logging.info(f'Instantiating the Neo4J connector for: { os.getenv("NEO4J_URI") }') 10 | from langchain_community.graphs import Neo4jGraph 11 | graph = Neo4jGraph() 12 | 13 | # Instantiate LLM to use with the Graph RAG 14 | logging.info('Instantiating LLM to use with the LLMGraphTransformer') 15 | from langchain_community.llms import Ollama 16 | llm=Ollama(model='llama3', temperature=0.0) 17 | 18 | # Instantiate the langchain Graph RAG with the Neo4J connector and the LLM 19 | from langchain.chains import GraphCypherQAChain 20 | chain = GraphCypherQAChain.from_llm(graph=graph, llm=llm, verbose=True) 21 | 22 | logging.info('Knowledge Graph RAG is ready to go!') 23 | logging.info('='*50) 24 | 25 | def main(): 26 | logging.info('Type "exit" to quit the program.') 27 | while True: 28 | question = input('\nAsk me a question: ') 29 | if question == 'exit': 30 | break 31 | result = chain.invoke({"query": question}) 32 | if result['result']: 33 | print(result['result']) 34 | else: 35 | print(result) 36 | 37 | 38 | if __name__ == '__main__': 39 | main() 40 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Knowledge Graph RAG with Local LLM 2 | 3 | This is an **<*ongoing*>** personal project aimed to practice building a pipeline to feed a Neo4J database from unstructured data from PDFs containing (fictional) crime reports, and then use a Graph RAG to query the database in natural language. 4 | 5 | The pipeline is based on [Neo4J - Enhancing the Accuracy of RAG Applications With Knowledge Graphs](https://neo4j.com/developer-blog/enhance-rag-knowledge-graph/?mkt_tok=NzEwLVJSQy0zMzUAAAGTBn-WDr1KcupEPExYL6rh_DaP3R0h5gWQFxWGRm6dXiew5-oAnYBbvXvedknjyhyojNebyUa0ywWZwIkZQRtiJ-9x6k22vY3ru2Ztp7PjlgN5Bbs) article. 6 | 7 | The GraphRAG is based on the YouTube tutorial [Langchain & Neo4j: Query Your Graph Database in Natural Language](https://www.youtube.com/watch?v=Wg445gThtcE). 8 | 9 | Both parts of the project were adapted to use a locally hosted Neo4J database (Docker) and a locally hosted LLM (Ollama). 10 | 11 | 12 | > *Stack:* Python, LangChain, Ollama, Neo4J, Docker 13 | 14 | To run this project you'll need: 15 | 1) [Docker](https://www.docker.com/) installed and running on your machine (docker-compose.yml file included in the repository). 16 | 2) [Ollama](https://ollama.com/) installed and running on your machine, and a [model](https://ollama.com/library) downloaded. 17 | 3) A Python environment with the required packages installed. You can install them with `pip install -r requirements.txt`. 18 | 4) A .env file with the following variables: 19 | ``` 20 | NEO4J_URI=bolt://localhost:7687 21 | NEO4J_USERNAME=neo4j 22 | NEO4J_PASSWORD=neo4j 23 | ``` 24 | 25 | # The pipeline 26 | 27 | pipeline.py -> main script to run the pipeline. 28 | 29 | 1) It extracts text from PDFs in the `files` folder. 30 | 2) Sends the text to the local LLM to extract entities and relationships. 31 | * To use a I needed to build a custom chat_prompt, as pointed out in this [StackOverflow topic](https://stackoverflow.com/questions/78521181/llmgraphtransformer-convert-to-graph-documentsdocuments-attributeerror-str). 32 | * I chose to also build my own Pydantic class and examples, instead of using the library's default, to align the model to the crime-related theme. 33 | 3) Inserts into the Neo4J database the extracted entities and relationships. 34 | 35 | After running the pipeline script, check out the Neo4J database at `http://localhost:7474/browser/`: 36 | ``` 37 | MATCH (n)-[r]->(m) 38 | RETURN n, r, m 39 | ``` 40 | 41 | You should see all the entities and relationships extracted from the PDFs. 42 | 43 | Results using Llama3-8B model: 44 | 45 | ![result](./files/pipeline_result.png) 46 | 47 | 48 | # The Graph RAG 49 | 50 | graph_rag.py -> main script to run the Graph RAG Q&A. 51 | 52 | 1) It queries the Neo4J database with a natural language question. 53 | 2) It returns the answer in natural language based on the result of the query. 54 | 55 | > Right now you need to write the questions using the same words as the entities and relationships in the database. I'm working on a way to make the questions more flexible... 56 | 57 | Results using Llama3-8B model: 58 | 59 | ![result](./files/graph_rag_result.png) 60 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | #.idea/ 161 | 162 | **/.DS_Store 163 | neo4j_db/ -------------------------------------------------------------------------------- /pipeline.py: -------------------------------------------------------------------------------- 1 | import os 2 | import logging 3 | import dotenv 4 | dotenv.load_dotenv() 5 | logging.basicConfig(level=logging.INFO) 6 | logging.info('Starting the data pipeline...') 7 | 8 | # Get list of PDF files from 'files' folder 9 | logging.info('Getting list of PDF files from "files" folder') 10 | files_path = 'files' 11 | files = [files_path+'/'+file for file in os.listdir(files_path) if file.endswith('.pdf')] 12 | logging.info(f'List of PDF files: {files}') 13 | 14 | # Instantiate the token text splitter 15 | logging.info('Instantiating the token text splitter') 16 | from langchain.text_splitter import TokenTextSplitter 17 | splitter = TokenTextSplitter(chunk_size=512, chunk_overlap=24) 18 | 19 | # Split the PDFs into chunks as Documents 20 | logging.info('Splitting the PDFs into chunks as Documents') 21 | from langchain_community.document_loaders import PyPDFLoader 22 | documents = [] 23 | 24 | for file in files: 25 | # Load the PDF file 26 | pdf_loader = PyPDFLoader(file_path=file, extract_images=False) 27 | # Split the PDF into Documents 28 | files_documents = pdf_loader.load_and_split(text_splitter=splitter) 29 | # Add the Documents to the list 30 | documents.extend(files_documents) 31 | logging.info(f'Loaded and split {file} into {len(files_documents)} Documents') 32 | 33 | # Instantiate LLM to use with the LLMGraphTransformer 34 | logging.info('Instantiating LLM to use with the LLMGraphTransformer') 35 | from langchain_community.llms import Ollama 36 | llm=Ollama(model='llama3', temperature=0.0) 37 | 38 | # To use a local LLM we need to create a chat_prompt to solve the AttributeError from the LLMGraphTransformer 39 | 40 | # Create a system message to provide the LLM with the instructions 41 | logging.info('Creating a chat_prompt to provide the LLM with the instructions and examples') 42 | from langchain_experimental.graph_transformers.llm import SystemMessage 43 | system_prompt = """ 44 | You are a data scientist working for the police and you are building a knowledge graph database. 45 | Your task is to extract information from data and convert it into a knowledge graph database. 46 | Provide a set of Nodes in the form [head, head_type, relation, tail, tail_type]. 47 | It is important that the head and tail exists as nodes that are related by the relation. 48 | If you can't pair a relationship with a pair of nodes don't add it. 49 | When you find a node or relationship you want to add try to create a generic TYPE for it that describes the entity you can also think of it as a label. 50 | You must generate the output in a JSON format containing a list with JSON objects. Each object should have the keys: "head", "head_type", "relation", "tail", and "tail_type". 51 | """ 52 | 53 | system_message = SystemMessage(content=system_prompt) 54 | 55 | # Create a human message to combine with the SystemMessage 56 | # We need: 57 | # 1) A parser to provide the LLM with the format instructions 58 | # 2) Examples to provide the LLM with the context we want to extract 59 | # 3) A list of nodes and relationships to provide the LLM with the context we want to extract 60 | 61 | # 1) The parser 62 | # To instantiate a parser we need a Pydantic class. 63 | # Instead of using the default langchain_experimental.graph_transformers.llm.UnstructuredRelation, 64 | # we'll build our own class to provide more crime-related context instructions 65 | from langchain_core.pydantic_v1 import BaseModel, Field 66 | 67 | class UnstructuredRelation(BaseModel): 68 | head: str = Field( 69 | description=( 70 | "extracted head entity like Person, Crime, Object, Vehicle, Location, etc." 71 | "Must use human-readable unique identifier." 72 | ) 73 | ) 74 | head_type: str = Field( 75 | description="type of the extracted head entity like Person, Crime, Object, Vehicle, etc" 76 | ) 77 | relation: str = Field(description="relation between the head and the tail entities") 78 | tail: str = Field( 79 | description=( 80 | "extracted head entity like Person, Crime, Object, Vehicle, Location, etc." 81 | "Must use human-readable unique identifier." 82 | ) 83 | ) 84 | tail_type: str = Field( 85 | description="type of the extracted head entity like Person, Crime, Object, Vehicle, etc" 86 | ) 87 | 88 | # Instantiate the parser with our Pydantic class to provide the LLM with the format instructions 89 | from langchain_experimental.graph_transformers.llm import JsonOutputParser 90 | parser = JsonOutputParser(pydantic_object=UnstructuredRelation) 91 | 92 | # 2) The examples 93 | examples = [ 94 | { 95 | "text": ( 96 | "Michael Johnson was mugged at knife-point by two assailants on 5th Avenue. " 97 | "They took his wallet and phone." 98 | ), 99 | "head": "Michael Johnson", 100 | "head_type": "Person", 101 | "relation": "VICTIM_OF", 102 | "tail": "Mugging", 103 | "tail_type": "Crime", 104 | }, 105 | { 106 | "text": ( 107 | "Michael Johnson was mugged at knife-point by two assailants on 5th Avenue. " 108 | "They took his wallet and phone." 109 | ), 110 | "head": "5th Avenue", 111 | "head_type": "Location", 112 | "relation": "SCENE_OF", 113 | "tail": "Mugging", 114 | "tail_type": "Crime", 115 | }, 116 | { 117 | "text": ( 118 | "Sarah Connor witnessed a mugging on 5th Avenue where Michael Johnson was attacked. " 119 | "She saw the assailants flee in a black car." 120 | ), 121 | "head": "Sarah Connor", 122 | "head_type": "Person", 123 | "relation": "WITNESS_OF", 124 | "tail": "Mugging", 125 | "tail_type": "Crime", 126 | }, 127 | { 128 | "text": ( 129 | "John Doe was caught selling illegal drugs in Central Park. " 130 | "He was arrested by undercover officers." 131 | ), 132 | "head": "John Doe", 133 | "head_type": "Person", 134 | "relation": "SUSPECT_IN", 135 | "tail": "Drug Trafficking", 136 | "tail_type": "Crime", 137 | }, 138 | { 139 | "text": ( 140 | "John Doe was caught selling illegal drugs in Central Park. " 141 | "He was arrested by undercover officers." 142 | ), 143 | "head": "Central Park", 144 | "head_type": "Location", 145 | "relation": "SCENE_OF", 146 | "tail": "Drug Trafficking", 147 | "tail_type": "Crime", 148 | }, 149 | { 150 | "text": ( 151 | "Emily Clark was assaulted in a parking lot near her office on Elm Street. " 152 | "The assailant attempted to steal her car but fled when she screamed." 153 | ), 154 | "head": "Emily Clark", 155 | "head_type": "Person", 156 | "relation": "VICTIM_OF", 157 | "tail": "Assault", 158 | "tail_type": "Crime", 159 | }, 160 | { 161 | "text": ( 162 | "Emily Clark was assaulted in a parking lot near her office on Elm Street. " 163 | "The assailant attempted to steal her car but fled when she screamed." 164 | ), 165 | "head": "Elm Street", 166 | "head_type": "Location", 167 | "relation": "SCENE_OF", 168 | "tail": "Assault", 169 | "tail_type": "Crime", 170 | }, 171 | { 172 | "text": ( 173 | "James Smith was identified as the suspect in the assault on Emily Clark. " 174 | "He was later arrested by the police." 175 | ), 176 | "head": "James Smith", 177 | "head_type": "Person", 178 | "relation": "SUSPECT_IN", 179 | "tail": "Assault", 180 | "tail_type": "Crime", 181 | }, 182 | { 183 | "text": ( 184 | "Laura Adams witnessed the assault on Emily Clark and provided a description of the assailant to the police." 185 | ), 186 | "head": "Laura Adams", 187 | "head_type": "Person", 188 | "relation": "WITNESS_OF", 189 | "tail": "Assault", 190 | "tail_type": "Crime", 191 | }, 192 | { 193 | "text": ( 194 | "David Brown attempted to murder Lisa White by poisoning her drink at a party on Pine Street. " 195 | "She was hospitalized but survived the attack." 196 | ), 197 | "head": "David Brown", 198 | "head_type": "Person", 199 | "relation": "SUSPECT_IN", 200 | "tail": "Attempted Murder", 201 | "tail_type": "Crime", 202 | }, 203 | { 204 | "text": ( 205 | "David Brown attempted to murder Lisa White by poisoning her drink at a party on Pine Street. " 206 | "She was hospitalized but survived the attack." 207 | ), 208 | "head": "Lisa White", 209 | "head_type": "Person", 210 | "relation": "VICTIM_OF", 211 | "tail": "Attempted Murder", 212 | "tail_type": "Crime", 213 | }, 214 | { 215 | "text": ( 216 | "David Brown attempted to murder Lisa White by poisoning her drink at a party on Pine Street. " 217 | "She was hospitalized but survived the attack." 218 | ), 219 | "head": "Pine Street", 220 | "head_type": "Location", 221 | "relation": "SCENE_OF", 222 | "tail": "Attempted Murder", 223 | "tail_type": "Crime", 224 | }, 225 | { 226 | "text": ( 227 | "Mark Thompson witnessed David Brown putting something in Lisa White's drink at the party. " 228 | "He reported this to the police." 229 | ), 230 | "head": "Mark Thompson", 231 | "head_type": "Person", 232 | "relation": "WITNESS_OF", 233 | "tail": "Attempted Murder", 234 | "tail_type": "Crime", 235 | } 236 | ] 237 | 238 | # 3) The list of nodes and relationships 239 | # to be coded and experimented with... 240 | 241 | # Instantiate the human prompt template using the examples and Pydantic class with format instructions 242 | from langchain_experimental.graph_transformers.llm import PromptTemplate 243 | human_prompt = PromptTemplate( 244 | template=""" 245 | Examples: 246 | {examples} 247 | 248 | For the following text, extract entities and relations as in the provided example. 249 | {format_instructions}\nText: {input}""", 250 | input_variables=["input"], 251 | partial_variables={ 252 | "format_instructions": parser.get_format_instructions(), 253 | "node_labels": None, 254 | "rel_types": None, 255 | "examples": examples, 256 | }, 257 | ) 258 | 259 | # Instantiate the human message prompt to provide the LLM with the instructions and examples 260 | from langchain_experimental.graph_transformers.llm import HumanMessagePromptTemplate 261 | human_message_prompt = HumanMessagePromptTemplate(prompt=human_prompt) 262 | 263 | # Create a chat_prompt combining the system mand human messages to use with the LLMGraphTransformer 264 | from langchain_experimental.graph_transformers.llm import ChatPromptTemplate 265 | chat_prompt = ChatPromptTemplate.from_messages( 266 | [system_message, human_message_prompt] 267 | ) 268 | 269 | # Instantiate the LLMGraphTransformer that will extract the entities and relationships from the Documents 270 | logging.info('Instantiating the LLMGraphTransformer that will extract the entities and relationships from the Documents') 271 | from langchain_experimental.graph_transformers import LLMGraphTransformer 272 | llm_transformer = LLMGraphTransformer(llm=llm, prompt=chat_prompt) 273 | 274 | # Convert the Documents into Graph Documents 275 | # This is the heavy computation part... 276 | logging.info('Converting the Documents into Graph Documents...') 277 | graph_documents = llm_transformer.convert_to_graph_documents(documents) 278 | 279 | # Instantiate the Neo4JGraph to persist the data 280 | logging.info('Instantiating the Neo4JGraph to persist the data') 281 | from langchain_community.graphs import Neo4jGraph 282 | graph = Neo4jGraph() 283 | 284 | # Persist the Graph Documents into the Neo4JGraph 285 | logging.info('Persisting the Graph Documents into the Neo4JGraph') 286 | graph.add_graph_documents( 287 | graph_documents, 288 | baseEntityLabel=True, 289 | include_source=True 290 | ) 291 | 292 | logging.info('Data pipeline completed successfully!') 293 | --------------------------------------------------------------------------------