├── .gitignore ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE ├── NOTICE ├── README.md ├── byokg-rag ├── .gitignore ├── README.md ├── pyproject.toml └── src │ └── graphrag_toolkit │ ├── __init__.py │ └── byokg_rag │ ├── __init__.py │ ├── byokg_query_engine.py │ ├── graph_connectors │ ├── __init__.py │ ├── kg_linker.py │ └── prompts │ │ ├── kg_linker_prompt.yaml │ │ └── task_prompts.yaml │ ├── graph_retrievers │ ├── __init__.py │ ├── entity_linker.py │ ├── graph_reranker.py │ ├── graph_retrievers.py │ ├── graph_traversal.py │ ├── graph_verbalizer.py │ └── prompts │ │ └── agent_prompts.yaml │ ├── graphstore │ ├── __init__.py │ └── graphstore.py │ ├── indexing │ ├── __init__.py │ ├── fuzzy_string.py │ └── index.py │ ├── llm │ ├── __init__.py │ └── bedrock_llms.py │ ├── prompts │ └── generation_prompts.yaml │ ├── requirements.txt │ └── utils.py ├── docs ├── README.md └── lexical-graph │ ├── README.md │ ├── aws-profile.md │ ├── batch-extraction.md │ ├── configuration.md │ ├── faq.md │ ├── graph-model.md │ ├── graph-store-falkor-db.md │ ├── graph-store-neptune-analytics.md │ ├── graph-store-neptune-db.md │ ├── hybrid-deployment.md │ ├── indexing.md │ ├── metadata-filtering.md │ ├── multi-tenancy.md │ ├── overview.md │ ├── prompts.md │ ├── querying.md │ ├── security.md │ ├── storage-model.md │ ├── vector-store-neptune-analytics.md │ ├── vector-store-opensearch-serverless.md │ └── vector-store-postgres.md ├── examples ├── README.md ├── byokg-rag │ ├── byokg_rag_demo.ipynb │ └── data │ │ └── freebase_tiny_kg.csv ├── lexical-graph-hybrid-dev │ ├── aws │ │ ├── setup-bedrock-batch-doc.md │ │ └── setup-bedrock-batch.sh │ ├── docker │ │ ├── .env │ │ ├── build.sh │ │ ├── docker-compose.yml │ │ ├── postgres │ │ │ └── schema.sql │ │ └── reset.sh │ ├── docs │ │ ├── docker_build_shell_script.md │ │ ├── docker_compose_services.md │ │ └── docker_reset_shell_script.md │ └── notebooks │ │ ├── .env │ │ ├── 00-Setup.ipynb │ │ ├── 01-Local-Extract-Batch.ipynb │ │ ├── 02-Cloud-Setup.ipynb │ │ ├── 03-Cloud-Build.ipynb │ │ ├── 04-Cloud-Querying.ipynb │ │ └── best-practices │ │ └── Retrieval-Augmented-Generation-Options.pdf ├── lexical-graph-local-dev │ ├── README.md │ ├── docker │ │ ├── .env │ │ ├── build.sh │ │ ├── docker-compose.yml │ │ ├── postgres │ │ │ └── schema.sql │ │ └── reset.sh │ ├── docs │ │ ├── docker_build.md │ │ ├── docker_reset_script.md │ │ └── docker_services.md │ └── notebooks │ │ ├── .env │ │ ├── 00-Setup.ipynb │ │ ├── 01-Combined-Extract-and-Build.ipynb │ │ ├── 02-Querying.ipynb │ │ ├── 03-Querying with prompting.ipynb │ │ └── prompts │ │ ├── system_prompt.txt │ │ └── user_prompt.txt └── lexical-graph │ ├── README.md │ ├── cloudformation-templates │ ├── graphrag-toolkit-neptune-analytics-aurora-postgres.json │ ├── graphrag-toolkit-neptune-analytics-opensearch-serverless.json │ ├── graphrag-toolkit-neptune-analytics.json │ ├── graphrag-toolkit-neptune-db-aurora-postgres-existing-vpc.json │ ├── graphrag-toolkit-neptune-db-aurora-postgres.json │ ├── graphrag-toolkit-neptune-db-opensearch-serverless.json │ └── update-stack.sh │ └── notebooks │ ├── 00-Setup.ipynb │ ├── 01-Combined-Extract-and-Build.ipynb │ ├── 02-Separate-Extract-and-Build.ipynb │ ├── 03-Traversal-Based-Querying.ipynb │ ├── 04-Semantic-Guided-Querying.ipynb │ ├── 05-Multi-Tenancy.ipynb │ └── 06-Agentic-GraphRAG.ipynb ├── images ├── byokg_rag.png ├── extract-and-build.png ├── hybrid-extract-and-build.png ├── lexical-graph.png ├── local-extract-and-build.png └── question-answering.png ├── lexical-graph-contrib └── falkordb │ ├── pyproject.toml │ └── src │ ├── graphrag_toolkit │ └── lexical_graph │ │ └── storage │ │ └── graph │ │ └── falkordb │ │ ├── __init__.py │ │ ├── falkordb_graph_store.py │ │ └── falkordb_graph_store_factory.py │ ├── requirements.txt │ ├── setup.cfg │ └── setup.py ├── lexical-graph ├── README.md ├── pyproject.toml └── src │ └── graphrag_toolkit │ ├── __init__.py │ └── lexical_graph │ ├── __init__.py │ ├── config.py │ ├── errors.py │ ├── indexing │ ├── __init__.py │ ├── build │ │ ├── __init__.py │ │ ├── build_filter.py │ │ ├── build_filters.py │ │ ├── build_pipeline.py │ │ ├── checkpoint.py │ │ ├── chunk_graph_builder.py │ │ ├── chunk_node_builder.py │ │ ├── entity_graph_builder.py │ │ ├── entity_relation_graph_builder.py │ │ ├── fact_graph_builder.py │ │ ├── graph_batch_client.py │ │ ├── graph_builder.py │ │ ├── graph_construction.py │ │ ├── graph_summary_builder.py │ │ ├── node_builder.py │ │ ├── node_builders.py │ │ ├── null_builder.py │ │ ├── source_graph_builder.py │ │ ├── source_node_builder.py │ │ ├── statement_graph_builder.py │ │ ├── statement_node_builder.py │ │ ├── topic_graph_builder.py │ │ ├── topic_node_builder.py │ │ ├── vector_batch_client.py │ │ └── vector_indexing.py │ ├── constants.py │ ├── extract │ │ ├── __init__.py │ │ ├── batch_config.py │ │ ├── batch_llm_proposition_extractor.py │ │ ├── batch_topic_extractor.py │ │ ├── docs_to_nodes.py │ │ ├── extraction_pipeline.py │ │ ├── file_system_tap.py │ │ ├── graph_scoped_value_store.py │ │ ├── id_rewriter.py │ │ ├── infer_classifications.py │ │ ├── infer_config.py │ │ ├── llm_proposition_extractor.py │ │ ├── pipeline_decorator.py │ │ ├── proposition_extractor.py │ │ ├── scoped_value_provider.py │ │ ├── source_doc_parser.py │ │ └── topic_extractor.py │ ├── id_generator.py │ ├── load │ │ ├── __init__.py │ │ ├── bedrock_knowledge_base.py │ │ ├── file_based_chunks.py │ │ ├── file_based_docs.py │ │ ├── json_array_reader.py │ │ ├── s3_based_chunks.py │ │ ├── s3_based_docs.py │ │ └── source_documents.py │ ├── model.py │ ├── node_handler.py │ ├── prompts.py │ └── utils │ │ ├── __init__.py │ │ ├── batch_inference_utils.py │ │ ├── metadata_utils.py │ │ ├── pipeline_utils.py │ │ └── topic_utils.py │ ├── lexical_graph_index.py │ ├── lexical_graph_query_engine.py │ ├── logging.py │ ├── metadata.py │ ├── prompts │ ├── __init__.py │ ├── bedrock_prompt_provider.py │ ├── file_prompt_provider.py │ ├── prompt_provider_base.py │ ├── prompt_provider_config.py │ ├── prompt_provider_config_base.py │ ├── prompt_provider_factory.py │ ├── prompt_provider_registry.py │ ├── s3_prompt_provider.py │ └── static_prompt_provider.py │ ├── protocols │ ├── __init__.py │ └── mcp_server.py │ ├── requirements.txt │ ├── retrieval │ ├── __init__.py │ ├── model.py │ ├── post_processors │ │ ├── __init__.py │ │ ├── bedrock_context_format.py │ │ ├── bge_reranker.py │ │ ├── enrich_source_details.py │ │ ├── reranker_mixin.py │ │ ├── sentence_reranker.py │ │ ├── statement_diversity.py │ │ └── statement_enhancement.py │ ├── processors │ │ ├── __init__.py │ │ ├── clear_chunks.py │ │ ├── clear_scores.py │ │ ├── dedup_results.py │ │ ├── disaggregate_results.py │ │ ├── filter_by_metadata.py │ │ ├── format_sources.py │ │ ├── populate_statement_strs.py │ │ ├── processor_args.py │ │ ├── processor_base.py │ │ ├── prune_results.py │ │ ├── prune_statements.py │ │ ├── rerank_statements.py │ │ ├── rescore_results.py │ │ ├── simplify_single_topic_results.py │ │ ├── sort_results.py │ │ ├── statements_to_strings.py │ │ ├── truncate_results.py │ │ ├── truncate_statements.py │ │ └── zero_scores.py │ ├── prompts.py │ ├── retrievers │ │ ├── __init__.py │ │ ├── chunk_based_search.py │ │ ├── composite_traversal_based_retriever.py │ │ ├── entity_based_search.py │ │ ├── entity_context_search.py │ │ ├── keyword_entity_search.py │ │ ├── keyword_ranking_search.py │ │ ├── rerank_beam_search.py │ │ ├── semantic_beam_search.py │ │ ├── semantic_guided_base_retriever.py │ │ ├── semantic_guided_retriever.py │ │ ├── statement_cosine_seach.py │ │ ├── topic_based_search.py │ │ └── traversal_based_base_retriever.py │ ├── summary │ │ ├── __init__.py │ │ └── graph_summary.py │ └── utils │ │ ├── __init__.py │ │ ├── query_decomposition.py │ │ ├── statement_utils.py │ │ └── vector_utils.py │ ├── storage │ ├── __init__.py │ ├── constants.py │ ├── graph │ │ ├── __init__.py │ │ ├── dummy_graph_store.py │ │ ├── graph_store.py │ │ ├── graph_store_factory_method.py │ │ ├── graph_utils.py │ │ ├── multi_tenant_graph_store.py │ │ └── neptune_graph_stores.py │ ├── graph_store_factory.py │ ├── vector │ │ ├── __init__.py │ │ ├── dummy_vector_index.py │ │ ├── multi_tenant_vector_store.py │ │ ├── neptune_vector_indexes.py │ │ ├── opensearch_vector_index_factory.py │ │ ├── opensearch_vector_indexes.py │ │ ├── pg_vector_index_factory.py │ │ ├── pg_vector_indexes.py │ │ ├── read_only_vector_store.py │ │ ├── vector_index.py │ │ ├── vector_index_factory_method.py │ │ └── vector_store.py │ └── vector_store_factory.py │ ├── tenant_id.py │ └── utils │ ├── __init__.py │ ├── bedrock_utils.py │ ├── fm_observability.py │ ├── io_utils.py │ └── llm_cache.py └── security.md /.gitignore: -------------------------------------------------------------------------------- 1 | /examples/lexical-graph/notebooks/extracted/ 2 | /examples/lexical-graph/notebooks/output/ 3 | /.venv/ 4 | /examples/lexical-graph-local-dev/notebooks/output/ 5 | /docs/lexical-graph/.idea/ 6 | /examples/lexical-graph-hybrid-dev/notebooks/output/ 7 | /examples/lexical-graph-hybrid-dev/notebooks/extracted/ 8 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | ## Code of Conduct 2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 4 | opensource-codeofconduct@amazon.com with any additional questions or comments. 5 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing Guidelines 2 | 3 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional 4 | documentation, we greatly value feedback and contributions from our community. 5 | 6 | Please read through this document before submitting any issues or pull requests to ensure we have all the necessary 7 | information to effectively respond to your bug report or contribution. 8 | 9 | 10 | ## Reporting Bugs/Feature Requests 11 | 12 | We welcome you to use the GitHub issue tracker to report bugs or suggest features. 13 | 14 | When filing an issue, please check existing open, or recently closed, issues to make sure somebody else hasn't already 15 | reported the issue. Please try to include as much information as you can. Details like these are incredibly useful: 16 | 17 | * A reproducible test case or series of steps 18 | * The version of our code being used 19 | * Any modifications you've made relevant to the bug 20 | * Anything unusual about your environment or deployment 21 | 22 | 23 | ## Contributing via Pull Requests 24 | Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that: 25 | 26 | 1. You are working against the latest source on the *main* branch. 27 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already. 28 | 3. You open an issue to discuss any significant work - we would hate for your time to be wasted. 29 | 30 | To send us a pull request, please: 31 | 32 | 1. Fork the repository. 33 | 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change. 34 | 3. Ensure local tests pass. 35 | 4. Commit to your fork using clear commit messages. 36 | 5. Send us a pull request, answering any default questions in the pull request interface. 37 | 6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation. 38 | 39 | GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and 40 | [creating a pull request](https://help.github.com/articles/creating-a-pull-request/). 41 | 42 | 43 | ## Finding contributions to work on 44 | Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any 'help wanted' issues is a great place to start. 45 | 46 | 47 | ## Code of Conduct 48 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 49 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 50 | opensource-codeofconduct@amazon.com with any additional questions or comments. 51 | 52 | 53 | ## Security issue notifications 54 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue. 55 | 56 | 57 | ## Licensing 58 | 59 | See the [LICENSE](LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution. 60 | -------------------------------------------------------------------------------- /NOTICE: -------------------------------------------------------------------------------- 1 | Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## GraphRAG Toolkit 2 | 3 | The graphrag-toolkit is a collection of Python tools for building graph-enhanced Generative AI applications. 4 | 5 | > **4 June 2025** Release 3.8.0 includes a separate BYOKG-RAG package, which allows users to bring their own knowledge graph and perform complex question answering over it. 6 | 7 | > **28 May 2025** Release 3.7.0 includes an MCP server that dynamically generates tools and tool descriptions (one per tenant in a multi-tenant graph). 8 | 9 | Installation instructions and requirements are detailed separately with each tool. 10 | 11 | ### Lexical Graph 12 | 13 | The [lexical-graph](./lexical-graph/) provides a framework for automating the construction of a [hierarchical lexical graph](./docs/lexical-graph/graph-model.md) from unstructured data, and composing question-answering strategies that query this graph when answering user questions. 14 | 15 | ### BYOKG-RAG 16 | 17 | [BYOKG-RAG](./byokg-rag/) is a novel approach to Knowledge Graph Question Answering (KGQA) that combines the power of Large Language Models (LLMs) with structured knowledge graphs. The system allows users to bring their own knowledge graph and perform complex question answering over it. 18 | 19 | ## Security 20 | 21 | See [CONTRIBUTING](CONTRIBUTING.md#security-issue-notifications) for more information. 22 | 23 | ## License 24 | 25 | This project is licensed under the Apache-2.0 License. 26 | 27 | -------------------------------------------------------------------------------- /byokg-rag/.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | dist/ 3 | build/ 4 | *.egg-info/ 5 | __pycache__/ 6 | *.DS_Store -------------------------------------------------------------------------------- /byokg-rag/README.md: -------------------------------------------------------------------------------- 1 | # BYOKG-RAG: Bring Your Own Knowledge Graph for Retrieval Augmented Generation 2 | 3 | ![BYOKG-RAG Architecture](../images/byokg_rag.png) 4 | 5 | BYOKG-RAG is a novel approach to Knowledge Graph Question Answering (KGQA) that combines the power of Large Language Models (LLMs) with structured knowledge graphs. The system allows users to bring their own knowledge graph and perform complex question answering over it. 6 | 7 | ## Key Features 🔑 8 | 9 | - **Multi-strategy Retrieval**: Combines multiple retrieval strategies: 10 | - Agentic retrieval for dynamic graph exploration 11 | - Scoring-based retrieval for relevance ranking 12 | - Path-based retrieval for multi-hop reasoning 13 | - Query-based retrieval for direct graph queries 14 | - **LLM-powered Reasoning**: Leverages state-of-the-art LLMs for question understanding and answer generation 15 | 16 | ## System Components ⚙️ 17 | 18 | 1. **Graph Store** ([src/graphrag_toolkit/byokg_rag/graphstore](src/graphrag_toolkit/byokg_rag/graphstore)) 19 | - Manages the knowledge graph data structure 20 | - Provides interfaces for graph traversal and querying 21 | 22 | 2. **KG Linker** ([src/graphrag_toolkit/byokg_rag/graph_connectors](src/graphrag_toolkit/byokg_rag/graph_connectors)) 23 | - Links natural language queries to graph entities and paths 24 | - Uses LLMs to understand question intent 25 | - Extracts relevant entities and relationship patterns 26 | 27 | 3. **Entity Linker** ([src/graphrag_toolkit/byokg_rag/graph_retrievers](src/graphrag_toolkit/byokg_rag/graph_retrievers)) 28 | - Matches entities from text to graph nodes 29 | - Handles variations in entity names 30 | - Uses fuzzy string matching for robust entity resolution 31 | 32 | 4. **Triplet Retriever** ([src/graphrag_toolkit/byokg_rag/graph_retrievers](src/graphrag_toolkit/byokg_rag/graph_retrievers)) 33 | - Retrieves relevant triplets from the graph 34 | - Navigates the graph starting from linked entities 35 | - Verbalizes triplets in natural language format 36 | 37 | 5. **Path Retriever** ([src/graphrag_toolkit/byokg_rag/graph_retrievers](src/graphrag_toolkit/byokg_rag/graph_retrievers)) 38 | - Finds paths between entities in the graph 39 | - Follows metapath patterns for structured traversal 40 | - Connects entities through intermediate nodes 41 | 42 | 6. **Query Engine** ([src/graphrag_toolkit/byokg_rag/byokg_query_engine.py](src/graphrag_toolkit/byokg_rag/byokg_query_engine.py)) 43 | - Orchestrates all components 44 | - Processes natural language questions 45 | - Generates answers based on retrieved information 46 | 47 | ## Performance 📈 48 | 49 | Our results show that BYOKG-RAG outperforms existing approaches across multiple knowledge graph benchmarks: 50 | 51 | | KGQA Hit (%) | Wiki-KG | Temp-KG | Med-KG | 52 | |--------------|---------|---------|--------| 53 | | Agent | 77.8 | 57.3 | 59.2 | 54 | | BYOKG-RAG | 80.1 | 65.5 | 65.0 | 55 | 56 | *Note: Full paper with detailed methodology and results coming soon!* 📄 57 | 58 | ## Getting Started 🚀 59 | 60 | The byokg-rag toolkit requires Python and [pip](http://www.pip-installer.org/en/latest/) to install. You can install the byokg-rag using pip: 61 | 62 | 1. Install dependencies: 63 | ```bash 64 | pip install . 65 | ``` 66 | or 67 | ``` 68 | pip install https://github.com/awslabs/graphrag-toolkit/archive/refs/tags/v3.8.1.zip#subdirectory=byokg-rag 69 | ``` 70 | (The version number will vary based on the latest GitHub release) 71 | 72 | 2. Run the demo notebook ([byokg_rag_demo.ipynb](../examples/byokg-rag/byokg_rag_demo.ipynb)): 73 | ``` 74 | graphrag-toolkit/examples/byokg-rag/byokg_rag_demo.ipynb 75 | ``` 76 | 77 | ## Citation 📚 78 | 79 | *Arxiv paper and citation coming soon!* 80 | 81 | ``` 82 | @misc{byokg-rag-2025, 83 | author = {Mavromatis, Costas and Adeshina, Soji and Ioannidis, Vassilis N. and Han, Zhen and Zhu, Qi and Robinson, Ian and Thompson, Bryan and Rangwala, Huzefa and Karypis, George}, 84 | title = {{BYOKG-RAG}: Multi-Strategy Graph Retrieval for Knowledge Graph Question Answering}, 85 | url = {https://github.com/awslabs/graphrag-toolkit}, 86 | year = {2025} 87 | } 88 | ``` 89 | 90 | ## License ⚖️ 91 | 92 | This project is licensed under the Apache-2.0 License. -------------------------------------------------------------------------------- /byokg-rag/pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["hatchling", "hatch-requirements-txt"] 3 | build-backend = "hatchling.build" 4 | 5 | [tool.hatch.build.targets.wheel] 6 | packages = ["src/graphrag_toolkit"] 7 | 8 | [project] 9 | name = "graphrag-toolkit-byokg-rag" 10 | version = "0.0.1" 11 | description = "AWS GraphRAG Toolkit, BYOKG RAG" 12 | readme = "README.md" 13 | requires-python = ">=3.10" 14 | dynamic = ["dependencies"] 15 | license = "Apache-2.0" 16 | 17 | [tool.hatch.metadata.hooks.requirements_txt] 18 | files = ["src/graphrag_toolkit/byokg_rag/requirements.txt"] -------------------------------------------------------------------------------- /byokg-rag/src/graphrag_toolkit/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 -------------------------------------------------------------------------------- /byokg-rag/src/graphrag_toolkit/byokg_rag/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 -------------------------------------------------------------------------------- /byokg-rag/src/graphrag_toolkit/byokg_rag/graph_connectors/__init__.py: -------------------------------------------------------------------------------- 1 | from .kg_linker import * 2 | -------------------------------------------------------------------------------- /byokg-rag/src/graphrag_toolkit/byokg_rag/graph_connectors/prompts/kg_linker_prompt.yaml: -------------------------------------------------------------------------------- 1 | kg-linker-prompt: 2 | system-prompt: '''You are a highly skilled Graph Query Language interpreter specializing in translating natural language questions into precise graph queries. With expertise in graph database structures, your role is to respond to user tasks, which may involve the following: 3 | - Analyze natural language questions thoroughly 4 | - Identify relevant entities within the question context 5 | - Determine appropriate relationship paths within the graph schema 6 | - Generate executable graph queries that accurately represent the user intent 7 | 8 | Your responses must be technically accurate, follow the exact format requested, and only use relationship types and properties that exist in the provided schema. 9 | You may also be provided with additional graph context, which you should utilize to tackle any related tasks. 10 | ''' 11 | user-prompt: '''Given a question, schema, and optional graph context, your role is to perform the following tasks: 12 | 13 | {{task_prompts}} 14 | 15 | ### Important Instructions: 16 | - Respond ONLY to the requested tasks with proper tags 17 | - Do not provide explanations outside of the tagged sections 18 | - Use the exact relationship types and property names from the schema 19 | - Ensure any generated graph query (if any) is syntactically correct and executable 20 | - **When graph context is provided, thoroughly leverage it to improve all aspects of your response** 21 | 22 | Now, please analyze the following: 23 | 24 | Question: {question} 25 | Schema: {schema} 26 | Graph Context: {graph_context} 27 | ''' 28 | -------------------------------------------------------------------------------- /byokg-rag/src/graphrag_toolkit/byokg_rag/graph_retrievers/__init__.py: -------------------------------------------------------------------------------- 1 | from .graph_retrievers import * 2 | from .graph_reranker import * 3 | from .graph_traversal import * 4 | from .graph_verbalizer import * 5 | from .entity_linker import * -------------------------------------------------------------------------------- /byokg-rag/src/graphrag_toolkit/byokg_rag/graph_retrievers/graph_reranker.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | import numpy as np 3 | import torch 4 | 5 | class GReranker(ABC): 6 | """ 7 | Abstract base class for GraphRAG reranker. 8 | """ 9 | 10 | def __init__(self): 11 | """ 12 | Initialize the graph reranker. 13 | """ 14 | 15 | @abstractmethod 16 | def rerank_input_with_query(self, query, input, topk=None): 17 | """ 18 | Rerank the given input based on the query. 19 | 20 | Args: 21 | query (str): The query string. 22 | node text (list): List of node text to be reranked. 23 | 24 | Returns: 25 | NotImplementedError: If not implemented by child class. 26 | """ 27 | raise NotImplementedError("Method rerank_input_with_query must be implemented") 28 | 29 | class LocalGReranker(GReranker): 30 | """ 31 | Local reranker on single machine with BGE-reranker-base models. 32 | """ 33 | def __init__(self, model_name="BAAI/bge-reranker-base", topk=10, device="cuda"): 34 | assert model_name in ["BAAI/bge-reranker-base", "BAAI/bge-reranker-large", "BAAI/bge-reranker-v2-m3"], "Model name not supported" 35 | self.model_name = model_name 36 | from transformers import AutoModelForSequenceClassification, AutoTokenizer 37 | 38 | self.tokenizer = AutoTokenizer.from_pretrained(model_name) 39 | self.reranker = AutoModelForSequenceClassification.from_pretrained(model_name) 40 | self.reranker = self.reranker.to(device) 41 | self.reranker.eval() 42 | 43 | self.topk = topk 44 | 45 | 46 | def calculate_score(self, pairs): 47 | """ 48 | Calculate the score for the given pairs (query, text) 49 | """ 50 | if self.model_name in ["BAAI/bge-reranker-base", "BAAI/bge-reranker-large", "BAAI/bge-reranker-v2-m3"]: 51 | with torch.no_grad(): 52 | inputs = self.tokenizer(pairs, padding=True, truncation=True, return_tensors='pt', max_length=512) 53 | inputs = inputs.to(self.reranker.device) 54 | scores = self.reranker(**inputs, return_dict=True).logits.view(-1, ).float() 55 | return scores 56 | else: 57 | raise NotImplementedError 58 | 59 | def filter_topk(self, query, input, topk=10, return_scores=False): 60 | """ 61 | Filter the top-k input based on the reranker score. 62 | """ 63 | if isinstance(query, str): 64 | pairs = [[query, x] for x in input] 65 | else: 66 | pairs = [[x,y] for x,y in zip(query, input)] 67 | score = self.calculate_score(pairs) 68 | # convert to CPU 69 | score = score.cpu() 70 | np_score = -np.array(score) 71 | ids = np.argsort(np_score, kind="stable") 72 | 73 | if return_scores: 74 | return [input[x] for x in ids[:topk]], [score[x] for x in ids[:topk]], ids[:topk] 75 | else: 76 | return [input[x] for x in ids[:topk]], ids[:topk] 77 | 78 | def rerank_input_with_query(self, query, input, topk=None, return_scores=False): 79 | """ 80 | Rerank the given input based on the query. 81 | 82 | Args: 83 | query (str): The query string. 84 | input (list): List of input to be reranked. 85 | 86 | Returns: 87 | list: Reranked list of input. 88 | """ 89 | if not topk: 90 | topk = self.topk 91 | return self.filter_topk(query, input, topk=topk, return_scores=return_scores) 92 | -------------------------------------------------------------------------------- /byokg-rag/src/graphrag_toolkit/byokg_rag/graph_retrievers/prompts/agent_prompts.yaml: -------------------------------------------------------------------------------- 1 | relation_selection_prompt: ''' 2 | Your task is to select the most appropriate relations based on their relevance to a given question. 3 | 4 | ### Formatting Requirements: 5 | 1. Each selected relation and score MUST be on a new line 6 | 2. Do not use commas, semicolons, or any other delimiters between relations or scores 7 | 8 | 9 | Follow these steps: 10 | 1. Read the provided question carefully. 11 | 2. Analyze each relation in the list and determine its relevance to the question and relation. 12 | 3. Respond by selecting the most relevant relations within tags. Be both frugal on your selection and consider completeness. 13 | 4. The selected relations should be provided line-by-line. 14 | 15 | 16 | Example format: 17 | 18 | Name the president of the country whose main spoken language was English in 1980? 19 | 20 | 21 | 22 | English Language 23 | English 24 | 25 | 26 | 27 | language.human_language.main_country 28 | language.human_language.language_family 29 | language.human_language.iso_639_3_code 30 | base.rosetta.languoid.parent 31 | language.human_language.countries_spoken_in 32 | 33 | 34 | 35 | language.human_language.main_country 36 | base.rosetta.languoid.parent 37 | language.human_language.countries_spoken_in 38 | 39 | 40 | Explanation: language.human_language.main_country relation is highly relevant as it directly relates to the country whose president is being asked for, and the main country where English language is spoken in 1980. 41 | language.human_language.countries_spoken_in relation is also relevant as it provides information on the countries where English language is spoken, which could help narrow down the search for the president. 42 | base.rosetta.languoid.parent relation is less relevant but still provides some context on the language family 43 | 44 | Important Instructions: Always return at least one relation. 45 | Now it is your turn. 46 | 47 | 48 | {question} 49 | 50 | 51 | 52 | {entity} 53 | 54 | 55 | 56 | {relations} 57 | 58 | 59 | Remember to parse your response in tags: 60 | ''' 61 | 62 | 63 | 64 | entity_selection_prompt: ''' 65 | Given a question and the associated retrieved knowledge graph context (entity, relation, entity), you are asked to select the most important entities to explore in order to answer the question. 66 | Consider important entities only that are necessary for answering the question. Do not select entities, for which we already have all necessary information. 67 | 68 | - Format your response exactly as follows: 69 | 70 | relevant_entity1 71 | relevant_entity2 72 | ... 73 | 74 | 75 | The selected entities must be provided line-by-line (\n). 76 | 77 | Example format: 78 | Question: Name the president of the country whose main spoken language was English in 1980? 79 | Graph Context: English Language -> language.human_language.countries_spoken_in -> England | USA 80 | 81 | 82 | England 83 | USA 84 | 85 | 86 | 87 | The entites should be sorted from the most important to the least important. 88 | Important Instruction: If we can answer the question directly based on the provided graph context, respond with: 89 | 90 | FINISH 91 | 92 | 93 | - Now Respond ONLY to the requested tasks with proper tags 94 | 95 | Question: {question} 96 | Graph Context: {graph_context} 97 | ''' 98 | -------------------------------------------------------------------------------- /byokg-rag/src/graphrag_toolkit/byokg_rag/graphstore/__init__.py: -------------------------------------------------------------------------------- 1 | from .graphstore import * -------------------------------------------------------------------------------- /byokg-rag/src/graphrag_toolkit/byokg_rag/indexing/__init__.py: -------------------------------------------------------------------------------- 1 | from .index import * 2 | from .fuzzy_string import * 3 | -------------------------------------------------------------------------------- /byokg-rag/src/graphrag_toolkit/byokg_rag/indexing/fuzzy_string.py: -------------------------------------------------------------------------------- 1 | from thefuzz import fuzz, process 2 | from abc import ABC, abstractmethod 3 | from typing import List 4 | from .index import Index 5 | 6 | 7 | class FuzzyStringIndex(Index): 8 | """ 9 | A class for fuzzy string matching and indexing. 10 | """ 11 | 12 | def __init__(self): 13 | super().__init__() # Ensure proper initialization of the base class. 14 | self.vocab = [] 15 | 16 | def reset(self): 17 | self.vocab = [] 18 | 19 | def query(self, input, topk=1, id_selector=None): 20 | """ 21 | match a query to items in the index and return the topk results 22 | 23 | :param input: str the query to match 24 | :param topk: number of items to return 25 | :param id_selector: a list of ids to retrieve the topk from i.e an allowlist 26 | :return: 27 | """ 28 | 29 | if id_selector is not None: 30 | raise NotImplementedError(f"id_selector not implemented for FuzzyString") 31 | 32 | # string matching process from thefuzz library https://pypi.org/project/thefuzz/ 33 | results = process.extract(input, self.vocab, limit=topk) 34 | 35 | return {'hits': [{'document_id': match_string, 36 | 'document': match_string, 37 | 'match_score': match_score} 38 | for match_string, match_score in results]} 39 | 40 | 41 | def match(self, inputs, topk=1, id_selector=None, max_len_difference=4): 42 | """ 43 | match entity inputs to vocab 44 | 45 | :param input: list(str) of entities per query to match 46 | :param topk: number of items to return 47 | :param id_selector: a list of ids to retrieve the topk from i.e an allowlist 48 | :return: 49 | """ 50 | 51 | if id_selector is not None: 52 | raise NotImplementedError(f"id_selector not implemented for {self.__class__.__name__}") 53 | 54 | results = [] 55 | for input in inputs: 56 | # string matching process from thefuzz library https://pypi.org/project/thefuzz/ 57 | intermediate_results = process.extract(input, self.vocab, limit=topk) 58 | #skip much shorter strings 59 | for interintermediate_result in intermediate_results: 60 | if len(interintermediate_result[0]) + max_len_difference < len(input): 61 | continue 62 | results.append(interintermediate_result) 63 | 64 | results = sorted(results, key=lambda x: x[1], reverse=True) 65 | 66 | return {'hits': [{'document_id': match_string, 67 | 'document': match_string, 68 | 'match_score': match_score} 69 | for match_string, match_score in results]} 70 | 71 | def add(self, vocab_list): 72 | """ 73 | add vocab instances to the index 74 | 75 | :param vocab_list: list of vocab instances to add 76 | 77 | """ 78 | self.vocab = list(set(self.vocab) | set(vocab_list)) 79 | 80 | def add_with_ids(self, ids, vocab_list): 81 | raise NotImplementedError(f"add_with_ids not implemented for {self.__class__.__name__}") -------------------------------------------------------------------------------- /byokg-rag/src/graphrag_toolkit/byokg_rag/indexing/index.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from typing import List 3 | 4 | class Index(ABC): 5 | """ 6 | Abstract base class for indexes 7 | """ 8 | def __init__(self): 9 | pass 10 | 11 | @abstractmethod 12 | def reset(self): 13 | """ 14 | reset the index to empty it contents without needed to create a new index object 15 | """ 16 | pass 17 | 18 | @abstractmethod 19 | def query(self, input, topk=1): 20 | """ 21 | match a query to items in the index and return the topk results 22 | 23 | :param query: str the query to match 24 | :param topk: number of items to return 25 | :return: 26 | """ 27 | pass 28 | 29 | @abstractmethod 30 | def add(self, documents): 31 | """ 32 | add documents to the index 33 | 34 | :param documents: list of documents to add 35 | 36 | """ 37 | pass 38 | 39 | def add_with_ids(self, ids, documents): 40 | """ 41 | add documents with their given ids to the index 42 | 43 | :param ids: list of documents to add 44 | :param documents: list of doument ids in same order as documents 45 | :return: 46 | """ 47 | pass 48 | 49 | def as_retriever(self): 50 | retriever = Retriever(index=self) 51 | return retriever 52 | 53 | def as_entity_matcher(self): 54 | entity_matcher = EntityMatcher(index=self) 55 | return entity_matcher 56 | 57 | class Retriever: 58 | """ 59 | Base class for Retriever. Given a set of queries, the retriever can process the input, query the index and potentially 60 | post process the output. 61 | """ 62 | 63 | def __init__(self, index): 64 | self.index = index 65 | 66 | @abstractmethod 67 | def retrieve(self, queries:List[str], topk=1, id_selectors = None, **kwargs): 68 | items = [] 69 | if isinstance(id_selectors, list): 70 | if all(isinstance(item, list) for item in id_selectors): 71 | # id selector only allows one query per time 72 | for query, id_selector in zip(queries, id_selectors): 73 | if len(id_selector) == 0: 74 | # if no id is selected skip retrieval 75 | items.append({"hits": []}) 76 | else: 77 | items.append(self.index.query(query, topk, id_selector, **kwargs)) 78 | else: 79 | raise ValueError("id_selectors must be a list of lists") 80 | else: 81 | for query in queries: 82 | items.append(self.index.query(query, topk, **kwargs)) 83 | return items 84 | 85 | class EntityMatcher(Retriever): 86 | """ 87 | Base class for entity matching. Given a set of extracted entities, the matcher returns the matched entities from vocab. 88 | """ 89 | @abstractmethod 90 | def retrieve(self, queries:List[str], **kwargs): 91 | return self.index.match(queries, **kwargs) -------------------------------------------------------------------------------- /byokg-rag/src/graphrag_toolkit/byokg_rag/llm/__init__.py: -------------------------------------------------------------------------------- 1 | from .bedrock_llms import * -------------------------------------------------------------------------------- /byokg-rag/src/graphrag_toolkit/byokg_rag/prompts/generation_prompts.yaml: -------------------------------------------------------------------------------- 1 | generate-response-qa: ''' 2 | ### Task: Question Answering 3 | Answer the question using your existing knowledge base or the external information provided in the graph context (if provided). 4 | 5 | You are allowed to perform chain-of-thought or thinking but the final answers shoud be in tags with the following instructions: 6 | - Provide only direct entity answers that specifically address the question 7 | - Each answer should be a distinct, well-defined entity (person, place, organization, concept, etc.) 8 | - List multiple answers if appropriate, with each answer on a separate line 9 | - Do not include explanations, reasoning, context, or commentary of any kind 10 | - Do not preface or conclude your answer with statements like "Based on my knowledge..." or "The answers are..." 11 | - **If graph context is provided, prioritize answers that can be derived from the context over general knowledge** 12 | - Format your response exactly as follows, where answers are separated by newlines: 13 | 14 | 15 | answer1 16 | answer2 17 | ... 18 | 19 | 20 | If the answer cannot be directly determined by the provided graph context, use your own knowldge. 21 | Try to always output an answer. 22 | 23 | Now, please answer the following: 24 | 25 | Question: {question} 26 | Graph Context: {graph_context} 27 | ''' -------------------------------------------------------------------------------- /byokg-rag/src/graphrag_toolkit/byokg_rag/requirements.txt: -------------------------------------------------------------------------------- 1 | pydantic>=2.8.2 2 | boto3 3 | xmltodict 4 | colorama 5 | pyyaml 6 | pytest 7 | faiss-cpu==1.9.0 8 | thefuzz 9 | langchain_huggingface 10 | ipykernel 11 | torch 12 | transformers>=4.44.2 13 | numpy>=1.26.4 14 | scipy>=1.15.3 15 | -------------------------------------------------------------------------------- /byokg-rag/src/graphrag_toolkit/byokg_rag/utils.py: -------------------------------------------------------------------------------- 1 | import yaml 2 | import os.path as osp 3 | from colorama import Fore, Style 4 | import re 5 | import string 6 | 7 | 8 | def load_yaml(file_path): 9 | file_path = file_path if file_path.startswith('/') else osp.join(osp.dirname(osp.abspath(__file__)), file_path) 10 | with open(file_path, 'r') as file: 11 | content = yaml.safe_load(file) 12 | return content 13 | 14 | def color_print(text, color): 15 | print(getattr(Fore, color.upper()) + Style.BRIGHT + text + Style.RESET_ALL) 16 | 17 | def parse_response(response, pattern): 18 | 19 | if not isinstance(response, str): 20 | return [] 21 | 22 | match = re.search(pattern, response, flags=re.DOTALL) 23 | matched = [] 24 | if match: 25 | graph_text = match.group(1) 26 | for to_match in graph_text.strip().split('\n'): 27 | if to_match != "": 28 | matched.append(to_match) 29 | 30 | return matched 31 | -------------------------------------------------------------------------------- /docs/README.md: -------------------------------------------------------------------------------- 1 | ## Documentation 2 | 3 | - [Lexical Graph](./lexical-graph/) 4 | 5 | -------------------------------------------------------------------------------- /docs/lexical-graph/README.md: -------------------------------------------------------------------------------- 1 | ## Lexical Graph 2 | 3 | - [Overview](./overview.md) 4 | - [Storage Model](./storage-model.md) 5 | - [Indexing](./indexing.md) 6 | - [Batch Extraction](./batch-extraction.md) 7 | - [Querying](./querying.md) 8 | - [Multi-Tenancy](./multi-tenancy.md) 9 | - [Configuration](./configuration.md) 10 | - [Graph Model](./graph-model.md) 11 | - [Security](./security.md) 12 | - [Hybrid Deployment](./hybrid-deployment.md) 13 | - [FAQ](./faq.md) 14 | 15 | #### Code examples 16 | 17 | The code examples throughout the documentation are formatted to run in a Jupyter notebook. If you’re building an application with a main entry point, put your application logic inside a method, and add an [`if __name__ == '__main__'` block](./faq.md#runtimeerror-please-use-nest_asyncioapply-to-allow-nested-event-loops). 18 | -------------------------------------------------------------------------------- /docs/lexical-graph/aws-profile.md: -------------------------------------------------------------------------------- 1 | # Using AWS Profiles in `GraphRAGConfig` 2 | 3 | This guide explains how to configure and use **AWS named profiles** in the lexical-graph by leveraging the `GraphRAGConfig` class. 4 | 5 | ## What is an AWS Profile? 6 | 7 | AWS CLI and SDKs allow the use of named profiles to manage different sets of credentials. Each profile typically contains: 8 | - Access key ID 9 | - Secret access key 10 | - (Optional) Session token 11 | - (Optional) Default region 12 | 13 | These profiles are stored in: 14 | - `~/.aws/credentials` 15 | - `~/.aws/config` 16 | 17 | --- 18 | 19 | ## How `GraphRAGConfig` Uses AWS Profiles 20 | 21 | ### 1. **Automatic Detection** 22 | If no profile is explicitly provided, `GraphRAGConfig` attempts to use: 23 | ```python 24 | os.environ.get("AWS_PROFILE") 25 | ``` 26 | 27 | If that’s not set, it will fall back to the default AWS behavior. 28 | 29 | --- 30 | 31 | ### 2. **Explicit Profile Setting** 32 | 33 | You can programmatically set a profile: 34 | 35 | ```python 36 | from graphrag_toolkit.config import GraphRAGConfig 37 | 38 | GraphRAGConfig.aws_profile = "padmin" 39 | ``` 40 | 41 | This automatically resets any previously cached clients or sessions to ensure all AWS service interactions use the new credentials. 42 | 43 | --- 44 | 45 | ### 3. **Where Profiles are Used** 46 | 47 | When you call: 48 | 49 | ```python 50 | GraphRAGConfig.session 51 | ``` 52 | 53 | or use properties like: 54 | 55 | ```python 56 | GraphRAGConfig.bedrock 57 | GraphRAGConfig.s3 58 | GraphRAGConfig.rds 59 | ``` 60 | 61 | the SDK creates the respective clients using the active profile and region. 62 | 63 | --- 64 | 65 | ## Example with Environment Variables 66 | 67 | You can export the profile and region before running your app: 68 | 69 | ```bash 70 | export AWS_PROFILE=padmin 71 | export AWS_REGION=us-east-1 72 | python my_app.py 73 | ``` 74 | 75 | Or set them inline: 76 | 77 | ```bash 78 | AWS_PROFILE=padmin AWS_REGION=us-east-1 python my_app.py 79 | ``` 80 | 81 | --- 82 | 83 | ## Profile-Based Multi-Account Testing 84 | 85 | To test across AWS accounts: 86 | ```python 87 | GraphRAGConfig.aws_profile = "dev-profile" 88 | GraphRAGConfig.aws_region = "us-west-2" 89 | 90 | bedrock = GraphRAGConfig.bedrock # Will use dev-profile in us-west-2 91 | ``` 92 | 93 | --- 94 | 95 | ## Common Pitfalls 96 | 97 | - **Missing Profile**: Ensure the profile exists in `~/.aws/credentials` and is not misspelled. 98 | - **Access Denied**: Check IAM permissions for the services you're trying to access. 99 | - **Region mismatch**: Bedrock may only be available in specific regions (e.g., `us-east-1`). 100 | 101 | --- 102 | 103 | ## Summary 104 | 105 | | Use Case | How to Do It | 106 | |-----------------------------|------------------------------------------------------------| 107 | | Default profile | Rely on environment variables or default config | 108 | | Programmatic override | `GraphRAGConfig.aws_profile = "my-profile"` | 109 | | Switch regions | `GraphRAGConfig.aws_region = "us-east-2"` | 110 | | Full override | Set both profile and region before invoking `.session` | 111 | | Create boto3 clients | Use `.bedrock`, `.s3`, or `.rds` properties | -------------------------------------------------------------------------------- /docs/lexical-graph/graph-store-falkor-db.md: -------------------------------------------------------------------------------- 1 | [[Home](./)] 2 | 3 | ## FalkorDB as a Graph Store 4 | 5 | ### Topics 6 | 7 | - [Overview](#overview) 8 | - [Install package](#install-package) 9 | - [Registering FalkorDB as a graph store](#registering-falkordb-as-a-graph-store) 10 | - [Creating a FalkorDB graph store](#creating-a-falkordb-graph-store) 11 | 12 | ### Overview 13 | 14 | You can use FalkorDB as a graph store. 15 | 16 | ### Install package 17 | 18 | The FalkorDB graph store is contained in a separate contributor package. To install it: 19 | 20 | ``` 21 | !pip install https://github.com/awslabs/graphrag-toolkit/archive/refs/tags/v3.8.1.zip#subdirectory=lexical-graph-contrib/falkordb 22 | ``` 23 | 24 | ### Registering FalkorDB as a graph store 25 | 26 | Before creating a FalkorDB graph store, you must register the `FalkorDBGraphStoreFactory` with the `GraphStoreFactory`: 27 | 28 | ```python 29 | from graphrag_toolkit.lexical_graph.storage import GraphStoreFactory 30 | from graphrag_toolkit.lexical_graph.storage.graph.falkordb import FalkorDBGraphStoreFactory 31 | 32 | GraphStoreFactory.register(FalkorDBGraphStoreFactory) 33 | 34 | ``` 35 | 36 | ### Creating a FalkorDB graph store 37 | 38 | You can use the `GraphStoreFactory.for_graph_store()` static factory method to create an instance of a FalkorDB graph store. 39 | 40 | The FalkorDB graph store currently supports the [SemanticGuidedRetriever](./querying.md#semanticguidedretriever). It does not support the [TraversalBasedRetriever](./querying.md#traversalbasedretriever). 41 | 42 | To create a [FalkorDB Cloud](https://app.falkordb.cloud/) graph store, supply a connection string that begins `falkordb://`, followed by the FalkorDB endpoint: 43 | 44 | ```python 45 | from graphrag_toolkit.lexical_graph.storage import GraphStoreFactory 46 | from graphrag_toolkit.lexical_graph.storage.graph.falkordb import FalkorDBGraphStoreFactory 47 | 48 | falkordb_connection_info = 'falkordb://your-falkordb-endpoint' 49 | 50 | GraphStoreFactory.register(FalkorDBGraphStoreFactory) 51 | 52 | graph_store = GraphStoreFactory.for_graph_store(falkordb_connection_info) 53 | ``` 54 | 55 | You may also need to pass a username and password, and specify whether or not to use SSL: 56 | 57 | ```python 58 | from graphrag_toolkit.lexical_graph.storage import GraphStoreFactory 59 | 60 | falkordb_connection_info = 'falkordb://' 61 | 62 | graph_store = GraphStoreFactory.for_graph_store( 63 | falkordb_connection_info, 64 | username='', 65 | password='', 66 | ssl=True 67 | ) 68 | ``` 69 | 70 | To create a local FalkorDB graph store, supply a connection string that has only `falkordb://`; 71 | 72 | ```python 73 | from graphrag_toolkit.lexical_graph.storage import GraphStoreFactory 74 | 75 | falkordb_connection_info = 'falkordb://' 76 | 77 | graph_store = GraphStoreFactory.for_graph_store(falkordb_connection_info) 78 | ``` 79 | 80 | -------------------------------------------------------------------------------- /docs/lexical-graph/graph-store-neptune-analytics.md: -------------------------------------------------------------------------------- 1 | [[Home](./)] 2 | 3 | ## Neptune Analytics as a Graph Store 4 | 5 | ### Topics 6 | 7 | - [Overview](#overview) 8 | - [Creating a Neptune Analytics graph store](#creating-a-neptune-analytics-graph-store) 9 | 10 | ### Overview 11 | 12 | You can use Amazon Neptune Analytics as a graph store. 13 | 14 | ### Creating a Neptune Analytics graph store 15 | 16 | Use the `GraphStoreFactory.for_graph_store()` static factory method to create an instance of a Neptune Analytics graph store. 17 | 18 | To create a Neptune Analytics graph store, supply a connection string that begins `neptune-graph://`, followed by the graph's identifier: 19 | 20 | ``` 21 | from graphrag_toolkit.lexical_graph.storage import GraphStoreFactory 22 | 23 | neptune_connection_info = 'neptune-graph://g-jbzzaqb209' 24 | 25 | graph_store = GraphStoreFactory.for_graph_store(neptune_connection_info) 26 | ``` 27 | 28 | -------------------------------------------------------------------------------- /docs/lexical-graph/graph-store-neptune-db.md: -------------------------------------------------------------------------------- 1 | [[Home](./)] 2 | 3 | ## Neptune Database as a Graph Store 4 | 5 | ### Topics 6 | 7 | - [Overview](#overview) 8 | - [Creating a Neptune Database graph store](#creating-a-neptune-database-graph-store) 9 | - [Connecting to Neptune via a proxy](#connecting-to-neptune-via-a-proxy) 10 | 11 | ### Overview 12 | 13 | You can use Amazon Neptune Database as a graph store. The lexical-graph requires [Neptune engine version](https://docs.aws.amazon.com/neptune/latest/userguide/engine-releases.html) 1.4.1.0 or later. 14 | 15 | ### Creating a Neptune Database graph store 16 | 17 | Use the `GraphStoreFactory.for_graph_store()` static factory method to create an instance of a Neptune Database graph store. 18 | 19 | To create a Neptune Database graph store (engine version 1.4.1.0 or later), supply a connection string that begins `neptune-db://`, followed by an [endpoint](https://docs.aws.amazon.com/neptune/latest/userguide/feature-overview-endpoints.html): 20 | 21 | ```python 22 | from graphrag_toolkit.lexical_graph.storage import GraphStoreFactory 23 | 24 | neptune_connection_info = 'neptune-db://mydbcluster.cluster-123456789012.us-east-1.neptune.amazonaws.com:8182' 25 | 26 | graph_store = GraphStoreFactory.for_graph_store(neptune_connection_info) 27 | ``` 28 | 29 | #### Connecting to Neptune via a proxy 30 | 31 | To connect to Neptune via a proxy (e.g. a load balancer), you must supply a config dictionary to the `GraphStoreFactory.for_graph_store()` factory method, with a `proxies` dictionary of proxy servers to use by protocol or endpoint: 32 | 33 | ```python 34 | from graphrag_toolkit.lexical_graph.storage import GraphStoreFactory 35 | 36 | neptune_connection_info = 'neptune-db://mydbcluster.cluster-123456789012.us-east-1.neptune.amazonaws.com:8182' 37 | 38 | config = { 39 | 'proxies': { 40 | 'http': 'http://proxy-hostname:80' 41 | } 42 | } 43 | 44 | graph_store = GraphStoreFactory.for_graph_store( 45 | neptune_connection_info, 46 | config=config 47 | ) 48 | ``` 49 | -------------------------------------------------------------------------------- /docs/lexical-graph/hybrid-deployment.md: -------------------------------------------------------------------------------- 1 | 2 | [[Home](./)] 3 | 4 | ## Hybrid Deployment 5 | 6 | ### Topics 7 | 8 | - [Overview](#overview) 9 | - [Stores and model providers](#stores-and-model-providers) 10 | - [Indexing and querying](#indexing-and-querying) 11 | - [Indexing](#indexing) 12 | 13 | ### Overview 14 | 15 | Hybrid deployment enables flexible deployment: high-throughput LLM inference via SageMaker and Bedrock, and cost-effective local development using containerized graph/vector stores. 16 | 17 | ### Stores and model providers 18 | 19 | The `lexical-graph` library depends on three backend systems: a [*graph store*](./storage-model.md#graph-store), a [*vector store*](./storage-model.md#vector-store), and a *foundation model provider*. The graph store enables storage and querying of a lexical graph built from unstructured, text-based sources. The vector store contains one or more indexes with embeddings for selected graph elements, which help identify starting points for graph queries. The foundation model provider hosts the Large Language Models (LLMs) used for extraction and embedding. 20 | 21 | The library provides built-in support for: 22 | 23 | * Graph stores: [Amazon Neptune Database](https://docs.aws.amazon.com/neptune/latest/userguide/intro.html), [Amazon Neptune Analytics](https://docs.aws.amazon.com/neptune-analytics/latest/userguide/what-is-neptune-analytics.html), and local [FalkorDB](https://falkordb.com/) (via Docker) 24 | * Vector stores: [Amazon OpenSearch Serverless](https://docs.aws.amazon.com/opensearch-service/latest/developerguide/serverless.html), [PostgreSQL with `pgvector`](https://github.com/pgvector/pgvector), Neptune Analytics, and local [PostgreSQL with `pgvector`](https://github.com/pgvector/pgvector) 25 | * Foundation model provider: [Amazon Bedrock](https://aws.amazon.com/bedrock/) 26 | 27 | This hybrid configuration enables flexible deployment: high-throughput LLM inference via SageMaker and Bedrock, and cost-effective local development using containerized graph/vector stores. 28 | 29 | ### Indexing and querying 30 | 31 | The lexical-graph library implements two high-level processes: [_indexing_](./indexing.md) and [_querying_](./querying.md). The indexing process ingests and extracts information from unstuctured, text-based source documents and then builds a graph and accompanying vector indexes. The query process retrieves content from the graph and vector indexes, and then supplies this content as context to an LLM to answer a user question. 32 | 33 | #### Indexing 34 | 35 | Indexing is split into two pipeline stages: **Extract** and **Build**. 36 | 37 | The **Extract** stage runs **locally using Docker**: 38 | 39 | * Loads and chunks documents 40 | * Performs two LLM-based extraction steps: 41 | 42 | * *Proposition extraction*: Converts chunked text into well-formed statements 43 | * *Topic/entity/fact extraction*: Identifies relations and concepts 44 | * Stores the extracted results in an **AWS S3 bucket**, serving as the transport medium between stages 45 | 46 | The **Build** stage remains unchanged. 47 | 48 | ![Indexing](../../images/hybrid-extract-and-build.png) -------------------------------------------------------------------------------- /docs/lexical-graph/multi-tenancy.md: -------------------------------------------------------------------------------- 1 | [[Home](./)] 2 | 3 | ## Multi-Tenancy 4 | 5 | ### Topics 6 | 7 | - [Overview](#overview) 8 | - [Tenant Id](#tenant-id) 9 | - [Indexing and multi-tenancy](#indexing-and-multi-tenancy) 10 | - [Querying and multi-tenancy](#querying-and-multi-tenancy) 11 | - [Implementation details](#implementation-details) 12 | 13 | ### Overview 14 | 15 | Multi-tenancy allows you to host multiple separate lexical graphs in the same underlying graph and vector stores. 16 | 17 | ### Tenant Id 18 | 19 | To use the multi-tenancy feature, you must supply a tenant id when creating a `LexicalGraphIndex` or `LexicalGraphQueryEngine`. A tenant id is a string containing 1-10 lower case characters and numbers. If you don't supply a tenant id, the index and query engine will use the _default tenant_ (i.e. a tenant id value of `None`). 20 | 21 | ### Indexing and multi-tenancy 22 | 23 | The following example creates a `LexicalGraphIndex` for tenant 'user123': 24 | 25 | ```python 26 | from graphrag_toolkit.lexical_graph import LexicalGraphIndex 27 | 28 | graph_store = ... 29 | vector_store = ... 30 | 31 | graph_index = LexicalGraphIndex( 32 | graph_store, 33 | vector_store, 34 | tenant_id='user123' 35 | ) 36 | ``` 37 | 38 | The `LexicalGraphIndex` always uses the _default tenant_ for the [extract stage](https://github.com/awslabs/graphrag-toolkit/blob/main/docs/indexing.md#extract), even if you supply a different tenant id. The [build stage](https://github.com/awslabs/graphrag-toolkit/blob/main/docs/indexing.md#build), however, will use the tenant id. The reason for this is so that you can extract once, and then build many times, potentially for different tenants. 39 | 40 | ### Querying and multi-tenancy 41 | 42 | The following example creates a `LexicalGraphQueryEngine` for tenant 'user123': 43 | 44 | ```python 45 | from graphrag_toolkit.lexical_graph import LexicalGraphQueryEngine 46 | 47 | graph_store = ... 48 | vector_store = ... 49 | 50 | query_engine = LexicalGraphQueryEngine.for_traversal_based_search( 51 | graph_store, 52 | vector_store, 53 | tenant_id='user123' 54 | ) 55 | ``` 56 | 57 | If a lexical graph does not exist for the specified tenant id, the underlying retrievers will return an empty set of results. 58 | 59 | ### Implementation details 60 | 61 | Multi-tenancy works by using tenant-specific node labels for nodes in the graph, and tenant-specific indexes in the vector store. For example, chunk nodes in a graph belonging to tenant 'user123' will be labelled `__Chunk__user123__`, while the chunk vector index will be named `chunk_user123`. 62 | 63 | Not every graph and vector store necessarily supports multi-tenancy. Neptune Analytics, when used as a vector store, for example, does not currently support multi-tenancy. -------------------------------------------------------------------------------- /docs/lexical-graph/prompts.md: -------------------------------------------------------------------------------- 1 | 2 | ## Using Custom Prompt Providers 3 | 4 | The GraphRAG Toolkit supports pluggable prompt providers to allow dynamic loading of prompt templates from various sources. There are four built-in providers: 5 | 6 | ### 1. StaticPromptProvider 7 | 8 | Use this when your system and user prompts are defined as constants in your codebase. 9 | 10 | ```python 11 | from graphrag_toolkit.lexical_graph.prompts.static_prompt_provider import StaticPromptProvider 12 | 13 | prompt_provider = StaticPromptProvider() 14 | ``` 15 | 16 | This provider uses the predefined constants `ANSWER_QUESTION_SYSTEM_PROMPT` and `ANSWER_QUESTION_USER_PROMPT`. 17 | 18 | --- 19 | 20 | ### 2. FilePromptProvider 21 | 22 | Use this when your prompts are stored locally on disk. 23 | 24 | ```python 25 | from graphrag_toolkit.lexical_graph.prompts.file_prompt_provider import FilePromptProvider 26 | from graphrag_toolkit.lexical_graph.prompts.prompt_provider_config import FilePromptProviderConfig 27 | 28 | prompt_provider = FilePromptProvider( 29 | FilePromptProviderConfig(base_path="./prompts"), 30 | system_prompt_file="system.txt", 31 | user_prompt_file="user.txt" 32 | ) 33 | ``` 34 | 35 | The prompt files are read from a directory (`base_path`), and you can override the file names if needed. 36 | 37 | --- 38 | 39 | ### 3. S3PromptProvider 40 | 41 | Use this when your prompts are stored in an Amazon S3 bucket. 42 | 43 | ```python 44 | from graphrag_toolkit.lexical_graph.prompts.s3_prompt_provider import S3PromptProvider 45 | from graphrag_toolkit.lexical_graph.prompts.prompt_provider_config import S3PromptProviderConfig 46 | 47 | prompt_provider = S3PromptProvider( 48 | S3PromptProviderConfig( 49 | bucket="ccms-prompts", 50 | prefix="prompts", 51 | aws_region="us-east-1", # optional if set via env 52 | aws_profile="my-profile", # optional if using default profile 53 | system_prompt_file="my_system.txt", # optional override 54 | user_prompt_file="my_user.txt" # optional override 55 | ) 56 | ) 57 | ``` 58 | 59 | Prompts are loaded using `boto3` and AWS credentials. Ensure your environment or `~/.aws/config` is configured for SSO, roles, or keys. 60 | 61 | --- 62 | 63 | ### 4. BedrockPromptProvider 64 | 65 | Use this when your prompts are stored and versioned using Amazon Bedrock prompt ARNs. 66 | 67 | ```python 68 | from graphrag_toolkit.lexical_graph.prompts.bedrock_prompt_provider import BedrockPromptProvider 69 | from graphrag_toolkit.lexical_graph.prompts.prompt_provider_config import BedrockPromptProviderConfig 70 | 71 | prompt_provider = BedrockPromptProvider( 72 | config=BedrockPromptProviderConfig( 73 | system_prompt_arn="arn:aws:bedrock:us-east-1:123456789012:prompt/my-system", 74 | user_prompt_arn="arn:aws:bedrock:us-east-1:123456789012:prompt/my-user", 75 | system_prompt_version="DRAFT", 76 | user_prompt_version="DRAFT" 77 | ) 78 | ) 79 | ``` 80 | 81 | This provider resolves prompt ARNs dynamically using STS and can fall back to environment variables if needed. 82 | 83 | -------------------------------------------------------------------------------- /docs/lexical-graph/vector-store-neptune-analytics.md: -------------------------------------------------------------------------------- 1 | [[Home](./)] 2 | 3 | ## Neptune Analytics as a Vector Store 4 | 5 | ### Topics 6 | 7 | - [Overview](#overview) 8 | - [Creating a Neptune Analytics vector store](#creating-a-neptune-analytics-vector-store) 9 | 10 | ### Overview 11 | 12 | You can use Amazon Neptune Analytics as a vector store. 13 | 14 | ### Creating a Neptune Analytics vector store 15 | 16 | Use the `VectorStoreFactory.for_vector_store()` static factory method to create an instance of an Amazon Neptune Analytics vector store. 17 | 18 | To create a Neptune Analytics vector store, supply a connection string that begins `neptune-graph://`, followed by the graph's identifier: 19 | 20 | ```python 21 | from graphrag_toolkit.lexical_graph.storage import VectorStoreFactory 22 | 23 | neptune_connection_info = 'neptune-graph://g-jbzzaqb209' 24 | 25 | vector_store = VectorStoreFactory.for_vector_store(neptune_connection_info) 26 | ``` 27 | 28 | -------------------------------------------------------------------------------- /docs/lexical-graph/vector-store-opensearch-serverless.md: -------------------------------------------------------------------------------- 1 | [[Home](./)] 2 | 3 | ## Amazon OpenSearch Serverless as a Vector Store 4 | 5 | ### Topics 6 | 7 | - [Overview](#overview) 8 | - [Install dependencies](#install-dependencies) 9 | - [Creating an OpenSearch Serverless vector store](#creating-a-neptune-analytics-vector-store) 10 | 11 | ### Overview 12 | 13 | You can use an Amazon OpenSearch Serverless collection as a vector store. 14 | 15 | ### Install dependencies 16 | 17 | The OpenSeacrh vector store requires both the `opensearch-py` and `llama-index-vector-stores-opensearch` packages: 18 | 19 | ``` 20 | pip install opensearch-py llama-index-vector-stores-opensearch 21 | ``` 22 | 23 | ### Creating an OpenSearch Serverless vector store 24 | 25 | Use the `VectorStoreFactory.for_vector_store()` static factory method to create an instance of an Amazon OpenSearch Serverless vector store. 26 | 27 | To create an Amazon OpenSearch Serverless vector store, supply a connection string that begins `aoss://`, followed by the https endpoint of the OpenSearch Serverless collection: 28 | 29 | ```python 30 | from graphrag_toolkit.lexical_graph.storage import VectorStoreFactory 31 | 32 | opensearch_connection_info = 'aoss://https://123456789012.us-east-1.aoss.amazonaws.com' 33 | 34 | vector_store = VectorStoreFactory.for_vector_store(opensearch_connection_info) 35 | ``` 36 | -------------------------------------------------------------------------------- /docs/lexical-graph/vector-store-postgres.md: -------------------------------------------------------------------------------- 1 | [[Home](./)] 2 | 3 | ## Postgres as a Vector Store 4 | 5 | ### Topics 6 | 7 | - [Overview](#overview) 8 | - [Install dependencies](#install-dependencies) 9 | - [Creating Postgres vector store](#creating-a-postgres-vector-store) 10 | - [Connecting to an IAM auth-enabled Postgres vector store](#connecting-to-an-iam-auth-enabled-postgres-vector-store) 11 | 12 | ### Overview 13 | 14 | You can use a Postgres database with the [pgvector](https://github.com/pgvector/pgvector) extension as a vector store. 15 | 16 | ### Install dependencies 17 | 18 | The Postgres vector store requires both the `psycopg2` and `pgvector` packages: 19 | 20 | ``` 21 | pip install psycopg2-binary pgvector 22 | ``` 23 | 24 | ### Creating a Postgres vector store 25 | 26 | Use the `VectorStoreFactory.for_vector_store()` static factory method to create an instance of a Postgres vector store. 27 | 28 | To create a Postgres vector store, supply a connection string in the following format: 29 | 30 | ``` 31 | postgresql://[user[:password]@][netloc][:port][/dbname][?param1=value1&...] 32 | ``` 33 | 34 | For example: 35 | 36 | ``` 37 | postgresql://graphrag:!zfg%dGGh@mydbcluster.cluster-123456789012.us-west-2.rds.amazonaws.com:5432/postgres 38 | ``` 39 | 40 | #### Connecting to an IAM auth-enabled Postgres vector store 41 | 42 | If your Postgres database supports [AWS Identity and Access Management (IAM) database authentication](https://docs.aws.amazon.com/AmazonRDS/latest/UserGuide/UsingWithRDS.IAMDBAuth.html), omit the password, and add `enable_iam_db_auth=True` to the connection string query parameters: 43 | 44 | ``` 45 | postgresql://graphrag@mydbcluster.cluster-123456789012.us-west-2.rds.amazonaws.com:5432/postgres?enable_iam_db_auth=True 46 | ``` 47 | 48 | You will need to create a database user, and [grant the `rds_iam` role](https://docs.aws.amazon.com/AmazonRDS/latest/UserGuide/UsingWithRDS.IAMDBAuth.DBAccounts.html#UsingWithRDS.IAMDBAuth.DBAccounts.PostgreSQL) to use IAM authentication. 49 | 50 | 51 | -------------------------------------------------------------------------------- /examples/README.md: -------------------------------------------------------------------------------- 1 | ## Examples 2 | 3 | - [BYOKG-RAG](./byokg-rag/) Example notebook and dataset demonstrating a RAG (Retrieval Augmented Generation) system built on top of a Knowledge Graph. 4 | - [Lexical Graph](./lexical-graph/) Examples of deploying and running the lexical-graph indexing and querying processes on AWS services 5 | - [Lexical Graph hybrid development](./lexical-graph-hybrid-dev/) Examples of running the indexing extract stage locally and the indexing build stage and querying on AWS services 6 | - [Lexical Graph local development](./lexical-graph-local-dev/) Examples of running the lexical-graph indexing and querying processes locally -------------------------------------------------------------------------------- /examples/lexical-graph-hybrid-dev/aws/setup-bedrock-batch-doc.md: -------------------------------------------------------------------------------- 1 | 2 | # Bedrock Batch Inference Setup Script Documentation 3 | 4 | This script automates the provisioning of the necessary AWS resources to perform **Batch Model Invocation** jobs with Amazon Bedrock. 5 | 6 | --- 7 | 8 | ## What the Script Does 9 | 10 | 1. **Checks AWS Credentials** 11 | Validates that the AWS CLI is authenticated using either: 12 | - SSO (e.g., `aws sso login --profile padmin`) 13 | - or static credentials (via `aws configure`) 14 | 15 | 2. **Retrieves AWS Account and Region Info** 16 | Using the AWS profile, the script resolves: 17 | - `ACCOUNT_ID` 18 | - `REGION` 19 | - (Optional) Current SSO role being used 20 | 21 | 3. **Creates an S3 Bucket** 22 | Creates a bucket named `ccms-rag-extract-` for uploading input/output files used in batch jobs. 23 | 24 | 4. **Creates an IAM Role for Bedrock (Execution Role)** 25 | - Name: `bedrock-batch-inference-role` 26 | - Trusts the `bedrock.amazonaws.com` service 27 | - Permissions: 28 | Allows access to the newly created S3 bucket. 29 | 30 | 5. **Creates an IAM Identity Policy** 31 | - Name: `bedrock-batch-identity-policy` 32 | - Grants permission to: 33 | - Create, List, Get, and Stop Bedrock model invocation jobs 34 | - Pass the execution role to Bedrock 35 | 36 | 6. **Attaches Policies to Role/User** 37 | - Attaches the role permissions to the `bedrock-batch-inference-role` 38 | - Prints instructions to attach the identity policy manually depending on credential type 39 | 40 | 7. **Cleanup** 41 | Temporary policy files are deleted from the local directory. 42 | 43 | --- 44 | 45 | ## Output Resources 46 | 47 | | Resource | Description | 48 | |---------|-------------| 49 | | S3 Bucket | `ccms-rag-extract-` | 50 | | IAM Role | `bedrock-batch-inference-role` | 51 | | IAM Role Policy | Grants S3 access for batch inference | 52 | | IAM Identity Policy | Grants permission to submit and manage Bedrock batch jobs | 53 | 54 | --- 55 | 56 | ## Usage 57 | 58 | ```bash 59 | bash setup-bedrock-batch.sh padmin 60 | ``` 61 | 62 | If no profile is specified, it defaults to `padmin`. 63 | 64 | --- 65 | 66 | ## Manual IAM Setup Required (SSO Users) 67 | 68 | If you're using AWS SSO, the script will print: 69 | ``` 70 | NOTE: You are using AWS SSO with role: 71 | To complete setup, you need to: 72 | 1. Go to AWS IAM Identity Center 73 | 2. Find your Permission Set 74 | 3. Add the identity policy (arn:aws:iam:::policy/bedrock-batch-identity-policy) to your Permission Set 75 | ``` 76 | 77 | If you're using static credentials, you must manually attach the identity policy to the user/role. 78 | 79 | --- 80 | 81 | ## Related Policies 82 | 83 | ### Trust Policy (Role) 84 | ```json 85 | { 86 | "Principal": { 87 | "Service": "bedrock.amazonaws.com" 88 | }, 89 | "Condition": { 90 | "StringEquals": { 91 | "aws:SourceAccount": "" 92 | }, 93 | "ArnEquals": { 94 | "aws:SourceArn": "arn:aws:bedrock:::model-invocation-job/*" 95 | } 96 | } 97 | } 98 | ``` 99 | 100 | ### Role Policy (S3 Access) 101 | ```json 102 | { 103 | "Action": ["s3:GetObject", "s3:ListBucket", "s3:PutObject"], 104 | "Resource": [ 105 | "arn:aws:s3:::ccms-rag-extract-", 106 | "arn:aws:s3:::ccms-rag-extract-/*" 107 | ] 108 | } 109 | ``` 110 | 111 | ### Identity Policy (Bedrock Access) 112 | ```json 113 | { 114 | "Action": [ 115 | "bedrock:CreateModelInvocationJob", 116 | "bedrock:GetModelInvocationJob", 117 | "bedrock:ListModelInvocationJobs", 118 | "bedrock:StopModelInvocationJob", 119 | "iam:PassRole" 120 | ] 121 | } 122 | ``` 123 | 124 | --- 125 | 126 | ## Prerequisites 127 | 128 | - AWS CLI installed 129 | - AWS credentials configured for the profile (via SSO or `aws configure`) 130 | - Sufficient permissions to: 131 | - Create IAM roles and policies 132 | - Create S3 buckets 133 | -------------------------------------------------------------------------------- /examples/lexical-graph-hybrid-dev/docker/.env: -------------------------------------------------------------------------------- 1 | # PostgreSQL settings 2 | POSTGRES_USER=graphrag 3 | POSTGRES_PASSWORD=graphragpass 4 | POSTGRES_DB=graphrag_db 5 | POSTGRES_HOST=postgres 6 | POSTGRES_PORT=5432 7 | 8 | # FalkorDB settings (adjust as needed) 9 | FALKORDB_HOST=falkordb 10 | FALKORDB_PORT=6379 11 | 12 | # Other potential settings for graphrag_toolkit 13 | EMBEDDING_MODEL=sentence-transformers/all-mpnet-base-v2 14 | EMBEDDING_DIMENSIONS=1536 -------------------------------------------------------------------------------- /examples/lexical-graph-hybrid-dev/docker/build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo "Building and starting containers..." 4 | docker compose up -d --build 5 | 6 | echo "Build and startup complete." 7 | -------------------------------------------------------------------------------- /examples/lexical-graph-hybrid-dev/docker/docker-compose.yml: -------------------------------------------------------------------------------- 1 | services: 2 | falkordb: 3 | image: falkordb/falkordb:latest 4 | container_name: falkordb 5 | ports: 6 | - "6379:6379" # Redis/FalkorDB default 7 | - "3000:3000" # Optional for FalkorDB REST if exposed 8 | volumes: 9 | - falkor_data:/data 10 | networks: 11 | - graphrag_network 12 | 13 | falkordb-browser: 14 | image: falkordb/falkordb-browser:latest 15 | container_name: falkordb-browser 16 | ports: 17 | - "8092:8080" # Browser UI exposed on localhost:8080 18 | environment: 19 | FALKORDB_BROWSER_REDIS_HOST: falkordb 20 | FALKORDB_BROWSER_REDIS_PORT: 6379 21 | FALKORDB_BROWSER_REDIS_USE_TLS: "false" 22 | depends_on: 23 | - falkordb 24 | networks: 25 | - graphrag_network 26 | 27 | postgres: 28 | image: pgvector/pgvector:0.6.2-pg16 29 | container_name: pgvector-db 30 | ports: 31 | - "5432:5432" 32 | environment: 33 | - POSTGRES_USER=${POSTGRES_USER} 34 | - POSTGRES_PASSWORD=${POSTGRES_PASSWORD} 35 | - POSTGRES_DB=${POSTGRES_DB} 36 | volumes: 37 | - pgvector_data:/var/lib/postgresql/data 38 | - ./postgres/schema.sql:/docker-entrypoint-initdb.d/schema.sql 39 | networks: 40 | - graphrag_network 41 | 42 | networks: 43 | graphrag_network: 44 | driver: bridge 45 | 46 | volumes: 47 | falkor_data: 48 | pgvector_data: 49 | -------------------------------------------------------------------------------- /examples/lexical-graph-hybrid-dev/docker/postgres/schema.sql: -------------------------------------------------------------------------------- 1 | -- Enable pgvector extension in public schema 2 | CREATE EXTENSION IF NOT EXISTS vector SCHEMA public; 3 | 4 | -- Enable pg_trgm extension in public schema 5 | CREATE EXTENSION IF NOT EXISTS pg_trgm SCHEMA public; 6 | 7 | -- Create schema for GraphRAG 8 | CREATE SCHEMA IF NOT EXISTS graphrag; -------------------------------------------------------------------------------- /examples/lexical-graph-hybrid-dev/docker/reset.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo "Stopping and removing containers, volumes, and networks..." 4 | docker compose down -v --remove-orphans 5 | 6 | echo "Ensuring containers are removed..." 7 | docker rm -f falkordb falkordb-browser pgvector-db 2>/dev/null 8 | 9 | echo "Removing named volumes..." 10 | docker volume rm -f pgvector_data falkor_data 2>/dev/null 11 | 12 | echo "Pruning dangling volumes (if any)..." 13 | docker volume prune -f 14 | 15 | echo "Clearing extracted directory..." 16 | rm -rf extracted 17 | 18 | echo "Rebuilding containers..." 19 | docker compose up -d --force-recreate 20 | 21 | echo "Reset complete." -------------------------------------------------------------------------------- /examples/lexical-graph-hybrid-dev/docs/docker_build_shell_script.md: -------------------------------------------------------------------------------- 1 | # build.sh 2 | 3 | This script is used to build and start the containers for a new deployment of the application using Docker Compose. It is intended for **initial deployments** or **redeployments** without resetting volumes, removing data, or clearing persistent state. 4 | 5 | ## Usage 6 | 7 | ```bash 8 | chmod +x build.sh 9 | ./build.sh 10 | ``` 11 | 12 | ## What it does 13 | 14 | - Executes `docker compose up -d --build` to: 15 | - Build the Docker images using the `Dockerfile`s defined in the project. 16 | - Start the services in detached mode (`-d`) so the terminal remains available. 17 | - Automatically pull required images if not already present. 18 | - Rebuild containers if source code has changed. 19 | 20 | ## Important Notes 21 | 22 | - This script does **not** remove any existing containers, volumes, or data. 23 | - It is safe to run on top of an existing deployment if you are deploying an updated version of your app. 24 | - Make sure your `.env` and `docker-compose.yml` files are configured properly before running the script. 25 | 26 | ## Related Scripts 27 | 28 | - See [`reset.sh`](reset.md) for a full environment reset, including data deletion and volume pruning. 29 | -------------------------------------------------------------------------------- /examples/lexical-graph-hybrid-dev/docs/docker_compose_services.md: -------------------------------------------------------------------------------- 1 | # Docker Services Overview for GraphRAG Deployment 2 | 3 | This document describes the services defined in the `docker-compose.yml` file used for setting up a GraphRAG environment. It includes containerized services for FalkorDB, a FalkorDB browser UI, and a PostgreSQL database with the `pgvector` extension enabled. 4 | 5 | --- 6 | 7 | ## Services 8 | 9 | ### 1. `falkordb` 10 | - **Image**: `falkordb/falkordb:latest` 11 | - **Description**: Runs the FalkorDB graph database, which uses Redis as its backend. 12 | - **Ports**: 13 | - `6379`: Redis/FalkorDB main port. 14 | - `3000`: Optional REST API for FalkorDB if exposed. 15 | - **Volume**: Persists graph data using `falkor_data`. 16 | - **Network**: Connected to `graphrag_network`. 17 | 18 | ### 2. `falkordb-browser` 19 | - **Image**: `falkordb/falkordb-browser:latest` 20 | - **Description**: Provides a web-based interface for interacting with FalkorDB. 21 | - **Ports**: 22 | - `8092:8080`: Web UI exposed on localhost:8092. 23 | - **Environment Variables**: 24 | - `FALKORDB_BROWSER_REDIS_HOST`: Hostname of the FalkorDB service. 25 | - `FALKORDB_BROWSER_REDIS_PORT`: Port for Redis. 26 | - `FALKORDB_BROWSER_REDIS_USE_TLS`: TLS setting for secure Redis communication (disabled in this setup). 27 | - **Depends On**: `falkordb` 28 | - **Network**: Connected to `graphrag_network`. 29 | 30 | ### 3. `postgres` 31 | - **Image**: `pgvector/pgvector:0.6.2-pg16` 32 | - **Description**: PostgreSQL 16 image with the `pgvector` extension pre-installed for vector search capabilities. 33 | - **Ports**: 34 | - `5432`: PostgreSQL default port. 35 | - **Environment Variables**: 36 | - `POSTGRES_USER`, `POSTGRES_PASSWORD`, `POSTGRES_DB`: Injected from environment or `.env` file. 37 | - **Volumes**: 38 | - `pgvector_data`: Data persistence. 39 | - `./postgres/schema.sql`: Initializes the database schema. 40 | - **Network**: Connected to `graphrag_network`. 41 | 42 | --- 43 | 44 | ## `schema.sql` 45 | 46 | This SQL file is used to bootstrap the PostgreSQL container with necessary extensions and a custom schema: 47 | 48 | ```sql 49 | -- Enable pgvector extension in public schema 50 | CREATE EXTENSION IF NOT EXISTS vector SCHEMA public; 51 | 52 | -- Enable pg_trgm extension in public schema 53 | CREATE EXTENSION IF NOT EXISTS pg_trgm SCHEMA public; 54 | 55 | -- Create schema for GraphRAG 56 | CREATE SCHEMA IF NOT EXISTS graphrag; 57 | ``` 58 | 59 | These extensions are required for vector similarity search and trigram-based indexing within the GraphRAG framework. 60 | 61 | --- 62 | 63 | ## Networks 64 | 65 | - **graphrag_network**: A dedicated Docker bridge network for inter-container communication. 66 | 67 | --- 68 | 69 | ## Volumes 70 | 71 | - `falkor_data`: Persists FalkorDB graph state. 72 | - `pgvector_data`: Persists PostgreSQL data including vector embeddings and schema definitions. 73 | -------------------------------------------------------------------------------- /examples/lexical-graph-hybrid-dev/docs/docker_reset_shell_script.md: -------------------------------------------------------------------------------- 1 | # Docker Environment Reset Script 2 | 3 | This script is used to **fully reset a local Docker-based development environment** for graphrag-toolkit. The script will reset FalkorDB, PGVector, and optionally other components. It performs cleanup of containers, networks, volumes, and extracted data, followed by a fresh container rebuild. 4 | 5 | ## Filename 6 | 7 | Use `reset.sh` (file is located in lexical-graph-contrib/docker) and run it with: 8 | 9 | ```bash 10 | bash reset.sh 11 | ``` 12 | 13 | > **Note:** Make sure the script is executable (`chmod +x reset.sh`) or invoke it with `bash`. 14 | 15 | --- 16 | 17 | ## Script Breakdown 18 | 19 | ```bash 20 | #!/bin/bash 21 | ``` 22 | - Standard shebang to run the script using `bash`. 23 | 24 | --- 25 | 26 | ### 1. Stop and Remove Docker Resources 27 | 28 | ```bash 29 | echo "Stopping and removing containers, volumes, and networks..." 30 | docker compose down -v --remove-orphans 31 | ``` 32 | 33 | - **`docker compose down`** stops and removes containers defined in `docker-compose.yml`. 34 | - **`-v`** removes associated anonymous volumes. 35 | - **`--remove-orphans`** removes containers not defined in the current Compose file but part of the same project network. 36 | 37 | --- 38 | 39 | ### 2. Explicitly Remove Named Containers 40 | 41 | ```bash 42 | echo "Ensuring containers are removed..." 43 | docker rm -f falkordb falkordb-browser pgvector-db 2>/dev/null 44 | ``` 45 | 46 | - Forcefully removes specific named containers, if they still exist. 47 | - Errors are suppressed using `2>/dev/null`. 48 | 49 | --- 50 | 51 | ### 3. Remove Named Volumes 52 | 53 | ```bash 54 | echo "Removing named volumes..." 55 | docker volume rm -f pgvector_data falkor_data 2>/dev/null 56 | ``` 57 | 58 | - Deletes project-specific Docker volumes that might persist after shutdown. 59 | 60 | --- 61 | 62 | ### 4. Prune Dangling Volumes 63 | 64 | ```bash 65 | echo "Pruning dangling volumes (if any)..." 66 | docker volume prune -f 67 | ``` 68 | 69 | - Removes **dangling (unused)** Docker volumes that may be left behind. 70 | 71 | --- 72 | 73 | ### 5. Delete Local Directories 74 | 75 | ```bash 76 | echo "Clearing extracted directory..." 77 | rm -rf extracted 78 | ``` 79 | 80 | - Cleans up the local `./extracted` directory used to store intermediate files (like parsed documents, indexes, or temp outputs). 81 | 82 | --- 83 | 84 | ### 6. Rebuild and Start Containers 85 | 86 | ```bash 87 | echo "Rebuilding containers..." 88 | docker compose up -d --force-recreate 89 | ``` 90 | 91 | - **`-d`** runs containers in detached mode. 92 | - **`--force-recreate`** ensures all containers are recreated even if configuration hasn't changed. 93 | 94 | --- 95 | 96 | ### 7. Final Message 97 | 98 | ```bash 99 | echo "Reset complete." 100 | ``` 101 | 102 | - Indicates successful completion of the reset process. 103 | 104 | --- 105 | 106 | ## Use Cases 107 | 108 | - Full environment reset between development sessions 109 | - Clean-up after corrupt container or volume states 110 | - Ensures a consistent baseline environment for troubleshooting or testing 111 | 112 | --- 113 | 114 | ## Warnings 115 | 116 | - **Data Loss**: This script removes all persistent data and should not be used on production environments. 117 | - **Rebuild Time**: Fresh container creation may take time depending on image sizes and network speed. 118 | 119 | -------------------------------------------------------------------------------- /examples/lexical-graph-hybrid-dev/notebooks/.env: -------------------------------------------------------------------------------- 1 | AWS_REGION="" #Populate with region 2 | AWS_PROFILE="" #Populate with optional AWS_PROFILE 3 | AWS_ACCOUNT="" #Populate with AWS Account Number 4 | DYNAMODB_NAME="" # Populate from the ouput from setup-bedrock-batch.sh 5 | S3_BUCKET_EXTRACK_BUILD_BATCH_NAME="" #Populate from the ouput from setup-bedrock-batch.sh 6 | S3_BATCH_BUCKET_NAME="" # Populate from the ouput from setup-bedrock-batch.sh 7 | EMBEDDINGS_MODEL="cohere.embed-english-v3" 8 | EMBEDDINGS_DIMENSIONS=1024 9 | EXTRACTION_MODEL="us.anthropic.claude-3-5-sonnet-20240620-v1:0" 10 | RESPONSE_MODEL="us.anthropic.claude-3-5-sonnet-20240620-v1:0" 11 | EXTRACTION_NUM_WORKERS=2 12 | EXTRACTION_NUM_THREADS_PER_WORKER=4 13 | EXTRACTION_BATCH_SIZE=100 14 | BUILD_NUM_WORKERS=2 15 | BUILD_BATCH_SIZE=4 16 | BUILD_BATCH_WRITE_SIZE=25 17 | BATCH_WRITES_ENABLED=True 18 | BATCH_ROLE_NAME="" #Populate from the ouput from setup-bedrock-batch.sh 19 | SOURCE_DIR="best-practices" 20 | BATCH_PREFIX="batch" #Batch S3 Prefix 21 | EXTRACT_BUILD_PREFIX="extract-build" #Extract S3 Prefix 22 | DEFAULT_INCLUDE_DOMAIN_LABELS=False 23 | ENABLE_CACHE=False 24 | VECTOR_STORE="postgresql://graphrag:graphragpass@localhost:5432/graphrag_db" #Docker defaults 25 | GRAPH_STORE="falkordb://localhost:6379" #Docker defaults 26 | MAX_BATCH_SIZE=25000 27 | MAX_NUM_CONCURRENT_BATCHES=3 28 | S3_ENCRYPTION_KEY_ID="" 29 | SUBNET_IDS="" 30 | SECURITY_GROUP_IDS="" -------------------------------------------------------------------------------- /examples/lexical-graph-hybrid-dev/notebooks/03-Cloud-Build.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "434fea4e", 6 | "metadata": {}, 7 | "source": "# 02 - Cloud Build" 8 | }, 9 | { 10 | "cell_type": "markdown", 11 | "id": "a9fb5cff", 12 | "metadata": {}, 13 | "source": [ 14 | "## Setup\n", 15 | "\n", 16 | "If you haven't already, install the toolkit and dependencies using the [Setup](./00-Setup.ipynb) notebook." 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "id": "40c3f5e1", 22 | "metadata": {}, 23 | "source": [ 24 | "## Build" 25 | ] 26 | }, 27 | { 28 | "metadata": {}, 29 | "cell_type": "code", 30 | "source": [ 31 | "%reload_ext dotenv\n", 32 | "%dotenv\n", 33 | "\n", 34 | "import os\n", 35 | "\n", 36 | "for key, value in os.environ.items():\n", 37 | " print(f\"{key}={value}\")" 38 | ], 39 | "id": "6fee75a08bc1a7e9", 40 | "outputs": [], 41 | "execution_count": null 42 | }, 43 | { 44 | "metadata": {}, 45 | "cell_type": "code", 46 | "source": [ 47 | "%reload_ext dotenv\n", 48 | "%dotenv\n", 49 | "\n", 50 | "import os\n", 51 | "\n", 52 | "from graphrag_toolkit.lexical_graph import LexicalGraphIndex, set_logging_config\n", 53 | "from graphrag_toolkit.lexical_graph.storage import GraphStoreFactory\n", 54 | "from graphrag_toolkit.lexical_graph.storage import VectorStoreFactory\n", 55 | "from graphrag_toolkit.lexical_graph.indexing.load import S3BasedDocs\n", 56 | "from graphrag_toolkit.lexical_graph.indexing.build import Checkpoint\n", 57 | "\n", 58 | "set_logging_config('INFO')\n", 59 | "\n", 60 | "docs = S3BasedDocs(\n", 61 | " region='us-east-1',\n", 62 | " bucket_name=os.environ['LOCAL_EXTRACT_S3'],\n", 63 | " key_prefix='extract-build',\n", 64 | " collection_id='best-practices'\n", 65 | ")\n", 66 | "checkpoint = Checkpoint('s3-build-checkpoint')\n", 67 | "\n", 68 | "graph_store = GraphStoreFactory.for_graph_store(os.environ['GRAPH_STORE'])\n", 69 | "vector_store = VectorStoreFactory.for_vector_store(os.environ['VECTOR_STORE'])\n", 70 | "\n", 71 | "graph_index = LexicalGraphIndex(\n", 72 | " graph_store,\n", 73 | " vector_store\n", 74 | ")\n", 75 | "\n", 76 | "graph_index.build(docs, checkpoint=checkpoint, show_progress=True)\n", 77 | "\n", 78 | "print('Build complete')" 79 | ], 80 | "id": "eaa952bf", 81 | "outputs": [], 82 | "execution_count": null 83 | } 84 | ], 85 | "metadata": { 86 | "kernelspec": { 87 | "display_name": "Python 3", 88 | "language": "python", 89 | "name": "python3" 90 | }, 91 | "language_info": { 92 | "codemirror_mode": { 93 | "name": "ipython", 94 | "version": 3 95 | }, 96 | "file_extension": ".py", 97 | "mimetype": "text/x-python", 98 | "name": "python", 99 | "nbconvert_exporter": "python", 100 | "pygments_lexer": "ipython3", 101 | "version": "3.10.8" 102 | } 103 | }, 104 | "nbformat": 4, 105 | "nbformat_minor": 5 106 | } 107 | -------------------------------------------------------------------------------- /examples/lexical-graph-hybrid-dev/notebooks/best-practices/Retrieval-Augmented-Generation-Options.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/graphrag-toolkit/e6967b5d2589bfd39bc72be2e4da0c9d1a30e865/examples/lexical-graph-hybrid-dev/notebooks/best-practices/Retrieval-Augmented-Generation-Options.pdf -------------------------------------------------------------------------------- /examples/lexical-graph-local-dev/README.md: -------------------------------------------------------------------------------- 1 | ## Lexical Graph Examples 2 | 3 | ### Notebooks 4 | 5 | - [**00-Setup**](./notebooks/00-Setup.ipynb) – Installs the lexical-graph package and additional dependencies. 6 | - [**01-Combined Extract and Build**](./notebooks/01-Combined-Extract-and-Build.ipynb) – An example of [performing continuous ingest](../../docs/lexical-graph/indexing.md#continous-ingest) using the `LexicalGraphIndex.extract_and_build()` method. 7 | - [**03-Querying**](./notebooks/04-Querying.ipynb) – Examples of [querying the graph](../../docs/lexical-graph/querying.md) using the `LexicalGraphQueryEngine` with `SemanticGuidedRetriever`. 8 | 9 | ## Environment Setup 10 | 11 | The notebooks rely on `GRAPH_STORE` and `VECTOR_STORE` environment variables being properly set. These variables define where and how the graph store and vector store connect. 12 | 13 | To set up your local environment: 14 | 15 | 1. Clone the repository and navigate to your working directory. 16 | 2. Run: 17 | 18 | ```bash 19 | ./build.sh 20 | ``` 21 | 22 | This will start and configure the following services in Docker: 23 | 24 | - **FalkorDB** for graph storage 25 | - **FalkorDB Browser** (accessible on `localhost:8092`) for interactive graph exploration 26 | - **PostgreSQL with pgvector** for vector embeddings 27 | 28 | The Postgres container auto-applies the following schema on initialization via `./postgres/schema.sql`: 29 | 30 | ```sql 31 | -- Enable pgvector extension in public schema 32 | CREATE EXTENSION IF NOT EXISTS vector SCHEMA public; 33 | 34 | -- Enable pg_trgm extension in public schema 35 | CREATE EXTENSION IF NOT EXISTS pg_trgm SCHEMA public; 36 | 37 | -- Create schema for GraphRAG 38 | CREATE SCHEMA IF NOT EXISTS graphrag; 39 | ``` 40 | 41 | These extensions are necessary for similarity search and fuzzy matching in GraphRAG. 42 | 43 | ## AWS Foundation Model Access (Optional) 44 | 45 | If you intend to run the CloudFormation templates instead of using Docker: 46 | 47 | - Ensure your AWS account has access to the following Amazon Bedrock foundation models: 48 | - `anthropic.claude-3-5-sonnet-20240620-v1:0` 49 | - `cohere.embed-english-v3` 50 | 51 | Enable model access via the [Bedrock model access console](https://docs.aws.amazon.com/bedrock/latest/userguide/model-access.html). 52 | 53 | You must deploy to an AWS region where these models are available. 54 | 55 | ## Optional: CloudFormation Stacks 56 | 57 | If you want to deploy infrastructure in AWS, CloudFormation templates are available: 58 | 59 | - `graphrag-toolkit-neptune-db-opensearch-serverless.json` 60 | - `graphrag-toolkit-neptune-db-aurora-postgres.json` 61 | 62 | These templates create: 63 | 64 | - A Neptune serverless DB cluster 65 | - Either OpenSearch Serverless or Aurora PostgreSQL 66 | - A SageMaker notebook instance 67 | - IAM roles with optional policies via the `IamPolicyArn` parameter 68 | - An optional `ExampleNotebooksURL` parameter to auto-load the examples 69 | 70 | > ⚠️ AWS charges apply for cloud resources. 71 | 72 | --- 73 | 74 | Use this guide if you prefer to develop and test locally before migrating to AWS-based deployments. -------------------------------------------------------------------------------- /examples/lexical-graph-local-dev/docker/.env: -------------------------------------------------------------------------------- 1 | # PostgreSQL settings 2 | POSTGRES_USER=graphrag 3 | POSTGRES_PASSWORD=graphragpass 4 | POSTGRES_DB=graphrag_db 5 | POSTGRES_HOST=postgres 6 | POSTGRES_PORT=5432 7 | 8 | # FalkorDB settings (adjust as needed) 9 | FALKORDB_HOST=falkordb 10 | FALKORDB_PORT=6379 11 | 12 | # Other potential settings for graphrag_toolkit 13 | EMBEDDING_MODEL=sentence-transformers/all-mpnet-base-v2 14 | EMBEDDING_DIMENSIONS=1536 -------------------------------------------------------------------------------- /examples/lexical-graph-local-dev/docker/build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo "Building and starting containers..." 4 | docker compose up -d --build 5 | 6 | echo "Build and startup complete." 7 | -------------------------------------------------------------------------------- /examples/lexical-graph-local-dev/docker/docker-compose.yml: -------------------------------------------------------------------------------- 1 | services: 2 | falkordb: 3 | image: falkordb/falkordb:latest 4 | container_name: falkordb 5 | ports: 6 | - "6379:6379" # Redis/FalkorDB default 7 | - "3000:3000" # Optional for FalkorDB REST if exposed 8 | volumes: 9 | - falkor_data:/data 10 | networks: 11 | - graphrag_network 12 | 13 | falkordb-browser: 14 | image: falkordb/falkordb-browser:latest 15 | container_name: falkordb-browser 16 | ports: 17 | - "8092:8080" # Browser UI exposed on localhost:8080 18 | environment: 19 | FALKORDB_BROWSER_REDIS_HOST: falkordb 20 | FALKORDB_BROWSER_REDIS_PORT: 6379 21 | FALKORDB_BROWSER_REDIS_USE_TLS: "false" 22 | depends_on: 23 | - falkordb 24 | networks: 25 | - graphrag_network 26 | 27 | postgres: 28 | image: pgvector/pgvector:0.6.2-pg16 29 | container_name: pgvector-db 30 | ports: 31 | - "5432:5432" 32 | environment: 33 | - POSTGRES_USER=${POSTGRES_USER} 34 | - POSTGRES_PASSWORD=${POSTGRES_PASSWORD} 35 | - POSTGRES_DB=${POSTGRES_DB} 36 | volumes: 37 | - pgvector_data:/var/lib/postgresql/data 38 | - ./postgres/schema.sql:/docker-entrypoint-initdb.d/schema.sql 39 | networks: 40 | - graphrag_network 41 | 42 | networks: 43 | graphrag_network: 44 | driver: bridge 45 | 46 | volumes: 47 | falkor_data: 48 | pgvector_data: 49 | -------------------------------------------------------------------------------- /examples/lexical-graph-local-dev/docker/postgres/schema.sql: -------------------------------------------------------------------------------- 1 | -- Enable pgvector extension in public schema 2 | CREATE EXTENSION IF NOT EXISTS vector SCHEMA public; 3 | 4 | -- Enable pg_trgm extension in public schema 5 | CREATE EXTENSION IF NOT EXISTS pg_trgm SCHEMA public; 6 | 7 | -- Create schema for GraphRAG 8 | CREATE SCHEMA IF NOT EXISTS graphrag; -------------------------------------------------------------------------------- /examples/lexical-graph-local-dev/docker/reset.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo "Stopping and removing containers, volumes, and networks..." 4 | docker compose down -v --remove-orphans 5 | 6 | echo "Ensuring containers are removed..." 7 | docker rm -f falkordb falkordb-browser pgvector-db 2>/dev/null 8 | 9 | echo "Removing named volumes..." 10 | docker volume rm -f pgvector_data falkor_data 2>/dev/null 11 | 12 | echo "Pruning dangling volumes (if any)..." 13 | docker volume prune -f 14 | 15 | echo "Clearing extracted directory..." 16 | rm -rf extracted 17 | 18 | echo "Rebuilding containers..." 19 | docker compose up -d --force-recreate 20 | 21 | echo "Reset complete." -------------------------------------------------------------------------------- /examples/lexical-graph-local-dev/docs/docker_build.md: -------------------------------------------------------------------------------- 1 | # build.sh 2 | 3 | This script is used to build and start the containers for a new deployment of the application using Docker Compose. It is intended for **initial deployments** or **redeployments** without resetting volumes, removing data, or clearing persistent state. 4 | 5 | ## Usage 6 | 7 | ```bash 8 | chmod +x build.sh 9 | ./build.sh 10 | ``` 11 | 12 | ## What it does 13 | 14 | - Executes `docker compose up -d --build` to: 15 | - Build the Docker images using the `Dockerfile`s defined in the project. 16 | - Start the services in detached mode (`-d`) so the terminal remains available. 17 | - Automatically pull required images if not already present. 18 | - Rebuild containers if source code has changed. 19 | 20 | ## Important Notes 21 | 22 | - This script does **not** remove any existing containers, volumes, or data. 23 | - It is safe to run on top of an existing deployment if you are deploying an updated version of your app. 24 | - Make sure your `.env` and `docker-compose.yml` files are configured properly before running the script. 25 | 26 | ## Related Scripts 27 | 28 | - See [`reset.sh`](reset.md) for a full environment reset, including data deletion and volume pruning. 29 | -------------------------------------------------------------------------------- /examples/lexical-graph-local-dev/docs/docker_reset_script.md: -------------------------------------------------------------------------------- 1 | # Docker Environment Reset Script 2 | 3 | This script is used to **fully reset a local Docker-based development environment** for graphrag-toolkit. The script will reset FalkorDB, PGVector, and optionally other components. It performs cleanup of containers, networks, volumes, and extracted data, followed by a fresh container rebuild. 4 | 5 | ## Filename 6 | 7 | Use `reset.sh` (file is located in lexical-graph-contrib/docker) and run it with: 8 | 9 | ```bash 10 | bash reset.sh 11 | ``` 12 | 13 | > **Note:** Make sure the script is executable (`chmod +x reset.sh`) or invoke it with `bash`. 14 | 15 | --- 16 | 17 | ## Script Breakdown 18 | 19 | ```bash 20 | #!/bin/bash 21 | ``` 22 | - Standard shebang to run the script using `bash`. 23 | 24 | --- 25 | 26 | ### 1. Stop and Remove Docker Resources 27 | 28 | ```bash 29 | echo "Stopping and removing containers, volumes, and networks..." 30 | docker compose down -v --remove-orphans 31 | ``` 32 | 33 | - **`docker compose down`** stops and removes containers defined in `docker-compose.yml`. 34 | - **`-v`** removes associated anonymous volumes. 35 | - **`--remove-orphans`** removes containers not defined in the current Compose file but part of the same project network. 36 | 37 | --- 38 | 39 | ### 2. Explicitly Remove Named Containers 40 | 41 | ```bash 42 | echo "Ensuring containers are removed..." 43 | docker rm -f falkordb falkordb-browser pgvector-db 2>/dev/null 44 | ``` 45 | 46 | - Forcefully removes specific named containers, if they still exist. 47 | - Errors are suppressed using `2>/dev/null`. 48 | 49 | --- 50 | 51 | ### 3. Remove Named Volumes 52 | 53 | ```bash 54 | echo "Removing named volumes..." 55 | docker volume rm -f pgvector_data falkor_data 2>/dev/null 56 | ``` 57 | 58 | - Deletes project-specific Docker volumes that might persist after shutdown. 59 | 60 | --- 61 | 62 | ### 4. Prune Dangling Volumes 63 | 64 | ```bash 65 | echo "Pruning dangling volumes (if any)..." 66 | docker volume prune -f 67 | ``` 68 | 69 | - Removes **dangling (unused)** Docker volumes that may be left behind. 70 | 71 | --- 72 | 73 | ### 5. Delete Local Directories 74 | 75 | ```bash 76 | echo "Clearing extracted directory..." 77 | rm -rf extracted 78 | ``` 79 | 80 | - Cleans up the local `./extracted` directory used to store intermediate files (like parsed documents, indexes, or temp outputs). 81 | 82 | --- 83 | 84 | ### 6. Rebuild and Start Containers 85 | 86 | ```bash 87 | echo "Rebuilding containers..." 88 | docker compose up -d --force-recreate 89 | ``` 90 | 91 | - **`-d`** runs containers in detached mode. 92 | - **`--force-recreate`** ensures all containers are recreated even if configuration hasn't changed. 93 | 94 | --- 95 | 96 | ### 7. Final Message 97 | 98 | ```bash 99 | echo "Reset complete." 100 | ``` 101 | 102 | - Indicates successful completion of the reset process. 103 | 104 | --- 105 | 106 | ## Use Cases 107 | 108 | - Full environment reset between development sessions 109 | - Clean-up after corrupt container or volume states 110 | - Ensures a consistent baseline environment for troubleshooting or testing 111 | 112 | --- 113 | 114 | ## Warnings 115 | 116 | - **Data Loss**: This script removes all persistent data and should not be used on production environments. 117 | - **Rebuild Time**: Fresh container creation may take time depending on image sizes and network speed. 118 | 119 | -------------------------------------------------------------------------------- /examples/lexical-graph-local-dev/docs/docker_services.md: -------------------------------------------------------------------------------- 1 | # Docker Services Overview for GraphRAG Deployment 2 | 3 | This document describes the services defined in the `docker-compose.yml` file used for setting up a GraphRAG environment. It includes containerized services for FalkorDB, a FalkorDB browser UI, and a PostgreSQL database with the `pgvector` extension enabled. 4 | 5 | --- 6 | 7 | ## Services 8 | 9 | ### 1. `falkordb` 10 | - **Image**: `falkordb/falkordb:latest` 11 | - **Description**: Runs the FalkorDB graph database, which uses Redis as its backend. 12 | - **Ports**: 13 | - `6379`: Redis/FalkorDB main port. 14 | - `3000`: Optional REST API for FalkorDB if exposed. 15 | - **Volume**: Persists graph data using `falkor_data`. 16 | - **Network**: Connected to `graphrag_network`. 17 | 18 | ### 2. `falkordb-browser` 19 | - **Image**: `falkordb/falkordb-browser:latest` 20 | - **Description**: Provides a web-based interface for interacting with FalkorDB. 21 | - **Ports**: 22 | - `8092:8080`: Web UI exposed on localhost:8092. 23 | - **Environment Variables**: 24 | - `FALKORDB_BROWSER_REDIS_HOST`: Hostname of the FalkorDB service. 25 | - `FALKORDB_BROWSER_REDIS_PORT`: Port for Redis. 26 | - `FALKORDB_BROWSER_REDIS_USE_TLS`: TLS setting for secure Redis communication (disabled in this setup). 27 | - **Depends On**: `falkordb` 28 | - **Network**: Connected to `graphrag_network`. 29 | 30 | ### 3. `postgres` 31 | - **Image**: `pgvector/pgvector:0.6.2-pg16` 32 | - **Description**: PostgreSQL 16 image with the `pgvector` extension pre-installed for vector search capabilities. 33 | - **Ports**: 34 | - `5432`: PostgreSQL default port. 35 | - **Environment Variables**: 36 | - `POSTGRES_USER`, `POSTGRES_PASSWORD`, `POSTGRES_DB`: Injected from environment or `.env` file. 37 | - **Volumes**: 38 | - `pgvector_data`: Data persistence. 39 | - `./postgres/schema.sql`: Initializes the database schema. 40 | - **Network**: Connected to `graphrag_network`. 41 | 42 | --- 43 | 44 | ## `schema.sql` 45 | 46 | This SQL file is used to bootstrap the PostgreSQL container with necessary extensions and a custom schema: 47 | 48 | ```sql 49 | -- Enable pgvector extension in public schema 50 | CREATE EXTENSION IF NOT EXISTS vector SCHEMA public; 51 | 52 | -- Enable pg_trgm extension in public schema 53 | CREATE EXTENSION IF NOT EXISTS pg_trgm SCHEMA public; 54 | 55 | -- Create schema for GraphRAG 56 | CREATE SCHEMA IF NOT EXISTS graphrag; 57 | ``` 58 | 59 | These extensions are required for vector similarity search and trigram-based indexing within the GraphRAG framework. 60 | 61 | --- 62 | 63 | ## Networks 64 | 65 | - **graphrag_network**: A dedicated Docker bridge network for inter-container communication. 66 | 67 | --- 68 | 69 | ## Volumes 70 | 71 | - `falkor_data`: Persists FalkorDB graph state. 72 | - `pgvector_data`: Persists PostgreSQL data including vector embeddings and schema definitions. 73 | -------------------------------------------------------------------------------- /examples/lexical-graph-local-dev/notebooks/.env: -------------------------------------------------------------------------------- 1 | AWS_REGION="us-east-1" 2 | AWS_PROFILE="padmin" 3 | EMBEDDINGS_MODEL="cohere.embed-english-v3" 4 | EMBEDDINGS_DIMENSIONS=1024 5 | EXTRACTION_MODEL="us.anthropic.claude-3-5-sonnet-20240620-v1:0" 6 | RESPONSE_MODEL="us.anthropic.claude-3-5-sonnet-20240620-v1:0" 7 | EXTRACTION_NUM_WORKERS=2 8 | EXTRACTION_NUM_THREADS_PER_WORKER=4 9 | EXTRACTION_BATCH_SIZE=4 10 | BUILD_NUM_WORKERS=2 11 | BUILD_BATCH_SIZE=4 12 | BUILD_BATCH_WRITE_SIZE=25 13 | BATCH_WRITES_ENABLED=True 14 | DEFAULT_INCLUDE_DOMAIN_LABELS=False 15 | ENABLE_CACHE=False 16 | VECTOR_STORE="postgresql://graphrag:graphragpass@localhost:5432/graphrag_db" 17 | GRAPH_STORE="falkordb://localhost:6379" -------------------------------------------------------------------------------- /examples/lexical-graph-local-dev/notebooks/01-Combined-Extract-and-Build.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "3eb1535a", 6 | "metadata": {}, 7 | "source": "# 01 - Combined Extract and Build" 8 | }, 9 | { 10 | "cell_type": "markdown", 11 | "id": "e3f529c1", 12 | "metadata": {}, 13 | "source": [ 14 | "## Setup\n", 15 | "\n", 16 | "If you haven't already, install the toolkit and dependencies using the [Setup](./00-Setup.ipynb) notebook." 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "id": "0aa94cc9", 22 | "metadata": {}, 23 | "source": [ 24 | "## Continous ingest\n", 25 | "\n", 26 | "See [Continous ingest](https://github.com/awslabs/graphrag-toolkit/blob/main/docs/lexical-graph/indexing.md#continous-ingest)." 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "id": "7ec68542", 32 | "metadata": {}, 33 | "source": [ 34 | "%reload_ext dotenv\n", 35 | "%dotenv\n", 36 | "\n", 37 | "import os\n", 38 | "\n", 39 | "from graphrag_toolkit.lexical_graph import LexicalGraphIndex, set_logging_config\n", 40 | "from graphrag_toolkit.lexical_graph.storage import VectorStoreFactory\n", 41 | "from graphrag_toolkit.lexical_graph.storage import GraphStoreFactory\n", 42 | "from graphrag_toolkit.lexical_graph.storage.graph.falkordb import FalkorDBGraphStoreFactory\n", 43 | "\n", 44 | "GraphStoreFactory.register(FalkorDBGraphStoreFactory)\n", 45 | "\n", 46 | "from llama_index.readers.web import SimpleWebPageReader\n", 47 | "\n", 48 | "graph_store = GraphStoreFactory.for_graph_store(os.environ['GRAPH_STORE'])\n", 49 | "vector_store = VectorStoreFactory.for_vector_store(os.environ['VECTOR_STORE'])\n", 50 | "\n", 51 | "graph_index = LexicalGraphIndex(\n", 52 | " graph_store, \n", 53 | " vector_store\n", 54 | ")\n", 55 | "\n", 56 | "doc_urls = [\n", 57 | " 'https://docs.aws.amazon.com/neptune/latest/userguide/intro.html',\n", 58 | " 'https://docs.aws.amazon.com/neptune-analytics/latest/userguide/what-is-neptune-analytics.html',\n", 59 | " 'https://docs.aws.amazon.com/neptune-analytics/latest/userguide/neptune-analytics-features.html',\n", 60 | " 'https://docs.aws.amazon.com/neptune-analytics/latest/userguide/neptune-analytics-vs-neptune-database.html'\n", 61 | "]\n", 62 | "\n", 63 | "docs = SimpleWebPageReader(\n", 64 | " html_to_text=True,\n", 65 | " metadata_fn=lambda url:{'url': url}\n", 66 | ").load_data(doc_urls)\n", 67 | "\n", 68 | "graph_index.extract_and_build(docs, show_progress=True)\n", 69 | "\n", 70 | "print('Complete')" 71 | ], 72 | "outputs": [], 73 | "execution_count": null 74 | }, 75 | { 76 | "metadata": {}, 77 | "cell_type": "code", 78 | "source": "", 79 | "id": "5e2b536ce6540fb5", 80 | "outputs": [], 81 | "execution_count": null 82 | } 83 | ], 84 | "metadata": { 85 | "kernelspec": { 86 | "display_name": "Python 3", 87 | "language": "python", 88 | "name": "python3" 89 | }, 90 | "language_info": { 91 | "codemirror_mode": { 92 | "name": "ipython", 93 | "version": 3 94 | }, 95 | "file_extension": ".py", 96 | "mimetype": "text/x-python", 97 | "name": "python", 98 | "nbconvert_exporter": "python", 99 | "pygments_lexer": "ipython3", 100 | "version": "3.10.8" 101 | } 102 | }, 103 | "nbformat": 4, 104 | "nbformat_minor": 5 105 | } 106 | -------------------------------------------------------------------------------- /examples/lexical-graph-local-dev/notebooks/prompts/system_prompt.txt: -------------------------------------------------------------------------------- 1 | You are a question answering agent. I will provide you with a set of search results. The user will provide you with a question. Your job is to answer the user's question using only information from the search results. If the search results are empty, do not attempt to answer the question. 2 | 3 | 4 | {search_results} 5 | 6 | 7 | ## Instructions 8 | - Think carefully about the question, the source and relevancy of each of the search results, and the logical connections between different search results before answering. 9 | - Ensure you answer each part of the question. 10 | - Reference information from the search results in your answer by adding the 'source' in square brackets at the end of relevant sentences. 11 | - Do NOT directly quote the search results in your answer. 12 | - If the question is a yes/no question, start with either 'Yes' or 'No'. 13 | - If the search results are empty, do not attempt to answer the question. 14 | 15 | Based on the search results, answer the following question as concisely as possible: 16 | -------------------------------------------------------------------------------- /examples/lexical-graph-local-dev/notebooks/prompts/user_prompt.txt: -------------------------------------------------------------------------------- 1 | 2 | {query} 3 | 4 | 5 | Please answer the question above using the information available. Respond only in the following JSON format: 6 | 7 | { 8 | "answer": "", 9 | "supporting_facts": ["fact 1", "fact 2", "..."], 10 | "confidence": "" 11 | } 12 | -------------------------------------------------------------------------------- /examples/lexical-graph/notebooks/01-Combined-Extract-and-Build.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "3eb1535a", 6 | "metadata": {}, 7 | "source": [ 8 | "# 01 - Combined Extract and Build" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "id": "e3f529c1", 14 | "metadata": {}, 15 | "source": [ 16 | "## Setup\n", 17 | "\n", 18 | "If you haven't already, install the toolkit and dependencies using the [Setup](./00-Setup.ipynb) notebook." 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "id": "0aa94cc9", 24 | "metadata": {}, 25 | "source": [ 26 | "## Continous ingest\n", 27 | "\n", 28 | "See [Continous ingest](https://github.com/awslabs/graphrag-toolkit/blob/main/docs/lexical-graph/indexing.md#continous-ingest)." 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": null, 34 | "id": "7ec68542", 35 | "metadata": {}, 36 | "outputs": [], 37 | "source": [ 38 | "%reload_ext dotenv\n", 39 | "%dotenv\n", 40 | "\n", 41 | "import os\n", 42 | "\n", 43 | "from graphrag_toolkit.lexical_graph import LexicalGraphIndex, GraphRAGConfig, set_logging_config\n", 44 | "from graphrag_toolkit.lexical_graph.storage import GraphStoreFactory\n", 45 | "from graphrag_toolkit.lexical_graph.storage import VectorStoreFactory\n", 46 | "\n", 47 | "from llama_index.readers.web import SimpleWebPageReader\n", 48 | "\n", 49 | "set_logging_config('INFO')\n", 50 | "\n", 51 | "graph_store = GraphStoreFactory.for_graph_store(os.environ['GRAPH_STORE'])\n", 52 | "vector_store = VectorStoreFactory.for_vector_store(os.environ['VECTOR_STORE'])\n", 53 | "\n", 54 | "graph_index = LexicalGraphIndex(\n", 55 | " graph_store, \n", 56 | " vector_store\n", 57 | ")\n", 58 | "\n", 59 | "doc_urls = [\n", 60 | " 'https://docs.aws.amazon.com/neptune/latest/userguide/intro.html',\n", 61 | " 'https://docs.aws.amazon.com/neptune-analytics/latest/userguide/what-is-neptune-analytics.html',\n", 62 | " 'https://docs.aws.amazon.com/neptune-analytics/latest/userguide/neptune-analytics-features.html',\n", 63 | " 'https://docs.aws.amazon.com/neptune-analytics/latest/userguide/neptune-analytics-vs-neptune-database.html'\n", 64 | "]\n", 65 | "\n", 66 | "docs = SimpleWebPageReader(\n", 67 | " html_to_text=True,\n", 68 | " metadata_fn=lambda url:{'url': url}\n", 69 | ").load_data(doc_urls)\n", 70 | "\n", 71 | "graph_index.extract_and_build(docs, show_progress=True)\n", 72 | "\n", 73 | "print('Complete')" 74 | ] 75 | } 76 | ], 77 | "metadata": { 78 | "kernelspec": { 79 | "display_name": "Python 3", 80 | "language": "python", 81 | "name": "python3" 82 | }, 83 | "language_info": { 84 | "codemirror_mode": { 85 | "name": "ipython", 86 | "version": 3 87 | }, 88 | "file_extension": ".py", 89 | "mimetype": "text/x-python", 90 | "name": "python", 91 | "nbconvert_exporter": "python", 92 | "pygments_lexer": "ipython3", 93 | "version": "3.10.8" 94 | } 95 | }, 96 | "nbformat": 4, 97 | "nbformat_minor": 5 98 | } 99 | -------------------------------------------------------------------------------- /images/byokg_rag.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/graphrag-toolkit/e6967b5d2589bfd39bc72be2e4da0c9d1a30e865/images/byokg_rag.png -------------------------------------------------------------------------------- /images/extract-and-build.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/graphrag-toolkit/e6967b5d2589bfd39bc72be2e4da0c9d1a30e865/images/extract-and-build.png -------------------------------------------------------------------------------- /images/hybrid-extract-and-build.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/graphrag-toolkit/e6967b5d2589bfd39bc72be2e4da0c9d1a30e865/images/hybrid-extract-and-build.png -------------------------------------------------------------------------------- /images/lexical-graph.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/graphrag-toolkit/e6967b5d2589bfd39bc72be2e4da0c9d1a30e865/images/lexical-graph.png -------------------------------------------------------------------------------- /images/local-extract-and-build.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/graphrag-toolkit/e6967b5d2589bfd39bc72be2e4da0c9d1a30e865/images/local-extract-and-build.png -------------------------------------------------------------------------------- /images/question-answering.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/graphrag-toolkit/e6967b5d2589bfd39bc72be2e4da0c9d1a30e865/images/question-answering.png -------------------------------------------------------------------------------- /lexical-graph-contrib/falkordb/pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["pbr>=6.1.1"] 3 | build-backend = "pbr.build" 4 | 5 | [project] 6 | name = "graphrag-toolkit-lexical-graph-falkordb" 7 | version = "1.0.1" 8 | description = "FalkorDB support for the AWS GraphRAG Toolkit, lexical graph" 9 | readme = "README.md" 10 | requires-python = ">=3.10" 11 | dynamic = ["dependencies"] 12 | license = "Apache-2.0" 13 | 14 | [tool.setuptools.dynamic] 15 | dependencies = {file = ["src/requirements.txt"]} 16 | -------------------------------------------------------------------------------- /lexical-graph-contrib/falkordb/src/graphrag_toolkit/lexical_graph/storage/graph/falkordb/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | from .falkordb_graph_store_factory import FalkorDBGraphStoreFactory 5 | from .falkordb_graph_store import FalkorDBDatabaseClient 6 | -------------------------------------------------------------------------------- /lexical-graph-contrib/falkordb/src/graphrag_toolkit/lexical_graph/storage/graph/falkordb/falkordb_graph_store_factory.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from typing import List, Union 3 | from falkordb.node import Node 4 | from falkordb.edge import Edge 5 | from falkordb.path import Path 6 | 7 | from graphrag_toolkit.lexical_graph.storage.graph import GraphStoreFactoryMethod, GraphStore, get_log_formatting 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | FALKORDB = 'falkordb://' 12 | FALKORDB_DNS = 'falkordb.com' 13 | DEFAULT_DATABASE_NAME = 'graphrag' 14 | QUERY_RESULT_TYPE = Union[List[List[Node]], List[List[List[Path]]], List[List[Edge]]] 15 | 16 | class FalkorDBGraphStoreFactory(GraphStoreFactoryMethod): 17 | 18 | def try_create(self, graph_info:str, **kwargs) -> GraphStore: 19 | endpoint_url = None 20 | if graph_info.startswith(FALKORDB): 21 | endpoint_url = graph_info[len(FALKORDB):] 22 | elif graph_info.endswith(FALKORDB_DNS): 23 | endpoint_url = graph_info 24 | if endpoint_url: 25 | try: 26 | from graphrag_toolkit.lexical_graph.storage.graph.falkordb import FalkorDBDatabaseClient 27 | logger.debug(f'Opening FalkorDB database [endpoint: {endpoint_url}]') 28 | return FalkorDBDatabaseClient( 29 | endpoint_url=endpoint_url, 30 | log_formatting=get_log_formatting(kwargs), 31 | **kwargs 32 | ) 33 | except ImportError as e: 34 | raise e 35 | 36 | else: 37 | return None -------------------------------------------------------------------------------- /lexical-graph-contrib/falkordb/src/requirements.txt: -------------------------------------------------------------------------------- 1 | FalkorDB 2 | redis -------------------------------------------------------------------------------- /lexical-graph-contrib/falkordb/src/setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | name = graphrag-toolkit-lexical-graph-falkordb -------------------------------------------------------------------------------- /lexical-graph-contrib/falkordb/src/setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | setuptools.setup(pbr=True) -------------------------------------------------------------------------------- /lexical-graph/pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["hatchling", "hatch-requirements-txt"] 3 | build-backend = "hatchling.build" 4 | 5 | [tool.hatch.build.targets.wheel] 6 | packages = ["src/graphrag_toolkit"] 7 | 8 | [project] 9 | name = "graphrag-toolkit-lexical-graph" 10 | version = "3.9.0-SNAPSHOT" 11 | description = "AWS GraphRAG Toolkit, lexical graph" 12 | readme = "README.md" 13 | requires-python = ">=3.10" 14 | dynamic = ["dependencies"] 15 | license = "Apache-2.0" 16 | 17 | [tool.hatch.metadata.hooks.requirements_txt] 18 | files = ["src/graphrag_toolkit/lexical_graph/requirements.txt"] 19 | -------------------------------------------------------------------------------- /lexical-graph/src/graphrag_toolkit/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 -------------------------------------------------------------------------------- /lexical-graph/src/graphrag_toolkit/lexical_graph/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | from .tenant_id import TenantId, DEFAULT_TENANT_ID, TenantIdType, to_tenant_id 5 | from .config import GraphRAGConfig as GraphRAGConfig, LLMType, EmbeddingType 6 | from .errors import ModelError, BatchJobError, IndexError 7 | from .logging import set_logging_config, set_advanced_logging_config 8 | from .lexical_graph_query_engine import LexicalGraphQueryEngine 9 | from .lexical_graph_index import LexicalGraphIndex 10 | from .lexical_graph_index import ExtractionConfig, BuildConfig, IndexingConfig 11 | from . import utils 12 | from . import indexing 13 | from . import retrieval 14 | from . import storage 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /lexical-graph/src/graphrag_toolkit/lexical_graph/errors.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | class ModelError(Exception): 5 | pass 6 | 7 | class BatchJobError(Exception): 8 | pass 9 | 10 | class IndexError(Exception): 11 | pass 12 | -------------------------------------------------------------------------------- /lexical-graph/src/graphrag_toolkit/lexical_graph/indexing/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | from .node_handler import NodeHandler 5 | from .utils.pipeline_utils import sink 6 | from .utils.metadata_utils import last_accessed_date 7 | from .id_generator import IdGenerator 8 | from . import build 9 | from . import extract 10 | from . import load 11 | from . import utils 12 | -------------------------------------------------------------------------------- /lexical-graph/src/graphrag_toolkit/lexical_graph/indexing/build/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | from .build_pipeline import BuildPipeline 5 | from .vector_indexing import VectorIndexing 6 | from .graph_construction import GraphConstruction 7 | from .checkpoint import Checkpoint 8 | from .build_filters import BuildFilters, DEFAULT_BUILD_FILTER -------------------------------------------------------------------------------- /lexical-graph/src/graphrag_toolkit/lexical_graph/indexing/build/graph_builder.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | import abc 5 | from typing import Dict, Any 6 | 7 | from graphrag_toolkit.lexical_graph.storage.graph import GraphStore 8 | 9 | from llama_index.core.schema import BaseComponent, BaseNode 10 | 11 | class GraphBuilder(BaseComponent): 12 | """ 13 | Handles the construction and management of graph structures. 14 | 15 | GraphBuilder serves as a base class for implementing components that facilitate the 16 | creation, indexing, and management of graphs within a specific application context. It 17 | provides a structure for defining essential methods that subclasses must override to 18 | handle graph building and indexing functionalities. 19 | 20 | Attributes: 21 | index_key (str): Represents the unique key or identifier used to distinguish 22 | the indexing strategy of the graph structure. 23 | """ 24 | def _to_params(self, p:Dict): 25 | """ 26 | Converts a given dictionary into a specific parameters structure expected by 27 | the application or system. 28 | 29 | The function processes the input dictionary and wraps it inside another 30 | dictionary under the key `'params'`. It ensures consistency in the data 31 | format for further use or processing. 32 | 33 | Args: 34 | p (Dict): A dictionary containing the parameters to be converted. 35 | 36 | Returns: 37 | Dict: A dictionary wrapping the input as a value under the key `'params'`. 38 | """ 39 | return { 'params': [p] } 40 | 41 | @classmethod 42 | @abc.abstractmethod 43 | def index_key(cls) -> str: 44 | """ 45 | Defines an abstract class method to retrieve the index key associated with 46 | the implementing class. This method must be implemented by all subclasses to 47 | provide a unique identifier or key for indexing purposes. 48 | 49 | Returns: 50 | str: A string representing the index key for the class. 51 | """ 52 | pass 53 | 54 | @abc.abstractmethod 55 | def build(self, node:BaseNode, graph_client: GraphStore, **kwargs:Any): 56 | """ 57 | An abstract method designated for building a specific process related to a 58 | BaseNode within a GraphStore using additional parameters. 59 | 60 | Args: 61 | node: The node instance of type BaseNode on which the build operation 62 | is performed. 63 | graph_client: The graph storage client of type GraphStore responsible 64 | for managing graph operations. 65 | **kwargs: Arbitrary additional arguments that may be required for the 66 | build operation specific to the implementation. 67 | """ 68 | pass -------------------------------------------------------------------------------- /lexical-graph/src/graphrag_toolkit/lexical_graph/indexing/build/null_builder.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | import logging 5 | from typing import List, Any 6 | 7 | from graphrag_toolkit.lexical_graph.indexing import NodeHandler 8 | 9 | from llama_index.core.schema import BaseNode 10 | 11 | logger = logging.getLogger(__name__) 12 | 13 | class NullBuilder(NodeHandler): 14 | """ 15 | Handles the acceptance of nodes without performing any transformations, primarily 16 | used as a pass-through handler. 17 | 18 | The class is designed to process and yield nodes without altering their state. This 19 | can be helpful in scenarios where nodes need to be logged or monitored without any 20 | modification. The class inherits from `NodeHandler`. 21 | 22 | Attributes: 23 | None 24 | """ 25 | def accept(self, nodes: List[BaseNode], **kwargs: Any): 26 | """ 27 | Accepts a list of nodes and processes them, yielding each node while logging its acceptance. This function is designed 28 | to produce a generator for the given nodes after logging their node IDs. 29 | 30 | Args: 31 | nodes (List[BaseNode]): A list of nodes to be processed. Each node is expected to have a `node_id` attribute 32 | which will be used for logging. 33 | **kwargs (Any): Additional arguments that might be used for extended functionality or context, but are not 34 | required for this function's core behavior. 35 | 36 | Yields: 37 | BaseNode: Each node from the input list is yielded after being processed (specifically logged in this case). 38 | """ 39 | for node in nodes: 40 | logger.debug(f'Accepted node [node_id: {node.node_id}]') 41 | yield node -------------------------------------------------------------------------------- /lexical-graph/src/graphrag_toolkit/lexical_graph/indexing/constants.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | TOPICS_KEY = 'aws::graph::topics' 5 | PROPOSITIONS_KEY= 'aws::graph::propositions' 6 | SOURCE_DOC_KEY = 'aws::graph::source_doc' 7 | 8 | DEFAULT_TOPIC = 'context' 9 | DEFAULT_CLASSIFICATION = 'unknown' 10 | DEFAULT_ENTITY_CLASSIFICATIONS = [ 11 | 'Company', 12 | 'Organization', 13 | 'Location', 14 | 'Event', 15 | 'Sports Team', 16 | 'Sports Organization', 17 | 'Person', 18 | 'Role', 19 | 'Character', 20 | 'Product', 21 | 'Service', 22 | 'Media', 23 | 'Creative Work', 24 | 'Game', 25 | 'Software', 26 | 'Financial Instrument' 27 | ] 28 | 29 | 30 | 31 | 32 | 33 | -------------------------------------------------------------------------------- /lexical-graph/src/graphrag_toolkit/lexical_graph/indexing/extract/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | from .extraction_pipeline import ExtractionPipeline 5 | from .batch_config import BatchConfig 6 | from .llm_proposition_extractor import LLMPropositionExtractor 7 | from .proposition_extractor import PropositionExtractor 8 | from .batch_llm_proposition_extractor import BatchLLMPropositionExtractor 9 | from .batch_topic_extractor import BatchTopicExtractor 10 | from .topic_extractor import TopicExtractor 11 | from .graph_scoped_value_store import GraphScopedValueStore 12 | from .scoped_value_provider import ScopedValueStore, ScopedValueProvider, FixedScopedValueProvider, DEFAULT_SCOPE 13 | from .file_system_tap import FileSystemTap 14 | from .infer_classifications import InferClassifications 15 | from .infer_config import OnExistingClassifications, InferClassificationsConfig 16 | -------------------------------------------------------------------------------- /lexical-graph/src/graphrag_toolkit/lexical_graph/indexing/extract/batch_config.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | from dataclasses import dataclass, field 5 | from typing import Optional, List 6 | 7 | @dataclass 8 | class BatchConfig: 9 | """ 10 | Configuration for batch processing settings. 11 | 12 | This class provides a structure for configuring batch processing, including 13 | AWS settings like role ARN, region, and S3 bucket details, as well as network 14 | and batch control parameters. It is designed to facilitate batch operations 15 | by defining a standardized schema for batch-related configurations. 16 | 17 | Attributes: 18 | role_arn (str): ARN of the IAM role used for batch processing. 19 | region (str): AWS region where the batch processing will take place. 20 | bucket_name (str): Name of the S3 bucket used for storing batch-related 21 | data. 22 | key_prefix (Optional[str]): Optional prefix for keys in the S3 bucket. 23 | s3_encryption_key_id (Optional[str]): KMS key ID used for S3 encryption, 24 | if any. 25 | subnet_ids (List[str]): List of subnet IDs used for the network 26 | configuration of the batch processing. 27 | security_group_ids (List[str]): List of security group IDs applied to the 28 | batch processing tasks. 29 | max_batch_size (int): Maximum size of a single batch. Default is 25000. 30 | max_num_concurrent_batches (int): Maximum number of concurrent batches 31 | allowed. Default is 3. 32 | """ 33 | role_arn:str 34 | region:str 35 | bucket_name:str 36 | key_prefix:Optional[str]=None 37 | s3_encryption_key_id:Optional[str]=None 38 | subnet_ids:List[str] = field(default_factory=list) 39 | security_group_ids:List[str] = field(default_factory=list) 40 | max_batch_size:int=25000 41 | max_num_concurrent_batches:int=3 -------------------------------------------------------------------------------- /lexical-graph/src/graphrag_toolkit/lexical_graph/indexing/extract/docs_to_nodes.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | import logging 5 | from typing import List, Any, Sequence 6 | 7 | from graphrag_toolkit.lexical_graph.indexing.build.checkpoint import DoNotCheckpoint 8 | 9 | from llama_index.core.node_parser import NodeParser 10 | from llama_index.core.schema import BaseNode, Document 11 | from llama_index.core.node_parser.node_utils import build_nodes_from_splits 12 | 13 | logger = logging.getLogger(__name__) 14 | 15 | class DocsToNodes(NodeParser, DoNotCheckpoint): 16 | """Parses documents into nodes. 17 | 18 | This class is responsible for parsing a collection of documents or nodes into 19 | a corresponding list of nodes. It extends functionality from `NodeParser` and 20 | `DoNotCheckpoint` to ensure compatibility with inheritable features and avoid 21 | saving checkpoints during operations. 22 | 23 | Attributes: 24 | None 25 | """ 26 | def _parse_nodes( 27 | self, 28 | nodes: Sequence[BaseNode], 29 | show_progress: bool = False, 30 | **kwargs: Any, 31 | ) -> List[BaseNode]: 32 | """ 33 | Parses a sequence of nodes into a list of `BaseNode` objects. If a node is of type 34 | `Document`, it converts the node into `BaseNode` by splitting the text and 35 | reconstructing the node. For other node types, it retains the original node. 36 | 37 | Args: 38 | nodes (Sequence[BaseNode]): A sequence of nodes to be parsed. 39 | show_progress (bool): A flag to indicate whether to display progress 40 | during parsing. 41 | **kwargs (Any): Additional keyword arguments for any future extensibility. 42 | 43 | Returns: 44 | List[BaseNode]: A list of parsed `BaseNode` objects. 45 | """ 46 | def to_node(node): 47 | """ 48 | Parses a sequence of nodes and converts documents to nodes where applicable. 49 | 50 | This method processes a given sequence of nodes. If a node is of type Document, 51 | it converts the node into one or more BaseNode instances based on text splits. 52 | For all other node types, it retains the original node. The function also 53 | allows progress tracking if specified. 54 | 55 | Args: 56 | nodes (Sequence[BaseNode]): A sequence of nodes to be parsed and processed. 57 | show_progress (bool): Indicates whether to show progress during parsing. 58 | **kwargs (Any): Additional keyword arguments for customization. 59 | 60 | Returns: 61 | List[BaseNode]: A list of processed BaseNode instances formed from the 62 | input nodes. 63 | """ 64 | if isinstance(node, Document): 65 | return build_nodes_from_splits([node.text], node)[0] 66 | else: 67 | return node 68 | 69 | return [to_node(n) for n in nodes] -------------------------------------------------------------------------------- /lexical-graph/src/graphrag_toolkit/lexical_graph/indexing/extract/graph_scoped_value_store.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | import logging 5 | from typing import List 6 | from graphrag_toolkit.lexical_graph.indexing.extract.scoped_value_provider import ScopedValueStore 7 | from graphrag_toolkit.lexical_graph.storage.graph import GraphStore 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | class GraphScopedValueStore(ScopedValueStore): 12 | """Manages and stores values in a graph database with scope-based organization. 13 | 14 | This class allows for storing and retrieving scoped values using a graph database. 15 | Scoped values are organized by a label and associated with a defined scope for easy 16 | management and retrieval. 17 | 18 | Attributes: 19 | graph_store (GraphStore): The graph database store used for executing queries. 20 | """ 21 | graph_store: GraphStore 22 | 23 | def get_scoped_values(self, label:str, scope:str) -> List[str]: 24 | """ 25 | Fetches distinct values associated with a specific label and scope from the graph database. 26 | 27 | This function performs a Cypher query to retrieve distinct values from nodes that match 28 | the specified label and scope in the graph database. The results are then extracted and 29 | returned as a list. 30 | 31 | Args: 32 | label (str): The label used to identify the nodes in the graph. 33 | scope (str): The scope value to filter nodes in the graph. 34 | 35 | Returns: 36 | List[str]: A list of distinct values associated with the input label and scope from 37 | the graph database. 38 | 39 | Raises: 40 | Any exceptions raised by the `execute_query` method within `self.graph_store` or 41 | any database-related issues will propagate to the caller. 42 | """ 43 | cypher = f''' 44 | MATCH (n:`__SYS_SV__{label}__`) 45 | WHERE n.scope=$scope 46 | RETURN DISTINCT n.value AS value 47 | ''' 48 | 49 | params = { 50 | 'scope': scope 51 | } 52 | 53 | results = self.graph_store.execute_query(cypher, params) 54 | 55 | return [result['value'] for result in results] 56 | 57 | def save_scoped_values(self, label:str, scope:str, values:List[str]) -> None: 58 | """ 59 | Saves a list of values associated with a specific label and scope to the graph store. Each value is 60 | processed within the provided scope, and the method ensures a unique combination of scope and value 61 | through the `MERGE` operation in the query. The execution handles retries in case of query failure. 62 | 63 | Args: 64 | label (str): The label used to dynamically define the node label in the query. This allows for 65 | compartmentalization of values within the graph store. 66 | scope (str): A string defining the specific scope in which values will be stored. Used as an 67 | attribute to uniquely identify nodes and group values accordingly. 68 | values (List[str]): A list of string values to be stored in association with the provided scope 69 | and label. 70 | 71 | Returns: 72 | None 73 | """ 74 | cypher = f''' 75 | UNWIND $values AS value 76 | MERGE (:`__SYS_SV__{label}__`{{scope:$scope, value:value}}) 77 | ''' 78 | 79 | params = { 80 | 'scope': scope, 81 | 'values': values 82 | } 83 | 84 | self.graph_store.execute_query_with_retry(cypher, params) 85 | 86 | 87 | 88 | 89 | 90 | -------------------------------------------------------------------------------- /lexical-graph/src/graphrag_toolkit/lexical_graph/indexing/extract/infer_config.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | from enum import Enum 5 | from dataclasses import dataclass 6 | from typing import Optional 7 | 8 | class OnExistingClassifications(Enum): 9 | """ 10 | Enumeration to define behaviors for handling existing classifications. 11 | 12 | This class is an Enum used to specify how existing classifications 13 | should be handled during operations. It provides three options: 14 | merging with existing classifications, replacing them entirely, 15 | or retaining the existing classifications without changes. 16 | 17 | Attributes: 18 | MERGE_EXISTING: Merges new classifications with the existing ones. 19 | REPLACE_EXISTING: Replaces any existing classifications completely. 20 | RETAIN_EXISTING: Keeps the existing classifications without changes. 21 | """ 22 | MERGE_EXISTING = 1 23 | REPLACE_EXISTING = 2 24 | RETAIN_EXISTING = 3 25 | 26 | @dataclass 27 | class InferClassificationsConfig: 28 | """ 29 | Configuration for inferring classifications in a data processing context. 30 | 31 | This class encapsulates the configuration parameters required for inferring 32 | classifications within a system. It defines the number of samples to process, 33 | the number of iterations to perform, how to handle existing classifications, 34 | and an optional prompt template for customization. 35 | 36 | Attributes: 37 | num_samples (Optional[int]): Number of samples to infer classifications from. 38 | Defaults to 5. 39 | num_iterations (Optional[int]): Number of iterations to perform for the 40 | classification inference process. Defaults to 1. 41 | on_existing_classifications (Optional[OnExistingClassifications]): Strategy 42 | to apply when handling pre-existing classifications. Defaults to 43 | OnExistingClassifications.MERGE_EXISTING. 44 | prompt_template (Optional[str]): Custom template text for classification 45 | prompts, if applicable. Defaults to None. 46 | """ 47 | num_samples:Optional[int]=5 48 | num_iterations:Optional[int]=1 49 | on_existing_classifications:Optional[OnExistingClassifications]=OnExistingClassifications.MERGE_EXISTING 50 | prompt_template:Optional[str]=None -------------------------------------------------------------------------------- /lexical-graph/src/graphrag_toolkit/lexical_graph/indexing/extract/pipeline_decorator.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | import abc 5 | import six 6 | from typing import Iterable 7 | 8 | from graphrag_toolkit.lexical_graph.indexing.model import SourceDocument 9 | 10 | @six.add_metaclass(abc.ABCMeta) 11 | class PipelineDecorator(): 12 | """ 13 | Abstract base class for defining pipeline decorators. 14 | 15 | This class provides an interface for creating pipeline decorators that 16 | process input documents and transform them through an arbitrary operation. 17 | It is intended to be subclassed, with the abstract methods implemented 18 | to define custom behavior for handling documents in a pipeline. 19 | 20 | Attributes: 21 | None 22 | """ 23 | @abc.abstractmethod 24 | def handle_input_docs(self, docs:Iterable[SourceDocument]) -> Iterable[SourceDocument]: 25 | """ 26 | Abstract method that processes a collection of SourceDocument instances 27 | and returns an iterable of processed SourceDocument instances. 28 | 29 | This method defines an interface for handling input documents that must 30 | be implemented by subclasses. The implementation of the method should 31 | provide the logic for processing the documents within the iterable input. 32 | 33 | Args: 34 | docs (Iterable[SourceDocument]): An iterable collection of SourceDocument 35 | objects to be processed. 36 | 37 | Returns: 38 | Iterable[SourceDocument]: An iterable collection of processed SourceDocument 39 | objects after applying the logic defined in the subclass implementation. 40 | """ 41 | pass 42 | 43 | @abc.abstractmethod 44 | def handle_output_doc(self, doc: SourceDocument) -> SourceDocument: 45 | """ 46 | An abstract method to process and handle an input SourceDocument object 47 | and return the processed SourceDocument. 48 | 49 | Args: 50 | doc (SourceDocument): The input document that needs to be processed. 51 | 52 | Returns: 53 | SourceDocument: The processed document after handling. 54 | """ 55 | pass 56 | 57 | -------------------------------------------------------------------------------- /lexical-graph/src/graphrag_toolkit/lexical_graph/indexing/extract/source_doc_parser.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | import abc 5 | from typing import Iterable 6 | 7 | from graphrag_toolkit.lexical_graph.indexing.model import SourceDocument 8 | 9 | from llama_index.core.schema import BaseComponent 10 | 11 | class SourceDocParser(BaseComponent): 12 | """ 13 | Parses source documents and provides an interface for handling document parsing logic. 14 | 15 | This class serves as an abstract base for implementing source document parsing 16 | functionality. The main purpose of the class is to define a generic interface that 17 | subclasses implement to customize how source documents are parsed. It ensures a 18 | consistent API for parsing while delegating the specific implementation of parsing 19 | to the subclasses. 20 | 21 | Attributes: 22 | None 23 | """ 24 | @abc.abstractmethod 25 | def _parse_source_docs(self, source_documents:Iterable[SourceDocument]) -> Iterable[SourceDocument]: 26 | """ 27 | Parses a collection of source documents and processes them into a specified format. 28 | 29 | This method is intended to be overridden by subclasses to provide specific 30 | logic for processing the input documents and transforming them into the desired 31 | output. The input documents should be iterable, and the output must also be an 32 | iterable containing the processed documents. This is an abstract method, and 33 | instantiating the containing class without implementing this method will result 34 | in errors. 35 | 36 | Args: 37 | source_documents: An iterable of `SourceDocument` instances representing 38 | the input documents to be processed. 39 | 40 | Returns: 41 | An iterable of `SourceDocument` instances representing the processed 42 | version of the source documents. 43 | 44 | Raises: 45 | NotImplementedError: If the method is not implemented by the subclass. 46 | """ 47 | pass 48 | 49 | def parse_source_docs(self, source_documents:Iterable[SourceDocument]) -> Iterable[SourceDocument]: 50 | """ 51 | Parses a collection of source documents and processes them through an internal 52 | parsing mechanism. 53 | 54 | Args: 55 | source_documents: An iterable of SourceDocument objects to be parsed. Each 56 | document is expected to contain the necessary structure and metadata 57 | required by the parser. 58 | 59 | Returns: 60 | An iterable of SourceDocument objects that have been processed and parsed 61 | using the internal parsing mechanism. 62 | """ 63 | return self._parse_source_docs(source_documents) -------------------------------------------------------------------------------- /lexical-graph/src/graphrag_toolkit/lexical_graph/indexing/load/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | from .bedrock_knowledge_base import BedrockKnowledgeBaseExport 5 | from .file_based_chunks import FileBasedChunks 6 | from .s3_based_chunks import S3BasedChunks 7 | from .file_based_docs import FileBasedDocs 8 | from .s3_based_docs import S3BasedDocs 9 | from .json_array_reader import JSONArrayReader 10 | from .source_documents import SourceDocuments -------------------------------------------------------------------------------- /lexical-graph/src/graphrag_toolkit/lexical_graph/indexing/load/source_documents.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | from typing import Callable, List 5 | from llama_index.core import Document 6 | 7 | class SourceDocuments: 8 | """ 9 | Represents a collection of source document generator functions. 10 | 11 | This class is designed to encapsulate a list of callable functions that generate 12 | source documents. It provides an iterable interface to iterate through all source 13 | documents produced by these functions. The class supports handling nested lists 14 | of documents, yielding individual document items. 15 | 16 | Attributes: 17 | source_documents_fns (List[Callable[[], List[Document]]]): A list of callable 18 | functions that, when invoked, return lists of documents or nested lists 19 | of documents. 20 | """ 21 | def __init__(self, source_documents_fns: List[Callable[[], List[Document] ]]): 22 | """ 23 | Initializes an instance of the class, setting up the source document functions. 24 | 25 | Args: 26 | source_documents_fns (List[Callable[[], List[Document]]]): A list of 27 | callables. Each callable, when executed, is expected to return a 28 | list of Document objects. 29 | """ 30 | self.source_documents_fns = source_documents_fns 31 | 32 | def __iter__(self): 33 | """ 34 | Yields items from the nested lists or the iterable objects provided by source_documents_fns. 35 | 36 | This method iterates through the callable objects in the `source_documents_fns` attribute, which 37 | are expected to return iterable collections. It recognizes nested lists, iterates through them, 38 | and yields individual items. If the iterable is already flat, it directly yields the items. 39 | 40 | Yields: 41 | Any: The individual elements extracted from the nested or flat iterable structures 42 | returned by the callables in `source_documents_fns`. 43 | """ 44 | for source_documents_fn in self.source_documents_fns: 45 | for source_documents in source_documents_fn(): 46 | if isinstance(source_documents, list): 47 | for item in source_documents: 48 | if isinstance(item, list): 49 | for i in item: 50 | yield i 51 | else: 52 | yield item 53 | else: 54 | yield source_documents -------------------------------------------------------------------------------- /lexical-graph/src/graphrag_toolkit/lexical_graph/indexing/node_handler.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | import abc 5 | from typing import List, Any, Generator 6 | from llama_index.core.schema import BaseNode 7 | from llama_index.core.schema import TransformComponent 8 | from llama_index.core.bridge.pydantic import Field 9 | 10 | class NodeHandler(TransformComponent): 11 | """ 12 | Handles the processing and transformation of node data. 13 | 14 | This class is designed to process a collection of nodes with optional 15 | parameters. It serves as a base class for customizable node handling 16 | operations, requiring the implementation of the `accept` method to 17 | define specific processing logic. The `__call__` method is provided 18 | for use as a callable, enabling straightforward invocation of the 19 | processing logic. 20 | 21 | Attributes: 22 | show_progress (bool): Whether to show progress during processing. 23 | """ 24 | show_progress: bool = Field(default=True, description='Whether to show progress.') 25 | 26 | def __call__(self, nodes: List[BaseNode], **kwargs: Any) -> List[BaseNode]: 27 | """ 28 | Processes and filters a list of nodes by applying the accept method to each node. 29 | 30 | The method takes a list of BaseNode objects, applies the accept method, and 31 | returns a new list containing the results. 32 | 33 | Args: 34 | nodes: A list of BaseNode objects that need to be processed. 35 | **kwargs: Additional keyword arguments that can be passed to the accept 36 | method. 37 | 38 | Returns: 39 | A list of BaseNode objects that have been processed by the accept method. 40 | """ 41 | return [n for n in self.accept(nodes, **kwargs)] 42 | 43 | @abc.abstractmethod 44 | def accept(self, nodes: List[BaseNode], **kwargs: Any) -> Generator[BaseNode, None, None]: 45 | """ 46 | Abstract base class for implementing a visitor pattern that can process 47 | a collection of nodes. This requires subclasses to implement the `accept` 48 | method to define their processing logic. 49 | 50 | Args: 51 | nodes: A list of nodes derived from the BaseNode class that are to 52 | be processed by the visitor pattern. 53 | **kwargs: Additional keyword arguments that can be passed during the 54 | processing of the nodes. 55 | 56 | Yields: 57 | BaseNode: Processed node instances derived from BaseNode, one at 58 | a time as the generator progresses. 59 | """ 60 | raise NotImplementedError() -------------------------------------------------------------------------------- /lexical-graph/src/graphrag_toolkit/lexical_graph/indexing/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 -------------------------------------------------------------------------------- /lexical-graph/src/graphrag_toolkit/lexical_graph/indexing/utils/metadata_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | import datetime 5 | 6 | def get_properties_str(properties, default): 7 | if properties: 8 | return ';'.join(sorted([f'{k}:{v}' for k,v in properties.items()])) 9 | else: 10 | return default 11 | 12 | def last_accessed_date(*args): 13 | return { 14 | 'last_accessed_date': datetime.datetime.now().strftime("%Y-%m-%d") 15 | } -------------------------------------------------------------------------------- /lexical-graph/src/graphrag_toolkit/lexical_graph/indexing/utils/pipeline_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | from pipe import Pipe 5 | from concurrent.futures import ProcessPoolExecutor 6 | from functools import partial 7 | from typing import List, Optional, Sequence, Any, cast, Callable 8 | 9 | 10 | from llama_index.core.ingestion import IngestionPipeline 11 | from llama_index.core.ingestion.pipeline import run_transformations 12 | from llama_index.core.schema import BaseNode 13 | 14 | def _sink(): 15 | def _sink_from(generator): 16 | for item in generator: 17 | pass 18 | return Pipe(_sink_from) 19 | 20 | sink = _sink() 21 | 22 | def run_pipeline( 23 | pipeline:IngestionPipeline, 24 | node_batches:List[List[BaseNode]], 25 | cache_collection: Optional[str] = None, 26 | in_place: bool = True, 27 | num_workers: int = 1, 28 | **kwargs: Any, 29 | ) -> Sequence[BaseNode]: 30 | transform: Callable[[List[BaseNode]], List[BaseNode]] = partial( 31 | run_transformations, 32 | transformations=pipeline.transformations, 33 | in_place=in_place, 34 | cache=pipeline.cache if not pipeline.disable_cache else None, 35 | cache_collection=cache_collection, 36 | **kwargs 37 | ) 38 | 39 | with ProcessPoolExecutor(max_workers=num_workers) as p: 40 | processed_node_batches = p.map(transform, node_batches) 41 | processed_nodes = sum(processed_node_batches, start=cast(List[BaseNode], [])) 42 | 43 | return processed_nodes -------------------------------------------------------------------------------- /lexical-graph/src/graphrag_toolkit/lexical_graph/prompts/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | """ 5 | This module exposes the core prompt provider interface and registry entry point. 6 | 7 | To avoid circular import errors, concrete provider classes (S3, Bedrock, File, Static) 8 | are not imported here. Use `prompt_provider_config.py` to dynamically construct providers. 9 | """ 10 | 11 | from .prompt_provider_base import PromptProvider 12 | from .prompt_provider_registry import PromptProviderRegistry 13 | 14 | __all__ = [ 15 | "PromptProvider", 16 | "PromptProviderRegistry", 17 | ] 18 | -------------------------------------------------------------------------------- /lexical-graph/src/graphrag_toolkit/lexical_graph/prompts/bedrock_prompt_provider.py: -------------------------------------------------------------------------------- 1 | # graphrag_toolkit/lexical_graph/prompts/bedrock_prompt_provider.py 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | from graphrag_toolkit.lexical_graph.prompts.prompt_provider_base import PromptProvider 5 | from graphrag_toolkit.lexical_graph.prompts.prompt_provider_config import BedrockPromptProviderConfig 6 | from graphrag_toolkit.lexical_graph.logging import logging 7 | 8 | logger = logging.getLogger(__name__) 9 | 10 | class BedrockPromptProvider(PromptProvider): 11 | """Provides prompt templates from AWS Bedrock using specified ARNs and versions. 12 | 13 | This class loads and returns system and user prompt templates from AWS Bedrock, 14 | based on configuration provided at initialization. 15 | """ 16 | 17 | def __init__(self, config: BedrockPromptProviderConfig): 18 | self.config = config 19 | 20 | logger.info( 21 | f"[Prompt Debug] Using BedrockPromptProvider with:\n" 22 | f" system_prompt_arn={config.system_prompt_arn} " 23 | f"(resolved={config.resolved_system_prompt_arn}, version={config.system_prompt_version})\n" 24 | f" user_prompt_arn={config.user_prompt_arn} " 25 | f"(resolved={config.resolved_user_prompt_arn}, version={config.user_prompt_version})\n" 26 | f" region={config.aws_region}, profile={config.aws_profile}" 27 | ) 28 | 29 | def _load_prompt(self, prompt_arn: str, version: str = None) -> str: 30 | """Loads a prompt template from AWS Bedrock using the given ARN and version. 31 | 32 | Args: 33 | prompt_arn: The ARN of the prompt to load. 34 | version: The version of the prompt to load (optional). 35 | 36 | Returns: 37 | The text of the loaded prompt template. 38 | 39 | Raises: 40 | RuntimeError: If the prompt or its text cannot be found or loaded. 41 | """ 42 | try: 43 | kwargs = {"promptIdentifier": prompt_arn} 44 | if version: 45 | kwargs["promptVersion"] = version 46 | 47 | response = self.config.bedrock.get_prompt(**kwargs) 48 | 49 | variants = response.get("variants", []) 50 | if not variants: 51 | raise RuntimeError(f"No variants found for prompt: {prompt_arn}") 52 | 53 | text = variants[0].get("templateConfiguration", {}).get("text", {}).get("text") 54 | if not text: 55 | raise RuntimeError(f"Prompt text not found for: {prompt_arn}") 56 | 57 | return text.strip() 58 | 59 | except Exception as e: 60 | logger.error(f"Failed to load prompt for {prompt_arn}: {str(e)}") 61 | raise RuntimeError(f"Could not load prompt from Bedrock: {prompt_arn}") from e 62 | 63 | def get_system_prompt(self) -> str: 64 | """Retrieves the system prompt template from AWS Bedrock. 65 | 66 | Returns: 67 | The text of the system prompt template. 68 | """ 69 | return self._load_prompt( 70 | self.config.resolved_system_prompt_arn, 71 | self.config.system_prompt_version, 72 | ) 73 | 74 | def get_user_prompt(self) -> str: 75 | """Retrieves the user prompt template from AWS Bedrock. 76 | 77 | Returns: 78 | The text of the user prompt template. 79 | """ 80 | return self._load_prompt( 81 | self.config.resolved_user_prompt_arn, 82 | self.config.user_prompt_version, 83 | ) 84 | -------------------------------------------------------------------------------- /lexical-graph/src/graphrag_toolkit/lexical_graph/prompts/file_prompt_provider.py: -------------------------------------------------------------------------------- 1 | import os 2 | from graphrag_toolkit.lexical_graph.prompts.prompt_provider_base import PromptProvider 3 | from graphrag_toolkit.lexical_graph.prompts.prompt_provider_config import FilePromptProviderConfig 4 | from graphrag_toolkit.lexical_graph.logging import logging 5 | 6 | logger = logging.getLogger(__name__) 7 | 8 | class FilePromptProvider(PromptProvider): 9 | """ 10 | Loads system and user prompts from the local filesystem using a config object. 11 | """ 12 | 13 | def __init__(self, config: FilePromptProviderConfig, system_prompt_file: str = "system_prompt.txt", user_prompt_file: str = "user_prompt.txt"): 14 | """ 15 | Initializes the FilePromptProvider with a configuration and prompt file names. 16 | 17 | Args: 18 | config: The configuration object specifying the base path for prompt files. 19 | system_prompt_file: The filename for the system prompt (default is "system_prompt.txt"). 20 | user_prompt_file: The filename for the user prompt (default is "user_prompt.txt"). 21 | 22 | Raises: 23 | NotADirectoryError: If the provided base path does not exist or is not a directory. 24 | """ 25 | if not os.path.isdir(config.base_path): 26 | raise NotADirectoryError(f"Invalid or non-existent directory: {config.base_path}") 27 | self.config = config 28 | self.system_prompt_file = system_prompt_file 29 | self.user_prompt_file = user_prompt_file 30 | 31 | logger.info(f"[Prompt Debug] Initialized FilePromptProvider") 32 | logger.info(f"[Prompt Debug] Base path: {self.config.base_path}") 33 | logger.info(f"[Prompt Debug] System prompt file: {self.system_prompt_file}") 34 | logger.info(f"[Prompt Debug] User prompt file: {self.user_prompt_file}") 35 | 36 | def _load_prompt(self, filename: str) -> str: 37 | """ 38 | Loads the contents of a prompt file from the configured base path. 39 | 40 | Args: 41 | filename: The name of the prompt file to load. 42 | 43 | Returns: 44 | The contents of the prompt file as a string. 45 | 46 | Raises: 47 | FileNotFoundError: If the prompt file does not exist. 48 | OSError: If the file cannot be read. 49 | """ 50 | path = os.path.join(self.config.base_path, filename) 51 | if not os.path.exists(path): 52 | raise FileNotFoundError(f"Prompt file not found: {path}") 53 | try: 54 | with open(path, "r", encoding="utf-8") as f: 55 | return f.read().rstrip() 56 | except OSError as e: 57 | raise OSError(f"Failed to read prompt file {path}: {str(e)}") from e 58 | 59 | def get_system_prompt(self) -> str: 60 | """ 61 | Returns the contents of the system prompt file. 62 | 63 | Returns: 64 | The contents of the system prompt file as a string. 65 | """ 66 | return self._load_prompt(self.system_prompt_file) 67 | 68 | def get_user_prompt(self) -> str: 69 | """ 70 | Returns the contents of the user prompt file. 71 | 72 | Returns: 73 | The contents of the user prompt file as a string. 74 | """ 75 | return self._load_prompt(self.user_prompt_file) 76 | -------------------------------------------------------------------------------- /lexical-graph/src/graphrag_toolkit/lexical_graph/prompts/prompt_provider_base.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | from abc import ABC, abstractmethod 5 | from graphrag_toolkit.lexical_graph.logging import logging 6 | 7 | logger = logging.getLogger(__name__) 8 | 9 | class PromptProvider(ABC): 10 | """ 11 | Abstract base class for loading prompts from various sources. 12 | """ 13 | 14 | @abstractmethod 15 | def get_system_prompt(self) -> str: 16 | """ 17 | Returns the system prompt as a string. 18 | """ 19 | pass 20 | 21 | @abstractmethod 22 | def get_user_prompt(self) -> str: 23 | """ 24 | Returns the user prompt as a string. 25 | """ 26 | pass 27 | -------------------------------------------------------------------------------- /lexical-graph/src/graphrag_toolkit/lexical_graph/prompts/prompt_provider_config_base.py: -------------------------------------------------------------------------------- 1 | # prompt_provider_config_base.py 2 | from pydantic import BaseModel 3 | from typing import Optional 4 | 5 | class FilePromptProviderConfig(BaseModel): 6 | """ 7 | Configuration model for file-based prompt providers. 8 | 9 | This class defines the required fields for specifying system and user prompt file names. 10 | """ 11 | system_prompt_file: str 12 | user_prompt_file: str 13 | 14 | class S3PromptProviderConfig(BaseModel): 15 | """ 16 | Configuration model for S3-based prompt providers. 17 | 18 | This class defines the required fields for specifying the S3 bucket, key, and optional region for prompt storage. 19 | """ 20 | bucket: str 21 | key: str 22 | region: Optional[str] = None 23 | 24 | class BedrockPromptProviderConfig(BaseModel): 25 | """ 26 | Configuration model for Bedrock-based prompt providers. 27 | 28 | This class defines the required field for specifying the Bedrock prompt ARN. 29 | """ 30 | prompt_arn: str 31 | -------------------------------------------------------------------------------- /lexical-graph/src/graphrag_toolkit/lexical_graph/prompts/prompt_provider_factory.py: -------------------------------------------------------------------------------- 1 | # graphrag_toolkit/lexical_graph/prompts/prompt_provider_factory.py 2 | 3 | import os 4 | from graphrag_toolkit.lexical_graph.prompts.prompt_provider_base import PromptProvider 5 | from graphrag_toolkit.lexical_graph.prompts.prompt_provider_config import ( 6 | BedrockPromptProviderConfig, 7 | S3PromptProviderConfig, 8 | FilePromptProviderConfig, 9 | StaticPromptProviderConfig, 10 | ) 11 | from graphrag_toolkit.lexical_graph.logging import logging 12 | 13 | logger = logging.getLogger(__name__) 14 | 15 | 16 | class PromptProviderFactory: 17 | """ 18 | Factory class for creating PromptProvider instances based on environment configuration. 19 | 20 | This class selects and builds the appropriate PromptProvider implementation according to the PROMPT_PROVIDER environment variable. 21 | """ 22 | @staticmethod 23 | def get_provider() -> PromptProvider: 24 | """ 25 | Returns a PromptProvider instance based on the PROMPT_PROVIDER environment variable. 26 | 27 | This method selects and builds the appropriate PromptProvider implementation for Bedrock, S3, file, or static sources. 28 | 29 | Returns: 30 | PromptProvider: An instance of the selected PromptProvider implementation. 31 | """ 32 | provider_type = os.getenv("PROMPT_PROVIDER", "static").lower() 33 | 34 | if provider_type == "bedrock": 35 | return BedrockPromptProviderConfig().build() 36 | elif provider_type == "s3": 37 | return S3PromptProviderConfig().build() 38 | elif provider_type == "file": 39 | return FilePromptProviderConfig().build() 40 | else: 41 | # Final fallback to static default prompts 42 | return StaticPromptProviderConfig().build() 43 | -------------------------------------------------------------------------------- /lexical-graph/src/graphrag_toolkit/lexical_graph/prompts/prompt_provider_registry.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | from typing import Optional, Dict 5 | from graphrag_toolkit.lexical_graph.prompts.prompt_provider_base import PromptProvider 6 | from graphrag_toolkit.lexical_graph.logging import logging 7 | 8 | logger = logging.getLogger(__name__) 9 | 10 | class PromptProviderRegistry: 11 | """ 12 | Global registry for managing and retrieving named PromptProvider instances. 13 | Supports multiple sources (e.g., Bedrock, S3, File) and default fallback. 14 | """ 15 | 16 | _registry: Dict[str, PromptProvider] = {} 17 | _default_provider_name: Optional[str] = None 18 | 19 | @classmethod 20 | def register(cls, name: str, provider: PromptProvider, default: bool = False) -> None: 21 | """ 22 | Register a prompt provider under a unique name. 23 | Optionally, set it as the default provider. 24 | 25 | Parameters 26 | ---------- 27 | name : str 28 | The unique name for the provider (e.g., "aws-prod", "local-dev"). 29 | provider : PromptProvider 30 | The provider instance to register. 31 | default : bool 32 | Whether to make this the default provider. 33 | """ 34 | cls._registry[name] = provider 35 | if default or cls._default_provider_name is None: 36 | cls._default_provider_name = name 37 | 38 | @classmethod 39 | def get(cls, name: Optional[str] = None) -> Optional[PromptProvider]: 40 | """ 41 | Retrieve a prompt provider by name, or return the default if no name is specified. 42 | 43 | Parameters 44 | ---------- 45 | name : Optional[str] 46 | The name of the provider to retrieve. 47 | 48 | Returns 49 | ------- 50 | Optional[PromptProvider] 51 | The matching provider instance or None. 52 | """ 53 | if name: 54 | return cls._registry.get(name) 55 | if cls._default_provider_name: 56 | return cls._registry.get(cls._default_provider_name) 57 | return None 58 | 59 | @classmethod 60 | def list_registered(cls) -> Dict[str, PromptProvider]: 61 | """ 62 | List all registered prompt providers. 63 | 64 | Returns 65 | ------- 66 | Dict[str, PromptProvider] 67 | A dictionary of provider names and their instances. 68 | """ 69 | return cls._registry.copy() 70 | -------------------------------------------------------------------------------- /lexical-graph/src/graphrag_toolkit/lexical_graph/prompts/s3_prompt_provider.py: -------------------------------------------------------------------------------- 1 | # graphrag_toolkit/lexical_graph/prompts/s3_prompt_provider.py 2 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | from graphrag_toolkit.lexical_graph.prompts.prompt_provider_base import PromptProvider 6 | from graphrag_toolkit.lexical_graph.prompts.prompt_provider_config import S3PromptProviderConfig 7 | from graphrag_toolkit.lexical_graph.logging import logging 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | class S3PromptProvider(PromptProvider): 12 | """ 13 | Loads system and user prompts from an S3 bucket using provided configuration. 14 | 15 | Attributes: 16 | config (S3PromptProviderConfig): Configuration object including bucket, prefix, 17 | and optionally custom file names for prompts. 18 | """ 19 | 20 | def __init__(self, config: S3PromptProviderConfig): 21 | self.config = config 22 | 23 | def _load_prompt(self, filename: str) -> str: 24 | """ 25 | Loads a prompt file from the configured S3 bucket and returns its contents as a string. 26 | 27 | Args: 28 | filename: The name of the prompt file to load from S3. 29 | 30 | Returns: 31 | The contents of the prompt file as a UTF-8 string. 32 | """ 33 | key = f"{self.config.prefix.rstrip('/')}/{filename}" 34 | logger.info(f"[Prompt Debug] Loading prompt from S3: s3://{self.config.bucket}/{key}") 35 | s3_client = self.config.s3 # session-aware S3 client from config 36 | response = s3_client.get_object(Bucket=self.config.bucket, Key=key) 37 | return response["Body"].read().decode("utf-8").rstrip() 38 | 39 | def get_system_prompt(self) -> str: 40 | """ 41 | Retrieves the system prompt from S3. 42 | 43 | Returns: 44 | The contents of the system prompt file. 45 | """ 46 | return self._load_prompt(self.config.system_prompt_file) 47 | 48 | def get_user_prompt(self) -> str: 49 | """ 50 | Retrieves the user prompt from S3. 51 | 52 | Returns: 53 | The contents of the user prompt file. 54 | """ 55 | return self._load_prompt(self.config.user_prompt_file) 56 | -------------------------------------------------------------------------------- /lexical-graph/src/graphrag_toolkit/lexical_graph/prompts/static_prompt_provider.py: -------------------------------------------------------------------------------- 1 | # static_prompt_provider.py 2 | 3 | from graphrag_toolkit.lexical_graph.prompts.prompt_provider_base import PromptProvider 4 | from graphrag_toolkit.lexical_graph.retrieval.prompts import ( 5 | ANSWER_QUESTION_SYSTEM_PROMPT, 6 | ANSWER_QUESTION_USER_PROMPT, 7 | ) 8 | from graphrag_toolkit.lexical_graph.logging import logging 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | class StaticPromptProvider(PromptProvider): 13 | """ 14 | Provides static system and user prompts for use in the application. 15 | This class returns predefined prompt strings that do not change at runtime. 16 | """ 17 | def __init__(self): 18 | """ 19 | Initializes a StaticPromptProvider with predefined system and user prompts. 20 | This constructor sets the system and user prompts to static values for consistent retrieval. 21 | """ 22 | self._system_prompt = ANSWER_QUESTION_SYSTEM_PROMPT 23 | self._user_prompt = ANSWER_QUESTION_USER_PROMPT 24 | logger.debug(f"System Prompt (truncated): {self._system_prompt[:60]}...") 25 | logger.debug(f"User Prompt (truncated): {self._user_prompt[:60]}...") 26 | 27 | def get_system_prompt(self) -> str: 28 | """ 29 | Returns the static system prompt string. 30 | This method provides the system prompt that is set during initialization. 31 | 32 | Returns: 33 | The system prompt as a string. 34 | """ 35 | return self._system_prompt 36 | 37 | def get_user_prompt(self) -> str: 38 | """ 39 | Returns the static user prompt string. 40 | This method provides the user prompt that is set during initialization. 41 | 42 | Returns: 43 | The user prompt as a string. 44 | """ 45 | return self._user_prompt 46 | -------------------------------------------------------------------------------- /lexical-graph/src/graphrag_toolkit/lexical_graph/protocols/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | from .mcp_server import create_mcp_server -------------------------------------------------------------------------------- /lexical-graph/src/graphrag_toolkit/lexical_graph/requirements.txt: -------------------------------------------------------------------------------- 1 | anthropic-bedrock==0.8.0 2 | boto3>=1.36.1 3 | botocore>=1.36.1 4 | json2xml==5.0.5 5 | llama-index-core==0.12.37 6 | llama-index-embeddings-bedrock==0.5.0 7 | llama-index-llms-anthropic==0.6.19 8 | llama-index-llms-bedrock-converse==0.6.0 9 | lru-dict==1.3.0 10 | pipe==2.2 11 | python-dotenv==1.0.1 12 | smart_open==7.1.0 13 | spacy==3.7.5 14 | tfidf_matcher==0.3.0 -------------------------------------------------------------------------------- /lexical-graph/src/graphrag_toolkit/lexical_graph/retrieval/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | from . import post_processors 5 | from . import processors 6 | from . import retrievers 7 | from . import utils -------------------------------------------------------------------------------- /lexical-graph/src/graphrag_toolkit/lexical_graph/retrieval/post_processors/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | from .reranker_mixin import RerankerMixin 5 | from .enrich_source_details import EnrichSourceDetails 6 | from .bedrock_context_format import BedrockContextFormat 7 | from .sentence_reranker import SentenceReranker 8 | from .statement_diversity import StatementDiversityPostProcessor 9 | from .statement_enhancement import StatementEnhancementPostProcessor -------------------------------------------------------------------------------- /lexical-graph/src/graphrag_toolkit/lexical_graph/retrieval/post_processors/reranker_mixin.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | from abc import ABC, abstractmethod 5 | from typing import List, Tuple 6 | 7 | from llama_index.core.postprocessor.types import BaseNodePostprocessor 8 | from llama_index.core.schema import NodeWithScore, QueryBundle 9 | 10 | class RerankerMixin(ABC): 11 | """ 12 | Provides an abstract base class for rerankers with mixin functionality. 13 | 14 | This class serves as a foundational mixin for implementing custom rerankers. 15 | It defines the required interface that any subclass must implement, including 16 | a property to retrieve batch size and a method to rerank given pairs of data. 17 | Subclasses of this mixin are expected to define domain-specific behavior for 18 | reranking operations. 19 | 20 | Attributes: 21 | batch_size (int): Abstract property defining the number of items processed 22 | in a batch by the reranker. 23 | """ 24 | @property 25 | @abstractmethod 26 | def batch_size(self): 27 | """ 28 | Abstract property that defines the batch size for a specific object or operation. 29 | 30 | This property serves as an interface for retrieving or working with the batch size, 31 | making it mandatory to implement in any subclass that inherits from the class which 32 | declares this property. Subclasses must define the behavior and value associated 33 | with this property. 34 | 35 | Attributes: 36 | batch_size: An integer representing the size of the batch used in the context 37 | of the implementation. 38 | 39 | """ 40 | pass 41 | 42 | @abstractmethod 43 | def rerank_pairs(self, pairs: List[Tuple[str, str]], batch_size: int = 128) -> List[float]: 44 | """ 45 | Reranks the given list of key-value pairs by assigning a numerical score to each pair. 46 | The reranking operation is expected to be implemented by subclasses inheriting this 47 | abstract method. The method takes a list of tuples (key-value pairs) and an optional 48 | batch size parameter to process the data in chunks. 49 | 50 | Args: 51 | pairs: 52 | A list of tuples, where each tuple contains two strings representing the 53 | key-value pair to be reranked. 54 | batch_size: 55 | An optional integer specifying the size of data chunks to process in batches. 56 | Default value is 128. 57 | 58 | Returns: 59 | A list of float values corresponding to the reranked scores for the given 60 | key-value pairs. The returned list should maintain the same order as the input list. 61 | 62 | """ 63 | pass 64 | -------------------------------------------------------------------------------- /lexical-graph/src/graphrag_toolkit/lexical_graph/retrieval/processors/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | from .processor_args import ProcessorArgs 5 | from .processor_base import ProcessorBase 6 | from .clear_chunks import ClearChunks 7 | from .clear_scores import ClearScores 8 | from .dedup_results import DedupResults 9 | from .disaggregate_results import DisaggregateResults 10 | from .filter_by_metadata import FilterByMetadata 11 | from .format_sources import FormatSources 12 | from .populate_statement_strs import PopulateStatementStrs 13 | from .prune_results import PruneResults 14 | from .prune_statements import PruneStatements 15 | from .rerank_statements import RerankStatements 16 | from .rescore_results import RescoreResults 17 | from .simplify_single_topic_results import SimplifySingleTopicResults 18 | from .sort_results import SortResults 19 | from .statements_to_strings import StatementsToStrings 20 | from .truncate_results import TruncateResults 21 | from .truncate_statements import TruncateStatements 22 | from .zero_scores import ZeroScores 23 | -------------------------------------------------------------------------------- /lexical-graph/src/graphrag_toolkit/lexical_graph/retrieval/processors/clear_chunks.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | from graphrag_toolkit.lexical_graph.metadata import FilterConfig 5 | from graphrag_toolkit.lexical_graph.retrieval.processors import ProcessorBase, ProcessorArgs 6 | from graphrag_toolkit.lexical_graph.retrieval.model import SearchResultCollection, SearchResult, Topic 7 | 8 | from llama_index.core.schema import QueryBundle 9 | 10 | class ClearChunks(ProcessorBase): 11 | """ 12 | Handles the clearing of chunks within topics in a collection of search results. 13 | 14 | The ClearChunks class is responsible for modifying topics by removing their 15 | associated chunks. This is done iteratively over a collection of search results. 16 | It inherits from `ProcessorBase` and utilizes its utility methods to perform 17 | operations on topics and search results. This processor may be used in 18 | situations where textual or data chunks associated with topics need to be 19 | removed from search results for further processing or analysis. 20 | 21 | Attributes: 22 | args (ProcessorArgs): Configuration arguments passed to the processor, 23 | defining its behavior and settings. 24 | filter_config (FilterConfig): Filtering configuration that determines 25 | how the processor handles filtering-related tasks. 26 | """ 27 | def __init__(self, args:ProcessorArgs, filter_config:FilterConfig): 28 | """ 29 | Initializes the instance of the class with the provided arguments and filter configuration. 30 | This sets up the necessary attributes and base class initialization to manage processing 31 | and configuration effectively for the derived use case. 32 | 33 | Args: 34 | args (ProcessorArgs): The processing arguments required for setting up the instance. 35 | filter_config (FilterConfig): The configuration settings for filtering tasks. 36 | """ 37 | super().__init__(args, filter_config) 38 | 39 | def _process_results(self, search_results:SearchResultCollection, query:QueryBundle) -> SearchResultCollection: 40 | """ 41 | Processes search results by clearing the chunks in associated topics. 42 | 43 | This method processes a collection of search results, applying an operation to 44 | clear all chunks associated with the topics in each search result. It modifies 45 | the input search results collection and returns the processed results. 46 | 47 | Args: 48 | search_results: A collection of search results to be processed. 49 | query: A query bundle containing the search query details. 50 | 51 | Returns: 52 | SearchResultCollection: A processed collection of search results where the 53 | chunks in associated topics have been cleared. 54 | """ 55 | def clear_chunks(topic:Topic): 56 | topic.chunks.clear() 57 | return topic 58 | 59 | def clear_search_result_chunks(index:int, search_result:SearchResult): 60 | return self._apply_to_topics(search_result, clear_chunks) 61 | 62 | return self._apply_to_search_results(search_results, clear_search_result_chunks) 63 | 64 | 65 | -------------------------------------------------------------------------------- /lexical-graph/src/graphrag_toolkit/lexical_graph/retrieval/processors/clear_scores.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | from graphrag_toolkit.lexical_graph.metadata import FilterConfig 5 | from graphrag_toolkit.lexical_graph.retrieval.processors import ProcessorBase, ProcessorArgs 6 | from graphrag_toolkit.lexical_graph.retrieval.model import SearchResultCollection, SearchResult, Topic 7 | 8 | from llama_index.core.schema import QueryBundle 9 | 10 | class ClearScores(ProcessorBase): 11 | """ 12 | Handles the processing of clearing scores from search results. 13 | 14 | This class is designed to process a collection of search results and remove the 15 | scores associated with them. It uses the base ProcessorBase functionality to 16 | apply the clearing operation to each search result in the given collection. 17 | This can be useful in scenarios where the scores are either irrelevant or need 18 | to be redacted for further processing. 19 | 20 | Attributes: 21 | args (ProcessorArgs): Arguments required for initializing the processor, 22 | providing configuration and operational parameters. 23 | filter_config (FilterConfig): Configuration settings for filtering, passed 24 | during instantiation. 25 | """ 26 | def __init__(self, args:ProcessorArgs, filter_config:FilterConfig): 27 | """ 28 | Initializes the class with the provided arguments for processing and filter configuration. 29 | Ensures proper setup by invoking the parent class initializer. 30 | 31 | Args: 32 | args (ProcessorArgs): The arguments required for processing operations. 33 | filter_config (FilterConfig): The configuration for filtering operations. 34 | """ 35 | super().__init__(args, filter_config) 36 | 37 | def _process_results(self, search_results:SearchResultCollection, query:QueryBundle) -> SearchResultCollection: 38 | """ 39 | Processes the given search results by applying a scoring operation through 40 | a specified callback function. This method clears the scores of all 41 | search results within the given collection, setting them to None. 42 | 43 | Args: 44 | search_results: The collection of search results to be processed. 45 | query: The query bundle associated with the search results. 46 | 47 | Returns: 48 | SearchResultCollection: A collection of search results with updated scores. 49 | """ 50 | def clear_score(index:int, search_result:SearchResult): 51 | search_result.score = None 52 | return search_result 53 | 54 | return self._apply_to_search_results(search_results, clear_score) 55 | -------------------------------------------------------------------------------- /lexical-graph/src/graphrag_toolkit/lexical_graph/retrieval/processors/disaggregate_results.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | from graphrag_toolkit.lexical_graph.metadata import FilterConfig 5 | from graphrag_toolkit.lexical_graph.retrieval.processors import ProcessorBase, ProcessorArgs 6 | from graphrag_toolkit.lexical_graph.retrieval.model import SearchResultCollection, SearchResult 7 | 8 | from llama_index.core.schema import QueryBundle 9 | 10 | class DisaggregateResults(ProcessorBase): 11 | """ 12 | Processes search results to disaggregate topics. 13 | 14 | The DisaggregateResults class extends the ProcessorBase class, and its primary 15 | purpose is to process a collection of search results by disaggregating topics 16 | within each search result. Each topic is evaluated individually with its 17 | corresponding score, allowing more granular analysis or filtering. 18 | 19 | Attributes: 20 | args (ProcessorArgs): Configuration and runtime arguments passed 21 | to the processor. 22 | filter_config (FilterConfig): Configuration details related to 23 | filtering criteria and logic. 24 | """ 25 | def __init__(self, args:ProcessorArgs, filter_config:FilterConfig): 26 | """ 27 | Initializes the Processor class with provided arguments and filter configuration. 28 | 29 | This method sets up the processor by utilizing the given configuration and 30 | arguments, ensuring proper initialization for further processing tasks. 31 | 32 | Args: 33 | args (ProcessorArgs): Arguments necessary for configuring the processor. 34 | filter_config (FilterConfig): Configuration details for filter settings. 35 | 36 | """ 37 | super().__init__(args, filter_config) 38 | 39 | def _process_results(self, search_results:SearchResultCollection, query:QueryBundle) -> SearchResultCollection: 40 | """ 41 | Processes and disaggregates search results based on individual topics and their highest statement scores. 42 | 43 | This method analyzes each search result, iterating through the associated topics, and isolates them into 44 | individual search results with updated scores based on the highest statement score within the topic. The 45 | updated collection of search results is then returned. 46 | 47 | Args: 48 | search_results: A collection of search results to be disaggregated and processed. 49 | query: The query bundle that corresponds to the search results. 50 | 51 | Returns: 52 | SearchResultCollection: An updated collection of search results with disaggregated topics and recalculated 53 | scores. 54 | """ 55 | disaggregated_results = [] 56 | 57 | for search_result in search_results.results: 58 | for topic in search_result.topics: 59 | score = max([s.score for s in topic.statements]) 60 | disaggregated_results.append(SearchResult(topics=[topic], source=search_result.source, score=score)) 61 | 62 | search_results = search_results.with_new_results(results=disaggregated_results) 63 | 64 | return search_results 65 | 66 | 67 | -------------------------------------------------------------------------------- /lexical-graph/src/graphrag_toolkit/lexical_graph/retrieval/processors/filter_by_metadata.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | from graphrag_toolkit.lexical_graph.metadata import FilterConfig 5 | from graphrag_toolkit.lexical_graph.retrieval.processors import ProcessorBase, ProcessorArgs 6 | from graphrag_toolkit.lexical_graph.retrieval.model import SearchResultCollection, SearchResult 7 | 8 | from llama_index.core.schema import QueryBundle 9 | 10 | class FilterByMetadata(ProcessorBase): 11 | """ 12 | Filters search results based on metadata. 13 | 14 | This class is responsible for filtering search results by examining their metadata. 15 | The filtering is applied to a collection of search results, retaining only those 16 | that meet the criteria defined in the filter configuration. 17 | 18 | Attributes: 19 | args (ProcessorArgs): Arguments required for the processing. 20 | filter_config (FilterConfig): Configuration that defines the metadata filtering rules. 21 | """ 22 | def __init__(self, args:ProcessorArgs, filter_config:FilterConfig): 23 | """ 24 | Initializes the class instance and sets up the basic configuration for processing. 25 | 26 | The constructor initializes the parent class with the provided arguments and 27 | filter configuration. It is responsible for setting up any necessary state 28 | or configurations required by the class for further processing. 29 | 30 | Args: 31 | args (ProcessorArgs): The arguments required for initializing the processor. 32 | filter_config (FilterConfig): The configuration settings for the filtering 33 | process. 34 | """ 35 | super().__init__(args, filter_config) 36 | 37 | def _process_results(self, search_results:SearchResultCollection, query:QueryBundle) -> SearchResultCollection: 38 | """ 39 | Processes search results based on the provided query and applies filters to the search result metadata. 40 | 41 | Filters the search results by evaluating the metadata of each result using the filter configuration. 42 | Only results that satisfy the filter criteria are retained. 43 | 44 | Args: 45 | search_results: A collection of search results to be filtered. 46 | query: The query bundle associated with the search results. 47 | 48 | Returns: 49 | SearchResultCollection: A collection of filtered search results. 50 | """ 51 | def filter_search_result(index:int, search_result:SearchResult): 52 | return search_result if self.filter_config.filter_source_metadata_dictionary(search_result.source.metadata) else None 53 | 54 | return self._apply_to_search_results(search_results, filter_search_result) -------------------------------------------------------------------------------- /lexical-graph/src/graphrag_toolkit/lexical_graph/retrieval/processors/prune_results.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | from graphrag_toolkit.lexical_graph.metadata import FilterConfig 5 | from graphrag_toolkit.lexical_graph.retrieval.processors import ProcessorBase, ProcessorArgs 6 | from graphrag_toolkit.lexical_graph.retrieval.model import SearchResultCollection, SearchResult 7 | 8 | from llama_index.core.schema import QueryBundle 9 | 10 | class PruneResults(ProcessorBase): 11 | """ 12 | Represents a processor that prunes search results based on a score threshold. 13 | 14 | This class inherits from ProcessorBase and processes search results by applying a pruning function. 15 | The pruning removes results that do not meet a predefined score threshold. It is designed for use 16 | cases where it is necessary to filter out low-scoring results from a search result collection. 17 | 18 | Attributes: 19 | args (ProcessorArgs): Arguments containing configuration and settings for the pruning process, 20 | including the results pruning threshold. 21 | filter_config (FilterConfig): Configuration for filtering, providing additional parameters 22 | or constraints that may influence the pruning logic. 23 | """ 24 | def __init__(self, args:ProcessorArgs, filter_config:FilterConfig): 25 | """ 26 | Initializes the base class for processing tasks with specified arguments and 27 | filter configuration. 28 | 29 | Args: 30 | args (ProcessorArgs): The arguments required for processing tasks. 31 | filter_config (FilterConfig): Configuration settings for filtering during 32 | processing. 33 | """ 34 | super().__init__(args, filter_config) 35 | 36 | def _process_results(self, search_results:SearchResultCollection, query:QueryBundle) -> SearchResultCollection: 37 | """ 38 | Processes the search results by applying a pruning function based on the results' scores relative 39 | to a predefined threshold. Any search result with a score below the threshold is excluded. This 40 | method modifies the search results collection to retain only those results meeting the score 41 | criterion. 42 | 43 | Args: 44 | search_results: The collection of search results to be processed. Each result may either 45 | be retained or pruned based on its score relative to the pruning threshold. 46 | query: The query bundle associated with the search results, providing context for processing. 47 | 48 | Returns: 49 | SearchResultCollection: A new collection of search results with only those results whose 50 | scores meet the pruning threshold retained. 51 | """ 52 | def prune_search_result(index:int, search_result:SearchResult): 53 | return search_result if search_result.score >= self.args.results_pruning_threshold else None 54 | 55 | return self._apply_to_search_results(search_results, prune_search_result) 56 | 57 | 58 | -------------------------------------------------------------------------------- /lexical-graph/src/graphrag_toolkit/lexical_graph/retrieval/processors/simplify_single_topic_results.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | from graphrag_toolkit.lexical_graph.metadata import FilterConfig 5 | from graphrag_toolkit.lexical_graph.retrieval.processors import ProcessorBase, ProcessorArgs 6 | from graphrag_toolkit.lexical_graph.retrieval.model import SearchResultCollection, SearchResult 7 | 8 | from llama_index.core.schema import QueryBundle 9 | 10 | class SimplifySingleTopicResults(ProcessorBase): 11 | """ 12 | Processor that simplifies search results by condensing single-topic results. 13 | 14 | This processor is designed to analyze search results and simplify cases 15 | where a search result contains only one topic. It modifies the search 16 | result structure by transferring the topic and its statements to the 17 | main result attributes and clearing the list of topics. This can be 18 | useful in scenarios where topics are nested in search results and there 19 | is a need to normalize them for easier processing. 20 | 21 | Attributes: 22 | args (ProcessorArgs): Configuration and arguments that dictate 23 | the behavior of the processor. 24 | filter_config (FilterConfig): Configuration that defines filtering 25 | settings for the processor. 26 | """ 27 | def __init__(self, args:ProcessorArgs, filter_config:FilterConfig): 28 | """ 29 | Initializes an instance of the processor class, setting up the base class with the provided 30 | arguments and configuration. This constructor ensures necessary setup for the processing 31 | pipeline. 32 | 33 | Args: 34 | args: The processor arguments providing configuration details required for setting 35 | up the processor instance. 36 | filter_config: The filter configuration specifying parameters and settings for 37 | filtering operations in the processor. 38 | """ 39 | super().__init__(args, filter_config) 40 | 41 | def _process_results(self, search_results:SearchResultCollection, query:QueryBundle) -> SearchResultCollection: 42 | """ 43 | Processes and simplifies the given search results by extracting and consolidating 44 | topics and statements from individual search results. 45 | 46 | This function is intended to manipulate a collection of search results by invoking a 47 | helper function on each element of the collection. The helper function reduces 48 | the complexity in individual search result elements by simplifying topics when applicable. 49 | 50 | Args: 51 | search_results (SearchResultCollection): A collection of search results to process. 52 | query (QueryBundle): The related query for the search results. 53 | 54 | Returns: 55 | SearchResultCollection: The processed collection of search results where each result 56 | may have simplified topics and statements. 57 | """ 58 | def simplify_result(index:int, search_result:SearchResult): 59 | """ 60 | Processor to simplify search results by reducing them to a single topic when applicable. 61 | 62 | This processor iterates through the search results and examines their associated topics. If there is exactly one 63 | topic linked to a result, it promotes this topic to be the primary topic of the result, appending all statements 64 | linked to the topic into the main list of statements for that result, and clears the topic list. 65 | 66 | Method: 67 | - `_process_results`: Processes and simplifies the collection of search results based on the conditions 68 | described above. 69 | """ 70 | if len(search_result.topics) == 1: 71 | topic = search_result.topics[0] 72 | search_result.topic = topic.topic 73 | search_result.statements.extend(topic.statements) 74 | search_result.topics.clear() 75 | return search_result 76 | else: 77 | return search_result 78 | 79 | return self._apply_to_search_results(search_results, simplify_result) -------------------------------------------------------------------------------- /lexical-graph/src/graphrag_toolkit/lexical_graph/retrieval/processors/sort_results.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | from graphrag_toolkit.lexical_graph.metadata import FilterConfig 5 | from graphrag_toolkit.lexical_graph.retrieval.processors import ProcessorBase, ProcessorArgs 6 | from graphrag_toolkit.lexical_graph.retrieval.model import SearchResultCollection 7 | 8 | from llama_index.core.schema import QueryBundle 9 | 10 | class SortResults(ProcessorBase): 11 | """ 12 | SortResults processes and sorts search results based on their score. 13 | 14 | This class inherits from ProcessorBase and provides functionality for sorting 15 | search results in descending order of their scores. It is designed to be used 16 | within a computational pipeline that handles search result processing. 17 | The class ensures that the search results are ordered by relevance as determined 18 | by their scores, which allows subsequent stages in the pipeline to operate on 19 | sorted result data. 20 | 21 | Attributes: 22 | args (ProcessorArgs): Configuration and arguments relevant for the 23 | processing of results. 24 | filter_config (FilterConfig): Configuration for filtering behavior 25 | during processing. 26 | """ 27 | def __init__(self, args:ProcessorArgs, filter_config:FilterConfig): 28 | """ 29 | Initializes a processor with the provided arguments and filter configuration. 30 | 31 | This constructor sets up the necessary parameters by accepting processor 32 | arguments and a filter configuration object. It ensures that the processor 33 | is initialized correctly with all required settings. 34 | 35 | Args: 36 | args: Configuration parameters and settings required for the processor 37 | to operate. 38 | filter_config: A configuration object containing filter specifications 39 | that define processing criteria. 40 | """ 41 | super().__init__(args, filter_config) 42 | 43 | def _process_results(self, search_results:SearchResultCollection, query:QueryBundle) -> SearchResultCollection: 44 | """ 45 | Processes and sorts search results based on their score in descending order. 46 | 47 | This function is responsible for reordering the search results, ensuring that 48 | items with higher scores appear earlier in the collection. It modifies the 49 | `search_results` object in place and returns it after sorting. 50 | 51 | Args: 52 | search_results: A collection of search results to be sorted. 53 | query: A query bundle that was used to generate the search results. 54 | 55 | Returns: 56 | A `SearchResultCollection` object with the results sorted in descending 57 | order of score. 58 | """ 59 | results = search_results.results 60 | search_results.results = sorted(results, key=lambda x: x.score, reverse=True) 61 | return search_results 62 | 63 | 64 | -------------------------------------------------------------------------------- /lexical-graph/src/graphrag_toolkit/lexical_graph/retrieval/processors/truncate_results.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | from graphrag_toolkit.lexical_graph.metadata import FilterConfig 5 | from graphrag_toolkit.lexical_graph.retrieval.processors import ProcessorBase, ProcessorArgs 6 | from graphrag_toolkit.lexical_graph.retrieval.model import SearchResultCollection 7 | 8 | from llama_index.core.schema import QueryBundle 9 | 10 | class TruncateResults(ProcessorBase): 11 | """ 12 | TruncateResults processes search results by limiting the number of results. 13 | 14 | This class extends the ProcessorBase and is used to truncate the 15 | number of search results to a defined maximum limit specified 16 | in the configuration. It modifies the search results inline 17 | by only keeping the top results up to the configured limit. 18 | 19 | Attributes: 20 | args (ProcessorArgs): Configuration and settings for processing. 21 | filter_config (FilterConfig): Configuration for the filtering process. 22 | """ 23 | def __init__(self, args:ProcessorArgs, filter_config:FilterConfig): 24 | """ 25 | Initializes an instance of the Processor class. This constructor provides 26 | initial setup and configuration using the specified arguments and filter 27 | configuration. 28 | 29 | Args: 30 | args (ProcessorArgs): Arguments for configuring the processor. 31 | filter_config (FilterConfig): Filter configuration details used during 32 | initialization. 33 | """ 34 | super().__init__(args, filter_config) 35 | 36 | def _process_results(self, search_results:SearchResultCollection, query:QueryBundle) -> SearchResultCollection: 37 | """ 38 | Processes the search results by truncating the number of results to a defined maximum. 39 | 40 | This method modifies a SearchResultCollection object by trimming its results 41 | based on the `max_search_results` attribute specified in the `args`. It ensures 42 | that only the top-ranked results up to this maximum limit are retained. 43 | 44 | Args: 45 | search_results: A collection of search results to process. 46 | query: The query information associated with the search results. 47 | 48 | Returns: 49 | A SearchResultCollection object with the results truncated to the specified 50 | maximum number. 51 | 52 | """ 53 | search_results.results = search_results.results[:self.args.max_search_results] 54 | return search_results 55 | 56 | 57 | -------------------------------------------------------------------------------- /lexical-graph/src/graphrag_toolkit/lexical_graph/retrieval/retrievers/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | from .chunk_based_search import ChunkBasedSearch 5 | from .entity_based_search import EntityBasedSearch 6 | from .entity_context_search import EntityContextSearch 7 | from .topic_based_search import TopicBasedSearch 8 | from .composite_traversal_based_retriever import CompositeTraversalBasedRetriever, WeightedTraversalBasedRetrieverType 9 | from .keyword_ranking_search import KeywordRankingSearch 10 | from .keyword_entity_search import KeywordEntitySearch 11 | from .rerank_beam_search import RerankingBeamGraphSearch 12 | from .semantic_beam_search import SemanticBeamGraphSearch 13 | from .statement_cosine_seach import StatementCosineSimilaritySearch 14 | from .semantic_guided_retriever import SemanticGuidedRetriever, SemanticGuidedRetrieverType -------------------------------------------------------------------------------- /lexical-graph/src/graphrag_toolkit/lexical_graph/retrieval/summary/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | from .graph_summary import GraphSummary, get_domain -------------------------------------------------------------------------------- /lexical-graph/src/graphrag_toolkit/lexical_graph/retrieval/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 -------------------------------------------------------------------------------- /lexical-graph/src/graphrag_toolkit/lexical_graph/retrieval/utils/vector_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | import logging 5 | import queue 6 | from typing import Optional 7 | 8 | from graphrag_toolkit.lexical_graph.metadata import FilterConfig 9 | from graphrag_toolkit.lexical_graph.storage.vector.vector_store import VectorStore 10 | from graphrag_toolkit.lexical_graph.retrieval.processors import ProcessorArgs 11 | 12 | from llama_index.core.schema import QueryBundle 13 | 14 | logger = logging.getLogger(__name__) 15 | 16 | def get_diverse_vss_elements(index_name:str, query_bundle: QueryBundle, vector_store:VectorStore, args:ProcessorArgs, filter_config:Optional[FilterConfig]): 17 | """ 18 | Retrieve diverse elements from a vector search system (VSS) by applying a diversity 19 | factor to limit redundancy among results. 20 | 21 | This function queries a vector store using the provided query, index, and filter 22 | configuration, then applies a diversity mechanism to return results with more 23 | heterogeneity. The diversity factor determines the level of diversification among 24 | the results. 25 | 26 | Args: 27 | index_name (str): Name of the index to search in the vector store. 28 | query_bundle (QueryBundle): Query object containing the necessary details for 29 | executing the search. 30 | vector_store (VectorStore): Vector store instance to query for retrieving the 31 | elements. 32 | args (ProcessorArgs): Arguments object containing configurations for top-k 33 | results and the diversity factor. 34 | filter_config (Optional[FilterConfig]): Optional filter configuration to 35 | refine the query results. 36 | 37 | Returns: 38 | list: A list of diverse elements from the vector store result set. 39 | """ 40 | diversity_factor = args.vss_diversity_factor 41 | vss_top_k = args.vss_top_k 42 | 43 | if not diversity_factor or diversity_factor < 1: 44 | return vector_store.get_index(index_name).top_k(query_bundle, top_k=vss_top_k, filter_config=filter_config) 45 | 46 | top_k = vss_top_k * diversity_factor 47 | 48 | elements = vector_store.get_index(index_name).top_k(query_bundle, top_k=top_k, filter_config=filter_config) 49 | 50 | source_map = {} 51 | 52 | for element in elements: 53 | source_id = element['source']['sourceId'] 54 | if source_id not in source_map: 55 | source_map[source_id] = queue.Queue() 56 | source_map[source_id].put(element) 57 | 58 | elements_by_source = queue.Queue() 59 | 60 | for source_elements in source_map.values(): 61 | elements_by_source.put(source_elements) 62 | 63 | diverse_elements = [] 64 | 65 | while (not elements_by_source.empty()) and len(diverse_elements) < vss_top_k: 66 | source_elements = elements_by_source.get() 67 | diverse_elements.append(source_elements.get()) 68 | if not source_elements.empty(): 69 | elements_by_source.put(source_elements) 70 | 71 | logger.debug(f'Diverse {index_name}s:\n' + '\n--------------\n'.join([str(element) for element in diverse_elements])) 72 | 73 | return diverse_elements -------------------------------------------------------------------------------- /lexical-graph/src/graphrag_toolkit/lexical_graph/storage/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | from .graph_store_factory import GraphStoreFactory, GraphStoreType 5 | from .vector_store_factory import VectorStoreFactory, VectorStoreType 6 | from .constants import INDEX_KEY, ALL_EMBEDDING_INDEXES, DEFAULT_EMBEDDING_INDEXES, LEXICAL_GRAPH_LABELS 7 | -------------------------------------------------------------------------------- /lexical-graph/src/graphrag_toolkit/lexical_graph/storage/constants.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | INDEX_KEY = 'aws::graph::index' 5 | ALL_EMBEDDING_INDEXES = ['chunk', 'statement', 'topic'] 6 | DEFAULT_EMBEDDING_INDEXES = ['chunk', 'statement'] 7 | LEXICAL_GRAPH_LABELS = [ 8 | '__Source__', 9 | '__Chunk__', 10 | '__Topic__', 11 | '__Statement__', 12 | '__Fact__', 13 | '__Entity__', 14 | '__SYS_SV__EntityClassification__', 15 | '__SYS_SV__StatementTopic__', 16 | '__SYS_Class__' 17 | ] 18 | -------------------------------------------------------------------------------- /lexical-graph/src/graphrag_toolkit/lexical_graph/storage/graph/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | from .graph_store import GraphStore, RedactedGraphQueryLogFormatting, NonRedactedGraphQueryLogFormatting, NodeId, get_log_formatting, format_id 5 | from .graph_store_factory_method import GraphStoreFactoryMethod 6 | from .multi_tenant_graph_store import MultiTenantGraphStore 7 | from .dummy_graph_store import DummyGraphStore -------------------------------------------------------------------------------- /lexical-graph/src/graphrag_toolkit/lexical_graph/storage/graph/dummy_graph_store.py: -------------------------------------------------------------------------------- 1 | # Copyright FalkorDB.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | import logging 5 | 6 | from graphrag_toolkit.lexical_graph.storage.graph import GraphStoreFactoryMethod, GraphStore, get_log_formatting 7 | 8 | DUMMY = 'dummy://' 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | 13 | class DummyGraphStoreFactory(GraphStoreFactoryMethod): 14 | """ 15 | Factory class for creating instances of DummyGraphStore if applicable. 16 | 17 | This class implements a factory method pattern to create a DummyGraphStore 18 | object based on provided graph information. It attempts to determine whether 19 | the provided information corresponds to a dummy graph store, and if so, 20 | returns a new instance of DummyGraphStore. Otherwise, it returns None. 21 | 22 | Attributes: 23 | No additional class attributes are explicitly defined beyond inherited attributes. 24 | """ 25 | def try_create(self, graph_info: str, **kwargs) -> GraphStore: 26 | """ 27 | Attempts to create a `GraphStore` instance based on the provided `graph_info`. 28 | If `graph_info` starts with the constant `DUMMY`, a `DummyGraphStore` instance 29 | is initialized and returned. Otherwise, the method returns `None`. 30 | 31 | Args: 32 | graph_info (str): Information specifying the type of the graph store to 33 | create. If the value starts with `DUMMY`, a dummy graph store is opened. 34 | **kwargs: Additional keyword arguments used for configuring the graph store, 35 | such as formatting for logs. 36 | 37 | Returns: 38 | GraphStore: A `DummyGraphStore` instance if `graph_info` starts with 39 | `DUMMY`. Otherwise, returns `None`. 40 | """ 41 | if graph_info.startswith(DUMMY): 42 | logger.debug('Opening dummy graph store') 43 | return DummyGraphStore(log_formatting=get_log_formatting(kwargs)) 44 | else: 45 | return None 46 | 47 | 48 | class DummyGraphStore(GraphStore): 49 | """ 50 | Represents a specialized graph store that extends the base functionality of GraphStore. 51 | 52 | This class is designed to execute Cypher queries on a graph database and log the query 53 | information for debugging purposes. It provides an implementation for executing queries with 54 | optional parameters and correlation IDs. The main use case for this class is to interact with 55 | graph databases, primarily for logging and debugging scenarios. 56 | 57 | Attributes: 58 | log_formatting (LogFormatter): An instance of LogFormatter used for formatting log entries. 59 | _logging_prefix (callable): A callable function or method responsible for generating the 60 | logging prefix based on the provided correlation ID. 61 | """ 62 | def execute_query(self, cypher, parameters={}, correlation_id=None): 63 | """ 64 | Executes the given Cypher query with specified parameters and logs the operation. 65 | 66 | The function logs a formatted version of the Cypher query and its parameters with 67 | a correlation identifier for tracking. It provides an empty result as a placeholder. 68 | 69 | Args: 70 | cypher: The Cypher query to be executed. 71 | parameters: A dictionary representing the parameters for the Cypher query. 72 | Defaults to an empty dictionary. 73 | correlation_id: An optional identifier for correlating log entries. Defaults 74 | to None. 75 | 76 | Returns: 77 | A list as a placeholder for query execution results. Currently, it does 78 | not retrieve any actual results. 79 | """ 80 | log_entry_parameters = self.log_formatting.format_log_entry(self._logging_prefix(correlation_id), cypher, 81 | parameters) 82 | logger.debug( 83 | f'[{log_entry_parameters.query_ref}] query: {log_entry_parameters.query}, parameters: {log_entry_parameters.parameters}') 84 | return [] 85 | -------------------------------------------------------------------------------- /lexical-graph/src/graphrag_toolkit/lexical_graph/storage/graph/graph_store_factory_method.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | import abc 5 | 6 | from graphrag_toolkit.lexical_graph.storage.graph.graph_store import GraphStore 7 | 8 | class GraphStoreFactoryMethod(): 9 | """ 10 | GraphStoreFactoryMethod provides an abstraction for creating GraphStore objects. 11 | 12 | This class defines a factory method pattern that serves as the template for creating 13 | instances of `GraphStore`. It provides a contract that must be implemented by 14 | subclasses, specifying how `GraphStore` objects are instantiated based on 15 | graph configuration details. 16 | 17 | Methods: 18 | try_create(graph_info, **kwargs): Abstract method to attempt the creation 19 | of a `GraphStore` instance based on provided graph configuration 20 | and optional parameters. 21 | """ 22 | @abc.abstractmethod 23 | def try_create(self, graph_info:str, **kwargs) -> GraphStore: 24 | """ 25 | Abstract base class for creating a graph store from provided graph information. 26 | 27 | This class represents the structure that any concrete implementation must adhere 28 | to, ensuring that essential methods related to graph creation are defined. 29 | 30 | Attributes: 31 | graph_info (str): Information or details about the graph structure, which 32 | will be used for constructing the graph store. 33 | **kwargs: Arbitrary keyword arguments that might be required to configure 34 | the creation process. 35 | """ 36 | raise NotImplementedError 37 | -------------------------------------------------------------------------------- /lexical-graph/src/graphrag_toolkit/lexical_graph/storage/vector/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | from .vector_index import VectorIndex, to_embedded_query 5 | from .vector_index_factory_method import VectorIndexFactoryMethod 6 | from .vector_store import VectorStore 7 | from .multi_tenant_vector_store import MultiTenantVectorStore 8 | from .read_only_vector_store import ReadOnlyVectorStore 9 | from .dummy_vector_index import DummyVectorIndex -------------------------------------------------------------------------------- /lexical-graph/src/graphrag_toolkit/lexical_graph/storage/vector/multi_tenant_vector_store.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | from typing import List 4 | from graphrag_toolkit.lexical_graph import TenantId 5 | from graphrag_toolkit.lexical_graph.storage.vector import VectorStore, VectorIndex 6 | 7 | 8 | class MultiTenantVectorStore(VectorStore): 9 | """Provides a multi-tenant wrapper for VectorStore. 10 | 11 | This class allows creating a wrapper around a `VectorStore` object to 12 | support multi-tenancy by associating a specific tenant ID with operations. 13 | It ensures that all indexes retrieved or processed are identified and 14 | associated with the correct tenant context. 15 | 16 | Attributes: 17 | inner (VectorStore): The underlying vector store being wrapped. 18 | tenant_id (TenantId): The tenant ID associated with the operations 19 | performed on the vector store. 20 | """ 21 | @classmethod 22 | def wrap(cls, vector_store:VectorStore, tenant_id:TenantId): 23 | """ 24 | Wraps the given vector_store with a MultiTenantVectorStore if necessary, based on the 25 | tenant_id provided. The method ensures that the given vector_store is returned as-is 26 | if it corresponds to the default tenant or is already an instance of 27 | MultiTenantVectorStore. Otherwise, it wraps the vector_store inside a 28 | MultiTenantVectorStore with the given tenant_id. 29 | 30 | Args: 31 | vector_store: The vector_store to wrap if required. 32 | tenant_id: The tenant identifier used to decide whether wrapping is necessary. 33 | 34 | Returns: 35 | The provided vector_store, wrapped in a MultiTenantVectorStore if necessary, or 36 | the vector_store itself if no wrapping is required. 37 | """ 38 | 39 | if isinstance(vector_store, MultiTenantVectorStore): 40 | return vector_store 41 | return MultiTenantVectorStore(inner=vector_store, tenant_id=tenant_id) 42 | 43 | inner:VectorStore 44 | tenant_id:TenantId 45 | 46 | def get_index(self, index_name): 47 | """ 48 | Retrieves an index from the inner object and associates it with the tenant ID. 49 | 50 | Args: 51 | index_name: Name of the index to retrieve. 52 | 53 | Returns: 54 | The index retrieved, with the tenant_id attribute set to the tenant ID. 55 | """ 56 | index = self.inner.get_index(index_name=index_name) 57 | index.tenant_id = self.tenant_id 58 | return index 59 | 60 | def all_indexes(self) -> List[VectorIndex]: 61 | """ 62 | Returns a list of all VectorIndex instances stored in the inner indexes. 63 | 64 | This method iterates through the keys of the `inner.indexes` dictionary and 65 | retrieves the corresponding VectorIndex object for each key. 66 | 67 | Returns: 68 | List[VectorIndex]: A list containing all VectorIndex objects within the 69 | inner indexes of the current instance. 70 | """ 71 | return [self.get_index(i) for i in self.inner.indexes.keys()] 72 | -------------------------------------------------------------------------------- /lexical-graph/src/graphrag_toolkit/lexical_graph/storage/vector/opensearch_vector_index_factory.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | import logging 5 | from typing import List 6 | 7 | from graphrag_toolkit.lexical_graph.storage.vector import VectorIndex, VectorIndexFactoryMethod, to_embedded_query 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | OPENSEARCH_SERVERLESS = 'aoss://' 12 | OPENSEARCH_SERVERLESS_DNS = 'aoss.amazonaws.com' 13 | 14 | class OpenSearchVectorIndexFactory(VectorIndexFactoryMethod): 15 | """Factory class for creating OpenSearch vector indexes. 16 | 17 | This class is responsible for creating OpenSearch vector indexes based 18 | on provided index names and vector index connection information. It 19 | detects whether the connection information corresponds to an OpenSearch 20 | Serverless endpoint or a traditional OpenSearch endpoint and constructs 21 | the corresponding vector indexes. 22 | 23 | Attributes: 24 | No specific attributes are directly defined in this class. The class 25 | relies on the methods and details passed during the instantiation and 26 | method calls. 27 | """ 28 | def try_create(self, index_names:List[str], vector_index_info:str, **kwargs) -> List[VectorIndex]: 29 | """ 30 | Attempts to create a list of vector indexes using the provided index names and vector 31 | index information. This method checks if a supported endpoint configuration is 32 | provided and uses it to initialize appropriate vector indexes. Raises ImportError 33 | if the required module is not available. 34 | 35 | Args: 36 | index_names (List[str]): List of index names to create vector indexes for. 37 | vector_index_info (str): Information defining the type and endpoint of the vector 38 | index, such as an OpenSearch Serverless endpoint. 39 | **kwargs: Additional keyword arguments passed when creating the vector indexes. 40 | 41 | Returns: 42 | List[VectorIndex]: A list of vector index objects created for the provided index 43 | names and endpoint configuration, or None if no suitable endpoint is found. 44 | """ 45 | endpoint = None 46 | if vector_index_info.startswith(OPENSEARCH_SERVERLESS): 47 | endpoint = vector_index_info[len(OPENSEARCH_SERVERLESS):] 48 | elif vector_index_info.startswith('https://') and vector_index_info.endswith(OPENSEARCH_SERVERLESS_DNS): 49 | endpoint = vector_index_info 50 | if endpoint: 51 | try: 52 | from graphrag_toolkit.lexical_graph.storage.vector.opensearch_vector_indexes import OpenSearchIndex 53 | logger.debug(f'Opening OpenSearch vector indexes [index_names: {index_names}, endpoint: {endpoint}]') 54 | return [OpenSearchIndex.for_index(index_name, endpoint, **kwargs) for index_name in index_names] 55 | except ImportError as e: 56 | raise e 57 | 58 | else: 59 | return None -------------------------------------------------------------------------------- /lexical-graph/src/graphrag_toolkit/lexical_graph/storage/vector/pg_vector_index_factory.py: -------------------------------------------------------------------------------- 1 | # Copyright FalkorDB.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | import logging 5 | from typing import List 6 | 7 | from graphrag_toolkit.lexical_graph.storage.vector import VectorIndex, VectorIndexFactoryMethod 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | POSTGRES = 'postgres://' 12 | POSTGRESQL = 'postgresql://' 13 | 14 | class PGVectorIndexFactory(VectorIndexFactoryMethod): 15 | def try_create(self, index_names:List[str], vector_index_info:str, **kwargs) -> List[VectorIndex]: 16 | """ 17 | Tries to create and return a list of vector indexes using the given parameters. 18 | 19 | Depending on the connection information provided in `vector_index_info`, this method 20 | attempts to open PostgreSQL vector indexes or returns None if the connection string 21 | is not valid or applicable. If the PostgreSQL module is not available, it raises an 22 | ImportError. 23 | 24 | Args: 25 | index_names (List[str]): A list of index names to be used when creating vector indexes. 26 | vector_index_info (str): A string containing information about the vector index 27 | connection, such as a PostgreSQL connection string. 28 | \*\*kwargs: Additional arguments that might be passed to the underlying index creation 29 | utility. 30 | 31 | Returns: 32 | List[VectorIndex]: A list of vector index objects created for the provided index 33 | names if successful, otherwise None. 34 | 35 | Raises: 36 | ImportError: If the PostgreSQL-specific module required for creating the indexes 37 | cannot be imported. 38 | """ 39 | connection_string = None 40 | if vector_index_info.startswith(POSTGRES) or vector_index_info.startswith(POSTGRESQL): 41 | connection_string = vector_index_info 42 | if connection_string: 43 | logger.debug(f'Opening PostgreSQL vector indexes [index_names: {index_names}, connection_string: {connection_string}]') 44 | try: 45 | from graphrag_toolkit.lexical_graph.storage.vector.pg_vector_indexes import PGIndex 46 | return [PGIndex.for_index(index_name, connection_string, **kwargs) for index_name in index_names] 47 | except ImportError as e: 48 | raise e 49 | else: 50 | return None -------------------------------------------------------------------------------- /lexical-graph/src/graphrag_toolkit/lexical_graph/storage/vector/read_only_vector_store.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | from typing import List 4 | from graphrag_toolkit.lexical_graph import TenantId 5 | from graphrag_toolkit.lexical_graph.storage.vector import VectorStore, VectorIndex 6 | 7 | 8 | class ReadOnlyVectorStore(VectorStore): 9 | """ 10 | Represents a read-only wrapper for a VectorStore. 11 | 12 | This class is designed to wrap an existing VectorStore and provide a 13 | read-only interface to its contents. It ensures that any indexes accessed 14 | through this wrapper are not writeable. This class can be useful when you 15 | want to share or use a VectorStore in a context where modifications to it 16 | are not allowed. 17 | 18 | Attributes: 19 | inner (VectorStore): The underlying VectorStore being wrapped. All read 20 | operations are delegated to this inner VectorStore. 21 | """ 22 | @classmethod 23 | def wrap(cls, vector_store:VectorStore): 24 | """ 25 | Wraps the given vector store in a read-only wrapper if it is not already read-only. 26 | 27 | This method ensures that the vector store is encapsulated in a `ReadOnlyVectorStore` 28 | if it is not yet of that type, providing a read-only interface while preserving the 29 | original functionality. 30 | 31 | Args: 32 | vector_store: The vector store instance to wrap. 33 | 34 | Returns: 35 | ReadOnlyVectorStore: A read-only wrapper around the provided vector store. 36 | If the input is already a `ReadOnlyVectorStore`, it is returned unchanged. 37 | """ 38 | if isinstance(vector_store, ReadOnlyVectorStore): 39 | return vector_store 40 | return ReadOnlyVectorStore(inner=vector_store) 41 | 42 | inner:VectorStore 43 | 44 | def get_index(self, index_name): 45 | """ 46 | Retrieves an index by its name, making it non-writeable. 47 | 48 | This method fetches an index from the inner system using the specified 49 | index name and sets its `writeable` attribute to `False`. The modified 50 | index is then returned. 51 | 52 | Args: 53 | index_name: The name of the index to retrieve. 54 | 55 | Returns: 56 | The requested index with its `writeable` attribute set to `False`. 57 | 58 | Raises: 59 | KeyError: If the specified index name does not exist in the inner 60 | system. 61 | """ 62 | index = self.inner.get_index(index_name=index_name) 63 | index.writeable = False 64 | return index 65 | 66 | def all_indexes(self) -> List[VectorIndex]: 67 | """ 68 | Gets all indexes from the inner indexing system. 69 | 70 | This method iterates over the keys of the inner indexing system and retrieves 71 | the corresponding indexes using the `get_index` method. It consolidates these 72 | indexes into a list and returns it. 73 | 74 | Returns: 75 | List[VectorIndex]: A list of all retrieved indexes from the inner indexing 76 | system. 77 | """ 78 | return [self.get_index(i) for i in self.inner.indexes.keys()] 79 | -------------------------------------------------------------------------------- /lexical-graph/src/graphrag_toolkit/lexical_graph/storage/vector/vector_index_factory_method.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | import abc 5 | from typing import List 6 | 7 | from graphrag_toolkit.lexical_graph.storage.vector.vector_index import VectorIndex 8 | 9 | class VectorIndexFactoryMethod(): 10 | """ 11 | A factory method for creating vector indexes. 12 | 13 | This abstract class defines an interface for creating vector indexes. Any subclass should 14 | implement the `try_create` method to specify creation logic. The factory method ensures a 15 | consistent way of generating vector indexes, potentially handling input validation, 16 | additional configurations, or other pre-processing tasks. 17 | 18 | Methods in any subclasses are expected to return valid instances of `VectorIndex` based 19 | on provided input. 20 | 21 | Attributes: 22 | None 23 | """ 24 | @abc.abstractmethod 25 | def try_create(self, index_names:List[str], vector_index_info:str, **kwargs) -> List[VectorIndex]: 26 | raise NotImplementedError -------------------------------------------------------------------------------- /lexical-graph/src/graphrag_toolkit/lexical_graph/storage/vector/vector_store.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | import logging 5 | from typing import Dict, Optional, List 6 | 7 | from graphrag_toolkit.lexical_graph.storage.constants import ALL_EMBEDDING_INDEXES 8 | from graphrag_toolkit.lexical_graph.storage.vector.vector_index import VectorIndex 9 | from graphrag_toolkit.lexical_graph.storage.vector.dummy_vector_index import DummyVectorIndex 10 | 11 | from llama_index.core.bridge.pydantic import BaseModel, Field 12 | 13 | logger = logging.getLogger(__name__) 14 | 15 | class VectorStore(BaseModel): 16 | """ 17 | Represents a storage for managing and retrieving vector indexes. 18 | 19 | The `VectorStore` class is responsible for maintaining a collection of vector 20 | indexes. It supports functionalities such as retrieving specific indexes and 21 | accessing all stored indexes. This class is designed to handle operations 22 | related to vector index management seamlessly. 23 | 24 | Attributes: 25 | indexes (Optional[Dict[str, VectorIndex]]): A dictionary where the keys are 26 | index names and the values are corresponding `VectorIndex` objects. It 27 | stores all vector indexes available in this class instance. Defaults to 28 | an empty dictionary. 29 | """ 30 | indexes:Optional[Dict[str, VectorIndex]] = Field(description='Vector indexes', default_factory=dict) 31 | 32 | def get_index(self, index_name): 33 | """ 34 | Retrieves the vector index associated with the given index name. If the specified index 35 | name is not recognized or has not been registered in the indexes dictionary, it returns 36 | a dummy index instead. The method ensures that only valid index names are processed and 37 | handled appropriately. 38 | 39 | Args: 40 | index_name: The name of the index to retrieve. Must be an entry from the 41 | global `ALL_EMBEDDING_INDEXES` list. 42 | 43 | Returns: 44 | Union[VectorIndex, DummyVectorIndex]: The corresponding vector index if the name is 45 | found in the `indexes` dictionary; otherwise, a dummy vector index configured with 46 | the specified `index_name`. 47 | 48 | Raises: 49 | ValueError: If the provided `index_name` is not one of the allowed entries listed in 50 | `ALL_EMBEDDING_INDEXES`. 51 | """ 52 | if index_name not in ALL_EMBEDDING_INDEXES: 53 | raise ValueError(f'Invalid index name ({index_name}): must be one of {ALL_EMBEDDING_INDEXES}') 54 | if index_name not in self.indexes: 55 | logger.debug(f"Returning dummy index for '{index_name}'") 56 | return DummyVectorIndex(index_name=index_name) 57 | return self.indexes[index_name] 58 | 59 | def all_indexes(self) -> List[VectorIndex]: 60 | """ 61 | Returns a list of all vector indexes stored in the object. 62 | 63 | The method retrieves all vector indexes from the internal storage and 64 | returns them as a list. 65 | 66 | Returns: 67 | List[VectorIndex]: A list containing all vector indexes present in the 68 | internal storage. 69 | """ 70 | return list(self.indexes.values()) 71 | 72 | -------------------------------------------------------------------------------- /lexical-graph/src/graphrag_toolkit/lexical_graph/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | from .fm_observability import FMObservabilityPublisher, ConsoleFMObservabilitySubscriber 5 | from .llm_cache import LLMCache, LLMCacheType -------------------------------------------------------------------------------- /lexical-graph/src/graphrag_toolkit/lexical_graph/utils/bedrock_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | import logging 5 | import llama_index.llms.bedrock_converse.utils 6 | from typing import Any, Callable 7 | 8 | from tenacity import ( 9 | before_sleep_log, 10 | retry, 11 | retry_if_exception_type, 12 | stop_after_attempt, 13 | wait_random_exponential, 14 | ) 15 | 16 | logger = logging.getLogger(__name__) 17 | 18 | 19 | 20 | def _create_retry_decorator(client: Any, max_retries: int) -> Callable[[Any], Any]: 21 | """ 22 | Creates a retry decorator with exponential backoff strategy. 23 | 24 | This function returns a retry decorator based on the specified maximum 25 | number of retries and the provided client. It uses exponential backoff 26 | and wait times between retry attempts. This ensures handling of temporary 27 | failures and throttling exceptions raised by the specified client. 28 | 29 | Args: 30 | client: A client object that has exception classes for handling 31 | specific errors such as ThrottlingException, ModelTimeoutException, 32 | and ModelErrorException. 33 | max_retries: An integer specifying the maximum number of retry attempts 34 | to make before giving up. 35 | 36 | Returns: 37 | A callable retry decorator with specified retry policies. 38 | """ 39 | min_seconds = 4 40 | max_seconds = 30 41 | # Wait 2^x * 1 second between each retry starting with 42 | # 4 seconds, then up to 30 seconds, then 30 seconds afterwards 43 | try: 44 | import boto3 # noqa 45 | except ImportError as e: 46 | raise ImportError( 47 | "boto3 package not found, install with 'pip install boto3'" 48 | ) from e 49 | return retry( 50 | reraise=True, 51 | stop=stop_after_attempt(max_retries), 52 | wait=wait_random_exponential(multiplier=1, min=min_seconds, max=max_seconds), 53 | retry=( 54 | retry_if_exception_type(client.exceptions.ThrottlingException) | 55 | retry_if_exception_type(client.exceptions.ModelTimeoutException) | 56 | retry_if_exception_type(client.exceptions.ModelErrorException) 57 | ), 58 | before_sleep=before_sleep_log(logger, logging.WARNING), 59 | ) 60 | 61 | llama_index.llms.bedrock_converse.utils._create_retry_decorator = _create_retry_decorator 62 | -------------------------------------------------------------------------------- /security.md: -------------------------------------------------------------------------------- 1 | # Security issue notifications 2 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue. 3 | 4 | --------------------------------------------------------------------------------