├── .gitignore
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE
├── NOTICE
├── README.md
├── byokg-rag
├── .gitignore
├── README.md
├── pyproject.toml
└── src
│ └── graphrag_toolkit
│ ├── __init__.py
│ └── byokg_rag
│ ├── __init__.py
│ ├── byokg_query_engine.py
│ ├── graph_connectors
│ ├── __init__.py
│ ├── kg_linker.py
│ └── prompts
│ │ ├── kg_linker_prompt.yaml
│ │ └── task_prompts.yaml
│ ├── graph_retrievers
│ ├── __init__.py
│ ├── entity_linker.py
│ ├── graph_reranker.py
│ ├── graph_retrievers.py
│ ├── graph_traversal.py
│ ├── graph_verbalizer.py
│ └── prompts
│ │ └── agent_prompts.yaml
│ ├── graphstore
│ ├── __init__.py
│ └── graphstore.py
│ ├── indexing
│ ├── __init__.py
│ ├── fuzzy_string.py
│ └── index.py
│ ├── llm
│ ├── __init__.py
│ └── bedrock_llms.py
│ ├── prompts
│ └── generation_prompts.yaml
│ ├── requirements.txt
│ └── utils.py
├── docs
├── README.md
└── lexical-graph
│ ├── README.md
│ ├── aws-profile.md
│ ├── batch-extraction.md
│ ├── configuration.md
│ ├── faq.md
│ ├── graph-model.md
│ ├── graph-store-falkor-db.md
│ ├── graph-store-neptune-analytics.md
│ ├── graph-store-neptune-db.md
│ ├── hybrid-deployment.md
│ ├── indexing.md
│ ├── metadata-filtering.md
│ ├── multi-tenancy.md
│ ├── overview.md
│ ├── prompts.md
│ ├── querying.md
│ ├── security.md
│ ├── storage-model.md
│ ├── vector-store-neptune-analytics.md
│ ├── vector-store-opensearch-serverless.md
│ └── vector-store-postgres.md
├── examples
├── README.md
├── byokg-rag
│ ├── byokg_rag_demo.ipynb
│ └── data
│ │ └── freebase_tiny_kg.csv
├── lexical-graph-hybrid-dev
│ ├── aws
│ │ ├── setup-bedrock-batch-doc.md
│ │ └── setup-bedrock-batch.sh
│ ├── docker
│ │ ├── .env
│ │ ├── build.sh
│ │ ├── docker-compose.yml
│ │ ├── postgres
│ │ │ └── schema.sql
│ │ └── reset.sh
│ ├── docs
│ │ ├── docker_build_shell_script.md
│ │ ├── docker_compose_services.md
│ │ └── docker_reset_shell_script.md
│ └── notebooks
│ │ ├── .env
│ │ ├── 00-Setup.ipynb
│ │ ├── 01-Local-Extract-Batch.ipynb
│ │ ├── 02-Cloud-Setup.ipynb
│ │ ├── 03-Cloud-Build.ipynb
│ │ ├── 04-Cloud-Querying.ipynb
│ │ └── best-practices
│ │ └── Retrieval-Augmented-Generation-Options.pdf
├── lexical-graph-local-dev
│ ├── README.md
│ ├── docker
│ │ ├── .env
│ │ ├── build.sh
│ │ ├── docker-compose.yml
│ │ ├── postgres
│ │ │ └── schema.sql
│ │ └── reset.sh
│ ├── docs
│ │ ├── docker_build.md
│ │ ├── docker_reset_script.md
│ │ └── docker_services.md
│ └── notebooks
│ │ ├── .env
│ │ ├── 00-Setup.ipynb
│ │ ├── 01-Combined-Extract-and-Build.ipynb
│ │ ├── 02-Querying.ipynb
│ │ ├── 03-Querying with prompting.ipynb
│ │ └── prompts
│ │ ├── system_prompt.txt
│ │ └── user_prompt.txt
└── lexical-graph
│ ├── README.md
│ ├── cloudformation-templates
│ ├── graphrag-toolkit-neptune-analytics-aurora-postgres.json
│ ├── graphrag-toolkit-neptune-analytics-opensearch-serverless.json
│ ├── graphrag-toolkit-neptune-analytics.json
│ ├── graphrag-toolkit-neptune-db-aurora-postgres-existing-vpc.json
│ ├── graphrag-toolkit-neptune-db-aurora-postgres.json
│ ├── graphrag-toolkit-neptune-db-opensearch-serverless.json
│ └── update-stack.sh
│ └── notebooks
│ ├── 00-Setup.ipynb
│ ├── 01-Combined-Extract-and-Build.ipynb
│ ├── 02-Separate-Extract-and-Build.ipynb
│ ├── 03-Traversal-Based-Querying.ipynb
│ ├── 04-Semantic-Guided-Querying.ipynb
│ ├── 05-Multi-Tenancy.ipynb
│ └── 06-Agentic-GraphRAG.ipynb
├── images
├── byokg_rag.png
├── extract-and-build.png
├── hybrid-extract-and-build.png
├── lexical-graph.png
├── local-extract-and-build.png
└── question-answering.png
├── lexical-graph-contrib
└── falkordb
│ ├── pyproject.toml
│ └── src
│ ├── graphrag_toolkit
│ └── lexical_graph
│ │ └── storage
│ │ └── graph
│ │ └── falkordb
│ │ ├── __init__.py
│ │ ├── falkordb_graph_store.py
│ │ └── falkordb_graph_store_factory.py
│ ├── requirements.txt
│ ├── setup.cfg
│ └── setup.py
├── lexical-graph
├── README.md
├── pyproject.toml
└── src
│ └── graphrag_toolkit
│ ├── __init__.py
│ └── lexical_graph
│ ├── __init__.py
│ ├── config.py
│ ├── errors.py
│ ├── indexing
│ ├── __init__.py
│ ├── build
│ │ ├── __init__.py
│ │ ├── build_filter.py
│ │ ├── build_filters.py
│ │ ├── build_pipeline.py
│ │ ├── checkpoint.py
│ │ ├── chunk_graph_builder.py
│ │ ├── chunk_node_builder.py
│ │ ├── entity_graph_builder.py
│ │ ├── entity_relation_graph_builder.py
│ │ ├── fact_graph_builder.py
│ │ ├── graph_batch_client.py
│ │ ├── graph_builder.py
│ │ ├── graph_construction.py
│ │ ├── graph_summary_builder.py
│ │ ├── node_builder.py
│ │ ├── node_builders.py
│ │ ├── null_builder.py
│ │ ├── source_graph_builder.py
│ │ ├── source_node_builder.py
│ │ ├── statement_graph_builder.py
│ │ ├── statement_node_builder.py
│ │ ├── topic_graph_builder.py
│ │ ├── topic_node_builder.py
│ │ ├── vector_batch_client.py
│ │ └── vector_indexing.py
│ ├── constants.py
│ ├── extract
│ │ ├── __init__.py
│ │ ├── batch_config.py
│ │ ├── batch_llm_proposition_extractor.py
│ │ ├── batch_topic_extractor.py
│ │ ├── docs_to_nodes.py
│ │ ├── extraction_pipeline.py
│ │ ├── file_system_tap.py
│ │ ├── graph_scoped_value_store.py
│ │ ├── id_rewriter.py
│ │ ├── infer_classifications.py
│ │ ├── infer_config.py
│ │ ├── llm_proposition_extractor.py
│ │ ├── pipeline_decorator.py
│ │ ├── proposition_extractor.py
│ │ ├── scoped_value_provider.py
│ │ ├── source_doc_parser.py
│ │ └── topic_extractor.py
│ ├── id_generator.py
│ ├── load
│ │ ├── __init__.py
│ │ ├── bedrock_knowledge_base.py
│ │ ├── file_based_chunks.py
│ │ ├── file_based_docs.py
│ │ ├── json_array_reader.py
│ │ ├── s3_based_chunks.py
│ │ ├── s3_based_docs.py
│ │ └── source_documents.py
│ ├── model.py
│ ├── node_handler.py
│ ├── prompts.py
│ └── utils
│ │ ├── __init__.py
│ │ ├── batch_inference_utils.py
│ │ ├── metadata_utils.py
│ │ ├── pipeline_utils.py
│ │ └── topic_utils.py
│ ├── lexical_graph_index.py
│ ├── lexical_graph_query_engine.py
│ ├── logging.py
│ ├── metadata.py
│ ├── prompts
│ ├── __init__.py
│ ├── bedrock_prompt_provider.py
│ ├── file_prompt_provider.py
│ ├── prompt_provider_base.py
│ ├── prompt_provider_config.py
│ ├── prompt_provider_config_base.py
│ ├── prompt_provider_factory.py
│ ├── prompt_provider_registry.py
│ ├── s3_prompt_provider.py
│ └── static_prompt_provider.py
│ ├── protocols
│ ├── __init__.py
│ └── mcp_server.py
│ ├── requirements.txt
│ ├── retrieval
│ ├── __init__.py
│ ├── model.py
│ ├── post_processors
│ │ ├── __init__.py
│ │ ├── bedrock_context_format.py
│ │ ├── bge_reranker.py
│ │ ├── enrich_source_details.py
│ │ ├── reranker_mixin.py
│ │ ├── sentence_reranker.py
│ │ ├── statement_diversity.py
│ │ └── statement_enhancement.py
│ ├── processors
│ │ ├── __init__.py
│ │ ├── clear_chunks.py
│ │ ├── clear_scores.py
│ │ ├── dedup_results.py
│ │ ├── disaggregate_results.py
│ │ ├── filter_by_metadata.py
│ │ ├── format_sources.py
│ │ ├── populate_statement_strs.py
│ │ ├── processor_args.py
│ │ ├── processor_base.py
│ │ ├── prune_results.py
│ │ ├── prune_statements.py
│ │ ├── rerank_statements.py
│ │ ├── rescore_results.py
│ │ ├── simplify_single_topic_results.py
│ │ ├── sort_results.py
│ │ ├── statements_to_strings.py
│ │ ├── truncate_results.py
│ │ ├── truncate_statements.py
│ │ └── zero_scores.py
│ ├── prompts.py
│ ├── retrievers
│ │ ├── __init__.py
│ │ ├── chunk_based_search.py
│ │ ├── composite_traversal_based_retriever.py
│ │ ├── entity_based_search.py
│ │ ├── entity_context_search.py
│ │ ├── keyword_entity_search.py
│ │ ├── keyword_ranking_search.py
│ │ ├── rerank_beam_search.py
│ │ ├── semantic_beam_search.py
│ │ ├── semantic_guided_base_retriever.py
│ │ ├── semantic_guided_retriever.py
│ │ ├── statement_cosine_seach.py
│ │ ├── topic_based_search.py
│ │ └── traversal_based_base_retriever.py
│ ├── summary
│ │ ├── __init__.py
│ │ └── graph_summary.py
│ └── utils
│ │ ├── __init__.py
│ │ ├── query_decomposition.py
│ │ ├── statement_utils.py
│ │ └── vector_utils.py
│ ├── storage
│ ├── __init__.py
│ ├── constants.py
│ ├── graph
│ │ ├── __init__.py
│ │ ├── dummy_graph_store.py
│ │ ├── graph_store.py
│ │ ├── graph_store_factory_method.py
│ │ ├── graph_utils.py
│ │ ├── multi_tenant_graph_store.py
│ │ └── neptune_graph_stores.py
│ ├── graph_store_factory.py
│ ├── vector
│ │ ├── __init__.py
│ │ ├── dummy_vector_index.py
│ │ ├── multi_tenant_vector_store.py
│ │ ├── neptune_vector_indexes.py
│ │ ├── opensearch_vector_index_factory.py
│ │ ├── opensearch_vector_indexes.py
│ │ ├── pg_vector_index_factory.py
│ │ ├── pg_vector_indexes.py
│ │ ├── read_only_vector_store.py
│ │ ├── vector_index.py
│ │ ├── vector_index_factory_method.py
│ │ └── vector_store.py
│ └── vector_store_factory.py
│ ├── tenant_id.py
│ └── utils
│ ├── __init__.py
│ ├── bedrock_utils.py
│ ├── fm_observability.py
│ ├── io_utils.py
│ └── llm_cache.py
└── security.md
/.gitignore:
--------------------------------------------------------------------------------
1 | /examples/lexical-graph/notebooks/extracted/
2 | /examples/lexical-graph/notebooks/output/
3 | /.venv/
4 | /examples/lexical-graph-local-dev/notebooks/output/
5 | /docs/lexical-graph/.idea/
6 | /examples/lexical-graph-hybrid-dev/notebooks/output/
7 | /examples/lexical-graph-hybrid-dev/notebooks/extracted/
8 |
--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | ## Code of Conduct
2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
4 | opensource-codeofconduct@amazon.com with any additional questions or comments.
5 |
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # Contributing Guidelines
2 |
3 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional
4 | documentation, we greatly value feedback and contributions from our community.
5 |
6 | Please read through this document before submitting any issues or pull requests to ensure we have all the necessary
7 | information to effectively respond to your bug report or contribution.
8 |
9 |
10 | ## Reporting Bugs/Feature Requests
11 |
12 | We welcome you to use the GitHub issue tracker to report bugs or suggest features.
13 |
14 | When filing an issue, please check existing open, or recently closed, issues to make sure somebody else hasn't already
15 | reported the issue. Please try to include as much information as you can. Details like these are incredibly useful:
16 |
17 | * A reproducible test case or series of steps
18 | * The version of our code being used
19 | * Any modifications you've made relevant to the bug
20 | * Anything unusual about your environment or deployment
21 |
22 |
23 | ## Contributing via Pull Requests
24 | Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that:
25 |
26 | 1. You are working against the latest source on the *main* branch.
27 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already.
28 | 3. You open an issue to discuss any significant work - we would hate for your time to be wasted.
29 |
30 | To send us a pull request, please:
31 |
32 | 1. Fork the repository.
33 | 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change.
34 | 3. Ensure local tests pass.
35 | 4. Commit to your fork using clear commit messages.
36 | 5. Send us a pull request, answering any default questions in the pull request interface.
37 | 6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation.
38 |
39 | GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and
40 | [creating a pull request](https://help.github.com/articles/creating-a-pull-request/).
41 |
42 |
43 | ## Finding contributions to work on
44 | Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any 'help wanted' issues is a great place to start.
45 |
46 |
47 | ## Code of Conduct
48 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
49 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
50 | opensource-codeofconduct@amazon.com with any additional questions or comments.
51 |
52 |
53 | ## Security issue notifications
54 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue.
55 |
56 |
57 | ## Licensing
58 |
59 | See the [LICENSE](LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution.
60 |
--------------------------------------------------------------------------------
/NOTICE:
--------------------------------------------------------------------------------
1 | Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | ## GraphRAG Toolkit
2 |
3 | The graphrag-toolkit is a collection of Python tools for building graph-enhanced Generative AI applications.
4 |
5 | > **4 June 2025** Release 3.8.0 includes a separate BYOKG-RAG package, which allows users to bring their own knowledge graph and perform complex question answering over it.
6 |
7 | > **28 May 2025** Release 3.7.0 includes an MCP server that dynamically generates tools and tool descriptions (one per tenant in a multi-tenant graph).
8 |
9 | Installation instructions and requirements are detailed separately with each tool.
10 |
11 | ### Lexical Graph
12 |
13 | The [lexical-graph](./lexical-graph/) provides a framework for automating the construction of a [hierarchical lexical graph](./docs/lexical-graph/graph-model.md) from unstructured data, and composing question-answering strategies that query this graph when answering user questions.
14 |
15 | ### BYOKG-RAG
16 |
17 | [BYOKG-RAG](./byokg-rag/) is a novel approach to Knowledge Graph Question Answering (KGQA) that combines the power of Large Language Models (LLMs) with structured knowledge graphs. The system allows users to bring their own knowledge graph and perform complex question answering over it.
18 |
19 | ## Security
20 |
21 | See [CONTRIBUTING](CONTRIBUTING.md#security-issue-notifications) for more information.
22 |
23 | ## License
24 |
25 | This project is licensed under the Apache-2.0 License.
26 |
27 |
--------------------------------------------------------------------------------
/byokg-rag/.gitignore:
--------------------------------------------------------------------------------
1 | .idea
2 | dist/
3 | build/
4 | *.egg-info/
5 | __pycache__/
6 | *.DS_Store
--------------------------------------------------------------------------------
/byokg-rag/README.md:
--------------------------------------------------------------------------------
1 | # BYOKG-RAG: Bring Your Own Knowledge Graph for Retrieval Augmented Generation
2 |
3 | 
4 |
5 | BYOKG-RAG is a novel approach to Knowledge Graph Question Answering (KGQA) that combines the power of Large Language Models (LLMs) with structured knowledge graphs. The system allows users to bring their own knowledge graph and perform complex question answering over it.
6 |
7 | ## Key Features 🔑
8 |
9 | - **Multi-strategy Retrieval**: Combines multiple retrieval strategies:
10 | - Agentic retrieval for dynamic graph exploration
11 | - Scoring-based retrieval for relevance ranking
12 | - Path-based retrieval for multi-hop reasoning
13 | - Query-based retrieval for direct graph queries
14 | - **LLM-powered Reasoning**: Leverages state-of-the-art LLMs for question understanding and answer generation
15 |
16 | ## System Components ⚙️
17 |
18 | 1. **Graph Store** ([src/graphrag_toolkit/byokg_rag/graphstore](src/graphrag_toolkit/byokg_rag/graphstore))
19 | - Manages the knowledge graph data structure
20 | - Provides interfaces for graph traversal and querying
21 |
22 | 2. **KG Linker** ([src/graphrag_toolkit/byokg_rag/graph_connectors](src/graphrag_toolkit/byokg_rag/graph_connectors))
23 | - Links natural language queries to graph entities and paths
24 | - Uses LLMs to understand question intent
25 | - Extracts relevant entities and relationship patterns
26 |
27 | 3. **Entity Linker** ([src/graphrag_toolkit/byokg_rag/graph_retrievers](src/graphrag_toolkit/byokg_rag/graph_retrievers))
28 | - Matches entities from text to graph nodes
29 | - Handles variations in entity names
30 | - Uses fuzzy string matching for robust entity resolution
31 |
32 | 4. **Triplet Retriever** ([src/graphrag_toolkit/byokg_rag/graph_retrievers](src/graphrag_toolkit/byokg_rag/graph_retrievers))
33 | - Retrieves relevant triplets from the graph
34 | - Navigates the graph starting from linked entities
35 | - Verbalizes triplets in natural language format
36 |
37 | 5. **Path Retriever** ([src/graphrag_toolkit/byokg_rag/graph_retrievers](src/graphrag_toolkit/byokg_rag/graph_retrievers))
38 | - Finds paths between entities in the graph
39 | - Follows metapath patterns for structured traversal
40 | - Connects entities through intermediate nodes
41 |
42 | 6. **Query Engine** ([src/graphrag_toolkit/byokg_rag/byokg_query_engine.py](src/graphrag_toolkit/byokg_rag/byokg_query_engine.py))
43 | - Orchestrates all components
44 | - Processes natural language questions
45 | - Generates answers based on retrieved information
46 |
47 | ## Performance 📈
48 |
49 | Our results show that BYOKG-RAG outperforms existing approaches across multiple knowledge graph benchmarks:
50 |
51 | | KGQA Hit (%) | Wiki-KG | Temp-KG | Med-KG |
52 | |--------------|---------|---------|--------|
53 | | Agent | 77.8 | 57.3 | 59.2 |
54 | | BYOKG-RAG | 80.1 | 65.5 | 65.0 |
55 |
56 | *Note: Full paper with detailed methodology and results coming soon!* 📄
57 |
58 | ## Getting Started 🚀
59 |
60 | The byokg-rag toolkit requires Python and [pip](http://www.pip-installer.org/en/latest/) to install. You can install the byokg-rag using pip:
61 |
62 | 1. Install dependencies:
63 | ```bash
64 | pip install .
65 | ```
66 | or
67 | ```
68 | pip install https://github.com/awslabs/graphrag-toolkit/archive/refs/tags/v3.8.1.zip#subdirectory=byokg-rag
69 | ```
70 | (The version number will vary based on the latest GitHub release)
71 |
72 | 2. Run the demo notebook ([byokg_rag_demo.ipynb](../examples/byokg-rag/byokg_rag_demo.ipynb)):
73 | ```
74 | graphrag-toolkit/examples/byokg-rag/byokg_rag_demo.ipynb
75 | ```
76 |
77 | ## Citation 📚
78 |
79 | *Arxiv paper and citation coming soon!*
80 |
81 | ```
82 | @misc{byokg-rag-2025,
83 | author = {Mavromatis, Costas and Adeshina, Soji and Ioannidis, Vassilis N. and Han, Zhen and Zhu, Qi and Robinson, Ian and Thompson, Bryan and Rangwala, Huzefa and Karypis, George},
84 | title = {{BYOKG-RAG}: Multi-Strategy Graph Retrieval for Knowledge Graph Question Answering},
85 | url = {https://github.com/awslabs/graphrag-toolkit},
86 | year = {2025}
87 | }
88 | ```
89 |
90 | ## License ⚖️
91 |
92 | This project is licensed under the Apache-2.0 License.
--------------------------------------------------------------------------------
/byokg-rag/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["hatchling", "hatch-requirements-txt"]
3 | build-backend = "hatchling.build"
4 |
5 | [tool.hatch.build.targets.wheel]
6 | packages = ["src/graphrag_toolkit"]
7 |
8 | [project]
9 | name = "graphrag-toolkit-byokg-rag"
10 | version = "0.0.1"
11 | description = "AWS GraphRAG Toolkit, BYOKG RAG"
12 | readme = "README.md"
13 | requires-python = ">=3.10"
14 | dynamic = ["dependencies"]
15 | license = "Apache-2.0"
16 |
17 | [tool.hatch.metadata.hooks.requirements_txt]
18 | files = ["src/graphrag_toolkit/byokg_rag/requirements.txt"]
--------------------------------------------------------------------------------
/byokg-rag/src/graphrag_toolkit/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 | # SPDX-License-Identifier: Apache-2.0
--------------------------------------------------------------------------------
/byokg-rag/src/graphrag_toolkit/byokg_rag/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 | # SPDX-License-Identifier: Apache-2.0
--------------------------------------------------------------------------------
/byokg-rag/src/graphrag_toolkit/byokg_rag/graph_connectors/__init__.py:
--------------------------------------------------------------------------------
1 | from .kg_linker import *
2 |
--------------------------------------------------------------------------------
/byokg-rag/src/graphrag_toolkit/byokg_rag/graph_connectors/prompts/kg_linker_prompt.yaml:
--------------------------------------------------------------------------------
1 | kg-linker-prompt:
2 | system-prompt: '''You are a highly skilled Graph Query Language interpreter specializing in translating natural language questions into precise graph queries. With expertise in graph database structures, your role is to respond to user tasks, which may involve the following:
3 | - Analyze natural language questions thoroughly
4 | - Identify relevant entities within the question context
5 | - Determine appropriate relationship paths within the graph schema
6 | - Generate executable graph queries that accurately represent the user intent
7 |
8 | Your responses must be technically accurate, follow the exact format requested, and only use relationship types and properties that exist in the provided schema.
9 | You may also be provided with additional graph context, which you should utilize to tackle any related tasks.
10 | '''
11 | user-prompt: '''Given a question, schema, and optional graph context, your role is to perform the following tasks:
12 |
13 | {{task_prompts}}
14 |
15 | ### Important Instructions:
16 | - Respond ONLY to the requested tasks with proper tags
17 | - Do not provide explanations outside of the tagged sections
18 | - Use the exact relationship types and property names from the schema
19 | - Ensure any generated graph query (if any) is syntactically correct and executable
20 | - **When graph context is provided, thoroughly leverage it to improve all aspects of your response**
21 |
22 | Now, please analyze the following:
23 |
24 | Question: {question}
25 | Schema: {schema}
26 | Graph Context: {graph_context}
27 | '''
28 |
--------------------------------------------------------------------------------
/byokg-rag/src/graphrag_toolkit/byokg_rag/graph_retrievers/__init__.py:
--------------------------------------------------------------------------------
1 | from .graph_retrievers import *
2 | from .graph_reranker import *
3 | from .graph_traversal import *
4 | from .graph_verbalizer import *
5 | from .entity_linker import *
--------------------------------------------------------------------------------
/byokg-rag/src/graphrag_toolkit/byokg_rag/graph_retrievers/graph_reranker.py:
--------------------------------------------------------------------------------
1 | from abc import ABC, abstractmethod
2 | import numpy as np
3 | import torch
4 |
5 | class GReranker(ABC):
6 | """
7 | Abstract base class for GraphRAG reranker.
8 | """
9 |
10 | def __init__(self):
11 | """
12 | Initialize the graph reranker.
13 | """
14 |
15 | @abstractmethod
16 | def rerank_input_with_query(self, query, input, topk=None):
17 | """
18 | Rerank the given input based on the query.
19 |
20 | Args:
21 | query (str): The query string.
22 | node text (list): List of node text to be reranked.
23 |
24 | Returns:
25 | NotImplementedError: If not implemented by child class.
26 | """
27 | raise NotImplementedError("Method rerank_input_with_query must be implemented")
28 |
29 | class LocalGReranker(GReranker):
30 | """
31 | Local reranker on single machine with BGE-reranker-base models.
32 | """
33 | def __init__(self, model_name="BAAI/bge-reranker-base", topk=10, device="cuda"):
34 | assert model_name in ["BAAI/bge-reranker-base", "BAAI/bge-reranker-large", "BAAI/bge-reranker-v2-m3"], "Model name not supported"
35 | self.model_name = model_name
36 | from transformers import AutoModelForSequenceClassification, AutoTokenizer
37 |
38 | self.tokenizer = AutoTokenizer.from_pretrained(model_name)
39 | self.reranker = AutoModelForSequenceClassification.from_pretrained(model_name)
40 | self.reranker = self.reranker.to(device)
41 | self.reranker.eval()
42 |
43 | self.topk = topk
44 |
45 |
46 | def calculate_score(self, pairs):
47 | """
48 | Calculate the score for the given pairs (query, text)
49 | """
50 | if self.model_name in ["BAAI/bge-reranker-base", "BAAI/bge-reranker-large", "BAAI/bge-reranker-v2-m3"]:
51 | with torch.no_grad():
52 | inputs = self.tokenizer(pairs, padding=True, truncation=True, return_tensors='pt', max_length=512)
53 | inputs = inputs.to(self.reranker.device)
54 | scores = self.reranker(**inputs, return_dict=True).logits.view(-1, ).float()
55 | return scores
56 | else:
57 | raise NotImplementedError
58 |
59 | def filter_topk(self, query, input, topk=10, return_scores=False):
60 | """
61 | Filter the top-k input based on the reranker score.
62 | """
63 | if isinstance(query, str):
64 | pairs = [[query, x] for x in input]
65 | else:
66 | pairs = [[x,y] for x,y in zip(query, input)]
67 | score = self.calculate_score(pairs)
68 | # convert to CPU
69 | score = score.cpu()
70 | np_score = -np.array(score)
71 | ids = np.argsort(np_score, kind="stable")
72 |
73 | if return_scores:
74 | return [input[x] for x in ids[:topk]], [score[x] for x in ids[:topk]], ids[:topk]
75 | else:
76 | return [input[x] for x in ids[:topk]], ids[:topk]
77 |
78 | def rerank_input_with_query(self, query, input, topk=None, return_scores=False):
79 | """
80 | Rerank the given input based on the query.
81 |
82 | Args:
83 | query (str): The query string.
84 | input (list): List of input to be reranked.
85 |
86 | Returns:
87 | list: Reranked list of input.
88 | """
89 | if not topk:
90 | topk = self.topk
91 | return self.filter_topk(query, input, topk=topk, return_scores=return_scores)
92 |
--------------------------------------------------------------------------------
/byokg-rag/src/graphrag_toolkit/byokg_rag/graph_retrievers/prompts/agent_prompts.yaml:
--------------------------------------------------------------------------------
1 | relation_selection_prompt: '''
2 | Your task is to select the most appropriate relations based on their relevance to a given question.
3 |
4 | ### Formatting Requirements:
5 | 1. Each selected relation and score MUST be on a new line
6 | 2. Do not use commas, semicolons, or any other delimiters between relations or scores
7 |
8 |
9 | Follow these steps:
10 | 1. Read the provided question carefully.
11 | 2. Analyze each relation in the list and determine its relevance to the question and relation.
12 | 3. Respond by selecting the most relevant relations within tags. Be both frugal on your selection and consider completeness.
13 | 4. The selected relations should be provided line-by-line.
14 |
15 |
16 | Example format:
17 |
18 | Name the president of the country whose main spoken language was English in 1980?
19 |
20 |
21 |
22 | English Language
23 | English
24 |
25 |
26 |
27 | language.human_language.main_country
28 | language.human_language.language_family
29 | language.human_language.iso_639_3_code
30 | base.rosetta.languoid.parent
31 | language.human_language.countries_spoken_in
32 |
33 |
34 |
35 | language.human_language.main_country
36 | base.rosetta.languoid.parent
37 | language.human_language.countries_spoken_in
38 |
39 |
40 | Explanation: language.human_language.main_country relation is highly relevant as it directly relates to the country whose president is being asked for, and the main country where English language is spoken in 1980.
41 | language.human_language.countries_spoken_in relation is also relevant as it provides information on the countries where English language is spoken, which could help narrow down the search for the president.
42 | base.rosetta.languoid.parent relation is less relevant but still provides some context on the language family
43 |
44 | Important Instructions: Always return at least one relation.
45 | Now it is your turn.
46 |
47 |
48 | {question}
49 |
50 |
51 |
52 | {entity}
53 |
54 |
55 |
56 | {relations}
57 |
58 |
59 | Remember to parse your response in tags:
60 | '''
61 |
62 |
63 |
64 | entity_selection_prompt: '''
65 | Given a question and the associated retrieved knowledge graph context (entity, relation, entity), you are asked to select the most important entities to explore in order to answer the question.
66 | Consider important entities only that are necessary for answering the question. Do not select entities, for which we already have all necessary information.
67 |
68 | - Format your response exactly as follows:
69 |
70 | relevant_entity1
71 | relevant_entity2
72 | ...
73 |
74 |
75 | The selected entities must be provided line-by-line (\n).
76 |
77 | Example format:
78 | Question: Name the president of the country whose main spoken language was English in 1980?
79 | Graph Context: English Language -> language.human_language.countries_spoken_in -> England | USA
80 |
81 |
82 | England
83 | USA
84 |
85 |
86 |
87 | The entites should be sorted from the most important to the least important.
88 | Important Instruction: If we can answer the question directly based on the provided graph context, respond with:
89 |
90 | FINISH
91 |
92 |
93 | - Now Respond ONLY to the requested tasks with proper tags
94 |
95 | Question: {question}
96 | Graph Context: {graph_context}
97 | '''
98 |
--------------------------------------------------------------------------------
/byokg-rag/src/graphrag_toolkit/byokg_rag/graphstore/__init__.py:
--------------------------------------------------------------------------------
1 | from .graphstore import *
--------------------------------------------------------------------------------
/byokg-rag/src/graphrag_toolkit/byokg_rag/indexing/__init__.py:
--------------------------------------------------------------------------------
1 | from .index import *
2 | from .fuzzy_string import *
3 |
--------------------------------------------------------------------------------
/byokg-rag/src/graphrag_toolkit/byokg_rag/indexing/fuzzy_string.py:
--------------------------------------------------------------------------------
1 | from thefuzz import fuzz, process
2 | from abc import ABC, abstractmethod
3 | from typing import List
4 | from .index import Index
5 |
6 |
7 | class FuzzyStringIndex(Index):
8 | """
9 | A class for fuzzy string matching and indexing.
10 | """
11 |
12 | def __init__(self):
13 | super().__init__() # Ensure proper initialization of the base class.
14 | self.vocab = []
15 |
16 | def reset(self):
17 | self.vocab = []
18 |
19 | def query(self, input, topk=1, id_selector=None):
20 | """
21 | match a query to items in the index and return the topk results
22 |
23 | :param input: str the query to match
24 | :param topk: number of items to return
25 | :param id_selector: a list of ids to retrieve the topk from i.e an allowlist
26 | :return:
27 | """
28 |
29 | if id_selector is not None:
30 | raise NotImplementedError(f"id_selector not implemented for FuzzyString")
31 |
32 | # string matching process from thefuzz library https://pypi.org/project/thefuzz/
33 | results = process.extract(input, self.vocab, limit=topk)
34 |
35 | return {'hits': [{'document_id': match_string,
36 | 'document': match_string,
37 | 'match_score': match_score}
38 | for match_string, match_score in results]}
39 |
40 |
41 | def match(self, inputs, topk=1, id_selector=None, max_len_difference=4):
42 | """
43 | match entity inputs to vocab
44 |
45 | :param input: list(str) of entities per query to match
46 | :param topk: number of items to return
47 | :param id_selector: a list of ids to retrieve the topk from i.e an allowlist
48 | :return:
49 | """
50 |
51 | if id_selector is not None:
52 | raise NotImplementedError(f"id_selector not implemented for {self.__class__.__name__}")
53 |
54 | results = []
55 | for input in inputs:
56 | # string matching process from thefuzz library https://pypi.org/project/thefuzz/
57 | intermediate_results = process.extract(input, self.vocab, limit=topk)
58 | #skip much shorter strings
59 | for interintermediate_result in intermediate_results:
60 | if len(interintermediate_result[0]) + max_len_difference < len(input):
61 | continue
62 | results.append(interintermediate_result)
63 |
64 | results = sorted(results, key=lambda x: x[1], reverse=True)
65 |
66 | return {'hits': [{'document_id': match_string,
67 | 'document': match_string,
68 | 'match_score': match_score}
69 | for match_string, match_score in results]}
70 |
71 | def add(self, vocab_list):
72 | """
73 | add vocab instances to the index
74 |
75 | :param vocab_list: list of vocab instances to add
76 |
77 | """
78 | self.vocab = list(set(self.vocab) | set(vocab_list))
79 |
80 | def add_with_ids(self, ids, vocab_list):
81 | raise NotImplementedError(f"add_with_ids not implemented for {self.__class__.__name__}")
--------------------------------------------------------------------------------
/byokg-rag/src/graphrag_toolkit/byokg_rag/indexing/index.py:
--------------------------------------------------------------------------------
1 | from abc import ABC, abstractmethod
2 | from typing import List
3 |
4 | class Index(ABC):
5 | """
6 | Abstract base class for indexes
7 | """
8 | def __init__(self):
9 | pass
10 |
11 | @abstractmethod
12 | def reset(self):
13 | """
14 | reset the index to empty it contents without needed to create a new index object
15 | """
16 | pass
17 |
18 | @abstractmethod
19 | def query(self, input, topk=1):
20 | """
21 | match a query to items in the index and return the topk results
22 |
23 | :param query: str the query to match
24 | :param topk: number of items to return
25 | :return:
26 | """
27 | pass
28 |
29 | @abstractmethod
30 | def add(self, documents):
31 | """
32 | add documents to the index
33 |
34 | :param documents: list of documents to add
35 |
36 | """
37 | pass
38 |
39 | def add_with_ids(self, ids, documents):
40 | """
41 | add documents with their given ids to the index
42 |
43 | :param ids: list of documents to add
44 | :param documents: list of doument ids in same order as documents
45 | :return:
46 | """
47 | pass
48 |
49 | def as_retriever(self):
50 | retriever = Retriever(index=self)
51 | return retriever
52 |
53 | def as_entity_matcher(self):
54 | entity_matcher = EntityMatcher(index=self)
55 | return entity_matcher
56 |
57 | class Retriever:
58 | """
59 | Base class for Retriever. Given a set of queries, the retriever can process the input, query the index and potentially
60 | post process the output.
61 | """
62 |
63 | def __init__(self, index):
64 | self.index = index
65 |
66 | @abstractmethod
67 | def retrieve(self, queries:List[str], topk=1, id_selectors = None, **kwargs):
68 | items = []
69 | if isinstance(id_selectors, list):
70 | if all(isinstance(item, list) for item in id_selectors):
71 | # id selector only allows one query per time
72 | for query, id_selector in zip(queries, id_selectors):
73 | if len(id_selector) == 0:
74 | # if no id is selected skip retrieval
75 | items.append({"hits": []})
76 | else:
77 | items.append(self.index.query(query, topk, id_selector, **kwargs))
78 | else:
79 | raise ValueError("id_selectors must be a list of lists")
80 | else:
81 | for query in queries:
82 | items.append(self.index.query(query, topk, **kwargs))
83 | return items
84 |
85 | class EntityMatcher(Retriever):
86 | """
87 | Base class for entity matching. Given a set of extracted entities, the matcher returns the matched entities from vocab.
88 | """
89 | @abstractmethod
90 | def retrieve(self, queries:List[str], **kwargs):
91 | return self.index.match(queries, **kwargs)
--------------------------------------------------------------------------------
/byokg-rag/src/graphrag_toolkit/byokg_rag/llm/__init__.py:
--------------------------------------------------------------------------------
1 | from .bedrock_llms import *
--------------------------------------------------------------------------------
/byokg-rag/src/graphrag_toolkit/byokg_rag/prompts/generation_prompts.yaml:
--------------------------------------------------------------------------------
1 | generate-response-qa: '''
2 | ### Task: Question Answering
3 | Answer the question using your existing knowledge base or the external information provided in the graph context (if provided).
4 |
5 | You are allowed to perform chain-of-thought or thinking but the final answers shoud be in tags with the following instructions:
6 | - Provide only direct entity answers that specifically address the question
7 | - Each answer should be a distinct, well-defined entity (person, place, organization, concept, etc.)
8 | - List multiple answers if appropriate, with each answer on a separate line
9 | - Do not include explanations, reasoning, context, or commentary of any kind
10 | - Do not preface or conclude your answer with statements like "Based on my knowledge..." or "The answers are..."
11 | - **If graph context is provided, prioritize answers that can be derived from the context over general knowledge**
12 | - Format your response exactly as follows, where answers are separated by newlines:
13 |
14 |
15 | answer1
16 | answer2
17 | ...
18 |
19 |
20 | If the answer cannot be directly determined by the provided graph context, use your own knowldge.
21 | Try to always output an answer.
22 |
23 | Now, please answer the following:
24 |
25 | Question: {question}
26 | Graph Context: {graph_context}
27 | '''
--------------------------------------------------------------------------------
/byokg-rag/src/graphrag_toolkit/byokg_rag/requirements.txt:
--------------------------------------------------------------------------------
1 | pydantic>=2.8.2
2 | boto3
3 | xmltodict
4 | colorama
5 | pyyaml
6 | pytest
7 | faiss-cpu==1.9.0
8 | thefuzz
9 | langchain_huggingface
10 | ipykernel
11 | torch
12 | transformers>=4.44.2
13 | numpy>=1.26.4
14 | scipy>=1.15.3
15 |
--------------------------------------------------------------------------------
/byokg-rag/src/graphrag_toolkit/byokg_rag/utils.py:
--------------------------------------------------------------------------------
1 | import yaml
2 | import os.path as osp
3 | from colorama import Fore, Style
4 | import re
5 | import string
6 |
7 |
8 | def load_yaml(file_path):
9 | file_path = file_path if file_path.startswith('/') else osp.join(osp.dirname(osp.abspath(__file__)), file_path)
10 | with open(file_path, 'r') as file:
11 | content = yaml.safe_load(file)
12 | return content
13 |
14 | def color_print(text, color):
15 | print(getattr(Fore, color.upper()) + Style.BRIGHT + text + Style.RESET_ALL)
16 |
17 | def parse_response(response, pattern):
18 |
19 | if not isinstance(response, str):
20 | return []
21 |
22 | match = re.search(pattern, response, flags=re.DOTALL)
23 | matched = []
24 | if match:
25 | graph_text = match.group(1)
26 | for to_match in graph_text.strip().split('\n'):
27 | if to_match != "":
28 | matched.append(to_match)
29 |
30 | return matched
31 |
--------------------------------------------------------------------------------
/docs/README.md:
--------------------------------------------------------------------------------
1 | ## Documentation
2 |
3 | - [Lexical Graph](./lexical-graph/)
4 |
5 |
--------------------------------------------------------------------------------
/docs/lexical-graph/README.md:
--------------------------------------------------------------------------------
1 | ## Lexical Graph
2 |
3 | - [Overview](./overview.md)
4 | - [Storage Model](./storage-model.md)
5 | - [Indexing](./indexing.md)
6 | - [Batch Extraction](./batch-extraction.md)
7 | - [Querying](./querying.md)
8 | - [Multi-Tenancy](./multi-tenancy.md)
9 | - [Configuration](./configuration.md)
10 | - [Graph Model](./graph-model.md)
11 | - [Security](./security.md)
12 | - [Hybrid Deployment](./hybrid-deployment.md)
13 | - [FAQ](./faq.md)
14 |
15 | #### Code examples
16 |
17 | The code examples throughout the documentation are formatted to run in a Jupyter notebook. If you’re building an application with a main entry point, put your application logic inside a method, and add an [`if __name__ == '__main__'` block](./faq.md#runtimeerror-please-use-nest_asyncioapply-to-allow-nested-event-loops).
18 |
--------------------------------------------------------------------------------
/docs/lexical-graph/aws-profile.md:
--------------------------------------------------------------------------------
1 | # Using AWS Profiles in `GraphRAGConfig`
2 |
3 | This guide explains how to configure and use **AWS named profiles** in the lexical-graph by leveraging the `GraphRAGConfig` class.
4 |
5 | ## What is an AWS Profile?
6 |
7 | AWS CLI and SDKs allow the use of named profiles to manage different sets of credentials. Each profile typically contains:
8 | - Access key ID
9 | - Secret access key
10 | - (Optional) Session token
11 | - (Optional) Default region
12 |
13 | These profiles are stored in:
14 | - `~/.aws/credentials`
15 | - `~/.aws/config`
16 |
17 | ---
18 |
19 | ## How `GraphRAGConfig` Uses AWS Profiles
20 |
21 | ### 1. **Automatic Detection**
22 | If no profile is explicitly provided, `GraphRAGConfig` attempts to use:
23 | ```python
24 | os.environ.get("AWS_PROFILE")
25 | ```
26 |
27 | If that’s not set, it will fall back to the default AWS behavior.
28 |
29 | ---
30 |
31 | ### 2. **Explicit Profile Setting**
32 |
33 | You can programmatically set a profile:
34 |
35 | ```python
36 | from graphrag_toolkit.config import GraphRAGConfig
37 |
38 | GraphRAGConfig.aws_profile = "padmin"
39 | ```
40 |
41 | This automatically resets any previously cached clients or sessions to ensure all AWS service interactions use the new credentials.
42 |
43 | ---
44 |
45 | ### 3. **Where Profiles are Used**
46 |
47 | When you call:
48 |
49 | ```python
50 | GraphRAGConfig.session
51 | ```
52 |
53 | or use properties like:
54 |
55 | ```python
56 | GraphRAGConfig.bedrock
57 | GraphRAGConfig.s3
58 | GraphRAGConfig.rds
59 | ```
60 |
61 | the SDK creates the respective clients using the active profile and region.
62 |
63 | ---
64 |
65 | ## Example with Environment Variables
66 |
67 | You can export the profile and region before running your app:
68 |
69 | ```bash
70 | export AWS_PROFILE=padmin
71 | export AWS_REGION=us-east-1
72 | python my_app.py
73 | ```
74 |
75 | Or set them inline:
76 |
77 | ```bash
78 | AWS_PROFILE=padmin AWS_REGION=us-east-1 python my_app.py
79 | ```
80 |
81 | ---
82 |
83 | ## Profile-Based Multi-Account Testing
84 |
85 | To test across AWS accounts:
86 | ```python
87 | GraphRAGConfig.aws_profile = "dev-profile"
88 | GraphRAGConfig.aws_region = "us-west-2"
89 |
90 | bedrock = GraphRAGConfig.bedrock # Will use dev-profile in us-west-2
91 | ```
92 |
93 | ---
94 |
95 | ## Common Pitfalls
96 |
97 | - **Missing Profile**: Ensure the profile exists in `~/.aws/credentials` and is not misspelled.
98 | - **Access Denied**: Check IAM permissions for the services you're trying to access.
99 | - **Region mismatch**: Bedrock may only be available in specific regions (e.g., `us-east-1`).
100 |
101 | ---
102 |
103 | ## Summary
104 |
105 | | Use Case | How to Do It |
106 | |-----------------------------|------------------------------------------------------------|
107 | | Default profile | Rely on environment variables or default config |
108 | | Programmatic override | `GraphRAGConfig.aws_profile = "my-profile"` |
109 | | Switch regions | `GraphRAGConfig.aws_region = "us-east-2"` |
110 | | Full override | Set both profile and region before invoking `.session` |
111 | | Create boto3 clients | Use `.bedrock`, `.s3`, or `.rds` properties |
--------------------------------------------------------------------------------
/docs/lexical-graph/graph-store-falkor-db.md:
--------------------------------------------------------------------------------
1 | [[Home](./)]
2 |
3 | ## FalkorDB as a Graph Store
4 |
5 | ### Topics
6 |
7 | - [Overview](#overview)
8 | - [Install package](#install-package)
9 | - [Registering FalkorDB as a graph store](#registering-falkordb-as-a-graph-store)
10 | - [Creating a FalkorDB graph store](#creating-a-falkordb-graph-store)
11 |
12 | ### Overview
13 |
14 | You can use FalkorDB as a graph store.
15 |
16 | ### Install package
17 |
18 | The FalkorDB graph store is contained in a separate contributor package. To install it:
19 |
20 | ```
21 | !pip install https://github.com/awslabs/graphrag-toolkit/archive/refs/tags/v3.8.1.zip#subdirectory=lexical-graph-contrib/falkordb
22 | ```
23 |
24 | ### Registering FalkorDB as a graph store
25 |
26 | Before creating a FalkorDB graph store, you must register the `FalkorDBGraphStoreFactory` with the `GraphStoreFactory`:
27 |
28 | ```python
29 | from graphrag_toolkit.lexical_graph.storage import GraphStoreFactory
30 | from graphrag_toolkit.lexical_graph.storage.graph.falkordb import FalkorDBGraphStoreFactory
31 |
32 | GraphStoreFactory.register(FalkorDBGraphStoreFactory)
33 |
34 | ```
35 |
36 | ### Creating a FalkorDB graph store
37 |
38 | You can use the `GraphStoreFactory.for_graph_store()` static factory method to create an instance of a FalkorDB graph store.
39 |
40 | The FalkorDB graph store currently supports the [SemanticGuidedRetriever](./querying.md#semanticguidedretriever). It does not support the [TraversalBasedRetriever](./querying.md#traversalbasedretriever).
41 |
42 | To create a [FalkorDB Cloud](https://app.falkordb.cloud/) graph store, supply a connection string that begins `falkordb://`, followed by the FalkorDB endpoint:
43 |
44 | ```python
45 | from graphrag_toolkit.lexical_graph.storage import GraphStoreFactory
46 | from graphrag_toolkit.lexical_graph.storage.graph.falkordb import FalkorDBGraphStoreFactory
47 |
48 | falkordb_connection_info = 'falkordb://your-falkordb-endpoint'
49 |
50 | GraphStoreFactory.register(FalkorDBGraphStoreFactory)
51 |
52 | graph_store = GraphStoreFactory.for_graph_store(falkordb_connection_info)
53 | ```
54 |
55 | You may also need to pass a username and password, and specify whether or not to use SSL:
56 |
57 | ```python
58 | from graphrag_toolkit.lexical_graph.storage import GraphStoreFactory
59 |
60 | falkordb_connection_info = 'falkordb://'
61 |
62 | graph_store = GraphStoreFactory.for_graph_store(
63 | falkordb_connection_info,
64 | username='',
65 | password='',
66 | ssl=True
67 | )
68 | ```
69 |
70 | To create a local FalkorDB graph store, supply a connection string that has only `falkordb://`;
71 |
72 | ```python
73 | from graphrag_toolkit.lexical_graph.storage import GraphStoreFactory
74 |
75 | falkordb_connection_info = 'falkordb://'
76 |
77 | graph_store = GraphStoreFactory.for_graph_store(falkordb_connection_info)
78 | ```
79 |
80 |
--------------------------------------------------------------------------------
/docs/lexical-graph/graph-store-neptune-analytics.md:
--------------------------------------------------------------------------------
1 | [[Home](./)]
2 |
3 | ## Neptune Analytics as a Graph Store
4 |
5 | ### Topics
6 |
7 | - [Overview](#overview)
8 | - [Creating a Neptune Analytics graph store](#creating-a-neptune-analytics-graph-store)
9 |
10 | ### Overview
11 |
12 | You can use Amazon Neptune Analytics as a graph store.
13 |
14 | ### Creating a Neptune Analytics graph store
15 |
16 | Use the `GraphStoreFactory.for_graph_store()` static factory method to create an instance of a Neptune Analytics graph store.
17 |
18 | To create a Neptune Analytics graph store, supply a connection string that begins `neptune-graph://`, followed by the graph's identifier:
19 |
20 | ```
21 | from graphrag_toolkit.lexical_graph.storage import GraphStoreFactory
22 |
23 | neptune_connection_info = 'neptune-graph://g-jbzzaqb209'
24 |
25 | graph_store = GraphStoreFactory.for_graph_store(neptune_connection_info)
26 | ```
27 |
28 |
--------------------------------------------------------------------------------
/docs/lexical-graph/graph-store-neptune-db.md:
--------------------------------------------------------------------------------
1 | [[Home](./)]
2 |
3 | ## Neptune Database as a Graph Store
4 |
5 | ### Topics
6 |
7 | - [Overview](#overview)
8 | - [Creating a Neptune Database graph store](#creating-a-neptune-database-graph-store)
9 | - [Connecting to Neptune via a proxy](#connecting-to-neptune-via-a-proxy)
10 |
11 | ### Overview
12 |
13 | You can use Amazon Neptune Database as a graph store. The lexical-graph requires [Neptune engine version](https://docs.aws.amazon.com/neptune/latest/userguide/engine-releases.html) 1.4.1.0 or later.
14 |
15 | ### Creating a Neptune Database graph store
16 |
17 | Use the `GraphStoreFactory.for_graph_store()` static factory method to create an instance of a Neptune Database graph store.
18 |
19 | To create a Neptune Database graph store (engine version 1.4.1.0 or later), supply a connection string that begins `neptune-db://`, followed by an [endpoint](https://docs.aws.amazon.com/neptune/latest/userguide/feature-overview-endpoints.html):
20 |
21 | ```python
22 | from graphrag_toolkit.lexical_graph.storage import GraphStoreFactory
23 |
24 | neptune_connection_info = 'neptune-db://mydbcluster.cluster-123456789012.us-east-1.neptune.amazonaws.com:8182'
25 |
26 | graph_store = GraphStoreFactory.for_graph_store(neptune_connection_info)
27 | ```
28 |
29 | #### Connecting to Neptune via a proxy
30 |
31 | To connect to Neptune via a proxy (e.g. a load balancer), you must supply a config dictionary to the `GraphStoreFactory.for_graph_store()` factory method, with a `proxies` dictionary of proxy servers to use by protocol or endpoint:
32 |
33 | ```python
34 | from graphrag_toolkit.lexical_graph.storage import GraphStoreFactory
35 |
36 | neptune_connection_info = 'neptune-db://mydbcluster.cluster-123456789012.us-east-1.neptune.amazonaws.com:8182'
37 |
38 | config = {
39 | 'proxies': {
40 | 'http': 'http://proxy-hostname:80'
41 | }
42 | }
43 |
44 | graph_store = GraphStoreFactory.for_graph_store(
45 | neptune_connection_info,
46 | config=config
47 | )
48 | ```
49 |
--------------------------------------------------------------------------------
/docs/lexical-graph/hybrid-deployment.md:
--------------------------------------------------------------------------------
1 |
2 | [[Home](./)]
3 |
4 | ## Hybrid Deployment
5 |
6 | ### Topics
7 |
8 | - [Overview](#overview)
9 | - [Stores and model providers](#stores-and-model-providers)
10 | - [Indexing and querying](#indexing-and-querying)
11 | - [Indexing](#indexing)
12 |
13 | ### Overview
14 |
15 | Hybrid deployment enables flexible deployment: high-throughput LLM inference via SageMaker and Bedrock, and cost-effective local development using containerized graph/vector stores.
16 |
17 | ### Stores and model providers
18 |
19 | The `lexical-graph` library depends on three backend systems: a [*graph store*](./storage-model.md#graph-store), a [*vector store*](./storage-model.md#vector-store), and a *foundation model provider*. The graph store enables storage and querying of a lexical graph built from unstructured, text-based sources. The vector store contains one or more indexes with embeddings for selected graph elements, which help identify starting points for graph queries. The foundation model provider hosts the Large Language Models (LLMs) used for extraction and embedding.
20 |
21 | The library provides built-in support for:
22 |
23 | * Graph stores: [Amazon Neptune Database](https://docs.aws.amazon.com/neptune/latest/userguide/intro.html), [Amazon Neptune Analytics](https://docs.aws.amazon.com/neptune-analytics/latest/userguide/what-is-neptune-analytics.html), and local [FalkorDB](https://falkordb.com/) (via Docker)
24 | * Vector stores: [Amazon OpenSearch Serverless](https://docs.aws.amazon.com/opensearch-service/latest/developerguide/serverless.html), [PostgreSQL with `pgvector`](https://github.com/pgvector/pgvector), Neptune Analytics, and local [PostgreSQL with `pgvector`](https://github.com/pgvector/pgvector)
25 | * Foundation model provider: [Amazon Bedrock](https://aws.amazon.com/bedrock/)
26 |
27 | This hybrid configuration enables flexible deployment: high-throughput LLM inference via SageMaker and Bedrock, and cost-effective local development using containerized graph/vector stores.
28 |
29 | ### Indexing and querying
30 |
31 | The lexical-graph library implements two high-level processes: [_indexing_](./indexing.md) and [_querying_](./querying.md). The indexing process ingests and extracts information from unstuctured, text-based source documents and then builds a graph and accompanying vector indexes. The query process retrieves content from the graph and vector indexes, and then supplies this content as context to an LLM to answer a user question.
32 |
33 | #### Indexing
34 |
35 | Indexing is split into two pipeline stages: **Extract** and **Build**.
36 |
37 | The **Extract** stage runs **locally using Docker**:
38 |
39 | * Loads and chunks documents
40 | * Performs two LLM-based extraction steps:
41 |
42 | * *Proposition extraction*: Converts chunked text into well-formed statements
43 | * *Topic/entity/fact extraction*: Identifies relations and concepts
44 | * Stores the extracted results in an **AWS S3 bucket**, serving as the transport medium between stages
45 |
46 | The **Build** stage remains unchanged.
47 |
48 | 
--------------------------------------------------------------------------------
/docs/lexical-graph/multi-tenancy.md:
--------------------------------------------------------------------------------
1 | [[Home](./)]
2 |
3 | ## Multi-Tenancy
4 |
5 | ### Topics
6 |
7 | - [Overview](#overview)
8 | - [Tenant Id](#tenant-id)
9 | - [Indexing and multi-tenancy](#indexing-and-multi-tenancy)
10 | - [Querying and multi-tenancy](#querying-and-multi-tenancy)
11 | - [Implementation details](#implementation-details)
12 |
13 | ### Overview
14 |
15 | Multi-tenancy allows you to host multiple separate lexical graphs in the same underlying graph and vector stores.
16 |
17 | ### Tenant Id
18 |
19 | To use the multi-tenancy feature, you must supply a tenant id when creating a `LexicalGraphIndex` or `LexicalGraphQueryEngine`. A tenant id is a string containing 1-10 lower case characters and numbers. If you don't supply a tenant id, the index and query engine will use the _default tenant_ (i.e. a tenant id value of `None`).
20 |
21 | ### Indexing and multi-tenancy
22 |
23 | The following example creates a `LexicalGraphIndex` for tenant 'user123':
24 |
25 | ```python
26 | from graphrag_toolkit.lexical_graph import LexicalGraphIndex
27 |
28 | graph_store = ...
29 | vector_store = ...
30 |
31 | graph_index = LexicalGraphIndex(
32 | graph_store,
33 | vector_store,
34 | tenant_id='user123'
35 | )
36 | ```
37 |
38 | The `LexicalGraphIndex` always uses the _default tenant_ for the [extract stage](https://github.com/awslabs/graphrag-toolkit/blob/main/docs/indexing.md#extract), even if you supply a different tenant id. The [build stage](https://github.com/awslabs/graphrag-toolkit/blob/main/docs/indexing.md#build), however, will use the tenant id. The reason for this is so that you can extract once, and then build many times, potentially for different tenants.
39 |
40 | ### Querying and multi-tenancy
41 |
42 | The following example creates a `LexicalGraphQueryEngine` for tenant 'user123':
43 |
44 | ```python
45 | from graphrag_toolkit.lexical_graph import LexicalGraphQueryEngine
46 |
47 | graph_store = ...
48 | vector_store = ...
49 |
50 | query_engine = LexicalGraphQueryEngine.for_traversal_based_search(
51 | graph_store,
52 | vector_store,
53 | tenant_id='user123'
54 | )
55 | ```
56 |
57 | If a lexical graph does not exist for the specified tenant id, the underlying retrievers will return an empty set of results.
58 |
59 | ### Implementation details
60 |
61 | Multi-tenancy works by using tenant-specific node labels for nodes in the graph, and tenant-specific indexes in the vector store. For example, chunk nodes in a graph belonging to tenant 'user123' will be labelled `__Chunk__user123__`, while the chunk vector index will be named `chunk_user123`.
62 |
63 | Not every graph and vector store necessarily supports multi-tenancy. Neptune Analytics, when used as a vector store, for example, does not currently support multi-tenancy.
--------------------------------------------------------------------------------
/docs/lexical-graph/prompts.md:
--------------------------------------------------------------------------------
1 |
2 | ## Using Custom Prompt Providers
3 |
4 | The GraphRAG Toolkit supports pluggable prompt providers to allow dynamic loading of prompt templates from various sources. There are four built-in providers:
5 |
6 | ### 1. StaticPromptProvider
7 |
8 | Use this when your system and user prompts are defined as constants in your codebase.
9 |
10 | ```python
11 | from graphrag_toolkit.lexical_graph.prompts.static_prompt_provider import StaticPromptProvider
12 |
13 | prompt_provider = StaticPromptProvider()
14 | ```
15 |
16 | This provider uses the predefined constants `ANSWER_QUESTION_SYSTEM_PROMPT` and `ANSWER_QUESTION_USER_PROMPT`.
17 |
18 | ---
19 |
20 | ### 2. FilePromptProvider
21 |
22 | Use this when your prompts are stored locally on disk.
23 |
24 | ```python
25 | from graphrag_toolkit.lexical_graph.prompts.file_prompt_provider import FilePromptProvider
26 | from graphrag_toolkit.lexical_graph.prompts.prompt_provider_config import FilePromptProviderConfig
27 |
28 | prompt_provider = FilePromptProvider(
29 | FilePromptProviderConfig(base_path="./prompts"),
30 | system_prompt_file="system.txt",
31 | user_prompt_file="user.txt"
32 | )
33 | ```
34 |
35 | The prompt files are read from a directory (`base_path`), and you can override the file names if needed.
36 |
37 | ---
38 |
39 | ### 3. S3PromptProvider
40 |
41 | Use this when your prompts are stored in an Amazon S3 bucket.
42 |
43 | ```python
44 | from graphrag_toolkit.lexical_graph.prompts.s3_prompt_provider import S3PromptProvider
45 | from graphrag_toolkit.lexical_graph.prompts.prompt_provider_config import S3PromptProviderConfig
46 |
47 | prompt_provider = S3PromptProvider(
48 | S3PromptProviderConfig(
49 | bucket="ccms-prompts",
50 | prefix="prompts",
51 | aws_region="us-east-1", # optional if set via env
52 | aws_profile="my-profile", # optional if using default profile
53 | system_prompt_file="my_system.txt", # optional override
54 | user_prompt_file="my_user.txt" # optional override
55 | )
56 | )
57 | ```
58 |
59 | Prompts are loaded using `boto3` and AWS credentials. Ensure your environment or `~/.aws/config` is configured for SSO, roles, or keys.
60 |
61 | ---
62 |
63 | ### 4. BedrockPromptProvider
64 |
65 | Use this when your prompts are stored and versioned using Amazon Bedrock prompt ARNs.
66 |
67 | ```python
68 | from graphrag_toolkit.lexical_graph.prompts.bedrock_prompt_provider import BedrockPromptProvider
69 | from graphrag_toolkit.lexical_graph.prompts.prompt_provider_config import BedrockPromptProviderConfig
70 |
71 | prompt_provider = BedrockPromptProvider(
72 | config=BedrockPromptProviderConfig(
73 | system_prompt_arn="arn:aws:bedrock:us-east-1:123456789012:prompt/my-system",
74 | user_prompt_arn="arn:aws:bedrock:us-east-1:123456789012:prompt/my-user",
75 | system_prompt_version="DRAFT",
76 | user_prompt_version="DRAFT"
77 | )
78 | )
79 | ```
80 |
81 | This provider resolves prompt ARNs dynamically using STS and can fall back to environment variables if needed.
82 |
83 |
--------------------------------------------------------------------------------
/docs/lexical-graph/vector-store-neptune-analytics.md:
--------------------------------------------------------------------------------
1 | [[Home](./)]
2 |
3 | ## Neptune Analytics as a Vector Store
4 |
5 | ### Topics
6 |
7 | - [Overview](#overview)
8 | - [Creating a Neptune Analytics vector store](#creating-a-neptune-analytics-vector-store)
9 |
10 | ### Overview
11 |
12 | You can use Amazon Neptune Analytics as a vector store.
13 |
14 | ### Creating a Neptune Analytics vector store
15 |
16 | Use the `VectorStoreFactory.for_vector_store()` static factory method to create an instance of an Amazon Neptune Analytics vector store.
17 |
18 | To create a Neptune Analytics vector store, supply a connection string that begins `neptune-graph://`, followed by the graph's identifier:
19 |
20 | ```python
21 | from graphrag_toolkit.lexical_graph.storage import VectorStoreFactory
22 |
23 | neptune_connection_info = 'neptune-graph://g-jbzzaqb209'
24 |
25 | vector_store = VectorStoreFactory.for_vector_store(neptune_connection_info)
26 | ```
27 |
28 |
--------------------------------------------------------------------------------
/docs/lexical-graph/vector-store-opensearch-serverless.md:
--------------------------------------------------------------------------------
1 | [[Home](./)]
2 |
3 | ## Amazon OpenSearch Serverless as a Vector Store
4 |
5 | ### Topics
6 |
7 | - [Overview](#overview)
8 | - [Install dependencies](#install-dependencies)
9 | - [Creating an OpenSearch Serverless vector store](#creating-a-neptune-analytics-vector-store)
10 |
11 | ### Overview
12 |
13 | You can use an Amazon OpenSearch Serverless collection as a vector store.
14 |
15 | ### Install dependencies
16 |
17 | The OpenSeacrh vector store requires both the `opensearch-py` and `llama-index-vector-stores-opensearch` packages:
18 |
19 | ```
20 | pip install opensearch-py llama-index-vector-stores-opensearch
21 | ```
22 |
23 | ### Creating an OpenSearch Serverless vector store
24 |
25 | Use the `VectorStoreFactory.for_vector_store()` static factory method to create an instance of an Amazon OpenSearch Serverless vector store.
26 |
27 | To create an Amazon OpenSearch Serverless vector store, supply a connection string that begins `aoss://`, followed by the https endpoint of the OpenSearch Serverless collection:
28 |
29 | ```python
30 | from graphrag_toolkit.lexical_graph.storage import VectorStoreFactory
31 |
32 | opensearch_connection_info = 'aoss://https://123456789012.us-east-1.aoss.amazonaws.com'
33 |
34 | vector_store = VectorStoreFactory.for_vector_store(opensearch_connection_info)
35 | ```
36 |
--------------------------------------------------------------------------------
/docs/lexical-graph/vector-store-postgres.md:
--------------------------------------------------------------------------------
1 | [[Home](./)]
2 |
3 | ## Postgres as a Vector Store
4 |
5 | ### Topics
6 |
7 | - [Overview](#overview)
8 | - [Install dependencies](#install-dependencies)
9 | - [Creating Postgres vector store](#creating-a-postgres-vector-store)
10 | - [Connecting to an IAM auth-enabled Postgres vector store](#connecting-to-an-iam-auth-enabled-postgres-vector-store)
11 |
12 | ### Overview
13 |
14 | You can use a Postgres database with the [pgvector](https://github.com/pgvector/pgvector) extension as a vector store.
15 |
16 | ### Install dependencies
17 |
18 | The Postgres vector store requires both the `psycopg2` and `pgvector` packages:
19 |
20 | ```
21 | pip install psycopg2-binary pgvector
22 | ```
23 |
24 | ### Creating a Postgres vector store
25 |
26 | Use the `VectorStoreFactory.for_vector_store()` static factory method to create an instance of a Postgres vector store.
27 |
28 | To create a Postgres vector store, supply a connection string in the following format:
29 |
30 | ```
31 | postgresql://[user[:password]@][netloc][:port][/dbname][?param1=value1&...]
32 | ```
33 |
34 | For example:
35 |
36 | ```
37 | postgresql://graphrag:!zfg%dGGh@mydbcluster.cluster-123456789012.us-west-2.rds.amazonaws.com:5432/postgres
38 | ```
39 |
40 | #### Connecting to an IAM auth-enabled Postgres vector store
41 |
42 | If your Postgres database supports [AWS Identity and Access Management (IAM) database authentication](https://docs.aws.amazon.com/AmazonRDS/latest/UserGuide/UsingWithRDS.IAMDBAuth.html), omit the password, and add `enable_iam_db_auth=True` to the connection string query parameters:
43 |
44 | ```
45 | postgresql://graphrag@mydbcluster.cluster-123456789012.us-west-2.rds.amazonaws.com:5432/postgres?enable_iam_db_auth=True
46 | ```
47 |
48 | You will need to create a database user, and [grant the `rds_iam` role](https://docs.aws.amazon.com/AmazonRDS/latest/UserGuide/UsingWithRDS.IAMDBAuth.DBAccounts.html#UsingWithRDS.IAMDBAuth.DBAccounts.PostgreSQL) to use IAM authentication.
49 |
50 |
51 |
--------------------------------------------------------------------------------
/examples/README.md:
--------------------------------------------------------------------------------
1 | ## Examples
2 |
3 | - [BYOKG-RAG](./byokg-rag/) Example notebook and dataset demonstrating a RAG (Retrieval Augmented Generation) system built on top of a Knowledge Graph.
4 | - [Lexical Graph](./lexical-graph/) Examples of deploying and running the lexical-graph indexing and querying processes on AWS services
5 | - [Lexical Graph hybrid development](./lexical-graph-hybrid-dev/) Examples of running the indexing extract stage locally and the indexing build stage and querying on AWS services
6 | - [Lexical Graph local development](./lexical-graph-local-dev/) Examples of running the lexical-graph indexing and querying processes locally
--------------------------------------------------------------------------------
/examples/lexical-graph-hybrid-dev/aws/setup-bedrock-batch-doc.md:
--------------------------------------------------------------------------------
1 |
2 | # Bedrock Batch Inference Setup Script Documentation
3 |
4 | This script automates the provisioning of the necessary AWS resources to perform **Batch Model Invocation** jobs with Amazon Bedrock.
5 |
6 | ---
7 |
8 | ## What the Script Does
9 |
10 | 1. **Checks AWS Credentials**
11 | Validates that the AWS CLI is authenticated using either:
12 | - SSO (e.g., `aws sso login --profile padmin`)
13 | - or static credentials (via `aws configure`)
14 |
15 | 2. **Retrieves AWS Account and Region Info**
16 | Using the AWS profile, the script resolves:
17 | - `ACCOUNT_ID`
18 | - `REGION`
19 | - (Optional) Current SSO role being used
20 |
21 | 3. **Creates an S3 Bucket**
22 | Creates a bucket named `ccms-rag-extract-` for uploading input/output files used in batch jobs.
23 |
24 | 4. **Creates an IAM Role for Bedrock (Execution Role)**
25 | - Name: `bedrock-batch-inference-role`
26 | - Trusts the `bedrock.amazonaws.com` service
27 | - Permissions:
28 | Allows access to the newly created S3 bucket.
29 |
30 | 5. **Creates an IAM Identity Policy**
31 | - Name: `bedrock-batch-identity-policy`
32 | - Grants permission to:
33 | - Create, List, Get, and Stop Bedrock model invocation jobs
34 | - Pass the execution role to Bedrock
35 |
36 | 6. **Attaches Policies to Role/User**
37 | - Attaches the role permissions to the `bedrock-batch-inference-role`
38 | - Prints instructions to attach the identity policy manually depending on credential type
39 |
40 | 7. **Cleanup**
41 | Temporary policy files are deleted from the local directory.
42 |
43 | ---
44 |
45 | ## Output Resources
46 |
47 | | Resource | Description |
48 | |---------|-------------|
49 | | S3 Bucket | `ccms-rag-extract-` |
50 | | IAM Role | `bedrock-batch-inference-role` |
51 | | IAM Role Policy | Grants S3 access for batch inference |
52 | | IAM Identity Policy | Grants permission to submit and manage Bedrock batch jobs |
53 |
54 | ---
55 |
56 | ## Usage
57 |
58 | ```bash
59 | bash setup-bedrock-batch.sh padmin
60 | ```
61 |
62 | If no profile is specified, it defaults to `padmin`.
63 |
64 | ---
65 |
66 | ## Manual IAM Setup Required (SSO Users)
67 |
68 | If you're using AWS SSO, the script will print:
69 | ```
70 | NOTE: You are using AWS SSO with role:
71 | To complete setup, you need to:
72 | 1. Go to AWS IAM Identity Center
73 | 2. Find your Permission Set
74 | 3. Add the identity policy (arn:aws:iam:::policy/bedrock-batch-identity-policy) to your Permission Set
75 | ```
76 |
77 | If you're using static credentials, you must manually attach the identity policy to the user/role.
78 |
79 | ---
80 |
81 | ## Related Policies
82 |
83 | ### Trust Policy (Role)
84 | ```json
85 | {
86 | "Principal": {
87 | "Service": "bedrock.amazonaws.com"
88 | },
89 | "Condition": {
90 | "StringEquals": {
91 | "aws:SourceAccount": ""
92 | },
93 | "ArnEquals": {
94 | "aws:SourceArn": "arn:aws:bedrock:::model-invocation-job/*"
95 | }
96 | }
97 | }
98 | ```
99 |
100 | ### Role Policy (S3 Access)
101 | ```json
102 | {
103 | "Action": ["s3:GetObject", "s3:ListBucket", "s3:PutObject"],
104 | "Resource": [
105 | "arn:aws:s3:::ccms-rag-extract-",
106 | "arn:aws:s3:::ccms-rag-extract-/*"
107 | ]
108 | }
109 | ```
110 |
111 | ### Identity Policy (Bedrock Access)
112 | ```json
113 | {
114 | "Action": [
115 | "bedrock:CreateModelInvocationJob",
116 | "bedrock:GetModelInvocationJob",
117 | "bedrock:ListModelInvocationJobs",
118 | "bedrock:StopModelInvocationJob",
119 | "iam:PassRole"
120 | ]
121 | }
122 | ```
123 |
124 | ---
125 |
126 | ## Prerequisites
127 |
128 | - AWS CLI installed
129 | - AWS credentials configured for the profile (via SSO or `aws configure`)
130 | - Sufficient permissions to:
131 | - Create IAM roles and policies
132 | - Create S3 buckets
133 |
--------------------------------------------------------------------------------
/examples/lexical-graph-hybrid-dev/docker/.env:
--------------------------------------------------------------------------------
1 | # PostgreSQL settings
2 | POSTGRES_USER=graphrag
3 | POSTGRES_PASSWORD=graphragpass
4 | POSTGRES_DB=graphrag_db
5 | POSTGRES_HOST=postgres
6 | POSTGRES_PORT=5432
7 |
8 | # FalkorDB settings (adjust as needed)
9 | FALKORDB_HOST=falkordb
10 | FALKORDB_PORT=6379
11 |
12 | # Other potential settings for graphrag_toolkit
13 | EMBEDDING_MODEL=sentence-transformers/all-mpnet-base-v2
14 | EMBEDDING_DIMENSIONS=1536
--------------------------------------------------------------------------------
/examples/lexical-graph-hybrid-dev/docker/build.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | echo "Building and starting containers..."
4 | docker compose up -d --build
5 |
6 | echo "Build and startup complete."
7 |
--------------------------------------------------------------------------------
/examples/lexical-graph-hybrid-dev/docker/docker-compose.yml:
--------------------------------------------------------------------------------
1 | services:
2 | falkordb:
3 | image: falkordb/falkordb:latest
4 | container_name: falkordb
5 | ports:
6 | - "6379:6379" # Redis/FalkorDB default
7 | - "3000:3000" # Optional for FalkorDB REST if exposed
8 | volumes:
9 | - falkor_data:/data
10 | networks:
11 | - graphrag_network
12 |
13 | falkordb-browser:
14 | image: falkordb/falkordb-browser:latest
15 | container_name: falkordb-browser
16 | ports:
17 | - "8092:8080" # Browser UI exposed on localhost:8080
18 | environment:
19 | FALKORDB_BROWSER_REDIS_HOST: falkordb
20 | FALKORDB_BROWSER_REDIS_PORT: 6379
21 | FALKORDB_BROWSER_REDIS_USE_TLS: "false"
22 | depends_on:
23 | - falkordb
24 | networks:
25 | - graphrag_network
26 |
27 | postgres:
28 | image: pgvector/pgvector:0.6.2-pg16
29 | container_name: pgvector-db
30 | ports:
31 | - "5432:5432"
32 | environment:
33 | - POSTGRES_USER=${POSTGRES_USER}
34 | - POSTGRES_PASSWORD=${POSTGRES_PASSWORD}
35 | - POSTGRES_DB=${POSTGRES_DB}
36 | volumes:
37 | - pgvector_data:/var/lib/postgresql/data
38 | - ./postgres/schema.sql:/docker-entrypoint-initdb.d/schema.sql
39 | networks:
40 | - graphrag_network
41 |
42 | networks:
43 | graphrag_network:
44 | driver: bridge
45 |
46 | volumes:
47 | falkor_data:
48 | pgvector_data:
49 |
--------------------------------------------------------------------------------
/examples/lexical-graph-hybrid-dev/docker/postgres/schema.sql:
--------------------------------------------------------------------------------
1 | -- Enable pgvector extension in public schema
2 | CREATE EXTENSION IF NOT EXISTS vector SCHEMA public;
3 |
4 | -- Enable pg_trgm extension in public schema
5 | CREATE EXTENSION IF NOT EXISTS pg_trgm SCHEMA public;
6 |
7 | -- Create schema for GraphRAG
8 | CREATE SCHEMA IF NOT EXISTS graphrag;
--------------------------------------------------------------------------------
/examples/lexical-graph-hybrid-dev/docker/reset.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | echo "Stopping and removing containers, volumes, and networks..."
4 | docker compose down -v --remove-orphans
5 |
6 | echo "Ensuring containers are removed..."
7 | docker rm -f falkordb falkordb-browser pgvector-db 2>/dev/null
8 |
9 | echo "Removing named volumes..."
10 | docker volume rm -f pgvector_data falkor_data 2>/dev/null
11 |
12 | echo "Pruning dangling volumes (if any)..."
13 | docker volume prune -f
14 |
15 | echo "Clearing extracted directory..."
16 | rm -rf extracted
17 |
18 | echo "Rebuilding containers..."
19 | docker compose up -d --force-recreate
20 |
21 | echo "Reset complete."
--------------------------------------------------------------------------------
/examples/lexical-graph-hybrid-dev/docs/docker_build_shell_script.md:
--------------------------------------------------------------------------------
1 | # build.sh
2 |
3 | This script is used to build and start the containers for a new deployment of the application using Docker Compose. It is intended for **initial deployments** or **redeployments** without resetting volumes, removing data, or clearing persistent state.
4 |
5 | ## Usage
6 |
7 | ```bash
8 | chmod +x build.sh
9 | ./build.sh
10 | ```
11 |
12 | ## What it does
13 |
14 | - Executes `docker compose up -d --build` to:
15 | - Build the Docker images using the `Dockerfile`s defined in the project.
16 | - Start the services in detached mode (`-d`) so the terminal remains available.
17 | - Automatically pull required images if not already present.
18 | - Rebuild containers if source code has changed.
19 |
20 | ## Important Notes
21 |
22 | - This script does **not** remove any existing containers, volumes, or data.
23 | - It is safe to run on top of an existing deployment if you are deploying an updated version of your app.
24 | - Make sure your `.env` and `docker-compose.yml` files are configured properly before running the script.
25 |
26 | ## Related Scripts
27 |
28 | - See [`reset.sh`](reset.md) for a full environment reset, including data deletion and volume pruning.
29 |
--------------------------------------------------------------------------------
/examples/lexical-graph-hybrid-dev/docs/docker_compose_services.md:
--------------------------------------------------------------------------------
1 | # Docker Services Overview for GraphRAG Deployment
2 |
3 | This document describes the services defined in the `docker-compose.yml` file used for setting up a GraphRAG environment. It includes containerized services for FalkorDB, a FalkorDB browser UI, and a PostgreSQL database with the `pgvector` extension enabled.
4 |
5 | ---
6 |
7 | ## Services
8 |
9 | ### 1. `falkordb`
10 | - **Image**: `falkordb/falkordb:latest`
11 | - **Description**: Runs the FalkorDB graph database, which uses Redis as its backend.
12 | - **Ports**:
13 | - `6379`: Redis/FalkorDB main port.
14 | - `3000`: Optional REST API for FalkorDB if exposed.
15 | - **Volume**: Persists graph data using `falkor_data`.
16 | - **Network**: Connected to `graphrag_network`.
17 |
18 | ### 2. `falkordb-browser`
19 | - **Image**: `falkordb/falkordb-browser:latest`
20 | - **Description**: Provides a web-based interface for interacting with FalkorDB.
21 | - **Ports**:
22 | - `8092:8080`: Web UI exposed on localhost:8092.
23 | - **Environment Variables**:
24 | - `FALKORDB_BROWSER_REDIS_HOST`: Hostname of the FalkorDB service.
25 | - `FALKORDB_BROWSER_REDIS_PORT`: Port for Redis.
26 | - `FALKORDB_BROWSER_REDIS_USE_TLS`: TLS setting for secure Redis communication (disabled in this setup).
27 | - **Depends On**: `falkordb`
28 | - **Network**: Connected to `graphrag_network`.
29 |
30 | ### 3. `postgres`
31 | - **Image**: `pgvector/pgvector:0.6.2-pg16`
32 | - **Description**: PostgreSQL 16 image with the `pgvector` extension pre-installed for vector search capabilities.
33 | - **Ports**:
34 | - `5432`: PostgreSQL default port.
35 | - **Environment Variables**:
36 | - `POSTGRES_USER`, `POSTGRES_PASSWORD`, `POSTGRES_DB`: Injected from environment or `.env` file.
37 | - **Volumes**:
38 | - `pgvector_data`: Data persistence.
39 | - `./postgres/schema.sql`: Initializes the database schema.
40 | - **Network**: Connected to `graphrag_network`.
41 |
42 | ---
43 |
44 | ## `schema.sql`
45 |
46 | This SQL file is used to bootstrap the PostgreSQL container with necessary extensions and a custom schema:
47 |
48 | ```sql
49 | -- Enable pgvector extension in public schema
50 | CREATE EXTENSION IF NOT EXISTS vector SCHEMA public;
51 |
52 | -- Enable pg_trgm extension in public schema
53 | CREATE EXTENSION IF NOT EXISTS pg_trgm SCHEMA public;
54 |
55 | -- Create schema for GraphRAG
56 | CREATE SCHEMA IF NOT EXISTS graphrag;
57 | ```
58 |
59 | These extensions are required for vector similarity search and trigram-based indexing within the GraphRAG framework.
60 |
61 | ---
62 |
63 | ## Networks
64 |
65 | - **graphrag_network**: A dedicated Docker bridge network for inter-container communication.
66 |
67 | ---
68 |
69 | ## Volumes
70 |
71 | - `falkor_data`: Persists FalkorDB graph state.
72 | - `pgvector_data`: Persists PostgreSQL data including vector embeddings and schema definitions.
73 |
--------------------------------------------------------------------------------
/examples/lexical-graph-hybrid-dev/docs/docker_reset_shell_script.md:
--------------------------------------------------------------------------------
1 | # Docker Environment Reset Script
2 |
3 | This script is used to **fully reset a local Docker-based development environment** for graphrag-toolkit. The script will reset FalkorDB, PGVector, and optionally other components. It performs cleanup of containers, networks, volumes, and extracted data, followed by a fresh container rebuild.
4 |
5 | ## Filename
6 |
7 | Use `reset.sh` (file is located in lexical-graph-contrib/docker) and run it with:
8 |
9 | ```bash
10 | bash reset.sh
11 | ```
12 |
13 | > **Note:** Make sure the script is executable (`chmod +x reset.sh`) or invoke it with `bash`.
14 |
15 | ---
16 |
17 | ## Script Breakdown
18 |
19 | ```bash
20 | #!/bin/bash
21 | ```
22 | - Standard shebang to run the script using `bash`.
23 |
24 | ---
25 |
26 | ### 1. Stop and Remove Docker Resources
27 |
28 | ```bash
29 | echo "Stopping and removing containers, volumes, and networks..."
30 | docker compose down -v --remove-orphans
31 | ```
32 |
33 | - **`docker compose down`** stops and removes containers defined in `docker-compose.yml`.
34 | - **`-v`** removes associated anonymous volumes.
35 | - **`--remove-orphans`** removes containers not defined in the current Compose file but part of the same project network.
36 |
37 | ---
38 |
39 | ### 2. Explicitly Remove Named Containers
40 |
41 | ```bash
42 | echo "Ensuring containers are removed..."
43 | docker rm -f falkordb falkordb-browser pgvector-db 2>/dev/null
44 | ```
45 |
46 | - Forcefully removes specific named containers, if they still exist.
47 | - Errors are suppressed using `2>/dev/null`.
48 |
49 | ---
50 |
51 | ### 3. Remove Named Volumes
52 |
53 | ```bash
54 | echo "Removing named volumes..."
55 | docker volume rm -f pgvector_data falkor_data 2>/dev/null
56 | ```
57 |
58 | - Deletes project-specific Docker volumes that might persist after shutdown.
59 |
60 | ---
61 |
62 | ### 4. Prune Dangling Volumes
63 |
64 | ```bash
65 | echo "Pruning dangling volumes (if any)..."
66 | docker volume prune -f
67 | ```
68 |
69 | - Removes **dangling (unused)** Docker volumes that may be left behind.
70 |
71 | ---
72 |
73 | ### 5. Delete Local Directories
74 |
75 | ```bash
76 | echo "Clearing extracted directory..."
77 | rm -rf extracted
78 | ```
79 |
80 | - Cleans up the local `./extracted` directory used to store intermediate files (like parsed documents, indexes, or temp outputs).
81 |
82 | ---
83 |
84 | ### 6. Rebuild and Start Containers
85 |
86 | ```bash
87 | echo "Rebuilding containers..."
88 | docker compose up -d --force-recreate
89 | ```
90 |
91 | - **`-d`** runs containers in detached mode.
92 | - **`--force-recreate`** ensures all containers are recreated even if configuration hasn't changed.
93 |
94 | ---
95 |
96 | ### 7. Final Message
97 |
98 | ```bash
99 | echo "Reset complete."
100 | ```
101 |
102 | - Indicates successful completion of the reset process.
103 |
104 | ---
105 |
106 | ## Use Cases
107 |
108 | - Full environment reset between development sessions
109 | - Clean-up after corrupt container or volume states
110 | - Ensures a consistent baseline environment for troubleshooting or testing
111 |
112 | ---
113 |
114 | ## Warnings
115 |
116 | - **Data Loss**: This script removes all persistent data and should not be used on production environments.
117 | - **Rebuild Time**: Fresh container creation may take time depending on image sizes and network speed.
118 |
119 |
--------------------------------------------------------------------------------
/examples/lexical-graph-hybrid-dev/notebooks/.env:
--------------------------------------------------------------------------------
1 | AWS_REGION="" #Populate with region
2 | AWS_PROFILE="" #Populate with optional AWS_PROFILE
3 | AWS_ACCOUNT="" #Populate with AWS Account Number
4 | DYNAMODB_NAME="" # Populate from the ouput from setup-bedrock-batch.sh
5 | S3_BUCKET_EXTRACK_BUILD_BATCH_NAME="" #Populate from the ouput from setup-bedrock-batch.sh
6 | S3_BATCH_BUCKET_NAME="" # Populate from the ouput from setup-bedrock-batch.sh
7 | EMBEDDINGS_MODEL="cohere.embed-english-v3"
8 | EMBEDDINGS_DIMENSIONS=1024
9 | EXTRACTION_MODEL="us.anthropic.claude-3-5-sonnet-20240620-v1:0"
10 | RESPONSE_MODEL="us.anthropic.claude-3-5-sonnet-20240620-v1:0"
11 | EXTRACTION_NUM_WORKERS=2
12 | EXTRACTION_NUM_THREADS_PER_WORKER=4
13 | EXTRACTION_BATCH_SIZE=100
14 | BUILD_NUM_WORKERS=2
15 | BUILD_BATCH_SIZE=4
16 | BUILD_BATCH_WRITE_SIZE=25
17 | BATCH_WRITES_ENABLED=True
18 | BATCH_ROLE_NAME="" #Populate from the ouput from setup-bedrock-batch.sh
19 | SOURCE_DIR="best-practices"
20 | BATCH_PREFIX="batch" #Batch S3 Prefix
21 | EXTRACT_BUILD_PREFIX="extract-build" #Extract S3 Prefix
22 | DEFAULT_INCLUDE_DOMAIN_LABELS=False
23 | ENABLE_CACHE=False
24 | VECTOR_STORE="postgresql://graphrag:graphragpass@localhost:5432/graphrag_db" #Docker defaults
25 | GRAPH_STORE="falkordb://localhost:6379" #Docker defaults
26 | MAX_BATCH_SIZE=25000
27 | MAX_NUM_CONCURRENT_BATCHES=3
28 | S3_ENCRYPTION_KEY_ID=""
29 | SUBNET_IDS=""
30 | SECURITY_GROUP_IDS=""
--------------------------------------------------------------------------------
/examples/lexical-graph-hybrid-dev/notebooks/03-Cloud-Build.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "434fea4e",
6 | "metadata": {},
7 | "source": "# 02 - Cloud Build"
8 | },
9 | {
10 | "cell_type": "markdown",
11 | "id": "a9fb5cff",
12 | "metadata": {},
13 | "source": [
14 | "## Setup\n",
15 | "\n",
16 | "If you haven't already, install the toolkit and dependencies using the [Setup](./00-Setup.ipynb) notebook."
17 | ]
18 | },
19 | {
20 | "cell_type": "markdown",
21 | "id": "40c3f5e1",
22 | "metadata": {},
23 | "source": [
24 | "## Build"
25 | ]
26 | },
27 | {
28 | "metadata": {},
29 | "cell_type": "code",
30 | "source": [
31 | "%reload_ext dotenv\n",
32 | "%dotenv\n",
33 | "\n",
34 | "import os\n",
35 | "\n",
36 | "for key, value in os.environ.items():\n",
37 | " print(f\"{key}={value}\")"
38 | ],
39 | "id": "6fee75a08bc1a7e9",
40 | "outputs": [],
41 | "execution_count": null
42 | },
43 | {
44 | "metadata": {},
45 | "cell_type": "code",
46 | "source": [
47 | "%reload_ext dotenv\n",
48 | "%dotenv\n",
49 | "\n",
50 | "import os\n",
51 | "\n",
52 | "from graphrag_toolkit.lexical_graph import LexicalGraphIndex, set_logging_config\n",
53 | "from graphrag_toolkit.lexical_graph.storage import GraphStoreFactory\n",
54 | "from graphrag_toolkit.lexical_graph.storage import VectorStoreFactory\n",
55 | "from graphrag_toolkit.lexical_graph.indexing.load import S3BasedDocs\n",
56 | "from graphrag_toolkit.lexical_graph.indexing.build import Checkpoint\n",
57 | "\n",
58 | "set_logging_config('INFO')\n",
59 | "\n",
60 | "docs = S3BasedDocs(\n",
61 | " region='us-east-1',\n",
62 | " bucket_name=os.environ['LOCAL_EXTRACT_S3'],\n",
63 | " key_prefix='extract-build',\n",
64 | " collection_id='best-practices'\n",
65 | ")\n",
66 | "checkpoint = Checkpoint('s3-build-checkpoint')\n",
67 | "\n",
68 | "graph_store = GraphStoreFactory.for_graph_store(os.environ['GRAPH_STORE'])\n",
69 | "vector_store = VectorStoreFactory.for_vector_store(os.environ['VECTOR_STORE'])\n",
70 | "\n",
71 | "graph_index = LexicalGraphIndex(\n",
72 | " graph_store,\n",
73 | " vector_store\n",
74 | ")\n",
75 | "\n",
76 | "graph_index.build(docs, checkpoint=checkpoint, show_progress=True)\n",
77 | "\n",
78 | "print('Build complete')"
79 | ],
80 | "id": "eaa952bf",
81 | "outputs": [],
82 | "execution_count": null
83 | }
84 | ],
85 | "metadata": {
86 | "kernelspec": {
87 | "display_name": "Python 3",
88 | "language": "python",
89 | "name": "python3"
90 | },
91 | "language_info": {
92 | "codemirror_mode": {
93 | "name": "ipython",
94 | "version": 3
95 | },
96 | "file_extension": ".py",
97 | "mimetype": "text/x-python",
98 | "name": "python",
99 | "nbconvert_exporter": "python",
100 | "pygments_lexer": "ipython3",
101 | "version": "3.10.8"
102 | }
103 | },
104 | "nbformat": 4,
105 | "nbformat_minor": 5
106 | }
107 |
--------------------------------------------------------------------------------
/examples/lexical-graph-hybrid-dev/notebooks/best-practices/Retrieval-Augmented-Generation-Options.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/awslabs/graphrag-toolkit/e6967b5d2589bfd39bc72be2e4da0c9d1a30e865/examples/lexical-graph-hybrid-dev/notebooks/best-practices/Retrieval-Augmented-Generation-Options.pdf
--------------------------------------------------------------------------------
/examples/lexical-graph-local-dev/README.md:
--------------------------------------------------------------------------------
1 | ## Lexical Graph Examples
2 |
3 | ### Notebooks
4 |
5 | - [**00-Setup**](./notebooks/00-Setup.ipynb) – Installs the lexical-graph package and additional dependencies.
6 | - [**01-Combined Extract and Build**](./notebooks/01-Combined-Extract-and-Build.ipynb) – An example of [performing continuous ingest](../../docs/lexical-graph/indexing.md#continous-ingest) using the `LexicalGraphIndex.extract_and_build()` method.
7 | - [**03-Querying**](./notebooks/04-Querying.ipynb) – Examples of [querying the graph](../../docs/lexical-graph/querying.md) using the `LexicalGraphQueryEngine` with `SemanticGuidedRetriever`.
8 |
9 | ## Environment Setup
10 |
11 | The notebooks rely on `GRAPH_STORE` and `VECTOR_STORE` environment variables being properly set. These variables define where and how the graph store and vector store connect.
12 |
13 | To set up your local environment:
14 |
15 | 1. Clone the repository and navigate to your working directory.
16 | 2. Run:
17 |
18 | ```bash
19 | ./build.sh
20 | ```
21 |
22 | This will start and configure the following services in Docker:
23 |
24 | - **FalkorDB** for graph storage
25 | - **FalkorDB Browser** (accessible on `localhost:8092`) for interactive graph exploration
26 | - **PostgreSQL with pgvector** for vector embeddings
27 |
28 | The Postgres container auto-applies the following schema on initialization via `./postgres/schema.sql`:
29 |
30 | ```sql
31 | -- Enable pgvector extension in public schema
32 | CREATE EXTENSION IF NOT EXISTS vector SCHEMA public;
33 |
34 | -- Enable pg_trgm extension in public schema
35 | CREATE EXTENSION IF NOT EXISTS pg_trgm SCHEMA public;
36 |
37 | -- Create schema for GraphRAG
38 | CREATE SCHEMA IF NOT EXISTS graphrag;
39 | ```
40 |
41 | These extensions are necessary for similarity search and fuzzy matching in GraphRAG.
42 |
43 | ## AWS Foundation Model Access (Optional)
44 |
45 | If you intend to run the CloudFormation templates instead of using Docker:
46 |
47 | - Ensure your AWS account has access to the following Amazon Bedrock foundation models:
48 | - `anthropic.claude-3-5-sonnet-20240620-v1:0`
49 | - `cohere.embed-english-v3`
50 |
51 | Enable model access via the [Bedrock model access console](https://docs.aws.amazon.com/bedrock/latest/userguide/model-access.html).
52 |
53 | You must deploy to an AWS region where these models are available.
54 |
55 | ## Optional: CloudFormation Stacks
56 |
57 | If you want to deploy infrastructure in AWS, CloudFormation templates are available:
58 |
59 | - `graphrag-toolkit-neptune-db-opensearch-serverless.json`
60 | - `graphrag-toolkit-neptune-db-aurora-postgres.json`
61 |
62 | These templates create:
63 |
64 | - A Neptune serverless DB cluster
65 | - Either OpenSearch Serverless or Aurora PostgreSQL
66 | - A SageMaker notebook instance
67 | - IAM roles with optional policies via the `IamPolicyArn` parameter
68 | - An optional `ExampleNotebooksURL` parameter to auto-load the examples
69 |
70 | > ⚠️ AWS charges apply for cloud resources.
71 |
72 | ---
73 |
74 | Use this guide if you prefer to develop and test locally before migrating to AWS-based deployments.
--------------------------------------------------------------------------------
/examples/lexical-graph-local-dev/docker/.env:
--------------------------------------------------------------------------------
1 | # PostgreSQL settings
2 | POSTGRES_USER=graphrag
3 | POSTGRES_PASSWORD=graphragpass
4 | POSTGRES_DB=graphrag_db
5 | POSTGRES_HOST=postgres
6 | POSTGRES_PORT=5432
7 |
8 | # FalkorDB settings (adjust as needed)
9 | FALKORDB_HOST=falkordb
10 | FALKORDB_PORT=6379
11 |
12 | # Other potential settings for graphrag_toolkit
13 | EMBEDDING_MODEL=sentence-transformers/all-mpnet-base-v2
14 | EMBEDDING_DIMENSIONS=1536
--------------------------------------------------------------------------------
/examples/lexical-graph-local-dev/docker/build.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | echo "Building and starting containers..."
4 | docker compose up -d --build
5 |
6 | echo "Build and startup complete."
7 |
--------------------------------------------------------------------------------
/examples/lexical-graph-local-dev/docker/docker-compose.yml:
--------------------------------------------------------------------------------
1 | services:
2 | falkordb:
3 | image: falkordb/falkordb:latest
4 | container_name: falkordb
5 | ports:
6 | - "6379:6379" # Redis/FalkorDB default
7 | - "3000:3000" # Optional for FalkorDB REST if exposed
8 | volumes:
9 | - falkor_data:/data
10 | networks:
11 | - graphrag_network
12 |
13 | falkordb-browser:
14 | image: falkordb/falkordb-browser:latest
15 | container_name: falkordb-browser
16 | ports:
17 | - "8092:8080" # Browser UI exposed on localhost:8080
18 | environment:
19 | FALKORDB_BROWSER_REDIS_HOST: falkordb
20 | FALKORDB_BROWSER_REDIS_PORT: 6379
21 | FALKORDB_BROWSER_REDIS_USE_TLS: "false"
22 | depends_on:
23 | - falkordb
24 | networks:
25 | - graphrag_network
26 |
27 | postgres:
28 | image: pgvector/pgvector:0.6.2-pg16
29 | container_name: pgvector-db
30 | ports:
31 | - "5432:5432"
32 | environment:
33 | - POSTGRES_USER=${POSTGRES_USER}
34 | - POSTGRES_PASSWORD=${POSTGRES_PASSWORD}
35 | - POSTGRES_DB=${POSTGRES_DB}
36 | volumes:
37 | - pgvector_data:/var/lib/postgresql/data
38 | - ./postgres/schema.sql:/docker-entrypoint-initdb.d/schema.sql
39 | networks:
40 | - graphrag_network
41 |
42 | networks:
43 | graphrag_network:
44 | driver: bridge
45 |
46 | volumes:
47 | falkor_data:
48 | pgvector_data:
49 |
--------------------------------------------------------------------------------
/examples/lexical-graph-local-dev/docker/postgres/schema.sql:
--------------------------------------------------------------------------------
1 | -- Enable pgvector extension in public schema
2 | CREATE EXTENSION IF NOT EXISTS vector SCHEMA public;
3 |
4 | -- Enable pg_trgm extension in public schema
5 | CREATE EXTENSION IF NOT EXISTS pg_trgm SCHEMA public;
6 |
7 | -- Create schema for GraphRAG
8 | CREATE SCHEMA IF NOT EXISTS graphrag;
--------------------------------------------------------------------------------
/examples/lexical-graph-local-dev/docker/reset.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | echo "Stopping and removing containers, volumes, and networks..."
4 | docker compose down -v --remove-orphans
5 |
6 | echo "Ensuring containers are removed..."
7 | docker rm -f falkordb falkordb-browser pgvector-db 2>/dev/null
8 |
9 | echo "Removing named volumes..."
10 | docker volume rm -f pgvector_data falkor_data 2>/dev/null
11 |
12 | echo "Pruning dangling volumes (if any)..."
13 | docker volume prune -f
14 |
15 | echo "Clearing extracted directory..."
16 | rm -rf extracted
17 |
18 | echo "Rebuilding containers..."
19 | docker compose up -d --force-recreate
20 |
21 | echo "Reset complete."
--------------------------------------------------------------------------------
/examples/lexical-graph-local-dev/docs/docker_build.md:
--------------------------------------------------------------------------------
1 | # build.sh
2 |
3 | This script is used to build and start the containers for a new deployment of the application using Docker Compose. It is intended for **initial deployments** or **redeployments** without resetting volumes, removing data, or clearing persistent state.
4 |
5 | ## Usage
6 |
7 | ```bash
8 | chmod +x build.sh
9 | ./build.sh
10 | ```
11 |
12 | ## What it does
13 |
14 | - Executes `docker compose up -d --build` to:
15 | - Build the Docker images using the `Dockerfile`s defined in the project.
16 | - Start the services in detached mode (`-d`) so the terminal remains available.
17 | - Automatically pull required images if not already present.
18 | - Rebuild containers if source code has changed.
19 |
20 | ## Important Notes
21 |
22 | - This script does **not** remove any existing containers, volumes, or data.
23 | - It is safe to run on top of an existing deployment if you are deploying an updated version of your app.
24 | - Make sure your `.env` and `docker-compose.yml` files are configured properly before running the script.
25 |
26 | ## Related Scripts
27 |
28 | - See [`reset.sh`](reset.md) for a full environment reset, including data deletion and volume pruning.
29 |
--------------------------------------------------------------------------------
/examples/lexical-graph-local-dev/docs/docker_reset_script.md:
--------------------------------------------------------------------------------
1 | # Docker Environment Reset Script
2 |
3 | This script is used to **fully reset a local Docker-based development environment** for graphrag-toolkit. The script will reset FalkorDB, PGVector, and optionally other components. It performs cleanup of containers, networks, volumes, and extracted data, followed by a fresh container rebuild.
4 |
5 | ## Filename
6 |
7 | Use `reset.sh` (file is located in lexical-graph-contrib/docker) and run it with:
8 |
9 | ```bash
10 | bash reset.sh
11 | ```
12 |
13 | > **Note:** Make sure the script is executable (`chmod +x reset.sh`) or invoke it with `bash`.
14 |
15 | ---
16 |
17 | ## Script Breakdown
18 |
19 | ```bash
20 | #!/bin/bash
21 | ```
22 | - Standard shebang to run the script using `bash`.
23 |
24 | ---
25 |
26 | ### 1. Stop and Remove Docker Resources
27 |
28 | ```bash
29 | echo "Stopping and removing containers, volumes, and networks..."
30 | docker compose down -v --remove-orphans
31 | ```
32 |
33 | - **`docker compose down`** stops and removes containers defined in `docker-compose.yml`.
34 | - **`-v`** removes associated anonymous volumes.
35 | - **`--remove-orphans`** removes containers not defined in the current Compose file but part of the same project network.
36 |
37 | ---
38 |
39 | ### 2. Explicitly Remove Named Containers
40 |
41 | ```bash
42 | echo "Ensuring containers are removed..."
43 | docker rm -f falkordb falkordb-browser pgvector-db 2>/dev/null
44 | ```
45 |
46 | - Forcefully removes specific named containers, if they still exist.
47 | - Errors are suppressed using `2>/dev/null`.
48 |
49 | ---
50 |
51 | ### 3. Remove Named Volumes
52 |
53 | ```bash
54 | echo "Removing named volumes..."
55 | docker volume rm -f pgvector_data falkor_data 2>/dev/null
56 | ```
57 |
58 | - Deletes project-specific Docker volumes that might persist after shutdown.
59 |
60 | ---
61 |
62 | ### 4. Prune Dangling Volumes
63 |
64 | ```bash
65 | echo "Pruning dangling volumes (if any)..."
66 | docker volume prune -f
67 | ```
68 |
69 | - Removes **dangling (unused)** Docker volumes that may be left behind.
70 |
71 | ---
72 |
73 | ### 5. Delete Local Directories
74 |
75 | ```bash
76 | echo "Clearing extracted directory..."
77 | rm -rf extracted
78 | ```
79 |
80 | - Cleans up the local `./extracted` directory used to store intermediate files (like parsed documents, indexes, or temp outputs).
81 |
82 | ---
83 |
84 | ### 6. Rebuild and Start Containers
85 |
86 | ```bash
87 | echo "Rebuilding containers..."
88 | docker compose up -d --force-recreate
89 | ```
90 |
91 | - **`-d`** runs containers in detached mode.
92 | - **`--force-recreate`** ensures all containers are recreated even if configuration hasn't changed.
93 |
94 | ---
95 |
96 | ### 7. Final Message
97 |
98 | ```bash
99 | echo "Reset complete."
100 | ```
101 |
102 | - Indicates successful completion of the reset process.
103 |
104 | ---
105 |
106 | ## Use Cases
107 |
108 | - Full environment reset between development sessions
109 | - Clean-up after corrupt container or volume states
110 | - Ensures a consistent baseline environment for troubleshooting or testing
111 |
112 | ---
113 |
114 | ## Warnings
115 |
116 | - **Data Loss**: This script removes all persistent data and should not be used on production environments.
117 | - **Rebuild Time**: Fresh container creation may take time depending on image sizes and network speed.
118 |
119 |
--------------------------------------------------------------------------------
/examples/lexical-graph-local-dev/docs/docker_services.md:
--------------------------------------------------------------------------------
1 | # Docker Services Overview for GraphRAG Deployment
2 |
3 | This document describes the services defined in the `docker-compose.yml` file used for setting up a GraphRAG environment. It includes containerized services for FalkorDB, a FalkorDB browser UI, and a PostgreSQL database with the `pgvector` extension enabled.
4 |
5 | ---
6 |
7 | ## Services
8 |
9 | ### 1. `falkordb`
10 | - **Image**: `falkordb/falkordb:latest`
11 | - **Description**: Runs the FalkorDB graph database, which uses Redis as its backend.
12 | - **Ports**:
13 | - `6379`: Redis/FalkorDB main port.
14 | - `3000`: Optional REST API for FalkorDB if exposed.
15 | - **Volume**: Persists graph data using `falkor_data`.
16 | - **Network**: Connected to `graphrag_network`.
17 |
18 | ### 2. `falkordb-browser`
19 | - **Image**: `falkordb/falkordb-browser:latest`
20 | - **Description**: Provides a web-based interface for interacting with FalkorDB.
21 | - **Ports**:
22 | - `8092:8080`: Web UI exposed on localhost:8092.
23 | - **Environment Variables**:
24 | - `FALKORDB_BROWSER_REDIS_HOST`: Hostname of the FalkorDB service.
25 | - `FALKORDB_BROWSER_REDIS_PORT`: Port for Redis.
26 | - `FALKORDB_BROWSER_REDIS_USE_TLS`: TLS setting for secure Redis communication (disabled in this setup).
27 | - **Depends On**: `falkordb`
28 | - **Network**: Connected to `graphrag_network`.
29 |
30 | ### 3. `postgres`
31 | - **Image**: `pgvector/pgvector:0.6.2-pg16`
32 | - **Description**: PostgreSQL 16 image with the `pgvector` extension pre-installed for vector search capabilities.
33 | - **Ports**:
34 | - `5432`: PostgreSQL default port.
35 | - **Environment Variables**:
36 | - `POSTGRES_USER`, `POSTGRES_PASSWORD`, `POSTGRES_DB`: Injected from environment or `.env` file.
37 | - **Volumes**:
38 | - `pgvector_data`: Data persistence.
39 | - `./postgres/schema.sql`: Initializes the database schema.
40 | - **Network**: Connected to `graphrag_network`.
41 |
42 | ---
43 |
44 | ## `schema.sql`
45 |
46 | This SQL file is used to bootstrap the PostgreSQL container with necessary extensions and a custom schema:
47 |
48 | ```sql
49 | -- Enable pgvector extension in public schema
50 | CREATE EXTENSION IF NOT EXISTS vector SCHEMA public;
51 |
52 | -- Enable pg_trgm extension in public schema
53 | CREATE EXTENSION IF NOT EXISTS pg_trgm SCHEMA public;
54 |
55 | -- Create schema for GraphRAG
56 | CREATE SCHEMA IF NOT EXISTS graphrag;
57 | ```
58 |
59 | These extensions are required for vector similarity search and trigram-based indexing within the GraphRAG framework.
60 |
61 | ---
62 |
63 | ## Networks
64 |
65 | - **graphrag_network**: A dedicated Docker bridge network for inter-container communication.
66 |
67 | ---
68 |
69 | ## Volumes
70 |
71 | - `falkor_data`: Persists FalkorDB graph state.
72 | - `pgvector_data`: Persists PostgreSQL data including vector embeddings and schema definitions.
73 |
--------------------------------------------------------------------------------
/examples/lexical-graph-local-dev/notebooks/.env:
--------------------------------------------------------------------------------
1 | AWS_REGION="us-east-1"
2 | AWS_PROFILE="padmin"
3 | EMBEDDINGS_MODEL="cohere.embed-english-v3"
4 | EMBEDDINGS_DIMENSIONS=1024
5 | EXTRACTION_MODEL="us.anthropic.claude-3-5-sonnet-20240620-v1:0"
6 | RESPONSE_MODEL="us.anthropic.claude-3-5-sonnet-20240620-v1:0"
7 | EXTRACTION_NUM_WORKERS=2
8 | EXTRACTION_NUM_THREADS_PER_WORKER=4
9 | EXTRACTION_BATCH_SIZE=4
10 | BUILD_NUM_WORKERS=2
11 | BUILD_BATCH_SIZE=4
12 | BUILD_BATCH_WRITE_SIZE=25
13 | BATCH_WRITES_ENABLED=True
14 | DEFAULT_INCLUDE_DOMAIN_LABELS=False
15 | ENABLE_CACHE=False
16 | VECTOR_STORE="postgresql://graphrag:graphragpass@localhost:5432/graphrag_db"
17 | GRAPH_STORE="falkordb://localhost:6379"
--------------------------------------------------------------------------------
/examples/lexical-graph-local-dev/notebooks/01-Combined-Extract-and-Build.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "3eb1535a",
6 | "metadata": {},
7 | "source": "# 01 - Combined Extract and Build"
8 | },
9 | {
10 | "cell_type": "markdown",
11 | "id": "e3f529c1",
12 | "metadata": {},
13 | "source": [
14 | "## Setup\n",
15 | "\n",
16 | "If you haven't already, install the toolkit and dependencies using the [Setup](./00-Setup.ipynb) notebook."
17 | ]
18 | },
19 | {
20 | "cell_type": "markdown",
21 | "id": "0aa94cc9",
22 | "metadata": {},
23 | "source": [
24 | "## Continous ingest\n",
25 | "\n",
26 | "See [Continous ingest](https://github.com/awslabs/graphrag-toolkit/blob/main/docs/lexical-graph/indexing.md#continous-ingest)."
27 | ]
28 | },
29 | {
30 | "cell_type": "code",
31 | "id": "7ec68542",
32 | "metadata": {},
33 | "source": [
34 | "%reload_ext dotenv\n",
35 | "%dotenv\n",
36 | "\n",
37 | "import os\n",
38 | "\n",
39 | "from graphrag_toolkit.lexical_graph import LexicalGraphIndex, set_logging_config\n",
40 | "from graphrag_toolkit.lexical_graph.storage import VectorStoreFactory\n",
41 | "from graphrag_toolkit.lexical_graph.storage import GraphStoreFactory\n",
42 | "from graphrag_toolkit.lexical_graph.storage.graph.falkordb import FalkorDBGraphStoreFactory\n",
43 | "\n",
44 | "GraphStoreFactory.register(FalkorDBGraphStoreFactory)\n",
45 | "\n",
46 | "from llama_index.readers.web import SimpleWebPageReader\n",
47 | "\n",
48 | "graph_store = GraphStoreFactory.for_graph_store(os.environ['GRAPH_STORE'])\n",
49 | "vector_store = VectorStoreFactory.for_vector_store(os.environ['VECTOR_STORE'])\n",
50 | "\n",
51 | "graph_index = LexicalGraphIndex(\n",
52 | " graph_store, \n",
53 | " vector_store\n",
54 | ")\n",
55 | "\n",
56 | "doc_urls = [\n",
57 | " 'https://docs.aws.amazon.com/neptune/latest/userguide/intro.html',\n",
58 | " 'https://docs.aws.amazon.com/neptune-analytics/latest/userguide/what-is-neptune-analytics.html',\n",
59 | " 'https://docs.aws.amazon.com/neptune-analytics/latest/userguide/neptune-analytics-features.html',\n",
60 | " 'https://docs.aws.amazon.com/neptune-analytics/latest/userguide/neptune-analytics-vs-neptune-database.html'\n",
61 | "]\n",
62 | "\n",
63 | "docs = SimpleWebPageReader(\n",
64 | " html_to_text=True,\n",
65 | " metadata_fn=lambda url:{'url': url}\n",
66 | ").load_data(doc_urls)\n",
67 | "\n",
68 | "graph_index.extract_and_build(docs, show_progress=True)\n",
69 | "\n",
70 | "print('Complete')"
71 | ],
72 | "outputs": [],
73 | "execution_count": null
74 | },
75 | {
76 | "metadata": {},
77 | "cell_type": "code",
78 | "source": "",
79 | "id": "5e2b536ce6540fb5",
80 | "outputs": [],
81 | "execution_count": null
82 | }
83 | ],
84 | "metadata": {
85 | "kernelspec": {
86 | "display_name": "Python 3",
87 | "language": "python",
88 | "name": "python3"
89 | },
90 | "language_info": {
91 | "codemirror_mode": {
92 | "name": "ipython",
93 | "version": 3
94 | },
95 | "file_extension": ".py",
96 | "mimetype": "text/x-python",
97 | "name": "python",
98 | "nbconvert_exporter": "python",
99 | "pygments_lexer": "ipython3",
100 | "version": "3.10.8"
101 | }
102 | },
103 | "nbformat": 4,
104 | "nbformat_minor": 5
105 | }
106 |
--------------------------------------------------------------------------------
/examples/lexical-graph-local-dev/notebooks/prompts/system_prompt.txt:
--------------------------------------------------------------------------------
1 | You are a question answering agent. I will provide you with a set of search results. The user will provide you with a question. Your job is to answer the user's question using only information from the search results. If the search results are empty, do not attempt to answer the question.
2 |
3 |
4 | {search_results}
5 |
6 |
7 | ## Instructions
8 | - Think carefully about the question, the source and relevancy of each of the search results, and the logical connections between different search results before answering.
9 | - Ensure you answer each part of the question.
10 | - Reference information from the search results in your answer by adding the 'source' in square brackets at the end of relevant sentences.
11 | - Do NOT directly quote the search results in your answer.
12 | - If the question is a yes/no question, start with either 'Yes' or 'No'.
13 | - If the search results are empty, do not attempt to answer the question.
14 |
15 | Based on the search results, answer the following question as concisely as possible:
16 |
--------------------------------------------------------------------------------
/examples/lexical-graph-local-dev/notebooks/prompts/user_prompt.txt:
--------------------------------------------------------------------------------
1 |
2 | {query}
3 |
4 |
5 | Please answer the question above using the information available. Respond only in the following JSON format:
6 |
7 | {
8 | "answer": "",
9 | "supporting_facts": ["fact 1", "fact 2", "..."],
10 | "confidence": ""
11 | }
12 |
--------------------------------------------------------------------------------
/examples/lexical-graph/notebooks/01-Combined-Extract-and-Build.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "3eb1535a",
6 | "metadata": {},
7 | "source": [
8 | "# 01 - Combined Extract and Build"
9 | ]
10 | },
11 | {
12 | "cell_type": "markdown",
13 | "id": "e3f529c1",
14 | "metadata": {},
15 | "source": [
16 | "## Setup\n",
17 | "\n",
18 | "If you haven't already, install the toolkit and dependencies using the [Setup](./00-Setup.ipynb) notebook."
19 | ]
20 | },
21 | {
22 | "cell_type": "markdown",
23 | "id": "0aa94cc9",
24 | "metadata": {},
25 | "source": [
26 | "## Continous ingest\n",
27 | "\n",
28 | "See [Continous ingest](https://github.com/awslabs/graphrag-toolkit/blob/main/docs/lexical-graph/indexing.md#continous-ingest)."
29 | ]
30 | },
31 | {
32 | "cell_type": "code",
33 | "execution_count": null,
34 | "id": "7ec68542",
35 | "metadata": {},
36 | "outputs": [],
37 | "source": [
38 | "%reload_ext dotenv\n",
39 | "%dotenv\n",
40 | "\n",
41 | "import os\n",
42 | "\n",
43 | "from graphrag_toolkit.lexical_graph import LexicalGraphIndex, GraphRAGConfig, set_logging_config\n",
44 | "from graphrag_toolkit.lexical_graph.storage import GraphStoreFactory\n",
45 | "from graphrag_toolkit.lexical_graph.storage import VectorStoreFactory\n",
46 | "\n",
47 | "from llama_index.readers.web import SimpleWebPageReader\n",
48 | "\n",
49 | "set_logging_config('INFO')\n",
50 | "\n",
51 | "graph_store = GraphStoreFactory.for_graph_store(os.environ['GRAPH_STORE'])\n",
52 | "vector_store = VectorStoreFactory.for_vector_store(os.environ['VECTOR_STORE'])\n",
53 | "\n",
54 | "graph_index = LexicalGraphIndex(\n",
55 | " graph_store, \n",
56 | " vector_store\n",
57 | ")\n",
58 | "\n",
59 | "doc_urls = [\n",
60 | " 'https://docs.aws.amazon.com/neptune/latest/userguide/intro.html',\n",
61 | " 'https://docs.aws.amazon.com/neptune-analytics/latest/userguide/what-is-neptune-analytics.html',\n",
62 | " 'https://docs.aws.amazon.com/neptune-analytics/latest/userguide/neptune-analytics-features.html',\n",
63 | " 'https://docs.aws.amazon.com/neptune-analytics/latest/userguide/neptune-analytics-vs-neptune-database.html'\n",
64 | "]\n",
65 | "\n",
66 | "docs = SimpleWebPageReader(\n",
67 | " html_to_text=True,\n",
68 | " metadata_fn=lambda url:{'url': url}\n",
69 | ").load_data(doc_urls)\n",
70 | "\n",
71 | "graph_index.extract_and_build(docs, show_progress=True)\n",
72 | "\n",
73 | "print('Complete')"
74 | ]
75 | }
76 | ],
77 | "metadata": {
78 | "kernelspec": {
79 | "display_name": "Python 3",
80 | "language": "python",
81 | "name": "python3"
82 | },
83 | "language_info": {
84 | "codemirror_mode": {
85 | "name": "ipython",
86 | "version": 3
87 | },
88 | "file_extension": ".py",
89 | "mimetype": "text/x-python",
90 | "name": "python",
91 | "nbconvert_exporter": "python",
92 | "pygments_lexer": "ipython3",
93 | "version": "3.10.8"
94 | }
95 | },
96 | "nbformat": 4,
97 | "nbformat_minor": 5
98 | }
99 |
--------------------------------------------------------------------------------
/images/byokg_rag.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/awslabs/graphrag-toolkit/e6967b5d2589bfd39bc72be2e4da0c9d1a30e865/images/byokg_rag.png
--------------------------------------------------------------------------------
/images/extract-and-build.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/awslabs/graphrag-toolkit/e6967b5d2589bfd39bc72be2e4da0c9d1a30e865/images/extract-and-build.png
--------------------------------------------------------------------------------
/images/hybrid-extract-and-build.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/awslabs/graphrag-toolkit/e6967b5d2589bfd39bc72be2e4da0c9d1a30e865/images/hybrid-extract-and-build.png
--------------------------------------------------------------------------------
/images/lexical-graph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/awslabs/graphrag-toolkit/e6967b5d2589bfd39bc72be2e4da0c9d1a30e865/images/lexical-graph.png
--------------------------------------------------------------------------------
/images/local-extract-and-build.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/awslabs/graphrag-toolkit/e6967b5d2589bfd39bc72be2e4da0c9d1a30e865/images/local-extract-and-build.png
--------------------------------------------------------------------------------
/images/question-answering.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/awslabs/graphrag-toolkit/e6967b5d2589bfd39bc72be2e4da0c9d1a30e865/images/question-answering.png
--------------------------------------------------------------------------------
/lexical-graph-contrib/falkordb/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["pbr>=6.1.1"]
3 | build-backend = "pbr.build"
4 |
5 | [project]
6 | name = "graphrag-toolkit-lexical-graph-falkordb"
7 | version = "1.0.1"
8 | description = "FalkorDB support for the AWS GraphRAG Toolkit, lexical graph"
9 | readme = "README.md"
10 | requires-python = ">=3.10"
11 | dynamic = ["dependencies"]
12 | license = "Apache-2.0"
13 |
14 | [tool.setuptools.dynamic]
15 | dependencies = {file = ["src/requirements.txt"]}
16 |
--------------------------------------------------------------------------------
/lexical-graph-contrib/falkordb/src/graphrag_toolkit/lexical_graph/storage/graph/falkordb/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 | # SPDX-License-Identifier: Apache-2.0
3 |
4 | from .falkordb_graph_store_factory import FalkorDBGraphStoreFactory
5 | from .falkordb_graph_store import FalkorDBDatabaseClient
6 |
--------------------------------------------------------------------------------
/lexical-graph-contrib/falkordb/src/graphrag_toolkit/lexical_graph/storage/graph/falkordb/falkordb_graph_store_factory.py:
--------------------------------------------------------------------------------
1 | import logging
2 | from typing import List, Union
3 | from falkordb.node import Node
4 | from falkordb.edge import Edge
5 | from falkordb.path import Path
6 |
7 | from graphrag_toolkit.lexical_graph.storage.graph import GraphStoreFactoryMethod, GraphStore, get_log_formatting
8 |
9 | logger = logging.getLogger(__name__)
10 |
11 | FALKORDB = 'falkordb://'
12 | FALKORDB_DNS = 'falkordb.com'
13 | DEFAULT_DATABASE_NAME = 'graphrag'
14 | QUERY_RESULT_TYPE = Union[List[List[Node]], List[List[List[Path]]], List[List[Edge]]]
15 |
16 | class FalkorDBGraphStoreFactory(GraphStoreFactoryMethod):
17 |
18 | def try_create(self, graph_info:str, **kwargs) -> GraphStore:
19 | endpoint_url = None
20 | if graph_info.startswith(FALKORDB):
21 | endpoint_url = graph_info[len(FALKORDB):]
22 | elif graph_info.endswith(FALKORDB_DNS):
23 | endpoint_url = graph_info
24 | if endpoint_url:
25 | try:
26 | from graphrag_toolkit.lexical_graph.storage.graph.falkordb import FalkorDBDatabaseClient
27 | logger.debug(f'Opening FalkorDB database [endpoint: {endpoint_url}]')
28 | return FalkorDBDatabaseClient(
29 | endpoint_url=endpoint_url,
30 | log_formatting=get_log_formatting(kwargs),
31 | **kwargs
32 | )
33 | except ImportError as e:
34 | raise e
35 |
36 | else:
37 | return None
--------------------------------------------------------------------------------
/lexical-graph-contrib/falkordb/src/requirements.txt:
--------------------------------------------------------------------------------
1 | FalkorDB
2 | redis
--------------------------------------------------------------------------------
/lexical-graph-contrib/falkordb/src/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | name = graphrag-toolkit-lexical-graph-falkordb
--------------------------------------------------------------------------------
/lexical-graph-contrib/falkordb/src/setup.py:
--------------------------------------------------------------------------------
1 | import setuptools
2 | setuptools.setup(pbr=True)
--------------------------------------------------------------------------------
/lexical-graph/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["hatchling", "hatch-requirements-txt"]
3 | build-backend = "hatchling.build"
4 |
5 | [tool.hatch.build.targets.wheel]
6 | packages = ["src/graphrag_toolkit"]
7 |
8 | [project]
9 | name = "graphrag-toolkit-lexical-graph"
10 | version = "3.9.0-SNAPSHOT"
11 | description = "AWS GraphRAG Toolkit, lexical graph"
12 | readme = "README.md"
13 | requires-python = ">=3.10"
14 | dynamic = ["dependencies"]
15 | license = "Apache-2.0"
16 |
17 | [tool.hatch.metadata.hooks.requirements_txt]
18 | files = ["src/graphrag_toolkit/lexical_graph/requirements.txt"]
19 |
--------------------------------------------------------------------------------
/lexical-graph/src/graphrag_toolkit/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 | # SPDX-License-Identifier: Apache-2.0
--------------------------------------------------------------------------------
/lexical-graph/src/graphrag_toolkit/lexical_graph/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 | # SPDX-License-Identifier: Apache-2.0
3 |
4 | from .tenant_id import TenantId, DEFAULT_TENANT_ID, TenantIdType, to_tenant_id
5 | from .config import GraphRAGConfig as GraphRAGConfig, LLMType, EmbeddingType
6 | from .errors import ModelError, BatchJobError, IndexError
7 | from .logging import set_logging_config, set_advanced_logging_config
8 | from .lexical_graph_query_engine import LexicalGraphQueryEngine
9 | from .lexical_graph_index import LexicalGraphIndex
10 | from .lexical_graph_index import ExtractionConfig, BuildConfig, IndexingConfig
11 | from . import utils
12 | from . import indexing
13 | from . import retrieval
14 | from . import storage
15 |
16 |
17 |
18 |
--------------------------------------------------------------------------------
/lexical-graph/src/graphrag_toolkit/lexical_graph/errors.py:
--------------------------------------------------------------------------------
1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 | # SPDX-License-Identifier: Apache-2.0
3 |
4 | class ModelError(Exception):
5 | pass
6 |
7 | class BatchJobError(Exception):
8 | pass
9 |
10 | class IndexError(Exception):
11 | pass
12 |
--------------------------------------------------------------------------------
/lexical-graph/src/graphrag_toolkit/lexical_graph/indexing/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 | # SPDX-License-Identifier: Apache-2.0
3 |
4 | from .node_handler import NodeHandler
5 | from .utils.pipeline_utils import sink
6 | from .utils.metadata_utils import last_accessed_date
7 | from .id_generator import IdGenerator
8 | from . import build
9 | from . import extract
10 | from . import load
11 | from . import utils
12 |
--------------------------------------------------------------------------------
/lexical-graph/src/graphrag_toolkit/lexical_graph/indexing/build/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 | # SPDX-License-Identifier: Apache-2.0
3 |
4 | from .build_pipeline import BuildPipeline
5 | from .vector_indexing import VectorIndexing
6 | from .graph_construction import GraphConstruction
7 | from .checkpoint import Checkpoint
8 | from .build_filters import BuildFilters, DEFAULT_BUILD_FILTER
--------------------------------------------------------------------------------
/lexical-graph/src/graphrag_toolkit/lexical_graph/indexing/build/graph_builder.py:
--------------------------------------------------------------------------------
1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 | # SPDX-License-Identifier: Apache-2.0
3 |
4 | import abc
5 | from typing import Dict, Any
6 |
7 | from graphrag_toolkit.lexical_graph.storage.graph import GraphStore
8 |
9 | from llama_index.core.schema import BaseComponent, BaseNode
10 |
11 | class GraphBuilder(BaseComponent):
12 | """
13 | Handles the construction and management of graph structures.
14 |
15 | GraphBuilder serves as a base class for implementing components that facilitate the
16 | creation, indexing, and management of graphs within a specific application context. It
17 | provides a structure for defining essential methods that subclasses must override to
18 | handle graph building and indexing functionalities.
19 |
20 | Attributes:
21 | index_key (str): Represents the unique key or identifier used to distinguish
22 | the indexing strategy of the graph structure.
23 | """
24 | def _to_params(self, p:Dict):
25 | """
26 | Converts a given dictionary into a specific parameters structure expected by
27 | the application or system.
28 |
29 | The function processes the input dictionary and wraps it inside another
30 | dictionary under the key `'params'`. It ensures consistency in the data
31 | format for further use or processing.
32 |
33 | Args:
34 | p (Dict): A dictionary containing the parameters to be converted.
35 |
36 | Returns:
37 | Dict: A dictionary wrapping the input as a value under the key `'params'`.
38 | """
39 | return { 'params': [p] }
40 |
41 | @classmethod
42 | @abc.abstractmethod
43 | def index_key(cls) -> str:
44 | """
45 | Defines an abstract class method to retrieve the index key associated with
46 | the implementing class. This method must be implemented by all subclasses to
47 | provide a unique identifier or key for indexing purposes.
48 |
49 | Returns:
50 | str: A string representing the index key for the class.
51 | """
52 | pass
53 |
54 | @abc.abstractmethod
55 | def build(self, node:BaseNode, graph_client: GraphStore, **kwargs:Any):
56 | """
57 | An abstract method designated for building a specific process related to a
58 | BaseNode within a GraphStore using additional parameters.
59 |
60 | Args:
61 | node: The node instance of type BaseNode on which the build operation
62 | is performed.
63 | graph_client: The graph storage client of type GraphStore responsible
64 | for managing graph operations.
65 | **kwargs: Arbitrary additional arguments that may be required for the
66 | build operation specific to the implementation.
67 | """
68 | pass
--------------------------------------------------------------------------------
/lexical-graph/src/graphrag_toolkit/lexical_graph/indexing/build/null_builder.py:
--------------------------------------------------------------------------------
1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 | # SPDX-License-Identifier: Apache-2.0
3 |
4 | import logging
5 | from typing import List, Any
6 |
7 | from graphrag_toolkit.lexical_graph.indexing import NodeHandler
8 |
9 | from llama_index.core.schema import BaseNode
10 |
11 | logger = logging.getLogger(__name__)
12 |
13 | class NullBuilder(NodeHandler):
14 | """
15 | Handles the acceptance of nodes without performing any transformations, primarily
16 | used as a pass-through handler.
17 |
18 | The class is designed to process and yield nodes without altering their state. This
19 | can be helpful in scenarios where nodes need to be logged or monitored without any
20 | modification. The class inherits from `NodeHandler`.
21 |
22 | Attributes:
23 | None
24 | """
25 | def accept(self, nodes: List[BaseNode], **kwargs: Any):
26 | """
27 | Accepts a list of nodes and processes them, yielding each node while logging its acceptance. This function is designed
28 | to produce a generator for the given nodes after logging their node IDs.
29 |
30 | Args:
31 | nodes (List[BaseNode]): A list of nodes to be processed. Each node is expected to have a `node_id` attribute
32 | which will be used for logging.
33 | **kwargs (Any): Additional arguments that might be used for extended functionality or context, but are not
34 | required for this function's core behavior.
35 |
36 | Yields:
37 | BaseNode: Each node from the input list is yielded after being processed (specifically logged in this case).
38 | """
39 | for node in nodes:
40 | logger.debug(f'Accepted node [node_id: {node.node_id}]')
41 | yield node
--------------------------------------------------------------------------------
/lexical-graph/src/graphrag_toolkit/lexical_graph/indexing/constants.py:
--------------------------------------------------------------------------------
1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 | # SPDX-License-Identifier: Apache-2.0
3 |
4 | TOPICS_KEY = 'aws::graph::topics'
5 | PROPOSITIONS_KEY= 'aws::graph::propositions'
6 | SOURCE_DOC_KEY = 'aws::graph::source_doc'
7 |
8 | DEFAULT_TOPIC = 'context'
9 | DEFAULT_CLASSIFICATION = 'unknown'
10 | DEFAULT_ENTITY_CLASSIFICATIONS = [
11 | 'Company',
12 | 'Organization',
13 | 'Location',
14 | 'Event',
15 | 'Sports Team',
16 | 'Sports Organization',
17 | 'Person',
18 | 'Role',
19 | 'Character',
20 | 'Product',
21 | 'Service',
22 | 'Media',
23 | 'Creative Work',
24 | 'Game',
25 | 'Software',
26 | 'Financial Instrument'
27 | ]
28 |
29 |
30 |
31 |
32 |
33 |
--------------------------------------------------------------------------------
/lexical-graph/src/graphrag_toolkit/lexical_graph/indexing/extract/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 | # SPDX-License-Identifier: Apache-2.0
3 |
4 | from .extraction_pipeline import ExtractionPipeline
5 | from .batch_config import BatchConfig
6 | from .llm_proposition_extractor import LLMPropositionExtractor
7 | from .proposition_extractor import PropositionExtractor
8 | from .batch_llm_proposition_extractor import BatchLLMPropositionExtractor
9 | from .batch_topic_extractor import BatchTopicExtractor
10 | from .topic_extractor import TopicExtractor
11 | from .graph_scoped_value_store import GraphScopedValueStore
12 | from .scoped_value_provider import ScopedValueStore, ScopedValueProvider, FixedScopedValueProvider, DEFAULT_SCOPE
13 | from .file_system_tap import FileSystemTap
14 | from .infer_classifications import InferClassifications
15 | from .infer_config import OnExistingClassifications, InferClassificationsConfig
16 |
--------------------------------------------------------------------------------
/lexical-graph/src/graphrag_toolkit/lexical_graph/indexing/extract/batch_config.py:
--------------------------------------------------------------------------------
1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 | # SPDX-License-Identifier: Apache-2.0
3 |
4 | from dataclasses import dataclass, field
5 | from typing import Optional, List
6 |
7 | @dataclass
8 | class BatchConfig:
9 | """
10 | Configuration for batch processing settings.
11 |
12 | This class provides a structure for configuring batch processing, including
13 | AWS settings like role ARN, region, and S3 bucket details, as well as network
14 | and batch control parameters. It is designed to facilitate batch operations
15 | by defining a standardized schema for batch-related configurations.
16 |
17 | Attributes:
18 | role_arn (str): ARN of the IAM role used for batch processing.
19 | region (str): AWS region where the batch processing will take place.
20 | bucket_name (str): Name of the S3 bucket used for storing batch-related
21 | data.
22 | key_prefix (Optional[str]): Optional prefix for keys in the S3 bucket.
23 | s3_encryption_key_id (Optional[str]): KMS key ID used for S3 encryption,
24 | if any.
25 | subnet_ids (List[str]): List of subnet IDs used for the network
26 | configuration of the batch processing.
27 | security_group_ids (List[str]): List of security group IDs applied to the
28 | batch processing tasks.
29 | max_batch_size (int): Maximum size of a single batch. Default is 25000.
30 | max_num_concurrent_batches (int): Maximum number of concurrent batches
31 | allowed. Default is 3.
32 | """
33 | role_arn:str
34 | region:str
35 | bucket_name:str
36 | key_prefix:Optional[str]=None
37 | s3_encryption_key_id:Optional[str]=None
38 | subnet_ids:List[str] = field(default_factory=list)
39 | security_group_ids:List[str] = field(default_factory=list)
40 | max_batch_size:int=25000
41 | max_num_concurrent_batches:int=3
--------------------------------------------------------------------------------
/lexical-graph/src/graphrag_toolkit/lexical_graph/indexing/extract/docs_to_nodes.py:
--------------------------------------------------------------------------------
1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 | # SPDX-License-Identifier: Apache-2.0
3 |
4 | import logging
5 | from typing import List, Any, Sequence
6 |
7 | from graphrag_toolkit.lexical_graph.indexing.build.checkpoint import DoNotCheckpoint
8 |
9 | from llama_index.core.node_parser import NodeParser
10 | from llama_index.core.schema import BaseNode, Document
11 | from llama_index.core.node_parser.node_utils import build_nodes_from_splits
12 |
13 | logger = logging.getLogger(__name__)
14 |
15 | class DocsToNodes(NodeParser, DoNotCheckpoint):
16 | """Parses documents into nodes.
17 |
18 | This class is responsible for parsing a collection of documents or nodes into
19 | a corresponding list of nodes. It extends functionality from `NodeParser` and
20 | `DoNotCheckpoint` to ensure compatibility with inheritable features and avoid
21 | saving checkpoints during operations.
22 |
23 | Attributes:
24 | None
25 | """
26 | def _parse_nodes(
27 | self,
28 | nodes: Sequence[BaseNode],
29 | show_progress: bool = False,
30 | **kwargs: Any,
31 | ) -> List[BaseNode]:
32 | """
33 | Parses a sequence of nodes into a list of `BaseNode` objects. If a node is of type
34 | `Document`, it converts the node into `BaseNode` by splitting the text and
35 | reconstructing the node. For other node types, it retains the original node.
36 |
37 | Args:
38 | nodes (Sequence[BaseNode]): A sequence of nodes to be parsed.
39 | show_progress (bool): A flag to indicate whether to display progress
40 | during parsing.
41 | **kwargs (Any): Additional keyword arguments for any future extensibility.
42 |
43 | Returns:
44 | List[BaseNode]: A list of parsed `BaseNode` objects.
45 | """
46 | def to_node(node):
47 | """
48 | Parses a sequence of nodes and converts documents to nodes where applicable.
49 |
50 | This method processes a given sequence of nodes. If a node is of type Document,
51 | it converts the node into one or more BaseNode instances based on text splits.
52 | For all other node types, it retains the original node. The function also
53 | allows progress tracking if specified.
54 |
55 | Args:
56 | nodes (Sequence[BaseNode]): A sequence of nodes to be parsed and processed.
57 | show_progress (bool): Indicates whether to show progress during parsing.
58 | **kwargs (Any): Additional keyword arguments for customization.
59 |
60 | Returns:
61 | List[BaseNode]: A list of processed BaseNode instances formed from the
62 | input nodes.
63 | """
64 | if isinstance(node, Document):
65 | return build_nodes_from_splits([node.text], node)[0]
66 | else:
67 | return node
68 |
69 | return [to_node(n) for n in nodes]
--------------------------------------------------------------------------------
/lexical-graph/src/graphrag_toolkit/lexical_graph/indexing/extract/graph_scoped_value_store.py:
--------------------------------------------------------------------------------
1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 | # SPDX-License-Identifier: Apache-2.0
3 |
4 | import logging
5 | from typing import List
6 | from graphrag_toolkit.lexical_graph.indexing.extract.scoped_value_provider import ScopedValueStore
7 | from graphrag_toolkit.lexical_graph.storage.graph import GraphStore
8 |
9 | logger = logging.getLogger(__name__)
10 |
11 | class GraphScopedValueStore(ScopedValueStore):
12 | """Manages and stores values in a graph database with scope-based organization.
13 |
14 | This class allows for storing and retrieving scoped values using a graph database.
15 | Scoped values are organized by a label and associated with a defined scope for easy
16 | management and retrieval.
17 |
18 | Attributes:
19 | graph_store (GraphStore): The graph database store used for executing queries.
20 | """
21 | graph_store: GraphStore
22 |
23 | def get_scoped_values(self, label:str, scope:str) -> List[str]:
24 | """
25 | Fetches distinct values associated with a specific label and scope from the graph database.
26 |
27 | This function performs a Cypher query to retrieve distinct values from nodes that match
28 | the specified label and scope in the graph database. The results are then extracted and
29 | returned as a list.
30 |
31 | Args:
32 | label (str): The label used to identify the nodes in the graph.
33 | scope (str): The scope value to filter nodes in the graph.
34 |
35 | Returns:
36 | List[str]: A list of distinct values associated with the input label and scope from
37 | the graph database.
38 |
39 | Raises:
40 | Any exceptions raised by the `execute_query` method within `self.graph_store` or
41 | any database-related issues will propagate to the caller.
42 | """
43 | cypher = f'''
44 | MATCH (n:`__SYS_SV__{label}__`)
45 | WHERE n.scope=$scope
46 | RETURN DISTINCT n.value AS value
47 | '''
48 |
49 | params = {
50 | 'scope': scope
51 | }
52 |
53 | results = self.graph_store.execute_query(cypher, params)
54 |
55 | return [result['value'] for result in results]
56 |
57 | def save_scoped_values(self, label:str, scope:str, values:List[str]) -> None:
58 | """
59 | Saves a list of values associated with a specific label and scope to the graph store. Each value is
60 | processed within the provided scope, and the method ensures a unique combination of scope and value
61 | through the `MERGE` operation in the query. The execution handles retries in case of query failure.
62 |
63 | Args:
64 | label (str): The label used to dynamically define the node label in the query. This allows for
65 | compartmentalization of values within the graph store.
66 | scope (str): A string defining the specific scope in which values will be stored. Used as an
67 | attribute to uniquely identify nodes and group values accordingly.
68 | values (List[str]): A list of string values to be stored in association with the provided scope
69 | and label.
70 |
71 | Returns:
72 | None
73 | """
74 | cypher = f'''
75 | UNWIND $values AS value
76 | MERGE (:`__SYS_SV__{label}__`{{scope:$scope, value:value}})
77 | '''
78 |
79 | params = {
80 | 'scope': scope,
81 | 'values': values
82 | }
83 |
84 | self.graph_store.execute_query_with_retry(cypher, params)
85 |
86 |
87 |
88 |
89 |
90 |
--------------------------------------------------------------------------------
/lexical-graph/src/graphrag_toolkit/lexical_graph/indexing/extract/infer_config.py:
--------------------------------------------------------------------------------
1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 | # SPDX-License-Identifier: Apache-2.0
3 |
4 | from enum import Enum
5 | from dataclasses import dataclass
6 | from typing import Optional
7 |
8 | class OnExistingClassifications(Enum):
9 | """
10 | Enumeration to define behaviors for handling existing classifications.
11 |
12 | This class is an Enum used to specify how existing classifications
13 | should be handled during operations. It provides three options:
14 | merging with existing classifications, replacing them entirely,
15 | or retaining the existing classifications without changes.
16 |
17 | Attributes:
18 | MERGE_EXISTING: Merges new classifications with the existing ones.
19 | REPLACE_EXISTING: Replaces any existing classifications completely.
20 | RETAIN_EXISTING: Keeps the existing classifications without changes.
21 | """
22 | MERGE_EXISTING = 1
23 | REPLACE_EXISTING = 2
24 | RETAIN_EXISTING = 3
25 |
26 | @dataclass
27 | class InferClassificationsConfig:
28 | """
29 | Configuration for inferring classifications in a data processing context.
30 |
31 | This class encapsulates the configuration parameters required for inferring
32 | classifications within a system. It defines the number of samples to process,
33 | the number of iterations to perform, how to handle existing classifications,
34 | and an optional prompt template for customization.
35 |
36 | Attributes:
37 | num_samples (Optional[int]): Number of samples to infer classifications from.
38 | Defaults to 5.
39 | num_iterations (Optional[int]): Number of iterations to perform for the
40 | classification inference process. Defaults to 1.
41 | on_existing_classifications (Optional[OnExistingClassifications]): Strategy
42 | to apply when handling pre-existing classifications. Defaults to
43 | OnExistingClassifications.MERGE_EXISTING.
44 | prompt_template (Optional[str]): Custom template text for classification
45 | prompts, if applicable. Defaults to None.
46 | """
47 | num_samples:Optional[int]=5
48 | num_iterations:Optional[int]=1
49 | on_existing_classifications:Optional[OnExistingClassifications]=OnExistingClassifications.MERGE_EXISTING
50 | prompt_template:Optional[str]=None
--------------------------------------------------------------------------------
/lexical-graph/src/graphrag_toolkit/lexical_graph/indexing/extract/pipeline_decorator.py:
--------------------------------------------------------------------------------
1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 | # SPDX-License-Identifier: Apache-2.0
3 |
4 | import abc
5 | import six
6 | from typing import Iterable
7 |
8 | from graphrag_toolkit.lexical_graph.indexing.model import SourceDocument
9 |
10 | @six.add_metaclass(abc.ABCMeta)
11 | class PipelineDecorator():
12 | """
13 | Abstract base class for defining pipeline decorators.
14 |
15 | This class provides an interface for creating pipeline decorators that
16 | process input documents and transform them through an arbitrary operation.
17 | It is intended to be subclassed, with the abstract methods implemented
18 | to define custom behavior for handling documents in a pipeline.
19 |
20 | Attributes:
21 | None
22 | """
23 | @abc.abstractmethod
24 | def handle_input_docs(self, docs:Iterable[SourceDocument]) -> Iterable[SourceDocument]:
25 | """
26 | Abstract method that processes a collection of SourceDocument instances
27 | and returns an iterable of processed SourceDocument instances.
28 |
29 | This method defines an interface for handling input documents that must
30 | be implemented by subclasses. The implementation of the method should
31 | provide the logic for processing the documents within the iterable input.
32 |
33 | Args:
34 | docs (Iterable[SourceDocument]): An iterable collection of SourceDocument
35 | objects to be processed.
36 |
37 | Returns:
38 | Iterable[SourceDocument]: An iterable collection of processed SourceDocument
39 | objects after applying the logic defined in the subclass implementation.
40 | """
41 | pass
42 |
43 | @abc.abstractmethod
44 | def handle_output_doc(self, doc: SourceDocument) -> SourceDocument:
45 | """
46 | An abstract method to process and handle an input SourceDocument object
47 | and return the processed SourceDocument.
48 |
49 | Args:
50 | doc (SourceDocument): The input document that needs to be processed.
51 |
52 | Returns:
53 | SourceDocument: The processed document after handling.
54 | """
55 | pass
56 |
57 |
--------------------------------------------------------------------------------
/lexical-graph/src/graphrag_toolkit/lexical_graph/indexing/extract/source_doc_parser.py:
--------------------------------------------------------------------------------
1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 | # SPDX-License-Identifier: Apache-2.0
3 |
4 | import abc
5 | from typing import Iterable
6 |
7 | from graphrag_toolkit.lexical_graph.indexing.model import SourceDocument
8 |
9 | from llama_index.core.schema import BaseComponent
10 |
11 | class SourceDocParser(BaseComponent):
12 | """
13 | Parses source documents and provides an interface for handling document parsing logic.
14 |
15 | This class serves as an abstract base for implementing source document parsing
16 | functionality. The main purpose of the class is to define a generic interface that
17 | subclasses implement to customize how source documents are parsed. It ensures a
18 | consistent API for parsing while delegating the specific implementation of parsing
19 | to the subclasses.
20 |
21 | Attributes:
22 | None
23 | """
24 | @abc.abstractmethod
25 | def _parse_source_docs(self, source_documents:Iterable[SourceDocument]) -> Iterable[SourceDocument]:
26 | """
27 | Parses a collection of source documents and processes them into a specified format.
28 |
29 | This method is intended to be overridden by subclasses to provide specific
30 | logic for processing the input documents and transforming them into the desired
31 | output. The input documents should be iterable, and the output must also be an
32 | iterable containing the processed documents. This is an abstract method, and
33 | instantiating the containing class without implementing this method will result
34 | in errors.
35 |
36 | Args:
37 | source_documents: An iterable of `SourceDocument` instances representing
38 | the input documents to be processed.
39 |
40 | Returns:
41 | An iterable of `SourceDocument` instances representing the processed
42 | version of the source documents.
43 |
44 | Raises:
45 | NotImplementedError: If the method is not implemented by the subclass.
46 | """
47 | pass
48 |
49 | def parse_source_docs(self, source_documents:Iterable[SourceDocument]) -> Iterable[SourceDocument]:
50 | """
51 | Parses a collection of source documents and processes them through an internal
52 | parsing mechanism.
53 |
54 | Args:
55 | source_documents: An iterable of SourceDocument objects to be parsed. Each
56 | document is expected to contain the necessary structure and metadata
57 | required by the parser.
58 |
59 | Returns:
60 | An iterable of SourceDocument objects that have been processed and parsed
61 | using the internal parsing mechanism.
62 | """
63 | return self._parse_source_docs(source_documents)
--------------------------------------------------------------------------------
/lexical-graph/src/graphrag_toolkit/lexical_graph/indexing/load/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 | # SPDX-License-Identifier: Apache-2.0
3 |
4 | from .bedrock_knowledge_base import BedrockKnowledgeBaseExport
5 | from .file_based_chunks import FileBasedChunks
6 | from .s3_based_chunks import S3BasedChunks
7 | from .file_based_docs import FileBasedDocs
8 | from .s3_based_docs import S3BasedDocs
9 | from .json_array_reader import JSONArrayReader
10 | from .source_documents import SourceDocuments
--------------------------------------------------------------------------------
/lexical-graph/src/graphrag_toolkit/lexical_graph/indexing/load/source_documents.py:
--------------------------------------------------------------------------------
1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 | # SPDX-License-Identifier: Apache-2.0
3 |
4 | from typing import Callable, List
5 | from llama_index.core import Document
6 |
7 | class SourceDocuments:
8 | """
9 | Represents a collection of source document generator functions.
10 |
11 | This class is designed to encapsulate a list of callable functions that generate
12 | source documents. It provides an iterable interface to iterate through all source
13 | documents produced by these functions. The class supports handling nested lists
14 | of documents, yielding individual document items.
15 |
16 | Attributes:
17 | source_documents_fns (List[Callable[[], List[Document]]]): A list of callable
18 | functions that, when invoked, return lists of documents or nested lists
19 | of documents.
20 | """
21 | def __init__(self, source_documents_fns: List[Callable[[], List[Document] ]]):
22 | """
23 | Initializes an instance of the class, setting up the source document functions.
24 |
25 | Args:
26 | source_documents_fns (List[Callable[[], List[Document]]]): A list of
27 | callables. Each callable, when executed, is expected to return a
28 | list of Document objects.
29 | """
30 | self.source_documents_fns = source_documents_fns
31 |
32 | def __iter__(self):
33 | """
34 | Yields items from the nested lists or the iterable objects provided by source_documents_fns.
35 |
36 | This method iterates through the callable objects in the `source_documents_fns` attribute, which
37 | are expected to return iterable collections. It recognizes nested lists, iterates through them,
38 | and yields individual items. If the iterable is already flat, it directly yields the items.
39 |
40 | Yields:
41 | Any: The individual elements extracted from the nested or flat iterable structures
42 | returned by the callables in `source_documents_fns`.
43 | """
44 | for source_documents_fn in self.source_documents_fns:
45 | for source_documents in source_documents_fn():
46 | if isinstance(source_documents, list):
47 | for item in source_documents:
48 | if isinstance(item, list):
49 | for i in item:
50 | yield i
51 | else:
52 | yield item
53 | else:
54 | yield source_documents
--------------------------------------------------------------------------------
/lexical-graph/src/graphrag_toolkit/lexical_graph/indexing/node_handler.py:
--------------------------------------------------------------------------------
1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 | # SPDX-License-Identifier: Apache-2.0
3 |
4 | import abc
5 | from typing import List, Any, Generator
6 | from llama_index.core.schema import BaseNode
7 | from llama_index.core.schema import TransformComponent
8 | from llama_index.core.bridge.pydantic import Field
9 |
10 | class NodeHandler(TransformComponent):
11 | """
12 | Handles the processing and transformation of node data.
13 |
14 | This class is designed to process a collection of nodes with optional
15 | parameters. It serves as a base class for customizable node handling
16 | operations, requiring the implementation of the `accept` method to
17 | define specific processing logic. The `__call__` method is provided
18 | for use as a callable, enabling straightforward invocation of the
19 | processing logic.
20 |
21 | Attributes:
22 | show_progress (bool): Whether to show progress during processing.
23 | """
24 | show_progress: bool = Field(default=True, description='Whether to show progress.')
25 |
26 | def __call__(self, nodes: List[BaseNode], **kwargs: Any) -> List[BaseNode]:
27 | """
28 | Processes and filters a list of nodes by applying the accept method to each node.
29 |
30 | The method takes a list of BaseNode objects, applies the accept method, and
31 | returns a new list containing the results.
32 |
33 | Args:
34 | nodes: A list of BaseNode objects that need to be processed.
35 | **kwargs: Additional keyword arguments that can be passed to the accept
36 | method.
37 |
38 | Returns:
39 | A list of BaseNode objects that have been processed by the accept method.
40 | """
41 | return [n for n in self.accept(nodes, **kwargs)]
42 |
43 | @abc.abstractmethod
44 | def accept(self, nodes: List[BaseNode], **kwargs: Any) -> Generator[BaseNode, None, None]:
45 | """
46 | Abstract base class for implementing a visitor pattern that can process
47 | a collection of nodes. This requires subclasses to implement the `accept`
48 | method to define their processing logic.
49 |
50 | Args:
51 | nodes: A list of nodes derived from the BaseNode class that are to
52 | be processed by the visitor pattern.
53 | **kwargs: Additional keyword arguments that can be passed during the
54 | processing of the nodes.
55 |
56 | Yields:
57 | BaseNode: Processed node instances derived from BaseNode, one at
58 | a time as the generator progresses.
59 | """
60 | raise NotImplementedError()
--------------------------------------------------------------------------------
/lexical-graph/src/graphrag_toolkit/lexical_graph/indexing/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 | # SPDX-License-Identifier: Apache-2.0
--------------------------------------------------------------------------------
/lexical-graph/src/graphrag_toolkit/lexical_graph/indexing/utils/metadata_utils.py:
--------------------------------------------------------------------------------
1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 | # SPDX-License-Identifier: Apache-2.0
3 |
4 | import datetime
5 |
6 | def get_properties_str(properties, default):
7 | if properties:
8 | return ';'.join(sorted([f'{k}:{v}' for k,v in properties.items()]))
9 | else:
10 | return default
11 |
12 | def last_accessed_date(*args):
13 | return {
14 | 'last_accessed_date': datetime.datetime.now().strftime("%Y-%m-%d")
15 | }
--------------------------------------------------------------------------------
/lexical-graph/src/graphrag_toolkit/lexical_graph/indexing/utils/pipeline_utils.py:
--------------------------------------------------------------------------------
1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 | # SPDX-License-Identifier: Apache-2.0
3 |
4 | from pipe import Pipe
5 | from concurrent.futures import ProcessPoolExecutor
6 | from functools import partial
7 | from typing import List, Optional, Sequence, Any, cast, Callable
8 |
9 |
10 | from llama_index.core.ingestion import IngestionPipeline
11 | from llama_index.core.ingestion.pipeline import run_transformations
12 | from llama_index.core.schema import BaseNode
13 |
14 | def _sink():
15 | def _sink_from(generator):
16 | for item in generator:
17 | pass
18 | return Pipe(_sink_from)
19 |
20 | sink = _sink()
21 |
22 | def run_pipeline(
23 | pipeline:IngestionPipeline,
24 | node_batches:List[List[BaseNode]],
25 | cache_collection: Optional[str] = None,
26 | in_place: bool = True,
27 | num_workers: int = 1,
28 | **kwargs: Any,
29 | ) -> Sequence[BaseNode]:
30 | transform: Callable[[List[BaseNode]], List[BaseNode]] = partial(
31 | run_transformations,
32 | transformations=pipeline.transformations,
33 | in_place=in_place,
34 | cache=pipeline.cache if not pipeline.disable_cache else None,
35 | cache_collection=cache_collection,
36 | **kwargs
37 | )
38 |
39 | with ProcessPoolExecutor(max_workers=num_workers) as p:
40 | processed_node_batches = p.map(transform, node_batches)
41 | processed_nodes = sum(processed_node_batches, start=cast(List[BaseNode], []))
42 |
43 | return processed_nodes
--------------------------------------------------------------------------------
/lexical-graph/src/graphrag_toolkit/lexical_graph/prompts/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 | # SPDX-License-Identifier: Apache-2.0
3 |
4 | """
5 | This module exposes the core prompt provider interface and registry entry point.
6 |
7 | To avoid circular import errors, concrete provider classes (S3, Bedrock, File, Static)
8 | are not imported here. Use `prompt_provider_config.py` to dynamically construct providers.
9 | """
10 |
11 | from .prompt_provider_base import PromptProvider
12 | from .prompt_provider_registry import PromptProviderRegistry
13 |
14 | __all__ = [
15 | "PromptProvider",
16 | "PromptProviderRegistry",
17 | ]
18 |
--------------------------------------------------------------------------------
/lexical-graph/src/graphrag_toolkit/lexical_graph/prompts/bedrock_prompt_provider.py:
--------------------------------------------------------------------------------
1 | # graphrag_toolkit/lexical_graph/prompts/bedrock_prompt_provider.py
2 | # SPDX-License-Identifier: Apache-2.0
3 |
4 | from graphrag_toolkit.lexical_graph.prompts.prompt_provider_base import PromptProvider
5 | from graphrag_toolkit.lexical_graph.prompts.prompt_provider_config import BedrockPromptProviderConfig
6 | from graphrag_toolkit.lexical_graph.logging import logging
7 |
8 | logger = logging.getLogger(__name__)
9 |
10 | class BedrockPromptProvider(PromptProvider):
11 | """Provides prompt templates from AWS Bedrock using specified ARNs and versions.
12 |
13 | This class loads and returns system and user prompt templates from AWS Bedrock,
14 | based on configuration provided at initialization.
15 | """
16 |
17 | def __init__(self, config: BedrockPromptProviderConfig):
18 | self.config = config
19 |
20 | logger.info(
21 | f"[Prompt Debug] Using BedrockPromptProvider with:\n"
22 | f" system_prompt_arn={config.system_prompt_arn} "
23 | f"(resolved={config.resolved_system_prompt_arn}, version={config.system_prompt_version})\n"
24 | f" user_prompt_arn={config.user_prompt_arn} "
25 | f"(resolved={config.resolved_user_prompt_arn}, version={config.user_prompt_version})\n"
26 | f" region={config.aws_region}, profile={config.aws_profile}"
27 | )
28 |
29 | def _load_prompt(self, prompt_arn: str, version: str = None) -> str:
30 | """Loads a prompt template from AWS Bedrock using the given ARN and version.
31 |
32 | Args:
33 | prompt_arn: The ARN of the prompt to load.
34 | version: The version of the prompt to load (optional).
35 |
36 | Returns:
37 | The text of the loaded prompt template.
38 |
39 | Raises:
40 | RuntimeError: If the prompt or its text cannot be found or loaded.
41 | """
42 | try:
43 | kwargs = {"promptIdentifier": prompt_arn}
44 | if version:
45 | kwargs["promptVersion"] = version
46 |
47 | response = self.config.bedrock.get_prompt(**kwargs)
48 |
49 | variants = response.get("variants", [])
50 | if not variants:
51 | raise RuntimeError(f"No variants found for prompt: {prompt_arn}")
52 |
53 | text = variants[0].get("templateConfiguration", {}).get("text", {}).get("text")
54 | if not text:
55 | raise RuntimeError(f"Prompt text not found for: {prompt_arn}")
56 |
57 | return text.strip()
58 |
59 | except Exception as e:
60 | logger.error(f"Failed to load prompt for {prompt_arn}: {str(e)}")
61 | raise RuntimeError(f"Could not load prompt from Bedrock: {prompt_arn}") from e
62 |
63 | def get_system_prompt(self) -> str:
64 | """Retrieves the system prompt template from AWS Bedrock.
65 |
66 | Returns:
67 | The text of the system prompt template.
68 | """
69 | return self._load_prompt(
70 | self.config.resolved_system_prompt_arn,
71 | self.config.system_prompt_version,
72 | )
73 |
74 | def get_user_prompt(self) -> str:
75 | """Retrieves the user prompt template from AWS Bedrock.
76 |
77 | Returns:
78 | The text of the user prompt template.
79 | """
80 | return self._load_prompt(
81 | self.config.resolved_user_prompt_arn,
82 | self.config.user_prompt_version,
83 | )
84 |
--------------------------------------------------------------------------------
/lexical-graph/src/graphrag_toolkit/lexical_graph/prompts/file_prompt_provider.py:
--------------------------------------------------------------------------------
1 | import os
2 | from graphrag_toolkit.lexical_graph.prompts.prompt_provider_base import PromptProvider
3 | from graphrag_toolkit.lexical_graph.prompts.prompt_provider_config import FilePromptProviderConfig
4 | from graphrag_toolkit.lexical_graph.logging import logging
5 |
6 | logger = logging.getLogger(__name__)
7 |
8 | class FilePromptProvider(PromptProvider):
9 | """
10 | Loads system and user prompts from the local filesystem using a config object.
11 | """
12 |
13 | def __init__(self, config: FilePromptProviderConfig, system_prompt_file: str = "system_prompt.txt", user_prompt_file: str = "user_prompt.txt"):
14 | """
15 | Initializes the FilePromptProvider with a configuration and prompt file names.
16 |
17 | Args:
18 | config: The configuration object specifying the base path for prompt files.
19 | system_prompt_file: The filename for the system prompt (default is "system_prompt.txt").
20 | user_prompt_file: The filename for the user prompt (default is "user_prompt.txt").
21 |
22 | Raises:
23 | NotADirectoryError: If the provided base path does not exist or is not a directory.
24 | """
25 | if not os.path.isdir(config.base_path):
26 | raise NotADirectoryError(f"Invalid or non-existent directory: {config.base_path}")
27 | self.config = config
28 | self.system_prompt_file = system_prompt_file
29 | self.user_prompt_file = user_prompt_file
30 |
31 | logger.info(f"[Prompt Debug] Initialized FilePromptProvider")
32 | logger.info(f"[Prompt Debug] Base path: {self.config.base_path}")
33 | logger.info(f"[Prompt Debug] System prompt file: {self.system_prompt_file}")
34 | logger.info(f"[Prompt Debug] User prompt file: {self.user_prompt_file}")
35 |
36 | def _load_prompt(self, filename: str) -> str:
37 | """
38 | Loads the contents of a prompt file from the configured base path.
39 |
40 | Args:
41 | filename: The name of the prompt file to load.
42 |
43 | Returns:
44 | The contents of the prompt file as a string.
45 |
46 | Raises:
47 | FileNotFoundError: If the prompt file does not exist.
48 | OSError: If the file cannot be read.
49 | """
50 | path = os.path.join(self.config.base_path, filename)
51 | if not os.path.exists(path):
52 | raise FileNotFoundError(f"Prompt file not found: {path}")
53 | try:
54 | with open(path, "r", encoding="utf-8") as f:
55 | return f.read().rstrip()
56 | except OSError as e:
57 | raise OSError(f"Failed to read prompt file {path}: {str(e)}") from e
58 |
59 | def get_system_prompt(self) -> str:
60 | """
61 | Returns the contents of the system prompt file.
62 |
63 | Returns:
64 | The contents of the system prompt file as a string.
65 | """
66 | return self._load_prompt(self.system_prompt_file)
67 |
68 | def get_user_prompt(self) -> str:
69 | """
70 | Returns the contents of the user prompt file.
71 |
72 | Returns:
73 | The contents of the user prompt file as a string.
74 | """
75 | return self._load_prompt(self.user_prompt_file)
76 |
--------------------------------------------------------------------------------
/lexical-graph/src/graphrag_toolkit/lexical_graph/prompts/prompt_provider_base.py:
--------------------------------------------------------------------------------
1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 | # SPDX-License-Identifier: Apache-2.0
3 |
4 | from abc import ABC, abstractmethod
5 | from graphrag_toolkit.lexical_graph.logging import logging
6 |
7 | logger = logging.getLogger(__name__)
8 |
9 | class PromptProvider(ABC):
10 | """
11 | Abstract base class for loading prompts from various sources.
12 | """
13 |
14 | @abstractmethod
15 | def get_system_prompt(self) -> str:
16 | """
17 | Returns the system prompt as a string.
18 | """
19 | pass
20 |
21 | @abstractmethod
22 | def get_user_prompt(self) -> str:
23 | """
24 | Returns the user prompt as a string.
25 | """
26 | pass
27 |
--------------------------------------------------------------------------------
/lexical-graph/src/graphrag_toolkit/lexical_graph/prompts/prompt_provider_config_base.py:
--------------------------------------------------------------------------------
1 | # prompt_provider_config_base.py
2 | from pydantic import BaseModel
3 | from typing import Optional
4 |
5 | class FilePromptProviderConfig(BaseModel):
6 | """
7 | Configuration model for file-based prompt providers.
8 |
9 | This class defines the required fields for specifying system and user prompt file names.
10 | """
11 | system_prompt_file: str
12 | user_prompt_file: str
13 |
14 | class S3PromptProviderConfig(BaseModel):
15 | """
16 | Configuration model for S3-based prompt providers.
17 |
18 | This class defines the required fields for specifying the S3 bucket, key, and optional region for prompt storage.
19 | """
20 | bucket: str
21 | key: str
22 | region: Optional[str] = None
23 |
24 | class BedrockPromptProviderConfig(BaseModel):
25 | """
26 | Configuration model for Bedrock-based prompt providers.
27 |
28 | This class defines the required field for specifying the Bedrock prompt ARN.
29 | """
30 | prompt_arn: str
31 |
--------------------------------------------------------------------------------
/lexical-graph/src/graphrag_toolkit/lexical_graph/prompts/prompt_provider_factory.py:
--------------------------------------------------------------------------------
1 | # graphrag_toolkit/lexical_graph/prompts/prompt_provider_factory.py
2 |
3 | import os
4 | from graphrag_toolkit.lexical_graph.prompts.prompt_provider_base import PromptProvider
5 | from graphrag_toolkit.lexical_graph.prompts.prompt_provider_config import (
6 | BedrockPromptProviderConfig,
7 | S3PromptProviderConfig,
8 | FilePromptProviderConfig,
9 | StaticPromptProviderConfig,
10 | )
11 | from graphrag_toolkit.lexical_graph.logging import logging
12 |
13 | logger = logging.getLogger(__name__)
14 |
15 |
16 | class PromptProviderFactory:
17 | """
18 | Factory class for creating PromptProvider instances based on environment configuration.
19 |
20 | This class selects and builds the appropriate PromptProvider implementation according to the PROMPT_PROVIDER environment variable.
21 | """
22 | @staticmethod
23 | def get_provider() -> PromptProvider:
24 | """
25 | Returns a PromptProvider instance based on the PROMPT_PROVIDER environment variable.
26 |
27 | This method selects and builds the appropriate PromptProvider implementation for Bedrock, S3, file, or static sources.
28 |
29 | Returns:
30 | PromptProvider: An instance of the selected PromptProvider implementation.
31 | """
32 | provider_type = os.getenv("PROMPT_PROVIDER", "static").lower()
33 |
34 | if provider_type == "bedrock":
35 | return BedrockPromptProviderConfig().build()
36 | elif provider_type == "s3":
37 | return S3PromptProviderConfig().build()
38 | elif provider_type == "file":
39 | return FilePromptProviderConfig().build()
40 | else:
41 | # Final fallback to static default prompts
42 | return StaticPromptProviderConfig().build()
43 |
--------------------------------------------------------------------------------
/lexical-graph/src/graphrag_toolkit/lexical_graph/prompts/prompt_provider_registry.py:
--------------------------------------------------------------------------------
1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 | # SPDX-License-Identifier: Apache-2.0
3 |
4 | from typing import Optional, Dict
5 | from graphrag_toolkit.lexical_graph.prompts.prompt_provider_base import PromptProvider
6 | from graphrag_toolkit.lexical_graph.logging import logging
7 |
8 | logger = logging.getLogger(__name__)
9 |
10 | class PromptProviderRegistry:
11 | """
12 | Global registry for managing and retrieving named PromptProvider instances.
13 | Supports multiple sources (e.g., Bedrock, S3, File) and default fallback.
14 | """
15 |
16 | _registry: Dict[str, PromptProvider] = {}
17 | _default_provider_name: Optional[str] = None
18 |
19 | @classmethod
20 | def register(cls, name: str, provider: PromptProvider, default: bool = False) -> None:
21 | """
22 | Register a prompt provider under a unique name.
23 | Optionally, set it as the default provider.
24 |
25 | Parameters
26 | ----------
27 | name : str
28 | The unique name for the provider (e.g., "aws-prod", "local-dev").
29 | provider : PromptProvider
30 | The provider instance to register.
31 | default : bool
32 | Whether to make this the default provider.
33 | """
34 | cls._registry[name] = provider
35 | if default or cls._default_provider_name is None:
36 | cls._default_provider_name = name
37 |
38 | @classmethod
39 | def get(cls, name: Optional[str] = None) -> Optional[PromptProvider]:
40 | """
41 | Retrieve a prompt provider by name, or return the default if no name is specified.
42 |
43 | Parameters
44 | ----------
45 | name : Optional[str]
46 | The name of the provider to retrieve.
47 |
48 | Returns
49 | -------
50 | Optional[PromptProvider]
51 | The matching provider instance or None.
52 | """
53 | if name:
54 | return cls._registry.get(name)
55 | if cls._default_provider_name:
56 | return cls._registry.get(cls._default_provider_name)
57 | return None
58 |
59 | @classmethod
60 | def list_registered(cls) -> Dict[str, PromptProvider]:
61 | """
62 | List all registered prompt providers.
63 |
64 | Returns
65 | -------
66 | Dict[str, PromptProvider]
67 | A dictionary of provider names and their instances.
68 | """
69 | return cls._registry.copy()
70 |
--------------------------------------------------------------------------------
/lexical-graph/src/graphrag_toolkit/lexical_graph/prompts/s3_prompt_provider.py:
--------------------------------------------------------------------------------
1 | # graphrag_toolkit/lexical_graph/prompts/s3_prompt_provider.py
2 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 |
5 | from graphrag_toolkit.lexical_graph.prompts.prompt_provider_base import PromptProvider
6 | from graphrag_toolkit.lexical_graph.prompts.prompt_provider_config import S3PromptProviderConfig
7 | from graphrag_toolkit.lexical_graph.logging import logging
8 |
9 | logger = logging.getLogger(__name__)
10 |
11 | class S3PromptProvider(PromptProvider):
12 | """
13 | Loads system and user prompts from an S3 bucket using provided configuration.
14 |
15 | Attributes:
16 | config (S3PromptProviderConfig): Configuration object including bucket, prefix,
17 | and optionally custom file names for prompts.
18 | """
19 |
20 | def __init__(self, config: S3PromptProviderConfig):
21 | self.config = config
22 |
23 | def _load_prompt(self, filename: str) -> str:
24 | """
25 | Loads a prompt file from the configured S3 bucket and returns its contents as a string.
26 |
27 | Args:
28 | filename: The name of the prompt file to load from S3.
29 |
30 | Returns:
31 | The contents of the prompt file as a UTF-8 string.
32 | """
33 | key = f"{self.config.prefix.rstrip('/')}/{filename}"
34 | logger.info(f"[Prompt Debug] Loading prompt from S3: s3://{self.config.bucket}/{key}")
35 | s3_client = self.config.s3 # session-aware S3 client from config
36 | response = s3_client.get_object(Bucket=self.config.bucket, Key=key)
37 | return response["Body"].read().decode("utf-8").rstrip()
38 |
39 | def get_system_prompt(self) -> str:
40 | """
41 | Retrieves the system prompt from S3.
42 |
43 | Returns:
44 | The contents of the system prompt file.
45 | """
46 | return self._load_prompt(self.config.system_prompt_file)
47 |
48 | def get_user_prompt(self) -> str:
49 | """
50 | Retrieves the user prompt from S3.
51 |
52 | Returns:
53 | The contents of the user prompt file.
54 | """
55 | return self._load_prompt(self.config.user_prompt_file)
56 |
--------------------------------------------------------------------------------
/lexical-graph/src/graphrag_toolkit/lexical_graph/prompts/static_prompt_provider.py:
--------------------------------------------------------------------------------
1 | # static_prompt_provider.py
2 |
3 | from graphrag_toolkit.lexical_graph.prompts.prompt_provider_base import PromptProvider
4 | from graphrag_toolkit.lexical_graph.retrieval.prompts import (
5 | ANSWER_QUESTION_SYSTEM_PROMPT,
6 | ANSWER_QUESTION_USER_PROMPT,
7 | )
8 | from graphrag_toolkit.lexical_graph.logging import logging
9 |
10 | logger = logging.getLogger(__name__)
11 |
12 | class StaticPromptProvider(PromptProvider):
13 | """
14 | Provides static system and user prompts for use in the application.
15 | This class returns predefined prompt strings that do not change at runtime.
16 | """
17 | def __init__(self):
18 | """
19 | Initializes a StaticPromptProvider with predefined system and user prompts.
20 | This constructor sets the system and user prompts to static values for consistent retrieval.
21 | """
22 | self._system_prompt = ANSWER_QUESTION_SYSTEM_PROMPT
23 | self._user_prompt = ANSWER_QUESTION_USER_PROMPT
24 | logger.debug(f"System Prompt (truncated): {self._system_prompt[:60]}...")
25 | logger.debug(f"User Prompt (truncated): {self._user_prompt[:60]}...")
26 |
27 | def get_system_prompt(self) -> str:
28 | """
29 | Returns the static system prompt string.
30 | This method provides the system prompt that is set during initialization.
31 |
32 | Returns:
33 | The system prompt as a string.
34 | """
35 | return self._system_prompt
36 |
37 | def get_user_prompt(self) -> str:
38 | """
39 | Returns the static user prompt string.
40 | This method provides the user prompt that is set during initialization.
41 |
42 | Returns:
43 | The user prompt as a string.
44 | """
45 | return self._user_prompt
46 |
--------------------------------------------------------------------------------
/lexical-graph/src/graphrag_toolkit/lexical_graph/protocols/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 | # SPDX-License-Identifier: Apache-2.0
3 |
4 | from .mcp_server import create_mcp_server
--------------------------------------------------------------------------------
/lexical-graph/src/graphrag_toolkit/lexical_graph/requirements.txt:
--------------------------------------------------------------------------------
1 | anthropic-bedrock==0.8.0
2 | boto3>=1.36.1
3 | botocore>=1.36.1
4 | json2xml==5.0.5
5 | llama-index-core==0.12.37
6 | llama-index-embeddings-bedrock==0.5.0
7 | llama-index-llms-anthropic==0.6.19
8 | llama-index-llms-bedrock-converse==0.6.0
9 | lru-dict==1.3.0
10 | pipe==2.2
11 | python-dotenv==1.0.1
12 | smart_open==7.1.0
13 | spacy==3.7.5
14 | tfidf_matcher==0.3.0
--------------------------------------------------------------------------------
/lexical-graph/src/graphrag_toolkit/lexical_graph/retrieval/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 | # SPDX-License-Identifier: Apache-2.0
3 |
4 | from . import post_processors
5 | from . import processors
6 | from . import retrievers
7 | from . import utils
--------------------------------------------------------------------------------
/lexical-graph/src/graphrag_toolkit/lexical_graph/retrieval/post_processors/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 | # SPDX-License-Identifier: Apache-2.0
3 |
4 | from .reranker_mixin import RerankerMixin
5 | from .enrich_source_details import EnrichSourceDetails
6 | from .bedrock_context_format import BedrockContextFormat
7 | from .sentence_reranker import SentenceReranker
8 | from .statement_diversity import StatementDiversityPostProcessor
9 | from .statement_enhancement import StatementEnhancementPostProcessor
--------------------------------------------------------------------------------
/lexical-graph/src/graphrag_toolkit/lexical_graph/retrieval/post_processors/reranker_mixin.py:
--------------------------------------------------------------------------------
1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 | # SPDX-License-Identifier: Apache-2.0
3 |
4 | from abc import ABC, abstractmethod
5 | from typing import List, Tuple
6 |
7 | from llama_index.core.postprocessor.types import BaseNodePostprocessor
8 | from llama_index.core.schema import NodeWithScore, QueryBundle
9 |
10 | class RerankerMixin(ABC):
11 | """
12 | Provides an abstract base class for rerankers with mixin functionality.
13 |
14 | This class serves as a foundational mixin for implementing custom rerankers.
15 | It defines the required interface that any subclass must implement, including
16 | a property to retrieve batch size and a method to rerank given pairs of data.
17 | Subclasses of this mixin are expected to define domain-specific behavior for
18 | reranking operations.
19 |
20 | Attributes:
21 | batch_size (int): Abstract property defining the number of items processed
22 | in a batch by the reranker.
23 | """
24 | @property
25 | @abstractmethod
26 | def batch_size(self):
27 | """
28 | Abstract property that defines the batch size for a specific object or operation.
29 |
30 | This property serves as an interface for retrieving or working with the batch size,
31 | making it mandatory to implement in any subclass that inherits from the class which
32 | declares this property. Subclasses must define the behavior and value associated
33 | with this property.
34 |
35 | Attributes:
36 | batch_size: An integer representing the size of the batch used in the context
37 | of the implementation.
38 |
39 | """
40 | pass
41 |
42 | @abstractmethod
43 | def rerank_pairs(self, pairs: List[Tuple[str, str]], batch_size: int = 128) -> List[float]:
44 | """
45 | Reranks the given list of key-value pairs by assigning a numerical score to each pair.
46 | The reranking operation is expected to be implemented by subclasses inheriting this
47 | abstract method. The method takes a list of tuples (key-value pairs) and an optional
48 | batch size parameter to process the data in chunks.
49 |
50 | Args:
51 | pairs:
52 | A list of tuples, where each tuple contains two strings representing the
53 | key-value pair to be reranked.
54 | batch_size:
55 | An optional integer specifying the size of data chunks to process in batches.
56 | Default value is 128.
57 |
58 | Returns:
59 | A list of float values corresponding to the reranked scores for the given
60 | key-value pairs. The returned list should maintain the same order as the input list.
61 |
62 | """
63 | pass
64 |
--------------------------------------------------------------------------------
/lexical-graph/src/graphrag_toolkit/lexical_graph/retrieval/processors/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 | # SPDX-License-Identifier: Apache-2.0
3 |
4 | from .processor_args import ProcessorArgs
5 | from .processor_base import ProcessorBase
6 | from .clear_chunks import ClearChunks
7 | from .clear_scores import ClearScores
8 | from .dedup_results import DedupResults
9 | from .disaggregate_results import DisaggregateResults
10 | from .filter_by_metadata import FilterByMetadata
11 | from .format_sources import FormatSources
12 | from .populate_statement_strs import PopulateStatementStrs
13 | from .prune_results import PruneResults
14 | from .prune_statements import PruneStatements
15 | from .rerank_statements import RerankStatements
16 | from .rescore_results import RescoreResults
17 | from .simplify_single_topic_results import SimplifySingleTopicResults
18 | from .sort_results import SortResults
19 | from .statements_to_strings import StatementsToStrings
20 | from .truncate_results import TruncateResults
21 | from .truncate_statements import TruncateStatements
22 | from .zero_scores import ZeroScores
23 |
--------------------------------------------------------------------------------
/lexical-graph/src/graphrag_toolkit/lexical_graph/retrieval/processors/clear_chunks.py:
--------------------------------------------------------------------------------
1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 | # SPDX-License-Identifier: Apache-2.0
3 |
4 | from graphrag_toolkit.lexical_graph.metadata import FilterConfig
5 | from graphrag_toolkit.lexical_graph.retrieval.processors import ProcessorBase, ProcessorArgs
6 | from graphrag_toolkit.lexical_graph.retrieval.model import SearchResultCollection, SearchResult, Topic
7 |
8 | from llama_index.core.schema import QueryBundle
9 |
10 | class ClearChunks(ProcessorBase):
11 | """
12 | Handles the clearing of chunks within topics in a collection of search results.
13 |
14 | The ClearChunks class is responsible for modifying topics by removing their
15 | associated chunks. This is done iteratively over a collection of search results.
16 | It inherits from `ProcessorBase` and utilizes its utility methods to perform
17 | operations on topics and search results. This processor may be used in
18 | situations where textual or data chunks associated with topics need to be
19 | removed from search results for further processing or analysis.
20 |
21 | Attributes:
22 | args (ProcessorArgs): Configuration arguments passed to the processor,
23 | defining its behavior and settings.
24 | filter_config (FilterConfig): Filtering configuration that determines
25 | how the processor handles filtering-related tasks.
26 | """
27 | def __init__(self, args:ProcessorArgs, filter_config:FilterConfig):
28 | """
29 | Initializes the instance of the class with the provided arguments and filter configuration.
30 | This sets up the necessary attributes and base class initialization to manage processing
31 | and configuration effectively for the derived use case.
32 |
33 | Args:
34 | args (ProcessorArgs): The processing arguments required for setting up the instance.
35 | filter_config (FilterConfig): The configuration settings for filtering tasks.
36 | """
37 | super().__init__(args, filter_config)
38 |
39 | def _process_results(self, search_results:SearchResultCollection, query:QueryBundle) -> SearchResultCollection:
40 | """
41 | Processes search results by clearing the chunks in associated topics.
42 |
43 | This method processes a collection of search results, applying an operation to
44 | clear all chunks associated with the topics in each search result. It modifies
45 | the input search results collection and returns the processed results.
46 |
47 | Args:
48 | search_results: A collection of search results to be processed.
49 | query: A query bundle containing the search query details.
50 |
51 | Returns:
52 | SearchResultCollection: A processed collection of search results where the
53 | chunks in associated topics have been cleared.
54 | """
55 | def clear_chunks(topic:Topic):
56 | topic.chunks.clear()
57 | return topic
58 |
59 | def clear_search_result_chunks(index:int, search_result:SearchResult):
60 | return self._apply_to_topics(search_result, clear_chunks)
61 |
62 | return self._apply_to_search_results(search_results, clear_search_result_chunks)
63 |
64 |
65 |
--------------------------------------------------------------------------------
/lexical-graph/src/graphrag_toolkit/lexical_graph/retrieval/processors/clear_scores.py:
--------------------------------------------------------------------------------
1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 | # SPDX-License-Identifier: Apache-2.0
3 |
4 | from graphrag_toolkit.lexical_graph.metadata import FilterConfig
5 | from graphrag_toolkit.lexical_graph.retrieval.processors import ProcessorBase, ProcessorArgs
6 | from graphrag_toolkit.lexical_graph.retrieval.model import SearchResultCollection, SearchResult, Topic
7 |
8 | from llama_index.core.schema import QueryBundle
9 |
10 | class ClearScores(ProcessorBase):
11 | """
12 | Handles the processing of clearing scores from search results.
13 |
14 | This class is designed to process a collection of search results and remove the
15 | scores associated with them. It uses the base ProcessorBase functionality to
16 | apply the clearing operation to each search result in the given collection.
17 | This can be useful in scenarios where the scores are either irrelevant or need
18 | to be redacted for further processing.
19 |
20 | Attributes:
21 | args (ProcessorArgs): Arguments required for initializing the processor,
22 | providing configuration and operational parameters.
23 | filter_config (FilterConfig): Configuration settings for filtering, passed
24 | during instantiation.
25 | """
26 | def __init__(self, args:ProcessorArgs, filter_config:FilterConfig):
27 | """
28 | Initializes the class with the provided arguments for processing and filter configuration.
29 | Ensures proper setup by invoking the parent class initializer.
30 |
31 | Args:
32 | args (ProcessorArgs): The arguments required for processing operations.
33 | filter_config (FilterConfig): The configuration for filtering operations.
34 | """
35 | super().__init__(args, filter_config)
36 |
37 | def _process_results(self, search_results:SearchResultCollection, query:QueryBundle) -> SearchResultCollection:
38 | """
39 | Processes the given search results by applying a scoring operation through
40 | a specified callback function. This method clears the scores of all
41 | search results within the given collection, setting them to None.
42 |
43 | Args:
44 | search_results: The collection of search results to be processed.
45 | query: The query bundle associated with the search results.
46 |
47 | Returns:
48 | SearchResultCollection: A collection of search results with updated scores.
49 | """
50 | def clear_score(index:int, search_result:SearchResult):
51 | search_result.score = None
52 | return search_result
53 |
54 | return self._apply_to_search_results(search_results, clear_score)
55 |
--------------------------------------------------------------------------------
/lexical-graph/src/graphrag_toolkit/lexical_graph/retrieval/processors/disaggregate_results.py:
--------------------------------------------------------------------------------
1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 | # SPDX-License-Identifier: Apache-2.0
3 |
4 | from graphrag_toolkit.lexical_graph.metadata import FilterConfig
5 | from graphrag_toolkit.lexical_graph.retrieval.processors import ProcessorBase, ProcessorArgs
6 | from graphrag_toolkit.lexical_graph.retrieval.model import SearchResultCollection, SearchResult
7 |
8 | from llama_index.core.schema import QueryBundle
9 |
10 | class DisaggregateResults(ProcessorBase):
11 | """
12 | Processes search results to disaggregate topics.
13 |
14 | The DisaggregateResults class extends the ProcessorBase class, and its primary
15 | purpose is to process a collection of search results by disaggregating topics
16 | within each search result. Each topic is evaluated individually with its
17 | corresponding score, allowing more granular analysis or filtering.
18 |
19 | Attributes:
20 | args (ProcessorArgs): Configuration and runtime arguments passed
21 | to the processor.
22 | filter_config (FilterConfig): Configuration details related to
23 | filtering criteria and logic.
24 | """
25 | def __init__(self, args:ProcessorArgs, filter_config:FilterConfig):
26 | """
27 | Initializes the Processor class with provided arguments and filter configuration.
28 |
29 | This method sets up the processor by utilizing the given configuration and
30 | arguments, ensuring proper initialization for further processing tasks.
31 |
32 | Args:
33 | args (ProcessorArgs): Arguments necessary for configuring the processor.
34 | filter_config (FilterConfig): Configuration details for filter settings.
35 |
36 | """
37 | super().__init__(args, filter_config)
38 |
39 | def _process_results(self, search_results:SearchResultCollection, query:QueryBundle) -> SearchResultCollection:
40 | """
41 | Processes and disaggregates search results based on individual topics and their highest statement scores.
42 |
43 | This method analyzes each search result, iterating through the associated topics, and isolates them into
44 | individual search results with updated scores based on the highest statement score within the topic. The
45 | updated collection of search results is then returned.
46 |
47 | Args:
48 | search_results: A collection of search results to be disaggregated and processed.
49 | query: The query bundle that corresponds to the search results.
50 |
51 | Returns:
52 | SearchResultCollection: An updated collection of search results with disaggregated topics and recalculated
53 | scores.
54 | """
55 | disaggregated_results = []
56 |
57 | for search_result in search_results.results:
58 | for topic in search_result.topics:
59 | score = max([s.score for s in topic.statements])
60 | disaggregated_results.append(SearchResult(topics=[topic], source=search_result.source, score=score))
61 |
62 | search_results = search_results.with_new_results(results=disaggregated_results)
63 |
64 | return search_results
65 |
66 |
67 |
--------------------------------------------------------------------------------
/lexical-graph/src/graphrag_toolkit/lexical_graph/retrieval/processors/filter_by_metadata.py:
--------------------------------------------------------------------------------
1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 | # SPDX-License-Identifier: Apache-2.0
3 |
4 | from graphrag_toolkit.lexical_graph.metadata import FilterConfig
5 | from graphrag_toolkit.lexical_graph.retrieval.processors import ProcessorBase, ProcessorArgs
6 | from graphrag_toolkit.lexical_graph.retrieval.model import SearchResultCollection, SearchResult
7 |
8 | from llama_index.core.schema import QueryBundle
9 |
10 | class FilterByMetadata(ProcessorBase):
11 | """
12 | Filters search results based on metadata.
13 |
14 | This class is responsible for filtering search results by examining their metadata.
15 | The filtering is applied to a collection of search results, retaining only those
16 | that meet the criteria defined in the filter configuration.
17 |
18 | Attributes:
19 | args (ProcessorArgs): Arguments required for the processing.
20 | filter_config (FilterConfig): Configuration that defines the metadata filtering rules.
21 | """
22 | def __init__(self, args:ProcessorArgs, filter_config:FilterConfig):
23 | """
24 | Initializes the class instance and sets up the basic configuration for processing.
25 |
26 | The constructor initializes the parent class with the provided arguments and
27 | filter configuration. It is responsible for setting up any necessary state
28 | or configurations required by the class for further processing.
29 |
30 | Args:
31 | args (ProcessorArgs): The arguments required for initializing the processor.
32 | filter_config (FilterConfig): The configuration settings for the filtering
33 | process.
34 | """
35 | super().__init__(args, filter_config)
36 |
37 | def _process_results(self, search_results:SearchResultCollection, query:QueryBundle) -> SearchResultCollection:
38 | """
39 | Processes search results based on the provided query and applies filters to the search result metadata.
40 |
41 | Filters the search results by evaluating the metadata of each result using the filter configuration.
42 | Only results that satisfy the filter criteria are retained.
43 |
44 | Args:
45 | search_results: A collection of search results to be filtered.
46 | query: The query bundle associated with the search results.
47 |
48 | Returns:
49 | SearchResultCollection: A collection of filtered search results.
50 | """
51 | def filter_search_result(index:int, search_result:SearchResult):
52 | return search_result if self.filter_config.filter_source_metadata_dictionary(search_result.source.metadata) else None
53 |
54 | return self._apply_to_search_results(search_results, filter_search_result)
--------------------------------------------------------------------------------
/lexical-graph/src/graphrag_toolkit/lexical_graph/retrieval/processors/prune_results.py:
--------------------------------------------------------------------------------
1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 | # SPDX-License-Identifier: Apache-2.0
3 |
4 | from graphrag_toolkit.lexical_graph.metadata import FilterConfig
5 | from graphrag_toolkit.lexical_graph.retrieval.processors import ProcessorBase, ProcessorArgs
6 | from graphrag_toolkit.lexical_graph.retrieval.model import SearchResultCollection, SearchResult
7 |
8 | from llama_index.core.schema import QueryBundle
9 |
10 | class PruneResults(ProcessorBase):
11 | """
12 | Represents a processor that prunes search results based on a score threshold.
13 |
14 | This class inherits from ProcessorBase and processes search results by applying a pruning function.
15 | The pruning removes results that do not meet a predefined score threshold. It is designed for use
16 | cases where it is necessary to filter out low-scoring results from a search result collection.
17 |
18 | Attributes:
19 | args (ProcessorArgs): Arguments containing configuration and settings for the pruning process,
20 | including the results pruning threshold.
21 | filter_config (FilterConfig): Configuration for filtering, providing additional parameters
22 | or constraints that may influence the pruning logic.
23 | """
24 | def __init__(self, args:ProcessorArgs, filter_config:FilterConfig):
25 | """
26 | Initializes the base class for processing tasks with specified arguments and
27 | filter configuration.
28 |
29 | Args:
30 | args (ProcessorArgs): The arguments required for processing tasks.
31 | filter_config (FilterConfig): Configuration settings for filtering during
32 | processing.
33 | """
34 | super().__init__(args, filter_config)
35 |
36 | def _process_results(self, search_results:SearchResultCollection, query:QueryBundle) -> SearchResultCollection:
37 | """
38 | Processes the search results by applying a pruning function based on the results' scores relative
39 | to a predefined threshold. Any search result with a score below the threshold is excluded. This
40 | method modifies the search results collection to retain only those results meeting the score
41 | criterion.
42 |
43 | Args:
44 | search_results: The collection of search results to be processed. Each result may either
45 | be retained or pruned based on its score relative to the pruning threshold.
46 | query: The query bundle associated with the search results, providing context for processing.
47 |
48 | Returns:
49 | SearchResultCollection: A new collection of search results with only those results whose
50 | scores meet the pruning threshold retained.
51 | """
52 | def prune_search_result(index:int, search_result:SearchResult):
53 | return search_result if search_result.score >= self.args.results_pruning_threshold else None
54 |
55 | return self._apply_to_search_results(search_results, prune_search_result)
56 |
57 |
58 |
--------------------------------------------------------------------------------
/lexical-graph/src/graphrag_toolkit/lexical_graph/retrieval/processors/simplify_single_topic_results.py:
--------------------------------------------------------------------------------
1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 | # SPDX-License-Identifier: Apache-2.0
3 |
4 | from graphrag_toolkit.lexical_graph.metadata import FilterConfig
5 | from graphrag_toolkit.lexical_graph.retrieval.processors import ProcessorBase, ProcessorArgs
6 | from graphrag_toolkit.lexical_graph.retrieval.model import SearchResultCollection, SearchResult
7 |
8 | from llama_index.core.schema import QueryBundle
9 |
10 | class SimplifySingleTopicResults(ProcessorBase):
11 | """
12 | Processor that simplifies search results by condensing single-topic results.
13 |
14 | This processor is designed to analyze search results and simplify cases
15 | where a search result contains only one topic. It modifies the search
16 | result structure by transferring the topic and its statements to the
17 | main result attributes and clearing the list of topics. This can be
18 | useful in scenarios where topics are nested in search results and there
19 | is a need to normalize them for easier processing.
20 |
21 | Attributes:
22 | args (ProcessorArgs): Configuration and arguments that dictate
23 | the behavior of the processor.
24 | filter_config (FilterConfig): Configuration that defines filtering
25 | settings for the processor.
26 | """
27 | def __init__(self, args:ProcessorArgs, filter_config:FilterConfig):
28 | """
29 | Initializes an instance of the processor class, setting up the base class with the provided
30 | arguments and configuration. This constructor ensures necessary setup for the processing
31 | pipeline.
32 |
33 | Args:
34 | args: The processor arguments providing configuration details required for setting
35 | up the processor instance.
36 | filter_config: The filter configuration specifying parameters and settings for
37 | filtering operations in the processor.
38 | """
39 | super().__init__(args, filter_config)
40 |
41 | def _process_results(self, search_results:SearchResultCollection, query:QueryBundle) -> SearchResultCollection:
42 | """
43 | Processes and simplifies the given search results by extracting and consolidating
44 | topics and statements from individual search results.
45 |
46 | This function is intended to manipulate a collection of search results by invoking a
47 | helper function on each element of the collection. The helper function reduces
48 | the complexity in individual search result elements by simplifying topics when applicable.
49 |
50 | Args:
51 | search_results (SearchResultCollection): A collection of search results to process.
52 | query (QueryBundle): The related query for the search results.
53 |
54 | Returns:
55 | SearchResultCollection: The processed collection of search results where each result
56 | may have simplified topics and statements.
57 | """
58 | def simplify_result(index:int, search_result:SearchResult):
59 | """
60 | Processor to simplify search results by reducing them to a single topic when applicable.
61 |
62 | This processor iterates through the search results and examines their associated topics. If there is exactly one
63 | topic linked to a result, it promotes this topic to be the primary topic of the result, appending all statements
64 | linked to the topic into the main list of statements for that result, and clears the topic list.
65 |
66 | Method:
67 | - `_process_results`: Processes and simplifies the collection of search results based on the conditions
68 | described above.
69 | """
70 | if len(search_result.topics) == 1:
71 | topic = search_result.topics[0]
72 | search_result.topic = topic.topic
73 | search_result.statements.extend(topic.statements)
74 | search_result.topics.clear()
75 | return search_result
76 | else:
77 | return search_result
78 |
79 | return self._apply_to_search_results(search_results, simplify_result)
--------------------------------------------------------------------------------
/lexical-graph/src/graphrag_toolkit/lexical_graph/retrieval/processors/sort_results.py:
--------------------------------------------------------------------------------
1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 | # SPDX-License-Identifier: Apache-2.0
3 |
4 | from graphrag_toolkit.lexical_graph.metadata import FilterConfig
5 | from graphrag_toolkit.lexical_graph.retrieval.processors import ProcessorBase, ProcessorArgs
6 | from graphrag_toolkit.lexical_graph.retrieval.model import SearchResultCollection
7 |
8 | from llama_index.core.schema import QueryBundle
9 |
10 | class SortResults(ProcessorBase):
11 | """
12 | SortResults processes and sorts search results based on their score.
13 |
14 | This class inherits from ProcessorBase and provides functionality for sorting
15 | search results in descending order of their scores. It is designed to be used
16 | within a computational pipeline that handles search result processing.
17 | The class ensures that the search results are ordered by relevance as determined
18 | by their scores, which allows subsequent stages in the pipeline to operate on
19 | sorted result data.
20 |
21 | Attributes:
22 | args (ProcessorArgs): Configuration and arguments relevant for the
23 | processing of results.
24 | filter_config (FilterConfig): Configuration for filtering behavior
25 | during processing.
26 | """
27 | def __init__(self, args:ProcessorArgs, filter_config:FilterConfig):
28 | """
29 | Initializes a processor with the provided arguments and filter configuration.
30 |
31 | This constructor sets up the necessary parameters by accepting processor
32 | arguments and a filter configuration object. It ensures that the processor
33 | is initialized correctly with all required settings.
34 |
35 | Args:
36 | args: Configuration parameters and settings required for the processor
37 | to operate.
38 | filter_config: A configuration object containing filter specifications
39 | that define processing criteria.
40 | """
41 | super().__init__(args, filter_config)
42 |
43 | def _process_results(self, search_results:SearchResultCollection, query:QueryBundle) -> SearchResultCollection:
44 | """
45 | Processes and sorts search results based on their score in descending order.
46 |
47 | This function is responsible for reordering the search results, ensuring that
48 | items with higher scores appear earlier in the collection. It modifies the
49 | `search_results` object in place and returns it after sorting.
50 |
51 | Args:
52 | search_results: A collection of search results to be sorted.
53 | query: A query bundle that was used to generate the search results.
54 |
55 | Returns:
56 | A `SearchResultCollection` object with the results sorted in descending
57 | order of score.
58 | """
59 | results = search_results.results
60 | search_results.results = sorted(results, key=lambda x: x.score, reverse=True)
61 | return search_results
62 |
63 |
64 |
--------------------------------------------------------------------------------
/lexical-graph/src/graphrag_toolkit/lexical_graph/retrieval/processors/truncate_results.py:
--------------------------------------------------------------------------------
1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 | # SPDX-License-Identifier: Apache-2.0
3 |
4 | from graphrag_toolkit.lexical_graph.metadata import FilterConfig
5 | from graphrag_toolkit.lexical_graph.retrieval.processors import ProcessorBase, ProcessorArgs
6 | from graphrag_toolkit.lexical_graph.retrieval.model import SearchResultCollection
7 |
8 | from llama_index.core.schema import QueryBundle
9 |
10 | class TruncateResults(ProcessorBase):
11 | """
12 | TruncateResults processes search results by limiting the number of results.
13 |
14 | This class extends the ProcessorBase and is used to truncate the
15 | number of search results to a defined maximum limit specified
16 | in the configuration. It modifies the search results inline
17 | by only keeping the top results up to the configured limit.
18 |
19 | Attributes:
20 | args (ProcessorArgs): Configuration and settings for processing.
21 | filter_config (FilterConfig): Configuration for the filtering process.
22 | """
23 | def __init__(self, args:ProcessorArgs, filter_config:FilterConfig):
24 | """
25 | Initializes an instance of the Processor class. This constructor provides
26 | initial setup and configuration using the specified arguments and filter
27 | configuration.
28 |
29 | Args:
30 | args (ProcessorArgs): Arguments for configuring the processor.
31 | filter_config (FilterConfig): Filter configuration details used during
32 | initialization.
33 | """
34 | super().__init__(args, filter_config)
35 |
36 | def _process_results(self, search_results:SearchResultCollection, query:QueryBundle) -> SearchResultCollection:
37 | """
38 | Processes the search results by truncating the number of results to a defined maximum.
39 |
40 | This method modifies a SearchResultCollection object by trimming its results
41 | based on the `max_search_results` attribute specified in the `args`. It ensures
42 | that only the top-ranked results up to this maximum limit are retained.
43 |
44 | Args:
45 | search_results: A collection of search results to process.
46 | query: The query information associated with the search results.
47 |
48 | Returns:
49 | A SearchResultCollection object with the results truncated to the specified
50 | maximum number.
51 |
52 | """
53 | search_results.results = search_results.results[:self.args.max_search_results]
54 | return search_results
55 |
56 |
57 |
--------------------------------------------------------------------------------
/lexical-graph/src/graphrag_toolkit/lexical_graph/retrieval/retrievers/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 | # SPDX-License-Identifier: Apache-2.0
3 |
4 | from .chunk_based_search import ChunkBasedSearch
5 | from .entity_based_search import EntityBasedSearch
6 | from .entity_context_search import EntityContextSearch
7 | from .topic_based_search import TopicBasedSearch
8 | from .composite_traversal_based_retriever import CompositeTraversalBasedRetriever, WeightedTraversalBasedRetrieverType
9 | from .keyword_ranking_search import KeywordRankingSearch
10 | from .keyword_entity_search import KeywordEntitySearch
11 | from .rerank_beam_search import RerankingBeamGraphSearch
12 | from .semantic_beam_search import SemanticBeamGraphSearch
13 | from .statement_cosine_seach import StatementCosineSimilaritySearch
14 | from .semantic_guided_retriever import SemanticGuidedRetriever, SemanticGuidedRetrieverType
--------------------------------------------------------------------------------
/lexical-graph/src/graphrag_toolkit/lexical_graph/retrieval/summary/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 | # SPDX-License-Identifier: Apache-2.0
3 |
4 | from .graph_summary import GraphSummary, get_domain
--------------------------------------------------------------------------------
/lexical-graph/src/graphrag_toolkit/lexical_graph/retrieval/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 | # SPDX-License-Identifier: Apache-2.0
--------------------------------------------------------------------------------
/lexical-graph/src/graphrag_toolkit/lexical_graph/retrieval/utils/vector_utils.py:
--------------------------------------------------------------------------------
1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 | # SPDX-License-Identifier: Apache-2.0
3 |
4 | import logging
5 | import queue
6 | from typing import Optional
7 |
8 | from graphrag_toolkit.lexical_graph.metadata import FilterConfig
9 | from graphrag_toolkit.lexical_graph.storage.vector.vector_store import VectorStore
10 | from graphrag_toolkit.lexical_graph.retrieval.processors import ProcessorArgs
11 |
12 | from llama_index.core.schema import QueryBundle
13 |
14 | logger = logging.getLogger(__name__)
15 |
16 | def get_diverse_vss_elements(index_name:str, query_bundle: QueryBundle, vector_store:VectorStore, args:ProcessorArgs, filter_config:Optional[FilterConfig]):
17 | """
18 | Retrieve diverse elements from a vector search system (VSS) by applying a diversity
19 | factor to limit redundancy among results.
20 |
21 | This function queries a vector store using the provided query, index, and filter
22 | configuration, then applies a diversity mechanism to return results with more
23 | heterogeneity. The diversity factor determines the level of diversification among
24 | the results.
25 |
26 | Args:
27 | index_name (str): Name of the index to search in the vector store.
28 | query_bundle (QueryBundle): Query object containing the necessary details for
29 | executing the search.
30 | vector_store (VectorStore): Vector store instance to query for retrieving the
31 | elements.
32 | args (ProcessorArgs): Arguments object containing configurations for top-k
33 | results and the diversity factor.
34 | filter_config (Optional[FilterConfig]): Optional filter configuration to
35 | refine the query results.
36 |
37 | Returns:
38 | list: A list of diverse elements from the vector store result set.
39 | """
40 | diversity_factor = args.vss_diversity_factor
41 | vss_top_k = args.vss_top_k
42 |
43 | if not diversity_factor or diversity_factor < 1:
44 | return vector_store.get_index(index_name).top_k(query_bundle, top_k=vss_top_k, filter_config=filter_config)
45 |
46 | top_k = vss_top_k * diversity_factor
47 |
48 | elements = vector_store.get_index(index_name).top_k(query_bundle, top_k=top_k, filter_config=filter_config)
49 |
50 | source_map = {}
51 |
52 | for element in elements:
53 | source_id = element['source']['sourceId']
54 | if source_id not in source_map:
55 | source_map[source_id] = queue.Queue()
56 | source_map[source_id].put(element)
57 |
58 | elements_by_source = queue.Queue()
59 |
60 | for source_elements in source_map.values():
61 | elements_by_source.put(source_elements)
62 |
63 | diverse_elements = []
64 |
65 | while (not elements_by_source.empty()) and len(diverse_elements) < vss_top_k:
66 | source_elements = elements_by_source.get()
67 | diverse_elements.append(source_elements.get())
68 | if not source_elements.empty():
69 | elements_by_source.put(source_elements)
70 |
71 | logger.debug(f'Diverse {index_name}s:\n' + '\n--------------\n'.join([str(element) for element in diverse_elements]))
72 |
73 | return diverse_elements
--------------------------------------------------------------------------------
/lexical-graph/src/graphrag_toolkit/lexical_graph/storage/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 | # SPDX-License-Identifier: Apache-2.0
3 |
4 | from .graph_store_factory import GraphStoreFactory, GraphStoreType
5 | from .vector_store_factory import VectorStoreFactory, VectorStoreType
6 | from .constants import INDEX_KEY, ALL_EMBEDDING_INDEXES, DEFAULT_EMBEDDING_INDEXES, LEXICAL_GRAPH_LABELS
7 |
--------------------------------------------------------------------------------
/lexical-graph/src/graphrag_toolkit/lexical_graph/storage/constants.py:
--------------------------------------------------------------------------------
1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 | # SPDX-License-Identifier: Apache-2.0
3 |
4 | INDEX_KEY = 'aws::graph::index'
5 | ALL_EMBEDDING_INDEXES = ['chunk', 'statement', 'topic']
6 | DEFAULT_EMBEDDING_INDEXES = ['chunk', 'statement']
7 | LEXICAL_GRAPH_LABELS = [
8 | '__Source__',
9 | '__Chunk__',
10 | '__Topic__',
11 | '__Statement__',
12 | '__Fact__',
13 | '__Entity__',
14 | '__SYS_SV__EntityClassification__',
15 | '__SYS_SV__StatementTopic__',
16 | '__SYS_Class__'
17 | ]
18 |
--------------------------------------------------------------------------------
/lexical-graph/src/graphrag_toolkit/lexical_graph/storage/graph/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 | # SPDX-License-Identifier: Apache-2.0
3 |
4 | from .graph_store import GraphStore, RedactedGraphQueryLogFormatting, NonRedactedGraphQueryLogFormatting, NodeId, get_log_formatting, format_id
5 | from .graph_store_factory_method import GraphStoreFactoryMethod
6 | from .multi_tenant_graph_store import MultiTenantGraphStore
7 | from .dummy_graph_store import DummyGraphStore
--------------------------------------------------------------------------------
/lexical-graph/src/graphrag_toolkit/lexical_graph/storage/graph/dummy_graph_store.py:
--------------------------------------------------------------------------------
1 | # Copyright FalkorDB.com, Inc. or its affiliates. All Rights Reserved.
2 | # SPDX-License-Identifier: Apache-2.0
3 |
4 | import logging
5 |
6 | from graphrag_toolkit.lexical_graph.storage.graph import GraphStoreFactoryMethod, GraphStore, get_log_formatting
7 |
8 | DUMMY = 'dummy://'
9 |
10 | logger = logging.getLogger(__name__)
11 |
12 |
13 | class DummyGraphStoreFactory(GraphStoreFactoryMethod):
14 | """
15 | Factory class for creating instances of DummyGraphStore if applicable.
16 |
17 | This class implements a factory method pattern to create a DummyGraphStore
18 | object based on provided graph information. It attempts to determine whether
19 | the provided information corresponds to a dummy graph store, and if so,
20 | returns a new instance of DummyGraphStore. Otherwise, it returns None.
21 |
22 | Attributes:
23 | No additional class attributes are explicitly defined beyond inherited attributes.
24 | """
25 | def try_create(self, graph_info: str, **kwargs) -> GraphStore:
26 | """
27 | Attempts to create a `GraphStore` instance based on the provided `graph_info`.
28 | If `graph_info` starts with the constant `DUMMY`, a `DummyGraphStore` instance
29 | is initialized and returned. Otherwise, the method returns `None`.
30 |
31 | Args:
32 | graph_info (str): Information specifying the type of the graph store to
33 | create. If the value starts with `DUMMY`, a dummy graph store is opened.
34 | **kwargs: Additional keyword arguments used for configuring the graph store,
35 | such as formatting for logs.
36 |
37 | Returns:
38 | GraphStore: A `DummyGraphStore` instance if `graph_info` starts with
39 | `DUMMY`. Otherwise, returns `None`.
40 | """
41 | if graph_info.startswith(DUMMY):
42 | logger.debug('Opening dummy graph store')
43 | return DummyGraphStore(log_formatting=get_log_formatting(kwargs))
44 | else:
45 | return None
46 |
47 |
48 | class DummyGraphStore(GraphStore):
49 | """
50 | Represents a specialized graph store that extends the base functionality of GraphStore.
51 |
52 | This class is designed to execute Cypher queries on a graph database and log the query
53 | information for debugging purposes. It provides an implementation for executing queries with
54 | optional parameters and correlation IDs. The main use case for this class is to interact with
55 | graph databases, primarily for logging and debugging scenarios.
56 |
57 | Attributes:
58 | log_formatting (LogFormatter): An instance of LogFormatter used for formatting log entries.
59 | _logging_prefix (callable): A callable function or method responsible for generating the
60 | logging prefix based on the provided correlation ID.
61 | """
62 | def execute_query(self, cypher, parameters={}, correlation_id=None):
63 | """
64 | Executes the given Cypher query with specified parameters and logs the operation.
65 |
66 | The function logs a formatted version of the Cypher query and its parameters with
67 | a correlation identifier for tracking. It provides an empty result as a placeholder.
68 |
69 | Args:
70 | cypher: The Cypher query to be executed.
71 | parameters: A dictionary representing the parameters for the Cypher query.
72 | Defaults to an empty dictionary.
73 | correlation_id: An optional identifier for correlating log entries. Defaults
74 | to None.
75 |
76 | Returns:
77 | A list as a placeholder for query execution results. Currently, it does
78 | not retrieve any actual results.
79 | """
80 | log_entry_parameters = self.log_formatting.format_log_entry(self._logging_prefix(correlation_id), cypher,
81 | parameters)
82 | logger.debug(
83 | f'[{log_entry_parameters.query_ref}] query: {log_entry_parameters.query}, parameters: {log_entry_parameters.parameters}')
84 | return []
85 |
--------------------------------------------------------------------------------
/lexical-graph/src/graphrag_toolkit/lexical_graph/storage/graph/graph_store_factory_method.py:
--------------------------------------------------------------------------------
1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 | # SPDX-License-Identifier: Apache-2.0
3 |
4 | import abc
5 |
6 | from graphrag_toolkit.lexical_graph.storage.graph.graph_store import GraphStore
7 |
8 | class GraphStoreFactoryMethod():
9 | """
10 | GraphStoreFactoryMethod provides an abstraction for creating GraphStore objects.
11 |
12 | This class defines a factory method pattern that serves as the template for creating
13 | instances of `GraphStore`. It provides a contract that must be implemented by
14 | subclasses, specifying how `GraphStore` objects are instantiated based on
15 | graph configuration details.
16 |
17 | Methods:
18 | try_create(graph_info, **kwargs): Abstract method to attempt the creation
19 | of a `GraphStore` instance based on provided graph configuration
20 | and optional parameters.
21 | """
22 | @abc.abstractmethod
23 | def try_create(self, graph_info:str, **kwargs) -> GraphStore:
24 | """
25 | Abstract base class for creating a graph store from provided graph information.
26 |
27 | This class represents the structure that any concrete implementation must adhere
28 | to, ensuring that essential methods related to graph creation are defined.
29 |
30 | Attributes:
31 | graph_info (str): Information or details about the graph structure, which
32 | will be used for constructing the graph store.
33 | **kwargs: Arbitrary keyword arguments that might be required to configure
34 | the creation process.
35 | """
36 | raise NotImplementedError
37 |
--------------------------------------------------------------------------------
/lexical-graph/src/graphrag_toolkit/lexical_graph/storage/vector/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 | # SPDX-License-Identifier: Apache-2.0
3 |
4 | from .vector_index import VectorIndex, to_embedded_query
5 | from .vector_index_factory_method import VectorIndexFactoryMethod
6 | from .vector_store import VectorStore
7 | from .multi_tenant_vector_store import MultiTenantVectorStore
8 | from .read_only_vector_store import ReadOnlyVectorStore
9 | from .dummy_vector_index import DummyVectorIndex
--------------------------------------------------------------------------------
/lexical-graph/src/graphrag_toolkit/lexical_graph/storage/vector/multi_tenant_vector_store.py:
--------------------------------------------------------------------------------
1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 | # SPDX-License-Identifier: Apache-2.0
3 | from typing import List
4 | from graphrag_toolkit.lexical_graph import TenantId
5 | from graphrag_toolkit.lexical_graph.storage.vector import VectorStore, VectorIndex
6 |
7 |
8 | class MultiTenantVectorStore(VectorStore):
9 | """Provides a multi-tenant wrapper for VectorStore.
10 |
11 | This class allows creating a wrapper around a `VectorStore` object to
12 | support multi-tenancy by associating a specific tenant ID with operations.
13 | It ensures that all indexes retrieved or processed are identified and
14 | associated with the correct tenant context.
15 |
16 | Attributes:
17 | inner (VectorStore): The underlying vector store being wrapped.
18 | tenant_id (TenantId): The tenant ID associated with the operations
19 | performed on the vector store.
20 | """
21 | @classmethod
22 | def wrap(cls, vector_store:VectorStore, tenant_id:TenantId):
23 | """
24 | Wraps the given vector_store with a MultiTenantVectorStore if necessary, based on the
25 | tenant_id provided. The method ensures that the given vector_store is returned as-is
26 | if it corresponds to the default tenant or is already an instance of
27 | MultiTenantVectorStore. Otherwise, it wraps the vector_store inside a
28 | MultiTenantVectorStore with the given tenant_id.
29 |
30 | Args:
31 | vector_store: The vector_store to wrap if required.
32 | tenant_id: The tenant identifier used to decide whether wrapping is necessary.
33 |
34 | Returns:
35 | The provided vector_store, wrapped in a MultiTenantVectorStore if necessary, or
36 | the vector_store itself if no wrapping is required.
37 | """
38 |
39 | if isinstance(vector_store, MultiTenantVectorStore):
40 | return vector_store
41 | return MultiTenantVectorStore(inner=vector_store, tenant_id=tenant_id)
42 |
43 | inner:VectorStore
44 | tenant_id:TenantId
45 |
46 | def get_index(self, index_name):
47 | """
48 | Retrieves an index from the inner object and associates it with the tenant ID.
49 |
50 | Args:
51 | index_name: Name of the index to retrieve.
52 |
53 | Returns:
54 | The index retrieved, with the tenant_id attribute set to the tenant ID.
55 | """
56 | index = self.inner.get_index(index_name=index_name)
57 | index.tenant_id = self.tenant_id
58 | return index
59 |
60 | def all_indexes(self) -> List[VectorIndex]:
61 | """
62 | Returns a list of all VectorIndex instances stored in the inner indexes.
63 |
64 | This method iterates through the keys of the `inner.indexes` dictionary and
65 | retrieves the corresponding VectorIndex object for each key.
66 |
67 | Returns:
68 | List[VectorIndex]: A list containing all VectorIndex objects within the
69 | inner indexes of the current instance.
70 | """
71 | return [self.get_index(i) for i in self.inner.indexes.keys()]
72 |
--------------------------------------------------------------------------------
/lexical-graph/src/graphrag_toolkit/lexical_graph/storage/vector/opensearch_vector_index_factory.py:
--------------------------------------------------------------------------------
1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 | # SPDX-License-Identifier: Apache-2.0
3 |
4 | import logging
5 | from typing import List
6 |
7 | from graphrag_toolkit.lexical_graph.storage.vector import VectorIndex, VectorIndexFactoryMethod, to_embedded_query
8 |
9 | logger = logging.getLogger(__name__)
10 |
11 | OPENSEARCH_SERVERLESS = 'aoss://'
12 | OPENSEARCH_SERVERLESS_DNS = 'aoss.amazonaws.com'
13 |
14 | class OpenSearchVectorIndexFactory(VectorIndexFactoryMethod):
15 | """Factory class for creating OpenSearch vector indexes.
16 |
17 | This class is responsible for creating OpenSearch vector indexes based
18 | on provided index names and vector index connection information. It
19 | detects whether the connection information corresponds to an OpenSearch
20 | Serverless endpoint or a traditional OpenSearch endpoint and constructs
21 | the corresponding vector indexes.
22 |
23 | Attributes:
24 | No specific attributes are directly defined in this class. The class
25 | relies on the methods and details passed during the instantiation and
26 | method calls.
27 | """
28 | def try_create(self, index_names:List[str], vector_index_info:str, **kwargs) -> List[VectorIndex]:
29 | """
30 | Attempts to create a list of vector indexes using the provided index names and vector
31 | index information. This method checks if a supported endpoint configuration is
32 | provided and uses it to initialize appropriate vector indexes. Raises ImportError
33 | if the required module is not available.
34 |
35 | Args:
36 | index_names (List[str]): List of index names to create vector indexes for.
37 | vector_index_info (str): Information defining the type and endpoint of the vector
38 | index, such as an OpenSearch Serverless endpoint.
39 | **kwargs: Additional keyword arguments passed when creating the vector indexes.
40 |
41 | Returns:
42 | List[VectorIndex]: A list of vector index objects created for the provided index
43 | names and endpoint configuration, or None if no suitable endpoint is found.
44 | """
45 | endpoint = None
46 | if vector_index_info.startswith(OPENSEARCH_SERVERLESS):
47 | endpoint = vector_index_info[len(OPENSEARCH_SERVERLESS):]
48 | elif vector_index_info.startswith('https://') and vector_index_info.endswith(OPENSEARCH_SERVERLESS_DNS):
49 | endpoint = vector_index_info
50 | if endpoint:
51 | try:
52 | from graphrag_toolkit.lexical_graph.storage.vector.opensearch_vector_indexes import OpenSearchIndex
53 | logger.debug(f'Opening OpenSearch vector indexes [index_names: {index_names}, endpoint: {endpoint}]')
54 | return [OpenSearchIndex.for_index(index_name, endpoint, **kwargs) for index_name in index_names]
55 | except ImportError as e:
56 | raise e
57 |
58 | else:
59 | return None
--------------------------------------------------------------------------------
/lexical-graph/src/graphrag_toolkit/lexical_graph/storage/vector/pg_vector_index_factory.py:
--------------------------------------------------------------------------------
1 | # Copyright FalkorDB.com, Inc. or its affiliates. All Rights Reserved.
2 | # SPDX-License-Identifier: Apache-2.0
3 |
4 | import logging
5 | from typing import List
6 |
7 | from graphrag_toolkit.lexical_graph.storage.vector import VectorIndex, VectorIndexFactoryMethod
8 |
9 | logger = logging.getLogger(__name__)
10 |
11 | POSTGRES = 'postgres://'
12 | POSTGRESQL = 'postgresql://'
13 |
14 | class PGVectorIndexFactory(VectorIndexFactoryMethod):
15 | def try_create(self, index_names:List[str], vector_index_info:str, **kwargs) -> List[VectorIndex]:
16 | """
17 | Tries to create and return a list of vector indexes using the given parameters.
18 |
19 | Depending on the connection information provided in `vector_index_info`, this method
20 | attempts to open PostgreSQL vector indexes or returns None if the connection string
21 | is not valid or applicable. If the PostgreSQL module is not available, it raises an
22 | ImportError.
23 |
24 | Args:
25 | index_names (List[str]): A list of index names to be used when creating vector indexes.
26 | vector_index_info (str): A string containing information about the vector index
27 | connection, such as a PostgreSQL connection string.
28 | \*\*kwargs: Additional arguments that might be passed to the underlying index creation
29 | utility.
30 |
31 | Returns:
32 | List[VectorIndex]: A list of vector index objects created for the provided index
33 | names if successful, otherwise None.
34 |
35 | Raises:
36 | ImportError: If the PostgreSQL-specific module required for creating the indexes
37 | cannot be imported.
38 | """
39 | connection_string = None
40 | if vector_index_info.startswith(POSTGRES) or vector_index_info.startswith(POSTGRESQL):
41 | connection_string = vector_index_info
42 | if connection_string:
43 | logger.debug(f'Opening PostgreSQL vector indexes [index_names: {index_names}, connection_string: {connection_string}]')
44 | try:
45 | from graphrag_toolkit.lexical_graph.storage.vector.pg_vector_indexes import PGIndex
46 | return [PGIndex.for_index(index_name, connection_string, **kwargs) for index_name in index_names]
47 | except ImportError as e:
48 | raise e
49 | else:
50 | return None
--------------------------------------------------------------------------------
/lexical-graph/src/graphrag_toolkit/lexical_graph/storage/vector/read_only_vector_store.py:
--------------------------------------------------------------------------------
1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 | # SPDX-License-Identifier: Apache-2.0
3 | from typing import List
4 | from graphrag_toolkit.lexical_graph import TenantId
5 | from graphrag_toolkit.lexical_graph.storage.vector import VectorStore, VectorIndex
6 |
7 |
8 | class ReadOnlyVectorStore(VectorStore):
9 | """
10 | Represents a read-only wrapper for a VectorStore.
11 |
12 | This class is designed to wrap an existing VectorStore and provide a
13 | read-only interface to its contents. It ensures that any indexes accessed
14 | through this wrapper are not writeable. This class can be useful when you
15 | want to share or use a VectorStore in a context where modifications to it
16 | are not allowed.
17 |
18 | Attributes:
19 | inner (VectorStore): The underlying VectorStore being wrapped. All read
20 | operations are delegated to this inner VectorStore.
21 | """
22 | @classmethod
23 | def wrap(cls, vector_store:VectorStore):
24 | """
25 | Wraps the given vector store in a read-only wrapper if it is not already read-only.
26 |
27 | This method ensures that the vector store is encapsulated in a `ReadOnlyVectorStore`
28 | if it is not yet of that type, providing a read-only interface while preserving the
29 | original functionality.
30 |
31 | Args:
32 | vector_store: The vector store instance to wrap.
33 |
34 | Returns:
35 | ReadOnlyVectorStore: A read-only wrapper around the provided vector store.
36 | If the input is already a `ReadOnlyVectorStore`, it is returned unchanged.
37 | """
38 | if isinstance(vector_store, ReadOnlyVectorStore):
39 | return vector_store
40 | return ReadOnlyVectorStore(inner=vector_store)
41 |
42 | inner:VectorStore
43 |
44 | def get_index(self, index_name):
45 | """
46 | Retrieves an index by its name, making it non-writeable.
47 |
48 | This method fetches an index from the inner system using the specified
49 | index name and sets its `writeable` attribute to `False`. The modified
50 | index is then returned.
51 |
52 | Args:
53 | index_name: The name of the index to retrieve.
54 |
55 | Returns:
56 | The requested index with its `writeable` attribute set to `False`.
57 |
58 | Raises:
59 | KeyError: If the specified index name does not exist in the inner
60 | system.
61 | """
62 | index = self.inner.get_index(index_name=index_name)
63 | index.writeable = False
64 | return index
65 |
66 | def all_indexes(self) -> List[VectorIndex]:
67 | """
68 | Gets all indexes from the inner indexing system.
69 |
70 | This method iterates over the keys of the inner indexing system and retrieves
71 | the corresponding indexes using the `get_index` method. It consolidates these
72 | indexes into a list and returns it.
73 |
74 | Returns:
75 | List[VectorIndex]: A list of all retrieved indexes from the inner indexing
76 | system.
77 | """
78 | return [self.get_index(i) for i in self.inner.indexes.keys()]
79 |
--------------------------------------------------------------------------------
/lexical-graph/src/graphrag_toolkit/lexical_graph/storage/vector/vector_index_factory_method.py:
--------------------------------------------------------------------------------
1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 | # SPDX-License-Identifier: Apache-2.0
3 |
4 | import abc
5 | from typing import List
6 |
7 | from graphrag_toolkit.lexical_graph.storage.vector.vector_index import VectorIndex
8 |
9 | class VectorIndexFactoryMethod():
10 | """
11 | A factory method for creating vector indexes.
12 |
13 | This abstract class defines an interface for creating vector indexes. Any subclass should
14 | implement the `try_create` method to specify creation logic. The factory method ensures a
15 | consistent way of generating vector indexes, potentially handling input validation,
16 | additional configurations, or other pre-processing tasks.
17 |
18 | Methods in any subclasses are expected to return valid instances of `VectorIndex` based
19 | on provided input.
20 |
21 | Attributes:
22 | None
23 | """
24 | @abc.abstractmethod
25 | def try_create(self, index_names:List[str], vector_index_info:str, **kwargs) -> List[VectorIndex]:
26 | raise NotImplementedError
--------------------------------------------------------------------------------
/lexical-graph/src/graphrag_toolkit/lexical_graph/storage/vector/vector_store.py:
--------------------------------------------------------------------------------
1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 | # SPDX-License-Identifier: Apache-2.0
3 |
4 | import logging
5 | from typing import Dict, Optional, List
6 |
7 | from graphrag_toolkit.lexical_graph.storage.constants import ALL_EMBEDDING_INDEXES
8 | from graphrag_toolkit.lexical_graph.storage.vector.vector_index import VectorIndex
9 | from graphrag_toolkit.lexical_graph.storage.vector.dummy_vector_index import DummyVectorIndex
10 |
11 | from llama_index.core.bridge.pydantic import BaseModel, Field
12 |
13 | logger = logging.getLogger(__name__)
14 |
15 | class VectorStore(BaseModel):
16 | """
17 | Represents a storage for managing and retrieving vector indexes.
18 |
19 | The `VectorStore` class is responsible for maintaining a collection of vector
20 | indexes. It supports functionalities such as retrieving specific indexes and
21 | accessing all stored indexes. This class is designed to handle operations
22 | related to vector index management seamlessly.
23 |
24 | Attributes:
25 | indexes (Optional[Dict[str, VectorIndex]]): A dictionary where the keys are
26 | index names and the values are corresponding `VectorIndex` objects. It
27 | stores all vector indexes available in this class instance. Defaults to
28 | an empty dictionary.
29 | """
30 | indexes:Optional[Dict[str, VectorIndex]] = Field(description='Vector indexes', default_factory=dict)
31 |
32 | def get_index(self, index_name):
33 | """
34 | Retrieves the vector index associated with the given index name. If the specified index
35 | name is not recognized or has not been registered in the indexes dictionary, it returns
36 | a dummy index instead. The method ensures that only valid index names are processed and
37 | handled appropriately.
38 |
39 | Args:
40 | index_name: The name of the index to retrieve. Must be an entry from the
41 | global `ALL_EMBEDDING_INDEXES` list.
42 |
43 | Returns:
44 | Union[VectorIndex, DummyVectorIndex]: The corresponding vector index if the name is
45 | found in the `indexes` dictionary; otherwise, a dummy vector index configured with
46 | the specified `index_name`.
47 |
48 | Raises:
49 | ValueError: If the provided `index_name` is not one of the allowed entries listed in
50 | `ALL_EMBEDDING_INDEXES`.
51 | """
52 | if index_name not in ALL_EMBEDDING_INDEXES:
53 | raise ValueError(f'Invalid index name ({index_name}): must be one of {ALL_EMBEDDING_INDEXES}')
54 | if index_name not in self.indexes:
55 | logger.debug(f"Returning dummy index for '{index_name}'")
56 | return DummyVectorIndex(index_name=index_name)
57 | return self.indexes[index_name]
58 |
59 | def all_indexes(self) -> List[VectorIndex]:
60 | """
61 | Returns a list of all vector indexes stored in the object.
62 |
63 | The method retrieves all vector indexes from the internal storage and
64 | returns them as a list.
65 |
66 | Returns:
67 | List[VectorIndex]: A list containing all vector indexes present in the
68 | internal storage.
69 | """
70 | return list(self.indexes.values())
71 |
72 |
--------------------------------------------------------------------------------
/lexical-graph/src/graphrag_toolkit/lexical_graph/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 | # SPDX-License-Identifier: Apache-2.0
3 |
4 | from .fm_observability import FMObservabilityPublisher, ConsoleFMObservabilitySubscriber
5 | from .llm_cache import LLMCache, LLMCacheType
--------------------------------------------------------------------------------
/lexical-graph/src/graphrag_toolkit/lexical_graph/utils/bedrock_utils.py:
--------------------------------------------------------------------------------
1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 | # SPDX-License-Identifier: Apache-2.0
3 |
4 | import logging
5 | import llama_index.llms.bedrock_converse.utils
6 | from typing import Any, Callable
7 |
8 | from tenacity import (
9 | before_sleep_log,
10 | retry,
11 | retry_if_exception_type,
12 | stop_after_attempt,
13 | wait_random_exponential,
14 | )
15 |
16 | logger = logging.getLogger(__name__)
17 |
18 |
19 |
20 | def _create_retry_decorator(client: Any, max_retries: int) -> Callable[[Any], Any]:
21 | """
22 | Creates a retry decorator with exponential backoff strategy.
23 |
24 | This function returns a retry decorator based on the specified maximum
25 | number of retries and the provided client. It uses exponential backoff
26 | and wait times between retry attempts. This ensures handling of temporary
27 | failures and throttling exceptions raised by the specified client.
28 |
29 | Args:
30 | client: A client object that has exception classes for handling
31 | specific errors such as ThrottlingException, ModelTimeoutException,
32 | and ModelErrorException.
33 | max_retries: An integer specifying the maximum number of retry attempts
34 | to make before giving up.
35 |
36 | Returns:
37 | A callable retry decorator with specified retry policies.
38 | """
39 | min_seconds = 4
40 | max_seconds = 30
41 | # Wait 2^x * 1 second between each retry starting with
42 | # 4 seconds, then up to 30 seconds, then 30 seconds afterwards
43 | try:
44 | import boto3 # noqa
45 | except ImportError as e:
46 | raise ImportError(
47 | "boto3 package not found, install with 'pip install boto3'"
48 | ) from e
49 | return retry(
50 | reraise=True,
51 | stop=stop_after_attempt(max_retries),
52 | wait=wait_random_exponential(multiplier=1, min=min_seconds, max=max_seconds),
53 | retry=(
54 | retry_if_exception_type(client.exceptions.ThrottlingException) |
55 | retry_if_exception_type(client.exceptions.ModelTimeoutException) |
56 | retry_if_exception_type(client.exceptions.ModelErrorException)
57 | ),
58 | before_sleep=before_sleep_log(logger, logging.WARNING),
59 | )
60 |
61 | llama_index.llms.bedrock_converse.utils._create_retry_decorator = _create_retry_decorator
62 |
--------------------------------------------------------------------------------
/security.md:
--------------------------------------------------------------------------------
1 | # Security issue notifications
2 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue.
3 |
4 |
--------------------------------------------------------------------------------