├── tests ├── unit │ ├── config │ │ ├── prompt-a.txt │ │ ├── prompt-b.txt │ │ ├── prompt-c.txt │ │ ├── prompt-d.txt │ │ └── __init__.py │ ├── __init__.py │ └── indexing │ │ ├── __init__.py │ │ ├── cache │ │ └── __init__.py │ │ ├── config │ │ ├── __init__.py │ │ ├── default_config_with_overridden_workflows.yml │ │ ├── default_config_with_overridden_input.yml │ │ └── default_config_with_everything_overridden.yml │ │ ├── graph │ │ ├── __init__.py │ │ ├── utils │ │ │ └── __init__.py │ │ └── extractors │ │ │ ├── __init__.py │ │ │ └── community_reports │ │ │ └── __init__.py │ │ ├── verbs │ │ ├── __init__.py │ │ ├── text │ │ │ └── __init__.py │ │ ├── entities │ │ │ ├── __init__.py │ │ │ └── extraction │ │ │ │ ├── __init__.py │ │ │ │ └── strategies │ │ │ │ ├── __init__.py │ │ │ │ └── graph_intelligence │ │ │ │ └── __init__.py │ │ └── helpers │ │ │ ├── __init__.py │ │ │ └── mock_llm.py │ │ ├── storage │ │ └── __init__.py │ │ ├── workflows │ │ ├── __init__.py │ │ └── helpers.py │ │ ├── test_exports.py │ │ └── test_init_content.py ├── __init__.py ├── smoke │ └── __init__.py ├── notebook │ ├── __init__.py │ └── test_notebooks.py ├── integration │ ├── __init__.py │ └── _pipeline │ │ └── __init__.py ├── fixtures │ ├── azure │ │ ├── input │ │ │ └── ABOUT.md │ │ ├── config.json │ │ └── settings.yml │ ├── text │ │ ├── input │ │ │ └── ABOUT.md │ │ └── settings.yml │ └── min-csv │ │ ├── input │ │ └── ABOUT.md │ │ └── settings.yml └── conftest.py ├── examples ├── single_verb │ ├── input │ │ └── data.csv │ ├── __init__.py │ └── pipeline.yml ├── multiple_workflows │ ├── workflows │ │ ├── shared │ │ │ └── shared_fill_value.txt │ │ ├── workflow_1.yml │ │ ├── workflow_3.yml │ │ └── workflow_2.yml │ ├── __init__.py │ └── pipeline.yml ├── __init__.py ├── custom_input │ ├── __init__.py │ ├── pipeline.yml │ └── run.py ├── entity_extraction │ ├── __init__.py │ ├── with_nltk │ │ ├── __init__.py │ │ └── pipeline.yml │ └── with_graph_intelligence │ │ ├── __init__.py │ │ └── pipeline.yml ├── use_built_in_workflows │ ├── __init__.py │ └── pipeline.yml ├── interdependent_workflows │ ├── __init__.py │ └── pipeline.yml ├── various_levels_of_configs │ └── __init__.py ├── custom_set_of_available_verbs │ ├── __init__.py │ ├── pipeline.yml │ └── custom_verb_definitions.py ├── custom_set_of_available_workflows │ ├── __init__.py │ ├── pipeline.yml │ └── custom_workflow_definitions.py └── README.md ├── docsite ├── .eleventyignore ├── nbdocsite_template │ └── conf.json ├── .gitignore ├── img │ ├── GraphRag-Figure1.jpg │ └── pipeline-running.png ├── .yarnrc.yml ├── .yarn │ └── sdks │ │ └── integrations.yml ├── data │ └── operation_dulce │ │ └── ABOUT.md ├── posts │ ├── query │ │ └── notebooks │ │ │ └── overview.md │ └── config │ │ └── overview.md ├── package.json └── .eleventy.js ├── scripts ├── spellcheck.sh ├── start-azurite.sh ├── e2e-test.sh └── semver-check.sh ├── .semversioner ├── next-release │ ├── patch-20240701233152787373.json │ ├── patch-20240703182750529114.json │ ├── patch-20240704181236015699.json │ ├── patch-20240705184142723331.json │ └── patch-20240703152422358587.json └── 0.1.0.json ├── graphrag ├── __init__.py ├── index │ ├── py.typed │ ├── graph │ │ ├── __init__.py │ │ ├── embedding │ │ │ ├── __init__.py │ │ │ └── embedding.py │ │ ├── extractors │ │ │ ├── claims │ │ │ │ └── __init__.py │ │ │ ├── summarize │ │ │ │ ├── __init__.py │ │ │ │ └── prompts.py │ │ │ ├── graph │ │ │ │ └── __init__.py │ │ │ ├── __init__.py │ │ │ └── community_reports │ │ │ │ └── __init__.py │ │ ├── utils │ │ │ ├── __init__.py │ │ │ └── normalize_node_names.py │ │ └── visualization │ │ │ ├── __init__.py │ │ │ └── typing.py │ ├── verbs │ │ ├── graph │ │ │ ├── layout │ │ │ │ ├── methods │ │ │ │ │ └── __init__.py │ │ │ │ └── __init__.py │ │ │ ├── embed │ │ │ │ ├── strategies │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── node_2_vec.py │ │ │ │ ├── __init__.py │ │ │ │ └── typing.py │ │ │ ├── clustering │ │ │ │ ├── strategies │ │ │ │ │ └── __init__.py │ │ │ │ ├── typing.py │ │ │ │ └── __init__.py │ │ │ ├── report │ │ │ │ ├── strategies │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── graph_intelligence │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ └── defaults.py │ │ │ │ │ └── typing.py │ │ │ │ └── __init__.py │ │ │ ├── merge │ │ │ │ ├── __init__.py │ │ │ │ ├── defaults.py │ │ │ │ └── typing.py │ │ │ └── __init__.py │ │ ├── text │ │ │ ├── embed │ │ │ │ ├── strategies │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── typing.py │ │ │ │ │ └── mock.py │ │ │ │ └── __init__.py │ │ │ ├── chunk │ │ │ │ ├── strategies │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── typing.py │ │ │ │ │ └── sentence.py │ │ │ │ ├── __init__.py │ │ │ │ └── typing.py │ │ │ ├── replace │ │ │ │ ├── __init__.py │ │ │ │ └── typing.py │ │ │ ├── translate │ │ │ │ ├── __init__.py │ │ │ │ └── strategies │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── defaults.py │ │ │ │ │ ├── typing.py │ │ │ │ │ └── mock.py │ │ │ └── __init__.py │ │ ├── entities │ │ │ ├── extraction │ │ │ │ ├── strategies │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── graph_intelligence │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ └── defaults.py │ │ │ │ │ └── typing.py │ │ │ │ └── __init__.py │ │ │ ├── summarize │ │ │ │ ├── strategies │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── graph_intelligence │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ └── defaults.py │ │ │ │ │ └── typing.py │ │ │ │ └── __init__.py │ │ │ └── __init__.py │ │ ├── covariates │ │ │ ├── extract_covariates │ │ │ │ ├── strategies │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── graph_intelligence │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ └── defaults.py │ │ │ │ └── __init__.py │ │ │ ├── __init__.py │ │ │ └── typing.py │ │ ├── overrides │ │ │ ├── __init__.py │ │ │ └── concat.py │ │ ├── unzip.py │ │ ├── snapshot.py │ │ └── __init__.py │ ├── workflows │ │ ├── v1 │ │ │ ├── __init__.py │ │ │ ├── create_final_documents.py │ │ │ └── join_text_units_to_covariate_ids.py │ │ ├── __init__.py │ │ └── typing.py │ ├── input │ │ └── __init__.py │ ├── progress │ │ └── __init__.py │ ├── emit │ │ ├── types.py │ │ ├── table_emitter.py │ │ ├── __init__.py │ │ ├── csv_table_emitter.py │ │ └── json_table_emitter.py │ ├── llm │ │ ├── types.py │ │ └── __init__.py │ ├── utils │ │ ├── topological_sort.py │ │ ├── load_graph.py │ │ ├── uuid.py │ │ ├── hashing.py │ │ ├── is_null.py │ │ ├── dicts.py │ │ ├── string.py │ │ ├── json.py │ │ ├── __init__.py │ │ ├── ds_util.py │ │ ├── tokens.py │ │ └── rate_limiter.py │ ├── text_splitting │ │ ├── check_token_limit.py │ │ └── __init__.py │ ├── cache │ │ └── __init__.py │ ├── typing.py │ ├── storage │ │ ├── __init__.py │ │ └── load_storage.py │ ├── reporting │ │ ├── __init__.py │ │ └── console_workflow_callbacks.py │ ├── errors.py │ ├── bootstrap.py │ ├── config │ │ └── workflow.py │ └── context.py ├── query │ ├── __init__.py │ ├── input │ │ ├── __init__.py │ │ ├── loaders │ │ │ └── __init__.py │ │ └── retrieval │ │ │ └── __init__.py │ ├── llm │ │ ├── __init__.py │ │ ├── oai │ │ │ ├── typing.py │ │ │ └── __init__.py │ │ └── text_utils.py │ ├── question_gen │ │ ├── __init__.py │ │ └── system_prompt.py │ ├── structured_search │ │ ├── __init__.py │ │ ├── global_search │ │ │ ├── __init__.py │ │ │ └── callbacks.py │ │ └── local_search │ │ │ └── __init__.py │ ├── context_builder │ │ ├── __init__.py │ │ └── builders.py │ └── progress.py ├── prompt_tune │ ├── __init__.py │ ├── generator │ │ ├── defaults.py │ │ ├── domain.py │ │ ├── persona.py │ │ ├── __init__.py │ │ ├── entity_summarization_prompt.py │ │ └── community_reporter_role.py │ ├── loader │ │ └── __init__.py │ ├── prompt │ │ ├── domain.py │ │ ├── persona.py │ │ ├── __init__.py │ │ └── community_reporter_role.py │ └── template │ │ ├── __init__.py │ │ └── entity_summarization.py ├── model │ ├── types.py │ ├── named.py │ ├── identified.py │ └── __init__.py ├── llm │ ├── openai │ │ ├── types.py │ │ ├── _json.py │ │ ├── __init__.py │ │ ├── json_parsing_llm.py │ │ ├── openai_history_tracking_llm.py │ │ ├── openai_token_replacing_llm.py │ │ ├── openai_embeddings_llm.py │ │ └── openai_completion_llm.py │ ├── mock │ │ ├── __init__.py │ │ └── mock_completion_llm.py │ ├── base │ │ ├── __init__.py │ │ └── _create_cache_key.py │ ├── errors.py │ ├── types │ │ ├── llm_types.py │ │ ├── llm_cache.py │ │ ├── llm.py │ │ ├── llm_callbacks.py │ │ ├── llm_invocation_result.py │ │ ├── llm_config.py │ │ └── __init__.py │ └── limiting │ │ ├── __init__.py │ │ ├── llm_limiter.py │ │ ├── noop_llm_limiter.py │ │ ├── create_limiters.py │ │ ├── composite_limiter.py │ │ └── tpm_rpm_limiter.py ├── config │ ├── input_models │ │ ├── umap_config_input.py │ │ ├── parallelization_parameters_input.py │ │ ├── cluster_graph_config_input.py │ │ ├── snapshots_config_input.py │ │ ├── chunking_config_input.py │ │ ├── summarize_descriptions_config_input.py │ │ ├── community_reports_config_input.py │ │ ├── entity_extraction_config_input.py │ │ ├── global_search_config_input.py │ │ ├── claim_extraction_config_input.py │ │ ├── cache_config_input.py │ │ ├── storage_config_input.py │ │ ├── reporting_config_input.py │ │ ├── llm_config_input.py │ │ ├── embed_graph_config_input.py │ │ ├── local_search_config_input.py │ │ ├── text_embedding_config_input.py │ │ ├── input_config_input.py │ │ └── llm_parameters_input.py │ ├── models │ │ ├── umap_config.py │ │ ├── parallelization_parameters.py │ │ ├── snapshots_config.py │ │ ├── llm_config.py │ │ ├── cluster_graph_config.py │ │ ├── cache_config.py │ │ ├── storage_config.py │ │ ├── reporting_config.py │ │ ├── chunking_config.py │ │ ├── local_search_config.py │ │ └── global_search_config.py │ └── read_dotenv.py └── vector_stores │ └── __init__.py ├── Screenshot 2024-07-09 at 3.36.28 AM.png ├── Screenshot 2024-07-09 at 3.34.31 AM-1.png ├── examples_notebooks └── inputs │ └── operation dulce │ ├── create_final_nodes.parquet │ ├── create_final_entities.parquet │ ├── create_final_covariates.parquet │ ├── create_final_text_units.parquet │ ├── create_final_relationships.parquet │ ├── ABOUT.md │ └── create_final_community_reports.parquet ├── .vscode ├── launch.json └── extensions.json ├── CODEOWNERS ├── .github ├── workflows │ ├── spellcheck.yml │ ├── semver.yml │ ├── javascript-ci.yml │ └── python-publish.yml ├── dependabot.yml └── pull_request_template.md ├── cspell.config.yaml ├── requirements.txt ├── .vsts-ci.yml ├── LICENSE └── .gitignore /tests/unit/config/prompt-a.txt: -------------------------------------------------------------------------------- 1 | Hello, World! A -------------------------------------------------------------------------------- /tests/unit/config/prompt-b.txt: -------------------------------------------------------------------------------- 1 | Hello, World! B -------------------------------------------------------------------------------- /tests/unit/config/prompt-c.txt: -------------------------------------------------------------------------------- 1 | Hello, World! C -------------------------------------------------------------------------------- /tests/unit/config/prompt-d.txt: -------------------------------------------------------------------------------- 1 | Hello, World! D -------------------------------------------------------------------------------- /examples/single_verb/input/data.csv: -------------------------------------------------------------------------------- 1 | col1,col2 2 | 2,4 3 | 5,10 -------------------------------------------------------------------------------- /docsite/.eleventyignore: -------------------------------------------------------------------------------- 1 | !posts/index/verbs/*.md 2 | !posts/index/workflows/*.md 3 | -------------------------------------------------------------------------------- /examples/multiple_workflows/workflows/shared/shared_fill_value.txt: -------------------------------------------------------------------------------- 1 | value_from_shared_file -------------------------------------------------------------------------------- /scripts/spellcheck.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | npx --yes cspell -c cspell.config.yaml --no-progress lint . -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | -------------------------------------------------------------------------------- /examples/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | -------------------------------------------------------------------------------- /scripts/start-azurite.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | npx --yes azurite -L -l ./temp_azurite -d ./temp_azurite/debug.log -------------------------------------------------------------------------------- /tests/smoke/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | -------------------------------------------------------------------------------- /tests/unit/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | -------------------------------------------------------------------------------- /docsite/nbdocsite_template/conf.json: -------------------------------------------------------------------------------- 1 | { 2 | "mimetypes": { 3 | "text/markdown": true 4 | } 5 | } -------------------------------------------------------------------------------- /tests/notebook/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | -------------------------------------------------------------------------------- /examples/custom_input/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | -------------------------------------------------------------------------------- /examples/single_verb/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | -------------------------------------------------------------------------------- /tests/integration/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | -------------------------------------------------------------------------------- /tests/unit/config/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | -------------------------------------------------------------------------------- /tests/unit/indexing/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | -------------------------------------------------------------------------------- /examples/entity_extraction/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | -------------------------------------------------------------------------------- /tests/unit/indexing/cache/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | -------------------------------------------------------------------------------- /tests/unit/indexing/config/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | -------------------------------------------------------------------------------- /tests/unit/indexing/graph/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | -------------------------------------------------------------------------------- /tests/unit/indexing/verbs/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | -------------------------------------------------------------------------------- /docsite/.gitignore: -------------------------------------------------------------------------------- 1 | _site 2 | _posts 3 | posts/query/notebooks/*.ipynb 4 | posts/query/notebooks/*_nb.md 5 | *.parquet 6 | *.zip -------------------------------------------------------------------------------- /examples/multiple_workflows/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | -------------------------------------------------------------------------------- /examples/use_built_in_workflows/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | -------------------------------------------------------------------------------- /tests/integration/_pipeline/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | -------------------------------------------------------------------------------- /tests/unit/indexing/graph/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | -------------------------------------------------------------------------------- /tests/unit/indexing/storage/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | -------------------------------------------------------------------------------- /tests/unit/indexing/verbs/text/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | -------------------------------------------------------------------------------- /tests/unit/indexing/workflows/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | -------------------------------------------------------------------------------- /examples/entity_extraction/with_nltk/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | -------------------------------------------------------------------------------- /examples/interdependent_workflows/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | -------------------------------------------------------------------------------- /examples/various_levels_of_configs/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | -------------------------------------------------------------------------------- /tests/unit/indexing/graph/extractors/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | -------------------------------------------------------------------------------- /tests/unit/indexing/verbs/entities/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | -------------------------------------------------------------------------------- /tests/unit/indexing/verbs/helpers/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | -------------------------------------------------------------------------------- /.semversioner/next-release/patch-20240701233152787373.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "patch", 3 | "description": "Fix docsite base url" 4 | } 5 | -------------------------------------------------------------------------------- /docsite/img/GraphRag-Figure1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TheAiSingularity/graphrag-local-ollama/HEAD/docsite/img/GraphRag-Figure1.jpg -------------------------------------------------------------------------------- /docsite/img/pipeline-running.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TheAiSingularity/graphrag-local-ollama/HEAD/docsite/img/pipeline-running.png -------------------------------------------------------------------------------- /examples/custom_set_of_available_verbs/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | -------------------------------------------------------------------------------- /examples/custom_set_of_available_workflows/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | -------------------------------------------------------------------------------- /graphrag/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """The GraphRAG package.""" 5 | -------------------------------------------------------------------------------- /graphrag/index/py.typed: -------------------------------------------------------------------------------- 1 | # This package supports type hinting, 2 | # see https://www.python.org/dev/peps/pep-0561/#packaging-type-information -------------------------------------------------------------------------------- /scripts/e2e-test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Use CLI Form 4 | poetry run python -m graphrag.index --config ./examples/single_verb/pipeline.yml -------------------------------------------------------------------------------- /tests/unit/indexing/verbs/entities/extraction/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | -------------------------------------------------------------------------------- /examples/entity_extraction/with_graph_intelligence/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | -------------------------------------------------------------------------------- /Screenshot 2024-07-09 at 3.36.28 AM.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TheAiSingularity/graphrag-local-ollama/HEAD/Screenshot 2024-07-09 at 3.36.28 AM.png -------------------------------------------------------------------------------- /tests/unit/indexing/graph/extractors/community_reports/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | -------------------------------------------------------------------------------- /tests/unit/indexing/verbs/entities/extraction/strategies/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | -------------------------------------------------------------------------------- /.semversioner/next-release/patch-20240703182750529114.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "patch", 3 | "description": "Fix broken prompt tuning link on docs" 4 | } 5 | -------------------------------------------------------------------------------- /Screenshot 2024-07-09 at 3.34.31 AM-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TheAiSingularity/graphrag-local-ollama/HEAD/Screenshot 2024-07-09 at 3.34.31 AM-1.png -------------------------------------------------------------------------------- /docsite/.yarnrc.yml: -------------------------------------------------------------------------------- 1 | compressionLevel: mixed 2 | 3 | enableGlobalCache: false 4 | 5 | nodeLinker: pnp 6 | 7 | yarnPath: .yarn/releases/yarn-4.0.2.cjs 8 | -------------------------------------------------------------------------------- /graphrag/query/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """GraphRAG Orchestration Module.""" 5 | -------------------------------------------------------------------------------- /.semversioner/next-release/patch-20240704181236015699.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "patch", 3 | "description": "Fix for --limit exceeding the dataframe lenght" 4 | } 5 | -------------------------------------------------------------------------------- /examples/custom_set_of_available_workflows/pipeline.yml: -------------------------------------------------------------------------------- 1 | workflows: 2 | - name: my_workflow 3 | config: 4 | derive_output_column: "col_1_multiplied" 5 | -------------------------------------------------------------------------------- /graphrag/query/input/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """GraphRAG Orchestration Inputs.""" 5 | -------------------------------------------------------------------------------- /graphrag/query/llm/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Orchestration LLM utilities.""" 5 | -------------------------------------------------------------------------------- /examples/multiple_workflows/workflows/workflow_1.yml: -------------------------------------------------------------------------------- 1 | name: workflow_1 2 | steps: 3 | - verb: fill 4 | args: 5 | to: "col_workflow_1" 6 | value: 1 7 | -------------------------------------------------------------------------------- /examples/multiple_workflows/workflows/workflow_3.yml: -------------------------------------------------------------------------------- 1 | name: workflow_3 2 | steps: 3 | - verb: fill 4 | args: 5 | to: "col_workflow_3" 6 | value: 3 7 | -------------------------------------------------------------------------------- /graphrag/query/question_gen/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Question Generation Module.""" 5 | -------------------------------------------------------------------------------- /.semversioner/next-release/patch-20240705184142723331.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "patch", 3 | "description": "Add Minute-based Rate Limiting and fix rpm, tpm settings" 4 | } 5 | -------------------------------------------------------------------------------- /docsite/.yarn/sdks/integrations.yml: -------------------------------------------------------------------------------- 1 | # This file is automatically generated by @yarnpkg/sdks. 2 | # Manual changes might be lost! 3 | 4 | integrations: 5 | - vscode 6 | -------------------------------------------------------------------------------- /graphrag/index/graph/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """The Indexing Engine graph package root.""" 5 | -------------------------------------------------------------------------------- /graphrag/query/structured_search/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Structured Search package.""" 5 | -------------------------------------------------------------------------------- /tests/unit/indexing/verbs/entities/extraction/strategies/graph_intelligence/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | -------------------------------------------------------------------------------- /examples/multiple_workflows/pipeline.yml: -------------------------------------------------------------------------------- 1 | workflows: 2 | - !include workflows/workflow_1.yml 3 | - !include workflows/workflow_2.yml 4 | - !include workflows/workflow_3.yml -------------------------------------------------------------------------------- /graphrag/index/verbs/graph/layout/methods/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Graph Layout Methods.""" 5 | -------------------------------------------------------------------------------- /graphrag/query/input/loaders/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """GraphRAG Orchestartion Input Loaders.""" 5 | -------------------------------------------------------------------------------- /.semversioner/next-release/patch-20240703152422358587.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "patch", 3 | "description": "Add cli flag to overlay default values onto a provided config." 4 | } 5 | -------------------------------------------------------------------------------- /graphrag/index/verbs/graph/embed/strategies/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Text Embedding strategies.""" 5 | -------------------------------------------------------------------------------- /graphrag/index/workflows/v1/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """The Indexing Engine workflows package root.""" 5 | -------------------------------------------------------------------------------- /graphrag/prompt_tune/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Command line interface for the fine_tune module.""" 5 | -------------------------------------------------------------------------------- /graphrag/query/input/retrieval/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """GraphRAG Orchestration Input Retrieval.""" 5 | -------------------------------------------------------------------------------- /graphrag/query/structured_search/global_search/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """GlobalSearch module.""" 5 | -------------------------------------------------------------------------------- /graphrag/query/structured_search/local_search/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """The LocalSearch package.""" 5 | -------------------------------------------------------------------------------- /tests/fixtures/azure/input/ABOUT.md: -------------------------------------------------------------------------------- 1 | # About 2 | 3 | This document (Operation Dulce) in an AI-generated science fiction novella, included here for the purposes of integration testing. -------------------------------------------------------------------------------- /tests/fixtures/text/input/ABOUT.md: -------------------------------------------------------------------------------- 1 | # About 2 | 3 | This document (Operation Dulce) in an AI-generated science fiction novella, included here for the purposes of integration testing. -------------------------------------------------------------------------------- /tests/unit/indexing/config/default_config_with_overridden_workflows.yml: -------------------------------------------------------------------------------- 1 | extends: default 2 | 3 | workflows: 4 | - name: TEST_WORKFLOW 5 | steps: 6 | - verb: TEST_VERB 7 | -------------------------------------------------------------------------------- /docsite/data/operation_dulce/ABOUT.md: -------------------------------------------------------------------------------- 1 | # About 2 | 3 | This document (Operation Dulce) is an AI-generated science fiction novella, included here for the purposes of integration testing. -------------------------------------------------------------------------------- /tests/fixtures/min-csv/input/ABOUT.md: -------------------------------------------------------------------------------- 1 | # About 2 | 3 | This document (Operation Dulce) in an AI-generated science fiction novella, included here for the purposes of integration testing. -------------------------------------------------------------------------------- /tests/unit/indexing/config/default_config_with_overridden_input.yml: -------------------------------------------------------------------------------- 1 | extends: default 2 | input: 3 | file_type: text 4 | base_dir: /some/overridden/dir 5 | file_pattern: test.txt 6 | -------------------------------------------------------------------------------- /examples/entity_extraction/with_nltk/pipeline.yml: -------------------------------------------------------------------------------- 1 | workflows: 2 | - name: "entity_extraction" 3 | config: 4 | entity_extract: 5 | strategy: 6 | type: "nltk" -------------------------------------------------------------------------------- /graphrag/index/verbs/graph/clustering/strategies/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Graph Clustering Strategies.""" 5 | -------------------------------------------------------------------------------- /graphrag/index/verbs/text/embed/strategies/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """The Indexing Engine embed strategies package root.""" 5 | -------------------------------------------------------------------------------- /examples_notebooks/inputs/operation dulce/create_final_nodes.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TheAiSingularity/graphrag-local-ollama/HEAD/examples_notebooks/inputs/operation dulce/create_final_nodes.parquet -------------------------------------------------------------------------------- /graphrag/index/verbs/text/chunk/strategies/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """The Indexing Engine text chunk strategies package root.""" 5 | -------------------------------------------------------------------------------- /examples_notebooks/inputs/operation dulce/create_final_entities.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TheAiSingularity/graphrag-local-ollama/HEAD/examples_notebooks/inputs/operation dulce/create_final_entities.parquet -------------------------------------------------------------------------------- /graphrag/index/verbs/graph/report/strategies/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """The Indexing Engine graph report strategies package root.""" 5 | -------------------------------------------------------------------------------- /examples_notebooks/inputs/operation dulce/create_final_covariates.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TheAiSingularity/graphrag-local-ollama/HEAD/examples_notebooks/inputs/operation dulce/create_final_covariates.parquet -------------------------------------------------------------------------------- /examples_notebooks/inputs/operation dulce/create_final_text_units.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TheAiSingularity/graphrag-local-ollama/HEAD/examples_notebooks/inputs/operation dulce/create_final_text_units.parquet -------------------------------------------------------------------------------- /examples_notebooks/inputs/operation dulce/create_final_relationships.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TheAiSingularity/graphrag-local-ollama/HEAD/examples_notebooks/inputs/operation dulce/create_final_relationships.parquet -------------------------------------------------------------------------------- /graphrag/index/verbs/entities/extraction/strategies/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """The Indexing Engine entities extraction strategies package root.""" 5 | -------------------------------------------------------------------------------- /graphrag/query/context_builder/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Functions to build context for system prompt to generate responses for a user query.""" 5 | -------------------------------------------------------------------------------- /.semversioner/0.1.0.json: -------------------------------------------------------------------------------- 1 | { 2 | "changes": [ 3 | { 4 | "description": "Initial Release", 5 | "type": "minor" 6 | } 7 | ], 8 | "created_at": "2024-07-01T21:48:50+00:00", 9 | "version": "0.1.0" 10 | } -------------------------------------------------------------------------------- /examples_notebooks/inputs/operation dulce/ABOUT.md: -------------------------------------------------------------------------------- 1 | # About 2 | 3 | This document (Operation Dulce) is an AI-generated science fiction novella, included here for the purposes of providing a starting point for notebook experimentation. 4 | -------------------------------------------------------------------------------- /examples_notebooks/inputs/operation dulce/create_final_community_reports.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TheAiSingularity/graphrag-local-ollama/HEAD/examples_notebooks/inputs/operation dulce/create_final_community_reports.parquet -------------------------------------------------------------------------------- /graphrag/index/verbs/covariates/extract_covariates/strategies/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """The Indexing Engine text extract claims strategies package root.""" 5 | -------------------------------------------------------------------------------- /graphrag/index/input/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """The Indexing Engine input package root.""" 5 | 6 | from .load_input import load_input 7 | 8 | __all__ = ["load_input"] 9 | -------------------------------------------------------------------------------- /graphrag/index/verbs/graph/clustering/typing.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """A module containing Communities list definition.""" 5 | 6 | Communities = list[tuple[int, str, list[str]]] 7 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | def pytest_addoption(parser): 4 | parser.addoption( 5 | "--run_slow", action="store_true", default=False, help="run slow tests" 6 | ) 7 | -------------------------------------------------------------------------------- /graphrag/index/verbs/text/replace/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """The Indexing Engine text replace package root.""" 5 | 6 | from .replace import text_replace 7 | 8 | __all__ = ["text_replace"] 9 | -------------------------------------------------------------------------------- /graphrag/model/types.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Common types for the GraphRAG knowledge model.""" 5 | 6 | from collections.abc import Callable 7 | 8 | TextEmbedder = Callable[[str], list[float]] 9 | -------------------------------------------------------------------------------- /.vscode/launch.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": "0.2.0", 3 | "configurations": [ 4 | { 5 | "name": "Attach to Node Functions", 6 | "type": "node", 7 | "request": "attach", 8 | "port": 9229, 9 | "preLaunchTask": "func: host start" 10 | } 11 | ] 12 | } 13 | -------------------------------------------------------------------------------- /graphrag/index/verbs/graph/layout/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """The Indexing Engine graph layout package root.""" 5 | 6 | from .layout_graph import layout_graph 7 | 8 | __all__ = ["layout_graph"] 9 | -------------------------------------------------------------------------------- /graphrag/index/verbs/graph/merge/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """The Indexing Engine graph merge package root.""" 5 | 6 | from .merge_graphs import merge_graphs 7 | 8 | __all__ = ["merge_graphs"] 9 | -------------------------------------------------------------------------------- /CODEOWNERS: -------------------------------------------------------------------------------- 1 | # These owners will be the default owners for everything in 2 | # the repo. Unless a later match takes precedence, 3 | # @global-owner1 and @global-owner2 will be requested for 4 | # review when someone opens a pull request. 5 | * @microsoft/societal-resilience 6 | 7 | -------------------------------------------------------------------------------- /graphrag/index/verbs/text/translate/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """The Indexing Engine text translate package root.""" 5 | 6 | from .text_translate import text_translate 7 | 8 | __all__ = ["text_translate"] 9 | -------------------------------------------------------------------------------- /graphrag/index/verbs/covariates/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """The Indexing Engine covariates package root.""" 5 | 6 | from .extract_covariates import extract_covariates 7 | 8 | __all__ = ["extract_covariates"] 9 | -------------------------------------------------------------------------------- /examples/custom_set_of_available_verbs/pipeline.yml: -------------------------------------------------------------------------------- 1 | workflows: 2 | - steps: 3 | - verb: "str_append" # should be the key that you pass to the custom_verbs dict below 4 | args: 5 | source_column: "col1" 6 | target_column: "col_1_custom" 7 | string_to_append: " - custom verb" -------------------------------------------------------------------------------- /graphrag/index/graph/embedding/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """The Indexing Engine graph embedding package root.""" 5 | 6 | from .embedding import NodeEmbeddings, embed_nod2vec 7 | 8 | __all__ = ["NodeEmbeddings", "embed_nod2vec"] 9 | -------------------------------------------------------------------------------- /graphrag/index/verbs/entities/summarize/strategies/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Indexing Engine - Summarization Strategies Package.""" 5 | 6 | from .typing import SummarizationStrategy 7 | 8 | __all__ = ["SummarizationStrategy"] 9 | -------------------------------------------------------------------------------- /graphrag/index/verbs/entities/summarize/strategies/graph_intelligence/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """The Entity Resolution graph intelligence package root.""" 5 | 6 | from .run_graph_intelligence import run 7 | 8 | __all__ = ["run"] 9 | -------------------------------------------------------------------------------- /graphrag/index/verbs/entities/extraction/strategies/graph_intelligence/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """The Indexing Engine graph intelligence package root.""" 5 | 6 | from .run_graph_intelligence import run_gi 7 | 8 | __all__ = ["run_gi"] 9 | -------------------------------------------------------------------------------- /graphrag/index/verbs/text/embed/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """The Indexing Engine text embed package root.""" 5 | 6 | from .text_embed import TextEmbedStrategyType, text_embed 7 | 8 | __all__ = ["TextEmbedStrategyType", "text_embed"] 9 | -------------------------------------------------------------------------------- /.vscode/extensions.json: -------------------------------------------------------------------------------- 1 | { 2 | "recommendations": [ 3 | "arcanis.vscode-zipfs", 4 | "ms-python.python", 5 | "charliermarsh.ruff", 6 | "ms-python.vscode-pylance", 7 | "bierner.markdown-mermaid", 8 | "streetsidesoftware.code-spell-checker", 9 | "ronnidc.nunjucks" 10 | ] 11 | } 12 | -------------------------------------------------------------------------------- /graphrag/index/verbs/graph/embed/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """The Indexing Engine graph embed package root.""" 5 | 6 | from .embed_graph import EmbedGraphStrategyType, embed_graph 7 | 8 | __all__ = ["EmbedGraphStrategyType", "embed_graph"] 9 | -------------------------------------------------------------------------------- /graphrag/llm/openai/types.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """A base class for OpenAI-based LLMs.""" 5 | 6 | from openai import ( 7 | AsyncAzureOpenAI, 8 | AsyncOpenAI, 9 | ) 10 | 11 | OpenAIClientTypes = AsyncOpenAI | AsyncAzureOpenAI 12 | -------------------------------------------------------------------------------- /graphrag/index/verbs/graph/report/strategies/graph_intelligence/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """The Indexing Engine graph report strategies graph intelligence package root.""" 5 | 6 | from .run_graph_intelligence import run 7 | 8 | __all__ = ["run"] 9 | -------------------------------------------------------------------------------- /graphrag/index/verbs/text/chunk/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """The Indexing Engine text chunk package root.""" 5 | 6 | from .text_chunk import ChunkStrategy, ChunkStrategyType, chunk 7 | 8 | __all__ = ["ChunkStrategy", "ChunkStrategyType", "chunk"] 9 | -------------------------------------------------------------------------------- /graphrag/index/progress/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Progress-reporting components.""" 5 | 6 | from .types import NullProgressReporter, PrintProgressReporter, ProgressReporter 7 | 8 | __all__ = ["NullProgressReporter", "PrintProgressReporter", "ProgressReporter"] 9 | -------------------------------------------------------------------------------- /graphrag/index/verbs/graph/clustering/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """The Indexing Engine graph clustering package root.""" 5 | 6 | from .cluster_graph import GraphCommunityStrategyType, cluster_graph 7 | 8 | __all__ = ["GraphCommunityStrategyType", "cluster_graph"] 9 | -------------------------------------------------------------------------------- /graphrag/index/verbs/covariates/extract_covariates/strategies/graph_intelligence/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """The Indexing Engine text extract claims strategies graph intelligence package root.""" 5 | 6 | from .run_gi_extract_claims import run 7 | 8 | __all__ = ["run"] 9 | -------------------------------------------------------------------------------- /graphrag/index/verbs/entities/extraction/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """The Indexing Engine entities extraction package root.""" 5 | 6 | from .entity_extract import ExtractEntityStrategyType, entity_extract 7 | 8 | __all__ = ["ExtractEntityStrategyType", "entity_extract"] 9 | -------------------------------------------------------------------------------- /graphrag/index/verbs/entities/summarize/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Root package for resolution entities.""" 5 | 6 | from .description_summarize import SummarizeStrategyType, summarize_descriptions 7 | 8 | __all__ = ["SummarizeStrategyType", "summarize_descriptions"] 9 | -------------------------------------------------------------------------------- /graphrag/index/verbs/overrides/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """The Indexing Engine overrides package root.""" 5 | 6 | from .aggregate import aggregate 7 | from .concat import concat 8 | from .merge import merge 9 | 10 | __all__ = ["aggregate", "concat", "merge"] 11 | -------------------------------------------------------------------------------- /graphrag/llm/mock/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Mock LLM Implementations.""" 5 | 6 | from .mock_chat_llm import MockChatLLM 7 | from .mock_completion_llm import MockCompletionLLM 8 | 9 | __all__ = [ 10 | "MockChatLLM", 11 | "MockCompletionLLM", 12 | ] 13 | -------------------------------------------------------------------------------- /.github/workflows/spellcheck.yml: -------------------------------------------------------------------------------- 1 | name: Spellcheck 2 | on: 3 | push: 4 | branches: [main] 5 | pull_request: 6 | paths: 7 | - '**/*' 8 | jobs: 9 | spellcheck: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - uses: actions/checkout@v4 13 | 14 | - name: Spellcheck 15 | run: ./scripts/spellcheck.sh 16 | -------------------------------------------------------------------------------- /graphrag/index/emit/types.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Table Emitter Types.""" 5 | 6 | from enum import Enum 7 | 8 | 9 | class TableEmitterType(str, Enum): 10 | """Table Emitter Types.""" 11 | 12 | Json = "json" 13 | Parquet = "parquet" 14 | CSV = "csv" 15 | -------------------------------------------------------------------------------- /graphrag/index/verbs/entities/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """The Indexing Engine entities package root.""" 5 | 6 | from .extraction import entity_extract 7 | from .summarize import summarize_descriptions 8 | 9 | __all__ = ["entity_extract", "summarize_descriptions"] 10 | -------------------------------------------------------------------------------- /graphrag/index/verbs/text/translate/strategies/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """The Indexing Engine translate strategies package root.""" 5 | 6 | from .mock import run as run_mock 7 | from .openai import run as run_openai 8 | 9 | __all__ = ["run_mock", "run_openai"] 10 | -------------------------------------------------------------------------------- /graphrag/llm/base/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Base LLM Implementations.""" 5 | 6 | from .base_llm import BaseLLM 7 | from .caching_llm import CachingLLM 8 | from .rate_limiting_llm import RateLimitingLLM 9 | 10 | __all__ = ["BaseLLM", "CachingLLM", "RateLimitingLLM"] 11 | -------------------------------------------------------------------------------- /.github/workflows/semver.yml: -------------------------------------------------------------------------------- 1 | name: Semver Check 2 | on: 3 | pull_request: 4 | branches: [main] 5 | 6 | jobs: 7 | semver: 8 | runs-on: ubuntu-latest 9 | steps: 10 | - uses: actions/checkout@v4 11 | with: 12 | fetch-depth: 0 13 | 14 | - name: Check Semver 15 | run: ./scripts/semver-check.sh -------------------------------------------------------------------------------- /graphrag/index/verbs/covariates/extract_covariates/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """The Indexing Engine text extract claims package root.""" 5 | 6 | from .extract_covariates import ExtractClaimsStrategyType, extract_covariates 7 | 8 | __all__ = ["ExtractClaimsStrategyType", "extract_covariates"] 9 | -------------------------------------------------------------------------------- /graphrag/prompt_tune/generator/defaults.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Default values for the fine-tuning module.""" 5 | 6 | DEFAULT_TASK = """ 7 | Identify the relations and structure of the community of interest, specifically within the {domain} domain. 8 | """ 9 | 10 | MAX_TOKEN_COUNT = 2000 11 | -------------------------------------------------------------------------------- /graphrag/index/graph/extractors/claims/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """The Indexing Engine graph extractors claims package root.""" 5 | 6 | from .claim_extractor import ClaimExtractor 7 | from .prompts import CLAIM_EXTRACTION_PROMPT 8 | 9 | __all__ = ["CLAIM_EXTRACTION_PROMPT", "ClaimExtractor"] 10 | -------------------------------------------------------------------------------- /graphrag/index/llm/types.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """A module containing the 'LLMtype' model.""" 5 | 6 | from collections.abc import Callable 7 | from typing import TypeAlias 8 | 9 | TextSplitter: TypeAlias = Callable[[str], list[str]] 10 | TextListSplitter: TypeAlias = Callable[[list[str]], list[str]] 11 | -------------------------------------------------------------------------------- /graphrag/index/verbs/text/replace/typing.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """A module containing 'Replacement' model.""" 5 | 6 | from dataclasses import dataclass 7 | 8 | 9 | @dataclass 10 | class Replacement: 11 | """Replacement class definition.""" 12 | 13 | pattern: str 14 | replacement: str 15 | -------------------------------------------------------------------------------- /tests/fixtures/azure/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "input_path": "./tests/fixtures/azure", 3 | "input_file_type": "text", 4 | "workflow_config": { 5 | "skip_assert": true, 6 | "azure": { 7 | "input_container": "azurefixture", 8 | "input_base_dir": "input" 9 | } 10 | }, 11 | "query_config": [], 12 | "slow": false 13 | } -------------------------------------------------------------------------------- /tests/unit/indexing/verbs/helpers/mock_llm.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | from graphrag.llm import CompletionLLM, MockChatLLM 4 | 5 | 6 | def create_mock_llm( 7 | responses: list[str], 8 | ) -> CompletionLLM: 9 | """Creates a mock LLM that returns the given responses.""" 10 | return MockChatLLM(responses) 11 | -------------------------------------------------------------------------------- /graphrag/index/verbs/text/translate/strategies/defaults.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """A file containing TRANSLATION_PROMPT value definition.""" 5 | 6 | TRANSLATION_PROMPT = """ 7 | You are a helpful assistant. Translate into {language} the following text, and make sure all of the text is in {language}. 8 | """.strip() 9 | -------------------------------------------------------------------------------- /graphrag/index/graph/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """The Indexing Engine graph utils package root.""" 5 | 6 | from .normalize_node_names import normalize_node_names 7 | from .stable_lcc import stable_largest_connected_component 8 | 9 | __all__ = ["normalize_node_names", "stable_largest_connected_component"] 10 | -------------------------------------------------------------------------------- /graphrag/index/verbs/graph/embed/typing.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """A module containing different lists and dictionaries.""" 5 | 6 | # Use this for now instead of a wrapper 7 | from typing import Any 8 | 9 | NodeList = list[str] 10 | EmbeddingList = list[Any] 11 | NodeEmbeddings = dict[str, list[float]] 12 | """Label -> Embedding""" 13 | -------------------------------------------------------------------------------- /tests/unit/indexing/config/default_config_with_everything_overridden.yml: -------------------------------------------------------------------------------- 1 | extends: default 2 | 3 | input: 4 | file_type: text 5 | base_dir: /some/overridden/dir 6 | file_pattern: test.txt 7 | 8 | storage: 9 | type: file 10 | 11 | cache: 12 | type: file 13 | 14 | reporting: 15 | type: file 16 | 17 | workflows: 18 | - name: TEST_WORKFLOW 19 | steps: 20 | - verb: TEST_VERB 21 | -------------------------------------------------------------------------------- /graphrag/index/utils/topological_sort.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Topological sort utility method.""" 5 | 6 | from graphlib import TopologicalSorter 7 | 8 | 9 | def topological_sort(graph: dict[str, list[str]]) -> list[str]: 10 | """Topological sort.""" 11 | ts = TopologicalSorter(graph) 12 | return list(ts.static_order()) 13 | -------------------------------------------------------------------------------- /graphrag/config/input_models/umap_config_input.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Parameterization settings for the default configuration.""" 5 | 6 | from typing_extensions import NotRequired, TypedDict 7 | 8 | 9 | class UmapConfigInput(TypedDict): 10 | """Configuration section for UMAP.""" 11 | 12 | enabled: NotRequired[bool | str | None] 13 | -------------------------------------------------------------------------------- /graphrag/index/llm/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """The Indexing Engine LLM package root.""" 5 | 6 | from .load_llm import load_llm, load_llm_embeddings 7 | from .types import TextListSplitter, TextSplitter 8 | 9 | __all__ = [ 10 | "TextListSplitter", 11 | "TextSplitter", 12 | "load_llm", 13 | "load_llm_embeddings", 14 | ] 15 | -------------------------------------------------------------------------------- /examples/single_verb/pipeline.yml: -------------------------------------------------------------------------------- 1 | input: 2 | file_type: csv 3 | base_dir: ./input 4 | file_pattern: .*\.csv$ 5 | workflows: 6 | - steps: 7 | - verb: derive # https://github.com/microsoft/datashaper/blob/main/python/datashaper/datashaper/engine/verbs/derive.py 8 | args: 9 | column1: "col1" 10 | column2: "col2" 11 | to: "col_multiplied" 12 | operator: "*" 13 | -------------------------------------------------------------------------------- /graphrag/index/utils/load_graph.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Networkx load_graph utility definition.""" 5 | 6 | import networkx as nx 7 | 8 | 9 | def load_graph(graphml: str | nx.Graph) -> nx.Graph: 10 | """Load a graph from a graphml file or a networkx graph.""" 11 | return nx.parse_graphml(graphml) if isinstance(graphml, str) else graphml 12 | -------------------------------------------------------------------------------- /scripts/semver-check.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | changes=$(git diff --name-only origin/main) 3 | has_change_doc=$(echo $changes | grep .semversioner/next-release) 4 | has_impacting_changes=$(echo $changes | grep graphrag) 5 | 6 | if [ "$has_impacting_changes" ] && [ -z "$has_change_doc" ]; then 7 | echo "Check failed. Run 'poetry run semversioner add-change' to update the next release version" 8 | exit 1 9 | fi 10 | echo "OK" 11 | -------------------------------------------------------------------------------- /tests/unit/indexing/test_exports.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | from graphrag.index import ( 4 | create_pipeline_config, 5 | run_pipeline, 6 | run_pipeline_with_config, 7 | ) 8 | 9 | 10 | def test_exported_functions(): 11 | assert callable(create_pipeline_config) 12 | assert callable(run_pipeline_with_config) 13 | assert callable(run_pipeline) 14 | -------------------------------------------------------------------------------- /graphrag/index/utils/uuid.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """UUID utilities.""" 5 | 6 | import uuid 7 | from random import Random, getrandbits 8 | 9 | 10 | def gen_uuid(rd: Random | None = None): 11 | """Generate a random UUID v4.""" 12 | return uuid.UUID( 13 | int=rd.getrandbits(128) if rd is not None else getrandbits(128), version=4 14 | ).hex 15 | -------------------------------------------------------------------------------- /graphrag/model/named.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """A package containing the 'Named' protocol.""" 5 | 6 | from dataclasses import dataclass 7 | 8 | from .identified import Identified 9 | 10 | 11 | @dataclass 12 | class Named(Identified): 13 | """A protocol for an item with a name/title.""" 14 | 15 | title: str 16 | """The name/title of the item.""" 17 | -------------------------------------------------------------------------------- /tests/fixtures/text/settings.yml: -------------------------------------------------------------------------------- 1 | claim_extraction: 2 | enabled: true 3 | 4 | embeddings: 5 | vector_store: 6 | type: "azure_ai_search" 7 | url: ${AZURE_AI_SEARCH_URL_ENDPOINT} 8 | api_key: ${AZURE_AI_SEARCH_API_KEY} 9 | collection_name: "simple_text_ci" 10 | query_collection_name: "simple_text_ci_query" 11 | store_in_table: True 12 | 13 | entity_name_description: 14 | title_column: "name" 15 | -------------------------------------------------------------------------------- /graphrag/index/graph/extractors/summarize/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """The Indexing Engine unipartite graph package root.""" 5 | 6 | from .description_summary_extractor import ( 7 | SummarizationResult, 8 | SummarizeExtractor, 9 | ) 10 | from .prompts import SUMMARIZE_PROMPT 11 | 12 | __all__ = ["SUMMARIZE_PROMPT", "SummarizationResult", "SummarizeExtractor"] 13 | -------------------------------------------------------------------------------- /graphrag/config/input_models/parallelization_parameters_input.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """LLM Parameters model.""" 5 | 6 | from typing_extensions import NotRequired, TypedDict 7 | 8 | 9 | class ParallelizationParametersInput(TypedDict): 10 | """LLM Parameters model.""" 11 | 12 | stagger: NotRequired[float | str | None] 13 | num_threads: NotRequired[int | str | None] 14 | -------------------------------------------------------------------------------- /graphrag/prompt_tune/loader/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Fine-tuning config and data loader module.""" 5 | 6 | from .config import read_config_parameters 7 | from .input import MIN_CHUNK_OVERLAP, MIN_CHUNK_SIZE, load_docs_in_chunks 8 | 9 | __all__ = [ 10 | "MIN_CHUNK_OVERLAP", 11 | "MIN_CHUNK_SIZE", 12 | "load_docs_in_chunks", 13 | "read_config_parameters", 14 | ] 15 | -------------------------------------------------------------------------------- /graphrag/index/graph/visualization/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """The Indexing Engine graph visualization package root.""" 5 | 6 | from .compute_umap_positions import compute_umap_positions, get_zero_positions 7 | from .typing import GraphLayout, NodePosition 8 | 9 | __all__ = [ 10 | "GraphLayout", 11 | "NodePosition", 12 | "compute_umap_positions", 13 | "get_zero_positions", 14 | ] 15 | -------------------------------------------------------------------------------- /graphrag/llm/errors.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Error definitions for the OpenAI DataShaper package.""" 5 | 6 | 7 | class RetriesExhaustedError(RuntimeError): 8 | """Retries exhausted error.""" 9 | 10 | def __init__(self, name: str, num_retries: int) -> None: 11 | """Init method definition.""" 12 | super().__init__(f"Operation '{name}' failed - {num_retries} retries exhausted") 13 | -------------------------------------------------------------------------------- /graphrag/config/input_models/cluster_graph_config_input.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Parameterization settings for the default configuration.""" 5 | 6 | from typing_extensions import NotRequired, TypedDict 7 | 8 | 9 | class ClusterGraphConfigInput(TypedDict): 10 | """Configuration section for clustering graphs.""" 11 | 12 | max_cluster_size: NotRequired[int | None] 13 | strategy: NotRequired[dict | None] 14 | -------------------------------------------------------------------------------- /docsite/posts/query/notebooks/overview.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Query Engine Notebooks 3 | navtitle: Query Engine Notebooks 4 | layout: page 5 | tags: [post, notebook] 6 | --- 7 | 8 | For examples about running Query please refer to the following notebooks: 9 | 10 | - [Global Search Notebook](/posts/query/notebooks/global_search_nb) 11 | - [Local Search Notebook](/posts/query/notebooks/local_search_nb) 12 | 13 | The test dataset for these notebooks can be found [here](/data/operation_dulce/dataset.zip). -------------------------------------------------------------------------------- /graphrag/index/emit/table_emitter.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """TableEmitter protocol for emitting tables to a destination.""" 5 | 6 | from typing import Protocol 7 | 8 | import pandas as pd 9 | 10 | 11 | class TableEmitter(Protocol): 12 | """TableEmitter protocol for emitting tables to a destination.""" 13 | 14 | async def emit(self, name: str, data: pd.DataFrame) -> None: 15 | """Emit a dataframe to storage.""" 16 | -------------------------------------------------------------------------------- /graphrag/index/verbs/text/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """The Indexing Engine text package root.""" 5 | 6 | from .chunk.text_chunk import chunk 7 | from .embed import text_embed 8 | from .replace import replace 9 | from .split import text_split 10 | from .translate import text_translate 11 | 12 | __all__ = [ 13 | "chunk", 14 | "replace", 15 | "text_embed", 16 | "text_split", 17 | "text_translate", 18 | ] 19 | -------------------------------------------------------------------------------- /graphrag/index/text_splitting/check_token_limit.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Token limit method definition.""" 5 | 6 | from .text_splitting import TokenTextSplitter 7 | 8 | 9 | def check_token_limit(text, max_token): 10 | """Check token limit.""" 11 | text_splitter = TokenTextSplitter(chunk_size=max_token, chunk_overlap=0) 12 | docs = text_splitter.split_text(text) 13 | if len(docs) > 1: 14 | return 0 15 | return 1 16 | -------------------------------------------------------------------------------- /graphrag/index/utils/hashing.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Hashing utilities.""" 5 | 6 | from collections.abc import Iterable 7 | from hashlib import md5 8 | from typing import Any 9 | 10 | 11 | def gen_md5_hash(item: dict[str, Any], hashcode: Iterable[str]): 12 | """Generate an md5 hash.""" 13 | hashed = "".join([str(item[column]) for column in hashcode]) 14 | return f"{md5(hashed.encode('utf-8'), usedforsecurity=False).hexdigest()}" 15 | -------------------------------------------------------------------------------- /graphrag/index/utils/is_null.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Defines the is_null utility.""" 5 | 6 | import math 7 | from typing import Any 8 | 9 | 10 | def is_null(value: Any) -> bool: 11 | """Check if value is null or is nan.""" 12 | 13 | def is_none() -> bool: 14 | return value is None 15 | 16 | def is_nan() -> bool: 17 | return isinstance(value, float) and math.isnan(value) 18 | 19 | return is_none() or is_nan() 20 | -------------------------------------------------------------------------------- /graphrag/llm/types/llm_types.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """LLM Types.""" 5 | 6 | from typing import TypeAlias 7 | 8 | from .llm import LLM 9 | 10 | EmbeddingInput: TypeAlias = list[str] 11 | EmbeddingOutput: TypeAlias = list[list[float]] 12 | CompletionInput: TypeAlias = str 13 | CompletionOutput: TypeAlias = str 14 | 15 | EmbeddingLLM: TypeAlias = LLM[EmbeddingInput, EmbeddingOutput] 16 | CompletionLLM: TypeAlias = LLM[CompletionInput, CompletionOutput] 17 | -------------------------------------------------------------------------------- /graphrag/index/verbs/entities/summarize/strategies/graph_intelligence/defaults.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """A file containing some default responses.""" 5 | 6 | from graphrag.config.enums import LLMType 7 | 8 | MOCK_LLM_RESPONSES = [ 9 | """ 10 | This is a MOCK response for the LLM. It is summarized! 11 | """.strip() 12 | ] 13 | 14 | DEFAULT_LLM_CONFIG = { 15 | "type": LLMType.StaticResponse, 16 | "responses": MOCK_LLM_RESPONSES, 17 | } 18 | -------------------------------------------------------------------------------- /graphrag/config/models/umap_config.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Parameterization settings for the default configuration.""" 5 | 6 | from pydantic import BaseModel, Field 7 | 8 | import graphrag.config.defaults as defs 9 | 10 | 11 | class UmapConfig(BaseModel): 12 | """Configuration section for UMAP.""" 13 | 14 | enabled: bool = Field( 15 | description="A flag indicating whether to enable UMAP.", 16 | default=defs.UMAP_ENABLED, 17 | ) 18 | -------------------------------------------------------------------------------- /graphrag/index/graph/extractors/graph/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """The Indexing Engine unipartite graph package root.""" 5 | 6 | from .graph_extractor import ( 7 | DEFAULT_ENTITY_TYPES, 8 | GraphExtractionResult, 9 | GraphExtractor, 10 | ) 11 | from .prompts import GRAPH_EXTRACTION_PROMPT 12 | 13 | __all__ = [ 14 | "DEFAULT_ENTITY_TYPES", 15 | "GRAPH_EXTRACTION_PROMPT", 16 | "GraphExtractionResult", 17 | "GraphExtractor", 18 | ] 19 | -------------------------------------------------------------------------------- /graphrag/config/input_models/snapshots_config_input.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Parameterization settings for the default configuration.""" 5 | 6 | from typing_extensions import NotRequired, TypedDict 7 | 8 | 9 | class SnapshotsConfigInput(TypedDict): 10 | """Configuration section for snapshots.""" 11 | 12 | graphml: NotRequired[bool | str | None] 13 | raw_entities: NotRequired[bool | str | None] 14 | top_level_nodes: NotRequired[bool | str | None] 15 | -------------------------------------------------------------------------------- /examples/custom_input/pipeline.yml: -------------------------------------------------------------------------------- 1 | 2 | # Setup reporting however you'd like 3 | reporting: 4 | type: console 5 | 6 | # Setup storage however you'd like 7 | storage: 8 | type: memory 9 | 10 | # Setup cache however you'd like 11 | cache: 12 | type: memory 13 | 14 | # Just a simple workflow 15 | workflows: 16 | 17 | # This is an anonymous workflow, it doesn't have a name 18 | - steps: 19 | 20 | # Unpack the nodes from the graph 21 | - verb: fill 22 | args: 23 | to: filled_column 24 | value: "Filled Value" -------------------------------------------------------------------------------- /graphrag/model/identified.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """A package containing the 'Identified' protocol.""" 5 | 6 | from dataclasses import dataclass 7 | 8 | 9 | @dataclass 10 | class Identified: 11 | """A protocol for an item with an ID.""" 12 | 13 | id: str 14 | """The ID of the item.""" 15 | 16 | short_id: str | None 17 | """Human readable ID used to refer to this community in prompts or texts displayed to users, such as in a report text (optional).""" 18 | -------------------------------------------------------------------------------- /examples/multiple_workflows/workflows/workflow_2.yml: -------------------------------------------------------------------------------- 1 | name: workflow_2 2 | steps: 3 | - verb: fill 4 | args: 5 | to: "col_workflow_2" 6 | value: 2 7 | input: 8 | 9 | # workflow_2 is dependent on workflow_1 10 | # so in workflow_2 output, you'll also see the output from workflow_1 11 | source: "workflow:workflow_1" 12 | 13 | # Example of pulling in values from a shared file 14 | - verb: fill 15 | args: 16 | to: "col_from_shared_file" 17 | value: !include ./shared/shared_fill_value.txt 18 | -------------------------------------------------------------------------------- /graphrag/index/graph/utils/normalize_node_names.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """A module containing normalize_node_names method definition.""" 5 | 6 | import html 7 | 8 | import networkx as nx 9 | 10 | 11 | def normalize_node_names(graph: nx.Graph | nx.DiGraph) -> nx.Graph | nx.DiGraph: 12 | """Normalize node names.""" 13 | node_mapping = {node: html.unescape(node.upper().strip()) for node in graph.nodes()} # type: ignore 14 | return nx.relabel_nodes(graph, node_mapping) 15 | -------------------------------------------------------------------------------- /graphrag/config/input_models/chunking_config_input.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Parameterization settings for the default configuration.""" 5 | 6 | from typing_extensions import NotRequired, TypedDict 7 | 8 | 9 | class ChunkingConfigInput(TypedDict): 10 | """Configuration section for chunking.""" 11 | 12 | size: NotRequired[int | str | None] 13 | overlap: NotRequired[int | str | None] 14 | group_by_columns: NotRequired[list[str] | str | None] 15 | strategy: NotRequired[dict | None] 16 | -------------------------------------------------------------------------------- /graphrag/prompt_tune/prompt/domain.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Fine-tuning prompts for domain generation.""" 5 | 6 | GENERATE_DOMAIN_PROMPT = """ 7 | You are an intelligent assistant that helps a human to analyze the information in a text document. 8 | Given a sample text, help the user by assigning a descriptive domain that summarizes what the text is about. 9 | Example domains are: "Social studies", "Algorithmic analysis", "Medical science", among others. 10 | 11 | Text: {input_text} 12 | Domain:""" 13 | -------------------------------------------------------------------------------- /graphrag/llm/limiting/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """LLM limiters module.""" 5 | 6 | from .composite_limiter import CompositeLLMLimiter 7 | from .create_limiters import create_tpm_rpm_limiters 8 | from .llm_limiter import LLMLimiter 9 | from .noop_llm_limiter import NoopLLMLimiter 10 | from .tpm_rpm_limiter import TpmRpmLLMLimiter 11 | 12 | __all__ = [ 13 | "CompositeLLMLimiter", 14 | "LLMLimiter", 15 | "NoopLLMLimiter", 16 | "TpmRpmLLMLimiter", 17 | "create_tpm_rpm_limiters", 18 | ] 19 | -------------------------------------------------------------------------------- /graphrag/index/cache/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """The Indexing Engine cache package root.""" 5 | 6 | from .json_pipeline_cache import JsonPipelineCache 7 | from .load_cache import load_cache 8 | from .memory_pipeline_cache import InMemoryCache 9 | from .noop_pipeline_cache import NoopPipelineCache 10 | from .pipeline_cache import PipelineCache 11 | 12 | __all__ = [ 13 | "InMemoryCache", 14 | "JsonPipelineCache", 15 | "NoopPipelineCache", 16 | "PipelineCache", 17 | "load_cache", 18 | ] 19 | -------------------------------------------------------------------------------- /graphrag/llm/limiting/llm_limiter.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Limiting types.""" 5 | 6 | from abc import ABC, abstractmethod 7 | 8 | 9 | class LLMLimiter(ABC): 10 | """LLM Limiter Interface.""" 11 | 12 | @property 13 | @abstractmethod 14 | def needs_token_count(self) -> bool: 15 | """Whether this limiter needs the token count to be passed in.""" 16 | 17 | @abstractmethod 18 | async def acquire(self, num_tokens: int = 1) -> None: 19 | """Acquire a pass through the limiter.""" 20 | -------------------------------------------------------------------------------- /graphrag/index/typing.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """A module containing the 'PipelineRunResult' model.""" 5 | 6 | from collections.abc import Callable 7 | from dataclasses import dataclass 8 | 9 | import pandas as pd 10 | 11 | ErrorHandlerFn = Callable[[BaseException | None, str | None, dict | None], None] 12 | 13 | 14 | @dataclass 15 | class PipelineRunResult: 16 | """Pipeline run result class definition.""" 17 | 18 | workflow: str 19 | result: pd.DataFrame | None 20 | errors: list[BaseException] | None 21 | -------------------------------------------------------------------------------- /graphrag/index/verbs/text/chunk/typing.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """A module containing 'TextChunk' model.""" 5 | 6 | from dataclasses import dataclass 7 | 8 | 9 | @dataclass 10 | class TextChunk: 11 | """Text chunk class definition.""" 12 | 13 | text_chunk: str 14 | source_doc_indices: list[int] 15 | n_tokens: int | None = None 16 | 17 | 18 | ChunkInput = str | list[str] | list[tuple[str, str]] 19 | """Input to a chunking strategy. Can be a string, a list of strings, or a list of tuples of (id, text).""" 20 | -------------------------------------------------------------------------------- /tests/fixtures/min-csv/settings.yml: -------------------------------------------------------------------------------- 1 | input: 2 | file_type: csv 3 | 4 | embeddings: 5 | vector_store: 6 | type: "lancedb" 7 | uri_db: "./tests/fixtures/min-csv/lancedb" 8 | store_in_table: True 9 | 10 | entity_name_description: 11 | title_column: "name" 12 | # id_column: "id" 13 | # overwrite: true 14 | # entity_name: ... 15 | # relationship_description: ... 16 | # community_report_full_content: ... 17 | # community_report_summary: ... 18 | # community_report_title: ... 19 | # document_raw_content: ... 20 | # text_unit_text: ... 21 | -------------------------------------------------------------------------------- /graphrag/config/input_models/summarize_descriptions_config_input.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Parameterization settings for the default configuration.""" 5 | 6 | from typing_extensions import NotRequired 7 | 8 | from .llm_config_input import LLMConfigInput 9 | 10 | 11 | class SummarizeDescriptionsConfigInput(LLMConfigInput): 12 | """Configuration section for description summarization.""" 13 | 14 | prompt: NotRequired[str | None] 15 | max_length: NotRequired[int | str | None] 16 | strategy: NotRequired[dict | None] 17 | -------------------------------------------------------------------------------- /graphrag/index/verbs/text/chunk/strategies/typing.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """A module containing ChunkStrategy definition.""" 5 | 6 | from collections.abc import Callable, Iterable 7 | from typing import Any 8 | 9 | from datashaper import ProgressTicker 10 | 11 | from graphrag.index.verbs.text.chunk.typing import TextChunk 12 | 13 | # Given a list of document texts, return a list of tuples of (source_doc_indices, text_chunk) 14 | 15 | ChunkStrategy = Callable[ 16 | [list[str], dict[str, Any], ProgressTicker], Iterable[TextChunk] 17 | ] 18 | -------------------------------------------------------------------------------- /graphrag/llm/limiting/noop_llm_limiter.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """TPM RPM Limiter module.""" 5 | 6 | from .llm_limiter import LLMLimiter 7 | 8 | 9 | class NoopLLMLimiter(LLMLimiter): 10 | """TPM RPM Limiter class definition.""" 11 | 12 | @property 13 | def needs_token_count(self) -> bool: 14 | """Whether this limiter needs the token count to be passed in.""" 15 | return False 16 | 17 | async def acquire(self, num_tokens: int = 1) -> None: 18 | """Call method definition.""" 19 | # do nothing 20 | -------------------------------------------------------------------------------- /graphrag/index/verbs/graph/merge/defaults.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """A file containing DEFAULT_NODE_OPERATIONS, DEFAULT_EDGE_OPERATIONS and DEFAULT_CONCAT_SEPARATOR values definition.""" 5 | 6 | from .typing import BasicMergeOperation 7 | 8 | DEFAULT_NODE_OPERATIONS = { 9 | "*": { 10 | "operation": BasicMergeOperation.Replace, 11 | } 12 | } 13 | 14 | DEFAULT_EDGE_OPERATIONS = { 15 | "*": { 16 | "operation": BasicMergeOperation.Replace, 17 | }, 18 | "weight": "sum", 19 | } 20 | 21 | DEFAULT_CONCAT_SEPARATOR = "," 22 | -------------------------------------------------------------------------------- /docsite/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "@graphrag/docsite", 3 | "version": "0.0.1", 4 | "private": true, 5 | "scripts": { 6 | "start": "eleventy --serve", 7 | "build": "eleventy && touch _site/.nojekyll", 8 | "build:docs": "yarn build", 9 | "start:docs": "yarn start" 10 | }, 11 | "dependencies": { 12 | "@11ty/eleventy": "^2.0.1", 13 | "@11ty/eleventy-plugin-syntaxhighlight": "^5.0.0", 14 | "@kevingimbel/eleventy-plugin-mermaid": "^2.2.1", 15 | "eleventy-plugin-code-clipboard": "^0.1.1", 16 | "markdown-it": "^14.1.0" 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /graphrag/config/input_models/community_reports_config_input.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Parameterization settings for the default configuration.""" 5 | 6 | from typing_extensions import NotRequired 7 | 8 | from .llm_config_input import LLMConfigInput 9 | 10 | 11 | class CommunityReportsConfigInput(LLMConfigInput): 12 | """Configuration section for community reports.""" 13 | 14 | prompt: NotRequired[str | None] 15 | max_length: NotRequired[int | str | None] 16 | max_input_length: NotRequired[int | str | None] 17 | strategy: NotRequired[dict | None] 18 | -------------------------------------------------------------------------------- /graphrag/query/llm/oai/typing.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """OpenAI wrapper options.""" 5 | 6 | from enum import Enum 7 | from typing import Any, cast 8 | 9 | import openai 10 | 11 | OPENAI_RETRY_ERROR_TYPES = ( 12 | # TODO: update these when we update to OpenAI 1+ library 13 | cast(Any, openai).RateLimitError, 14 | cast(Any, openai).APIConnectionError, 15 | # TODO: replace with comparable OpenAI 1+ error 16 | ) 17 | 18 | 19 | class OpenaiApiType(str, Enum): 20 | """The OpenAI Flavor.""" 21 | 22 | OpenAI = "openai" 23 | AzureOpenAI = "azure" 24 | -------------------------------------------------------------------------------- /graphrag/config/input_models/entity_extraction_config_input.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Parameterization settings for the default configuration.""" 5 | 6 | from typing_extensions import NotRequired 7 | 8 | from .llm_config_input import LLMConfigInput 9 | 10 | 11 | class EntityExtractionConfigInput(LLMConfigInput): 12 | """Configuration section for entity extraction.""" 13 | 14 | prompt: NotRequired[str | None] 15 | entity_types: NotRequired[list[str] | str | None] 16 | max_gleanings: NotRequired[int | str | None] 17 | strategy: NotRequired[dict | None] 18 | -------------------------------------------------------------------------------- /examples/use_built_in_workflows/pipeline.yml: -------------------------------------------------------------------------------- 1 | workflows: 2 | - name: "entity_extraction" 3 | config: 4 | entity_extract: 5 | strategy: 6 | type: "nltk" 7 | 8 | - name: "entity_graph" 9 | config: 10 | cluster_graph: 11 | strategy: 12 | type: "leiden" 13 | embed_graph: 14 | strategy: 15 | type: "node2vec" 16 | num_walks: 10 17 | walk_length: 40 18 | window_size: 2 19 | iterations: 3 20 | random_seed: 597832 21 | layout_graph: 22 | strategy: 23 | type: "umap" -------------------------------------------------------------------------------- /graphrag/index/graph/extractors/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """The Indexing Engine graph extractors package root.""" 5 | 6 | from .claims import CLAIM_EXTRACTION_PROMPT, ClaimExtractor 7 | from .community_reports import ( 8 | COMMUNITY_REPORT_PROMPT, 9 | CommunityReportsExtractor, 10 | ) 11 | from .graph import GraphExtractionResult, GraphExtractor 12 | 13 | __all__ = [ 14 | "CLAIM_EXTRACTION_PROMPT", 15 | "COMMUNITY_REPORT_PROMPT", 16 | "ClaimExtractor", 17 | "CommunityReportsExtractor", 18 | "GraphExtractionResult", 19 | "GraphExtractor", 20 | ] 21 | -------------------------------------------------------------------------------- /graphrag/config/input_models/global_search_config_input.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Parameterization settings for the default configuration.""" 5 | 6 | from typing_extensions import NotRequired, TypedDict 7 | 8 | 9 | class GlobalSearchConfigInput(TypedDict): 10 | """The default configuration section for Cache.""" 11 | 12 | max_tokens: NotRequired[int | str | None] 13 | data_max_tokens: NotRequired[int | str | None] 14 | map_max_tokens: NotRequired[int | str | None] 15 | reduce_max_tokens: NotRequired[int | str | None] 16 | concurrency: NotRequired[int | str | None] 17 | -------------------------------------------------------------------------------- /graphrag/index/workflows/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """The Indexing Engine workflows package root.""" 5 | 6 | from .load import create_workflow, load_workflows 7 | from .typing import ( 8 | StepDefinition, 9 | VerbDefinitions, 10 | VerbTiming, 11 | WorkflowConfig, 12 | WorkflowDefinitions, 13 | WorkflowToRun, 14 | ) 15 | 16 | __all__ = [ 17 | "StepDefinition", 18 | "VerbDefinitions", 19 | "VerbTiming", 20 | "WorkflowConfig", 21 | "WorkflowDefinitions", 22 | "WorkflowToRun", 23 | "create_workflow", 24 | "load_workflows", 25 | ] 26 | -------------------------------------------------------------------------------- /graphrag/vector_stores/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """A package containing vector-storage implementations.""" 5 | 6 | from .azure_ai_search import AzureAISearch 7 | from .base import BaseVectorStore, VectorStoreDocument, VectorStoreSearchResult 8 | from .lancedb import LanceDBVectorStore 9 | from .typing import VectorStoreFactory, VectorStoreType 10 | 11 | __all__ = [ 12 | "AzureAISearch", 13 | "BaseVectorStore", 14 | "LanceDBVectorStore", 15 | "VectorStoreDocument", 16 | "VectorStoreFactory", 17 | "VectorStoreSearchResult", 18 | "VectorStoreType", 19 | ] 20 | -------------------------------------------------------------------------------- /graphrag/config/input_models/claim_extraction_config_input.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Parameterization settings for the default configuration.""" 5 | 6 | from typing_extensions import NotRequired 7 | 8 | from .llm_config_input import LLMConfigInput 9 | 10 | 11 | class ClaimExtractionConfigInput(LLMConfigInput): 12 | """Configuration section for claim extraction.""" 13 | 14 | enabled: NotRequired[bool | None] 15 | prompt: NotRequired[str | None] 16 | description: NotRequired[str | None] 17 | max_gleanings: NotRequired[int | str | None] 18 | strategy: NotRequired[dict | None] 19 | -------------------------------------------------------------------------------- /graphrag/index/storage/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """The Indexing Engine storage package root.""" 5 | 6 | from .blob_pipeline_storage import BlobPipelineStorage, create_blob_storage 7 | from .file_pipeline_storage import FilePipelineStorage 8 | from .load_storage import load_storage 9 | from .memory_pipeline_storage import MemoryPipelineStorage 10 | from .typing import PipelineStorage 11 | 12 | __all__ = [ 13 | "BlobPipelineStorage", 14 | "FilePipelineStorage", 15 | "MemoryPipelineStorage", 16 | "PipelineStorage", 17 | "create_blob_storage", 18 | "load_storage", 19 | ] 20 | -------------------------------------------------------------------------------- /graphrag/index/utils/dicts.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """A utility module containing methods for inspecting and verifying dictionary types.""" 5 | 6 | 7 | def dict_has_keys_with_types( 8 | data: dict, expected_fields: list[tuple[str, type]] 9 | ) -> bool: 10 | """Return True if the given dictionary has the given keys with the given types.""" 11 | for field, field_type in expected_fields: 12 | if field not in data: 13 | return False 14 | 15 | value = data[field] 16 | if not isinstance(value, field_type): 17 | return False 18 | return True 19 | -------------------------------------------------------------------------------- /graphrag/query/llm/oai/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """GraphRAG Orchestration OpenAI Wrappers.""" 5 | 6 | from .base import BaseOpenAILLM, OpenAILLMImpl, OpenAITextEmbeddingImpl 7 | from .chat_openai import ChatOpenAI 8 | from .embedding import OpenAIEmbedding 9 | from .openai import OpenAI 10 | from .typing import OPENAI_RETRY_ERROR_TYPES, OpenaiApiType 11 | 12 | __all__ = [ 13 | "OPENAI_RETRY_ERROR_TYPES", 14 | "BaseOpenAILLM", 15 | "ChatOpenAI", 16 | "OpenAI", 17 | "OpenAIEmbedding", 18 | "OpenAILLMImpl", 19 | "OpenAITextEmbeddingImpl", 20 | "OpenaiApiType", 21 | ] 22 | -------------------------------------------------------------------------------- /graphrag/config/input_models/cache_config_input.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Parameterization settings for the default configuration.""" 5 | 6 | from typing_extensions import NotRequired, TypedDict 7 | 8 | from graphrag.config.enums import CacheType 9 | 10 | 11 | class CacheConfigInput(TypedDict): 12 | """The default configuration section for Cache.""" 13 | 14 | type: NotRequired[CacheType | str | None] 15 | base_dir: NotRequired[str | None] 16 | connection_string: NotRequired[str | None] 17 | container_name: NotRequired[str | None] 18 | storage_account_blob_url: NotRequired[str | None] 19 | -------------------------------------------------------------------------------- /graphrag/config/input_models/storage_config_input.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Parameterization settings for the default configuration.""" 5 | 6 | from typing_extensions import NotRequired, TypedDict 7 | 8 | from graphrag.config.enums import StorageType 9 | 10 | 11 | class StorageConfigInput(TypedDict): 12 | """The default configuration section for Storage.""" 13 | 14 | type: NotRequired[StorageType | str | None] 15 | base_dir: NotRequired[str | None] 16 | connection_string: NotRequired[str | None] 17 | container_name: NotRequired[str | None] 18 | storage_account_blob_url: NotRequired[str | None] 19 | -------------------------------------------------------------------------------- /graphrag/config/input_models/reporting_config_input.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Parameterization settings for the default configuration.""" 5 | 6 | from typing_extensions import NotRequired, TypedDict 7 | 8 | from graphrag.config.enums import ReportingType 9 | 10 | 11 | class ReportingConfigInput(TypedDict): 12 | """The default configuration section for Reporting.""" 13 | 14 | type: NotRequired[ReportingType | str | None] 15 | base_dir: NotRequired[str | None] 16 | connection_string: NotRequired[str | None] 17 | container_name: NotRequired[str | None] 18 | storage_account_blob_url: NotRequired[str | None] 19 | -------------------------------------------------------------------------------- /graphrag/prompt_tune/prompt/persona.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Fine-tuning prompts for persona generation.""" 5 | 6 | GENERATE_PERSONA_PROMPT = """ 7 | You are an intelligent assistant that helps a human to analyze the information in a text document. 8 | Given a specific type of task and sample text, help the user by generating a 3 to 4 sentence description of an expert who could help solve the problem. 9 | Use a format similar to the following: 10 | You are an expert {{role}}. You are skilled at {{relevant skills}}. You are adept at helping people with {{specific task}}. 11 | 12 | task: {sample_task} 13 | persona description:""" 14 | -------------------------------------------------------------------------------- /graphrag/config/models/parallelization_parameters.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """LLM Parameters model.""" 5 | 6 | from pydantic import BaseModel, Field 7 | 8 | import graphrag.config.defaults as defs 9 | 10 | 11 | class ParallelizationParameters(BaseModel): 12 | """LLM Parameters model.""" 13 | 14 | stagger: float = Field( 15 | description="The stagger to use for the LLM service.", 16 | default=defs.PARALLELIZATION_STAGGER, 17 | ) 18 | num_threads: int = Field( 19 | description="The number of threads to use for the LLM service.", 20 | default=defs.PARALLELIZATION_NUM_THREADS, 21 | ) 22 | -------------------------------------------------------------------------------- /graphrag/index/utils/string.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """String utilities.""" 5 | 6 | import html 7 | import re 8 | from typing import Any 9 | 10 | 11 | def clean_str(input: Any) -> str: 12 | """Clean an input string by removing HTML escapes, control characters, and other unwanted characters.""" 13 | # If we get non-string input, just give it back 14 | if not isinstance(input, str): 15 | return input 16 | 17 | result = html.unescape(input.strip()) 18 | # https://stackoverflow.com/questions/4324790/removing-control-characters-from-a-string-in-python 19 | return re.sub(r"[\x00-\x1f\x7f-\x9f]", "", result) 20 | -------------------------------------------------------------------------------- /graphrag/llm/types/llm_cache.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Typing definitions for the OpenAI DataShaper package.""" 5 | 6 | from typing import Any, Protocol 7 | 8 | 9 | class LLMCache(Protocol): 10 | """LLM Cache interface.""" 11 | 12 | async def has(self, key: str) -> bool: 13 | """Check if the cache has a value.""" 14 | ... 15 | 16 | async def get(self, key: str) -> Any | None: 17 | """Retrieve a value from the cache.""" 18 | ... 19 | 20 | async def set(self, key: str, value: Any, debug_data: dict | None = None) -> None: 21 | """Write a value into the cache.""" 22 | ... 23 | -------------------------------------------------------------------------------- /graphrag/llm/types/llm.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """LLM Types.""" 5 | 6 | from typing import Generic, Protocol, TypeVar 7 | 8 | from typing_extensions import Unpack 9 | 10 | from .llm_io import ( 11 | LLMInput, 12 | LLMOutput, 13 | ) 14 | 15 | TIn = TypeVar("TIn", contravariant=True) 16 | TOut = TypeVar("TOut") 17 | 18 | 19 | class LLM(Protocol, Generic[TIn, TOut]): 20 | """LLM Protocol definition.""" 21 | 22 | async def __call__( 23 | self, 24 | input: TIn, 25 | **kwargs: Unpack[LLMInput], 26 | ) -> LLMOutput[TOut]: 27 | """Invoke the LLM, treating the LLM as a function.""" 28 | ... 29 | -------------------------------------------------------------------------------- /graphrag/config/input_models/llm_config_input.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Parameterization settings for the default configuration.""" 5 | 6 | from datashaper import AsyncType 7 | from typing_extensions import NotRequired, TypedDict 8 | 9 | from .llm_parameters_input import LLMParametersInput 10 | from .parallelization_parameters_input import ParallelizationParametersInput 11 | 12 | 13 | class LLMConfigInput(TypedDict): 14 | """Base class for LLM-configured steps.""" 15 | 16 | llm: NotRequired[LLMParametersInput | None] 17 | parallelization: NotRequired[ParallelizationParametersInput | None] 18 | async_mode: NotRequired[AsyncType | str | None] 19 | -------------------------------------------------------------------------------- /graphrag/config/input_models/embed_graph_config_input.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Parameterization settings for the default configuration.""" 5 | 6 | from typing_extensions import NotRequired, TypedDict 7 | 8 | 9 | class EmbedGraphConfigInput(TypedDict): 10 | """The default configuration section for Node2Vec.""" 11 | 12 | enabled: NotRequired[bool | str | None] 13 | num_walks: NotRequired[int | str | None] 14 | walk_length: NotRequired[int | str | None] 15 | window_size: NotRequired[int | str | None] 16 | iterations: NotRequired[int | str | None] 17 | random_seed: NotRequired[int | str | None] 18 | strategy: NotRequired[dict | None] 19 | -------------------------------------------------------------------------------- /graphrag/index/reporting/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Reporting utilities and implementations for the indexing engine.""" 5 | 6 | from .blob_workflow_callbacks import BlobWorkflowCallbacks 7 | from .console_workflow_callbacks import ConsoleWorkflowCallbacks 8 | from .file_workflow_callbacks import FileWorkflowCallbacks 9 | from .load_pipeline_reporter import load_pipeline_reporter 10 | from .progress_workflow_callbacks import ProgressWorkflowCallbacks 11 | 12 | __all__ = [ 13 | "BlobWorkflowCallbacks", 14 | "ConsoleWorkflowCallbacks", 15 | "FileWorkflowCallbacks", 16 | "ProgressWorkflowCallbacks", 17 | "load_pipeline_reporter", 18 | ] 19 | -------------------------------------------------------------------------------- /.github/workflows/javascript-ci.yml: -------------------------------------------------------------------------------- 1 | name: JavaScript CI 2 | on: 3 | push: 4 | branches: [main] 5 | pull_request: 6 | branches: [main] 7 | 8 | env: 9 | NODE_VERSION: 18.x 10 | 11 | jobs: 12 | javascript-ci: 13 | runs-on: ubuntu-latest 14 | strategy: 15 | fail-fast: false 16 | steps: 17 | - name: Use Node ${{ env.NODE_VERSION }} 18 | uses: actions/setup-node@v4 19 | with: 20 | node-version: ${{ env.NODE_VERSION }} 21 | 22 | - uses: actions/checkout@v4 23 | 24 | - run: yarn install 25 | working-directory: docsite 26 | name: Install Dependencies 27 | 28 | - run: yarn build 29 | working-directory: docsite 30 | name: Build Docsite -------------------------------------------------------------------------------- /graphrag/index/emit/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Definitions for emitting pipeline artifacts to storage.""" 5 | 6 | from .csv_table_emitter import CSVTableEmitter 7 | from .factories import create_table_emitter, create_table_emitters 8 | from .json_table_emitter import JsonTableEmitter 9 | from .parquet_table_emitter import ParquetTableEmitter 10 | from .table_emitter import TableEmitter 11 | from .types import TableEmitterType 12 | 13 | __all__ = [ 14 | "CSVTableEmitter", 15 | "JsonTableEmitter", 16 | "ParquetTableEmitter", 17 | "TableEmitter", 18 | "TableEmitterType", 19 | "create_table_emitter", 20 | "create_table_emitters", 21 | ] 22 | -------------------------------------------------------------------------------- /graphrag/index/graph/visualization/typing.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | # Use this for now instead of a wrapper 5 | """A module containing 'NodePosition' model.""" 6 | 7 | from dataclasses import dataclass 8 | 9 | 10 | @dataclass 11 | class NodePosition: 12 | """Node position class definition.""" 13 | 14 | label: str 15 | cluster: str 16 | size: float 17 | 18 | x: float 19 | y: float 20 | z: float | None = None 21 | 22 | def to_pandas(self) -> tuple[str, float, float, str, float]: 23 | """To pandas method definition.""" 24 | return self.label, self.x, self.y, self.cluster, self.size 25 | 26 | 27 | GraphLayout = list[NodePosition] 28 | -------------------------------------------------------------------------------- /graphrag/index/verbs/text/translate/strategies/typing.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """A module containing 'TextTranslationResult' model.""" 5 | 6 | from collections.abc import Awaitable, Callable 7 | from dataclasses import dataclass 8 | from typing import Any 9 | 10 | from datashaper import VerbCallbacks 11 | 12 | from graphrag.index.cache import PipelineCache 13 | 14 | 15 | @dataclass 16 | class TextTranslationResult: 17 | """Text translation result class definition.""" 18 | 19 | translations: list[str] 20 | 21 | 22 | TextTranslationStrategy = Callable[ 23 | [list[str], dict[str, Any], VerbCallbacks, PipelineCache], 24 | Awaitable[TextTranslationResult], 25 | ] 26 | -------------------------------------------------------------------------------- /graphrag/index/errors.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """GraphRAG indexing error types.""" 5 | 6 | 7 | class NoWorkflowsDefinedError(ValueError): 8 | """Exception for no workflows defined.""" 9 | 10 | def __init__(self): 11 | super().__init__("No workflows defined.") 12 | 13 | 14 | class UndefinedWorkflowError(ValueError): 15 | """Exception for invalid verb input.""" 16 | 17 | def __init__(self): 18 | super().__init__("Workflow name is undefined.") 19 | 20 | 21 | class UnknownWorkflowError(ValueError): 22 | """Exception for invalid verb input.""" 23 | 24 | def __init__(self, name: str): 25 | super().__init__(f"Unknown workflow: {name}") 26 | -------------------------------------------------------------------------------- /graphrag/llm/types/llm_callbacks.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Typing definitions for the OpenAI DataShaper package.""" 5 | 6 | from collections.abc import Callable 7 | 8 | from .llm_invocation_result import LLMInvocationResult 9 | 10 | ErrorHandlerFn = Callable[[BaseException | None, str | None, dict | None], None] 11 | """Error handler function type definition.""" 12 | 13 | LLMInvocationFn = Callable[[LLMInvocationResult], None] 14 | """Handler for LLM invocation results""" 15 | 16 | OnCacheActionFn = Callable[[str, str | None], None] 17 | """Handler for cache hits""" 18 | 19 | IsResponseValidFn = Callable[[dict], bool] 20 | """A function that checks if an LLM response is valid.""" 21 | -------------------------------------------------------------------------------- /graphrag/index/utils/json.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """JSON cleaning and formatting utilities.""" 5 | 6 | 7 | def clean_up_json(json_str: str): 8 | """Clean up json string.""" 9 | json_str = ( 10 | json_str.replace("\\n", "") 11 | .replace("\n", "") 12 | .replace("\r", "") 13 | .replace('"[{', "[{") 14 | .replace('}]"', "}]") 15 | .replace("\\", "") 16 | .strip() 17 | ) 18 | 19 | # Remove JSON Markdown Frame 20 | if json_str.startswith("```json"): 21 | json_str = json_str[len("```json") :] 22 | if json_str.endswith("```"): 23 | json_str = json_str[: len(json_str) - len("```")] 24 | 25 | return json_str 26 | -------------------------------------------------------------------------------- /graphrag/llm/openai/_json.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """JSON cleaning and formatting utilities.""" 5 | 6 | 7 | def clean_up_json(json_str: str) -> str: 8 | """Clean up json string.""" 9 | json_str = ( 10 | json_str.replace("\\n", "") 11 | .replace("\n", "") 12 | .replace("\r", "") 13 | .replace('"[{', "[{") 14 | .replace('}]"', "}]") 15 | .replace("\\", "") 16 | .strip() 17 | ) 18 | 19 | # Remove JSON Markdown Frame 20 | if json_str.startswith("```json"): 21 | json_str = json_str[len("```json") :] 22 | if json_str.endswith("```"): 23 | json_str = json_str[: len(json_str) - len("```")] 24 | 25 | return json_str 26 | -------------------------------------------------------------------------------- /graphrag/index/verbs/text/embed/strategies/typing.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """A module containing 'TextEmbeddingResult' model.""" 5 | 6 | from collections.abc import Awaitable, Callable 7 | from dataclasses import dataclass 8 | 9 | from datashaper import VerbCallbacks 10 | 11 | from graphrag.index.cache import PipelineCache 12 | 13 | 14 | @dataclass 15 | class TextEmbeddingResult: 16 | """Text embedding result class definition.""" 17 | 18 | embeddings: list[list[float] | None] | None 19 | 20 | 21 | TextEmbeddingStrategy = Callable[ 22 | [ 23 | list[str], 24 | VerbCallbacks, 25 | PipelineCache, 26 | dict, 27 | ], 28 | Awaitable[TextEmbeddingResult], 29 | ] 30 | -------------------------------------------------------------------------------- /examples/README.md: -------------------------------------------------------------------------------- 1 | # Indexing Engine Examples 2 | This directory contains several examples of how to use the indexing engine. 3 | 4 | Most examples include two different forms of running the pipeline, both are contained in the examples `run.py` 5 | 1. Using mostly the Python API 6 | 2. Using mostly the a pipeline configuration file 7 | 8 | # Running an Example 9 | First run `poetry shell` to activate a virtual environment with the required dependencies. 10 | 11 | Then run `PYTHONPATH="$(pwd)" python examples/path_to_example/run.py` from the `python/graphrag` directory. 12 | 13 | For example to run the single_verb example, you would run the following commands: 14 | 15 | ```bash 16 | cd python/graphrag 17 | poetry shell 18 | PYTHONPATH="$(pwd)" python examples/single_verb/run.py 19 | ``` -------------------------------------------------------------------------------- /graphrag/config/input_models/local_search_config_input.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Parameterization settings for the default configuration.""" 5 | 6 | from typing_extensions import NotRequired, TypedDict 7 | 8 | 9 | class LocalSearchConfigInput(TypedDict): 10 | """The default configuration section for Cache.""" 11 | 12 | text_unit_prop: NotRequired[float | str | None] 13 | community_prop: NotRequired[float | str | None] 14 | conversation_history_max_turns: NotRequired[int | str | None] 15 | top_k_entities: NotRequired[int | str | None] 16 | top_k_relationships: NotRequired[int | str | None] 17 | max_tokens: NotRequired[int | str | None] 18 | llm_max_tokens: NotRequired[int | str | None] 19 | -------------------------------------------------------------------------------- /graphrag/index/verbs/unzip.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """A module containing unzip method definition.""" 5 | 6 | from typing import cast 7 | 8 | import pandas as pd 9 | from datashaper import TableContainer, VerbInput, verb 10 | 11 | 12 | # TODO: Check if this is already a thing 13 | # Takes 1|(x,y)|b 14 | # and converts to 15 | # 1|x|y|b 16 | @verb(name="unzip") 17 | def unzip( 18 | input: VerbInput, column: str, to: list[str], **_kwargs: dict 19 | ) -> TableContainer: 20 | """Unpacks a column containing a tuple into multiple columns.""" 21 | table = cast(pd.DataFrame, input.get_input()) 22 | 23 | table[to] = pd.DataFrame(table[column].tolist(), index=table.index) 24 | 25 | return TableContainer(table=table) 26 | -------------------------------------------------------------------------------- /tests/unit/indexing/workflows/helpers.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | mock_verbs = { 4 | "mock_verb": lambda x: x, 5 | "mock_verb_2": lambda x: x, 6 | } 7 | 8 | mock_workflows = { 9 | "mock_workflow": lambda _x: [ 10 | { 11 | "verb": "mock_verb", 12 | "args": { 13 | "column": "test", 14 | }, 15 | } 16 | ], 17 | "mock_workflow_2": lambda _x: [ 18 | { 19 | "verb": "mock_verb", 20 | "args": { 21 | "column": "test", 22 | }, 23 | }, 24 | { 25 | "verb": "mock_verb_2", 26 | "args": { 27 | "column": "test", 28 | }, 29 | }, 30 | ], 31 | } 32 | -------------------------------------------------------------------------------- /examples/entity_extraction/with_graph_intelligence/pipeline.yml: -------------------------------------------------------------------------------- 1 | workflows: 2 | - name: "entity_extraction" 3 | config: 4 | entity_extract: 5 | strategy: 6 | type: "graph_intelligence" 7 | llm: 8 | type: "openai_chat" 9 | 10 | # create a .env file in the same directory as this pipeline.yml file 11 | # end add the following lines to it: 12 | # EXAMPLE_OPENAI_API_KEY="YOUR_API_KEY" 13 | api_key: !ENV ${EXAMPLE_OPENAI_API_KEY:None} # None is the default 14 | model: !ENV ${EXAMPLE_OPENAI_MODEL:gpt-3.5-turbo} # gpt-3.5-turbo is the default 15 | max_tokens: !ENV ${EXAMPLE_OPENAI_MAX_TOKENS:2500} # 2500 is the default 16 | temperature: !ENV ${EXAMPLE_OPENAI_TEMPERATURE:0} # 0 is the default 17 | -------------------------------------------------------------------------------- /graphrag/config/read_dotenv.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """A module containing the read_dotenv utility.""" 5 | 6 | import logging 7 | import os 8 | from pathlib import Path 9 | 10 | from dotenv import dotenv_values 11 | 12 | log = logging.getLogger(__name__) 13 | 14 | 15 | def read_dotenv(root: str) -> None: 16 | """Read a .env file in the given root path.""" 17 | env_path = Path(root) / ".env" 18 | if env_path.exists(): 19 | log.info("Loading pipeline .env file") 20 | env_config = dotenv_values(f"{env_path}") 21 | for key, value in env_config.items(): 22 | if key not in os.environ: 23 | os.environ[key] = value or "" 24 | else: 25 | log.info("No .env file found at %s", root) 26 | -------------------------------------------------------------------------------- /graphrag/config/input_models/text_embedding_config_input.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Parameterization settings for the default configuration.""" 5 | 6 | from typing_extensions import NotRequired 7 | 8 | from graphrag.config.enums import ( 9 | TextEmbeddingTarget, 10 | ) 11 | 12 | from .llm_config_input import LLMConfigInput 13 | 14 | 15 | class TextEmbeddingConfigInput(LLMConfigInput): 16 | """Configuration section for text embeddings.""" 17 | 18 | batch_size: NotRequired[int | str | None] 19 | batch_max_tokens: NotRequired[int | str | None] 20 | target: NotRequired[TextEmbeddingTarget | str | None] 21 | skip: NotRequired[list[str] | str | None] 22 | vector_store: NotRequired[dict | None] 23 | strategy: NotRequired[dict | None] 24 | -------------------------------------------------------------------------------- /graphrag/index/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Utils methods definition.""" 5 | 6 | from .dicts import dict_has_keys_with_types 7 | from .hashing import gen_md5_hash 8 | from .is_null import is_null 9 | from .json import clean_up_json 10 | from .load_graph import load_graph 11 | from .string import clean_str 12 | from .tokens import num_tokens_from_string, string_from_tokens 13 | from .topological_sort import topological_sort 14 | from .uuid import gen_uuid 15 | 16 | __all__ = [ 17 | "clean_str", 18 | "clean_up_json", 19 | "dict_has_keys_with_types", 20 | "gen_md5_hash", 21 | "gen_uuid", 22 | "is_null", 23 | "load_graph", 24 | "num_tokens_from_string", 25 | "string_from_tokens", 26 | "topological_sort", 27 | ] 28 | -------------------------------------------------------------------------------- /graphrag/index/verbs/text/chunk/strategies/sentence.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """A module containing run method definition.""" 5 | 6 | from collections.abc import Iterable 7 | from typing import Any 8 | 9 | import nltk 10 | from datashaper import ProgressTicker 11 | 12 | from .typing import TextChunk 13 | 14 | 15 | def run( 16 | input: list[str], _args: dict[str, Any], tick: ProgressTicker 17 | ) -> Iterable[TextChunk]: 18 | """Chunks text into multiple parts. A pipeline verb.""" 19 | for doc_idx, text in enumerate(input): 20 | sentences = nltk.sent_tokenize(text) 21 | for sentence in sentences: 22 | yield TextChunk( 23 | text_chunk=sentence, 24 | source_doc_indices=[doc_idx], 25 | ) 26 | tick(1) 27 | -------------------------------------------------------------------------------- /cspell.config.yaml: -------------------------------------------------------------------------------- 1 | $schema: https://raw.githubusercontent.com/streetsidesoftware/cspell/main/cspell.schema.json 2 | version: "0.2" 3 | allowCompoundWords: true 4 | dictionaryDefinitions: 5 | - name: dictionary 6 | path: "./dictionary.txt" 7 | addWords: true 8 | dictionaries: 9 | - dictionary 10 | ignorePaths: 11 | - cspell.config.yaml 12 | - node_modules 13 | - _site 14 | - /project-words.txt 15 | - default_pipeline.yml 16 | - .turbo 17 | - output/ 18 | - dist/ 19 | - temp_azurite/ 20 | - __pycache__ 21 | - pyproject.toml 22 | - entity_extraction.txt 23 | - package.json 24 | - tests/fixtures/ 25 | - docsite/data/ 26 | - docsite/nbdocsite_template/ 27 | - docsite/posts/query/notebooks/inputs/ 28 | - examples_notebooks/inputs/ 29 | - "*.csv" 30 | - "*.parquet" 31 | - "*.faiss" 32 | - "*.ipynb" 33 | - "*.log" 34 | -------------------------------------------------------------------------------- /graphrag/index/verbs/graph/report/strategies/graph_intelligence/defaults.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """A file containing DEFAULT_CHUNK_SIZE and MOCK_RESPONSES definitions.""" 5 | 6 | import json 7 | 8 | DEFAULT_CHUNK_SIZE = 3000 9 | MOCK_RESPONSES = [ 10 | json.dumps({ 11 | "title": "", 12 | "summary": "", 13 | "rating": 2, 14 | "rating_explanation": "", 15 | "findings": [ 16 | { 17 | "summary": "", 18 | "explanation": "", 22 | "explanation": " LLMLimiter: 23 | """Get the limiters for a given model name.""" 24 | tpm = configuration.tokens_per_minute 25 | rpm = configuration.requests_per_minute 26 | return TpmRpmLLMLimiter( 27 | None if tpm == 0 else AsyncLimiter(tpm or 50_000), 28 | None if rpm == 0 else AsyncLimiter(rpm or 10_000), 29 | ) 30 | -------------------------------------------------------------------------------- /graphrag/config/models/snapshots_config.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Parameterization settings for the default configuration.""" 5 | 6 | from pydantic import BaseModel, Field 7 | 8 | import graphrag.config.defaults as defs 9 | 10 | 11 | class SnapshotsConfig(BaseModel): 12 | """Configuration section for snapshots.""" 13 | 14 | graphml: bool = Field( 15 | description="A flag indicating whether to take snapshots of GraphML.", 16 | default=defs.SNAPSHOTS_GRAPHML, 17 | ) 18 | raw_entities: bool = Field( 19 | description="A flag indicating whether to take snapshots of raw entities.", 20 | default=defs.SNAPSHOTS_RAW_ENTITIES, 21 | ) 22 | top_level_nodes: bool = Field( 23 | description="A flag indicating whether to take snapshots of top-level nodes.", 24 | default=defs.SNAPSHOTS_TOP_LEVEL_NODES, 25 | ) 26 | -------------------------------------------------------------------------------- /examples/custom_set_of_available_verbs/custom_verb_definitions.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | from datashaper import TableContainer, VerbInput 4 | 5 | 6 | def str_append( 7 | input: VerbInput, source_column: str, target_column: str, string_to_append: str 8 | ): 9 | """A custom verb that appends a string to a column""" 10 | # by convention, we typically use "column" as the input column name and "to" as the output column name, but you can use whatever you want 11 | # just as long as the "args" in the workflow reference match the function signature 12 | input_data = input.get_input() 13 | output_df = input_data.copy() 14 | output_df[target_column] = output_df[source_column].apply( 15 | lambda x: f"{x}{string_to_append}" 16 | ) 17 | return TableContainer(table=output_df) 18 | 19 | 20 | custom_verbs = { 21 | "str_append": str_append, 22 | } 23 | -------------------------------------------------------------------------------- /graphrag/index/graph/extractors/summarize/prompts.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """A file containing prompts definition.""" 5 | 6 | SUMMARIZE_PROMPT = """ 7 | You are a helpful assistant responsible for generating a comprehensive summary of the data provided below. 8 | Given one or two entities, and a list of descriptions, all related to the same entity or group of entities. 9 | Please concatenate all of these into a single, comprehensive description. Make sure to include information collected from all the descriptions. 10 | If the provided descriptions are contradictory, please resolve the contradictions and provide a single, coherent summary. 11 | Make sure it is written in third person, and include the entity names so we the have full context. 12 | 13 | ####### 14 | -Data- 15 | Entities: {entity_name} 16 | Description List: {description_list} 17 | ####### 18 | Output: 19 | """ 20 | -------------------------------------------------------------------------------- /graphrag/index/verbs/overrides/concat.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """A module containing concat method definition.""" 5 | 6 | # Copyright (c) 2024 Microsoft Corporation. 7 | # Licensed under the MIT License 8 | from typing import cast 9 | 10 | import pandas as pd 11 | from datashaper import TableContainer, VerbInput, verb 12 | 13 | 14 | @verb(name="concat_override") 15 | def concat( 16 | input: VerbInput, 17 | columnwise: bool = False, 18 | **_kwargs: dict, 19 | ) -> TableContainer: 20 | """Concat method definition.""" 21 | input_table = cast(pd.DataFrame, input.get_input()) 22 | others = cast(list[pd.DataFrame], input.get_others()) 23 | if columnwise: 24 | output = pd.concat([input_table, *others], axis=1) 25 | else: 26 | output = pd.concat([input_table, *others], ignore_index=True) 27 | return TableContainer(table=output) 28 | -------------------------------------------------------------------------------- /graphrag/llm/openai/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """OpenAI LLM implementations.""" 5 | 6 | from .create_openai_client import create_openai_client 7 | from .factories import ( 8 | create_openai_chat_llm, 9 | create_openai_completion_llm, 10 | create_openai_embedding_llm, 11 | ) 12 | from .openai_chat_llm import OpenAIChatLLM 13 | from .openai_completion_llm import OpenAICompletionLLM 14 | from .openai_configuration import OpenAIConfiguration 15 | from .openai_embeddings_llm import OpenAIEmbeddingsLLM 16 | from .types import OpenAIClientTypes 17 | 18 | __all__ = [ 19 | "OpenAIChatLLM", 20 | "OpenAIClientTypes", 21 | "OpenAICompletionLLM", 22 | "OpenAIConfiguration", 23 | "OpenAIEmbeddingsLLM", 24 | "create_openai_chat_llm", 25 | "create_openai_client", 26 | "create_openai_completion_llm", 27 | "create_openai_embedding_llm", 28 | ] 29 | -------------------------------------------------------------------------------- /graphrag/llm/limiting/composite_limiter.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """A module containing Composite Limiter class definition.""" 5 | 6 | from .llm_limiter import LLMLimiter 7 | 8 | 9 | class CompositeLLMLimiter(LLMLimiter): 10 | """Composite Limiter class definition.""" 11 | 12 | _limiters: list[LLMLimiter] 13 | 14 | def __init__(self, limiters: list[LLMLimiter]): 15 | """Init method definition.""" 16 | self._limiters = limiters 17 | 18 | @property 19 | def needs_token_count(self) -> bool: 20 | """Whether this limiter needs the token count to be passed in.""" 21 | return any(limiter.needs_token_count for limiter in self._limiters) 22 | 23 | async def acquire(self, num_tokens: int = 1) -> None: 24 | """Call method definition.""" 25 | for limiter in self._limiters: 26 | await limiter.acquire(num_tokens) 27 | -------------------------------------------------------------------------------- /graphrag/prompt_tune/template/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Fine-tuning prompts for entity extraction, entity summarization, and community report summarization.""" 5 | 6 | from .community_report_summarization import COMMUNITY_REPORT_SUMMARIZATION_PROMPT 7 | from .entity_extraction import ( 8 | EXAMPLE_EXTRACTION_TEMPLATE, 9 | GRAPH_EXTRACTION_JSON_PROMPT, 10 | GRAPH_EXTRACTION_PROMPT, 11 | UNTYPED_EXAMPLE_EXTRACTION_TEMPLATE, 12 | UNTYPED_GRAPH_EXTRACTION_PROMPT, 13 | ) 14 | from .entity_summarization import ENTITY_SUMMARIZATION_PROMPT 15 | 16 | __all__ = [ 17 | "COMMUNITY_REPORT_SUMMARIZATION_PROMPT", 18 | "ENTITY_SUMMARIZATION_PROMPT", 19 | "EXAMPLE_EXTRACTION_TEMPLATE", 20 | "GRAPH_EXTRACTION_JSON_PROMPT", 21 | "GRAPH_EXTRACTION_PROMPT", 22 | "UNTYPED_EXAMPLE_EXTRACTION_TEMPLATE", 23 | "UNTYPED_GRAPH_EXTRACTION_PROMPT", 24 | ] 25 | -------------------------------------------------------------------------------- /graphrag/index/verbs/entities/summarize/strategies/typing.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """A module containing 'ResolvedEntity' and 'EntityResolutionResult' models.""" 5 | 6 | from collections.abc import Awaitable, Callable 7 | from dataclasses import dataclass 8 | from typing import Any 9 | 10 | from datashaper import VerbCallbacks 11 | 12 | from graphrag.index.cache import PipelineCache 13 | 14 | StrategyConfig = dict[str, Any] 15 | 16 | 17 | @dataclass 18 | class SummarizedDescriptionResult: 19 | """Entity summarization result class definition.""" 20 | 21 | items: str | tuple[str, str] 22 | description: str 23 | 24 | 25 | SummarizationStrategy = Callable[ 26 | [ 27 | str | tuple[str, str], 28 | list[str], 29 | VerbCallbacks, 30 | PipelineCache, 31 | StrategyConfig, 32 | ], 33 | Awaitable[SummarizedDescriptionResult], 34 | ] 35 | -------------------------------------------------------------------------------- /graphrag/query/structured_search/global_search/callbacks.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """GlobalSearch LLM Callbacks.""" 5 | 6 | from graphrag.query.llm.base import BaseLLMCallback 7 | from graphrag.query.structured_search.base import SearchResult 8 | 9 | 10 | class GlobalSearchLLMCallback(BaseLLMCallback): 11 | """GlobalSearch LLM Callbacks.""" 12 | 13 | def __init__(self): 14 | super().__init__() 15 | self.map_response_contexts = [] 16 | self.map_response_outputs = [] 17 | 18 | def on_map_response_start(self, map_response_contexts: list[str]): 19 | """Handle the start of map response.""" 20 | self.map_response_contexts = map_response_contexts 21 | 22 | def on_map_response_end(self, map_response_outputs: list[SearchResult]): 23 | """Handle the end of map response.""" 24 | self.map_response_outputs = map_response_outputs 25 | -------------------------------------------------------------------------------- /graphrag/index/verbs/snapshot.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """A module containing snapshot method definition.""" 5 | 6 | from datashaper import TableContainer, VerbInput, verb 7 | 8 | from graphrag.index.storage import PipelineStorage 9 | 10 | 11 | @verb(name="snapshot") 12 | async def snapshot( 13 | input: VerbInput, 14 | name: str, 15 | formats: list[str], 16 | storage: PipelineStorage, 17 | **_kwargs: dict, 18 | ) -> TableContainer: 19 | """Take a entire snapshot of the tabular data.""" 20 | data = input.get_input() 21 | 22 | for fmt in formats: 23 | if fmt == "parquet": 24 | await storage.set(name + ".parquet", data.to_parquet()) 25 | elif fmt == "json": 26 | await storage.set( 27 | name + ".json", data.to_json(orient="records", lines=True) 28 | ) 29 | 30 | return TableContainer(table=data) 31 | -------------------------------------------------------------------------------- /graphrag/index/emit/csv_table_emitter.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """CSVTableEmitter module.""" 5 | 6 | import logging 7 | 8 | import pandas as pd 9 | 10 | from graphrag.index.storage import PipelineStorage 11 | 12 | from .table_emitter import TableEmitter 13 | 14 | log = logging.getLogger(__name__) 15 | 16 | 17 | class CSVTableEmitter(TableEmitter): 18 | """CSVTableEmitter class.""" 19 | 20 | _storage: PipelineStorage 21 | 22 | def __init__(self, storage: PipelineStorage): 23 | """Create a new CSV Table Emitter.""" 24 | self._storage = storage 25 | 26 | async def emit(self, name: str, data: pd.DataFrame) -> None: 27 | """Emit a dataframe to storage.""" 28 | filename = f"{name}.csv" 29 | log.info("emitting CSV table %s", filename) 30 | await self._storage.set( 31 | filename, 32 | data.to_csv(), 33 | ) 34 | -------------------------------------------------------------------------------- /graphrag/index/verbs/text/translate/strategies/mock.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """A module containing run and _summarize_text methods definitions.""" 5 | 6 | from typing import Any 7 | 8 | from datashaper import VerbCallbacks 9 | 10 | from graphrag.index.cache import PipelineCache 11 | 12 | from .typing import TextTranslationResult 13 | 14 | 15 | async def run( # noqa RUF029 async is required for interface 16 | input: str | list[str], 17 | _args: dict[str, Any], 18 | _reporter: VerbCallbacks, 19 | _cache: PipelineCache, 20 | ) -> TextTranslationResult: 21 | """Run the Claim extraction chain.""" 22 | input = [input] if isinstance(input, str) else input 23 | return TextTranslationResult(translations=[_translate_text(text) for text in input]) 24 | 25 | 26 | def _translate_text(text: str) -> str: 27 | """Translate a single piece of text.""" 28 | return f"{text} translated" 29 | -------------------------------------------------------------------------------- /graphrag/llm/mock/mock_completion_llm.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """LLM Static Response method definition.""" 5 | 6 | import logging 7 | 8 | from typing_extensions import Unpack 9 | 10 | from graphrag.llm.base import BaseLLM 11 | from graphrag.llm.types import ( 12 | CompletionInput, 13 | CompletionOutput, 14 | LLMInput, 15 | ) 16 | 17 | log = logging.getLogger(__name__) 18 | 19 | 20 | class MockCompletionLLM( 21 | BaseLLM[ 22 | CompletionInput, 23 | CompletionOutput, 24 | ] 25 | ): 26 | """Mock Completion LLM for testing purposes.""" 27 | 28 | def __init__(self, responses: list[str]): 29 | self.responses = responses 30 | self._on_error = None 31 | 32 | async def _execute_llm( 33 | self, 34 | input: CompletionInput, 35 | **kwargs: Unpack[LLMInput], 36 | ) -> CompletionOutput: 37 | return self.responses[0] 38 | -------------------------------------------------------------------------------- /graphrag/config/models/llm_config.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Parameterization settings for the default configuration.""" 5 | 6 | from datashaper import AsyncType 7 | from pydantic import BaseModel, Field 8 | 9 | import graphrag.config.defaults as defs 10 | 11 | from .llm_parameters import LLMParameters 12 | from .parallelization_parameters import ParallelizationParameters 13 | 14 | 15 | class LLMConfig(BaseModel): 16 | """Base class for LLM-configured steps.""" 17 | 18 | llm: LLMParameters = Field( 19 | description="The LLM configuration to use.", default=LLMParameters() 20 | ) 21 | parallelization: ParallelizationParameters = Field( 22 | description="The parallelization configuration to use.", 23 | default=ParallelizationParameters(), 24 | ) 25 | async_mode: AsyncType = Field( 26 | description="The async mode to use.", default=defs.ASYNC_MODE 27 | ) 28 | -------------------------------------------------------------------------------- /graphrag/model/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """ 5 | GraphRAG knowledge model package root. 6 | 7 | The GraphRAG knowledge model contains a set of classes that represent the target datamodels for our pipelines and analytics tools. 8 | These models can be augmented and integrated into your own data infrastructure to suit your needs. 9 | """ 10 | 11 | from .community import Community 12 | from .community_report import CommunityReport 13 | from .covariate import Covariate 14 | from .document import Document 15 | from .entity import Entity 16 | from .identified import Identified 17 | from .named import Named 18 | from .relationship import Relationship 19 | from .text_unit import TextUnit 20 | 21 | __all__ = [ 22 | "Community", 23 | "CommunityReport", 24 | "Covariate", 25 | "Document", 26 | "Entity", 27 | "Identified", 28 | "Named", 29 | "Relationship", 30 | "TextUnit", 31 | ] 32 | -------------------------------------------------------------------------------- /graphrag/prompt_tune/generator/domain.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Domain generation for GraphRAG prompts.""" 5 | 6 | from graphrag.llm.types.llm_types import CompletionLLM 7 | from graphrag.prompt_tune.prompt.domain import GENERATE_DOMAIN_PROMPT 8 | 9 | 10 | async def generate_domain(llm: CompletionLLM, docs: str | list[str]) -> str: 11 | """Generate an LLM persona to use for GraphRAG prompts. 12 | 13 | Parameters 14 | ---------- 15 | - llm (CompletionLLM): The LLM to use for generation 16 | - docs (str | list[str]): The domain to generate a persona for 17 | 18 | Returns 19 | ------- 20 | - str: The generated domain prompt response. 21 | """ 22 | docs_str = " ".join(docs) if isinstance(docs, list) else docs 23 | domain_prompt = GENERATE_DOMAIN_PROMPT.format(input_text=docs_str) 24 | 25 | response = await llm(domain_prompt) 26 | 27 | return str(response.output) 28 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | environs==11.0.0 2 | datashaper==0.0.49 3 | azure-search-documents==11.4.0 4 | lancedb==0.9.0 5 | uvloop==0.19.0; platform_system != 'Windows' 6 | nest-asyncio==1.6.0; platform_system == 'Windows' 7 | aiolimiter==1.1.0 8 | aiofiles==24.1.0 9 | openai==1.35.7 10 | nltk==3.8.1 11 | tiktoken==0.7.0 12 | numba==0.60.0 13 | numpy==1.25.2 14 | graspologic==3.4.1 15 | networkx==3 16 | fastparquet==2024.2.0 17 | scipy==1.12.0 18 | pyyaml==6.0.1 19 | pyaml-env==1.2.1 20 | python-dotenv==1.0.0 21 | tenacity==8.2.3 22 | swifter==1.4.0 23 | pydantic==2 24 | rich==13.6.0 25 | textual==0.70.0 26 | devtools==0.12.2 27 | typing-extensions==4.12.2 28 | azure-storage-blob==12.19.0 29 | azure-identity==1.17.1 30 | coverage==7.5.4 31 | ipykernel==6.29.4 32 | jupyter==1.0.0 33 | nbconvert==7.16.3 34 | poethepoet==0.26.0 35 | pyright==1.1.368 36 | pytest==8.2.0 37 | pytest-asyncio==0.23.4 38 | pytest-timeout==2.3.1 39 | ruff==0.5.0 40 | semversioner==2.0.3 41 | update-toml==0.2.1 42 | 43 | -------------------------------------------------------------------------------- /graphrag/llm/types/llm_invocation_result.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Typing definitions for the OpenAI DataShaper package.""" 5 | 6 | from dataclasses import dataclass 7 | from typing import Generic, TypeVar 8 | 9 | T = TypeVar("T") 10 | 11 | 12 | @dataclass 13 | class LLMInvocationResult(Generic[T]): 14 | """The result of an LLM invocation.""" 15 | 16 | result: T | None 17 | """The result of the LLM invocation.""" 18 | 19 | name: str 20 | """The operation name of the result""" 21 | 22 | num_retries: int 23 | """The number of retries the invocation took.""" 24 | 25 | total_time: float 26 | """The total time of the LLM invocation.""" 27 | 28 | call_times: list[float] 29 | """The network times of individual invocations.""" 30 | 31 | input_tokens: int 32 | """The number of input tokens.""" 33 | 34 | output_tokens: int 35 | """The number of output tokens.""" 36 | -------------------------------------------------------------------------------- /docsite/.eleventy.js: -------------------------------------------------------------------------------- 1 | const { EleventyHtmlBasePlugin } = require("@11ty/eleventy"); 2 | const syntaxHighlight = require("@11ty/eleventy-plugin-syntaxhighlight"); 3 | const codeClipboard = require("eleventy-plugin-code-clipboard"); 4 | const pluginMermaid = require("@kevingimbel/eleventy-plugin-mermaid"); 5 | const markdownIt = require('markdown-it'); 6 | 7 | module.exports = (eleventyConfig) => { 8 | eleventyConfig.addPlugin(syntaxHighlight); 9 | eleventyConfig.addPlugin(codeClipboard); 10 | eleventyConfig.addPlugin(pluginMermaid); 11 | eleventyConfig.addPlugin(EleventyHtmlBasePlugin, { 12 | baseHref: process.env.DOCSITE_BASE_URL || "" 13 | }); 14 | eleventyConfig.addPassthroughCopy("data"); 15 | eleventyConfig.addPassthroughCopy("img"); 16 | // Ignore auto-generated content 17 | eleventyConfig.setUseGitIgnore(false); 18 | 19 | const markdownLibrary = markdownIt({ 20 | html: true 21 | }).use(codeClipboard.markdownItCopyButton); 22 | 23 | eleventyConfig.setLibrary("md", markdownLibrary); 24 | 25 | }; -------------------------------------------------------------------------------- /graphrag/index/emit/json_table_emitter.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """JsonTableEmitter module.""" 5 | 6 | import logging 7 | 8 | import pandas as pd 9 | 10 | from graphrag.index.storage import PipelineStorage 11 | 12 | from .table_emitter import TableEmitter 13 | 14 | log = logging.getLogger(__name__) 15 | 16 | 17 | class JsonTableEmitter(TableEmitter): 18 | """JsonTableEmitter class.""" 19 | 20 | _storage: PipelineStorage 21 | 22 | def __init__(self, storage: PipelineStorage): 23 | """Create a new Json Table Emitter.""" 24 | self._storage = storage 25 | 26 | async def emit(self, name: str, data: pd.DataFrame) -> None: 27 | """Emit a dataframe to storage.""" 28 | filename = f"{name}.json" 29 | 30 | log.info("emitting JSON table %s", filename) 31 | await self._storage.set( 32 | filename, 33 | data.to_json(orient="records", lines=True, force_ascii=False), 34 | ) 35 | -------------------------------------------------------------------------------- /graphrag/index/verbs/covariates/extract_covariates/strategies/graph_intelligence/defaults.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """A file containing MOCK_LLM_RESPONSES definition.""" 5 | 6 | MOCK_LLM_RESPONSES = [ 7 | """ 8 | [ 9 | { 10 | "subject": "COMPANY A", 11 | "object": "GOVERNMENT AGENCY B", 12 | "type": "ANTI-COMPETITIVE PRACTICES", 13 | "status": "TRUE", 14 | "start_date": "2022-01-10T00:00:00", 15 | "end_date": "2022-01-10T00:00:00", 16 | "description": "Company A was found to engage in anti-competitive practices because it was fined for bid rigging in multiple public tenders published by Government Agency B according to an article published on 2022/01/10", 17 | "source_text": ["According to an article published on 2022/01/10, Company A was fined for bid rigging while participating in multiple public tenders published by Government Agency B."] 18 | } 19 | ] 20 | """.strip() 21 | ] 22 | -------------------------------------------------------------------------------- /graphrag/config/models/cluster_graph_config.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Parameterization settings for the default configuration.""" 5 | 6 | from pydantic import BaseModel, Field 7 | 8 | import graphrag.config.defaults as defs 9 | 10 | 11 | class ClusterGraphConfig(BaseModel): 12 | """Configuration section for clustering graphs.""" 13 | 14 | max_cluster_size: int = Field( 15 | description="The maximum cluster size to use.", default=defs.MAX_CLUSTER_SIZE 16 | ) 17 | strategy: dict | None = Field( 18 | description="The cluster strategy to use.", default=None 19 | ) 20 | 21 | def resolved_strategy(self) -> dict: 22 | """Get the resolved cluster strategy.""" 23 | from graphrag.index.verbs.graph.clustering import GraphCommunityStrategyType 24 | 25 | return self.strategy or { 26 | "type": GraphCommunityStrategyType.leiden, 27 | "max_cluster_size": self.max_cluster_size, 28 | } 29 | -------------------------------------------------------------------------------- /graphrag/index/workflows/typing.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """A module containing 'WorkflowToRun' model.""" 5 | 6 | from collections.abc import Callable 7 | from dataclasses import dataclass as dc_dataclass 8 | from typing import Any 9 | 10 | from datashaper import TableContainer, Workflow 11 | 12 | StepDefinition = dict[str, Any] 13 | """A step definition.""" 14 | 15 | VerbDefinitions = dict[str, Callable[..., TableContainer]] 16 | """A mapping of verb names to their implementations.""" 17 | 18 | WorkflowConfig = dict[str, Any] 19 | """A workflow configuration.""" 20 | 21 | WorkflowDefinitions = dict[str, Callable[[WorkflowConfig], list[StepDefinition]]] 22 | """A mapping of workflow names to their implementations.""" 23 | 24 | VerbTiming = dict[str, float] 25 | """The timings of verbs by id.""" 26 | 27 | 28 | @dc_dataclass 29 | class WorkflowToRun: 30 | """Workflow to run class definition.""" 31 | 32 | workflow: Workflow 33 | config: dict[str, Any] 34 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | # To get started with Dependabot version updates, you'll need to specify which 2 | # package ecosystems to update and where the package manifests are located. 3 | # Please see the documentation for all configuration options: 4 | # https://docs.github.com/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file 5 | version: 2 6 | updates: 7 | - package-ecosystem: "npm" # See documentation for possible values 8 | directory: "docsite/" # Location of package manifests 9 | schedule: 10 | interval: "weekly" 11 | - package-ecosystem: "pip" # See documentation for possible values 12 | directory: "/" # Location of package manifests 13 | schedule: 14 | interval: "weekly" 15 | - package-ecosystem: "github-actions" 16 | # Workflow files stored in the default location of `.github/workflows`. (You don't need to specify `/.github/workflows` for `directory`. You can use `directory: "/"`.) 17 | directory: "/" 18 | schedule: 19 | interval: "weekly" 20 | -------------------------------------------------------------------------------- /tests/fixtures/azure/settings.yml: -------------------------------------------------------------------------------- 1 | claim_extraction: 2 | enabled: true 3 | 4 | embeddings: 5 | vector_store: 6 | type: "azure_ai_search" 7 | url: ${AZURE_AI_SEARCH_URL_ENDPOINT} 8 | api_key: ${AZURE_AI_SEARCH_API_KEY} 9 | collection_name: "azure_ci" 10 | query_collection_name: "azure_ci_query" 11 | 12 | entity_name_description: 13 | title_column: "name" 14 | 15 | input: 16 | type: blob 17 | file_type: text 18 | connection_string: ${LOCAL_BLOB_STORAGE_CONNECTION_STRING} 19 | container_name: azurefixture 20 | base_dir: input 21 | 22 | cache: 23 | type: blob 24 | connection_string: ${BLOB_STORAGE_CONNECTION_STRING} 25 | container_name: cicache 26 | base_dir: cache_azure_ai 27 | 28 | storage: 29 | type: blob 30 | connection_string: ${LOCAL_BLOB_STORAGE_CONNECTION_STRING} 31 | container_name: azurefixture 32 | base_dir: output 33 | 34 | reporting: 35 | type: blob 36 | connection_string: ${LOCAL_BLOB_STORAGE_CONNECTION_STRING} 37 | container_name: azurefixture 38 | base_dir: reports 39 | -------------------------------------------------------------------------------- /graphrag/llm/types/llm_config.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """LLM Configuration Protocol definition.""" 5 | 6 | from typing import Protocol 7 | 8 | 9 | class LLMConfig(Protocol): 10 | """LLM Configuration Protocol definition.""" 11 | 12 | @property 13 | def max_retries(self) -> int | None: 14 | """Get the maximum number of retries.""" 15 | ... 16 | 17 | @property 18 | def max_retry_wait(self) -> float | None: 19 | """Get the maximum retry wait time.""" 20 | ... 21 | 22 | @property 23 | def sleep_on_rate_limit_recommendation(self) -> bool | None: 24 | """Get whether to sleep on rate limit recommendation.""" 25 | ... 26 | 27 | @property 28 | def tokens_per_minute(self) -> int | None: 29 | """Get the number of tokens per minute.""" 30 | ... 31 | 32 | @property 33 | def requests_per_minute(self) -> int | None: 34 | """Get the number of requests per minute.""" 35 | ... 36 | -------------------------------------------------------------------------------- /graphrag/index/verbs/graph/report/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """The Indexing Engine graph report package root.""" 5 | 6 | from .create_community_reports import ( 7 | CreateCommunityReportsStrategyType, 8 | create_community_reports, 9 | ) 10 | from .prepare_community_reports import prepare_community_reports 11 | from .prepare_community_reports_claims import prepare_community_reports_claims 12 | from .prepare_community_reports_edges import prepare_community_reports_edges 13 | from .prepare_community_reports_nodes import prepare_community_reports_nodes 14 | from .restore_community_hierarchy import restore_community_hierarchy 15 | 16 | __all__ = [ 17 | "CreateCommunityReportsStrategyType", 18 | "create_community_reports", 19 | "create_community_reports", 20 | "prepare_community_reports", 21 | "prepare_community_reports_claims", 22 | "prepare_community_reports_edges", 23 | "prepare_community_reports_nodes", 24 | "restore_community_hierarchy", 25 | ] 26 | -------------------------------------------------------------------------------- /graphrag/prompt_tune/generator/persona.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Persona generating module for fine-tuning GraphRAG prompts.""" 5 | 6 | from graphrag.llm.types.llm_types import CompletionLLM 7 | from graphrag.prompt_tune.generator.defaults import DEFAULT_TASK 8 | from graphrag.prompt_tune.prompt import GENERATE_PERSONA_PROMPT 9 | 10 | 11 | async def generate_persona( 12 | llm: CompletionLLM, domain: str, task: str = DEFAULT_TASK 13 | ) -> str: 14 | """Generate an LLM persona to use for GraphRAG prompts. 15 | 16 | Parameters 17 | ---------- 18 | - llm (CompletionLLM): The LLM to use for generation 19 | - domain (str): The domain to generate a persona for 20 | - task (str): The task to generate a persona for. Default is DEFAULT_TASK 21 | """ 22 | formatted_task = task.format(domain=domain) 23 | persona_prompt = GENERATE_PERSONA_PROMPT.format(sample_task=formatted_task) 24 | 25 | response = await llm(persona_prompt) 26 | 27 | return str(response.output) 28 | -------------------------------------------------------------------------------- /graphrag/index/verbs/entities/extraction/strategies/graph_intelligence/defaults.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """A file containing some default responses.""" 5 | 6 | from graphrag.config.enums import LLMType 7 | 8 | MOCK_LLM_RESPONSES = [ 9 | """ 10 | ("entity"<|>COMPANY_A<|>COMPANY<|>Company_A is a test company) 11 | ## 12 | ("entity"<|>COMPANY_B<|>COMPANY<|>Company_B owns Company_A and also shares an address with Company_A) 13 | ## 14 | ("entity"<|>PERSON_C<|>PERSON<|>Person_C is director of Company_A) 15 | ## 16 | ("relationship"<|>COMPANY_A<|>COMPANY_B<|>Company_A and Company_B are related because Company_A is 100% owned by Company_B and the two companies also share the same address)<|>2) 17 | ## 18 | ("relationship"<|>COMPANY_A<|>PERSON_C<|>Company_A and Person_C are related because Person_C is director of Company_A<|>1)) 19 | """.strip() 20 | ] 21 | 22 | DEFAULT_LLM_CONFIG = { 23 | "type": LLMType.StaticResponse, 24 | "responses": MOCK_LLM_RESPONSES, 25 | } 26 | -------------------------------------------------------------------------------- /graphrag/config/models/cache_config.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Parameterization settings for the default configuration.""" 5 | 6 | from pydantic import BaseModel, Field 7 | 8 | import graphrag.config.defaults as defs 9 | from graphrag.config.enums import CacheType 10 | 11 | 12 | class CacheConfig(BaseModel): 13 | """The default configuration section for Cache.""" 14 | 15 | type: CacheType = Field( 16 | description="The cache type to use.", default=defs.CACHE_TYPE 17 | ) 18 | base_dir: str = Field( 19 | description="The base directory for the cache.", default=defs.CACHE_BASE_DIR 20 | ) 21 | connection_string: str | None = Field( 22 | description="The cache connection string to use.", default=None 23 | ) 24 | container_name: str | None = Field( 25 | description="The cache container name to use.", default=None 26 | ) 27 | storage_account_blob_url: str | None = Field( 28 | description="The storage account blob url to use.", default=None 29 | ) 30 | -------------------------------------------------------------------------------- /graphrag/query/question_gen/system_prompt.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Question Generation system prompts.""" 5 | 6 | QUESTION_SYSTEM_PROMPT = """ 7 | ---Role--- 8 | 9 | You are a helpful assistant generating a bulleted list of {question_count} questions about data in the tables provided. 10 | 11 | 12 | ---Data tables--- 13 | 14 | {context_data} 15 | 16 | 17 | ---Goal--- 18 | 19 | Given a series of example questions provided by the user, generate a bulleted list of {question_count} candidates for the next question. Use - marks as bullet points. 20 | 21 | These candidate questions should represent the most important or urgent information content or themes in the data tables. 22 | 23 | The candidate questions should be answerable using the data tables provided, but should not mention any specific data fields or data tables in the question text. 24 | 25 | If the user's questions reference several named entities, then each candidate question should reference all named entities. 26 | 27 | ---Example questions--- 28 | """ 29 | -------------------------------------------------------------------------------- /graphrag/prompt_tune/generator/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Prompt generation module.""" 5 | 6 | from .community_report_summarization import create_community_summarization_prompt 7 | from .community_reporter_role import generate_community_reporter_role 8 | from .defaults import MAX_TOKEN_COUNT 9 | from .domain import generate_domain 10 | from .entity_extraction_prompt import create_entity_extraction_prompt 11 | from .entity_relationship import generate_entity_relationship_examples 12 | from .entity_summarization_prompt import create_entity_summarization_prompt 13 | from .entity_types import generate_entity_types 14 | from .persona import generate_persona 15 | 16 | __all__ = [ 17 | "MAX_TOKEN_COUNT", 18 | "create_community_summarization_prompt", 19 | "create_entity_extraction_prompt", 20 | "create_entity_summarization_prompt", 21 | "generate_community_reporter_role", 22 | "generate_domain", 23 | "generate_entity_relationship_examples", 24 | "generate_entity_types", 25 | "generate_persona", 26 | ] 27 | -------------------------------------------------------------------------------- /graphrag/prompt_tune/prompt/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Persona, entity type, relationships and domain generation prompts module.""" 5 | 6 | from .community_reporter_role import GENERATE_COMMUNITY_REPORTER_ROLE_PROMPT 7 | from .domain import GENERATE_DOMAIN_PROMPT 8 | from .entity_relationship import ( 9 | ENTITY_RELATIONSHIPS_GENERATION_JSON_PROMPT, 10 | ENTITY_RELATIONSHIPS_GENERATION_PROMPT, 11 | UNTYPED_ENTITY_RELATIONSHIPS_GENERATION_PROMPT, 12 | ) 13 | from .entity_types import ( 14 | ENTITY_TYPE_GENERATION_JSON_PROMPT, 15 | ENTITY_TYPE_GENERATION_PROMPT, 16 | ) 17 | from .persona import GENERATE_PERSONA_PROMPT 18 | 19 | __all__ = [ 20 | "ENTITY_RELATIONSHIPS_GENERATION_JSON_PROMPT", 21 | "ENTITY_RELATIONSHIPS_GENERATION_PROMPT", 22 | "ENTITY_TYPE_GENERATION_JSON_PROMPT", 23 | "ENTITY_TYPE_GENERATION_PROMPT", 24 | "GENERATE_COMMUNITY_REPORTER_ROLE_PROMPT", 25 | "GENERATE_DOMAIN_PROMPT", 26 | "GENERATE_PERSONA_PROMPT", 27 | "UNTYPED_ENTITY_RELATIONSHIPS_GENERATION_PROMPT", 28 | ] 29 | -------------------------------------------------------------------------------- /graphrag/prompt_tune/prompt/community_reporter_role.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Fine-tuning prompts for community reporter role generation.""" 5 | 6 | GENERATE_COMMUNITY_REPORTER_ROLE_PROMPT = """ 7 | {persona} 8 | Given a sample text, help the user by creating a role definition that will be tasked with community analysis. 9 | Take a look at this example, determine its key parts, and using the domain provided and your expertise, create a new role definition for the provided inputs that follows the same pattern as the example. 10 | Remember, your output should look just like the provided example in structure and content. 11 | 12 | Example: 13 | A technologist reporter that is analyzing Kevin Scott's "Behind the Tech Podcast", given a list of entities 14 | that belong to the community as well as their relationships and optional associated claims. 15 | The report will be used to inform decision-makers about significant developments associated with the community and their potential impact. 16 | 17 | 18 | Domain: {domain} 19 | Text: {input_text} 20 | Role:""" 21 | -------------------------------------------------------------------------------- /graphrag/config/models/storage_config.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Parameterization settings for the default configuration.""" 5 | 6 | from pydantic import BaseModel, Field 7 | 8 | import graphrag.config.defaults as defs 9 | from graphrag.config.enums import StorageType 10 | 11 | 12 | class StorageConfig(BaseModel): 13 | """The default configuration section for Storage.""" 14 | 15 | type: StorageType = Field( 16 | description="The storage type to use.", default=defs.STORAGE_TYPE 17 | ) 18 | base_dir: str = Field( 19 | description="The base directory for the storage.", 20 | default=defs.STORAGE_BASE_DIR, 21 | ) 22 | connection_string: str | None = Field( 23 | description="The storage connection string to use.", default=None 24 | ) 25 | container_name: str | None = Field( 26 | description="The storage container name to use.", default=None 27 | ) 28 | storage_account_blob_url: str | None = Field( 29 | description="The storage account blob url to use.", default=None 30 | ) 31 | -------------------------------------------------------------------------------- /graphrag/llm/types/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """LLM Typings.""" 5 | 6 | from .llm import LLM 7 | from .llm_cache import LLMCache 8 | from .llm_callbacks import ( 9 | ErrorHandlerFn, 10 | IsResponseValidFn, 11 | LLMInvocationFn, 12 | OnCacheActionFn, 13 | ) 14 | from .llm_config import LLMConfig 15 | from .llm_invocation_result import LLMInvocationResult 16 | from .llm_io import ( 17 | LLMInput, 18 | LLMOutput, 19 | ) 20 | from .llm_types import ( 21 | CompletionInput, 22 | CompletionLLM, 23 | CompletionOutput, 24 | EmbeddingInput, 25 | EmbeddingLLM, 26 | EmbeddingOutput, 27 | ) 28 | 29 | __all__ = [ 30 | "LLM", 31 | "CompletionInput", 32 | "CompletionLLM", 33 | "CompletionOutput", 34 | "EmbeddingInput", 35 | "EmbeddingLLM", 36 | "EmbeddingOutput", 37 | "ErrorHandlerFn", 38 | "IsResponseValidFn", 39 | "LLMCache", 40 | "LLMConfig", 41 | "LLMInput", 42 | "LLMInvocationFn", 43 | "LLMInvocationResult", 44 | "LLMOutput", 45 | "OnCacheActionFn", 46 | ] 47 | -------------------------------------------------------------------------------- /.vsts-ci.yml: -------------------------------------------------------------------------------- 1 | name: GraphRAG CI 2 | pool: 3 | vmImage: ubuntu-latest 4 | 5 | trigger: 6 | batch: true 7 | branches: 8 | include: 9 | - main 10 | 11 | variables: 12 | isMain: $[eq(variables['Build.SourceBranch'], 'refs/heads/main')] 13 | pythonVersion: "3.10" 14 | poetryVersion: "1.6.1" 15 | nodeVersion: "18.x" 16 | artifactsFullFeedName: "Resilience/resilience_python" 17 | 18 | stages: 19 | - stage: Compliance 20 | dependsOn: [] 21 | jobs: 22 | - job: compliance 23 | displayName: Compliance 24 | pool: 25 | vmImage: windows-latest 26 | steps: 27 | - task: CredScan@3 28 | inputs: 29 | outputFormat: sarif 30 | debugMode: false 31 | 32 | - task: ComponentGovernanceComponentDetection@0 33 | inputs: 34 | scanType: "Register" 35 | verbosity: "Verbose" 36 | alertWarningLevel: "High" 37 | 38 | - task: PublishSecurityAnalysisLogs@3 39 | inputs: 40 | ArtifactName: "CodeAnalysisLogs" 41 | ArtifactType: "Container" -------------------------------------------------------------------------------- /graphrag/index/utils/ds_util.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """A utility module datashaper-specific utility methods.""" 5 | 6 | from typing import cast 7 | 8 | from datashaper import TableContainer, VerbInput 9 | 10 | _NAMED_INPUTS_REQUIRED = "Named inputs are required" 11 | 12 | 13 | def get_required_input_table(input: VerbInput, name: str) -> TableContainer: 14 | """Get a required input table by name.""" 15 | return cast(TableContainer, get_named_input_table(input, name, required=True)) 16 | 17 | 18 | def get_named_input_table( 19 | input: VerbInput, name: str, required: bool = False 20 | ) -> TableContainer | None: 21 | """Get an input table from datashaper verb-inputs by name.""" 22 | named_inputs = input.named 23 | if named_inputs is None: 24 | if not required: 25 | return None 26 | raise ValueError(_NAMED_INPUTS_REQUIRED) 27 | 28 | result = named_inputs.get(name) 29 | if result is None and required: 30 | msg = f"input '${name}' is required" 31 | raise ValueError(msg) 32 | return result 33 | -------------------------------------------------------------------------------- /graphrag/index/reporting/console_workflow_callbacks.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Console-based reporter for the workflow engine.""" 5 | 6 | from datashaper import NoopWorkflowCallbacks 7 | 8 | 9 | class ConsoleWorkflowCallbacks(NoopWorkflowCallbacks): 10 | """A reporter that writes to a console.""" 11 | 12 | def on_error( 13 | self, 14 | message: str, 15 | cause: BaseException | None = None, 16 | stack: str | None = None, 17 | details: dict | None = None, 18 | ): 19 | """Handle when an error occurs.""" 20 | print(message, str(cause), stack, details) # noqa T201 21 | 22 | def on_warning(self, message: str, details: dict | None = None): 23 | """Handle when a warning occurs.""" 24 | _print_warning(message) 25 | 26 | def on_log(self, message: str, details: dict | None = None): 27 | """Handle when a log message is produced.""" 28 | print(message, details) # noqa T201 29 | 30 | 31 | def _print_warning(skk): 32 | print("\033[93m {}\033[00m".format(skk)) # noqa T201 33 | -------------------------------------------------------------------------------- /graphrag/index/verbs/entities/extraction/strategies/typing.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """A module containing 'Document' and 'EntityExtractionResult' models.""" 5 | 6 | from collections.abc import Awaitable, Callable 7 | from dataclasses import dataclass 8 | from typing import Any 9 | 10 | from datashaper import VerbCallbacks 11 | 12 | from graphrag.index.cache import PipelineCache 13 | 14 | ExtractedEntity = dict[str, Any] 15 | StrategyConfig = dict[str, Any] 16 | EntityTypes = list[str] 17 | 18 | 19 | @dataclass 20 | class Document: 21 | """Document class definition.""" 22 | 23 | text: str 24 | id: str 25 | 26 | 27 | @dataclass 28 | class EntityExtractionResult: 29 | """Entity extraction result class definition.""" 30 | 31 | entities: list[ExtractedEntity] 32 | graphml_graph: str | None 33 | 34 | 35 | EntityExtractStrategy = Callable[ 36 | [ 37 | list[Document], 38 | EntityTypes, 39 | VerbCallbacks, 40 | PipelineCache, 41 | StrategyConfig, 42 | ], 43 | Awaitable[EntityExtractionResult], 44 | ] 45 | -------------------------------------------------------------------------------- /tests/unit/indexing/test_init_content.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | import re 5 | from typing import Any, cast 6 | 7 | import yaml 8 | 9 | from graphrag.config import ( 10 | GraphRagConfig, 11 | create_graphrag_config, 12 | ) 13 | from graphrag.index.init_content import INIT_YAML 14 | 15 | 16 | def test_init_yaml(): 17 | data = yaml.load(INIT_YAML, Loader=yaml.FullLoader) 18 | config = create_graphrag_config(data) 19 | GraphRagConfig.model_validate(config, strict=True) 20 | 21 | 22 | def test_init_yaml_uncommented(): 23 | lines = INIT_YAML.splitlines() 24 | lines = [line for line in lines if "##" not in line] 25 | 26 | def uncomment_line(line: str) -> str: 27 | leading_whitespace = cast(Any, re.search(r"^(\s*)", line)).group(1) 28 | return re.sub(r"^\s*# ", leading_whitespace, line, count=1) 29 | 30 | content = "\n".join([uncomment_line(line) for line in lines]) 31 | data = yaml.load(content, Loader=yaml.FullLoader) 32 | config = create_graphrag_config(data) 33 | GraphRagConfig.model_validate(config, strict=True) 34 | -------------------------------------------------------------------------------- /graphrag/config/models/reporting_config.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Parameterization settings for the default configuration.""" 5 | 6 | from pydantic import BaseModel, Field 7 | 8 | import graphrag.config.defaults as defs 9 | from graphrag.config.enums import ReportingType 10 | 11 | 12 | class ReportingConfig(BaseModel): 13 | """The default configuration section for Reporting.""" 14 | 15 | type: ReportingType = Field( 16 | description="The reporting type to use.", default=defs.REPORTING_TYPE 17 | ) 18 | base_dir: str = Field( 19 | description="The base directory for reporting.", 20 | default=defs.REPORTING_BASE_DIR, 21 | ) 22 | connection_string: str | None = Field( 23 | description="The reporting connection string to use.", default=None 24 | ) 25 | container_name: str | None = Field( 26 | description="The reporting container name to use.", default=None 27 | ) 28 | storage_account_blob_url: str | None = Field( 29 | description="The storage account blob url to use.", default=None 30 | ) 31 | -------------------------------------------------------------------------------- /graphrag/config/input_models/input_config_input.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Parameterization settings for the default configuration.""" 5 | 6 | from typing_extensions import NotRequired, TypedDict 7 | 8 | from graphrag.config.enums import InputFileType, InputType 9 | 10 | 11 | class InputConfigInput(TypedDict): 12 | """The default configuration section for Input.""" 13 | 14 | type: NotRequired[InputType | str | None] 15 | file_type: NotRequired[InputFileType | str | None] 16 | base_dir: NotRequired[str | None] 17 | connection_string: NotRequired[str | None] 18 | container_name: NotRequired[str | None] 19 | file_encoding: NotRequired[str | None] 20 | file_pattern: NotRequired[str | None] 21 | source_column: NotRequired[str | None] 22 | timestamp_column: NotRequired[str | None] 23 | timestamp_format: NotRequired[str | None] 24 | text_column: NotRequired[str | None] 25 | title_column: NotRequired[str | None] 26 | document_attribute_columns: NotRequired[list[str] | str | None] 27 | storage_account_blob_url: NotRequired[str | None] 28 | -------------------------------------------------------------------------------- /graphrag/query/context_builder/builders.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Base classes for global and local context builders.""" 5 | 6 | from abc import ABC, abstractmethod 7 | 8 | import pandas as pd 9 | 10 | from graphrag.query.context_builder.conversation_history import ( 11 | ConversationHistory, 12 | ) 13 | 14 | 15 | class GlobalContextBuilder(ABC): 16 | """Base class for global-search context builders.""" 17 | 18 | @abstractmethod 19 | def build_context( 20 | self, conversation_history: ConversationHistory | None = None, **kwargs 21 | ) -> tuple[str | list[str], dict[str, pd.DataFrame]]: 22 | """Build the context for the global search mode.""" 23 | 24 | 25 | class LocalContextBuilder(ABC): 26 | """Base class for local-search context builders.""" 27 | 28 | @abstractmethod 29 | def build_context( 30 | self, 31 | query: str, 32 | conversation_history: ConversationHistory | None = None, 33 | **kwargs, 34 | ) -> tuple[str | list[str], dict[str, pd.DataFrame]]: 35 | """Build the context for the local search mode.""" 36 | -------------------------------------------------------------------------------- /graphrag/llm/openai/json_parsing_llm.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """An LLM that unpacks cached JSON responses.""" 5 | 6 | from typing_extensions import Unpack 7 | 8 | from graphrag.llm.types import ( 9 | LLM, 10 | CompletionInput, 11 | CompletionLLM, 12 | CompletionOutput, 13 | LLMInput, 14 | LLMOutput, 15 | ) 16 | 17 | from .utils import try_parse_json_object 18 | 19 | 20 | class JsonParsingLLM(LLM[CompletionInput, CompletionOutput]): 21 | """An OpenAI History-Tracking LLM.""" 22 | 23 | _delegate: CompletionLLM 24 | 25 | def __init__(self, delegate: CompletionLLM): 26 | self._delegate = delegate 27 | 28 | async def __call__( 29 | self, 30 | input: CompletionInput, 31 | **kwargs: Unpack[LLMInput], 32 | ) -> LLMOutput[CompletionOutput]: 33 | """Call the LLM with the input and kwargs.""" 34 | result = await self._delegate(input, **kwargs) 35 | if kwargs.get("json") and result.json is None and result.output is not None: 36 | result.json = try_parse_json_object(result.output) 37 | return result 38 | -------------------------------------------------------------------------------- /graphrag/llm/openai/openai_history_tracking_llm.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """The Chat-based language model.""" 5 | 6 | from typing_extensions import Unpack 7 | 8 | from graphrag.llm.types import ( 9 | LLM, 10 | CompletionInput, 11 | CompletionLLM, 12 | CompletionOutput, 13 | LLMInput, 14 | LLMOutput, 15 | ) 16 | 17 | 18 | class OpenAIHistoryTrackingLLM(LLM[CompletionInput, CompletionOutput]): 19 | """An OpenAI History-Tracking LLM.""" 20 | 21 | _delegate: CompletionLLM 22 | 23 | def __init__(self, delegate: CompletionLLM): 24 | self._delegate = delegate 25 | 26 | async def __call__( 27 | self, 28 | input: CompletionInput, 29 | **kwargs: Unpack[LLMInput], 30 | ) -> LLMOutput[CompletionOutput]: 31 | """Call the LLM.""" 32 | history = kwargs.get("history") or [] 33 | output = await self._delegate(input, **kwargs) 34 | return LLMOutput( 35 | output=output.output, 36 | json=output.json, 37 | history=[*history, {"role": "system", "content": output.output}], 38 | ) 39 | -------------------------------------------------------------------------------- /graphrag/llm/openai/openai_token_replacing_llm.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """The Chat-based language model.""" 5 | 6 | from typing_extensions import Unpack 7 | 8 | from graphrag.llm.types import ( 9 | LLM, 10 | CompletionInput, 11 | CompletionLLM, 12 | CompletionOutput, 13 | LLMInput, 14 | LLMOutput, 15 | ) 16 | 17 | from .utils import perform_variable_replacements 18 | 19 | 20 | class OpenAITokenReplacingLLM(LLM[CompletionInput, CompletionOutput]): 21 | """An OpenAI History-Tracking LLM.""" 22 | 23 | _delegate: CompletionLLM 24 | 25 | def __init__(self, delegate: CompletionLLM): 26 | self._delegate = delegate 27 | 28 | async def __call__( 29 | self, 30 | input: CompletionInput, 31 | **kwargs: Unpack[LLMInput], 32 | ) -> LLMOutput[CompletionOutput]: 33 | """Call the LLM with the input and kwargs.""" 34 | variables = kwargs.get("variables") 35 | history = kwargs.get("history") or [] 36 | input = perform_variable_replacements(input, history, variables) 37 | return await self._delegate(input, **kwargs) 38 | -------------------------------------------------------------------------------- /graphrag/prompt_tune/template/entity_summarization.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Fine-tuning prompts for entity summarization.""" 5 | 6 | ENTITY_SUMMARIZATION_PROMPT = """ 7 | {persona} 8 | Using your expertise, you're asked to generate a comprehensive summary of the data provided below. 9 | Given one or two entities, and a list of descriptions, all related to the same entity or group of entities. 10 | Please concatenate all of these into a single, concise description. Make sure to include information collected from all the descriptions. 11 | If the provided descriptions are contradictory, please resolve the contradictions and provide a single, coherent summary. 12 | Make sure it is written in third person, and include the entity names so we the have full context. 13 | 14 | Enrich it as much as you can with relevant information from the nearby text, this is very important. 15 | 16 | If no answer is possible, or the description is empty, only convey information that is provided within the text. 17 | ####### 18 | -Data- 19 | Entities: {{entity_name}} 20 | Description List: {{description_list}} 21 | ####### 22 | Output:""" 23 | -------------------------------------------------------------------------------- /graphrag/index/utils/tokens.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Utilities for working with tokens.""" 5 | 6 | import tiktoken 7 | 8 | DEFAULT_ENCODING_NAME = "cl100k_base" 9 | 10 | 11 | def num_tokens_from_string( 12 | string: str, model: str | None = None, encoding_name: str | None = None 13 | ) -> int: 14 | """Return the number of tokens in a text string.""" 15 | if model is not None: 16 | encoding = tiktoken.encoding_for_model(model) 17 | else: 18 | encoding = tiktoken.get_encoding(encoding_name or DEFAULT_ENCODING_NAME) 19 | return len(encoding.encode(string)) 20 | 21 | 22 | def string_from_tokens( 23 | tokens: list[int], model: str | None = None, encoding_name: str | None = None 24 | ) -> str: 25 | """Return a text string from a list of tokens.""" 26 | if model is not None: 27 | encoding = tiktoken.encoding_for_model(model) 28 | elif encoding_name is not None: 29 | encoding = tiktoken.get_encoding(encoding_name) 30 | else: 31 | msg = "Either model or encoding_name must be specified." 32 | raise ValueError(msg) 33 | return encoding.decode(tokens) 34 | -------------------------------------------------------------------------------- /graphrag/llm/limiting/tpm_rpm_limiter.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """TPM RPM Limiter module.""" 5 | 6 | from aiolimiter import AsyncLimiter 7 | 8 | from .llm_limiter import LLMLimiter 9 | 10 | 11 | class TpmRpmLLMLimiter(LLMLimiter): 12 | """TPM RPM Limiter class definition.""" 13 | 14 | _tpm_limiter: AsyncLimiter | None 15 | _rpm_limiter: AsyncLimiter | None 16 | 17 | def __init__( 18 | self, tpm_limiter: AsyncLimiter | None, rpm_limiter: AsyncLimiter | None 19 | ): 20 | """Init method definition.""" 21 | self._tpm_limiter = tpm_limiter 22 | self._rpm_limiter = rpm_limiter 23 | 24 | @property 25 | def needs_token_count(self) -> bool: 26 | """Whether this limiter needs the token count to be passed in.""" 27 | return self._tpm_limiter is not None 28 | 29 | async def acquire(self, num_tokens: int = 1) -> None: 30 | """Call method definition.""" 31 | if self._tpm_limiter is not None: 32 | await self._tpm_limiter.acquire(num_tokens) 33 | if self._rpm_limiter is not None: 34 | await self._rpm_limiter.acquire() 35 | -------------------------------------------------------------------------------- /graphrag/index/graph/extractors/community_reports/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """The Indexing Engine community reports package root.""" 5 | 6 | import graphrag.index.graph.extractors.community_reports.schemas as schemas 7 | 8 | from .build_mixed_context import build_mixed_context 9 | from .community_reports_extractor import CommunityReportsExtractor 10 | from .prep_community_report_context import prep_community_report_context 11 | from .prompts import COMMUNITY_REPORT_PROMPT 12 | from .sort_context import sort_context 13 | from .utils import ( 14 | filter_claims_to_nodes, 15 | filter_edges_to_nodes, 16 | filter_nodes_to_level, 17 | get_levels, 18 | set_context_exceeds_flag, 19 | set_context_size, 20 | ) 21 | 22 | __all__ = [ 23 | "COMMUNITY_REPORT_PROMPT", 24 | "CommunityReportsExtractor", 25 | "build_mixed_context", 26 | "filter_claims_to_nodes", 27 | "filter_edges_to_nodes", 28 | "filter_nodes_to_level", 29 | "get_levels", 30 | "prep_community_report_context", 31 | "schemas", 32 | "set_context_exceeds_flag", 33 | "set_context_size", 34 | "sort_context", 35 | ] 36 | -------------------------------------------------------------------------------- /graphrag/llm/base/_create_cache_key.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Cache key generation utils.""" 5 | 6 | import hashlib 7 | 8 | 9 | def _llm_string(params: dict) -> str: 10 | # New version of the cache is not including n in the params dictionary 11 | # This avoids creating a new cache key for the same prompt 12 | if "max_tokens" in params and "n" not in params: 13 | params["n"] = None 14 | return str(sorted((k, v) for k, v in params.items())) 15 | 16 | 17 | def _hash(_input: str) -> str: 18 | """Use a deterministic hashing approach.""" 19 | return hashlib.md5(_input.encode()).hexdigest() # noqa S324 20 | 21 | 22 | def create_hash_key(operation: str, prompt: str, parameters: dict) -> str: 23 | """Compute cache key from prompt and associated model and settings. 24 | 25 | Args: 26 | prompt (str): The prompt run through the language model. 27 | llm_string (str): The language model version and settings. 28 | 29 | Returns 30 | ------- 31 | str: The cache key. 32 | """ 33 | llm_string = _llm_string(parameters) 34 | return f"{operation}-{_hash(prompt + llm_string)}" 35 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) Microsoft Corporation. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE 22 | -------------------------------------------------------------------------------- /graphrag/index/graph/embedding/embedding.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Utilities to generate graph embeddings.""" 5 | 6 | from dataclasses import dataclass 7 | 8 | import graspologic as gc 9 | import networkx as nx 10 | import numpy as np 11 | 12 | 13 | @dataclass 14 | class NodeEmbeddings: 15 | """Node embeddings class definition.""" 16 | 17 | nodes: list[str] 18 | embeddings: np.ndarray 19 | 20 | 21 | def embed_nod2vec( 22 | graph: nx.Graph | nx.DiGraph, 23 | dimensions: int = 1536, 24 | num_walks: int = 10, 25 | walk_length: int = 40, 26 | window_size: int = 2, 27 | iterations: int = 3, 28 | random_seed: int = 86, 29 | ) -> NodeEmbeddings: 30 | """Generate node embeddings using Node2Vec.""" 31 | # generate embedding 32 | lcc_tensors = gc.embed.node2vec_embed( # type: ignore 33 | graph=graph, 34 | dimensions=dimensions, 35 | window_size=window_size, 36 | iterations=iterations, 37 | num_walks=num_walks, 38 | walk_length=walk_length, 39 | random_seed=random_seed, 40 | ) 41 | return NodeEmbeddings(embeddings=lcc_tensors[0], nodes=lcc_tensors[1]) 42 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Node Artifacts 2 | */node_modules/ 3 | docsite/*/src/**/*.js 4 | docsite/*/lib/ 5 | docsite/*/storybook-static/ 6 | docsite/*/docsTemp/ 7 | docsite/*/build/ 8 | .swc/ 9 | dist/ 10 | .idea 11 | # https://yarnpkg.com/advanced/qa#which-files-should-be-gitignored 12 | docsite/.yarn/* 13 | !docsite/.yarn/patches 14 | !docsite/.yarn/releases 15 | !docsite/.yarn/plugins 16 | !docsite/.yarn/sdks 17 | !docsite/.yarn/versions 18 | docsite/.pnp.* 19 | 20 | ./ragtest 21 | 22 | .yarn/* 23 | !.yarn/patches 24 | !.yarn/releases 25 | !.yarn/plugins 26 | !.yarn/sdks 27 | !.yarn/versions 28 | .pnp.* 29 | 30 | # Python Artifacts 31 | python/*/lib/ 32 | # Test Output 33 | .coverage 34 | coverage/ 35 | licenses.txt 36 | examples_notebooks/*/lancedb 37 | examples_notebooks/*/data 38 | tests/fixtures/cache 39 | tests/fixtures/*/cache 40 | tests/fixtures/*/output 41 | lancedb/ 42 | 43 | # Random 44 | .DS_Store 45 | *.log* 46 | .venv 47 | .conda 48 | .tmp 49 | 50 | 51 | .env 52 | build.zip 53 | 54 | .turbo 55 | 56 | __pycache__ 57 | 58 | .pipeline 59 | 60 | # Azurite 61 | temp_azurite/ 62 | __azurite*.json 63 | __blobstorage*.json 64 | __blobstorage__/ 65 | 66 | # Getting started example 67 | ragtest/ 68 | .ragtest/ 69 | .pipelines 70 | .pipeline -------------------------------------------------------------------------------- /graphrag/index/verbs/graph/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """The Indexing Engine graph package root.""" 5 | 6 | from .clustering import cluster_graph 7 | from .compute_edge_combined_degree import compute_edge_combined_degree 8 | from .create import DEFAULT_EDGE_ATTRIBUTES, DEFAULT_NODE_ATTRIBUTES, create_graph 9 | from .embed import embed_graph 10 | from .layout import layout_graph 11 | from .merge import merge_graphs 12 | from .report import ( 13 | create_community_reports, 14 | prepare_community_reports, 15 | prepare_community_reports_claims, 16 | prepare_community_reports_edges, 17 | restore_community_hierarchy, 18 | ) 19 | from .unpack import unpack_graph 20 | 21 | __all__ = [ 22 | "DEFAULT_EDGE_ATTRIBUTES", 23 | "DEFAULT_NODE_ATTRIBUTES", 24 | "cluster_graph", 25 | "compute_edge_combined_degree", 26 | "create_community_reports", 27 | "create_graph", 28 | "embed_graph", 29 | "layout_graph", 30 | "merge_graphs", 31 | "prepare_community_reports", 32 | "prepare_community_reports_claims", 33 | "prepare_community_reports_edges", 34 | "restore_community_hierarchy", 35 | "unpack_graph", 36 | ] 37 | -------------------------------------------------------------------------------- /graphrag/index/verbs/text/embed/strategies/mock.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """A module containing run and _embed_text methods definitions.""" 5 | 6 | import random 7 | from collections.abc import Iterable 8 | from typing import Any 9 | 10 | from datashaper import ProgressTicker, VerbCallbacks, progress_ticker 11 | 12 | from graphrag.index.cache import PipelineCache 13 | 14 | from .typing import TextEmbeddingResult 15 | 16 | 17 | async def run( # noqa RUF029 async is required for interface 18 | input: list[str], 19 | callbacks: VerbCallbacks, 20 | cache: PipelineCache, 21 | _args: dict[str, Any], 22 | ) -> TextEmbeddingResult: 23 | """Run the Claim extraction chain.""" 24 | input = input if isinstance(input, Iterable) else [input] 25 | ticker = progress_ticker(callbacks.progress, len(input)) 26 | return TextEmbeddingResult( 27 | embeddings=[_embed_text(cache, text, ticker) for text in input] 28 | ) 29 | 30 | 31 | def _embed_text(_cache: PipelineCache, _text: str, tick: ProgressTicker) -> list[float]: 32 | """Embed a single piece of text.""" 33 | tick(1) 34 | return [random.random(), random.random(), random.random()] # noqa S311 35 | -------------------------------------------------------------------------------- /docsite/posts/config/overview.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Configuring GraphRAG Indexing 3 | navtitle: Configuration 4 | tags: [post] 5 | layout: page 6 | date: 2023-01-03 7 | --- 8 | 9 | The GraphRAG system is highly configurable. This page provides an overview of the configuration options available for the GraphRAG indexing engine. 10 | 11 | ## Default Configuration Mode 12 | 13 | The default configuration mode is the simplest way to get started with the GraphRAG system. It is designed to work out-of-the-box with minimal configuration. The primary configuration sections for the Indexing Engine pipelines are described below. The main ways to set up GraphRAG in Default Configuration mode are via: 14 | 15 | - [Init command](/posts/config/init) (recommended) 16 | - [Purely using environment variables](/posts/config/env_vars) 17 | - [Using JSON or YAML for deeper control](/posts/config/json_yaml) 18 | 19 | ## Custom Configuration Mode 20 | 21 | Custom configuration mode is an advanced use-case. Most users will want to use the Default Configuration instead. The primary configuration sections for Indexing Engine pipelines are described below. Details about how to use custom configuration are available in the [Custom Configuration Mode](/posts/config/custom) documentation. 22 | -------------------------------------------------------------------------------- /graphrag/index/config/workflow.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """A module containing 'PipelineWorkflowReference' model.""" 5 | 6 | from __future__ import annotations 7 | 8 | from typing import Any 9 | 10 | from pydantic import BaseModel 11 | from pydantic import Field as pydantic_Field 12 | 13 | PipelineWorkflowStep = dict[str, Any] 14 | """Represent a step in a workflow.""" 15 | 16 | PipelineWorkflowConfig = dict[str, Any] 17 | """Represent a configuration for a workflow.""" 18 | 19 | 20 | class PipelineWorkflowReference(BaseModel): 21 | """Represent a reference to a workflow, and can optionally be the workflow itself.""" 22 | 23 | name: str | None = pydantic_Field(description="Name of the workflow.", default=None) 24 | """Name of the workflow.""" 25 | 26 | steps: list[PipelineWorkflowStep] | None = pydantic_Field( 27 | description="The optional steps for the workflow.", default=None 28 | ) 29 | """The optional steps for the workflow.""" 30 | 31 | config: PipelineWorkflowConfig | None = pydantic_Field( 32 | description="The optional configuration for the workflow.", default=None 33 | ) 34 | """The optional configuration for the workflow.""" 35 | -------------------------------------------------------------------------------- /graphrag/llm/openai/openai_embeddings_llm.py: -------------------------------------------------------------------------------- 1 | #openai_embeddings_llm.py 2 | 3 | from typing_extensions import Unpack 4 | from graphrag.llm.base import BaseLLM 5 | from graphrag.llm.types import ( 6 | EmbeddingInput, 7 | EmbeddingOutput, 8 | LLMInput, 9 | ) 10 | from .openai_configuration import OpenAIConfiguration 11 | from .types import OpenAIClientTypes 12 | import ollama 13 | 14 | class OpenAIEmbeddingsLLM(BaseLLM[EmbeddingInput, EmbeddingOutput]): 15 | _client: OpenAIClientTypes 16 | _configuration: OpenAIConfiguration 17 | 18 | def __init__(self, client: OpenAIClientTypes, configuration: OpenAIConfiguration): 19 | self._client = client 20 | self._configuration = configuration 21 | 22 | async def _execute_llm( 23 | self, input: EmbeddingInput, **kwargs: Unpack[LLMInput] 24 | ) -> EmbeddingOutput | None: 25 | args = { 26 | "model": self._configuration.model, 27 | **(kwargs.get("model_parameters") or {}), 28 | } 29 | embedding_list = [] 30 | for inp in input: 31 | embedding = ollama.embeddings(model=self._configuration.model, prompt=inp) 32 | embedding_list.append(embedding["embedding"]) 33 | return embedding_list 34 | -------------------------------------------------------------------------------- /examples/interdependent_workflows/pipeline.yml: -------------------------------------------------------------------------------- 1 | workflows: 2 | - name: aggregate_workflow 3 | steps: 4 | - verb: "aggregate" # https://github.com/microsoft/datashaper/blob/main/python/datashaper/datashaper/engine/verbs/aggregate.py 5 | args: 6 | groupby: "type" 7 | column: "col_multiplied" 8 | to: "aggregated_output" 9 | operation: "sum" 10 | input: 11 | source: "workflow:derive_workflow" # reference the derive_workflow, cause this one requires that one to run first 12 | # Notice, these are out of order, the indexing engine will figure out the right order to run them in 13 | 14 | - name: derive_workflow 15 | steps: 16 | - verb: "derive" # https://github.com/microsoft/datashaper/blob/main/python/datashaper/datashaper/engine/verbs/derive.py 17 | args: 18 | column1: "col1" # from above 19 | column2: "col2" # from above 20 | to: "col_multiplied" # new column name 21 | operator: "*" # multiply the two columns, 22 | # Since we're trying to act on the dataset, we don't need explicitly to specify an input 23 | # "input": { "source": "source" } # use the dataset as the input to this verb. This is the default, so you can omit it. -------------------------------------------------------------------------------- /graphrag/prompt_tune/generator/entity_summarization_prompt.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Entity summarization prompt generation module.""" 5 | 6 | from pathlib import Path 7 | 8 | from graphrag.prompt_tune.template import ENTITY_SUMMARIZATION_PROMPT 9 | 10 | ENTITY_SUMMARIZATION_FILENAME = "summarize_descriptions.txt" 11 | 12 | 13 | def create_entity_summarization_prompt( 14 | persona: str, 15 | output_path: Path | None = None, 16 | ) -> str: 17 | """Create a prompt for entity summarization. If output_path is provided, write the prompt to a file. 18 | 19 | Parameters 20 | ---------- 21 | - persona (str): The persona to use for the entity summarization prompt 22 | - output_path (Path | None): The path to write the prompt to. Default is None. If None, the prompt is not written to a file. Default is None. 23 | """ 24 | prompt = ENTITY_SUMMARIZATION_PROMPT.format(persona=persona) 25 | 26 | if output_path: 27 | output_path.mkdir(parents=True, exist_ok=True) 28 | 29 | output_path = output_path / ENTITY_SUMMARIZATION_FILENAME 30 | # Write file to output path 31 | with output_path.open("w") as file: 32 | file.write(prompt) 33 | 34 | return prompt 35 | -------------------------------------------------------------------------------- /graphrag/prompt_tune/generator/community_reporter_role.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Generate a community reporter role for community summarization.""" 5 | 6 | from graphrag.llm.types.llm_types import CompletionLLM 7 | from graphrag.prompt_tune.prompt import ( 8 | GENERATE_COMMUNITY_REPORTER_ROLE_PROMPT, 9 | ) 10 | 11 | 12 | async def generate_community_reporter_role( 13 | llm: CompletionLLM, domain: str, persona: str, docs: str | list[str] 14 | ) -> str: 15 | """Generate an LLM persona to use for GraphRAG prompts. 16 | 17 | Parameters 18 | ---------- 19 | - llm (CompletionLLM): The LLM to use for generation 20 | - domain (str): The domain to generate a persona for 21 | - persona (str): The persona to generate a role for 22 | - docs (str | list[str]): The domain to generate a persona for 23 | 24 | Returns 25 | ------- 26 | - str: The generated domain prompt response. 27 | """ 28 | docs_str = " ".join(docs) if isinstance(docs, list) else docs 29 | domain_prompt = GENERATE_COMMUNITY_REPORTER_ROLE_PROMPT.format( 30 | domain=domain, persona=persona, input_text=docs_str 31 | ) 32 | 33 | response = await llm(domain_prompt) 34 | 35 | return str(response.output) 36 | -------------------------------------------------------------------------------- /graphrag/config/input_models/llm_parameters_input.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """LLM Parameters model.""" 5 | 6 | from typing_extensions import NotRequired, TypedDict 7 | 8 | from graphrag.config.enums import LLMType 9 | 10 | 11 | class LLMParametersInput(TypedDict): 12 | """LLM Parameters model.""" 13 | 14 | api_key: NotRequired[str | None] 15 | type: NotRequired[LLMType | str | None] 16 | model: NotRequired[str | None] 17 | max_tokens: NotRequired[int | str | None] 18 | request_timeout: NotRequired[float | str | None] 19 | api_base: NotRequired[str | None] 20 | api_version: NotRequired[str | None] 21 | organization: NotRequired[str | None] 22 | proxy: NotRequired[str | None] 23 | cognitive_services_endpoint: NotRequired[str | None] 24 | deployment_name: NotRequired[str | None] 25 | model_supports_json: NotRequired[bool | str | None] 26 | tokens_per_minute: NotRequired[int | str | None] 27 | requests_per_minute: NotRequired[int | str | None] 28 | max_retries: NotRequired[int | str | None] 29 | max_retry_wait: NotRequired[float | str | None] 30 | sleep_on_rate_limit_recommendation: NotRequired[bool | str | None] 31 | concurrent_requests: NotRequired[int | str | None] 32 | -------------------------------------------------------------------------------- /graphrag/index/verbs/graph/embed/strategies/node_2_vec.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """A module containing run method definition.""" 5 | 6 | from typing import Any 7 | 8 | import networkx as nx 9 | 10 | from graphrag.index.graph.embedding import embed_nod2vec 11 | from graphrag.index.graph.utils import stable_largest_connected_component 12 | from graphrag.index.verbs.graph.embed.typing import NodeEmbeddings 13 | 14 | 15 | def run(graph: nx.Graph, args: dict[str, Any]) -> NodeEmbeddings: 16 | """Run method definition.""" 17 | if args.get("use_lcc", True): 18 | graph = stable_largest_connected_component(graph) 19 | 20 | # create graph embedding using node2vec 21 | embeddings = embed_nod2vec( 22 | graph=graph, 23 | dimensions=args.get("dimensions", 1536), 24 | num_walks=args.get("num_walks", 10), 25 | walk_length=args.get("walk_length", 40), 26 | window_size=args.get("window_size", 2), 27 | iterations=args.get("iterations", 3), 28 | random_seed=args.get("random_seed", 86), 29 | ) 30 | 31 | pairs = zip(embeddings.nodes, embeddings.embeddings.tolist(), strict=True) 32 | sorted_pairs = sorted(pairs, key=lambda x: x[0]) 33 | 34 | return dict(sorted_pairs) 35 | -------------------------------------------------------------------------------- /graphrag/index/context.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | # isort: skip_file 5 | """A module containing the 'PipelineRunStats' and 'PipelineRunContext' models.""" 6 | 7 | from dataclasses import dataclass as dc_dataclass 8 | from dataclasses import field 9 | 10 | from .cache import PipelineCache 11 | from .storage.typing import PipelineStorage 12 | 13 | 14 | @dc_dataclass 15 | class PipelineRunStats: 16 | """Pipeline running stats.""" 17 | 18 | total_runtime: float = field(default=0) 19 | """Float representing the total runtime.""" 20 | 21 | num_documents: int = field(default=0) 22 | """Number of documents.""" 23 | 24 | input_load_time: float = field(default=0) 25 | """Float representing the input load time.""" 26 | 27 | workflows: dict[str, dict[str, float]] = field(default_factory=dict) 28 | """A dictionary of workflows.""" 29 | 30 | 31 | @dc_dataclass 32 | class PipelineRunContext: 33 | """Provides the context for the current pipeline run.""" 34 | 35 | stats: PipelineRunStats 36 | storage: PipelineStorage 37 | cache: PipelineCache 38 | 39 | 40 | # TODO: For now, just has the same props available to it 41 | VerbRunContext = PipelineRunContext 42 | """Provides the context for the current verb run.""" 43 | -------------------------------------------------------------------------------- /graphrag/index/verbs/graph/report/strategies/typing.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """A module containing 'Finding' and 'CommunityReport' models.""" 5 | 6 | from collections.abc import Awaitable, Callable 7 | from typing import Any 8 | 9 | from datashaper import VerbCallbacks 10 | from typing_extensions import TypedDict 11 | 12 | from graphrag.index.cache import PipelineCache 13 | 14 | ExtractedEntity = dict[str, Any] 15 | StrategyConfig = dict[str, Any] 16 | RowContext = dict[str, Any] 17 | EntityTypes = list[str] 18 | Claim = dict[str, Any] 19 | 20 | 21 | class Finding(TypedDict): 22 | """Finding class definition.""" 23 | 24 | summary: str 25 | explanation: str 26 | 27 | 28 | class CommunityReport(TypedDict): 29 | """Community report class definition.""" 30 | 31 | community: str | int 32 | title: str 33 | summary: str 34 | full_content: str 35 | full_content_json: str 36 | rank: float 37 | level: int 38 | rank_explanation: str 39 | findings: list[Finding] 40 | 41 | 42 | CommunityReportsStrategy = Callable[ 43 | [ 44 | str | int, 45 | str, 46 | int, 47 | VerbCallbacks, 48 | PipelineCache, 49 | StrategyConfig, 50 | ], 51 | Awaitable[CommunityReport | None], 52 | ] 53 | -------------------------------------------------------------------------------- /.github/pull_request_template.md: -------------------------------------------------------------------------------- 1 | 14 | 15 | ## Description 16 | 17 | [Provide a brief description of the changes made in this pull request.] 18 | 19 | ## Related Issues 20 | 21 | [Reference any related issues or tasks that this pull request addresses.] 22 | 23 | ## Proposed Changes 24 | 25 | [List the specific changes made in this pull request.] 26 | 27 | ## Checklist 28 | 29 | - [ ] I have tested these changes locally. 30 | - [ ] I have reviewed the code changes. 31 | - [ ] I have updated the documentation (if necessary). 32 | - [ ] I have added appropriate unit tests (if applicable). 33 | 34 | ## Additional Notes 35 | 36 | [Add any additional notes or context that may be helpful for the reviewer(s).] 37 | -------------------------------------------------------------------------------- /graphrag/config/models/chunking_config.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Parameterization settings for the default configuration.""" 5 | 6 | from pydantic import BaseModel, Field 7 | 8 | import graphrag.config.defaults as defs 9 | 10 | 11 | class ChunkingConfig(BaseModel): 12 | """Configuration section for chunking.""" 13 | 14 | size: int = Field(description="The chunk size to use.", default=defs.CHUNK_SIZE) 15 | overlap: int = Field( 16 | description="The chunk overlap to use.", default=defs.CHUNK_OVERLAP 17 | ) 18 | group_by_columns: list[str] = Field( 19 | description="The chunk by columns to use.", 20 | default=defs.CHUNK_GROUP_BY_COLUMNS, 21 | ) 22 | strategy: dict | None = Field( 23 | description="The chunk strategy to use, overriding the default tokenization strategy", 24 | default=None, 25 | ) 26 | 27 | def resolved_strategy(self) -> dict: 28 | """Get the resolved chunking strategy.""" 29 | from graphrag.index.verbs.text.chunk import ChunkStrategyType 30 | 31 | return self.strategy or { 32 | "type": ChunkStrategyType.tokens, 33 | "chunk_size": self.size, 34 | "chunk_overlap": self.overlap, 35 | "group_by_columns": self.group_by_columns, 36 | } 37 | -------------------------------------------------------------------------------- /graphrag/index/verbs/graph/merge/typing.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """A module containing 'BasicMergeOperation', 'StringOperation', 'NumericOperation' and 'DetailedAttributeMergeOperation' models.""" 5 | 6 | from dataclasses import dataclass 7 | from enum import Enum 8 | 9 | 10 | class BasicMergeOperation(str, Enum): 11 | """Basic Merge Operation class definition.""" 12 | 13 | Replace = "replace" 14 | Skip = "skip" 15 | 16 | 17 | class StringOperation(str, Enum): 18 | """String Operation class definition.""" 19 | 20 | Concat = "concat" 21 | Replace = "replace" 22 | Skip = "skip" 23 | 24 | 25 | class NumericOperation(str, Enum): 26 | """Numeric Operation class definition.""" 27 | 28 | Sum = "sum" 29 | Average = "average" 30 | Max = "max" 31 | Min = "min" 32 | Multiply = "multiply" 33 | Replace = "replace" 34 | Skip = "skip" 35 | 36 | 37 | @dataclass 38 | class DetailedAttributeMergeOperation: 39 | """Detailed attribute merge operation class definition.""" 40 | 41 | operation: str # StringOperation | NumericOperation 42 | 43 | # concat 44 | separator: str | None = None 45 | delimiter: str | None = None 46 | distinct: bool = False 47 | 48 | 49 | AttributeMergeOperation = str | DetailedAttributeMergeOperation 50 | -------------------------------------------------------------------------------- /graphrag/index/verbs/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """A module containing get_default_verbs method definition.""" 5 | 6 | from .covariates import extract_covariates 7 | from .entities import entity_extract, summarize_descriptions 8 | from .genid import genid 9 | from .graph import ( 10 | cluster_graph, 11 | create_community_reports, 12 | create_graph, 13 | embed_graph, 14 | layout_graph, 15 | merge_graphs, 16 | unpack_graph, 17 | ) 18 | from .overrides import aggregate, concat, merge 19 | from .snapshot import snapshot 20 | from .snapshot_rows import snapshot_rows 21 | from .spread_json import spread_json 22 | from .text import chunk, text_embed, text_split, text_translate 23 | from .unzip import unzip 24 | from .zip import zip_verb 25 | 26 | __all__ = [ 27 | "aggregate", 28 | "chunk", 29 | "cluster_graph", 30 | "concat", 31 | "create_community_reports", 32 | "create_graph", 33 | "embed_graph", 34 | "entity_extract", 35 | "extract_covariates", 36 | "genid", 37 | "layout_graph", 38 | "merge", 39 | "merge_graphs", 40 | "snapshot", 41 | "snapshot_rows", 42 | "spread_json", 43 | "summarize_descriptions", 44 | "text_embed", 45 | "text_split", 46 | "text_translate", 47 | "unpack_graph", 48 | "unzip", 49 | "zip_verb", 50 | ] 51 | -------------------------------------------------------------------------------- /graphrag/llm/openai/openai_completion_llm.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """A text-completion based LLM.""" 5 | 6 | import logging 7 | 8 | from typing_extensions import Unpack 9 | 10 | from graphrag.llm.base import BaseLLM 11 | from graphrag.llm.types import ( 12 | CompletionInput, 13 | CompletionOutput, 14 | LLMInput, 15 | ) 16 | 17 | from .openai_configuration import OpenAIConfiguration 18 | from .types import OpenAIClientTypes 19 | from .utils import get_completion_llm_args 20 | 21 | log = logging.getLogger(__name__) 22 | 23 | 24 | class OpenAICompletionLLM(BaseLLM[CompletionInput, CompletionOutput]): 25 | """A text-completion based LLM.""" 26 | 27 | _client: OpenAIClientTypes 28 | _configuration: OpenAIConfiguration 29 | 30 | def __init__(self, client: OpenAIClientTypes, configuration: OpenAIConfiguration): 31 | self.client = client 32 | self.configuration = configuration 33 | 34 | async def _execute_llm( 35 | self, 36 | input: CompletionInput, 37 | **kwargs: Unpack[LLMInput], 38 | ) -> CompletionOutput | None: 39 | args = get_completion_llm_args( 40 | kwargs.get("model_parameters"), self.configuration 41 | ) 42 | completion = self.client.completions.create(prompt=input, **args) 43 | return completion.choices[0].text 44 | -------------------------------------------------------------------------------- /tests/notebook/test_notebooks.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | import subprocess 4 | import tempfile 5 | from pathlib import Path 6 | 7 | import nbformat 8 | import pytest 9 | 10 | DOCS_PATH = Path("../../docsite") 11 | 12 | notebooks_list = list(DOCS_PATH.rglob("*.ipynb")) 13 | 14 | 15 | def _notebook_run(filepath: Path): 16 | """Execute a notebook via nbconvert and collect output. 17 | :returns execution errors 18 | """ 19 | with tempfile.NamedTemporaryFile(suffix=".ipynb") as temp_file: 20 | args = [ 21 | "jupyter", 22 | "nbconvert", 23 | "--to", 24 | "notebook", 25 | "--execute", 26 | "-y", 27 | "--no-prompt", 28 | "--output", 29 | temp_file.name, 30 | filepath.absolute().as_posix(), 31 | ] 32 | subprocess.check_call(args) 33 | 34 | temp_file.seek(0) 35 | nb = nbformat.read(temp_file, nbformat.current_nbformat) 36 | 37 | return [ 38 | output 39 | for cell in nb.cells 40 | if "outputs" in cell 41 | for output in cell["outputs"] 42 | if output.output_type == "error" 43 | ] 44 | 45 | 46 | @pytest.mark.parametrize("notebook_path", notebooks_list) 47 | def test_notebook(notebook_path: Path): 48 | assert _notebook_run(notebook_path) == [] 49 | -------------------------------------------------------------------------------- /graphrag/index/utils/rate_limiter.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Rate limiter utility.""" 5 | 6 | import asyncio 7 | import time 8 | 9 | 10 | class RateLimiter: 11 | """ 12 | The original TpmRpmLLMLimiter strategy did not account for minute-based rate limiting when scheduled. 13 | 14 | The RateLimiter was introduced to ensure that the CommunityReportsExtractor could be scheduled to adhere to rate configurations on a per-minute basis. 15 | """ 16 | 17 | # TODO: RateLimiter scheduled: using asyncio for async_mode 18 | 19 | def __init__(self, rate: int, per: int): 20 | self.rate = rate 21 | self.per = per 22 | self.allowance = rate 23 | self.last_check = time.monotonic() 24 | 25 | async def acquire(self): 26 | """Acquire a token from the rate limiter.""" 27 | current = time.monotonic() 28 | elapsed = current - self.last_check 29 | self.last_check = current 30 | self.allowance += elapsed * (self.rate / self.per) 31 | 32 | if self.allowance > self.rate: 33 | self.allowance = self.rate 34 | 35 | if self.allowance < 1.0: 36 | sleep_time = (1.0 - self.allowance) * (self.per / self.rate) 37 | await asyncio.sleep(sleep_time) 38 | self.allowance = 0.0 39 | else: 40 | self.allowance -= 1.0 41 | -------------------------------------------------------------------------------- /examples/custom_input/run.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | import asyncio 4 | import os 5 | 6 | import pandas as pd 7 | 8 | from graphrag.index import run_pipeline_with_config 9 | 10 | pipeline_file = os.path.join( 11 | os.path.dirname(os.path.abspath(__file__)), "./pipeline.yml" 12 | ) 13 | 14 | 15 | async def run(): 16 | # Load your dataset 17 | dataset = _load_dataset_some_unique_way() 18 | 19 | # Load your config without the input section 20 | config = pipeline_file 21 | 22 | # Grab the last result from the pipeline, should be our entity extraction 23 | outputs = [] 24 | async for output in run_pipeline_with_config( 25 | config_or_path=config, dataset=dataset 26 | ): 27 | outputs.append(output) 28 | pipeline_result = outputs[-1] 29 | 30 | if pipeline_result.result is not None: 31 | # Should look something like 32 | # col1 col2 filled_column 33 | # 0 2 4 Filled Value 34 | # 1 5 10 Filled Value 35 | print(pipeline_result.result) 36 | else: 37 | print("No results!") 38 | 39 | 40 | def _load_dataset_some_unique_way() -> pd.DataFrame: 41 | # Totally loaded from some other place 42 | return pd.DataFrame([{"col1": 2, "col2": 4}, {"col1": 5, "col2": 10}]) 43 | 44 | 45 | if __name__ == "__main__": 46 | asyncio.run(run()) 47 | -------------------------------------------------------------------------------- /.github/workflows/python-publish.yml: -------------------------------------------------------------------------------- 1 | name: Python Publish 2 | on: 3 | release: 4 | types: [created] 5 | push: 6 | branches: [main] 7 | 8 | env: 9 | POETRY_VERSION: "1.8.3" 10 | PYTHON_VERSION: "3.10" 11 | 12 | jobs: 13 | publish: 14 | name: Upload release to PyPI 15 | if: github.ref == 'refs/heads/main' 16 | runs-on: ubuntu-latest 17 | environment: 18 | name: pypi 19 | url: https://pypi.org/p/graphrag 20 | permissions: 21 | id-token: write # IMPORTANT: this permission is mandatory for trusted publishing 22 | 23 | steps: 24 | - uses: actions/checkout@v4 25 | with: 26 | fetch-depth: 0 27 | fetch-tags: true 28 | 29 | - name: Set up Python 30 | uses: actions/setup-python@v5 31 | with: 32 | python-version: ${{ env.PYTHON_VERSION }} 33 | 34 | - name: Install Poetry 35 | uses: abatilo/actions-poetry@v3.0.0 36 | with: 37 | poetry-version: ${{ env.POETRY_VERSION }} 38 | 39 | - name: Install dependencies 40 | shell: bash 41 | run: poetry install 42 | 43 | - name: Build Distributable 44 | shell: bash 45 | run: poetry build 46 | 47 | - name: Publish package distributions to PyPI 48 | uses: pypa/gh-action-pypi-publish@release/v1 49 | with: 50 | packages-dir: dist 51 | skip-existing: true 52 | verbose: true 53 | -------------------------------------------------------------------------------- /graphrag/index/workflows/v1/create_final_documents.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """A module containing build_steps method definition.""" 5 | 6 | from graphrag.index.config import PipelineWorkflowConfig, PipelineWorkflowStep 7 | 8 | workflow_name = "create_final_documents" 9 | 10 | 11 | def build_steps( 12 | config: PipelineWorkflowConfig, 13 | ) -> list[PipelineWorkflowStep]: 14 | """ 15 | Create the final documents table. 16 | 17 | ## Dependencies 18 | * `workflow:create_base_documents` 19 | * `workflow:create_base_document_nodes` 20 | """ 21 | base_text_embed = config.get("text_embed", {}) 22 | document_raw_content_embed_config = config.get( 23 | "document_raw_content_embed", base_text_embed 24 | ) 25 | skip_raw_content_embedding = config.get("skip_raw_content_embedding", False) 26 | return [ 27 | { 28 | "verb": "rename", 29 | "args": {"columns": {"text_units": "text_unit_ids"}}, 30 | "input": {"source": "workflow:create_base_documents"}, 31 | }, 32 | { 33 | "verb": "text_embed", 34 | "enabled": not skip_raw_content_embedding, 35 | "args": { 36 | "column": "raw_content", 37 | "to": "raw_content_embedding", 38 | **document_raw_content_embed_config, 39 | }, 40 | }, 41 | ] 42 | -------------------------------------------------------------------------------- /graphrag/index/workflows/v1/join_text_units_to_covariate_ids.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """A module containing build_steps method definition.""" 5 | 6 | from graphrag.index.config import PipelineWorkflowConfig, PipelineWorkflowStep 7 | 8 | workflow_name = "join_text_units_to_covariate_ids" 9 | 10 | 11 | def build_steps( 12 | _config: PipelineWorkflowConfig, 13 | ) -> list[PipelineWorkflowStep]: 14 | """ 15 | Create the final text-units table. 16 | 17 | ## Dependencies 18 | * `workflow:create_final_covariates` 19 | """ 20 | return [ 21 | { 22 | "verb": "select", 23 | "args": {"columns": ["id", "text_unit_id"]}, 24 | "input": {"source": "workflow:create_final_covariates"}, 25 | }, 26 | { 27 | "verb": "aggregate_override", 28 | "args": { 29 | "groupby": ["text_unit_id"], 30 | "aggregations": [ 31 | { 32 | "column": "id", 33 | "operation": "array_agg_distinct", 34 | "to": "covariate_ids", 35 | }, 36 | { 37 | "column": "text_unit_id", 38 | "operation": "any", 39 | "to": "id", 40 | }, 41 | ], 42 | }, 43 | }, 44 | ] 45 | -------------------------------------------------------------------------------- /graphrag/query/progress.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Status Reporter for orchestration.""" 5 | 6 | from abc import ABCMeta, abstractmethod 7 | from typing import Any 8 | 9 | 10 | class StatusReporter(metaclass=ABCMeta): 11 | """Provides a way to report status updates from the pipeline.""" 12 | 13 | @abstractmethod 14 | def error(self, message: str, details: dict[str, Any] | None = None): 15 | """Report an error.""" 16 | 17 | @abstractmethod 18 | def warning(self, message: str, details: dict[str, Any] | None = None): 19 | """Report a warning.""" 20 | 21 | @abstractmethod 22 | def log(self, message: str, details: dict[str, Any] | None = None): 23 | """Report a log.""" 24 | 25 | 26 | class ConsoleStatusReporter(StatusReporter): 27 | """A reporter that writes to a console.""" 28 | 29 | def error(self, message: str, details: dict[str, Any] | None = None): 30 | """Report an error.""" 31 | print(message, details) # noqa T201 32 | 33 | def warning(self, message: str, details: dict[str, Any] | None = None): 34 | """Report a warning.""" 35 | _print_warning(message) 36 | 37 | def log(self, message: str, details: dict[str, Any] | None = None): 38 | """Report a log.""" 39 | print(message, details) # noqa T201 40 | 41 | 42 | def _print_warning(skk): 43 | print(f"\033[93m {skk}\033[00m") # noqa T201 44 | -------------------------------------------------------------------------------- /graphrag/index/storage/load_storage.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """A module containing load_storage method definition.""" 5 | 6 | from __future__ import annotations 7 | 8 | from typing import cast 9 | 10 | from graphrag.config import StorageType 11 | from graphrag.index.config.storage import ( 12 | PipelineBlobStorageConfig, 13 | PipelineFileStorageConfig, 14 | PipelineStorageConfig, 15 | ) 16 | 17 | from .blob_pipeline_storage import create_blob_storage 18 | from .file_pipeline_storage import create_file_storage 19 | from .memory_pipeline_storage import create_memory_storage 20 | 21 | 22 | def load_storage(config: PipelineStorageConfig): 23 | """Load the storage for a pipeline.""" 24 | match config.type: 25 | case StorageType.memory: 26 | return create_memory_storage() 27 | case StorageType.blob: 28 | config = cast(PipelineBlobStorageConfig, config) 29 | return create_blob_storage( 30 | config.connection_string, 31 | config.storage_account_blob_url, 32 | config.container_name, 33 | config.base_dir, 34 | ) 35 | case StorageType.file: 36 | config = cast(PipelineFileStorageConfig, config) 37 | return create_file_storage(config.base_dir) 38 | case _: 39 | msg = f"Unknown storage type: {config.type}" 40 | raise ValueError(msg) 41 | -------------------------------------------------------------------------------- /examples/custom_set_of_available_workflows/custom_workflow_definitions.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | from graphrag.index.workflows import WorkflowDefinitions 4 | 5 | # Sets up the list of custom workflows that can be used in a pipeline 6 | # The idea being that you can have a pool of workflows that can be used in any number of 7 | # your pipelines 8 | custom_workflows: WorkflowDefinitions = { 9 | "my_workflow": lambda config: [ 10 | { 11 | "verb": "derive", 12 | "args": { 13 | "column1": "col1", # looks for col1 in the dataset 14 | "column2": "col2", # looks for col2 in the dataset 15 | "to": config.get( 16 | # Allow the user to specify the output column name, 17 | # otherwise default to "output_column" 18 | "derive_output_column", 19 | "output_column", 20 | ), # new column name, 21 | "operator": "*", 22 | }, 23 | } 24 | ], 25 | "my_unused_workflow": lambda _config: [ 26 | { 27 | "verb": "derive", 28 | "args": { 29 | "column1": "col1", # looks for col1 in the dataset 30 | "column2": "col2", # looks for col2 in the dataset 31 | "to": "unused_output_column", 32 | "operator": "*", 33 | }, 34 | } 35 | ], 36 | } 37 | -------------------------------------------------------------------------------- /graphrag/config/models/local_search_config.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Parameterization settings for the default configuration.""" 5 | 6 | from pydantic import BaseModel, Field 7 | 8 | import graphrag.config.defaults as defs 9 | 10 | 11 | class LocalSearchConfig(BaseModel): 12 | """The default configuration section for Cache.""" 13 | 14 | text_unit_prop: float = Field( 15 | description="The text unit proportion.", 16 | default=defs.LOCAL_SEARCH_TEXT_UNIT_PROP, 17 | ) 18 | community_prop: float = Field( 19 | description="The community proportion.", 20 | default=defs.LOCAL_SEARCH_COMMUNITY_PROP, 21 | ) 22 | conversation_history_max_turns: int = Field( 23 | description="The conversation history maximum turns.", 24 | default=defs.LOCAL_SEARCH_CONVERSATION_HISTORY_MAX_TURNS, 25 | ) 26 | top_k_entities: int = Field( 27 | description="The top k mapped entities.", 28 | default=defs.LOCAL_SEARCH_TOP_K_MAPPED_ENTITIES, 29 | ) 30 | top_k_relationships: int = Field( 31 | description="The top k mapped relations.", 32 | default=defs.LOCAL_SEARCH_TOP_K_RELATIONSHIPS, 33 | ) 34 | max_tokens: int = Field( 35 | description="The maximum tokens.", default=defs.LOCAL_SEARCH_MAX_TOKENS 36 | ) 37 | llm_max_tokens: int = Field( 38 | description="The LLM maximum tokens.", default=defs.LOCAL_SEARCH_LLM_MAX_TOKENS 39 | ) 40 | -------------------------------------------------------------------------------- /graphrag/query/llm/text_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Text Utilities for LLM.""" 5 | 6 | from collections.abc import Iterator 7 | from itertools import islice 8 | 9 | import tiktoken 10 | 11 | 12 | def num_tokens(text: str, token_encoder: tiktoken.Encoding | None = None) -> int: 13 | """Return the number of tokens in the given text.""" 14 | if token_encoder is None: 15 | token_encoder = tiktoken.get_encoding("cl100k_base") 16 | return len(token_encoder.encode(text)) # type: ignore 17 | 18 | 19 | def batched(iterable: Iterator, n: int): 20 | """ 21 | Batch data into tuples of length n. The last batch may be shorter. 22 | 23 | Taken from Python's cookbook: https://docs.python.org/3/library/itertools.html#itertools.batched 24 | """ 25 | # batched('ABCDEFG', 3) --> ABC DEF G 26 | if n < 1: 27 | value_error = "n must be at least one" 28 | raise ValueError(value_error) 29 | it = iter(iterable) 30 | while batch := tuple(islice(it, n)): 31 | yield batch 32 | 33 | 34 | def chunk_text( 35 | text: str, max_tokens: int, token_encoder: tiktoken.Encoding | None = None 36 | ): 37 | """Chunk text by token length.""" 38 | if token_encoder is None: 39 | token_encoder = tiktoken.get_encoding("cl100k_base") 40 | tokens = token_encoder.encode(text) # type: ignore 41 | chunk_iterator = batched(iter(tokens), max_tokens) 42 | yield from chunk_iterator 43 | -------------------------------------------------------------------------------- /graphrag/index/verbs/covariates/typing.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """A module containing 'Covariate' and 'CovariateExtractionResult' models.""" 5 | 6 | from collections.abc import Awaitable, Callable, Iterable 7 | from dataclasses import dataclass 8 | from typing import Any 9 | 10 | from datashaper import VerbCallbacks 11 | 12 | from graphrag.index.cache import PipelineCache 13 | 14 | 15 | @dataclass 16 | class Covariate: 17 | """Covariate class definition.""" 18 | 19 | covariate_type: str | None = None 20 | subject_id: str | None = None 21 | subject_type: str | None = None 22 | object_id: str | None = None 23 | object_type: str | None = None 24 | type: str | None = None 25 | status: str | None = None 26 | start_date: str | None = None 27 | end_date: str | None = None 28 | description: str | None = None 29 | source_text: list[str] | None = None 30 | doc_id: str | None = None 31 | record_id: int | None = None 32 | id: str | None = None 33 | 34 | 35 | @dataclass 36 | class CovariateExtractionResult: 37 | """Covariate extraction result class definition.""" 38 | 39 | covariate_data: list[Covariate] 40 | 41 | 42 | CovariateExtractStrategy = Callable[ 43 | [ 44 | Iterable[str], 45 | list[str], 46 | dict[str, str], 47 | VerbCallbacks, 48 | PipelineCache, 49 | dict[str, Any], 50 | ], 51 | Awaitable[CovariateExtractionResult], 52 | ] 53 | -------------------------------------------------------------------------------- /graphrag/config/models/global_search_config.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Parameterization settings for the default configuration.""" 5 | 6 | from pydantic import BaseModel, Field 7 | 8 | import graphrag.config.defaults as defs 9 | 10 | 11 | class GlobalSearchConfig(BaseModel): 12 | """The default configuration section for Cache.""" 13 | 14 | temperature: float | None = Field( 15 | description="The temperature to use for token generation.", 16 | default=defs.LLM_TEMPERATURE, 17 | ) 18 | top_p: float | None = Field( 19 | description="The top-p value to use for token generation.", 20 | default=defs.LLM_TOP_P, 21 | ) 22 | max_tokens: int = Field( 23 | description="The maximum context size in tokens.", 24 | default=defs.GLOBAL_SEARCH_MAX_TOKENS, 25 | ) 26 | data_max_tokens: int = Field( 27 | description="The data llm maximum tokens.", 28 | default=defs.GLOBAL_SEARCH_DATA_MAX_TOKENS, 29 | ) 30 | map_max_tokens: int = Field( 31 | description="The map llm maximum tokens.", 32 | default=defs.GLOBAL_SEARCH_MAP_MAX_TOKENS, 33 | ) 34 | reduce_max_tokens: int = Field( 35 | description="The reduce llm maximum tokens.", 36 | default=defs.GLOBAL_SEARCH_REDUCE_MAX_TOKENS, 37 | ) 38 | concurrency: int = Field( 39 | description="The number of concurrent requests.", 40 | default=defs.GLOBAL_SEARCH_CONCURRENCY, 41 | ) 42 | --------------------------------------------------------------------------------