├── tests
    ├── unit
    │   ├── config
    │   │   ├── prompt-a.txt
    │   │   ├── prompt-b.txt
    │   │   ├── prompt-c.txt
    │   │   ├── prompt-d.txt
    │   │   └── __init__.py
    │   ├── __init__.py
    │   └── indexing
    │   │   ├── __init__.py
    │   │   ├── cache
    │   │       └── __init__.py
    │   │   ├── config
    │   │       ├── __init__.py
    │   │       ├── default_config_with_overridden_workflows.yml
    │   │       ├── default_config_with_overridden_input.yml
    │   │       └── default_config_with_everything_overridden.yml
    │   │   ├── graph
    │   │       ├── __init__.py
    │   │       ├── utils
    │   │       │   └── __init__.py
    │   │       └── extractors
    │   │       │   ├── __init__.py
    │   │       │   └── community_reports
    │   │       │       └── __init__.py
    │   │   ├── verbs
    │   │       ├── __init__.py
    │   │       ├── text
    │   │       │   └── __init__.py
    │   │       ├── entities
    │   │       │   ├── __init__.py
    │   │       │   └── extraction
    │   │       │   │   ├── __init__.py
    │   │       │   │   └── strategies
    │   │       │   │       ├── __init__.py
    │   │       │   │       └── graph_intelligence
    │   │       │   │           └── __init__.py
    │   │       └── helpers
    │   │       │   ├── __init__.py
    │   │       │   └── mock_llm.py
    │   │   ├── storage
    │   │       └── __init__.py
    │   │   ├── workflows
    │   │       ├── __init__.py
    │   │       └── helpers.py
    │   │   ├── test_exports.py
    │   │   └── test_init_content.py
    ├── __init__.py
    ├── smoke
    │   └── __init__.py
    ├── notebook
    │   ├── __init__.py
    │   └── test_notebooks.py
    ├── integration
    │   ├── __init__.py
    │   └── _pipeline
    │   │   └── __init__.py
    ├── fixtures
    │   ├── azure
    │   │   ├── input
    │   │   │   └── ABOUT.md
    │   │   ├── config.json
    │   │   └── settings.yml
    │   ├── text
    │   │   ├── input
    │   │   │   └── ABOUT.md
    │   │   └── settings.yml
    │   └── min-csv
    │   │   ├── input
    │   │       └── ABOUT.md
    │   │   └── settings.yml
    └── conftest.py
├── examples
    ├── single_verb
    │   ├── input
    │   │   └── data.csv
    │   ├── __init__.py
    │   └── pipeline.yml
    ├── multiple_workflows
    │   ├── workflows
    │   │   ├── shared
    │   │   │   └── shared_fill_value.txt
    │   │   ├── workflow_1.yml
    │   │   ├── workflow_3.yml
    │   │   └── workflow_2.yml
    │   ├── __init__.py
    │   └── pipeline.yml
    ├── __init__.py
    ├── custom_input
    │   ├── __init__.py
    │   ├── pipeline.yml
    │   └── run.py
    ├── entity_extraction
    │   ├── __init__.py
    │   ├── with_nltk
    │   │   ├── __init__.py
    │   │   └── pipeline.yml
    │   └── with_graph_intelligence
    │   │   ├── __init__.py
    │   │   └── pipeline.yml
    ├── use_built_in_workflows
    │   ├── __init__.py
    │   └── pipeline.yml
    ├── interdependent_workflows
    │   ├── __init__.py
    │   └── pipeline.yml
    ├── various_levels_of_configs
    │   └── __init__.py
    ├── custom_set_of_available_verbs
    │   ├── __init__.py
    │   ├── pipeline.yml
    │   └── custom_verb_definitions.py
    ├── custom_set_of_available_workflows
    │   ├── __init__.py
    │   ├── pipeline.yml
    │   └── custom_workflow_definitions.py
    └── README.md
├── docsite
    ├── .eleventyignore
    ├── nbdocsite_template
    │   └── conf.json
    ├── .gitignore
    ├── img
    │   ├── GraphRag-Figure1.jpg
    │   └── pipeline-running.png
    ├── .yarnrc.yml
    ├── .yarn
    │   └── sdks
    │   │   └── integrations.yml
    ├── data
    │   └── operation_dulce
    │   │   └── ABOUT.md
    ├── posts
    │   ├── query
    │   │   └── notebooks
    │   │   │   └── overview.md
    │   └── config
    │   │   └── overview.md
    ├── package.json
    └── .eleventy.js
├── scripts
    ├── spellcheck.sh
    ├── start-azurite.sh
    ├── e2e-test.sh
    └── semver-check.sh
├── .semversioner
    ├── next-release
    │   ├── patch-20240701233152787373.json
    │   ├── patch-20240703182750529114.json
    │   ├── patch-20240704181236015699.json
    │   ├── patch-20240705184142723331.json
    │   └── patch-20240703152422358587.json
    └── 0.1.0.json
├── graphrag
    ├── __init__.py
    ├── index
    │   ├── py.typed
    │   ├── graph
    │   │   ├── __init__.py
    │   │   ├── embedding
    │   │   │   ├── __init__.py
    │   │   │   └── embedding.py
    │   │   ├── extractors
    │   │   │   ├── claims
    │   │   │   │   └── __init__.py
    │   │   │   ├── summarize
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── prompts.py
    │   │   │   ├── graph
    │   │   │   │   └── __init__.py
    │   │   │   ├── __init__.py
    │   │   │   └── community_reports
    │   │   │   │   └── __init__.py
    │   │   ├── utils
    │   │   │   ├── __init__.py
    │   │   │   └── normalize_node_names.py
    │   │   └── visualization
    │   │   │   ├── __init__.py
    │   │   │   └── typing.py
    │   ├── verbs
    │   │   ├── graph
    │   │   │   ├── layout
    │   │   │   │   ├── methods
    │   │   │   │   │   └── __init__.py
    │   │   │   │   └── __init__.py
    │   │   │   ├── embed
    │   │   │   │   ├── strategies
    │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   └── node_2_vec.py
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── typing.py
    │   │   │   ├── clustering
    │   │   │   │   ├── strategies
    │   │   │   │   │   └── __init__.py
    │   │   │   │   ├── typing.py
    │   │   │   │   └── __init__.py
    │   │   │   ├── report
    │   │   │   │   ├── strategies
    │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   ├── graph_intelligence
    │   │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   │   └── defaults.py
    │   │   │   │   │   └── typing.py
    │   │   │   │   └── __init__.py
    │   │   │   ├── merge
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── defaults.py
    │   │   │   │   └── typing.py
    │   │   │   └── __init__.py
    │   │   ├── text
    │   │   │   ├── embed
    │   │   │   │   ├── strategies
    │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   ├── typing.py
    │   │   │   │   │   └── mock.py
    │   │   │   │   └── __init__.py
    │   │   │   ├── chunk
    │   │   │   │   ├── strategies
    │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   ├── typing.py
    │   │   │   │   │   └── sentence.py
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── typing.py
    │   │   │   ├── replace
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── typing.py
    │   │   │   ├── translate
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── strategies
    │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   ├── defaults.py
    │   │   │   │   │   ├── typing.py
    │   │   │   │   │   └── mock.py
    │   │   │   └── __init__.py
    │   │   ├── entities
    │   │   │   ├── extraction
    │   │   │   │   ├── strategies
    │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   ├── graph_intelligence
    │   │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   │   └── defaults.py
    │   │   │   │   │   └── typing.py
    │   │   │   │   └── __init__.py
    │   │   │   ├── summarize
    │   │   │   │   ├── strategies
    │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   ├── graph_intelligence
    │   │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   │   └── defaults.py
    │   │   │   │   │   └── typing.py
    │   │   │   │   └── __init__.py
    │   │   │   └── __init__.py
    │   │   ├── covariates
    │   │   │   ├── extract_covariates
    │   │   │   │   ├── strategies
    │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   └── graph_intelligence
    │   │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   │   └── defaults.py
    │   │   │   │   └── __init__.py
    │   │   │   ├── __init__.py
    │   │   │   └── typing.py
    │   │   ├── overrides
    │   │   │   ├── __init__.py
    │   │   │   └── concat.py
    │   │   ├── unzip.py
    │   │   ├── snapshot.py
    │   │   └── __init__.py
    │   ├── workflows
    │   │   ├── v1
    │   │   │   ├── __init__.py
    │   │   │   ├── create_final_documents.py
    │   │   │   └── join_text_units_to_covariate_ids.py
    │   │   ├── __init__.py
    │   │   └── typing.py
    │   ├── input
    │   │   └── __init__.py
    │   ├── progress
    │   │   └── __init__.py
    │   ├── emit
    │   │   ├── types.py
    │   │   ├── table_emitter.py
    │   │   ├── __init__.py
    │   │   ├── csv_table_emitter.py
    │   │   └── json_table_emitter.py
    │   ├── llm
    │   │   ├── types.py
    │   │   └── __init__.py
    │   ├── utils
    │   │   ├── topological_sort.py
    │   │   ├── load_graph.py
    │   │   ├── uuid.py
    │   │   ├── hashing.py
    │   │   ├── is_null.py
    │   │   ├── dicts.py
    │   │   ├── string.py
    │   │   ├── json.py
    │   │   ├── __init__.py
    │   │   ├── ds_util.py
    │   │   ├── tokens.py
    │   │   └── rate_limiter.py
    │   ├── text_splitting
    │   │   ├── check_token_limit.py
    │   │   └── __init__.py
    │   ├── cache
    │   │   └── __init__.py
    │   ├── typing.py
    │   ├── storage
    │   │   ├── __init__.py
    │   │   └── load_storage.py
    │   ├── reporting
    │   │   ├── __init__.py
    │   │   └── console_workflow_callbacks.py
    │   ├── errors.py
    │   ├── bootstrap.py
    │   ├── config
    │   │   └── workflow.py
    │   └── context.py
    ├── query
    │   ├── __init__.py
    │   ├── input
    │   │   ├── __init__.py
    │   │   ├── loaders
    │   │   │   └── __init__.py
    │   │   └── retrieval
    │   │   │   └── __init__.py
    │   ├── llm
    │   │   ├── __init__.py
    │   │   ├── oai
    │   │   │   ├── typing.py
    │   │   │   └── __init__.py
    │   │   └── text_utils.py
    │   ├── question_gen
    │   │   ├── __init__.py
    │   │   └── system_prompt.py
    │   ├── structured_search
    │   │   ├── __init__.py
    │   │   ├── global_search
    │   │   │   ├── __init__.py
    │   │   │   └── callbacks.py
    │   │   └── local_search
    │   │   │   └── __init__.py
    │   ├── context_builder
    │   │   ├── __init__.py
    │   │   └── builders.py
    │   └── progress.py
    ├── prompt_tune
    │   ├── __init__.py
    │   ├── generator
    │   │   ├── defaults.py
    │   │   ├── domain.py
    │   │   ├── persona.py
    │   │   ├── __init__.py
    │   │   ├── entity_summarization_prompt.py
    │   │   └── community_reporter_role.py
    │   ├── loader
    │   │   └── __init__.py
    │   ├── prompt
    │   │   ├── domain.py
    │   │   ├── persona.py
    │   │   ├── __init__.py
    │   │   └── community_reporter_role.py
    │   └── template
    │   │   ├── __init__.py
    │   │   └── entity_summarization.py
    ├── model
    │   ├── types.py
    │   ├── named.py
    │   ├── identified.py
    │   └── __init__.py
    ├── llm
    │   ├── openai
    │   │   ├── types.py
    │   │   ├── _json.py
    │   │   ├── __init__.py
    │   │   ├── json_parsing_llm.py
    │   │   ├── openai_history_tracking_llm.py
    │   │   ├── openai_token_replacing_llm.py
    │   │   ├── openai_embeddings_llm.py
    │   │   └── openai_completion_llm.py
    │   ├── mock
    │   │   ├── __init__.py
    │   │   └── mock_completion_llm.py
    │   ├── base
    │   │   ├── __init__.py
    │   │   └── _create_cache_key.py
    │   ├── errors.py
    │   ├── types
    │   │   ├── llm_types.py
    │   │   ├── llm_cache.py
    │   │   ├── llm.py
    │   │   ├── llm_callbacks.py
    │   │   ├── llm_invocation_result.py
    │   │   ├── llm_config.py
    │   │   └── __init__.py
    │   └── limiting
    │   │   ├── __init__.py
    │   │   ├── llm_limiter.py
    │   │   ├── noop_llm_limiter.py
    │   │   ├── create_limiters.py
    │   │   ├── composite_limiter.py
    │   │   └── tpm_rpm_limiter.py
    ├── config
    │   ├── input_models
    │   │   ├── umap_config_input.py
    │   │   ├── parallelization_parameters_input.py
    │   │   ├── cluster_graph_config_input.py
    │   │   ├── snapshots_config_input.py
    │   │   ├── chunking_config_input.py
    │   │   ├── summarize_descriptions_config_input.py
    │   │   ├── community_reports_config_input.py
    │   │   ├── entity_extraction_config_input.py
    │   │   ├── global_search_config_input.py
    │   │   ├── claim_extraction_config_input.py
    │   │   ├── cache_config_input.py
    │   │   ├── storage_config_input.py
    │   │   ├── reporting_config_input.py
    │   │   ├── llm_config_input.py
    │   │   ├── embed_graph_config_input.py
    │   │   ├── local_search_config_input.py
    │   │   ├── text_embedding_config_input.py
    │   │   ├── input_config_input.py
    │   │   └── llm_parameters_input.py
    │   ├── models
    │   │   ├── umap_config.py
    │   │   ├── parallelization_parameters.py
    │   │   ├── snapshots_config.py
    │   │   ├── llm_config.py
    │   │   ├── cluster_graph_config.py
    │   │   ├── cache_config.py
    │   │   ├── storage_config.py
    │   │   ├── reporting_config.py
    │   │   ├── chunking_config.py
    │   │   ├── local_search_config.py
    │   │   └── global_search_config.py
    │   └── read_dotenv.py
    └── vector_stores
    │   └── __init__.py
├── Screenshot 2024-07-09 at 3.36.28 AM.png
├── Screenshot 2024-07-09 at 3.34.31 AM-1.png
├── examples_notebooks
    └── inputs
    │   └── operation dulce
    │       ├── create_final_nodes.parquet
    │       ├── create_final_entities.parquet
    │       ├── create_final_covariates.parquet
    │       ├── create_final_text_units.parquet
    │       ├── create_final_relationships.parquet
    │       ├── ABOUT.md
    │       └── create_final_community_reports.parquet
├── .vscode
    ├── launch.json
    └── extensions.json
├── CODEOWNERS
├── .github
    ├── workflows
    │   ├── spellcheck.yml
    │   ├── semver.yml
    │   ├── javascript-ci.yml
    │   └── python-publish.yml
    ├── dependabot.yml
    └── pull_request_template.md
├── cspell.config.yaml
├── requirements.txt
├── .vsts-ci.yml
├── LICENSE
└── .gitignore


/tests/unit/config/prompt-a.txt:
--------------------------------------------------------------------------------
1 | Hello, World! A


--------------------------------------------------------------------------------
/tests/unit/config/prompt-b.txt:
--------------------------------------------------------------------------------
1 | Hello, World! B


--------------------------------------------------------------------------------
/tests/unit/config/prompt-c.txt:
--------------------------------------------------------------------------------
1 | Hello, World! C


--------------------------------------------------------------------------------
/tests/unit/config/prompt-d.txt:
--------------------------------------------------------------------------------
1 | Hello, World! D


--------------------------------------------------------------------------------
/examples/single_verb/input/data.csv:
--------------------------------------------------------------------------------
1 | col1,col2
2 | 2,4
3 | 5,10


--------------------------------------------------------------------------------
/docsite/.eleventyignore:
--------------------------------------------------------------------------------
1 | !posts/index/verbs/*.md
2 | !posts/index/workflows/*.md
3 | 


--------------------------------------------------------------------------------
/examples/multiple_workflows/workflows/shared/shared_fill_value.txt:
--------------------------------------------------------------------------------
1 | value_from_shared_file


--------------------------------------------------------------------------------
/scripts/spellcheck.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | npx --yes cspell -c cspell.config.yaml --no-progress lint .


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 


--------------------------------------------------------------------------------
/examples/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 


--------------------------------------------------------------------------------
/scripts/start-azurite.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | npx --yes azurite -L -l ./temp_azurite -d ./temp_azurite/debug.log


--------------------------------------------------------------------------------
/tests/smoke/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 


--------------------------------------------------------------------------------
/tests/unit/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 


--------------------------------------------------------------------------------
/docsite/nbdocsite_template/conf.json:
--------------------------------------------------------------------------------
1 | {
2 |     "mimetypes": {
3 |       "text/markdown": true
4 |     }
5 |   }


--------------------------------------------------------------------------------
/tests/notebook/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 


--------------------------------------------------------------------------------
/examples/custom_input/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 


--------------------------------------------------------------------------------
/examples/single_verb/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 


--------------------------------------------------------------------------------
/tests/integration/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 


--------------------------------------------------------------------------------
/tests/unit/config/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 


--------------------------------------------------------------------------------
/tests/unit/indexing/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 


--------------------------------------------------------------------------------
/examples/entity_extraction/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 


--------------------------------------------------------------------------------
/tests/unit/indexing/cache/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 


--------------------------------------------------------------------------------
/tests/unit/indexing/config/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 


--------------------------------------------------------------------------------
/tests/unit/indexing/graph/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 


--------------------------------------------------------------------------------
/tests/unit/indexing/verbs/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 


--------------------------------------------------------------------------------
/docsite/.gitignore:
--------------------------------------------------------------------------------
1 | _site
2 | _posts
3 | posts/query/notebooks/*.ipynb
4 | posts/query/notebooks/*_nb.md
5 | *.parquet
6 | *.zip


--------------------------------------------------------------------------------
/examples/multiple_workflows/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 


--------------------------------------------------------------------------------
/examples/use_built_in_workflows/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 


--------------------------------------------------------------------------------
/tests/integration/_pipeline/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 


--------------------------------------------------------------------------------
/tests/unit/indexing/graph/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 


--------------------------------------------------------------------------------
/tests/unit/indexing/storage/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 


--------------------------------------------------------------------------------
/tests/unit/indexing/verbs/text/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 


--------------------------------------------------------------------------------
/tests/unit/indexing/workflows/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 


--------------------------------------------------------------------------------
/examples/entity_extraction/with_nltk/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 


--------------------------------------------------------------------------------
/examples/interdependent_workflows/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 


--------------------------------------------------------------------------------
/examples/various_levels_of_configs/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 


--------------------------------------------------------------------------------
/tests/unit/indexing/graph/extractors/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 


--------------------------------------------------------------------------------
/tests/unit/indexing/verbs/entities/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 


--------------------------------------------------------------------------------
/tests/unit/indexing/verbs/helpers/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 


--------------------------------------------------------------------------------
/.semversioner/next-release/patch-20240701233152787373.json:
--------------------------------------------------------------------------------
1 | {
2 |   "type": "patch",
3 |   "description": "Fix docsite base url"
4 | }
5 | 


--------------------------------------------------------------------------------
/docsite/img/GraphRag-Figure1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheAiSingularity/graphrag-local-ollama/HEAD/docsite/img/GraphRag-Figure1.jpg


--------------------------------------------------------------------------------
/docsite/img/pipeline-running.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheAiSingularity/graphrag-local-ollama/HEAD/docsite/img/pipeline-running.png


--------------------------------------------------------------------------------
/examples/custom_set_of_available_verbs/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 


--------------------------------------------------------------------------------
/examples/custom_set_of_available_workflows/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 


--------------------------------------------------------------------------------
/graphrag/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 
4 | """The GraphRAG package."""
5 | 


--------------------------------------------------------------------------------
/graphrag/index/py.typed:
--------------------------------------------------------------------------------
1 | # This package supports type hinting,
2 | # see https://www.python.org/dev/peps/pep-0561/#packaging-type-information


--------------------------------------------------------------------------------
/scripts/e2e-test.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | # Use CLI Form
4 | poetry run python -m graphrag.index --config ./examples/single_verb/pipeline.yml


--------------------------------------------------------------------------------
/tests/unit/indexing/verbs/entities/extraction/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 


--------------------------------------------------------------------------------
/examples/entity_extraction/with_graph_intelligence/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 


--------------------------------------------------------------------------------
/Screenshot 2024-07-09 at 3.36.28 AM.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheAiSingularity/graphrag-local-ollama/HEAD/Screenshot 2024-07-09 at 3.36.28 AM.png


--------------------------------------------------------------------------------
/tests/unit/indexing/graph/extractors/community_reports/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 


--------------------------------------------------------------------------------
/tests/unit/indexing/verbs/entities/extraction/strategies/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 


--------------------------------------------------------------------------------
/.semversioner/next-release/patch-20240703182750529114.json:
--------------------------------------------------------------------------------
1 | {
2 |   "type": "patch",
3 |   "description": "Fix broken prompt tuning link on docs"
4 | }
5 | 


--------------------------------------------------------------------------------
/Screenshot 2024-07-09 at 3.34.31 AM-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheAiSingularity/graphrag-local-ollama/HEAD/Screenshot 2024-07-09 at 3.34.31 AM-1.png


--------------------------------------------------------------------------------
/docsite/.yarnrc.yml:
--------------------------------------------------------------------------------
1 | compressionLevel: mixed
2 | 
3 | enableGlobalCache: false
4 | 
5 | nodeLinker: pnp
6 | 
7 | yarnPath: .yarn/releases/yarn-4.0.2.cjs
8 | 


--------------------------------------------------------------------------------
/graphrag/query/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 
4 | """GraphRAG Orchestration Module."""
5 | 


--------------------------------------------------------------------------------
/.semversioner/next-release/patch-20240704181236015699.json:
--------------------------------------------------------------------------------
1 | {
2 |   "type": "patch",
3 |   "description": "Fix for --limit exceeding the dataframe lenght"
4 | }
5 | 


--------------------------------------------------------------------------------
/examples/custom_set_of_available_workflows/pipeline.yml:
--------------------------------------------------------------------------------
1 | workflows:
2 |   - name: my_workflow
3 |     config:
4 |       derive_output_column: "col_1_multiplied"
5 | 


--------------------------------------------------------------------------------
/graphrag/query/input/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 
4 | """GraphRAG Orchestration Inputs."""
5 | 


--------------------------------------------------------------------------------
/graphrag/query/llm/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 
4 | """Orchestration LLM utilities."""
5 | 


--------------------------------------------------------------------------------
/examples/multiple_workflows/workflows/workflow_1.yml:
--------------------------------------------------------------------------------
1 | name: workflow_1
2 | steps:
3 |   - verb: fill
4 |     args:
5 |       to: "col_workflow_1"
6 |       value: 1
7 | 


--------------------------------------------------------------------------------
/examples/multiple_workflows/workflows/workflow_3.yml:
--------------------------------------------------------------------------------
1 | name: workflow_3
2 | steps:
3 |   - verb: fill
4 |     args:
5 |       to: "col_workflow_3"
6 |       value: 3
7 | 


--------------------------------------------------------------------------------
/graphrag/query/question_gen/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 
4 | """Question Generation Module."""
5 | 


--------------------------------------------------------------------------------
/.semversioner/next-release/patch-20240705184142723331.json:
--------------------------------------------------------------------------------
1 | {
2 |   "type": "patch",
3 |   "description": "Add Minute-based Rate Limiting and fix rpm, tpm settings"
4 | }
5 | 


--------------------------------------------------------------------------------
/docsite/.yarn/sdks/integrations.yml:
--------------------------------------------------------------------------------
1 | # This file is automatically generated by @yarnpkg/sdks.
2 | # Manual changes might be lost!
3 | 
4 | integrations:
5 |   - vscode
6 | 


--------------------------------------------------------------------------------
/graphrag/index/graph/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 
4 | """The Indexing Engine graph package root."""
5 | 


--------------------------------------------------------------------------------
/graphrag/query/structured_search/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 
4 | """Structured Search package."""
5 | 


--------------------------------------------------------------------------------
/tests/unit/indexing/verbs/entities/extraction/strategies/graph_intelligence/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 


--------------------------------------------------------------------------------
/examples/multiple_workflows/pipeline.yml:
--------------------------------------------------------------------------------
1 | workflows:
2 |   - !include workflows/workflow_1.yml
3 |   - !include workflows/workflow_2.yml
4 |   - !include workflows/workflow_3.yml


--------------------------------------------------------------------------------
/graphrag/index/verbs/graph/layout/methods/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 
4 | """Graph Layout Methods."""
5 | 


--------------------------------------------------------------------------------
/graphrag/query/input/loaders/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 
4 | """GraphRAG Orchestartion Input Loaders."""
5 | 


--------------------------------------------------------------------------------
/.semversioner/next-release/patch-20240703152422358587.json:
--------------------------------------------------------------------------------
1 | {
2 |   "type": "patch",
3 |   "description": "Add cli flag to overlay default values onto a provided config."
4 | }
5 | 


--------------------------------------------------------------------------------
/graphrag/index/verbs/graph/embed/strategies/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 
4 | """Text Embedding strategies."""
5 | 


--------------------------------------------------------------------------------
/graphrag/index/workflows/v1/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 
4 | """The Indexing Engine workflows package root."""
5 | 


--------------------------------------------------------------------------------
/graphrag/prompt_tune/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 
4 | """Command line interface for the fine_tune module."""
5 | 


--------------------------------------------------------------------------------
/graphrag/query/input/retrieval/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 
4 | """GraphRAG Orchestration Input Retrieval."""
5 | 


--------------------------------------------------------------------------------
/graphrag/query/structured_search/global_search/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 
4 | """GlobalSearch module."""
5 | 


--------------------------------------------------------------------------------
/graphrag/query/structured_search/local_search/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 
4 | """The LocalSearch package."""
5 | 


--------------------------------------------------------------------------------
/tests/fixtures/azure/input/ABOUT.md:
--------------------------------------------------------------------------------
1 | # About
2 | 
3 | This document (Operation Dulce) in an AI-generated science fiction novella, included here for the purposes of integration testing.


--------------------------------------------------------------------------------
/tests/fixtures/text/input/ABOUT.md:
--------------------------------------------------------------------------------
1 | # About
2 | 
3 | This document (Operation Dulce) in an AI-generated science fiction novella, included here for the purposes of integration testing.


--------------------------------------------------------------------------------
/tests/unit/indexing/config/default_config_with_overridden_workflows.yml:
--------------------------------------------------------------------------------
1 | extends: default
2 | 
3 | workflows:
4 |   - name: TEST_WORKFLOW
5 |     steps:
6 |       - verb: TEST_VERB
7 | 


--------------------------------------------------------------------------------
/docsite/data/operation_dulce/ABOUT.md:
--------------------------------------------------------------------------------
1 | # About
2 | 
3 | This document (Operation Dulce) is an AI-generated science fiction novella, included here for the purposes of integration testing.


--------------------------------------------------------------------------------
/tests/fixtures/min-csv/input/ABOUT.md:
--------------------------------------------------------------------------------
1 | # About
2 | 
3 | This document (Operation Dulce) in an AI-generated science fiction novella, included here for the purposes of integration testing.


--------------------------------------------------------------------------------
/tests/unit/indexing/config/default_config_with_overridden_input.yml:
--------------------------------------------------------------------------------
1 | extends: default
2 | input:
3 |   file_type: text
4 |   base_dir: /some/overridden/dir
5 |   file_pattern: test.txt
6 | 


--------------------------------------------------------------------------------
/examples/entity_extraction/with_nltk/pipeline.yml:
--------------------------------------------------------------------------------
1 | workflows:
2 |   - name: "entity_extraction"
3 |     config:
4 |       entity_extract:
5 |           strategy: 
6 |               type: "nltk"


--------------------------------------------------------------------------------
/graphrag/index/verbs/graph/clustering/strategies/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 
4 | """Graph Clustering Strategies."""
5 | 


--------------------------------------------------------------------------------
/graphrag/index/verbs/text/embed/strategies/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 
4 | """The Indexing Engine embed strategies package root."""
5 | 


--------------------------------------------------------------------------------
/examples_notebooks/inputs/operation dulce/create_final_nodes.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheAiSingularity/graphrag-local-ollama/HEAD/examples_notebooks/inputs/operation dulce/create_final_nodes.parquet


--------------------------------------------------------------------------------
/graphrag/index/verbs/text/chunk/strategies/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 
4 | """The Indexing Engine text chunk strategies package root."""
5 | 


--------------------------------------------------------------------------------
/examples_notebooks/inputs/operation dulce/create_final_entities.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheAiSingularity/graphrag-local-ollama/HEAD/examples_notebooks/inputs/operation dulce/create_final_entities.parquet


--------------------------------------------------------------------------------
/graphrag/index/verbs/graph/report/strategies/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 
4 | """The Indexing Engine graph report strategies package root."""
5 | 


--------------------------------------------------------------------------------
/examples_notebooks/inputs/operation dulce/create_final_covariates.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheAiSingularity/graphrag-local-ollama/HEAD/examples_notebooks/inputs/operation dulce/create_final_covariates.parquet


--------------------------------------------------------------------------------
/examples_notebooks/inputs/operation dulce/create_final_text_units.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheAiSingularity/graphrag-local-ollama/HEAD/examples_notebooks/inputs/operation dulce/create_final_text_units.parquet


--------------------------------------------------------------------------------
/examples_notebooks/inputs/operation dulce/create_final_relationships.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheAiSingularity/graphrag-local-ollama/HEAD/examples_notebooks/inputs/operation dulce/create_final_relationships.parquet


--------------------------------------------------------------------------------
/graphrag/index/verbs/entities/extraction/strategies/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 
4 | """The Indexing Engine entities extraction strategies package root."""
5 | 


--------------------------------------------------------------------------------
/graphrag/query/context_builder/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 
4 | """Functions to build context for system prompt to generate responses for a user query."""
5 | 


--------------------------------------------------------------------------------
/.semversioner/0.1.0.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "changes": [
 3 |     {
 4 |       "description": "Initial Release",
 5 |       "type": "minor"
 6 |     }
 7 |   ],
 8 |   "created_at": "2024-07-01T21:48:50+00:00",
 9 |   "version": "0.1.0"
10 | }


--------------------------------------------------------------------------------
/examples_notebooks/inputs/operation dulce/ABOUT.md:
--------------------------------------------------------------------------------
1 | # About
2 | 
3 | This document (Operation Dulce) is an AI-generated science fiction novella, included here for the purposes of providing a starting point for notebook experimentation.
4 | 


--------------------------------------------------------------------------------
/examples_notebooks/inputs/operation dulce/create_final_community_reports.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TheAiSingularity/graphrag-local-ollama/HEAD/examples_notebooks/inputs/operation dulce/create_final_community_reports.parquet


--------------------------------------------------------------------------------
/graphrag/index/verbs/covariates/extract_covariates/strategies/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 
4 | """The Indexing Engine text extract claims strategies package root."""
5 | 


--------------------------------------------------------------------------------
/graphrag/index/input/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 
4 | """The Indexing Engine input package root."""
5 | 
6 | from .load_input import load_input
7 | 
8 | __all__ = ["load_input"]
9 | 


--------------------------------------------------------------------------------
/graphrag/index/verbs/graph/clustering/typing.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 
4 | """A module containing Communities list definition."""
5 | 
6 | Communities = list[tuple[int, str, list[str]]]
7 | 


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | def pytest_addoption(parser):
4 |     parser.addoption(
5 |         "--run_slow", action="store_true", default=False, help="run slow tests"
6 |     )
7 | 


--------------------------------------------------------------------------------
/graphrag/index/verbs/text/replace/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 
4 | """The Indexing Engine text replace package root."""
5 | 
6 | from .replace import text_replace
7 | 
8 | __all__ = ["text_replace"]
9 | 


--------------------------------------------------------------------------------
/graphrag/model/types.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 
4 | """Common types for the GraphRAG knowledge model."""
5 | 
6 | from collections.abc import Callable
7 | 
8 | TextEmbedder = Callable[[str], list[float]]
9 | 


--------------------------------------------------------------------------------
/.vscode/launch.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"version": "0.2.0",
 3 | 	"configurations": [
 4 | 		{
 5 | 			"name": "Attach to Node Functions",
 6 | 			"type": "node",
 7 | 			"request": "attach",
 8 | 			"port": 9229,
 9 | 			"preLaunchTask": "func: host start"
10 | 		}
11 | 	]
12 | }
13 | 


--------------------------------------------------------------------------------
/graphrag/index/verbs/graph/layout/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 
4 | """The Indexing Engine graph layout package root."""
5 | 
6 | from .layout_graph import layout_graph
7 | 
8 | __all__ = ["layout_graph"]
9 | 


--------------------------------------------------------------------------------
/graphrag/index/verbs/graph/merge/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 
4 | """The Indexing Engine graph merge package root."""
5 | 
6 | from .merge_graphs import merge_graphs
7 | 
8 | __all__ = ["merge_graphs"]
9 | 


--------------------------------------------------------------------------------
/CODEOWNERS:
--------------------------------------------------------------------------------
1 | # These owners will be the default owners for everything in
2 | # the repo. Unless a later match takes precedence,
3 | # @global-owner1 and @global-owner2 will be requested for
4 | # review when someone opens a pull request.
5 | *       @microsoft/societal-resilience
6 | 
7 | 


--------------------------------------------------------------------------------
/graphrag/index/verbs/text/translate/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 
4 | """The Indexing Engine text translate package root."""
5 | 
6 | from .text_translate import text_translate
7 | 
8 | __all__ = ["text_translate"]
9 | 


--------------------------------------------------------------------------------
/graphrag/index/verbs/covariates/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 
4 | """The Indexing Engine covariates package root."""
5 | 
6 | from .extract_covariates import extract_covariates
7 | 
8 | __all__ = ["extract_covariates"]
9 | 


--------------------------------------------------------------------------------
/examples/custom_set_of_available_verbs/pipeline.yml:
--------------------------------------------------------------------------------
1 | workflows:
2 |  - steps:
3 |     - verb: "str_append"  # should be the key that you pass to the custom_verbs dict below
4 |       args: 
5 |         source_column: "col1"
6 |         target_column: "col_1_custom"
7 |         string_to_append: " - custom verb"


--------------------------------------------------------------------------------
/graphrag/index/graph/embedding/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 
4 | """The Indexing Engine graph embedding package root."""
5 | 
6 | from .embedding import NodeEmbeddings, embed_nod2vec
7 | 
8 | __all__ = ["NodeEmbeddings", "embed_nod2vec"]
9 | 


--------------------------------------------------------------------------------
/graphrag/index/verbs/entities/summarize/strategies/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 
4 | """Indexing Engine - Summarization Strategies Package."""
5 | 
6 | from .typing import SummarizationStrategy
7 | 
8 | __all__ = ["SummarizationStrategy"]
9 | 


--------------------------------------------------------------------------------
/graphrag/index/verbs/entities/summarize/strategies/graph_intelligence/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 
4 | """The Entity Resolution graph intelligence package root."""
5 | 
6 | from .run_graph_intelligence import run
7 | 
8 | __all__ = ["run"]
9 | 


--------------------------------------------------------------------------------
/graphrag/index/verbs/entities/extraction/strategies/graph_intelligence/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 
4 | """The Indexing Engine graph intelligence package root."""
5 | 
6 | from .run_graph_intelligence import run_gi
7 | 
8 | __all__ = ["run_gi"]
9 | 


--------------------------------------------------------------------------------
/graphrag/index/verbs/text/embed/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 
4 | """The Indexing Engine text embed package root."""
5 | 
6 | from .text_embed import TextEmbedStrategyType, text_embed
7 | 
8 | __all__ = ["TextEmbedStrategyType", "text_embed"]
9 | 


--------------------------------------------------------------------------------
/.vscode/extensions.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "recommendations": [
 3 |     "arcanis.vscode-zipfs",
 4 |     "ms-python.python",
 5 |     "charliermarsh.ruff",
 6 |     "ms-python.vscode-pylance",
 7 |     "bierner.markdown-mermaid",
 8 |     "streetsidesoftware.code-spell-checker",
 9 |     "ronnidc.nunjucks"
10 |   ]
11 | }
12 | 


--------------------------------------------------------------------------------
/graphrag/index/verbs/graph/embed/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 
4 | """The Indexing Engine graph embed package root."""
5 | 
6 | from .embed_graph import EmbedGraphStrategyType, embed_graph
7 | 
8 | __all__ = ["EmbedGraphStrategyType", "embed_graph"]
9 | 


--------------------------------------------------------------------------------
/graphrag/llm/openai/types.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """A base class for OpenAI-based LLMs."""
 5 | 
 6 | from openai import (
 7 |     AsyncAzureOpenAI,
 8 |     AsyncOpenAI,
 9 | )
10 | 
11 | OpenAIClientTypes = AsyncOpenAI | AsyncAzureOpenAI
12 | 


--------------------------------------------------------------------------------
/graphrag/index/verbs/graph/report/strategies/graph_intelligence/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 
4 | """The Indexing Engine graph report strategies graph intelligence package root."""
5 | 
6 | from .run_graph_intelligence import run
7 | 
8 | __all__ = ["run"]
9 | 


--------------------------------------------------------------------------------
/graphrag/index/verbs/text/chunk/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 
4 | """The Indexing Engine text chunk package root."""
5 | 
6 | from .text_chunk import ChunkStrategy, ChunkStrategyType, chunk
7 | 
8 | __all__ = ["ChunkStrategy", "ChunkStrategyType", "chunk"]
9 | 


--------------------------------------------------------------------------------
/graphrag/index/progress/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 
4 | """Progress-reporting components."""
5 | 
6 | from .types import NullProgressReporter, PrintProgressReporter, ProgressReporter
7 | 
8 | __all__ = ["NullProgressReporter", "PrintProgressReporter", "ProgressReporter"]
9 | 


--------------------------------------------------------------------------------
/graphrag/index/verbs/graph/clustering/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 
4 | """The Indexing Engine graph clustering package root."""
5 | 
6 | from .cluster_graph import GraphCommunityStrategyType, cluster_graph
7 | 
8 | __all__ = ["GraphCommunityStrategyType", "cluster_graph"]
9 | 


--------------------------------------------------------------------------------
/graphrag/index/verbs/covariates/extract_covariates/strategies/graph_intelligence/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 
4 | """The Indexing Engine text extract claims strategies graph intelligence package root."""
5 | 
6 | from .run_gi_extract_claims import run
7 | 
8 | __all__ = ["run"]
9 | 


--------------------------------------------------------------------------------
/graphrag/index/verbs/entities/extraction/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 
4 | """The Indexing Engine entities extraction package root."""
5 | 
6 | from .entity_extract import ExtractEntityStrategyType, entity_extract
7 | 
8 | __all__ = ["ExtractEntityStrategyType", "entity_extract"]
9 | 


--------------------------------------------------------------------------------
/graphrag/index/verbs/entities/summarize/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 
4 | """Root package for resolution entities."""
5 | 
6 | from .description_summarize import SummarizeStrategyType, summarize_descriptions
7 | 
8 | __all__ = ["SummarizeStrategyType", "summarize_descriptions"]
9 | 


--------------------------------------------------------------------------------
/graphrag/index/verbs/overrides/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """The Indexing Engine overrides package root."""
 5 | 
 6 | from .aggregate import aggregate
 7 | from .concat import concat
 8 | from .merge import merge
 9 | 
10 | __all__ = ["aggregate", "concat", "merge"]
11 | 


--------------------------------------------------------------------------------
/graphrag/llm/mock/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """Mock LLM Implementations."""
 5 | 
 6 | from .mock_chat_llm import MockChatLLM
 7 | from .mock_completion_llm import MockCompletionLLM
 8 | 
 9 | __all__ = [
10 |     "MockChatLLM",
11 |     "MockCompletionLLM",
12 | ]
13 | 


--------------------------------------------------------------------------------
/.github/workflows/spellcheck.yml:
--------------------------------------------------------------------------------
 1 | name: Spellcheck
 2 | on:
 3 |   push:
 4 |     branches: [main]
 5 |   pull_request:
 6 |     paths:
 7 |       - '**/*'
 8 | jobs:
 9 |   spellcheck:
10 |     runs-on: ubuntu-latest
11 |     steps:
12 |       - uses: actions/checkout@v4
13 | 
14 |       - name: Spellcheck
15 |         run: ./scripts/spellcheck.sh
16 | 


--------------------------------------------------------------------------------
/graphrag/index/emit/types.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """Table Emitter Types."""
 5 | 
 6 | from enum import Enum
 7 | 
 8 | 
 9 | class TableEmitterType(str, Enum):
10 |     """Table Emitter Types."""
11 | 
12 |     Json = "json"
13 |     Parquet = "parquet"
14 |     CSV = "csv"
15 | 


--------------------------------------------------------------------------------
/graphrag/index/verbs/entities/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """The Indexing Engine entities package root."""
 5 | 
 6 | from .extraction import entity_extract
 7 | from .summarize import summarize_descriptions
 8 | 
 9 | __all__ = ["entity_extract", "summarize_descriptions"]
10 | 


--------------------------------------------------------------------------------
/graphrag/index/verbs/text/translate/strategies/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """The Indexing Engine translate strategies package root."""
 5 | 
 6 | from .mock import run as run_mock
 7 | from .openai import run as run_openai
 8 | 
 9 | __all__ = ["run_mock", "run_openai"]
10 | 


--------------------------------------------------------------------------------
/graphrag/llm/base/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """Base LLM Implementations."""
 5 | 
 6 | from .base_llm import BaseLLM
 7 | from .caching_llm import CachingLLM
 8 | from .rate_limiting_llm import RateLimitingLLM
 9 | 
10 | __all__ = ["BaseLLM", "CachingLLM", "RateLimitingLLM"]
11 | 


--------------------------------------------------------------------------------
/.github/workflows/semver.yml:
--------------------------------------------------------------------------------
 1 | name: Semver Check
 2 | on:
 3 |   pull_request:
 4 |     branches: [main]
 5 | 
 6 | jobs:
 7 |     semver:
 8 |         runs-on: ubuntu-latest
 9 |         steps:
10 |         - uses: actions/checkout@v4
11 |           with:
12 |             fetch-depth: 0
13 |     
14 |         - name: Check Semver
15 |           run: ./scripts/semver-check.sh


--------------------------------------------------------------------------------
/graphrag/index/verbs/covariates/extract_covariates/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 
4 | """The Indexing Engine text extract claims package root."""
5 | 
6 | from .extract_covariates import ExtractClaimsStrategyType, extract_covariates
7 | 
8 | __all__ = ["ExtractClaimsStrategyType", "extract_covariates"]
9 | 


--------------------------------------------------------------------------------
/graphrag/prompt_tune/generator/defaults.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """Default values for the fine-tuning module."""
 5 | 
 6 | DEFAULT_TASK = """
 7 | Identify the relations and structure of the community of interest, specifically within the {domain} domain.
 8 | """
 9 | 
10 | MAX_TOKEN_COUNT = 2000
11 | 


--------------------------------------------------------------------------------
/graphrag/index/graph/extractors/claims/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """The Indexing Engine graph extractors claims package root."""
 5 | 
 6 | from .claim_extractor import ClaimExtractor
 7 | from .prompts import CLAIM_EXTRACTION_PROMPT
 8 | 
 9 | __all__ = ["CLAIM_EXTRACTION_PROMPT", "ClaimExtractor"]
10 | 


--------------------------------------------------------------------------------
/graphrag/index/llm/types.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """A module containing the 'LLMtype' model."""
 5 | 
 6 | from collections.abc import Callable
 7 | from typing import TypeAlias
 8 | 
 9 | TextSplitter: TypeAlias = Callable[[str], list[str]]
10 | TextListSplitter: TypeAlias = Callable[[list[str]], list[str]]
11 | 


--------------------------------------------------------------------------------
/graphrag/index/verbs/text/replace/typing.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """A module containing 'Replacement' model."""
 5 | 
 6 | from dataclasses import dataclass
 7 | 
 8 | 
 9 | @dataclass
10 | class Replacement:
11 |     """Replacement class definition."""
12 | 
13 |     pattern: str
14 |     replacement: str
15 | 


--------------------------------------------------------------------------------
/tests/fixtures/azure/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "input_path": "./tests/fixtures/azure",
 3 |     "input_file_type": "text",
 4 |     "workflow_config": {
 5 |         "skip_assert": true,
 6 |         "azure": {
 7 |             "input_container": "azurefixture",
 8 |             "input_base_dir": "input"
 9 |         }
10 |     },
11 |     "query_config": [],
12 |     "slow": false
13 | }


--------------------------------------------------------------------------------
/tests/unit/indexing/verbs/helpers/mock_llm.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | from graphrag.llm import CompletionLLM, MockChatLLM
 4 | 
 5 | 
 6 | def create_mock_llm(
 7 |     responses: list[str],
 8 | ) -> CompletionLLM:
 9 |     """Creates a mock LLM that returns the given responses."""
10 |     return MockChatLLM(responses)
11 | 


--------------------------------------------------------------------------------
/graphrag/index/verbs/text/translate/strategies/defaults.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 
4 | """A file containing TRANSLATION_PROMPT value definition."""
5 | 
6 | TRANSLATION_PROMPT = """
7 |     You are a helpful assistant. Translate into {language} the following text, and make sure all of the text is in {language}.
8 |     """.strip()
9 | 


--------------------------------------------------------------------------------
/graphrag/index/graph/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """The Indexing Engine graph utils package root."""
 5 | 
 6 | from .normalize_node_names import normalize_node_names
 7 | from .stable_lcc import stable_largest_connected_component
 8 | 
 9 | __all__ = ["normalize_node_names", "stable_largest_connected_component"]
10 | 


--------------------------------------------------------------------------------
/graphrag/index/verbs/graph/embed/typing.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """A module containing different lists and dictionaries."""
 5 | 
 6 | # Use this for now instead of a wrapper
 7 | from typing import Any
 8 | 
 9 | NodeList = list[str]
10 | EmbeddingList = list[Any]
11 | NodeEmbeddings = dict[str, list[float]]
12 | """Label -> Embedding"""
13 | 


--------------------------------------------------------------------------------
/tests/unit/indexing/config/default_config_with_everything_overridden.yml:
--------------------------------------------------------------------------------
 1 | extends: default
 2 | 
 3 | input:
 4 |   file_type: text
 5 |   base_dir: /some/overridden/dir
 6 |   file_pattern: test.txt
 7 | 
 8 | storage:
 9 |   type: file
10 | 
11 | cache:
12 |   type: file
13 | 
14 | reporting:
15 |   type: file
16 | 
17 | workflows:
18 |   - name: TEST_WORKFLOW
19 |     steps:
20 |       - verb: TEST_VERB
21 | 


--------------------------------------------------------------------------------
/graphrag/index/utils/topological_sort.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """Topological sort utility method."""
 5 | 
 6 | from graphlib import TopologicalSorter
 7 | 
 8 | 
 9 | def topological_sort(graph: dict[str, list[str]]) -> list[str]:
10 |     """Topological sort."""
11 |     ts = TopologicalSorter(graph)
12 |     return list(ts.static_order())
13 | 


--------------------------------------------------------------------------------
/graphrag/config/input_models/umap_config_input.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """Parameterization settings for the default configuration."""
 5 | 
 6 | from typing_extensions import NotRequired, TypedDict
 7 | 
 8 | 
 9 | class UmapConfigInput(TypedDict):
10 |     """Configuration section for UMAP."""
11 | 
12 |     enabled: NotRequired[bool | str | None]
13 | 


--------------------------------------------------------------------------------
/graphrag/index/llm/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """The Indexing Engine LLM package root."""
 5 | 
 6 | from .load_llm import load_llm, load_llm_embeddings
 7 | from .types import TextListSplitter, TextSplitter
 8 | 
 9 | __all__ = [
10 |     "TextListSplitter",
11 |     "TextSplitter",
12 |     "load_llm",
13 |     "load_llm_embeddings",
14 | ]
15 | 


--------------------------------------------------------------------------------
/examples/single_verb/pipeline.yml:
--------------------------------------------------------------------------------
 1 | input:
 2 |   file_type: csv
 3 |   base_dir: ./input
 4 |   file_pattern: .*\.csv$
 5 | workflows:
 6 |   - steps:
 7 |       - verb: derive # https://github.com/microsoft/datashaper/blob/main/python/datashaper/datashaper/engine/verbs/derive.py
 8 |         args:
 9 |           column1: "col1"
10 |           column2: "col2"
11 |           to: "col_multiplied"
12 |           operator: "*"
13 | 


--------------------------------------------------------------------------------
/graphrag/index/utils/load_graph.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """Networkx load_graph utility definition."""
 5 | 
 6 | import networkx as nx
 7 | 
 8 | 
 9 | def load_graph(graphml: str | nx.Graph) -> nx.Graph:
10 |     """Load a graph from a graphml file or a networkx graph."""
11 |     return nx.parse_graphml(graphml) if isinstance(graphml, str) else graphml
12 | 


--------------------------------------------------------------------------------
/scripts/semver-check.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | changes=$(git diff --name-only origin/main)
 3 | has_change_doc=$(echo $changes | grep .semversioner/next-release)
 4 | has_impacting_changes=$(echo $changes | grep graphrag)
 5 | 
 6 | if [ "$has_impacting_changes" ] && [ -z "$has_change_doc" ]; then
 7 |     echo "Check failed. Run 'poetry run semversioner add-change' to update the next release version"
 8 |     exit 1
 9 | fi
10 | echo "OK"
11 | 


--------------------------------------------------------------------------------
/tests/unit/indexing/test_exports.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | from graphrag.index import (
 4 |     create_pipeline_config,
 5 |     run_pipeline,
 6 |     run_pipeline_with_config,
 7 | )
 8 | 
 9 | 
10 | def test_exported_functions():
11 |     assert callable(create_pipeline_config)
12 |     assert callable(run_pipeline_with_config)
13 |     assert callable(run_pipeline)
14 | 


--------------------------------------------------------------------------------
/graphrag/index/utils/uuid.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """UUID utilities."""
 5 | 
 6 | import uuid
 7 | from random import Random, getrandbits
 8 | 
 9 | 
10 | def gen_uuid(rd: Random | None = None):
11 |     """Generate a random UUID v4."""
12 |     return uuid.UUID(
13 |         int=rd.getrandbits(128) if rd is not None else getrandbits(128), version=4
14 |     ).hex
15 | 


--------------------------------------------------------------------------------
/graphrag/model/named.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """A package containing the 'Named' protocol."""
 5 | 
 6 | from dataclasses import dataclass
 7 | 
 8 | from .identified import Identified
 9 | 
10 | 
11 | @dataclass
12 | class Named(Identified):
13 |     """A protocol for an item with a name/title."""
14 | 
15 |     title: str
16 |     """The name/title of the item."""
17 | 


--------------------------------------------------------------------------------
/tests/fixtures/text/settings.yml:
--------------------------------------------------------------------------------
 1 | claim_extraction:
 2 |   enabled: true
 3 | 
 4 | embeddings:
 5 |   vector_store:
 6 |     type: "azure_ai_search"
 7 |     url: ${AZURE_AI_SEARCH_URL_ENDPOINT}
 8 |     api_key: ${AZURE_AI_SEARCH_API_KEY}
 9 |     collection_name: "simple_text_ci"
10 |     query_collection_name: "simple_text_ci_query"
11 |     store_in_table: True
12 | 
13 |     entity_name_description:
14 |       title_column: "name"
15 | 


--------------------------------------------------------------------------------
/graphrag/index/graph/extractors/summarize/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """The Indexing Engine unipartite graph package root."""
 5 | 
 6 | from .description_summary_extractor import (
 7 |     SummarizationResult,
 8 |     SummarizeExtractor,
 9 | )
10 | from .prompts import SUMMARIZE_PROMPT
11 | 
12 | __all__ = ["SUMMARIZE_PROMPT", "SummarizationResult", "SummarizeExtractor"]
13 | 


--------------------------------------------------------------------------------
/graphrag/config/input_models/parallelization_parameters_input.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """LLM Parameters model."""
 5 | 
 6 | from typing_extensions import NotRequired, TypedDict
 7 | 
 8 | 
 9 | class ParallelizationParametersInput(TypedDict):
10 |     """LLM Parameters model."""
11 | 
12 |     stagger: NotRequired[float | str | None]
13 |     num_threads: NotRequired[int | str | None]
14 | 


--------------------------------------------------------------------------------
/graphrag/prompt_tune/loader/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """Fine-tuning config and data loader module."""
 5 | 
 6 | from .config import read_config_parameters
 7 | from .input import MIN_CHUNK_OVERLAP, MIN_CHUNK_SIZE, load_docs_in_chunks
 8 | 
 9 | __all__ = [
10 |     "MIN_CHUNK_OVERLAP",
11 |     "MIN_CHUNK_SIZE",
12 |     "load_docs_in_chunks",
13 |     "read_config_parameters",
14 | ]
15 | 


--------------------------------------------------------------------------------
/graphrag/index/graph/visualization/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """The Indexing Engine graph visualization package root."""
 5 | 
 6 | from .compute_umap_positions import compute_umap_positions, get_zero_positions
 7 | from .typing import GraphLayout, NodePosition
 8 | 
 9 | __all__ = [
10 |     "GraphLayout",
11 |     "NodePosition",
12 |     "compute_umap_positions",
13 |     "get_zero_positions",
14 | ]
15 | 


--------------------------------------------------------------------------------
/graphrag/llm/errors.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """Error definitions for the OpenAI DataShaper package."""
 5 | 
 6 | 
 7 | class RetriesExhaustedError(RuntimeError):
 8 |     """Retries exhausted error."""
 9 | 
10 |     def __init__(self, name: str, num_retries: int) -> None:
11 |         """Init method definition."""
12 |         super().__init__(f"Operation '{name}' failed - {num_retries} retries exhausted")
13 | 


--------------------------------------------------------------------------------
/graphrag/config/input_models/cluster_graph_config_input.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """Parameterization settings for the default configuration."""
 5 | 
 6 | from typing_extensions import NotRequired, TypedDict
 7 | 
 8 | 
 9 | class ClusterGraphConfigInput(TypedDict):
10 |     """Configuration section for clustering graphs."""
11 | 
12 |     max_cluster_size: NotRequired[int | None]
13 |     strategy: NotRequired[dict | None]
14 | 


--------------------------------------------------------------------------------
/docsite/posts/query/notebooks/overview.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Query Engine Notebooks
 3 | navtitle: Query Engine Notebooks
 4 | layout: page
 5 | tags: [post, notebook]
 6 | ---
 7 | 
 8 | For examples about running Query please refer to the following notebooks:
 9 | 
10 | - [Global Search Notebook](/posts/query/notebooks/global_search_nb)
11 | - [Local Search Notebook](/posts/query/notebooks/local_search_nb)
12 | 
13 | The test dataset for these notebooks can be found [here](/data/operation_dulce/dataset.zip).


--------------------------------------------------------------------------------
/graphrag/index/emit/table_emitter.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """TableEmitter protocol for emitting tables to a destination."""
 5 | 
 6 | from typing import Protocol
 7 | 
 8 | import pandas as pd
 9 | 
10 | 
11 | class TableEmitter(Protocol):
12 |     """TableEmitter protocol for emitting tables to a destination."""
13 | 
14 |     async def emit(self, name: str, data: pd.DataFrame) -> None:
15 |         """Emit a dataframe to storage."""
16 | 


--------------------------------------------------------------------------------
/graphrag/index/verbs/text/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """The Indexing Engine text package root."""
 5 | 
 6 | from .chunk.text_chunk import chunk
 7 | from .embed import text_embed
 8 | from .replace import replace
 9 | from .split import text_split
10 | from .translate import text_translate
11 | 
12 | __all__ = [
13 |     "chunk",
14 |     "replace",
15 |     "text_embed",
16 |     "text_split",
17 |     "text_translate",
18 | ]
19 | 


--------------------------------------------------------------------------------
/graphrag/index/text_splitting/check_token_limit.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """Token limit method definition."""
 5 | 
 6 | from .text_splitting import TokenTextSplitter
 7 | 
 8 | 
 9 | def check_token_limit(text, max_token):
10 |     """Check token limit."""
11 |     text_splitter = TokenTextSplitter(chunk_size=max_token, chunk_overlap=0)
12 |     docs = text_splitter.split_text(text)
13 |     if len(docs) > 1:
14 |         return 0
15 |     return 1
16 | 


--------------------------------------------------------------------------------
/graphrag/index/utils/hashing.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """Hashing utilities."""
 5 | 
 6 | from collections.abc import Iterable
 7 | from hashlib import md5
 8 | from typing import Any
 9 | 
10 | 
11 | def gen_md5_hash(item: dict[str, Any], hashcode: Iterable[str]):
12 |     """Generate an md5 hash."""
13 |     hashed = "".join([str(item[column]) for column in hashcode])
14 |     return f"{md5(hashed.encode('utf-8'), usedforsecurity=False).hexdigest()}"
15 | 


--------------------------------------------------------------------------------
/graphrag/index/utils/is_null.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """Defines the is_null utility."""
 5 | 
 6 | import math
 7 | from typing import Any
 8 | 
 9 | 
10 | def is_null(value: Any) -> bool:
11 |     """Check if value is null or is nan."""
12 | 
13 |     def is_none() -> bool:
14 |         return value is None
15 | 
16 |     def is_nan() -> bool:
17 |         return isinstance(value, float) and math.isnan(value)
18 | 
19 |     return is_none() or is_nan()
20 | 


--------------------------------------------------------------------------------
/graphrag/llm/types/llm_types.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """LLM Types."""
 5 | 
 6 | from typing import TypeAlias
 7 | 
 8 | from .llm import LLM
 9 | 
10 | EmbeddingInput: TypeAlias = list[str]
11 | EmbeddingOutput: TypeAlias = list[list[float]]
12 | CompletionInput: TypeAlias = str
13 | CompletionOutput: TypeAlias = str
14 | 
15 | EmbeddingLLM: TypeAlias = LLM[EmbeddingInput, EmbeddingOutput]
16 | CompletionLLM: TypeAlias = LLM[CompletionInput, CompletionOutput]
17 | 


--------------------------------------------------------------------------------
/graphrag/index/verbs/entities/summarize/strategies/graph_intelligence/defaults.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """A file containing some default responses."""
 5 | 
 6 | from graphrag.config.enums import LLMType
 7 | 
 8 | MOCK_LLM_RESPONSES = [
 9 |     """
10 |     This is a MOCK response for the LLM. It is summarized!
11 |     """.strip()
12 | ]
13 | 
14 | DEFAULT_LLM_CONFIG = {
15 |     "type": LLMType.StaticResponse,
16 |     "responses": MOCK_LLM_RESPONSES,
17 | }
18 | 


--------------------------------------------------------------------------------
/graphrag/config/models/umap_config.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """Parameterization settings for the default configuration."""
 5 | 
 6 | from pydantic import BaseModel, Field
 7 | 
 8 | import graphrag.config.defaults as defs
 9 | 
10 | 
11 | class UmapConfig(BaseModel):
12 |     """Configuration section for UMAP."""
13 | 
14 |     enabled: bool = Field(
15 |         description="A flag indicating whether to enable UMAP.",
16 |         default=defs.UMAP_ENABLED,
17 |     )
18 | 


--------------------------------------------------------------------------------
/graphrag/index/graph/extractors/graph/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """The Indexing Engine unipartite graph package root."""
 5 | 
 6 | from .graph_extractor import (
 7 |     DEFAULT_ENTITY_TYPES,
 8 |     GraphExtractionResult,
 9 |     GraphExtractor,
10 | )
11 | from .prompts import GRAPH_EXTRACTION_PROMPT
12 | 
13 | __all__ = [
14 |     "DEFAULT_ENTITY_TYPES",
15 |     "GRAPH_EXTRACTION_PROMPT",
16 |     "GraphExtractionResult",
17 |     "GraphExtractor",
18 | ]
19 | 


--------------------------------------------------------------------------------
/graphrag/config/input_models/snapshots_config_input.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """Parameterization settings for the default configuration."""
 5 | 
 6 | from typing_extensions import NotRequired, TypedDict
 7 | 
 8 | 
 9 | class SnapshotsConfigInput(TypedDict):
10 |     """Configuration section for snapshots."""
11 | 
12 |     graphml: NotRequired[bool | str | None]
13 |     raw_entities: NotRequired[bool | str | None]
14 |     top_level_nodes: NotRequired[bool | str | None]
15 | 


--------------------------------------------------------------------------------
/examples/custom_input/pipeline.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | # Setup reporting however you'd like
 3 | reporting:
 4 |   type: console
 5 | 
 6 | # Setup storage however you'd like
 7 | storage:
 8 |   type: memory
 9 | 
10 | # Setup cache however you'd like
11 | cache: 
12 |   type: memory
13 | 
14 | # Just a simple workflow
15 | workflows:
16 | 
17 |   # This is an anonymous workflow, it doesn't have a name
18 |   - steps:
19 | 
20 |     # Unpack the nodes from the graph
21 |     - verb: fill
22 |       args:
23 |         to: filled_column
24 |         value: "Filled Value"


--------------------------------------------------------------------------------
/graphrag/model/identified.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """A package containing the 'Identified' protocol."""
 5 | 
 6 | from dataclasses import dataclass
 7 | 
 8 | 
 9 | @dataclass
10 | class Identified:
11 |     """A protocol for an item with an ID."""
12 | 
13 |     id: str
14 |     """The ID of the item."""
15 | 
16 |     short_id: str | None
17 |     """Human readable ID used to refer to this community in prompts or texts displayed to users, such as in a report text (optional)."""
18 | 


--------------------------------------------------------------------------------
/examples/multiple_workflows/workflows/workflow_2.yml:
--------------------------------------------------------------------------------
 1 | name: workflow_2
 2 | steps:
 3 |   - verb: fill
 4 |     args:
 5 |       to: "col_workflow_2"
 6 |       value: 2
 7 |     input: 
 8 | 
 9 |       # workflow_2 is dependent on workflow_1
10 |       # so in workflow_2 output, you'll also see the output from workflow_1
11 |       source: "workflow:workflow_1"
12 | 
13 |   # Example of pulling in values from a shared file
14 |   - verb: fill
15 |     args:
16 |       to: "col_from_shared_file"
17 |       value: !include ./shared/shared_fill_value.txt
18 | 


--------------------------------------------------------------------------------
/graphrag/index/graph/utils/normalize_node_names.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """A module containing normalize_node_names method definition."""
 5 | 
 6 | import html
 7 | 
 8 | import networkx as nx
 9 | 
10 | 
11 | def normalize_node_names(graph: nx.Graph | nx.DiGraph) -> nx.Graph | nx.DiGraph:
12 |     """Normalize node names."""
13 |     node_mapping = {node: html.unescape(node.upper().strip()) for node in graph.nodes()}  # type: ignore
14 |     return nx.relabel_nodes(graph, node_mapping)
15 | 


--------------------------------------------------------------------------------
/graphrag/config/input_models/chunking_config_input.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """Parameterization settings for the default configuration."""
 5 | 
 6 | from typing_extensions import NotRequired, TypedDict
 7 | 
 8 | 
 9 | class ChunkingConfigInput(TypedDict):
10 |     """Configuration section for chunking."""
11 | 
12 |     size: NotRequired[int | str | None]
13 |     overlap: NotRequired[int | str | None]
14 |     group_by_columns: NotRequired[list[str] | str | None]
15 |     strategy: NotRequired[dict | None]
16 | 


--------------------------------------------------------------------------------
/graphrag/prompt_tune/prompt/domain.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """Fine-tuning prompts for domain generation."""
 5 | 
 6 | GENERATE_DOMAIN_PROMPT = """
 7 | You are an intelligent assistant that helps a human to analyze the information in a text document.
 8 | Given a sample text, help the user by assigning a descriptive domain that summarizes what the text is about.
 9 | Example domains are: "Social studies", "Algorithmic analysis", "Medical science", among others.
10 | 
11 | Text: {input_text}
12 | Domain:"""
13 | 


--------------------------------------------------------------------------------
/graphrag/llm/limiting/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """LLM limiters module."""
 5 | 
 6 | from .composite_limiter import CompositeLLMLimiter
 7 | from .create_limiters import create_tpm_rpm_limiters
 8 | from .llm_limiter import LLMLimiter
 9 | from .noop_llm_limiter import NoopLLMLimiter
10 | from .tpm_rpm_limiter import TpmRpmLLMLimiter
11 | 
12 | __all__ = [
13 |     "CompositeLLMLimiter",
14 |     "LLMLimiter",
15 |     "NoopLLMLimiter",
16 |     "TpmRpmLLMLimiter",
17 |     "create_tpm_rpm_limiters",
18 | ]
19 | 


--------------------------------------------------------------------------------
/graphrag/index/cache/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """The Indexing Engine cache package root."""
 5 | 
 6 | from .json_pipeline_cache import JsonPipelineCache
 7 | from .load_cache import load_cache
 8 | from .memory_pipeline_cache import InMemoryCache
 9 | from .noop_pipeline_cache import NoopPipelineCache
10 | from .pipeline_cache import PipelineCache
11 | 
12 | __all__ = [
13 |     "InMemoryCache",
14 |     "JsonPipelineCache",
15 |     "NoopPipelineCache",
16 |     "PipelineCache",
17 |     "load_cache",
18 | ]
19 | 


--------------------------------------------------------------------------------
/graphrag/llm/limiting/llm_limiter.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """Limiting types."""
 5 | 
 6 | from abc import ABC, abstractmethod
 7 | 
 8 | 
 9 | class LLMLimiter(ABC):
10 |     """LLM Limiter Interface."""
11 | 
12 |     @property
13 |     @abstractmethod
14 |     def needs_token_count(self) -> bool:
15 |         """Whether this limiter needs the token count to be passed in."""
16 | 
17 |     @abstractmethod
18 |     async def acquire(self, num_tokens: int = 1) -> None:
19 |         """Acquire a pass through the limiter."""
20 | 


--------------------------------------------------------------------------------
/graphrag/index/typing.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """A module containing the 'PipelineRunResult' model."""
 5 | 
 6 | from collections.abc import Callable
 7 | from dataclasses import dataclass
 8 | 
 9 | import pandas as pd
10 | 
11 | ErrorHandlerFn = Callable[[BaseException | None, str | None, dict | None], None]
12 | 
13 | 
14 | @dataclass
15 | class PipelineRunResult:
16 |     """Pipeline run result class definition."""
17 | 
18 |     workflow: str
19 |     result: pd.DataFrame | None
20 |     errors: list[BaseException] | None
21 | 


--------------------------------------------------------------------------------
/graphrag/index/verbs/text/chunk/typing.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """A module containing 'TextChunk' model."""
 5 | 
 6 | from dataclasses import dataclass
 7 | 
 8 | 
 9 | @dataclass
10 | class TextChunk:
11 |     """Text chunk class definition."""
12 | 
13 |     text_chunk: str
14 |     source_doc_indices: list[int]
15 |     n_tokens: int | None = None
16 | 
17 | 
18 | ChunkInput = str | list[str] | list[tuple[str, str]]
19 | """Input to a chunking strategy. Can be a string, a list of strings, or a list of tuples of (id, text)."""
20 | 


--------------------------------------------------------------------------------
/tests/fixtures/min-csv/settings.yml:
--------------------------------------------------------------------------------
 1 | input:
 2 |   file_type: csv
 3 | 
 4 | embeddings:
 5 |   vector_store:
 6 |     type: "lancedb"
 7 |     uri_db: "./tests/fixtures/min-csv/lancedb"
 8 |     store_in_table: True
 9 | 
10 |     entity_name_description:
11 |       title_column: "name"
12 |       # id_column: "id"
13 |       # overwrite: true
14 |     # entity_name: ...
15 |     # relationship_description: ...
16 |     # community_report_full_content: ...
17 |     # community_report_summary: ...
18 |     # community_report_title: ...
19 |     # document_raw_content: ...
20 |     # text_unit_text: ...
21 | 


--------------------------------------------------------------------------------
/graphrag/config/input_models/summarize_descriptions_config_input.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """Parameterization settings for the default configuration."""
 5 | 
 6 | from typing_extensions import NotRequired
 7 | 
 8 | from .llm_config_input import LLMConfigInput
 9 | 
10 | 
11 | class SummarizeDescriptionsConfigInput(LLMConfigInput):
12 |     """Configuration section for description summarization."""
13 | 
14 |     prompt: NotRequired[str | None]
15 |     max_length: NotRequired[int | str | None]
16 |     strategy: NotRequired[dict | None]
17 | 


--------------------------------------------------------------------------------
/graphrag/index/verbs/text/chunk/strategies/typing.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """A module containing ChunkStrategy definition."""
 5 | 
 6 | from collections.abc import Callable, Iterable
 7 | from typing import Any
 8 | 
 9 | from datashaper import ProgressTicker
10 | 
11 | from graphrag.index.verbs.text.chunk.typing import TextChunk
12 | 
13 | # Given a list of document texts, return a list of tuples of (source_doc_indices, text_chunk)
14 | 
15 | ChunkStrategy = Callable[
16 |     [list[str], dict[str, Any], ProgressTicker], Iterable[TextChunk]
17 | ]
18 | 


--------------------------------------------------------------------------------
/graphrag/llm/limiting/noop_llm_limiter.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """TPM RPM Limiter module."""
 5 | 
 6 | from .llm_limiter import LLMLimiter
 7 | 
 8 | 
 9 | class NoopLLMLimiter(LLMLimiter):
10 |     """TPM RPM Limiter class definition."""
11 | 
12 |     @property
13 |     def needs_token_count(self) -> bool:
14 |         """Whether this limiter needs the token count to be passed in."""
15 |         return False
16 | 
17 |     async def acquire(self, num_tokens: int = 1) -> None:
18 |         """Call method definition."""
19 |         # do nothing
20 | 


--------------------------------------------------------------------------------
/graphrag/index/verbs/graph/merge/defaults.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """A file containing DEFAULT_NODE_OPERATIONS, DEFAULT_EDGE_OPERATIONS and DEFAULT_CONCAT_SEPARATOR values definition."""
 5 | 
 6 | from .typing import BasicMergeOperation
 7 | 
 8 | DEFAULT_NODE_OPERATIONS = {
 9 |     "*": {
10 |         "operation": BasicMergeOperation.Replace,
11 |     }
12 | }
13 | 
14 | DEFAULT_EDGE_OPERATIONS = {
15 |     "*": {
16 |         "operation": BasicMergeOperation.Replace,
17 |     },
18 |     "weight": "sum",
19 | }
20 | 
21 | DEFAULT_CONCAT_SEPARATOR = ","
22 | 


--------------------------------------------------------------------------------
/docsite/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "@graphrag/docsite",
 3 |     "version": "0.0.1",
 4 |     "private": true,
 5 |     "scripts": {
 6 |         "start": "eleventy --serve",
 7 |         "build": "eleventy && touch _site/.nojekyll",
 8 |         "build:docs": "yarn build",
 9 |         "start:docs": "yarn start"
10 |     },
11 |     "dependencies": {
12 |         "@11ty/eleventy": "^2.0.1",
13 |         "@11ty/eleventy-plugin-syntaxhighlight": "^5.0.0",
14 |         "@kevingimbel/eleventy-plugin-mermaid": "^2.2.1",
15 |         "eleventy-plugin-code-clipboard": "^0.1.1",
16 |         "markdown-it": "^14.1.0"
17 |     }
18 | }
19 | 


--------------------------------------------------------------------------------
/graphrag/config/input_models/community_reports_config_input.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """Parameterization settings for the default configuration."""
 5 | 
 6 | from typing_extensions import NotRequired
 7 | 
 8 | from .llm_config_input import LLMConfigInput
 9 | 
10 | 
11 | class CommunityReportsConfigInput(LLMConfigInput):
12 |     """Configuration section for community reports."""
13 | 
14 |     prompt: NotRequired[str | None]
15 |     max_length: NotRequired[int | str | None]
16 |     max_input_length: NotRequired[int | str | None]
17 |     strategy: NotRequired[dict | None]
18 | 


--------------------------------------------------------------------------------
/graphrag/query/llm/oai/typing.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """OpenAI wrapper options."""
 5 | 
 6 | from enum import Enum
 7 | from typing import Any, cast
 8 | 
 9 | import openai
10 | 
11 | OPENAI_RETRY_ERROR_TYPES = (
12 |     # TODO: update these when we update to OpenAI 1+ library
13 |     cast(Any, openai).RateLimitError,
14 |     cast(Any, openai).APIConnectionError,
15 |     # TODO: replace with comparable OpenAI 1+ error
16 | )
17 | 
18 | 
19 | class OpenaiApiType(str, Enum):
20 |     """The OpenAI Flavor."""
21 | 
22 |     OpenAI = "openai"
23 |     AzureOpenAI = "azure"
24 | 


--------------------------------------------------------------------------------
/graphrag/config/input_models/entity_extraction_config_input.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """Parameterization settings for the default configuration."""
 5 | 
 6 | from typing_extensions import NotRequired
 7 | 
 8 | from .llm_config_input import LLMConfigInput
 9 | 
10 | 
11 | class EntityExtractionConfigInput(LLMConfigInput):
12 |     """Configuration section for entity extraction."""
13 | 
14 |     prompt: NotRequired[str | None]
15 |     entity_types: NotRequired[list[str] | str | None]
16 |     max_gleanings: NotRequired[int | str | None]
17 |     strategy: NotRequired[dict | None]
18 | 


--------------------------------------------------------------------------------
/examples/use_built_in_workflows/pipeline.yml:
--------------------------------------------------------------------------------
 1 | workflows:
 2 |   - name: "entity_extraction"
 3 |     config:
 4 |       entity_extract:
 5 |           strategy: 
 6 |               type: "nltk"
 7 | 
 8 |   - name: "entity_graph"
 9 |     config:
10 |       cluster_graph: 
11 |         strategy: 
12 |           type: "leiden"
13 |       embed_graph: 
14 |           strategy: 
15 |               type: "node2vec"
16 |               num_walks: 10
17 |               walk_length: 40
18 |               window_size: 2
19 |               iterations: 3
20 |               random_seed: 597832
21 |       layout_graph: 
22 |           strategy: 
23 |               type: "umap"


--------------------------------------------------------------------------------
/graphrag/index/graph/extractors/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """The Indexing Engine graph extractors package root."""
 5 | 
 6 | from .claims import CLAIM_EXTRACTION_PROMPT, ClaimExtractor
 7 | from .community_reports import (
 8 |     COMMUNITY_REPORT_PROMPT,
 9 |     CommunityReportsExtractor,
10 | )
11 | from .graph import GraphExtractionResult, GraphExtractor
12 | 
13 | __all__ = [
14 |     "CLAIM_EXTRACTION_PROMPT",
15 |     "COMMUNITY_REPORT_PROMPT",
16 |     "ClaimExtractor",
17 |     "CommunityReportsExtractor",
18 |     "GraphExtractionResult",
19 |     "GraphExtractor",
20 | ]
21 | 


--------------------------------------------------------------------------------
/graphrag/config/input_models/global_search_config_input.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """Parameterization settings for the default configuration."""
 5 | 
 6 | from typing_extensions import NotRequired, TypedDict
 7 | 
 8 | 
 9 | class GlobalSearchConfigInput(TypedDict):
10 |     """The default configuration section for Cache."""
11 | 
12 |     max_tokens: NotRequired[int | str | None]
13 |     data_max_tokens: NotRequired[int | str | None]
14 |     map_max_tokens: NotRequired[int | str | None]
15 |     reduce_max_tokens: NotRequired[int | str | None]
16 |     concurrency: NotRequired[int | str | None]
17 | 


--------------------------------------------------------------------------------
/graphrag/index/workflows/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """The Indexing Engine workflows package root."""
 5 | 
 6 | from .load import create_workflow, load_workflows
 7 | from .typing import (
 8 |     StepDefinition,
 9 |     VerbDefinitions,
10 |     VerbTiming,
11 |     WorkflowConfig,
12 |     WorkflowDefinitions,
13 |     WorkflowToRun,
14 | )
15 | 
16 | __all__ = [
17 |     "StepDefinition",
18 |     "VerbDefinitions",
19 |     "VerbTiming",
20 |     "WorkflowConfig",
21 |     "WorkflowDefinitions",
22 |     "WorkflowToRun",
23 |     "create_workflow",
24 |     "load_workflows",
25 | ]
26 | 


--------------------------------------------------------------------------------
/graphrag/vector_stores/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """A package containing vector-storage implementations."""
 5 | 
 6 | from .azure_ai_search import AzureAISearch
 7 | from .base import BaseVectorStore, VectorStoreDocument, VectorStoreSearchResult
 8 | from .lancedb import LanceDBVectorStore
 9 | from .typing import VectorStoreFactory, VectorStoreType
10 | 
11 | __all__ = [
12 |     "AzureAISearch",
13 |     "BaseVectorStore",
14 |     "LanceDBVectorStore",
15 |     "VectorStoreDocument",
16 |     "VectorStoreFactory",
17 |     "VectorStoreSearchResult",
18 |     "VectorStoreType",
19 | ]
20 | 


--------------------------------------------------------------------------------
/graphrag/config/input_models/claim_extraction_config_input.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """Parameterization settings for the default configuration."""
 5 | 
 6 | from typing_extensions import NotRequired
 7 | 
 8 | from .llm_config_input import LLMConfigInput
 9 | 
10 | 
11 | class ClaimExtractionConfigInput(LLMConfigInput):
12 |     """Configuration section for claim extraction."""
13 | 
14 |     enabled: NotRequired[bool | None]
15 |     prompt: NotRequired[str | None]
16 |     description: NotRequired[str | None]
17 |     max_gleanings: NotRequired[int | str | None]
18 |     strategy: NotRequired[dict | None]
19 | 


--------------------------------------------------------------------------------
/graphrag/index/storage/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """The Indexing Engine storage package root."""
 5 | 
 6 | from .blob_pipeline_storage import BlobPipelineStorage, create_blob_storage
 7 | from .file_pipeline_storage import FilePipelineStorage
 8 | from .load_storage import load_storage
 9 | from .memory_pipeline_storage import MemoryPipelineStorage
10 | from .typing import PipelineStorage
11 | 
12 | __all__ = [
13 |     "BlobPipelineStorage",
14 |     "FilePipelineStorage",
15 |     "MemoryPipelineStorage",
16 |     "PipelineStorage",
17 |     "create_blob_storage",
18 |     "load_storage",
19 | ]
20 | 


--------------------------------------------------------------------------------
/graphrag/index/utils/dicts.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """A utility module containing methods for inspecting and verifying dictionary types."""
 5 | 
 6 | 
 7 | def dict_has_keys_with_types(
 8 |     data: dict, expected_fields: list[tuple[str, type]]
 9 | ) -> bool:
10 |     """Return True if the given dictionary has the given keys with the given types."""
11 |     for field, field_type in expected_fields:
12 |         if field not in data:
13 |             return False
14 | 
15 |         value = data[field]
16 |         if not isinstance(value, field_type):
17 |             return False
18 |     return True
19 | 


--------------------------------------------------------------------------------
/graphrag/query/llm/oai/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """GraphRAG Orchestration OpenAI Wrappers."""
 5 | 
 6 | from .base import BaseOpenAILLM, OpenAILLMImpl, OpenAITextEmbeddingImpl
 7 | from .chat_openai import ChatOpenAI
 8 | from .embedding import OpenAIEmbedding
 9 | from .openai import OpenAI
10 | from .typing import OPENAI_RETRY_ERROR_TYPES, OpenaiApiType
11 | 
12 | __all__ = [
13 |     "OPENAI_RETRY_ERROR_TYPES",
14 |     "BaseOpenAILLM",
15 |     "ChatOpenAI",
16 |     "OpenAI",
17 |     "OpenAIEmbedding",
18 |     "OpenAILLMImpl",
19 |     "OpenAITextEmbeddingImpl",
20 |     "OpenaiApiType",
21 | ]
22 | 


--------------------------------------------------------------------------------
/graphrag/config/input_models/cache_config_input.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """Parameterization settings for the default configuration."""
 5 | 
 6 | from typing_extensions import NotRequired, TypedDict
 7 | 
 8 | from graphrag.config.enums import CacheType
 9 | 
10 | 
11 | class CacheConfigInput(TypedDict):
12 |     """The default configuration section for Cache."""
13 | 
14 |     type: NotRequired[CacheType | str | None]
15 |     base_dir: NotRequired[str | None]
16 |     connection_string: NotRequired[str | None]
17 |     container_name: NotRequired[str | None]
18 |     storage_account_blob_url: NotRequired[str | None]
19 | 


--------------------------------------------------------------------------------
/graphrag/config/input_models/storage_config_input.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """Parameterization settings for the default configuration."""
 5 | 
 6 | from typing_extensions import NotRequired, TypedDict
 7 | 
 8 | from graphrag.config.enums import StorageType
 9 | 
10 | 
11 | class StorageConfigInput(TypedDict):
12 |     """The default configuration section for Storage."""
13 | 
14 |     type: NotRequired[StorageType | str | None]
15 |     base_dir: NotRequired[str | None]
16 |     connection_string: NotRequired[str | None]
17 |     container_name: NotRequired[str | None]
18 |     storage_account_blob_url: NotRequired[str | None]
19 | 


--------------------------------------------------------------------------------
/graphrag/config/input_models/reporting_config_input.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """Parameterization settings for the default configuration."""
 5 | 
 6 | from typing_extensions import NotRequired, TypedDict
 7 | 
 8 | from graphrag.config.enums import ReportingType
 9 | 
10 | 
11 | class ReportingConfigInput(TypedDict):
12 |     """The default configuration section for Reporting."""
13 | 
14 |     type: NotRequired[ReportingType | str | None]
15 |     base_dir: NotRequired[str | None]
16 |     connection_string: NotRequired[str | None]
17 |     container_name: NotRequired[str | None]
18 |     storage_account_blob_url: NotRequired[str | None]
19 | 


--------------------------------------------------------------------------------
/graphrag/prompt_tune/prompt/persona.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """Fine-tuning prompts for persona generation."""
 5 | 
 6 | GENERATE_PERSONA_PROMPT = """
 7 | You are an intelligent assistant that helps a human to analyze the information in a text document.
 8 | Given a specific type of task and sample text, help the user by generating a 3 to 4 sentence description of an expert who could help solve the problem.
 9 | Use a format similar to the following:
10 | You are an expert {{role}}. You are skilled at {{relevant skills}}. You are adept at helping people with {{specific task}}.
11 | 
12 | task: {sample_task}
13 | persona description:"""
14 | 


--------------------------------------------------------------------------------
/graphrag/config/models/parallelization_parameters.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """LLM Parameters model."""
 5 | 
 6 | from pydantic import BaseModel, Field
 7 | 
 8 | import graphrag.config.defaults as defs
 9 | 
10 | 
11 | class ParallelizationParameters(BaseModel):
12 |     """LLM Parameters model."""
13 | 
14 |     stagger: float = Field(
15 |         description="The stagger to use for the LLM service.",
16 |         default=defs.PARALLELIZATION_STAGGER,
17 |     )
18 |     num_threads: int = Field(
19 |         description="The number of threads to use for the LLM service.",
20 |         default=defs.PARALLELIZATION_NUM_THREADS,
21 |     )
22 | 


--------------------------------------------------------------------------------
/graphrag/index/utils/string.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """String utilities."""
 5 | 
 6 | import html
 7 | import re
 8 | from typing import Any
 9 | 
10 | 
11 | def clean_str(input: Any) -> str:
12 |     """Clean an input string by removing HTML escapes, control characters, and other unwanted characters."""
13 |     # If we get non-string input, just give it back
14 |     if not isinstance(input, str):
15 |         return input
16 | 
17 |     result = html.unescape(input.strip())
18 |     # https://stackoverflow.com/questions/4324790/removing-control-characters-from-a-string-in-python
19 |     return re.sub(r"[\x00-\x1f\x7f-\x9f]", "", result)
20 | 


--------------------------------------------------------------------------------
/graphrag/llm/types/llm_cache.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """Typing definitions for the OpenAI DataShaper package."""
 5 | 
 6 | from typing import Any, Protocol
 7 | 
 8 | 
 9 | class LLMCache(Protocol):
10 |     """LLM Cache interface."""
11 | 
12 |     async def has(self, key: str) -> bool:
13 |         """Check if the cache has a value."""
14 |         ...
15 | 
16 |     async def get(self, key: str) -> Any | None:
17 |         """Retrieve a value from the cache."""
18 |         ...
19 | 
20 |     async def set(self, key: str, value: Any, debug_data: dict | None = None) -> None:
21 |         """Write a value into the cache."""
22 |         ...
23 | 


--------------------------------------------------------------------------------
/graphrag/llm/types/llm.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """LLM Types."""
 5 | 
 6 | from typing import Generic, Protocol, TypeVar
 7 | 
 8 | from typing_extensions import Unpack
 9 | 
10 | from .llm_io import (
11 |     LLMInput,
12 |     LLMOutput,
13 | )
14 | 
15 | TIn = TypeVar("TIn", contravariant=True)
16 | TOut = TypeVar("TOut")
17 | 
18 | 
19 | class LLM(Protocol, Generic[TIn, TOut]):
20 |     """LLM Protocol definition."""
21 | 
22 |     async def __call__(
23 |         self,
24 |         input: TIn,
25 |         **kwargs: Unpack[LLMInput],
26 |     ) -> LLMOutput[TOut]:
27 |         """Invoke the LLM, treating the LLM as a function."""
28 |         ...
29 | 


--------------------------------------------------------------------------------
/graphrag/config/input_models/llm_config_input.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """Parameterization settings for the default configuration."""
 5 | 
 6 | from datashaper import AsyncType
 7 | from typing_extensions import NotRequired, TypedDict
 8 | 
 9 | from .llm_parameters_input import LLMParametersInput
10 | from .parallelization_parameters_input import ParallelizationParametersInput
11 | 
12 | 
13 | class LLMConfigInput(TypedDict):
14 |     """Base class for LLM-configured steps."""
15 | 
16 |     llm: NotRequired[LLMParametersInput | None]
17 |     parallelization: NotRequired[ParallelizationParametersInput | None]
18 |     async_mode: NotRequired[AsyncType | str | None]
19 | 


--------------------------------------------------------------------------------
/graphrag/config/input_models/embed_graph_config_input.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """Parameterization settings for the default configuration."""
 5 | 
 6 | from typing_extensions import NotRequired, TypedDict
 7 | 
 8 | 
 9 | class EmbedGraphConfigInput(TypedDict):
10 |     """The default configuration section for Node2Vec."""
11 | 
12 |     enabled: NotRequired[bool | str | None]
13 |     num_walks: NotRequired[int | str | None]
14 |     walk_length: NotRequired[int | str | None]
15 |     window_size: NotRequired[int | str | None]
16 |     iterations: NotRequired[int | str | None]
17 |     random_seed: NotRequired[int | str | None]
18 |     strategy: NotRequired[dict | None]
19 | 


--------------------------------------------------------------------------------
/graphrag/index/reporting/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """Reporting utilities and implementations for the indexing engine."""
 5 | 
 6 | from .blob_workflow_callbacks import BlobWorkflowCallbacks
 7 | from .console_workflow_callbacks import ConsoleWorkflowCallbacks
 8 | from .file_workflow_callbacks import FileWorkflowCallbacks
 9 | from .load_pipeline_reporter import load_pipeline_reporter
10 | from .progress_workflow_callbacks import ProgressWorkflowCallbacks
11 | 
12 | __all__ = [
13 |     "BlobWorkflowCallbacks",
14 |     "ConsoleWorkflowCallbacks",
15 |     "FileWorkflowCallbacks",
16 |     "ProgressWorkflowCallbacks",
17 |     "load_pipeline_reporter",
18 | ]
19 | 


--------------------------------------------------------------------------------
/.github/workflows/javascript-ci.yml:
--------------------------------------------------------------------------------
 1 | name: JavaScript CI
 2 | on:
 3 |   push:
 4 |     branches: [main]
 5 |   pull_request:
 6 |     branches: [main]
 7 | 
 8 | env:
 9 |   NODE_VERSION: 18.x
10 | 
11 | jobs:
12 |   javascript-ci:
13 |     runs-on: ubuntu-latest
14 |     strategy:
15 |       fail-fast: false
16 |     steps:
17 |       - name: Use Node ${{ env.NODE_VERSION }}
18 |         uses: actions/setup-node@v4
19 |         with:
20 |           node-version: ${{ env.NODE_VERSION }}
21 | 
22 |       - uses: actions/checkout@v4
23 | 
24 |       - run: yarn install
25 |         working-directory: docsite
26 |         name: Install Dependencies
27 | 
28 |       - run: yarn build
29 |         working-directory: docsite
30 |         name: Build Docsite


--------------------------------------------------------------------------------
/graphrag/index/emit/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """Definitions for emitting pipeline artifacts to storage."""
 5 | 
 6 | from .csv_table_emitter import CSVTableEmitter
 7 | from .factories import create_table_emitter, create_table_emitters
 8 | from .json_table_emitter import JsonTableEmitter
 9 | from .parquet_table_emitter import ParquetTableEmitter
10 | from .table_emitter import TableEmitter
11 | from .types import TableEmitterType
12 | 
13 | __all__ = [
14 |     "CSVTableEmitter",
15 |     "JsonTableEmitter",
16 |     "ParquetTableEmitter",
17 |     "TableEmitter",
18 |     "TableEmitterType",
19 |     "create_table_emitter",
20 |     "create_table_emitters",
21 | ]
22 | 


--------------------------------------------------------------------------------
/graphrag/index/graph/visualization/typing.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | # Use this for now instead of a wrapper
 5 | """A module containing 'NodePosition' model."""
 6 | 
 7 | from dataclasses import dataclass
 8 | 
 9 | 
10 | @dataclass
11 | class NodePosition:
12 |     """Node position class definition."""
13 | 
14 |     label: str
15 |     cluster: str
16 |     size: float
17 | 
18 |     x: float
19 |     y: float
20 |     z: float | None = None
21 | 
22 |     def to_pandas(self) -> tuple[str, float, float, str, float]:
23 |         """To pandas method definition."""
24 |         return self.label, self.x, self.y, self.cluster, self.size
25 | 
26 | 
27 | GraphLayout = list[NodePosition]
28 | 


--------------------------------------------------------------------------------
/graphrag/index/verbs/text/translate/strategies/typing.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """A module containing 'TextTranslationResult' model."""
 5 | 
 6 | from collections.abc import Awaitable, Callable
 7 | from dataclasses import dataclass
 8 | from typing import Any
 9 | 
10 | from datashaper import VerbCallbacks
11 | 
12 | from graphrag.index.cache import PipelineCache
13 | 
14 | 
15 | @dataclass
16 | class TextTranslationResult:
17 |     """Text translation result class definition."""
18 | 
19 |     translations: list[str]
20 | 
21 | 
22 | TextTranslationStrategy = Callable[
23 |     [list[str], dict[str, Any], VerbCallbacks, PipelineCache],
24 |     Awaitable[TextTranslationResult],
25 | ]
26 | 


--------------------------------------------------------------------------------
/graphrag/index/errors.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """GraphRAG indexing error types."""
 5 | 
 6 | 
 7 | class NoWorkflowsDefinedError(ValueError):
 8 |     """Exception for no workflows defined."""
 9 | 
10 |     def __init__(self):
11 |         super().__init__("No workflows defined.")
12 | 
13 | 
14 | class UndefinedWorkflowError(ValueError):
15 |     """Exception for invalid verb input."""
16 | 
17 |     def __init__(self):
18 |         super().__init__("Workflow name is undefined.")
19 | 
20 | 
21 | class UnknownWorkflowError(ValueError):
22 |     """Exception for invalid verb input."""
23 | 
24 |     def __init__(self, name: str):
25 |         super().__init__(f"Unknown workflow: {name}")
26 | 


--------------------------------------------------------------------------------
/graphrag/llm/types/llm_callbacks.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """Typing definitions for the OpenAI DataShaper package."""
 5 | 
 6 | from collections.abc import Callable
 7 | 
 8 | from .llm_invocation_result import LLMInvocationResult
 9 | 
10 | ErrorHandlerFn = Callable[[BaseException | None, str | None, dict | None], None]
11 | """Error handler function type definition."""
12 | 
13 | LLMInvocationFn = Callable[[LLMInvocationResult], None]
14 | """Handler for LLM invocation results"""
15 | 
16 | OnCacheActionFn = Callable[[str, str | None], None]
17 | """Handler for cache hits"""
18 | 
19 | IsResponseValidFn = Callable[[dict], bool]
20 | """A function that checks if an LLM response is valid."""
21 | 


--------------------------------------------------------------------------------
/graphrag/index/utils/json.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """JSON cleaning and formatting utilities."""
 5 | 
 6 | 
 7 | def clean_up_json(json_str: str):
 8 |     """Clean up json string."""
 9 |     json_str = (
10 |         json_str.replace("\\n", "")
11 |         .replace("\n", "")
12 |         .replace("\r", "")
13 |         .replace('"[{', "[{")
14 |         .replace('}]"', "}]")
15 |         .replace("\\", "")
16 |         .strip()
17 |     )
18 | 
19 |     # Remove JSON Markdown Frame
20 |     if json_str.startswith("```json"):
21 |         json_str = json_str[len("```json") :]
22 |     if json_str.endswith("```"):
23 |         json_str = json_str[: len(json_str) - len("```")]
24 | 
25 |     return json_str
26 | 


--------------------------------------------------------------------------------
/graphrag/llm/openai/_json.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """JSON cleaning and formatting utilities."""
 5 | 
 6 | 
 7 | def clean_up_json(json_str: str) -> str:
 8 |     """Clean up json string."""
 9 |     json_str = (
10 |         json_str.replace("\\n", "")
11 |         .replace("\n", "")
12 |         .replace("\r", "")
13 |         .replace('"[{', "[{")
14 |         .replace('}]"', "}]")
15 |         .replace("\\", "")
16 |         .strip()
17 |     )
18 | 
19 |     # Remove JSON Markdown Frame
20 |     if json_str.startswith("```json"):
21 |         json_str = json_str[len("```json") :]
22 |     if json_str.endswith("```"):
23 |         json_str = json_str[: len(json_str) - len("```")]
24 | 
25 |     return json_str
26 | 


--------------------------------------------------------------------------------
/graphrag/index/verbs/text/embed/strategies/typing.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """A module containing 'TextEmbeddingResult' model."""
 5 | 
 6 | from collections.abc import Awaitable, Callable
 7 | from dataclasses import dataclass
 8 | 
 9 | from datashaper import VerbCallbacks
10 | 
11 | from graphrag.index.cache import PipelineCache
12 | 
13 | 
14 | @dataclass
15 | class TextEmbeddingResult:
16 |     """Text embedding result class definition."""
17 | 
18 |     embeddings: list[list[float] | None] | None
19 | 
20 | 
21 | TextEmbeddingStrategy = Callable[
22 |     [
23 |         list[str],
24 |         VerbCallbacks,
25 |         PipelineCache,
26 |         dict,
27 |     ],
28 |     Awaitable[TextEmbeddingResult],
29 | ]
30 | 


--------------------------------------------------------------------------------
/examples/README.md:
--------------------------------------------------------------------------------
 1 | # Indexing Engine Examples
 2 | This directory contains several examples of how to use the indexing engine.
 3 | 
 4 | Most examples include two different forms of running the pipeline, both are contained in the examples `run.py`
 5 | 1. Using mostly the Python API
 6 | 2. Using mostly the a pipeline configuration file
 7 | 
 8 | # Running an Example
 9 | First run `poetry shell` to activate a virtual environment with the required dependencies.
10 | 
11 | Then run `PYTHONPATH="$(pwd)" python examples/path_to_example/run.py` from the `python/graphrag` directory.
12 | 
13 | For example to run the single_verb example, you would run the following commands:
14 | 
15 | ```bash
16 | cd python/graphrag
17 | poetry shell
18 | PYTHONPATH="$(pwd)" python examples/single_verb/run.py
19 | ```


--------------------------------------------------------------------------------
/graphrag/config/input_models/local_search_config_input.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """Parameterization settings for the default configuration."""
 5 | 
 6 | from typing_extensions import NotRequired, TypedDict
 7 | 
 8 | 
 9 | class LocalSearchConfigInput(TypedDict):
10 |     """The default configuration section for Cache."""
11 | 
12 |     text_unit_prop: NotRequired[float | str | None]
13 |     community_prop: NotRequired[float | str | None]
14 |     conversation_history_max_turns: NotRequired[int | str | None]
15 |     top_k_entities: NotRequired[int | str | None]
16 |     top_k_relationships: NotRequired[int | str | None]
17 |     max_tokens: NotRequired[int | str | None]
18 |     llm_max_tokens: NotRequired[int | str | None]
19 | 


--------------------------------------------------------------------------------
/graphrag/index/verbs/unzip.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """A module containing unzip method definition."""
 5 | 
 6 | from typing import cast
 7 | 
 8 | import pandas as pd
 9 | from datashaper import TableContainer, VerbInput, verb
10 | 
11 | 
12 | # TODO: Check if this is already a thing
13 | # Takes 1|(x,y)|b
14 | # and converts to
15 | # 1|x|y|b
16 | @verb(name="unzip")
17 | def unzip(
18 |     input: VerbInput, column: str, to: list[str], **_kwargs: dict
19 | ) -> TableContainer:
20 |     """Unpacks a column containing a tuple into multiple columns."""
21 |     table = cast(pd.DataFrame, input.get_input())
22 | 
23 |     table[to] = pd.DataFrame(table[column].tolist(), index=table.index)
24 | 
25 |     return TableContainer(table=table)
26 | 


--------------------------------------------------------------------------------
/tests/unit/indexing/workflows/helpers.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | mock_verbs = {
 4 |     "mock_verb": lambda x: x,
 5 |     "mock_verb_2": lambda x: x,
 6 | }
 7 | 
 8 | mock_workflows = {
 9 |     "mock_workflow": lambda _x: [
10 |         {
11 |             "verb": "mock_verb",
12 |             "args": {
13 |                 "column": "test",
14 |             },
15 |         }
16 |     ],
17 |     "mock_workflow_2": lambda _x: [
18 |         {
19 |             "verb": "mock_verb",
20 |             "args": {
21 |                 "column": "test",
22 |             },
23 |         },
24 |         {
25 |             "verb": "mock_verb_2",
26 |             "args": {
27 |                 "column": "test",
28 |             },
29 |         },
30 |     ],
31 | }
32 | 


--------------------------------------------------------------------------------
/examples/entity_extraction/with_graph_intelligence/pipeline.yml:
--------------------------------------------------------------------------------
 1 | workflows:
 2 |   - name: "entity_extraction"
 3 |     config:
 4 |       entity_extract:
 5 |         strategy:
 6 |           type: "graph_intelligence"
 7 |           llm:
 8 |             type: "openai_chat"
 9 | 
10 |             # create a .env file in the same directory as this pipeline.yml file
11 |             # end add the following lines to it:
12 |             # EXAMPLE_OPENAI_API_KEY="YOUR_API_KEY"
13 |             api_key: !ENV ${EXAMPLE_OPENAI_API_KEY:None} # None is the default
14 |             model: !ENV ${EXAMPLE_OPENAI_MODEL:gpt-3.5-turbo} # gpt-3.5-turbo is the default
15 |             max_tokens: !ENV ${EXAMPLE_OPENAI_MAX_TOKENS:2500} # 2500 is the default
16 |             temperature: !ENV ${EXAMPLE_OPENAI_TEMPERATURE:0} # 0 is the default
17 | 


--------------------------------------------------------------------------------
/graphrag/config/read_dotenv.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """A module containing the read_dotenv utility."""
 5 | 
 6 | import logging
 7 | import os
 8 | from pathlib import Path
 9 | 
10 | from dotenv import dotenv_values
11 | 
12 | log = logging.getLogger(__name__)
13 | 
14 | 
15 | def read_dotenv(root: str) -> None:
16 |     """Read a .env file in the given root path."""
17 |     env_path = Path(root) / ".env"
18 |     if env_path.exists():
19 |         log.info("Loading pipeline .env file")
20 |         env_config = dotenv_values(f"{env_path}")
21 |         for key, value in env_config.items():
22 |             if key not in os.environ:
23 |                 os.environ[key] = value or ""
24 |     else:
25 |         log.info("No .env file found at %s", root)
26 | 


--------------------------------------------------------------------------------
/graphrag/config/input_models/text_embedding_config_input.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """Parameterization settings for the default configuration."""
 5 | 
 6 | from typing_extensions import NotRequired
 7 | 
 8 | from graphrag.config.enums import (
 9 |     TextEmbeddingTarget,
10 | )
11 | 
12 | from .llm_config_input import LLMConfigInput
13 | 
14 | 
15 | class TextEmbeddingConfigInput(LLMConfigInput):
16 |     """Configuration section for text embeddings."""
17 | 
18 |     batch_size: NotRequired[int | str | None]
19 |     batch_max_tokens: NotRequired[int | str | None]
20 |     target: NotRequired[TextEmbeddingTarget | str | None]
21 |     skip: NotRequired[list[str] | str | None]
22 |     vector_store: NotRequired[dict | None]
23 |     strategy: NotRequired[dict | None]
24 | 


--------------------------------------------------------------------------------
/graphrag/index/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """Utils methods definition."""
 5 | 
 6 | from .dicts import dict_has_keys_with_types
 7 | from .hashing import gen_md5_hash
 8 | from .is_null import is_null
 9 | from .json import clean_up_json
10 | from .load_graph import load_graph
11 | from .string import clean_str
12 | from .tokens import num_tokens_from_string, string_from_tokens
13 | from .topological_sort import topological_sort
14 | from .uuid import gen_uuid
15 | 
16 | __all__ = [
17 |     "clean_str",
18 |     "clean_up_json",
19 |     "dict_has_keys_with_types",
20 |     "gen_md5_hash",
21 |     "gen_uuid",
22 |     "is_null",
23 |     "load_graph",
24 |     "num_tokens_from_string",
25 |     "string_from_tokens",
26 |     "topological_sort",
27 | ]
28 | 


--------------------------------------------------------------------------------
/graphrag/index/verbs/text/chunk/strategies/sentence.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """A module containing run method definition."""
 5 | 
 6 | from collections.abc import Iterable
 7 | from typing import Any
 8 | 
 9 | import nltk
10 | from datashaper import ProgressTicker
11 | 
12 | from .typing import TextChunk
13 | 
14 | 
15 | def run(
16 |     input: list[str], _args: dict[str, Any], tick: ProgressTicker
17 | ) -> Iterable[TextChunk]:
18 |     """Chunks text into multiple parts. A pipeline verb."""
19 |     for doc_idx, text in enumerate(input):
20 |         sentences = nltk.sent_tokenize(text)
21 |         for sentence in sentences:
22 |             yield TextChunk(
23 |                 text_chunk=sentence,
24 |                 source_doc_indices=[doc_idx],
25 |             )
26 |         tick(1)
27 | 


--------------------------------------------------------------------------------
/cspell.config.yaml:
--------------------------------------------------------------------------------
 1 | $schema: https://raw.githubusercontent.com/streetsidesoftware/cspell/main/cspell.schema.json
 2 | version: "0.2"
 3 | allowCompoundWords: true
 4 | dictionaryDefinitions:
 5 |   - name: dictionary
 6 |     path: "./dictionary.txt"
 7 |     addWords: true
 8 | dictionaries:
 9 |   - dictionary
10 | ignorePaths:
11 |   - cspell.config.yaml
12 |   - node_modules
13 |   - _site
14 |   - /project-words.txt
15 |   - default_pipeline.yml
16 |   - .turbo
17 |   - output/
18 |   - dist/
19 |   - temp_azurite/
20 |   - __pycache__
21 |   - pyproject.toml
22 |   - entity_extraction.txt
23 |   - package.json
24 |   - tests/fixtures/
25 |   - docsite/data/
26 |   - docsite/nbdocsite_template/
27 |   - docsite/posts/query/notebooks/inputs/
28 |   - examples_notebooks/inputs/
29 |   - "*.csv"
30 |   - "*.parquet"
31 |   - "*.faiss"
32 |   - "*.ipynb"
33 |   - "*.log"
34 | 


--------------------------------------------------------------------------------
/graphrag/index/verbs/graph/report/strategies/graph_intelligence/defaults.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """A file containing DEFAULT_CHUNK_SIZE and MOCK_RESPONSES definitions."""
 5 | 
 6 | import json
 7 | 
 8 | DEFAULT_CHUNK_SIZE = 3000
 9 | MOCK_RESPONSES = [
10 |     json.dumps({
11 |         "title": "<report_title>",
12 |         "summary": "<executive_summary>",
13 |         "rating": 2,
14 |         "rating_explanation": "<rating_explanation>",
15 |         "findings": [
16 |             {
17 |                 "summary": "<insight_1_summary>",
18 |                 "explanation": "<insight_1_explanation",
19 |             },
20 |             {
21 |                 "summary": "<farts insight_2_summary>",
22 |                 "explanation": "<insight_2_explanation",
23 |             },
24 |         ],
25 |     })
26 | ]
27 | 


--------------------------------------------------------------------------------
/graphrag/index/text_splitting/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """The Indexing Engine Text Splitting package root."""
 5 | 
 6 | from .check_token_limit import check_token_limit
 7 | from .text_splitting import (
 8 |     DecodeFn,
 9 |     EncodedText,
10 |     EncodeFn,
11 |     LengthFn,
12 |     NoopTextSplitter,
13 |     TextListSplitter,
14 |     TextListSplitterType,
15 |     TextSplitter,
16 |     Tokenizer,
17 |     TokenTextSplitter,
18 |     split_text_on_tokens,
19 | )
20 | 
21 | __all__ = [
22 |     "DecodeFn",
23 |     "EncodeFn",
24 |     "EncodedText",
25 |     "LengthFn",
26 |     "NoopTextSplitter",
27 |     "TextListSplitter",
28 |     "TextListSplitterType",
29 |     "TextSplitter",
30 |     "TokenTextSplitter",
31 |     "Tokenizer",
32 |     "check_token_limit",
33 |     "split_text_on_tokens",
34 | ]
35 | 


--------------------------------------------------------------------------------
/graphrag/index/bootstrap.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """Bootstrap definition."""
 5 | 
 6 | import warnings
 7 | 
 8 | # Ignore warnings from numba
 9 | warnings.filterwarnings("ignore", message=".*The 'nopython' keyword.*")
10 | warnings.filterwarnings("ignore", message=".*Use no seed for parallelism.*")
11 | 
12 | initialized_nltk = False
13 | 
14 | 
15 | def bootstrap():
16 |     """Bootstrap definition."""
17 |     global initialized_nltk
18 |     if not initialized_nltk:
19 |         import nltk
20 |         from nltk.corpus import wordnet as wn
21 | 
22 |         nltk.download("punkt")
23 |         nltk.download("averaged_perceptron_tagger")
24 |         nltk.download("maxent_ne_chunker")
25 |         nltk.download("words")
26 |         nltk.download("wordnet")
27 |         wn.ensure_loaded()
28 |         initialized_nltk = True
29 | 


--------------------------------------------------------------------------------
/graphrag/llm/limiting/create_limiters.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """Create limiters for OpenAI API requests."""
 5 | 
 6 | import logging
 7 | 
 8 | from aiolimiter import AsyncLimiter
 9 | 
10 | from graphrag.llm.types import LLMConfig
11 | 
12 | from .llm_limiter import LLMLimiter
13 | from .tpm_rpm_limiter import TpmRpmLLMLimiter
14 | 
15 | log = logging.getLogger(__name__)
16 | 
17 | """The global TPM limiters."""
18 | 
19 | 
20 | def create_tpm_rpm_limiters(
21 |     configuration: LLMConfig,
22 | ) -> LLMLimiter:
23 |     """Get the limiters for a given model name."""
24 |     tpm = configuration.tokens_per_minute
25 |     rpm = configuration.requests_per_minute
26 |     return TpmRpmLLMLimiter(
27 |         None if tpm == 0 else AsyncLimiter(tpm or 50_000),
28 |         None if rpm == 0 else AsyncLimiter(rpm or 10_000),
29 |     )
30 | 


--------------------------------------------------------------------------------
/graphrag/config/models/snapshots_config.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """Parameterization settings for the default configuration."""
 5 | 
 6 | from pydantic import BaseModel, Field
 7 | 
 8 | import graphrag.config.defaults as defs
 9 | 
10 | 
11 | class SnapshotsConfig(BaseModel):
12 |     """Configuration section for snapshots."""
13 | 
14 |     graphml: bool = Field(
15 |         description="A flag indicating whether to take snapshots of GraphML.",
16 |         default=defs.SNAPSHOTS_GRAPHML,
17 |     )
18 |     raw_entities: bool = Field(
19 |         description="A flag indicating whether to take snapshots of raw entities.",
20 |         default=defs.SNAPSHOTS_RAW_ENTITIES,
21 |     )
22 |     top_level_nodes: bool = Field(
23 |         description="A flag indicating whether to take snapshots of top-level nodes.",
24 |         default=defs.SNAPSHOTS_TOP_LEVEL_NODES,
25 |     )
26 | 


--------------------------------------------------------------------------------
/examples/custom_set_of_available_verbs/custom_verb_definitions.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | from datashaper import TableContainer, VerbInput
 4 | 
 5 | 
 6 | def str_append(
 7 |     input: VerbInput, source_column: str, target_column: str, string_to_append: str
 8 | ):
 9 |     """A custom verb that appends a string to a column"""
10 |     # by convention, we typically use "column" as the input column name and "to" as the output column name, but you can use whatever you want
11 |     # just as long as the "args" in the workflow reference match the function signature
12 |     input_data = input.get_input()
13 |     output_df = input_data.copy()
14 |     output_df[target_column] = output_df[source_column].apply(
15 |         lambda x: f"{x}{string_to_append}"
16 |     )
17 |     return TableContainer(table=output_df)
18 | 
19 | 
20 | custom_verbs = {
21 |     "str_append": str_append,
22 | }
23 | 


--------------------------------------------------------------------------------
/graphrag/index/graph/extractors/summarize/prompts.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """A file containing prompts definition."""
 5 | 
 6 | SUMMARIZE_PROMPT = """
 7 | You are a helpful assistant responsible for generating a comprehensive summary of the data provided below.
 8 | Given one or two entities, and a list of descriptions, all related to the same entity or group of entities.
 9 | Please concatenate all of these into a single, comprehensive description. Make sure to include information collected from all the descriptions.
10 | If the provided descriptions are contradictory, please resolve the contradictions and provide a single, coherent summary.
11 | Make sure it is written in third person, and include the entity names so we the have full context.
12 | 
13 | #######
14 | -Data-
15 | Entities: {entity_name}
16 | Description List: {description_list}
17 | #######
18 | Output:
19 | """
20 | 


--------------------------------------------------------------------------------
/graphrag/index/verbs/overrides/concat.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """A module containing concat method definition."""
 5 | 
 6 | # Copyright (c) 2024 Microsoft Corporation.
 7 | # Licensed under the MIT License
 8 | from typing import cast
 9 | 
10 | import pandas as pd
11 | from datashaper import TableContainer, VerbInput, verb
12 | 
13 | 
14 | @verb(name="concat_override")
15 | def concat(
16 |     input: VerbInput,
17 |     columnwise: bool = False,
18 |     **_kwargs: dict,
19 | ) -> TableContainer:
20 |     """Concat method definition."""
21 |     input_table = cast(pd.DataFrame, input.get_input())
22 |     others = cast(list[pd.DataFrame], input.get_others())
23 |     if columnwise:
24 |         output = pd.concat([input_table, *others], axis=1)
25 |     else:
26 |         output = pd.concat([input_table, *others], ignore_index=True)
27 |     return TableContainer(table=output)
28 | 


--------------------------------------------------------------------------------
/graphrag/llm/openai/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """OpenAI LLM implementations."""
 5 | 
 6 | from .create_openai_client import create_openai_client
 7 | from .factories import (
 8 |     create_openai_chat_llm,
 9 |     create_openai_completion_llm,
10 |     create_openai_embedding_llm,
11 | )
12 | from .openai_chat_llm import OpenAIChatLLM
13 | from .openai_completion_llm import OpenAICompletionLLM
14 | from .openai_configuration import OpenAIConfiguration
15 | from .openai_embeddings_llm import OpenAIEmbeddingsLLM
16 | from .types import OpenAIClientTypes
17 | 
18 | __all__ = [
19 |     "OpenAIChatLLM",
20 |     "OpenAIClientTypes",
21 |     "OpenAICompletionLLM",
22 |     "OpenAIConfiguration",
23 |     "OpenAIEmbeddingsLLM",
24 |     "create_openai_chat_llm",
25 |     "create_openai_client",
26 |     "create_openai_completion_llm",
27 |     "create_openai_embedding_llm",
28 | ]
29 | 


--------------------------------------------------------------------------------
/graphrag/llm/limiting/composite_limiter.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """A module containing Composite Limiter class definition."""
 5 | 
 6 | from .llm_limiter import LLMLimiter
 7 | 
 8 | 
 9 | class CompositeLLMLimiter(LLMLimiter):
10 |     """Composite Limiter class definition."""
11 | 
12 |     _limiters: list[LLMLimiter]
13 | 
14 |     def __init__(self, limiters: list[LLMLimiter]):
15 |         """Init method definition."""
16 |         self._limiters = limiters
17 | 
18 |     @property
19 |     def needs_token_count(self) -> bool:
20 |         """Whether this limiter needs the token count to be passed in."""
21 |         return any(limiter.needs_token_count for limiter in self._limiters)
22 | 
23 |     async def acquire(self, num_tokens: int = 1) -> None:
24 |         """Call method definition."""
25 |         for limiter in self._limiters:
26 |             await limiter.acquire(num_tokens)
27 | 


--------------------------------------------------------------------------------
/graphrag/prompt_tune/template/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """Fine-tuning prompts for entity extraction, entity summarization, and community report summarization."""
 5 | 
 6 | from .community_report_summarization import COMMUNITY_REPORT_SUMMARIZATION_PROMPT
 7 | from .entity_extraction import (
 8 |     EXAMPLE_EXTRACTION_TEMPLATE,
 9 |     GRAPH_EXTRACTION_JSON_PROMPT,
10 |     GRAPH_EXTRACTION_PROMPT,
11 |     UNTYPED_EXAMPLE_EXTRACTION_TEMPLATE,
12 |     UNTYPED_GRAPH_EXTRACTION_PROMPT,
13 | )
14 | from .entity_summarization import ENTITY_SUMMARIZATION_PROMPT
15 | 
16 | __all__ = [
17 |     "COMMUNITY_REPORT_SUMMARIZATION_PROMPT",
18 |     "ENTITY_SUMMARIZATION_PROMPT",
19 |     "EXAMPLE_EXTRACTION_TEMPLATE",
20 |     "GRAPH_EXTRACTION_JSON_PROMPT",
21 |     "GRAPH_EXTRACTION_PROMPT",
22 |     "UNTYPED_EXAMPLE_EXTRACTION_TEMPLATE",
23 |     "UNTYPED_GRAPH_EXTRACTION_PROMPT",
24 | ]
25 | 


--------------------------------------------------------------------------------
/graphrag/index/verbs/entities/summarize/strategies/typing.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """A module containing 'ResolvedEntity' and 'EntityResolutionResult' models."""
 5 | 
 6 | from collections.abc import Awaitable, Callable
 7 | from dataclasses import dataclass
 8 | from typing import Any
 9 | 
10 | from datashaper import VerbCallbacks
11 | 
12 | from graphrag.index.cache import PipelineCache
13 | 
14 | StrategyConfig = dict[str, Any]
15 | 
16 | 
17 | @dataclass
18 | class SummarizedDescriptionResult:
19 |     """Entity summarization result class definition."""
20 | 
21 |     items: str | tuple[str, str]
22 |     description: str
23 | 
24 | 
25 | SummarizationStrategy = Callable[
26 |     [
27 |         str | tuple[str, str],
28 |         list[str],
29 |         VerbCallbacks,
30 |         PipelineCache,
31 |         StrategyConfig,
32 |     ],
33 |     Awaitable[SummarizedDescriptionResult],
34 | ]
35 | 


--------------------------------------------------------------------------------
/graphrag/query/structured_search/global_search/callbacks.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """GlobalSearch LLM Callbacks."""
 5 | 
 6 | from graphrag.query.llm.base import BaseLLMCallback
 7 | from graphrag.query.structured_search.base import SearchResult
 8 | 
 9 | 
10 | class GlobalSearchLLMCallback(BaseLLMCallback):
11 |     """GlobalSearch LLM Callbacks."""
12 | 
13 |     def __init__(self):
14 |         super().__init__()
15 |         self.map_response_contexts = []
16 |         self.map_response_outputs = []
17 | 
18 |     def on_map_response_start(self, map_response_contexts: list[str]):
19 |         """Handle the start of map response."""
20 |         self.map_response_contexts = map_response_contexts
21 | 
22 |     def on_map_response_end(self, map_response_outputs: list[SearchResult]):
23 |         """Handle the end of map response."""
24 |         self.map_response_outputs = map_response_outputs
25 | 


--------------------------------------------------------------------------------
/graphrag/index/verbs/snapshot.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """A module containing snapshot method definition."""
 5 | 
 6 | from datashaper import TableContainer, VerbInput, verb
 7 | 
 8 | from graphrag.index.storage import PipelineStorage
 9 | 
10 | 
11 | @verb(name="snapshot")
12 | async def snapshot(
13 |     input: VerbInput,
14 |     name: str,
15 |     formats: list[str],
16 |     storage: PipelineStorage,
17 |     **_kwargs: dict,
18 | ) -> TableContainer:
19 |     """Take a entire snapshot of the tabular data."""
20 |     data = input.get_input()
21 | 
22 |     for fmt in formats:
23 |         if fmt == "parquet":
24 |             await storage.set(name + ".parquet", data.to_parquet())
25 |         elif fmt == "json":
26 |             await storage.set(
27 |                 name + ".json", data.to_json(orient="records", lines=True)
28 |             )
29 | 
30 |     return TableContainer(table=data)
31 | 


--------------------------------------------------------------------------------
/graphrag/index/emit/csv_table_emitter.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """CSVTableEmitter module."""
 5 | 
 6 | import logging
 7 | 
 8 | import pandas as pd
 9 | 
10 | from graphrag.index.storage import PipelineStorage
11 | 
12 | from .table_emitter import TableEmitter
13 | 
14 | log = logging.getLogger(__name__)
15 | 
16 | 
17 | class CSVTableEmitter(TableEmitter):
18 |     """CSVTableEmitter class."""
19 | 
20 |     _storage: PipelineStorage
21 | 
22 |     def __init__(self, storage: PipelineStorage):
23 |         """Create a new CSV Table Emitter."""
24 |         self._storage = storage
25 | 
26 |     async def emit(self, name: str, data: pd.DataFrame) -> None:
27 |         """Emit a dataframe to storage."""
28 |         filename = f"{name}.csv"
29 |         log.info("emitting CSV table %s", filename)
30 |         await self._storage.set(
31 |             filename,
32 |             data.to_csv(),
33 |         )
34 | 


--------------------------------------------------------------------------------
/graphrag/index/verbs/text/translate/strategies/mock.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """A module containing run and _summarize_text methods definitions."""
 5 | 
 6 | from typing import Any
 7 | 
 8 | from datashaper import VerbCallbacks
 9 | 
10 | from graphrag.index.cache import PipelineCache
11 | 
12 | from .typing import TextTranslationResult
13 | 
14 | 
15 | async def run(  # noqa RUF029 async is required for interface
16 |     input: str | list[str],
17 |     _args: dict[str, Any],
18 |     _reporter: VerbCallbacks,
19 |     _cache: PipelineCache,
20 | ) -> TextTranslationResult:
21 |     """Run the Claim extraction chain."""
22 |     input = [input] if isinstance(input, str) else input
23 |     return TextTranslationResult(translations=[_translate_text(text) for text in input])
24 | 
25 | 
26 | def _translate_text(text: str) -> str:
27 |     """Translate a single piece of text."""
28 |     return f"{text} translated"
29 | 


--------------------------------------------------------------------------------
/graphrag/llm/mock/mock_completion_llm.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """LLM Static Response method definition."""
 5 | 
 6 | import logging
 7 | 
 8 | from typing_extensions import Unpack
 9 | 
10 | from graphrag.llm.base import BaseLLM
11 | from graphrag.llm.types import (
12 |     CompletionInput,
13 |     CompletionOutput,
14 |     LLMInput,
15 | )
16 | 
17 | log = logging.getLogger(__name__)
18 | 
19 | 
20 | class MockCompletionLLM(
21 |     BaseLLM[
22 |         CompletionInput,
23 |         CompletionOutput,
24 |     ]
25 | ):
26 |     """Mock Completion LLM for testing purposes."""
27 | 
28 |     def __init__(self, responses: list[str]):
29 |         self.responses = responses
30 |         self._on_error = None
31 | 
32 |     async def _execute_llm(
33 |         self,
34 |         input: CompletionInput,
35 |         **kwargs: Unpack[LLMInput],
36 |     ) -> CompletionOutput:
37 |         return self.responses[0]
38 | 


--------------------------------------------------------------------------------
/graphrag/config/models/llm_config.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """Parameterization settings for the default configuration."""
 5 | 
 6 | from datashaper import AsyncType
 7 | from pydantic import BaseModel, Field
 8 | 
 9 | import graphrag.config.defaults as defs
10 | 
11 | from .llm_parameters import LLMParameters
12 | from .parallelization_parameters import ParallelizationParameters
13 | 
14 | 
15 | class LLMConfig(BaseModel):
16 |     """Base class for LLM-configured steps."""
17 | 
18 |     llm: LLMParameters = Field(
19 |         description="The LLM configuration to use.", default=LLMParameters()
20 |     )
21 |     parallelization: ParallelizationParameters = Field(
22 |         description="The parallelization configuration to use.",
23 |         default=ParallelizationParameters(),
24 |     )
25 |     async_mode: AsyncType = Field(
26 |         description="The async mode to use.", default=defs.ASYNC_MODE
27 |     )
28 | 


--------------------------------------------------------------------------------
/graphrag/model/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """
 5 | GraphRAG knowledge model package root.
 6 | 
 7 | The GraphRAG knowledge model contains a set of classes that represent the target datamodels for our pipelines and analytics tools.
 8 | These models can be augmented and integrated into your own data infrastructure to suit your needs.
 9 | """
10 | 
11 | from .community import Community
12 | from .community_report import CommunityReport
13 | from .covariate import Covariate
14 | from .document import Document
15 | from .entity import Entity
16 | from .identified import Identified
17 | from .named import Named
18 | from .relationship import Relationship
19 | from .text_unit import TextUnit
20 | 
21 | __all__ = [
22 |     "Community",
23 |     "CommunityReport",
24 |     "Covariate",
25 |     "Document",
26 |     "Entity",
27 |     "Identified",
28 |     "Named",
29 |     "Relationship",
30 |     "TextUnit",
31 | ]
32 | 


--------------------------------------------------------------------------------
/graphrag/prompt_tune/generator/domain.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """Domain generation for GraphRAG prompts."""
 5 | 
 6 | from graphrag.llm.types.llm_types import CompletionLLM
 7 | from graphrag.prompt_tune.prompt.domain import GENERATE_DOMAIN_PROMPT
 8 | 
 9 | 
10 | async def generate_domain(llm: CompletionLLM, docs: str | list[str]) -> str:
11 |     """Generate an LLM persona to use for GraphRAG prompts.
12 | 
13 |     Parameters
14 |     ----------
15 |     - llm (CompletionLLM): The LLM to use for generation
16 |     - docs (str | list[str]): The domain to generate a persona for
17 | 
18 |     Returns
19 |     -------
20 |     - str: The generated domain prompt response.
21 |     """
22 |     docs_str = " ".join(docs) if isinstance(docs, list) else docs
23 |     domain_prompt = GENERATE_DOMAIN_PROMPT.format(input_text=docs_str)
24 | 
25 |     response = await llm(domain_prompt)
26 | 
27 |     return str(response.output)
28 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | environs==11.0.0
 2 | datashaper==0.0.49
 3 | azure-search-documents==11.4.0
 4 | lancedb==0.9.0
 5 | uvloop==0.19.0; platform_system != 'Windows'
 6 | nest-asyncio==1.6.0; platform_system == 'Windows'
 7 | aiolimiter==1.1.0
 8 | aiofiles==24.1.0
 9 | openai==1.35.7
10 | nltk==3.8.1
11 | tiktoken==0.7.0
12 | numba==0.60.0
13 | numpy==1.25.2
14 | graspologic==3.4.1
15 | networkx==3
16 | fastparquet==2024.2.0
17 | scipy==1.12.0
18 | pyyaml==6.0.1
19 | pyaml-env==1.2.1
20 | python-dotenv==1.0.0
21 | tenacity==8.2.3
22 | swifter==1.4.0
23 | pydantic==2
24 | rich==13.6.0
25 | textual==0.70.0
26 | devtools==0.12.2
27 | typing-extensions==4.12.2
28 | azure-storage-blob==12.19.0
29 | azure-identity==1.17.1
30 | coverage==7.5.4
31 | ipykernel==6.29.4
32 | jupyter==1.0.0
33 | nbconvert==7.16.3
34 | poethepoet==0.26.0
35 | pyright==1.1.368
36 | pytest==8.2.0
37 | pytest-asyncio==0.23.4
38 | pytest-timeout==2.3.1
39 | ruff==0.5.0
40 | semversioner==2.0.3
41 | update-toml==0.2.1
42 | 
43 | 


--------------------------------------------------------------------------------
/graphrag/llm/types/llm_invocation_result.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """Typing definitions for the OpenAI DataShaper package."""
 5 | 
 6 | from dataclasses import dataclass
 7 | from typing import Generic, TypeVar
 8 | 
 9 | T = TypeVar("T")
10 | 
11 | 
12 | @dataclass
13 | class LLMInvocationResult(Generic[T]):
14 |     """The result of an LLM invocation."""
15 | 
16 |     result: T | None
17 |     """The result of the LLM invocation."""
18 | 
19 |     name: str
20 |     """The operation name of the result"""
21 | 
22 |     num_retries: int
23 |     """The number of retries the invocation took."""
24 | 
25 |     total_time: float
26 |     """The total time of the LLM invocation."""
27 | 
28 |     call_times: list[float]
29 |     """The network times of individual invocations."""
30 | 
31 |     input_tokens: int
32 |     """The number of input tokens."""
33 | 
34 |     output_tokens: int
35 |     """The number of output tokens."""
36 | 


--------------------------------------------------------------------------------
/docsite/.eleventy.js:
--------------------------------------------------------------------------------
 1 | const { EleventyHtmlBasePlugin } = require("@11ty/eleventy");
 2 | const syntaxHighlight = require("@11ty/eleventy-plugin-syntaxhighlight");
 3 | const codeClipboard = require("eleventy-plugin-code-clipboard");
 4 | const pluginMermaid = require("@kevingimbel/eleventy-plugin-mermaid");
 5 | const markdownIt = require('markdown-it');
 6 | 
 7 | module.exports = (eleventyConfig) => {
 8 |   eleventyConfig.addPlugin(syntaxHighlight);
 9 |   eleventyConfig.addPlugin(codeClipboard);
10 |   eleventyConfig.addPlugin(pluginMermaid);
11 | 	eleventyConfig.addPlugin(EleventyHtmlBasePlugin, {
12 |     baseHref: process.env.DOCSITE_BASE_URL || ""
13 |   });
14 |   eleventyConfig.addPassthroughCopy("data");
15 |   eleventyConfig.addPassthroughCopy("img");
16 |   // Ignore auto-generated content
17 |   eleventyConfig.setUseGitIgnore(false);
18 | 
19 |   const markdownLibrary = markdownIt({
20 |     html: true
21 |   }).use(codeClipboard.markdownItCopyButton);
22 |   
23 |   eleventyConfig.setLibrary("md", markdownLibrary);
24 | 
25 | };


--------------------------------------------------------------------------------
/graphrag/index/emit/json_table_emitter.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """JsonTableEmitter module."""
 5 | 
 6 | import logging
 7 | 
 8 | import pandas as pd
 9 | 
10 | from graphrag.index.storage import PipelineStorage
11 | 
12 | from .table_emitter import TableEmitter
13 | 
14 | log = logging.getLogger(__name__)
15 | 
16 | 
17 | class JsonTableEmitter(TableEmitter):
18 |     """JsonTableEmitter class."""
19 | 
20 |     _storage: PipelineStorage
21 | 
22 |     def __init__(self, storage: PipelineStorage):
23 |         """Create a new Json Table Emitter."""
24 |         self._storage = storage
25 | 
26 |     async def emit(self, name: str, data: pd.DataFrame) -> None:
27 |         """Emit a dataframe to storage."""
28 |         filename = f"{name}.json"
29 | 
30 |         log.info("emitting JSON table %s", filename)
31 |         await self._storage.set(
32 |             filename,
33 |             data.to_json(orient="records", lines=True, force_ascii=False),
34 |         )
35 | 


--------------------------------------------------------------------------------
/graphrag/index/verbs/covariates/extract_covariates/strategies/graph_intelligence/defaults.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """A file containing MOCK_LLM_RESPONSES definition."""
 5 | 
 6 | MOCK_LLM_RESPONSES = [
 7 |     """
 8 | [
 9 |     {
10 |         "subject": "COMPANY A",
11 |         "object": "GOVERNMENT AGENCY B",
12 |         "type": "ANTI-COMPETITIVE PRACTICES",
13 |         "status": "TRUE",
14 |         "start_date": "2022-01-10T00:00:00",
15 |         "end_date": "2022-01-10T00:00:00",
16 |         "description": "Company A was found to engage in anti-competitive practices because it was fined for bid rigging in multiple public tenders published by Government Agency B according to an article published on 2022/01/10",
17 |         "source_text": ["According to an article published on 2022/01/10, Company A was fined for bid rigging while participating in multiple public tenders published by Government Agency B."]
18 |     }
19 | ]
20 |     """.strip()
21 | ]
22 | 


--------------------------------------------------------------------------------
/graphrag/config/models/cluster_graph_config.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """Parameterization settings for the default configuration."""
 5 | 
 6 | from pydantic import BaseModel, Field
 7 | 
 8 | import graphrag.config.defaults as defs
 9 | 
10 | 
11 | class ClusterGraphConfig(BaseModel):
12 |     """Configuration section for clustering graphs."""
13 | 
14 |     max_cluster_size: int = Field(
15 |         description="The maximum cluster size to use.", default=defs.MAX_CLUSTER_SIZE
16 |     )
17 |     strategy: dict | None = Field(
18 |         description="The cluster strategy to use.", default=None
19 |     )
20 | 
21 |     def resolved_strategy(self) -> dict:
22 |         """Get the resolved cluster strategy."""
23 |         from graphrag.index.verbs.graph.clustering import GraphCommunityStrategyType
24 | 
25 |         return self.strategy or {
26 |             "type": GraphCommunityStrategyType.leiden,
27 |             "max_cluster_size": self.max_cluster_size,
28 |         }
29 | 


--------------------------------------------------------------------------------
/graphrag/index/workflows/typing.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """A module containing 'WorkflowToRun' model."""
 5 | 
 6 | from collections.abc import Callable
 7 | from dataclasses import dataclass as dc_dataclass
 8 | from typing import Any
 9 | 
10 | from datashaper import TableContainer, Workflow
11 | 
12 | StepDefinition = dict[str, Any]
13 | """A step definition."""
14 | 
15 | VerbDefinitions = dict[str, Callable[..., TableContainer]]
16 | """A mapping of verb names to their implementations."""
17 | 
18 | WorkflowConfig = dict[str, Any]
19 | """A workflow configuration."""
20 | 
21 | WorkflowDefinitions = dict[str, Callable[[WorkflowConfig], list[StepDefinition]]]
22 | """A mapping of workflow names to their implementations."""
23 | 
24 | VerbTiming = dict[str, float]
25 | """The timings of verbs by id."""
26 | 
27 | 
28 | @dc_dataclass
29 | class WorkflowToRun:
30 |     """Workflow to run class definition."""
31 | 
32 |     workflow: Workflow
33 |     config: dict[str, Any]
34 | 


--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | # To get started with Dependabot version updates, you'll need to specify which
 2 | # package ecosystems to update and where the package manifests are located.
 3 | # Please see the documentation for all configuration options:
 4 | # https://docs.github.com/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file
 5 | version: 2
 6 | updates:
 7 |   - package-ecosystem: "npm" # See documentation for possible values
 8 |     directory: "docsite/" # Location of package manifests
 9 |     schedule:
10 |       interval: "weekly"
11 |   - package-ecosystem: "pip" # See documentation for possible values
12 |     directory: "/" # Location of package manifests
13 |     schedule:
14 |       interval: "weekly"
15 |   - package-ecosystem: "github-actions"
16 |     # Workflow files stored in the default location of `.github/workflows`. (You don't need to specify `/.github/workflows` for `directory`. You can use `directory: "/"`.)
17 |     directory: "/"
18 |     schedule:
19 |       interval: "weekly"
20 | 


--------------------------------------------------------------------------------
/tests/fixtures/azure/settings.yml:
--------------------------------------------------------------------------------
 1 | claim_extraction:
 2 |   enabled: true
 3 | 
 4 | embeddings:
 5 |   vector_store:
 6 |     type: "azure_ai_search"
 7 |     url: ${AZURE_AI_SEARCH_URL_ENDPOINT}
 8 |     api_key: ${AZURE_AI_SEARCH_API_KEY}
 9 |     collection_name: "azure_ci"
10 |     query_collection_name: "azure_ci_query"
11 | 
12 |     entity_name_description:
13 |       title_column: "name"
14 | 
15 | input:
16 |   type: blob
17 |   file_type: text
18 |   connection_string: ${LOCAL_BLOB_STORAGE_CONNECTION_STRING}
19 |   container_name: azurefixture
20 |   base_dir: input
21 | 
22 | cache:
23 |   type: blob
24 |   connection_string: ${BLOB_STORAGE_CONNECTION_STRING}
25 |   container_name: cicache
26 |   base_dir: cache_azure_ai
27 | 
28 | storage:
29 |   type: blob
30 |   connection_string: ${LOCAL_BLOB_STORAGE_CONNECTION_STRING}
31 |   container_name: azurefixture
32 |   base_dir: output
33 | 
34 | reporting:
35 |   type: blob
36 |   connection_string: ${LOCAL_BLOB_STORAGE_CONNECTION_STRING}
37 |   container_name: azurefixture
38 |   base_dir: reports
39 | 


--------------------------------------------------------------------------------
/graphrag/llm/types/llm_config.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """LLM Configuration Protocol definition."""
 5 | 
 6 | from typing import Protocol
 7 | 
 8 | 
 9 | class LLMConfig(Protocol):
10 |     """LLM Configuration Protocol definition."""
11 | 
12 |     @property
13 |     def max_retries(self) -> int | None:
14 |         """Get the maximum number of retries."""
15 |         ...
16 | 
17 |     @property
18 |     def max_retry_wait(self) -> float | None:
19 |         """Get the maximum retry wait time."""
20 |         ...
21 | 
22 |     @property
23 |     def sleep_on_rate_limit_recommendation(self) -> bool | None:
24 |         """Get whether to sleep on rate limit recommendation."""
25 |         ...
26 | 
27 |     @property
28 |     def tokens_per_minute(self) -> int | None:
29 |         """Get the number of tokens per minute."""
30 |         ...
31 | 
32 |     @property
33 |     def requests_per_minute(self) -> int | None:
34 |         """Get the number of requests per minute."""
35 |         ...
36 | 


--------------------------------------------------------------------------------
/graphrag/index/verbs/graph/report/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """The Indexing Engine graph report package root."""
 5 | 
 6 | from .create_community_reports import (
 7 |     CreateCommunityReportsStrategyType,
 8 |     create_community_reports,
 9 | )
10 | from .prepare_community_reports import prepare_community_reports
11 | from .prepare_community_reports_claims import prepare_community_reports_claims
12 | from .prepare_community_reports_edges import prepare_community_reports_edges
13 | from .prepare_community_reports_nodes import prepare_community_reports_nodes
14 | from .restore_community_hierarchy import restore_community_hierarchy
15 | 
16 | __all__ = [
17 |     "CreateCommunityReportsStrategyType",
18 |     "create_community_reports",
19 |     "create_community_reports",
20 |     "prepare_community_reports",
21 |     "prepare_community_reports_claims",
22 |     "prepare_community_reports_edges",
23 |     "prepare_community_reports_nodes",
24 |     "restore_community_hierarchy",
25 | ]
26 | 


--------------------------------------------------------------------------------
/graphrag/prompt_tune/generator/persona.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """Persona generating module for fine-tuning GraphRAG prompts."""
 5 | 
 6 | from graphrag.llm.types.llm_types import CompletionLLM
 7 | from graphrag.prompt_tune.generator.defaults import DEFAULT_TASK
 8 | from graphrag.prompt_tune.prompt import GENERATE_PERSONA_PROMPT
 9 | 
10 | 
11 | async def generate_persona(
12 |     llm: CompletionLLM, domain: str, task: str = DEFAULT_TASK
13 | ) -> str:
14 |     """Generate an LLM persona to use for GraphRAG prompts.
15 | 
16 |     Parameters
17 |     ----------
18 |     - llm (CompletionLLM): The LLM to use for generation
19 |     - domain (str): The domain to generate a persona for
20 |     - task (str): The task to generate a persona for. Default is DEFAULT_TASK
21 |     """
22 |     formatted_task = task.format(domain=domain)
23 |     persona_prompt = GENERATE_PERSONA_PROMPT.format(sample_task=formatted_task)
24 | 
25 |     response = await llm(persona_prompt)
26 | 
27 |     return str(response.output)
28 | 


--------------------------------------------------------------------------------
/graphrag/index/verbs/entities/extraction/strategies/graph_intelligence/defaults.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """A file containing some default responses."""
 5 | 
 6 | from graphrag.config.enums import LLMType
 7 | 
 8 | MOCK_LLM_RESPONSES = [
 9 |     """
10 |     ("entity"<|>COMPANY_A<|>COMPANY<|>Company_A is a test company)
11 |     ##
12 |     ("entity"<|>COMPANY_B<|>COMPANY<|>Company_B owns Company_A and also shares an address with Company_A)
13 |     ##
14 |     ("entity"<|>PERSON_C<|>PERSON<|>Person_C is director of Company_A)
15 |     ##
16 |     ("relationship"<|>COMPANY_A<|>COMPANY_B<|>Company_A and Company_B are related because Company_A is 100% owned by Company_B and the two companies also share the same address)<|>2)
17 |     ##
18 |     ("relationship"<|>COMPANY_A<|>PERSON_C<|>Company_A and Person_C are related because Person_C is director of Company_A<|>1))
19 |     """.strip()
20 | ]
21 | 
22 | DEFAULT_LLM_CONFIG = {
23 |     "type": LLMType.StaticResponse,
24 |     "responses": MOCK_LLM_RESPONSES,
25 | }
26 | 


--------------------------------------------------------------------------------
/graphrag/config/models/cache_config.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """Parameterization settings for the default configuration."""
 5 | 
 6 | from pydantic import BaseModel, Field
 7 | 
 8 | import graphrag.config.defaults as defs
 9 | from graphrag.config.enums import CacheType
10 | 
11 | 
12 | class CacheConfig(BaseModel):
13 |     """The default configuration section for Cache."""
14 | 
15 |     type: CacheType = Field(
16 |         description="The cache type to use.", default=defs.CACHE_TYPE
17 |     )
18 |     base_dir: str = Field(
19 |         description="The base directory for the cache.", default=defs.CACHE_BASE_DIR
20 |     )
21 |     connection_string: str | None = Field(
22 |         description="The cache connection string to use.", default=None
23 |     )
24 |     container_name: str | None = Field(
25 |         description="The cache container name to use.", default=None
26 |     )
27 |     storage_account_blob_url: str | None = Field(
28 |         description="The storage account blob url to use.", default=None
29 |     )
30 | 


--------------------------------------------------------------------------------
/graphrag/query/question_gen/system_prompt.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """Question Generation system prompts."""
 5 | 
 6 | QUESTION_SYSTEM_PROMPT = """
 7 | ---Role---
 8 | 
 9 | You are a helpful assistant generating a bulleted list of {question_count} questions about data in the tables provided.
10 | 
11 | 
12 | ---Data tables---
13 | 
14 | {context_data}
15 | 
16 | 
17 | ---Goal---
18 | 
19 | Given a series of example questions provided by the user, generate a bulleted list of {question_count} candidates for the next question. Use - marks as bullet points.
20 | 
21 | These candidate questions should represent the most important or urgent information content or themes in the data tables.
22 | 
23 | The candidate questions should be answerable using the data tables provided, but should not mention any specific data fields or data tables in the question text.
24 | 
25 | If the user's questions reference several named entities, then each candidate question should reference all named entities.
26 | 
27 | ---Example questions---
28 | """
29 | 


--------------------------------------------------------------------------------
/graphrag/prompt_tune/generator/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """Prompt generation module."""
 5 | 
 6 | from .community_report_summarization import create_community_summarization_prompt
 7 | from .community_reporter_role import generate_community_reporter_role
 8 | from .defaults import MAX_TOKEN_COUNT
 9 | from .domain import generate_domain
10 | from .entity_extraction_prompt import create_entity_extraction_prompt
11 | from .entity_relationship import generate_entity_relationship_examples
12 | from .entity_summarization_prompt import create_entity_summarization_prompt
13 | from .entity_types import generate_entity_types
14 | from .persona import generate_persona
15 | 
16 | __all__ = [
17 |     "MAX_TOKEN_COUNT",
18 |     "create_community_summarization_prompt",
19 |     "create_entity_extraction_prompt",
20 |     "create_entity_summarization_prompt",
21 |     "generate_community_reporter_role",
22 |     "generate_domain",
23 |     "generate_entity_relationship_examples",
24 |     "generate_entity_types",
25 |     "generate_persona",
26 | ]
27 | 


--------------------------------------------------------------------------------
/graphrag/prompt_tune/prompt/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """Persona, entity type, relationships and domain generation prompts module."""
 5 | 
 6 | from .community_reporter_role import GENERATE_COMMUNITY_REPORTER_ROLE_PROMPT
 7 | from .domain import GENERATE_DOMAIN_PROMPT
 8 | from .entity_relationship import (
 9 |     ENTITY_RELATIONSHIPS_GENERATION_JSON_PROMPT,
10 |     ENTITY_RELATIONSHIPS_GENERATION_PROMPT,
11 |     UNTYPED_ENTITY_RELATIONSHIPS_GENERATION_PROMPT,
12 | )
13 | from .entity_types import (
14 |     ENTITY_TYPE_GENERATION_JSON_PROMPT,
15 |     ENTITY_TYPE_GENERATION_PROMPT,
16 | )
17 | from .persona import GENERATE_PERSONA_PROMPT
18 | 
19 | __all__ = [
20 |     "ENTITY_RELATIONSHIPS_GENERATION_JSON_PROMPT",
21 |     "ENTITY_RELATIONSHIPS_GENERATION_PROMPT",
22 |     "ENTITY_TYPE_GENERATION_JSON_PROMPT",
23 |     "ENTITY_TYPE_GENERATION_PROMPT",
24 |     "GENERATE_COMMUNITY_REPORTER_ROLE_PROMPT",
25 |     "GENERATE_DOMAIN_PROMPT",
26 |     "GENERATE_PERSONA_PROMPT",
27 |     "UNTYPED_ENTITY_RELATIONSHIPS_GENERATION_PROMPT",
28 | ]
29 | 


--------------------------------------------------------------------------------
/graphrag/prompt_tune/prompt/community_reporter_role.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """Fine-tuning prompts for community reporter role generation."""
 5 | 
 6 | GENERATE_COMMUNITY_REPORTER_ROLE_PROMPT = """
 7 | {persona}
 8 | Given a sample text, help the user by creating a role definition that will be tasked with community analysis.
 9 | Take a look at this example, determine its key parts, and using the domain provided and your expertise, create a new role definition for the provided inputs that follows the same pattern as the example.
10 | Remember, your output should look just like the provided example in structure and content.
11 | 
12 | Example:
13 | A technologist reporter that is analyzing Kevin Scott's "Behind the Tech Podcast", given a list of entities
14 | that belong to the community as well as their relationships and optional associated claims.
15 | The report will be used to inform decision-makers about significant developments associated with the community and their potential impact.
16 | 
17 | 
18 | Domain: {domain}
19 | Text: {input_text}
20 | Role:"""
21 | 


--------------------------------------------------------------------------------
/graphrag/config/models/storage_config.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """Parameterization settings for the default configuration."""
 5 | 
 6 | from pydantic import BaseModel, Field
 7 | 
 8 | import graphrag.config.defaults as defs
 9 | from graphrag.config.enums import StorageType
10 | 
11 | 
12 | class StorageConfig(BaseModel):
13 |     """The default configuration section for Storage."""
14 | 
15 |     type: StorageType = Field(
16 |         description="The storage type to use.", default=defs.STORAGE_TYPE
17 |     )
18 |     base_dir: str = Field(
19 |         description="The base directory for the storage.",
20 |         default=defs.STORAGE_BASE_DIR,
21 |     )
22 |     connection_string: str | None = Field(
23 |         description="The storage connection string to use.", default=None
24 |     )
25 |     container_name: str | None = Field(
26 |         description="The storage container name to use.", default=None
27 |     )
28 |     storage_account_blob_url: str | None = Field(
29 |         description="The storage account blob url to use.", default=None
30 |     )
31 | 


--------------------------------------------------------------------------------
/graphrag/llm/types/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """LLM Typings."""
 5 | 
 6 | from .llm import LLM
 7 | from .llm_cache import LLMCache
 8 | from .llm_callbacks import (
 9 |     ErrorHandlerFn,
10 |     IsResponseValidFn,
11 |     LLMInvocationFn,
12 |     OnCacheActionFn,
13 | )
14 | from .llm_config import LLMConfig
15 | from .llm_invocation_result import LLMInvocationResult
16 | from .llm_io import (
17 |     LLMInput,
18 |     LLMOutput,
19 | )
20 | from .llm_types import (
21 |     CompletionInput,
22 |     CompletionLLM,
23 |     CompletionOutput,
24 |     EmbeddingInput,
25 |     EmbeddingLLM,
26 |     EmbeddingOutput,
27 | )
28 | 
29 | __all__ = [
30 |     "LLM",
31 |     "CompletionInput",
32 |     "CompletionLLM",
33 |     "CompletionOutput",
34 |     "EmbeddingInput",
35 |     "EmbeddingLLM",
36 |     "EmbeddingOutput",
37 |     "ErrorHandlerFn",
38 |     "IsResponseValidFn",
39 |     "LLMCache",
40 |     "LLMConfig",
41 |     "LLMInput",
42 |     "LLMInvocationFn",
43 |     "LLMInvocationResult",
44 |     "LLMOutput",
45 |     "OnCacheActionFn",
46 | ]
47 | 


--------------------------------------------------------------------------------
/.vsts-ci.yml:
--------------------------------------------------------------------------------
 1 | name: GraphRAG CI
 2 | pool:
 3 |   vmImage: ubuntu-latest
 4 | 
 5 | trigger:
 6 |   batch: true
 7 |   branches:
 8 |     include:
 9 |       - main
10 | 
11 | variables:
12 |   isMain: $[eq(variables['Build.SourceBranch'], 'refs/heads/main')]
13 |   pythonVersion: "3.10"
14 |   poetryVersion: "1.6.1"
15 |   nodeVersion: "18.x"
16 |   artifactsFullFeedName: "Resilience/resilience_python"
17 | 
18 | stages:
19 |   - stage: Compliance
20 |     dependsOn: []
21 |     jobs:
22 |       - job: compliance
23 |         displayName: Compliance
24 |         pool:
25 |           vmImage: windows-latest
26 |         steps:
27 |           - task: CredScan@3
28 |             inputs:
29 |               outputFormat: sarif
30 |               debugMode: false
31 | 
32 |           - task: ComponentGovernanceComponentDetection@0
33 |             inputs:
34 |               scanType: "Register"
35 |               verbosity: "Verbose"
36 |               alertWarningLevel: "High"
37 | 
38 |           - task: PublishSecurityAnalysisLogs@3
39 |             inputs:
40 |               ArtifactName: "CodeAnalysisLogs"
41 |               ArtifactType: "Container"


--------------------------------------------------------------------------------
/graphrag/index/utils/ds_util.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """A utility module datashaper-specific utility methods."""
 5 | 
 6 | from typing import cast
 7 | 
 8 | from datashaper import TableContainer, VerbInput
 9 | 
10 | _NAMED_INPUTS_REQUIRED = "Named inputs are required"
11 | 
12 | 
13 | def get_required_input_table(input: VerbInput, name: str) -> TableContainer:
14 |     """Get a required input table by name."""
15 |     return cast(TableContainer, get_named_input_table(input, name, required=True))
16 | 
17 | 
18 | def get_named_input_table(
19 |     input: VerbInput, name: str, required: bool = False
20 | ) -> TableContainer | None:
21 |     """Get an input table from datashaper verb-inputs by name."""
22 |     named_inputs = input.named
23 |     if named_inputs is None:
24 |         if not required:
25 |             return None
26 |         raise ValueError(_NAMED_INPUTS_REQUIRED)
27 | 
28 |     result = named_inputs.get(name)
29 |     if result is None and required:
30 |         msg = f"input '${name}' is required"
31 |         raise ValueError(msg)
32 |     return result
33 | 


--------------------------------------------------------------------------------
/graphrag/index/reporting/console_workflow_callbacks.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """Console-based reporter for the workflow engine."""
 5 | 
 6 | from datashaper import NoopWorkflowCallbacks
 7 | 
 8 | 
 9 | class ConsoleWorkflowCallbacks(NoopWorkflowCallbacks):
10 |     """A reporter that writes to a console."""
11 | 
12 |     def on_error(
13 |         self,
14 |         message: str,
15 |         cause: BaseException | None = None,
16 |         stack: str | None = None,
17 |         details: dict | None = None,
18 |     ):
19 |         """Handle when an error occurs."""
20 |         print(message, str(cause), stack, details)  # noqa T201
21 | 
22 |     def on_warning(self, message: str, details: dict | None = None):
23 |         """Handle when a warning occurs."""
24 |         _print_warning(message)
25 | 
26 |     def on_log(self, message: str, details: dict | None = None):
27 |         """Handle when a log message is produced."""
28 |         print(message, details)  # noqa T201
29 | 
30 | 
31 | def _print_warning(skk):
32 |     print("\033[93m {}\033[00m".format(skk))  # noqa T201
33 | 


--------------------------------------------------------------------------------
/graphrag/index/verbs/entities/extraction/strategies/typing.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """A module containing 'Document' and 'EntityExtractionResult' models."""
 5 | 
 6 | from collections.abc import Awaitable, Callable
 7 | from dataclasses import dataclass
 8 | from typing import Any
 9 | 
10 | from datashaper import VerbCallbacks
11 | 
12 | from graphrag.index.cache import PipelineCache
13 | 
14 | ExtractedEntity = dict[str, Any]
15 | StrategyConfig = dict[str, Any]
16 | EntityTypes = list[str]
17 | 
18 | 
19 | @dataclass
20 | class Document:
21 |     """Document class definition."""
22 | 
23 |     text: str
24 |     id: str
25 | 
26 | 
27 | @dataclass
28 | class EntityExtractionResult:
29 |     """Entity extraction result class definition."""
30 | 
31 |     entities: list[ExtractedEntity]
32 |     graphml_graph: str | None
33 | 
34 | 
35 | EntityExtractStrategy = Callable[
36 |     [
37 |         list[Document],
38 |         EntityTypes,
39 |         VerbCallbacks,
40 |         PipelineCache,
41 |         StrategyConfig,
42 |     ],
43 |     Awaitable[EntityExtractionResult],
44 | ]
45 | 


--------------------------------------------------------------------------------
/tests/unit/indexing/test_init_content.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | import re
 5 | from typing import Any, cast
 6 | 
 7 | import yaml
 8 | 
 9 | from graphrag.config import (
10 |     GraphRagConfig,
11 |     create_graphrag_config,
12 | )
13 | from graphrag.index.init_content import INIT_YAML
14 | 
15 | 
16 | def test_init_yaml():
17 |     data = yaml.load(INIT_YAML, Loader=yaml.FullLoader)
18 |     config = create_graphrag_config(data)
19 |     GraphRagConfig.model_validate(config, strict=True)
20 | 
21 | 
22 | def test_init_yaml_uncommented():
23 |     lines = INIT_YAML.splitlines()
24 |     lines = [line for line in lines if "##" not in line]
25 | 
26 |     def uncomment_line(line: str) -> str:
27 |         leading_whitespace = cast(Any, re.search(r"^(\s*)", line)).group(1)
28 |         return re.sub(r"^\s*# ", leading_whitespace, line, count=1)
29 | 
30 |     content = "\n".join([uncomment_line(line) for line in lines])
31 |     data = yaml.load(content, Loader=yaml.FullLoader)
32 |     config = create_graphrag_config(data)
33 |     GraphRagConfig.model_validate(config, strict=True)
34 | 


--------------------------------------------------------------------------------
/graphrag/config/models/reporting_config.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """Parameterization settings for the default configuration."""
 5 | 
 6 | from pydantic import BaseModel, Field
 7 | 
 8 | import graphrag.config.defaults as defs
 9 | from graphrag.config.enums import ReportingType
10 | 
11 | 
12 | class ReportingConfig(BaseModel):
13 |     """The default configuration section for Reporting."""
14 | 
15 |     type: ReportingType = Field(
16 |         description="The reporting type to use.", default=defs.REPORTING_TYPE
17 |     )
18 |     base_dir: str = Field(
19 |         description="The base directory for reporting.",
20 |         default=defs.REPORTING_BASE_DIR,
21 |     )
22 |     connection_string: str | None = Field(
23 |         description="The reporting connection string to use.", default=None
24 |     )
25 |     container_name: str | None = Field(
26 |         description="The reporting container name to use.", default=None
27 |     )
28 |     storage_account_blob_url: str | None = Field(
29 |         description="The storage account blob url to use.", default=None
30 |     )
31 | 


--------------------------------------------------------------------------------
/graphrag/config/input_models/input_config_input.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """Parameterization settings for the default configuration."""
 5 | 
 6 | from typing_extensions import NotRequired, TypedDict
 7 | 
 8 | from graphrag.config.enums import InputFileType, InputType
 9 | 
10 | 
11 | class InputConfigInput(TypedDict):
12 |     """The default configuration section for Input."""
13 | 
14 |     type: NotRequired[InputType | str | None]
15 |     file_type: NotRequired[InputFileType | str | None]
16 |     base_dir: NotRequired[str | None]
17 |     connection_string: NotRequired[str | None]
18 |     container_name: NotRequired[str | None]
19 |     file_encoding: NotRequired[str | None]
20 |     file_pattern: NotRequired[str | None]
21 |     source_column: NotRequired[str | None]
22 |     timestamp_column: NotRequired[str | None]
23 |     timestamp_format: NotRequired[str | None]
24 |     text_column: NotRequired[str | None]
25 |     title_column: NotRequired[str | None]
26 |     document_attribute_columns: NotRequired[list[str] | str | None]
27 |     storage_account_blob_url: NotRequired[str | None]
28 | 


--------------------------------------------------------------------------------
/graphrag/query/context_builder/builders.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """Base classes for global and local context builders."""
 5 | 
 6 | from abc import ABC, abstractmethod
 7 | 
 8 | import pandas as pd
 9 | 
10 | from graphrag.query.context_builder.conversation_history import (
11 |     ConversationHistory,
12 | )
13 | 
14 | 
15 | class GlobalContextBuilder(ABC):
16 |     """Base class for global-search context builders."""
17 | 
18 |     @abstractmethod
19 |     def build_context(
20 |         self, conversation_history: ConversationHistory | None = None, **kwargs
21 |     ) -> tuple[str | list[str], dict[str, pd.DataFrame]]:
22 |         """Build the context for the global search mode."""
23 | 
24 | 
25 | class LocalContextBuilder(ABC):
26 |     """Base class for local-search context builders."""
27 | 
28 |     @abstractmethod
29 |     def build_context(
30 |         self,
31 |         query: str,
32 |         conversation_history: ConversationHistory | None = None,
33 |         **kwargs,
34 |     ) -> tuple[str | list[str], dict[str, pd.DataFrame]]:
35 |         """Build the context for the local search mode."""
36 | 


--------------------------------------------------------------------------------
/graphrag/llm/openai/json_parsing_llm.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """An LLM that unpacks cached JSON responses."""
 5 | 
 6 | from typing_extensions import Unpack
 7 | 
 8 | from graphrag.llm.types import (
 9 |     LLM,
10 |     CompletionInput,
11 |     CompletionLLM,
12 |     CompletionOutput,
13 |     LLMInput,
14 |     LLMOutput,
15 | )
16 | 
17 | from .utils import try_parse_json_object
18 | 
19 | 
20 | class JsonParsingLLM(LLM[CompletionInput, CompletionOutput]):
21 |     """An OpenAI History-Tracking LLM."""
22 | 
23 |     _delegate: CompletionLLM
24 | 
25 |     def __init__(self, delegate: CompletionLLM):
26 |         self._delegate = delegate
27 | 
28 |     async def __call__(
29 |         self,
30 |         input: CompletionInput,
31 |         **kwargs: Unpack[LLMInput],
32 |     ) -> LLMOutput[CompletionOutput]:
33 |         """Call the LLM with the input and kwargs."""
34 |         result = await self._delegate(input, **kwargs)
35 |         if kwargs.get("json") and result.json is None and result.output is not None:
36 |             result.json = try_parse_json_object(result.output)
37 |         return result
38 | 


--------------------------------------------------------------------------------
/graphrag/llm/openai/openai_history_tracking_llm.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """The Chat-based language model."""
 5 | 
 6 | from typing_extensions import Unpack
 7 | 
 8 | from graphrag.llm.types import (
 9 |     LLM,
10 |     CompletionInput,
11 |     CompletionLLM,
12 |     CompletionOutput,
13 |     LLMInput,
14 |     LLMOutput,
15 | )
16 | 
17 | 
18 | class OpenAIHistoryTrackingLLM(LLM[CompletionInput, CompletionOutput]):
19 |     """An OpenAI History-Tracking LLM."""
20 | 
21 |     _delegate: CompletionLLM
22 | 
23 |     def __init__(self, delegate: CompletionLLM):
24 |         self._delegate = delegate
25 | 
26 |     async def __call__(
27 |         self,
28 |         input: CompletionInput,
29 |         **kwargs: Unpack[LLMInput],
30 |     ) -> LLMOutput[CompletionOutput]:
31 |         """Call the LLM."""
32 |         history = kwargs.get("history") or []
33 |         output = await self._delegate(input, **kwargs)
34 |         return LLMOutput(
35 |             output=output.output,
36 |             json=output.json,
37 |             history=[*history, {"role": "system", "content": output.output}],
38 |         )
39 | 


--------------------------------------------------------------------------------
/graphrag/llm/openai/openai_token_replacing_llm.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """The Chat-based language model."""
 5 | 
 6 | from typing_extensions import Unpack
 7 | 
 8 | from graphrag.llm.types import (
 9 |     LLM,
10 |     CompletionInput,
11 |     CompletionLLM,
12 |     CompletionOutput,
13 |     LLMInput,
14 |     LLMOutput,
15 | )
16 | 
17 | from .utils import perform_variable_replacements
18 | 
19 | 
20 | class OpenAITokenReplacingLLM(LLM[CompletionInput, CompletionOutput]):
21 |     """An OpenAI History-Tracking LLM."""
22 | 
23 |     _delegate: CompletionLLM
24 | 
25 |     def __init__(self, delegate: CompletionLLM):
26 |         self._delegate = delegate
27 | 
28 |     async def __call__(
29 |         self,
30 |         input: CompletionInput,
31 |         **kwargs: Unpack[LLMInput],
32 |     ) -> LLMOutput[CompletionOutput]:
33 |         """Call the LLM with the input and kwargs."""
34 |         variables = kwargs.get("variables")
35 |         history = kwargs.get("history") or []
36 |         input = perform_variable_replacements(input, history, variables)
37 |         return await self._delegate(input, **kwargs)
38 | 


--------------------------------------------------------------------------------
/graphrag/prompt_tune/template/entity_summarization.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """Fine-tuning prompts for entity summarization."""
 5 | 
 6 | ENTITY_SUMMARIZATION_PROMPT = """
 7 | {persona}
 8 | Using your expertise, you're asked to generate a comprehensive summary of the data provided below.
 9 | Given one or two entities, and a list of descriptions, all related to the same entity or group of entities.
10 | Please concatenate all of these into a single, concise description. Make sure to include information collected from all the descriptions.
11 | If the provided descriptions are contradictory, please resolve the contradictions and provide a single, coherent summary.
12 | Make sure it is written in third person, and include the entity names so we the have full context.
13 | 
14 | Enrich it as much as you can with relevant information from the nearby text, this is very important.
15 | 
16 | If no answer is possible, or the description is empty, only convey information that is provided within the text.
17 | #######
18 | -Data-
19 | Entities: {{entity_name}}
20 | Description List: {{description_list}}
21 | #######
22 | Output:"""
23 | 


--------------------------------------------------------------------------------
/graphrag/index/utils/tokens.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """Utilities for working with tokens."""
 5 | 
 6 | import tiktoken
 7 | 
 8 | DEFAULT_ENCODING_NAME = "cl100k_base"
 9 | 
10 | 
11 | def num_tokens_from_string(
12 |     string: str, model: str | None = None, encoding_name: str | None = None
13 | ) -> int:
14 |     """Return the number of tokens in a text string."""
15 |     if model is not None:
16 |         encoding = tiktoken.encoding_for_model(model)
17 |     else:
18 |         encoding = tiktoken.get_encoding(encoding_name or DEFAULT_ENCODING_NAME)
19 |     return len(encoding.encode(string))
20 | 
21 | 
22 | def string_from_tokens(
23 |     tokens: list[int], model: str | None = None, encoding_name: str | None = None
24 | ) -> str:
25 |     """Return a text string from a list of tokens."""
26 |     if model is not None:
27 |         encoding = tiktoken.encoding_for_model(model)
28 |     elif encoding_name is not None:
29 |         encoding = tiktoken.get_encoding(encoding_name)
30 |     else:
31 |         msg = "Either model or encoding_name must be specified."
32 |         raise ValueError(msg)
33 |     return encoding.decode(tokens)
34 | 


--------------------------------------------------------------------------------
/graphrag/llm/limiting/tpm_rpm_limiter.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """TPM RPM Limiter module."""
 5 | 
 6 | from aiolimiter import AsyncLimiter
 7 | 
 8 | from .llm_limiter import LLMLimiter
 9 | 
10 | 
11 | class TpmRpmLLMLimiter(LLMLimiter):
12 |     """TPM RPM Limiter class definition."""
13 | 
14 |     _tpm_limiter: AsyncLimiter | None
15 |     _rpm_limiter: AsyncLimiter | None
16 | 
17 |     def __init__(
18 |         self, tpm_limiter: AsyncLimiter | None, rpm_limiter: AsyncLimiter | None
19 |     ):
20 |         """Init method definition."""
21 |         self._tpm_limiter = tpm_limiter
22 |         self._rpm_limiter = rpm_limiter
23 | 
24 |     @property
25 |     def needs_token_count(self) -> bool:
26 |         """Whether this limiter needs the token count to be passed in."""
27 |         return self._tpm_limiter is not None
28 | 
29 |     async def acquire(self, num_tokens: int = 1) -> None:
30 |         """Call method definition."""
31 |         if self._tpm_limiter is not None:
32 |             await self._tpm_limiter.acquire(num_tokens)
33 |         if self._rpm_limiter is not None:
34 |             await self._rpm_limiter.acquire()
35 | 


--------------------------------------------------------------------------------
/graphrag/index/graph/extractors/community_reports/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """The Indexing Engine community reports package root."""
 5 | 
 6 | import graphrag.index.graph.extractors.community_reports.schemas as schemas
 7 | 
 8 | from .build_mixed_context import build_mixed_context
 9 | from .community_reports_extractor import CommunityReportsExtractor
10 | from .prep_community_report_context import prep_community_report_context
11 | from .prompts import COMMUNITY_REPORT_PROMPT
12 | from .sort_context import sort_context
13 | from .utils import (
14 |     filter_claims_to_nodes,
15 |     filter_edges_to_nodes,
16 |     filter_nodes_to_level,
17 |     get_levels,
18 |     set_context_exceeds_flag,
19 |     set_context_size,
20 | )
21 | 
22 | __all__ = [
23 |     "COMMUNITY_REPORT_PROMPT",
24 |     "CommunityReportsExtractor",
25 |     "build_mixed_context",
26 |     "filter_claims_to_nodes",
27 |     "filter_edges_to_nodes",
28 |     "filter_nodes_to_level",
29 |     "get_levels",
30 |     "prep_community_report_context",
31 |     "schemas",
32 |     "set_context_exceeds_flag",
33 |     "set_context_size",
34 |     "sort_context",
35 | ]
36 | 


--------------------------------------------------------------------------------
/graphrag/llm/base/_create_cache_key.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """Cache key generation utils."""
 5 | 
 6 | import hashlib
 7 | 
 8 | 
 9 | def _llm_string(params: dict) -> str:
10 |     # New version of the cache is not including n in the params dictionary
11 |     # This avoids creating a new cache key for the same prompt
12 |     if "max_tokens" in params and "n" not in params:
13 |         params["n"] = None
14 |     return str(sorted((k, v) for k, v in params.items()))
15 | 
16 | 
17 | def _hash(_input: str) -> str:
18 |     """Use a deterministic hashing approach."""
19 |     return hashlib.md5(_input.encode()).hexdigest()  # noqa S324
20 | 
21 | 
22 | def create_hash_key(operation: str, prompt: str, parameters: dict) -> str:
23 |     """Compute cache key from prompt and associated model and settings.
24 | 
25 |     Args:
26 |         prompt (str): The prompt run through the language model.
27 |         llm_string (str): The language model version and settings.
28 | 
29 |     Returns
30 |     -------
31 |         str: The cache key.
32 |     """
33 |     llm_string = _llm_string(parameters)
34 |     return f"{operation}-{_hash(prompt + llm_string)}"
35 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 |     MIT License
 2 | 
 3 |     Copyright (c) Microsoft Corporation.
 4 | 
 5 |     Permission is hereby granted, free of charge, to any person obtaining a copy
 6 |     of this software and associated documentation files (the "Software"), to deal
 7 |     in the Software without restriction, including without limitation the rights
 8 |     to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 |     copies of the Software, and to permit persons to whom the Software is
10 |     furnished to do so, subject to the following conditions:
11 | 
12 |     The above copyright notice and this permission notice shall be included in all
13 |     copies or substantial portions of the Software.
14 | 
15 |     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 |     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 |     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 |     AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 |     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 |     OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 |     SOFTWARE
22 | 


--------------------------------------------------------------------------------
/graphrag/index/graph/embedding/embedding.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """Utilities to generate graph embeddings."""
 5 | 
 6 | from dataclasses import dataclass
 7 | 
 8 | import graspologic as gc
 9 | import networkx as nx
10 | import numpy as np
11 | 
12 | 
13 | @dataclass
14 | class NodeEmbeddings:
15 |     """Node embeddings class definition."""
16 | 
17 |     nodes: list[str]
18 |     embeddings: np.ndarray
19 | 
20 | 
21 | def embed_nod2vec(
22 |     graph: nx.Graph | nx.DiGraph,
23 |     dimensions: int = 1536,
24 |     num_walks: int = 10,
25 |     walk_length: int = 40,
26 |     window_size: int = 2,
27 |     iterations: int = 3,
28 |     random_seed: int = 86,
29 | ) -> NodeEmbeddings:
30 |     """Generate node embeddings using Node2Vec."""
31 |     # generate embedding
32 |     lcc_tensors = gc.embed.node2vec_embed(  # type: ignore
33 |         graph=graph,
34 |         dimensions=dimensions,
35 |         window_size=window_size,
36 |         iterations=iterations,
37 |         num_walks=num_walks,
38 |         walk_length=walk_length,
39 |         random_seed=random_seed,
40 |     )
41 |     return NodeEmbeddings(embeddings=lcc_tensors[0], nodes=lcc_tensors[1])
42 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Node Artifacts
 2 | */node_modules/
 3 | docsite/*/src/**/*.js
 4 | docsite/*/lib/
 5 | docsite/*/storybook-static/
 6 | docsite/*/docsTemp/
 7 | docsite/*/build/
 8 | .swc/
 9 | dist/
10 | .idea
11 | # https://yarnpkg.com/advanced/qa#which-files-should-be-gitignored
12 | docsite/.yarn/*
13 | !docsite/.yarn/patches
14 | !docsite/.yarn/releases
15 | !docsite/.yarn/plugins
16 | !docsite/.yarn/sdks
17 | !docsite/.yarn/versions
18 | docsite/.pnp.*
19 | 
20 | ./ragtest
21 | 
22 | .yarn/*
23 | !.yarn/patches
24 | !.yarn/releases
25 | !.yarn/plugins
26 | !.yarn/sdks
27 | !.yarn/versions
28 | .pnp.*
29 | 
30 | # Python Artifacts
31 | python/*/lib/
32 | # Test Output
33 | .coverage
34 | coverage/
35 | licenses.txt
36 | examples_notebooks/*/lancedb
37 | examples_notebooks/*/data
38 | tests/fixtures/cache
39 | tests/fixtures/*/cache
40 | tests/fixtures/*/output
41 | lancedb/
42 | 
43 | # Random
44 | .DS_Store
45 | *.log*
46 | .venv
47 | .conda
48 | .tmp
49 | 
50 | 
51 | .env
52 | build.zip
53 | 
54 | .turbo
55 | 
56 | __pycache__
57 | 
58 | .pipeline
59 | 
60 | # Azurite
61 | temp_azurite/
62 | __azurite*.json
63 | __blobstorage*.json
64 | __blobstorage__/
65 | 
66 | # Getting started example
67 | ragtest/
68 | .ragtest/
69 | .pipelines
70 | .pipeline


--------------------------------------------------------------------------------
/graphrag/index/verbs/graph/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """The Indexing Engine graph package root."""
 5 | 
 6 | from .clustering import cluster_graph
 7 | from .compute_edge_combined_degree import compute_edge_combined_degree
 8 | from .create import DEFAULT_EDGE_ATTRIBUTES, DEFAULT_NODE_ATTRIBUTES, create_graph
 9 | from .embed import embed_graph
10 | from .layout import layout_graph
11 | from .merge import merge_graphs
12 | from .report import (
13 |     create_community_reports,
14 |     prepare_community_reports,
15 |     prepare_community_reports_claims,
16 |     prepare_community_reports_edges,
17 |     restore_community_hierarchy,
18 | )
19 | from .unpack import unpack_graph
20 | 
21 | __all__ = [
22 |     "DEFAULT_EDGE_ATTRIBUTES",
23 |     "DEFAULT_NODE_ATTRIBUTES",
24 |     "cluster_graph",
25 |     "compute_edge_combined_degree",
26 |     "create_community_reports",
27 |     "create_graph",
28 |     "embed_graph",
29 |     "layout_graph",
30 |     "merge_graphs",
31 |     "prepare_community_reports",
32 |     "prepare_community_reports_claims",
33 |     "prepare_community_reports_edges",
34 |     "restore_community_hierarchy",
35 |     "unpack_graph",
36 | ]
37 | 


--------------------------------------------------------------------------------
/graphrag/index/verbs/text/embed/strategies/mock.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """A module containing run and _embed_text methods definitions."""
 5 | 
 6 | import random
 7 | from collections.abc import Iterable
 8 | from typing import Any
 9 | 
10 | from datashaper import ProgressTicker, VerbCallbacks, progress_ticker
11 | 
12 | from graphrag.index.cache import PipelineCache
13 | 
14 | from .typing import TextEmbeddingResult
15 | 
16 | 
17 | async def run(  # noqa RUF029 async is required for interface
18 |     input: list[str],
19 |     callbacks: VerbCallbacks,
20 |     cache: PipelineCache,
21 |     _args: dict[str, Any],
22 | ) -> TextEmbeddingResult:
23 |     """Run the Claim extraction chain."""
24 |     input = input if isinstance(input, Iterable) else [input]
25 |     ticker = progress_ticker(callbacks.progress, len(input))
26 |     return TextEmbeddingResult(
27 |         embeddings=[_embed_text(cache, text, ticker) for text in input]
28 |     )
29 | 
30 | 
31 | def _embed_text(_cache: PipelineCache, _text: str, tick: ProgressTicker) -> list[float]:
32 |     """Embed a single piece of text."""
33 |     tick(1)
34 |     return [random.random(), random.random(), random.random()]  # noqa S311
35 | 


--------------------------------------------------------------------------------
/docsite/posts/config/overview.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Configuring GraphRAG Indexing
 3 | navtitle: Configuration
 4 | tags: [post]
 5 | layout: page
 6 | date: 2023-01-03
 7 | ---
 8 | 
 9 | The GraphRAG system is highly configurable. This page provides an overview of the configuration options available for the GraphRAG indexing engine.
10 | 
11 | ## Default Configuration Mode
12 | 
13 | The default configuration mode is the simplest way to get started with the GraphRAG system. It is designed to work out-of-the-box with minimal configuration. The primary configuration sections for the Indexing Engine pipelines are described below. The main ways to set up GraphRAG in Default Configuration mode are via:
14 | 
15 | - [Init command](/posts/config/init) (recommended)
16 | - [Purely using environment variables](/posts/config/env_vars)
17 | - [Using JSON or YAML for deeper control](/posts/config/json_yaml)
18 | 
19 | ## Custom Configuration Mode
20 | 
21 | Custom configuration mode is an advanced use-case. Most users will want to use the Default Configuration instead. The primary configuration sections for Indexing Engine pipelines are described below. Details about how to use custom configuration are available in the [Custom Configuration Mode](/posts/config/custom) documentation.
22 | 


--------------------------------------------------------------------------------
/graphrag/index/config/workflow.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """A module containing 'PipelineWorkflowReference' model."""
 5 | 
 6 | from __future__ import annotations
 7 | 
 8 | from typing import Any
 9 | 
10 | from pydantic import BaseModel
11 | from pydantic import Field as pydantic_Field
12 | 
13 | PipelineWorkflowStep = dict[str, Any]
14 | """Represent a step in a workflow."""
15 | 
16 | PipelineWorkflowConfig = dict[str, Any]
17 | """Represent a configuration for a workflow."""
18 | 
19 | 
20 | class PipelineWorkflowReference(BaseModel):
21 |     """Represent a reference to a workflow, and can optionally be the workflow itself."""
22 | 
23 |     name: str | None = pydantic_Field(description="Name of the workflow.", default=None)
24 |     """Name of the workflow."""
25 | 
26 |     steps: list[PipelineWorkflowStep] | None = pydantic_Field(
27 |         description="The optional steps for the workflow.", default=None
28 |     )
29 |     """The optional steps for the workflow."""
30 | 
31 |     config: PipelineWorkflowConfig | None = pydantic_Field(
32 |         description="The optional configuration for the workflow.", default=None
33 |     )
34 |     """The optional configuration for the workflow."""
35 | 


--------------------------------------------------------------------------------
/graphrag/llm/openai/openai_embeddings_llm.py:
--------------------------------------------------------------------------------
 1 | #openai_embeddings_llm.py
 2 | 
 3 | from typing_extensions import Unpack
 4 | from graphrag.llm.base import BaseLLM
 5 | from graphrag.llm.types import (
 6 |     EmbeddingInput,
 7 |     EmbeddingOutput,
 8 |     LLMInput,
 9 | )
10 | from .openai_configuration import OpenAIConfiguration
11 | from .types import OpenAIClientTypes
12 | import ollama
13 | 
14 | class OpenAIEmbeddingsLLM(BaseLLM[EmbeddingInput, EmbeddingOutput]):
15 |     _client: OpenAIClientTypes
16 |     _configuration: OpenAIConfiguration
17 | 
18 |     def __init__(self, client: OpenAIClientTypes, configuration: OpenAIConfiguration):
19 |         self._client = client
20 |         self._configuration = configuration
21 | 
22 |     async def _execute_llm(
23 |         self, input: EmbeddingInput, **kwargs: Unpack[LLMInput]
24 |     ) -> EmbeddingOutput | None:
25 |         args = {
26 |             "model": self._configuration.model,
27 |             **(kwargs.get("model_parameters") or {}),
28 |         }
29 |         embedding_list = []
30 |         for inp in input:
31 |             embedding = ollama.embeddings(model=self._configuration.model, prompt=inp)
32 |             embedding_list.append(embedding["embedding"])
33 |         return embedding_list
34 | 


--------------------------------------------------------------------------------
/examples/interdependent_workflows/pipeline.yml:
--------------------------------------------------------------------------------
 1 | workflows:
 2 |   - name: aggregate_workflow
 3 |     steps:
 4 |       - verb: "aggregate"  # https://github.com/microsoft/datashaper/blob/main/python/datashaper/datashaper/engine/verbs/aggregate.py
 5 |         args:
 6 |             groupby: "type"
 7 |             column: "col_multiplied"
 8 |             to: "aggregated_output"
 9 |             operation: "sum"
10 |         input:
11 |           source: "workflow:derive_workflow" # reference the derive_workflow, cause this one requires that one to run first
12 |             # Notice, these are out of order, the indexing engine will figure out the right order to run them in
13 | 
14 |   - name: derive_workflow
15 |     steps:
16 |       - verb: "derive" # https://github.com/microsoft/datashaper/blob/main/python/datashaper/datashaper/engine/verbs/derive.py
17 |         args:
18 |           column1: "col1"  # from above
19 |           column2: "col2"  # from above
20 |           to: "col_multiplied"  # new column name
21 |           operator: "*"  # multiply the two columns,
22 |     # Since we're trying to act on the dataset, we don't need explicitly to specify an input
23 |       # "input": { "source": "source" } # use the dataset as the input to this verb. This is the default, so you can omit it.


--------------------------------------------------------------------------------
/graphrag/prompt_tune/generator/entity_summarization_prompt.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """Entity summarization prompt generation module."""
 5 | 
 6 | from pathlib import Path
 7 | 
 8 | from graphrag.prompt_tune.template import ENTITY_SUMMARIZATION_PROMPT
 9 | 
10 | ENTITY_SUMMARIZATION_FILENAME = "summarize_descriptions.txt"
11 | 
12 | 
13 | def create_entity_summarization_prompt(
14 |     persona: str,
15 |     output_path: Path | None = None,
16 | ) -> str:
17 |     """Create a prompt for entity summarization. If output_path is provided, write the prompt to a file.
18 | 
19 |     Parameters
20 |     ----------
21 |     - persona (str): The persona to use for the entity summarization prompt
22 |     - output_path (Path | None): The path to write the prompt to. Default is None. If None, the prompt is not written to a file. Default is None.
23 |     """
24 |     prompt = ENTITY_SUMMARIZATION_PROMPT.format(persona=persona)
25 | 
26 |     if output_path:
27 |         output_path.mkdir(parents=True, exist_ok=True)
28 | 
29 |         output_path = output_path / ENTITY_SUMMARIZATION_FILENAME
30 |         # Write file to output path
31 |         with output_path.open("w") as file:
32 |             file.write(prompt)
33 | 
34 |     return prompt
35 | 


--------------------------------------------------------------------------------
/graphrag/prompt_tune/generator/community_reporter_role.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """Generate a community reporter role for community summarization."""
 5 | 
 6 | from graphrag.llm.types.llm_types import CompletionLLM
 7 | from graphrag.prompt_tune.prompt import (
 8 |     GENERATE_COMMUNITY_REPORTER_ROLE_PROMPT,
 9 | )
10 | 
11 | 
12 | async def generate_community_reporter_role(
13 |     llm: CompletionLLM, domain: str, persona: str, docs: str | list[str]
14 | ) -> str:
15 |     """Generate an LLM persona to use for GraphRAG prompts.
16 | 
17 |     Parameters
18 |     ----------
19 |     - llm (CompletionLLM): The LLM to use for generation
20 |     - domain (str): The domain to generate a persona for
21 |     - persona (str): The persona to generate a role for
22 |     - docs (str | list[str]): The domain to generate a persona for
23 | 
24 |     Returns
25 |     -------
26 |     - str: The generated domain prompt response.
27 |     """
28 |     docs_str = " ".join(docs) if isinstance(docs, list) else docs
29 |     domain_prompt = GENERATE_COMMUNITY_REPORTER_ROLE_PROMPT.format(
30 |         domain=domain, persona=persona, input_text=docs_str
31 |     )
32 | 
33 |     response = await llm(domain_prompt)
34 | 
35 |     return str(response.output)
36 | 


--------------------------------------------------------------------------------
/graphrag/config/input_models/llm_parameters_input.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """LLM Parameters model."""
 5 | 
 6 | from typing_extensions import NotRequired, TypedDict
 7 | 
 8 | from graphrag.config.enums import LLMType
 9 | 
10 | 
11 | class LLMParametersInput(TypedDict):
12 |     """LLM Parameters model."""
13 | 
14 |     api_key: NotRequired[str | None]
15 |     type: NotRequired[LLMType | str | None]
16 |     model: NotRequired[str | None]
17 |     max_tokens: NotRequired[int | str | None]
18 |     request_timeout: NotRequired[float | str | None]
19 |     api_base: NotRequired[str | None]
20 |     api_version: NotRequired[str | None]
21 |     organization: NotRequired[str | None]
22 |     proxy: NotRequired[str | None]
23 |     cognitive_services_endpoint: NotRequired[str | None]
24 |     deployment_name: NotRequired[str | None]
25 |     model_supports_json: NotRequired[bool | str | None]
26 |     tokens_per_minute: NotRequired[int | str | None]
27 |     requests_per_minute: NotRequired[int | str | None]
28 |     max_retries: NotRequired[int | str | None]
29 |     max_retry_wait: NotRequired[float | str | None]
30 |     sleep_on_rate_limit_recommendation: NotRequired[bool | str | None]
31 |     concurrent_requests: NotRequired[int | str | None]
32 | 


--------------------------------------------------------------------------------
/graphrag/index/verbs/graph/embed/strategies/node_2_vec.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """A module containing run method definition."""
 5 | 
 6 | from typing import Any
 7 | 
 8 | import networkx as nx
 9 | 
10 | from graphrag.index.graph.embedding import embed_nod2vec
11 | from graphrag.index.graph.utils import stable_largest_connected_component
12 | from graphrag.index.verbs.graph.embed.typing import NodeEmbeddings
13 | 
14 | 
15 | def run(graph: nx.Graph, args: dict[str, Any]) -> NodeEmbeddings:
16 |     """Run method definition."""
17 |     if args.get("use_lcc", True):
18 |         graph = stable_largest_connected_component(graph)
19 | 
20 |     # create graph embedding using node2vec
21 |     embeddings = embed_nod2vec(
22 |         graph=graph,
23 |         dimensions=args.get("dimensions", 1536),
24 |         num_walks=args.get("num_walks", 10),
25 |         walk_length=args.get("walk_length", 40),
26 |         window_size=args.get("window_size", 2),
27 |         iterations=args.get("iterations", 3),
28 |         random_seed=args.get("random_seed", 86),
29 |     )
30 | 
31 |     pairs = zip(embeddings.nodes, embeddings.embeddings.tolist(), strict=True)
32 |     sorted_pairs = sorted(pairs, key=lambda x: x[0])
33 | 
34 |     return dict(sorted_pairs)
35 | 


--------------------------------------------------------------------------------
/graphrag/index/context.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | # isort: skip_file
 5 | """A module containing the 'PipelineRunStats' and 'PipelineRunContext' models."""
 6 | 
 7 | from dataclasses import dataclass as dc_dataclass
 8 | from dataclasses import field
 9 | 
10 | from .cache import PipelineCache
11 | from .storage.typing import PipelineStorage
12 | 
13 | 
14 | @dc_dataclass
15 | class PipelineRunStats:
16 |     """Pipeline running stats."""
17 | 
18 |     total_runtime: float = field(default=0)
19 |     """Float representing the total runtime."""
20 | 
21 |     num_documents: int = field(default=0)
22 |     """Number of documents."""
23 | 
24 |     input_load_time: float = field(default=0)
25 |     """Float representing the input load time."""
26 | 
27 |     workflows: dict[str, dict[str, float]] = field(default_factory=dict)
28 |     """A dictionary of workflows."""
29 | 
30 | 
31 | @dc_dataclass
32 | class PipelineRunContext:
33 |     """Provides the context for the current pipeline run."""
34 | 
35 |     stats: PipelineRunStats
36 |     storage: PipelineStorage
37 |     cache: PipelineCache
38 | 
39 | 
40 | # TODO: For now, just has the same props available to it
41 | VerbRunContext = PipelineRunContext
42 | """Provides the context for the current verb run."""
43 | 


--------------------------------------------------------------------------------
/graphrag/index/verbs/graph/report/strategies/typing.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """A module containing 'Finding' and 'CommunityReport' models."""
 5 | 
 6 | from collections.abc import Awaitable, Callable
 7 | from typing import Any
 8 | 
 9 | from datashaper import VerbCallbacks
10 | from typing_extensions import TypedDict
11 | 
12 | from graphrag.index.cache import PipelineCache
13 | 
14 | ExtractedEntity = dict[str, Any]
15 | StrategyConfig = dict[str, Any]
16 | RowContext = dict[str, Any]
17 | EntityTypes = list[str]
18 | Claim = dict[str, Any]
19 | 
20 | 
21 | class Finding(TypedDict):
22 |     """Finding class definition."""
23 | 
24 |     summary: str
25 |     explanation: str
26 | 
27 | 
28 | class CommunityReport(TypedDict):
29 |     """Community report class definition."""
30 | 
31 |     community: str | int
32 |     title: str
33 |     summary: str
34 |     full_content: str
35 |     full_content_json: str
36 |     rank: float
37 |     level: int
38 |     rank_explanation: str
39 |     findings: list[Finding]
40 | 
41 | 
42 | CommunityReportsStrategy = Callable[
43 |     [
44 |         str | int,
45 |         str,
46 |         int,
47 |         VerbCallbacks,
48 |         PipelineCache,
49 |         StrategyConfig,
50 |     ],
51 |     Awaitable[CommunityReport | None],
52 | ]
53 | 


--------------------------------------------------------------------------------
/.github/pull_request_template.md:
--------------------------------------------------------------------------------
 1 | <!--
 2 | Thanks for contributing to GraphRAG!
 3 | 
 4 | Please do not make *Draft* pull requests, as they still notify anyone watching the repo.
 5 | 
 6 | Create a pull request when it is ready for review and feedback.
 7 | 
 8 | About this template
 9 | 
10 | The following template aims to help contributors write a good description for their pull requests.
11 | We'd like you to provide a description of the changes in your pull request (i.e. bugs fixed or features added), the motivation behind the changes, and complete the checklist below before opening a pull request.
12 | 
13 | Feel free to discard it if you need to (e.g. when you just fix a typo). -->
14 | 
15 | ## Description
16 | 
17 | [Provide a brief description of the changes made in this pull request.]
18 | 
19 | ## Related Issues
20 | 
21 | [Reference any related issues or tasks that this pull request addresses.]
22 | 
23 | ## Proposed Changes
24 | 
25 | [List the specific changes made in this pull request.]
26 | 
27 | ## Checklist
28 | 
29 | - [ ] I have tested these changes locally.
30 | - [ ] I have reviewed the code changes.
31 | - [ ] I have updated the documentation (if necessary).
32 | - [ ] I have added appropriate unit tests (if applicable).
33 | 
34 | ## Additional Notes
35 | 
36 | [Add any additional notes or context that may be helpful for the reviewer(s).]
37 | 


--------------------------------------------------------------------------------
/graphrag/config/models/chunking_config.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """Parameterization settings for the default configuration."""
 5 | 
 6 | from pydantic import BaseModel, Field
 7 | 
 8 | import graphrag.config.defaults as defs
 9 | 
10 | 
11 | class ChunkingConfig(BaseModel):
12 |     """Configuration section for chunking."""
13 | 
14 |     size: int = Field(description="The chunk size to use.", default=defs.CHUNK_SIZE)
15 |     overlap: int = Field(
16 |         description="The chunk overlap to use.", default=defs.CHUNK_OVERLAP
17 |     )
18 |     group_by_columns: list[str] = Field(
19 |         description="The chunk by columns to use.",
20 |         default=defs.CHUNK_GROUP_BY_COLUMNS,
21 |     )
22 |     strategy: dict | None = Field(
23 |         description="The chunk strategy to use, overriding the default tokenization strategy",
24 |         default=None,
25 |     )
26 | 
27 |     def resolved_strategy(self) -> dict:
28 |         """Get the resolved chunking strategy."""
29 |         from graphrag.index.verbs.text.chunk import ChunkStrategyType
30 | 
31 |         return self.strategy or {
32 |             "type": ChunkStrategyType.tokens,
33 |             "chunk_size": self.size,
34 |             "chunk_overlap": self.overlap,
35 |             "group_by_columns": self.group_by_columns,
36 |         }
37 | 


--------------------------------------------------------------------------------
/graphrag/index/verbs/graph/merge/typing.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """A module containing 'BasicMergeOperation', 'StringOperation', 'NumericOperation' and 'DetailedAttributeMergeOperation' models."""
 5 | 
 6 | from dataclasses import dataclass
 7 | from enum import Enum
 8 | 
 9 | 
10 | class BasicMergeOperation(str, Enum):
11 |     """Basic Merge Operation class definition."""
12 | 
13 |     Replace = "replace"
14 |     Skip = "skip"
15 | 
16 | 
17 | class StringOperation(str, Enum):
18 |     """String Operation class definition."""
19 | 
20 |     Concat = "concat"
21 |     Replace = "replace"
22 |     Skip = "skip"
23 | 
24 | 
25 | class NumericOperation(str, Enum):
26 |     """Numeric Operation class definition."""
27 | 
28 |     Sum = "sum"
29 |     Average = "average"
30 |     Max = "max"
31 |     Min = "min"
32 |     Multiply = "multiply"
33 |     Replace = "replace"
34 |     Skip = "skip"
35 | 
36 | 
37 | @dataclass
38 | class DetailedAttributeMergeOperation:
39 |     """Detailed attribute merge operation class definition."""
40 | 
41 |     operation: str  # StringOperation | NumericOperation
42 | 
43 |     # concat
44 |     separator: str | None = None
45 |     delimiter: str | None = None
46 |     distinct: bool = False
47 | 
48 | 
49 | AttributeMergeOperation = str | DetailedAttributeMergeOperation
50 | 


--------------------------------------------------------------------------------
/graphrag/index/verbs/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """A module containing get_default_verbs method definition."""
 5 | 
 6 | from .covariates import extract_covariates
 7 | from .entities import entity_extract, summarize_descriptions
 8 | from .genid import genid
 9 | from .graph import (
10 |     cluster_graph,
11 |     create_community_reports,
12 |     create_graph,
13 |     embed_graph,
14 |     layout_graph,
15 |     merge_graphs,
16 |     unpack_graph,
17 | )
18 | from .overrides import aggregate, concat, merge
19 | from .snapshot import snapshot
20 | from .snapshot_rows import snapshot_rows
21 | from .spread_json import spread_json
22 | from .text import chunk, text_embed, text_split, text_translate
23 | from .unzip import unzip
24 | from .zip import zip_verb
25 | 
26 | __all__ = [
27 |     "aggregate",
28 |     "chunk",
29 |     "cluster_graph",
30 |     "concat",
31 |     "create_community_reports",
32 |     "create_graph",
33 |     "embed_graph",
34 |     "entity_extract",
35 |     "extract_covariates",
36 |     "genid",
37 |     "layout_graph",
38 |     "merge",
39 |     "merge_graphs",
40 |     "snapshot",
41 |     "snapshot_rows",
42 |     "spread_json",
43 |     "summarize_descriptions",
44 |     "text_embed",
45 |     "text_split",
46 |     "text_translate",
47 |     "unpack_graph",
48 |     "unzip",
49 |     "zip_verb",
50 | ]
51 | 


--------------------------------------------------------------------------------
/graphrag/llm/openai/openai_completion_llm.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """A text-completion based LLM."""
 5 | 
 6 | import logging
 7 | 
 8 | from typing_extensions import Unpack
 9 | 
10 | from graphrag.llm.base import BaseLLM
11 | from graphrag.llm.types import (
12 |     CompletionInput,
13 |     CompletionOutput,
14 |     LLMInput,
15 | )
16 | 
17 | from .openai_configuration import OpenAIConfiguration
18 | from .types import OpenAIClientTypes
19 | from .utils import get_completion_llm_args
20 | 
21 | log = logging.getLogger(__name__)
22 | 
23 | 
24 | class OpenAICompletionLLM(BaseLLM[CompletionInput, CompletionOutput]):
25 |     """A text-completion based LLM."""
26 | 
27 |     _client: OpenAIClientTypes
28 |     _configuration: OpenAIConfiguration
29 | 
30 |     def __init__(self, client: OpenAIClientTypes, configuration: OpenAIConfiguration):
31 |         self.client = client
32 |         self.configuration = configuration
33 | 
34 |     async def _execute_llm(
35 |         self,
36 |         input: CompletionInput,
37 |         **kwargs: Unpack[LLMInput],
38 |     ) -> CompletionOutput | None:
39 |         args = get_completion_llm_args(
40 |             kwargs.get("model_parameters"), self.configuration
41 |         )
42 |         completion = self.client.completions.create(prompt=input, **args)
43 |         return completion.choices[0].text
44 | 


--------------------------------------------------------------------------------
/tests/notebook/test_notebooks.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | import subprocess
 4 | import tempfile
 5 | from pathlib import Path
 6 | 
 7 | import nbformat
 8 | import pytest
 9 | 
10 | DOCS_PATH = Path("../../docsite")
11 | 
12 | notebooks_list = list(DOCS_PATH.rglob("*.ipynb"))
13 | 
14 | 
15 | def _notebook_run(filepath: Path):
16 |     """Execute a notebook via nbconvert and collect output.
17 |     :returns execution errors
18 |     """
19 |     with tempfile.NamedTemporaryFile(suffix=".ipynb") as temp_file:
20 |         args = [
21 |             "jupyter",
22 |             "nbconvert",
23 |             "--to",
24 |             "notebook",
25 |             "--execute",
26 |             "-y",
27 |             "--no-prompt",
28 |             "--output",
29 |             temp_file.name,
30 |             filepath.absolute().as_posix(),
31 |         ]
32 |         subprocess.check_call(args)
33 | 
34 |         temp_file.seek(0)
35 |         nb = nbformat.read(temp_file, nbformat.current_nbformat)
36 | 
37 |     return [
38 |         output
39 |         for cell in nb.cells
40 |         if "outputs" in cell
41 |         for output in cell["outputs"]
42 |         if output.output_type == "error"
43 |     ]
44 | 
45 | 
46 | @pytest.mark.parametrize("notebook_path", notebooks_list)
47 | def test_notebook(notebook_path: Path):
48 |     assert _notebook_run(notebook_path) == []
49 | 


--------------------------------------------------------------------------------
/graphrag/index/utils/rate_limiter.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """Rate limiter utility."""
 5 | 
 6 | import asyncio
 7 | import time
 8 | 
 9 | 
10 | class RateLimiter:
11 |     """
12 |     The original TpmRpmLLMLimiter strategy did not account for minute-based rate limiting when scheduled.
13 | 
14 |     The RateLimiter was introduced to ensure that the CommunityReportsExtractor could be scheduled to adhere to rate configurations on a per-minute basis.
15 |     """
16 | 
17 |     # TODO: RateLimiter scheduled: using asyncio for async_mode
18 | 
19 |     def __init__(self, rate: int, per: int):
20 |         self.rate = rate
21 |         self.per = per
22 |         self.allowance = rate
23 |         self.last_check = time.monotonic()
24 | 
25 |     async def acquire(self):
26 |         """Acquire a token from the rate limiter."""
27 |         current = time.monotonic()
28 |         elapsed = current - self.last_check
29 |         self.last_check = current
30 |         self.allowance += elapsed * (self.rate / self.per)
31 | 
32 |         if self.allowance > self.rate:
33 |             self.allowance = self.rate
34 | 
35 |         if self.allowance < 1.0:
36 |             sleep_time = (1.0 - self.allowance) * (self.per / self.rate)
37 |             await asyncio.sleep(sleep_time)
38 |             self.allowance = 0.0
39 |         else:
40 |             self.allowance -= 1.0
41 | 


--------------------------------------------------------------------------------
/examples/custom_input/run.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | import asyncio
 4 | import os
 5 | 
 6 | import pandas as pd
 7 | 
 8 | from graphrag.index import run_pipeline_with_config
 9 | 
10 | pipeline_file = os.path.join(
11 |     os.path.dirname(os.path.abspath(__file__)), "./pipeline.yml"
12 | )
13 | 
14 | 
15 | async def run():
16 |     # Load your dataset
17 |     dataset = _load_dataset_some_unique_way()
18 | 
19 |     # Load your config without the input section
20 |     config = pipeline_file
21 | 
22 |     # Grab the last result from the pipeline, should be our entity extraction
23 |     outputs = []
24 |     async for output in run_pipeline_with_config(
25 |         config_or_path=config, dataset=dataset
26 |     ):
27 |         outputs.append(output)
28 |     pipeline_result = outputs[-1]
29 | 
30 |     if pipeline_result.result is not None:
31 |         # Should look something like
32 |         #            col1  col2 filled_column
33 |         # 0     2     4  Filled Value
34 |         # 1     5    10  Filled Value
35 |         print(pipeline_result.result)
36 |     else:
37 |         print("No results!")
38 | 
39 | 
40 | def _load_dataset_some_unique_way() -> pd.DataFrame:
41 |     # Totally loaded from some other place
42 |     return pd.DataFrame([{"col1": 2, "col2": 4}, {"col1": 5, "col2": 10}])
43 | 
44 | 
45 | if __name__ == "__main__":
46 |     asyncio.run(run())
47 | 


--------------------------------------------------------------------------------
/.github/workflows/python-publish.yml:
--------------------------------------------------------------------------------
 1 | name: Python Publish
 2 | on:
 3 |   release:
 4 |     types: [created]
 5 |   push:
 6 |     branches: [main]
 7 | 
 8 | env:
 9 |   POETRY_VERSION: "1.8.3"
10 |   PYTHON_VERSION: "3.10"
11 | 
12 | jobs:
13 |   publish:
14 |     name: Upload release to PyPI
15 |     if: github.ref == 'refs/heads/main'
16 |     runs-on: ubuntu-latest
17 |     environment:
18 |       name: pypi
19 |       url: https://pypi.org/p/graphrag
20 |     permissions:
21 |       id-token: write # IMPORTANT: this permission is mandatory for trusted publishing
22 | 
23 |     steps:
24 |       - uses: actions/checkout@v4
25 |         with:
26 |           fetch-depth: 0
27 |           fetch-tags: true
28 | 
29 |       - name: Set up Python
30 |         uses: actions/setup-python@v5
31 |         with:
32 |           python-version: ${{ env.PYTHON_VERSION }}
33 | 
34 |       - name: Install Poetry
35 |         uses: abatilo/actions-poetry@v3.0.0
36 |         with:
37 |           poetry-version: ${{ env.POETRY_VERSION }}
38 | 
39 |       - name: Install dependencies
40 |         shell: bash
41 |         run: poetry install
42 | 
43 |       - name: Build Distributable
44 |         shell: bash
45 |         run: poetry build
46 | 
47 |       - name: Publish package distributions to PyPI
48 |         uses: pypa/gh-action-pypi-publish@release/v1
49 |         with:
50 |           packages-dir: dist
51 |           skip-existing: true
52 |           verbose: true
53 | 


--------------------------------------------------------------------------------
/graphrag/index/workflows/v1/create_final_documents.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """A module containing build_steps method definition."""
 5 | 
 6 | from graphrag.index.config import PipelineWorkflowConfig, PipelineWorkflowStep
 7 | 
 8 | workflow_name = "create_final_documents"
 9 | 
10 | 
11 | def build_steps(
12 |     config: PipelineWorkflowConfig,
13 | ) -> list[PipelineWorkflowStep]:
14 |     """
15 |     Create the final documents table.
16 | 
17 |     ## Dependencies
18 |     * `workflow:create_base_documents`
19 |     * `workflow:create_base_document_nodes`
20 |     """
21 |     base_text_embed = config.get("text_embed", {})
22 |     document_raw_content_embed_config = config.get(
23 |         "document_raw_content_embed", base_text_embed
24 |     )
25 |     skip_raw_content_embedding = config.get("skip_raw_content_embedding", False)
26 |     return [
27 |         {
28 |             "verb": "rename",
29 |             "args": {"columns": {"text_units": "text_unit_ids"}},
30 |             "input": {"source": "workflow:create_base_documents"},
31 |         },
32 |         {
33 |             "verb": "text_embed",
34 |             "enabled": not skip_raw_content_embedding,
35 |             "args": {
36 |                 "column": "raw_content",
37 |                 "to": "raw_content_embedding",
38 |                 **document_raw_content_embed_config,
39 |             },
40 |         },
41 |     ]
42 | 


--------------------------------------------------------------------------------
/graphrag/index/workflows/v1/join_text_units_to_covariate_ids.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """A module containing build_steps method definition."""
 5 | 
 6 | from graphrag.index.config import PipelineWorkflowConfig, PipelineWorkflowStep
 7 | 
 8 | workflow_name = "join_text_units_to_covariate_ids"
 9 | 
10 | 
11 | def build_steps(
12 |     _config: PipelineWorkflowConfig,
13 | ) -> list[PipelineWorkflowStep]:
14 |     """
15 |     Create the final text-units table.
16 | 
17 |     ## Dependencies
18 |     * `workflow:create_final_covariates`
19 |     """
20 |     return [
21 |         {
22 |             "verb": "select",
23 |             "args": {"columns": ["id", "text_unit_id"]},
24 |             "input": {"source": "workflow:create_final_covariates"},
25 |         },
26 |         {
27 |             "verb": "aggregate_override",
28 |             "args": {
29 |                 "groupby": ["text_unit_id"],
30 |                 "aggregations": [
31 |                     {
32 |                         "column": "id",
33 |                         "operation": "array_agg_distinct",
34 |                         "to": "covariate_ids",
35 |                     },
36 |                     {
37 |                         "column": "text_unit_id",
38 |                         "operation": "any",
39 |                         "to": "id",
40 |                     },
41 |                 ],
42 |             },
43 |         },
44 |     ]
45 | 


--------------------------------------------------------------------------------
/graphrag/query/progress.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """Status Reporter for orchestration."""
 5 | 
 6 | from abc import ABCMeta, abstractmethod
 7 | from typing import Any
 8 | 
 9 | 
10 | class StatusReporter(metaclass=ABCMeta):
11 |     """Provides a way to report status updates from the pipeline."""
12 | 
13 |     @abstractmethod
14 |     def error(self, message: str, details: dict[str, Any] | None = None):
15 |         """Report an error."""
16 | 
17 |     @abstractmethod
18 |     def warning(self, message: str, details: dict[str, Any] | None = None):
19 |         """Report a warning."""
20 | 
21 |     @abstractmethod
22 |     def log(self, message: str, details: dict[str, Any] | None = None):
23 |         """Report a log."""
24 | 
25 | 
26 | class ConsoleStatusReporter(StatusReporter):
27 |     """A reporter that writes to a console."""
28 | 
29 |     def error(self, message: str, details: dict[str, Any] | None = None):
30 |         """Report an error."""
31 |         print(message, details)  # noqa T201
32 | 
33 |     def warning(self, message: str, details: dict[str, Any] | None = None):
34 |         """Report a warning."""
35 |         _print_warning(message)
36 | 
37 |     def log(self, message: str, details: dict[str, Any] | None = None):
38 |         """Report a log."""
39 |         print(message, details)  # noqa T201
40 | 
41 | 
42 | def _print_warning(skk):
43 |     print(f"\033[93m {skk}\033[00m")  # noqa T201
44 | 


--------------------------------------------------------------------------------
/graphrag/index/storage/load_storage.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """A module containing load_storage method definition."""
 5 | 
 6 | from __future__ import annotations
 7 | 
 8 | from typing import cast
 9 | 
10 | from graphrag.config import StorageType
11 | from graphrag.index.config.storage import (
12 |     PipelineBlobStorageConfig,
13 |     PipelineFileStorageConfig,
14 |     PipelineStorageConfig,
15 | )
16 | 
17 | from .blob_pipeline_storage import create_blob_storage
18 | from .file_pipeline_storage import create_file_storage
19 | from .memory_pipeline_storage import create_memory_storage
20 | 
21 | 
22 | def load_storage(config: PipelineStorageConfig):
23 |     """Load the storage for a pipeline."""
24 |     match config.type:
25 |         case StorageType.memory:
26 |             return create_memory_storage()
27 |         case StorageType.blob:
28 |             config = cast(PipelineBlobStorageConfig, config)
29 |             return create_blob_storage(
30 |                 config.connection_string,
31 |                 config.storage_account_blob_url,
32 |                 config.container_name,
33 |                 config.base_dir,
34 |             )
35 |         case StorageType.file:
36 |             config = cast(PipelineFileStorageConfig, config)
37 |             return create_file_storage(config.base_dir)
38 |         case _:
39 |             msg = f"Unknown storage type: {config.type}"
40 |             raise ValueError(msg)
41 | 


--------------------------------------------------------------------------------
/examples/custom_set_of_available_workflows/custom_workflow_definitions.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | from graphrag.index.workflows import WorkflowDefinitions
 4 | 
 5 | # Sets up the list of custom workflows that can be used in a pipeline
 6 | # The idea being that you can have a pool of workflows that can be used in any number of
 7 | # your pipelines
 8 | custom_workflows: WorkflowDefinitions = {
 9 |     "my_workflow": lambda config: [
10 |         {
11 |             "verb": "derive",
12 |             "args": {
13 |                 "column1": "col1",  # looks for col1 in the dataset
14 |                 "column2": "col2",  # looks for col2 in the dataset
15 |                 "to": config.get(
16 |                     # Allow the user to specify the output column name,
17 |                     # otherwise default to "output_column"
18 |                     "derive_output_column",
19 |                     "output_column",
20 |                 ),  # new column name,
21 |                 "operator": "*",
22 |             },
23 |         }
24 |     ],
25 |     "my_unused_workflow": lambda _config: [
26 |         {
27 |             "verb": "derive",
28 |             "args": {
29 |                 "column1": "col1",  # looks for col1 in the dataset
30 |                 "column2": "col2",  # looks for col2 in the dataset
31 |                 "to": "unused_output_column",
32 |                 "operator": "*",
33 |             },
34 |         }
35 |     ],
36 | }
37 | 


--------------------------------------------------------------------------------
/graphrag/config/models/local_search_config.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """Parameterization settings for the default configuration."""
 5 | 
 6 | from pydantic import BaseModel, Field
 7 | 
 8 | import graphrag.config.defaults as defs
 9 | 
10 | 
11 | class LocalSearchConfig(BaseModel):
12 |     """The default configuration section for Cache."""
13 | 
14 |     text_unit_prop: float = Field(
15 |         description="The text unit proportion.",
16 |         default=defs.LOCAL_SEARCH_TEXT_UNIT_PROP,
17 |     )
18 |     community_prop: float = Field(
19 |         description="The community proportion.",
20 |         default=defs.LOCAL_SEARCH_COMMUNITY_PROP,
21 |     )
22 |     conversation_history_max_turns: int = Field(
23 |         description="The conversation history maximum turns.",
24 |         default=defs.LOCAL_SEARCH_CONVERSATION_HISTORY_MAX_TURNS,
25 |     )
26 |     top_k_entities: int = Field(
27 |         description="The top k mapped entities.",
28 |         default=defs.LOCAL_SEARCH_TOP_K_MAPPED_ENTITIES,
29 |     )
30 |     top_k_relationships: int = Field(
31 |         description="The top k mapped relations.",
32 |         default=defs.LOCAL_SEARCH_TOP_K_RELATIONSHIPS,
33 |     )
34 |     max_tokens: int = Field(
35 |         description="The maximum tokens.", default=defs.LOCAL_SEARCH_MAX_TOKENS
36 |     )
37 |     llm_max_tokens: int = Field(
38 |         description="The LLM maximum tokens.", default=defs.LOCAL_SEARCH_LLM_MAX_TOKENS
39 |     )
40 | 


--------------------------------------------------------------------------------
/graphrag/query/llm/text_utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """Text Utilities for LLM."""
 5 | 
 6 | from collections.abc import Iterator
 7 | from itertools import islice
 8 | 
 9 | import tiktoken
10 | 
11 | 
12 | def num_tokens(text: str, token_encoder: tiktoken.Encoding | None = None) -> int:
13 |     """Return the number of tokens in the given text."""
14 |     if token_encoder is None:
15 |         token_encoder = tiktoken.get_encoding("cl100k_base")
16 |     return len(token_encoder.encode(text))  # type: ignore
17 | 
18 | 
19 | def batched(iterable: Iterator, n: int):
20 |     """
21 |     Batch data into tuples of length n. The last batch may be shorter.
22 | 
23 |     Taken from Python's cookbook: https://docs.python.org/3/library/itertools.html#itertools.batched
24 |     """
25 |     # batched('ABCDEFG', 3) --> ABC DEF G
26 |     if n < 1:
27 |         value_error = "n must be at least one"
28 |         raise ValueError(value_error)
29 |     it = iter(iterable)
30 |     while batch := tuple(islice(it, n)):
31 |         yield batch
32 | 
33 | 
34 | def chunk_text(
35 |     text: str, max_tokens: int, token_encoder: tiktoken.Encoding | None = None
36 | ):
37 |     """Chunk text by token length."""
38 |     if token_encoder is None:
39 |         token_encoder = tiktoken.get_encoding("cl100k_base")
40 |     tokens = token_encoder.encode(text)  # type: ignore
41 |     chunk_iterator = batched(iter(tokens), max_tokens)
42 |     yield from chunk_iterator
43 | 


--------------------------------------------------------------------------------
/graphrag/index/verbs/covariates/typing.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """A module containing 'Covariate' and 'CovariateExtractionResult' models."""
 5 | 
 6 | from collections.abc import Awaitable, Callable, Iterable
 7 | from dataclasses import dataclass
 8 | from typing import Any
 9 | 
10 | from datashaper import VerbCallbacks
11 | 
12 | from graphrag.index.cache import PipelineCache
13 | 
14 | 
15 | @dataclass
16 | class Covariate:
17 |     """Covariate class definition."""
18 | 
19 |     covariate_type: str | None = None
20 |     subject_id: str | None = None
21 |     subject_type: str | None = None
22 |     object_id: str | None = None
23 |     object_type: str | None = None
24 |     type: str | None = None
25 |     status: str | None = None
26 |     start_date: str | None = None
27 |     end_date: str | None = None
28 |     description: str | None = None
29 |     source_text: list[str] | None = None
30 |     doc_id: str | None = None
31 |     record_id: int | None = None
32 |     id: str | None = None
33 | 
34 | 
35 | @dataclass
36 | class CovariateExtractionResult:
37 |     """Covariate extraction result class definition."""
38 | 
39 |     covariate_data: list[Covariate]
40 | 
41 | 
42 | CovariateExtractStrategy = Callable[
43 |     [
44 |         Iterable[str],
45 |         list[str],
46 |         dict[str, str],
47 |         VerbCallbacks,
48 |         PipelineCache,
49 |         dict[str, Any],
50 |     ],
51 |     Awaitable[CovariateExtractionResult],
52 | ]
53 | 


--------------------------------------------------------------------------------
/graphrag/config/models/global_search_config.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """Parameterization settings for the default configuration."""
 5 | 
 6 | from pydantic import BaseModel, Field
 7 | 
 8 | import graphrag.config.defaults as defs
 9 | 
10 | 
11 | class GlobalSearchConfig(BaseModel):
12 |     """The default configuration section for Cache."""
13 | 
14 |     temperature: float | None = Field(
15 |         description="The temperature to use for token generation.",
16 |         default=defs.LLM_TEMPERATURE,
17 |     )
18 |     top_p: float | None = Field(
19 |         description="The top-p value to use for token generation.",
20 |         default=defs.LLM_TOP_P,
21 |     )
22 |     max_tokens: int = Field(
23 |         description="The maximum context size in tokens.",
24 |         default=defs.GLOBAL_SEARCH_MAX_TOKENS,
25 |     )
26 |     data_max_tokens: int = Field(
27 |         description="The data llm maximum tokens.",
28 |         default=defs.GLOBAL_SEARCH_DATA_MAX_TOKENS,
29 |     )
30 |     map_max_tokens: int = Field(
31 |         description="The map llm maximum tokens.",
32 |         default=defs.GLOBAL_SEARCH_MAP_MAX_TOKENS,
33 |     )
34 |     reduce_max_tokens: int = Field(
35 |         description="The reduce llm maximum tokens.",
36 |         default=defs.GLOBAL_SEARCH_REDUCE_MAX_TOKENS,
37 |     )
38 |     concurrency: int = Field(
39 |         description="The number of concurrent requests.",
40 |         default=defs.GLOBAL_SEARCH_CONCURRENCY,
41 |     )
42 | 


--------------------------------------------------------------------------------