├── .gitattributes
├── .github
    ├── ISSUE_TEMPLATE
    │   ├── bug_report.yml
    │   ├── config.yml
    │   ├── feature_request.yml
    │   └── general_issue.yml
    ├── dependabot.yml
    ├── pull_request_template.md
    └── workflows
    │   ├── gh-pages.yml
    │   ├── issues-autoresolve.yml
    │   ├── python-ci.yml
    │   ├── python-integration-tests.yml
    │   ├── python-notebook-tests.yml
    │   ├── python-publish.yml
    │   ├── python-smoke-tests.yml
    │   ├── semver.yml
    │   └── spellcheck.yml
├── .gitignore
├── .semversioner
    ├── 0.1.0.json
    ├── 0.2.0.json
    ├── 0.2.1.json
    ├── 0.2.2.json
    ├── 0.3.0.json
    ├── 0.3.1.json
    ├── 0.3.2.json
    ├── 0.3.3.json
    ├── 0.3.4.json
    ├── 0.3.5.json
    ├── 0.3.6.json
    ├── 0.4.0.json
    ├── 0.4.1.json
    ├── 0.5.0.json
    ├── 0.9.0.json
    ├── 1.0.0.json
    ├── 1.0.1.json
    ├── 1.1.0.json
    ├── 1.1.1.json
    ├── 1.1.2.json
    ├── 1.2.0.json
    ├── 2.0.0.json
    ├── 2.1.0.json
    ├── 2.2.0.json
    ├── 2.2.1.json
    ├── 2.3.0.json
    └── next-release
    │   └── patch-20250530204951787463.json
├── .vscode
    ├── extensions.json
    ├── launch.json
    └── settings.json
├── .vsts-ci.yml
├── CHANGELOG.md
├── CODEOWNERS
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── DEVELOPING.md
├── LICENSE
├── RAI_TRANSPARENCY.md
├── README.md
├── SECURITY.md
├── SUPPORT.md
├── breaking-changes.md
├── cspell.config.yaml
├── dictionary.txt
├── docs
    ├── blog_posts.md
    ├── cli.md
    ├── config
    │   ├── env_vars.md
    │   ├── init.md
    │   ├── models.md
    │   ├── overview.md
    │   └── yaml.md
    ├── data
    │   └── operation_dulce
    │   │   ├── ABOUT.md
    │   │   ├── Operation Dulce v2 1 1.md
    │   │   └── dataset.zip
    ├── developing.md
    ├── examples_notebooks
    │   ├── api_overview.ipynb
    │   ├── drift_search.ipynb
    │   ├── global_search.ipynb
    │   ├── global_search_with_dynamic_community_selection.ipynb
    │   ├── index_migration_to_v1.ipynb
    │   ├── index_migration_to_v2.ipynb
    │   ├── inputs
    │   │   └── operation dulce
    │   │   │   ├── ABOUT.md
    │   │   │   ├── Operation Dulce v2 1 1.md
    │   │   │   ├── communities.parquet
    │   │   │   ├── community_reports.parquet
    │   │   │   ├── covariates.parquet
    │   │   │   ├── documents.parquet
    │   │   │   ├── entities.parquet
    │   │   │   ├── lancedb
    │   │   │       ├── default-community-full_content.lance
    │   │   │       │   ├── _transactions
    │   │   │       │   │   ├── 0-2fed1d8b-daac-41b0-a93a-e115cda75be3.txn
    │   │   │       │   │   ├── 1-61dbb7c2-aec3-4796-b223-941fc7cc93cc.txn
    │   │   │       │   │   ├── 2-60012692-a153-48f9-8f4e-c479b44cbf3f.txn
    │   │   │       │   │   └── 3-0d2dc9a1-094f-4220-83c7-6ad6f26fac2b.txn
    │   │   │       │   ├── _versions
    │   │   │       │   │   ├── 1.manifest
    │   │   │       │   │   ├── 2.manifest
    │   │   │       │   │   ├── 3.manifest
    │   │   │       │   │   └── 4.manifest
    │   │   │       │   └── data
    │   │   │       │   │   ├── 1e7b2d94-ed06-4aa0-b22e-86a71d416bc6.lance
    │   │   │       │   │   └── 1ed9f301-ce30-46a8-8c0b-9c2a60a3cf43.lance
    │   │   │       ├── default-entity-description.lance
    │   │   │       │   ├── _transactions
    │   │   │       │   │   ├── 0-92c031e5-7558-451e-9d0f-f5514db9616d.txn
    │   │   │       │   │   ├── 1-7b3cb8d8-3512-4584-a003-91838fed8911.txn
    │   │   │       │   │   ├── 2-7de627d2-4c57-49e9-bf73-c17a9582ead4.txn
    │   │   │       │   │   └── 3-9ad29d69-9a69-43a8-8b26-252ea267958d.txn
    │   │   │       │   ├── _versions
    │   │   │       │   │   ├── 1.manifest
    │   │   │       │   │   ├── 2.manifest
    │   │   │       │   │   ├── 3.manifest
    │   │   │       │   │   └── 4.manifest
    │   │   │       │   └── data
    │   │   │       │   │   ├── a34575c4-5260-457f-bebe-3f40bc0e2ee3.lance
    │   │   │       │   │   └── eabd7580-86f5-4022-8aa7-fe0aff816d98.lance
    │   │   │       └── default-text_unit-text.lance
    │   │   │       │   ├── _transactions
    │   │   │       │       ├── 0-fd0434ac-e5cd-4ddd-9dd5-e5048d4edb59.txn
    │   │   │       │       ├── 1-14bb4b1d-cc00-420b-9b14-3626f0bd8c0b.txn
    │   │   │       │       ├── 2-8e74264c-f72d-44f5-a6f4-b3b61ae6a43b.txn
    │   │   │       │       └── 3-7516fb71-9db3-4666-bdef-ea04c1eb9697.txn
    │   │   │       │   ├── _versions
    │   │   │       │       ├── 1.manifest
    │   │   │       │       ├── 2.manifest
    │   │   │       │       ├── 3.manifest
    │   │   │       │       └── 4.manifest
    │   │   │       │   └── data
    │   │   │       │       ├── 2794bf5b-de3d-4202-ab16-e76bc27c8e6a.lance
    │   │   │       │       └── 2f74c8e8-3f35-4209-889c-a13cf0780eb3.lance
    │   │   │   ├── relationships.parquet
    │   │   │   └── text_units.parquet
    │   ├── local_search.ipynb
    │   └── multi_index_search.ipynb
    ├── get_started.md
    ├── img
    │   ├── GraphRag-Figure1.jpg
    │   ├── auto-tune-diagram.png
    │   ├── drift-search-diagram.png
    │   ├── pipeline-running.png
    │   └── viz_guide
    │   │   ├── gephi-appearance-pane.png
    │   │   ├── gephi-initial-graph-example.png
    │   │   ├── gephi-layout-forceatlas2-pane.png
    │   │   ├── gephi-layout-pane.png
    │   │   └── gephi-network-overview-settings.png
    ├── index.md
    ├── index
    │   ├── architecture.md
    │   ├── byog.md
    │   ├── default_dataflow.md
    │   ├── inputs.md
    │   ├── methods.md
    │   ├── outputs.md
    │   └── overview.md
    ├── prompt_tuning
    │   ├── auto_prompt_tuning.md
    │   ├── manual_prompt_tuning.md
    │   └── overview.md
    ├── query
    │   ├── drift_search.md
    │   ├── global_search.md
    │   ├── local_search.md
    │   ├── multi_index_search.md
    │   ├── notebooks
    │   │   └── overview.md
    │   ├── overview.md
    │   └── question_generation.md
    ├── scripts
    │   └── create_cookie_banner.js
    ├── stylesheets
    │   └── extra.css
    └── visualization_guide.md
├── examples_notebooks
    ├── community_contrib
    │   ├── README.md
    │   ├── neo4j
    │   │   └── graphrag_import_neo4j_cypher.ipynb
    │   └── yfiles-jupyter-graphs
    │   │   └── graph-visualization.ipynb
    └── inputs
    │   └── operation dulce
    │       └── lancedb
    │           └── entity_description_embeddings.lance
    │               ├── _latest.manifest
    │               ├── _transactions
    │                   ├── 0-498c6e24-dd0a-42b9-8f7e-5e3d2ab258b0.txn
    │                   └── 1-bf5aa024-a229-461f-8d78-699841a302fe.txn
    │               ├── _versions
    │                   ├── 1.manifest
    │                   └── 2.manifest
    │               └── data
    │                   └── fe64774f-5412-4c9c-8dea-f6ed55c81119.lance
├── graphrag
    ├── __init__.py
    ├── __main__.py
    ├── api
    │   ├── __init__.py
    │   ├── index.py
    │   ├── prompt_tune.py
    │   └── query.py
    ├── cache
    │   ├── __init__.py
    │   ├── factory.py
    │   ├── json_pipeline_cache.py
    │   ├── memory_pipeline_cache.py
    │   ├── noop_pipeline_cache.py
    │   └── pipeline_cache.py
    ├── callbacks
    │   ├── __init__.py
    │   ├── blob_workflow_callbacks.py
    │   ├── console_workflow_callbacks.py
    │   ├── file_workflow_callbacks.py
    │   ├── llm_callbacks.py
    │   ├── noop_query_callbacks.py
    │   ├── noop_workflow_callbacks.py
    │   ├── progress_workflow_callbacks.py
    │   ├── query_callbacks.py
    │   ├── reporting.py
    │   ├── workflow_callbacks.py
    │   └── workflow_callbacks_manager.py
    ├── cli
    │   ├── __init__.py
    │   ├── index.py
    │   ├── initialize.py
    │   ├── main.py
    │   ├── prompt_tune.py
    │   └── query.py
    ├── config
    │   ├── __init__.py
    │   ├── create_graphrag_config.py
    │   ├── defaults.py
    │   ├── embeddings.py
    │   ├── enums.py
    │   ├── environment_reader.py
    │   ├── errors.py
    │   ├── get_embedding_settings.py
    │   ├── init_content.py
    │   ├── load_config.py
    │   ├── logging.py
    │   ├── models
    │   │   ├── __init__.py
    │   │   ├── basic_search_config.py
    │   │   ├── cache_config.py
    │   │   ├── chunking_config.py
    │   │   ├── cluster_graph_config.py
    │   │   ├── community_reports_config.py
    │   │   ├── drift_search_config.py
    │   │   ├── embed_graph_config.py
    │   │   ├── extract_claims_config.py
    │   │   ├── extract_graph_config.py
    │   │   ├── extract_graph_nlp_config.py
    │   │   ├── global_search_config.py
    │   │   ├── graph_rag_config.py
    │   │   ├── input_config.py
    │   │   ├── language_model_config.py
    │   │   ├── local_search_config.py
    │   │   ├── output_config.py
    │   │   ├── prune_graph_config.py
    │   │   ├── reporting_config.py
    │   │   ├── snapshots_config.py
    │   │   ├── summarize_descriptions_config.py
    │   │   ├── text_embedding_config.py
    │   │   ├── umap_config.py
    │   │   └── vector_store_config.py
    │   └── read_dotenv.py
    ├── data_model
    │   ├── __init__.py
    │   ├── community.py
    │   ├── community_report.py
    │   ├── covariate.py
    │   ├── document.py
    │   ├── entity.py
    │   ├── identified.py
    │   ├── named.py
    │   ├── relationship.py
    │   ├── schemas.py
    │   ├── text_unit.py
    │   └── types.py
    ├── index
    │   ├── __init__.py
    │   ├── input
    │   │   ├── __init__.py
    │   │   ├── csv.py
    │   │   ├── factory.py
    │   │   ├── json.py
    │   │   ├── text.py
    │   │   └── util.py
    │   ├── operations
    │   │   ├── __init__.py
    │   │   ├── build_noun_graph
    │   │   │   ├── __init__.py
    │   │   │   ├── build_noun_graph.py
    │   │   │   └── np_extractors
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── base.py
    │   │   │   │   ├── cfg_extractor.py
    │   │   │   │   ├── factory.py
    │   │   │   │   ├── np_validator.py
    │   │   │   │   ├── regex_extractor.py
    │   │   │   │   ├── resource_loader.py
    │   │   │   │   ├── stop_words.py
    │   │   │   │   └── syntactic_parsing_extractor.py
    │   │   ├── chunk_text
    │   │   │   ├── __init__.py
    │   │   │   ├── bootstrap.py
    │   │   │   ├── chunk_text.py
    │   │   │   ├── strategies.py
    │   │   │   └── typing.py
    │   │   ├── cluster_graph.py
    │   │   ├── compute_degree.py
    │   │   ├── compute_edge_combined_degree.py
    │   │   ├── create_graph.py
    │   │   ├── embed_graph
    │   │   │   ├── __init__.py
    │   │   │   ├── embed_graph.py
    │   │   │   ├── embed_node2vec.py
    │   │   │   └── typing.py
    │   │   ├── embed_text
    │   │   │   ├── __init__.py
    │   │   │   ├── embed_text.py
    │   │   │   └── strategies
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── mock.py
    │   │   │   │   ├── openai.py
    │   │   │   │   └── typing.py
    │   │   ├── extract_covariates
    │   │   │   ├── __init__.py
    │   │   │   ├── claim_extractor.py
    │   │   │   ├── extract_covariates.py
    │   │   │   └── typing.py
    │   │   ├── extract_graph
    │   │   │   ├── __init__.py
    │   │   │   ├── extract_graph.py
    │   │   │   ├── graph_extractor.py
    │   │   │   ├── graph_intelligence_strategy.py
    │   │   │   └── typing.py
    │   │   ├── finalize_community_reports.py
    │   │   ├── finalize_entities.py
    │   │   ├── finalize_relationships.py
    │   │   ├── graph_to_dataframes.py
    │   │   ├── layout_graph
    │   │   │   ├── __init__.py
    │   │   │   ├── layout_graph.py
    │   │   │   ├── typing.py
    │   │   │   ├── umap.py
    │   │   │   └── zero.py
    │   │   ├── prune_graph.py
    │   │   ├── snapshot_graphml.py
    │   │   ├── summarize_communities
    │   │   │   ├── __init__.py
    │   │   │   ├── build_mixed_context.py
    │   │   │   ├── community_reports_extractor.py
    │   │   │   ├── explode_communities.py
    │   │   │   ├── graph_context
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── context_builder.py
    │   │   │   │   └── sort_context.py
    │   │   │   ├── strategies.py
    │   │   │   ├── summarize_communities.py
    │   │   │   ├── text_unit_context
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── context_builder.py
    │   │   │   │   ├── prep_text_units.py
    │   │   │   │   └── sort_context.py
    │   │   │   ├── typing.py
    │   │   │   └── utils.py
    │   │   └── summarize_descriptions
    │   │   │   ├── __init__.py
    │   │   │   ├── description_summary_extractor.py
    │   │   │   ├── graph_intelligence_strategy.py
    │   │   │   ├── summarize_descriptions.py
    │   │   │   └── typing.py
    │   ├── run
    │   │   ├── __init__.py
    │   │   ├── run_pipeline.py
    │   │   └── utils.py
    │   ├── text_splitting
    │   │   ├── __init__.py
    │   │   ├── check_token_limit.py
    │   │   └── text_splitting.py
    │   ├── typing
    │   │   ├── __init__.py
    │   │   ├── context.py
    │   │   ├── error_handler.py
    │   │   ├── pipeline.py
    │   │   ├── pipeline_run_result.py
    │   │   ├── state.py
    │   │   ├── stats.py
    │   │   └── workflow.py
    │   ├── update
    │   │   ├── __init__.py
    │   │   ├── communities.py
    │   │   ├── entities.py
    │   │   ├── incremental_index.py
    │   │   └── relationships.py
    │   ├── utils
    │   │   ├── __init__.py
    │   │   ├── dataframes.py
    │   │   ├── derive_from_rows.py
    │   │   ├── dicts.py
    │   │   ├── graphs.py
    │   │   ├── hashing.py
    │   │   ├── is_null.py
    │   │   ├── rate_limiter.py
    │   │   ├── stable_lcc.py
    │   │   ├── string.py
    │   │   ├── tokens.py
    │   │   └── uuid.py
    │   ├── validate_config.py
    │   └── workflows
    │   │   ├── __init__.py
    │   │   ├── create_base_text_units.py
    │   │   ├── create_communities.py
    │   │   ├── create_community_reports.py
    │   │   ├── create_community_reports_text.py
    │   │   ├── create_final_documents.py
    │   │   ├── create_final_text_units.py
    │   │   ├── extract_covariates.py
    │   │   ├── extract_graph.py
    │   │   ├── extract_graph_nlp.py
    │   │   ├── factory.py
    │   │   ├── finalize_graph.py
    │   │   ├── generate_text_embeddings.py
    │   │   ├── prune_graph.py
    │   │   ├── update_clean_state.py
    │   │   ├── update_communities.py
    │   │   ├── update_community_reports.py
    │   │   ├── update_covariates.py
    │   │   ├── update_entities_relationships.py
    │   │   ├── update_final_documents.py
    │   │   ├── update_text_embeddings.py
    │   │   └── update_text_units.py
    ├── language_model
    │   ├── __init__.py
    │   ├── cache
    │   │   ├── __init__.py
    │   │   └── base.py
    │   ├── events
    │   │   ├── __init__.py
    │   │   └── base.py
    │   ├── factory.py
    │   ├── manager.py
    │   ├── protocol
    │   │   ├── __init__.py
    │   │   └── base.py
    │   ├── providers
    │   │   ├── __init__.py
    │   │   └── fnllm
    │   │   │   ├── __init__.py
    │   │   │   ├── cache.py
    │   │   │   ├── events.py
    │   │   │   ├── models.py
    │   │   │   └── utils.py
    │   └── response
    │   │   ├── __init__.py
    │   │   ├── base.py
    │   │   └── base.pyi
    ├── logger
    │   ├── __init__.py
    │   ├── base.py
    │   ├── console.py
    │   ├── factory.py
    │   ├── null_progress.py
    │   ├── print_progress.py
    │   ├── progress.py
    │   ├── rich_progress.py
    │   └── types.py
    ├── prompt_tune
    │   ├── __init__.py
    │   ├── defaults.py
    │   ├── generator
    │   │   ├── __init__.py
    │   │   ├── community_report_rating.py
    │   │   ├── community_report_summarization.py
    │   │   ├── community_reporter_role.py
    │   │   ├── domain.py
    │   │   ├── entity_relationship.py
    │   │   ├── entity_summarization_prompt.py
    │   │   ├── entity_types.py
    │   │   ├── extract_graph_prompt.py
    │   │   ├── language.py
    │   │   └── persona.py
    │   ├── loader
    │   │   ├── __init__.py
    │   │   └── input.py
    │   ├── prompt
    │   │   ├── __init__.py
    │   │   ├── community_report_rating.py
    │   │   ├── community_reporter_role.py
    │   │   ├── domain.py
    │   │   ├── entity_relationship.py
    │   │   ├── entity_types.py
    │   │   ├── language.py
    │   │   └── persona.py
    │   ├── template
    │   │   ├── __init__.py
    │   │   ├── community_report_summarization.py
    │   │   ├── entity_summarization.py
    │   │   └── extract_graph.py
    │   └── types.py
    ├── prompts
    │   ├── __init__.py
    │   ├── index
    │   │   ├── __init__.py
    │   │   ├── community_report.py
    │   │   ├── community_report_text_units.py
    │   │   ├── extract_claims.py
    │   │   ├── extract_graph.py
    │   │   └── summarize_descriptions.py
    │   └── query
    │   │   ├── __init__.py
    │   │   ├── basic_search_system_prompt.py
    │   │   ├── drift_search_system_prompt.py
    │   │   ├── global_search_knowledge_system_prompt.py
    │   │   ├── global_search_map_system_prompt.py
    │   │   ├── global_search_reduce_system_prompt.py
    │   │   ├── local_search_system_prompt.py
    │   │   └── question_gen_system_prompt.py
    ├── py.typed
    ├── query
    │   ├── __init__.py
    │   ├── context_builder
    │   │   ├── __init__.py
    │   │   ├── builders.py
    │   │   ├── community_context.py
    │   │   ├── conversation_history.py
    │   │   ├── dynamic_community_selection.py
    │   │   ├── entity_extraction.py
    │   │   ├── local_context.py
    │   │   ├── rate_prompt.py
    │   │   ├── rate_relevancy.py
    │   │   └── source_context.py
    │   ├── factory.py
    │   ├── indexer_adapters.py
    │   ├── input
    │   │   ├── __init__.py
    │   │   ├── loaders
    │   │   │   ├── __init__.py
    │   │   │   ├── dfs.py
    │   │   │   └── utils.py
    │   │   └── retrieval
    │   │   │   ├── __init__.py
    │   │   │   ├── community_reports.py
    │   │   │   ├── covariates.py
    │   │   │   ├── entities.py
    │   │   │   ├── relationships.py
    │   │   │   └── text_units.py
    │   ├── llm
    │   │   ├── __init__.py
    │   │   └── text_utils.py
    │   ├── question_gen
    │   │   ├── __init__.py
    │   │   ├── base.py
    │   │   └── local_gen.py
    │   └── structured_search
    │   │   ├── __init__.py
    │   │   ├── base.py
    │   │   ├── basic_search
    │   │       ├── __init__.py
    │   │       ├── basic_context.py
    │   │       └── search.py
    │   │   ├── drift_search
    │   │       ├── __init__.py
    │   │       ├── action.py
    │   │       ├── drift_context.py
    │   │       ├── primer.py
    │   │       ├── search.py
    │   │       └── state.py
    │   │   ├── global_search
    │   │       ├── __init__.py
    │   │       ├── community_context.py
    │   │       └── search.py
    │   │   └── local_search
    │   │       ├── __init__.py
    │   │       ├── mixed_context.py
    │   │       └── search.py
    ├── storage
    │   ├── __init__.py
    │   ├── blob_pipeline_storage.py
    │   ├── cosmosdb_pipeline_storage.py
    │   ├── factory.py
    │   ├── file_pipeline_storage.py
    │   ├── memory_pipeline_storage.py
    │   └── pipeline_storage.py
    ├── utils
    │   ├── __init__.py
    │   ├── api.py
    │   ├── cli.py
    │   └── storage.py
    └── vector_stores
    │   ├── __init__.py
    │   ├── azure_ai_search.py
    │   ├── base.py
    │   ├── cosmosdb.py
    │   ├── factory.py
    │   └── lancedb.py
├── mkdocs.yaml
├── poetry.lock
├── pyproject.toml
├── scripts
    ├── semver-check.sh
    ├── spellcheck.sh
    └── start-azurite.sh
├── tests
    ├── __init__.py
    ├── conftest.py
    ├── fixtures
    │   ├── azure
    │   │   ├── config.json
    │   │   ├── input
    │   │   │   ├── ABOUT.md
    │   │   │   └── dulce.txt
    │   │   └── settings.yml
    │   ├── min-csv
    │   │   ├── config.json
    │   │   ├── input
    │   │   │   ├── ABOUT.md
    │   │   │   ├── dulce.csv
    │   │   │   └── dulce.txt
    │   │   └── settings.yml
    │   └── text
    │   │   ├── config.json
    │   │   ├── input
    │   │       ├── ABOUT.md
    │   │       └── dulce.txt
    │   │   ├── prompts
    │   │       └── community_report.txt
    │   │   └── settings.yml
    ├── integration
    │   ├── __init__.py
    │   ├── language_model
    │   │   ├── __init__.py
    │   │   └── test_factory.py
    │   ├── storage
    │   │   ├── __init__.py
    │   │   ├── test_blob_pipeline_storage.py
    │   │   ├── test_cosmosdb_storage.py
    │   │   ├── test_factory.py
    │   │   └── test_file_pipeline_storage.py
    │   └── vector_stores
    │   │   ├── __init__.py
    │   │   ├── test_azure_ai_search.py
    │   │   ├── test_cosmosdb.py
    │   │   └── test_lancedb.py
    ├── mock_provider.py
    ├── notebook
    │   ├── __init__.py
    │   └── test_notebooks.py
    ├── smoke
    │   ├── __init__.py
    │   └── test_fixtures.py
    ├── unit
    │   ├── __init__.py
    │   ├── config
    │   │   ├── __init__.py
    │   │   ├── fixtures
    │   │   │   ├── minimal_config
    │   │   │   │   └── settings.yaml
    │   │   │   ├── minimal_config_missing_env_var
    │   │   │   │   └── settings.yaml
    │   │   │   └── timestamp_dirs
    │   │   │   │   └── 20240812-120000
    │   │   │   │       └── empty.txt
    │   │   ├── prompt-a.txt
    │   │   ├── prompt-b.txt
    │   │   ├── prompt-c.txt
    │   │   ├── prompt-d.txt
    │   │   ├── test_config.py
    │   │   └── utils.py
    │   ├── indexing
    │   │   ├── __init__.py
    │   │   ├── cache
    │   │   │   ├── __init__.py
    │   │   │   └── test_file_pipeline_cache.py
    │   │   ├── graph
    │   │   │   ├── __init__.py
    │   │   │   ├── extractors
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── community_reports
    │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   └── test_sort_context.py
    │   │   │   └── utils
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── test_stable_lcc.py
    │   │   ├── input
    │   │   │   ├── __init__.py
    │   │   │   ├── data
    │   │   │   │   ├── multiple-csvs
    │   │   │   │   │   ├── input1.csv
    │   │   │   │   │   ├── input2.csv
    │   │   │   │   │   └── input3.csv
    │   │   │   │   ├── multiple-jsons
    │   │   │   │   │   ├── input1.json
    │   │   │   │   │   └── input2.json
    │   │   │   │   ├── multiple-txts
    │   │   │   │   │   ├── input1.txt
    │   │   │   │   │   └── input2.txt
    │   │   │   │   ├── one-csv
    │   │   │   │   │   └── input.csv
    │   │   │   │   ├── one-json-multiple-objects
    │   │   │   │   │   └── input.json
    │   │   │   │   ├── one-json-one-object
    │   │   │   │   │   └── input.json
    │   │   │   │   └── one-txt
    │   │   │   │   │   └── input.txt
    │   │   │   ├── test_csv_loader.py
    │   │   │   ├── test_json_loader.py
    │   │   │   └── test_txt_loader.py
    │   │   ├── operations
    │   │   │   ├── __init__.py
    │   │   │   └── chunk_text
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── test_chunk_text.py
    │   │   │   │   └── test_strategies.py
    │   │   ├── test_init_content.py
    │   │   ├── text_splitting
    │   │   │   ├── __init__.py
    │   │   │   └── test_text_splitting.py
    │   │   └── verbs
    │   │   │   ├── __init__.py
    │   │   │   ├── entities
    │   │   │       ├── __init__.py
    │   │   │       └── extraction
    │   │   │       │   ├── __init__.py
    │   │   │       │   └── strategies
    │   │   │       │       ├── __init__.py
    │   │   │       │       └── graph_intelligence
    │   │   │       │           ├── __init__.py
    │   │   │       │           └── test_gi_entity_extraction.py
    │   │   │   └── helpers
    │   │   │       ├── __init__.py
    │   │   │       └── mock_llm.py
    │   ├── query
    │   │   ├── __init__.py
    │   │   ├── context_builder
    │   │   │   ├── __init__.py
    │   │   │   └── test_entity_extraction.py
    │   │   ├── data
    │   │   │   ├── defaults
    │   │   │   │   └── output
    │   │   │   │   │   ├── 20240812-120000
    │   │   │   │   │       └── empty.txt
    │   │   │   │   │   └── 20240812-121000
    │   │   │   │   │       └── empty.txt
    │   │   │   ├── empty
    │   │   │   │   └── something-else
    │   │   │   │   │   └── empty.txt
    │   │   │   ├── hidden
    │   │   │   │   └── output
    │   │   │   │   │   ├── .another
    │   │   │   │   │       └── empty.txt
    │   │   │   │   │   ├── .hidden
    │   │   │   │   │   ├── 20240812-120000
    │   │   │   │   │       └── empty.txt
    │   │   │   │   │   └── 20240812-121000
    │   │   │   │   │       └── empty.txt
    │   │   │   └── non-numeric
    │   │   │   │   └── output
    │   │   │   │       ├── 20240812-120000
    │   │   │   │           └── empty.txt
    │   │   │   │       ├── 20240812-121000
    │   │   │   │           └── empty.txt
    │   │   │   │       └── something-else
    │   │   │   │           └── empty.txt
    │   │   └── input
    │   │   │   ├── __init__.py
    │   │   │   └── retrieval
    │   │   │       ├── __init__.py
    │   │   │       └── test_entities.py
    │   └── utils
    │   │   ├── __init__.py
    │   │   └── test_embeddings.py
    └── verbs
    │   ├── __init__.py
    │   ├── data
    │       ├── communities.parquet
    │       ├── community_reports.parquet
    │       ├── covariates.parquet
    │       ├── documents.parquet
    │       ├── entities.parquet
    │       ├── relationships.parquet
    │       ├── text_units.parquet
    │       ├── text_units_metadata.parquet
    │       └── text_units_metadata_included_chunk.parquet
    │   ├── test_create_base_text_units.py
    │   ├── test_create_communities.py
    │   ├── test_create_community_reports.py
    │   ├── test_create_final_documents.py
    │   ├── test_create_final_text_units.py
    │   ├── test_extract_covariates.py
    │   ├── test_extract_graph.py
    │   ├── test_extract_graph_nlp.py
    │   ├── test_finalize_graph.py
    │   ├── test_generate_text_embeddings.py
    │   ├── test_pipeline_state.py
    │   ├── test_prune_graph.py
    │   └── util.py
└── unified-search-app
    ├── .vsts-ci.yml
    ├── Dockerfile
    ├── README.md
    ├── app
        ├── __init__.py
        ├── app_logic.py
        ├── data_config.py
        ├── home_page.py
        ├── knowledge_loader
        │   ├── __init__.py
        │   ├── data_prep.py
        │   ├── data_sources
        │   │   ├── __init__.py
        │   │   ├── blob_source.py
        │   │   ├── default.py
        │   │   ├── loader.py
        │   │   ├── local_source.py
        │   │   └── typing.py
        │   └── model.py
        ├── rag
        │   ├── __init__.py
        │   └── typing.py
        ├── state
        │   ├── __init__.py
        │   ├── query_variable.py
        │   ├── session_variable.py
        │   └── session_variables.py
        └── ui
        │   ├── __init__.py
        │   ├── full_graph.py
        │   ├── questions_list.py
        │   ├── report_details.py
        │   ├── report_list.py
        │   ├── search.py
        │   └── sidebar.py
    ├── images
        ├── image-1.png
        ├── image-2.png
        ├── image-3.png
        └── image-4.png
    ├── poetry.lock
    └── pyproject.toml


/.gitattributes:
--------------------------------------------------------------------------------
 1 | *.txt text eol=lf
 2 | *.md text eol=lf
 3 | *.yml text eol=lf
 4 | *.html text eol=lf
 5 | *.py text eol=lf
 6 | *.toml text eol=lf
 7 | .gitattributes text eol=lf
 8 | .gitignore text eol=lf
 9 | *.lock
10 | CODEOWNERS text eol=lf
11 | LICENSE text eol=lf


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/config.yml:
--------------------------------------------------------------------------------
1 | blank_issues_enabled: true
2 | 


--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | # To get started with Dependabot version updates, you'll need to specify which
 2 | # package ecosystems to update and where the package manifests are located.
 3 | # Please see the documentation for all configuration options:
 4 | # https://docs.github.com/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file
 5 | version: 2
 6 | updates:
 7 |   - package-ecosystem: "pip" # See documentation for possible values
 8 |     directory: "/" # Location of package manifests
 9 |     schedule:
10 |       interval: "weekly"
11 |   - package-ecosystem: "github-actions"
12 |     # Workflow files stored in the default location of `.github/workflows`. (You don't need to specify `/.github/workflows` for `directory`. You can use `directory: "/"`.)
13 |     directory: "/"
14 |     schedule:
15 |       interval: "weekly"
16 | 


--------------------------------------------------------------------------------
/.github/pull_request_template.md:
--------------------------------------------------------------------------------
 1 | <!--
 2 | Thanks for contributing to GraphRAG!
 3 | 
 4 | Please do not make *Draft* pull requests, as they still notify anyone watching the repo.
 5 | 
 6 | Create a pull request when it is ready for review and feedback.
 7 | 
 8 | About this template
 9 | 
10 | The following template aims to help contributors write a good description for their pull requests.
11 | We'd like you to provide a description of the changes in your pull request (i.e. bugs fixed or features added), the motivation behind the changes, and complete the checklist below before opening a pull request.
12 | 
13 | Feel free to discard it if you need to (e.g. when you just fix a typo). -->
14 | 
15 | ## Description
16 | 
17 | [Provide a brief description of the changes made in this pull request.]
18 | 
19 | ## Related Issues
20 | 
21 | [Reference any related issues or tasks that this pull request addresses.]
22 | 
23 | ## Proposed Changes
24 | 
25 | [List the specific changes made in this pull request.]
26 | 
27 | ## Checklist
28 | 
29 | - [ ] I have tested these changes locally.
30 | - [ ] I have reviewed the code changes.
31 | - [ ] I have updated the documentation (if necessary).
32 | - [ ] I have added appropriate unit tests (if applicable).
33 | 
34 | ## Additional Notes
35 | 
36 | [Add any additional notes or context that may be helpful for the reviewer(s).]
37 | 


--------------------------------------------------------------------------------
/.github/workflows/gh-pages.yml:
--------------------------------------------------------------------------------
 1 | name: gh-pages
 2 | on:
 3 |   push:
 4 |     branches: [main]
 5 | permissions:
 6 |   contents: write
 7 | 
 8 | env:
 9 |   POETRY_VERSION: '1.8.3'
10 |   PYTHON_VERSION: '3.11'
11 | 
12 | jobs:
13 |   build:
14 |     runs-on: ubuntu-latest
15 |     env:
16 |       GH_PAGES: 1
17 |       DEBUG: 1
18 |       GRAPHRAG_API_KEY: ${{ secrets.GRAPHRAG_API_KEY }}
19 |       GRAPHRAG_LLM_MODEL: ${{ secrets.GRAPHRAG_LLM_MODEL }}
20 |       GRAPHRAG_EMBEDDING_MODEL: ${{ secrets.GRAPHRAG_EMBEDDING_MODEL }}
21 | 
22 |     steps:
23 |       - uses: actions/checkout@v4
24 |         with:
25 |           persist-credentials: false
26 | 
27 |       - name: Set up Python ${{ env.PYTHON_VERSION }}
28 |         uses: actions/setup-python@v5
29 |         with:
30 |           python-version: ${{ env.PYTHON_VERSION }}
31 | 
32 |       - name: Install Poetry ${{ env.POETRY_VERSION }}
33 |         uses: abatilo/actions-poetry@v3.0.0
34 |         with:
35 |           poetry-version: ${{ env.POETRY_VERSION }}
36 | 
37 |       - name: poetry intsall
38 |         shell: bash
39 |         run: poetry install
40 |   
41 |       - name: mkdocs build
42 |         shell: bash
43 |         run: poetry run poe build_docs
44 | 
45 |       - name: List Docsite Contents
46 |         run: find site
47 | 
48 |       - name: Deploy to GitHub Pages
49 |         uses: JamesIves/github-pages-deploy-action@v4.6.4
50 |         with:
51 |           branch: gh-pages
52 |           folder: site
53 |           clean: true
54 | 


--------------------------------------------------------------------------------
/.github/workflows/issues-autoresolve.yml:
--------------------------------------------------------------------------------
 1 | name: Close inactive issues
 2 | on:
 3 |   schedule:
 4 |     - cron: "30 1 * * *"
 5 | 
 6 | permissions:
 7 |   actions: write
 8 |   issues: write
 9 |   pull-requests: write
10 | 
11 | jobs:
12 |   close-issues:
13 |     runs-on: ubuntu-latest
14 |     permissions:
15 |       issues: write
16 |       pull-requests: write
17 |     steps:
18 |       - uses: actions/stale@v9
19 |         with:
20 |           days-before-issue-stale: 7
21 |           days-before-issue-close: 5
22 |           stale-issue-label: "stale"
23 |           close-issue-label: "autoresolved"
24 |           stale-issue-message: "This issue has been marked stale due to inactivity after repo maintainer or community member responses that request more information or suggest a solution. It will be closed after five additional days."
25 |           close-issue-message: "This issue has been closed after being marked as stale for five days. Please reopen if needed."
26 |           any-of-labels: "awaiting_response"
27 |           days-before-pr-stale: -1
28 |           days-before-pr-close: -1
29 |           repo-token: ${{ secrets.GITHUB_TOKEN }}
30 | 


--------------------------------------------------------------------------------
/.github/workflows/python-publish.yml:
--------------------------------------------------------------------------------
 1 | name: Python Publish (pypi)
 2 | on:
 3 |   release:
 4 |     types: [created]
 5 |   push:
 6 |     branches: [main]
 7 | 
 8 | env:
 9 |   POETRY_VERSION: "1.8.3"
10 |   PYTHON_VERSION: "3.10"
11 | 
12 | jobs:
13 |   publish:
14 |     name: Upload release to PyPI
15 |     if: github.ref == 'refs/heads/main'
16 |     runs-on: ubuntu-latest
17 |     environment:
18 |       name: pypi
19 |       url: https://pypi.org/p/graphrag
20 |     permissions:
21 |       id-token: write # IMPORTANT: this permission is mandatory for trusted publishing
22 | 
23 |     steps:
24 |       - uses: actions/checkout@v4
25 |         with:
26 |           fetch-depth: 0
27 |           fetch-tags: true
28 | 
29 |       - name: Set up Python
30 |         uses: actions/setup-python@v5
31 |         with:
32 |           python-version: ${{ env.PYTHON_VERSION }}
33 | 
34 |       - name: Install Poetry
35 |         uses: abatilo/actions-poetry@v3.0.0
36 |         with:
37 |           poetry-version: ${{ env.POETRY_VERSION }}
38 | 
39 |       - name: Install dependencies
40 |         shell: bash
41 |         run: poetry install
42 | 
43 |       - name: Export Publication Version
44 |         run: echo "version=`poetry version --short`" >> $GITHUB_OUTPUT
45 | 
46 |       - name: Build Distributable
47 |         shell: bash
48 |         run: poetry build
49 | 
50 |       - name: Publish package distributions to PyPI
51 |         uses: pypa/gh-action-pypi-publish@release/v1
52 |         with:
53 |           packages-dir: dist
54 |           skip-existing: true
55 |           verbose: true
56 | 


--------------------------------------------------------------------------------
/.github/workflows/semver.yml:
--------------------------------------------------------------------------------
 1 | name: Semver Check
 2 | on:
 3 |   pull_request:
 4 |     types:
 5 |       - opened
 6 |       - reopened
 7 |       - synchronize
 8 |       - ready_for_review
 9 |     branches: [main]
10 | 
11 | jobs:
12 |   semver:
13 |     # skip draft PRs
14 |     if: github.event.pull_request.draft == false
15 |     runs-on: ubuntu-latest
16 |     steps:
17 |     - uses: actions/checkout@v4
18 |       with:
19 |         fetch-depth: 0
20 |     - name: Check Semver
21 |       run: ./scripts/semver-check.sh


--------------------------------------------------------------------------------
/.github/workflows/spellcheck.yml:
--------------------------------------------------------------------------------
 1 | name: Spellcheck
 2 | on:
 3 |   push:
 4 |     branches: [main]
 5 |   pull_request:
 6 |     types:
 7 |       - opened
 8 |       - reopened
 9 |       - synchronize
10 |       - ready_for_review
11 |     paths:
12 |       - "**/*"
13 | jobs:
14 |   spellcheck:
15 |     # skip draft PRs
16 |     if: github.event.pull_request.draft == false
17 |     runs-on: ubuntu-latest
18 |     steps:
19 |       - uses: actions/checkout@v4
20 | 
21 |       - name: Spellcheck
22 |         run: ./scripts/spellcheck.sh
23 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Python Artifacts
 2 | python/*/lib/
 3 | dist/
 4 | 
 5 | # Test Output
 6 | .coverage
 7 | coverage/
 8 | licenses.txt
 9 | examples_notebooks/*/data
10 | tests/fixtures/cache
11 | tests/fixtures/*/cache
12 | tests/fixtures/*/output
13 | output/lancedb
14 | 
15 | 
16 | # Random
17 | .DS_Store
18 | *.log*
19 | .venv
20 | venv/
21 | .conda
22 | .tmp
23 | 
24 | .env
25 | build.zip
26 | 
27 | .turbo
28 | 
29 | __pycache__
30 | 
31 | .pipeline
32 | 
33 | # Azurite
34 | temp_azurite/
35 | __azurite*.json
36 | __blobstorage*.json
37 | __blobstorage__/
38 | 
39 | # Getting started example
40 | ragtest/
41 | .ragtest/
42 | .pipelines
43 | .pipeline
44 | 
45 | 
46 | # mkdocs
47 | site/
48 | 
49 | # Docs migration
50 | docsite/
51 | .yarn/
52 | .pnp*
53 | 
54 | # PyCharm
55 | .idea/
56 | 
57 | # Jupyter notebook
58 | .ipynb_checkpoints/
59 | 


--------------------------------------------------------------------------------
/.semversioner/0.1.0.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "changes": [
 3 |     {
 4 |       "description": "Initial Release",
 5 |       "type": "minor"
 6 |     }
 7 |   ],
 8 |   "created_at": "2024-07-01T21:48:50+00:00",
 9 |   "version": "0.1.0"
10 | }


--------------------------------------------------------------------------------
/.semversioner/0.2.2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "changes": [
 3 |     {
 4 |       "description": "Add a check if there is no community record added in local search context",
 5 |       "type": "patch"
 6 |     },
 7 |     {
 8 |       "description": "Add sepparate workflow for Python Tests",
 9 |       "type": "patch"
10 |     },
11 |     {
12 |       "description": "Docs updates",
13 |       "type": "patch"
14 |     },
15 |     {
16 |       "description": "Run smoke tests on 4o",
17 |       "type": "patch"
18 |     }
19 |   ],
20 |   "created_at": "2024-08-08T22:40:57+00:00",
21 |   "version": "0.2.2"
22 | }


--------------------------------------------------------------------------------
/.semversioner/0.3.0.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "changes": [
 3 |     {
 4 |       "description": "Implement auto templating API.",
 5 |       "type": "minor"
 6 |     },
 7 |     {
 8 |       "description": "Implement query engine API.",
 9 |       "type": "minor"
10 |     },
11 |     {
12 |       "description": "Fix file dumps using json for non ASCII chars",
13 |       "type": "patch"
14 |     },
15 |     {
16 |       "description": "Stabilize smoke tests for query context building",
17 |       "type": "patch"
18 |     },
19 |     {
20 |       "description": "fix query embedding",
21 |       "type": "patch"
22 |     },
23 |     {
24 |       "description": "fix sort_context & max_tokens params in verb",
25 |       "type": "patch"
26 |     }
27 |   ],
28 |   "created_at": "2024-08-12T23:51:49+00:00",
29 |   "version": "0.3.0"
30 | }


--------------------------------------------------------------------------------
/.semversioner/0.3.1.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "changes": [
 3 |     {
 4 |       "description": "Add preflight check to check LLM connectivity.",
 5 |       "type": "patch"
 6 |     },
 7 |     {
 8 |       "description": "Add streaming support for local/global search to query cli",
 9 |       "type": "patch"
10 |     },
11 |     {
12 |       "description": "Add support for both float and int on schema validation for community report generation",
13 |       "type": "patch"
14 |     },
15 |     {
16 |       "description": "Avoid running index on gh-pages publishing",
17 |       "type": "patch"
18 |     },
19 |     {
20 |       "description": "Implement Index API",
21 |       "type": "patch"
22 |     },
23 |     {
24 |       "description": "Improves filtering for data dir inferring",
25 |       "type": "patch"
26 |     },
27 |     {
28 |       "description": "Update to nltk 3.9.1",
29 |       "type": "patch"
30 |     }
31 |   ],
32 |   "created_at": "2024-08-21T22:46:19+00:00",
33 |   "version": "0.3.1"
34 | }


--------------------------------------------------------------------------------
/.semversioner/0.3.2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "changes": [
 3 |     {
 4 |       "description": "Add context data to query API responses.",
 5 |       "type": "patch"
 6 |     },
 7 |     {
 8 |       "description": "Add missing config parameter documentation for prompt tuning",
 9 |       "type": "patch"
10 |     },
11 |     {
12 |       "description": "Add neo4j community notebook",
13 |       "type": "patch"
14 |     },
15 |     {
16 |       "description": "Ensure entity types to be str when running prompt tuning",
17 |       "type": "patch"
18 |     },
19 |     {
20 |       "description": "Fix weight casting during graph extraction",
21 |       "type": "patch"
22 |     },
23 |     {
24 |       "description": "Patch \"past\" dependency issues",
25 |       "type": "patch"
26 |     },
27 |     {
28 |       "description": "Update developer guide.",
29 |       "type": "patch"
30 |     },
31 |     {
32 |       "description": "Update query type hints.",
33 |       "type": "patch"
34 |     },
35 |     {
36 |       "description": "change-lancedb-placement",
37 |       "type": "patch"
38 |     }
39 |   ],
40 |   "created_at": "2024-08-26T23:43:01+00:00",
41 |   "version": "0.3.2"
42 | }


--------------------------------------------------------------------------------
/.semversioner/0.3.4.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "changes": [
 3 |     {
 4 |       "description": "Deep copy txt units on local search to avoid race conditions",
 5 |       "type": "patch"
 6 |     },
 7 |     {
 8 |       "description": "Fix summarization including empty descriptions",
 9 |       "type": "patch"
10 |     }
11 |   ],
12 |   "created_at": "2024-09-11T22:31:58+00:00",
13 |   "version": "0.3.4"
14 | }


--------------------------------------------------------------------------------
/.semversioner/0.3.5.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "changes": [
 3 |     {
 4 |       "description": "Add compound verbs with tests infra.",
 5 |       "type": "patch"
 6 |     },
 7 |     {
 8 |       "description": "Collapse create_final_communities.",
 9 |       "type": "patch"
10 |     },
11 |     {
12 |       "description": "Collapse create_final_text_units.",
13 |       "type": "patch"
14 |     },
15 |     {
16 |       "description": "Covariate verb collapse.",
17 |       "type": "patch"
18 |     },
19 |     {
20 |       "description": "Fix duplicates in community context builder",
21 |       "type": "patch"
22 |     },
23 |     {
24 |       "description": "Fix prompt tune output path",
25 |       "type": "patch"
26 |     },
27 |     {
28 |       "description": "Fix seed hardcoded init",
29 |       "type": "patch"
30 |     },
31 |     {
32 |       "description": "Fix seeded random gen on clustering",
33 |       "type": "patch"
34 |     },
35 |     {
36 |       "description": "Improve logging.",
37 |       "type": "patch"
38 |     },
39 |     {
40 |       "description": "Set default values for cli parameters.",
41 |       "type": "patch"
42 |     },
43 |     {
44 |       "description": "Use static output directories.",
45 |       "type": "patch"
46 |     }
47 |   ],
48 |   "created_at": "2024-09-19T15:26:01+00:00",
49 |   "version": "0.3.5"
50 | }


--------------------------------------------------------------------------------
/.semversioner/0.3.6.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "changes": [
 3 |     {
 4 |       "description": "Collapse create_final_relationships.",
 5 |       "type": "patch"
 6 |     },
 7 |     {
 8 |       "description": "Dependency update and cleanup",
 9 |       "type": "patch"
10 |     }
11 |   ],
12 |   "created_at": "2024-09-20T00:09:13+00:00",
13 |   "version": "0.3.6"
14 | }


--------------------------------------------------------------------------------
/.semversioner/0.4.1.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "changes": [
 3 |     {
 4 |       "description": "Add update cli entrypoint for incremental indexing",
 5 |       "type": "patch"
 6 |     },
 7 |     {
 8 |       "description": "Allow some CI/CD jobs to skip PRs dedicated to doc updates only.",
 9 |       "type": "patch"
10 |     },
11 |     {
12 |       "description": "Fix a file paths issue in the viz guide.",
13 |       "type": "patch"
14 |     },
15 |     {
16 |       "description": "Fix optional covariates update in incremental indexing",
17 |       "type": "patch"
18 |     },
19 |     {
20 |       "description": "Raise error on empty deltas for inc indexing",
21 |       "type": "patch"
22 |     },
23 |     {
24 |       "description": "add visualization guide to doc site",
25 |       "type": "patch"
26 |     },
27 |     {
28 |       "description": "fix streaming output error",
29 |       "type": "patch"
30 |     }
31 |   ],
32 |   "created_at": "2024-11-08T23:13:05+00:00",
33 |   "version": "0.4.1"
34 | }


--------------------------------------------------------------------------------
/.semversioner/0.5.0.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "changes": [
 3 |     {
 4 |       "description": "Data model changes.",
 5 |       "type": "minor"
 6 |     },
 7 |     {
 8 |       "description": "Add Parquet as part of the default emitters when not pressent",
 9 |       "type": "patch"
10 |     },
11 |     {
12 |       "description": "Centralized prompts and export all for easier injection.",
13 |       "type": "patch"
14 |     },
15 |     {
16 |       "description": "Cleanup of artifact outputs/schemas.",
17 |       "type": "patch"
18 |     },
19 |     {
20 |       "description": "Config and docs updates.",
21 |       "type": "patch"
22 |     },
23 |     {
24 |       "description": "Implement dynamic community selection to global search",
25 |       "type": "patch"
26 |     },
27 |     {
28 |       "description": "fix autocompletion of existing files/directory paths.",
29 |       "type": "patch"
30 |     },
31 |     {
32 |       "description": "move import statements out of init files",
33 |       "type": "patch"
34 |     }
35 |   ],
36 |   "created_at": "2024-11-16T00:43:06+00:00",
37 |   "version": "0.5.0"
38 | }


--------------------------------------------------------------------------------
/.semversioner/0.9.0.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "changes": [
 3 |     {
 4 |       "description": "Refactor graph creation.",
 5 |       "type": "minor"
 6 |     },
 7 |     {
 8 |       "description": "Dependency updates",
 9 |       "type": "patch"
10 |     },
11 |     {
12 |       "description": "Fix Global Search with dynamic Community selection bug",
13 |       "type": "patch"
14 |     },
15 |     {
16 |       "description": "Fix question gen.",
17 |       "type": "patch"
18 |     },
19 |     {
20 |       "description": "Optimize Final Community Reports calculation and stabilize cache",
21 |       "type": "patch"
22 |     },
23 |     {
24 |       "description": "miscellaneous code cleanup and minor changes for better alignment of style across the codebase.",
25 |       "type": "patch"
26 |     },
27 |     {
28 |       "description": "replace llm package with fnllm",
29 |       "type": "patch"
30 |     },
31 |     {
32 |       "description": "replaced md5 hash with sha256",
33 |       "type": "patch"
34 |     },
35 |     {
36 |       "description": "replaced md5 hash with sha512",
37 |       "type": "patch"
38 |     },
39 |     {
40 |       "description": "update API and add a demonstration notebook",
41 |       "type": "patch"
42 |     }
43 |   ],
44 |   "created_at": "2024-12-06T20:12:30+00:00",
45 |   "version": "0.9.0"
46 | }


--------------------------------------------------------------------------------
/.semversioner/1.0.0.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "changes": [
 3 |     {
 4 |       "description": "Add Parent id to communities data model",
 5 |       "type": "patch"
 6 |     },
 7 |     {
 8 |       "description": "Add migration notebook.",
 9 |       "type": "patch"
10 |     },
11 |     {
12 |       "description": "Create separate community workflow, collapse subflows.",
13 |       "type": "patch"
14 |     },
15 |     {
16 |       "description": "Dependency Updates",
17 |       "type": "patch"
18 |     },
19 |     {
20 |       "description": "cleanup and refactor factory classes.",
21 |       "type": "patch"
22 |     }
23 |   ],
24 |   "created_at": "2024-12-11T21:41:49+00:00",
25 |   "version": "1.0.0"
26 | }


--------------------------------------------------------------------------------
/.semversioner/1.0.1.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "changes": [
 3 |     {
 4 |       "description": "Fix encoding model config parsing",
 5 |       "type": "patch"
 6 |     },
 7 |     {
 8 |       "description": "Fix exception on error callbacks",
 9 |       "type": "patch"
10 |     },
11 |     {
12 |       "description": "Manage llm instances inside a cached singleton. Check for empty dfs after entity/relationship extraction",
13 |       "type": "patch"
14 |     },
15 |     {
16 |       "description": "Respect encoding_model option",
17 |       "type": "patch"
18 |     }
19 |   ],
20 |   "created_at": "2024-12-18T23:12:52+00:00",
21 |   "version": "1.0.1"
22 | }


--------------------------------------------------------------------------------
/.semversioner/1.1.0.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "changes": [
 3 |     {
 4 |       "description": "Make gleanings independent of encoding",
 5 |       "type": "minor"
 6 |     },
 7 |     {
 8 |       "description": "Remove DataShaper (first steps).",
 9 |       "type": "minor"
10 |     },
11 |     {
12 |       "description": "Remove old pipeline runner.",
13 |       "type": "minor"
14 |     },
15 |     {
16 |       "description": "new search implemented as a new option for the api",
17 |       "type": "minor"
18 |     },
19 |     {
20 |       "description": "Fix gleanings loop check",
21 |       "type": "patch"
22 |     },
23 |     {
24 |       "description": "Implement cosmosdb storage option for cache and output",
25 |       "type": "patch"
26 |     },
27 |     {
28 |       "description": "Move extractor code to co-locate with operations.",
29 |       "type": "patch"
30 |     },
31 |     {
32 |       "description": "Remove config input models.",
33 |       "type": "patch"
34 |     },
35 |     {
36 |       "description": "Ruff update",
37 |       "type": "patch"
38 |     },
39 |     {
40 |       "description": "Simplify and streamline internal config.",
41 |       "type": "patch"
42 |     },
43 |     {
44 |       "description": "Simplify callbacks model.",
45 |       "type": "patch"
46 |     },
47 |     {
48 |       "description": "Streamline flows.",
49 |       "type": "patch"
50 |     },
51 |     {
52 |       "description": "fix instantiation of storage classes.",
53 |       "type": "patch"
54 |     }
55 |   ],
56 |   "created_at": "2025-01-07T20:25:57+00:00",
57 |   "version": "1.1.0"
58 | }


--------------------------------------------------------------------------------
/.semversioner/1.1.1.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "changes": [
 3 |     {
 4 |       "description": "Fix a bug on creating community hierarchy for dynamic search",
 5 |       "type": "patch"
 6 |     },
 7 |     {
 8 |       "description": "Increase LOCAL_SEARCH_COMMUNITY_PROP to 15%",
 9 |       "type": "patch"
10 |     }
11 |   ],
12 |   "created_at": "2025-01-08T21:53:16+00:00",
13 |   "version": "1.1.1"
14 | }


--------------------------------------------------------------------------------
/.semversioner/1.1.2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "changes": [
 3 |     {
 4 |       "description": "Basic Rag minor fix",
 5 |       "type": "patch"
 6 |     }
 7 |   ],
 8 |   "created_at": "2025-01-09T22:29:23+00:00",
 9 |   "version": "1.2.0"
10 | }


--------------------------------------------------------------------------------
/.semversioner/1.2.0.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "changes": [
 3 |     {
 4 |       "description": "Add Drift Reduce response and streaming endpoint",
 5 |       "type": "minor"
 6 |     },
 7 |     {
 8 |       "description": "add cosmosdb vector store",
 9 |       "type": "minor"
10 |     },
11 |     {
12 |       "description": "Fix example notebooks",
13 |       "type": "patch"
14 |     },
15 |     {
16 |       "description": "Set default rate limits.",
17 |       "type": "patch"
18 |     },
19 |     {
20 |       "description": "unit tests for text_splitting",
21 |       "type": "patch"
22 |     }
23 |   ],
24 |   "created_at": "2025-01-15T20:32:00+00:00",
25 |   "version": "1.2.0"
26 | }


--------------------------------------------------------------------------------
/.semversioner/2.1.0.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "changes": [
 3 |     {
 4 |       "description": "Add support for JSON input files.",
 5 |       "type": "minor"
 6 |     },
 7 |     {
 8 |       "description": "Updated the prompt tunning client to support csv-metadata injection and updated output file types to match the new naming convention.",
 9 |       "type": "minor"
10 |     },
11 |     {
12 |       "description": "Add check for custom model types while config loading",
13 |       "type": "patch"
14 |     },
15 |     {
16 |       "description": "Adds general-purpose pipeline run state object.",
17 |       "type": "patch"
18 |     }
19 |   ],
20 |   "created_at": "2025-03-11T23:53:00+00:00",
21 |   "version": "2.1.0"
22 | }


--------------------------------------------------------------------------------
/.semversioner/2.2.0.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "changes": [
 3 |     {
 4 |       "description": "Support OpenAI reasoning models.",
 5 |       "type": "minor"
 6 |     },
 7 |     {
 8 |       "description": "Add option to snapshot raw extracted graph tables.",
 9 |       "type": "patch"
10 |     },
11 |     {
12 |       "description": "Added batching logic to the prompt tuning autoselection embeddings workflow",
13 |       "type": "patch"
14 |     },
15 |     {
16 |       "description": "Align config classes and docs better.",
17 |       "type": "patch"
18 |     },
19 |     {
20 |       "description": "Align embeddings table loading with configured fields.",
21 |       "type": "patch"
22 |     },
23 |     {
24 |       "description": "Brings parity with our latest NLP extraction approaches.",
25 |       "type": "patch"
26 |     },
27 |     {
28 |       "description": "Fix fnllm to 0.2.3",
29 |       "type": "patch"
30 |     },
31 |     {
32 |       "description": "Fixes to basic search.",
33 |       "type": "patch"
34 |     },
35 |     {
36 |       "description": "Update llm args for consistency.",
37 |       "type": "patch"
38 |     },
39 |     {
40 |       "description": "add vector store integration tests",
41 |       "type": "patch"
42 |     }
43 |   ],
44 |   "created_at": "2025-04-25T23:30:57+00:00",
45 |   "version": "2.2.0"
46 | }


--------------------------------------------------------------------------------
/.semversioner/2.2.1.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "changes": [
 3 |     {
 4 |       "description": "Fix Community Report prompt tuning response",
 5 |       "type": "patch"
 6 |     },
 7 |     {
 8 |       "description": "Fix graph creation missing edge weights.",
 9 |       "type": "patch"
10 |     },
11 |     {
12 |       "description": "Update as workflows",
13 |       "type": "patch"
14 |     }
15 |   ],
16 |   "created_at": "2025-04-30T23:50:31+00:00",
17 |   "version": "2.2.1"
18 | }


--------------------------------------------------------------------------------
/.semversioner/2.3.0.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "changes": [
 3 |     {
 4 |       "description": "Remove Dynamic Max Retries support. Refactor typer typing in cli interface",
 5 |       "type": "minor"
 6 |     },
 7 |     {
 8 |       "description": "Update fnllm to latest. Update default graphrag configuration",
 9 |       "type": "minor"
10 |     },
11 |     {
12 |       "description": "A few fixes and enhancements for better reuse and flow.",
13 |       "type": "patch"
14 |     },
15 |     {
16 |       "description": "Add full llm response to LLM PRovider output",
17 |       "type": "patch"
18 |     },
19 |     {
20 |       "description": "Fix Drift Reduce Response for non streaming calls",
21 |       "type": "patch"
22 |     },
23 |     {
24 |       "description": "Fix global search prompt to include missing formatting key",
25 |       "type": "patch"
26 |     },
27 |     {
28 |       "description": "Upgrade pyarrow dependency to >=17.0.0 to fix CVE-2024-52338",
29 |       "type": "patch"
30 |     }
31 |   ],
32 |   "created_at": "2025-05-23T21:02:47+00:00",
33 |   "version": "2.3.0"
34 | }


--------------------------------------------------------------------------------
/.semversioner/next-release/patch-20250530204951787463.json:
--------------------------------------------------------------------------------
1 | {
2 |   "type": "patch",
3 |   "description": "Update typer."
4 | }
5 | 


--------------------------------------------------------------------------------
/.vscode/extensions.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "recommendations": [
 3 |     "arcanis.vscode-zipfs",
 4 |     "ms-python.python",
 5 |     "charliermarsh.ruff",
 6 |     "ms-python.vscode-pylance",
 7 |     "bierner.markdown-mermaid",
 8 |     "streetsidesoftware.code-spell-checker",
 9 |     "ronnidc.nunjucks",
10 |     "lucien-martijn.parquet-visualizer",
11 |   ]
12 | }
13 | 


--------------------------------------------------------------------------------
/.vscode/launch.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"_comment": "Use this file to configure the graphrag project for debugging. You may create other configuration profiles based on these or select one below to use.",
 3 | 	"version": "0.2.0",
 4 | 	"configurations": [
 5 | 		{
 6 | 			"name": "Indexer",
 7 | 			"type": "debugpy",
 8 | 			"request": "launch",
 9 | 			"module": "poetry",
10 | 			"args": [
11 | 				"poe", "index",
12 | 				"--root", "<path_to_ragtest_root_demo>"
13 | 			],
14 | 		},
15 | 		{
16 | 			"name": "Query",
17 | 			"type": "debugpy",
18 | 			"request": "launch",
19 | 			"module": "poetry",
20 | 			"args": [
21 | 				"poe", "query",
22 | 				"--root", "<path_to_ragtest_root_demo>",
23 | 				"--method", "global",
24 | 				"--query", "What are the top themes in this story",
25 | 			]
26 | 		},
27 | 		{
28 | 			"name": "Prompt Tuning",
29 | 			"type": "debugpy",
30 | 			"request": "launch",
31 | 			"module": "poetry",
32 | 			"args": [
33 | 				"poe", "prompt-tune",
34 | 				"--config",
35 | 				"<path_to_ragtest_root_demo>/settings.yaml",
36 | 			]
37 | 		}
38 | 	]
39 | }


--------------------------------------------------------------------------------
/.vsts-ci.yml:
--------------------------------------------------------------------------------
 1 | name: GraphRAG CI
 2 | pool:
 3 |   vmImage: ubuntu-latest
 4 | 
 5 | trigger:
 6 |   batch: true
 7 |   branches:
 8 |     include:
 9 |       - main
10 | 
11 | variables:
12 |   isMain: $[eq(variables['Build.SourceBranch'], 'refs/heads/main')]
13 |   pythonVersion: "3.10"
14 |   poetryVersion: "1.6.1"
15 |   nodeVersion: "18.x"
16 |   artifactsFullFeedName: "Resilience/resilience_python"
17 | 
18 | stages:
19 |   - stage: Compliance
20 |     dependsOn: []
21 |     jobs:
22 |       - job: compliance
23 |         displayName: Compliance
24 |         pool:
25 |           vmImage: windows-latest
26 |         steps:
27 |           - task: CredScan@3
28 |             inputs:
29 |               outputFormat: sarif
30 |               debugMode: false
31 | 
32 |           - task: ComponentGovernanceComponentDetection@0
33 |             inputs:
34 |               scanType: "Register"
35 |               verbosity: "Verbose"
36 |               alertWarningLevel: "High"
37 | 
38 |           - task: PublishSecurityAnalysisLogs@3
39 |             inputs:
40 |               ArtifactName: "CodeAnalysisLogs"
41 |               ArtifactType: "Container"


--------------------------------------------------------------------------------
/CODEOWNERS:
--------------------------------------------------------------------------------
1 | # These owners will be the default owners for everything in
2 | # the repo. Unless a later match takes precedence,
3 | # @global-owner1 and @global-owner2 will be requested for
4 | # review when someone opens a pull request.
5 | *       @microsoft/societal-resilience @microsoft/graphrag-core-team
6 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Microsoft Open Source Code of Conduct
 2 | 
 3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
 4 | 
 5 | Resources:
 6 | 
 7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/)
 8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
 9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns
10 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 |     MIT License
 2 | 
 3 |     Copyright (c) Microsoft Corporation.
 4 | 
 5 |     Permission is hereby granted, free of charge, to any person obtaining a copy
 6 |     of this software and associated documentation files (the "Software"), to deal
 7 |     in the Software without restriction, including without limitation the rights
 8 |     to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 |     copies of the Software, and to permit persons to whom the Software is
10 |     furnished to do so, subject to the following conditions:
11 | 
12 |     The above copyright notice and this permission notice shall be included in all
13 |     copies or substantial portions of the Software.
14 | 
15 |     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 |     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 |     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 |     AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 |     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 |     OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 |     SOFTWARE
22 | 


--------------------------------------------------------------------------------
/SUPPORT.md:
--------------------------------------------------------------------------------
 1 | # Support
 2 | 
 3 | ## How to file issues and get help
 4 | 
 5 | This project uses GitHub Issues to track bugs and feature requests. Please search the existing
 6 | issues before filing new issues to avoid duplicates. For new issues, file your bug or
 7 | feature request as a new Issue.
 8 | 
 9 | For help and questions about using this project, please create a GitHub issue with your question.
10 | 
11 | ## Microsoft Support Policy
12 | 
13 | # Support for this **PROJECT or PRODUCT** is limited to the resources listed above.
14 | 
15 | # Support
16 | 
17 | ## How to file issues and get help
18 | 
19 | This project uses GitHub Issues to track bugs and feature requests. Please search the existing
20 | issues before filing new issues to avoid duplicates. For new issues, file your bug or
21 | feature request as a new Issue.
22 | 
23 | For help and questions about using this project, please file an issue on the repo.
24 | 
25 | ## Microsoft Support Policy
26 | 
27 | Support for this project is limited to the resources listed above.
28 | 


--------------------------------------------------------------------------------
/cspell.config.yaml:
--------------------------------------------------------------------------------
 1 | $schema: https://raw.githubusercontent.com/streetsidesoftware/cspell/main/cspell.schema.json
 2 | version: "0.2"
 3 | allowCompoundWords: true
 4 | dictionaryDefinitions:
 5 |   - name: dictionary
 6 |     path: "./dictionary.txt"
 7 |     addWords: true
 8 | dictionaries:
 9 |   - dictionary
10 | ignorePaths:
11 |   - cspell.config.yaml
12 |   - node_modules
13 |   - _site
14 |   - /project-words.txt
15 |   - default_pipeline.yml
16 |   - .turbo
17 |   - output/
18 |   - dist/
19 |   - temp_azurite/
20 |   - __pycache__
21 |   - pyproject.toml
22 |   - entity_extraction.txt
23 |   - package.json
24 |   - tests/fixtures/
25 |   - examples_notebooks/inputs/
26 |   - docs/examples_notebooks/inputs/
27 |   - "*.csv"
28 |   - "*.parquet"
29 |   - "*.faiss"
30 |   - "*.ipynb"
31 |   - "*.log"
32 | 


--------------------------------------------------------------------------------
/docs/cli.md:
--------------------------------------------------------------------------------
 1 | # CLI Reference
 2 | 
 3 | This page documents the command-line interface of the graphrag library.
 4 | 
 5 | ::: mkdocs-typer
 6 |     :module: graphrag.cli.main
 7 |     :prog_name: graphrag
 8 |     :command: app
 9 |     :depth: 0
10 | 


--------------------------------------------------------------------------------
/docs/config/overview.md:
--------------------------------------------------------------------------------
 1 | # Configuring GraphRAG Indexing
 2 | 
 3 | The GraphRAG system is highly configurable. This page provides an overview of the configuration options available for the GraphRAG indexing engine.
 4 | 
 5 | ## Default Configuration Mode
 6 | 
 7 | The default configuration mode is the simplest way to get started with the GraphRAG system. It is designed to work out-of-the-box with minimal configuration. The main ways to set up GraphRAG in Default Configuration mode are via:
 8 | 
 9 | - [Init command](init.md) (recommended first step)
10 | - [Edit settings.yaml for deeper control](yaml.md)
11 | - [Purely using environment variables](env_vars.md) (not recommended)
12 | 


--------------------------------------------------------------------------------
/docs/data/operation_dulce/ABOUT.md:
--------------------------------------------------------------------------------
1 | # About
2 | 
3 | This document (Operation Dulce) is an AI-generated science fiction novella, included here for the purposes of integration testing.


--------------------------------------------------------------------------------
/docs/data/operation_dulce/dataset.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/docs/data/operation_dulce/dataset.zip


--------------------------------------------------------------------------------
/docs/examples_notebooks/inputs/operation dulce/ABOUT.md:
--------------------------------------------------------------------------------
1 | # About
2 | 
3 | This document (Operation Dulce) is an AI-generated science fiction novella, included here for the purposes of providing a starting point for notebook experimentation.
4 | 


--------------------------------------------------------------------------------
/docs/examples_notebooks/inputs/operation dulce/communities.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/docs/examples_notebooks/inputs/operation dulce/communities.parquet


--------------------------------------------------------------------------------
/docs/examples_notebooks/inputs/operation dulce/community_reports.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/docs/examples_notebooks/inputs/operation dulce/community_reports.parquet


--------------------------------------------------------------------------------
/docs/examples_notebooks/inputs/operation dulce/covariates.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/docs/examples_notebooks/inputs/operation dulce/covariates.parquet


--------------------------------------------------------------------------------
/docs/examples_notebooks/inputs/operation dulce/documents.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/docs/examples_notebooks/inputs/operation dulce/documents.parquet


--------------------------------------------------------------------------------
/docs/examples_notebooks/inputs/operation dulce/entities.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/docs/examples_notebooks/inputs/operation dulce/entities.parquet


--------------------------------------------------------------------------------
/docs/examples_notebooks/inputs/operation dulce/lancedb/default-community-full_content.lance/_transactions/0-2fed1d8b-daac-41b0-a93a-e115cda75be3.txn:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/docs/examples_notebooks/inputs/operation dulce/lancedb/default-community-full_content.lance/_transactions/0-2fed1d8b-daac-41b0-a93a-e115cda75be3.txn


--------------------------------------------------------------------------------
/docs/examples_notebooks/inputs/operation dulce/lancedb/default-community-full_content.lance/_transactions/1-61dbb7c2-aec3-4796-b223-941fc7cc93cc.txn:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/docs/examples_notebooks/inputs/operation dulce/lancedb/default-community-full_content.lance/_transactions/1-61dbb7c2-aec3-4796-b223-941fc7cc93cc.txn


--------------------------------------------------------------------------------
/docs/examples_notebooks/inputs/operation dulce/lancedb/default-community-full_content.lance/_transactions/2-60012692-a153-48f9-8f4e-c479b44cbf3f.txn:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/docs/examples_notebooks/inputs/operation dulce/lancedb/default-community-full_content.lance/_transactions/2-60012692-a153-48f9-8f4e-c479b44cbf3f.txn


--------------------------------------------------------------------------------
/docs/examples_notebooks/inputs/operation dulce/lancedb/default-community-full_content.lance/_transactions/3-0d2dc9a1-094f-4220-83c7-6ad6f26fac2b.txn:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/docs/examples_notebooks/inputs/operation dulce/lancedb/default-community-full_content.lance/_transactions/3-0d2dc9a1-094f-4220-83c7-6ad6f26fac2b.txn


--------------------------------------------------------------------------------
/docs/examples_notebooks/inputs/operation dulce/lancedb/default-community-full_content.lance/_versions/1.manifest:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/docs/examples_notebooks/inputs/operation dulce/lancedb/default-community-full_content.lance/_versions/1.manifest


--------------------------------------------------------------------------------
/docs/examples_notebooks/inputs/operation dulce/lancedb/default-community-full_content.lance/_versions/2.manifest:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/docs/examples_notebooks/inputs/operation dulce/lancedb/default-community-full_content.lance/_versions/2.manifest


--------------------------------------------------------------------------------
/docs/examples_notebooks/inputs/operation dulce/lancedb/default-community-full_content.lance/_versions/3.manifest:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/docs/examples_notebooks/inputs/operation dulce/lancedb/default-community-full_content.lance/_versions/3.manifest


--------------------------------------------------------------------------------
/docs/examples_notebooks/inputs/operation dulce/lancedb/default-community-full_content.lance/_versions/4.manifest:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/docs/examples_notebooks/inputs/operation dulce/lancedb/default-community-full_content.lance/_versions/4.manifest


--------------------------------------------------------------------------------
/docs/examples_notebooks/inputs/operation dulce/lancedb/default-community-full_content.lance/data/1e7b2d94-ed06-4aa0-b22e-86a71d416bc6.lance:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/docs/examples_notebooks/inputs/operation dulce/lancedb/default-community-full_content.lance/data/1e7b2d94-ed06-4aa0-b22e-86a71d416bc6.lance


--------------------------------------------------------------------------------
/docs/examples_notebooks/inputs/operation dulce/lancedb/default-community-full_content.lance/data/1ed9f301-ce30-46a8-8c0b-9c2a60a3cf43.lance:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/docs/examples_notebooks/inputs/operation dulce/lancedb/default-community-full_content.lance/data/1ed9f301-ce30-46a8-8c0b-9c2a60a3cf43.lance


--------------------------------------------------------------------------------
/docs/examples_notebooks/inputs/operation dulce/lancedb/default-entity-description.lance/_transactions/0-92c031e5-7558-451e-9d0f-f5514db9616d.txn:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/docs/examples_notebooks/inputs/operation dulce/lancedb/default-entity-description.lance/_transactions/0-92c031e5-7558-451e-9d0f-f5514db9616d.txn


--------------------------------------------------------------------------------
/docs/examples_notebooks/inputs/operation dulce/lancedb/default-entity-description.lance/_transactions/1-7b3cb8d8-3512-4584-a003-91838fed8911.txn:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/docs/examples_notebooks/inputs/operation dulce/lancedb/default-entity-description.lance/_transactions/1-7b3cb8d8-3512-4584-a003-91838fed8911.txn


--------------------------------------------------------------------------------
/docs/examples_notebooks/inputs/operation dulce/lancedb/default-entity-description.lance/_transactions/2-7de627d2-4c57-49e9-bf73-c17a9582ead4.txn:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/docs/examples_notebooks/inputs/operation dulce/lancedb/default-entity-description.lance/_transactions/2-7de627d2-4c57-49e9-bf73-c17a9582ead4.txn


--------------------------------------------------------------------------------
/docs/examples_notebooks/inputs/operation dulce/lancedb/default-entity-description.lance/_transactions/3-9ad29d69-9a69-43a8-8b26-252ea267958d.txn:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/docs/examples_notebooks/inputs/operation dulce/lancedb/default-entity-description.lance/_transactions/3-9ad29d69-9a69-43a8-8b26-252ea267958d.txn


--------------------------------------------------------------------------------
/docs/examples_notebooks/inputs/operation dulce/lancedb/default-entity-description.lance/_versions/1.manifest:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/docs/examples_notebooks/inputs/operation dulce/lancedb/default-entity-description.lance/_versions/1.manifest


--------------------------------------------------------------------------------
/docs/examples_notebooks/inputs/operation dulce/lancedb/default-entity-description.lance/_versions/2.manifest:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/docs/examples_notebooks/inputs/operation dulce/lancedb/default-entity-description.lance/_versions/2.manifest


--------------------------------------------------------------------------------
/docs/examples_notebooks/inputs/operation dulce/lancedb/default-entity-description.lance/_versions/3.manifest:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/docs/examples_notebooks/inputs/operation dulce/lancedb/default-entity-description.lance/_versions/3.manifest


--------------------------------------------------------------------------------
/docs/examples_notebooks/inputs/operation dulce/lancedb/default-entity-description.lance/_versions/4.manifest:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/docs/examples_notebooks/inputs/operation dulce/lancedb/default-entity-description.lance/_versions/4.manifest


--------------------------------------------------------------------------------
/docs/examples_notebooks/inputs/operation dulce/lancedb/default-entity-description.lance/data/a34575c4-5260-457f-bebe-3f40bc0e2ee3.lance:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/docs/examples_notebooks/inputs/operation dulce/lancedb/default-entity-description.lance/data/a34575c4-5260-457f-bebe-3f40bc0e2ee3.lance


--------------------------------------------------------------------------------
/docs/examples_notebooks/inputs/operation dulce/lancedb/default-entity-description.lance/data/eabd7580-86f5-4022-8aa7-fe0aff816d98.lance:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/docs/examples_notebooks/inputs/operation dulce/lancedb/default-entity-description.lance/data/eabd7580-86f5-4022-8aa7-fe0aff816d98.lance


--------------------------------------------------------------------------------
/docs/examples_notebooks/inputs/operation dulce/lancedb/default-text_unit-text.lance/_transactions/0-fd0434ac-e5cd-4ddd-9dd5-e5048d4edb59.txn:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/docs/examples_notebooks/inputs/operation dulce/lancedb/default-text_unit-text.lance/_transactions/0-fd0434ac-e5cd-4ddd-9dd5-e5048d4edb59.txn


--------------------------------------------------------------------------------
/docs/examples_notebooks/inputs/operation dulce/lancedb/default-text_unit-text.lance/_transactions/1-14bb4b1d-cc00-420b-9b14-3626f0bd8c0b.txn:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/docs/examples_notebooks/inputs/operation dulce/lancedb/default-text_unit-text.lance/_transactions/1-14bb4b1d-cc00-420b-9b14-3626f0bd8c0b.txn


--------------------------------------------------------------------------------
/docs/examples_notebooks/inputs/operation dulce/lancedb/default-text_unit-text.lance/_transactions/2-8e74264c-f72d-44f5-a6f4-b3b61ae6a43b.txn:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/docs/examples_notebooks/inputs/operation dulce/lancedb/default-text_unit-text.lance/_transactions/2-8e74264c-f72d-44f5-a6f4-b3b61ae6a43b.txn


--------------------------------------------------------------------------------
/docs/examples_notebooks/inputs/operation dulce/lancedb/default-text_unit-text.lance/_transactions/3-7516fb71-9db3-4666-bdef-ea04c1eb9697.txn:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/docs/examples_notebooks/inputs/operation dulce/lancedb/default-text_unit-text.lance/_transactions/3-7516fb71-9db3-4666-bdef-ea04c1eb9697.txn


--------------------------------------------------------------------------------
/docs/examples_notebooks/inputs/operation dulce/lancedb/default-text_unit-text.lance/_versions/1.manifest:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/docs/examples_notebooks/inputs/operation dulce/lancedb/default-text_unit-text.lance/_versions/1.manifest


--------------------------------------------------------------------------------
/docs/examples_notebooks/inputs/operation dulce/lancedb/default-text_unit-text.lance/_versions/2.manifest:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/docs/examples_notebooks/inputs/operation dulce/lancedb/default-text_unit-text.lance/_versions/2.manifest


--------------------------------------------------------------------------------
/docs/examples_notebooks/inputs/operation dulce/lancedb/default-text_unit-text.lance/_versions/3.manifest:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/docs/examples_notebooks/inputs/operation dulce/lancedb/default-text_unit-text.lance/_versions/3.manifest


--------------------------------------------------------------------------------
/docs/examples_notebooks/inputs/operation dulce/lancedb/default-text_unit-text.lance/_versions/4.manifest:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/docs/examples_notebooks/inputs/operation dulce/lancedb/default-text_unit-text.lance/_versions/4.manifest


--------------------------------------------------------------------------------
/docs/examples_notebooks/inputs/operation dulce/lancedb/default-text_unit-text.lance/data/2794bf5b-de3d-4202-ab16-e76bc27c8e6a.lance:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/docs/examples_notebooks/inputs/operation dulce/lancedb/default-text_unit-text.lance/data/2794bf5b-de3d-4202-ab16-e76bc27c8e6a.lance


--------------------------------------------------------------------------------
/docs/examples_notebooks/inputs/operation dulce/lancedb/default-text_unit-text.lance/data/2f74c8e8-3f35-4209-889c-a13cf0780eb3.lance:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/docs/examples_notebooks/inputs/operation dulce/lancedb/default-text_unit-text.lance/data/2f74c8e8-3f35-4209-889c-a13cf0780eb3.lance


--------------------------------------------------------------------------------
/docs/examples_notebooks/inputs/operation dulce/relationships.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/docs/examples_notebooks/inputs/operation dulce/relationships.parquet


--------------------------------------------------------------------------------
/docs/examples_notebooks/inputs/operation dulce/text_units.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/docs/examples_notebooks/inputs/operation dulce/text_units.parquet


--------------------------------------------------------------------------------
/docs/img/GraphRag-Figure1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/docs/img/GraphRag-Figure1.jpg


--------------------------------------------------------------------------------
/docs/img/auto-tune-diagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/docs/img/auto-tune-diagram.png


--------------------------------------------------------------------------------
/docs/img/drift-search-diagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/docs/img/drift-search-diagram.png


--------------------------------------------------------------------------------
/docs/img/pipeline-running.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/docs/img/pipeline-running.png


--------------------------------------------------------------------------------
/docs/img/viz_guide/gephi-appearance-pane.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/docs/img/viz_guide/gephi-appearance-pane.png


--------------------------------------------------------------------------------
/docs/img/viz_guide/gephi-initial-graph-example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/docs/img/viz_guide/gephi-initial-graph-example.png


--------------------------------------------------------------------------------
/docs/img/viz_guide/gephi-layout-forceatlas2-pane.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/docs/img/viz_guide/gephi-layout-forceatlas2-pane.png


--------------------------------------------------------------------------------
/docs/img/viz_guide/gephi-layout-pane.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/docs/img/viz_guide/gephi-layout-pane.png


--------------------------------------------------------------------------------
/docs/img/viz_guide/gephi-network-overview-settings.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/docs/img/viz_guide/gephi-network-overview-settings.png


--------------------------------------------------------------------------------
/docs/prompt_tuning/overview.md:
--------------------------------------------------------------------------------
 1 | # Prompt Tuning ⚙️
 2 | 
 3 | This page provides an overview of the prompt tuning options available for the GraphRAG indexing engine.
 4 | 
 5 | ## Default Prompts
 6 | 
 7 | The default prompts are the simplest way to get started with the GraphRAG system. It is designed to work out-of-the-box with minimal configuration. More details about each of the default prompts for indexing and query can be found on the [manual tuning](./manual_prompt_tuning.md) page.
 8 | 
 9 | ## Auto Tuning
10 | 
11 | Auto Tuning leverages your input data and LLM interactions to create domain adapted prompts for the generation of the knowledge graph. It is highly encouraged to run it as it will yield better results when executing an Index Run. For more details about how to use it, please refer to the [Auto Tuning](auto_prompt_tuning.md) documentation.
12 | 
13 | ## Manual Tuning
14 | 
15 | Manual tuning is an advanced use-case. Most users will want to use the Auto Tuning feature instead. Details about how to use manual configuration are available in the [manual tuning](manual_prompt_tuning.md) documentation.
16 | 


--------------------------------------------------------------------------------
/docs/query/notebooks/overview.md:
--------------------------------------------------------------------------------
 1 | # API Notebooks
 2 | 
 3 | - [API Overview Notebook](../../examples_notebooks/api_overview.ipynb)
 4 | 
 5 | # Query Engine Notebooks
 6 | 
 7 | For examples about running Query please refer to the following notebooks:
 8 | 
 9 | - [Global Search Notebook](../../examples_notebooks/global_search.ipynb)
10 | - [Local Search Notebook](../../examples_notebooks/local_search.ipynb)
11 | - [DRIFT Search Notebook](../../examples_notebooks/drift_search.ipynb)
12 | 
13 | The test dataset for these notebooks can be found in [dataset.zip](../../data/operation_dulce/dataset.zip){:download}.
14 | 


--------------------------------------------------------------------------------
/docs/scripts/create_cookie_banner.js:
--------------------------------------------------------------------------------
 1 | function onConsentChanged(categoryPreferences) {
 2 | 	console.log("onConsentChanged", categoryPreferences);        
 3 | }
 4 | 
 5 | 
 6 | cb = document.createElement("div");
 7 | cb.id = "cookie-banner";
 8 | document.body.insertBefore(cb, document.body.children[0]);
 9 | 
10 | window.WcpConsent && WcpConsent.init("en-US", "cookie-banner", function (err, consent) {
11 | 	if (!err) {
12 | 		console.log("consent: ", consent);
13 | 		window.manageConsent = () => consent.manageConsent();
14 | 		siteConsent = consent;          
15 | 	} else {
16 | 		console.log("Error initializing WcpConsent: "+ err);
17 | 	}
18 | }, onConsentChanged, WcpConsent.themes.light);


--------------------------------------------------------------------------------
/docs/stylesheets/extra.css:
--------------------------------------------------------------------------------
 1 | [data-md-color-scheme="default"] {
 2 |     --md-primary-fg-color: #3c4cab;
 3 |     --md-code-hl-color: #3772d9;
 4 |     --md-code-hl-comment-color: #6b6b6b;
 5 |     --md-code-hl-operator-color: #6b6b6b;
 6 |     --md-footer-fg-color--light: #ffffff;
 7 |     --md-footer-fg-color--lighter: #ffffff;
 8 | }
 9 | 
10 | [data-md-color-scheme="slate"] {
11 |     --md-primary-fg-color: #364499;
12 |     --md-code-hl-color: #246be5;
13 |     --md-code-hl-constant-color: #9a89ed;
14 |     --md-code-hl-number-color: #f16e5f;
15 |     --md-footer-fg-color--light: #ffffff;
16 |     --md-footer-fg-color--lighter: #ffffff;
17 | }
18 | 
19 | .md-tabs__item--active {
20 |     background-color: var(--md-primary-bg-color);
21 | }
22 | 
23 | .md-tabs__item--active .md-tabs__link {
24 |     color: var(--md-code-hl-color);
25 | }
26 | 
27 | .md-typeset a {
28 |     text-decoration: underline;
29 | }


--------------------------------------------------------------------------------
/examples_notebooks/community_contrib/README.md:
--------------------------------------------------------------------------------
1 | ## Disclaimer
2 | 
3 | This folder contains community contributed notebooks that are not officially supported by the GraphRAG team. The notebooks are provided as-is and are not guaranteed to work with the latest version of GraphRAG. If you have any questions or issues, please reach out to the author of the notebook directly.
4 | 
5 | For more information on how to contribute to the GraphRAG project, please refer to the [contribution guidelines](https://github.com/microsoft/graphrag/blob/main/CONTRIBUTING.md)
6 | 


--------------------------------------------------------------------------------
/examples_notebooks/inputs/operation dulce/lancedb/entity_description_embeddings.lance/_latest.manifest:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/examples_notebooks/inputs/operation dulce/lancedb/entity_description_embeddings.lance/_latest.manifest


--------------------------------------------------------------------------------
/examples_notebooks/inputs/operation dulce/lancedb/entity_description_embeddings.lance/_transactions/0-498c6e24-dd0a-42b9-8f7e-5e3d2ab258b0.txn:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/examples_notebooks/inputs/operation dulce/lancedb/entity_description_embeddings.lance/_transactions/0-498c6e24-dd0a-42b9-8f7e-5e3d2ab258b0.txn


--------------------------------------------------------------------------------
/examples_notebooks/inputs/operation dulce/lancedb/entity_description_embeddings.lance/_transactions/1-bf5aa024-a229-461f-8d78-699841a302fe.txn:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/examples_notebooks/inputs/operation dulce/lancedb/entity_description_embeddings.lance/_transactions/1-bf5aa024-a229-461f-8d78-699841a302fe.txn


--------------------------------------------------------------------------------
/examples_notebooks/inputs/operation dulce/lancedb/entity_description_embeddings.lance/_versions/1.manifest:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/examples_notebooks/inputs/operation dulce/lancedb/entity_description_embeddings.lance/_versions/1.manifest


--------------------------------------------------------------------------------
/examples_notebooks/inputs/operation dulce/lancedb/entity_description_embeddings.lance/_versions/2.manifest:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/examples_notebooks/inputs/operation dulce/lancedb/entity_description_embeddings.lance/_versions/2.manifest


--------------------------------------------------------------------------------
/examples_notebooks/inputs/operation dulce/lancedb/entity_description_embeddings.lance/data/fe64774f-5412-4c9c-8dea-f6ed55c81119.lance:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/examples_notebooks/inputs/operation dulce/lancedb/entity_description_embeddings.lance/data/fe64774f-5412-4c9c-8dea-f6ed55c81119.lance


--------------------------------------------------------------------------------
/graphrag/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 
4 | """The GraphRAG package."""
5 | 


--------------------------------------------------------------------------------
/graphrag/__main__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 
4 | """The GraphRAG package."""
5 | 
6 | from graphrag.cli.main import app
7 | 
8 | app(prog_name="graphrag")
9 | 


--------------------------------------------------------------------------------
/graphrag/api/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """API for GraphRAG.
 5 | 
 6 | WARNING: This API is under development and may undergo changes in future releases.
 7 | Backwards compatibility is not guaranteed at this time.
 8 | """
 9 | 
10 | from graphrag.api.index import build_index
11 | from graphrag.api.prompt_tune import generate_indexing_prompts
12 | from graphrag.api.query import (
13 |     basic_search,
14 |     basic_search_streaming,
15 |     drift_search,
16 |     drift_search_streaming,
17 |     global_search,
18 |     global_search_streaming,
19 |     local_search,
20 |     local_search_streaming,
21 |     multi_index_basic_search,
22 |     multi_index_drift_search,
23 |     multi_index_global_search,
24 |     multi_index_local_search,
25 | )
26 | from graphrag.prompt_tune.types import DocSelectionType
27 | 
28 | __all__ = [  # noqa: RUF022
29 |     # index API
30 |     "build_index",
31 |     # query API
32 |     "global_search",
33 |     "global_search_streaming",
34 |     "local_search",
35 |     "local_search_streaming",
36 |     "drift_search",
37 |     "drift_search_streaming",
38 |     "basic_search",
39 |     "basic_search_streaming",
40 |     "multi_index_basic_search",
41 |     "multi_index_drift_search",
42 |     "multi_index_global_search",
43 |     "multi_index_local_search",
44 |     # prompt tuning API
45 |     "DocSelectionType",
46 |     "generate_indexing_prompts",
47 | ]
48 | 


--------------------------------------------------------------------------------
/graphrag/cache/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 
4 | """A package containing cache implementations."""
5 | 


--------------------------------------------------------------------------------
/graphrag/callbacks/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 
4 | """A package containing callback implementations."""
5 | 


--------------------------------------------------------------------------------
/graphrag/callbacks/console_workflow_callbacks.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """A logger that emits updates from the indexing engine to the console."""
 5 | 
 6 | from graphrag.callbacks.noop_workflow_callbacks import NoopWorkflowCallbacks
 7 | 
 8 | 
 9 | class ConsoleWorkflowCallbacks(NoopWorkflowCallbacks):
10 |     """A logger that writes to a console."""
11 | 
12 |     def error(
13 |         self,
14 |         message: str,
15 |         cause: BaseException | None = None,
16 |         stack: str | None = None,
17 |         details: dict | None = None,
18 |     ):
19 |         """Handle when an error occurs."""
20 |         print(message, str(cause), stack, details)  # noqa T201
21 | 
22 |     def warning(self, message: str, details: dict | None = None):
23 |         """Handle when a warning occurs."""
24 |         _print_warning(message)
25 | 
26 |     def log(self, message: str, details: dict | None = None):
27 |         """Handle when a log message is produced."""
28 |         print(message, details)  # noqa T201
29 | 
30 | 
31 | def _print_warning(skk):
32 |     print("\033[93m {}\033[00m".format(skk))  # noqa T201
33 | 


--------------------------------------------------------------------------------
/graphrag/callbacks/llm_callbacks.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """LLM Callbacks."""
 5 | 
 6 | from typing import Protocol
 7 | 
 8 | 
 9 | class BaseLLMCallback(Protocol):
10 |     """Base class for LLM callbacks."""
11 | 
12 |     def on_llm_new_token(self, token: str):
13 |         """Handle when a new token is generated."""
14 |         ...
15 | 


--------------------------------------------------------------------------------
/graphrag/callbacks/noop_query_callbacks.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """No-op Query Callbacks."""
 5 | 
 6 | from typing import Any
 7 | 
 8 | from graphrag.callbacks.query_callbacks import QueryCallbacks
 9 | from graphrag.query.structured_search.base import SearchResult
10 | 
11 | 
12 | class NoopQueryCallbacks(QueryCallbacks):
13 |     """A no-op implementation of QueryCallbacks."""
14 | 
15 |     def on_context(self, context: Any) -> None:
16 |         """Handle when context data is constructed."""
17 | 
18 |     def on_map_response_start(self, map_response_contexts: list[str]) -> None:
19 |         """Handle the start of map operation."""
20 | 
21 |     def on_map_response_end(self, map_response_outputs: list[SearchResult]) -> None:
22 |         """Handle the end of map operation."""
23 | 
24 |     def on_reduce_response_start(
25 |         self, reduce_response_context: str | dict[str, Any]
26 |     ) -> None:
27 |         """Handle the start of reduce operation."""
28 | 
29 |     def on_reduce_response_end(self, reduce_response_output: str) -> None:
30 |         """Handle the end of reduce operation."""
31 | 
32 |     def on_llm_new_token(self, token):
33 |         """Handle when a new token is generated."""
34 | 


--------------------------------------------------------------------------------
/graphrag/callbacks/progress_workflow_callbacks.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """A workflow callback manager that emits updates."""
 5 | 
 6 | from graphrag.callbacks.noop_workflow_callbacks import NoopWorkflowCallbacks
 7 | from graphrag.logger.base import ProgressLogger
 8 | from graphrag.logger.progress import Progress
 9 | 
10 | 
11 | class ProgressWorkflowCallbacks(NoopWorkflowCallbacks):
12 |     """A callbackmanager that delegates to a ProgressLogger."""
13 | 
14 |     _root_progress: ProgressLogger
15 |     _progress_stack: list[ProgressLogger]
16 | 
17 |     def __init__(self, progress: ProgressLogger) -> None:
18 |         """Create a new ProgressWorkflowCallbacks."""
19 |         self._progress = progress
20 |         self._progress_stack = [progress]
21 | 
22 |     def _pop(self) -> None:
23 |         self._progress_stack.pop()
24 | 
25 |     def _push(self, name: str) -> None:
26 |         self._progress_stack.append(self._latest.child(name))
27 | 
28 |     @property
29 |     def _latest(self) -> ProgressLogger:
30 |         return self._progress_stack[-1]
31 | 
32 |     def workflow_start(self, name: str, instance: object) -> None:
33 |         """Execute this callback when a workflow starts."""
34 |         self._push(name)
35 | 
36 |     def workflow_end(self, name: str, instance: object) -> None:
37 |         """Execute this callback when a workflow ends."""
38 |         self._pop()
39 | 
40 |     def progress(self, progress: Progress) -> None:
41 |         """Handle when progress occurs."""
42 |         self._latest(progress)
43 | 


--------------------------------------------------------------------------------
/graphrag/callbacks/query_callbacks.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """Query Callbacks."""
 5 | 
 6 | from typing import Any
 7 | 
 8 | from graphrag.callbacks.llm_callbacks import BaseLLMCallback
 9 | from graphrag.query.structured_search.base import SearchResult
10 | 
11 | 
12 | class QueryCallbacks(BaseLLMCallback):
13 |     """Callbacks used during query execution."""
14 | 
15 |     def on_context(self, context: Any) -> None:
16 |         """Handle when context data is constructed."""
17 | 
18 |     def on_map_response_start(self, map_response_contexts: list[str]) -> None:
19 |         """Handle the start of map operation."""
20 | 
21 |     def on_map_response_end(self, map_response_outputs: list[SearchResult]) -> None:
22 |         """Handle the end of map operation."""
23 | 
24 |     def on_reduce_response_start(
25 |         self, reduce_response_context: str | dict[str, Any]
26 |     ) -> None:
27 |         """Handle the start of reduce operation."""
28 | 
29 |     def on_reduce_response_end(self, reduce_response_output: str) -> None:
30 |         """Handle the end of reduce operation."""
31 | 
32 |     def on_llm_new_token(self, token) -> None:
33 |         """Handle when a new token is generated."""
34 | 


--------------------------------------------------------------------------------
/graphrag/cli/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 
4 | """CLI for GraphRAG."""
5 | 


--------------------------------------------------------------------------------
/graphrag/config/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 
4 | """The config package root."""
5 | 


--------------------------------------------------------------------------------
/graphrag/config/create_graphrag_config.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """Parameterization settings for the default configuration, loaded from environment variables."""
 5 | 
 6 | from pathlib import Path
 7 | from typing import Any
 8 | 
 9 | from graphrag.config.models.graph_rag_config import GraphRagConfig
10 | 
11 | 
12 | def create_graphrag_config(
13 |     values: dict[str, Any] | None = None,
14 |     root_dir: str | None = None,
15 | ) -> GraphRagConfig:
16 |     """Load Configuration Parameters from a dictionary.
17 | 
18 |     Parameters
19 |     ----------
20 |     values : dict[str, Any] | None
21 |         Dictionary of configuration values to pass into pydantic model.
22 |     root_dir : str | None
23 |         Root directory for the project.
24 |     skip_validation : bool
25 |         Skip pydantic model validation of the configuration.
26 |         This is useful for testing and mocking purposes but
27 |         should not be used in the core code or API.
28 | 
29 |     Returns
30 |     -------
31 |     GraphRagConfig
32 |         The configuration object.
33 | 
34 |     Raises
35 |     ------
36 |     ValidationError
37 |         If the configuration values do not satisfy pydantic validation.
38 |     """
39 |     values = values or {}
40 |     if root_dir:
41 |         root_path = Path(root_dir).resolve()
42 |         values["root_dir"] = str(root_path)
43 |     return GraphRagConfig(**values)
44 | 


--------------------------------------------------------------------------------
/graphrag/config/get_embedding_settings.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """A module containing get_embedding_settings."""
 5 | 
 6 | from graphrag.config.models.graph_rag_config import GraphRagConfig
 7 | 
 8 | 
 9 | def get_embedding_settings(
10 |     settings: GraphRagConfig,
11 |     vector_store_params: dict | None = None,
12 | ) -> dict:
13 |     """Transform GraphRAG config into settings for workflows."""
14 |     # TEMP
15 |     embeddings_llm_settings = settings.get_language_model_config(
16 |         settings.embed_text.model_id
17 |     )
18 |     vector_store_settings = settings.get_vector_store_config(
19 |         settings.embed_text.vector_store_id
20 |     ).model_dump()
21 | 
22 |     #
23 |     # If we get to this point, settings.vector_store is defined, and there's a specific setting for this embedding.
24 |     # settings.vector_store.base contains connection information, or may be undefined
25 |     # settings.vector_store.<vector_name> contains the specific settings for this embedding
26 |     #
27 |     strategy = settings.embed_text.resolved_strategy(
28 |         embeddings_llm_settings
29 |     )  # get the default strategy
30 |     strategy.update({
31 |         "vector_store": {
32 |             **(vector_store_params or {}),
33 |             **(vector_store_settings),
34 |         }
35 |     })  # update the default strategy with the vector store settings
36 |     # This ensures the vector store config is part of the strategy and not the global config
37 |     return {
38 |         "strategy": strategy,
39 |     }
40 | 


--------------------------------------------------------------------------------
/graphrag/config/models/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 
4 | """Interfaces for Default Config parameterization."""
5 | 


--------------------------------------------------------------------------------
/graphrag/config/models/basic_search_config.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """Parameterization settings for the default configuration."""
 5 | 
 6 | from pydantic import BaseModel, Field
 7 | 
 8 | from graphrag.config.defaults import graphrag_config_defaults
 9 | 
10 | 
11 | class BasicSearchConfig(BaseModel):
12 |     """The default configuration section for Cache."""
13 | 
14 |     prompt: str | None = Field(
15 |         description="The basic search prompt to use.",
16 |         default=graphrag_config_defaults.basic_search.prompt,
17 |     )
18 |     chat_model_id: str = Field(
19 |         description="The model ID to use for basic search.",
20 |         default=graphrag_config_defaults.basic_search.chat_model_id,
21 |     )
22 |     embedding_model_id: str = Field(
23 |         description="The model ID to use for text embeddings.",
24 |         default=graphrag_config_defaults.basic_search.embedding_model_id,
25 |     )
26 |     k: int = Field(
27 |         description="The number of text units to include in search context.",
28 |         default=graphrag_config_defaults.basic_search.k,
29 |     )
30 |     max_context_tokens: int = Field(
31 |         description="The maximum tokens.",
32 |         default=graphrag_config_defaults.basic_search.max_context_tokens,
33 |     )
34 | 


--------------------------------------------------------------------------------
/graphrag/config/models/cache_config.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """Parameterization settings for the default configuration."""
 5 | 
 6 | from pydantic import BaseModel, Field
 7 | 
 8 | from graphrag.config.defaults import graphrag_config_defaults
 9 | from graphrag.config.enums import CacheType
10 | 
11 | 
12 | class CacheConfig(BaseModel):
13 |     """The default configuration section for Cache."""
14 | 
15 |     type: CacheType = Field(
16 |         description="The cache type to use.",
17 |         default=graphrag_config_defaults.cache.type,
18 |     )
19 |     base_dir: str = Field(
20 |         description="The base directory for the cache.",
21 |         default=graphrag_config_defaults.cache.base_dir,
22 |     )
23 |     connection_string: str | None = Field(
24 |         description="The cache connection string to use.",
25 |         default=graphrag_config_defaults.cache.connection_string,
26 |     )
27 |     container_name: str | None = Field(
28 |         description="The cache container name to use.",
29 |         default=graphrag_config_defaults.cache.container_name,
30 |     )
31 |     storage_account_blob_url: str | None = Field(
32 |         description="The storage account blob url to use.",
33 |         default=graphrag_config_defaults.cache.storage_account_blob_url,
34 |     )
35 |     cosmosdb_account_url: str | None = Field(
36 |         description="The cosmosdb account url to use.",
37 |         default=graphrag_config_defaults.cache.cosmosdb_account_url,
38 |     )
39 | 


--------------------------------------------------------------------------------
/graphrag/config/models/cluster_graph_config.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """Parameterization settings for the default configuration."""
 5 | 
 6 | from pydantic import BaseModel, Field
 7 | 
 8 | from graphrag.config.defaults import graphrag_config_defaults
 9 | 
10 | 
11 | class ClusterGraphConfig(BaseModel):
12 |     """Configuration section for clustering graphs."""
13 | 
14 |     max_cluster_size: int = Field(
15 |         description="The maximum cluster size to use.",
16 |         default=graphrag_config_defaults.cluster_graph.max_cluster_size,
17 |     )
18 |     use_lcc: bool = Field(
19 |         description="Whether to use the largest connected component.",
20 |         default=graphrag_config_defaults.cluster_graph.use_lcc,
21 |     )
22 |     seed: int = Field(
23 |         description="The seed to use for the clustering.",
24 |         default=graphrag_config_defaults.cluster_graph.seed,
25 |     )
26 | 


--------------------------------------------------------------------------------
/graphrag/config/models/output_config.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """Parameterization settings for the default configuration."""
 5 | 
 6 | from pydantic import BaseModel, Field
 7 | 
 8 | from graphrag.config.defaults import graphrag_config_defaults
 9 | from graphrag.config.enums import OutputType
10 | 
11 | 
12 | class OutputConfig(BaseModel):
13 |     """The default configuration section for Output."""
14 | 
15 |     type: OutputType = Field(
16 |         description="The output type to use.",
17 |         default=graphrag_config_defaults.output.type,
18 |     )
19 |     base_dir: str = Field(
20 |         description="The base directory for the output.",
21 |         default=graphrag_config_defaults.output.base_dir,
22 |     )
23 |     connection_string: str | None = Field(
24 |         description="The storage connection string to use.",
25 |         default=graphrag_config_defaults.output.connection_string,
26 |     )
27 |     container_name: str | None = Field(
28 |         description="The storage container name to use.",
29 |         default=graphrag_config_defaults.output.container_name,
30 |     )
31 |     storage_account_blob_url: str | None = Field(
32 |         description="The storage account blob url to use.",
33 |         default=graphrag_config_defaults.output.storage_account_blob_url,
34 |     )
35 |     cosmosdb_account_url: str | None = Field(
36 |         description="The cosmosdb account url to use.",
37 |         default=graphrag_config_defaults.output.cosmosdb_account_url,
38 |     )
39 | 


--------------------------------------------------------------------------------
/graphrag/config/models/reporting_config.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """Parameterization settings for the default configuration."""
 5 | 
 6 | from pydantic import BaseModel, Field
 7 | 
 8 | from graphrag.config.defaults import graphrag_config_defaults
 9 | from graphrag.config.enums import ReportingType
10 | 
11 | 
12 | class ReportingConfig(BaseModel):
13 |     """The default configuration section for Reporting."""
14 | 
15 |     type: ReportingType = Field(
16 |         description="The reporting type to use.",
17 |         default=graphrag_config_defaults.reporting.type,
18 |     )
19 |     base_dir: str = Field(
20 |         description="The base directory for reporting.",
21 |         default=graphrag_config_defaults.reporting.base_dir,
22 |     )
23 |     connection_string: str | None = Field(
24 |         description="The reporting connection string to use.",
25 |         default=graphrag_config_defaults.reporting.connection_string,
26 |     )
27 |     container_name: str | None = Field(
28 |         description="The reporting container name to use.",
29 |         default=graphrag_config_defaults.reporting.container_name,
30 |     )
31 |     storage_account_blob_url: str | None = Field(
32 |         description="The storage account blob url to use.",
33 |         default=graphrag_config_defaults.reporting.storage_account_blob_url,
34 |     )
35 | 


--------------------------------------------------------------------------------
/graphrag/config/models/snapshots_config.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """Parameterization settings for the default configuration."""
 5 | 
 6 | from pydantic import BaseModel, Field
 7 | 
 8 | from graphrag.config.defaults import graphrag_config_defaults
 9 | 
10 | 
11 | class SnapshotsConfig(BaseModel):
12 |     """Configuration section for snapshots."""
13 | 
14 |     embeddings: bool = Field(
15 |         description="A flag indicating whether to take snapshots of embeddings.",
16 |         default=graphrag_config_defaults.snapshots.embeddings,
17 |     )
18 |     graphml: bool = Field(
19 |         description="A flag indicating whether to take snapshots of GraphML.",
20 |         default=graphrag_config_defaults.snapshots.graphml,
21 |     )
22 |     raw_graph: bool = Field(
23 |         description="A flag indicating whether to take snapshots of the raw extracted graph (entities and relationships) before merging.",
24 |         default=graphrag_config_defaults.snapshots.raw_graph,
25 |     )
26 | 


--------------------------------------------------------------------------------
/graphrag/config/models/umap_config.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """Parameterization settings for the default configuration."""
 5 | 
 6 | from pydantic import BaseModel, Field
 7 | 
 8 | from graphrag.config.defaults import graphrag_config_defaults
 9 | 
10 | 
11 | class UmapConfig(BaseModel):
12 |     """Configuration section for UMAP."""
13 | 
14 |     enabled: bool = Field(
15 |         description="A flag indicating whether to enable UMAP.",
16 |         default=graphrag_config_defaults.umap.enabled,
17 |     )
18 | 


--------------------------------------------------------------------------------
/graphrag/config/read_dotenv.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """A module containing the read_dotenv utility."""
 5 | 
 6 | import logging
 7 | import os
 8 | from pathlib import Path
 9 | 
10 | from dotenv import dotenv_values
11 | 
12 | log = logging.getLogger(__name__)
13 | 
14 | 
15 | def read_dotenv(root: str) -> None:
16 |     """Read a .env file in the given root path."""
17 |     env_path = Path(root) / ".env"
18 |     if env_path.exists():
19 |         log.info("Loading pipeline .env file")
20 |         env_config = dotenv_values(f"{env_path}")
21 |         for key, value in env_config.items():
22 |             if key not in os.environ:
23 |                 os.environ[key] = value or ""
24 |     else:
25 |         log.info("No .env file found at %s", root)
26 | 


--------------------------------------------------------------------------------
/graphrag/data_model/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 
4 | """Knowledge model package."""
5 | 


--------------------------------------------------------------------------------
/graphrag/data_model/identified.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """A package containing the 'Identified' protocol."""
 5 | 
 6 | from dataclasses import dataclass
 7 | 
 8 | 
 9 | @dataclass
10 | class Identified:
11 |     """A protocol for an item with an ID."""
12 | 
13 |     id: str
14 |     """The ID of the item."""
15 | 
16 |     short_id: str | None
17 |     """Human readable ID used to refer to this community in prompts or texts displayed to users, such as in a report text (optional)."""
18 | 


--------------------------------------------------------------------------------
/graphrag/data_model/named.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """A package containing the 'Named' protocol."""
 5 | 
 6 | from dataclasses import dataclass
 7 | 
 8 | from graphrag.data_model.identified import Identified
 9 | 
10 | 
11 | @dataclass
12 | class Named(Identified):
13 |     """A protocol for an item with a name/title."""
14 | 
15 |     title: str
16 |     """The name/title of the item."""
17 | 


--------------------------------------------------------------------------------
/graphrag/data_model/types.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 
4 | """Common types for the GraphRAG knowledge model."""
5 | 
6 | from collections.abc import Callable
7 | 
8 | TextEmbedder = Callable[[str], list[float]]
9 | 


--------------------------------------------------------------------------------
/graphrag/index/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 
4 | """The indexing engine package root."""
5 | 


--------------------------------------------------------------------------------
/graphrag/index/input/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 
4 | """The Indexing Engine input package root."""
5 | 


--------------------------------------------------------------------------------
/graphrag/index/input/text.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """A module containing load method definition."""
 5 | 
 6 | import logging
 7 | from pathlib import Path
 8 | 
 9 | import pandas as pd
10 | 
11 | from graphrag.config.models.input_config import InputConfig
12 | from graphrag.index.input.util import load_files
13 | from graphrag.index.utils.hashing import gen_sha512_hash
14 | from graphrag.logger.base import ProgressLogger
15 | from graphrag.storage.pipeline_storage import PipelineStorage
16 | 
17 | log = logging.getLogger(__name__)
18 | 
19 | 
20 | async def load_text(
21 |     config: InputConfig,
22 |     progress: ProgressLogger | None,
23 |     storage: PipelineStorage,
24 | ) -> pd.DataFrame:
25 |     """Load text inputs from a directory."""
26 | 
27 |     async def load_file(path: str, group: dict | None = None) -> pd.DataFrame:
28 |         if group is None:
29 |             group = {}
30 |         text = await storage.get(path, encoding=config.encoding)
31 |         new_item = {**group, "text": text}
32 |         new_item["id"] = gen_sha512_hash(new_item, new_item.keys())
33 |         new_item["title"] = str(Path(path).name)
34 |         new_item["creation_date"] = await storage.get_creation_date(path)
35 |         return pd.DataFrame([new_item])
36 | 
37 |     return await load_files(load_file, config, storage, progress)
38 | 


--------------------------------------------------------------------------------
/graphrag/index/operations/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 
4 | """Reusable data frame operations."""
5 | 


--------------------------------------------------------------------------------
/graphrag/index/operations/build_noun_graph/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 
4 | """The Indexing Engine noun graph package root."""
5 | 


--------------------------------------------------------------------------------
/graphrag/index/operations/build_noun_graph/np_extractors/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 
4 | """NLP-based graph extractors."""
5 | 


--------------------------------------------------------------------------------
/graphrag/index/operations/build_noun_graph/np_extractors/np_validator.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """Util functions to tag noun phrases for filtering."""
 5 | 
 6 | 
 7 | def is_compound(tokens: list[str]) -> bool:
 8 |     """List of tokens forms a compound noun phrase."""
 9 |     return any(
10 |         "-" in token and len(token.strip()) > 1 and len(token.strip().split("-")) > 1
11 |         for token in tokens
12 |     )
13 | 
14 | 
15 | def has_valid_token_length(tokens: list[str], max_length: int) -> bool:
16 |     """Check if all tokens have valid length."""
17 |     return all(len(token) <= max_length for token in tokens)
18 | 
19 | 
20 | def is_valid_entity(entity: tuple[str, str], tokens: list[str]) -> bool:
21 |     """Check if the entity is valid."""
22 |     return (entity[1] not in ["CARDINAL", "ORDINAL"] and len(tokens) > 0) or (
23 |         entity[1] in ["CARDINAL", "ORDINAL"]
24 |         and (len(tokens) > 1 or is_compound(tokens))
25 |     )
26 | 


--------------------------------------------------------------------------------
/graphrag/index/operations/build_noun_graph/np_extractors/resource_loader.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """Util functions needed for nltk-based noun-phrase extractors (i.e. TextBlob)."""
 5 | 
 6 | import nltk
 7 | 
 8 | 
 9 | def download_if_not_exists(resource_name) -> bool:
10 |     """Download nltk resources if they haven't been already."""
11 |     # look under all possible categories
12 |     root_categories = [
13 |         "corpora",
14 |         "tokenizers",
15 |         "taggers",
16 |         "chunkers",
17 |         "classifiers",
18 |         "stemmers",
19 |         "stopwords",
20 |         "languages",
21 |         "frequent",
22 |         "gate",
23 |         "models",
24 |         "mt",
25 |         "sentiment",
26 |         "similarity",
27 |     ]
28 |     for category in root_categories:
29 |         try:
30 |             # if found, stop looking and avoid downloading
31 |             nltk.find(f"{category}/{resource_name}")
32 |             return True  # noqa: TRY300
33 |         except LookupError:
34 |             continue
35 | 
36 |     # is not found, download
37 |     nltk.download(resource_name)
38 |     return False
39 | 


--------------------------------------------------------------------------------
/graphrag/index/operations/build_noun_graph/np_extractors/stop_words.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """Custom list of stop words to be excluded by noun phrase extractors."""
 5 | 
 6 | EN_STOP_WORDS = [
 7 |     "stuff",
 8 |     "thing",
 9 |     "things",
10 |     "bunch",
11 |     "bit",
12 |     "bits",
13 |     "people",
14 |     "person",
15 |     "okay",
16 |     "hey",
17 |     "hi",
18 |     "hello",
19 |     "laughter",
20 |     "oh",
21 | ]
22 | 


--------------------------------------------------------------------------------
/graphrag/index/operations/chunk_text/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 
4 | """The Indexing Engine text chunk package root."""
5 | 


--------------------------------------------------------------------------------
/graphrag/index/operations/chunk_text/bootstrap.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """Bootstrap definition."""
 5 | 
 6 | import warnings
 7 | 
 8 | # Ignore warnings from numba
 9 | warnings.filterwarnings("ignore", message=".*The 'nopython' keyword.*")
10 | warnings.filterwarnings("ignore", message=".*Use no seed for parallelism.*")
11 | 
12 | initialized_nltk = False
13 | 
14 | 
15 | def bootstrap():
16 |     """Bootstrap definition."""
17 |     global initialized_nltk
18 |     if not initialized_nltk:
19 |         import nltk
20 |         from nltk.corpus import wordnet as wn
21 | 
22 |         nltk.download("punkt")
23 |         nltk.download("punkt_tab")
24 |         nltk.download("averaged_perceptron_tagger")
25 |         nltk.download("averaged_perceptron_tagger_eng")
26 |         nltk.download("maxent_ne_chunker")
27 |         nltk.download("maxent_ne_chunker_tab")
28 |         nltk.download("words")
29 |         nltk.download("wordnet")
30 |         wn.ensure_loaded()
31 |         initialized_nltk = True
32 | 


--------------------------------------------------------------------------------
/graphrag/index/operations/chunk_text/typing.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """A module containing 'TextChunk' model."""
 5 | 
 6 | from collections.abc import Callable, Iterable
 7 | from dataclasses import dataclass
 8 | 
 9 | from graphrag.config.models.chunking_config import ChunkingConfig
10 | from graphrag.logger.progress import ProgressTicker
11 | 
12 | 
13 | @dataclass
14 | class TextChunk:
15 |     """Text chunk class definition."""
16 | 
17 |     text_chunk: str
18 |     source_doc_indices: list[int]
19 |     n_tokens: int | None = None
20 | 
21 | 
22 | ChunkInput = str | list[str] | list[tuple[str, str]]
23 | """Input to a chunking strategy. Can be a string, a list of strings, or a list of tuples of (id, text)."""
24 | 
25 | ChunkStrategy = Callable[
26 |     [list[str], ChunkingConfig, ProgressTicker], Iterable[TextChunk]
27 | ]
28 | 


--------------------------------------------------------------------------------
/graphrag/index/operations/compute_degree.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """A module containing create_graph definition."""
 5 | 
 6 | import networkx as nx
 7 | import pandas as pd
 8 | 
 9 | 
10 | def compute_degree(graph: nx.Graph) -> pd.DataFrame:
11 |     """Create a new DataFrame with the degree of each node in the graph."""
12 |     return pd.DataFrame([
13 |         {"title": node, "degree": int(degree)}
14 |         for node, degree in graph.degree  # type: ignore
15 |     ])
16 | 


--------------------------------------------------------------------------------
/graphrag/index/operations/compute_edge_combined_degree.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """A module containing compute_edge_combined_degree methods definition."""
 5 | 
 6 | from typing import cast
 7 | 
 8 | import pandas as pd
 9 | 
10 | 
11 | def compute_edge_combined_degree(
12 |     edge_df: pd.DataFrame,
13 |     node_degree_df: pd.DataFrame,
14 |     node_name_column: str,
15 |     node_degree_column: str,
16 |     edge_source_column: str,
17 |     edge_target_column: str,
18 | ) -> pd.Series:
19 |     """Compute the combined degree for each edge in a graph."""
20 | 
21 |     def join_to_degree(df: pd.DataFrame, column: str) -> pd.DataFrame:
22 |         degree_column = _degree_colname(column)
23 |         result = df.merge(
24 |             node_degree_df.rename(
25 |                 columns={node_name_column: column, node_degree_column: degree_column}
26 |             ),
27 |             on=column,
28 |             how="left",
29 |         )
30 |         result[degree_column] = result[degree_column].fillna(0)
31 |         return result
32 | 
33 |     output_df = join_to_degree(edge_df, edge_source_column)
34 |     output_df = join_to_degree(output_df, edge_target_column)
35 |     output_df["combined_degree"] = (
36 |         output_df[_degree_colname(edge_source_column)]
37 |         + output_df[_degree_colname(edge_target_column)]
38 |     )
39 |     return cast("pd.Series", output_df["combined_degree"])
40 | 
41 | 
42 | def _degree_colname(column: str) -> str:
43 |     return f"{column}_degree"
44 | 


--------------------------------------------------------------------------------
/graphrag/index/operations/create_graph.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """A module containing create_graph definition."""
 5 | 
 6 | import networkx as nx
 7 | import pandas as pd
 8 | 
 9 | 
10 | def create_graph(
11 |     edges: pd.DataFrame,
12 |     edge_attr: list[str | int] | None = None,
13 |     nodes: pd.DataFrame | None = None,
14 |     node_id: str = "title",
15 | ) -> nx.Graph:
16 |     """Create a networkx graph from nodes and edges dataframes."""
17 |     graph = nx.from_pandas_edgelist(edges, edge_attr=edge_attr)
18 | 
19 |     if nodes is not None:
20 |         nodes.set_index(node_id, inplace=True)
21 |         graph.add_nodes_from((n, dict(d)) for n, d in nodes.iterrows())
22 | 
23 |     return graph
24 | 


--------------------------------------------------------------------------------
/graphrag/index/operations/embed_graph/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 
4 | """The Indexing Engine graph embed package root."""
5 | 


--------------------------------------------------------------------------------
/graphrag/index/operations/embed_graph/embed_node2vec.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """Utilities to generate graph embeddings."""
 5 | 
 6 | from dataclasses import dataclass
 7 | 
 8 | import networkx as nx
 9 | import numpy as np
10 | 
11 | 
12 | @dataclass
13 | class NodeEmbeddings:
14 |     """Node embeddings class definition."""
15 | 
16 |     nodes: list[str]
17 |     embeddings: np.ndarray
18 | 
19 | 
20 | def embed_node2vec(
21 |     graph: nx.Graph | nx.DiGraph,
22 |     dimensions: int = 1536,
23 |     num_walks: int = 10,
24 |     walk_length: int = 40,
25 |     window_size: int = 2,
26 |     iterations: int = 3,
27 |     random_seed: int = 86,
28 | ) -> NodeEmbeddings:
29 |     """Generate node embeddings using Node2Vec."""
30 |     # NOTE: This import is done here to reduce the initial import time of the graphrag package
31 |     import graspologic as gc
32 | 
33 |     # generate embedding
34 |     lcc_tensors = gc.embed.node2vec_embed(  # type: ignore
35 |         graph=graph,
36 |         dimensions=dimensions,
37 |         window_size=window_size,
38 |         iterations=iterations,
39 |         num_walks=num_walks,
40 |         walk_length=walk_length,
41 |         random_seed=random_seed,
42 |     )
43 |     return NodeEmbeddings(embeddings=lcc_tensors[0], nodes=lcc_tensors[1])
44 | 


--------------------------------------------------------------------------------
/graphrag/index/operations/embed_graph/typing.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """A module containing different lists and dictionaries."""
 5 | 
 6 | # Use this for now instead of a wrapper
 7 | from typing import Any
 8 | 
 9 | NodeList = list[str]
10 | EmbeddingList = list[Any]
11 | NodeEmbeddings = dict[str, list[float]]
12 | """Label -> Embedding"""
13 | 


--------------------------------------------------------------------------------
/graphrag/index/operations/embed_text/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """The Indexing Engine text embed package root."""
 5 | 
 6 | from graphrag.index.operations.embed_text.embed_text import (
 7 |     TextEmbedStrategyType,
 8 |     embed_text,
 9 | )
10 | 
11 | __all__ = ["TextEmbedStrategyType", "embed_text"]
12 | 


--------------------------------------------------------------------------------
/graphrag/index/operations/embed_text/strategies/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 
4 | """The Indexing Engine embed strategies package root."""
5 | 


--------------------------------------------------------------------------------
/graphrag/index/operations/embed_text/strategies/mock.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """A module containing run and _embed_text methods definitions."""
 5 | 
 6 | import random
 7 | from collections.abc import Iterable
 8 | from typing import Any
 9 | 
10 | from graphrag.cache.pipeline_cache import PipelineCache
11 | from graphrag.callbacks.workflow_callbacks import WorkflowCallbacks
12 | from graphrag.index.operations.embed_text.strategies.typing import TextEmbeddingResult
13 | from graphrag.logger.progress import ProgressTicker, progress_ticker
14 | 
15 | 
16 | async def run(  # noqa RUF029 async is required for interface
17 |     input: list[str],
18 |     callbacks: WorkflowCallbacks,
19 |     cache: PipelineCache,
20 |     _args: dict[str, Any],
21 | ) -> TextEmbeddingResult:
22 |     """Run the Claim extraction chain."""
23 |     input = input if isinstance(input, Iterable) else [input]
24 |     ticker = progress_ticker(callbacks.progress, len(input))
25 |     return TextEmbeddingResult(
26 |         embeddings=[_embed_text(cache, text, ticker) for text in input]
27 |     )
28 | 
29 | 
30 | def _embed_text(_cache: PipelineCache, _text: str, tick: ProgressTicker) -> list[float]:
31 |     """Embed a single piece of text."""
32 |     tick(1)
33 |     return [random.random(), random.random(), random.random()]  # noqa S311
34 | 


--------------------------------------------------------------------------------
/graphrag/index/operations/embed_text/strategies/typing.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """A module containing 'TextEmbeddingResult' model."""
 5 | 
 6 | from collections.abc import Awaitable, Callable
 7 | from dataclasses import dataclass
 8 | 
 9 | from graphrag.cache.pipeline_cache import PipelineCache
10 | from graphrag.callbacks.workflow_callbacks import WorkflowCallbacks
11 | 
12 | 
13 | @dataclass
14 | class TextEmbeddingResult:
15 |     """Text embedding result class definition."""
16 | 
17 |     embeddings: list[list[float] | None] | None
18 | 
19 | 
20 | TextEmbeddingStrategy = Callable[
21 |     [
22 |         list[str],
23 |         WorkflowCallbacks,
24 |         PipelineCache,
25 |         dict,
26 |     ],
27 |     Awaitable[TextEmbeddingResult],
28 | ]
29 | 


--------------------------------------------------------------------------------
/graphrag/index/operations/extract_covariates/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 
4 | """The Indexing Engine text extract claims package root."""
5 | 


--------------------------------------------------------------------------------
/graphrag/index/operations/extract_covariates/typing.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """A module containing 'Covariate' and 'CovariateExtractionResult' models."""
 5 | 
 6 | from collections.abc import Awaitable, Callable, Iterable
 7 | from dataclasses import dataclass
 8 | from typing import Any
 9 | 
10 | from graphrag.cache.pipeline_cache import PipelineCache
11 | from graphrag.callbacks.workflow_callbacks import WorkflowCallbacks
12 | 
13 | 
14 | @dataclass
15 | class Covariate:
16 |     """Covariate class definition."""
17 | 
18 |     covariate_type: str | None = None
19 |     subject_id: str | None = None
20 |     object_id: str | None = None
21 |     type: str | None = None
22 |     status: str | None = None
23 |     start_date: str | None = None
24 |     end_date: str | None = None
25 |     description: str | None = None
26 |     source_text: list[str] | None = None
27 |     doc_id: str | None = None
28 |     record_id: int | None = None
29 |     id: str | None = None
30 | 
31 | 
32 | @dataclass
33 | class CovariateExtractionResult:
34 |     """Covariate extraction result class definition."""
35 | 
36 |     covariate_data: list[Covariate]
37 | 
38 | 
39 | CovariateExtractStrategy = Callable[
40 |     [
41 |         Iterable[str],
42 |         list[str],
43 |         dict[str, str],
44 |         WorkflowCallbacks,
45 |         PipelineCache,
46 |         dict[str, Any],
47 |     ],
48 |     Awaitable[CovariateExtractionResult],
49 | ]
50 | 


--------------------------------------------------------------------------------
/graphrag/index/operations/extract_graph/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 
4 | """The Indexing Engine entities extraction package root."""
5 | 


--------------------------------------------------------------------------------
/graphrag/index/operations/extract_graph/typing.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """A module containing 'Document' and 'EntityExtractionResult' models."""
 5 | 
 6 | from collections.abc import Awaitable, Callable
 7 | from dataclasses import dataclass
 8 | from enum import Enum
 9 | from typing import Any
10 | 
11 | import networkx as nx
12 | 
13 | from graphrag.cache.pipeline_cache import PipelineCache
14 | from graphrag.callbacks.workflow_callbacks import WorkflowCallbacks
15 | 
16 | ExtractedEntity = dict[str, Any]
17 | ExtractedRelationship = dict[str, Any]
18 | StrategyConfig = dict[str, Any]
19 | EntityTypes = list[str]
20 | 
21 | 
22 | @dataclass
23 | class Document:
24 |     """Document class definition."""
25 | 
26 |     text: str
27 |     id: str
28 | 
29 | 
30 | @dataclass
31 | class EntityExtractionResult:
32 |     """Entity extraction result class definition."""
33 | 
34 |     entities: list[ExtractedEntity]
35 |     relationships: list[ExtractedRelationship]
36 |     graph: nx.Graph | None
37 | 
38 | 
39 | EntityExtractStrategy = Callable[
40 |     [
41 |         list[Document],
42 |         EntityTypes,
43 |         WorkflowCallbacks,
44 |         PipelineCache,
45 |         StrategyConfig,
46 |     ],
47 |     Awaitable[EntityExtractionResult],
48 | ]
49 | 
50 | 
51 | class ExtractEntityStrategyType(str, Enum):
52 |     """ExtractEntityStrategyType class definition."""
53 | 
54 |     graph_intelligence = "graph_intelligence"
55 |     nltk = "nltk"
56 | 
57 |     def __repr__(self):
58 |         """Get a string representation."""
59 |         return f'"{self.value}"'
60 | 


--------------------------------------------------------------------------------
/graphrag/index/operations/finalize_community_reports.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """All the steps to transform final entities."""
 5 | 
 6 | from uuid import uuid4
 7 | 
 8 | import pandas as pd
 9 | 
10 | from graphrag.data_model.schemas import COMMUNITY_REPORTS_FINAL_COLUMNS
11 | 
12 | 
13 | def finalize_community_reports(
14 |     reports: pd.DataFrame,
15 |     communities: pd.DataFrame,
16 | ) -> pd.DataFrame:
17 |     """All the steps to transform final community reports."""
18 |     # Merge with communities to add shared fields
19 |     community_reports = reports.merge(
20 |         communities.loc[:, ["community", "parent", "children", "size", "period"]],
21 |         on="community",
22 |         how="left",
23 |         copy=False,
24 |     )
25 | 
26 |     community_reports["community"] = community_reports["community"].astype(int)
27 |     community_reports["human_readable_id"] = community_reports["community"]
28 |     community_reports["id"] = [uuid4().hex for _ in range(len(community_reports))]
29 | 
30 |     return community_reports.loc[
31 |         :,
32 |         COMMUNITY_REPORTS_FINAL_COLUMNS,
33 |     ]
34 | 


--------------------------------------------------------------------------------
/graphrag/index/operations/graph_to_dataframes.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """A module containing create_graph definition."""
 5 | 
 6 | import networkx as nx
 7 | import pandas as pd
 8 | 
 9 | 
10 | def graph_to_dataframes(
11 |     graph: nx.Graph,
12 |     node_columns: list[str] | None = None,
13 |     edge_columns: list[str] | None = None,
14 |     node_id: str = "title",
15 | ) -> tuple[pd.DataFrame, pd.DataFrame]:
16 |     """Deconstructs an nx.Graph into nodes and edges dataframes."""
17 |     # nx graph nodes are a tuple, and creating a df from them results in the id being the index
18 |     nodes = pd.DataFrame.from_dict(dict(graph.nodes(data=True)), orient="index")
19 |     nodes[node_id] = nodes.index
20 |     nodes.reset_index(inplace=True, drop=True)
21 | 
22 |     edges = nx.to_pandas_edgelist(graph)
23 | 
24 |     # we don't deal in directed graphs, but we do need to ensure consistent ordering for df joins
25 |     # nx loses the initial ordering
26 |     edges["min_source"] = edges[["source", "target"]].min(axis=1)
27 |     edges["max_target"] = edges[["source", "target"]].max(axis=1)
28 |     edges = edges.drop(columns=["source", "target"]).rename(
29 |         columns={"min_source": "source", "max_target": "target"}  # type: ignore
30 |     )
31 | 
32 |     if node_columns:
33 |         nodes = nodes.loc[:, node_columns]
34 | 
35 |     if edge_columns:
36 |         edges = edges.loc[:, edge_columns]
37 | 
38 |     return (nodes, edges)
39 | 


--------------------------------------------------------------------------------
/graphrag/index/operations/layout_graph/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 
4 | """The Indexing Engine graph layout package root."""
5 | 


--------------------------------------------------------------------------------
/graphrag/index/operations/layout_graph/typing.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | # Use this for now instead of a wrapper
 5 | """A module containing 'NodePosition' model."""
 6 | 
 7 | from dataclasses import dataclass
 8 | 
 9 | 
10 | @dataclass
11 | class NodePosition:
12 |     """Node position class definition."""
13 | 
14 |     label: str
15 |     cluster: str
16 |     size: float
17 | 
18 |     x: float
19 |     y: float
20 |     z: float | None = None
21 | 
22 |     def to_pandas(self) -> tuple[str, float, float, str, float]:
23 |         """To pandas method definition."""
24 |         return self.label, self.x, self.y, self.cluster, self.size
25 | 
26 | 
27 | GraphLayout = list[NodePosition]
28 | 


--------------------------------------------------------------------------------
/graphrag/index/operations/snapshot_graphml.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """A module containing snapshot_graphml method definition."""
 5 | 
 6 | import networkx as nx
 7 | 
 8 | from graphrag.storage.pipeline_storage import PipelineStorage
 9 | 
10 | 
11 | async def snapshot_graphml(
12 |     input: str | nx.Graph,
13 |     name: str,
14 |     storage: PipelineStorage,
15 | ) -> None:
16 |     """Take a entire snapshot of a graph to standard graphml format."""
17 |     graphml = input if isinstance(input, str) else "\n".join(nx.generate_graphml(input))
18 |     await storage.set(name + ".graphml", graphml)
19 | 


--------------------------------------------------------------------------------
/graphrag/index/operations/summarize_communities/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 
4 | """Community summarization modules."""
5 | 


--------------------------------------------------------------------------------
/graphrag/index/operations/summarize_communities/explode_communities.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """Explode a list of communities into nodes for filtering."""
 5 | 
 6 | import pandas as pd
 7 | 
 8 | from graphrag.data_model.schemas import (
 9 |     COMMUNITY_ID,
10 | )
11 | 
12 | 
13 | def explode_communities(
14 |     communities: pd.DataFrame, entities: pd.DataFrame
15 | ) -> pd.DataFrame:
16 |     """Explode a list of communities into nodes for filtering."""
17 |     community_join = communities.explode("entity_ids").loc[
18 |         :, ["community", "level", "entity_ids"]
19 |     ]
20 |     nodes = entities.merge(
21 |         community_join, left_on="id", right_on="entity_ids", how="left"
22 |     )
23 |     return nodes.loc[nodes.loc[:, COMMUNITY_ID] != -1]
24 | 


--------------------------------------------------------------------------------
/graphrag/index/operations/summarize_communities/graph_context/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 
4 | """Package of context builders for graph-based reports."""
5 | 


--------------------------------------------------------------------------------
/graphrag/index/operations/summarize_communities/text_unit_context/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 
4 | """Package of context builders for text unit-based reports."""
5 | 


--------------------------------------------------------------------------------
/graphrag/index/operations/summarize_communities/utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """A module containing community report generation utilities."""
 5 | 
 6 | import pandas as pd
 7 | 
 8 | import graphrag.data_model.schemas as schemas
 9 | 
10 | 
11 | def get_levels(
12 |     df: pd.DataFrame, level_column: str = schemas.COMMUNITY_LEVEL
13 | ) -> list[int]:
14 |     """Get the levels of the communities."""
15 |     levels = df[level_column].dropna().unique()
16 |     levels = [int(lvl) for lvl in levels if lvl != -1]
17 |     return sorted(levels, reverse=True)
18 | 


--------------------------------------------------------------------------------
/graphrag/index/operations/summarize_descriptions/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """Root package for description summarization."""
 5 | 
 6 | from graphrag.index.operations.summarize_descriptions.summarize_descriptions import (
 7 |     summarize_descriptions,
 8 | )
 9 | from graphrag.index.operations.summarize_descriptions.typing import (
10 |     SummarizationStrategy,
11 |     SummarizeStrategyType,
12 | )
13 | 
14 | __all__ = [
15 |     "SummarizationStrategy",
16 |     "SummarizeStrategyType",
17 |     "summarize_descriptions",
18 | ]
19 | 


--------------------------------------------------------------------------------
/graphrag/index/operations/summarize_descriptions/typing.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """A module containing 'SummarizedDescriptionResult' model."""
 5 | 
 6 | from collections.abc import Awaitable, Callable
 7 | from dataclasses import dataclass
 8 | from enum import Enum
 9 | from typing import Any, NamedTuple
10 | 
11 | from graphrag.cache.pipeline_cache import PipelineCache
12 | from graphrag.callbacks.workflow_callbacks import WorkflowCallbacks
13 | 
14 | StrategyConfig = dict[str, Any]
15 | 
16 | 
17 | @dataclass
18 | class SummarizedDescriptionResult:
19 |     """Entity summarization result class definition."""
20 | 
21 |     id: str | tuple[str, str]
22 |     description: str
23 | 
24 | 
25 | SummarizationStrategy = Callable[
26 |     [
27 |         str | tuple[str, str],
28 |         list[str],
29 |         WorkflowCallbacks,
30 |         PipelineCache,
31 |         StrategyConfig,
32 |     ],
33 |     Awaitable[SummarizedDescriptionResult],
34 | ]
35 | 
36 | 
37 | class DescriptionSummarizeRow(NamedTuple):
38 |     """DescriptionSummarizeRow class definition."""
39 | 
40 |     graph: Any
41 | 
42 | 
43 | class SummarizeStrategyType(str, Enum):
44 |     """SummarizeStrategyType class definition."""
45 | 
46 |     graph_intelligence = "graph_intelligence"
47 | 
48 |     def __repr__(self):
49 |         """Get a string representation."""
50 |         return f'"{self.value}"'
51 | 


--------------------------------------------------------------------------------
/graphrag/index/run/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 
4 | """Run module for GraphRAG."""
5 | 


--------------------------------------------------------------------------------
/graphrag/index/text_splitting/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 
4 | """The Indexing Engine Text Splitting package root."""
5 | 


--------------------------------------------------------------------------------
/graphrag/index/text_splitting/check_token_limit.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """Token limit method definition."""
 5 | 
 6 | from graphrag.index.text_splitting.text_splitting import TokenTextSplitter
 7 | 
 8 | 
 9 | def check_token_limit(text, max_token):
10 |     """Check token limit."""
11 |     text_splitter = TokenTextSplitter(chunk_size=max_token, chunk_overlap=0)
12 |     docs = text_splitter.split_text(text)
13 |     if len(docs) > 1:
14 |         return 0
15 |     return 1
16 | 


--------------------------------------------------------------------------------
/graphrag/index/typing/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 
4 | """Root typings for GraphRAG."""
5 | 


--------------------------------------------------------------------------------
/graphrag/index/typing/context.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | # isort: skip_file
 5 | """A module containing the 'PipelineRunContext' models."""
 6 | 
 7 | from dataclasses import dataclass
 8 | 
 9 | from graphrag.cache.pipeline_cache import PipelineCache
10 | from graphrag.callbacks.workflow_callbacks import WorkflowCallbacks
11 | from graphrag.index.typing.state import PipelineState
12 | from graphrag.index.typing.stats import PipelineRunStats
13 | from graphrag.storage.pipeline_storage import PipelineStorage
14 | 
15 | 
16 | @dataclass
17 | class PipelineRunContext:
18 |     """Provides the context for the current pipeline run."""
19 | 
20 |     stats: PipelineRunStats
21 |     storage: PipelineStorage
22 |     "Long-term storage for pipeline verbs to use. Items written here will be written to the storage provider."
23 |     cache: PipelineCache
24 |     "Cache instance for reading previous LLM responses."
25 |     callbacks: WorkflowCallbacks
26 |     "Callbacks to be called during the pipeline run."
27 |     state: PipelineState
28 |     "Arbitrary property bag for runtime state, persistent pre-computes, or experimental features."
29 | 


--------------------------------------------------------------------------------
/graphrag/index/typing/error_handler.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 
4 | """Shared error handler types."""
5 | 
6 | from collections.abc import Callable
7 | 
8 | ErrorHandlerFn = Callable[[BaseException | None, str | None, dict | None], None]
9 | 


--------------------------------------------------------------------------------
/graphrag/index/typing/pipeline.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """A module containing the Pipeline class."""
 5 | 
 6 | from collections.abc import Generator
 7 | 
 8 | from graphrag.index.typing.workflow import Workflow
 9 | 
10 | 
11 | class Pipeline:
12 |     """Encapsulates running workflows."""
13 | 
14 |     def __init__(self, workflows: list[Workflow]):
15 |         self.workflows = workflows
16 | 
17 |     def run(self) -> Generator[Workflow]:
18 |         """Return a Generator over the pipeline workflows."""
19 |         yield from self.workflows
20 | 
21 |     def names(self) -> list[str]:
22 |         """Return the names of the workflows in the pipeline."""
23 |         return [name for name, _ in self.workflows]
24 | 


--------------------------------------------------------------------------------
/graphrag/index/typing/pipeline_run_result.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """A module containing the PipelineRunResult class."""
 5 | 
 6 | from dataclasses import dataclass
 7 | from typing import Any
 8 | 
 9 | from graphrag.index.typing.state import PipelineState
10 | 
11 | 
12 | @dataclass
13 | class PipelineRunResult:
14 |     """Pipeline run result class definition."""
15 | 
16 |     workflow: str
17 |     """The name of the workflow that was executed."""
18 |     result: Any | None
19 |     """The result of the workflow function. This can be anything - we use it only for logging downstream, and expect each workflow function to write official outputs to the provided storage."""
20 |     state: PipelineState
21 |     """Ongoing pipeline context state object."""
22 |     errors: list[BaseException] | None
23 | 


--------------------------------------------------------------------------------
/graphrag/index/typing/state.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 
4 | """Pipeline state types."""
5 | 
6 | from typing import Any
7 | 
8 | PipelineState = dict[Any, Any]
9 | 


--------------------------------------------------------------------------------
/graphrag/index/typing/stats.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """Pipeline stats types."""
 5 | 
 6 | from dataclasses import dataclass, field
 7 | 
 8 | 
 9 | @dataclass
10 | class PipelineRunStats:
11 |     """Pipeline running stats."""
12 | 
13 |     total_runtime: float = field(default=0)
14 |     """Float representing the total runtime."""
15 | 
16 |     num_documents: int = field(default=0)
17 |     """Number of documents."""
18 | 
19 |     input_load_time: float = field(default=0)
20 |     """Float representing the input load time."""
21 | 
22 |     workflows: dict[str, dict[str, float]] = field(default_factory=dict)
23 |     """A dictionary of workflows."""
24 | 


--------------------------------------------------------------------------------
/graphrag/index/typing/workflow.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """Pipeline workflow types."""
 5 | 
 6 | from collections.abc import Awaitable, Callable
 7 | from dataclasses import dataclass
 8 | from typing import Any
 9 | 
10 | from graphrag.config.models.graph_rag_config import GraphRagConfig
11 | from graphrag.index.typing.context import PipelineRunContext
12 | 
13 | 
14 | @dataclass
15 | class WorkflowFunctionOutput:
16 |     """Data container for Workflow function results."""
17 | 
18 |     result: Any | None
19 |     """The result of the workflow function. This can be anything - we use it only for logging downstream, and expect each workflow function to write official outputs to the provided storage."""
20 | 
21 | 
22 | WorkflowFunction = Callable[
23 |     [GraphRagConfig, PipelineRunContext],
24 |     Awaitable[WorkflowFunctionOutput],
25 | ]
26 | Workflow = tuple[str, WorkflowFunction]
27 | 


--------------------------------------------------------------------------------
/graphrag/index/update/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 
4 | """Incremental Indexing main module definition."""
5 | 


--------------------------------------------------------------------------------
/graphrag/index/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 
4 | """Utils methods definition."""
5 | 


--------------------------------------------------------------------------------
/graphrag/index/utils/dicts.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """A utility module containing methods for inspecting and verifying dictionary types."""
 5 | 
 6 | 
 7 | def dict_has_keys_with_types(
 8 |     data: dict, expected_fields: list[tuple[str, type]], inplace: bool = False
 9 | ) -> bool:
10 |     """Return True if the given dictionary has the given keys with the given types."""
11 |     for field, field_type in expected_fields:
12 |         if field not in data:
13 |             return False
14 | 
15 |         value = data[field]
16 |         try:
17 |             cast_value = field_type(value)
18 |             if inplace:
19 |                 data[field] = cast_value
20 |         except (TypeError, ValueError):
21 |             return False
22 |     return True
23 | 


--------------------------------------------------------------------------------
/graphrag/index/utils/hashing.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """Hashing utilities."""
 5 | 
 6 | from collections.abc import Iterable
 7 | from hashlib import sha512
 8 | from typing import Any
 9 | 
10 | 
11 | def gen_sha512_hash(item: dict[str, Any], hashcode: Iterable[str]):
12 |     """Generate a SHA512 hash."""
13 |     hashed = "".join([str(item[column]) for column in hashcode])
14 |     return f"{sha512(hashed.encode('utf-8'), usedforsecurity=False).hexdigest()}"
15 | 


--------------------------------------------------------------------------------
/graphrag/index/utils/is_null.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """Defines the is_null utility."""
 5 | 
 6 | import math
 7 | from typing import Any
 8 | 
 9 | 
10 | def is_null(value: Any) -> bool:
11 |     """Check if value is null or is nan."""
12 | 
13 |     def is_none() -> bool:
14 |         return value is None
15 | 
16 |     def is_nan() -> bool:
17 |         return isinstance(value, float) and math.isnan(value)
18 | 
19 |     return is_none() or is_nan()
20 | 


--------------------------------------------------------------------------------
/graphrag/index/utils/rate_limiter.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """Rate limiter utility."""
 5 | 
 6 | import asyncio
 7 | import time
 8 | 
 9 | 
10 | class RateLimiter:
11 |     """
12 |     The original TpmRpmLLMLimiter strategy did not account for minute-based rate limiting when scheduled.
13 | 
14 |     The RateLimiter was introduced to ensure that the CommunityReportsExtractor could be scheduled to adhere to rate configurations on a per-minute basis.
15 |     """
16 | 
17 |     # TODO: RateLimiter scheduled: using asyncio for async_mode
18 | 
19 |     def __init__(self, rate: int, per: int):
20 |         self.rate = rate
21 |         self.per = per
22 |         self.allowance = rate
23 |         self.last_check = time.monotonic()
24 | 
25 |     async def acquire(self):
26 |         """Acquire a token from the rate limiter."""
27 |         current = time.monotonic()
28 |         elapsed = current - self.last_check
29 |         self.last_check = current
30 |         self.allowance += elapsed * (self.rate / self.per)
31 | 
32 |         if self.allowance > self.rate:
33 |             self.allowance = self.rate
34 | 
35 |         if self.allowance < 1.0:
36 |             sleep_time = (1.0 - self.allowance) * (self.per / self.rate)
37 |             await asyncio.sleep(sleep_time)
38 |             self.allowance = 0.0
39 |         else:
40 |             self.allowance -= 1.0
41 | 


--------------------------------------------------------------------------------
/graphrag/index/utils/string.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """String utilities."""
 5 | 
 6 | import html
 7 | import re
 8 | from typing import Any
 9 | 
10 | 
11 | def clean_str(input: Any) -> str:
12 |     """Clean an input string by removing HTML escapes, control characters, and other unwanted characters."""
13 |     # If we get non-string input, just give it back
14 |     if not isinstance(input, str):
15 |         return input
16 | 
17 |     result = html.unescape(input.strip())
18 |     # https://stackoverflow.com/questions/4324790/removing-control-characters-from-a-string-in-python
19 |     return re.sub(r"[\x00-\x1f\x7f-\x9f]", "", result)
20 | 


--------------------------------------------------------------------------------
/graphrag/index/utils/uuid.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """UUID utilities."""
 5 | 
 6 | import uuid
 7 | from random import Random, getrandbits
 8 | 
 9 | 
10 | def gen_uuid(rd: Random | None = None):
11 |     """Generate a random UUID v4."""
12 |     return uuid.UUID(
13 |         int=rd.getrandbits(128) if rd is not None else getrandbits(128), version=4
14 |     ).hex
15 | 


--------------------------------------------------------------------------------
/graphrag/index/workflows/update_clean_state.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """A module containing run_workflow method definition."""
 5 | 
 6 | import logging
 7 | 
 8 | from graphrag.config.models.graph_rag_config import GraphRagConfig
 9 | from graphrag.index.typing.context import PipelineRunContext
10 | from graphrag.index.typing.workflow import WorkflowFunctionOutput
11 | 
12 | logger = logging.getLogger(__name__)
13 | 
14 | 
15 | async def run_workflow(  # noqa: RUF029
16 |     _config: GraphRagConfig,
17 |     context: PipelineRunContext,
18 | ) -> WorkflowFunctionOutput:
19 |     """Clean the state after the update."""
20 |     logger.info("Cleaning State")
21 |     keys_to_delete = [
22 |         key_name
23 |         for key_name in context.state
24 |         if key_name.startswith("incremental_update_")
25 |     ]
26 | 
27 |     for key_name in keys_to_delete:
28 |         del context.state[key_name]
29 | 
30 |     return WorkflowFunctionOutput(result=None)
31 | 


--------------------------------------------------------------------------------
/graphrag/index/workflows/update_final_documents.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """A module containing run_workflow method definition."""
 5 | 
 6 | import logging
 7 | 
 8 | from graphrag.config.models.graph_rag_config import GraphRagConfig
 9 | from graphrag.index.run.utils import get_update_storages
10 | from graphrag.index.typing.context import PipelineRunContext
11 | from graphrag.index.typing.workflow import WorkflowFunctionOutput
12 | from graphrag.index.update.incremental_index import concat_dataframes
13 | 
14 | logger = logging.getLogger(__name__)
15 | 
16 | 
17 | async def run_workflow(
18 |     config: GraphRagConfig,
19 |     context: PipelineRunContext,
20 | ) -> WorkflowFunctionOutput:
21 |     """Update the documents from a incremental index run."""
22 |     logger.info("Updating Documents")
23 |     output_storage, previous_storage, delta_storage = get_update_storages(
24 |         config, context.state["update_timestamp"]
25 |     )
26 | 
27 |     final_documents = await concat_dataframes(
28 |         "documents", previous_storage, delta_storage, output_storage
29 |     )
30 | 
31 |     context.state["incremental_update_final_documents"] = final_documents
32 | 
33 |     return WorkflowFunctionOutput(result=None)
34 | 


--------------------------------------------------------------------------------
/graphrag/language_model/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2025 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 
4 | """GraphRAG Language Models module. Allows for provider registrations while providing some out-of-the-box solutions."""
5 | 


--------------------------------------------------------------------------------
/graphrag/language_model/cache/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2025 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 
4 | """Cache provider definitions for Language Models."""
5 | 


--------------------------------------------------------------------------------
/graphrag/language_model/cache/base.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2025 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """Base cache protocol definition."""
 5 | 
 6 | from typing import Any, Protocol
 7 | 
 8 | 
 9 | class ModelCache(Protocol):
10 |     """Base cache protocol."""
11 | 
12 |     async def has(self, key: str) -> bool:
13 |         """Check if the cache has a value."""
14 |         ...
15 | 
16 |     async def get(self, key: str) -> Any | None:
17 |         """Retrieve a value from the cache."""
18 |         ...
19 | 
20 |     async def set(
21 |         self, key: str, value: Any, metadata: dict[str, Any] | None = None
22 |     ) -> None:
23 |         """Write a value into the cache."""
24 |         ...
25 | 
26 |     async def remove(self, key: str) -> None:
27 |         """Remove a value from the cache."""
28 |         ...
29 | 
30 |     async def clear(self) -> None:
31 |         """Clear the cache."""
32 |         ...
33 | 
34 |     def child(self, key: str) -> Any:
35 |         """Create a child cache."""
36 |         ...
37 | 


--------------------------------------------------------------------------------
/graphrag/language_model/events/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2025 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 
4 | """Model Event handler modules."""
5 | 


--------------------------------------------------------------------------------
/graphrag/language_model/events/base.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2025 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """Base model events protocol."""
 5 | 
 6 | from typing import Any, Protocol
 7 | 
 8 | 
 9 | class ModelEventHandler(Protocol):
10 |     """Protocol for Model event handling."""
11 | 
12 |     async def on_error(
13 |         self,
14 |         error: BaseException | None,
15 |         traceback: str | None = None,
16 |         arguments: dict[str, Any] | None = None,
17 |     ) -> None:
18 |         """Handle an model error."""
19 |         ...
20 | 


--------------------------------------------------------------------------------
/graphrag/language_model/protocol/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2025 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 
4 | """Base protocol definitions for LLMs."""
5 | 
6 | from .base import ChatModel, EmbeddingModel
7 | 
8 | __all__ = ["ChatModel", "EmbeddingModel"]
9 | 


--------------------------------------------------------------------------------
/graphrag/language_model/providers/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2025 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 
4 | """Model Providers module."""
5 | 


--------------------------------------------------------------------------------
/graphrag/language_model/providers/fnllm/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2025 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 
4 | """FNLLM provider module."""
5 | 


--------------------------------------------------------------------------------
/graphrag/language_model/providers/fnllm/cache.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2025 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """FNLLM Cache provider."""
 5 | 
 6 | from typing import Any
 7 | 
 8 | from fnllm.caching import Cache as FNLLMCache
 9 | 
10 | from graphrag.cache.pipeline_cache import PipelineCache
11 | 
12 | 
13 | class FNLLMCacheProvider(FNLLMCache):
14 |     """A cache for the pipeline."""
15 | 
16 |     def __init__(self, cache: PipelineCache):
17 |         self._cache = cache
18 | 
19 |     async def has(self, key: str) -> bool:
20 |         """Check if the cache has a value."""
21 |         return await self._cache.has(key)
22 | 
23 |     async def get(self, key: str) -> Any | None:
24 |         """Retrieve a value from the cache."""
25 |         return await self._cache.get(key)
26 | 
27 |     async def set(
28 |         self, key: str, value: Any, metadata: dict[str, Any] | None = None
29 |     ) -> None:
30 |         """Write a value into the cache."""
31 |         await self._cache.set(key, value, metadata)
32 | 
33 |     async def remove(self, key: str) -> None:
34 |         """Remove a value from the cache."""
35 |         await self._cache.delete(key)
36 | 
37 |     async def clear(self) -> None:
38 |         """Clear the cache."""
39 |         await self._cache.clear()
40 | 
41 |     def child(self, key: str) -> "FNLLMCacheProvider":
42 |         """Create a child cache."""
43 |         child_cache = self._cache.child(key)
44 |         return FNLLMCacheProvider(child_cache)
45 | 


--------------------------------------------------------------------------------
/graphrag/language_model/providers/fnllm/events.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2025 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """FNLLM llm events provider."""
 5 | 
 6 | from typing import Any
 7 | 
 8 | from fnllm.events import LLMEvents
 9 | 
10 | from graphrag.index.typing.error_handler import ErrorHandlerFn
11 | 
12 | 
13 | class FNLLMEvents(LLMEvents):
14 |     """FNLLM events handler that calls the error handler."""
15 | 
16 |     def __init__(self, on_error: ErrorHandlerFn):
17 |         self._on_error = on_error
18 | 
19 |     async def on_error(
20 |         self,
21 |         error: BaseException | None,
22 |         traceback: str | None = None,
23 |         arguments: dict[str, Any] | None = None,
24 |     ) -> None:
25 |         """Handle an fnllm error."""
26 |         self._on_error(error, traceback, arguments)
27 | 


--------------------------------------------------------------------------------
/graphrag/language_model/response/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2025 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 
4 | """A module containing Model response definitions."""
5 | 


--------------------------------------------------------------------------------
/graphrag/language_model/response/base.pyi:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2025 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | from typing import Any, Generic, Protocol, TypeVar
 5 | 
 6 | from pydantic import BaseModel
 7 | 
 8 | _T = TypeVar("_T", bound=BaseModel, covariant=True)
 9 | 
10 | class ModelOutput(Protocol):
11 |     @property
12 |     def content(self) -> str: ...
13 |     @property
14 |     def full_response(self) -> dict[str, Any] | None: ...
15 | 
16 | class ModelResponse(Protocol, Generic[_T]):
17 |     @property
18 |     def output(self) -> ModelOutput: ...
19 |     @property
20 |     def parsed_response(self) -> _T | None: ...
21 |     @property
22 |     def history(self) -> list[Any]: ...
23 | 
24 | class BaseModelOutput(BaseModel):
25 |     content: str
26 |     full_response: dict[str, Any] | None
27 | 
28 |     def __init__(
29 |         self,
30 |         content: str,
31 |         full_response: dict[str, Any] | None = None,
32 |     ) -> None: ...
33 | 
34 | class BaseModelResponse(BaseModel, Generic[_T]):
35 |     output: BaseModelOutput
36 |     parsed_response: _T | None
37 |     history: list[Any]
38 |     tool_calls: list[Any]
39 |     metrics: Any | None
40 |     cache_hit: bool | None
41 | 
42 |     def __init__(
43 |         self,
44 |         output: BaseModelOutput,
45 |         parsed_response: _T | None = None,
46 |         history: list[Any] = ...,  # default provided by Pydantic
47 |         tool_calls: list[Any] = ...,  # default provided by Pydantic
48 |         metrics: Any | None = None,
49 |         cache_hit: bool | None = None,
50 |     ) -> None: ...
51 | 


--------------------------------------------------------------------------------
/graphrag/logger/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 
4 | """Logger utilities and implementations."""
5 | 


--------------------------------------------------------------------------------
/graphrag/logger/console.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """Console Log."""
 5 | 
 6 | from typing import Any
 7 | 
 8 | from graphrag.logger.base import StatusLogger
 9 | 
10 | 
11 | class ConsoleReporter(StatusLogger):
12 |     """A logger that writes to a console."""
13 | 
14 |     def error(self, message: str, details: dict[str, Any] | None = None):
15 |         """Log an error."""
16 |         print(message, details)  # noqa T201
17 | 
18 |     def warning(self, message: str, details: dict[str, Any] | None = None):
19 |         """Log a warning."""
20 |         _print_warning(message)
21 | 
22 |     def log(self, message: str, details: dict[str, Any] | None = None):
23 |         """Log a log."""
24 |         print(message, details)  # noqa T201
25 | 
26 | 
27 | def _print_warning(skk):
28 |     print(f"\033[93m {skk}\033[00m")  # noqa T201
29 | 


--------------------------------------------------------------------------------
/graphrag/logger/null_progress.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """Null Progress Reporter."""
 5 | 
 6 | from graphrag.logger.base import Progress, ProgressLogger
 7 | 
 8 | 
 9 | class NullProgressLogger(ProgressLogger):
10 |     """A progress logger that does nothing."""
11 | 
12 |     def __call__(self, update: Progress) -> None:
13 |         """Update progress."""
14 | 
15 |     def dispose(self) -> None:
16 |         """Dispose of the progress logger."""
17 | 
18 |     def child(self, prefix: str, transient: bool = True) -> ProgressLogger:
19 |         """Create a child progress bar."""
20 |         return self
21 | 
22 |     def force_refresh(self) -> None:
23 |         """Force a refresh."""
24 | 
25 |     def stop(self) -> None:
26 |         """Stop the progress logger."""
27 | 
28 |     def error(self, message: str) -> None:
29 |         """Log an error."""
30 | 
31 |     def warning(self, message: str) -> None:
32 |         """Log a warning."""
33 | 
34 |     def info(self, message: str) -> None:
35 |         """Log information."""
36 | 
37 |     def success(self, message: str) -> None:
38 |         """Log success."""
39 | 


--------------------------------------------------------------------------------
/graphrag/logger/types.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """Logging types.
 5 | 
 6 | This module defines the types of loggers that can be used.
 7 | """
 8 | 
 9 | from enum import Enum
10 | 
11 | 
12 | # Note: Code in this module was not included in the factory module because it negatively impacts the CLI experience.
13 | class LoggerType(str, Enum):
14 |     """The type of logger to use."""
15 | 
16 |     RICH = "rich"
17 |     PRINT = "print"
18 |     NONE = "none"
19 | 
20 |     def __str__(self):
21 |         """Return a string representation of the enum value."""
22 |         return self.value
23 | 


--------------------------------------------------------------------------------
/graphrag/prompt_tune/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 
4 | """The prompt-tuning package root."""
5 | 


--------------------------------------------------------------------------------
/graphrag/prompt_tune/defaults.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """Default values for the prompt-tuning module.
 5 | 
 6 | Note: These values get accessed from the CLI to set default behavior.
 7 | To maintain fast responsiveness from the CLI, do not add long-running code in this file and be mindful of imports.
 8 | """
 9 | 
10 | DEFAULT_TASK = """
11 | Identify the relations and structure of the community of interest, specifically within the {domain} domain.
12 | """
13 | 
14 | K = 15
15 | LIMIT = 15
16 | MAX_TOKEN_COUNT = 2000
17 | MIN_CHUNK_SIZE = 200
18 | N_SUBSET_MAX = 300
19 | MIN_CHUNK_OVERLAP = 0
20 | PROMPT_TUNING_MODEL_ID = "default_chat_model"
21 | 


--------------------------------------------------------------------------------
/graphrag/prompt_tune/generator/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 
4 | """Prompt generation module."""
5 | 


--------------------------------------------------------------------------------
/graphrag/prompt_tune/generator/community_report_rating.py:
--------------------------------------------------------------------------------
 1 | """Generate a rating description for community report rating."""
 2 | 
 3 | # Copyright (c) 2024 Microsoft Corporation.
 4 | # Licensed under the MIT License
 5 | 
 6 | from graphrag.language_model.protocol.base import ChatModel
 7 | from graphrag.prompt_tune.prompt.community_report_rating import (
 8 |     GENERATE_REPORT_RATING_PROMPT,
 9 | )
10 | 
11 | 
12 | async def generate_community_report_rating(
13 |     model: ChatModel, domain: str, persona: str, docs: str | list[str]
14 | ) -> str:
15 |     """Generate an LLM persona to use for GraphRAG prompts.
16 | 
17 |     Parameters
18 |     ----------
19 |     - llm (CompletionLLM): The LLM to use for generation
20 |     - domain (str): The domain to generate a rating for
21 |     - persona (str): The persona to generate a rating for for
22 |     - docs (str | list[str]): Documents used to contextualize the rating
23 | 
24 |     Returns
25 |     -------
26 |     - str: The generated rating description prompt response.
27 |     """
28 |     docs_str = " ".join(docs) if isinstance(docs, list) else docs
29 |     domain_prompt = GENERATE_REPORT_RATING_PROMPT.format(
30 |         domain=domain, persona=persona, input_text=docs_str
31 |     )
32 | 
33 |     response = await model.achat(domain_prompt)
34 | 
35 |     return str(response.output.content).strip()
36 | 


--------------------------------------------------------------------------------
/graphrag/prompt_tune/generator/community_reporter_role.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """Generate a community reporter role for community summarization."""
 5 | 
 6 | from graphrag.language_model.protocol.base import ChatModel
 7 | from graphrag.prompt_tune.prompt.community_reporter_role import (
 8 |     GENERATE_COMMUNITY_REPORTER_ROLE_PROMPT,
 9 | )
10 | 
11 | 
12 | async def generate_community_reporter_role(
13 |     model: ChatModel, domain: str, persona: str, docs: str | list[str]
14 | ) -> str:
15 |     """Generate an LLM persona to use for GraphRAG prompts.
16 | 
17 |     Parameters
18 |     ----------
19 |     - llm (CompletionLLM): The LLM to use for generation
20 |     - domain (str): The domain to generate a persona for
21 |     - persona (str): The persona to generate a role for
22 |     - docs (str | list[str]): The domain to generate a persona for
23 | 
24 |     Returns
25 |     -------
26 |     - str: The generated domain prompt response.
27 |     """
28 |     docs_str = " ".join(docs) if isinstance(docs, list) else docs
29 |     domain_prompt = GENERATE_COMMUNITY_REPORTER_ROLE_PROMPT.format(
30 |         domain=domain, persona=persona, input_text=docs_str
31 |     )
32 | 
33 |     response = await model.achat(domain_prompt)
34 | 
35 |     return str(response.output.content)
36 | 


--------------------------------------------------------------------------------
/graphrag/prompt_tune/generator/domain.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """Domain generation for GraphRAG prompts."""
 5 | 
 6 | from graphrag.language_model.protocol.base import ChatModel
 7 | from graphrag.prompt_tune.prompt.domain import GENERATE_DOMAIN_PROMPT
 8 | 
 9 | 
10 | async def generate_domain(model: ChatModel, docs: str | list[str]) -> str:
11 |     """Generate an LLM persona to use for GraphRAG prompts.
12 | 
13 |     Parameters
14 |     ----------
15 |     - llm (CompletionLLM): The LLM to use for generation
16 |     - docs (str | list[str]): The domain to generate a persona for
17 | 
18 |     Returns
19 |     -------
20 |     - str: The generated domain prompt response.
21 |     """
22 |     docs_str = " ".join(docs) if isinstance(docs, list) else docs
23 |     domain_prompt = GENERATE_DOMAIN_PROMPT.format(input_text=docs_str)
24 | 
25 |     response = await model.achat(domain_prompt)
26 | 
27 |     return str(response.output.content)
28 | 


--------------------------------------------------------------------------------
/graphrag/prompt_tune/generator/entity_summarization_prompt.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """Entity summarization prompt generation module."""
 5 | 
 6 | from pathlib import Path
 7 | 
 8 | from graphrag.prompt_tune.template.entity_summarization import (
 9 |     ENTITY_SUMMARIZATION_PROMPT,
10 | )
11 | 
12 | ENTITY_SUMMARIZATION_FILENAME = "summarize_descriptions.txt"
13 | 
14 | 
15 | def create_entity_summarization_prompt(
16 |     persona: str,
17 |     language: str,
18 |     output_path: Path | None = None,
19 | ) -> str:
20 |     """
21 |     Create a prompt for entity summarization.
22 | 
23 |     Parameters
24 |     ----------
25 |     - persona (str): The persona to use for the entity summarization prompt
26 |     - language (str): The language to use for the entity summarization prompt
27 |     - output_path (Path | None): The path to write the prompt to. Default is None.
28 |     """
29 |     prompt = ENTITY_SUMMARIZATION_PROMPT.format(persona=persona, language=language)
30 | 
31 |     if output_path:
32 |         output_path.mkdir(parents=True, exist_ok=True)
33 | 
34 |         output_path = output_path / ENTITY_SUMMARIZATION_FILENAME
35 |         # Write file to output path
36 |         with output_path.open("wb") as file:
37 |             file.write(prompt.encode(encoding="utf-8", errors="strict"))
38 | 
39 |     return prompt
40 | 


--------------------------------------------------------------------------------
/graphrag/prompt_tune/generator/language.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """Language detection for GraphRAG prompts."""
 5 | 
 6 | from graphrag.language_model.protocol.base import ChatModel
 7 | from graphrag.prompt_tune.prompt.language import DETECT_LANGUAGE_PROMPT
 8 | 
 9 | 
10 | async def detect_language(model: ChatModel, docs: str | list[str]) -> str:
11 |     """Detect input language to use for GraphRAG prompts.
12 | 
13 |     Parameters
14 |     ----------
15 |     - llm (CompletionLLM): The LLM to use for generation
16 |     - docs (str | list[str]): The docs to detect language from
17 | 
18 |     Returns
19 |     -------
20 |     - str: The detected language.
21 |     """
22 |     docs_str = " ".join(docs) if isinstance(docs, list) else docs
23 |     language_prompt = DETECT_LANGUAGE_PROMPT.format(input_text=docs_str)
24 | 
25 |     response = await model.achat(language_prompt)
26 | 
27 |     return str(response.output.content)
28 | 


--------------------------------------------------------------------------------
/graphrag/prompt_tune/generator/persona.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """Persona generating module for fine-tuning GraphRAG prompts."""
 5 | 
 6 | from graphrag.language_model.protocol.base import ChatModel
 7 | from graphrag.prompt_tune.defaults import DEFAULT_TASK
 8 | from graphrag.prompt_tune.prompt.persona import GENERATE_PERSONA_PROMPT
 9 | 
10 | 
11 | async def generate_persona(
12 |     model: ChatModel, domain: str, task: str = DEFAULT_TASK
13 | ) -> str:
14 |     """Generate an LLM persona to use for GraphRAG prompts.
15 | 
16 |     Parameters
17 |     ----------
18 |     - llm (CompletionLLM): The LLM to use for generation
19 |     - domain (str): The domain to generate a persona for
20 |     - task (str): The task to generate a persona for. Default is DEFAULT_TASK
21 |     """
22 |     formatted_task = task.format(domain=domain)
23 |     persona_prompt = GENERATE_PERSONA_PROMPT.format(sample_task=formatted_task)
24 | 
25 |     response = await model.achat(persona_prompt)
26 | 
27 |     return str(response.output.content)
28 | 


--------------------------------------------------------------------------------
/graphrag/prompt_tune/loader/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 
4 | """Fine-tuning config and data loader module."""
5 | 


--------------------------------------------------------------------------------
/graphrag/prompt_tune/prompt/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 
4 | """Persona, entity type, relationships and domain generation prompts module."""
5 | 


--------------------------------------------------------------------------------
/graphrag/prompt_tune/prompt/community_reporter_role.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """Fine-tuning prompts for community reporter role generation."""
 5 | 
 6 | GENERATE_COMMUNITY_REPORTER_ROLE_PROMPT = """
 7 | {persona}
 8 | Given a sample text, help the user by creating a role definition that will be tasked with community analysis.
 9 | Take a look at this example, determine its key parts, and using the domain provided and your expertise, create a new role definition for the provided inputs that follows the same pattern as the example.
10 | Remember, your output should look just like the provided example in structure and content.
11 | 
12 | Example:
13 | A technologist reporter that is analyzing Kevin Scott's "Behind the Tech Podcast", given a list of entities
14 | that belong to the community as well as their relationships and optional associated claims.
15 | The report will be used to inform decision-makers about significant developments associated with the community and their potential impact.
16 | 
17 | 
18 | Domain: {domain}
19 | Text: {input_text}
20 | Role:"""
21 | 


--------------------------------------------------------------------------------
/graphrag/prompt_tune/prompt/domain.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """Fine-tuning prompts for domain generation."""
 5 | 
 6 | GENERATE_DOMAIN_PROMPT = """
 7 | You are an intelligent assistant that helps a human to analyze the information in a text document.
 8 | Given a sample text, help the user by assigning a descriptive domain that summarizes what the text is about.
 9 | Example domains are: "Social studies", "Algorithmic analysis", "Medical science", among others.
10 | 
11 | Text: {input_text}
12 | Domain:"""
13 | 


--------------------------------------------------------------------------------
/graphrag/prompt_tune/prompt/language.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """Fine-tuning prompts for language detection."""
 5 | 
 6 | DETECT_LANGUAGE_PROMPT = """
 7 | You are an intelligent assistant that helps a human to analyze the information in a text document.
 8 | Given a sample text, help the user by determining what's the primary language of the provided texts.
 9 | Examples are: "English", "Spanish", "Japanese", "Portuguese" among others. Reply ONLY with the language name.
10 | 
11 | Text: {input_text}
12 | Language:"""
13 | 


--------------------------------------------------------------------------------
/graphrag/prompt_tune/prompt/persona.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """Fine-tuning prompts for persona generation."""
 5 | 
 6 | GENERATE_PERSONA_PROMPT = """
 7 | You are an intelligent assistant that helps a human to analyze the information in a text document.
 8 | Given a specific type of task and sample text, help the user by generating a 3 to 4 sentence description of an expert who could help solve the problem.
 9 | Use a format similar to the following:
10 | You are an expert {{role}}. You are skilled at {{relevant skills}}. You are adept at helping people with {{specific task}}.
11 | 
12 | task: {sample_task}
13 | persona description:"""
14 | 


--------------------------------------------------------------------------------
/graphrag/prompt_tune/template/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 
4 | """Fine-tuning prompts for entity extraction, entity summarization, and community report summarization."""
5 | 


--------------------------------------------------------------------------------
/graphrag/prompt_tune/template/entity_summarization.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """Fine-tuning prompts for entity summarization."""
 5 | 
 6 | ENTITY_SUMMARIZATION_PROMPT = """
 7 | {persona}
 8 | Using your expertise, you're asked to generate a comprehensive summary of the data provided below.
 9 | Given one or two entities, and a list of descriptions, all related to the same entity or group of entities.
10 | Please concatenate all of these into a single, concise description in {language}. Make sure to include information collected from all the descriptions.
11 | If the provided descriptions are contradictory, please resolve the contradictions and provide a single, coherent summary.
12 | Make sure it is written in third person, and include the entity names so we have the full context.
13 | 
14 | Enrich it as much as you can with relevant information from the nearby text, this is very important.
15 | 
16 | If no answer is possible, or the description is empty, only convey information that is provided within the text.
17 | #######
18 | -Data-
19 | Entities: {{entity_name}}
20 | Description List: {{description_list}}
21 | #######
22 | Output:"""
23 | 


--------------------------------------------------------------------------------
/graphrag/prompt_tune/types.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """Types for prompt tuning."""
 5 | 
 6 | from enum import Enum
 7 | 
 8 | 
 9 | class DocSelectionType(str, Enum):
10 |     """The type of document selection to use."""
11 | 
12 |     ALL = "all"
13 |     RANDOM = "random"
14 |     TOP = "top"
15 |     AUTO = "auto"
16 | 
17 |     def __str__(self):
18 |         """Return the string representation of the enum value."""
19 |         return self.value
20 | 


--------------------------------------------------------------------------------
/graphrag/prompts/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 
4 | """All prompts for the GraphRAG system."""
5 | 


--------------------------------------------------------------------------------
/graphrag/prompts/index/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 
4 | """All prompts for the indexing engine."""
5 | 


--------------------------------------------------------------------------------
/graphrag/prompts/index/summarize_descriptions.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """A file containing prompts definition."""
 5 | 
 6 | SUMMARIZE_PROMPT = """
 7 | You are a helpful assistant responsible for generating a comprehensive summary of the data provided below.
 8 | Given one or more entities, and a list of descriptions, all related to the same entity or group of entities.
 9 | Please concatenate all of these into a single, comprehensive description. Make sure to include information collected from all the descriptions.
10 | If the provided descriptions are contradictory, please resolve the contradictions and provide a single, coherent summary.
11 | Make sure it is written in third person, and include the entity names so we have the full context.
12 | Limit the final description length to {max_length} words.
13 | 
14 | #######
15 | -Data-
16 | Entities: {entity_name}
17 | Description List: {description_list}
18 | #######
19 | Output:
20 | """
21 | 


--------------------------------------------------------------------------------
/graphrag/prompts/query/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 
4 | """All prompts for the query engine."""
5 | 


--------------------------------------------------------------------------------
/graphrag/prompts/query/global_search_knowledge_system_prompt.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """Global Search system prompts."""
 5 | 
 6 | GENERAL_KNOWLEDGE_INSTRUCTION = """
 7 | The response may also include relevant real-world knowledge outside the dataset, but it must be explicitly annotated with a verification tag [LLM: verify]. For example:
 8 | "This is an example sentence supported by real-world knowledge [LLM: verify]."
 9 | """
10 | 


--------------------------------------------------------------------------------
/graphrag/prompts/query/question_gen_system_prompt.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """Question Generation system prompts."""
 5 | 
 6 | QUESTION_SYSTEM_PROMPT = """
 7 | ---Role---
 8 | 
 9 | You are a helpful assistant generating a bulleted list of {question_count} questions about data in the tables provided.
10 | 
11 | 
12 | ---Data tables---
13 | 
14 | {context_data}
15 | 
16 | 
17 | ---Goal---
18 | 
19 | Given a series of example questions provided by the user, generate a bulleted list of {question_count} candidates for the next question. Use - marks as bullet points.
20 | 
21 | These candidate questions should represent the most important or urgent information content or themes in the data tables.
22 | 
23 | The candidate questions should be answerable using the data tables provided, but should not mention any specific data fields or data tables in the question text.
24 | 
25 | If the user's questions reference several named entities, then each candidate question should reference all named entities.
26 | 
27 | ---Example questions---
28 | """
29 | 


--------------------------------------------------------------------------------
/graphrag/py.typed:
--------------------------------------------------------------------------------
1 | # This package supports type hinting,
2 | # see https://www.python.org/dev/peps/pep-0561/#packaging-type-information


--------------------------------------------------------------------------------
/graphrag/query/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 
4 | """The query engine package root."""
5 | 


--------------------------------------------------------------------------------
/graphrag/query/context_builder/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 
4 | """Functions to build context for system prompt to generate responses for a user query."""
5 | 


--------------------------------------------------------------------------------
/graphrag/query/context_builder/rate_prompt.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """Global search with dynamic community selection prompt."""
 5 | 
 6 | RATE_QUERY = """
 7 | ---Role---
 8 | You are a helpful assistant responsible for deciding whether the provided information is useful in answering a given question, even if it is only partially relevant.
 9 | ---Goal---
10 | On a scale from 0 to 5, please rate how relevant or helpful is the provided information in answering the question.
11 | ---Information---
12 | {description}
13 | ---Question---
14 | {question}
15 | ---Target response length and format---
16 | Please response in the following JSON format with two entries:
17 | - "reason": the reasoning of your rating, please include information that you have considered.
18 | - "rating": the relevancy rating from 0 to 5, where 0 is the least relevant and 5 is the most relevant.
19 | {{
20 |     "reason": str,
21 |     "rating": int.
22 | }}
23 | """
24 | 


--------------------------------------------------------------------------------
/graphrag/query/input/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 
4 | """GraphRAG Orchestration Inputs."""
5 | 


--------------------------------------------------------------------------------
/graphrag/query/input/loaders/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 
4 | """GraphRAG Orchestartion Input Loaders."""
5 | 


--------------------------------------------------------------------------------
/graphrag/query/input/retrieval/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 
4 | """GraphRAG Orchestration Input Retrieval."""
5 | 


--------------------------------------------------------------------------------
/graphrag/query/llm/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 
4 | """Orchestration LLM utilities."""
5 | 


--------------------------------------------------------------------------------
/graphrag/query/question_gen/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 
4 | """Question Generation Module."""
5 | 


--------------------------------------------------------------------------------
/graphrag/query/structured_search/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 
4 | """Structured Search package."""
5 | 


--------------------------------------------------------------------------------
/graphrag/query/structured_search/basic_search/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 
4 | """The BasicSearch package."""
5 | 


--------------------------------------------------------------------------------
/graphrag/query/structured_search/drift_search/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 
4 | """DriftSearch module."""
5 | 


--------------------------------------------------------------------------------
/graphrag/query/structured_search/global_search/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 
4 | """GlobalSearch module."""
5 | 


--------------------------------------------------------------------------------
/graphrag/query/structured_search/local_search/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 
4 | """The LocalSearch package."""
5 | 


--------------------------------------------------------------------------------
/graphrag/storage/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 
4 | """The storage package root."""
5 | 


--------------------------------------------------------------------------------
/graphrag/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 
4 | """Util functions for the GraphRAG package."""
5 | 


--------------------------------------------------------------------------------
/graphrag/utils/storage.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """Storage functions for the GraphRAG run module."""
 5 | 
 6 | import logging
 7 | from io import BytesIO
 8 | 
 9 | import pandas as pd
10 | 
11 | from graphrag.storage.pipeline_storage import PipelineStorage
12 | 
13 | log = logging.getLogger(__name__)
14 | 
15 | 
16 | async def load_table_from_storage(name: str, storage: PipelineStorage) -> pd.DataFrame:
17 |     """Load a parquet from the storage instance."""
18 |     filename = f"{name}.parquet"
19 |     if not await storage.has(filename):
20 |         msg = f"Could not find {filename} in storage!"
21 |         raise ValueError(msg)
22 |     try:
23 |         log.info("reading table from storage: %s", filename)
24 |         return pd.read_parquet(BytesIO(await storage.get(filename, as_bytes=True)))
25 |     except Exception:
26 |         log.exception("error loading table from storage: %s", filename)
27 |         raise
28 | 
29 | 
30 | async def write_table_to_storage(
31 |     table: pd.DataFrame, name: str, storage: PipelineStorage
32 | ) -> None:
33 |     """Write a table to storage."""
34 |     await storage.set(f"{name}.parquet", table.to_parquet())
35 | 
36 | 
37 | async def delete_table_from_storage(name: str, storage: PipelineStorage) -> None:
38 |     """Delete a table to storage."""
39 |     await storage.delete(f"{name}.parquet")
40 | 
41 | 
42 | async def storage_has_table(name: str, storage: PipelineStorage) -> bool:
43 |     """Check if a table exists in storage."""
44 |     return await storage.has(f"{name}.parquet")
45 | 


--------------------------------------------------------------------------------
/graphrag/vector_stores/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 
4 | """A package containing vector store implementations."""
5 | 


--------------------------------------------------------------------------------
/scripts/semver-check.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | changes=$(git diff --name-only origin/main)
 3 | has_change_doc=$(echo $changes | grep .semversioner/next-release)
 4 | has_impacting_changes=$(echo $changes | grep graphrag)
 5 | 
 6 | if [ "$has_impacting_changes" ] && [ -z "$has_change_doc" ]; then
 7 |     echo "Check failed. Run 'poetry run semversioner add-change' to update the next release version"
 8 |     exit 1
 9 | fi
10 | echo "OK"
11 | 


--------------------------------------------------------------------------------
/scripts/spellcheck.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | npx --yes cspell -c cspell.config.yaml --no-progress lint .


--------------------------------------------------------------------------------
/scripts/start-azurite.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | npx --yes azurite -L -l ./temp_azurite -d ./temp_azurite/debug.log


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | 
 5 | """Tests for the GraphRAG LLM module."""
 6 | 
 7 | # Register MOCK providers
 8 | from graphrag.config.enums import ModelType
 9 | from graphrag.language_model.factory import ModelFactory
10 | from tests.mock_provider import MockChatLLM, MockEmbeddingLLM
11 | 
12 | ModelFactory.register_chat(ModelType.MockChat, lambda **kwargs: MockChatLLM(**kwargs))
13 | ModelFactory.register_embedding(
14 |     ModelType.MockEmbedding, lambda **kwargs: MockEmbeddingLLM(**kwargs)
15 | )
16 | 


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 
4 | 
5 | def pytest_addoption(parser):
6 |     parser.addoption(
7 |         "--run_slow", action="store_true", default=False, help="run slow tests"
8 |     )
9 | 


--------------------------------------------------------------------------------
/tests/fixtures/azure/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "input_path": "./tests/fixtures/azure",
 3 |     "input_file_type": "text",
 4 |     "workflow_config": {
 5 |         "skip_assert": true,
 6 |         "azure": {
 7 |             "input_container": "azurefixture",
 8 |             "input_base_dir": "input"
 9 |         }
10 |     },
11 |     "query_config": [],
12 |     "slow": false
13 | }


--------------------------------------------------------------------------------
/tests/fixtures/azure/input/ABOUT.md:
--------------------------------------------------------------------------------
1 | # About
2 | 
3 | This document (Operation Dulce) in an AI-generated science fiction novella, included here for the purposes of integration testing.


--------------------------------------------------------------------------------
/tests/fixtures/azure/settings.yml:
--------------------------------------------------------------------------------
 1 | extract_claims:
 2 |   enabled: true
 3 | 
 4 | vector_store:
 5 |   default_vector_store:
 6 |     type: "azure_ai_search"
 7 |     url: ${AZURE_AI_SEARCH_URL_ENDPOINT}
 8 |     api_key: ${AZURE_AI_SEARCH_API_KEY}
 9 |     container_name: "azure_ci"
10 | 
11 | input:
12 |   type: blob
13 |   file_type: text
14 |   connection_string: ${LOCAL_BLOB_STORAGE_CONNECTION_STRING}
15 |   container_name: azurefixture
16 |   base_dir: input
17 | 
18 | cache:
19 |   type: blob
20 |   connection_string: ${BLOB_STORAGE_CONNECTION_STRING}
21 |   container_name: cicache
22 |   base_dir: cache_azure_ai
23 | 
24 | storage:
25 |   type: blob
26 |   connection_string: ${LOCAL_BLOB_STORAGE_CONNECTION_STRING}
27 |   container_name: azurefixture
28 |   base_dir: output
29 | 
30 | reporting:
31 |   type: blob
32 |   connection_string: ${LOCAL_BLOB_STORAGE_CONNECTION_STRING}
33 |   container_name: azurefixture
34 |   base_dir: reports
35 | 


--------------------------------------------------------------------------------
/tests/fixtures/min-csv/input/ABOUT.md:
--------------------------------------------------------------------------------
1 | # About
2 | 
3 | This document (Operation Dulce) in an AI-generated science fiction novella, included here for the purposes of integration testing.


--------------------------------------------------------------------------------
/tests/fixtures/min-csv/settings.yml:
--------------------------------------------------------------------------------
 1 | models:
 2 |   default_chat_model:
 3 |     azure_auth_type: api_key
 4 |     type: ${GRAPHRAG_LLM_TYPE}
 5 |     api_key: ${GRAPHRAG_API_KEY}
 6 |     api_base: ${GRAPHRAG_API_BASE}
 7 |     api_version: ${GRAPHRAG_API_VERSION}
 8 |     deployment_name: ${GRAPHRAG_LLM_DEPLOYMENT_NAME}
 9 |     model: ${GRAPHRAG_LLM_MODEL}
10 |     tokens_per_minute: ${GRAPHRAG_LLM_TPM}
11 |     requests_per_minute: ${GRAPHRAG_LLM_RPM}
12 |     model_supports_json: true
13 |     concurrent_requests: 50
14 |     async_mode: threaded
15 |   default_embedding_model:
16 |     azure_auth_type: api_key
17 |     type: ${GRAPHRAG_EMBEDDING_TYPE}
18 |     api_key: ${GRAPHRAG_API_KEY}
19 |     api_base: ${GRAPHRAG_API_BASE}
20 |     api_version: ${GRAPHRAG_API_VERSION}
21 |     deployment_name: ${GRAPHRAG_EMBEDDING_DEPLOYMENT_NAME}
22 |     model: ${GRAPHRAG_EMBEDDING_MODEL}
23 |     tokens_per_minute: null
24 |     requests_per_minute: null
25 |     concurrent_requests: 50
26 |     async_mode: threaded
27 | 
28 | vector_store:
29 |   default_vector_store:
30 |     type: "lancedb"
31 |     db_uri: "./tests/fixtures/min-csv/lancedb"
32 |     container_name: "lancedb_ci"
33 |     overwrite: True
34 | 
35 | input:
36 |   file_type: csv
37 | 
38 | snapshots:
39 |   embeddings: True
40 | 
41 | drift_search:
42 |   n_depth: 1
43 |   drift_k_followups: 3
44 |   primer_folds: 3


--------------------------------------------------------------------------------
/tests/fixtures/text/input/ABOUT.md:
--------------------------------------------------------------------------------
1 | # About
2 | 
3 | This document (Operation Dulce) in an AI-generated science fiction novella, included here for the purposes of integration testing.


--------------------------------------------------------------------------------
/tests/fixtures/text/settings.yml:
--------------------------------------------------------------------------------
 1 | models:
 2 |   default_chat_model:
 3 |     azure_auth_type: api_key
 4 |     type: ${GRAPHRAG_LLM_TYPE}
 5 |     api_key: ${GRAPHRAG_API_KEY}
 6 |     api_base: ${GRAPHRAG_API_BASE}
 7 |     api_version: ${GRAPHRAG_API_VERSION}
 8 |     deployment_name: ${GRAPHRAG_LLM_DEPLOYMENT_NAME}
 9 |     model: ${GRAPHRAG_LLM_MODEL}
10 |     tokens_per_minute: ${GRAPHRAG_LLM_TPM}
11 |     requests_per_minute: ${GRAPHRAG_LLM_RPM}
12 |     model_supports_json: true
13 |     concurrent_requests: 50
14 |     async_mode: threaded
15 |   default_embedding_model:
16 |     azure_auth_type: api_key
17 |     type: ${GRAPHRAG_EMBEDDING_TYPE}
18 |     api_key: ${GRAPHRAG_API_KEY}
19 |     api_base: ${GRAPHRAG_API_BASE}
20 |     api_version: ${GRAPHRAG_API_VERSION}
21 |     deployment_name: ${GRAPHRAG_EMBEDDING_DEPLOYMENT_NAME}
22 |     model: ${GRAPHRAG_EMBEDDING_MODEL}
23 |     tokens_per_minute: null
24 |     requests_per_minute: null
25 |     concurrent_requests: 50
26 |     async_mode: threaded
27 | 
28 | vector_store:
29 |   default_vector_store:
30 |     type: "azure_ai_search"
31 |     url: ${AZURE_AI_SEARCH_URL_ENDPOINT}
32 |     api_key: ${AZURE_AI_SEARCH_API_KEY}
33 |     container_name: "simple_text_ci"
34 | 
35 | extract_claims:
36 |   enabled: true
37 | 
38 | community_reports:
39 |   prompt: "prompts/community_report.txt"
40 |   max_length: 2000
41 |   max_input_length: 8000
42 | 
43 | snapshots:
44 |   embeddings: True
45 | 
46 | drift_search:
47 |   n_depth: 1
48 |   drift_k_followups: 3
49 |   primer_folds: 3
50 | 


--------------------------------------------------------------------------------
/tests/integration/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 


--------------------------------------------------------------------------------
/tests/integration/language_model/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2025 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 


--------------------------------------------------------------------------------
/tests/integration/storage/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 


--------------------------------------------------------------------------------
/tests/integration/vector_stores/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 
4 | """Integration tests for vector store implementations."""
5 | 


--------------------------------------------------------------------------------
/tests/notebook/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 


--------------------------------------------------------------------------------
/tests/notebook/test_notebooks.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | import subprocess
 4 | from pathlib import Path
 5 | 
 6 | import nbformat
 7 | import pytest
 8 | 
 9 | NOTEBOOKS_PATH = Path("examples_notebooks")
10 | EXCLUDED_PATH = NOTEBOOKS_PATH / "community_contrib"
11 | 
12 | notebooks_list = [
13 |     notebook
14 |     for notebook in NOTEBOOKS_PATH.rglob("*.ipynb")
15 |     if EXCLUDED_PATH not in notebook.parents
16 | ]
17 | 
18 | 
19 | def _notebook_run(filepath: Path):
20 |     """Execute a notebook via nbconvert and collect output.
21 |     :returns execution errors
22 |     """
23 |     args = [
24 |         "jupyter",
25 |         "nbconvert",
26 |         "--to",
27 |         "notebook",
28 |         "--execute",
29 |         "-y",
30 |         "--no-prompt",
31 |         "--stdout",
32 |         str(filepath.absolute().resolve()),
33 |     ]
34 |     notebook = subprocess.check_output(args)
35 |     nb = nbformat.reads(notebook, nbformat.current_nbformat)
36 | 
37 |     return [
38 |         output
39 |         for cell in nb.cells
40 |         if "outputs" in cell
41 |         for output in cell["outputs"]
42 |         if output.output_type == "error"
43 |     ]
44 | 
45 | 
46 | @pytest.mark.parametrize("notebook_path", notebooks_list)
47 | def test_notebook(notebook_path: Path):
48 |     assert _notebook_run(notebook_path) == []
49 | 


--------------------------------------------------------------------------------
/tests/smoke/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 


--------------------------------------------------------------------------------
/tests/unit/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 


--------------------------------------------------------------------------------
/tests/unit/config/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 


--------------------------------------------------------------------------------
/tests/unit/config/fixtures/minimal_config/settings.yaml:
--------------------------------------------------------------------------------
1 | models:
2 |   default_chat_model:
3 |     api_key: ${CUSTOM_API_KEY}
4 |     type: openai_chat
5 |     model: gpt-4-turbo-preview
6 |   default_embedding_model:
7 |     api_key: ${CUSTOM_API_KEY}
8 |     type: openai_embedding
9 |     model: text-embedding-3-small


--------------------------------------------------------------------------------
/tests/unit/config/fixtures/minimal_config_missing_env_var/settings.yaml:
--------------------------------------------------------------------------------
1 | models:
2 |   default_chat_model:
3 |     api_key: ${SOME_NON_EXISTENT_ENV_VAR}
4 |     type: openai_chat
5 |     model: gpt-4-turbo-preview
6 |   default_embedding_model:
7 |     api_key: ${SOME_NON_EXISTENT_ENV_VAR}
8 |     type: openai_embedding
9 |     model: text-embedding-3-small


--------------------------------------------------------------------------------
/tests/unit/config/fixtures/timestamp_dirs/20240812-120000/empty.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/tests/unit/config/fixtures/timestamp_dirs/20240812-120000/empty.txt


--------------------------------------------------------------------------------
/tests/unit/config/prompt-a.txt:
--------------------------------------------------------------------------------
1 | Hello, World! A


--------------------------------------------------------------------------------
/tests/unit/config/prompt-b.txt:
--------------------------------------------------------------------------------
1 | Hello, World! B


--------------------------------------------------------------------------------
/tests/unit/config/prompt-c.txt:
--------------------------------------------------------------------------------
1 | Hello, World! C


--------------------------------------------------------------------------------
/tests/unit/config/prompt-d.txt:
--------------------------------------------------------------------------------
1 | Hello, World! D


--------------------------------------------------------------------------------
/tests/unit/indexing/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 


--------------------------------------------------------------------------------
/tests/unit/indexing/cache/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 


--------------------------------------------------------------------------------
/tests/unit/indexing/graph/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 


--------------------------------------------------------------------------------
/tests/unit/indexing/graph/extractors/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 


--------------------------------------------------------------------------------
/tests/unit/indexing/graph/extractors/community_reports/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 


--------------------------------------------------------------------------------
/tests/unit/indexing/graph/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 


--------------------------------------------------------------------------------
/tests/unit/indexing/input/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 


--------------------------------------------------------------------------------
/tests/unit/indexing/input/data/multiple-csvs/input1.csv:
--------------------------------------------------------------------------------
1 | title,text
2 | Hello,Hi how are you today?
3 | Goodbye,I'm outta here


--------------------------------------------------------------------------------
/tests/unit/indexing/input/data/multiple-csvs/input2.csv:
--------------------------------------------------------------------------------
1 | title,text
2 | Adios,See you later


--------------------------------------------------------------------------------
/tests/unit/indexing/input/data/multiple-csvs/input3.csv:
--------------------------------------------------------------------------------
1 | title,text
2 | Hi,I'm here


--------------------------------------------------------------------------------
/tests/unit/indexing/input/data/multiple-jsons/input1.json:
--------------------------------------------------------------------------------
 1 | [{
 2 |     "title": "Hello",
 3 |     "text": "Hi how are you today?"
 4 | }, {
 5 |     "title": "Goodbye",
 6 |     "text": "I'm outta here"
 7 | }, {
 8 |     "title": "Adios",
 9 |     "text": "See you later"
10 | }]
11 | 


--------------------------------------------------------------------------------
/tests/unit/indexing/input/data/multiple-jsons/input2.json:
--------------------------------------------------------------------------------
1 | {
2 |     "title": "Hi",
3 |     "text": "I'm here"
4 | }


--------------------------------------------------------------------------------
/tests/unit/indexing/input/data/multiple-txts/input1.txt:
--------------------------------------------------------------------------------
1 | Hi how are you today?


--------------------------------------------------------------------------------
/tests/unit/indexing/input/data/multiple-txts/input2.txt:
--------------------------------------------------------------------------------
1 | I'm outta here


--------------------------------------------------------------------------------
/tests/unit/indexing/input/data/one-csv/input.csv:
--------------------------------------------------------------------------------
1 | title,text
2 | Hello,Hi how are you today?
3 | Goodbye,I'm outta here


--------------------------------------------------------------------------------
/tests/unit/indexing/input/data/one-json-multiple-objects/input.json:
--------------------------------------------------------------------------------
 1 | [{
 2 |     "title": "Hello",
 3 |     "text": "Hi how are you today?"
 4 | }, {
 5 |     "title": "Goodbye",
 6 |     "text": "I'm outta here"
 7 | }, {
 8 |     "title": "Adios",
 9 |     "text": "See you later"
10 | }]
11 | 


--------------------------------------------------------------------------------
/tests/unit/indexing/input/data/one-json-one-object/input.json:
--------------------------------------------------------------------------------
1 | {
2 |     "title": "Hello",
3 |     "text": "Hi how are you today?"
4 | }


--------------------------------------------------------------------------------
/tests/unit/indexing/input/data/one-txt/input.txt:
--------------------------------------------------------------------------------
1 | Hi how are you today?


--------------------------------------------------------------------------------
/tests/unit/indexing/operations/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 


--------------------------------------------------------------------------------
/tests/unit/indexing/operations/chunk_text/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 


--------------------------------------------------------------------------------
/tests/unit/indexing/test_init_content.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | import re
 5 | from typing import Any, cast
 6 | 
 7 | import yaml
 8 | 
 9 | from graphrag.config.create_graphrag_config import create_graphrag_config
10 | from graphrag.config.init_content import INIT_YAML
11 | from graphrag.config.models.graph_rag_config import GraphRagConfig
12 | 
13 | 
14 | def test_init_yaml():
15 |     data = yaml.load(INIT_YAML, Loader=yaml.FullLoader)
16 |     config = create_graphrag_config(data)
17 |     GraphRagConfig.model_validate(config, strict=True)
18 | 
19 | 
20 | def test_init_yaml_uncommented():
21 |     lines = INIT_YAML.splitlines()
22 |     lines = [line for line in lines if "##" not in line]
23 | 
24 |     def uncomment_line(line: str) -> str:
25 |         leading_whitespace = cast("Any", re.search(r"^(\s*)", line)).group(1)
26 |         return re.sub(r"^\s*# ", leading_whitespace, line, count=1)
27 | 
28 |     content = "\n".join([uncomment_line(line) for line in lines])
29 |     data = yaml.load(content, Loader=yaml.FullLoader)
30 |     config = create_graphrag_config(data)
31 |     GraphRagConfig.model_validate(config, strict=True)
32 | 


--------------------------------------------------------------------------------
/tests/unit/indexing/text_splitting/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 


--------------------------------------------------------------------------------
/tests/unit/indexing/verbs/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 


--------------------------------------------------------------------------------
/tests/unit/indexing/verbs/entities/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 


--------------------------------------------------------------------------------
/tests/unit/indexing/verbs/entities/extraction/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 


--------------------------------------------------------------------------------
/tests/unit/indexing/verbs/entities/extraction/strategies/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 


--------------------------------------------------------------------------------
/tests/unit/indexing/verbs/entities/extraction/strategies/graph_intelligence/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 


--------------------------------------------------------------------------------
/tests/unit/indexing/verbs/helpers/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 


--------------------------------------------------------------------------------
/tests/unit/indexing/verbs/helpers/mock_llm.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | from pydantic import BaseModel
 4 | 
 5 | from graphrag.language_model.manager import ModelManager
 6 | from graphrag.language_model.protocol.base import ChatModel
 7 | 
 8 | 
 9 | def create_mock_llm(responses: list[str | BaseModel], name: str = "mock") -> ChatModel:
10 |     """Creates a mock LLM that returns the given responses."""
11 |     return ModelManager().get_or_create_chat_model(
12 |         name, "mock_chat", responses=responses
13 |     )
14 | 


--------------------------------------------------------------------------------
/tests/unit/query/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 


--------------------------------------------------------------------------------
/tests/unit/query/context_builder/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 


--------------------------------------------------------------------------------
/tests/unit/query/data/defaults/output/20240812-120000/empty.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/tests/unit/query/data/defaults/output/20240812-120000/empty.txt


--------------------------------------------------------------------------------
/tests/unit/query/data/defaults/output/20240812-121000/empty.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/tests/unit/query/data/defaults/output/20240812-121000/empty.txt


--------------------------------------------------------------------------------
/tests/unit/query/data/empty/something-else/empty.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/tests/unit/query/data/empty/something-else/empty.txt


--------------------------------------------------------------------------------
/tests/unit/query/data/hidden/output/.another/empty.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/tests/unit/query/data/hidden/output/.another/empty.txt


--------------------------------------------------------------------------------
/tests/unit/query/data/hidden/output/.hidden:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/tests/unit/query/data/hidden/output/.hidden


--------------------------------------------------------------------------------
/tests/unit/query/data/hidden/output/20240812-120000/empty.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/tests/unit/query/data/hidden/output/20240812-120000/empty.txt


--------------------------------------------------------------------------------
/tests/unit/query/data/hidden/output/20240812-121000/empty.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/tests/unit/query/data/hidden/output/20240812-121000/empty.txt


--------------------------------------------------------------------------------
/tests/unit/query/data/non-numeric/output/20240812-120000/empty.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/tests/unit/query/data/non-numeric/output/20240812-120000/empty.txt


--------------------------------------------------------------------------------
/tests/unit/query/data/non-numeric/output/20240812-121000/empty.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/tests/unit/query/data/non-numeric/output/20240812-121000/empty.txt


--------------------------------------------------------------------------------
/tests/unit/query/data/non-numeric/output/something-else/empty.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/tests/unit/query/data/non-numeric/output/something-else/empty.txt


--------------------------------------------------------------------------------
/tests/unit/query/input/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 


--------------------------------------------------------------------------------
/tests/unit/query/input/retrieval/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 


--------------------------------------------------------------------------------
/tests/unit/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 


--------------------------------------------------------------------------------
/tests/unit/utils/test_embeddings.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | import pytest
 5 | 
 6 | from graphrag.config.embeddings import create_collection_name
 7 | 
 8 | 
 9 | def test_create_collection_name():
10 |     collection = create_collection_name("default", "entity.title")
11 |     assert collection == "default-entity-title"
12 | 
13 | 
14 | def test_create_collection_name_invalid_embedding_throws():
15 |     with pytest.raises(KeyError):
16 |         create_collection_name("default", "invalid.name")
17 | 
18 | 
19 | def test_create_collection_name_invalid_embedding_does_not_throw():
20 |     collection = create_collection_name("default", "invalid.name", validate=False)
21 |     assert collection == "default-invalid-name"
22 | 


--------------------------------------------------------------------------------
/tests/verbs/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 


--------------------------------------------------------------------------------
/tests/verbs/data/communities.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/tests/verbs/data/communities.parquet


--------------------------------------------------------------------------------
/tests/verbs/data/community_reports.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/tests/verbs/data/community_reports.parquet


--------------------------------------------------------------------------------
/tests/verbs/data/covariates.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/tests/verbs/data/covariates.parquet


--------------------------------------------------------------------------------
/tests/verbs/data/documents.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/tests/verbs/data/documents.parquet


--------------------------------------------------------------------------------
/tests/verbs/data/entities.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/tests/verbs/data/entities.parquet


--------------------------------------------------------------------------------
/tests/verbs/data/relationships.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/tests/verbs/data/relationships.parquet


--------------------------------------------------------------------------------
/tests/verbs/data/text_units.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/tests/verbs/data/text_units.parquet


--------------------------------------------------------------------------------
/tests/verbs/data/text_units_metadata.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/tests/verbs/data/text_units_metadata.parquet


--------------------------------------------------------------------------------
/tests/verbs/data/text_units_metadata_included_chunk.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/tests/verbs/data/text_units_metadata_included_chunk.parquet


--------------------------------------------------------------------------------
/tests/verbs/test_create_communities.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | from graphrag.config.create_graphrag_config import create_graphrag_config
 5 | from graphrag.data_model.schemas import COMMUNITIES_FINAL_COLUMNS
 6 | from graphrag.index.workflows.create_communities import (
 7 |     run_workflow,
 8 | )
 9 | from graphrag.utils.storage import load_table_from_storage
10 | 
11 | from .util import (
12 |     DEFAULT_MODEL_CONFIG,
13 |     compare_outputs,
14 |     create_test_context,
15 |     load_test_table,
16 | )
17 | 
18 | 
19 | async def test_create_communities():
20 |     expected = load_test_table("communities")
21 | 
22 |     context = await create_test_context(
23 |         storage=[
24 |             "entities",
25 |             "relationships",
26 |         ],
27 |     )
28 | 
29 |     config = create_graphrag_config({"models": DEFAULT_MODEL_CONFIG})
30 | 
31 |     await run_workflow(
32 |         config,
33 |         context,
34 |     )
35 | 
36 |     actual = await load_table_from_storage("communities", context.storage)
37 | 
38 |     columns = list(expected.columns.values)
39 |     # don't compare period since it is created with the current date each time
40 |     columns.remove("period")
41 |     compare_outputs(
42 |         actual,
43 |         expected,
44 |         columns=columns,
45 |     )
46 | 
47 |     for column in COMMUNITIES_FINAL_COLUMNS:
48 |         assert column in actual.columns
49 | 


--------------------------------------------------------------------------------
/tests/verbs/test_create_final_text_units.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | from graphrag.config.create_graphrag_config import create_graphrag_config
 5 | from graphrag.data_model.schemas import TEXT_UNITS_FINAL_COLUMNS
 6 | from graphrag.index.workflows.create_final_text_units import (
 7 |     run_workflow,
 8 | )
 9 | from graphrag.utils.storage import load_table_from_storage
10 | 
11 | from .util import (
12 |     DEFAULT_MODEL_CONFIG,
13 |     compare_outputs,
14 |     create_test_context,
15 |     load_test_table,
16 | )
17 | 
18 | 
19 | async def test_create_final_text_units():
20 |     expected = load_test_table("text_units")
21 | 
22 |     context = await create_test_context(
23 |         storage=[
24 |             "text_units",
25 |             "entities",
26 |             "relationships",
27 |             "covariates",
28 |         ],
29 |     )
30 | 
31 |     config = create_graphrag_config({"models": DEFAULT_MODEL_CONFIG})
32 |     config.extract_claims.enabled = True
33 | 
34 |     await run_workflow(config, context)
35 | 
36 |     actual = await load_table_from_storage("text_units", context.storage)
37 | 
38 |     for column in TEXT_UNITS_FINAL_COLUMNS:
39 |         assert column in actual.columns
40 | 
41 |     compare_outputs(actual, expected)
42 | 


--------------------------------------------------------------------------------
/tests/verbs/test_extract_graph_nlp.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | from graphrag.config.create_graphrag_config import create_graphrag_config
 5 | from graphrag.index.workflows.extract_graph_nlp import (
 6 |     run_workflow,
 7 | )
 8 | from graphrag.utils.storage import load_table_from_storage
 9 | 
10 | from .util import (
11 |     DEFAULT_MODEL_CONFIG,
12 |     create_test_context,
13 | )
14 | 
15 | 
16 | async def test_extract_graph_nlp():
17 |     context = await create_test_context(
18 |         storage=["text_units"],
19 |     )
20 | 
21 |     config = create_graphrag_config({"models": DEFAULT_MODEL_CONFIG})
22 | 
23 |     await run_workflow(config, context)
24 | 
25 |     nodes_actual = await load_table_from_storage("entities", context.storage)
26 |     edges_actual = await load_table_from_storage("relationships", context.storage)
27 | 
28 |     # this will be the raw count of entities and edges with no pruning
29 |     # with NLP it is deterministic, so we can assert exact row counts
30 |     assert len(nodes_actual) == 1148
31 |     assert len(nodes_actual.columns) == 5
32 |     assert len(edges_actual) == 29445
33 |     assert len(edges_actual.columns) == 5
34 | 


--------------------------------------------------------------------------------
/tests/verbs/test_prune_graph.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | from graphrag.config.create_graphrag_config import create_graphrag_config
 5 | from graphrag.config.models.prune_graph_config import PruneGraphConfig
 6 | from graphrag.index.workflows.prune_graph import (
 7 |     run_workflow,
 8 | )
 9 | from graphrag.utils.storage import load_table_from_storage
10 | 
11 | from .util import (
12 |     DEFAULT_MODEL_CONFIG,
13 |     create_test_context,
14 | )
15 | 
16 | 
17 | async def test_prune_graph():
18 |     context = await create_test_context(
19 |         storage=["entities", "relationships"],
20 |     )
21 | 
22 |     config = create_graphrag_config({"models": DEFAULT_MODEL_CONFIG})
23 |     config.prune_graph = PruneGraphConfig(
24 |         min_node_freq=4, min_node_degree=0, min_edge_weight_pct=0
25 |     )
26 | 
27 |     await run_workflow(config, context)
28 | 
29 |     nodes_actual = await load_table_from_storage("entities", context.storage)
30 | 
31 |     assert len(nodes_actual) == 21
32 | 


--------------------------------------------------------------------------------
/unified-search-app/.vsts-ci.yml:
--------------------------------------------------------------------------------
 1 | name: unified-search-app
 2 | pool:
 3 |   vmImage: ubuntu-latest
 4 | 
 5 | trigger:
 6 |   batch: true
 7 |   branches:
 8 |     include:
 9 |       - main
10 |   paths:
11 |     include:
12 |       - unified-search-app
13 |     
14 | 
15 | stages:
16 |   - stage: Build_deploy
17 |     dependsOn: []
18 |     jobs:
19 |       - job: build
20 |         displayName: Build and deploy
21 |         pool:
22 |           vmImage: ubuntu-latest
23 |         steps:
24 |           - task: UsePythonVersion@0
25 |             inputs:
26 |               versionSpec: "3.11"
27 |             displayName: "Use Python 3.11"
28 | 
29 |           - task: Docker@2
30 |             inputs:
31 |               containerRegistry: '$(containerRegistry)'
32 |               repository: 'main'
33 |               command: 'buildAndPush'
34 |               Dockerfile: 'unified-search-app/Dockerfile'
35 |               tags: 'latest'
36 |           - task: AzureAppServiceManage@0
37 |             inputs:
38 |               azureSubscription: '$(subscription)'
39 |               Action: 'Restart Azure App Service'
40 |               WebAppName: '$(webApp)'
41 | 


--------------------------------------------------------------------------------
/unified-search-app/Dockerfile:
--------------------------------------------------------------------------------
 1 | 
 2 | # Copyright (c) Microsoft Corporation. All rights reserved.
 3 | # Dockerfile
 4 | # https://eng.ms/docs/more/containers-secure-supply-chain/approved-images
 5 | FROM mcr.microsoft.com/oryx/python:3.11
 6 | 
 7 | RUN curl -fsSL https://packages.microsoft.com/keys/microsoft.asc | gpg --dearmor -o /usr/share/keyrings/microsoft-prod.gpg
 8 | RUN apt-get update -y
 9 | 
10 | # Install dependencies
11 | WORKDIR ./
12 | COPY . .
13 | RUN curl -sSL https://install.python-poetry.org | python -
14 | ENV PATH="${PATH}:/root/.local/bin"
15 | RUN poetry config virtualenvs.in-project true
16 | RUN poetry install --no-root
17 | 
18 | # Run application
19 | EXPOSE 8501
20 | ENTRYPOINT ["poetry","run","streamlit", "run", "./app/home_page.py"]


--------------------------------------------------------------------------------
/unified-search-app/app/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 
4 | """App module."""
5 | 


--------------------------------------------------------------------------------
/unified-search-app/app/data_config.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """Data config module."""
 5 | 
 6 | # This file is used to store configurations for the graph-indexed data and the LLM/embeddings models used in the app.
 7 | 
 8 | # name of the table in the graph-indexed data where the communities are stored
 9 | communities_table = "output/communities"
10 | 
11 | # name of the table in the graph-indexed data where the community reports are stored
12 | community_report_table = "output/community_reports"
13 | 
14 | # name of the table in the graph-indexed data where the entity embeddings are stored
15 | entity_table = "output/entities"
16 | 
17 | # name of the table in the graph-indexed data where the entity relationships are stored
18 | relationship_table = "output/relationships"
19 | 
20 | # name of the table in the graph-indexed data where the entity covariates are stored
21 | covariate_table = "output/covariates"
22 | 
23 | # name of the table in the graph-indexed data where the text units are stored
24 | text_unit_table = "output/text_units"
25 | 
26 | # default configurations for LLM's answer generation, used in all search types
27 | # this should be adjusted based on the token limits of the LLM model being used
28 | # The following setting is for gpt-4-1106-preview (i.e. gpt-4-turbo)
29 | # For gpt-4 (token-limit = 8k), a good setting could be:
30 | default_suggested_questions = 5
31 | 
32 | # default timeout for streamlit cache
33 | default_ttl = 60 * 60 * 24 * 7
34 | 


--------------------------------------------------------------------------------
/unified-search-app/app/knowledge_loader/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 
4 | """Knowledge loader module."""
5 | 


--------------------------------------------------------------------------------
/unified-search-app/app/knowledge_loader/data_sources/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 
4 | """Data sources module."""
5 | 


--------------------------------------------------------------------------------
/unified-search-app/app/knowledge_loader/data_sources/default.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """Data sources default module."""
 5 | 
 6 | import os
 7 | 
 8 | container_name = "data"
 9 | blob_container_name = os.getenv("BLOB_CONTAINER_NAME", container_name)
10 | blob_account_name = os.getenv("BLOB_ACCOUNT_NAME")
11 | 
12 | local_data_root = os.getenv("DATA_ROOT")
13 | 
14 | LISTING_FILE = "listing.json"
15 | 
16 | if local_data_root is None and blob_account_name is None:
17 |     error_message = (
18 |         "Either DATA_ROOT or BLOB_ACCOUNT_NAME environment variable must be set."
19 |     )
20 |     raise ValueError(error_message)
21 | 


--------------------------------------------------------------------------------
/unified-search-app/app/rag/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 
4 | """Rag module."""
5 | 


--------------------------------------------------------------------------------
/unified-search-app/app/rag/typing.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """Typing module."""
 5 | 
 6 | from dataclasses import dataclass
 7 | from enum import Enum
 8 | 
 9 | import pandas as pd
10 | 
11 | 
12 | class SearchType(Enum):
13 |     """SearchType class definition."""
14 | 
15 |     Basic = "basic"
16 |     Local = "local"
17 |     Global = "global"
18 |     Drift = "drift"
19 | 
20 | 
21 | @dataclass
22 | class SearchResult:
23 |     """SearchResult class definition."""
24 | 
25 |     # create a dataclass to store the search result of each algorithm
26 |     search_type: SearchType
27 |     response: str
28 |     context: dict[str, pd.DataFrame]
29 | 


--------------------------------------------------------------------------------
/unified-search-app/app/state/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 
4 | """App state module."""
5 | 


--------------------------------------------------------------------------------
/unified-search-app/app/state/query_variable.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """Query variable module."""
 5 | 
 6 | from typing import Any
 7 | 
 8 | import streamlit as st
 9 | 
10 | 
11 | class QueryVariable:
12 |     """
13 |     Manage reading and writing variables from the URL query string.
14 | 
15 |     We handle translation between string values and bools, accounting for always-lowercase URLs to avoid case issues.
16 |     Note that all variables are managed via session state to account for widgets that auto-read.
17 |     We just push them up to the query to keep it updated.
18 |     """
19 | 
20 |     def __init__(self, key: str, default: Any | None):
21 |         """Init method definition."""
22 |         self._key = key
23 |         val = st.query_params[key].lower() if key in st.query_params else default
24 |         if val == "true":
25 |             val = True
26 |         elif val == "false":
27 |             val = False
28 |         if key not in st.session_state:
29 |             st.session_state[key] = val
30 | 
31 |     @property
32 |     def key(self) -> str:
33 |         """Key property definition."""
34 |         return self._key
35 | 
36 |     @property
37 |     def value(self) -> Any:
38 |         """Value property definition."""
39 |         return st.session_state[self._key]
40 | 
41 |     @value.setter
42 |     def value(self, value: Any) -> None:
43 |         """Value setter definition."""
44 |         st.session_state[self._key] = value
45 |         st.query_params[self._key] = f"{value}".lower()
46 | 


--------------------------------------------------------------------------------
/unified-search-app/app/ui/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Microsoft Corporation.
2 | # Licensed under the MIT License
3 | 
4 | """App UI module."""
5 | 


--------------------------------------------------------------------------------
/unified-search-app/app/ui/questions_list.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """Question list module."""
 5 | 
 6 | import streamlit as st
 7 | from state.session_variables import SessionVariables
 8 | 
 9 | 
10 | def create_questions_list_ui(sv: SessionVariables):
11 |     """Return question list UI component."""
12 |     selection = st.dataframe(
13 |         sv.generated_questions.value,
14 |         use_container_width=True,
15 |         hide_index=True,
16 |         selection_mode="single-row",
17 |         column_config={"value": "question"},
18 |         on_select="rerun",
19 |     )
20 |     rows = selection.selection.rows
21 |     if len(rows) > 0:
22 |         question_index = selection.selection.rows[0]
23 |         sv.selected_question.value = sv.generated_questions.value[question_index]
24 | 


--------------------------------------------------------------------------------
/unified-search-app/app/ui/report_list.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 Microsoft Corporation.
 2 | # Licensed under the MIT License
 3 | 
 4 | """Report list module."""
 5 | 
 6 | import streamlit as st
 7 | from state.session_variables import SessionVariables
 8 | 
 9 | 
10 | def create_report_list_ui(sv: SessionVariables):
11 |     """Return report list UI component."""
12 |     selection = st.dataframe(
13 |         sv.community_reports.value,
14 |         height=1000,
15 |         hide_index=True,
16 |         column_order=["id", "title"],
17 |         selection_mode="single-row",
18 |         on_select="rerun",
19 |     )
20 |     rows = selection.selection.rows
21 |     if len(rows) > 0:
22 |         report_index = selection.selection.rows[0]
23 |         sv.selected_report.value = sv.community_reports.value.iloc[report_index]
24 |     else:
25 |         sv.selected_report.value = None
26 | 


--------------------------------------------------------------------------------
/unified-search-app/images/image-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/unified-search-app/images/image-1.png


--------------------------------------------------------------------------------
/unified-search-app/images/image-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/unified-search-app/images/image-2.png


--------------------------------------------------------------------------------
/unified-search-app/images/image-3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/unified-search-app/images/image-3.png


--------------------------------------------------------------------------------
/unified-search-app/images/image-4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/unified-search-app/images/image-4.png


--------------------------------------------------------------------------------
/unified-search-app/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "unified-copilot"
 3 | version = "1.0.0"
 4 | description = ""
 5 | authors = ["GraphRAG team"]
 6 | readme = "README.md"
 7 | package-mode = false
 8 | 
 9 | [tool.poetry.dependencies]
10 | python = ">=3.10,<3.12"
11 | streamlit = "1.43.0"
12 | azure-search-documents = "^11.4.0"
13 | azure-storage-blob = "^12.20.0"
14 | azure-identity = "^1.16.0"
15 | graphrag = "2.0.0"
16 | altair = "^5.3.0"
17 | streamlit-agraph = "^0.0.45"
18 | st-tabs = "^0.1.1"
19 | spacy = ">=3.8.4,<4.0.0"
20 | 
21 | [tool.poetry.group.dev.dependencies]
22 | poethepoet = "^0.26.1"
23 | ipykernel = "^6.29.4"
24 | pyright = "^1.1.349"
25 | ruff = "^0.4.7"
26 | 
27 | [build-system]
28 | requires = ["poetry-core"]
29 | build-backend = "poetry.core.masonry.api"
30 | 
31 | [tool.poe.tasks]
32 | start = "streamlit run app/home_page.py"
33 | start_prod = "streamlit run app/home_page.py --server.port=8501 --server.address=0.0.0.0"
34 | 
35 | [tool.pyright]
36 | include = ["app"]
37 | exclude = ["**/node_modules", "**/__pycache__"]
38 | 


--------------------------------------------------------------------------------