├── .gitattributes ├── .github ├── ISSUE_TEMPLATE │ ├── bug_report.yml │ ├── config.yml │ ├── feature_request.yml │ └── general_issue.yml ├── dependabot.yml ├── pull_request_template.md └── workflows │ ├── gh-pages.yml │ ├── issues-autoresolve.yml │ ├── python-ci.yml │ ├── python-integration-tests.yml │ ├── python-notebook-tests.yml │ ├── python-publish.yml │ ├── python-smoke-tests.yml │ ├── semver.yml │ └── spellcheck.yml ├── .gitignore ├── .semversioner ├── 0.1.0.json ├── 0.2.0.json ├── 0.2.1.json ├── 0.2.2.json ├── 0.3.0.json ├── 0.3.1.json ├── 0.3.2.json ├── 0.3.3.json ├── 0.3.4.json ├── 0.3.5.json ├── 0.3.6.json ├── 0.4.0.json ├── 0.4.1.json ├── 0.5.0.json ├── 0.9.0.json ├── 1.0.0.json ├── 1.0.1.json ├── 1.1.0.json ├── 1.1.1.json ├── 1.1.2.json ├── 1.2.0.json ├── 2.0.0.json ├── 2.1.0.json ├── 2.2.0.json ├── 2.2.1.json ├── 2.3.0.json └── next-release │ └── patch-20250530204951787463.json ├── .vscode ├── extensions.json ├── launch.json └── settings.json ├── .vsts-ci.yml ├── CHANGELOG.md ├── CODEOWNERS ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── DEVELOPING.md ├── LICENSE ├── RAI_TRANSPARENCY.md ├── README.md ├── SECURITY.md ├── SUPPORT.md ├── breaking-changes.md ├── cspell.config.yaml ├── dictionary.txt ├── docs ├── blog_posts.md ├── cli.md ├── config │ ├── env_vars.md │ ├── init.md │ ├── models.md │ ├── overview.md │ └── yaml.md ├── data │ └── operation_dulce │ │ ├── ABOUT.md │ │ ├── Operation Dulce v2 1 1.md │ │ └── dataset.zip ├── developing.md ├── examples_notebooks │ ├── api_overview.ipynb │ ├── drift_search.ipynb │ ├── global_search.ipynb │ ├── global_search_with_dynamic_community_selection.ipynb │ ├── index_migration_to_v1.ipynb │ ├── index_migration_to_v2.ipynb │ ├── inputs │ │ └── operation dulce │ │ │ ├── ABOUT.md │ │ │ ├── Operation Dulce v2 1 1.md │ │ │ ├── communities.parquet │ │ │ ├── community_reports.parquet │ │ │ ├── covariates.parquet │ │ │ ├── documents.parquet │ │ │ ├── entities.parquet │ │ │ ├── lancedb │ │ │ ├── default-community-full_content.lance │ │ │ │ ├── _transactions │ │ │ │ │ ├── 0-2fed1d8b-daac-41b0-a93a-e115cda75be3.txn │ │ │ │ │ ├── 1-61dbb7c2-aec3-4796-b223-941fc7cc93cc.txn │ │ │ │ │ ├── 2-60012692-a153-48f9-8f4e-c479b44cbf3f.txn │ │ │ │ │ └── 3-0d2dc9a1-094f-4220-83c7-6ad6f26fac2b.txn │ │ │ │ ├── _versions │ │ │ │ │ ├── 1.manifest │ │ │ │ │ ├── 2.manifest │ │ │ │ │ ├── 3.manifest │ │ │ │ │ └── 4.manifest │ │ │ │ └── data │ │ │ │ │ ├── 1e7b2d94-ed06-4aa0-b22e-86a71d416bc6.lance │ │ │ │ │ └── 1ed9f301-ce30-46a8-8c0b-9c2a60a3cf43.lance │ │ │ ├── default-entity-description.lance │ │ │ │ ├── _transactions │ │ │ │ │ ├── 0-92c031e5-7558-451e-9d0f-f5514db9616d.txn │ │ │ │ │ ├── 1-7b3cb8d8-3512-4584-a003-91838fed8911.txn │ │ │ │ │ ├── 2-7de627d2-4c57-49e9-bf73-c17a9582ead4.txn │ │ │ │ │ └── 3-9ad29d69-9a69-43a8-8b26-252ea267958d.txn │ │ │ │ ├── _versions │ │ │ │ │ ├── 1.manifest │ │ │ │ │ ├── 2.manifest │ │ │ │ │ ├── 3.manifest │ │ │ │ │ └── 4.manifest │ │ │ │ └── data │ │ │ │ │ ├── a34575c4-5260-457f-bebe-3f40bc0e2ee3.lance │ │ │ │ │ └── eabd7580-86f5-4022-8aa7-fe0aff816d98.lance │ │ │ └── default-text_unit-text.lance │ │ │ │ ├── _transactions │ │ │ │ ├── 0-fd0434ac-e5cd-4ddd-9dd5-e5048d4edb59.txn │ │ │ │ ├── 1-14bb4b1d-cc00-420b-9b14-3626f0bd8c0b.txn │ │ │ │ ├── 2-8e74264c-f72d-44f5-a6f4-b3b61ae6a43b.txn │ │ │ │ └── 3-7516fb71-9db3-4666-bdef-ea04c1eb9697.txn │ │ │ │ ├── _versions │ │ │ │ ├── 1.manifest │ │ │ │ ├── 2.manifest │ │ │ │ ├── 3.manifest │ │ │ │ └── 4.manifest │ │ │ │ └── data │ │ │ │ ├── 2794bf5b-de3d-4202-ab16-e76bc27c8e6a.lance │ │ │ │ └── 2f74c8e8-3f35-4209-889c-a13cf0780eb3.lance │ │ │ ├── relationships.parquet │ │ │ └── text_units.parquet │ ├── local_search.ipynb │ └── multi_index_search.ipynb ├── get_started.md ├── img │ ├── GraphRag-Figure1.jpg │ ├── auto-tune-diagram.png │ ├── drift-search-diagram.png │ ├── pipeline-running.png │ └── viz_guide │ │ ├── gephi-appearance-pane.png │ │ ├── gephi-initial-graph-example.png │ │ ├── gephi-layout-forceatlas2-pane.png │ │ ├── gephi-layout-pane.png │ │ └── gephi-network-overview-settings.png ├── index.md ├── index │ ├── architecture.md │ ├── byog.md │ ├── default_dataflow.md │ ├── inputs.md │ ├── methods.md │ ├── outputs.md │ └── overview.md ├── prompt_tuning │ ├── auto_prompt_tuning.md │ ├── manual_prompt_tuning.md │ └── overview.md ├── query │ ├── drift_search.md │ ├── global_search.md │ ├── local_search.md │ ├── multi_index_search.md │ ├── notebooks │ │ └── overview.md │ ├── overview.md │ └── question_generation.md ├── scripts │ └── create_cookie_banner.js ├── stylesheets │ └── extra.css └── visualization_guide.md ├── examples_notebooks ├── community_contrib │ ├── README.md │ ├── neo4j │ │ └── graphrag_import_neo4j_cypher.ipynb │ └── yfiles-jupyter-graphs │ │ └── graph-visualization.ipynb └── inputs │ └── operation dulce │ └── lancedb │ └── entity_description_embeddings.lance │ ├── _latest.manifest │ ├── _transactions │ ├── 0-498c6e24-dd0a-42b9-8f7e-5e3d2ab258b0.txn │ └── 1-bf5aa024-a229-461f-8d78-699841a302fe.txn │ ├── _versions │ ├── 1.manifest │ └── 2.manifest │ └── data │ └── fe64774f-5412-4c9c-8dea-f6ed55c81119.lance ├── graphrag ├── __init__.py ├── __main__.py ├── api │ ├── __init__.py │ ├── index.py │ ├── prompt_tune.py │ └── query.py ├── cache │ ├── __init__.py │ ├── factory.py │ ├── json_pipeline_cache.py │ ├── memory_pipeline_cache.py │ ├── noop_pipeline_cache.py │ └── pipeline_cache.py ├── callbacks │ ├── __init__.py │ ├── blob_workflow_callbacks.py │ ├── console_workflow_callbacks.py │ ├── file_workflow_callbacks.py │ ├── llm_callbacks.py │ ├── noop_query_callbacks.py │ ├── noop_workflow_callbacks.py │ ├── progress_workflow_callbacks.py │ ├── query_callbacks.py │ ├── reporting.py │ ├── workflow_callbacks.py │ └── workflow_callbacks_manager.py ├── cli │ ├── __init__.py │ ├── index.py │ ├── initialize.py │ ├── main.py │ ├── prompt_tune.py │ └── query.py ├── config │ ├── __init__.py │ ├── create_graphrag_config.py │ ├── defaults.py │ ├── embeddings.py │ ├── enums.py │ ├── environment_reader.py │ ├── errors.py │ ├── get_embedding_settings.py │ ├── init_content.py │ ├── load_config.py │ ├── logging.py │ ├── models │ │ ├── __init__.py │ │ ├── basic_search_config.py │ │ ├── cache_config.py │ │ ├── chunking_config.py │ │ ├── cluster_graph_config.py │ │ ├── community_reports_config.py │ │ ├── drift_search_config.py │ │ ├── embed_graph_config.py │ │ ├── extract_claims_config.py │ │ ├── extract_graph_config.py │ │ ├── extract_graph_nlp_config.py │ │ ├── global_search_config.py │ │ ├── graph_rag_config.py │ │ ├── input_config.py │ │ ├── language_model_config.py │ │ ├── local_search_config.py │ │ ├── output_config.py │ │ ├── prune_graph_config.py │ │ ├── reporting_config.py │ │ ├── snapshots_config.py │ │ ├── summarize_descriptions_config.py │ │ ├── text_embedding_config.py │ │ ├── umap_config.py │ │ └── vector_store_config.py │ └── read_dotenv.py ├── data_model │ ├── __init__.py │ ├── community.py │ ├── community_report.py │ ├── covariate.py │ ├── document.py │ ├── entity.py │ ├── identified.py │ ├── named.py │ ├── relationship.py │ ├── schemas.py │ ├── text_unit.py │ └── types.py ├── index │ ├── __init__.py │ ├── input │ │ ├── __init__.py │ │ ├── csv.py │ │ ├── factory.py │ │ ├── json.py │ │ ├── text.py │ │ └── util.py │ ├── operations │ │ ├── __init__.py │ │ ├── build_noun_graph │ │ │ ├── __init__.py │ │ │ ├── build_noun_graph.py │ │ │ └── np_extractors │ │ │ │ ├── __init__.py │ │ │ │ ├── base.py │ │ │ │ ├── cfg_extractor.py │ │ │ │ ├── factory.py │ │ │ │ ├── np_validator.py │ │ │ │ ├── regex_extractor.py │ │ │ │ ├── resource_loader.py │ │ │ │ ├── stop_words.py │ │ │ │ └── syntactic_parsing_extractor.py │ │ ├── chunk_text │ │ │ ├── __init__.py │ │ │ ├── bootstrap.py │ │ │ ├── chunk_text.py │ │ │ ├── strategies.py │ │ │ └── typing.py │ │ ├── cluster_graph.py │ │ ├── compute_degree.py │ │ ├── compute_edge_combined_degree.py │ │ ├── create_graph.py │ │ ├── embed_graph │ │ │ ├── __init__.py │ │ │ ├── embed_graph.py │ │ │ ├── embed_node2vec.py │ │ │ └── typing.py │ │ ├── embed_text │ │ │ ├── __init__.py │ │ │ ├── embed_text.py │ │ │ └── strategies │ │ │ │ ├── __init__.py │ │ │ │ ├── mock.py │ │ │ │ ├── openai.py │ │ │ │ └── typing.py │ │ ├── extract_covariates │ │ │ ├── __init__.py │ │ │ ├── claim_extractor.py │ │ │ ├── extract_covariates.py │ │ │ └── typing.py │ │ ├── extract_graph │ │ │ ├── __init__.py │ │ │ ├── extract_graph.py │ │ │ ├── graph_extractor.py │ │ │ ├── graph_intelligence_strategy.py │ │ │ └── typing.py │ │ ├── finalize_community_reports.py │ │ ├── finalize_entities.py │ │ ├── finalize_relationships.py │ │ ├── graph_to_dataframes.py │ │ ├── layout_graph │ │ │ ├── __init__.py │ │ │ ├── layout_graph.py │ │ │ ├── typing.py │ │ │ ├── umap.py │ │ │ └── zero.py │ │ ├── prune_graph.py │ │ ├── snapshot_graphml.py │ │ ├── summarize_communities │ │ │ ├── __init__.py │ │ │ ├── build_mixed_context.py │ │ │ ├── community_reports_extractor.py │ │ │ ├── explode_communities.py │ │ │ ├── graph_context │ │ │ │ ├── __init__.py │ │ │ │ ├── context_builder.py │ │ │ │ └── sort_context.py │ │ │ ├── strategies.py │ │ │ ├── summarize_communities.py │ │ │ ├── text_unit_context │ │ │ │ ├── __init__.py │ │ │ │ ├── context_builder.py │ │ │ │ ├── prep_text_units.py │ │ │ │ └── sort_context.py │ │ │ ├── typing.py │ │ │ └── utils.py │ │ └── summarize_descriptions │ │ │ ├── __init__.py │ │ │ ├── description_summary_extractor.py │ │ │ ├── graph_intelligence_strategy.py │ │ │ ├── summarize_descriptions.py │ │ │ └── typing.py │ ├── run │ │ ├── __init__.py │ │ ├── run_pipeline.py │ │ └── utils.py │ ├── text_splitting │ │ ├── __init__.py │ │ ├── check_token_limit.py │ │ └── text_splitting.py │ ├── typing │ │ ├── __init__.py │ │ ├── context.py │ │ ├── error_handler.py │ │ ├── pipeline.py │ │ ├── pipeline_run_result.py │ │ ├── state.py │ │ ├── stats.py │ │ └── workflow.py │ ├── update │ │ ├── __init__.py │ │ ├── communities.py │ │ ├── entities.py │ │ ├── incremental_index.py │ │ └── relationships.py │ ├── utils │ │ ├── __init__.py │ │ ├── dataframes.py │ │ ├── derive_from_rows.py │ │ ├── dicts.py │ │ ├── graphs.py │ │ ├── hashing.py │ │ ├── is_null.py │ │ ├── rate_limiter.py │ │ ├── stable_lcc.py │ │ ├── string.py │ │ ├── tokens.py │ │ └── uuid.py │ ├── validate_config.py │ └── workflows │ │ ├── __init__.py │ │ ├── create_base_text_units.py │ │ ├── create_communities.py │ │ ├── create_community_reports.py │ │ ├── create_community_reports_text.py │ │ ├── create_final_documents.py │ │ ├── create_final_text_units.py │ │ ├── extract_covariates.py │ │ ├── extract_graph.py │ │ ├── extract_graph_nlp.py │ │ ├── factory.py │ │ ├── finalize_graph.py │ │ ├── generate_text_embeddings.py │ │ ├── prune_graph.py │ │ ├── update_clean_state.py │ │ ├── update_communities.py │ │ ├── update_community_reports.py │ │ ├── update_covariates.py │ │ ├── update_entities_relationships.py │ │ ├── update_final_documents.py │ │ ├── update_text_embeddings.py │ │ └── update_text_units.py ├── language_model │ ├── __init__.py │ ├── cache │ │ ├── __init__.py │ │ └── base.py │ ├── events │ │ ├── __init__.py │ │ └── base.py │ ├── factory.py │ ├── manager.py │ ├── protocol │ │ ├── __init__.py │ │ └── base.py │ ├── providers │ │ ├── __init__.py │ │ └── fnllm │ │ │ ├── __init__.py │ │ │ ├── cache.py │ │ │ ├── events.py │ │ │ ├── models.py │ │ │ └── utils.py │ └── response │ │ ├── __init__.py │ │ ├── base.py │ │ └── base.pyi ├── logger │ ├── __init__.py │ ├── base.py │ ├── console.py │ ├── factory.py │ ├── null_progress.py │ ├── print_progress.py │ ├── progress.py │ ├── rich_progress.py │ └── types.py ├── prompt_tune │ ├── __init__.py │ ├── defaults.py │ ├── generator │ │ ├── __init__.py │ │ ├── community_report_rating.py │ │ ├── community_report_summarization.py │ │ ├── community_reporter_role.py │ │ ├── domain.py │ │ ├── entity_relationship.py │ │ ├── entity_summarization_prompt.py │ │ ├── entity_types.py │ │ ├── extract_graph_prompt.py │ │ ├── language.py │ │ └── persona.py │ ├── loader │ │ ├── __init__.py │ │ └── input.py │ ├── prompt │ │ ├── __init__.py │ │ ├── community_report_rating.py │ │ ├── community_reporter_role.py │ │ ├── domain.py │ │ ├── entity_relationship.py │ │ ├── entity_types.py │ │ ├── language.py │ │ └── persona.py │ ├── template │ │ ├── __init__.py │ │ ├── community_report_summarization.py │ │ ├── entity_summarization.py │ │ └── extract_graph.py │ └── types.py ├── prompts │ ├── __init__.py │ ├── index │ │ ├── __init__.py │ │ ├── community_report.py │ │ ├── community_report_text_units.py │ │ ├── extract_claims.py │ │ ├── extract_graph.py │ │ └── summarize_descriptions.py │ └── query │ │ ├── __init__.py │ │ ├── basic_search_system_prompt.py │ │ ├── drift_search_system_prompt.py │ │ ├── global_search_knowledge_system_prompt.py │ │ ├── global_search_map_system_prompt.py │ │ ├── global_search_reduce_system_prompt.py │ │ ├── local_search_system_prompt.py │ │ └── question_gen_system_prompt.py ├── py.typed ├── query │ ├── __init__.py │ ├── context_builder │ │ ├── __init__.py │ │ ├── builders.py │ │ ├── community_context.py │ │ ├── conversation_history.py │ │ ├── dynamic_community_selection.py │ │ ├── entity_extraction.py │ │ ├── local_context.py │ │ ├── rate_prompt.py │ │ ├── rate_relevancy.py │ │ └── source_context.py │ ├── factory.py │ ├── indexer_adapters.py │ ├── input │ │ ├── __init__.py │ │ ├── loaders │ │ │ ├── __init__.py │ │ │ ├── dfs.py │ │ │ └── utils.py │ │ └── retrieval │ │ │ ├── __init__.py │ │ │ ├── community_reports.py │ │ │ ├── covariates.py │ │ │ ├── entities.py │ │ │ ├── relationships.py │ │ │ └── text_units.py │ ├── llm │ │ ├── __init__.py │ │ └── text_utils.py │ ├── question_gen │ │ ├── __init__.py │ │ ├── base.py │ │ └── local_gen.py │ └── structured_search │ │ ├── __init__.py │ │ ├── base.py │ │ ├── basic_search │ │ ├── __init__.py │ │ ├── basic_context.py │ │ └── search.py │ │ ├── drift_search │ │ ├── __init__.py │ │ ├── action.py │ │ ├── drift_context.py │ │ ├── primer.py │ │ ├── search.py │ │ └── state.py │ │ ├── global_search │ │ ├── __init__.py │ │ ├── community_context.py │ │ └── search.py │ │ └── local_search │ │ ├── __init__.py │ │ ├── mixed_context.py │ │ └── search.py ├── storage │ ├── __init__.py │ ├── blob_pipeline_storage.py │ ├── cosmosdb_pipeline_storage.py │ ├── factory.py │ ├── file_pipeline_storage.py │ ├── memory_pipeline_storage.py │ └── pipeline_storage.py ├── utils │ ├── __init__.py │ ├── api.py │ ├── cli.py │ └── storage.py └── vector_stores │ ├── __init__.py │ ├── azure_ai_search.py │ ├── base.py │ ├── cosmosdb.py │ ├── factory.py │ └── lancedb.py ├── mkdocs.yaml ├── poetry.lock ├── pyproject.toml ├── scripts ├── semver-check.sh ├── spellcheck.sh └── start-azurite.sh ├── tests ├── __init__.py ├── conftest.py ├── fixtures │ ├── azure │ │ ├── config.json │ │ ├── input │ │ │ ├── ABOUT.md │ │ │ └── dulce.txt │ │ └── settings.yml │ ├── min-csv │ │ ├── config.json │ │ ├── input │ │ │ ├── ABOUT.md │ │ │ ├── dulce.csv │ │ │ └── dulce.txt │ │ └── settings.yml │ └── text │ │ ├── config.json │ │ ├── input │ │ ├── ABOUT.md │ │ └── dulce.txt │ │ ├── prompts │ │ └── community_report.txt │ │ └── settings.yml ├── integration │ ├── __init__.py │ ├── language_model │ │ ├── __init__.py │ │ └── test_factory.py │ ├── storage │ │ ├── __init__.py │ │ ├── test_blob_pipeline_storage.py │ │ ├── test_cosmosdb_storage.py │ │ ├── test_factory.py │ │ └── test_file_pipeline_storage.py │ └── vector_stores │ │ ├── __init__.py │ │ ├── test_azure_ai_search.py │ │ ├── test_cosmosdb.py │ │ └── test_lancedb.py ├── mock_provider.py ├── notebook │ ├── __init__.py │ └── test_notebooks.py ├── smoke │ ├── __init__.py │ └── test_fixtures.py ├── unit │ ├── __init__.py │ ├── config │ │ ├── __init__.py │ │ ├── fixtures │ │ │ ├── minimal_config │ │ │ │ └── settings.yaml │ │ │ ├── minimal_config_missing_env_var │ │ │ │ └── settings.yaml │ │ │ └── timestamp_dirs │ │ │ │ └── 20240812-120000 │ │ │ │ └── empty.txt │ │ ├── prompt-a.txt │ │ ├── prompt-b.txt │ │ ├── prompt-c.txt │ │ ├── prompt-d.txt │ │ ├── test_config.py │ │ └── utils.py │ ├── indexing │ │ ├── __init__.py │ │ ├── cache │ │ │ ├── __init__.py │ │ │ └── test_file_pipeline_cache.py │ │ ├── graph │ │ │ ├── __init__.py │ │ │ ├── extractors │ │ │ │ ├── __init__.py │ │ │ │ └── community_reports │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── test_sort_context.py │ │ │ └── utils │ │ │ │ ├── __init__.py │ │ │ │ └── test_stable_lcc.py │ │ ├── input │ │ │ ├── __init__.py │ │ │ ├── data │ │ │ │ ├── multiple-csvs │ │ │ │ │ ├── input1.csv │ │ │ │ │ ├── input2.csv │ │ │ │ │ └── input3.csv │ │ │ │ ├── multiple-jsons │ │ │ │ │ ├── input1.json │ │ │ │ │ └── input2.json │ │ │ │ ├── multiple-txts │ │ │ │ │ ├── input1.txt │ │ │ │ │ └── input2.txt │ │ │ │ ├── one-csv │ │ │ │ │ └── input.csv │ │ │ │ ├── one-json-multiple-objects │ │ │ │ │ └── input.json │ │ │ │ ├── one-json-one-object │ │ │ │ │ └── input.json │ │ │ │ └── one-txt │ │ │ │ │ └── input.txt │ │ │ ├── test_csv_loader.py │ │ │ ├── test_json_loader.py │ │ │ └── test_txt_loader.py │ │ ├── operations │ │ │ ├── __init__.py │ │ │ └── chunk_text │ │ │ │ ├── __init__.py │ │ │ │ ├── test_chunk_text.py │ │ │ │ └── test_strategies.py │ │ ├── test_init_content.py │ │ ├── text_splitting │ │ │ ├── __init__.py │ │ │ └── test_text_splitting.py │ │ └── verbs │ │ │ ├── __init__.py │ │ │ ├── entities │ │ │ ├── __init__.py │ │ │ └── extraction │ │ │ │ ├── __init__.py │ │ │ │ └── strategies │ │ │ │ ├── __init__.py │ │ │ │ └── graph_intelligence │ │ │ │ ├── __init__.py │ │ │ │ └── test_gi_entity_extraction.py │ │ │ └── helpers │ │ │ ├── __init__.py │ │ │ └── mock_llm.py │ ├── query │ │ ├── __init__.py │ │ ├── context_builder │ │ │ ├── __init__.py │ │ │ └── test_entity_extraction.py │ │ ├── data │ │ │ ├── defaults │ │ │ │ └── output │ │ │ │ │ ├── 20240812-120000 │ │ │ │ │ └── empty.txt │ │ │ │ │ └── 20240812-121000 │ │ │ │ │ └── empty.txt │ │ │ ├── empty │ │ │ │ └── something-else │ │ │ │ │ └── empty.txt │ │ │ ├── hidden │ │ │ │ └── output │ │ │ │ │ ├── .another │ │ │ │ │ └── empty.txt │ │ │ │ │ ├── .hidden │ │ │ │ │ ├── 20240812-120000 │ │ │ │ │ └── empty.txt │ │ │ │ │ └── 20240812-121000 │ │ │ │ │ └── empty.txt │ │ │ └── non-numeric │ │ │ │ └── output │ │ │ │ ├── 20240812-120000 │ │ │ │ └── empty.txt │ │ │ │ ├── 20240812-121000 │ │ │ │ └── empty.txt │ │ │ │ └── something-else │ │ │ │ └── empty.txt │ │ └── input │ │ │ ├── __init__.py │ │ │ └── retrieval │ │ │ ├── __init__.py │ │ │ └── test_entities.py │ └── utils │ │ ├── __init__.py │ │ └── test_embeddings.py └── verbs │ ├── __init__.py │ ├── data │ ├── communities.parquet │ ├── community_reports.parquet │ ├── covariates.parquet │ ├── documents.parquet │ ├── entities.parquet │ ├── relationships.parquet │ ├── text_units.parquet │ ├── text_units_metadata.parquet │ └── text_units_metadata_included_chunk.parquet │ ├── test_create_base_text_units.py │ ├── test_create_communities.py │ ├── test_create_community_reports.py │ ├── test_create_final_documents.py │ ├── test_create_final_text_units.py │ ├── test_extract_covariates.py │ ├── test_extract_graph.py │ ├── test_extract_graph_nlp.py │ ├── test_finalize_graph.py │ ├── test_generate_text_embeddings.py │ ├── test_pipeline_state.py │ ├── test_prune_graph.py │ └── util.py └── unified-search-app ├── .vsts-ci.yml ├── Dockerfile ├── README.md ├── app ├── __init__.py ├── app_logic.py ├── data_config.py ├── home_page.py ├── knowledge_loader │ ├── __init__.py │ ├── data_prep.py │ ├── data_sources │ │ ├── __init__.py │ │ ├── blob_source.py │ │ ├── default.py │ │ ├── loader.py │ │ ├── local_source.py │ │ └── typing.py │ └── model.py ├── rag │ ├── __init__.py │ └── typing.py ├── state │ ├── __init__.py │ ├── query_variable.py │ ├── session_variable.py │ └── session_variables.py └── ui │ ├── __init__.py │ ├── full_graph.py │ ├── questions_list.py │ ├── report_details.py │ ├── report_list.py │ ├── search.py │ └── sidebar.py ├── images ├── image-1.png ├── image-2.png ├── image-3.png └── image-4.png ├── poetry.lock └── pyproject.toml /.gitattributes: -------------------------------------------------------------------------------- 1 | *.txt text eol=lf 2 | *.md text eol=lf 3 | *.yml text eol=lf 4 | *.html text eol=lf 5 | *.py text eol=lf 6 | *.toml text eol=lf 7 | .gitattributes text eol=lf 8 | .gitignore text eol=lf 9 | *.lock 10 | CODEOWNERS text eol=lf 11 | LICENSE text eol=lf -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/config.yml: -------------------------------------------------------------------------------- 1 | blank_issues_enabled: true 2 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | # To get started with Dependabot version updates, you'll need to specify which 2 | # package ecosystems to update and where the package manifests are located. 3 | # Please see the documentation for all configuration options: 4 | # https://docs.github.com/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file 5 | version: 2 6 | updates: 7 | - package-ecosystem: "pip" # See documentation for possible values 8 | directory: "/" # Location of package manifests 9 | schedule: 10 | interval: "weekly" 11 | - package-ecosystem: "github-actions" 12 | # Workflow files stored in the default location of `.github/workflows`. (You don't need to specify `/.github/workflows` for `directory`. You can use `directory: "/"`.) 13 | directory: "/" 14 | schedule: 15 | interval: "weekly" 16 | -------------------------------------------------------------------------------- /.github/pull_request_template.md: -------------------------------------------------------------------------------- 1 | 14 | 15 | ## Description 16 | 17 | [Provide a brief description of the changes made in this pull request.] 18 | 19 | ## Related Issues 20 | 21 | [Reference any related issues or tasks that this pull request addresses.] 22 | 23 | ## Proposed Changes 24 | 25 | [List the specific changes made in this pull request.] 26 | 27 | ## Checklist 28 | 29 | - [ ] I have tested these changes locally. 30 | - [ ] I have reviewed the code changes. 31 | - [ ] I have updated the documentation (if necessary). 32 | - [ ] I have added appropriate unit tests (if applicable). 33 | 34 | ## Additional Notes 35 | 36 | [Add any additional notes or context that may be helpful for the reviewer(s).] 37 | -------------------------------------------------------------------------------- /.github/workflows/gh-pages.yml: -------------------------------------------------------------------------------- 1 | name: gh-pages 2 | on: 3 | push: 4 | branches: [main] 5 | permissions: 6 | contents: write 7 | 8 | env: 9 | POETRY_VERSION: '1.8.3' 10 | PYTHON_VERSION: '3.11' 11 | 12 | jobs: 13 | build: 14 | runs-on: ubuntu-latest 15 | env: 16 | GH_PAGES: 1 17 | DEBUG: 1 18 | GRAPHRAG_API_KEY: ${{ secrets.GRAPHRAG_API_KEY }} 19 | GRAPHRAG_LLM_MODEL: ${{ secrets.GRAPHRAG_LLM_MODEL }} 20 | GRAPHRAG_EMBEDDING_MODEL: ${{ secrets.GRAPHRAG_EMBEDDING_MODEL }} 21 | 22 | steps: 23 | - uses: actions/checkout@v4 24 | with: 25 | persist-credentials: false 26 | 27 | - name: Set up Python ${{ env.PYTHON_VERSION }} 28 | uses: actions/setup-python@v5 29 | with: 30 | python-version: ${{ env.PYTHON_VERSION }} 31 | 32 | - name: Install Poetry ${{ env.POETRY_VERSION }} 33 | uses: abatilo/actions-poetry@v3.0.0 34 | with: 35 | poetry-version: ${{ env.POETRY_VERSION }} 36 | 37 | - name: poetry intsall 38 | shell: bash 39 | run: poetry install 40 | 41 | - name: mkdocs build 42 | shell: bash 43 | run: poetry run poe build_docs 44 | 45 | - name: List Docsite Contents 46 | run: find site 47 | 48 | - name: Deploy to GitHub Pages 49 | uses: JamesIves/github-pages-deploy-action@v4.6.4 50 | with: 51 | branch: gh-pages 52 | folder: site 53 | clean: true 54 | -------------------------------------------------------------------------------- /.github/workflows/issues-autoresolve.yml: -------------------------------------------------------------------------------- 1 | name: Close inactive issues 2 | on: 3 | schedule: 4 | - cron: "30 1 * * *" 5 | 6 | permissions: 7 | actions: write 8 | issues: write 9 | pull-requests: write 10 | 11 | jobs: 12 | close-issues: 13 | runs-on: ubuntu-latest 14 | permissions: 15 | issues: write 16 | pull-requests: write 17 | steps: 18 | - uses: actions/stale@v9 19 | with: 20 | days-before-issue-stale: 7 21 | days-before-issue-close: 5 22 | stale-issue-label: "stale" 23 | close-issue-label: "autoresolved" 24 | stale-issue-message: "This issue has been marked stale due to inactivity after repo maintainer or community member responses that request more information or suggest a solution. It will be closed after five additional days." 25 | close-issue-message: "This issue has been closed after being marked as stale for five days. Please reopen if needed." 26 | any-of-labels: "awaiting_response" 27 | days-before-pr-stale: -1 28 | days-before-pr-close: -1 29 | repo-token: ${{ secrets.GITHUB_TOKEN }} 30 | -------------------------------------------------------------------------------- /.github/workflows/python-publish.yml: -------------------------------------------------------------------------------- 1 | name: Python Publish (pypi) 2 | on: 3 | release: 4 | types: [created] 5 | push: 6 | branches: [main] 7 | 8 | env: 9 | POETRY_VERSION: "1.8.3" 10 | PYTHON_VERSION: "3.10" 11 | 12 | jobs: 13 | publish: 14 | name: Upload release to PyPI 15 | if: github.ref == 'refs/heads/main' 16 | runs-on: ubuntu-latest 17 | environment: 18 | name: pypi 19 | url: https://pypi.org/p/graphrag 20 | permissions: 21 | id-token: write # IMPORTANT: this permission is mandatory for trusted publishing 22 | 23 | steps: 24 | - uses: actions/checkout@v4 25 | with: 26 | fetch-depth: 0 27 | fetch-tags: true 28 | 29 | - name: Set up Python 30 | uses: actions/setup-python@v5 31 | with: 32 | python-version: ${{ env.PYTHON_VERSION }} 33 | 34 | - name: Install Poetry 35 | uses: abatilo/actions-poetry@v3.0.0 36 | with: 37 | poetry-version: ${{ env.POETRY_VERSION }} 38 | 39 | - name: Install dependencies 40 | shell: bash 41 | run: poetry install 42 | 43 | - name: Export Publication Version 44 | run: echo "version=`poetry version --short`" >> $GITHUB_OUTPUT 45 | 46 | - name: Build Distributable 47 | shell: bash 48 | run: poetry build 49 | 50 | - name: Publish package distributions to PyPI 51 | uses: pypa/gh-action-pypi-publish@release/v1 52 | with: 53 | packages-dir: dist 54 | skip-existing: true 55 | verbose: true 56 | -------------------------------------------------------------------------------- /.github/workflows/semver.yml: -------------------------------------------------------------------------------- 1 | name: Semver Check 2 | on: 3 | pull_request: 4 | types: 5 | - opened 6 | - reopened 7 | - synchronize 8 | - ready_for_review 9 | branches: [main] 10 | 11 | jobs: 12 | semver: 13 | # skip draft PRs 14 | if: github.event.pull_request.draft == false 15 | runs-on: ubuntu-latest 16 | steps: 17 | - uses: actions/checkout@v4 18 | with: 19 | fetch-depth: 0 20 | - name: Check Semver 21 | run: ./scripts/semver-check.sh -------------------------------------------------------------------------------- /.github/workflows/spellcheck.yml: -------------------------------------------------------------------------------- 1 | name: Spellcheck 2 | on: 3 | push: 4 | branches: [main] 5 | pull_request: 6 | types: 7 | - opened 8 | - reopened 9 | - synchronize 10 | - ready_for_review 11 | paths: 12 | - "**/*" 13 | jobs: 14 | spellcheck: 15 | # skip draft PRs 16 | if: github.event.pull_request.draft == false 17 | runs-on: ubuntu-latest 18 | steps: 19 | - uses: actions/checkout@v4 20 | 21 | - name: Spellcheck 22 | run: ./scripts/spellcheck.sh 23 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Python Artifacts 2 | python/*/lib/ 3 | dist/ 4 | 5 | # Test Output 6 | .coverage 7 | coverage/ 8 | licenses.txt 9 | examples_notebooks/*/data 10 | tests/fixtures/cache 11 | tests/fixtures/*/cache 12 | tests/fixtures/*/output 13 | output/lancedb 14 | 15 | 16 | # Random 17 | .DS_Store 18 | *.log* 19 | .venv 20 | venv/ 21 | .conda 22 | .tmp 23 | 24 | .env 25 | build.zip 26 | 27 | .turbo 28 | 29 | __pycache__ 30 | 31 | .pipeline 32 | 33 | # Azurite 34 | temp_azurite/ 35 | __azurite*.json 36 | __blobstorage*.json 37 | __blobstorage__/ 38 | 39 | # Getting started example 40 | ragtest/ 41 | .ragtest/ 42 | .pipelines 43 | .pipeline 44 | 45 | 46 | # mkdocs 47 | site/ 48 | 49 | # Docs migration 50 | docsite/ 51 | .yarn/ 52 | .pnp* 53 | 54 | # PyCharm 55 | .idea/ 56 | 57 | # Jupyter notebook 58 | .ipynb_checkpoints/ 59 | -------------------------------------------------------------------------------- /.semversioner/0.1.0.json: -------------------------------------------------------------------------------- 1 | { 2 | "changes": [ 3 | { 4 | "description": "Initial Release", 5 | "type": "minor" 6 | } 7 | ], 8 | "created_at": "2024-07-01T21:48:50+00:00", 9 | "version": "0.1.0" 10 | } -------------------------------------------------------------------------------- /.semversioner/0.2.2.json: -------------------------------------------------------------------------------- 1 | { 2 | "changes": [ 3 | { 4 | "description": "Add a check if there is no community record added in local search context", 5 | "type": "patch" 6 | }, 7 | { 8 | "description": "Add sepparate workflow for Python Tests", 9 | "type": "patch" 10 | }, 11 | { 12 | "description": "Docs updates", 13 | "type": "patch" 14 | }, 15 | { 16 | "description": "Run smoke tests on 4o", 17 | "type": "patch" 18 | } 19 | ], 20 | "created_at": "2024-08-08T22:40:57+00:00", 21 | "version": "0.2.2" 22 | } -------------------------------------------------------------------------------- /.semversioner/0.3.0.json: -------------------------------------------------------------------------------- 1 | { 2 | "changes": [ 3 | { 4 | "description": "Implement auto templating API.", 5 | "type": "minor" 6 | }, 7 | { 8 | "description": "Implement query engine API.", 9 | "type": "minor" 10 | }, 11 | { 12 | "description": "Fix file dumps using json for non ASCII chars", 13 | "type": "patch" 14 | }, 15 | { 16 | "description": "Stabilize smoke tests for query context building", 17 | "type": "patch" 18 | }, 19 | { 20 | "description": "fix query embedding", 21 | "type": "patch" 22 | }, 23 | { 24 | "description": "fix sort_context & max_tokens params in verb", 25 | "type": "patch" 26 | } 27 | ], 28 | "created_at": "2024-08-12T23:51:49+00:00", 29 | "version": "0.3.0" 30 | } -------------------------------------------------------------------------------- /.semversioner/0.3.1.json: -------------------------------------------------------------------------------- 1 | { 2 | "changes": [ 3 | { 4 | "description": "Add preflight check to check LLM connectivity.", 5 | "type": "patch" 6 | }, 7 | { 8 | "description": "Add streaming support for local/global search to query cli", 9 | "type": "patch" 10 | }, 11 | { 12 | "description": "Add support for both float and int on schema validation for community report generation", 13 | "type": "patch" 14 | }, 15 | { 16 | "description": "Avoid running index on gh-pages publishing", 17 | "type": "patch" 18 | }, 19 | { 20 | "description": "Implement Index API", 21 | "type": "patch" 22 | }, 23 | { 24 | "description": "Improves filtering for data dir inferring", 25 | "type": "patch" 26 | }, 27 | { 28 | "description": "Update to nltk 3.9.1", 29 | "type": "patch" 30 | } 31 | ], 32 | "created_at": "2024-08-21T22:46:19+00:00", 33 | "version": "0.3.1" 34 | } -------------------------------------------------------------------------------- /.semversioner/0.3.2.json: -------------------------------------------------------------------------------- 1 | { 2 | "changes": [ 3 | { 4 | "description": "Add context data to query API responses.", 5 | "type": "patch" 6 | }, 7 | { 8 | "description": "Add missing config parameter documentation for prompt tuning", 9 | "type": "patch" 10 | }, 11 | { 12 | "description": "Add neo4j community notebook", 13 | "type": "patch" 14 | }, 15 | { 16 | "description": "Ensure entity types to be str when running prompt tuning", 17 | "type": "patch" 18 | }, 19 | { 20 | "description": "Fix weight casting during graph extraction", 21 | "type": "patch" 22 | }, 23 | { 24 | "description": "Patch \"past\" dependency issues", 25 | "type": "patch" 26 | }, 27 | { 28 | "description": "Update developer guide.", 29 | "type": "patch" 30 | }, 31 | { 32 | "description": "Update query type hints.", 33 | "type": "patch" 34 | }, 35 | { 36 | "description": "change-lancedb-placement", 37 | "type": "patch" 38 | } 39 | ], 40 | "created_at": "2024-08-26T23:43:01+00:00", 41 | "version": "0.3.2" 42 | } -------------------------------------------------------------------------------- /.semversioner/0.3.4.json: -------------------------------------------------------------------------------- 1 | { 2 | "changes": [ 3 | { 4 | "description": "Deep copy txt units on local search to avoid race conditions", 5 | "type": "patch" 6 | }, 7 | { 8 | "description": "Fix summarization including empty descriptions", 9 | "type": "patch" 10 | } 11 | ], 12 | "created_at": "2024-09-11T22:31:58+00:00", 13 | "version": "0.3.4" 14 | } -------------------------------------------------------------------------------- /.semversioner/0.3.5.json: -------------------------------------------------------------------------------- 1 | { 2 | "changes": [ 3 | { 4 | "description": "Add compound verbs with tests infra.", 5 | "type": "patch" 6 | }, 7 | { 8 | "description": "Collapse create_final_communities.", 9 | "type": "patch" 10 | }, 11 | { 12 | "description": "Collapse create_final_text_units.", 13 | "type": "patch" 14 | }, 15 | { 16 | "description": "Covariate verb collapse.", 17 | "type": "patch" 18 | }, 19 | { 20 | "description": "Fix duplicates in community context builder", 21 | "type": "patch" 22 | }, 23 | { 24 | "description": "Fix prompt tune output path", 25 | "type": "patch" 26 | }, 27 | { 28 | "description": "Fix seed hardcoded init", 29 | "type": "patch" 30 | }, 31 | { 32 | "description": "Fix seeded random gen on clustering", 33 | "type": "patch" 34 | }, 35 | { 36 | "description": "Improve logging.", 37 | "type": "patch" 38 | }, 39 | { 40 | "description": "Set default values for cli parameters.", 41 | "type": "patch" 42 | }, 43 | { 44 | "description": "Use static output directories.", 45 | "type": "patch" 46 | } 47 | ], 48 | "created_at": "2024-09-19T15:26:01+00:00", 49 | "version": "0.3.5" 50 | } -------------------------------------------------------------------------------- /.semversioner/0.3.6.json: -------------------------------------------------------------------------------- 1 | { 2 | "changes": [ 3 | { 4 | "description": "Collapse create_final_relationships.", 5 | "type": "patch" 6 | }, 7 | { 8 | "description": "Dependency update and cleanup", 9 | "type": "patch" 10 | } 11 | ], 12 | "created_at": "2024-09-20T00:09:13+00:00", 13 | "version": "0.3.6" 14 | } -------------------------------------------------------------------------------- /.semversioner/0.4.1.json: -------------------------------------------------------------------------------- 1 | { 2 | "changes": [ 3 | { 4 | "description": "Add update cli entrypoint for incremental indexing", 5 | "type": "patch" 6 | }, 7 | { 8 | "description": "Allow some CI/CD jobs to skip PRs dedicated to doc updates only.", 9 | "type": "patch" 10 | }, 11 | { 12 | "description": "Fix a file paths issue in the viz guide.", 13 | "type": "patch" 14 | }, 15 | { 16 | "description": "Fix optional covariates update in incremental indexing", 17 | "type": "patch" 18 | }, 19 | { 20 | "description": "Raise error on empty deltas for inc indexing", 21 | "type": "patch" 22 | }, 23 | { 24 | "description": "add visualization guide to doc site", 25 | "type": "patch" 26 | }, 27 | { 28 | "description": "fix streaming output error", 29 | "type": "patch" 30 | } 31 | ], 32 | "created_at": "2024-11-08T23:13:05+00:00", 33 | "version": "0.4.1" 34 | } -------------------------------------------------------------------------------- /.semversioner/0.5.0.json: -------------------------------------------------------------------------------- 1 | { 2 | "changes": [ 3 | { 4 | "description": "Data model changes.", 5 | "type": "minor" 6 | }, 7 | { 8 | "description": "Add Parquet as part of the default emitters when not pressent", 9 | "type": "patch" 10 | }, 11 | { 12 | "description": "Centralized prompts and export all for easier injection.", 13 | "type": "patch" 14 | }, 15 | { 16 | "description": "Cleanup of artifact outputs/schemas.", 17 | "type": "patch" 18 | }, 19 | { 20 | "description": "Config and docs updates.", 21 | "type": "patch" 22 | }, 23 | { 24 | "description": "Implement dynamic community selection to global search", 25 | "type": "patch" 26 | }, 27 | { 28 | "description": "fix autocompletion of existing files/directory paths.", 29 | "type": "patch" 30 | }, 31 | { 32 | "description": "move import statements out of init files", 33 | "type": "patch" 34 | } 35 | ], 36 | "created_at": "2024-11-16T00:43:06+00:00", 37 | "version": "0.5.0" 38 | } -------------------------------------------------------------------------------- /.semversioner/0.9.0.json: -------------------------------------------------------------------------------- 1 | { 2 | "changes": [ 3 | { 4 | "description": "Refactor graph creation.", 5 | "type": "minor" 6 | }, 7 | { 8 | "description": "Dependency updates", 9 | "type": "patch" 10 | }, 11 | { 12 | "description": "Fix Global Search with dynamic Community selection bug", 13 | "type": "patch" 14 | }, 15 | { 16 | "description": "Fix question gen.", 17 | "type": "patch" 18 | }, 19 | { 20 | "description": "Optimize Final Community Reports calculation and stabilize cache", 21 | "type": "patch" 22 | }, 23 | { 24 | "description": "miscellaneous code cleanup and minor changes for better alignment of style across the codebase.", 25 | "type": "patch" 26 | }, 27 | { 28 | "description": "replace llm package with fnllm", 29 | "type": "patch" 30 | }, 31 | { 32 | "description": "replaced md5 hash with sha256", 33 | "type": "patch" 34 | }, 35 | { 36 | "description": "replaced md5 hash with sha512", 37 | "type": "patch" 38 | }, 39 | { 40 | "description": "update API and add a demonstration notebook", 41 | "type": "patch" 42 | } 43 | ], 44 | "created_at": "2024-12-06T20:12:30+00:00", 45 | "version": "0.9.0" 46 | } -------------------------------------------------------------------------------- /.semversioner/1.0.0.json: -------------------------------------------------------------------------------- 1 | { 2 | "changes": [ 3 | { 4 | "description": "Add Parent id to communities data model", 5 | "type": "patch" 6 | }, 7 | { 8 | "description": "Add migration notebook.", 9 | "type": "patch" 10 | }, 11 | { 12 | "description": "Create separate community workflow, collapse subflows.", 13 | "type": "patch" 14 | }, 15 | { 16 | "description": "Dependency Updates", 17 | "type": "patch" 18 | }, 19 | { 20 | "description": "cleanup and refactor factory classes.", 21 | "type": "patch" 22 | } 23 | ], 24 | "created_at": "2024-12-11T21:41:49+00:00", 25 | "version": "1.0.0" 26 | } -------------------------------------------------------------------------------- /.semversioner/1.0.1.json: -------------------------------------------------------------------------------- 1 | { 2 | "changes": [ 3 | { 4 | "description": "Fix encoding model config parsing", 5 | "type": "patch" 6 | }, 7 | { 8 | "description": "Fix exception on error callbacks", 9 | "type": "patch" 10 | }, 11 | { 12 | "description": "Manage llm instances inside a cached singleton. Check for empty dfs after entity/relationship extraction", 13 | "type": "patch" 14 | }, 15 | { 16 | "description": "Respect encoding_model option", 17 | "type": "patch" 18 | } 19 | ], 20 | "created_at": "2024-12-18T23:12:52+00:00", 21 | "version": "1.0.1" 22 | } -------------------------------------------------------------------------------- /.semversioner/1.1.0.json: -------------------------------------------------------------------------------- 1 | { 2 | "changes": [ 3 | { 4 | "description": "Make gleanings independent of encoding", 5 | "type": "minor" 6 | }, 7 | { 8 | "description": "Remove DataShaper (first steps).", 9 | "type": "minor" 10 | }, 11 | { 12 | "description": "Remove old pipeline runner.", 13 | "type": "minor" 14 | }, 15 | { 16 | "description": "new search implemented as a new option for the api", 17 | "type": "minor" 18 | }, 19 | { 20 | "description": "Fix gleanings loop check", 21 | "type": "patch" 22 | }, 23 | { 24 | "description": "Implement cosmosdb storage option for cache and output", 25 | "type": "patch" 26 | }, 27 | { 28 | "description": "Move extractor code to co-locate with operations.", 29 | "type": "patch" 30 | }, 31 | { 32 | "description": "Remove config input models.", 33 | "type": "patch" 34 | }, 35 | { 36 | "description": "Ruff update", 37 | "type": "patch" 38 | }, 39 | { 40 | "description": "Simplify and streamline internal config.", 41 | "type": "patch" 42 | }, 43 | { 44 | "description": "Simplify callbacks model.", 45 | "type": "patch" 46 | }, 47 | { 48 | "description": "Streamline flows.", 49 | "type": "patch" 50 | }, 51 | { 52 | "description": "fix instantiation of storage classes.", 53 | "type": "patch" 54 | } 55 | ], 56 | "created_at": "2025-01-07T20:25:57+00:00", 57 | "version": "1.1.0" 58 | } -------------------------------------------------------------------------------- /.semversioner/1.1.1.json: -------------------------------------------------------------------------------- 1 | { 2 | "changes": [ 3 | { 4 | "description": "Fix a bug on creating community hierarchy for dynamic search", 5 | "type": "patch" 6 | }, 7 | { 8 | "description": "Increase LOCAL_SEARCH_COMMUNITY_PROP to 15%", 9 | "type": "patch" 10 | } 11 | ], 12 | "created_at": "2025-01-08T21:53:16+00:00", 13 | "version": "1.1.1" 14 | } -------------------------------------------------------------------------------- /.semversioner/1.1.2.json: -------------------------------------------------------------------------------- 1 | { 2 | "changes": [ 3 | { 4 | "description": "Basic Rag minor fix", 5 | "type": "patch" 6 | } 7 | ], 8 | "created_at": "2025-01-09T22:29:23+00:00", 9 | "version": "1.2.0" 10 | } -------------------------------------------------------------------------------- /.semversioner/1.2.0.json: -------------------------------------------------------------------------------- 1 | { 2 | "changes": [ 3 | { 4 | "description": "Add Drift Reduce response and streaming endpoint", 5 | "type": "minor" 6 | }, 7 | { 8 | "description": "add cosmosdb vector store", 9 | "type": "minor" 10 | }, 11 | { 12 | "description": "Fix example notebooks", 13 | "type": "patch" 14 | }, 15 | { 16 | "description": "Set default rate limits.", 17 | "type": "patch" 18 | }, 19 | { 20 | "description": "unit tests for text_splitting", 21 | "type": "patch" 22 | } 23 | ], 24 | "created_at": "2025-01-15T20:32:00+00:00", 25 | "version": "1.2.0" 26 | } -------------------------------------------------------------------------------- /.semversioner/2.1.0.json: -------------------------------------------------------------------------------- 1 | { 2 | "changes": [ 3 | { 4 | "description": "Add support for JSON input files.", 5 | "type": "minor" 6 | }, 7 | { 8 | "description": "Updated the prompt tunning client to support csv-metadata injection and updated output file types to match the new naming convention.", 9 | "type": "minor" 10 | }, 11 | { 12 | "description": "Add check for custom model types while config loading", 13 | "type": "patch" 14 | }, 15 | { 16 | "description": "Adds general-purpose pipeline run state object.", 17 | "type": "patch" 18 | } 19 | ], 20 | "created_at": "2025-03-11T23:53:00+00:00", 21 | "version": "2.1.0" 22 | } -------------------------------------------------------------------------------- /.semversioner/2.2.0.json: -------------------------------------------------------------------------------- 1 | { 2 | "changes": [ 3 | { 4 | "description": "Support OpenAI reasoning models.", 5 | "type": "minor" 6 | }, 7 | { 8 | "description": "Add option to snapshot raw extracted graph tables.", 9 | "type": "patch" 10 | }, 11 | { 12 | "description": "Added batching logic to the prompt tuning autoselection embeddings workflow", 13 | "type": "patch" 14 | }, 15 | { 16 | "description": "Align config classes and docs better.", 17 | "type": "patch" 18 | }, 19 | { 20 | "description": "Align embeddings table loading with configured fields.", 21 | "type": "patch" 22 | }, 23 | { 24 | "description": "Brings parity with our latest NLP extraction approaches.", 25 | "type": "patch" 26 | }, 27 | { 28 | "description": "Fix fnllm to 0.2.3", 29 | "type": "patch" 30 | }, 31 | { 32 | "description": "Fixes to basic search.", 33 | "type": "patch" 34 | }, 35 | { 36 | "description": "Update llm args for consistency.", 37 | "type": "patch" 38 | }, 39 | { 40 | "description": "add vector store integration tests", 41 | "type": "patch" 42 | } 43 | ], 44 | "created_at": "2025-04-25T23:30:57+00:00", 45 | "version": "2.2.0" 46 | } -------------------------------------------------------------------------------- /.semversioner/2.2.1.json: -------------------------------------------------------------------------------- 1 | { 2 | "changes": [ 3 | { 4 | "description": "Fix Community Report prompt tuning response", 5 | "type": "patch" 6 | }, 7 | { 8 | "description": "Fix graph creation missing edge weights.", 9 | "type": "patch" 10 | }, 11 | { 12 | "description": "Update as workflows", 13 | "type": "patch" 14 | } 15 | ], 16 | "created_at": "2025-04-30T23:50:31+00:00", 17 | "version": "2.2.1" 18 | } -------------------------------------------------------------------------------- /.semversioner/2.3.0.json: -------------------------------------------------------------------------------- 1 | { 2 | "changes": [ 3 | { 4 | "description": "Remove Dynamic Max Retries support. Refactor typer typing in cli interface", 5 | "type": "minor" 6 | }, 7 | { 8 | "description": "Update fnllm to latest. Update default graphrag configuration", 9 | "type": "minor" 10 | }, 11 | { 12 | "description": "A few fixes and enhancements for better reuse and flow.", 13 | "type": "patch" 14 | }, 15 | { 16 | "description": "Add full llm response to LLM PRovider output", 17 | "type": "patch" 18 | }, 19 | { 20 | "description": "Fix Drift Reduce Response for non streaming calls", 21 | "type": "patch" 22 | }, 23 | { 24 | "description": "Fix global search prompt to include missing formatting key", 25 | "type": "patch" 26 | }, 27 | { 28 | "description": "Upgrade pyarrow dependency to >=17.0.0 to fix CVE-2024-52338", 29 | "type": "patch" 30 | } 31 | ], 32 | "created_at": "2025-05-23T21:02:47+00:00", 33 | "version": "2.3.0" 34 | } -------------------------------------------------------------------------------- /.semversioner/next-release/patch-20250530204951787463.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "patch", 3 | "description": "Update typer." 4 | } 5 | -------------------------------------------------------------------------------- /.vscode/extensions.json: -------------------------------------------------------------------------------- 1 | { 2 | "recommendations": [ 3 | "arcanis.vscode-zipfs", 4 | "ms-python.python", 5 | "charliermarsh.ruff", 6 | "ms-python.vscode-pylance", 7 | "bierner.markdown-mermaid", 8 | "streetsidesoftware.code-spell-checker", 9 | "ronnidc.nunjucks", 10 | "lucien-martijn.parquet-visualizer", 11 | ] 12 | } 13 | -------------------------------------------------------------------------------- /.vscode/launch.json: -------------------------------------------------------------------------------- 1 | { 2 | "_comment": "Use this file to configure the graphrag project for debugging. You may create other configuration profiles based on these or select one below to use.", 3 | "version": "0.2.0", 4 | "configurations": [ 5 | { 6 | "name": "Indexer", 7 | "type": "debugpy", 8 | "request": "launch", 9 | "module": "poetry", 10 | "args": [ 11 | "poe", "index", 12 | "--root", "" 13 | ], 14 | }, 15 | { 16 | "name": "Query", 17 | "type": "debugpy", 18 | "request": "launch", 19 | "module": "poetry", 20 | "args": [ 21 | "poe", "query", 22 | "--root", "", 23 | "--method", "global", 24 | "--query", "What are the top themes in this story", 25 | ] 26 | }, 27 | { 28 | "name": "Prompt Tuning", 29 | "type": "debugpy", 30 | "request": "launch", 31 | "module": "poetry", 32 | "args": [ 33 | "poe", "prompt-tune", 34 | "--config", 35 | "/settings.yaml", 36 | ] 37 | } 38 | ] 39 | } -------------------------------------------------------------------------------- /.vsts-ci.yml: -------------------------------------------------------------------------------- 1 | name: GraphRAG CI 2 | pool: 3 | vmImage: ubuntu-latest 4 | 5 | trigger: 6 | batch: true 7 | branches: 8 | include: 9 | - main 10 | 11 | variables: 12 | isMain: $[eq(variables['Build.SourceBranch'], 'refs/heads/main')] 13 | pythonVersion: "3.10" 14 | poetryVersion: "1.6.1" 15 | nodeVersion: "18.x" 16 | artifactsFullFeedName: "Resilience/resilience_python" 17 | 18 | stages: 19 | - stage: Compliance 20 | dependsOn: [] 21 | jobs: 22 | - job: compliance 23 | displayName: Compliance 24 | pool: 25 | vmImage: windows-latest 26 | steps: 27 | - task: CredScan@3 28 | inputs: 29 | outputFormat: sarif 30 | debugMode: false 31 | 32 | - task: ComponentGovernanceComponentDetection@0 33 | inputs: 34 | scanType: "Register" 35 | verbosity: "Verbose" 36 | alertWarningLevel: "High" 37 | 38 | - task: PublishSecurityAnalysisLogs@3 39 | inputs: 40 | ArtifactName: "CodeAnalysisLogs" 41 | ArtifactType: "Container" -------------------------------------------------------------------------------- /CODEOWNERS: -------------------------------------------------------------------------------- 1 | # These owners will be the default owners for everything in 2 | # the repo. Unless a later match takes precedence, 3 | # @global-owner1 and @global-owner2 will be requested for 4 | # review when someone opens a pull request. 5 | * @microsoft/societal-resilience @microsoft/graphrag-core-team 6 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Microsoft Open Source Code of Conduct 2 | 3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 4 | 5 | Resources: 6 | 7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/) 8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) 9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns 10 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) Microsoft Corporation. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE 22 | -------------------------------------------------------------------------------- /SUPPORT.md: -------------------------------------------------------------------------------- 1 | # Support 2 | 3 | ## How to file issues and get help 4 | 5 | This project uses GitHub Issues to track bugs and feature requests. Please search the existing 6 | issues before filing new issues to avoid duplicates. For new issues, file your bug or 7 | feature request as a new Issue. 8 | 9 | For help and questions about using this project, please create a GitHub issue with your question. 10 | 11 | ## Microsoft Support Policy 12 | 13 | # Support for this **PROJECT or PRODUCT** is limited to the resources listed above. 14 | 15 | # Support 16 | 17 | ## How to file issues and get help 18 | 19 | This project uses GitHub Issues to track bugs and feature requests. Please search the existing 20 | issues before filing new issues to avoid duplicates. For new issues, file your bug or 21 | feature request as a new Issue. 22 | 23 | For help and questions about using this project, please file an issue on the repo. 24 | 25 | ## Microsoft Support Policy 26 | 27 | Support for this project is limited to the resources listed above. 28 | -------------------------------------------------------------------------------- /cspell.config.yaml: -------------------------------------------------------------------------------- 1 | $schema: https://raw.githubusercontent.com/streetsidesoftware/cspell/main/cspell.schema.json 2 | version: "0.2" 3 | allowCompoundWords: true 4 | dictionaryDefinitions: 5 | - name: dictionary 6 | path: "./dictionary.txt" 7 | addWords: true 8 | dictionaries: 9 | - dictionary 10 | ignorePaths: 11 | - cspell.config.yaml 12 | - node_modules 13 | - _site 14 | - /project-words.txt 15 | - default_pipeline.yml 16 | - .turbo 17 | - output/ 18 | - dist/ 19 | - temp_azurite/ 20 | - __pycache__ 21 | - pyproject.toml 22 | - entity_extraction.txt 23 | - package.json 24 | - tests/fixtures/ 25 | - examples_notebooks/inputs/ 26 | - docs/examples_notebooks/inputs/ 27 | - "*.csv" 28 | - "*.parquet" 29 | - "*.faiss" 30 | - "*.ipynb" 31 | - "*.log" 32 | -------------------------------------------------------------------------------- /docs/cli.md: -------------------------------------------------------------------------------- 1 | # CLI Reference 2 | 3 | This page documents the command-line interface of the graphrag library. 4 | 5 | ::: mkdocs-typer 6 | :module: graphrag.cli.main 7 | :prog_name: graphrag 8 | :command: app 9 | :depth: 0 10 | -------------------------------------------------------------------------------- /docs/config/overview.md: -------------------------------------------------------------------------------- 1 | # Configuring GraphRAG Indexing 2 | 3 | The GraphRAG system is highly configurable. This page provides an overview of the configuration options available for the GraphRAG indexing engine. 4 | 5 | ## Default Configuration Mode 6 | 7 | The default configuration mode is the simplest way to get started with the GraphRAG system. It is designed to work out-of-the-box with minimal configuration. The main ways to set up GraphRAG in Default Configuration mode are via: 8 | 9 | - [Init command](init.md) (recommended first step) 10 | - [Edit settings.yaml for deeper control](yaml.md) 11 | - [Purely using environment variables](env_vars.md) (not recommended) 12 | -------------------------------------------------------------------------------- /docs/data/operation_dulce/ABOUT.md: -------------------------------------------------------------------------------- 1 | # About 2 | 3 | This document (Operation Dulce) is an AI-generated science fiction novella, included here for the purposes of integration testing. -------------------------------------------------------------------------------- /docs/data/operation_dulce/dataset.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/docs/data/operation_dulce/dataset.zip -------------------------------------------------------------------------------- /docs/examples_notebooks/inputs/operation dulce/ABOUT.md: -------------------------------------------------------------------------------- 1 | # About 2 | 3 | This document (Operation Dulce) is an AI-generated science fiction novella, included here for the purposes of providing a starting point for notebook experimentation. 4 | -------------------------------------------------------------------------------- /docs/examples_notebooks/inputs/operation dulce/communities.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/docs/examples_notebooks/inputs/operation dulce/communities.parquet -------------------------------------------------------------------------------- /docs/examples_notebooks/inputs/operation dulce/community_reports.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/docs/examples_notebooks/inputs/operation dulce/community_reports.parquet -------------------------------------------------------------------------------- /docs/examples_notebooks/inputs/operation dulce/covariates.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/docs/examples_notebooks/inputs/operation dulce/covariates.parquet -------------------------------------------------------------------------------- /docs/examples_notebooks/inputs/operation dulce/documents.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/docs/examples_notebooks/inputs/operation dulce/documents.parquet -------------------------------------------------------------------------------- /docs/examples_notebooks/inputs/operation dulce/entities.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/docs/examples_notebooks/inputs/operation dulce/entities.parquet -------------------------------------------------------------------------------- /docs/examples_notebooks/inputs/operation dulce/lancedb/default-community-full_content.lance/_transactions/0-2fed1d8b-daac-41b0-a93a-e115cda75be3.txn: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/docs/examples_notebooks/inputs/operation dulce/lancedb/default-community-full_content.lance/_transactions/0-2fed1d8b-daac-41b0-a93a-e115cda75be3.txn -------------------------------------------------------------------------------- /docs/examples_notebooks/inputs/operation dulce/lancedb/default-community-full_content.lance/_transactions/1-61dbb7c2-aec3-4796-b223-941fc7cc93cc.txn: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/docs/examples_notebooks/inputs/operation dulce/lancedb/default-community-full_content.lance/_transactions/1-61dbb7c2-aec3-4796-b223-941fc7cc93cc.txn -------------------------------------------------------------------------------- /docs/examples_notebooks/inputs/operation dulce/lancedb/default-community-full_content.lance/_transactions/2-60012692-a153-48f9-8f4e-c479b44cbf3f.txn: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/docs/examples_notebooks/inputs/operation dulce/lancedb/default-community-full_content.lance/_transactions/2-60012692-a153-48f9-8f4e-c479b44cbf3f.txn -------------------------------------------------------------------------------- /docs/examples_notebooks/inputs/operation dulce/lancedb/default-community-full_content.lance/_transactions/3-0d2dc9a1-094f-4220-83c7-6ad6f26fac2b.txn: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/docs/examples_notebooks/inputs/operation dulce/lancedb/default-community-full_content.lance/_transactions/3-0d2dc9a1-094f-4220-83c7-6ad6f26fac2b.txn -------------------------------------------------------------------------------- /docs/examples_notebooks/inputs/operation dulce/lancedb/default-community-full_content.lance/_versions/1.manifest: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/docs/examples_notebooks/inputs/operation dulce/lancedb/default-community-full_content.lance/_versions/1.manifest -------------------------------------------------------------------------------- /docs/examples_notebooks/inputs/operation dulce/lancedb/default-community-full_content.lance/_versions/2.manifest: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/docs/examples_notebooks/inputs/operation dulce/lancedb/default-community-full_content.lance/_versions/2.manifest -------------------------------------------------------------------------------- /docs/examples_notebooks/inputs/operation dulce/lancedb/default-community-full_content.lance/_versions/3.manifest: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/docs/examples_notebooks/inputs/operation dulce/lancedb/default-community-full_content.lance/_versions/3.manifest -------------------------------------------------------------------------------- /docs/examples_notebooks/inputs/operation dulce/lancedb/default-community-full_content.lance/_versions/4.manifest: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/docs/examples_notebooks/inputs/operation dulce/lancedb/default-community-full_content.lance/_versions/4.manifest -------------------------------------------------------------------------------- /docs/examples_notebooks/inputs/operation dulce/lancedb/default-community-full_content.lance/data/1e7b2d94-ed06-4aa0-b22e-86a71d416bc6.lance: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/docs/examples_notebooks/inputs/operation dulce/lancedb/default-community-full_content.lance/data/1e7b2d94-ed06-4aa0-b22e-86a71d416bc6.lance -------------------------------------------------------------------------------- /docs/examples_notebooks/inputs/operation dulce/lancedb/default-community-full_content.lance/data/1ed9f301-ce30-46a8-8c0b-9c2a60a3cf43.lance: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/docs/examples_notebooks/inputs/operation dulce/lancedb/default-community-full_content.lance/data/1ed9f301-ce30-46a8-8c0b-9c2a60a3cf43.lance -------------------------------------------------------------------------------- /docs/examples_notebooks/inputs/operation dulce/lancedb/default-entity-description.lance/_transactions/0-92c031e5-7558-451e-9d0f-f5514db9616d.txn: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/docs/examples_notebooks/inputs/operation dulce/lancedb/default-entity-description.lance/_transactions/0-92c031e5-7558-451e-9d0f-f5514db9616d.txn -------------------------------------------------------------------------------- /docs/examples_notebooks/inputs/operation dulce/lancedb/default-entity-description.lance/_transactions/1-7b3cb8d8-3512-4584-a003-91838fed8911.txn: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/docs/examples_notebooks/inputs/operation dulce/lancedb/default-entity-description.lance/_transactions/1-7b3cb8d8-3512-4584-a003-91838fed8911.txn -------------------------------------------------------------------------------- /docs/examples_notebooks/inputs/operation dulce/lancedb/default-entity-description.lance/_transactions/2-7de627d2-4c57-49e9-bf73-c17a9582ead4.txn: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/docs/examples_notebooks/inputs/operation dulce/lancedb/default-entity-description.lance/_transactions/2-7de627d2-4c57-49e9-bf73-c17a9582ead4.txn -------------------------------------------------------------------------------- /docs/examples_notebooks/inputs/operation dulce/lancedb/default-entity-description.lance/_transactions/3-9ad29d69-9a69-43a8-8b26-252ea267958d.txn: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/docs/examples_notebooks/inputs/operation dulce/lancedb/default-entity-description.lance/_transactions/3-9ad29d69-9a69-43a8-8b26-252ea267958d.txn -------------------------------------------------------------------------------- /docs/examples_notebooks/inputs/operation dulce/lancedb/default-entity-description.lance/_versions/1.manifest: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/docs/examples_notebooks/inputs/operation dulce/lancedb/default-entity-description.lance/_versions/1.manifest -------------------------------------------------------------------------------- /docs/examples_notebooks/inputs/operation dulce/lancedb/default-entity-description.lance/_versions/2.manifest: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/docs/examples_notebooks/inputs/operation dulce/lancedb/default-entity-description.lance/_versions/2.manifest -------------------------------------------------------------------------------- /docs/examples_notebooks/inputs/operation dulce/lancedb/default-entity-description.lance/_versions/3.manifest: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/docs/examples_notebooks/inputs/operation dulce/lancedb/default-entity-description.lance/_versions/3.manifest -------------------------------------------------------------------------------- /docs/examples_notebooks/inputs/operation dulce/lancedb/default-entity-description.lance/_versions/4.manifest: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/docs/examples_notebooks/inputs/operation dulce/lancedb/default-entity-description.lance/_versions/4.manifest -------------------------------------------------------------------------------- /docs/examples_notebooks/inputs/operation dulce/lancedb/default-entity-description.lance/data/a34575c4-5260-457f-bebe-3f40bc0e2ee3.lance: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/docs/examples_notebooks/inputs/operation dulce/lancedb/default-entity-description.lance/data/a34575c4-5260-457f-bebe-3f40bc0e2ee3.lance -------------------------------------------------------------------------------- /docs/examples_notebooks/inputs/operation dulce/lancedb/default-entity-description.lance/data/eabd7580-86f5-4022-8aa7-fe0aff816d98.lance: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/docs/examples_notebooks/inputs/operation dulce/lancedb/default-entity-description.lance/data/eabd7580-86f5-4022-8aa7-fe0aff816d98.lance -------------------------------------------------------------------------------- /docs/examples_notebooks/inputs/operation dulce/lancedb/default-text_unit-text.lance/_transactions/0-fd0434ac-e5cd-4ddd-9dd5-e5048d4edb59.txn: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/docs/examples_notebooks/inputs/operation dulce/lancedb/default-text_unit-text.lance/_transactions/0-fd0434ac-e5cd-4ddd-9dd5-e5048d4edb59.txn -------------------------------------------------------------------------------- /docs/examples_notebooks/inputs/operation dulce/lancedb/default-text_unit-text.lance/_transactions/1-14bb4b1d-cc00-420b-9b14-3626f0bd8c0b.txn: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/docs/examples_notebooks/inputs/operation dulce/lancedb/default-text_unit-text.lance/_transactions/1-14bb4b1d-cc00-420b-9b14-3626f0bd8c0b.txn -------------------------------------------------------------------------------- /docs/examples_notebooks/inputs/operation dulce/lancedb/default-text_unit-text.lance/_transactions/2-8e74264c-f72d-44f5-a6f4-b3b61ae6a43b.txn: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/docs/examples_notebooks/inputs/operation dulce/lancedb/default-text_unit-text.lance/_transactions/2-8e74264c-f72d-44f5-a6f4-b3b61ae6a43b.txn -------------------------------------------------------------------------------- /docs/examples_notebooks/inputs/operation dulce/lancedb/default-text_unit-text.lance/_transactions/3-7516fb71-9db3-4666-bdef-ea04c1eb9697.txn: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/docs/examples_notebooks/inputs/operation dulce/lancedb/default-text_unit-text.lance/_transactions/3-7516fb71-9db3-4666-bdef-ea04c1eb9697.txn -------------------------------------------------------------------------------- /docs/examples_notebooks/inputs/operation dulce/lancedb/default-text_unit-text.lance/_versions/1.manifest: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/docs/examples_notebooks/inputs/operation dulce/lancedb/default-text_unit-text.lance/_versions/1.manifest -------------------------------------------------------------------------------- /docs/examples_notebooks/inputs/operation dulce/lancedb/default-text_unit-text.lance/_versions/2.manifest: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/docs/examples_notebooks/inputs/operation dulce/lancedb/default-text_unit-text.lance/_versions/2.manifest -------------------------------------------------------------------------------- /docs/examples_notebooks/inputs/operation dulce/lancedb/default-text_unit-text.lance/_versions/3.manifest: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/docs/examples_notebooks/inputs/operation dulce/lancedb/default-text_unit-text.lance/_versions/3.manifest -------------------------------------------------------------------------------- /docs/examples_notebooks/inputs/operation dulce/lancedb/default-text_unit-text.lance/_versions/4.manifest: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/docs/examples_notebooks/inputs/operation dulce/lancedb/default-text_unit-text.lance/_versions/4.manifest -------------------------------------------------------------------------------- /docs/examples_notebooks/inputs/operation dulce/lancedb/default-text_unit-text.lance/data/2794bf5b-de3d-4202-ab16-e76bc27c8e6a.lance: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/docs/examples_notebooks/inputs/operation dulce/lancedb/default-text_unit-text.lance/data/2794bf5b-de3d-4202-ab16-e76bc27c8e6a.lance -------------------------------------------------------------------------------- /docs/examples_notebooks/inputs/operation dulce/lancedb/default-text_unit-text.lance/data/2f74c8e8-3f35-4209-889c-a13cf0780eb3.lance: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/docs/examples_notebooks/inputs/operation dulce/lancedb/default-text_unit-text.lance/data/2f74c8e8-3f35-4209-889c-a13cf0780eb3.lance -------------------------------------------------------------------------------- /docs/examples_notebooks/inputs/operation dulce/relationships.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/docs/examples_notebooks/inputs/operation dulce/relationships.parquet -------------------------------------------------------------------------------- /docs/examples_notebooks/inputs/operation dulce/text_units.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/docs/examples_notebooks/inputs/operation dulce/text_units.parquet -------------------------------------------------------------------------------- /docs/img/GraphRag-Figure1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/docs/img/GraphRag-Figure1.jpg -------------------------------------------------------------------------------- /docs/img/auto-tune-diagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/docs/img/auto-tune-diagram.png -------------------------------------------------------------------------------- /docs/img/drift-search-diagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/docs/img/drift-search-diagram.png -------------------------------------------------------------------------------- /docs/img/pipeline-running.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/docs/img/pipeline-running.png -------------------------------------------------------------------------------- /docs/img/viz_guide/gephi-appearance-pane.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/docs/img/viz_guide/gephi-appearance-pane.png -------------------------------------------------------------------------------- /docs/img/viz_guide/gephi-initial-graph-example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/docs/img/viz_guide/gephi-initial-graph-example.png -------------------------------------------------------------------------------- /docs/img/viz_guide/gephi-layout-forceatlas2-pane.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/docs/img/viz_guide/gephi-layout-forceatlas2-pane.png -------------------------------------------------------------------------------- /docs/img/viz_guide/gephi-layout-pane.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/docs/img/viz_guide/gephi-layout-pane.png -------------------------------------------------------------------------------- /docs/img/viz_guide/gephi-network-overview-settings.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/docs/img/viz_guide/gephi-network-overview-settings.png -------------------------------------------------------------------------------- /docs/prompt_tuning/overview.md: -------------------------------------------------------------------------------- 1 | # Prompt Tuning ⚙️ 2 | 3 | This page provides an overview of the prompt tuning options available for the GraphRAG indexing engine. 4 | 5 | ## Default Prompts 6 | 7 | The default prompts are the simplest way to get started with the GraphRAG system. It is designed to work out-of-the-box with minimal configuration. More details about each of the default prompts for indexing and query can be found on the [manual tuning](./manual_prompt_tuning.md) page. 8 | 9 | ## Auto Tuning 10 | 11 | Auto Tuning leverages your input data and LLM interactions to create domain adapted prompts for the generation of the knowledge graph. It is highly encouraged to run it as it will yield better results when executing an Index Run. For more details about how to use it, please refer to the [Auto Tuning](auto_prompt_tuning.md) documentation. 12 | 13 | ## Manual Tuning 14 | 15 | Manual tuning is an advanced use-case. Most users will want to use the Auto Tuning feature instead. Details about how to use manual configuration are available in the [manual tuning](manual_prompt_tuning.md) documentation. 16 | -------------------------------------------------------------------------------- /docs/query/notebooks/overview.md: -------------------------------------------------------------------------------- 1 | # API Notebooks 2 | 3 | - [API Overview Notebook](../../examples_notebooks/api_overview.ipynb) 4 | 5 | # Query Engine Notebooks 6 | 7 | For examples about running Query please refer to the following notebooks: 8 | 9 | - [Global Search Notebook](../../examples_notebooks/global_search.ipynb) 10 | - [Local Search Notebook](../../examples_notebooks/local_search.ipynb) 11 | - [DRIFT Search Notebook](../../examples_notebooks/drift_search.ipynb) 12 | 13 | The test dataset for these notebooks can be found in [dataset.zip](../../data/operation_dulce/dataset.zip){:download}. 14 | -------------------------------------------------------------------------------- /docs/scripts/create_cookie_banner.js: -------------------------------------------------------------------------------- 1 | function onConsentChanged(categoryPreferences) { 2 | console.log("onConsentChanged", categoryPreferences); 3 | } 4 | 5 | 6 | cb = document.createElement("div"); 7 | cb.id = "cookie-banner"; 8 | document.body.insertBefore(cb, document.body.children[0]); 9 | 10 | window.WcpConsent && WcpConsent.init("en-US", "cookie-banner", function (err, consent) { 11 | if (!err) { 12 | console.log("consent: ", consent); 13 | window.manageConsent = () => consent.manageConsent(); 14 | siteConsent = consent; 15 | } else { 16 | console.log("Error initializing WcpConsent: "+ err); 17 | } 18 | }, onConsentChanged, WcpConsent.themes.light); -------------------------------------------------------------------------------- /docs/stylesheets/extra.css: -------------------------------------------------------------------------------- 1 | [data-md-color-scheme="default"] { 2 | --md-primary-fg-color: #3c4cab; 3 | --md-code-hl-color: #3772d9; 4 | --md-code-hl-comment-color: #6b6b6b; 5 | --md-code-hl-operator-color: #6b6b6b; 6 | --md-footer-fg-color--light: #ffffff; 7 | --md-footer-fg-color--lighter: #ffffff; 8 | } 9 | 10 | [data-md-color-scheme="slate"] { 11 | --md-primary-fg-color: #364499; 12 | --md-code-hl-color: #246be5; 13 | --md-code-hl-constant-color: #9a89ed; 14 | --md-code-hl-number-color: #f16e5f; 15 | --md-footer-fg-color--light: #ffffff; 16 | --md-footer-fg-color--lighter: #ffffff; 17 | } 18 | 19 | .md-tabs__item--active { 20 | background-color: var(--md-primary-bg-color); 21 | } 22 | 23 | .md-tabs__item--active .md-tabs__link { 24 | color: var(--md-code-hl-color); 25 | } 26 | 27 | .md-typeset a { 28 | text-decoration: underline; 29 | } -------------------------------------------------------------------------------- /examples_notebooks/community_contrib/README.md: -------------------------------------------------------------------------------- 1 | ## Disclaimer 2 | 3 | This folder contains community contributed notebooks that are not officially supported by the GraphRAG team. The notebooks are provided as-is and are not guaranteed to work with the latest version of GraphRAG. If you have any questions or issues, please reach out to the author of the notebook directly. 4 | 5 | For more information on how to contribute to the GraphRAG project, please refer to the [contribution guidelines](https://github.com/microsoft/graphrag/blob/main/CONTRIBUTING.md) 6 | -------------------------------------------------------------------------------- /examples_notebooks/inputs/operation dulce/lancedb/entity_description_embeddings.lance/_latest.manifest: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/examples_notebooks/inputs/operation dulce/lancedb/entity_description_embeddings.lance/_latest.manifest -------------------------------------------------------------------------------- /examples_notebooks/inputs/operation dulce/lancedb/entity_description_embeddings.lance/_transactions/0-498c6e24-dd0a-42b9-8f7e-5e3d2ab258b0.txn: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/examples_notebooks/inputs/operation dulce/lancedb/entity_description_embeddings.lance/_transactions/0-498c6e24-dd0a-42b9-8f7e-5e3d2ab258b0.txn -------------------------------------------------------------------------------- /examples_notebooks/inputs/operation dulce/lancedb/entity_description_embeddings.lance/_transactions/1-bf5aa024-a229-461f-8d78-699841a302fe.txn: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/examples_notebooks/inputs/operation dulce/lancedb/entity_description_embeddings.lance/_transactions/1-bf5aa024-a229-461f-8d78-699841a302fe.txn -------------------------------------------------------------------------------- /examples_notebooks/inputs/operation dulce/lancedb/entity_description_embeddings.lance/_versions/1.manifest: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/examples_notebooks/inputs/operation dulce/lancedb/entity_description_embeddings.lance/_versions/1.manifest -------------------------------------------------------------------------------- /examples_notebooks/inputs/operation dulce/lancedb/entity_description_embeddings.lance/_versions/2.manifest: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/examples_notebooks/inputs/operation dulce/lancedb/entity_description_embeddings.lance/_versions/2.manifest -------------------------------------------------------------------------------- /examples_notebooks/inputs/operation dulce/lancedb/entity_description_embeddings.lance/data/fe64774f-5412-4c9c-8dea-f6ed55c81119.lance: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/examples_notebooks/inputs/operation dulce/lancedb/entity_description_embeddings.lance/data/fe64774f-5412-4c9c-8dea-f6ed55c81119.lance -------------------------------------------------------------------------------- /graphrag/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """The GraphRAG package.""" 5 | -------------------------------------------------------------------------------- /graphrag/__main__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """The GraphRAG package.""" 5 | 6 | from graphrag.cli.main import app 7 | 8 | app(prog_name="graphrag") 9 | -------------------------------------------------------------------------------- /graphrag/api/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """API for GraphRAG. 5 | 6 | WARNING: This API is under development and may undergo changes in future releases. 7 | Backwards compatibility is not guaranteed at this time. 8 | """ 9 | 10 | from graphrag.api.index import build_index 11 | from graphrag.api.prompt_tune import generate_indexing_prompts 12 | from graphrag.api.query import ( 13 | basic_search, 14 | basic_search_streaming, 15 | drift_search, 16 | drift_search_streaming, 17 | global_search, 18 | global_search_streaming, 19 | local_search, 20 | local_search_streaming, 21 | multi_index_basic_search, 22 | multi_index_drift_search, 23 | multi_index_global_search, 24 | multi_index_local_search, 25 | ) 26 | from graphrag.prompt_tune.types import DocSelectionType 27 | 28 | __all__ = [ # noqa: RUF022 29 | # index API 30 | "build_index", 31 | # query API 32 | "global_search", 33 | "global_search_streaming", 34 | "local_search", 35 | "local_search_streaming", 36 | "drift_search", 37 | "drift_search_streaming", 38 | "basic_search", 39 | "basic_search_streaming", 40 | "multi_index_basic_search", 41 | "multi_index_drift_search", 42 | "multi_index_global_search", 43 | "multi_index_local_search", 44 | # prompt tuning API 45 | "DocSelectionType", 46 | "generate_indexing_prompts", 47 | ] 48 | -------------------------------------------------------------------------------- /graphrag/cache/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """A package containing cache implementations.""" 5 | -------------------------------------------------------------------------------- /graphrag/callbacks/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """A package containing callback implementations.""" 5 | -------------------------------------------------------------------------------- /graphrag/callbacks/console_workflow_callbacks.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """A logger that emits updates from the indexing engine to the console.""" 5 | 6 | from graphrag.callbacks.noop_workflow_callbacks import NoopWorkflowCallbacks 7 | 8 | 9 | class ConsoleWorkflowCallbacks(NoopWorkflowCallbacks): 10 | """A logger that writes to a console.""" 11 | 12 | def error( 13 | self, 14 | message: str, 15 | cause: BaseException | None = None, 16 | stack: str | None = None, 17 | details: dict | None = None, 18 | ): 19 | """Handle when an error occurs.""" 20 | print(message, str(cause), stack, details) # noqa T201 21 | 22 | def warning(self, message: str, details: dict | None = None): 23 | """Handle when a warning occurs.""" 24 | _print_warning(message) 25 | 26 | def log(self, message: str, details: dict | None = None): 27 | """Handle when a log message is produced.""" 28 | print(message, details) # noqa T201 29 | 30 | 31 | def _print_warning(skk): 32 | print("\033[93m {}\033[00m".format(skk)) # noqa T201 33 | -------------------------------------------------------------------------------- /graphrag/callbacks/llm_callbacks.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """LLM Callbacks.""" 5 | 6 | from typing import Protocol 7 | 8 | 9 | class BaseLLMCallback(Protocol): 10 | """Base class for LLM callbacks.""" 11 | 12 | def on_llm_new_token(self, token: str): 13 | """Handle when a new token is generated.""" 14 | ... 15 | -------------------------------------------------------------------------------- /graphrag/callbacks/noop_query_callbacks.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """No-op Query Callbacks.""" 5 | 6 | from typing import Any 7 | 8 | from graphrag.callbacks.query_callbacks import QueryCallbacks 9 | from graphrag.query.structured_search.base import SearchResult 10 | 11 | 12 | class NoopQueryCallbacks(QueryCallbacks): 13 | """A no-op implementation of QueryCallbacks.""" 14 | 15 | def on_context(self, context: Any) -> None: 16 | """Handle when context data is constructed.""" 17 | 18 | def on_map_response_start(self, map_response_contexts: list[str]) -> None: 19 | """Handle the start of map operation.""" 20 | 21 | def on_map_response_end(self, map_response_outputs: list[SearchResult]) -> None: 22 | """Handle the end of map operation.""" 23 | 24 | def on_reduce_response_start( 25 | self, reduce_response_context: str | dict[str, Any] 26 | ) -> None: 27 | """Handle the start of reduce operation.""" 28 | 29 | def on_reduce_response_end(self, reduce_response_output: str) -> None: 30 | """Handle the end of reduce operation.""" 31 | 32 | def on_llm_new_token(self, token): 33 | """Handle when a new token is generated.""" 34 | -------------------------------------------------------------------------------- /graphrag/callbacks/progress_workflow_callbacks.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """A workflow callback manager that emits updates.""" 5 | 6 | from graphrag.callbacks.noop_workflow_callbacks import NoopWorkflowCallbacks 7 | from graphrag.logger.base import ProgressLogger 8 | from graphrag.logger.progress import Progress 9 | 10 | 11 | class ProgressWorkflowCallbacks(NoopWorkflowCallbacks): 12 | """A callbackmanager that delegates to a ProgressLogger.""" 13 | 14 | _root_progress: ProgressLogger 15 | _progress_stack: list[ProgressLogger] 16 | 17 | def __init__(self, progress: ProgressLogger) -> None: 18 | """Create a new ProgressWorkflowCallbacks.""" 19 | self._progress = progress 20 | self._progress_stack = [progress] 21 | 22 | def _pop(self) -> None: 23 | self._progress_stack.pop() 24 | 25 | def _push(self, name: str) -> None: 26 | self._progress_stack.append(self._latest.child(name)) 27 | 28 | @property 29 | def _latest(self) -> ProgressLogger: 30 | return self._progress_stack[-1] 31 | 32 | def workflow_start(self, name: str, instance: object) -> None: 33 | """Execute this callback when a workflow starts.""" 34 | self._push(name) 35 | 36 | def workflow_end(self, name: str, instance: object) -> None: 37 | """Execute this callback when a workflow ends.""" 38 | self._pop() 39 | 40 | def progress(self, progress: Progress) -> None: 41 | """Handle when progress occurs.""" 42 | self._latest(progress) 43 | -------------------------------------------------------------------------------- /graphrag/callbacks/query_callbacks.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Query Callbacks.""" 5 | 6 | from typing import Any 7 | 8 | from graphrag.callbacks.llm_callbacks import BaseLLMCallback 9 | from graphrag.query.structured_search.base import SearchResult 10 | 11 | 12 | class QueryCallbacks(BaseLLMCallback): 13 | """Callbacks used during query execution.""" 14 | 15 | def on_context(self, context: Any) -> None: 16 | """Handle when context data is constructed.""" 17 | 18 | def on_map_response_start(self, map_response_contexts: list[str]) -> None: 19 | """Handle the start of map operation.""" 20 | 21 | def on_map_response_end(self, map_response_outputs: list[SearchResult]) -> None: 22 | """Handle the end of map operation.""" 23 | 24 | def on_reduce_response_start( 25 | self, reduce_response_context: str | dict[str, Any] 26 | ) -> None: 27 | """Handle the start of reduce operation.""" 28 | 29 | def on_reduce_response_end(self, reduce_response_output: str) -> None: 30 | """Handle the end of reduce operation.""" 31 | 32 | def on_llm_new_token(self, token) -> None: 33 | """Handle when a new token is generated.""" 34 | -------------------------------------------------------------------------------- /graphrag/cli/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """CLI for GraphRAG.""" 5 | -------------------------------------------------------------------------------- /graphrag/config/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """The config package root.""" 5 | -------------------------------------------------------------------------------- /graphrag/config/create_graphrag_config.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Parameterization settings for the default configuration, loaded from environment variables.""" 5 | 6 | from pathlib import Path 7 | from typing import Any 8 | 9 | from graphrag.config.models.graph_rag_config import GraphRagConfig 10 | 11 | 12 | def create_graphrag_config( 13 | values: dict[str, Any] | None = None, 14 | root_dir: str | None = None, 15 | ) -> GraphRagConfig: 16 | """Load Configuration Parameters from a dictionary. 17 | 18 | Parameters 19 | ---------- 20 | values : dict[str, Any] | None 21 | Dictionary of configuration values to pass into pydantic model. 22 | root_dir : str | None 23 | Root directory for the project. 24 | skip_validation : bool 25 | Skip pydantic model validation of the configuration. 26 | This is useful for testing and mocking purposes but 27 | should not be used in the core code or API. 28 | 29 | Returns 30 | ------- 31 | GraphRagConfig 32 | The configuration object. 33 | 34 | Raises 35 | ------ 36 | ValidationError 37 | If the configuration values do not satisfy pydantic validation. 38 | """ 39 | values = values or {} 40 | if root_dir: 41 | root_path = Path(root_dir).resolve() 42 | values["root_dir"] = str(root_path) 43 | return GraphRagConfig(**values) 44 | -------------------------------------------------------------------------------- /graphrag/config/get_embedding_settings.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """A module containing get_embedding_settings.""" 5 | 6 | from graphrag.config.models.graph_rag_config import GraphRagConfig 7 | 8 | 9 | def get_embedding_settings( 10 | settings: GraphRagConfig, 11 | vector_store_params: dict | None = None, 12 | ) -> dict: 13 | """Transform GraphRAG config into settings for workflows.""" 14 | # TEMP 15 | embeddings_llm_settings = settings.get_language_model_config( 16 | settings.embed_text.model_id 17 | ) 18 | vector_store_settings = settings.get_vector_store_config( 19 | settings.embed_text.vector_store_id 20 | ).model_dump() 21 | 22 | # 23 | # If we get to this point, settings.vector_store is defined, and there's a specific setting for this embedding. 24 | # settings.vector_store.base contains connection information, or may be undefined 25 | # settings.vector_store. contains the specific settings for this embedding 26 | # 27 | strategy = settings.embed_text.resolved_strategy( 28 | embeddings_llm_settings 29 | ) # get the default strategy 30 | strategy.update({ 31 | "vector_store": { 32 | **(vector_store_params or {}), 33 | **(vector_store_settings), 34 | } 35 | }) # update the default strategy with the vector store settings 36 | # This ensures the vector store config is part of the strategy and not the global config 37 | return { 38 | "strategy": strategy, 39 | } 40 | -------------------------------------------------------------------------------- /graphrag/config/models/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Interfaces for Default Config parameterization.""" 5 | -------------------------------------------------------------------------------- /graphrag/config/models/basic_search_config.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Parameterization settings for the default configuration.""" 5 | 6 | from pydantic import BaseModel, Field 7 | 8 | from graphrag.config.defaults import graphrag_config_defaults 9 | 10 | 11 | class BasicSearchConfig(BaseModel): 12 | """The default configuration section for Cache.""" 13 | 14 | prompt: str | None = Field( 15 | description="The basic search prompt to use.", 16 | default=graphrag_config_defaults.basic_search.prompt, 17 | ) 18 | chat_model_id: str = Field( 19 | description="The model ID to use for basic search.", 20 | default=graphrag_config_defaults.basic_search.chat_model_id, 21 | ) 22 | embedding_model_id: str = Field( 23 | description="The model ID to use for text embeddings.", 24 | default=graphrag_config_defaults.basic_search.embedding_model_id, 25 | ) 26 | k: int = Field( 27 | description="The number of text units to include in search context.", 28 | default=graphrag_config_defaults.basic_search.k, 29 | ) 30 | max_context_tokens: int = Field( 31 | description="The maximum tokens.", 32 | default=graphrag_config_defaults.basic_search.max_context_tokens, 33 | ) 34 | -------------------------------------------------------------------------------- /graphrag/config/models/cache_config.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Parameterization settings for the default configuration.""" 5 | 6 | from pydantic import BaseModel, Field 7 | 8 | from graphrag.config.defaults import graphrag_config_defaults 9 | from graphrag.config.enums import CacheType 10 | 11 | 12 | class CacheConfig(BaseModel): 13 | """The default configuration section for Cache.""" 14 | 15 | type: CacheType = Field( 16 | description="The cache type to use.", 17 | default=graphrag_config_defaults.cache.type, 18 | ) 19 | base_dir: str = Field( 20 | description="The base directory for the cache.", 21 | default=graphrag_config_defaults.cache.base_dir, 22 | ) 23 | connection_string: str | None = Field( 24 | description="The cache connection string to use.", 25 | default=graphrag_config_defaults.cache.connection_string, 26 | ) 27 | container_name: str | None = Field( 28 | description="The cache container name to use.", 29 | default=graphrag_config_defaults.cache.container_name, 30 | ) 31 | storage_account_blob_url: str | None = Field( 32 | description="The storage account blob url to use.", 33 | default=graphrag_config_defaults.cache.storage_account_blob_url, 34 | ) 35 | cosmosdb_account_url: str | None = Field( 36 | description="The cosmosdb account url to use.", 37 | default=graphrag_config_defaults.cache.cosmosdb_account_url, 38 | ) 39 | -------------------------------------------------------------------------------- /graphrag/config/models/cluster_graph_config.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Parameterization settings for the default configuration.""" 5 | 6 | from pydantic import BaseModel, Field 7 | 8 | from graphrag.config.defaults import graphrag_config_defaults 9 | 10 | 11 | class ClusterGraphConfig(BaseModel): 12 | """Configuration section for clustering graphs.""" 13 | 14 | max_cluster_size: int = Field( 15 | description="The maximum cluster size to use.", 16 | default=graphrag_config_defaults.cluster_graph.max_cluster_size, 17 | ) 18 | use_lcc: bool = Field( 19 | description="Whether to use the largest connected component.", 20 | default=graphrag_config_defaults.cluster_graph.use_lcc, 21 | ) 22 | seed: int = Field( 23 | description="The seed to use for the clustering.", 24 | default=graphrag_config_defaults.cluster_graph.seed, 25 | ) 26 | -------------------------------------------------------------------------------- /graphrag/config/models/output_config.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Parameterization settings for the default configuration.""" 5 | 6 | from pydantic import BaseModel, Field 7 | 8 | from graphrag.config.defaults import graphrag_config_defaults 9 | from graphrag.config.enums import OutputType 10 | 11 | 12 | class OutputConfig(BaseModel): 13 | """The default configuration section for Output.""" 14 | 15 | type: OutputType = Field( 16 | description="The output type to use.", 17 | default=graphrag_config_defaults.output.type, 18 | ) 19 | base_dir: str = Field( 20 | description="The base directory for the output.", 21 | default=graphrag_config_defaults.output.base_dir, 22 | ) 23 | connection_string: str | None = Field( 24 | description="The storage connection string to use.", 25 | default=graphrag_config_defaults.output.connection_string, 26 | ) 27 | container_name: str | None = Field( 28 | description="The storage container name to use.", 29 | default=graphrag_config_defaults.output.container_name, 30 | ) 31 | storage_account_blob_url: str | None = Field( 32 | description="The storage account blob url to use.", 33 | default=graphrag_config_defaults.output.storage_account_blob_url, 34 | ) 35 | cosmosdb_account_url: str | None = Field( 36 | description="The cosmosdb account url to use.", 37 | default=graphrag_config_defaults.output.cosmosdb_account_url, 38 | ) 39 | -------------------------------------------------------------------------------- /graphrag/config/models/reporting_config.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Parameterization settings for the default configuration.""" 5 | 6 | from pydantic import BaseModel, Field 7 | 8 | from graphrag.config.defaults import graphrag_config_defaults 9 | from graphrag.config.enums import ReportingType 10 | 11 | 12 | class ReportingConfig(BaseModel): 13 | """The default configuration section for Reporting.""" 14 | 15 | type: ReportingType = Field( 16 | description="The reporting type to use.", 17 | default=graphrag_config_defaults.reporting.type, 18 | ) 19 | base_dir: str = Field( 20 | description="The base directory for reporting.", 21 | default=graphrag_config_defaults.reporting.base_dir, 22 | ) 23 | connection_string: str | None = Field( 24 | description="The reporting connection string to use.", 25 | default=graphrag_config_defaults.reporting.connection_string, 26 | ) 27 | container_name: str | None = Field( 28 | description="The reporting container name to use.", 29 | default=graphrag_config_defaults.reporting.container_name, 30 | ) 31 | storage_account_blob_url: str | None = Field( 32 | description="The storage account blob url to use.", 33 | default=graphrag_config_defaults.reporting.storage_account_blob_url, 34 | ) 35 | -------------------------------------------------------------------------------- /graphrag/config/models/snapshots_config.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Parameterization settings for the default configuration.""" 5 | 6 | from pydantic import BaseModel, Field 7 | 8 | from graphrag.config.defaults import graphrag_config_defaults 9 | 10 | 11 | class SnapshotsConfig(BaseModel): 12 | """Configuration section for snapshots.""" 13 | 14 | embeddings: bool = Field( 15 | description="A flag indicating whether to take snapshots of embeddings.", 16 | default=graphrag_config_defaults.snapshots.embeddings, 17 | ) 18 | graphml: bool = Field( 19 | description="A flag indicating whether to take snapshots of GraphML.", 20 | default=graphrag_config_defaults.snapshots.graphml, 21 | ) 22 | raw_graph: bool = Field( 23 | description="A flag indicating whether to take snapshots of the raw extracted graph (entities and relationships) before merging.", 24 | default=graphrag_config_defaults.snapshots.raw_graph, 25 | ) 26 | -------------------------------------------------------------------------------- /graphrag/config/models/umap_config.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Parameterization settings for the default configuration.""" 5 | 6 | from pydantic import BaseModel, Field 7 | 8 | from graphrag.config.defaults import graphrag_config_defaults 9 | 10 | 11 | class UmapConfig(BaseModel): 12 | """Configuration section for UMAP.""" 13 | 14 | enabled: bool = Field( 15 | description="A flag indicating whether to enable UMAP.", 16 | default=graphrag_config_defaults.umap.enabled, 17 | ) 18 | -------------------------------------------------------------------------------- /graphrag/config/read_dotenv.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """A module containing the read_dotenv utility.""" 5 | 6 | import logging 7 | import os 8 | from pathlib import Path 9 | 10 | from dotenv import dotenv_values 11 | 12 | log = logging.getLogger(__name__) 13 | 14 | 15 | def read_dotenv(root: str) -> None: 16 | """Read a .env file in the given root path.""" 17 | env_path = Path(root) / ".env" 18 | if env_path.exists(): 19 | log.info("Loading pipeline .env file") 20 | env_config = dotenv_values(f"{env_path}") 21 | for key, value in env_config.items(): 22 | if key not in os.environ: 23 | os.environ[key] = value or "" 24 | else: 25 | log.info("No .env file found at %s", root) 26 | -------------------------------------------------------------------------------- /graphrag/data_model/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Knowledge model package.""" 5 | -------------------------------------------------------------------------------- /graphrag/data_model/identified.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """A package containing the 'Identified' protocol.""" 5 | 6 | from dataclasses import dataclass 7 | 8 | 9 | @dataclass 10 | class Identified: 11 | """A protocol for an item with an ID.""" 12 | 13 | id: str 14 | """The ID of the item.""" 15 | 16 | short_id: str | None 17 | """Human readable ID used to refer to this community in prompts or texts displayed to users, such as in a report text (optional).""" 18 | -------------------------------------------------------------------------------- /graphrag/data_model/named.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """A package containing the 'Named' protocol.""" 5 | 6 | from dataclasses import dataclass 7 | 8 | from graphrag.data_model.identified import Identified 9 | 10 | 11 | @dataclass 12 | class Named(Identified): 13 | """A protocol for an item with a name/title.""" 14 | 15 | title: str 16 | """The name/title of the item.""" 17 | -------------------------------------------------------------------------------- /graphrag/data_model/types.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Common types for the GraphRAG knowledge model.""" 5 | 6 | from collections.abc import Callable 7 | 8 | TextEmbedder = Callable[[str], list[float]] 9 | -------------------------------------------------------------------------------- /graphrag/index/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """The indexing engine package root.""" 5 | -------------------------------------------------------------------------------- /graphrag/index/input/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """The Indexing Engine input package root.""" 5 | -------------------------------------------------------------------------------- /graphrag/index/input/text.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """A module containing load method definition.""" 5 | 6 | import logging 7 | from pathlib import Path 8 | 9 | import pandas as pd 10 | 11 | from graphrag.config.models.input_config import InputConfig 12 | from graphrag.index.input.util import load_files 13 | from graphrag.index.utils.hashing import gen_sha512_hash 14 | from graphrag.logger.base import ProgressLogger 15 | from graphrag.storage.pipeline_storage import PipelineStorage 16 | 17 | log = logging.getLogger(__name__) 18 | 19 | 20 | async def load_text( 21 | config: InputConfig, 22 | progress: ProgressLogger | None, 23 | storage: PipelineStorage, 24 | ) -> pd.DataFrame: 25 | """Load text inputs from a directory.""" 26 | 27 | async def load_file(path: str, group: dict | None = None) -> pd.DataFrame: 28 | if group is None: 29 | group = {} 30 | text = await storage.get(path, encoding=config.encoding) 31 | new_item = {**group, "text": text} 32 | new_item["id"] = gen_sha512_hash(new_item, new_item.keys()) 33 | new_item["title"] = str(Path(path).name) 34 | new_item["creation_date"] = await storage.get_creation_date(path) 35 | return pd.DataFrame([new_item]) 36 | 37 | return await load_files(load_file, config, storage, progress) 38 | -------------------------------------------------------------------------------- /graphrag/index/operations/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Reusable data frame operations.""" 5 | -------------------------------------------------------------------------------- /graphrag/index/operations/build_noun_graph/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """The Indexing Engine noun graph package root.""" 5 | -------------------------------------------------------------------------------- /graphrag/index/operations/build_noun_graph/np_extractors/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """NLP-based graph extractors.""" 5 | -------------------------------------------------------------------------------- /graphrag/index/operations/build_noun_graph/np_extractors/np_validator.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Util functions to tag noun phrases for filtering.""" 5 | 6 | 7 | def is_compound(tokens: list[str]) -> bool: 8 | """List of tokens forms a compound noun phrase.""" 9 | return any( 10 | "-" in token and len(token.strip()) > 1 and len(token.strip().split("-")) > 1 11 | for token in tokens 12 | ) 13 | 14 | 15 | def has_valid_token_length(tokens: list[str], max_length: int) -> bool: 16 | """Check if all tokens have valid length.""" 17 | return all(len(token) <= max_length for token in tokens) 18 | 19 | 20 | def is_valid_entity(entity: tuple[str, str], tokens: list[str]) -> bool: 21 | """Check if the entity is valid.""" 22 | return (entity[1] not in ["CARDINAL", "ORDINAL"] and len(tokens) > 0) or ( 23 | entity[1] in ["CARDINAL", "ORDINAL"] 24 | and (len(tokens) > 1 or is_compound(tokens)) 25 | ) 26 | -------------------------------------------------------------------------------- /graphrag/index/operations/build_noun_graph/np_extractors/resource_loader.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Util functions needed for nltk-based noun-phrase extractors (i.e. TextBlob).""" 5 | 6 | import nltk 7 | 8 | 9 | def download_if_not_exists(resource_name) -> bool: 10 | """Download nltk resources if they haven't been already.""" 11 | # look under all possible categories 12 | root_categories = [ 13 | "corpora", 14 | "tokenizers", 15 | "taggers", 16 | "chunkers", 17 | "classifiers", 18 | "stemmers", 19 | "stopwords", 20 | "languages", 21 | "frequent", 22 | "gate", 23 | "models", 24 | "mt", 25 | "sentiment", 26 | "similarity", 27 | ] 28 | for category in root_categories: 29 | try: 30 | # if found, stop looking and avoid downloading 31 | nltk.find(f"{category}/{resource_name}") 32 | return True # noqa: TRY300 33 | except LookupError: 34 | continue 35 | 36 | # is not found, download 37 | nltk.download(resource_name) 38 | return False 39 | -------------------------------------------------------------------------------- /graphrag/index/operations/build_noun_graph/np_extractors/stop_words.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Custom list of stop words to be excluded by noun phrase extractors.""" 5 | 6 | EN_STOP_WORDS = [ 7 | "stuff", 8 | "thing", 9 | "things", 10 | "bunch", 11 | "bit", 12 | "bits", 13 | "people", 14 | "person", 15 | "okay", 16 | "hey", 17 | "hi", 18 | "hello", 19 | "laughter", 20 | "oh", 21 | ] 22 | -------------------------------------------------------------------------------- /graphrag/index/operations/chunk_text/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """The Indexing Engine text chunk package root.""" 5 | -------------------------------------------------------------------------------- /graphrag/index/operations/chunk_text/bootstrap.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Bootstrap definition.""" 5 | 6 | import warnings 7 | 8 | # Ignore warnings from numba 9 | warnings.filterwarnings("ignore", message=".*The 'nopython' keyword.*") 10 | warnings.filterwarnings("ignore", message=".*Use no seed for parallelism.*") 11 | 12 | initialized_nltk = False 13 | 14 | 15 | def bootstrap(): 16 | """Bootstrap definition.""" 17 | global initialized_nltk 18 | if not initialized_nltk: 19 | import nltk 20 | from nltk.corpus import wordnet as wn 21 | 22 | nltk.download("punkt") 23 | nltk.download("punkt_tab") 24 | nltk.download("averaged_perceptron_tagger") 25 | nltk.download("averaged_perceptron_tagger_eng") 26 | nltk.download("maxent_ne_chunker") 27 | nltk.download("maxent_ne_chunker_tab") 28 | nltk.download("words") 29 | nltk.download("wordnet") 30 | wn.ensure_loaded() 31 | initialized_nltk = True 32 | -------------------------------------------------------------------------------- /graphrag/index/operations/chunk_text/typing.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """A module containing 'TextChunk' model.""" 5 | 6 | from collections.abc import Callable, Iterable 7 | from dataclasses import dataclass 8 | 9 | from graphrag.config.models.chunking_config import ChunkingConfig 10 | from graphrag.logger.progress import ProgressTicker 11 | 12 | 13 | @dataclass 14 | class TextChunk: 15 | """Text chunk class definition.""" 16 | 17 | text_chunk: str 18 | source_doc_indices: list[int] 19 | n_tokens: int | None = None 20 | 21 | 22 | ChunkInput = str | list[str] | list[tuple[str, str]] 23 | """Input to a chunking strategy. Can be a string, a list of strings, or a list of tuples of (id, text).""" 24 | 25 | ChunkStrategy = Callable[ 26 | [list[str], ChunkingConfig, ProgressTicker], Iterable[TextChunk] 27 | ] 28 | -------------------------------------------------------------------------------- /graphrag/index/operations/compute_degree.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """A module containing create_graph definition.""" 5 | 6 | import networkx as nx 7 | import pandas as pd 8 | 9 | 10 | def compute_degree(graph: nx.Graph) -> pd.DataFrame: 11 | """Create a new DataFrame with the degree of each node in the graph.""" 12 | return pd.DataFrame([ 13 | {"title": node, "degree": int(degree)} 14 | for node, degree in graph.degree # type: ignore 15 | ]) 16 | -------------------------------------------------------------------------------- /graphrag/index/operations/compute_edge_combined_degree.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """A module containing compute_edge_combined_degree methods definition.""" 5 | 6 | from typing import cast 7 | 8 | import pandas as pd 9 | 10 | 11 | def compute_edge_combined_degree( 12 | edge_df: pd.DataFrame, 13 | node_degree_df: pd.DataFrame, 14 | node_name_column: str, 15 | node_degree_column: str, 16 | edge_source_column: str, 17 | edge_target_column: str, 18 | ) -> pd.Series: 19 | """Compute the combined degree for each edge in a graph.""" 20 | 21 | def join_to_degree(df: pd.DataFrame, column: str) -> pd.DataFrame: 22 | degree_column = _degree_colname(column) 23 | result = df.merge( 24 | node_degree_df.rename( 25 | columns={node_name_column: column, node_degree_column: degree_column} 26 | ), 27 | on=column, 28 | how="left", 29 | ) 30 | result[degree_column] = result[degree_column].fillna(0) 31 | return result 32 | 33 | output_df = join_to_degree(edge_df, edge_source_column) 34 | output_df = join_to_degree(output_df, edge_target_column) 35 | output_df["combined_degree"] = ( 36 | output_df[_degree_colname(edge_source_column)] 37 | + output_df[_degree_colname(edge_target_column)] 38 | ) 39 | return cast("pd.Series", output_df["combined_degree"]) 40 | 41 | 42 | def _degree_colname(column: str) -> str: 43 | return f"{column}_degree" 44 | -------------------------------------------------------------------------------- /graphrag/index/operations/create_graph.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """A module containing create_graph definition.""" 5 | 6 | import networkx as nx 7 | import pandas as pd 8 | 9 | 10 | def create_graph( 11 | edges: pd.DataFrame, 12 | edge_attr: list[str | int] | None = None, 13 | nodes: pd.DataFrame | None = None, 14 | node_id: str = "title", 15 | ) -> nx.Graph: 16 | """Create a networkx graph from nodes and edges dataframes.""" 17 | graph = nx.from_pandas_edgelist(edges, edge_attr=edge_attr) 18 | 19 | if nodes is not None: 20 | nodes.set_index(node_id, inplace=True) 21 | graph.add_nodes_from((n, dict(d)) for n, d in nodes.iterrows()) 22 | 23 | return graph 24 | -------------------------------------------------------------------------------- /graphrag/index/operations/embed_graph/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """The Indexing Engine graph embed package root.""" 5 | -------------------------------------------------------------------------------- /graphrag/index/operations/embed_graph/embed_node2vec.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Utilities to generate graph embeddings.""" 5 | 6 | from dataclasses import dataclass 7 | 8 | import networkx as nx 9 | import numpy as np 10 | 11 | 12 | @dataclass 13 | class NodeEmbeddings: 14 | """Node embeddings class definition.""" 15 | 16 | nodes: list[str] 17 | embeddings: np.ndarray 18 | 19 | 20 | def embed_node2vec( 21 | graph: nx.Graph | nx.DiGraph, 22 | dimensions: int = 1536, 23 | num_walks: int = 10, 24 | walk_length: int = 40, 25 | window_size: int = 2, 26 | iterations: int = 3, 27 | random_seed: int = 86, 28 | ) -> NodeEmbeddings: 29 | """Generate node embeddings using Node2Vec.""" 30 | # NOTE: This import is done here to reduce the initial import time of the graphrag package 31 | import graspologic as gc 32 | 33 | # generate embedding 34 | lcc_tensors = gc.embed.node2vec_embed( # type: ignore 35 | graph=graph, 36 | dimensions=dimensions, 37 | window_size=window_size, 38 | iterations=iterations, 39 | num_walks=num_walks, 40 | walk_length=walk_length, 41 | random_seed=random_seed, 42 | ) 43 | return NodeEmbeddings(embeddings=lcc_tensors[0], nodes=lcc_tensors[1]) 44 | -------------------------------------------------------------------------------- /graphrag/index/operations/embed_graph/typing.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """A module containing different lists and dictionaries.""" 5 | 6 | # Use this for now instead of a wrapper 7 | from typing import Any 8 | 9 | NodeList = list[str] 10 | EmbeddingList = list[Any] 11 | NodeEmbeddings = dict[str, list[float]] 12 | """Label -> Embedding""" 13 | -------------------------------------------------------------------------------- /graphrag/index/operations/embed_text/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """The Indexing Engine text embed package root.""" 5 | 6 | from graphrag.index.operations.embed_text.embed_text import ( 7 | TextEmbedStrategyType, 8 | embed_text, 9 | ) 10 | 11 | __all__ = ["TextEmbedStrategyType", "embed_text"] 12 | -------------------------------------------------------------------------------- /graphrag/index/operations/embed_text/strategies/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """The Indexing Engine embed strategies package root.""" 5 | -------------------------------------------------------------------------------- /graphrag/index/operations/embed_text/strategies/mock.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """A module containing run and _embed_text methods definitions.""" 5 | 6 | import random 7 | from collections.abc import Iterable 8 | from typing import Any 9 | 10 | from graphrag.cache.pipeline_cache import PipelineCache 11 | from graphrag.callbacks.workflow_callbacks import WorkflowCallbacks 12 | from graphrag.index.operations.embed_text.strategies.typing import TextEmbeddingResult 13 | from graphrag.logger.progress import ProgressTicker, progress_ticker 14 | 15 | 16 | async def run( # noqa RUF029 async is required for interface 17 | input: list[str], 18 | callbacks: WorkflowCallbacks, 19 | cache: PipelineCache, 20 | _args: dict[str, Any], 21 | ) -> TextEmbeddingResult: 22 | """Run the Claim extraction chain.""" 23 | input = input if isinstance(input, Iterable) else [input] 24 | ticker = progress_ticker(callbacks.progress, len(input)) 25 | return TextEmbeddingResult( 26 | embeddings=[_embed_text(cache, text, ticker) for text in input] 27 | ) 28 | 29 | 30 | def _embed_text(_cache: PipelineCache, _text: str, tick: ProgressTicker) -> list[float]: 31 | """Embed a single piece of text.""" 32 | tick(1) 33 | return [random.random(), random.random(), random.random()] # noqa S311 34 | -------------------------------------------------------------------------------- /graphrag/index/operations/embed_text/strategies/typing.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """A module containing 'TextEmbeddingResult' model.""" 5 | 6 | from collections.abc import Awaitable, Callable 7 | from dataclasses import dataclass 8 | 9 | from graphrag.cache.pipeline_cache import PipelineCache 10 | from graphrag.callbacks.workflow_callbacks import WorkflowCallbacks 11 | 12 | 13 | @dataclass 14 | class TextEmbeddingResult: 15 | """Text embedding result class definition.""" 16 | 17 | embeddings: list[list[float] | None] | None 18 | 19 | 20 | TextEmbeddingStrategy = Callable[ 21 | [ 22 | list[str], 23 | WorkflowCallbacks, 24 | PipelineCache, 25 | dict, 26 | ], 27 | Awaitable[TextEmbeddingResult], 28 | ] 29 | -------------------------------------------------------------------------------- /graphrag/index/operations/extract_covariates/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """The Indexing Engine text extract claims package root.""" 5 | -------------------------------------------------------------------------------- /graphrag/index/operations/extract_covariates/typing.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """A module containing 'Covariate' and 'CovariateExtractionResult' models.""" 5 | 6 | from collections.abc import Awaitable, Callable, Iterable 7 | from dataclasses import dataclass 8 | from typing import Any 9 | 10 | from graphrag.cache.pipeline_cache import PipelineCache 11 | from graphrag.callbacks.workflow_callbacks import WorkflowCallbacks 12 | 13 | 14 | @dataclass 15 | class Covariate: 16 | """Covariate class definition.""" 17 | 18 | covariate_type: str | None = None 19 | subject_id: str | None = None 20 | object_id: str | None = None 21 | type: str | None = None 22 | status: str | None = None 23 | start_date: str | None = None 24 | end_date: str | None = None 25 | description: str | None = None 26 | source_text: list[str] | None = None 27 | doc_id: str | None = None 28 | record_id: int | None = None 29 | id: str | None = None 30 | 31 | 32 | @dataclass 33 | class CovariateExtractionResult: 34 | """Covariate extraction result class definition.""" 35 | 36 | covariate_data: list[Covariate] 37 | 38 | 39 | CovariateExtractStrategy = Callable[ 40 | [ 41 | Iterable[str], 42 | list[str], 43 | dict[str, str], 44 | WorkflowCallbacks, 45 | PipelineCache, 46 | dict[str, Any], 47 | ], 48 | Awaitable[CovariateExtractionResult], 49 | ] 50 | -------------------------------------------------------------------------------- /graphrag/index/operations/extract_graph/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """The Indexing Engine entities extraction package root.""" 5 | -------------------------------------------------------------------------------- /graphrag/index/operations/extract_graph/typing.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """A module containing 'Document' and 'EntityExtractionResult' models.""" 5 | 6 | from collections.abc import Awaitable, Callable 7 | from dataclasses import dataclass 8 | from enum import Enum 9 | from typing import Any 10 | 11 | import networkx as nx 12 | 13 | from graphrag.cache.pipeline_cache import PipelineCache 14 | from graphrag.callbacks.workflow_callbacks import WorkflowCallbacks 15 | 16 | ExtractedEntity = dict[str, Any] 17 | ExtractedRelationship = dict[str, Any] 18 | StrategyConfig = dict[str, Any] 19 | EntityTypes = list[str] 20 | 21 | 22 | @dataclass 23 | class Document: 24 | """Document class definition.""" 25 | 26 | text: str 27 | id: str 28 | 29 | 30 | @dataclass 31 | class EntityExtractionResult: 32 | """Entity extraction result class definition.""" 33 | 34 | entities: list[ExtractedEntity] 35 | relationships: list[ExtractedRelationship] 36 | graph: nx.Graph | None 37 | 38 | 39 | EntityExtractStrategy = Callable[ 40 | [ 41 | list[Document], 42 | EntityTypes, 43 | WorkflowCallbacks, 44 | PipelineCache, 45 | StrategyConfig, 46 | ], 47 | Awaitable[EntityExtractionResult], 48 | ] 49 | 50 | 51 | class ExtractEntityStrategyType(str, Enum): 52 | """ExtractEntityStrategyType class definition.""" 53 | 54 | graph_intelligence = "graph_intelligence" 55 | nltk = "nltk" 56 | 57 | def __repr__(self): 58 | """Get a string representation.""" 59 | return f'"{self.value}"' 60 | -------------------------------------------------------------------------------- /graphrag/index/operations/finalize_community_reports.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """All the steps to transform final entities.""" 5 | 6 | from uuid import uuid4 7 | 8 | import pandas as pd 9 | 10 | from graphrag.data_model.schemas import COMMUNITY_REPORTS_FINAL_COLUMNS 11 | 12 | 13 | def finalize_community_reports( 14 | reports: pd.DataFrame, 15 | communities: pd.DataFrame, 16 | ) -> pd.DataFrame: 17 | """All the steps to transform final community reports.""" 18 | # Merge with communities to add shared fields 19 | community_reports = reports.merge( 20 | communities.loc[:, ["community", "parent", "children", "size", "period"]], 21 | on="community", 22 | how="left", 23 | copy=False, 24 | ) 25 | 26 | community_reports["community"] = community_reports["community"].astype(int) 27 | community_reports["human_readable_id"] = community_reports["community"] 28 | community_reports["id"] = [uuid4().hex for _ in range(len(community_reports))] 29 | 30 | return community_reports.loc[ 31 | :, 32 | COMMUNITY_REPORTS_FINAL_COLUMNS, 33 | ] 34 | -------------------------------------------------------------------------------- /graphrag/index/operations/graph_to_dataframes.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """A module containing create_graph definition.""" 5 | 6 | import networkx as nx 7 | import pandas as pd 8 | 9 | 10 | def graph_to_dataframes( 11 | graph: nx.Graph, 12 | node_columns: list[str] | None = None, 13 | edge_columns: list[str] | None = None, 14 | node_id: str = "title", 15 | ) -> tuple[pd.DataFrame, pd.DataFrame]: 16 | """Deconstructs an nx.Graph into nodes and edges dataframes.""" 17 | # nx graph nodes are a tuple, and creating a df from them results in the id being the index 18 | nodes = pd.DataFrame.from_dict(dict(graph.nodes(data=True)), orient="index") 19 | nodes[node_id] = nodes.index 20 | nodes.reset_index(inplace=True, drop=True) 21 | 22 | edges = nx.to_pandas_edgelist(graph) 23 | 24 | # we don't deal in directed graphs, but we do need to ensure consistent ordering for df joins 25 | # nx loses the initial ordering 26 | edges["min_source"] = edges[["source", "target"]].min(axis=1) 27 | edges["max_target"] = edges[["source", "target"]].max(axis=1) 28 | edges = edges.drop(columns=["source", "target"]).rename( 29 | columns={"min_source": "source", "max_target": "target"} # type: ignore 30 | ) 31 | 32 | if node_columns: 33 | nodes = nodes.loc[:, node_columns] 34 | 35 | if edge_columns: 36 | edges = edges.loc[:, edge_columns] 37 | 38 | return (nodes, edges) 39 | -------------------------------------------------------------------------------- /graphrag/index/operations/layout_graph/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """The Indexing Engine graph layout package root.""" 5 | -------------------------------------------------------------------------------- /graphrag/index/operations/layout_graph/typing.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | # Use this for now instead of a wrapper 5 | """A module containing 'NodePosition' model.""" 6 | 7 | from dataclasses import dataclass 8 | 9 | 10 | @dataclass 11 | class NodePosition: 12 | """Node position class definition.""" 13 | 14 | label: str 15 | cluster: str 16 | size: float 17 | 18 | x: float 19 | y: float 20 | z: float | None = None 21 | 22 | def to_pandas(self) -> tuple[str, float, float, str, float]: 23 | """To pandas method definition.""" 24 | return self.label, self.x, self.y, self.cluster, self.size 25 | 26 | 27 | GraphLayout = list[NodePosition] 28 | -------------------------------------------------------------------------------- /graphrag/index/operations/snapshot_graphml.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """A module containing snapshot_graphml method definition.""" 5 | 6 | import networkx as nx 7 | 8 | from graphrag.storage.pipeline_storage import PipelineStorage 9 | 10 | 11 | async def snapshot_graphml( 12 | input: str | nx.Graph, 13 | name: str, 14 | storage: PipelineStorage, 15 | ) -> None: 16 | """Take a entire snapshot of a graph to standard graphml format.""" 17 | graphml = input if isinstance(input, str) else "\n".join(nx.generate_graphml(input)) 18 | await storage.set(name + ".graphml", graphml) 19 | -------------------------------------------------------------------------------- /graphrag/index/operations/summarize_communities/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Community summarization modules.""" 5 | -------------------------------------------------------------------------------- /graphrag/index/operations/summarize_communities/explode_communities.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Explode a list of communities into nodes for filtering.""" 5 | 6 | import pandas as pd 7 | 8 | from graphrag.data_model.schemas import ( 9 | COMMUNITY_ID, 10 | ) 11 | 12 | 13 | def explode_communities( 14 | communities: pd.DataFrame, entities: pd.DataFrame 15 | ) -> pd.DataFrame: 16 | """Explode a list of communities into nodes for filtering.""" 17 | community_join = communities.explode("entity_ids").loc[ 18 | :, ["community", "level", "entity_ids"] 19 | ] 20 | nodes = entities.merge( 21 | community_join, left_on="id", right_on="entity_ids", how="left" 22 | ) 23 | return nodes.loc[nodes.loc[:, COMMUNITY_ID] != -1] 24 | -------------------------------------------------------------------------------- /graphrag/index/operations/summarize_communities/graph_context/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Package of context builders for graph-based reports.""" 5 | -------------------------------------------------------------------------------- /graphrag/index/operations/summarize_communities/text_unit_context/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Package of context builders for text unit-based reports.""" 5 | -------------------------------------------------------------------------------- /graphrag/index/operations/summarize_communities/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """A module containing community report generation utilities.""" 5 | 6 | import pandas as pd 7 | 8 | import graphrag.data_model.schemas as schemas 9 | 10 | 11 | def get_levels( 12 | df: pd.DataFrame, level_column: str = schemas.COMMUNITY_LEVEL 13 | ) -> list[int]: 14 | """Get the levels of the communities.""" 15 | levels = df[level_column].dropna().unique() 16 | levels = [int(lvl) for lvl in levels if lvl != -1] 17 | return sorted(levels, reverse=True) 18 | -------------------------------------------------------------------------------- /graphrag/index/operations/summarize_descriptions/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Root package for description summarization.""" 5 | 6 | from graphrag.index.operations.summarize_descriptions.summarize_descriptions import ( 7 | summarize_descriptions, 8 | ) 9 | from graphrag.index.operations.summarize_descriptions.typing import ( 10 | SummarizationStrategy, 11 | SummarizeStrategyType, 12 | ) 13 | 14 | __all__ = [ 15 | "SummarizationStrategy", 16 | "SummarizeStrategyType", 17 | "summarize_descriptions", 18 | ] 19 | -------------------------------------------------------------------------------- /graphrag/index/operations/summarize_descriptions/typing.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """A module containing 'SummarizedDescriptionResult' model.""" 5 | 6 | from collections.abc import Awaitable, Callable 7 | from dataclasses import dataclass 8 | from enum import Enum 9 | from typing import Any, NamedTuple 10 | 11 | from graphrag.cache.pipeline_cache import PipelineCache 12 | from graphrag.callbacks.workflow_callbacks import WorkflowCallbacks 13 | 14 | StrategyConfig = dict[str, Any] 15 | 16 | 17 | @dataclass 18 | class SummarizedDescriptionResult: 19 | """Entity summarization result class definition.""" 20 | 21 | id: str | tuple[str, str] 22 | description: str 23 | 24 | 25 | SummarizationStrategy = Callable[ 26 | [ 27 | str | tuple[str, str], 28 | list[str], 29 | WorkflowCallbacks, 30 | PipelineCache, 31 | StrategyConfig, 32 | ], 33 | Awaitable[SummarizedDescriptionResult], 34 | ] 35 | 36 | 37 | class DescriptionSummarizeRow(NamedTuple): 38 | """DescriptionSummarizeRow class definition.""" 39 | 40 | graph: Any 41 | 42 | 43 | class SummarizeStrategyType(str, Enum): 44 | """SummarizeStrategyType class definition.""" 45 | 46 | graph_intelligence = "graph_intelligence" 47 | 48 | def __repr__(self): 49 | """Get a string representation.""" 50 | return f'"{self.value}"' 51 | -------------------------------------------------------------------------------- /graphrag/index/run/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Run module for GraphRAG.""" 5 | -------------------------------------------------------------------------------- /graphrag/index/text_splitting/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """The Indexing Engine Text Splitting package root.""" 5 | -------------------------------------------------------------------------------- /graphrag/index/text_splitting/check_token_limit.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Token limit method definition.""" 5 | 6 | from graphrag.index.text_splitting.text_splitting import TokenTextSplitter 7 | 8 | 9 | def check_token_limit(text, max_token): 10 | """Check token limit.""" 11 | text_splitter = TokenTextSplitter(chunk_size=max_token, chunk_overlap=0) 12 | docs = text_splitter.split_text(text) 13 | if len(docs) > 1: 14 | return 0 15 | return 1 16 | -------------------------------------------------------------------------------- /graphrag/index/typing/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Root typings for GraphRAG.""" 5 | -------------------------------------------------------------------------------- /graphrag/index/typing/context.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | # isort: skip_file 5 | """A module containing the 'PipelineRunContext' models.""" 6 | 7 | from dataclasses import dataclass 8 | 9 | from graphrag.cache.pipeline_cache import PipelineCache 10 | from graphrag.callbacks.workflow_callbacks import WorkflowCallbacks 11 | from graphrag.index.typing.state import PipelineState 12 | from graphrag.index.typing.stats import PipelineRunStats 13 | from graphrag.storage.pipeline_storage import PipelineStorage 14 | 15 | 16 | @dataclass 17 | class PipelineRunContext: 18 | """Provides the context for the current pipeline run.""" 19 | 20 | stats: PipelineRunStats 21 | storage: PipelineStorage 22 | "Long-term storage for pipeline verbs to use. Items written here will be written to the storage provider." 23 | cache: PipelineCache 24 | "Cache instance for reading previous LLM responses." 25 | callbacks: WorkflowCallbacks 26 | "Callbacks to be called during the pipeline run." 27 | state: PipelineState 28 | "Arbitrary property bag for runtime state, persistent pre-computes, or experimental features." 29 | -------------------------------------------------------------------------------- /graphrag/index/typing/error_handler.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Shared error handler types.""" 5 | 6 | from collections.abc import Callable 7 | 8 | ErrorHandlerFn = Callable[[BaseException | None, str | None, dict | None], None] 9 | -------------------------------------------------------------------------------- /graphrag/index/typing/pipeline.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """A module containing the Pipeline class.""" 5 | 6 | from collections.abc import Generator 7 | 8 | from graphrag.index.typing.workflow import Workflow 9 | 10 | 11 | class Pipeline: 12 | """Encapsulates running workflows.""" 13 | 14 | def __init__(self, workflows: list[Workflow]): 15 | self.workflows = workflows 16 | 17 | def run(self) -> Generator[Workflow]: 18 | """Return a Generator over the pipeline workflows.""" 19 | yield from self.workflows 20 | 21 | def names(self) -> list[str]: 22 | """Return the names of the workflows in the pipeline.""" 23 | return [name for name, _ in self.workflows] 24 | -------------------------------------------------------------------------------- /graphrag/index/typing/pipeline_run_result.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """A module containing the PipelineRunResult class.""" 5 | 6 | from dataclasses import dataclass 7 | from typing import Any 8 | 9 | from graphrag.index.typing.state import PipelineState 10 | 11 | 12 | @dataclass 13 | class PipelineRunResult: 14 | """Pipeline run result class definition.""" 15 | 16 | workflow: str 17 | """The name of the workflow that was executed.""" 18 | result: Any | None 19 | """The result of the workflow function. This can be anything - we use it only for logging downstream, and expect each workflow function to write official outputs to the provided storage.""" 20 | state: PipelineState 21 | """Ongoing pipeline context state object.""" 22 | errors: list[BaseException] | None 23 | -------------------------------------------------------------------------------- /graphrag/index/typing/state.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Pipeline state types.""" 5 | 6 | from typing import Any 7 | 8 | PipelineState = dict[Any, Any] 9 | -------------------------------------------------------------------------------- /graphrag/index/typing/stats.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Pipeline stats types.""" 5 | 6 | from dataclasses import dataclass, field 7 | 8 | 9 | @dataclass 10 | class PipelineRunStats: 11 | """Pipeline running stats.""" 12 | 13 | total_runtime: float = field(default=0) 14 | """Float representing the total runtime.""" 15 | 16 | num_documents: int = field(default=0) 17 | """Number of documents.""" 18 | 19 | input_load_time: float = field(default=0) 20 | """Float representing the input load time.""" 21 | 22 | workflows: dict[str, dict[str, float]] = field(default_factory=dict) 23 | """A dictionary of workflows.""" 24 | -------------------------------------------------------------------------------- /graphrag/index/typing/workflow.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Pipeline workflow types.""" 5 | 6 | from collections.abc import Awaitable, Callable 7 | from dataclasses import dataclass 8 | from typing import Any 9 | 10 | from graphrag.config.models.graph_rag_config import GraphRagConfig 11 | from graphrag.index.typing.context import PipelineRunContext 12 | 13 | 14 | @dataclass 15 | class WorkflowFunctionOutput: 16 | """Data container for Workflow function results.""" 17 | 18 | result: Any | None 19 | """The result of the workflow function. This can be anything - we use it only for logging downstream, and expect each workflow function to write official outputs to the provided storage.""" 20 | 21 | 22 | WorkflowFunction = Callable[ 23 | [GraphRagConfig, PipelineRunContext], 24 | Awaitable[WorkflowFunctionOutput], 25 | ] 26 | Workflow = tuple[str, WorkflowFunction] 27 | -------------------------------------------------------------------------------- /graphrag/index/update/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Incremental Indexing main module definition.""" 5 | -------------------------------------------------------------------------------- /graphrag/index/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Utils methods definition.""" 5 | -------------------------------------------------------------------------------- /graphrag/index/utils/dicts.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """A utility module containing methods for inspecting and verifying dictionary types.""" 5 | 6 | 7 | def dict_has_keys_with_types( 8 | data: dict, expected_fields: list[tuple[str, type]], inplace: bool = False 9 | ) -> bool: 10 | """Return True if the given dictionary has the given keys with the given types.""" 11 | for field, field_type in expected_fields: 12 | if field not in data: 13 | return False 14 | 15 | value = data[field] 16 | try: 17 | cast_value = field_type(value) 18 | if inplace: 19 | data[field] = cast_value 20 | except (TypeError, ValueError): 21 | return False 22 | return True 23 | -------------------------------------------------------------------------------- /graphrag/index/utils/hashing.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Hashing utilities.""" 5 | 6 | from collections.abc import Iterable 7 | from hashlib import sha512 8 | from typing import Any 9 | 10 | 11 | def gen_sha512_hash(item: dict[str, Any], hashcode: Iterable[str]): 12 | """Generate a SHA512 hash.""" 13 | hashed = "".join([str(item[column]) for column in hashcode]) 14 | return f"{sha512(hashed.encode('utf-8'), usedforsecurity=False).hexdigest()}" 15 | -------------------------------------------------------------------------------- /graphrag/index/utils/is_null.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Defines the is_null utility.""" 5 | 6 | import math 7 | from typing import Any 8 | 9 | 10 | def is_null(value: Any) -> bool: 11 | """Check if value is null or is nan.""" 12 | 13 | def is_none() -> bool: 14 | return value is None 15 | 16 | def is_nan() -> bool: 17 | return isinstance(value, float) and math.isnan(value) 18 | 19 | return is_none() or is_nan() 20 | -------------------------------------------------------------------------------- /graphrag/index/utils/rate_limiter.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Rate limiter utility.""" 5 | 6 | import asyncio 7 | import time 8 | 9 | 10 | class RateLimiter: 11 | """ 12 | The original TpmRpmLLMLimiter strategy did not account for minute-based rate limiting when scheduled. 13 | 14 | The RateLimiter was introduced to ensure that the CommunityReportsExtractor could be scheduled to adhere to rate configurations on a per-minute basis. 15 | """ 16 | 17 | # TODO: RateLimiter scheduled: using asyncio for async_mode 18 | 19 | def __init__(self, rate: int, per: int): 20 | self.rate = rate 21 | self.per = per 22 | self.allowance = rate 23 | self.last_check = time.monotonic() 24 | 25 | async def acquire(self): 26 | """Acquire a token from the rate limiter.""" 27 | current = time.monotonic() 28 | elapsed = current - self.last_check 29 | self.last_check = current 30 | self.allowance += elapsed * (self.rate / self.per) 31 | 32 | if self.allowance > self.rate: 33 | self.allowance = self.rate 34 | 35 | if self.allowance < 1.0: 36 | sleep_time = (1.0 - self.allowance) * (self.per / self.rate) 37 | await asyncio.sleep(sleep_time) 38 | self.allowance = 0.0 39 | else: 40 | self.allowance -= 1.0 41 | -------------------------------------------------------------------------------- /graphrag/index/utils/string.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """String utilities.""" 5 | 6 | import html 7 | import re 8 | from typing import Any 9 | 10 | 11 | def clean_str(input: Any) -> str: 12 | """Clean an input string by removing HTML escapes, control characters, and other unwanted characters.""" 13 | # If we get non-string input, just give it back 14 | if not isinstance(input, str): 15 | return input 16 | 17 | result = html.unescape(input.strip()) 18 | # https://stackoverflow.com/questions/4324790/removing-control-characters-from-a-string-in-python 19 | return re.sub(r"[\x00-\x1f\x7f-\x9f]", "", result) 20 | -------------------------------------------------------------------------------- /graphrag/index/utils/uuid.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """UUID utilities.""" 5 | 6 | import uuid 7 | from random import Random, getrandbits 8 | 9 | 10 | def gen_uuid(rd: Random | None = None): 11 | """Generate a random UUID v4.""" 12 | return uuid.UUID( 13 | int=rd.getrandbits(128) if rd is not None else getrandbits(128), version=4 14 | ).hex 15 | -------------------------------------------------------------------------------- /graphrag/index/workflows/update_clean_state.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """A module containing run_workflow method definition.""" 5 | 6 | import logging 7 | 8 | from graphrag.config.models.graph_rag_config import GraphRagConfig 9 | from graphrag.index.typing.context import PipelineRunContext 10 | from graphrag.index.typing.workflow import WorkflowFunctionOutput 11 | 12 | logger = logging.getLogger(__name__) 13 | 14 | 15 | async def run_workflow( # noqa: RUF029 16 | _config: GraphRagConfig, 17 | context: PipelineRunContext, 18 | ) -> WorkflowFunctionOutput: 19 | """Clean the state after the update.""" 20 | logger.info("Cleaning State") 21 | keys_to_delete = [ 22 | key_name 23 | for key_name in context.state 24 | if key_name.startswith("incremental_update_") 25 | ] 26 | 27 | for key_name in keys_to_delete: 28 | del context.state[key_name] 29 | 30 | return WorkflowFunctionOutput(result=None) 31 | -------------------------------------------------------------------------------- /graphrag/index/workflows/update_final_documents.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """A module containing run_workflow method definition.""" 5 | 6 | import logging 7 | 8 | from graphrag.config.models.graph_rag_config import GraphRagConfig 9 | from graphrag.index.run.utils import get_update_storages 10 | from graphrag.index.typing.context import PipelineRunContext 11 | from graphrag.index.typing.workflow import WorkflowFunctionOutput 12 | from graphrag.index.update.incremental_index import concat_dataframes 13 | 14 | logger = logging.getLogger(__name__) 15 | 16 | 17 | async def run_workflow( 18 | config: GraphRagConfig, 19 | context: PipelineRunContext, 20 | ) -> WorkflowFunctionOutput: 21 | """Update the documents from a incremental index run.""" 22 | logger.info("Updating Documents") 23 | output_storage, previous_storage, delta_storage = get_update_storages( 24 | config, context.state["update_timestamp"] 25 | ) 26 | 27 | final_documents = await concat_dataframes( 28 | "documents", previous_storage, delta_storage, output_storage 29 | ) 30 | 31 | context.state["incremental_update_final_documents"] = final_documents 32 | 33 | return WorkflowFunctionOutput(result=None) 34 | -------------------------------------------------------------------------------- /graphrag/language_model/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """GraphRAG Language Models module. Allows for provider registrations while providing some out-of-the-box solutions.""" 5 | -------------------------------------------------------------------------------- /graphrag/language_model/cache/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Cache provider definitions for Language Models.""" 5 | -------------------------------------------------------------------------------- /graphrag/language_model/cache/base.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Base cache protocol definition.""" 5 | 6 | from typing import Any, Protocol 7 | 8 | 9 | class ModelCache(Protocol): 10 | """Base cache protocol.""" 11 | 12 | async def has(self, key: str) -> bool: 13 | """Check if the cache has a value.""" 14 | ... 15 | 16 | async def get(self, key: str) -> Any | None: 17 | """Retrieve a value from the cache.""" 18 | ... 19 | 20 | async def set( 21 | self, key: str, value: Any, metadata: dict[str, Any] | None = None 22 | ) -> None: 23 | """Write a value into the cache.""" 24 | ... 25 | 26 | async def remove(self, key: str) -> None: 27 | """Remove a value from the cache.""" 28 | ... 29 | 30 | async def clear(self) -> None: 31 | """Clear the cache.""" 32 | ... 33 | 34 | def child(self, key: str) -> Any: 35 | """Create a child cache.""" 36 | ... 37 | -------------------------------------------------------------------------------- /graphrag/language_model/events/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Model Event handler modules.""" 5 | -------------------------------------------------------------------------------- /graphrag/language_model/events/base.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Base model events protocol.""" 5 | 6 | from typing import Any, Protocol 7 | 8 | 9 | class ModelEventHandler(Protocol): 10 | """Protocol for Model event handling.""" 11 | 12 | async def on_error( 13 | self, 14 | error: BaseException | None, 15 | traceback: str | None = None, 16 | arguments: dict[str, Any] | None = None, 17 | ) -> None: 18 | """Handle an model error.""" 19 | ... 20 | -------------------------------------------------------------------------------- /graphrag/language_model/protocol/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Base protocol definitions for LLMs.""" 5 | 6 | from .base import ChatModel, EmbeddingModel 7 | 8 | __all__ = ["ChatModel", "EmbeddingModel"] 9 | -------------------------------------------------------------------------------- /graphrag/language_model/providers/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Model Providers module.""" 5 | -------------------------------------------------------------------------------- /graphrag/language_model/providers/fnllm/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """FNLLM provider module.""" 5 | -------------------------------------------------------------------------------- /graphrag/language_model/providers/fnllm/cache.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """FNLLM Cache provider.""" 5 | 6 | from typing import Any 7 | 8 | from fnllm.caching import Cache as FNLLMCache 9 | 10 | from graphrag.cache.pipeline_cache import PipelineCache 11 | 12 | 13 | class FNLLMCacheProvider(FNLLMCache): 14 | """A cache for the pipeline.""" 15 | 16 | def __init__(self, cache: PipelineCache): 17 | self._cache = cache 18 | 19 | async def has(self, key: str) -> bool: 20 | """Check if the cache has a value.""" 21 | return await self._cache.has(key) 22 | 23 | async def get(self, key: str) -> Any | None: 24 | """Retrieve a value from the cache.""" 25 | return await self._cache.get(key) 26 | 27 | async def set( 28 | self, key: str, value: Any, metadata: dict[str, Any] | None = None 29 | ) -> None: 30 | """Write a value into the cache.""" 31 | await self._cache.set(key, value, metadata) 32 | 33 | async def remove(self, key: str) -> None: 34 | """Remove a value from the cache.""" 35 | await self._cache.delete(key) 36 | 37 | async def clear(self) -> None: 38 | """Clear the cache.""" 39 | await self._cache.clear() 40 | 41 | def child(self, key: str) -> "FNLLMCacheProvider": 42 | """Create a child cache.""" 43 | child_cache = self._cache.child(key) 44 | return FNLLMCacheProvider(child_cache) 45 | -------------------------------------------------------------------------------- /graphrag/language_model/providers/fnllm/events.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """FNLLM llm events provider.""" 5 | 6 | from typing import Any 7 | 8 | from fnllm.events import LLMEvents 9 | 10 | from graphrag.index.typing.error_handler import ErrorHandlerFn 11 | 12 | 13 | class FNLLMEvents(LLMEvents): 14 | """FNLLM events handler that calls the error handler.""" 15 | 16 | def __init__(self, on_error: ErrorHandlerFn): 17 | self._on_error = on_error 18 | 19 | async def on_error( 20 | self, 21 | error: BaseException | None, 22 | traceback: str | None = None, 23 | arguments: dict[str, Any] | None = None, 24 | ) -> None: 25 | """Handle an fnllm error.""" 26 | self._on_error(error, traceback, arguments) 27 | -------------------------------------------------------------------------------- /graphrag/language_model/response/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """A module containing Model response definitions.""" 5 | -------------------------------------------------------------------------------- /graphrag/language_model/response/base.pyi: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | from typing import Any, Generic, Protocol, TypeVar 5 | 6 | from pydantic import BaseModel 7 | 8 | _T = TypeVar("_T", bound=BaseModel, covariant=True) 9 | 10 | class ModelOutput(Protocol): 11 | @property 12 | def content(self) -> str: ... 13 | @property 14 | def full_response(self) -> dict[str, Any] | None: ... 15 | 16 | class ModelResponse(Protocol, Generic[_T]): 17 | @property 18 | def output(self) -> ModelOutput: ... 19 | @property 20 | def parsed_response(self) -> _T | None: ... 21 | @property 22 | def history(self) -> list[Any]: ... 23 | 24 | class BaseModelOutput(BaseModel): 25 | content: str 26 | full_response: dict[str, Any] | None 27 | 28 | def __init__( 29 | self, 30 | content: str, 31 | full_response: dict[str, Any] | None = None, 32 | ) -> None: ... 33 | 34 | class BaseModelResponse(BaseModel, Generic[_T]): 35 | output: BaseModelOutput 36 | parsed_response: _T | None 37 | history: list[Any] 38 | tool_calls: list[Any] 39 | metrics: Any | None 40 | cache_hit: bool | None 41 | 42 | def __init__( 43 | self, 44 | output: BaseModelOutput, 45 | parsed_response: _T | None = None, 46 | history: list[Any] = ..., # default provided by Pydantic 47 | tool_calls: list[Any] = ..., # default provided by Pydantic 48 | metrics: Any | None = None, 49 | cache_hit: bool | None = None, 50 | ) -> None: ... 51 | -------------------------------------------------------------------------------- /graphrag/logger/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Logger utilities and implementations.""" 5 | -------------------------------------------------------------------------------- /graphrag/logger/console.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Console Log.""" 5 | 6 | from typing import Any 7 | 8 | from graphrag.logger.base import StatusLogger 9 | 10 | 11 | class ConsoleReporter(StatusLogger): 12 | """A logger that writes to a console.""" 13 | 14 | def error(self, message: str, details: dict[str, Any] | None = None): 15 | """Log an error.""" 16 | print(message, details) # noqa T201 17 | 18 | def warning(self, message: str, details: dict[str, Any] | None = None): 19 | """Log a warning.""" 20 | _print_warning(message) 21 | 22 | def log(self, message: str, details: dict[str, Any] | None = None): 23 | """Log a log.""" 24 | print(message, details) # noqa T201 25 | 26 | 27 | def _print_warning(skk): 28 | print(f"\033[93m {skk}\033[00m") # noqa T201 29 | -------------------------------------------------------------------------------- /graphrag/logger/null_progress.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Null Progress Reporter.""" 5 | 6 | from graphrag.logger.base import Progress, ProgressLogger 7 | 8 | 9 | class NullProgressLogger(ProgressLogger): 10 | """A progress logger that does nothing.""" 11 | 12 | def __call__(self, update: Progress) -> None: 13 | """Update progress.""" 14 | 15 | def dispose(self) -> None: 16 | """Dispose of the progress logger.""" 17 | 18 | def child(self, prefix: str, transient: bool = True) -> ProgressLogger: 19 | """Create a child progress bar.""" 20 | return self 21 | 22 | def force_refresh(self) -> None: 23 | """Force a refresh.""" 24 | 25 | def stop(self) -> None: 26 | """Stop the progress logger.""" 27 | 28 | def error(self, message: str) -> None: 29 | """Log an error.""" 30 | 31 | def warning(self, message: str) -> None: 32 | """Log a warning.""" 33 | 34 | def info(self, message: str) -> None: 35 | """Log information.""" 36 | 37 | def success(self, message: str) -> None: 38 | """Log success.""" 39 | -------------------------------------------------------------------------------- /graphrag/logger/types.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Logging types. 5 | 6 | This module defines the types of loggers that can be used. 7 | """ 8 | 9 | from enum import Enum 10 | 11 | 12 | # Note: Code in this module was not included in the factory module because it negatively impacts the CLI experience. 13 | class LoggerType(str, Enum): 14 | """The type of logger to use.""" 15 | 16 | RICH = "rich" 17 | PRINT = "print" 18 | NONE = "none" 19 | 20 | def __str__(self): 21 | """Return a string representation of the enum value.""" 22 | return self.value 23 | -------------------------------------------------------------------------------- /graphrag/prompt_tune/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """The prompt-tuning package root.""" 5 | -------------------------------------------------------------------------------- /graphrag/prompt_tune/defaults.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Default values for the prompt-tuning module. 5 | 6 | Note: These values get accessed from the CLI to set default behavior. 7 | To maintain fast responsiveness from the CLI, do not add long-running code in this file and be mindful of imports. 8 | """ 9 | 10 | DEFAULT_TASK = """ 11 | Identify the relations and structure of the community of interest, specifically within the {domain} domain. 12 | """ 13 | 14 | K = 15 15 | LIMIT = 15 16 | MAX_TOKEN_COUNT = 2000 17 | MIN_CHUNK_SIZE = 200 18 | N_SUBSET_MAX = 300 19 | MIN_CHUNK_OVERLAP = 0 20 | PROMPT_TUNING_MODEL_ID = "default_chat_model" 21 | -------------------------------------------------------------------------------- /graphrag/prompt_tune/generator/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Prompt generation module.""" 5 | -------------------------------------------------------------------------------- /graphrag/prompt_tune/generator/community_report_rating.py: -------------------------------------------------------------------------------- 1 | """Generate a rating description for community report rating.""" 2 | 3 | # Copyright (c) 2024 Microsoft Corporation. 4 | # Licensed under the MIT License 5 | 6 | from graphrag.language_model.protocol.base import ChatModel 7 | from graphrag.prompt_tune.prompt.community_report_rating import ( 8 | GENERATE_REPORT_RATING_PROMPT, 9 | ) 10 | 11 | 12 | async def generate_community_report_rating( 13 | model: ChatModel, domain: str, persona: str, docs: str | list[str] 14 | ) -> str: 15 | """Generate an LLM persona to use for GraphRAG prompts. 16 | 17 | Parameters 18 | ---------- 19 | - llm (CompletionLLM): The LLM to use for generation 20 | - domain (str): The domain to generate a rating for 21 | - persona (str): The persona to generate a rating for for 22 | - docs (str | list[str]): Documents used to contextualize the rating 23 | 24 | Returns 25 | ------- 26 | - str: The generated rating description prompt response. 27 | """ 28 | docs_str = " ".join(docs) if isinstance(docs, list) else docs 29 | domain_prompt = GENERATE_REPORT_RATING_PROMPT.format( 30 | domain=domain, persona=persona, input_text=docs_str 31 | ) 32 | 33 | response = await model.achat(domain_prompt) 34 | 35 | return str(response.output.content).strip() 36 | -------------------------------------------------------------------------------- /graphrag/prompt_tune/generator/community_reporter_role.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Generate a community reporter role for community summarization.""" 5 | 6 | from graphrag.language_model.protocol.base import ChatModel 7 | from graphrag.prompt_tune.prompt.community_reporter_role import ( 8 | GENERATE_COMMUNITY_REPORTER_ROLE_PROMPT, 9 | ) 10 | 11 | 12 | async def generate_community_reporter_role( 13 | model: ChatModel, domain: str, persona: str, docs: str | list[str] 14 | ) -> str: 15 | """Generate an LLM persona to use for GraphRAG prompts. 16 | 17 | Parameters 18 | ---------- 19 | - llm (CompletionLLM): The LLM to use for generation 20 | - domain (str): The domain to generate a persona for 21 | - persona (str): The persona to generate a role for 22 | - docs (str | list[str]): The domain to generate a persona for 23 | 24 | Returns 25 | ------- 26 | - str: The generated domain prompt response. 27 | """ 28 | docs_str = " ".join(docs) if isinstance(docs, list) else docs 29 | domain_prompt = GENERATE_COMMUNITY_REPORTER_ROLE_PROMPT.format( 30 | domain=domain, persona=persona, input_text=docs_str 31 | ) 32 | 33 | response = await model.achat(domain_prompt) 34 | 35 | return str(response.output.content) 36 | -------------------------------------------------------------------------------- /graphrag/prompt_tune/generator/domain.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Domain generation for GraphRAG prompts.""" 5 | 6 | from graphrag.language_model.protocol.base import ChatModel 7 | from graphrag.prompt_tune.prompt.domain import GENERATE_DOMAIN_PROMPT 8 | 9 | 10 | async def generate_domain(model: ChatModel, docs: str | list[str]) -> str: 11 | """Generate an LLM persona to use for GraphRAG prompts. 12 | 13 | Parameters 14 | ---------- 15 | - llm (CompletionLLM): The LLM to use for generation 16 | - docs (str | list[str]): The domain to generate a persona for 17 | 18 | Returns 19 | ------- 20 | - str: The generated domain prompt response. 21 | """ 22 | docs_str = " ".join(docs) if isinstance(docs, list) else docs 23 | domain_prompt = GENERATE_DOMAIN_PROMPT.format(input_text=docs_str) 24 | 25 | response = await model.achat(domain_prompt) 26 | 27 | return str(response.output.content) 28 | -------------------------------------------------------------------------------- /graphrag/prompt_tune/generator/entity_summarization_prompt.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Entity summarization prompt generation module.""" 5 | 6 | from pathlib import Path 7 | 8 | from graphrag.prompt_tune.template.entity_summarization import ( 9 | ENTITY_SUMMARIZATION_PROMPT, 10 | ) 11 | 12 | ENTITY_SUMMARIZATION_FILENAME = "summarize_descriptions.txt" 13 | 14 | 15 | def create_entity_summarization_prompt( 16 | persona: str, 17 | language: str, 18 | output_path: Path | None = None, 19 | ) -> str: 20 | """ 21 | Create a prompt for entity summarization. 22 | 23 | Parameters 24 | ---------- 25 | - persona (str): The persona to use for the entity summarization prompt 26 | - language (str): The language to use for the entity summarization prompt 27 | - output_path (Path | None): The path to write the prompt to. Default is None. 28 | """ 29 | prompt = ENTITY_SUMMARIZATION_PROMPT.format(persona=persona, language=language) 30 | 31 | if output_path: 32 | output_path.mkdir(parents=True, exist_ok=True) 33 | 34 | output_path = output_path / ENTITY_SUMMARIZATION_FILENAME 35 | # Write file to output path 36 | with output_path.open("wb") as file: 37 | file.write(prompt.encode(encoding="utf-8", errors="strict")) 38 | 39 | return prompt 40 | -------------------------------------------------------------------------------- /graphrag/prompt_tune/generator/language.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Language detection for GraphRAG prompts.""" 5 | 6 | from graphrag.language_model.protocol.base import ChatModel 7 | from graphrag.prompt_tune.prompt.language import DETECT_LANGUAGE_PROMPT 8 | 9 | 10 | async def detect_language(model: ChatModel, docs: str | list[str]) -> str: 11 | """Detect input language to use for GraphRAG prompts. 12 | 13 | Parameters 14 | ---------- 15 | - llm (CompletionLLM): The LLM to use for generation 16 | - docs (str | list[str]): The docs to detect language from 17 | 18 | Returns 19 | ------- 20 | - str: The detected language. 21 | """ 22 | docs_str = " ".join(docs) if isinstance(docs, list) else docs 23 | language_prompt = DETECT_LANGUAGE_PROMPT.format(input_text=docs_str) 24 | 25 | response = await model.achat(language_prompt) 26 | 27 | return str(response.output.content) 28 | -------------------------------------------------------------------------------- /graphrag/prompt_tune/generator/persona.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Persona generating module for fine-tuning GraphRAG prompts.""" 5 | 6 | from graphrag.language_model.protocol.base import ChatModel 7 | from graphrag.prompt_tune.defaults import DEFAULT_TASK 8 | from graphrag.prompt_tune.prompt.persona import GENERATE_PERSONA_PROMPT 9 | 10 | 11 | async def generate_persona( 12 | model: ChatModel, domain: str, task: str = DEFAULT_TASK 13 | ) -> str: 14 | """Generate an LLM persona to use for GraphRAG prompts. 15 | 16 | Parameters 17 | ---------- 18 | - llm (CompletionLLM): The LLM to use for generation 19 | - domain (str): The domain to generate a persona for 20 | - task (str): The task to generate a persona for. Default is DEFAULT_TASK 21 | """ 22 | formatted_task = task.format(domain=domain) 23 | persona_prompt = GENERATE_PERSONA_PROMPT.format(sample_task=formatted_task) 24 | 25 | response = await model.achat(persona_prompt) 26 | 27 | return str(response.output.content) 28 | -------------------------------------------------------------------------------- /graphrag/prompt_tune/loader/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Fine-tuning config and data loader module.""" 5 | -------------------------------------------------------------------------------- /graphrag/prompt_tune/prompt/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Persona, entity type, relationships and domain generation prompts module.""" 5 | -------------------------------------------------------------------------------- /graphrag/prompt_tune/prompt/community_reporter_role.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Fine-tuning prompts for community reporter role generation.""" 5 | 6 | GENERATE_COMMUNITY_REPORTER_ROLE_PROMPT = """ 7 | {persona} 8 | Given a sample text, help the user by creating a role definition that will be tasked with community analysis. 9 | Take a look at this example, determine its key parts, and using the domain provided and your expertise, create a new role definition for the provided inputs that follows the same pattern as the example. 10 | Remember, your output should look just like the provided example in structure and content. 11 | 12 | Example: 13 | A technologist reporter that is analyzing Kevin Scott's "Behind the Tech Podcast", given a list of entities 14 | that belong to the community as well as their relationships and optional associated claims. 15 | The report will be used to inform decision-makers about significant developments associated with the community and their potential impact. 16 | 17 | 18 | Domain: {domain} 19 | Text: {input_text} 20 | Role:""" 21 | -------------------------------------------------------------------------------- /graphrag/prompt_tune/prompt/domain.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Fine-tuning prompts for domain generation.""" 5 | 6 | GENERATE_DOMAIN_PROMPT = """ 7 | You are an intelligent assistant that helps a human to analyze the information in a text document. 8 | Given a sample text, help the user by assigning a descriptive domain that summarizes what the text is about. 9 | Example domains are: "Social studies", "Algorithmic analysis", "Medical science", among others. 10 | 11 | Text: {input_text} 12 | Domain:""" 13 | -------------------------------------------------------------------------------- /graphrag/prompt_tune/prompt/language.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Fine-tuning prompts for language detection.""" 5 | 6 | DETECT_LANGUAGE_PROMPT = """ 7 | You are an intelligent assistant that helps a human to analyze the information in a text document. 8 | Given a sample text, help the user by determining what's the primary language of the provided texts. 9 | Examples are: "English", "Spanish", "Japanese", "Portuguese" among others. Reply ONLY with the language name. 10 | 11 | Text: {input_text} 12 | Language:""" 13 | -------------------------------------------------------------------------------- /graphrag/prompt_tune/prompt/persona.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Fine-tuning prompts for persona generation.""" 5 | 6 | GENERATE_PERSONA_PROMPT = """ 7 | You are an intelligent assistant that helps a human to analyze the information in a text document. 8 | Given a specific type of task and sample text, help the user by generating a 3 to 4 sentence description of an expert who could help solve the problem. 9 | Use a format similar to the following: 10 | You are an expert {{role}}. You are skilled at {{relevant skills}}. You are adept at helping people with {{specific task}}. 11 | 12 | task: {sample_task} 13 | persona description:""" 14 | -------------------------------------------------------------------------------- /graphrag/prompt_tune/template/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Fine-tuning prompts for entity extraction, entity summarization, and community report summarization.""" 5 | -------------------------------------------------------------------------------- /graphrag/prompt_tune/template/entity_summarization.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Fine-tuning prompts for entity summarization.""" 5 | 6 | ENTITY_SUMMARIZATION_PROMPT = """ 7 | {persona} 8 | Using your expertise, you're asked to generate a comprehensive summary of the data provided below. 9 | Given one or two entities, and a list of descriptions, all related to the same entity or group of entities. 10 | Please concatenate all of these into a single, concise description in {language}. Make sure to include information collected from all the descriptions. 11 | If the provided descriptions are contradictory, please resolve the contradictions and provide a single, coherent summary. 12 | Make sure it is written in third person, and include the entity names so we have the full context. 13 | 14 | Enrich it as much as you can with relevant information from the nearby text, this is very important. 15 | 16 | If no answer is possible, or the description is empty, only convey information that is provided within the text. 17 | ####### 18 | -Data- 19 | Entities: {{entity_name}} 20 | Description List: {{description_list}} 21 | ####### 22 | Output:""" 23 | -------------------------------------------------------------------------------- /graphrag/prompt_tune/types.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Types for prompt tuning.""" 5 | 6 | from enum import Enum 7 | 8 | 9 | class DocSelectionType(str, Enum): 10 | """The type of document selection to use.""" 11 | 12 | ALL = "all" 13 | RANDOM = "random" 14 | TOP = "top" 15 | AUTO = "auto" 16 | 17 | def __str__(self): 18 | """Return the string representation of the enum value.""" 19 | return self.value 20 | -------------------------------------------------------------------------------- /graphrag/prompts/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """All prompts for the GraphRAG system.""" 5 | -------------------------------------------------------------------------------- /graphrag/prompts/index/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """All prompts for the indexing engine.""" 5 | -------------------------------------------------------------------------------- /graphrag/prompts/index/summarize_descriptions.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """A file containing prompts definition.""" 5 | 6 | SUMMARIZE_PROMPT = """ 7 | You are a helpful assistant responsible for generating a comprehensive summary of the data provided below. 8 | Given one or more entities, and a list of descriptions, all related to the same entity or group of entities. 9 | Please concatenate all of these into a single, comprehensive description. Make sure to include information collected from all the descriptions. 10 | If the provided descriptions are contradictory, please resolve the contradictions and provide a single, coherent summary. 11 | Make sure it is written in third person, and include the entity names so we have the full context. 12 | Limit the final description length to {max_length} words. 13 | 14 | ####### 15 | -Data- 16 | Entities: {entity_name} 17 | Description List: {description_list} 18 | ####### 19 | Output: 20 | """ 21 | -------------------------------------------------------------------------------- /graphrag/prompts/query/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """All prompts for the query engine.""" 5 | -------------------------------------------------------------------------------- /graphrag/prompts/query/global_search_knowledge_system_prompt.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Global Search system prompts.""" 5 | 6 | GENERAL_KNOWLEDGE_INSTRUCTION = """ 7 | The response may also include relevant real-world knowledge outside the dataset, but it must be explicitly annotated with a verification tag [LLM: verify]. For example: 8 | "This is an example sentence supported by real-world knowledge [LLM: verify]." 9 | """ 10 | -------------------------------------------------------------------------------- /graphrag/prompts/query/question_gen_system_prompt.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Question Generation system prompts.""" 5 | 6 | QUESTION_SYSTEM_PROMPT = """ 7 | ---Role--- 8 | 9 | You are a helpful assistant generating a bulleted list of {question_count} questions about data in the tables provided. 10 | 11 | 12 | ---Data tables--- 13 | 14 | {context_data} 15 | 16 | 17 | ---Goal--- 18 | 19 | Given a series of example questions provided by the user, generate a bulleted list of {question_count} candidates for the next question. Use - marks as bullet points. 20 | 21 | These candidate questions should represent the most important or urgent information content or themes in the data tables. 22 | 23 | The candidate questions should be answerable using the data tables provided, but should not mention any specific data fields or data tables in the question text. 24 | 25 | If the user's questions reference several named entities, then each candidate question should reference all named entities. 26 | 27 | ---Example questions--- 28 | """ 29 | -------------------------------------------------------------------------------- /graphrag/py.typed: -------------------------------------------------------------------------------- 1 | # This package supports type hinting, 2 | # see https://www.python.org/dev/peps/pep-0561/#packaging-type-information -------------------------------------------------------------------------------- /graphrag/query/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """The query engine package root.""" 5 | -------------------------------------------------------------------------------- /graphrag/query/context_builder/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Functions to build context for system prompt to generate responses for a user query.""" 5 | -------------------------------------------------------------------------------- /graphrag/query/context_builder/rate_prompt.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Global search with dynamic community selection prompt.""" 5 | 6 | RATE_QUERY = """ 7 | ---Role--- 8 | You are a helpful assistant responsible for deciding whether the provided information is useful in answering a given question, even if it is only partially relevant. 9 | ---Goal--- 10 | On a scale from 0 to 5, please rate how relevant or helpful is the provided information in answering the question. 11 | ---Information--- 12 | {description} 13 | ---Question--- 14 | {question} 15 | ---Target response length and format--- 16 | Please response in the following JSON format with two entries: 17 | - "reason": the reasoning of your rating, please include information that you have considered. 18 | - "rating": the relevancy rating from 0 to 5, where 0 is the least relevant and 5 is the most relevant. 19 | {{ 20 | "reason": str, 21 | "rating": int. 22 | }} 23 | """ 24 | -------------------------------------------------------------------------------- /graphrag/query/input/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """GraphRAG Orchestration Inputs.""" 5 | -------------------------------------------------------------------------------- /graphrag/query/input/loaders/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """GraphRAG Orchestartion Input Loaders.""" 5 | -------------------------------------------------------------------------------- /graphrag/query/input/retrieval/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """GraphRAG Orchestration Input Retrieval.""" 5 | -------------------------------------------------------------------------------- /graphrag/query/llm/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Orchestration LLM utilities.""" 5 | -------------------------------------------------------------------------------- /graphrag/query/question_gen/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Question Generation Module.""" 5 | -------------------------------------------------------------------------------- /graphrag/query/structured_search/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Structured Search package.""" 5 | -------------------------------------------------------------------------------- /graphrag/query/structured_search/basic_search/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """The BasicSearch package.""" 5 | -------------------------------------------------------------------------------- /graphrag/query/structured_search/drift_search/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """DriftSearch module.""" 5 | -------------------------------------------------------------------------------- /graphrag/query/structured_search/global_search/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """GlobalSearch module.""" 5 | -------------------------------------------------------------------------------- /graphrag/query/structured_search/local_search/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """The LocalSearch package.""" 5 | -------------------------------------------------------------------------------- /graphrag/storage/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """The storage package root.""" 5 | -------------------------------------------------------------------------------- /graphrag/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Util functions for the GraphRAG package.""" 5 | -------------------------------------------------------------------------------- /graphrag/utils/storage.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Storage functions for the GraphRAG run module.""" 5 | 6 | import logging 7 | from io import BytesIO 8 | 9 | import pandas as pd 10 | 11 | from graphrag.storage.pipeline_storage import PipelineStorage 12 | 13 | log = logging.getLogger(__name__) 14 | 15 | 16 | async def load_table_from_storage(name: str, storage: PipelineStorage) -> pd.DataFrame: 17 | """Load a parquet from the storage instance.""" 18 | filename = f"{name}.parquet" 19 | if not await storage.has(filename): 20 | msg = f"Could not find {filename} in storage!" 21 | raise ValueError(msg) 22 | try: 23 | log.info("reading table from storage: %s", filename) 24 | return pd.read_parquet(BytesIO(await storage.get(filename, as_bytes=True))) 25 | except Exception: 26 | log.exception("error loading table from storage: %s", filename) 27 | raise 28 | 29 | 30 | async def write_table_to_storage( 31 | table: pd.DataFrame, name: str, storage: PipelineStorage 32 | ) -> None: 33 | """Write a table to storage.""" 34 | await storage.set(f"{name}.parquet", table.to_parquet()) 35 | 36 | 37 | async def delete_table_from_storage(name: str, storage: PipelineStorage) -> None: 38 | """Delete a table to storage.""" 39 | await storage.delete(f"{name}.parquet") 40 | 41 | 42 | async def storage_has_table(name: str, storage: PipelineStorage) -> bool: 43 | """Check if a table exists in storage.""" 44 | return await storage.has(f"{name}.parquet") 45 | -------------------------------------------------------------------------------- /graphrag/vector_stores/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """A package containing vector store implementations.""" 5 | -------------------------------------------------------------------------------- /scripts/semver-check.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | changes=$(git diff --name-only origin/main) 3 | has_change_doc=$(echo $changes | grep .semversioner/next-release) 4 | has_impacting_changes=$(echo $changes | grep graphrag) 5 | 6 | if [ "$has_impacting_changes" ] && [ -z "$has_change_doc" ]; then 7 | echo "Check failed. Run 'poetry run semversioner add-change' to update the next release version" 8 | exit 1 9 | fi 10 | echo "OK" 11 | -------------------------------------------------------------------------------- /scripts/spellcheck.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | npx --yes cspell -c cspell.config.yaml --no-progress lint . -------------------------------------------------------------------------------- /scripts/start-azurite.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | npx --yes azurite -L -l ./temp_azurite -d ./temp_azurite/debug.log -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | 5 | """Tests for the GraphRAG LLM module.""" 6 | 7 | # Register MOCK providers 8 | from graphrag.config.enums import ModelType 9 | from graphrag.language_model.factory import ModelFactory 10 | from tests.mock_provider import MockChatLLM, MockEmbeddingLLM 11 | 12 | ModelFactory.register_chat(ModelType.MockChat, lambda **kwargs: MockChatLLM(**kwargs)) 13 | ModelFactory.register_embedding( 14 | ModelType.MockEmbedding, lambda **kwargs: MockEmbeddingLLM(**kwargs) 15 | ) 16 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | 5 | def pytest_addoption(parser): 6 | parser.addoption( 7 | "--run_slow", action="store_true", default=False, help="run slow tests" 8 | ) 9 | -------------------------------------------------------------------------------- /tests/fixtures/azure/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "input_path": "./tests/fixtures/azure", 3 | "input_file_type": "text", 4 | "workflow_config": { 5 | "skip_assert": true, 6 | "azure": { 7 | "input_container": "azurefixture", 8 | "input_base_dir": "input" 9 | } 10 | }, 11 | "query_config": [], 12 | "slow": false 13 | } -------------------------------------------------------------------------------- /tests/fixtures/azure/input/ABOUT.md: -------------------------------------------------------------------------------- 1 | # About 2 | 3 | This document (Operation Dulce) in an AI-generated science fiction novella, included here for the purposes of integration testing. -------------------------------------------------------------------------------- /tests/fixtures/azure/settings.yml: -------------------------------------------------------------------------------- 1 | extract_claims: 2 | enabled: true 3 | 4 | vector_store: 5 | default_vector_store: 6 | type: "azure_ai_search" 7 | url: ${AZURE_AI_SEARCH_URL_ENDPOINT} 8 | api_key: ${AZURE_AI_SEARCH_API_KEY} 9 | container_name: "azure_ci" 10 | 11 | input: 12 | type: blob 13 | file_type: text 14 | connection_string: ${LOCAL_BLOB_STORAGE_CONNECTION_STRING} 15 | container_name: azurefixture 16 | base_dir: input 17 | 18 | cache: 19 | type: blob 20 | connection_string: ${BLOB_STORAGE_CONNECTION_STRING} 21 | container_name: cicache 22 | base_dir: cache_azure_ai 23 | 24 | storage: 25 | type: blob 26 | connection_string: ${LOCAL_BLOB_STORAGE_CONNECTION_STRING} 27 | container_name: azurefixture 28 | base_dir: output 29 | 30 | reporting: 31 | type: blob 32 | connection_string: ${LOCAL_BLOB_STORAGE_CONNECTION_STRING} 33 | container_name: azurefixture 34 | base_dir: reports 35 | -------------------------------------------------------------------------------- /tests/fixtures/min-csv/input/ABOUT.md: -------------------------------------------------------------------------------- 1 | # About 2 | 3 | This document (Operation Dulce) in an AI-generated science fiction novella, included here for the purposes of integration testing. -------------------------------------------------------------------------------- /tests/fixtures/min-csv/settings.yml: -------------------------------------------------------------------------------- 1 | models: 2 | default_chat_model: 3 | azure_auth_type: api_key 4 | type: ${GRAPHRAG_LLM_TYPE} 5 | api_key: ${GRAPHRAG_API_KEY} 6 | api_base: ${GRAPHRAG_API_BASE} 7 | api_version: ${GRAPHRAG_API_VERSION} 8 | deployment_name: ${GRAPHRAG_LLM_DEPLOYMENT_NAME} 9 | model: ${GRAPHRAG_LLM_MODEL} 10 | tokens_per_minute: ${GRAPHRAG_LLM_TPM} 11 | requests_per_minute: ${GRAPHRAG_LLM_RPM} 12 | model_supports_json: true 13 | concurrent_requests: 50 14 | async_mode: threaded 15 | default_embedding_model: 16 | azure_auth_type: api_key 17 | type: ${GRAPHRAG_EMBEDDING_TYPE} 18 | api_key: ${GRAPHRAG_API_KEY} 19 | api_base: ${GRAPHRAG_API_BASE} 20 | api_version: ${GRAPHRAG_API_VERSION} 21 | deployment_name: ${GRAPHRAG_EMBEDDING_DEPLOYMENT_NAME} 22 | model: ${GRAPHRAG_EMBEDDING_MODEL} 23 | tokens_per_minute: null 24 | requests_per_minute: null 25 | concurrent_requests: 50 26 | async_mode: threaded 27 | 28 | vector_store: 29 | default_vector_store: 30 | type: "lancedb" 31 | db_uri: "./tests/fixtures/min-csv/lancedb" 32 | container_name: "lancedb_ci" 33 | overwrite: True 34 | 35 | input: 36 | file_type: csv 37 | 38 | snapshots: 39 | embeddings: True 40 | 41 | drift_search: 42 | n_depth: 1 43 | drift_k_followups: 3 44 | primer_folds: 3 -------------------------------------------------------------------------------- /tests/fixtures/text/input/ABOUT.md: -------------------------------------------------------------------------------- 1 | # About 2 | 3 | This document (Operation Dulce) in an AI-generated science fiction novella, included here for the purposes of integration testing. -------------------------------------------------------------------------------- /tests/fixtures/text/settings.yml: -------------------------------------------------------------------------------- 1 | models: 2 | default_chat_model: 3 | azure_auth_type: api_key 4 | type: ${GRAPHRAG_LLM_TYPE} 5 | api_key: ${GRAPHRAG_API_KEY} 6 | api_base: ${GRAPHRAG_API_BASE} 7 | api_version: ${GRAPHRAG_API_VERSION} 8 | deployment_name: ${GRAPHRAG_LLM_DEPLOYMENT_NAME} 9 | model: ${GRAPHRAG_LLM_MODEL} 10 | tokens_per_minute: ${GRAPHRAG_LLM_TPM} 11 | requests_per_minute: ${GRAPHRAG_LLM_RPM} 12 | model_supports_json: true 13 | concurrent_requests: 50 14 | async_mode: threaded 15 | default_embedding_model: 16 | azure_auth_type: api_key 17 | type: ${GRAPHRAG_EMBEDDING_TYPE} 18 | api_key: ${GRAPHRAG_API_KEY} 19 | api_base: ${GRAPHRAG_API_BASE} 20 | api_version: ${GRAPHRAG_API_VERSION} 21 | deployment_name: ${GRAPHRAG_EMBEDDING_DEPLOYMENT_NAME} 22 | model: ${GRAPHRAG_EMBEDDING_MODEL} 23 | tokens_per_minute: null 24 | requests_per_minute: null 25 | concurrent_requests: 50 26 | async_mode: threaded 27 | 28 | vector_store: 29 | default_vector_store: 30 | type: "azure_ai_search" 31 | url: ${AZURE_AI_SEARCH_URL_ENDPOINT} 32 | api_key: ${AZURE_AI_SEARCH_API_KEY} 33 | container_name: "simple_text_ci" 34 | 35 | extract_claims: 36 | enabled: true 37 | 38 | community_reports: 39 | prompt: "prompts/community_report.txt" 40 | max_length: 2000 41 | max_input_length: 8000 42 | 43 | snapshots: 44 | embeddings: True 45 | 46 | drift_search: 47 | n_depth: 1 48 | drift_k_followups: 3 49 | primer_folds: 3 50 | -------------------------------------------------------------------------------- /tests/integration/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | -------------------------------------------------------------------------------- /tests/integration/language_model/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | -------------------------------------------------------------------------------- /tests/integration/storage/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | -------------------------------------------------------------------------------- /tests/integration/vector_stores/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Integration tests for vector store implementations.""" 5 | -------------------------------------------------------------------------------- /tests/notebook/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | -------------------------------------------------------------------------------- /tests/notebook/test_notebooks.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | import subprocess 4 | from pathlib import Path 5 | 6 | import nbformat 7 | import pytest 8 | 9 | NOTEBOOKS_PATH = Path("examples_notebooks") 10 | EXCLUDED_PATH = NOTEBOOKS_PATH / "community_contrib" 11 | 12 | notebooks_list = [ 13 | notebook 14 | for notebook in NOTEBOOKS_PATH.rglob("*.ipynb") 15 | if EXCLUDED_PATH not in notebook.parents 16 | ] 17 | 18 | 19 | def _notebook_run(filepath: Path): 20 | """Execute a notebook via nbconvert and collect output. 21 | :returns execution errors 22 | """ 23 | args = [ 24 | "jupyter", 25 | "nbconvert", 26 | "--to", 27 | "notebook", 28 | "--execute", 29 | "-y", 30 | "--no-prompt", 31 | "--stdout", 32 | str(filepath.absolute().resolve()), 33 | ] 34 | notebook = subprocess.check_output(args) 35 | nb = nbformat.reads(notebook, nbformat.current_nbformat) 36 | 37 | return [ 38 | output 39 | for cell in nb.cells 40 | if "outputs" in cell 41 | for output in cell["outputs"] 42 | if output.output_type == "error" 43 | ] 44 | 45 | 46 | @pytest.mark.parametrize("notebook_path", notebooks_list) 47 | def test_notebook(notebook_path: Path): 48 | assert _notebook_run(notebook_path) == [] 49 | -------------------------------------------------------------------------------- /tests/smoke/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | -------------------------------------------------------------------------------- /tests/unit/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | -------------------------------------------------------------------------------- /tests/unit/config/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | -------------------------------------------------------------------------------- /tests/unit/config/fixtures/minimal_config/settings.yaml: -------------------------------------------------------------------------------- 1 | models: 2 | default_chat_model: 3 | api_key: ${CUSTOM_API_KEY} 4 | type: openai_chat 5 | model: gpt-4-turbo-preview 6 | default_embedding_model: 7 | api_key: ${CUSTOM_API_KEY} 8 | type: openai_embedding 9 | model: text-embedding-3-small -------------------------------------------------------------------------------- /tests/unit/config/fixtures/minimal_config_missing_env_var/settings.yaml: -------------------------------------------------------------------------------- 1 | models: 2 | default_chat_model: 3 | api_key: ${SOME_NON_EXISTENT_ENV_VAR} 4 | type: openai_chat 5 | model: gpt-4-turbo-preview 6 | default_embedding_model: 7 | api_key: ${SOME_NON_EXISTENT_ENV_VAR} 8 | type: openai_embedding 9 | model: text-embedding-3-small -------------------------------------------------------------------------------- /tests/unit/config/fixtures/timestamp_dirs/20240812-120000/empty.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/tests/unit/config/fixtures/timestamp_dirs/20240812-120000/empty.txt -------------------------------------------------------------------------------- /tests/unit/config/prompt-a.txt: -------------------------------------------------------------------------------- 1 | Hello, World! A -------------------------------------------------------------------------------- /tests/unit/config/prompt-b.txt: -------------------------------------------------------------------------------- 1 | Hello, World! B -------------------------------------------------------------------------------- /tests/unit/config/prompt-c.txt: -------------------------------------------------------------------------------- 1 | Hello, World! C -------------------------------------------------------------------------------- /tests/unit/config/prompt-d.txt: -------------------------------------------------------------------------------- 1 | Hello, World! D -------------------------------------------------------------------------------- /tests/unit/indexing/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | -------------------------------------------------------------------------------- /tests/unit/indexing/cache/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | -------------------------------------------------------------------------------- /tests/unit/indexing/graph/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | -------------------------------------------------------------------------------- /tests/unit/indexing/graph/extractors/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | -------------------------------------------------------------------------------- /tests/unit/indexing/graph/extractors/community_reports/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | -------------------------------------------------------------------------------- /tests/unit/indexing/graph/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | -------------------------------------------------------------------------------- /tests/unit/indexing/input/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | -------------------------------------------------------------------------------- /tests/unit/indexing/input/data/multiple-csvs/input1.csv: -------------------------------------------------------------------------------- 1 | title,text 2 | Hello,Hi how are you today? 3 | Goodbye,I'm outta here -------------------------------------------------------------------------------- /tests/unit/indexing/input/data/multiple-csvs/input2.csv: -------------------------------------------------------------------------------- 1 | title,text 2 | Adios,See you later -------------------------------------------------------------------------------- /tests/unit/indexing/input/data/multiple-csvs/input3.csv: -------------------------------------------------------------------------------- 1 | title,text 2 | Hi,I'm here -------------------------------------------------------------------------------- /tests/unit/indexing/input/data/multiple-jsons/input1.json: -------------------------------------------------------------------------------- 1 | [{ 2 | "title": "Hello", 3 | "text": "Hi how are you today?" 4 | }, { 5 | "title": "Goodbye", 6 | "text": "I'm outta here" 7 | }, { 8 | "title": "Adios", 9 | "text": "See you later" 10 | }] 11 | -------------------------------------------------------------------------------- /tests/unit/indexing/input/data/multiple-jsons/input2.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Hi", 3 | "text": "I'm here" 4 | } -------------------------------------------------------------------------------- /tests/unit/indexing/input/data/multiple-txts/input1.txt: -------------------------------------------------------------------------------- 1 | Hi how are you today? -------------------------------------------------------------------------------- /tests/unit/indexing/input/data/multiple-txts/input2.txt: -------------------------------------------------------------------------------- 1 | I'm outta here -------------------------------------------------------------------------------- /tests/unit/indexing/input/data/one-csv/input.csv: -------------------------------------------------------------------------------- 1 | title,text 2 | Hello,Hi how are you today? 3 | Goodbye,I'm outta here -------------------------------------------------------------------------------- /tests/unit/indexing/input/data/one-json-multiple-objects/input.json: -------------------------------------------------------------------------------- 1 | [{ 2 | "title": "Hello", 3 | "text": "Hi how are you today?" 4 | }, { 5 | "title": "Goodbye", 6 | "text": "I'm outta here" 7 | }, { 8 | "title": "Adios", 9 | "text": "See you later" 10 | }] 11 | -------------------------------------------------------------------------------- /tests/unit/indexing/input/data/one-json-one-object/input.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Hello", 3 | "text": "Hi how are you today?" 4 | } -------------------------------------------------------------------------------- /tests/unit/indexing/input/data/one-txt/input.txt: -------------------------------------------------------------------------------- 1 | Hi how are you today? -------------------------------------------------------------------------------- /tests/unit/indexing/operations/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | -------------------------------------------------------------------------------- /tests/unit/indexing/operations/chunk_text/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | -------------------------------------------------------------------------------- /tests/unit/indexing/test_init_content.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | import re 5 | from typing import Any, cast 6 | 7 | import yaml 8 | 9 | from graphrag.config.create_graphrag_config import create_graphrag_config 10 | from graphrag.config.init_content import INIT_YAML 11 | from graphrag.config.models.graph_rag_config import GraphRagConfig 12 | 13 | 14 | def test_init_yaml(): 15 | data = yaml.load(INIT_YAML, Loader=yaml.FullLoader) 16 | config = create_graphrag_config(data) 17 | GraphRagConfig.model_validate(config, strict=True) 18 | 19 | 20 | def test_init_yaml_uncommented(): 21 | lines = INIT_YAML.splitlines() 22 | lines = [line for line in lines if "##" not in line] 23 | 24 | def uncomment_line(line: str) -> str: 25 | leading_whitespace = cast("Any", re.search(r"^(\s*)", line)).group(1) 26 | return re.sub(r"^\s*# ", leading_whitespace, line, count=1) 27 | 28 | content = "\n".join([uncomment_line(line) for line in lines]) 29 | data = yaml.load(content, Loader=yaml.FullLoader) 30 | config = create_graphrag_config(data) 31 | GraphRagConfig.model_validate(config, strict=True) 32 | -------------------------------------------------------------------------------- /tests/unit/indexing/text_splitting/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | -------------------------------------------------------------------------------- /tests/unit/indexing/verbs/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | -------------------------------------------------------------------------------- /tests/unit/indexing/verbs/entities/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | -------------------------------------------------------------------------------- /tests/unit/indexing/verbs/entities/extraction/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | -------------------------------------------------------------------------------- /tests/unit/indexing/verbs/entities/extraction/strategies/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | -------------------------------------------------------------------------------- /tests/unit/indexing/verbs/entities/extraction/strategies/graph_intelligence/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | -------------------------------------------------------------------------------- /tests/unit/indexing/verbs/helpers/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | -------------------------------------------------------------------------------- /tests/unit/indexing/verbs/helpers/mock_llm.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | from pydantic import BaseModel 4 | 5 | from graphrag.language_model.manager import ModelManager 6 | from graphrag.language_model.protocol.base import ChatModel 7 | 8 | 9 | def create_mock_llm(responses: list[str | BaseModel], name: str = "mock") -> ChatModel: 10 | """Creates a mock LLM that returns the given responses.""" 11 | return ModelManager().get_or_create_chat_model( 12 | name, "mock_chat", responses=responses 13 | ) 14 | -------------------------------------------------------------------------------- /tests/unit/query/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | -------------------------------------------------------------------------------- /tests/unit/query/context_builder/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | -------------------------------------------------------------------------------- /tests/unit/query/data/defaults/output/20240812-120000/empty.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/tests/unit/query/data/defaults/output/20240812-120000/empty.txt -------------------------------------------------------------------------------- /tests/unit/query/data/defaults/output/20240812-121000/empty.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/tests/unit/query/data/defaults/output/20240812-121000/empty.txt -------------------------------------------------------------------------------- /tests/unit/query/data/empty/something-else/empty.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/tests/unit/query/data/empty/something-else/empty.txt -------------------------------------------------------------------------------- /tests/unit/query/data/hidden/output/.another/empty.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/tests/unit/query/data/hidden/output/.another/empty.txt -------------------------------------------------------------------------------- /tests/unit/query/data/hidden/output/.hidden: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/tests/unit/query/data/hidden/output/.hidden -------------------------------------------------------------------------------- /tests/unit/query/data/hidden/output/20240812-120000/empty.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/tests/unit/query/data/hidden/output/20240812-120000/empty.txt -------------------------------------------------------------------------------- /tests/unit/query/data/hidden/output/20240812-121000/empty.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/tests/unit/query/data/hidden/output/20240812-121000/empty.txt -------------------------------------------------------------------------------- /tests/unit/query/data/non-numeric/output/20240812-120000/empty.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/tests/unit/query/data/non-numeric/output/20240812-120000/empty.txt -------------------------------------------------------------------------------- /tests/unit/query/data/non-numeric/output/20240812-121000/empty.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/tests/unit/query/data/non-numeric/output/20240812-121000/empty.txt -------------------------------------------------------------------------------- /tests/unit/query/data/non-numeric/output/something-else/empty.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/tests/unit/query/data/non-numeric/output/something-else/empty.txt -------------------------------------------------------------------------------- /tests/unit/query/input/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | -------------------------------------------------------------------------------- /tests/unit/query/input/retrieval/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | -------------------------------------------------------------------------------- /tests/unit/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | -------------------------------------------------------------------------------- /tests/unit/utils/test_embeddings.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | import pytest 5 | 6 | from graphrag.config.embeddings import create_collection_name 7 | 8 | 9 | def test_create_collection_name(): 10 | collection = create_collection_name("default", "entity.title") 11 | assert collection == "default-entity-title" 12 | 13 | 14 | def test_create_collection_name_invalid_embedding_throws(): 15 | with pytest.raises(KeyError): 16 | create_collection_name("default", "invalid.name") 17 | 18 | 19 | def test_create_collection_name_invalid_embedding_does_not_throw(): 20 | collection = create_collection_name("default", "invalid.name", validate=False) 21 | assert collection == "default-invalid-name" 22 | -------------------------------------------------------------------------------- /tests/verbs/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | -------------------------------------------------------------------------------- /tests/verbs/data/communities.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/tests/verbs/data/communities.parquet -------------------------------------------------------------------------------- /tests/verbs/data/community_reports.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/tests/verbs/data/community_reports.parquet -------------------------------------------------------------------------------- /tests/verbs/data/covariates.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/tests/verbs/data/covariates.parquet -------------------------------------------------------------------------------- /tests/verbs/data/documents.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/tests/verbs/data/documents.parquet -------------------------------------------------------------------------------- /tests/verbs/data/entities.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/tests/verbs/data/entities.parquet -------------------------------------------------------------------------------- /tests/verbs/data/relationships.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/tests/verbs/data/relationships.parquet -------------------------------------------------------------------------------- /tests/verbs/data/text_units.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/tests/verbs/data/text_units.parquet -------------------------------------------------------------------------------- /tests/verbs/data/text_units_metadata.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/tests/verbs/data/text_units_metadata.parquet -------------------------------------------------------------------------------- /tests/verbs/data/text_units_metadata_included_chunk.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/tests/verbs/data/text_units_metadata_included_chunk.parquet -------------------------------------------------------------------------------- /tests/verbs/test_create_communities.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | from graphrag.config.create_graphrag_config import create_graphrag_config 5 | from graphrag.data_model.schemas import COMMUNITIES_FINAL_COLUMNS 6 | from graphrag.index.workflows.create_communities import ( 7 | run_workflow, 8 | ) 9 | from graphrag.utils.storage import load_table_from_storage 10 | 11 | from .util import ( 12 | DEFAULT_MODEL_CONFIG, 13 | compare_outputs, 14 | create_test_context, 15 | load_test_table, 16 | ) 17 | 18 | 19 | async def test_create_communities(): 20 | expected = load_test_table("communities") 21 | 22 | context = await create_test_context( 23 | storage=[ 24 | "entities", 25 | "relationships", 26 | ], 27 | ) 28 | 29 | config = create_graphrag_config({"models": DEFAULT_MODEL_CONFIG}) 30 | 31 | await run_workflow( 32 | config, 33 | context, 34 | ) 35 | 36 | actual = await load_table_from_storage("communities", context.storage) 37 | 38 | columns = list(expected.columns.values) 39 | # don't compare period since it is created with the current date each time 40 | columns.remove("period") 41 | compare_outputs( 42 | actual, 43 | expected, 44 | columns=columns, 45 | ) 46 | 47 | for column in COMMUNITIES_FINAL_COLUMNS: 48 | assert column in actual.columns 49 | -------------------------------------------------------------------------------- /tests/verbs/test_create_final_text_units.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | from graphrag.config.create_graphrag_config import create_graphrag_config 5 | from graphrag.data_model.schemas import TEXT_UNITS_FINAL_COLUMNS 6 | from graphrag.index.workflows.create_final_text_units import ( 7 | run_workflow, 8 | ) 9 | from graphrag.utils.storage import load_table_from_storage 10 | 11 | from .util import ( 12 | DEFAULT_MODEL_CONFIG, 13 | compare_outputs, 14 | create_test_context, 15 | load_test_table, 16 | ) 17 | 18 | 19 | async def test_create_final_text_units(): 20 | expected = load_test_table("text_units") 21 | 22 | context = await create_test_context( 23 | storage=[ 24 | "text_units", 25 | "entities", 26 | "relationships", 27 | "covariates", 28 | ], 29 | ) 30 | 31 | config = create_graphrag_config({"models": DEFAULT_MODEL_CONFIG}) 32 | config.extract_claims.enabled = True 33 | 34 | await run_workflow(config, context) 35 | 36 | actual = await load_table_from_storage("text_units", context.storage) 37 | 38 | for column in TEXT_UNITS_FINAL_COLUMNS: 39 | assert column in actual.columns 40 | 41 | compare_outputs(actual, expected) 42 | -------------------------------------------------------------------------------- /tests/verbs/test_extract_graph_nlp.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | from graphrag.config.create_graphrag_config import create_graphrag_config 5 | from graphrag.index.workflows.extract_graph_nlp import ( 6 | run_workflow, 7 | ) 8 | from graphrag.utils.storage import load_table_from_storage 9 | 10 | from .util import ( 11 | DEFAULT_MODEL_CONFIG, 12 | create_test_context, 13 | ) 14 | 15 | 16 | async def test_extract_graph_nlp(): 17 | context = await create_test_context( 18 | storage=["text_units"], 19 | ) 20 | 21 | config = create_graphrag_config({"models": DEFAULT_MODEL_CONFIG}) 22 | 23 | await run_workflow(config, context) 24 | 25 | nodes_actual = await load_table_from_storage("entities", context.storage) 26 | edges_actual = await load_table_from_storage("relationships", context.storage) 27 | 28 | # this will be the raw count of entities and edges with no pruning 29 | # with NLP it is deterministic, so we can assert exact row counts 30 | assert len(nodes_actual) == 1148 31 | assert len(nodes_actual.columns) == 5 32 | assert len(edges_actual) == 29445 33 | assert len(edges_actual.columns) == 5 34 | -------------------------------------------------------------------------------- /tests/verbs/test_prune_graph.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | from graphrag.config.create_graphrag_config import create_graphrag_config 5 | from graphrag.config.models.prune_graph_config import PruneGraphConfig 6 | from graphrag.index.workflows.prune_graph import ( 7 | run_workflow, 8 | ) 9 | from graphrag.utils.storage import load_table_from_storage 10 | 11 | from .util import ( 12 | DEFAULT_MODEL_CONFIG, 13 | create_test_context, 14 | ) 15 | 16 | 17 | async def test_prune_graph(): 18 | context = await create_test_context( 19 | storage=["entities", "relationships"], 20 | ) 21 | 22 | config = create_graphrag_config({"models": DEFAULT_MODEL_CONFIG}) 23 | config.prune_graph = PruneGraphConfig( 24 | min_node_freq=4, min_node_degree=0, min_edge_weight_pct=0 25 | ) 26 | 27 | await run_workflow(config, context) 28 | 29 | nodes_actual = await load_table_from_storage("entities", context.storage) 30 | 31 | assert len(nodes_actual) == 21 32 | -------------------------------------------------------------------------------- /unified-search-app/.vsts-ci.yml: -------------------------------------------------------------------------------- 1 | name: unified-search-app 2 | pool: 3 | vmImage: ubuntu-latest 4 | 5 | trigger: 6 | batch: true 7 | branches: 8 | include: 9 | - main 10 | paths: 11 | include: 12 | - unified-search-app 13 | 14 | 15 | stages: 16 | - stage: Build_deploy 17 | dependsOn: [] 18 | jobs: 19 | - job: build 20 | displayName: Build and deploy 21 | pool: 22 | vmImage: ubuntu-latest 23 | steps: 24 | - task: UsePythonVersion@0 25 | inputs: 26 | versionSpec: "3.11" 27 | displayName: "Use Python 3.11" 28 | 29 | - task: Docker@2 30 | inputs: 31 | containerRegistry: '$(containerRegistry)' 32 | repository: 'main' 33 | command: 'buildAndPush' 34 | Dockerfile: 'unified-search-app/Dockerfile' 35 | tags: 'latest' 36 | - task: AzureAppServiceManage@0 37 | inputs: 38 | azureSubscription: '$(subscription)' 39 | Action: 'Restart Azure App Service' 40 | WebAppName: '$(webApp)' 41 | -------------------------------------------------------------------------------- /unified-search-app/Dockerfile: -------------------------------------------------------------------------------- 1 | 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Dockerfile 4 | # https://eng.ms/docs/more/containers-secure-supply-chain/approved-images 5 | FROM mcr.microsoft.com/oryx/python:3.11 6 | 7 | RUN curl -fsSL https://packages.microsoft.com/keys/microsoft.asc | gpg --dearmor -o /usr/share/keyrings/microsoft-prod.gpg 8 | RUN apt-get update -y 9 | 10 | # Install dependencies 11 | WORKDIR ./ 12 | COPY . . 13 | RUN curl -sSL https://install.python-poetry.org | python - 14 | ENV PATH="${PATH}:/root/.local/bin" 15 | RUN poetry config virtualenvs.in-project true 16 | RUN poetry install --no-root 17 | 18 | # Run application 19 | EXPOSE 8501 20 | ENTRYPOINT ["poetry","run","streamlit", "run", "./app/home_page.py"] -------------------------------------------------------------------------------- /unified-search-app/app/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """App module.""" 5 | -------------------------------------------------------------------------------- /unified-search-app/app/data_config.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Data config module.""" 5 | 6 | # This file is used to store configurations for the graph-indexed data and the LLM/embeddings models used in the app. 7 | 8 | # name of the table in the graph-indexed data where the communities are stored 9 | communities_table = "output/communities" 10 | 11 | # name of the table in the graph-indexed data where the community reports are stored 12 | community_report_table = "output/community_reports" 13 | 14 | # name of the table in the graph-indexed data where the entity embeddings are stored 15 | entity_table = "output/entities" 16 | 17 | # name of the table in the graph-indexed data where the entity relationships are stored 18 | relationship_table = "output/relationships" 19 | 20 | # name of the table in the graph-indexed data where the entity covariates are stored 21 | covariate_table = "output/covariates" 22 | 23 | # name of the table in the graph-indexed data where the text units are stored 24 | text_unit_table = "output/text_units" 25 | 26 | # default configurations for LLM's answer generation, used in all search types 27 | # this should be adjusted based on the token limits of the LLM model being used 28 | # The following setting is for gpt-4-1106-preview (i.e. gpt-4-turbo) 29 | # For gpt-4 (token-limit = 8k), a good setting could be: 30 | default_suggested_questions = 5 31 | 32 | # default timeout for streamlit cache 33 | default_ttl = 60 * 60 * 24 * 7 34 | -------------------------------------------------------------------------------- /unified-search-app/app/knowledge_loader/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Knowledge loader module.""" 5 | -------------------------------------------------------------------------------- /unified-search-app/app/knowledge_loader/data_sources/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Data sources module.""" 5 | -------------------------------------------------------------------------------- /unified-search-app/app/knowledge_loader/data_sources/default.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Data sources default module.""" 5 | 6 | import os 7 | 8 | container_name = "data" 9 | blob_container_name = os.getenv("BLOB_CONTAINER_NAME", container_name) 10 | blob_account_name = os.getenv("BLOB_ACCOUNT_NAME") 11 | 12 | local_data_root = os.getenv("DATA_ROOT") 13 | 14 | LISTING_FILE = "listing.json" 15 | 16 | if local_data_root is None and blob_account_name is None: 17 | error_message = ( 18 | "Either DATA_ROOT or BLOB_ACCOUNT_NAME environment variable must be set." 19 | ) 20 | raise ValueError(error_message) 21 | -------------------------------------------------------------------------------- /unified-search-app/app/rag/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Rag module.""" 5 | -------------------------------------------------------------------------------- /unified-search-app/app/rag/typing.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Typing module.""" 5 | 6 | from dataclasses import dataclass 7 | from enum import Enum 8 | 9 | import pandas as pd 10 | 11 | 12 | class SearchType(Enum): 13 | """SearchType class definition.""" 14 | 15 | Basic = "basic" 16 | Local = "local" 17 | Global = "global" 18 | Drift = "drift" 19 | 20 | 21 | @dataclass 22 | class SearchResult: 23 | """SearchResult class definition.""" 24 | 25 | # create a dataclass to store the search result of each algorithm 26 | search_type: SearchType 27 | response: str 28 | context: dict[str, pd.DataFrame] 29 | -------------------------------------------------------------------------------- /unified-search-app/app/state/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """App state module.""" 5 | -------------------------------------------------------------------------------- /unified-search-app/app/state/query_variable.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Query variable module.""" 5 | 6 | from typing import Any 7 | 8 | import streamlit as st 9 | 10 | 11 | class QueryVariable: 12 | """ 13 | Manage reading and writing variables from the URL query string. 14 | 15 | We handle translation between string values and bools, accounting for always-lowercase URLs to avoid case issues. 16 | Note that all variables are managed via session state to account for widgets that auto-read. 17 | We just push them up to the query to keep it updated. 18 | """ 19 | 20 | def __init__(self, key: str, default: Any | None): 21 | """Init method definition.""" 22 | self._key = key 23 | val = st.query_params[key].lower() if key in st.query_params else default 24 | if val == "true": 25 | val = True 26 | elif val == "false": 27 | val = False 28 | if key not in st.session_state: 29 | st.session_state[key] = val 30 | 31 | @property 32 | def key(self) -> str: 33 | """Key property definition.""" 34 | return self._key 35 | 36 | @property 37 | def value(self) -> Any: 38 | """Value property definition.""" 39 | return st.session_state[self._key] 40 | 41 | @value.setter 42 | def value(self, value: Any) -> None: 43 | """Value setter definition.""" 44 | st.session_state[self._key] = value 45 | st.query_params[self._key] = f"{value}".lower() 46 | -------------------------------------------------------------------------------- /unified-search-app/app/ui/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """App UI module.""" 5 | -------------------------------------------------------------------------------- /unified-search-app/app/ui/questions_list.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Question list module.""" 5 | 6 | import streamlit as st 7 | from state.session_variables import SessionVariables 8 | 9 | 10 | def create_questions_list_ui(sv: SessionVariables): 11 | """Return question list UI component.""" 12 | selection = st.dataframe( 13 | sv.generated_questions.value, 14 | use_container_width=True, 15 | hide_index=True, 16 | selection_mode="single-row", 17 | column_config={"value": "question"}, 18 | on_select="rerun", 19 | ) 20 | rows = selection.selection.rows 21 | if len(rows) > 0: 22 | question_index = selection.selection.rows[0] 23 | sv.selected_question.value = sv.generated_questions.value[question_index] 24 | -------------------------------------------------------------------------------- /unified-search-app/app/ui/report_list.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Microsoft Corporation. 2 | # Licensed under the MIT License 3 | 4 | """Report list module.""" 5 | 6 | import streamlit as st 7 | from state.session_variables import SessionVariables 8 | 9 | 10 | def create_report_list_ui(sv: SessionVariables): 11 | """Return report list UI component.""" 12 | selection = st.dataframe( 13 | sv.community_reports.value, 14 | height=1000, 15 | hide_index=True, 16 | column_order=["id", "title"], 17 | selection_mode="single-row", 18 | on_select="rerun", 19 | ) 20 | rows = selection.selection.rows 21 | if len(rows) > 0: 22 | report_index = selection.selection.rows[0] 23 | sv.selected_report.value = sv.community_reports.value.iloc[report_index] 24 | else: 25 | sv.selected_report.value = None 26 | -------------------------------------------------------------------------------- /unified-search-app/images/image-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/unified-search-app/images/image-1.png -------------------------------------------------------------------------------- /unified-search-app/images/image-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/unified-search-app/images/image-2.png -------------------------------------------------------------------------------- /unified-search-app/images/image-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/unified-search-app/images/image-3.png -------------------------------------------------------------------------------- /unified-search-app/images/image-4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/graphrag/17e431cf42ac4969199c736547a0cc6840e49f2c/unified-search-app/images/image-4.png -------------------------------------------------------------------------------- /unified-search-app/pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "unified-copilot" 3 | version = "1.0.0" 4 | description = "" 5 | authors = ["GraphRAG team"] 6 | readme = "README.md" 7 | package-mode = false 8 | 9 | [tool.poetry.dependencies] 10 | python = ">=3.10,<3.12" 11 | streamlit = "1.43.0" 12 | azure-search-documents = "^11.4.0" 13 | azure-storage-blob = "^12.20.0" 14 | azure-identity = "^1.16.0" 15 | graphrag = "2.0.0" 16 | altair = "^5.3.0" 17 | streamlit-agraph = "^0.0.45" 18 | st-tabs = "^0.1.1" 19 | spacy = ">=3.8.4,<4.0.0" 20 | 21 | [tool.poetry.group.dev.dependencies] 22 | poethepoet = "^0.26.1" 23 | ipykernel = "^6.29.4" 24 | pyright = "^1.1.349" 25 | ruff = "^0.4.7" 26 | 27 | [build-system] 28 | requires = ["poetry-core"] 29 | build-backend = "poetry.core.masonry.api" 30 | 31 | [tool.poe.tasks] 32 | start = "streamlit run app/home_page.py" 33 | start_prod = "streamlit run app/home_page.py --server.port=8501 --server.address=0.0.0.0" 34 | 35 | [tool.pyright] 36 | include = ["app"] 37 | exclude = ["**/node_modules", "**/__pycache__"] 38 | --------------------------------------------------------------------------------