├── .azdo └── azure-pipeline.yaml ├── .devcontainer ├── Dockerfile ├── devcontainer.json └── entrypoint.sh ├── .dockerignore ├── .editorconfig ├── .github ├── CODEOWNERS ├── ISSUE_TEMPLATE │ ├── bug_report.md │ └── feature_request.md ├── dependabot.yml └── workflows │ ├── dev.yaml │ └── tests.yaml ├── .pre-commit-config.yaml ├── .secrets.baseline ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── SECURITY.md ├── TRANSPARENCY.md ├── backend ├── .coveragerc ├── README.md ├── graphrag_app │ ├── __init__.py │ ├── api │ │ ├── __init__.py │ │ ├── data.py │ │ ├── graph.py │ │ ├── index.py │ │ ├── prompt_tuning.py │ │ ├── query.py │ │ ├── query_streaming.py │ │ └── source.py │ ├── logger │ │ ├── __init__.py │ │ ├── application_insights_workflow_callbacks.py │ │ ├── blob_workflow_callbacks.py │ │ ├── console_workflow_callbacks.py │ │ ├── load_logger.py │ │ ├── pipeline_job_updater.py │ │ └── typing.py │ ├── main.py │ ├── typing │ │ ├── __init__.py │ │ ├── models.py │ │ └── pipeline.py │ └── utils │ │ ├── __init__.py │ │ ├── azure_clients.py │ │ ├── common.py │ │ └── pipeline.py ├── manifests │ ├── cronjob.yaml │ └── job.yaml ├── poetry.lock ├── pyproject.toml ├── pytest.ini ├── scripts │ ├── indexer.py │ ├── job-scheduler.py │ └── settings.yaml └── tests │ ├── __init__.py │ ├── conftest.py │ ├── data │ └── synthetic-dataset │ │ ├── ABOUT.md │ │ └── output │ │ ├── create_final_communities.parquet │ │ ├── create_final_community_reports.parquet │ │ ├── create_final_covariates.parquet │ │ ├── create_final_documents.parquet │ │ ├── create_final_entities.parquet │ │ ├── create_final_nodes.parquet │ │ ├── create_final_relationships.parquet │ │ ├── create_final_text_units.parquet │ │ ├── graph.graphml │ │ └── stats.json │ ├── integration │ ├── test_api_data.py │ ├── test_api_default.py │ ├── test_api_graph.py │ ├── test_api_index.py │ ├── test_api_index_configuration.py │ ├── test_api_prompt_tuning.py │ ├── test_api_source.py │ └── test_utils_pipeline.py │ └── unit │ ├── test_azure_clients.py │ ├── test_common.py │ ├── test_load_logger.py │ ├── test_logger_app_insights_callbacks.py │ ├── test_logger_blob_callbacks.py │ └── test_logger_console_callbacks.py ├── docker ├── Dockerfile-backend └── Dockerfile-frontend ├── docs ├── DEPLOYMENT-GUIDE.md ├── DEVELOPMENT-GUIDE.md └── assets │ ├── graphrag-architecture-diagram.png │ └── graphrag-architecture-diagram.vsdx ├── frontend ├── .streamlit │ └── config.toml ├── README.md ├── app.py ├── deploy.sh ├── frontend_deploy.parameters.json ├── poetry.lock ├── pyproject.toml ├── src │ ├── __init__.py │ ├── components │ │ ├── __init__.py │ │ ├── index_pipeline.py │ │ ├── login_sidebar.py │ │ ├── prompt_configuration.py │ │ ├── query.py │ │ ├── tabs.py │ │ └── upload_files_component.py │ ├── enums.py │ ├── functions.py │ └── graphrag_api.py └── style.css ├── infra ├── abbreviations.json ├── core │ ├── acr │ │ └── acr.bicep │ ├── ai-search │ │ └── ai-search.bicep │ ├── aks │ │ └── aks.bicep │ ├── aoai │ │ └── aoai.bicep │ ├── apim │ │ ├── apim.bicep │ │ ├── apim.graphrag-api.bicep │ │ ├── apim.graphrag-docs-api.bicep │ │ ├── openapi.json │ │ └── policies │ │ │ └── apiPolicy.xml │ ├── cosmosdb │ │ └── cosmosdb.bicep │ ├── identity │ │ └── identity.bicep │ ├── log-analytics │ │ └── log.bicep │ ├── monitor │ │ ├── app-insights.bicep │ │ └── private-link-scope.bicep │ ├── rbac │ │ ├── aks-rbac.bicep │ │ ├── aoai-rbac.bicep │ │ └── workload-identity-rbac.bicep │ ├── scripts │ │ └── deployment-script.bicep │ ├── storage │ │ └── storage.bicep │ └── vnet │ │ ├── nsg.bicep │ │ ├── private-dns-vnet-link.bicep │ │ ├── private-dns-zone-a-record.bicep │ │ ├── private-dns-zone-groups.json │ │ ├── private-dns-zone.bicep │ │ ├── private-endpoint.bicep │ │ ├── privatelink-private-dns-zones.bicep │ │ ├── vnet-dns-link.bicep │ │ └── vnet.bicep ├── deploy.parameters.json ├── deploy.sh ├── helm │ ├── README.md │ └── graphrag │ │ ├── .helmignore │ │ ├── Chart.yaml │ │ ├── LICENSE │ │ ├── templates │ │ ├── NOTES.txt │ │ ├── _helpers.tpl │ │ ├── graphrag-clusterrole.yaml │ │ ├── graphrag-configmap.yaml │ │ ├── graphrag-ingress.yaml │ │ ├── graphrag-master-deployment.yaml │ │ ├── graphrag-master-hpa.yaml │ │ ├── graphrag-master-service.yaml │ │ ├── graphrag-nginx-internal-controller.yaml │ │ ├── graphrag-rolebinding.yaml │ │ ├── graphrag-serviceaccount.yaml │ │ └── tests │ │ │ └── test-connection.yaml │ │ └── values.yaml ├── main.bicep └── managed-app │ ├── README.md │ ├── createUiDefinition.json │ ├── mainTemplate.json │ ├── scripts │ └── install-graphrag.sh │ └── viewDefinition.json ├── notebooks ├── 1-Quickstart.ipynb ├── 2-Advanced_Getting_Started.ipynb ├── README.md └── get-wiki-articles.py └── openapi.json /.azdo/azure-pipeline.yaml: -------------------------------------------------------------------------------- 1 | trigger: 2 | - main 3 | 4 | # the `resources` specify the location and version of the 1ES Pipeline Template (PT). 5 | resources: 6 | repositories: 7 | - repository: 1ESPipelineTemplates 8 | type: git 9 | name: 1ESPipelineTemplates/1ESPipelineTemplates 10 | ref: refs/tags/release 11 | 12 | extends: 13 | # this pipeline extends an existing 1ES PT which injects various SDL and compliance tasks 14 | template: v1/1ES.Official.PipelineTemplate.yml@1ESPipelineTemplates 15 | parameters: 16 | pool: 17 | name: OCTO1ES_HostedPool 18 | image: SMTOCTO1ESAgentWindowsVM 19 | os: windows 20 | sdl: 21 | skipComponentGovernanceDetection: false 22 | policheck: 23 | enabled: true 24 | sourceAnalysisPool: 25 | name: OCTO1ES_HostedPool 26 | image: SMTOCTO1ESAgentWindowsVM 27 | os: windows 28 | stages: 29 | - stage: Component_Governance 30 | jobs: 31 | - job: CG_Prep 32 | steps: 33 | # Component Governance (CG) does not support pyproject.toml yet. 34 | # For this reason, we export dependencies into a requirements.txt file. 35 | # CG will auto-detect the requirements.txt file and use it to scan for dependencies. 36 | - script: | 37 | pip install poetry poetry-plugin-export 38 | poetry export --directory=backend --format=requirements.txt --without-hashes --without-urls --all-extras --all-groups -o requirements.txt 39 | displayName: "Export python dependencies to requirements.txt for CG" 40 | -------------------------------------------------------------------------------- /.devcontainer/Dockerfile: -------------------------------------------------------------------------------- 1 | # For more information about the base image visit: 2 | # https://mcr.microsoft.com/en-us/artifact/mar/devcontainers/python/about 3 | FROM mcr.microsoft.com/devcontainers/python:3.10-bookworm 4 | 5 | # disable common warning messages 6 | ENV DEBIAN_FRONTEND=noninteractive 7 | ENV PIP_ROOT_USER_ACTION=ignore 8 | ENV PIP_DISABLE_PIP_VERSION_CHECK=1 9 | 10 | # configure environment 11 | ARG ENVNAME="GraphRAG" 12 | ARG USERNAME=vscode 13 | ARG WORKDIR=/${ENVNAME} 14 | 15 | # install python, pip, git, and other required tools 16 | RUN apt-get update && apt-get install -y \ 17 | ca-certificates \ 18 | libicu-dev \ 19 | git \ 20 | curl \ 21 | sudo \ 22 | pre-commit \ 23 | wget \ 24 | jq \ 25 | apt-transport-https \ 26 | lsb-release \ 27 | gnupg \ 28 | software-properties-common 29 | # install Azure CLI 30 | RUN curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash 31 | # install bicep and kubectl 32 | RUN az bicep install && az aks install-cli 33 | # install helm 34 | RUN curl -fsSL -o get_helm.sh https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 \ 35 | && chmod 700 get_helm.sh \ 36 | && ./get_helm.sh \ 37 | && rm ./get_helm.sh 38 | # install yq 39 | RUN wget https://github.com/mikefarah/yq/releases/latest/download/yq_linux_amd64 -O /usr/bin/yq \ 40 | && chmod +x /usr/bin/yq 41 | 42 | # install docker 43 | RUN curl -fsSL https://get.docker.com -o install-docker.sh \ 44 | && sh install-docker.sh \ 45 | && rm install-docker.sh 46 | 47 | # cleanup to keep the image size down 48 | RUN rm -rf /var/lib/apt/lists/* \ 49 | && apt-get purge -y --auto-remove \ 50 | && apt-get autoremove \ 51 | && apt-get clean 52 | 53 | # set the location for the virtual environments to be outside the project directory 54 | ENV POETRY_VIRTUALENVS_IN_PROJECT=true 55 | 56 | # a non-root user (vscode) already exist in the base image. Add it to sudo group and docker group 57 | RUN echo "${USERNAME}:${USERNAME}" | chpasswd \ 58 | && adduser ${USERNAME} sudo \ 59 | && adduser ${USERNAME} docker \ 60 | && echo '%sudo ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers 61 | 62 | # switch to non-root user 63 | USER ${USERNAME} 64 | 65 | # install poetry 66 | RUN curl -sSL https://install.python-poetry.org | python3 - 67 | 68 | # add the local bin to the PATH for the non-root user 69 | ENV PATH="/home/${USERNAME}/.local/bin:${PATH}" 70 | # Add venv to beginning of path so we don't have to activate it 71 | ENV PATH=/graphrag-accelerator/.venv/bin:$PATH 72 | 73 | # copy the project files into the container and set ownership 74 | COPY --chown=${USERNAME}:${USERNAME} . ${WORKDIR} 75 | 76 | COPY entrypoint.sh /usr/local/bin/entrypoint.sh 77 | 78 | # Create directories for vscode server and extensions 79 | RUN mkdir -p ~/.vscode-server/extensions \ 80 | && chown -R $USERNAME:$USERNAME ~/.vscode-server 81 | 82 | ENTRYPOINT [ "/usr/local/bin/entrypoint.sh" ] 83 | CMD ["bash"] 84 | -------------------------------------------------------------------------------- /.devcontainer/devcontainer.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "graphrag-accelerator", 3 | "build": { 4 | "dockerfile": "./Dockerfile", 5 | "args": { 6 | "DOCKER_GROUP_ID": "${localEnv:DOCKER_GROUP_ID}" 7 | } 8 | }, 9 | "forwardPorts": [ 7071 ], 10 | "runArgs": [ 11 | "--network", "host" // use host networking so that the dev container can access the API when running the container locally 12 | ], 13 | "remoteUser": "vscode", 14 | "remoteEnv": { 15 | // We add the .venv to the beginning of the path env in the Dockerfile 16 | // so that we use the proper python, however vscode rewrites/overwrites 17 | // the PATH in the image and puts /usr/local/bin in front of our .venv 18 | // path. This fixes that issue. 19 | "PATH": "${containerEnv:PATH}", 20 | // Add src folder to PYTHONPATH so that we can import modules that 21 | // are in the source dir 22 | "PYTHONPATH": "/graphrag-accelerator/backend/:$PATH" 23 | // disable SSL verification for Azure CLI if working in CodeSpaces 24 | // "AZURE_CLI_DISABLE_CONNECTION_VERIFICATION": "1" 25 | }, 26 | "mounts": [ 27 | // NOTE: we reference both HOME and USERPROFILE environment variables to simultaneously support both Windows and Unix environments 28 | // in most default situations, only one variable will exist (Windows has USERPROFILE and unix has HOME) and a reference to the other variable will result in an empty string 29 | // Keep command history 30 | "type=volume,source=graphrag-bashhistory,target=/home/vscode/command_history", 31 | "type=volume,source=graphrag-devcontainer-vscode-server,target=/home/vscode/.vscode-server/extensions", 32 | // Mounts the login details from the host machine so azcli works seamlessly in the container 33 | // "type=bind,source=${localEnv:HOME}${localEnv:USERPROFILE}/.azure,target=/home/vscode/.azure", 34 | // Mounts the ssh details from the host machine - this allows the container to connect to ssh hosts 35 | "type=bind,source=${localEnv:HOME}${localEnv:USERPROFILE}/.ssh,target=/home/vscode/.ssh", 36 | // Mount docker socket for docker builds 37 | "type=bind,source=/var/run/docker.sock,target=/var/run/docker.sock" 38 | ], 39 | "customizations": { 40 | "vscode": { 41 | // Set *default* container specific settings.json values on container create. 42 | "settings": { 43 | "python.pythonPath": "/graphrag-accelerator/.venv/bin/python", 44 | "python.defaultInterpreterPath": "/graphrag-accelerator/.venv/bin/python", 45 | "python.languageServer": "Pylance", 46 | "files.watcherExclude": { 47 | "**/.git/objects/**": true, 48 | "**/.git/subtree-cache/**": true, 49 | "**/node_modules/*/**": true, 50 | "**/.python_packages/*/**": true 51 | }, 52 | "files.associations": { 53 | "*.workbook": "[jsonc]" 54 | }, 55 | "ruff.interpreter": [ 56 | "/graphrag-accelerator/.venv/bin/python" 57 | ], 58 | "ruff.lint.args": [ 59 | "--config", 60 | "/graphrag-accelerator/pyproject.toml" 61 | ], 62 | "ruff.lint.run": "onType" 63 | }, 64 | // Add the IDs of extensions you want installed when the container is created. 65 | "extensions": [ 66 | "donjayamanne.githistory", 67 | "codezombiech.gitignore", 68 | "GitHub.copilot", 69 | "GitHub.copilot-chat", 70 | "ms-azuretools.vscode-docker", 71 | "ms-azuretools.vscode-bicep", 72 | "ms-dotnettools.vscode-dotnet-runtime", 73 | "ms-kubernetes-tools.vscode-kubernetes-tools", 74 | "ms-python.python", 75 | "ms-python.vscode-pylance", 76 | "ms-toolsai.datawrangler", 77 | "ms-toolsai.jupyter", 78 | "ms-toolsai.jupyter-keymap", 79 | "ms-toolsai.vscode-jupyter-cell-tags", 80 | "ms-toolsai.vscode-jupyter-slideshow", 81 | "ziyasal.vscode-open-in-github", 82 | "charliermarsh.ruff" 83 | ] 84 | } 85 | }, 86 | "postCreateCommand": "bash /usr/local/bin/entrypoint.sh", 87 | "workspaceMount": "source=${localWorkspaceFolder},target=/graphrag-accelerator,type=bind,consistency=cached", 88 | "workspaceFolder": "/graphrag-accelerator" 89 | } -------------------------------------------------------------------------------- /.devcontainer/entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ################################ 4 | ### Docker configuration ### 5 | ################################ 6 | sudo chmod 666 /var/run/docker.sock 7 | 8 | ################################ 9 | ### Dependency configuration ### 10 | ################################ 11 | 12 | # Install graphrag dependencies 13 | ROOT_DIR=/graphrag-accelerator 14 | cd ${ROOT_DIR} 15 | poetry install --no-interaction -v --directory ${ROOT_DIR}/backend 16 | 17 | ######################### 18 | ### Git configuration ### 19 | ######################### 20 | git config --global --add safe.directory ${ROOT_DIR} 21 | pre-commit install 22 | -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | notebooks 2 | 3 | # Environments 4 | .env 5 | env/ 6 | ENV/ 7 | **/env.bak 8 | **/venv.bak 9 | **/.venv 10 | **/venv 11 | 12 | .github 13 | .git 14 | **/__pycache__ 15 | *.pyc 16 | *.pyo 17 | *.pyd 18 | **/.pytest_cache 19 | **/.ruff_cache 20 | **/.DS_Store -------------------------------------------------------------------------------- /.editorconfig: -------------------------------------------------------------------------------- 1 | root = true 2 | 3 | [*] 4 | end_of_line = lf 5 | insert_final_newline = true 6 | -------------------------------------------------------------------------------- /.github/CODEOWNERS: -------------------------------------------------------------------------------- 1 | # These owners will be the default owners for everything in 2 | # the repo. Unless a later match takes precedence, 3 | # @graphrag-core-team and @societal-resilience-graphrag will be requested for 4 | # review when someone opens a pull request. 5 | * @Azure-Samples/graphrag-core-team @Azure-Samples/societal-resilience-graphrag 6 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: "[BUG]" 5 | labels: bug 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce** 14 | Steps to reproduce the behavior: 15 | 1. Go to '...' 16 | 2. Click on '....' 17 | 3. Scroll down to '....' 18 | 4. See error 19 | 20 | **Expected behavior** 21 | A clear and concise description of what you expected to happen. 22 | 23 | **Screenshots** 24 | If applicable, add screenshots to help explain your problem. 25 | 26 | **Desktop (please complete the following information):** 27 | - OS: [e.g. iOS] 28 | - Version [e.g. 22] 29 | 30 | **Additional context** 31 | Add any other context about the problem here. 32 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: "[FEATURE]" 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Additional context** 20 | Add any other context or screenshots about the feature request here. 21 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | # To get started with Dependabot version updates, you'll need to specify which 2 | # package ecosystems to update and where the package manifests are located. 3 | # Please see the documentation for all configuration options: 4 | # https://docs.github.com/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file 5 | 6 | version: 2 7 | updates: 8 | - package-ecosystem: "pip" 9 | directory: "/backend" 10 | schedule: 11 | interval: "weekly" 12 | -------------------------------------------------------------------------------- /.github/workflows/dev.yaml: -------------------------------------------------------------------------------- 1 | name: Dev Build 2 | on: 3 | workflow_dispatch: # triggered manually via the GitHub UI 4 | pull_request: # triggered when a PR is created or updated 5 | types: 6 | - opened 7 | - reopened 8 | - synchronize 9 | - ready_for_review 10 | paths: # only trigger on changes in specific directories 11 | - '.github/**/*.yaml' 12 | - 'backend/**' 13 | - 'docker/**' 14 | - 'infra/**' 15 | - 'poetry.lock' 16 | - 'pyproject.toml' 17 | jobs: 18 | lint-check: 19 | runs-on: ubuntu-latest 20 | steps: 21 | - name: Checkout repository 22 | uses: actions/checkout@v4 23 | - name: Setup python 24 | uses: actions/setup-python@v5 25 | with: 26 | python-version: '3.10' 27 | - run: pip install ruff 28 | - run: | 29 | ruff check 30 | build-devcontainer: 31 | needs: [lint-check] 32 | runs-on: ubuntu-latest 33 | if: ${{ !github.event.pull_request.draft }} 34 | steps: 35 | - name: Checkout repository 36 | uses: actions/checkout@v4 37 | - name: Build docker image 38 | uses: docker/build-push-action@v2 39 | with: 40 | context: .devcontainer 41 | push: false 42 | build-backend: 43 | needs: [lint-check] 44 | runs-on: ubuntu-latest 45 | if: ${{ !github.event.pull_request.draft }} 46 | steps: 47 | - name: Checkout repository 48 | uses: actions/checkout@v4 49 | - name: Build docker image 50 | uses: docker/build-push-action@v2 51 | with: 52 | context: . 53 | file: docker/Dockerfile-backend 54 | push: false 55 | build-frontend: 56 | needs: [lint-check] 57 | runs-on: ubuntu-latest 58 | if: ${{ !github.event.pull_request.draft }} 59 | steps: 60 | - name: Checkout repository 61 | uses: actions/checkout@v4 62 | - name: Build docker image 63 | uses: docker/build-push-action@v2 64 | with: 65 | context: . 66 | file: docker/Dockerfile-frontend 67 | push: false 68 | -------------------------------------------------------------------------------- /.github/workflows/tests.yaml: -------------------------------------------------------------------------------- 1 | name: Testing 2 | on: 3 | workflow_dispatch: # triggered manually via the GitHub UI 4 | pull_request: # triggered when a PR is created or updated 5 | types: 6 | - opened 7 | - reopened 8 | - synchronize 9 | - ready_for_review 10 | paths: # only trigger on changes in specific directories 11 | - '.github/**/*.yaml' 12 | - 'backend/**' 13 | - 'docker/**' 14 | - 'backend/poetry.lock' 15 | - 'backend/pyproject.toml' 16 | 17 | env: 18 | PYTHON_VERSION: '3.10' 19 | 20 | jobs: 21 | tests: 22 | runs-on: windows-latest 23 | steps: 24 | - name: Checkout repository 25 | uses: actions/checkout@v4 26 | 27 | - name: Install python 28 | uses: actions/setup-python@v5 29 | with: 30 | python-version: '3.10' 31 | 32 | - name: Install Azurite 33 | shell: bash 34 | run: | 35 | npm install -g azurite 36 | azurite --silent & 37 | 38 | # For more information on installation/setup of Azure Cosmos DB Emulator 39 | # https://learn.microsoft.com/en-us/azure/cosmos-db/how-to-develop-emulator?tabs=docker-linux%2Cpython&pivots=api-nosql 40 | # Note: the emulator is only available on Windows runners. It can take longer than the default to initially startup so we increase the default timeout. 41 | # If a job fails due to timeout, restarting the cicd job usually resolves the problem. 42 | - name: Install Azure Cosmos DB emulator 43 | run: | 44 | Write-Host "Launching Cosmos DB Emulator" 45 | Import-Module "$env:ProgramFiles\Azure Cosmos DB Emulator\PSModules\Microsoft.Azure.CosmosDB.Emulator" 46 | Start-CosmosDbEmulator -Timeout 500 47 | 48 | - name: Install dependencies 49 | working-directory: ${{ github.workspace }}/backend 50 | run: | 51 | pip install poetry 52 | poetry config virtualenvs.create false 53 | poetry install --with test 54 | 55 | - name: Run pytests 56 | working-directory: ${{ github.workspace }}/backend 57 | run: | 58 | pytest --cov=graphrag_app --junitxml=test-results.xml tests/ 59 | 60 | - name: Upload test results 61 | uses: actions/upload-artifact@v4 62 | with: 63 | name: pytest-results 64 | path: ${{ github.workspace }}/backend/test-results.xml 65 | # Use always() to always run this step to publish test results when there are test failures 66 | if: ${{ always() }} 67 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | exclude: "tests/data" 2 | repos: 3 | - repo: https://github.com/kynan/nbstripout 4 | rev: 0.7.1 5 | hooks: 6 | - id: nbstripout 7 | - repo: https://github.com/pre-commit/pre-commit-hooks 8 | rev: v4.6.0 9 | hooks: 10 | - id: requirements-txt-fixer 11 | - id: mixed-line-ending 12 | - id: trailing-whitespace 13 | - id: check-json 14 | exclude: "devcontainer.json" 15 | - id: pretty-format-json 16 | args: 17 | - "--autofix" 18 | exclude: 'tests/|devcontainer.json|^.*\.ipynb$' 19 | - id: end-of-file-fixer 20 | files: \.(py|sh|bash|conf|yaml|yml|toml|ini)$ 21 | - repo: https://github.com/astral-sh/ruff-pre-commit 22 | rev: v0.4.6 23 | hooks: 24 | # Run the linter. 25 | - id: ruff 26 | types_or: [ python, pyi, jupyter ] 27 | # Run the formatter. 28 | - id: ruff-format 29 | types_or: [ python, pyi, jupyter ] 30 | - repo: https://github.com/Yelp/detect-secrets 31 | rev: v1.5.0 32 | hooks: 33 | - id: detect-secrets 34 | args: ['--baseline', '.secrets.baseline'] 35 | -------------------------------------------------------------------------------- /.secrets.baseline: -------------------------------------------------------------------------------- 1 | { 2 | "version": "1.5.0", 3 | "plugins_used": [ 4 | { 5 | "name": "ArtifactoryDetector" 6 | }, 7 | { 8 | "name": "AWSKeyDetector" 9 | }, 10 | { 11 | "name": "AzureStorageKeyDetector" 12 | }, 13 | { 14 | "name": "Base64HighEntropyString", 15 | "limit": 4.5 16 | }, 17 | { 18 | "name": "BasicAuthDetector" 19 | }, 20 | { 21 | "name": "CloudantDetector" 22 | }, 23 | { 24 | "name": "DiscordBotTokenDetector" 25 | }, 26 | { 27 | "name": "GitHubTokenDetector" 28 | }, 29 | { 30 | "name": "GitLabTokenDetector" 31 | }, 32 | { 33 | "name": "HexHighEntropyString", 34 | "limit": 3.0 35 | }, 36 | { 37 | "name": "IbmCloudIamDetector" 38 | }, 39 | { 40 | "name": "IbmCosHmacDetector" 41 | }, 42 | { 43 | "name": "IPPublicDetector" 44 | }, 45 | { 46 | "name": "JwtTokenDetector" 47 | }, 48 | { 49 | "name": "KeywordDetector", 50 | "keyword_exclude": "" 51 | }, 52 | { 53 | "name": "MailchimpDetector" 54 | }, 55 | { 56 | "name": "NpmDetector" 57 | }, 58 | { 59 | "name": "OpenAIDetector" 60 | }, 61 | { 62 | "name": "PrivateKeyDetector" 63 | }, 64 | { 65 | "name": "PypiTokenDetector" 66 | }, 67 | { 68 | "name": "SendGridDetector" 69 | }, 70 | { 71 | "name": "SlackDetector" 72 | }, 73 | { 74 | "name": "SoftlayerDetector" 75 | }, 76 | { 77 | "name": "SquareOAuthDetector" 78 | }, 79 | { 80 | "name": "StripeDetector" 81 | }, 82 | { 83 | "name": "TelegramBotTokenDetector" 84 | }, 85 | { 86 | "name": "TwilioKeyDetector" 87 | } 88 | ], 89 | "filters_used": [ 90 | { 91 | "path": "detect_secrets.filters.allowlist.is_line_allowlisted" 92 | }, 93 | { 94 | "path": "detect_secrets.filters.common.is_ignored_due_to_verification_policies", 95 | "min_level": 2 96 | }, 97 | { 98 | "path": "detect_secrets.filters.heuristic.is_indirect_reference" 99 | }, 100 | { 101 | "path": "detect_secrets.filters.heuristic.is_likely_id_string" 102 | }, 103 | { 104 | "path": "detect_secrets.filters.heuristic.is_lock_file" 105 | }, 106 | { 107 | "path": "detect_secrets.filters.heuristic.is_not_alphanumeric_string" 108 | }, 109 | { 110 | "path": "detect_secrets.filters.heuristic.is_potential_uuid" 111 | }, 112 | { 113 | "path": "detect_secrets.filters.heuristic.is_prefixed_with_dollar_sign" 114 | }, 115 | { 116 | "path": "detect_secrets.filters.heuristic.is_sequential_string" 117 | }, 118 | { 119 | "path": "detect_secrets.filters.heuristic.is_swagger_file" 120 | }, 121 | { 122 | "path": "detect_secrets.filters.heuristic.is_templated_secret" 123 | } 124 | ], 125 | "results": { 126 | "backend/pytest.ini": [ 127 | { 128 | "type": "Secret Keyword", 129 | "filename": "backend/pytest.ini", 130 | "hashed_secret": "589c2d30c725c063a05a59110ea5888a80a28f15", 131 | "is_verified": false, 132 | "line_number": 7 133 | }, 134 | { 135 | "type": "Azure Storage Account access key", 136 | "filename": "backend/pytest.ini", 137 | "hashed_secret": "7388811af1e10afcb96c331748597e7a75e27e7d", 138 | "is_verified": false, 139 | "line_number": 7 140 | }, 141 | { 142 | "type": "Secret Keyword", 143 | "filename": "backend/pytest.ini", 144 | "hashed_secret": "1655679f8bfda925b76ee655dfac4519d90d3431", 145 | "is_verified": false, 146 | "line_number": 8 147 | }, 148 | { 149 | "type": "Azure Storage Account access key", 150 | "filename": "backend/pytest.ini", 151 | "hashed_secret": "5666459779d6a76bea73453137803fd27d8f79cd", 152 | "is_verified": false, 153 | "line_number": 8 154 | } 155 | ] 156 | }, 157 | "generated_at": "2024-12-17T06:41:24Z" 158 | } 159 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Microsoft Open Source Code of Conduct 2 | 3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 4 | 5 | Resources: 6 | 7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/) 8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) 9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to [project-title] 2 | 3 | This project welcomes contributions and suggestions. Most contributions require you to agree to a 4 | Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us 5 | the rights to use your contribution. For details, visit https://cla.opensource.microsoft.com. 6 | 7 | When you submit a pull request, a CLA bot will automatically determine whether you need to provide 8 | a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions 9 | provided by the bot. You will only need to do this once across all repos using our CLA. 10 | 11 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 12 | For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or 13 | contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments. 14 | 15 | - [Code of Conduct](#coc) 16 | - [Issues and Bugs](#issue) 17 | - [Feature Requests](#feature) 18 | - [Submission Guidelines](#submit) 19 | 20 | ## Code of Conduct 21 | Help us keep this project open and inclusive. Please read and follow our [Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 22 | 23 | ## Found an Issue? 24 | If you find a bug in the source code or a mistake in the documentation, you can help us by 25 | [submitting an issue](#submit-issue) to the GitHub Repository. Even better, you can 26 | [submit a Pull Request](#submit-pr) with a fix. 27 | 28 | ## Want a Feature? 29 | You can *request* a new feature by [submitting an issue](#submit-issue) to the GitHub 30 | Repository. If you would like to *implement* a new feature, please submit an issue with 31 | a proposal for your work first, to be sure that we can use it. 32 | 33 | * **Small Features** can be crafted and directly [submitted as a Pull Request](#submit-pr). 34 | 35 | ## Submission Guidelines 36 | 37 | ### Submitting an Issue 38 | Before you submit an issue, search the archive, maybe your question was already answered. 39 | 40 | If your issue appears to be a bug, and hasn't been reported, open a new issue. 41 | Help us to maximize the effort we can spend fixing issues and adding new 42 | features, by not reporting duplicate issues. Providing the following information will increase the 43 | chances of your issue being dealt with quickly: 44 | 45 | * **Overview of the Issue** - if an error is being thrown a non-minified stack trace helps 46 | * **Version** - what version is affected (e.g. 0.1.2) 47 | * **Motivation for or Use Case** - explain what are you trying to do and why the current behavior is a bug for you 48 | * **Browsers and Operating System** - is this a problem with all browsers? 49 | * **Reproduce the Error** - provide a live example or a unambiguous set of steps 50 | * **Related Issues** - has a similar issue been reported before? 51 | * **Suggest a Fix** - if you can't fix the bug yourself, perhaps you can point to what might be 52 | causing the problem (line of code or commit) 53 | 54 | You can file new issues by providing the above information at the corresponding repository's issues link: https://github.com/[organization-name]/[repository-name]/issues/new]. 55 | 56 | ### Submitting a Pull Request (PR) 57 | Before you submit your Pull Request (PR) consider the following guidelines: 58 | 59 | * Search the repository (https://github.com/[organization-name]/[repository-name]/pulls) for an open or closed PR 60 | that relates to your submission. You don't want to duplicate effort. 61 | 62 | * Make your changes in a new git fork: 63 | 64 | * Commit your changes using a descriptive commit message 65 | * Push your fork to GitHub: 66 | * In GitHub, create a pull request 67 | * If we suggest changes then: 68 | * Make the required updates. 69 | * Rebase your fork and force push to your GitHub repository (this will update your Pull Request): 70 | 71 | ```shell 72 | git rebase master -i 73 | git push -f 74 | ``` 75 | 76 | That's it! Thank you for your contribution! 77 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) Microsoft Corporation. 2 | 3 | MIT License 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # GraphRAG Accelerator 2 | 3 | ## ⚠️ ATTENTION 4 | This repository is no longer maintained. We sincerely appreciate the interest and all contributors to the Graphrag Solution Accelerator. 5 | 6 | 🚀 Future development and updates - Please visit the [graphrag library](https://github.com/microsoft/graphrag) for future updates and continued collaboration with the graphrag community at Microsoft. 7 | 8 | --- 9 | 10 | [![Open in Dev Containers](https://img.shields.io/static/v1?label=Dev%20Containers&message=Open&color=blue&logo=visualstudiocode)](https://vscode.dev/redirect?url=vscode://ms-vscode-remote.remote-containers/cloneInVolume?url=https://github.com/Azure-Samples/graphrag-accelerator) 11 | 12 | Welcome to the GraphRAG solution accelerator! This accelerator builds on top of the [graphrag](https://github.com/microsoft/graphrag) python package and exposes API endpoints hosted on Azure, which can be used to trigger indexing pipelines and enable querying of the graphrag knowledge graph. 13 | 14 | This repository presents a methodology for running a hosted service using knowledge graph memory structures to enhance LLM outputs. Please note that the provided code serves as a demonstration and is not an officially supported Microsoft offering. 15 | 16 | ⚠️ Warning: The GraphRAG Accelerator uses multiple Azure services and may incur substantial costs. It is meant to host a high-utilization API with auto-scaling and user access control. Please see the [deployment bicep](infra/main.bicep) for further detail on the services used. 17 | 18 | ⚠️ Warning: GraphRAG indexing can be an expensive operation. Please read all documentation to understand the process and costs involved, and start with a small amount of data. 19 | 20 | For FAQ and our roadmap, please visit `aka.ms/graphrag` 21 | 22 | ![](docs/assets/graphrag-architecture-diagram.png) 23 | 24 | ## Getting Started with GraphRAG on Azure 25 | 26 | ### Deployment Guide 27 | To deploy the solution accelerator, see the [deployment guide](docs/DEPLOYMENT-GUIDE.md). This will result in a full deployment of graphrag as an API. 28 | Afterwards, check out the [Quickstart](notebooks/1-Quickstart.ipynb) notebook for a demonstration of various API calls. 29 | 30 | ## Development Guide 31 | Interested in contributing? Check out the [development guide](docs/DEVELOPMENT-GUIDE.md). 32 | 33 | ### How to file issues and get help 34 | This project uses GitHub Issues to track bugs and feature requests. Please search the existing issues before filing new issues to avoid duplicates. For new issues, file your bug or feature request as a new Issue. 35 | 36 | ## Contributing 37 | 38 | This project welcomes contributions and suggestions. Most contributions require you to 39 | agree to a Contributor License Agreement (CLA) declaring that you have the right to, 40 | and actually do, grant us the rights to use your contribution. For details, visit 41 | https://cla.microsoft.com. 42 | 43 | When you submit a pull request, a CLA-bot will automatically determine whether you need 44 | to provide a CLA and decorate the PR appropriately (e.g., label, comment). Simply follow the 45 | instructions provided by the bot. You will only need to do this once across all repositories using our CLA. 46 | 47 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 48 | For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) 49 | or contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments. 50 | 51 | # Trademarks 52 | 53 | This project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft trademarks or logos is subject to and must follow [Microsoft’s Trademark & Brand Guidelines](https://www.microsoft.com/en-us/legal/intellectualproperty/trademarks/usage/general). Use of Microsoft trademarks or logos in modified versions of this project must not cause confusion or imply Microsoft sponsorship. Any use of third-party trademarks or logos are subject to those third-party’s policies. 54 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ## Security 4 | 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet) and [Xamarin](https://github.com/xamarin). 6 | 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/security.md/definition), please report it to us as described below. 8 | 9 | ## Reporting Security Issues 10 | 11 | **Please do not report security vulnerabilities through public GitHub issues.** 12 | 13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/security.md/msrc/create-report). 14 | 15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/security.md/msrc/pgp). 16 | 17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc). 18 | 19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue: 20 | 21 | * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.) 22 | * Full paths of source file(s) related to the manifestation of the issue 23 | * The location of the affected source code (tag/branch/commit or direct URL) 24 | * Any special configuration required to reproduce the issue 25 | * Step-by-step instructions to reproduce the issue 26 | * Proof-of-concept or exploit code (if possible) 27 | * Impact of the issue, including how an attacker might exploit the issue 28 | 29 | This information will help us triage your report more quickly. 30 | 31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/security.md/msrc/bounty) page for more details about our active programs. 32 | 33 | ## Preferred Languages 34 | 35 | We prefer all communications to be in English. 36 | 37 | ## Policy 38 | 39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/security.md/cvd). 40 | 41 | -------------------------------------------------------------------------------- /TRANSPARENCY.md: -------------------------------------------------------------------------------- 1 | # GraphRAG: Responsible AI FAQ 2 | 3 | ## What is GraphRAG? 4 | GraphRAG is an AI-based content interpretation and search capability. Using LLMs, it parses data to create a knowledge graph and answer user questions about a user-provided private dataset. 5 | 6 | ## What can GraphRAG do? 7 | GraphRAG is able to connect information across large volumes of information and use these connections to answer questions that are difficult or impossible to answer using keyword and vector-based search mechanisms. Building on the previous question, provide semi-technical, high-level information on how the system offers functionality for various uses. This lets a system using GraphRAG to answer questions where the answers span many documents as well as thematic questions such as “what are the top themes in this dataset?.” 8 | 9 | ## What are GraphRAG’s intended use(s)? 10 | - GraphRAG is intended to support critical information discovery and analysis use cases where the information required to arrive at a useful insight spans many documents, is noisy, is mixed with mis and/or dis-information, or when the questions users aim to answer are more abstract or thematic than the underlying data can directly answer. 11 | - GraphRAG is designed to be used in settings where users are already trained on responsible analytic approaches and critical reasoning is expected. GraphRAG is capable of providing high degrees of insight on complex information topics, however human analysis by a domain expert of the answers is needed in order to verify and augment GraphRAG’s generated responses. 12 | - GraphRAG is intended to be deployed and used with a domain specific corpus of text data. GraphRAG itself does not collect user data, but users are encouraged to verify data privacy policies of the chosen LLM used to configure GraphRAG. 13 | 14 | ## How was GraphRAG evaluated? What metrics are used to measure performance? 15 | 16 | GraphRAG has been evaluated in multiple ways. The primary concerns are 1) accurate representation of the data set, 2) providing transparency and groundedness of responses, 3) resilience to prompt and data corpus injection attacks, and 4) low hallucination rates. Details on how each of these has been evaluated is outlined below by number. 17 | 1. Accurate representation of the dataset has been tested by both manual inspection and automated testing against a “gold answer” that is created from randomly selected subsets of a test corpus. 18 | 1. GraphRAG has been tested against datasets with known confusors and noise in multiple domains. These tests include both automated evaluation of answer detail (as compared to vector search approaches) as well as manual inspection using questions that are known to be difficult or impossible for other search systems to answer. 19 | 1. Transparency and groundedness of responses is tested via automated answer coverage evaluation and human inspection of the underlying context returned. 20 | 1. We test both user prompt injection attacks (“jailbreaks”) and cross prompt injection attacks (“data attacks”) using manual and semi-automated techniques. 21 | 1. Hallucination rates are evaluated using claim coverage metrics, manual inspection of answer and source, and adversarial attacks to attempt a forced hallucination through adversarial and exceptionally challenging datasets. 22 | 23 | ## What are the limitations of GraphRAG? How can users minimize the impact of GraphRAG’s limitations when using the system? 24 | - GraphRAG depends on well-constructed indexing examples. For general applications (e.g. content oriented around people, places, organizations, things, etc.) we provide example indexing prompts. For unique datasets, effective indexing can depend on proper identification of domain-specific concepts. 25 | - Indexing is a relatively expensive operation; a best practice to mitigate indexing is to create a small test dataset in the target domain to ensure indexer performance prior to large indexing operations. 26 | - GraphRAG is designed to accept well-formatted UTF-8 text only. Input data that does not conform to this specification will cause issues in indexing with unreliable effects. 27 | 28 | ## What operational factors and settings allow for effective and responsible use of GraphRAG? 29 | - GraphRAG is designed for use by users with domain sophistication and experience working through difficult information challenges. While the approach is generally robust to injection attacks and identifying conflicting sources of information, the system is designed for trusted users. Proper human analysis of responses is important to generate reliable insights, and the provenance of information should be traced to ensure human agreement with the inferences made as part of the answer generation. 30 | - GraphRAG yields the most effective results on natural language text data that is collectively focused on an overall topic or theme, and that is entity rich – entities being people, places, things, or objects that can be uniquely identified. 31 | - GraphRAG has been evaluated for its resilience to prompt and data corpus injection attacks and has been probed for specific types of harms. However, the LLM that the user configures with GraphRAG may produce inappropriate or offensive content which may make it inappropriate to deploy for sensitive contexts without additional mitigations that are specific to the use case and model. Developers should assess outputs for their context and use available safety classifiers, model specific safety filters and features (such as [https://azure.microsoft.com/en-us/products/ai-services/ai-content-safety](https://azure.microsoft.com/en-us/products/ai-services/ai-content-safety)), or custom solutions appropriate for their use case. The use of content safety filters is recommended to prevent XPIA and UPIA attacks, as well as to limit harmful content generation by malicious users. Discretion is advised when modifying or removing filters for applications that require it. 32 | -------------------------------------------------------------------------------- /backend/.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | omit = 3 | **/__init__.py 4 | -------------------------------------------------------------------------------- /backend/README.md: -------------------------------------------------------------------------------- 1 | # Web App 2 | This directory contains the source code for a FastAPI application implements a REST API wrapper around the graphrag library. The app has been packaged up as a python package for a cleaner install/deployment experience. 3 | 4 | ## Package Layout 5 | The code has the following structure: 6 | ```shell 7 | backend 8 | ├── README.md 9 | ├── graphrag_app # contains the main application files 10 | │   ├── __init__.py 11 | │   ├── api # endpoint definitions 12 | │   ├── logger # custom loggers designed for graphrag use 13 | │   ├── main.py # initializes the FastAPI application 14 | │   ├── typing # data validation models 15 | │   └── utils # utility/helper functions 16 | ├── manifests # k8s manifest files 17 | ├── poetry.lock 18 | ├── pyproject.toml 19 | ├── pytest.ini 20 | ├── scripts # miscellaneous scripts that get executed in k8s 21 | └── tests # pytests (integration tests + unit tests) 22 | ``` 23 | -------------------------------------------------------------------------------- /backend/graphrag_app/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure-Samples/graphrag-accelerator/c660c84cff1f7dfc7c82a62ee566ab0b2dc8e6b6/backend/graphrag_app/__init__.py -------------------------------------------------------------------------------- /backend/graphrag_app/api/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure-Samples/graphrag-accelerator/c660c84cff1f7dfc7c82a62ee566ab0b2dc8e6b6/backend/graphrag_app/api/__init__.py -------------------------------------------------------------------------------- /backend/graphrag_app/api/graph.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | import os 5 | import traceback 6 | 7 | from fastapi import ( 8 | APIRouter, 9 | Depends, 10 | HTTPException, 11 | status, 12 | ) 13 | from fastapi.responses import StreamingResponse 14 | 15 | from graphrag_app.logger.load_logger import load_pipeline_logger 16 | from graphrag_app.utils.azure_clients import AzureClientManager 17 | from graphrag_app.utils.common import ( 18 | sanitize_name, 19 | subscription_key_check, 20 | validate_index_file_exist, 21 | ) 22 | 23 | graph_route = APIRouter( 24 | prefix="/graph", 25 | tags=["Graph Operations"], 26 | ) 27 | if os.getenv("KUBERNETES_SERVICE_HOST"): 28 | graph_route.dependencies.append(Depends(subscription_key_check)) 29 | 30 | 31 | @graph_route.get( 32 | "/graphml/{container_name}", 33 | summary="Retrieve a GraphML file of the knowledge graph", 34 | response_description="GraphML file successfully downloaded", 35 | status_code=status.HTTP_200_OK, 36 | ) 37 | async def get_graphml_file( 38 | container_name, sanitized_container_name: str = Depends(sanitize_name) 39 | ): 40 | # validate graphml file existence 41 | azure_client_manager = AzureClientManager() 42 | graphml_filename = "graph.graphml" 43 | blob_filepath = f"output/{graphml_filename}" # expected file location of the graph based on the workflow 44 | validate_index_file_exist(sanitized_container_name, blob_filepath) 45 | try: 46 | blob_client = azure_client_manager.get_blob_service_client().get_blob_client( 47 | container=sanitized_container_name, blob=blob_filepath 48 | ) 49 | blob_stream = blob_client.download_blob().chunks() 50 | return StreamingResponse( 51 | blob_stream, 52 | media_type="application/octet-stream", 53 | headers={"Content-Disposition": f"attachment; filename={graphml_filename}"}, 54 | ) 55 | except Exception as e: 56 | logger = load_pipeline_logger() 57 | logger.error( 58 | message="Could not fetch graphml file", 59 | cause=e, 60 | stack=traceback.format_exc(), 61 | ) 62 | raise HTTPException( 63 | status_code=500, 64 | detail=f"Could not fetch graphml file for '{container_name}'.", 65 | ) 66 | -------------------------------------------------------------------------------- /backend/graphrag_app/api/prompt_tuning.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | import os 5 | import traceback 6 | from pathlib import Path 7 | 8 | import graphrag.api as api 9 | import yaml 10 | from fastapi import ( 11 | APIRouter, 12 | Depends, 13 | HTTPException, 14 | status, 15 | ) 16 | from graphrag.config.create_graphrag_config import create_graphrag_config 17 | 18 | from graphrag_app.logger.load_logger import load_pipeline_logger 19 | from graphrag_app.utils.azure_clients import AzureClientManager 20 | from graphrag_app.utils.common import sanitize_name, subscription_key_check 21 | 22 | prompt_tuning_route = APIRouter(prefix="/index/config", tags=["Prompt Tuning"]) 23 | if os.getenv("KUBERNETES_SERVICE_HOST"): 24 | prompt_tuning_route.dependencies.append(Depends(subscription_key_check)) 25 | 26 | 27 | @prompt_tuning_route.get( 28 | "/prompts", 29 | summary="Generate custom graphrag prompts based on user-provided data.", 30 | description="Generating custom prompts from user-provided data may take several minutes to run based on the amount of data used.", 31 | status_code=status.HTTP_200_OK, 32 | ) 33 | async def generate_prompts( 34 | container_name: str, 35 | limit: int = 5, 36 | sanitized_container_name: str = Depends(sanitize_name), 37 | ): 38 | """ 39 | Automatically generate custom prompts for entity entraction, 40 | community reports, and summarize descriptions based on a sample of provided data. 41 | """ 42 | # check for storage container existence 43 | azure_client_manager = AzureClientManager() 44 | blob_service_client = azure_client_manager.get_blob_service_client() 45 | if not blob_service_client.get_container_client(sanitized_container_name).exists(): 46 | raise HTTPException( 47 | status_code=500, 48 | detail=f"Storage container '{container_name}' does not exist.", 49 | ) 50 | 51 | # load pipeline configuration file (settings.yaml) for input data and other settings 52 | ROOT_DIR = Path(__file__).resolve().parent.parent.parent 53 | with (ROOT_DIR / "scripts/settings.yaml").open("r") as f: 54 | data = yaml.safe_load(f) 55 | data["input"]["container_name"] = sanitized_container_name 56 | graphrag_config = create_graphrag_config(values=data, root_dir=".") 57 | 58 | # generate prompts 59 | try: 60 | prompts: tuple[str, str, str] = await api.generate_indexing_prompts( 61 | config=graphrag_config, 62 | root=".", 63 | limit=limit, 64 | selection_method="random", 65 | ) 66 | except Exception as e: 67 | logger = load_pipeline_logger() 68 | error_details = { 69 | "storage_name": container_name, 70 | } 71 | logger.error( 72 | message="Auto-prompt generation failed.", 73 | cause=e, 74 | stack=traceback.format_exc(), 75 | details=error_details, 76 | ) 77 | raise HTTPException( 78 | status_code=500, 79 | detail=f"Error generating prompts for data in '{container_name}'. Please try a lower limit.", 80 | ) 81 | 82 | prompt_content = { 83 | "entity_extraction_prompt": prompts[0], 84 | "entity_summarization_prompt": prompts[1], 85 | "community_summarization_prompt": prompts[2], 86 | } 87 | return prompt_content # returns a fastapi.responses.JSONResponse object 88 | -------------------------------------------------------------------------------- /backend/graphrag_app/logger/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | from graphrag_app.logger.application_insights_workflow_callbacks import ( 5 | ApplicationInsightsWorkflowCallbacks, 6 | ) 7 | from graphrag_app.logger.console_workflow_callbacks import ConsoleWorkflowCallbacks 8 | from graphrag_app.logger.load_logger import load_pipeline_logger 9 | from graphrag_app.logger.pipeline_job_updater import PipelineJobUpdater 10 | from graphrag_app.logger.typing import ( 11 | Logger, 12 | PipelineAppInsightsReportingConfig, 13 | PipelineReportingConfigTypes, 14 | ) 15 | 16 | __all__ = [ 17 | "Logger", 18 | "ApplicationInsightsWorkflowCallbacks", 19 | "ConsoleWorkflowCallbacks", 20 | "PipelineAppInsightsReportingConfig", 21 | "PipelineJobUpdater", 22 | "PipelineReportingConfigTypes", 23 | "load_pipeline_logger", 24 | ] 25 | -------------------------------------------------------------------------------- /backend/graphrag_app/logger/blob_workflow_callbacks.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | from datetime import datetime 5 | from typing import ( 6 | Any, 7 | ) 8 | 9 | from azure.storage.blob import BlobServiceClient 10 | from devtools import pformat 11 | from graphrag.callbacks.noop_workflow_callbacks import NoopWorkflowCallbacks 12 | 13 | 14 | class BlobWorkflowCallbacks(NoopWorkflowCallbacks): 15 | """A reporter that writes to a blob storage.""" 16 | 17 | _blob_service_client: BlobServiceClient 18 | _container_name: str 19 | _index_name: str 20 | _num_workflow_steps: int 21 | _processed_workflow_steps: list[str] = [] 22 | _max_block_count: int = 25000 # 25k blocks per blob 23 | _num_blocks = 0 24 | _blob_name: str 25 | 26 | def __init__( 27 | self, 28 | blob_service_client: BlobServiceClient, 29 | container_name: str, 30 | blob_name: str = "", 31 | index_name: str = "", 32 | num_workflow_steps: int = 0, 33 | ): 34 | """Create a new instance of the BlobWorkflowCallbacks class. 35 | 36 | Args: 37 | storage_account_blob_url (str): The URL to the storage account. 38 | container_name (str): The name of the container. 39 | blob_name (str, optional): The name of the blob. Defaults to "". 40 | index_name (str, optional): The name of the index. Defaults to "". 41 | num_workflow_steps (int): A list of workflow names ordered by their execution. Defaults to []. 42 | """ 43 | self._blob_service_client = blob_service_client 44 | self._blob_name = ( 45 | f"{datetime.now().strftime('%Y-%m-%d-%H:%M:%S:%f')}.logs.txt" 46 | if not blob_name 47 | else blob_name 48 | ) 49 | self._container_name = container_name 50 | self._index_name = index_name 51 | self._num_workflow_steps = num_workflow_steps 52 | self._processed_workflow_steps = [] # maintain a running list of workflow steps that get processed 53 | self._blob_client = self._blob_service_client.get_blob_client( 54 | self._container_name, self._blob_name 55 | ) 56 | if not self._blob_client.exists(): 57 | self._blob_client.create_append_blob() 58 | self._num_blocks = 0 # refresh block counter 59 | 60 | def _write_log(self, log: dict[str, Any]): 61 | """Write a log message to blob storage.""" 62 | # create a new file when block count is close to 25k 63 | if self._num_blocks >= self._max_block_count: 64 | self.__init__( 65 | blob_service_client=self._blob_service_client, 66 | container_name=self._container_name, 67 | ) 68 | blob_client = self._blob_service_client.get_blob_client( 69 | self._container_name, self._blob_name 70 | ) 71 | blob_client.append_block(pformat(log, indent=2) + "\n") 72 | self._num_blocks += 1 73 | 74 | def workflow_start(self, name: str, instance: object) -> None: 75 | """Execute this callback when a workflow starts.""" 76 | self._workflow_name = name 77 | self._processed_workflow_steps.append(name) 78 | message = f"Index: {self._index_name} -- " if self._index_name else "" 79 | workflow_progress = ( 80 | f" ({len(self._processed_workflow_steps)}/{self._num_workflow_steps})" 81 | if self._num_workflow_steps 82 | else "" 83 | ) # will take the form "(1/4)" 84 | message += f"Workflow{workflow_progress}: {name} started." 85 | details = { 86 | "workflow_name": name, 87 | # "workflow_instance": str(instance), 88 | } 89 | if self._index_name: 90 | details["index_name"] = self._index_name 91 | self._write_log({ 92 | "type": "on_workflow_start", 93 | "data": message, 94 | "details": details, 95 | }) 96 | 97 | def workflow_end(self, name: str, instance: object) -> None: 98 | """Execute this callback when a workflow ends.""" 99 | message = f"Index: {self._index_name} -- " if self._index_name else "" 100 | workflow_progress = ( 101 | f" ({len(self._processed_workflow_steps)}/{self._num_workflow_steps})" 102 | if self._num_workflow_steps 103 | else "" 104 | ) # will take the form "(1/4)" 105 | message += f"Workflow{workflow_progress}: {name} complete." 106 | details = { 107 | "workflow_name": name, 108 | # "workflow_instance": str(instance), 109 | } 110 | if self._index_name: 111 | details["index_name"] = self._index_name 112 | self._write_log({ 113 | "type": "on_workflow_end", 114 | "data": message, 115 | "details": details, 116 | }) 117 | 118 | def error( 119 | self, 120 | message: str, 121 | cause: BaseException | None = None, 122 | stack: str | None = None, 123 | details: dict | None = None, 124 | ): 125 | """Report an error.""" 126 | self._write_log({ 127 | "type": "error", 128 | "data": message, 129 | "cause": str(cause), 130 | "stack": stack, 131 | "details": details, 132 | }) 133 | 134 | def warning(self, message: str, details: dict | None = None): 135 | """Report a warning.""" 136 | self._write_log({"type": "warning", "data": message, "details": details}) 137 | 138 | def log(self, message: str, details: dict | None = None): 139 | """Report a generic log message.""" 140 | self._write_log({"type": "log", "data": message, "details": details}) 141 | -------------------------------------------------------------------------------- /backend/graphrag_app/logger/load_logger.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | import os 5 | from pathlib import Path 6 | from typing import List 7 | 8 | from graphrag.callbacks.file_workflow_callbacks import FileWorkflowCallbacks 9 | from graphrag.callbacks.workflow_callbacks import WorkflowCallbacks 10 | from graphrag.callbacks.workflow_callbacks_manager import WorkflowCallbacksManager 11 | 12 | from graphrag_app.logger.application_insights_workflow_callbacks import ( 13 | ApplicationInsightsWorkflowCallbacks, 14 | ) 15 | from graphrag_app.logger.blob_workflow_callbacks import BlobWorkflowCallbacks 16 | from graphrag_app.logger.console_workflow_callbacks import ConsoleWorkflowCallbacks 17 | from graphrag_app.logger.typing import Logger 18 | from graphrag_app.utils.azure_clients import AzureClientManager 19 | 20 | 21 | def load_pipeline_logger( 22 | logging_dir: str = "", 23 | index_name: str = "", 24 | num_workflow_steps: int = 0, 25 | ) -> WorkflowCallbacks: 26 | """Create and load a list of loggers. 27 | 28 | This function creates loggers for two different scenarios. Loggers can be instantiated as generic loggers or associated with a specified indexing job. 29 | 1. When an indexing job is running, custom index-specific loggers are created to log the job activity 30 | 2. When the fastapi app is running, generic loggers are used to log the app's activities. 31 | """ 32 | loggers: List[Logger] = [] 33 | for logger_type in ["BLOB", "CONSOLE", "APP_INSIGHTS"]: 34 | loggers.append(Logger[logger_type]) 35 | 36 | azure_client_manager = AzureClientManager() 37 | callback_manager = WorkflowCallbacksManager() 38 | for logger in loggers: 39 | match logger: 40 | case Logger.BLOB: 41 | # create a dedicated container for logs 42 | log_blob_name = "logs" 43 | if logging_dir: 44 | log_blob_name = os.path.join(logging_dir, log_blob_name) 45 | # ensure the root directory exists; if not, create it 46 | blob_service_client = azure_client_manager.get_blob_service_client() 47 | container_root = Path(log_blob_name).parts[0] 48 | if not blob_service_client.get_container_client( 49 | container_root 50 | ).exists(): 51 | blob_service_client.create_container(container_root) 52 | callback_manager.register( 53 | BlobWorkflowCallbacks( 54 | blob_service_client=blob_service_client, 55 | container_name=log_blob_name, 56 | index_name=index_name, 57 | num_workflow_steps=num_workflow_steps, 58 | ) 59 | ) 60 | case Logger.FILE: 61 | callback_manager.register(FileWorkflowCallbacks(dir=logging_dir)) 62 | case Logger.APP_INSIGHTS: 63 | if os.getenv("APPLICATIONINSIGHTS_CONNECTION_STRING"): 64 | callback_manager.register( 65 | ApplicationInsightsWorkflowCallbacks( 66 | index_name=index_name, 67 | num_workflow_steps=num_workflow_steps, 68 | ) 69 | ) 70 | case Logger.CONSOLE: 71 | callback_manager.register( 72 | ConsoleWorkflowCallbacks( 73 | index_name=index_name, num_workflow_steps=num_workflow_steps 74 | ) 75 | ) 76 | case _: 77 | print(f"WARNING: unknown logger type: {logger}. Skipping.") 78 | return callback_manager 79 | -------------------------------------------------------------------------------- /backend/graphrag_app/logger/pipeline_job_updater.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | from graphrag.callbacks.noop_workflow_callbacks import NoopWorkflowCallbacks 5 | 6 | from graphrag_app.typing.pipeline import PipelineJobState 7 | from graphrag_app.utils.pipeline import PipelineJob 8 | 9 | 10 | class PipelineJobUpdater(NoopWorkflowCallbacks): 11 | """A callback that records pipeline updates.""" 12 | 13 | def __init__(self, pipeline_job: PipelineJob): 14 | """ 15 | This class defines a set of callback methods that can be used to log the progress of a pipeline job. 16 | It inherits from the NoopWorkflowCallbacks class, which provides default implementations for all the callback methods. 17 | 18 | Attributes: 19 | pipeline_job (PipelineJob): The pipeline object associated with the job. 20 | 21 | """ 22 | self._pipeline_job = pipeline_job 23 | 24 | def workflow_start(self, name: str, instance: object) -> None: 25 | """Execute this callback when a workflow starts.""" 26 | self._pipeline_job.status = PipelineJobState.RUNNING 27 | self._pipeline_job.progress = f"Workflow {name} started." 28 | 29 | def workflow_end(self, name: str, instance: object) -> None: 30 | """Execute this callback when a workflow ends.""" 31 | self._pipeline_job.completed_workflows.append(name) 32 | self._pipeline_job.update_db() 33 | self._pipeline_job.progress = f"Workflow {name} complete." 34 | self._pipeline_job.percent_complete = ( 35 | self._pipeline_job.calculate_percent_complete() 36 | ) 37 | -------------------------------------------------------------------------------- /backend/graphrag_app/logger/typing.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | import logging 5 | from enum import Enum 6 | from typing import Literal 7 | 8 | from graphrag.index.config.reporting import ( 9 | PipelineReportingConfig, 10 | PipelineReportingConfigTypes, 11 | ) 12 | from pydantic import Field as pydantic_Field 13 | 14 | 15 | class Logger(Enum): 16 | BLOB = (1, "blob") 17 | CONSOLE = (2, "console") 18 | FILE = (3, "file") 19 | APP_INSIGHTS = (4, "app_insights") 20 | 21 | 22 | class PipelineAppInsightsReportingConfig( 23 | PipelineReportingConfig[Literal["app_insights"]] 24 | ): 25 | """Represents the ApplicationInsights reporting configuration for the pipeline.""" 26 | 27 | type: Literal["app_insights"] = Logger.APP_INSIGHTS.name.lower() 28 | """The type of reporting.""" 29 | 30 | connection_string: str = pydantic_Field( 31 | description="The connection string for the App Insights instance.", 32 | default=None, 33 | ) 34 | """The connection string for the App Insights instance.""" 35 | 36 | logger_name: str = pydantic_Field( 37 | description="The name for logger instance", default=None 38 | ) 39 | """The name for logger instance""" 40 | 41 | logger_level: int = pydantic_Field( 42 | description="The name of the logger. Defaults to None.", default=logging.INFO 43 | ) 44 | """The name of the logger. Defaults to None.""" 45 | 46 | 47 | # add the new type to the existing PipelineReportingConfigTypes 48 | PipelineReportingConfigTypes = ( 49 | PipelineReportingConfigTypes | PipelineAppInsightsReportingConfig 50 | ) 51 | -------------------------------------------------------------------------------- /backend/graphrag_app/typing/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure-Samples/graphrag-accelerator/c660c84cff1f7dfc7c82a62ee566ab0b2dc8e6b6/backend/graphrag_app/typing/__init__.py -------------------------------------------------------------------------------- /backend/graphrag_app/typing/models.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | from typing import ( 5 | Any, 6 | List, 7 | ) 8 | 9 | from pydantic import BaseModel 10 | 11 | 12 | class BaseResponse(BaseModel): 13 | status: str 14 | 15 | 16 | class ClaimResponse(BaseModel): 17 | covariate_type: str 18 | type: str 19 | description: str 20 | subject_id: str 21 | object_id: str 22 | source_text: str 23 | text_unit_id: str 24 | document_ids: List[str] 25 | 26 | 27 | class EntityResponse(BaseModel): 28 | name: str 29 | description: str 30 | text_units: list[str] 31 | 32 | 33 | class GraphRequest(BaseModel): 34 | index_name: str 35 | query: str 36 | community_level: int | None = None 37 | 38 | 39 | class GraphResponse(BaseModel): 40 | result: Any 41 | context_data: Any 42 | 43 | 44 | class GraphDataResponse(BaseModel): 45 | nodes: int 46 | edges: int 47 | 48 | 49 | class IndexNameList(BaseModel): 50 | index_name: List[str] 51 | 52 | 53 | class IndexStatusResponse(BaseModel): 54 | status_code: int 55 | index_name: str 56 | storage_name: str 57 | status: str 58 | percent_complete: float 59 | progress: str 60 | 61 | 62 | class ReportResponse(BaseModel): 63 | text: str 64 | 65 | 66 | class RelationshipResponse(BaseModel): 67 | source: str 68 | source_id: int 69 | target: str 70 | target_id: int 71 | description: str 72 | text_units: list[str] 73 | 74 | 75 | class StorageNameList(BaseModel): 76 | storage_name: List[str] 77 | 78 | 79 | class TextUnitResponse(BaseModel): 80 | text: str 81 | source_document: str 82 | -------------------------------------------------------------------------------- /backend/graphrag_app/typing/pipeline.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | from enum import Enum 5 | 6 | 7 | class PipelineJobState(str, Enum): 8 | SCHEDULED = "scheduled" 9 | RUNNING = "running" 10 | FAILED = "failed" 11 | COMPLETE = "complete" 12 | 13 | def __repr__(self): 14 | """Get a string representation.""" 15 | return f'"{self.value}"' 16 | -------------------------------------------------------------------------------- /backend/graphrag_app/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | -------------------------------------------------------------------------------- /backend/manifests/cronjob.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | # NOTE: the location of this file is important as it gets referenced by the src/main.py script 5 | # and depends on the relative path to this file when uvicorn is run 6 | 7 | apiVersion: batch/v1 8 | kind: CronJob 9 | metadata: 10 | name: graphrag-index-manager 11 | spec: 12 | schedule: "*/5 * * * *" 13 | jobTemplate: 14 | spec: 15 | ttlSecondsAfterFinished: 180 16 | template: 17 | metadata: 18 | labels: 19 | azure.workload.identity/use: "true" 20 | spec: 21 | serviceAccountName: PLACEHOLDER 22 | restartPolicy: OnFailure 23 | containers: 24 | - name: index-job-manager 25 | image: PLACEHOLDER 26 | # override default WORKDIR with absolute path to the scripts directory 27 | workingDir: "/backend/scripts" 28 | imagePullPolicy: Always 29 | resources: 30 | requests: 31 | cpu: "0.5" 32 | memory: "0.5Gi" 33 | limits: 34 | cpu: "1" 35 | memory: "1Gi" 36 | envFrom: 37 | - configMapRef: 38 | name: graphrag 39 | command: 40 | - python 41 | - "job-scheduler.py" 42 | -------------------------------------------------------------------------------- /backend/manifests/job.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | # NOTE: the location of this file is important as it gets referenced by the job-scheduler.py script 5 | # and depends on the relative path to this file when uvicorn is run 6 | 7 | apiVersion: batch/v1 8 | kind: Job 9 | metadata: 10 | name: PLACEHOLDER 11 | spec: 12 | ttlSecondsAfterFinished: 300 13 | backoffLimit: 3 14 | template: 15 | metadata: 16 | labels: 17 | azure.workload.identity/use: "true" 18 | spec: 19 | serviceAccountName: PLACEHOLDER 20 | restartPolicy: OnFailure 21 | nodeSelector: 22 | workload: graphrag-indexing 23 | containers: 24 | - name: graphrag 25 | image: PLACEHOLDER 26 | # override default WORKDIR with absolute path to the scripts directory 27 | workingDir: "/backend/scripts" 28 | imagePullPolicy: Always 29 | resources: 30 | requests: 31 | cpu: "5" 32 | memory: "36Gi" 33 | limits: 34 | cpu: "8" 35 | memory: "64Gi" 36 | envFrom: 37 | - configMapRef: 38 | name: graphrag 39 | command: [PLACEHOLDER] 40 | -------------------------------------------------------------------------------- /backend/pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "graphrag-app" 3 | description = "A web API wrapper around the official GraphRAG library." 4 | # we make the version defined here match the graphrag library version 5 | version = "1.2.0" 6 | license = "MIT" 7 | authors = [ 8 | {name = "Josh Bradley", email = "joshbradley@microsoft.com"}, 9 | {name = "Newman Cheng", email = "newmancheng@microsoft.com"}, 10 | {name = "Christine DiFonzo", email = "cdifonzo@microsoft.com"}, 11 | {name = "Gabriel Nieves", email = "gnievesponce@microsoft.com"}, 12 | {name = "Douglas Orbaker", email = "dorbaker@microsoft.com"}, 13 | {name = "Shane Solomon", email = "shane.solomon@microsoft.com"}, 14 | {name = "Kenny Zhang", email = "zhangken@microsoft.com"}, 15 | ] 16 | requires-python = '>=3.10, <3.13' 17 | package-mode=false 18 | 19 | [tool.poetry.group.dev.dependencies] 20 | detect-secrets = ">=1.5.0" 21 | devtools = ">=0.12.2" 22 | flake8 = ">=6.1.0" 23 | ipython = "*" 24 | jupyter = "*" 25 | pre-commit = ">=3.6.0" 26 | ruff = ">=0.1.13" 27 | 28 | [tool.poetry.group.test] 29 | optional = true 30 | 31 | [tool.poetry.group.test.dependencies] 32 | pytest = ">=8.2.1" 33 | pytest-asyncio = "^0.25.0" 34 | pytest-cov = "^6.0.0" 35 | pytest-env = "^1.1.5" 36 | pytest-xdist = "^3.6.1" 37 | wikipedia = ">=1.4.0" 38 | 39 | [tool.poetry.group.backend.dependencies] 40 | adlfs = ">=2024.7.0" 41 | attrs = ">=23.2.0" 42 | azure-core = ">=1.30.1" 43 | azure-cosmos = ">=4.5.1" 44 | azure-identity = ">=1.15.0" 45 | azure-monitor-opentelemetry = "^1.6.4" 46 | azure-search-documents = ">=11.4.0" 47 | azure-storage-blob = ">=12.19.0" 48 | environs = ">=9.5.0" 49 | fastapi = ">=0.110.0" 50 | fastparquet = ">=2023.10.1" 51 | fsspec = ">=2024.2.0" 52 | graphrag = "==1.2.0" 53 | httpx = ">=0.25.2" 54 | kubernetes = ">=29.0.0" 55 | markitdown = {extras = ["all"], version = "^0.1.1"} 56 | networkx = ">=3.2.1" 57 | nltk = "*" 58 | pandas = ">=2.2.1" 59 | pyaml-env = ">=1.2.1" 60 | pyarrow = ">=15.0.0" 61 | pydantic = ">=1.10.14" 62 | python-multipart = ">=0.0.6" 63 | requests = "*" 64 | rich = ">=13.7.1" 65 | tiktoken = ">=0.6.0" 66 | urllib3 = ">=2.2.2" 67 | uvicorn = ">=0.23.2" 68 | 69 | [tool.ruff] 70 | indent-width = 4 71 | line-length = 88 72 | target-version = "py310" 73 | 74 | [tool.ruff.format] 75 | preview = true 76 | quote-style = "double" 77 | 78 | [tool.ruff.lint] 79 | ignore = ["E402", "E501", "F821"] 80 | preview = true 81 | select = ["E", "F", "I"] 82 | 83 | [build-system] 84 | build-backend = "poetry.core.masonry.api" 85 | requires = ["poetry-core"] 86 | -------------------------------------------------------------------------------- /backend/pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | required_plugins = anyio pytest-asyncio pytest-cov pytest-env pytest-xdist 3 | asyncio_default_fixture_loop_scope="function" 4 | asyncio_mode=auto 5 | ; NOTE: we use well known credentials for the Cosmos DB emulator and Azure Storage emulator. 6 | ; If executing these pytests locally, users may need to modify the cosmosdb connection string to use http protocol instead of https. 7 | ; This depends on how the cosmosdb emulator has been configured (by the user) to run. 8 | env = 9 | COSMOS_CONNECTION_STRING=AccountEndpoint=https://127.0.0.1:8081/;AccountKey=C2y6yDjf5/R+ob0N8A7Cgv30VRDJIWEHLM+4QDU5DE2nQ9nDuVTqobD4b8mGGyPMbIZnqyMsEcaGQy67XIw/Jw== 10 | STORAGE_CONNECTION_STRING=DefaultEndpointsProtocol=http;AccountName=devstoreaccount1;AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;BlobEndpoint=http://127.0.0.1:10000/devstoreaccount1;QueueEndpoint=http://127.0.0.1:10001/devstoreaccount1;TableEndpoint=http://127.0.0.1:10002/devstoreaccount1; 11 | TESTING=1 12 | -------------------------------------------------------------------------------- /backend/scripts/settings.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | # this yaml file serves as a configuration template for the graphrag indexing jobs 5 | # some values are hardcoded while others denoted by PLACEHOLDER will be dynamically set 6 | 7 | ###################### LLM settings ###################### 8 | encoding_model: cl100k_base # this needs to be matched to your model! 9 | 10 | llm: 11 | type: azure_openai_chat 12 | api_base: $GRAPHRAG_API_BASE 13 | api_version: $GRAPHRAG_API_VERSION 14 | model: $GRAPHRAG_LLM_MODEL 15 | deployment_name: $GRAPHRAG_LLM_DEPLOYMENT_NAME 16 | audience: $COGNITIVE_SERVICES_AUDIENCE 17 | model_supports_json: True 18 | tokens_per_minute: 80_000 19 | requests_per_minute: 480 20 | concurrent_requests: 25 21 | max_retries: 250 22 | max_retry_wait: 60.0 23 | sleep_on_rate_limit_recommendation: True 24 | 25 | parallelization: 26 | num_threads: 10 27 | stagger: 0.25 28 | 29 | async_mode: threaded # or asyncio 30 | 31 | embeddings: 32 | vector_store: 33 | type: azure_ai_search 34 | collection_name: PLACEHOLDER 35 | title_column: name 36 | overwrite: True 37 | url: $AI_SEARCH_URL 38 | audience: $AI_SEARCH_AUDIENCE 39 | llm: 40 | type: azure_openai_embedding 41 | api_base: $GRAPHRAG_API_BASE 42 | api_version: $GRAPHRAG_API_VERSION 43 | batch_size: 10 44 | model: $GRAPHRAG_EMBEDDING_MODEL 45 | deployment_name: $GRAPHRAG_EMBEDDING_DEPLOYMENT_NAME 46 | audience: $COGNITIVE_SERVICES_AUDIENCE 47 | tokens_per_minute: 350_000 48 | requests_per_minute: 2_100 49 | 50 | ###################### Input settings ###################### 51 | input: 52 | type: blob 53 | file_type: text 54 | base_dir: . 55 | file_encoding: utf-8 56 | file_pattern: .*\.txt$ 57 | storage_account_blob_url: $STORAGE_ACCOUNT_BLOB_URL 58 | container_name: PLACEHOLDER 59 | 60 | chunks: 61 | size: 1_200 62 | overlap: 100 63 | group_by_columns: [id] 64 | 65 | ###################### Storage settings ###################### 66 | cache: 67 | type: blob 68 | storage_account_blob_url: $STORAGE_ACCOUNT_BLOB_URL 69 | container_name: PLACEHOLDER 70 | base_dir: cache 71 | 72 | reporting: 73 | type: blob 74 | storage_account_blob_url: $STORAGE_ACCOUNT_BLOB_URL 75 | container_name: PLACEHOLDER 76 | base_dir: logs 77 | 78 | storage: 79 | type: blob 80 | storage_account_blob_url: $STORAGE_ACCOUNT_BLOB_URL 81 | container_name: PLACEHOLDER 82 | base_dir: output 83 | 84 | ###################### Workflow settings ###################### 85 | skip_workflows: [] 86 | 87 | entity_extraction: 88 | prompt: PLACEHOLDER 89 | entity_types: [organization, person, geo, event] 90 | max_gleanings: 1 91 | 92 | summarize_descriptions: 93 | prompt: PLACEHOLDER 94 | max_length: 500 95 | 96 | claim_extraction: 97 | enabled: false 98 | prompt: "prompts/claim_extraction.txt" 99 | description: "Any claims or facts that could be relevant to information discovery." 100 | max_gleanings: 1 101 | 102 | community_reports: 103 | prompt: PLACEHOLDER 104 | max_length: 2_000 105 | max_input_length: 8_000 106 | 107 | cluster_graph: 108 | max_cluster_size: 10 109 | 110 | embed_graph: 111 | enabled: false 112 | 113 | umap: 114 | enabled: false 115 | 116 | snapshots: 117 | graphml: True 118 | embeddings: false 119 | transient: false 120 | 121 | ###################### Query settings ###################### 122 | ## The prompt locations are required here, but each search method has a number of optional knobs that can be tuned. 123 | ## See the config docs: https://microsoft.github.io/graphrag/config/yaml/#query 124 | local_search: 125 | prompt: PLACEHOLDER 126 | 127 | global_search: 128 | map_prompt: PLACEHOLDER 129 | reduce_prompt: PLACEHOLDER 130 | knowledge_prompt: PLACEHOLDER 131 | 132 | drift_search: 133 | prompt: PLACEHOLDER 134 | reduce_prompt: PLACEHOLDER 135 | 136 | basic_search: 137 | prompt: PLACEHOLDER 138 | -------------------------------------------------------------------------------- /backend/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure-Samples/graphrag-accelerator/c660c84cff1f7dfc7c82a62ee566ab0b2dc8e6b6/backend/tests/__init__.py -------------------------------------------------------------------------------- /backend/tests/conftest.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | import os 5 | from pathlib import Path 6 | from typing import Generator 7 | 8 | import pytest 9 | from azure.cosmos import CosmosClient, PartitionKey 10 | from azure.storage.blob import BlobServiceClient 11 | from fastapi.testclient import TestClient 12 | 13 | from graphrag_app.main import app 14 | from graphrag_app.utils.common import sanitize_name 15 | 16 | 17 | @pytest.fixture(scope="session") 18 | def blob_with_data_container_name(blob_service_client: BlobServiceClient): 19 | # create a storage container and upload some data 20 | container_name = "container-with-data" 21 | sanitized_name = sanitize_name(container_name) 22 | blob_service_client.create_container(sanitized_name) 23 | blob_client = blob_service_client.get_blob_client(sanitized_name, "data.txt") 24 | blob_client.upload_blob(data="Hello, World!", overwrite=True) 25 | yield container_name 26 | # cleanup 27 | blob_service_client.delete_container(sanitized_name) 28 | 29 | 30 | @pytest.fixture(scope="session") 31 | def blob_service_client() -> Generator[BlobServiceClient, None, None]: 32 | blob_service_client = BlobServiceClient.from_connection_string( 33 | os.environ["STORAGE_CONNECTION_STRING"] 34 | ) 35 | yield blob_service_client 36 | # no cleanup 37 | 38 | 39 | @pytest.fixture(scope="session") 40 | def cosmos_client() -> Generator[CosmosClient, None, None]: 41 | """Initializes the CosmosDB databases that graphrag expects at startup time.""" 42 | # setup 43 | client = CosmosClient.from_connection_string(os.environ["COSMOS_CONNECTION_STRING"]) 44 | db_client = client.create_database_if_not_exists(id="graphrag") 45 | db_client.create_container_if_not_exists( 46 | id="container-store", partition_key=PartitionKey(path="/id") 47 | ) 48 | db_client.create_container_if_not_exists( 49 | id="jobs", partition_key=PartitionKey(path="/id") 50 | ) 51 | yield client # run the test 52 | # teardown 53 | client.delete_database("graphrag") 54 | 55 | 56 | @pytest.fixture(scope="session") 57 | def container_with_graphml_file( 58 | blob_service_client: BlobServiceClient, cosmos_client: CosmosClient 59 | ): 60 | """Create a storage container that mimics a valid index and upload a fake graphml file""" 61 | container_name = "container-with-graphml" 62 | sanitized_name = sanitize_name(container_name) 63 | if not blob_service_client.get_container_client(sanitized_name).exists(): 64 | blob_service_client.create_container(sanitized_name) 65 | blob_client = blob_service_client.get_blob_client( 66 | sanitized_name, "output/graph.graphml" 67 | ) 68 | blob_client.upload_blob(data="a fake graphml file", overwrite=True) 69 | # add an entry to the container-store table in cosmos db 70 | container_store_client = cosmos_client.get_database_client( 71 | "graphrag" 72 | ).get_container_client("container-store") 73 | container_store_client.upsert_item({ 74 | "id": sanitized_name, 75 | "human_readable_name": container_name, 76 | "type": "index", 77 | }) 78 | yield container_name 79 | # cleanup 80 | blob_service_client.delete_container(sanitized_name) 81 | # container_store_client.delete_item(sanitized_name, sanitized_name) 82 | 83 | 84 | @pytest.fixture(scope="session") 85 | def container_with_index_files( 86 | blob_service_client: BlobServiceClient, cosmos_client: CosmosClient 87 | ): 88 | """Create a storage container and upload a set of parquet files associated with a valid index""" 89 | container_name = "container-with-index-files" 90 | sanitized_name = sanitize_name(container_name) 91 | if not blob_service_client.get_container_client(sanitized_name).exists(): 92 | blob_service_client.create_container(sanitized_name) 93 | 94 | # upload synthetic index to a container 95 | data_root = Path(__file__).parent / "data/synthetic-dataset/output" 96 | for file in data_root.iterdir(): 97 | # upload each file in the output folder 98 | blob_client = blob_service_client.get_blob_client( 99 | sanitized_name, f"output/{file.name}" 100 | ) 101 | local_file = f"{data_root}/{file.name}" 102 | with open(local_file, "rb") as data: 103 | blob_client.upload_blob(data, overwrite=True) 104 | 105 | # add an entry to the container-store table in cosmos db 106 | container_store_client = cosmos_client.get_database_client( 107 | "graphrag" 108 | ).get_container_client("container-store") 109 | container_store_client.upsert_item({ 110 | "id": sanitized_name, 111 | "human_readable_name": container_name, 112 | "type": "index", 113 | }) 114 | yield container_name 115 | # cleanup 116 | blob_service_client.delete_container(sanitized_name) 117 | container_store_client.delete_item(sanitized_name, sanitized_name) 118 | 119 | 120 | @pytest.fixture(scope="session") 121 | def client(request) -> Generator[TestClient, None, None]: 122 | with TestClient(app) as c: 123 | yield c 124 | -------------------------------------------------------------------------------- /backend/tests/data/synthetic-dataset/ABOUT.md: -------------------------------------------------------------------------------- 1 | # About 2 | 3 | This is an index built on a book (Operation Dulce) that is an AI-generated science fiction novella, included here for the purposes of integration testing. 4 | 5 | To regenerate the index and update pytests, the original book can be retrieved from [here](https://github.com/microsoft/graphrag/tree/main/docs/data/operation_dulce). 6 | -------------------------------------------------------------------------------- /backend/tests/data/synthetic-dataset/output/create_final_communities.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure-Samples/graphrag-accelerator/c660c84cff1f7dfc7c82a62ee566ab0b2dc8e6b6/backend/tests/data/synthetic-dataset/output/create_final_communities.parquet -------------------------------------------------------------------------------- /backend/tests/data/synthetic-dataset/output/create_final_community_reports.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure-Samples/graphrag-accelerator/c660c84cff1f7dfc7c82a62ee566ab0b2dc8e6b6/backend/tests/data/synthetic-dataset/output/create_final_community_reports.parquet -------------------------------------------------------------------------------- /backend/tests/data/synthetic-dataset/output/create_final_covariates.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure-Samples/graphrag-accelerator/c660c84cff1f7dfc7c82a62ee566ab0b2dc8e6b6/backend/tests/data/synthetic-dataset/output/create_final_covariates.parquet -------------------------------------------------------------------------------- /backend/tests/data/synthetic-dataset/output/create_final_documents.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure-Samples/graphrag-accelerator/c660c84cff1f7dfc7c82a62ee566ab0b2dc8e6b6/backend/tests/data/synthetic-dataset/output/create_final_documents.parquet -------------------------------------------------------------------------------- /backend/tests/data/synthetic-dataset/output/create_final_entities.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure-Samples/graphrag-accelerator/c660c84cff1f7dfc7c82a62ee566ab0b2dc8e6b6/backend/tests/data/synthetic-dataset/output/create_final_entities.parquet -------------------------------------------------------------------------------- /backend/tests/data/synthetic-dataset/output/create_final_nodes.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure-Samples/graphrag-accelerator/c660c84cff1f7dfc7c82a62ee566ab0b2dc8e6b6/backend/tests/data/synthetic-dataset/output/create_final_nodes.parquet -------------------------------------------------------------------------------- /backend/tests/data/synthetic-dataset/output/create_final_relationships.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure-Samples/graphrag-accelerator/c660c84cff1f7dfc7c82a62ee566ab0b2dc8e6b6/backend/tests/data/synthetic-dataset/output/create_final_relationships.parquet -------------------------------------------------------------------------------- /backend/tests/data/synthetic-dataset/output/create_final_text_units.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure-Samples/graphrag-accelerator/c660c84cff1f7dfc7c82a62ee566ab0b2dc8e6b6/backend/tests/data/synthetic-dataset/output/create_final_text_units.parquet -------------------------------------------------------------------------------- /backend/tests/data/synthetic-dataset/output/stats.json: -------------------------------------------------------------------------------- 1 | { 2 | "total_runtime": 358.0534498691559, 3 | "num_documents": 1, 4 | "input_load_time": 0, 5 | "workflows": { 6 | "create_base_text_units": { 7 | "overall": 2.060708999633789 8 | }, 9 | "create_final_documents": { 10 | "overall": 0.043251991271972656 11 | }, 12 | "extract_graph": { 13 | "overall": 162.8238878250122 14 | }, 15 | "compute_communities": { 16 | "overall": 14.345926284790039 17 | }, 18 | "create_final_entities": { 19 | "overall": 0.04870915412902832 20 | }, 21 | "create_final_relationships": { 22 | "overall": 0.05901288986206055 23 | }, 24 | "create_final_nodes": { 25 | "overall": 0.07453203201293945 26 | }, 27 | "create_final_communities": { 28 | "overall": 0.127485990524292 29 | }, 30 | "create_final_covariates": { 31 | "overall": 142.99078702926636 32 | }, 33 | "create_final_text_units": { 34 | "overall": 0.12473607063293457 35 | }, 36 | "create_final_community_reports": { 37 | "overall": 31.13183307647705 38 | }, 39 | "generate_text_embeddings": { 40 | "overall": 3.978173017501831 41 | } 42 | } 43 | } -------------------------------------------------------------------------------- /backend/tests/integration/test_api_data.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | """ 4 | Integration tests for the /data API endpoints. 5 | """ 6 | 7 | import os 8 | 9 | from azure.cosmos import CosmosClient 10 | 11 | 12 | def test_upload_files(cosmos_client: CosmosClient, client): 13 | """Test uploading files to a data blob container.""" 14 | # create a single file 15 | with open("test.txt", "wb") as f: 16 | f.write(b"Hello, world!") 17 | # send the file in the request 18 | with open("test.txt", "rb") as f: 19 | response = client.post( 20 | "/data", 21 | files={"files": ("test.txt", f)}, 22 | params={"container_name": "testContainer"}, 23 | ) 24 | # check the response 25 | assert response.status_code == 200 26 | # remove the sample file as part of garbage collection 27 | if os.path.exists("test.txt"): 28 | os.remove("test.txt") 29 | 30 | 31 | def test_delete_files(cosmos_client: CosmosClient, client): 32 | """Test deleting a data blob container.""" 33 | # delete a data blob container 34 | response = client.delete("/data/testContainer") 35 | assert response.status_code == 200 36 | 37 | 38 | def test_get_list_of_data_containers(cosmos_client: CosmosClient, client): 39 | """Test getting a list of all data blob containers.""" 40 | response = client.get("/data") 41 | assert response.status_code == 200 42 | -------------------------------------------------------------------------------- /backend/tests/integration/test_api_default.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | """ 4 | Integration tests for the /health API endpoint. 5 | """ 6 | 7 | 8 | def test_health_check(client): 9 | """Test health check endpoint.""" 10 | response = client.get("/health") 11 | assert response.status_code == 200 12 | -------------------------------------------------------------------------------- /backend/tests/integration/test_api_graph.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | """ 4 | Integration tests for the /graph API endpoints. 5 | """ 6 | 7 | 8 | def test_get_graphml_file(client, container_with_graphml_file: str): 9 | """Test retrieving a graphml file endpoint.""" 10 | url = f"/graph/graphml/{container_with_graphml_file}" 11 | response = client.get(url) 12 | assert response.status_code == 200 13 | response.raise_for_status() 14 | full_data = b"" 15 | for chunk in response.iter_bytes(chunk_size=1024): 16 | full_data += chunk 17 | assert full_data == b"a fake graphml file" 18 | -------------------------------------------------------------------------------- /backend/tests/integration/test_api_index.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | """ 4 | Integration tests for the /index API endpoints. 5 | """ 6 | 7 | from azure.cosmos import CosmosClient 8 | 9 | 10 | def test_get_list_of_index_containers_empty(client, cosmos_client: CosmosClient): 11 | """Test getting a list of all blob containers holding an index.""" 12 | response = client.get("/index") 13 | assert response.status_code == 200 14 | 15 | 16 | def test_schedule_index_without_data(client, cosmos_client: CosmosClient): 17 | """Test scheduling an index job with a non-existent data blob container.""" 18 | response = client.post( 19 | "/index", 20 | params={ 21 | "index_container_name": "myindex", 22 | "storage_container_name": "nonexistent-data-container", 23 | }, 24 | ) 25 | assert response.status_code == 412 26 | 27 | 28 | # def test_schedule_index_with_data(client, cosmos_client, blob_with_data_container_name): 29 | # """Test scheduling an index job with real data.""" 30 | # response = client.post("/index", files=None, params={"storage_container_name": blob_with_data_container_name, "index_container_name": "myindex"}) 31 | # print(response.json()) 32 | # assert response.status_code == 200 33 | -------------------------------------------------------------------------------- /backend/tests/integration/test_api_index_configuration.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | """ 4 | Integration tests for the /index/config API endpoints. 5 | """ 6 | 7 | from unittest.mock import AsyncMock, patch 8 | 9 | import pytest_asyncio 10 | 11 | 12 | @pytest_asyncio.fixture 13 | def mock_generate_indexing_prompts(): 14 | with patch( 15 | "graphrag.api.generate_indexing_prompts", new_callable=AsyncMock 16 | ) as mock: 17 | mock.return_value = ( 18 | "synthetic-prompt1", 19 | "synthetic-prompt2", 20 | "synthetic-prompt3", 21 | ) 22 | yield mock 23 | 24 | 25 | def test_generate_prompts( 26 | blob_with_data_container_name, mock_generate_indexing_prompts, client 27 | ): 28 | """Test generating prompts.""" 29 | response = client.get( 30 | "/index/config/prompts", 31 | params={"container_name": blob_with_data_container_name}, 32 | ) 33 | assert response.status_code == 200 34 | -------------------------------------------------------------------------------- /backend/tests/integration/test_api_prompt_tuning.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | """ 4 | Integration tests for the /index/config API endpoints. 5 | """ 6 | 7 | from unittest.mock import AsyncMock, patch 8 | 9 | import pytest_asyncio 10 | 11 | 12 | @pytest_asyncio.fixture 13 | def mock_generate_indexing_prompts(): 14 | with patch( 15 | "graphrag.api.generate_indexing_prompts", new_callable=AsyncMock 16 | ) as mock: 17 | mock.return_value = ( 18 | "synthetic-prompt1", 19 | "synthetic-prompt2", 20 | "synthetic-prompt3", 21 | ) 22 | yield mock 23 | 24 | 25 | def test_generate_prompts( 26 | blob_with_data_container_name, mock_generate_indexing_prompts, client 27 | ): 28 | """Test generating prompts.""" 29 | response = client.get( 30 | "/index/config/prompts", 31 | params={"container_name": blob_with_data_container_name}, 32 | ) 33 | assert response.status_code == 200 34 | -------------------------------------------------------------------------------- /backend/tests/integration/test_api_source.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | """ 4 | Integration tests for the /source API endpoints. 5 | """ 6 | 7 | from fastapi.testclient import TestClient 8 | 9 | 10 | def test_get_report(container_with_index_files: str, client: TestClient): 11 | """Test retrieving a report via the graphrag_app.api.source.get_report_info() function.""" 12 | # retrieve a report that exists 13 | response = client.get(f"/source/report/{container_with_index_files}/1") 14 | assert response.status_code == 200 15 | # # retrieve a report that does not exist 16 | # response = client.get(f"/source/report/{container_with_index_files}/-1") 17 | # assert response.status_code == 500 18 | 19 | 20 | def test_get_chunk_info(container_with_index_files: str, client: TestClient): 21 | """Test retrieving a text chunk.""" 22 | response = client.get( 23 | f"/source/text/{container_with_index_files}/c4197a012ea9e7d2618450cbb197852dec47c40883d4a69e0ea473a8111319c80d608ae5fa66acc2d3f95cd845277b3acd8186d7fa326803dde09681da29790c" 24 | ) 25 | assert response.status_code == 200 26 | 27 | 28 | def test_get_entity_info(container_with_index_files: str, client: TestClient): 29 | """Test retrieving an entity description.""" 30 | response = client.get(f"/source/entity/{container_with_index_files}/1") 31 | assert response.status_code == 200 32 | 33 | 34 | def test_get_relationship_info(container_with_index_files: str, client: TestClient): 35 | """Test retrieving an entity description.""" 36 | response = client.get(f"/source/relationship/{container_with_index_files}/1") 37 | assert response.status_code == 200 38 | -------------------------------------------------------------------------------- /backend/tests/integration/test_utils_pipeline.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | """ 4 | Integration tests for the PipelineJob class. 5 | """ 6 | 7 | from typing import Generator 8 | 9 | import pytest 10 | 11 | from graphrag_app.typing.pipeline import PipelineJobState 12 | from graphrag_app.utils.pipeline import PipelineJob 13 | 14 | 15 | @pytest.fixture() 16 | def cosmos_index_job_entry(cosmos_client) -> Generator[str, None, None]: 17 | """Create an entry for an indexing job in the appropriate CosmosDB database and container 18 | that graphrag expects when first scheduling an indexing job.""" 19 | 20 | db_client = cosmos_client.get_database_client("graphrag") 21 | container_client = db_client.get_container_client("jobs") 22 | synthetic_job_entry = { 23 | "id": "testID", 24 | "epoch_request_time": 0, 25 | "human_readable_index_name": "test_human_readable_index_name", 26 | "sanitized_index_name": "test_sanitized_index_name", 27 | "human_readable_storage_name": "test_human_readable_storage_name", 28 | "sanitized_storage_name": "test_sanitized_storage_name", 29 | "all_workflows": ["workflow1", "workflow2"], 30 | "completed_workflows": ["workflow1"], 31 | "failed_workflows": ["workflow2"], 32 | "status": PipelineJobState.COMPLETE, 33 | "percent_complete": 50.0, 34 | "progress": "some progress", 35 | } 36 | container_client.upsert_item(synthetic_job_entry) 37 | yield synthetic_job_entry["id"] 38 | # teardown 39 | container_client.delete_item( 40 | synthetic_job_entry["id"], partition_key=synthetic_job_entry["id"] 41 | ) 42 | 43 | 44 | def test_pipeline_job_interface(cosmos_index_job_entry): 45 | """Test the graphrag_app.utils.pipeline.PipelineJob class interface.""" 46 | pipeline_job = PipelineJob() 47 | 48 | # test creating a new entry 49 | pipeline_job.create_item( 50 | id="synthetic_id", 51 | human_readable_index_name="test_human_readable_index_name", 52 | human_readable_storage_name="test_human_readable_storage_name", 53 | entity_extraction_prompt="fake entity extraction prompt", 54 | community_report_prompt="fake community report prompt", 55 | summarize_descriptions_prompt="fake summarize descriptions prompt", 56 | ) 57 | assert pipeline_job.item_exist("synthetic_id") 58 | 59 | # test loading an existing entry 60 | pipeline_job = pipeline_job.load_item(cosmos_index_job_entry) 61 | assert pipeline_job.id == "testID" 62 | assert pipeline_job.human_readable_index_name == "test_human_readable_index_name" 63 | assert pipeline_job.sanitized_index_name == "test_sanitized_index_name" 64 | assert ( 65 | pipeline_job.human_readable_storage_name == "test_human_readable_storage_name" 66 | ) 67 | assert pipeline_job.sanitized_storage_name == "test_sanitized_storage_name" 68 | assert pipeline_job.all_workflows == ["workflow1", "workflow2"] 69 | assert pipeline_job.completed_workflows == ["workflow1"] 70 | assert pipeline_job.failed_workflows == ["workflow2"] 71 | assert pipeline_job.status == PipelineJobState.COMPLETE 72 | assert pipeline_job.percent_complete == 50.0 73 | assert pipeline_job.progress == "some progress" 74 | assert pipeline_job.calculate_percent_complete() == 50.0 75 | 76 | # test setters and getters 77 | pipeline_job.id = "newID" 78 | assert pipeline_job.id == "newID" 79 | pipeline_job.epoch_request_time = 1 80 | assert pipeline_job.epoch_request_time == 1 81 | 82 | pipeline_job.human_readable_index_name = "new_human_readable_index_name" 83 | assert pipeline_job.human_readable_index_name == "new_human_readable_index_name" 84 | pipeline_job.sanitized_index_name = "new_sanitized_index_name" 85 | assert pipeline_job.sanitized_index_name == "new_sanitized_index_name" 86 | 87 | pipeline_job.human_readable_storage_name = "new_human_readable_storage_name" 88 | assert pipeline_job.human_readable_storage_name == "new_human_readable_storage_name" 89 | pipeline_job.sanitized_storage_name = "new_sanitized_storage_name" 90 | assert pipeline_job.sanitized_storage_name == "new_sanitized_storage_name" 91 | 92 | pipeline_job.entity_extraction_prompt = "new_entity_extraction_prompt" 93 | assert pipeline_job.entity_extraction_prompt == "new_entity_extraction_prompt" 94 | pipeline_job.community_report_prompt = "new_community_report_prompt" 95 | assert pipeline_job.community_report_prompt == "new_community_report_prompt" 96 | pipeline_job.summarize_descriptions_prompt = "new_summarize_descriptions_prompt" 97 | assert ( 98 | pipeline_job.summarize_descriptions_prompt 99 | == "new_summarize_descriptions_prompt" 100 | ) 101 | 102 | pipeline_job.all_workflows = ["new_workflow1", "new_workflow2", "new_workflow3"] 103 | assert len(pipeline_job.all_workflows) == 3 104 | 105 | pipeline_job.completed_workflows = ["new_workflow1", "new_workflow2"] 106 | assert len(pipeline_job.completed_workflows) == 2 107 | 108 | pipeline_job.failed_workflows = ["new_workflow3"] 109 | assert len(pipeline_job.failed_workflows) == 1 110 | -------------------------------------------------------------------------------- /backend/tests/unit/test_azure_clients.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | from azure.cosmos import CosmosClient 5 | from azure.storage.blob import BlobServiceClient 6 | from azure.storage.blob.aio import BlobServiceClient as BlobServiceClientAsync 7 | 8 | from graphrag_app.utils.azure_clients import ( 9 | AzureClientManager, 10 | _BlobServiceClientSingleton, 11 | _BlobServiceClientSingletonAsync, 12 | _CosmosClientSingleton, 13 | ) 14 | 15 | 16 | def test_get_cosmos_singleton(): 17 | """verify correctness of singleton implementation""" 18 | client1 = _CosmosClientSingleton.get_instance() 19 | client2 = _CosmosClientSingleton.get_instance() 20 | assert isinstance(client1, CosmosClient) 21 | assert isinstance(client2, CosmosClient) 22 | assert client1 is client2 23 | 24 | 25 | def test_get_storage_singleton(): 26 | """Verify correctness of singleton implementation""" 27 | client1 = _BlobServiceClientSingleton.get_instance() 28 | client2 = _BlobServiceClientSingleton.get_instance() 29 | assert isinstance(client1, BlobServiceClient) 30 | assert isinstance(client2, BlobServiceClient) 31 | assert client1 is client2 # check if both reference the same object 32 | 33 | 34 | def test_get_storage_async_singleton(): 35 | """Verify correctness of singleton implementation""" 36 | client1 = _BlobServiceClientSingletonAsync.get_instance() 37 | client2 = _BlobServiceClientSingletonAsync.get_instance() 38 | assert isinstance(client1, BlobServiceClientAsync) 39 | assert isinstance(client2, BlobServiceClientAsync) 40 | assert client1 is client2 # check if both reference the same object 41 | 42 | 43 | def test_azure_client_manager(): 44 | azure_client_manager = AzureClientManager() 45 | assert isinstance(azure_client_manager, AzureClientManager) 46 | assert isinstance(azure_client_manager.get_cosmos_client(), CosmosClient) 47 | assert isinstance(azure_client_manager.get_blob_service_client(), BlobServiceClient) 48 | assert isinstance( 49 | azure_client_manager.get_blob_service_client_async(), BlobServiceClientAsync 50 | ) 51 | -------------------------------------------------------------------------------- /backend/tests/unit/test_common.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | import pytest 5 | 6 | from graphrag_app.utils.common import ( 7 | desanitize_name, 8 | sanitize_name, 9 | validate_index_file_exist, 10 | ) 11 | 12 | 13 | def test_desanitize_name(container_with_graphml_file): 14 | """Test the graphrag_app.utils.common.desanitize_name function.""" 15 | # test retrieving a valid container name 16 | original_name = container_with_graphml_file 17 | sanitized_name = sanitize_name(original_name) 18 | assert desanitize_name(sanitized_name) == original_name 19 | # test retrieving an invalid container name 20 | assert desanitize_name("nonexistent-container") is None 21 | 22 | 23 | def test_validate_index_file_exist(container_with_graphml_file): 24 | """Test the graphrag_app.utils.common.validate_index_file_exist function.""" 25 | original_name = container_with_graphml_file 26 | sanitized_name = sanitize_name(original_name) 27 | # test with a valid index and valid file 28 | assert validate_index_file_exist(sanitized_name, "output/graph.graphml") is None 29 | # test with a valid index and non-existent file 30 | with pytest.raises(ValueError): 31 | validate_index_file_exist(sanitized_name, "non-existent-file") 32 | # test non-existent index and valid file 33 | with pytest.raises(ValueError): 34 | validate_index_file_exist("nonexistent-index", "output/graph.graphml") 35 | -------------------------------------------------------------------------------- /backend/tests/unit/test_load_logger.py: -------------------------------------------------------------------------------- 1 | from unittest.mock import patch 2 | 3 | import pytest 4 | 5 | from graphrag_app.logger.load_logger import load_pipeline_logger 6 | 7 | 8 | @pytest.fixture 9 | def mock_app_insights_workflow_callbacks(): 10 | with patch( 11 | "graphrag_app.logger.application_insights_workflow_callbacks.ApplicationInsightsWorkflowCallbacks" 12 | ) as mock_app_insights_workflow_callbacks: 13 | yield mock_app_insights_workflow_callbacks 14 | 15 | 16 | @pytest.fixture 17 | def mock_file_workflow_callbacks(): 18 | with patch( 19 | "graphrag.index.reporting.file_workflow_callbacks.FileWorkflowCallbacks" 20 | ) as mock_file_workflow_callbacks: 21 | yield mock_file_workflow_callbacks 22 | 23 | 24 | @pytest.fixture 25 | def mock_blob_workflow_callbacks(): 26 | with patch( 27 | "graphrag_app.logger.blob_workflow_callbacks.BlobWorkflowCallbacks" 28 | ) as mock_blob_workflow_callbacks: 29 | yield mock_blob_workflow_callbacks 30 | 31 | 32 | @pytest.fixture 33 | def mock_console_workflow_callbacks(): 34 | with patch( 35 | "graphrag_app.logger.console_workflow_callbacks.ConsoleWorkflowCallbacks" 36 | ) as mock_console_workflow_callbacks: 37 | yield mock_console_workflow_callbacks 38 | 39 | 40 | @pytest.mark.skip(reason="This test is currently not complete") 41 | def test_load_pipeline_logger_with_console( 42 | mock_app_insights_workflow_callbacks, 43 | mock_blob_workflow_callbacks, 44 | mock_console_workflow_callbacks, 45 | mock_file_workflow_callbacks, 46 | ): 47 | """Test load_pipeline_logger.""" 48 | loggers = load_pipeline_logger( 49 | logging_dir="logs", 50 | loggers=["app_insights", "blob", "console", "file"], 51 | index_name="test-index", 52 | num_workflow_steps=4, 53 | ) 54 | assert len(loggers._callbacks) == 4 55 | -------------------------------------------------------------------------------- /backend/tests/unit/test_logger_app_insights_callbacks.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | import logging 5 | from unittest.mock import MagicMock, patch 6 | 7 | import pytest 8 | 9 | from graphrag_app.logger.application_insights_workflow_callbacks import ( 10 | ApplicationInsightsWorkflowCallbacks, 11 | ) 12 | 13 | 14 | @pytest.fixture 15 | def mock_logger(): 16 | with patch( 17 | "graphrag_app.logger.application_insights_workflow_callbacks.logging.getLogger" 18 | ) as mock_get_logger: 19 | mock_logger_instance = MagicMock(spec=logging.Logger) 20 | mock_get_logger.return_value = mock_logger_instance 21 | yield mock_logger_instance 22 | 23 | 24 | @pytest.fixture 25 | def workflow_callbacks(mock_logger): 26 | with patch( 27 | "graphrag_app.logger.application_insights_workflow_callbacks.ApplicationInsightsWorkflowCallbacks.__init__", 28 | return_value=None, 29 | ): 30 | instance = ApplicationInsightsWorkflowCallbacks() 31 | instance._connection_string = "mock_connection_string" 32 | instance._index_name = "mock_index_name" 33 | instance._num_workflow_steps = 4 34 | instance._logger = mock_logger 35 | instance._processed_workflow_steps = [] 36 | instance._properties = {} 37 | yield instance 38 | 39 | 40 | def test_workflow_start(workflow_callbacks, mock_logger): 41 | workflow_callbacks.workflow_start("test_workflow", object()) 42 | assert mock_logger.info.called 43 | 44 | 45 | def test_workflow_end(workflow_callbacks, mock_logger): 46 | workflow_callbacks.workflow_end("test_workflow", object()) 47 | assert mock_logger.info.called 48 | 49 | 50 | def test_log(workflow_callbacks, mock_logger): 51 | workflow_callbacks.log("test_log_message") 52 | assert mock_logger.info.called 53 | 54 | 55 | def test_warning(workflow_callbacks, mock_logger): 56 | workflow_callbacks.warning("test_warning") 57 | assert mock_logger.warning.called 58 | 59 | 60 | def test_error(workflow_callbacks, mock_logger): 61 | workflow_callbacks.error("test_error", Exception("test_exception")) 62 | assert mock_logger.error.called 63 | -------------------------------------------------------------------------------- /backend/tests/unit/test_logger_blob_callbacks.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | from unittest.mock import patch 5 | 6 | import pytest 7 | 8 | from graphrag_app.logger.blob_workflow_callbacks import BlobWorkflowCallbacks 9 | 10 | 11 | @pytest.fixture 12 | def mock_blob_service_client(): 13 | with patch( 14 | "graphrag_app.logger.blob_workflow_callbacks.BlobServiceClient" 15 | ) as mock_blob_service_client: 16 | yield mock_blob_service_client 17 | 18 | 19 | @pytest.fixture 20 | def workflow_callbacks(mock_blob_service_client): 21 | with patch( 22 | "graphrag_app.logger.blob_workflow_callbacks.BlobWorkflowCallbacks.__init__", 23 | return_value=None, 24 | ): 25 | instance = BlobWorkflowCallbacks() 26 | instance._blob_service_client = mock_blob_service_client 27 | instance._index_name = "mock_index_name" 28 | instance._container_name = "logs" 29 | instance._blob_name = "logs/logs.txt" 30 | instance._num_workflow_steps = 4 31 | instance._processed_workflow_steps = [] 32 | instance._workflow_name = "" 33 | yield instance 34 | 35 | 36 | def test_on_workflow_start(workflow_callbacks): 37 | workflow_callbacks.workflow_start("test_workflow", object()) 38 | # check if blob workflow callbacks _write_log() method was called 39 | assert workflow_callbacks._blob_service_client.get_blob_client().append_block.called 40 | 41 | 42 | def test_on_workflow_end(workflow_callbacks): 43 | workflow_callbacks.workflow_end("test_workflow", object()) 44 | assert workflow_callbacks._blob_service_client.get_blob_client().append_block.called 45 | 46 | 47 | def test_on_error(workflow_callbacks): 48 | workflow_callbacks.error("test_error", Exception("test_exception")) 49 | assert workflow_callbacks._blob_service_client.get_blob_client().append_block.called 50 | -------------------------------------------------------------------------------- /backend/tests/unit/test_logger_console_callbacks.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | import logging 5 | from unittest.mock import MagicMock, patch 6 | 7 | import pytest 8 | 9 | from graphrag_app.logger.console_workflow_callbacks import ConsoleWorkflowCallbacks 10 | 11 | 12 | @pytest.fixture 13 | def mock_logger(): 14 | with patch( 15 | "graphrag_app.logger.console_workflow_callbacks.logging.getLogger" 16 | ) as mock_get_logger: 17 | mock_logger_instance = MagicMock(spec=logging.Logger) 18 | mock_get_logger.return_value = mock_logger_instance 19 | yield mock_logger_instance 20 | 21 | 22 | @pytest.fixture 23 | def workflow_callbacks(mock_logger): 24 | with patch( 25 | "graphrag_app.logger.console_workflow_callbacks.ConsoleWorkflowCallbacks.__init__", 26 | return_value=None, 27 | ): 28 | instance = ConsoleWorkflowCallbacks() 29 | instance._logger = mock_logger 30 | instance._index_name = "mock_index_name" 31 | instance._num_workflow_steps = 4 32 | instance._processed_workflow_steps = [] 33 | instance._properties = {} 34 | yield instance 35 | 36 | 37 | def test_workflow_start(workflow_callbacks, mock_logger): 38 | workflow_callbacks.workflow_start("test_workflow", object()) 39 | assert mock_logger.info.called 40 | 41 | 42 | def test_workflow_end(workflow_callbacks, mock_logger): 43 | workflow_callbacks.workflow_end("test_workflow", object()) 44 | assert mock_logger.info.called 45 | 46 | 47 | def test_log(workflow_callbacks, mock_logger): 48 | workflow_callbacks.log("test_log_message") 49 | assert mock_logger.info.called 50 | 51 | 52 | def test_warning(workflow_callbacks, mock_logger): 53 | workflow_callbacks.warning("test_warning") 54 | assert mock_logger.warning.called 55 | 56 | 57 | def test_error(workflow_callbacks, mock_logger): 58 | workflow_callbacks.error("test_error", Exception("test_exception")) 59 | assert mock_logger.error.called 60 | -------------------------------------------------------------------------------- /docker/Dockerfile-backend: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | # For more information about the base image: https://mcr.microsoft.com/en-us/artifact/mar/devcontainers/python/about 5 | FROM mcr.microsoft.com/devcontainers/python:3.10-bookworm 6 | 7 | # Patch Debian to remediate CVE findings 8 | # Apply Debian bookworm-updates by running a full system upgrade 9 | RUN echo "deb http://deb.debian.org/debian bookworm-updates main" >> /etc/apt/sources.list.d/bookworm-updates.list \ 10 | && echo "deb http://deb.debian.org/debian bookworm-backports main" >> /etc/apt/sources.list.d/backports.list \ 11 | && apt-get update \ 12 | && apt-get upgrade -y \ 13 | && apt-get autoremove -y \ 14 | && apt-get clean \ 15 | && rm -rf /var/lib/apt/lists/* 16 | 17 | # default graphrag version will be 0.0.0 unless overridden by --build-arg 18 | ARG GRAPHRAG_VERSION=0.0.0 19 | ENV GRAPHRAG_VERSION=v${GRAPHRAG_VERSION} 20 | ENV PIP_ROOT_USER_ACTION=ignore 21 | ENV PIP_DISABLE_PIP_VERSION_CHECK=1 22 | ENV SETUPTOOLS_USE_DISTUTILS=stdlib 23 | ENV TIKTOKEN_CACHE_DIR=/opt/tiktoken_cache/ 24 | 25 | # CVE finding in pip < 23.3 - Upgrade pip to version 23.3 or greater 26 | RUN pip install --upgrade pip 27 | 28 | COPY backend /backend 29 | RUN cd backend \ 30 | && pip install poetry \ 31 | && poetry config virtualenvs.create false \ 32 | && poetry install 33 | 34 | # download all nltk data that graphrag requires 35 | RUN python -c "import nltk;nltk.download(['punkt','averaged_perceptron_tagger','maxent_ne_chunker','words','wordnet'])" 36 | # download tiktoken model encodings 37 | RUN python -c "import tiktoken; tiktoken.encoding_for_model('gpt-3.5-turbo'); tiktoken.encoding_for_model('gpt-4'); tiktoken.encoding_for_model('gpt-4o');" 38 | 39 | # CVE finding in cryptography <= 44.0.0 - cache version 44.0.1 of cryptography via pip 40 | RUN pip install cryptography==44.0.1 41 | 42 | WORKDIR /backend 43 | EXPOSE 80 44 | CMD ["uvicorn", "graphrag_app.main:app", "--host", "0.0.0.0", "--port", "80"] 45 | -------------------------------------------------------------------------------- /docker/Dockerfile-frontend: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | # For more information about the base image visit: 5 | # https://mcr.microsoft.com/en-us/artifact/mar/devcontainers/python/about 6 | FROM mcr.microsoft.com/devcontainers/python:3.10-bookworm 7 | 8 | ENV PIP_ROOT_USER_ACTION=ignore 9 | ENV PIP_DISABLE_PIP_VERSION_CHECK=1 10 | ENV SETUPTOOLS_USE_DISTUTILS=stdlib 11 | 12 | COPY frontend /frontend 13 | RUN cd frontend \ 14 | && pip install poetry \ 15 | && poetry config virtualenvs.create false \ 16 | && poetry install 17 | 18 | WORKDIR /frontend 19 | EXPOSE 8080 20 | CMD ["streamlit", "run", "app.py", "--server.port", "8080"] 21 | -------------------------------------------------------------------------------- /docs/DEVELOPMENT-GUIDE.md: -------------------------------------------------------------------------------- 1 | # Development Guide 2 | 3 | This document is for developers interested in contributing to GraphRAG. 4 | 5 | ## Quickstart 6 | Development is best done in a unix environment (Linux, Mac, or [Windows WSL](https://learn.microsoft.com/en-us/windows/wsl/install)). 7 | 8 | 1. Clone the GraphRAG repository. 9 | 1. Follow all directions in the [deployment guide](DEPLOYMENT-GUIDE.md) to install required tools and deploy an instance of the GraphRAG service in Azure. Alternatively, this repo provides a devcontainer with all tools preinstalled. 10 | 1. New unit tests and integration tests are currently being added to improve the developer experience when testing code changes locally. 11 | 12 | ### Testing 13 | 14 | A small collection of unit tests and integrations tests have been written to test functionality of the API. To get started, first ensure that all test dependencies have been installed. 15 | 16 | ```shell 17 | cd /backend 18 | poetry install --with test 19 | ``` 20 | 21 | Some tests require the [azurite emulator](https://learn.microsoft.com/en-us/azure/storage/common/storage-use-azurite?toc=%2Fazure%2Fstorage%2Fblobs%2Ftoc.json&bc=%2Fazure%2Fstorage%2Fblobs%2Fbreadcrumb%2Ftoc.json&tabs=docker-hub%2Cblob-storage) and [cosmosdb emulator](https://learn.microsoft.com/en-us/azure/cosmos-db/how-to-develop-emulator?tabs=docker-linux%2Ccsharp&pivots=api-nosql) to be running locally (these are setup in the ci/cd automatically). You may start these emulators by running them in the background as docker containers. 22 | 23 | ```shell 24 | docker run -d -p 10000:10000 -p 10001:10001 -p 10002:10002 mcr.microsoft.com/azure-storage/azurite:latest 25 | docker run -d -p 8081:8081 -p 1234:1234 mcr.microsoft.com/cosmosdb/linux/azure-cosmos-emulator:vnext-preview 26 | ``` 27 | 28 | To run the tests: 29 | 30 | ```shell 31 | cd /backend 32 | pytest -s --cov=src tests 33 | ``` 34 | 35 | ### Deployment (CI/CD) 36 | This repository uses Github Actions for continuous integration and continious deployment (CI/CD). 37 | 38 | ### Style Guide: 39 | * We follow [PEP 8](https://peps.python.org/pep-0008) standards and naming conventions as close as possible. 40 | 41 | * [ruff](https://docs.astral.sh/ruff) is used for linting and code formatting. A pre-commit hook has been setup to automatically apply settings to this repo. To make use of this tool without explicitly calling it, install the pre-commit hook. 42 | ``` 43 | > pre-commit install 44 | ``` 45 | 46 | ### Versioning 47 | We use [SemVer](https://aka.ms/StartRight/README-Template/semver) for semantic versioning. 48 | -------------------------------------------------------------------------------- /docs/assets/graphrag-architecture-diagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure-Samples/graphrag-accelerator/c660c84cff1f7dfc7c82a62ee566ab0b2dc8e6b6/docs/assets/graphrag-architecture-diagram.png -------------------------------------------------------------------------------- /docs/assets/graphrag-architecture-diagram.vsdx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure-Samples/graphrag-accelerator/c660c84cff1f7dfc7c82a62ee566ab0b2dc8e6b6/docs/assets/graphrag-architecture-diagram.vsdx -------------------------------------------------------------------------------- /frontend/.streamlit/config.toml: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | [server] 5 | enableXsrfProtection = false 6 | -------------------------------------------------------------------------------- /frontend/README.md: -------------------------------------------------------------------------------- 1 | # Frontend Application Launch Instructions 2 | A small frontend application (a streamlit app) is provided to demonstrate how to build and deploy a UI on top of the solution accelerator API. 3 | This application is optional and not required for the solution accelerator API to function properly. 4 | 5 | ### 1. Deploy the GraphRAG solution accelerator 6 | Follow instructions from the [deployment guide](../docs/DEPLOYMENT-GUIDE.md) to deploy a full instance of the solution accelerator. 7 | 8 | ### 2. (optional) Create a `.env` file: 9 | If a `.env` file is not provided, the UI will prompt the user for additional login information. 10 | 11 | | Variable Name | Required | Example | Description | 12 | | :--- | --- | :--- | ---: | 13 | DEPLOYMENT_URL | No | https://.azure-api.net | Base url of the deployed graphrag API. Also referred to as the APIM Gateway URL. 14 | APIM_SUBSCRIPTION_KEY | No | | A [subscription key](https://learn.microsoft.com/en-us/azure/api-management/api-management-subscriptions) generated by APIM. 15 | DEPLOYER_EMAIL | No | deployer@email.com | Email address of the person/organization that deployed the solution accelerator. 16 | 17 | ## Run UI locally 18 | 19 | The frontend application can run locally as a docker container. 20 | 21 | ``` 22 | # cd to the root directory of the repo 23 | > docker build -t graphrag:frontend -f docker/Dockerfile-frontend . 24 | > docker run --env-file -p 8080:8080 graphrag:frontend 25 | ``` 26 | To access the app , visit `localhost:8080` in your browser. 27 | 28 | ## Host UI in Azure 29 | The frontend application can also be hosted in Azure as a Web App using the provided `frontend/deploy.sh` script. 30 | 31 | ### 1. Create Azure App Registration 32 | 33 | To enable authentication and authorization for the frontend application, you need to create an Azure App Registration with ID tokens enabled. You may need Owner level permissions on the subscription for some steps. 34 | This app registration will be used for Authentication and Authorization to the frontend web app (not the backend). Follow the steps below: 35 | 36 | 1. Go to the [Azure portal](https://portal.azure.com) and sign in with your Azure account. 37 | 2. Navigate to the **Microsoft Entra ID** service. 38 | 3. Select **App registrations** from the left-hand menu. 39 | 4. Click on the **+ New registration** button. 40 | 5. Provide a name for your app registration and select the appropriate account type. 41 | 6. Under the **Redirect URIs** section, select the `Web` platform from dropdown menu. Add `http://localhost:8080/.auth/login/aad/callback` to the URL text field. The deployment script will update this field later with the actual URL of the webapp. 42 | 7. Save the app registration. 43 | 8. Under the **Manage** section select **Authentication**. Select **ID tokens** as the supported token type. 44 | 9. In the **Overview** section of the registered app, take note of the **Application (client) ID**, **Object ID** and **Directory (tenant) ID**. This information will be used later. 45 | 46 | ### 2. Populate the deploy parameters 47 | 48 | Please fill out `frontend/frontend_deploy.parameters.json` with the required values described below. 49 | 50 | 1. Replace the placeholder values with actual values for the following required variables, you may also add optional variables in the json file if you wish to override the default values: 51 | 52 | | Variable Name | Required | Example | Description | 53 | | :------------------- | :------- | :------------------------------------- | :-------------------------------------------------------------- | 54 | | LOCATION | Yes | eastus | The Azure region where resources will be deployed. | 55 | | RESOURCE_GROUP | Yes | my-resource-group | The name of the Azure resource group where resources will be created. At this time, the name must follow [Azure Container Registry](https://learn.microsoft.com/en-us/azure/azure-resource-manager/management/resource-name-rules#microsoftcontainerregistry) naming guidelines. | 56 | | SUBSCRIPTION_ID | Yes | 12345678-1234-1234-1234-1234567890ab | The ID of the Azure subscription where the resources will be deployed. | 57 | | AAD_CLIENT_ID | Yes | 12345678-1234-1234-1234-1234567890ab | The client ID of the Microsoft Entra ID (AAD) app registration. | 58 | | AAD_OBJECT_ID | Yes | 12345678-1234-1234-1234-1234567890ab | The object ID of the Microsoft Entra ID (AAD) app registration. | 59 | | AAD_TENANT_ID | Yes | 12345678-1234-1234-1234-1234567890ab | The ID of the Microsoft Entra ID (AAD) tenant. | 60 | | AAD_TOKEN_ISSUER_URL | No | https://login.microsoftonline.com/12345678-1234-1234-1234-1234567890ab/v2.0 | The URL of the Microsoft Entra ID (AAD) token issuer. Defaults to the tenant-specific issuer URL. | 61 | | IMAGE_NAME | No | graphrag:frontend | The name of the Docker image for the frontend application. Defaults to `graphrag:frontend`. | 62 | | REGISTRY_NAME | No | myresourcegroupreg | The name of the Azure Container Registry. Defaults to the resource group name with `reg` appended. | 63 | | APP_SERVICE_PLAN | No | myresourcegroup-asp | The name of the Azure App Service plan. Defaults to the resource group name with `asp` appended. | 64 | | WEB_APP | No | myresourcegroup-playground | The name of the Azure Web App. Defaults to the resource group name with `playground` appended. | 65 | | WEB_APP_IDENTITY | No | myresourcegroup-playground-identity | The name of the managed identity for the Azure Web App. Defaults to the web app name with `identity` appended. | 66 | 67 | Save the `frontend/frontend_deploy.parameters.json` file after populating the values. If you would like the webapp to automatically connect 68 | to the solution accelerator backend API, create and populate a `.env` file described in step 2, otherwise the webapp will ask for login credentials to the APIM service that was deployed as part of the backend API. 69 | 70 | ### 3. Run the deploy script 71 | 72 | Prerequisite : Please install az-cli version >=2.64.0 73 | To deploy the frontend application: 74 | 75 | 1. Open a terminal and navigate to the `/frontend` directory. 76 | 2. Run the deploy script: 77 | 78 | ``` 79 | # cd to graphrag-accelerator/frontend directory 80 | > bash deploy.sh -p frontend_deploy.parameters.json 81 | ``` 82 | 83 | One the frontend application has been deployed, please take note of the URL that is displayed at the end of the script. It will have the form `(https://PLACEHOLDER.azurewebsites.net)`. The Web App service will take 2-3 minutes initially to load the first time. This is expected behavior. 84 | -------------------------------------------------------------------------------- /frontend/app.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | import os 5 | 6 | import streamlit as st 7 | 8 | from src.components import tabs 9 | from src.components.index_pipeline import IndexPipeline 10 | from src.enums import EnvVars 11 | from src.functions import initialize_app 12 | from src.graphrag_api import GraphragAPI 13 | 14 | # Load environment variables 15 | initialized = initialize_app() 16 | st.session_state["initialized"] = True if initialized else False 17 | 18 | 19 | def graphrag_app(initialized: bool): 20 | st.title("Microsoft GraphRAG Copilot") 21 | main_tab, prompt_gen_tab, prompt_edit_tab, index_tab, query_tab = st.tabs([ 22 | "**Intro**", 23 | "**1. Prompt Generation**", 24 | "**2. Prompt Configuration**", 25 | "**3. Index**", 26 | "**4. Query**", 27 | ]) 28 | # display only the main tab if a connection to an existing APIM has not been initialized 29 | with main_tab: 30 | tabs.get_main_tab(initialized) 31 | if initialized: 32 | # setup API request information 33 | COLUMN_WIDTHS = [0.275, 0.45, 0.275] 34 | apim_url = st.session_state[EnvVars.DEPLOYMENT_URL.value] 35 | apim_key = st.session_state[EnvVars.APIM_SUBSCRIPTION_KEY.value] 36 | # perform health check to verify connectivity 37 | client = GraphragAPI(apim_url, apim_key) 38 | if not client.health_check_passed(): 39 | st.error("APIM Connection Error") 40 | st.stop() 41 | indexPipe = IndexPipeline(client, COLUMN_WIDTHS) 42 | # display tabs 43 | with prompt_gen_tab: 44 | tabs.get_prompt_generation_tab(client, COLUMN_WIDTHS) 45 | with prompt_edit_tab: 46 | tabs.get_prompt_configuration_tab() 47 | with index_tab: 48 | tabs.get_index_tab(indexPipe) 49 | with query_tab: 50 | tabs.get_query_tab(client) 51 | deployer_email = os.getenv("DEPLOYER_EMAIL", "deployer@email.com") 52 | footer = f""" 53 | 56 | """ 57 | st.markdown(footer, unsafe_allow_html=True) 58 | 59 | 60 | if __name__ == "__main__": 61 | graphrag_app(st.session_state["initialized"]) 62 | -------------------------------------------------------------------------------- /frontend/frontend_deploy.parameters.json: -------------------------------------------------------------------------------- 1 | { 2 | "SUBSCRIPTION_ID": "", 3 | "RESOURCE_GROUP": "", 4 | "LOCATION": "", 5 | "AAD_CLIENT_ID": "", 6 | "AAD_OBJECT_ID": "", 7 | "AAD_TENANT_ID": "" 8 | } 9 | -------------------------------------------------------------------------------- /frontend/pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "graphrag-solution-accelerator" 3 | version = "0.1.1" 4 | description = "" 5 | authors = [ 6 | "Josh Bradley ", 7 | "Newman Cheng ", 8 | "Christine DiFonzo ", 9 | "Gabriel Nieves ", 10 | "Douglas Orbaker ", 11 | "Chris Sanchez ", 12 | "Shane Solomon ", 13 | ] 14 | readme = "README.md" 15 | license = "MIT" 16 | package-mode = false 17 | 18 | [tool.poetry.dependencies] 19 | python = "~3.10" 20 | 21 | [tool.poetry.group.dev.dependencies] 22 | detect-secrets = ">=1.5.0" 23 | devtools = ">=0.12.2" 24 | flake8 = ">=6.1.0" 25 | ipython = "*" 26 | jupyter = "*" 27 | pre-commit = ">=3.6.0" 28 | ruff = ">=0.1.13" 29 | 30 | [tool.poetry.group.test.dependencies] 31 | pytest = ">=8.2.1" 32 | wikipedia = ">=1.4.0" 33 | 34 | [tool.poetry.group.frontend.dependencies] 35 | requests = "*" 36 | streamlit = ">=1.38.0" 37 | streamlit-nested-layout = "*" 38 | 39 | [tool.ruff] 40 | target-version = "py310" 41 | line-length = 88 42 | indent-width = 4 43 | 44 | [tool.ruff.format] 45 | preview = true 46 | quote-style = "double" 47 | 48 | [tool.ruff.lint] 49 | preview = true 50 | select = ["E", "F", "I"] 51 | ignore = ["E402", "E501", "F821"] 52 | 53 | [build-system] 54 | requires = ["poetry-core"] 55 | build-backend = "poetry.core.masonry.api" 56 | -------------------------------------------------------------------------------- /frontend/src/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure-Samples/graphrag-accelerator/c660c84cff1f7dfc7c82a62ee566ab0b2dc8e6b6/frontend/src/__init__.py -------------------------------------------------------------------------------- /frontend/src/components/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure-Samples/graphrag-accelerator/c660c84cff1f7dfc7c82a62ee566ab0b2dc8e6b6/frontend/src/components/__init__.py -------------------------------------------------------------------------------- /frontend/src/components/login_sidebar.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | import streamlit as st 5 | 6 | from src.enums import EnvVars 7 | from src.graphrag_api import GraphragAPI 8 | 9 | 10 | def login(): 11 | """ 12 | Login component that displays in the sidebar. Requires the user to enter 13 | the APIM Gateway URL and Subscription Key to login. After entering user 14 | credentials, a simple health check call is made to the GraphRAG API. 15 | """ 16 | with st.sidebar: 17 | st.title( 18 | "Login", 19 | help="Enter your APIM credentials to get started. Refreshing the browser will require you to login again.", 20 | ) 21 | with st.form(key="login-form", clear_on_submit=True): 22 | apim_url = st.text_input("APIM Gateway URL", key="apim-url") 23 | apim_sub_key = st.text_input( 24 | "APIM Subscription Key", key="subscription-key" 25 | ) 26 | form_submit = st.form_submit_button("Login") 27 | if form_submit: 28 | client = GraphragAPI(apim_url, apim_sub_key) 29 | if client.health_check_passed(): 30 | st.success("Login Successful") 31 | st.session_state[EnvVars.DEPLOYMENT_URL.value] = apim_url 32 | st.session_state[EnvVars.APIM_SUBSCRIPTION_KEY.value] = apim_sub_key 33 | st.session_state["initialized"] = True 34 | st.rerun() 35 | else: 36 | st.error("Login Failed") 37 | st.error("Please check the APIM Gateway URL and Subscription Key") 38 | -------------------------------------------------------------------------------- /frontend/src/components/prompt_configuration.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | import os 5 | 6 | import streamlit as st 7 | 8 | from src.enums import PromptFileNames, PromptKeys, PromptTextAreas 9 | from src.functions import zip_directory 10 | 11 | SAVED_PROMPT_VAR = "saved_prompts" 12 | 13 | 14 | def save_prompts( 15 | local_dir: str = "./edited_prompts/", zip_file_path: str = "edited_prompts.zip" 16 | ): 17 | """ 18 | Save prompts in memory and on disk as a zip file 19 | """ 20 | st.session_state[SAVED_PROMPT_VAR] = True 21 | st.session_state[PromptKeys.ENTITY.value] = st.session_state[ 22 | PromptTextAreas.ENTITY.value 23 | ] 24 | st.session_state[PromptKeys.SUMMARY.value] = st.session_state[ 25 | PromptTextAreas.SUMMARY.value 26 | ] 27 | st.session_state[PromptKeys.COMMUNITY.value] = st.session_state[ 28 | PromptTextAreas.COMMUNITY.value 29 | ] 30 | os.makedirs(local_dir, exist_ok=True) 31 | for key, filename in zip(PromptKeys, PromptFileNames): 32 | outpath = os.path.join(local_dir, filename.value) 33 | with open(outpath, "w", encoding="utf-8") as f: 34 | f.write(st.session_state[key.value]) 35 | zip_directory(local_dir, zip_file_path) 36 | 37 | 38 | def edit_prompts(): 39 | """ 40 | Re-edit the prompts 41 | """ 42 | st.session_state[SAVED_PROMPT_VAR] = False 43 | 44 | 45 | def prompt_editor(prompt_values: list[str]): 46 | """ 47 | Container for prompt configurations 48 | """ 49 | saved_prompts = st.session_state[SAVED_PROMPT_VAR] 50 | 51 | entity_ext_prompt, summ_prompt, comm_report_prompt = prompt_values 52 | 53 | with st.container(border=True): 54 | tab_labels = [ 55 | "**Entity Extraction**", 56 | "**Summarize Descriptions**", 57 | "**Community Reports**", 58 | ] 59 | # subheaders = [f"{tab_label} Prompt" for tab_label in tab_labels] 60 | tab1, tab2, tab3 = st.tabs(tabs=tab_labels) 61 | with tab1: 62 | st.text_area( 63 | label="Entity Prompt", 64 | value=entity_ext_prompt, 65 | max_chars=20000, 66 | key="entity_text_area", 67 | label_visibility="hidden", 68 | disabled=saved_prompts, 69 | ) 70 | 71 | with tab2: 72 | st.text_area( 73 | label="Summarize Prompt", 74 | value=summ_prompt, 75 | max_chars=20000, 76 | key="summary_text_area", 77 | label_visibility="hidden", 78 | disabled=saved_prompts, 79 | ) 80 | 81 | with tab3: 82 | st.text_area( 83 | label="Community Reports Prompt", 84 | value=comm_report_prompt, 85 | max_chars=20000, 86 | key="community_text_area", 87 | label_visibility="hidden", 88 | disabled=saved_prompts, 89 | ) 90 | -------------------------------------------------------------------------------- /frontend/src/components/upload_files_component.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | import json 5 | 6 | import streamlit as st 7 | 8 | from src.graphrag_api import GraphragAPI 9 | 10 | UPLOAD_HELP_MESSAGE = """ 11 | This functionality is disabled while an existing Storage Container is selected. 12 | Please deselect the existing Storage Container to upload new data. 13 | """ 14 | 15 | 16 | def upload_files( 17 | client: GraphragAPI, key_prefix: str, disable_other_input: bool = False 18 | ): 19 | """ 20 | Reusable component to upload files to Blob Storage Container 21 | """ 22 | input_storage_name = st.text_input( 23 | label="Enter Storage Name", 24 | key=f"{key_prefix}-storage-name-input", 25 | disabled=disable_other_input, 26 | help=UPLOAD_HELP_MESSAGE, 27 | ) 28 | file_upload = st.file_uploader( 29 | "Upload Data", 30 | type=["txt"], 31 | key=f"{key_prefix}-file-uploader", 32 | accept_multiple_files=True, 33 | disabled=disable_other_input, 34 | ) 35 | 36 | uploaded = st.button( 37 | "Upload Files", 38 | key=f"{key_prefix}-upload-button", 39 | disabled=disable_other_input or input_storage_name == "", 40 | ) 41 | if uploaded: 42 | if file_upload and input_storage_name != "": 43 | file_payloads = [] 44 | for file in file_upload: 45 | file_payload = ( 46 | "files", 47 | (file.name, file.read(), file.type), 48 | ) 49 | file_payloads.append((file_payload)) 50 | 51 | response = client.upload_files(file_payloads, input_storage_name) 52 | if response.status_code == 200: 53 | st.success("Files uploaded successfully!") 54 | else: 55 | st.error(f"Error: {json.loads(response.text)}") 56 | return uploaded 57 | -------------------------------------------------------------------------------- /frontend/src/enums.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | from enum import Enum 5 | 6 | 7 | class PromptKeys(str, Enum): 8 | ENTITY = "entity_extraction" 9 | SUMMARY = "summarize_descriptions" 10 | COMMUNITY = "community_report" 11 | 12 | 13 | class PromptFileNames(str, Enum): 14 | ENTITY = "entity_extraction_prompt.txt" 15 | SUMMARY = "summarize_descriptions_prompt.txt" 16 | COMMUNITY = "community_report_prompt.txt" 17 | 18 | 19 | class PromptTextAreas(str, Enum): 20 | ENTITY = "entity_text_area" 21 | SUMMARY = "summary_text_area" 22 | COMMUNITY = "community_text_area" 23 | 24 | 25 | class StorageIndexVars(str, Enum): 26 | SELECTED_STORAGE = "selected_storage" 27 | INPUT_STORAGE = "input_storage" 28 | SELECTED_INDEX = "selected_index" 29 | 30 | 31 | class EnvVars(str, Enum): 32 | APIM_SUBSCRIPTION_KEY = "APIM_SUBSCRIPTION_KEY" 33 | DEPLOYMENT_URL = "DEPLOYMENT_URL" 34 | -------------------------------------------------------------------------------- /frontend/style.css: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) Microsoft Corporation. 3 | Licensed under the MIT License. 4 | */ 5 | 6 | @import url('https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.15.1/css/all.min.css'); 7 | 8 | #root > div:nth-child(1) > div.withScreencast > div > div > div > section.main.st-emotion-cache-uf99v8.ea3mdgi8 > div.block-container.st-emotion-cache-z5fcl4.ea3mdgi5 > div > div > div > div.st-emotion-cache-ocqkz7.e1f1d6gn5 > div:nth-child(4) > div > div > div > div > div{ 9 | margin-top: 1.6em; 10 | } 11 | 12 | 13 | [data-testid="stHeadingDivider"] { 14 | background-color: #3d9df3; /* Set your desired color */ 15 | height: 1px; 16 | } 17 | 18 | #microsoft-graphrag-copilot > div > span { 19 | text-align: center; 20 | margin-top: -1em; 21 | } 22 | 23 | /* Tooltip container */ 24 | .tooltip { 25 | position: relative; 26 | display: inline-block; 27 | border-bottom: 1px dotted black; /* If you want dots under the hoverable text */ 28 | } 29 | 30 | /* Tooltip text */ 31 | .tooltip .tooltiptext { 32 | visibility: hidden; 33 | width: 120px; 34 | background-color: #555; 35 | color: #fff; 36 | text-align: center; 37 | border-radius: 6px; 38 | padding: 5px; 39 | position: absolute; 40 | z-index: 1; 41 | bottom: 125%; 42 | left: 50%; 43 | margin-left: -60px; 44 | opacity: 0; 45 | transition: opacity 0.3s; 46 | } 47 | 48 | /* Show the tooltip text when you hover over the tooltip container */ 49 | .tooltip:hover .tooltiptext { 50 | visibility: visible; 51 | opacity: 1; 52 | } 53 | 54 | .gray-box { 55 | 56 | background-color: #ffffff; 57 | padding: 10px; 58 | width: 80%; 59 | } 60 | 61 | .center-container { 62 | margin-top: -10em; 63 | display: flex; 64 | align-items: center; 65 | justify-content: center; 66 | height: 100vh; 67 | } 68 | 69 | .footer { 70 | display: flex; 71 | justify-content: center; 72 | align-items: center; 73 | position: fixed; 74 | left: 0; 75 | bottom: 0; 76 | width: 100%; 77 | background-color: #f1f1f1; 78 | text-align: center; 79 | padding: 5px; 80 | z-index: 1000; 81 | } 82 | .footer p{ 83 | font-size: 12px; 84 | } 85 | 86 | /* CSS */ 87 | button[kind="primary"] { 88 | background-color: #1d9445; 89 | border: 0; 90 | border-radius: 56px; 91 | color: #fff; 92 | cursor: pointer; 93 | display: inline-block; 94 | font-family: system-ui,-apple-system,system-ui,"Segoe UI",Roboto,Ubuntu,"Helvetica Neue",sans-serif; 95 | font-size: 58px; 96 | font-weight: 600; 97 | outline: 0; 98 | padding: 16px 21px; 99 | position: relative; 100 | text-align: center; 101 | text-decoration: none; 102 | transition: all .3s; 103 | user-select: none; 104 | -webkit-user-select: none; 105 | touch-action: manipulation; 106 | } 107 | 108 | button[kind="primary"]:before { 109 | background-color: initial; 110 | background-image: linear-gradient(#fff 0, rgba(255, 255, 255, 0) 100%); 111 | border-radius: 125px; 112 | content: ""; 113 | height: 50%; 114 | left: 4%; 115 | opacity: .5; 116 | position: absolute; 117 | top: 0; 118 | transition: all .3s; 119 | width: 62%; 120 | } 121 | 122 | button[kind="primary"]:hover { 123 | box-shadow: rgba(255, 255, 255, .2) 0 3px 15px inset, rgba(0, 0, 0, .1) 0 3px 5px, rgba(0, 0, 0, .1) 0 10px 13px; 124 | transform: scale(1.05); 125 | } 126 | 127 | @media (min-width: 768px) { 128 | button[kind="primary"] { 129 | padding: 15px 34px; 130 | margin: 20px auto; 131 | } 132 | } 133 | 134 | .element-container:has(>.stTextArea), .stTextArea { 135 | display: block; 136 | margin-left: auto; 137 | margin-right: auto; 138 | } 139 | .stTextArea textarea { 140 | height: 500px; 141 | /*background-color: #a7b0a4;*/ 142 | } 143 | -------------------------------------------------------------------------------- /infra/abbreviations.json: -------------------------------------------------------------------------------- 1 | { 2 | "analysisServicesServers": "as", 3 | "apiManagementService": "apim-", 4 | "appConfigurationConfigurationStores": "appcs-", 5 | "appContainerApps": "ca-", 6 | "appManagedEnvironments": "cae-", 7 | "authorizationPolicyDefinitions": "policy-", 8 | "automationAutomationAccounts": "aa-", 9 | "azureOpenAI": "aoai-", 10 | "blueprintBlueprints": "bp-", 11 | "blueprintBlueprintsArtifacts": "bpa-", 12 | "cacheRedis": "redis-", 13 | "cdnProfiles": "cdnp-", 14 | "cdnProfilesEndpoints": "cdne-", 15 | "cognitiveServicesAccounts": "cog-", 16 | "cognitiveServicesFormRecognizer": "cog-fr-", 17 | "cognitiveServicesTextAnalytics": "cog-ta-", 18 | "computeAvailabilitySets": "avail-", 19 | "computeCloudServices": "cld-", 20 | "computeDiskEncryptionSets": "des", 21 | "computeDisks": "disk", 22 | "computeDisksOs": "osdisk", 23 | "computeGalleries": "gal", 24 | "computeSnapshots": "snap-", 25 | "computeVirtualMachineScaleSets": "vmss-", 26 | "computeVirtualMachines": "vm", 27 | "containerInstanceContainerGroups": "ci", 28 | "containerRegistryRegistries": "cr", 29 | "containerServiceManagedClusters": "aks-", 30 | "dBforMySQLServers": "mysql-", 31 | "dBforPostgreSQLServers": "psql-", 32 | "dataFactoryFactories": "adf-", 33 | "dataLakeAnalyticsAccounts": "dla", 34 | "dataLakeStoreAccounts": "dls", 35 | "dataMigrationServices": "dms-", 36 | "databricksWorkspaces": "dbw-", 37 | "devicesIotHubs": "iot-", 38 | "devicesProvisioningServices": "provs-", 39 | "devicesProvisioningServicesCertificates": "pcert-", 40 | "documentDBDatabaseAccounts": "cosmos-", 41 | "eventGridDomains": "evgd-", 42 | "eventGridDomainsTopics": "evgt-", 43 | "eventGridEventSubscriptions": "evgs-", 44 | "eventHubNamespaces": "evhns-", 45 | "eventHubNamespacesEventHubs": "evh-", 46 | "hdInsightClustersHadoop": "hadoop-", 47 | "hdInsightClustersHbase": "hbase-", 48 | "hdInsightClustersKafka": "kafka-", 49 | "hdInsightClustersMl": "mls-", 50 | "hdInsightClustersSpark": "spark-", 51 | "hdInsightClustersStorm": "storm-", 52 | "hybridComputeMachines": "arcs-", 53 | "insightsActionGroups": "ag-", 54 | "insightsComponents": "appi-", 55 | "keyVaultVaults": "kv-", 56 | "kubernetesConnectedClusters": "arck", 57 | "kustoClusters": "dec", 58 | "kustoClustersDatabases": "dedb", 59 | "logicIntegrationAccounts": "ia-", 60 | "logicWorkflows": "logic-", 61 | "machineLearningServicesWorkspaces": "mlw-", 62 | "managedIdentityUserAssignedIdentities": "id-", 63 | "managementManagementGroups": "mg-", 64 | "migrateAssessmentProjects": "migr-", 65 | "networkApplicationGateways": "agw-", 66 | "networkApplicationSecurityGroups": "asg-", 67 | "networkAzureFirewalls": "afw-", 68 | "networkBastionHosts": "bas-", 69 | "networkConnections": "con-", 70 | "networkDnsZones": "dnsz-", 71 | "networkExpressRouteCircuits": "erc-", 72 | "networkFirewallPolicies": "afwp-", 73 | "networkFirewallPoliciesRuleGroups": "wafrg", 74 | "networkFirewallPoliciesWebApplication": "waf", 75 | "networkFrontDoors": "fd-", 76 | "networkFrontdoorWebApplicationFirewallPolicies": "fdfp-", 77 | "networkLoadBalancersExternal": "lbe-", 78 | "networkLoadBalancersInboundNatRules": "rule-", 79 | "networkLoadBalancersInternal": "lbi-", 80 | "networkLocalNetworkGateways": "lgw-", 81 | "networkNatGateways": "ng-", 82 | "networkNetworkInterfaces": "nic-", 83 | "networkNetworkSecurityGroups": "nsg-", 84 | "networkNetworkSecurityGroupsSecurityRules": "nsgsr-", 85 | "networkNetworkWatchers": "nw-", 86 | "networkPrivateDnsZones": "pdnsz-", 87 | "networkPrivateLinkScope": "pls-", 88 | "networkPrivateLinkServices": "pl-", 89 | "networkPublicIPAddresses": "pip-", 90 | "networkPublicIPPrefixes": "ippre-", 91 | "networkRouteFilters": "rf-", 92 | "networkRouteTables": "rt-", 93 | "networkRouteTablesRoutes": "udr-", 94 | "networkTrafficManagerProfiles": "traf-", 95 | "networkVirtualNetworkGateways": "vgw-", 96 | "networkVirtualNetworks": "vnet-", 97 | "networkVirtualNetworksSubnets": "snet-", 98 | "networkVirtualNetworksVirtualNetworkPeerings": "peer-", 99 | "networkVirtualWans": "vwan-", 100 | "networkVpnGateways": "vpng-", 101 | "networkVpnGatewaysVpnConnections": "vcn-", 102 | "networkVpnGatewaysVpnSites": "vst-", 103 | "notificationHubsNamespaces": "ntfns-", 104 | "notificationHubsNamespacesNotificationHubs": "ntf-", 105 | "operationalInsightsWorkspaces": "log-", 106 | "portalDashboards": "dash-", 107 | "powerBIDedicatedCapacities": "pbi-", 108 | "privateEndpoint": "pep-", 109 | "purviewAccounts": "pview-", 110 | "recoveryServicesVaults": "rsv-", 111 | "resourcesResourceGroups": "rg-", 112 | "searchSearchServices": "srch-", 113 | "serviceBusNamespaces": "sb-", 114 | "serviceBusNamespacesQueues": "sbq-", 115 | "serviceBusNamespacesTopics": "sbt-", 116 | "serviceEndPointPolicies": "se-", 117 | "serviceFabricClusters": "sf-", 118 | "signalRServiceSignalR": "sigr", 119 | "sqlManagedInstances": "sqlmi-", 120 | "sqlServers": "sql-", 121 | "sqlServersDataWarehouse": "sqldw-", 122 | "sqlServersDatabases": "sqldb-", 123 | "sqlServersDatabasesStretch": "sqlstrdb-", 124 | "storSimpleManagers": "ssimp", 125 | "storageStorageAccounts": "st", 126 | "storageStorageAccountsVm": "stvm", 127 | "streamAnalyticsCluster": "asa-", 128 | "synapseWorkspaces": "syn", 129 | "synapseWorkspacesAnalyticsWorkspaces": "synw", 130 | "synapseWorkspacesSqlPoolsDedicated": "syndp", 131 | "synapseWorkspacesSqlPoolsSpark": "synsp", 132 | "timeSeriesInsightsEnvironments": "tsi-", 133 | "webServerFarms": "plan-", 134 | "webSitesAppService": "app-", 135 | "webSitesAppServiceEnvironment": "ase-", 136 | "webSitesFunctions": "func-", 137 | "webStaticSites": "stapp-" 138 | } -------------------------------------------------------------------------------- /infra/core/acr/acr.bicep: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // Licensed under the MIT License. 3 | 4 | @description('The name of the Container Registry resource. Will be automatically generated if not provided.') 5 | param registryName string 6 | 7 | @description('The location of the Container Registry resource.') 8 | param location string = resourceGroup().location 9 | 10 | resource registry 'Microsoft.ContainerRegistry/registries@2023-11-01-preview' = { 11 | name: registryName 12 | location: location 13 | sku: { 14 | name: 'Standard' 15 | } 16 | properties: { 17 | adminUserEnabled: false 18 | encryption: { 19 | status: 'disabled' 20 | } 21 | dataEndpointEnabled: false 22 | publicNetworkAccess: 'Enabled' 23 | networkRuleBypassOptions: 'AzureServices' 24 | zoneRedundancy: 'Disabled' 25 | anonymousPullEnabled: false 26 | metadataSearch: 'Disabled' 27 | } 28 | } 29 | 30 | output name string = registry.name 31 | output id string = registry.id 32 | output loginServer string = registry.properties.loginServer 33 | -------------------------------------------------------------------------------- /infra/core/ai-search/ai-search.bicep: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // Licensed under the MIT License. 3 | 4 | @description('The name of the AI Search instance.') 5 | param name string 6 | 7 | @description('The location of the Managed Cluster resource.') 8 | param location string = resourceGroup().location 9 | 10 | @allowed(['enabled', 'disabled']) 11 | param publicNetworkAccess string = 'enabled' 12 | 13 | resource search 'Microsoft.Search/searchServices@2024-06-01-preview' = { 14 | name: name 15 | location: location 16 | sku: { 17 | name: 'standard' 18 | } 19 | properties: { 20 | disableLocalAuth: true 21 | replicaCount: 1 22 | partitionCount: 1 23 | publicNetworkAccess: publicNetworkAccess 24 | networkRuleSet: { 25 | ipRules: [] 26 | bypass: 'AzureServices' 27 | } 28 | semanticSearch: 'disabled' 29 | } 30 | } 31 | 32 | output name string = search.name 33 | output id string = search.id 34 | -------------------------------------------------------------------------------- /infra/core/aoai/aoai.bicep: -------------------------------------------------------------------------------- 1 | @description('Name of the Azure OpenAI instance') 2 | param openAiName string 3 | 4 | @description('Location for the Azure OpenAI instance') 5 | param location string = resourceGroup().location 6 | 7 | @description('LLM model name') 8 | param llmModelName string = 'gpt-4o' 9 | 10 | @description('LLM model deployment name') 11 | param llmModelDeploymentName string = 'gpt-4o' 12 | 13 | @description('LLM Model API version') 14 | param llmModelVersion string 15 | 16 | @description('Embedding model name') 17 | param embeddingModelName string = 'text-embedding-ada-002' 18 | 19 | @description('Embedding model deployment name') 20 | param embeddingModelDeploymentName string = 'text-embedding-ada-002' 21 | 22 | @description('Embedding Model API version') 23 | param embeddingModelVersion string 24 | 25 | @description('TPM quota for the LLM model (x1000)') 26 | param llmTpmQuota int = 1 27 | 28 | @description('TPM quota for the embedding model (x1000)') 29 | param embeddingTpmQuota int = 1 30 | 31 | resource aoai 'Microsoft.CognitiveServices/accounts@2024-10-01' = { 32 | name: openAiName 33 | location: location 34 | sku: { 35 | name: 'S0' 36 | } 37 | kind: 'OpenAI' 38 | properties: { 39 | publicNetworkAccess: 'Enabled' 40 | disableLocalAuth: true 41 | } 42 | } 43 | 44 | resource llmDeployment 'Microsoft.CognitiveServices/accounts/deployments@2024-10-01' = { 45 | parent: aoai 46 | name: llmModelDeploymentName // model deployment name 47 | sku: { 48 | name: 'GlobalStandard' 49 | capacity: llmTpmQuota 50 | } 51 | properties: { 52 | model: { 53 | format: 'OpenAI' 54 | name: llmModelName // model name 55 | version: llmModelVersion 56 | } 57 | currentCapacity: llmTpmQuota 58 | } 59 | } 60 | 61 | resource embeddingDeployment 'Microsoft.CognitiveServices/accounts/deployments@2024-10-01' = { 62 | parent: aoai 63 | name: embeddingModelDeploymentName // model deployment name 64 | // NOTE: simultaneous AOAI model deployments are not supported at this time. As a workaround, use dependsOn to force the models to get deployed sequentially. 65 | dependsOn: [llmDeployment] 66 | sku: { 67 | name: 'Standard' 68 | capacity: embeddingTpmQuota 69 | } 70 | properties: { 71 | model: { 72 | format: 'OpenAI' 73 | name: embeddingModelName // model name 74 | version: embeddingModelVersion 75 | } 76 | currentCapacity: embeddingTpmQuota 77 | } 78 | } 79 | 80 | output name string = aoai.name 81 | output id string = aoai.id 82 | output endpoint string = aoai.properties.endpoint 83 | output llmModel string = llmDeployment.properties.model.name 84 | output llmModelDeploymentName string = llmDeployment.name 85 | output llmModelQuota int = llmDeployment.sku.capacity 86 | output llmModelVersion string = llmDeployment.apiVersion 87 | output embeddingModel string = embeddingDeployment.properties.model.name 88 | output embeddingModelDeploymentName string = embeddingDeployment.name 89 | output embeddingModelQuota int = embeddingDeployment.sku.capacity 90 | output embeddingModelVersion string = embeddingDeployment.apiVersion 91 | -------------------------------------------------------------------------------- /infra/core/apim/apim.bicep: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // Licensed under the MIT License. 3 | 4 | @description('The name of the API Management service instance') 5 | param apiManagementName string 6 | 7 | @description('The email address of the owner of the service') 8 | @minLength(1) 9 | param publisherEmail string 10 | 11 | @description('The name of the owner of the service') 12 | @minLength(1) 13 | param publisherName string 14 | 15 | @description('The pricing tier of this API Management service') 16 | @allowed([ 17 | 'Developer' 18 | 'StandardV2' 19 | ]) 20 | param sku string = 'Developer' 21 | 22 | @description('The instance size of this API Management service. This should be a multiple of the number of availability zones getting deployed.') 23 | param skuCount int = 1 24 | 25 | @description('Application Insights resource ID') 26 | param appInsightsId string 27 | 28 | @description('Application Insights instrumentation key') 29 | param appInsightsInstrumentationKey string 30 | 31 | @description('Azure region where the resources will be deployed') 32 | param location string = resourceGroup().location 33 | 34 | @description('Numbers for availability zones, for example, 1,2,3.') 35 | param availabilityZones array = [ 36 | '1' 37 | '2' 38 | ] 39 | 40 | @description('Name for the public IP address used to access the API Management service.') 41 | param publicIpName string = 'apimPublicIP' 42 | 43 | @description('SKU for the public IP address used to access the API Management service.') 44 | @allowed([ 45 | 'Standard' 46 | ]) 47 | param publicIpSku string = 'Standard' 48 | 49 | @description('Allocation method for the public IP address used to access the API Management service. Standard SKU public IP requires `Static` allocation.') 50 | @allowed([ 51 | 'Static' 52 | ]) 53 | param publicIPAllocationMethod string = 'Static' 54 | 55 | @description('Unique DNS name for the public IP address used to access the API management service.') 56 | param dnsLabelPrefix string = toLower('${publicIpName}-${uniqueString(resourceGroup().id)}') 57 | 58 | param restoreAPIM bool = false 59 | param subnetId string 60 | 61 | resource publicIp 'Microsoft.Network/publicIPAddresses@2024-01-01' = { 62 | name: publicIpName 63 | location: location 64 | sku: { 65 | name: publicIpSku 66 | } 67 | properties: { 68 | publicIPAllocationMethod: publicIPAllocationMethod 69 | publicIPAddressVersion: 'IPv4' 70 | dnsSettings: { 71 | domainNameLabel: dnsLabelPrefix 72 | } 73 | } 74 | } 75 | 76 | resource apiManagementService 'Microsoft.ApiManagement/service@2023-09-01-preview' = { 77 | name: apiManagementName 78 | location: location 79 | sku: { 80 | name: sku 81 | capacity: skuCount 82 | } 83 | zones: ((length(availabilityZones) == 0) ? null : availabilityZones) 84 | properties: { 85 | restore: restoreAPIM 86 | publisherEmail: publisherEmail 87 | publisherName: publisherName 88 | virtualNetworkType: 'External' 89 | publicIpAddressId: publicIp.id 90 | virtualNetworkConfiguration: { 91 | subnetResourceId: subnetId 92 | } 93 | customProperties: { 94 | 'Microsoft.WindowsAzure.ApiManagement.Gateway.Security.Ciphers.TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA': 'false' 95 | 'Microsoft.WindowsAzure.ApiManagement.Gateway.Security.Ciphers.TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA': 'false' 96 | 'Microsoft.WindowsAzure.ApiManagement.Gateway.Security.Ciphers.TLS_RSA_WITH_AES_128_GCM_SHA256': 'false' 97 | 'Microsoft.WindowsAzure.ApiManagement.Gateway.Security.Ciphers.TLS_RSA_WITH_AES_256_CBC_SHA256': 'false' 98 | 'Microsoft.WindowsAzure.ApiManagement.Gateway.Security.Ciphers.TLS_RSA_WITH_AES_128_CBC_SHA256': 'false' 99 | 'Microsoft.WindowsAzure.ApiManagement.Gateway.Security.Ciphers.TLS_RSA_WITH_AES_256_CBC_SHA': 'false' 100 | 'Microsoft.WindowsAzure.ApiManagement.Gateway.Security.Ciphers.TLS_RSA_WITH_AES_128_CBC_SHA': 'false' 101 | 'Microsoft.WindowsAzure.ApiManagement.Gateway.Security.Ciphers.TripleDes168': 'false' 102 | 'Microsoft.WindowsAzure.ApiManagement.Gateway.Security.Protocols.Tls10': 'false' 103 | 'Microsoft.WindowsAzure.ApiManagement.Gateway.Security.Protocols.Tls11': 'false' 104 | 'Microsoft.WindowsAzure.ApiManagement.Gateway.Security.Protocols.Ssl30': 'false' 105 | 'Microsoft.WindowsAzure.ApiManagement.Gateway.Security.Backend.Protocols.Tls10': 'false' 106 | 'Microsoft.WindowsAzure.ApiManagement.Gateway.Security.Backend.Protocols.Tls11': 'false' 107 | 'Microsoft.WindowsAzure.ApiManagement.Gateway.Security.Backend.Protocols.Ssl30': 'false' 108 | 'Microsoft.WindowsAzure.ApiManagement.Gateway.Protocols.Server.Http2': 'false' 109 | } 110 | } 111 | } 112 | 113 | resource apimLogger 'Microsoft.ApiManagement/service/loggers@2024-06-01-preview' = { 114 | name: 'apimLogger' 115 | parent: apiManagementService 116 | properties: { 117 | credentials: { 118 | instrumentationKey: appInsightsInstrumentationKey 119 | } 120 | description: 'Application Insights for APIM' 121 | loggerType: 'applicationInsights' 122 | resourceId: appInsightsId 123 | } 124 | } 125 | 126 | resource apimDiagnostics 'Microsoft.ApiManagement/service/diagnostics@2023-09-01-preview' = { 127 | name: 'applicationinsights' 128 | parent: apiManagementService 129 | properties: { 130 | loggerId: apimLogger.id 131 | alwaysLog: 'allErrors' 132 | verbosity: 'information' 133 | sampling: { 134 | percentage: 100 135 | samplingType: 'fixed' 136 | } 137 | } 138 | } 139 | 140 | output name string = apiManagementService.name 141 | output id string = apiManagementService.id 142 | output apimGatewayUrl string = apiManagementService.properties.gatewayUrl 143 | -------------------------------------------------------------------------------- /infra/core/apim/apim.graphrag-api.bicep: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // Licensed under the MIT License. 3 | 4 | param apiManagementName string 5 | param name string 6 | param backendUrl string 7 | 8 | resource api 'Microsoft.ApiManagement/service/apis@2023-09-01-preview' = { 9 | name: '${apiManagementName}/${name}' 10 | properties: { 11 | displayName: 'GraphRAG' 12 | apiRevision: '1' 13 | subscriptionRequired: true 14 | serviceUrl: backendUrl 15 | path: '' 16 | protocols: ['https'] 17 | authenticationSettings: { 18 | oAuth2AuthenticationSettings: [] 19 | openidAuthenticationSettings: [] 20 | } 21 | subscriptionKeyParameterNames: { 22 | header: 'Ocp-Apim-Subscription-Key' 23 | query: 'subscription-key' 24 | } 25 | isCurrent: true 26 | format: 'openapi+json' 27 | value: string(loadJsonContent('openapi.json')) // local file will be dynamically created by deployment script 28 | } 29 | resource apiPolicy 'policies@2022-08-01' = { 30 | name: 'policy' 31 | properties: { 32 | format: 'rawxml' 33 | value: loadTextContent('policies/apiPolicy.xml') 34 | } 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /infra/core/apim/apim.graphrag-docs-api.bicep: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // Licensed under the MIT License. 3 | 4 | param apiManagementName string 5 | param backendUrl string 6 | 7 | resource api_docs 'Microsoft.ApiManagement/service/apis@2023-09-01-preview' = { 8 | name: '${apiManagementName}/documentation' 9 | properties: { 10 | displayName: 'documentation' 11 | apiRevision: '1' 12 | subscriptionRequired: false 13 | serviceUrl: '${backendUrl}/manpage' 14 | path: 'manpage' 15 | protocols: ['https'] 16 | authenticationSettings: { 17 | oAuth2AuthenticationSettings: [] 18 | openidAuthenticationSettings: [] 19 | } 20 | subscriptionKeyParameterNames: { 21 | header: 'Ocp-Apim-Subscription-Key' 22 | query: 'subscription-key' 23 | } 24 | isCurrent: true 25 | } 26 | 27 | resource documentation_docs 'operations@2023-09-01-preview' = { 28 | name: 'docs' 29 | properties: { 30 | displayName: 'docs' 31 | method: 'GET' 32 | urlTemplate: '/docs' 33 | templateParameters: [] 34 | responses: [] 35 | } 36 | } 37 | 38 | resource documentation_openapi 'operations@2023-09-01-preview' = { 39 | name: 'openapi' 40 | properties: { 41 | displayName: 'openapi' 42 | method: 'GET' 43 | urlTemplate: '/openapi.json' 44 | templateParameters: [] 45 | responses: [] 46 | } 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /infra/core/apim/openapi.json: -------------------------------------------------------------------------------- 1 | { 2 | "comment": "This file is a placeholder for an OpenAPI specification. It will get replaced with an actual OpenAPI spec during the deployment process." 3 | } -------------------------------------------------------------------------------- /infra/core/apim/policies/apiPolicy.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | * 7 | 8 | 9 | * 10 | 11 | 12 |
*
13 |
14 | 15 |
*
16 |
17 |
18 |
19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 |
-------------------------------------------------------------------------------- /infra/core/cosmosdb/cosmosdb.bicep: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // Licensed under the MIT License. 3 | 4 | @description('The name of the CosmosDB resource.') 5 | param cosmosDbName string 6 | 7 | @description('The location of the CosmosDB resource.') 8 | param location string = resourceGroup().location 9 | 10 | @allowed(['Enabled', 'Disabled']) 11 | param publicNetworkAccess string = 'Disabled' 12 | 13 | var maxThroughput = 1000 14 | 15 | resource cosmosDb 'Microsoft.DocumentDB/databaseAccounts@2024-11-15' = { 16 | name: cosmosDbName 17 | location: location 18 | tags: { 19 | defaultExperience: 'Core (SQL)' 20 | 'hidden-cosmos-mmspecial': '' 21 | } 22 | kind: 'GlobalDocumentDB' 23 | identity: { 24 | type: 'SystemAssigned' 25 | } 26 | properties: { 27 | publicNetworkAccess: publicNetworkAccess 28 | enableAutomaticFailover: false 29 | enableMultipleWriteLocations: false 30 | isVirtualNetworkFilterEnabled: false 31 | virtualNetworkRules: [] 32 | disableKeyBasedMetadataWriteAccess: false 33 | enableFreeTier: false 34 | enableAnalyticalStorage: false 35 | analyticalStorageConfiguration: { 36 | schemaType: 'WellDefined' 37 | } 38 | databaseAccountOfferType: 'Standard' 39 | defaultIdentity: 'FirstPartyIdentity' 40 | networkAclBypass: 'None' 41 | disableLocalAuth: true 42 | enablePartitionMerge: false 43 | minimalTlsVersion: 'Tls12' 44 | consistencyPolicy: { 45 | defaultConsistencyLevel: 'Session' 46 | maxIntervalInSeconds: 5 47 | maxStalenessPrefix: 100 48 | } 49 | locations: [ 50 | { 51 | locationName: location 52 | failoverPriority: 0 53 | isZoneRedundant: false 54 | } 55 | ] 56 | cors: [] 57 | capabilities: [] 58 | ipRules: [] 59 | backupPolicy: { 60 | type: 'Periodic' 61 | periodicModeProperties: { 62 | backupIntervalInMinutes: 240 63 | backupRetentionIntervalInHours: 8 64 | backupStorageRedundancy: 'Geo' 65 | } 66 | } 67 | networkAclBypassResourceIds: [] 68 | capacity: { 69 | totalThroughputLimit: maxThroughput 70 | } 71 | } 72 | } 73 | 74 | // create a single database that is used to maintain state information for graphrag indexing 75 | // NOTE: The current CosmosDB role assignments are not sufficient to allow the aks workload identity to create databases and containers so we must do it in bicep at deployment time. 76 | // TODO: Identify and assign appropriate RBAC roles that allow the workload identity to create new databases and containers instead of relying on this bicep implementation. 77 | resource graphragDatabase 'Microsoft.DocumentDB/databaseAccounts/sqlDatabases@2024-11-15' = { 78 | parent: cosmosDb 79 | name: 'graphrag' 80 | properties: { 81 | options: { 82 | autoscaleSettings: { 83 | maxThroughput: maxThroughput 84 | } 85 | } 86 | resource: { 87 | id: 'graphrag' 88 | } 89 | } 90 | } 91 | 92 | resource jobsContainer 'Microsoft.DocumentDB/databaseAccounts/sqlDatabases/containers@2024-11-15' = { 93 | parent: graphragDatabase 94 | name: 'jobs' 95 | properties: { 96 | resource: { 97 | id: 'jobs' 98 | indexingPolicy: { 99 | indexingMode: 'consistent' 100 | automatic: true 101 | includedPaths: [ 102 | { 103 | path: '/*' 104 | } 105 | ] 106 | excludedPaths: [ 107 | { 108 | path: '/"_etag"/?' 109 | } 110 | ] 111 | } 112 | partitionKey: { 113 | paths: [ 114 | '/id' 115 | ] 116 | kind: 'Hash' 117 | version: 2 118 | } 119 | uniqueKeyPolicy: { 120 | uniqueKeys: [] 121 | } 122 | conflictResolutionPolicy: { 123 | mode: 'LastWriterWins' 124 | conflictResolutionPath: '/_ts' 125 | } 126 | } 127 | } 128 | } 129 | 130 | resource containerStoreContainer 'Microsoft.DocumentDB/databaseAccounts/sqlDatabases/containers@2024-11-15' = { 131 | parent: graphragDatabase 132 | name: 'container-store' 133 | properties: { 134 | resource: { 135 | id: 'container-store' 136 | indexingPolicy: { 137 | indexingMode: 'consistent' 138 | automatic: true 139 | includedPaths: [ 140 | { 141 | path: '/*' 142 | } 143 | ] 144 | excludedPaths: [ 145 | { 146 | path: '/"_etag"/?' 147 | } 148 | ] 149 | } 150 | partitionKey: { 151 | paths: [ 152 | '/id' 153 | ] 154 | kind: 'Hash' 155 | version: 2 156 | } 157 | uniqueKeyPolicy: { 158 | uniqueKeys: [] 159 | } 160 | conflictResolutionPolicy: { 161 | mode: 'LastWriterWins' 162 | conflictResolutionPath: '/_ts' 163 | } 164 | } 165 | } 166 | } 167 | 168 | output name string = cosmosDb.name 169 | output id string = cosmosDb.id 170 | output endpoint string = cosmosDb.properties.documentEndpoint 171 | -------------------------------------------------------------------------------- /infra/core/identity/identity.bicep: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // Licensed under the MIT License. 3 | 4 | @description('The name of the identity') 5 | param name string 6 | 7 | @description('The location of the identity') 8 | param location string = resourceGroup().location 9 | 10 | @description('federated name: FederatedIdentityCredentialProperties. See https://learn.microsoft.com/en-us/azure/templates/microsoft.managedidentity/userassignedidentities/federatedidentitycredentials?pivots=deployment-language-bicep#federatedidentitycredentialproperties') 11 | param federatedCredentials object = {} 12 | 13 | resource identity 'Microsoft.ManagedIdentity/userAssignedIdentities@2023-01-31' = { 14 | name: name 15 | location: location 16 | } 17 | 18 | resource federatedCredentialResources 'Microsoft.ManagedIdentity/userAssignedIdentities/federatedIdentityCredentials@2023-01-31' = [ 19 | for federatedCredential in items(federatedCredentials): { 20 | name: federatedCredential.key 21 | parent: identity 22 | properties: federatedCredential.value 23 | } 24 | ] 25 | 26 | output name string = identity.name 27 | output id string = identity.id 28 | output clientId string = identity.properties.clientId 29 | output principalId string = identity.properties.principalId 30 | -------------------------------------------------------------------------------- /infra/core/log-analytics/log.bicep: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // Licensed under the MIT License. 3 | 4 | @description('The name of the Log Analytics resource.') 5 | param name string 6 | 7 | @description('The location of the Log Analytics resource.') 8 | param location string = resourceGroup().location 9 | 10 | @description('The public network access for ingestion.') 11 | param publicNetworkAccessForIngestion string = 'Disabled' 12 | 13 | resource logAnalyticsWorkspace 'Microsoft.OperationalInsights/workspaces@2022-10-01' = { 14 | name: name 15 | location: location 16 | properties: { 17 | retentionInDays: 30 18 | publicNetworkAccessForIngestion: publicNetworkAccessForIngestion 19 | publicNetworkAccessForQuery: 'Enabled' 20 | features: { 21 | immediatePurgeDataOn30Days: true 22 | } 23 | } 24 | } 25 | 26 | output name string = logAnalyticsWorkspace.name 27 | output id string = logAnalyticsWorkspace.id 28 | -------------------------------------------------------------------------------- /infra/core/monitor/app-insights.bicep: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // Licensed under the MIT License. 3 | 4 | @description('Application Insights resource name') 5 | param appInsightsName string = 'appi' 6 | 7 | @description('Azure region where the resources will be deployed') 8 | param location string = resourceGroup().location 9 | 10 | @description('Application Insights public network access for ingestion') 11 | param appInsightsPublicNetworkAccessForIngestion string = 'Disabled' 12 | 13 | @description('Workspace id of a Log Analytics resource.') 14 | param logAnalyticsWorkspaceId string 15 | 16 | resource appInsights 'Microsoft.Insights/components@2020-02-02' = { 17 | name: appInsightsName 18 | location: location 19 | kind: 'web' 20 | properties: { 21 | Application_Type: 'web' 22 | WorkspaceResourceId: logAnalyticsWorkspaceId 23 | publicNetworkAccessForIngestion: appInsightsPublicNetworkAccessForIngestion 24 | publicNetworkAccessForQuery: 'Enabled' 25 | } 26 | } 27 | 28 | output name string = appInsights.name 29 | output id string = appInsights.id 30 | output connectionString string = appInsights.properties.ConnectionString 31 | output instrumentationKey string = appInsights.properties.InstrumentationKey 32 | -------------------------------------------------------------------------------- /infra/core/monitor/private-link-scope.bicep: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // Licensed under the MIT License. 3 | 4 | param privateLinkScopeName string 5 | param privateLinkScopedResources array = [] 6 | param queryAccessMode string = 'Open' 7 | param ingestionAccessMode string = 'PrivateOnly' 8 | 9 | resource privateLinkScope 'Microsoft.Insights/privateLinkScopes@2021-07-01-preview' = { 10 | name: privateLinkScopeName 11 | location: 'global' 12 | properties: { 13 | accessModeSettings: { 14 | queryAccessMode: queryAccessMode 15 | ingestionAccessMode: ingestionAccessMode 16 | } 17 | } 18 | } 19 | 20 | resource scopedResources 'Microsoft.Insights/privateLinkScopes/scopedResources@2021-07-01-preview' = [ 21 | for id in privateLinkScopedResources: { 22 | name: uniqueString(id) 23 | parent: privateLinkScope 24 | properties: { 25 | linkedResourceId: id 26 | } 27 | } 28 | ] 29 | 30 | output name string = privateLinkScope.name 31 | output id string = privateLinkScope.id 32 | -------------------------------------------------------------------------------- /infra/core/rbac/aks-rbac.bicep: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // Licensed under the MIT License. 3 | 4 | @description('Array of objects with fields principalId, principalType, roleDefinitionId') 5 | param roleAssignments array = [] 6 | 7 | resource roleAssignment 'Microsoft.Authorization/roleAssignments@2022-04-01' = [ 8 | for role in roleAssignments: { 9 | // note: the guid must be globally unique and deterministic (reproducible) across Azure 10 | name: guid( 11 | subscription().subscriptionId, 12 | resourceGroup().name, 13 | role.principalId, 14 | role.principalType, 15 | role.roleDefinitionId 16 | ) 17 | scope: resourceGroup() 18 | properties: role 19 | } 20 | ] 21 | -------------------------------------------------------------------------------- /infra/core/rbac/aoai-rbac.bicep: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // Licensed under the MIT License. 3 | // This generic Bicep module can be used to assign RBAC roles to an Azure OpenAI resource at any defined scope 4 | 5 | param name string 6 | 7 | @description('Array of objects with fields principalId, principalType, roleDefinitionId') 8 | param roleAssignments array = [] 9 | 10 | resource aoai 'Microsoft.CognitiveServices/accounts@2024-10-01' existing = { 11 | name: name 12 | } 13 | 14 | resource roleAssignment 'Microsoft.Authorization/roleAssignments@2022-04-01' = [ 15 | for role in roleAssignments: { 16 | // note: the guid must be globally unique and deterministic (reproducible) across Azure 17 | name: guid(aoai.id, role.principalId, role.principalType, role.roleDefinitionId) 18 | scope: aoai 19 | properties: { 20 | principalId: role.principalId 21 | roleDefinitionId: resourceId('Microsoft.Authorization/roleDefinitions', role.roleDefinitionId) 22 | principalType: role.principalType 23 | } 24 | } 25 | ] 26 | 27 | // output the name, id, and endpoint of the Azure OpenAI resource 28 | output name string = aoai.name 29 | output id string = aoai.id 30 | output endpoint string = aoai.properties.endpoint 31 | -------------------------------------------------------------------------------- /infra/core/storage/storage.bicep: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // Licensed under the MIT License. 3 | 4 | @description('The name of the Storage Account resource.') 5 | param name string 6 | 7 | @description('The location of the Storage Account resource.') 8 | param location string = resourceGroup().location 9 | 10 | @allowed(['Hot', 'Cool', 'Premium']) 11 | param accessTier string = 'Hot' 12 | 13 | @allowed(['AzureDnsZone', 'Standard']) 14 | param dnsEndpointType string = 'Standard' 15 | 16 | @allowed(['Enabled', 'Disabled']) 17 | param publicNetworkAccess string = 'Disabled' 18 | 19 | param tags object = {} 20 | param allowBlobPublicAccess bool = false 21 | param allowCrossTenantReplication bool = true 22 | param allowSharedKeyAccess bool = false 23 | param defaultToOAuthAuthentication bool = false 24 | param deleteRetentionPolicy object = {} 25 | param kind string = 'StorageV2' 26 | param minimumTlsVersion string = 'TLS1_2' 27 | param containers array = [] 28 | 29 | resource storage 'Microsoft.Storage/storageAccounts@2023-01-01' = { 30 | name: name 31 | location: location 32 | tags: tags 33 | kind: kind 34 | sku: { name: 'Standard_LRS' } 35 | properties: { 36 | accessTier: accessTier 37 | allowBlobPublicAccess: allowBlobPublicAccess 38 | allowCrossTenantReplication: allowCrossTenantReplication 39 | allowSharedKeyAccess: allowSharedKeyAccess 40 | defaultToOAuthAuthentication: defaultToOAuthAuthentication 41 | dnsEndpointType: dnsEndpointType 42 | isHnsEnabled: true 43 | minimumTlsVersion: minimumTlsVersion 44 | networkAcls: { 45 | bypass: 'AzureServices' 46 | defaultAction: 'Allow' 47 | } 48 | publicNetworkAccess: publicNetworkAccess 49 | } 50 | 51 | resource blobServices 'blobServices' = if (!empty(containers)) { 52 | name: 'default' 53 | properties: { 54 | deleteRetentionPolicy: deleteRetentionPolicy 55 | } 56 | resource container 'containers' = [ 57 | for container in containers: { 58 | name: container.name 59 | properties: { 60 | publicAccess: container.?publicAccess ?? 'None' 61 | } 62 | } 63 | ] 64 | } 65 | } 66 | 67 | output name string = storage.name 68 | output id string = storage.id 69 | output primaryEndpoints object = storage.properties.primaryEndpoints 70 | -------------------------------------------------------------------------------- /infra/core/vnet/private-dns-vnet-link.bicep: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // Licensed under the MIT License. 3 | 4 | param vnetId string 5 | param privateDnsZoneName string 6 | var vnet_id_hash = uniqueString(vnetId) 7 | 8 | resource dnsZone 'Microsoft.Network/privateDnsZones@2024-06-01' = { 9 | name: privateDnsZoneName 10 | location: 'global' 11 | properties: {} 12 | } 13 | 14 | resource dnsZoneLinks 'Microsoft.Network/privateDnsZones/virtualNetworkLinks@2024-06-01' = { 15 | name: 'vnet-link-${privateDnsZoneName}-${vnet_id_hash}' 16 | location: 'global' 17 | parent: dnsZone 18 | properties: { 19 | registrationEnabled: false 20 | virtualNetwork: { 21 | id: vnetId 22 | } 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /infra/core/vnet/private-dns-zone-a-record.bicep: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // Licensed under the MIT License. 3 | 4 | @description('DNS name') 5 | param name string 6 | 7 | @description('DNS zone name to create the record in') 8 | param dnsZoneName string 9 | 10 | @description('TTL in seconds') 11 | param ttl int = 900 12 | 13 | @description('The IP address') 14 | param ipv4Address string 15 | 16 | resource dnsZone 'Microsoft.Network/privateDnsZones@2024-06-01' existing = { 17 | name: dnsZoneName 18 | } 19 | 20 | resource aRecord 'Microsoft.Network/privateDnsZones/A@2024-06-01' = { 21 | name: name 22 | parent: dnsZone 23 | properties: { 24 | ttl: ttl 25 | aRecords: [ 26 | { 27 | ipv4Address: ipv4Address 28 | } 29 | ] 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /infra/core/vnet/private-dns-zone-groups.json: -------------------------------------------------------------------------------- 1 | { 2 | "azureCloud": { 3 | "aiSearch": "privatelink.search.windows.net", 4 | "azureMonitor": [ 5 | "privatelink.monitor.azure.com", 6 | "privatelink.oms.opinsights.azure.com", 7 | "privatelink.agentsvc.azure-automation.net", 8 | "privatelink.ods.opinsights.azure.com" 9 | ], 10 | "blobStorage": "privatelink.blob.core.windows.net", 11 | "cosmosDB": "privatelink.documents.azure.com" 12 | }, 13 | "azureusgovernment": { 14 | "aiSearch": "privatelink.search.azure.us", 15 | "azureMonitor": [ 16 | "privatelink.monitor.azure.us", 17 | "privatelink.oms.opinsights.azure.us", 18 | "privatelink.agentsvc.azure-automation.us", 19 | "privatelink.ods.opinsights.azure.us" 20 | ], 21 | "blobStorage": "privatelink.blob.core.usgovcloudapi.net", 22 | "cosmosDB": "privatelink.documents.azure.us" 23 | } 24 | } -------------------------------------------------------------------------------- /infra/core/vnet/private-dns-zone.bicep: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // Licensed under the MIT License. 3 | 4 | @description('The name of the private DNS zone.') 5 | param name string 6 | 7 | @description('The name of the virtual networks the DNS zone should be associated with.') 8 | param vnetName string 9 | 10 | resource vnet 'Microsoft.Network/virtualNetworks@2024-05-01' existing = { 11 | name: vnetName 12 | } 13 | 14 | resource dnsZone 'Microsoft.Network/privateDnsZones@2024-06-01' = { 15 | name: name 16 | location: 'global' 17 | properties: {} 18 | } 19 | 20 | resource dnsZoneLinks 'Microsoft.Network/privateDnsZones/virtualNetworkLinks@2024-06-01' = { 21 | name: vnetName 22 | location: 'global' 23 | parent: dnsZone 24 | properties: { 25 | registrationEnabled: false 26 | virtualNetwork: { 27 | id: vnet.id 28 | } 29 | } 30 | } 31 | 32 | output name string = dnsZone.name 33 | output id string = dnsZone.id 34 | -------------------------------------------------------------------------------- /infra/core/vnet/private-endpoint.bicep: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // Licensed under the MIT License. 3 | 4 | @description('Resource ID of service the private endpoint is for') 5 | param privateLinkServiceId string 6 | 7 | @description('The resource ID of the subnet to deploy the private endpoint to') 8 | param subnetId string 9 | 10 | @description('Map of group id to array of private dns zone configs to associate with the private endpoint') 11 | param privateDnsZoneConfigs array 12 | 13 | param privateEndpointName string 14 | param groupId string 15 | param location string = resourceGroup().location 16 | 17 | resource privateEndpoint 'Microsoft.Network/privateEndpoints@2024-05-01' = { 18 | name: privateEndpointName 19 | location: location 20 | properties: { 21 | privateLinkServiceConnections: [ 22 | { 23 | name: privateEndpointName 24 | properties: { 25 | privateLinkServiceId: privateLinkServiceId 26 | groupIds: [groupId] 27 | } 28 | } 29 | ] 30 | subnet: { 31 | id: subnetId 32 | } 33 | } 34 | } 35 | 36 | resource privateDnsZoneGroup 'Microsoft.Network/privateEndpoints/privateDnsZoneGroups@2024-05-01' = { 37 | name: groupId 38 | parent: privateEndpoint 39 | properties: { 40 | privateDnsZoneConfigs: privateDnsZoneConfigs 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /infra/core/vnet/privatelink-private-dns-zones.bicep: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // Licensed under the MIT License. 3 | 4 | @description('The virtual network ID to link to') 5 | param linkedVnetId string 6 | 7 | var privateDnsZoneData = loadJsonContent('private-dns-zone-groups.json') // for more information: https://learn.microsoft.com/en-us/azure/azure-government/compare-azure-government-global-azure 8 | var cloudName = toLower(environment().name) 9 | 10 | var aiSearchPrivateDnsZoneName = privateDnsZoneData[cloudName].aiSearch 11 | var blobStoragePrivateDnsZoneName = privateDnsZoneData[cloudName].blobStorage 12 | var cosmosDbPrivateDnsZoneName = privateDnsZoneData[cloudName].cosmosDb 13 | var storagePrivateDnsZoneNames = [blobStoragePrivateDnsZoneName] 14 | var azureMonitorPrivateDnsZones = privateDnsZoneData[cloudName].azureMonitor 15 | 16 | var privateDnsZones = union( 17 | azureMonitorPrivateDnsZones, 18 | storagePrivateDnsZoneNames, 19 | [cosmosDbPrivateDnsZoneName], 20 | [aiSearchPrivateDnsZoneName] 21 | ) 22 | 23 | resource privateDnsZoneResources 'Microsoft.Network/privateDnsZones@2024-06-01' = [ 24 | for name in privateDnsZones: { 25 | name: name 26 | location: 'global' 27 | } 28 | ] 29 | 30 | module dnsVnetLinks 'vnet-dns-link.bicep' = [ 31 | for (privateDnsZoneName, index) in privateDnsZones: { 32 | name: replace(privateDnsZoneName, '.', '-') 33 | params: { 34 | privateDnsZoneName: privateDnsZoneResources[index].name 35 | vnetId: linkedVnetId 36 | } 37 | } 38 | ] 39 | 40 | output azureMonitorPrivateDnsZoneConfigs array = [ 41 | for zoneName in union(azureMonitorPrivateDnsZones, [blobStoragePrivateDnsZoneName]): { 42 | name: privateDnsZoneResources[indexOf(privateDnsZones, zoneName)].name 43 | properties: { 44 | #disable-next-line use-resource-id-functions 45 | privateDnsZoneId: privateDnsZoneResources[indexOf(privateDnsZones, zoneName)].id 46 | } 47 | } 48 | ] 49 | 50 | output blobStoragePrivateDnsZoneConfigs array = [ 51 | { 52 | name: blobStoragePrivateDnsZoneName 53 | properties: { 54 | #disable-next-line use-resource-id-functions 55 | privateDnsZoneId: privateDnsZoneResources[indexOf(privateDnsZones, blobStoragePrivateDnsZoneName)].id 56 | } 57 | } 58 | ] 59 | 60 | output cosmosDbPrivateDnsZoneConfigs array = [ 61 | { 62 | name: privateDnsZoneResources[indexOf(privateDnsZones, cosmosDbPrivateDnsZoneName)].name 63 | properties: { 64 | #disable-next-line use-resource-id-functions 65 | privateDnsZoneId: privateDnsZoneResources[indexOf(privateDnsZones, cosmosDbPrivateDnsZoneName)].id 66 | } 67 | } 68 | ] 69 | 70 | output aiSearchPrivateDnsZoneConfigs array = [ 71 | { 72 | name: privateDnsZoneResources[indexOf(privateDnsZones, aiSearchPrivateDnsZoneName)].name 73 | properties: { 74 | #disable-next-line use-resource-id-functions 75 | privateDnsZoneId: privateDnsZoneResources[indexOf(privateDnsZones, aiSearchPrivateDnsZoneName)].id 76 | } 77 | } 78 | ] 79 | 80 | output privateDnsZones array = privateDnsZones 81 | -------------------------------------------------------------------------------- /infra/core/vnet/vnet-dns-link.bicep: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // Licensed under the MIT License. 3 | 4 | param privateDnsZoneName string 5 | param vnetId string 6 | 7 | resource privateDnsZone 'Microsoft.Network/privateDnsZones@2024-06-01' existing = { 8 | name: privateDnsZoneName 9 | } 10 | 11 | resource dnsVnetLinks 'Microsoft.Network/privateDnsZones/virtualNetworkLinks@2024-06-01' = { 12 | name: '${replace(privateDnsZoneName, '.', '-')}-${uniqueString(vnetId)}' 13 | parent: privateDnsZone 14 | location: 'global' 15 | properties: { 16 | registrationEnabled: false 17 | resolutionPolicy: 'Default' 18 | virtualNetwork: { 19 | id: vnetId 20 | } 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /infra/core/vnet/vnet.bicep: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // Licensed under the MIT License. 3 | 4 | @description('Name of the vnet resource.') 5 | param vnetName string 6 | 7 | @description('Azure region where the resource will be deployed.') 8 | param location string = resourceGroup().location 9 | 10 | @description('Optional prefix to prepend to subnet names.') 11 | param subnetPrefix string = 'snet-' 12 | 13 | @description('APIM tier - used to determine if subnet delegations are required.') 14 | @allowed(['Developer', 'StandardV2']) 15 | param apimTier string 16 | 17 | @description('NSG resource ID.') 18 | param nsgID string 19 | 20 | resource vnet 'Microsoft.Network/virtualNetworks@2024-05-01' = { 21 | name: vnetName 22 | location: location 23 | properties: { 24 | addressSpace: { 25 | addressPrefixes: [ 26 | '10.1.0.0/16' 27 | ] 28 | } 29 | subnets: [ 30 | { 31 | name: '${subnetPrefix}apim' 32 | properties: { 33 | addressPrefix: '10.1.0.0/24' 34 | networkSecurityGroup: { 35 | id: nsgID 36 | } 37 | delegations: (apimTier == 'Developer') 38 | ? [] 39 | : [ 40 | { 41 | name: 'Microsoft.Web/serverFarms' 42 | properties: { 43 | serviceName: 'Microsoft.Web/serverFarms' 44 | } 45 | } 46 | ] 47 | } 48 | } 49 | { 50 | name: '${subnetPrefix}aks' 51 | properties: { 52 | addressPrefix: '10.1.1.0/24' 53 | serviceEndpoints: [ 54 | { 55 | service: 'Microsoft.Storage' 56 | } 57 | { 58 | service: 'Microsoft.Sql' 59 | } 60 | { 61 | service: 'Microsoft.EventHub' 62 | } 63 | ] 64 | } 65 | } 66 | ] 67 | } 68 | } 69 | 70 | output name string = vnet.name 71 | output id string = vnet.id 72 | output apimSubnetId string = vnet.properties.subnets[0].id 73 | output aksSubnetId string = vnet.properties.subnets[1].id 74 | -------------------------------------------------------------------------------- /infra/deploy.parameters.json: -------------------------------------------------------------------------------- 1 | { 2 | "LOCATION": "__LOCATION__", 3 | "RESOURCE_GROUP": "__RESOURCE_GROUP__" 4 | } -------------------------------------------------------------------------------- /infra/helm/README.md: -------------------------------------------------------------------------------- 1 | # Overview 2 | 3 | This helm chart was created to install graphrag into a kubernetes cluster. 4 | 5 | ## Developer Notes 6 | If making updates to the helm chart, you can validate changes to the helm chart locally by using the following `helm` command example: 7 | 8 | ```shell 9 | helm template test ./graphrag \ 10 | --namespace graphrag \ 11 | --set "master.image.repository=registry.azurecr.io/graphrag" \ 12 | --set "master.image.tag=latest" 13 | ``` 14 | -------------------------------------------------------------------------------- /infra/helm/graphrag/.helmignore: -------------------------------------------------------------------------------- 1 | # Patterns to ignore when building packages. 2 | # This supports shell glob matching, relative path matching, and 3 | # negation (prefixed with !). Only one pattern per line. 4 | .DS_Store 5 | # Common VCS dirs 6 | .git/ 7 | .gitignore 8 | .bzr/ 9 | .bzrignore 10 | .hg/ 11 | .hgignore 12 | .svn/ 13 | # Common backup files 14 | *.swp 15 | *.bak 16 | *.tmp 17 | *.orig 18 | *~ 19 | # Various IDEs 20 | .project 21 | .idea/ 22 | *.tmproj 23 | .vscode/ 24 | -------------------------------------------------------------------------------- /infra/helm/graphrag/Chart.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT License. 3 | 4 | apiVersion: v2 5 | name: graphrag 6 | description: GraphRAG - a graph-based RAG search engine 7 | type: application 8 | version: 0.0.1 9 | 10 | # This is the version number of the application being deployed. This version number should be 11 | # incremented each time you make changes to the application. For graphrag, we define the appVersion to match 12 | # the version of the graphrag library being used. 13 | appVersion: "1.2.0" 14 | -------------------------------------------------------------------------------- /infra/helm/graphrag/LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) Microsoft Corporation. 2 | 3 | MIT License 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /infra/helm/graphrag/templates/NOTES.txt: -------------------------------------------------------------------------------- 1 | Thank you for installing {{ .Chart.Name }}. 2 | 3 | Your release is named {{ .Release.Name }}. 4 | 5 | To learn more about the release, try: 6 | 7 | $ helm status {{ .Release.Name }} 8 | $ helm get all {{ .Release.Name }} -------------------------------------------------------------------------------- /infra/helm/graphrag/templates/_helpers.tpl: -------------------------------------------------------------------------------- 1 | {{/* 2 | Expand the name of the chart. 3 | */}} 4 | {{- define "graphrag.name" -}} 5 | {{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} 6 | {{- end }} 7 | 8 | {{/* 9 | Create a default fully qualified app name. 10 | We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). 11 | If release name contains chart name it will be used as a full name. 12 | */}} 13 | {{- define "graphrag.fullname" -}} 14 | {{- if .Values.fullnameOverride }} 15 | {{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} 16 | {{- else }} 17 | {{- $name := default .Chart.Name .Values.nameOverride }} 18 | {{- if contains $name .Release.Name }} 19 | {{- .Release.Name | trunc 63 | trimSuffix "-" }} 20 | {{- else }} 21 | {{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} 22 | {{- end }} 23 | {{- end }} 24 | {{- end }} 25 | 26 | {{/* 27 | Create a graphrag-master fully qualified app name. 28 | We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). 29 | If release name contains chart name it will be used as a full name. 30 | */}} 31 | {{- define "graphrag.master.fullname" -}} 32 | {{- if .Values.master.fullnameOverride }} 33 | {{- .Values.master.fullnameOverride | trunc 63 | trimSuffix "-" }} 34 | {{- else }} 35 | {{- $name := default .Chart.Name .Values.nameOverride }} 36 | {{- if contains $name .Release.Name }} 37 | {{- printf "%s-%s" .Release.Name .Values.master.name | trunc 63 | trimSuffix "-" }} 38 | {{- else }} 39 | {{- printf "%s-%s-%s" .Release.Name $name .Values.master.name | trunc 63 | trimSuffix "-" }} 40 | {{- end }} 41 | {{- end }} 42 | {{- end }} 43 | 44 | {{/* 45 | Create chart name and version as used by the chart label. 46 | */}} 47 | {{- define "graphrag.chart" -}} 48 | {{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} 49 | {{- end }} 50 | 51 | {{/* 52 | Common labels 53 | */}} 54 | {{- define "graphrag.common.labels" -}} 55 | azure.workload.identity/use: "true" 56 | helm.sh/chart: {{ include "graphrag.chart" . }} 57 | {{ include "graphrag.common.selectorLabels" . }} 58 | {{- if .Chart.AppVersion }} 59 | app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} 60 | {{- end }} 61 | app.kubernetes.io/managed-by: {{ .Release.Service }} 62 | {{- end }} 63 | 64 | {{- define "graphrag.labels" -}} 65 | {{ include "graphrag.common.labels" . }} 66 | {{- end }} 67 | 68 | {{/* 69 | Selector labels 70 | */}} 71 | {{- define "graphrag.common.selectorLabels" -}} 72 | app.kubernetes.io/name: {{ include "graphrag.name" . }} 73 | app.kubernetes.io/instance: {{ .Release.Name }} 74 | {{- end }} 75 | 76 | {{- define "graphrag.master.labels" -}} 77 | {{ include "graphrag.common.labels" . }} 78 | {{ include "graphrag.master.selectorLabels" . }} 79 | {{- end -}} 80 | 81 | {{- define "graphrag.master.selectorLabels" -}} 82 | {{ include "graphrag.common.selectorLabels" . }} 83 | component: {{ .Values.master.name | quote }} 84 | {{- end -}} 85 | 86 | {{/* 87 | Create the name of the service account to use 88 | */}} 89 | {{- define "graphrag.serviceAccountName" -}} 90 | {{- if .Values.serviceAccount.create }} 91 | {{- default (include "graphrag.fullname" .) .Values.serviceAccount.name }} 92 | {{- else }} 93 | {{- default "default" .Values.serviceAccount.name }} 94 | {{- end }} 95 | {{- end }} 96 | -------------------------------------------------------------------------------- /infra/helm/graphrag/templates/graphrag-clusterrole.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: ClusterRole 3 | metadata: 4 | name: {{ include "graphrag.fullname" . }} 5 | labels: 6 | {{- include "graphrag.labels" . | nindent 4 }} 7 | rules: 8 | - apiGroups: [""] 9 | resources: ["pods"] 10 | verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] 11 | - apiGroups: ["batch", "extensions"] 12 | resources: ["*"] 13 | verbs: ["*"] 14 | -------------------------------------------------------------------------------- /infra/helm/graphrag/templates/graphrag-configmap.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ConfigMap 3 | metadata: 4 | labels: 5 | {{- include "graphrag.labels" . | nindent 4 }} 6 | name: {{ include "graphrag.fullname" . }} 7 | data: 8 | {{- toYaml .Values.graphragConfig | nindent 2 }} 9 | AKS_NAMESPACE: {{ .Release.Namespace }} 10 | -------------------------------------------------------------------------------- /infra/helm/graphrag/templates/graphrag-ingress.yaml: -------------------------------------------------------------------------------- 1 | {{- if .Values.ingress.enabled -}} 2 | {{- $fullName := include "graphrag.fullname" . -}} 3 | {{- $masterFullName := include "graphrag.master.fullname" . -}} 4 | {{- $masterSvcPort := .Values.master.service.port -}} 5 | {{- if semverCompare ">=1.19-0" .Capabilities.KubeVersion.GitVersion -}} 6 | apiVersion: networking.k8s.io/v1 7 | {{- else if semverCompare ">=1.14-0" .Capabilities.KubeVersion.GitVersion -}} 8 | apiVersion: networking.k8s.io/v1beta1 9 | {{- else -}} 10 | apiVersion: extensions/v1beta1 11 | {{- end }} 12 | kind: Ingress 13 | metadata: 14 | name: {{ $fullName }} 15 | namespace: {{ .Release.Namespace }} 16 | labels: 17 | {{- include "graphrag.labels" . | nindent 4 }} 18 | {{- with .Values.ingress.annotations }} 19 | annotations: 20 | {{- toYaml . | nindent 4 }} 21 | {{- end }} 22 | spec: 23 | {{- if and .Values.ingress.className (semverCompare ">=1.18-0" .Capabilities.KubeVersion.GitVersion) }} 24 | ingressClassName: {{ .Values.ingress.className }} 25 | {{- end }} 26 | {{- if .Values.ingress.tls }} 27 | tls: 28 | {{- range .Values.ingress.tls }} 29 | - hosts: 30 | {{- range .hosts }} 31 | - {{ . | quote }} 32 | {{- end }} 33 | secretName: {{ .secretName }} 34 | {{- end }} 35 | {{- end }} 36 | rules: 37 | - host: {{ .Values.ingress.host | quote }} 38 | http: 39 | paths: 40 | - path: "/" 41 | pathType: "Prefix" 42 | backend: 43 | {{- if semverCompare ">=1.19-0" $.Capabilities.KubeVersion.GitVersion }} 44 | service: 45 | name: {{ $masterFullName }} 46 | port: 47 | number: {{ $masterSvcPort }} 48 | {{- else }} 49 | serviceName: {{ $masterFullName }} 50 | servicePort: {{ $masterSvcPort }} 51 | {{- end }} 52 | {{- end }} 53 | -------------------------------------------------------------------------------- /infra/helm/graphrag/templates/graphrag-master-deployment.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: {{ include "graphrag.master.fullname" . }} 5 | labels: 6 | {{- include "graphrag.master.labels" . | nindent 4 }} 7 | spec: 8 | {{- if not .Values.master.autoscaling.enabled }} 9 | replicas: {{ .Values.master.replicaCount }} 10 | {{- end }} 11 | selector: 12 | matchLabels: 13 | {{- include "graphrag.master.selectorLabels" . | nindent 6 }} 14 | template: 15 | metadata: 16 | {{- with .Values.master.podAnnotations }} 17 | annotations: 18 | {{- toYaml . | nindent 8 }} 19 | {{- end }} 20 | labels: 21 | date: "{{ now | unixEpoch }}" 22 | {{- include "graphrag.master.labels" . | nindent 8 }} 23 | {{- with .Values.master.podLabels }} 24 | {{- toYaml . | nindent 8 }} 25 | {{- end }} 26 | spec: 27 | serviceAccountName: {{ include "graphrag.serviceAccountName" . }} 28 | securityContext: 29 | {{- toYaml .Values.master.podSecurityContext | nindent 8 }} 30 | {{- with .Values.master.imagePullSecrets }} 31 | imagePullSecrets: 32 | {{- toYaml . | nindent 8 }} 33 | {{- end }} 34 | containers: 35 | - name: {{ .Values.master.name }} 36 | securityContext: 37 | {{- toYaml .Values.master.securityContext | nindent 12 }} 38 | image: "{{ .Values.master.image.repository }}:{{ .Values.master.image.tag | default .Chart.AppVersion }}" 39 | imagePullPolicy: {{ .Values.master.image.pullPolicy }} 40 | envFrom: 41 | - configMapRef: 42 | name: {{ include "graphrag.fullname" . }} 43 | ports: 44 | - name: http 45 | containerPort: {{ .Values.master.service.port }} 46 | protocol: TCP 47 | livenessProbe: 48 | {{- toYaml .Values.master.livenessProbe | nindent 12 }} 49 | readinessProbe: 50 | {{- toYaml .Values.master.readinessProbe | nindent 12 }} 51 | resources: 52 | {{- toYaml .Values.master.resources | nindent 12 }} 53 | volumeMounts: 54 | {{- with .Values.master.volumeMounts }} 55 | {{- toYaml . | nindent 12 }} 56 | {{- end }} 57 | {{- with .Values.master.volumes }} 58 | {{- toYaml . | nindent 8 }} 59 | {{- end }} 60 | {{- with .Values.master.nodeSelector }} 61 | nodeSelector: 62 | {{- toYaml . | nindent 8 }} 63 | {{- end }} 64 | {{- with .Values.master.affinity }} 65 | affinity: 66 | {{- toYaml . | nindent 8 }} 67 | {{- end }} 68 | {{- with .Values.master.tolerations }} 69 | tolerations: 70 | {{- toYaml . | nindent 8 }} 71 | {{- end }} 72 | -------------------------------------------------------------------------------- /infra/helm/graphrag/templates/graphrag-master-hpa.yaml: -------------------------------------------------------------------------------- 1 | {{- if .Values.master.autoscaling.enabled }} 2 | apiVersion: autoscaling/v2 3 | kind: HorizontalPodAutoscaler 4 | metadata: 5 | name: {{ include "graphrag.master.fullname" . }} 6 | labels: 7 | {{- include "graphrag.master.labels" . | nindent 4 }} 8 | spec: 9 | scaleTargetRef: 10 | apiVersion: apps/v1 11 | kind: Deployment 12 | name: {{ include "graphrag.master.fullname" . }} 13 | minReplicas: {{ .Values.master.autoscaling.minReplicas }} 14 | maxReplicas: {{ .Values.master.autoscaling.maxReplicas }} 15 | metrics: 16 | {{- if .Values.master.autoscaling.targetCPUUtilizationPercentage }} 17 | - type: Resource 18 | resource: 19 | name: cpu 20 | target: 21 | type: Utilization 22 | averageUtilization: {{ .Values.master.autoscaling.targetCPUUtilizationPercentage }} 23 | {{- end }} 24 | {{- if .Values.master.autoscaling.targetMemoryUtilizationPercentage }} 25 | - type: Resource 26 | resource: 27 | name: memory 28 | target: 29 | type: Utilization 30 | averageUtilization: {{ .Values.master.autoscaling.targetMemoryUtilizationPercentage }} 31 | {{- end }} 32 | {{- end }} 33 | -------------------------------------------------------------------------------- /infra/helm/graphrag/templates/graphrag-master-service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | name: {{ include "graphrag.master.fullname" . }} 5 | {{- if .Values.master.service.annotations }} 6 | annotations: 7 | {{- range $key, $value := .Values.master.service.annotations }} 8 | {{ $key }}: {{ $value | quote }} 9 | {{- end }} 10 | {{- end }} 11 | labels: 12 | {{- include "graphrag.master.labels" . | nindent 4 }} 13 | spec: 14 | type: {{ .Values.master.service.type }} 15 | ports: 16 | - port: {{ .Values.master.service.port }} 17 | selector: 18 | {{- include "graphrag.master.selectorLabels" . | nindent 4 }} 19 | -------------------------------------------------------------------------------- /infra/helm/graphrag/templates/graphrag-nginx-internal-controller.yaml: -------------------------------------------------------------------------------- 1 | {{- if .Values.ingress.createIngressClass -}} 2 | apiVersion: approuting.kubernetes.azure.com/v1alpha1 3 | kind: NginxIngressController 4 | metadata: 5 | name: {{ .Values.ingress.className }} 6 | spec: 7 | ingressClassName: {{ .Values.ingress.className }} 8 | controllerNamePrefix: {{ .Values.ingress.className }} 9 | {{- with .Values.ingress.loadBalancerAnnotations }} 10 | loadBalancerAnnotations: 11 | {{- toYaml . | nindent 4 }} 12 | {{- end }} 13 | {{- end }} 14 | -------------------------------------------------------------------------------- /infra/helm/graphrag/templates/graphrag-rolebinding.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: ClusterRoleBinding 3 | metadata: 4 | name: {{ include "graphrag.fullname" . }} 5 | labels: 6 | {{- include "graphrag.labels" . | nindent 4 }} 7 | subjects: 8 | - kind: ServiceAccount 9 | name: {{ include "graphrag.serviceAccountName" . }} 10 | namespace: {{ .Release.Namespace }} 11 | roleRef: 12 | kind: ClusterRole 13 | name: {{ include "graphrag.fullname" . }} 14 | apiGroup: rbac.authorization.k8s.io 15 | -------------------------------------------------------------------------------- /infra/helm/graphrag/templates/graphrag-serviceaccount.yaml: -------------------------------------------------------------------------------- 1 | {{- if .Values.serviceAccount.create -}} 2 | apiVersion: v1 3 | kind: ServiceAccount 4 | metadata: 5 | name: {{ include "graphrag.serviceAccountName" . }} 6 | labels: 7 | {{- include "graphrag.labels" . | nindent 4 }} 8 | {{- with .Values.serviceAccount.annotations }} 9 | annotations: 10 | {{- toYaml . | nindent 4 }} 11 | {{- end }} 12 | automountServiceAccountToken: {{ .Values.serviceAccount.automount }} 13 | {{- end }} 14 | -------------------------------------------------------------------------------- /infra/helm/graphrag/templates/tests/test-connection.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: "{{ include "graphrag.master.fullname" . }}-test-connection" 5 | labels: 6 | {{- include "graphrag.master.labels" . | nindent 4 }} 7 | annotations: 8 | "helm.sh/hook": test 9 | spec: 10 | containers: 11 | - name: wget 12 | image: busybox 13 | command: ['wget'] 14 | args: ['{{ include "graphrag.master.fullname" . }}:{{ .Values.master.service.port }}'] 15 | restartPolicy: Never 16 | -------------------------------------------------------------------------------- /infra/helm/graphrag/values.yaml: -------------------------------------------------------------------------------- 1 | # Default values for the graphrag helm chart. 2 | 3 | nameOverride: "" 4 | fullnameOverride: "" 5 | 6 | serviceAccount: 7 | # Specifies whether a service account should be created 8 | create: true 9 | # Automatically mount a ServiceAccount's API credentials? 10 | automount: true 11 | # Annotations to add to the service account 12 | annotations: 13 | azure.workload.identity/client-id: "" 14 | # Name of the service account to use. 15 | # If not set and create is true, a name is generated using the fullname template 16 | name: "" 17 | 18 | ingress: 19 | enabled: true 20 | className: nginx-internal 21 | createIngressClass: true 22 | host: graphrag.graphrag.io 23 | tls: [] 24 | annotations: 25 | nginx.ingress.kubernetes.io/proxy-connect-timeout: "900" 26 | nginx.ingress.kubernetes.io/proxy-send-timeout: "900" 27 | nginx.ingress.kubernetes.io/proxy-read-timeout: "900" 28 | nginx.ingress.kubernetes.io/proxy-body-size: 500m 29 | loadBalancerAnnotations: 30 | service.beta.kubernetes.io/azure-load-balancer-internal: "true" 31 | 32 | graphragConfig: 33 | AI_SEARCH_AUDIENCE: "https://search.azure.com" 34 | AI_SEARCH_URL: "" 35 | APPLICATIONINSIGHTS_CONNECTION_STRING: "" 36 | # Must set hidden env variable to true to disable statsbeat. For more information: https://github.com/Azure/azure-sdk-for-python/issues/34804 37 | APPLICATIONINSIGHTS_STATSBEAT_DISABLED_ALL: "True" 38 | COSMOS_URI_ENDPOINT: "" 39 | GRAPHRAG_API_BASE: "" 40 | GRAPHRAG_API_VERSION: "" 41 | COGNITIVE_SERVICES_AUDIENCE: "https://cognitiveservices.azure.com/.default" 42 | GRAPHRAG_LLM_MODEL: "" 43 | GRAPHRAG_LLM_DEPLOYMENT_NAME: "" 44 | GRAPHRAG_EMBEDDING_MODEL: "" 45 | GRAPHRAG_EMBEDDING_DEPLOYMENT_NAME: "" 46 | STORAGE_ACCOUNT_BLOB_URL: "" 47 | 48 | master: 49 | name: "master" 50 | replicaCount: 1 51 | image: 52 | repository: "" 53 | pullPolicy: Always 54 | # Override the image tag whose default is the chart appVersion. 55 | tag: "" 56 | imagePullSecrets: [] 57 | podAnnotations: {} 58 | podLabels: {} 59 | podSecurityContext: 60 | {} 61 | # fsGroup: 2000 62 | 63 | securityContext: 64 | {} 65 | # capabilities: 66 | # drop: 67 | # - ALL 68 | # readOnlyRootFilesystem: true 69 | # runAsNonRoot: true 70 | # runAsUser: 1000 71 | 72 | service: 73 | annotations: {} 74 | type: ClusterIP 75 | port: 80 76 | 77 | resources: 78 | # We recommend not modifying the default resources below unless you know what you're doing 79 | # and have investigated graphrag's baseline spec requirements to ensure the application 80 | # can run properly. 81 | limits: 82 | cpu: 8 83 | memory: "16Gi" 84 | requests: 85 | cpu: 4 86 | memory: "10Gi" 87 | 88 | livenessProbe: 89 | httpGet: 90 | path: /manpage/docs 91 | port: http 92 | failureThreshold: 50 93 | initialDelaySeconds: 30 94 | periodSeconds: 20 95 | 96 | readinessProbe: 97 | httpGet: 98 | path: /manpage/docs 99 | port: http 100 | failureThreshold: 50 101 | initialDelaySeconds: 30 102 | periodSeconds: 20 103 | 104 | autoscaling: 105 | enabled: true 106 | minReplicas: 1 107 | maxReplicas: 20 108 | targetMemoryUtilizationPercentage: 50 109 | # targetCPUUtilizationPercentage: 50 110 | 111 | # Additional volumes on the output Deployment definition. 112 | volumes: [] 113 | # - name: foo 114 | # secret: 115 | # secretName: mysecret 116 | # optional: false 117 | 118 | # Additional volumeMounts on the output Deployment definition. 119 | volumeMounts: [] 120 | # - name: foo 121 | # mountPath: "/etc/foo" 122 | # readOnly: true 123 | 124 | nodeSelector: {} 125 | 126 | tolerations: [] 127 | 128 | affinity: 129 | nodeAffinity: 130 | requiredDuringSchedulingIgnoredDuringExecution: 131 | nodeSelectorTerms: 132 | - matchExpressions: 133 | - key: workload 134 | operator: In 135 | values: 136 | - graphrag 137 | -------------------------------------------------------------------------------- /infra/managed-app/README.md: -------------------------------------------------------------------------------- 1 | # Managed App Instructions 2 | 3 | This guide walks through the process to convert the graphrag solution accelerator into a managed app. 4 | 5 | ### Prerequisites 6 | 1. Create an ACR 7 | 1. Push both the graphrag backend docker image and the graphrag helm chart to the registry. 8 | ```shell 9 | # push docker image 10 | az acr login --name .azurecr.io 11 | cd 12 | az acr build --registry acurecr.io -f docker/Dockerfile-backend --image graphrag:latest . 13 | # push helm chart 14 | cd /infra/helm 15 | helm package graphrag 16 | helm push graphrag-.tgz oci://.azurecr.io/helm 17 | ``` 18 | 1. A managed app [requires a storage account to deploy](https://learn.microsoft.com/en-us/azure/azure-resource-manager/managed-applications/publish-service-catalog-bring-your-own-storage?tabs=azure-powershell) an Azure Managed App Definition. Create a storage account and take note of the name and SAS key for later. 19 | 1. Enable anonymous access on the blob container that will host the managed app deployment package (a zip file). 20 | 1. The Azure built-in service principle `Managed Applications on Behalf Application` **MUST** be granted the role of `Contributor` and `Role Based Access Control Administrator` on any Azure subscription where the app will be deployed. 21 | 22 | ### Steps to build a Managed App 23 | 24 | ### 1. Auto format the bicep code (optional) 25 | 26 | As a precaution, auto-format and lint the bicep code to detect any mistakes early-on. 27 | 28 | ```bash 29 | cd /infra 30 | find . -type f -name "*.bicep" -exec az bicep format --file {} \; 31 | find . -type f -name "*.bicep" -exec az bicep lint --file {} \; 32 | ``` 33 | 34 | ### 2. Create & test the Azure portal interface 35 | 36 | Use the [Azure Portal Sandbox](https://portal.azure.com/#blade/Microsoft_Azure_CreateUIDef/SandboxBlade) to test and make UI changes defined in [createUiDefinition.json](createUiDefinition.json). To make additional changes to the Azure portal experience, check out the [documentation](https://learn.microsoft.com/en-us/azure/azure-resource-manager/managed-applications/create-uidefinition-overview) and copy the contents of `createUiDefinition.json` into the sandbox environment. 37 | 38 | ### 3. Prepare the deployment package 39 | 40 | A *deployment package* is a zip file comprised of several files. This will include an ARM template and other files from the previous steps, along with additional code relevant to the deployment (i.e. artifacts) 41 | 42 | The names of certain files (`mainTemplate.json` and `createUiDefinition.json`) should not be modified and are case-sensitive. Azure expects these files to be included in the final managed app deployment package. 43 | 44 | A local copy of the backend docker image needs to be built in order to retrieve a copy of the openapi json spec associated with GraphRAG's REST API. This api spec file will become part of the final deployment package. 45 | ```shell 46 | cd 47 | docker build -t graphrag:latest -f docker/Dockerfile-backend . 48 | docker run -d -p 8080:80 graphrag:latest 49 | ``` 50 | 51 | Now create the deployment package: 52 | ```bash 53 | cd /infra 54 | 55 | # get the openapi specification file 56 | curl --fail-with-body -o core/apim/openapi.json http://localhost:8080/manpage/openapi.json 57 | 58 | # compile bicep -> ARM 59 | az bicep build --file main.bicep --outfile managed-app/mainTemplate.json 60 | 61 | # zip up all files 62 | cd managed-app 63 | tar -a -cf managed-app-deployment-pkg.zip scripts createUiDefinition.json mainTemplate.json viewDefinition.json 64 | ``` 65 | 66 | The final deployment package should have the following file structure: 67 | ```bash 68 | managed-app-deployment-pkg.zip 69 | ├── scripts 70 | │ └── install-graphrag.sh 71 | ├── createUiDefinition.json 72 | ├── mainTemplate.json 73 | └── viewDefinition.json 74 | ``` 75 | 76 | Upload the zip file to an Azure Storage location in preparation for the next step. 77 | 78 | ### 4. Create a Service Catalog Managed App Definition 79 | 80 | Click [here](https://ms.portal.azure.com/#view/Microsoft_Azure_Marketplace/GalleryItemDetailsBladeNopdl/id/Microsoft.ApplianceDefinition/selectionMode~/false/resourceGroupId//resourceGroupLocation//dontDiscardJourney~/false/selectedMenuId/home/launchingContext~/%7B%22galleryItemId%22%3A%22Microsoft.ApplianceDefinition%22%2C%22source%22%3A%5B%22GalleryFeaturedMenuItemPart%22%2C%22VirtualizedTileDetails%22%5D%2C%22menuItemId%22%3A%22home%22%2C%22subMenuItemId%22%3A%22Search%20results%22%2C%22telemetryId%22%3A%2220409084-39a1-4800-bbce-d0b26a6f46a4%22%7D/searchTelemetryId/d7d20e05-ca16-47f7-bed5-9c7b8d2fa641) or from within the Azure Portal, go to Marketplace and create a `Service Catalog Managed App Definition`. You will be asked to provide a uri link to the uploaded `managed-app-deployment-pkg.zip` file during the creation process. 81 | 82 | ### 5. Deploy the managed app 83 | 84 | There are two deployment options to consider when deploying a managed app. As an app in the Marketplace or as a one-click button: 85 | 86 | * Marketplace App 87 | 88 | 1. In the Azure Portal, find and click on the managed app definition resource created in the previous step. 89 | 2. A button option `Deploy from definition` will be available. 90 | 3. Click on it and proceed through the same setup experience (defined by the `createUiDefinitions.json` file) that a consumer would experience when installing the managed app. 91 | 4. Additional work is needed to [publish the app](https://learn.microsoft.com/en-us/partner-center/marketplace-offers/plan-azure-application-offer) as an official app in the Azure Marketplace 92 | 93 | * 1-click Deployment Button 94 | If `mainTemplate.json` is hosted somewhere publicly (i.e. on Github), a deployment button can be created that deploys the app when clicked, like the the example below. 95 | 96 | [![Deploy to Azure](https://aka.ms/deploytoazurebutton)](https://portal.azure.com/#create/Microsoft.Template/uri/https%3A%2F%2Fraw.githubusercontent.com%2FAzure-Samples%2Fgraphrag-accelerator%2Frefs%2Fheads%2Fmain%2Finfra%2Fmanaged-app%2FmainTemplate.json) 97 | -------------------------------------------------------------------------------- /infra/managed-app/scripts/install-graphrag.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Install kubectl 4 | set -e 5 | az aks install-cli --only-show-errors 6 | az login --identity 7 | 8 | # Get AKS credentials 9 | # requires "Azure Kubernetes Service Cluster Admin" role and "Azure Kubernetes Service RBAC Cluster Admin" role 10 | az aks get-credentials \ 11 | --admin \ 12 | --name $AKS_NAME \ 13 | --resource-group $RESOURCE_GROUP --only-show-errors 14 | 15 | # Define a namespace to install graphrag in 16 | aksNamespace="graphrag" 17 | 18 | # Setup an image pull secret for AKS to access ACR 19 | # NOTE: use an image pull secret instead of managed identity RBAC roles to seamlessly enable ACR access from any subscription/tenant 20 | aksSecretName="regcred" 21 | kubectl create namespace $aksNamespace 22 | kubectl create secret docker-registry $aksSecretName \ 23 | --docker-server=$ACR_SERVER \ 24 | --docker-username=$ACR_TOKEN_NAME \ 25 | --docker-password=$ACR_TOKEN_PASSWORD \ 26 | --namespace $aksNamespace 27 | 28 | # Install helm 29 | curl https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 -o get_helm.sh -s 30 | chmod 700 get_helm.sh 31 | ./get_helm.sh &>/dev/null 32 | 33 | # Login to ACR and retrieve helm chart 34 | # A token for the ACR should be generated ahead of time 35 | helm registry login $ACR_SERVER --username $ACR_TOKEN_NAME --password $ACR_TOKEN_PASSWORD 36 | helm pull "oci://$ACR_SERVER/helm/graphrag" --untar 37 | 38 | # Install the helm chart 39 | helm upgrade -i graphrag ./graphrag -f ./graphrag/values.yaml \ 40 | --namespace $aksNamespace --create-namespace \ 41 | --set "serviceAccount.name=$AKS_SERVICE_ACCOUNT_NAME" \ 42 | --set "serviceAccount.annotations.azure\.workload\.identity/client-id=$WORKLOAD_IDENTITY_CLIENT_ID" \ 43 | --set "master.imagePullSecrets[0].name=$aksSecretName" \ 44 | --set "master.image.repository=$ACR_SERVER/$IMAGE_NAME" \ 45 | --set "master.image.tag=$IMAGE_VERSION" \ 46 | --set "ingress.host=$APP_HOSTNAME" \ 47 | --set "graphragConfig.AI_SEARCH_URL=https://$AI_SEARCH_NAME.$AI_SEARCH_ENDPOINT_SUFFIX" \ 48 | --set "graphragConfig.AI_SEARCH_AUDIENCE=$AI_SEARCH_AUDIENCE" \ 49 | --set "graphragConfig.APPLICATIONINSIGHTS_CONNECTION_STRING=$APP_INSIGHTS_CONNECTION_STRING" \ 50 | --set "graphragConfig.COGNITIVE_SERVICES_AUDIENCE=$COGNITIVE_SERVICES_AUDIENCE" \ 51 | --set "graphragConfig.COSMOS_URI_ENDPOINT=$COSMOSDB_ENDPOINT" \ 52 | --set "graphragConfig.GRAPHRAG_API_BASE=$AOAI_ENDPOINT" \ 53 | --set "graphragConfig.GRAPHRAG_API_VERSION=$AOAI_LLM_MODEL_API_VERSION" \ 54 | --set "graphragConfig.GRAPHRAG_LLM_MODEL=$AOAI_LLM_MODEL"\ 55 | --set "graphragConfig.GRAPHRAG_LLM_DEPLOYMENT_NAME=$AOAI_LLM_MODEL_DEPLOYMENT_NAME" \ 56 | --set "graphragConfig.GRAPHRAG_EMBEDDING_MODEL=$AOAI_EMBEDDING_MODEL" \ 57 | --set "graphragConfig.GRAPHRAG_EMBEDDING_DEPLOYMENT_NAME=$AOAI_EMBEDDING_MODEL_DEPLOYMENT_NAME" \ 58 | --set "graphragConfig.STORAGE_ACCOUNT_BLOB_URL=$STORAGE_ACCOUNT_BLOB_URL" 59 | -------------------------------------------------------------------------------- /infra/managed-app/viewDefinition.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://schema.management.azure.com/schemas/viewdefinition/0.0.1-preview/ViewDefinition.json#", 3 | "contentVersion": "0.0.0.1", 4 | "views": [ 5 | { 6 | "kind": "Overview", 7 | "properties": { 8 | "header": "Welcome to GraphRAG!", 9 | "description": "Enabling customers to build and leverage the power of knowledge graphs every day...

Getting Started: Look under the `Settings` -> `Parameters and Outputs` section, and find `azure_apim_gateway_url`.

GraphRAG Swagger Docs: `/manpage/docs`" 10 | } 11 | } 12 | ] 13 | } -------------------------------------------------------------------------------- /notebooks/README.md: -------------------------------------------------------------------------------- 1 | # Instructions 2 | 3 | 1. Create a dataset to use with GraphRAG. You may provide your own data or use the `get-wiki-articles.py` script to download a small set of wikipedia articles for demonstration purposes. 4 | 5 | ```shell 6 | > python get-wiki-articles.py testdata 7 | ``` 8 | For a faster example with less data 9 | ```shell 10 | > python get-wiki-articles.py --short-summary --num-articles 1 testdata 11 | ``` 12 | 13 | 2. Follow instructions in the `1-Quickstart.ipynb` notebook to explore the GraphRAG API, by building an index of the data in `testdata` and executing queries. 14 | -------------------------------------------------------------------------------- /notebooks/get-wiki-articles.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Copyright (c) Microsoft Corporation. 3 | # Licensed under the MIT License. 4 | 5 | """ 6 | This script downloads a few sample wikipedia articles that can be used for demo or quickstart purposes in conjunction with the solution accelerator. 7 | """ 8 | 9 | import argparse 10 | import os 11 | 12 | import wikipedia 13 | 14 | us_states = [ 15 | "Alaska", 16 | "California", 17 | "Washington (state)", 18 | "Washington_D.C.", 19 | "New York (state)", 20 | ] 21 | 22 | 23 | def main(): 24 | parser = argparse.ArgumentParser(description="Wikipedia Download Script") 25 | parser.add_argument( 26 | "directory", 27 | help="Directory to download sample wikipedia articles to.", 28 | default="testdata", 29 | ) 30 | parser.add_argument( 31 | "--short-summary", 32 | help="Retrieve short summary article content.", 33 | action="store_true", 34 | ) 35 | parser.add_argument( 36 | "--num-articles", 37 | help="Number of wikipedia articles to download. Default=5", 38 | default=5, 39 | choices=range(1, 6), 40 | type=int, 41 | ) 42 | args = parser.parse_args() 43 | os.makedirs(args.directory, exist_ok=True) 44 | for state in us_states[0 : args.num_articles]: 45 | try: 46 | title = wikipedia.page(state).title.lower().replace(" ", "_") 47 | content = ( 48 | wikipedia.page(state).summary 49 | if args.short_summary 50 | else wikipedia.page(state).content 51 | ) 52 | content = content.strip() 53 | filename = os.path.join(args.directory, f"{title}_wiki_article.txt") 54 | with open(filename, "w", encoding="utf-8") as f: 55 | f.write(content) 56 | print(f"Saving wiki article '{title}' to {filename}") 57 | except Exception: 58 | print(f"Error fetching wiki article {title}") 59 | 60 | 61 | if __name__ == "__main__": 62 | main() 63 | --------------------------------------------------------------------------------