├── .azdo
└── azure-pipeline.yaml
├── .devcontainer
├── Dockerfile
├── devcontainer.json
└── entrypoint.sh
├── .dockerignore
├── .editorconfig
├── .github
├── CODEOWNERS
├── ISSUE_TEMPLATE
│ ├── bug_report.md
│ └── feature_request.md
├── dependabot.yml
└── workflows
│ ├── dev.yaml
│ └── tests.yaml
├── .pre-commit-config.yaml
├── .secrets.baseline
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── SECURITY.md
├── TRANSPARENCY.md
├── backend
├── .coveragerc
├── README.md
├── graphrag_app
│ ├── __init__.py
│ ├── api
│ │ ├── __init__.py
│ │ ├── data.py
│ │ ├── graph.py
│ │ ├── index.py
│ │ ├── prompt_tuning.py
│ │ ├── query.py
│ │ ├── query_streaming.py
│ │ └── source.py
│ ├── logger
│ │ ├── __init__.py
│ │ ├── application_insights_workflow_callbacks.py
│ │ ├── blob_workflow_callbacks.py
│ │ ├── console_workflow_callbacks.py
│ │ ├── load_logger.py
│ │ ├── pipeline_job_updater.py
│ │ └── typing.py
│ ├── main.py
│ ├── typing
│ │ ├── __init__.py
│ │ ├── models.py
│ │ └── pipeline.py
│ └── utils
│ │ ├── __init__.py
│ │ ├── azure_clients.py
│ │ ├── common.py
│ │ └── pipeline.py
├── manifests
│ ├── cronjob.yaml
│ └── job.yaml
├── poetry.lock
├── pyproject.toml
├── pytest.ini
├── scripts
│ ├── indexer.py
│ ├── job-scheduler.py
│ └── settings.yaml
└── tests
│ ├── __init__.py
│ ├── conftest.py
│ ├── data
│ └── synthetic-dataset
│ │ ├── ABOUT.md
│ │ └── output
│ │ ├── create_final_communities.parquet
│ │ ├── create_final_community_reports.parquet
│ │ ├── create_final_covariates.parquet
│ │ ├── create_final_documents.parquet
│ │ ├── create_final_entities.parquet
│ │ ├── create_final_nodes.parquet
│ │ ├── create_final_relationships.parquet
│ │ ├── create_final_text_units.parquet
│ │ ├── graph.graphml
│ │ └── stats.json
│ ├── integration
│ ├── test_api_data.py
│ ├── test_api_default.py
│ ├── test_api_graph.py
│ ├── test_api_index.py
│ ├── test_api_index_configuration.py
│ ├── test_api_prompt_tuning.py
│ ├── test_api_source.py
│ └── test_utils_pipeline.py
│ └── unit
│ ├── test_azure_clients.py
│ ├── test_common.py
│ ├── test_load_logger.py
│ ├── test_logger_app_insights_callbacks.py
│ ├── test_logger_blob_callbacks.py
│ └── test_logger_console_callbacks.py
├── docker
├── Dockerfile-backend
└── Dockerfile-frontend
├── docs
├── DEPLOYMENT-GUIDE.md
├── DEVELOPMENT-GUIDE.md
└── assets
│ ├── graphrag-architecture-diagram.png
│ └── graphrag-architecture-diagram.vsdx
├── frontend
├── .streamlit
│ └── config.toml
├── README.md
├── app.py
├── deploy.sh
├── frontend_deploy.parameters.json
├── poetry.lock
├── pyproject.toml
├── src
│ ├── __init__.py
│ ├── components
│ │ ├── __init__.py
│ │ ├── index_pipeline.py
│ │ ├── login_sidebar.py
│ │ ├── prompt_configuration.py
│ │ ├── query.py
│ │ ├── tabs.py
│ │ └── upload_files_component.py
│ ├── enums.py
│ ├── functions.py
│ └── graphrag_api.py
└── style.css
├── infra
├── abbreviations.json
├── core
│ ├── acr
│ │ └── acr.bicep
│ ├── ai-search
│ │ └── ai-search.bicep
│ ├── aks
│ │ └── aks.bicep
│ ├── aoai
│ │ └── aoai.bicep
│ ├── apim
│ │ ├── apim.bicep
│ │ ├── apim.graphrag-api.bicep
│ │ ├── apim.graphrag-docs-api.bicep
│ │ ├── openapi.json
│ │ └── policies
│ │ │ └── apiPolicy.xml
│ ├── cosmosdb
│ │ └── cosmosdb.bicep
│ ├── identity
│ │ └── identity.bicep
│ ├── log-analytics
│ │ └── log.bicep
│ ├── monitor
│ │ ├── app-insights.bicep
│ │ └── private-link-scope.bicep
│ ├── rbac
│ │ ├── aks-rbac.bicep
│ │ ├── aoai-rbac.bicep
│ │ └── workload-identity-rbac.bicep
│ ├── scripts
│ │ └── deployment-script.bicep
│ ├── storage
│ │ └── storage.bicep
│ └── vnet
│ │ ├── nsg.bicep
│ │ ├── private-dns-vnet-link.bicep
│ │ ├── private-dns-zone-a-record.bicep
│ │ ├── private-dns-zone-groups.json
│ │ ├── private-dns-zone.bicep
│ │ ├── private-endpoint.bicep
│ │ ├── privatelink-private-dns-zones.bicep
│ │ ├── vnet-dns-link.bicep
│ │ └── vnet.bicep
├── deploy.parameters.json
├── deploy.sh
├── helm
│ ├── README.md
│ └── graphrag
│ │ ├── .helmignore
│ │ ├── Chart.yaml
│ │ ├── LICENSE
│ │ ├── templates
│ │ ├── NOTES.txt
│ │ ├── _helpers.tpl
│ │ ├── graphrag-clusterrole.yaml
│ │ ├── graphrag-configmap.yaml
│ │ ├── graphrag-ingress.yaml
│ │ ├── graphrag-master-deployment.yaml
│ │ ├── graphrag-master-hpa.yaml
│ │ ├── graphrag-master-service.yaml
│ │ ├── graphrag-nginx-internal-controller.yaml
│ │ ├── graphrag-rolebinding.yaml
│ │ ├── graphrag-serviceaccount.yaml
│ │ └── tests
│ │ │ └── test-connection.yaml
│ │ └── values.yaml
├── main.bicep
└── managed-app
│ ├── README.md
│ ├── createUiDefinition.json
│ ├── mainTemplate.json
│ ├── scripts
│ └── install-graphrag.sh
│ └── viewDefinition.json
├── notebooks
├── 1-Quickstart.ipynb
├── 2-Advanced_Getting_Started.ipynb
├── README.md
└── get-wiki-articles.py
└── openapi.json
/.azdo/azure-pipeline.yaml:
--------------------------------------------------------------------------------
1 | trigger:
2 | - main
3 |
4 | # the `resources` specify the location and version of the 1ES Pipeline Template (PT).
5 | resources:
6 | repositories:
7 | - repository: 1ESPipelineTemplates
8 | type: git
9 | name: 1ESPipelineTemplates/1ESPipelineTemplates
10 | ref: refs/tags/release
11 |
12 | extends:
13 | # this pipeline extends an existing 1ES PT which injects various SDL and compliance tasks
14 | template: v1/1ES.Official.PipelineTemplate.yml@1ESPipelineTemplates
15 | parameters:
16 | pool:
17 | name: OCTO1ES_HostedPool
18 | image: SMTOCTO1ESAgentWindowsVM
19 | os: windows
20 | sdl:
21 | skipComponentGovernanceDetection: false
22 | policheck:
23 | enabled: true
24 | sourceAnalysisPool:
25 | name: OCTO1ES_HostedPool
26 | image: SMTOCTO1ESAgentWindowsVM
27 | os: windows
28 | stages:
29 | - stage: Component_Governance
30 | jobs:
31 | - job: CG_Prep
32 | steps:
33 | # Component Governance (CG) does not support pyproject.toml yet.
34 | # For this reason, we export dependencies into a requirements.txt file.
35 | # CG will auto-detect the requirements.txt file and use it to scan for dependencies.
36 | - script: |
37 | pip install poetry poetry-plugin-export
38 | poetry export --directory=backend --format=requirements.txt --without-hashes --without-urls --all-extras --all-groups -o requirements.txt
39 | displayName: "Export python dependencies to requirements.txt for CG"
40 |
--------------------------------------------------------------------------------
/.devcontainer/Dockerfile:
--------------------------------------------------------------------------------
1 | # For more information about the base image visit:
2 | # https://mcr.microsoft.com/en-us/artifact/mar/devcontainers/python/about
3 | FROM mcr.microsoft.com/devcontainers/python:3.10-bookworm
4 |
5 | # disable common warning messages
6 | ENV DEBIAN_FRONTEND=noninteractive
7 | ENV PIP_ROOT_USER_ACTION=ignore
8 | ENV PIP_DISABLE_PIP_VERSION_CHECK=1
9 |
10 | # configure environment
11 | ARG ENVNAME="GraphRAG"
12 | ARG USERNAME=vscode
13 | ARG WORKDIR=/${ENVNAME}
14 |
15 | # install python, pip, git, and other required tools
16 | RUN apt-get update && apt-get install -y \
17 | ca-certificates \
18 | libicu-dev \
19 | git \
20 | curl \
21 | sudo \
22 | pre-commit \
23 | wget \
24 | jq \
25 | apt-transport-https \
26 | lsb-release \
27 | gnupg \
28 | software-properties-common
29 | # install Azure CLI
30 | RUN curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
31 | # install bicep and kubectl
32 | RUN az bicep install && az aks install-cli
33 | # install helm
34 | RUN curl -fsSL -o get_helm.sh https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 \
35 | && chmod 700 get_helm.sh \
36 | && ./get_helm.sh \
37 | && rm ./get_helm.sh
38 | # install yq
39 | RUN wget https://github.com/mikefarah/yq/releases/latest/download/yq_linux_amd64 -O /usr/bin/yq \
40 | && chmod +x /usr/bin/yq
41 |
42 | # install docker
43 | RUN curl -fsSL https://get.docker.com -o install-docker.sh \
44 | && sh install-docker.sh \
45 | && rm install-docker.sh
46 |
47 | # cleanup to keep the image size down
48 | RUN rm -rf /var/lib/apt/lists/* \
49 | && apt-get purge -y --auto-remove \
50 | && apt-get autoremove \
51 | && apt-get clean
52 |
53 | # set the location for the virtual environments to be outside the project directory
54 | ENV POETRY_VIRTUALENVS_IN_PROJECT=true
55 |
56 | # a non-root user (vscode) already exist in the base image. Add it to sudo group and docker group
57 | RUN echo "${USERNAME}:${USERNAME}" | chpasswd \
58 | && adduser ${USERNAME} sudo \
59 | && adduser ${USERNAME} docker \
60 | && echo '%sudo ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers
61 |
62 | # switch to non-root user
63 | USER ${USERNAME}
64 |
65 | # install poetry
66 | RUN curl -sSL https://install.python-poetry.org | python3 -
67 |
68 | # add the local bin to the PATH for the non-root user
69 | ENV PATH="/home/${USERNAME}/.local/bin:${PATH}"
70 | # Add venv to beginning of path so we don't have to activate it
71 | ENV PATH=/graphrag-accelerator/.venv/bin:$PATH
72 |
73 | # copy the project files into the container and set ownership
74 | COPY --chown=${USERNAME}:${USERNAME} . ${WORKDIR}
75 |
76 | COPY entrypoint.sh /usr/local/bin/entrypoint.sh
77 |
78 | # Create directories for vscode server and extensions
79 | RUN mkdir -p ~/.vscode-server/extensions \
80 | && chown -R $USERNAME:$USERNAME ~/.vscode-server
81 |
82 | ENTRYPOINT [ "/usr/local/bin/entrypoint.sh" ]
83 | CMD ["bash"]
84 |
--------------------------------------------------------------------------------
/.devcontainer/devcontainer.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "graphrag-accelerator",
3 | "build": {
4 | "dockerfile": "./Dockerfile",
5 | "args": {
6 | "DOCKER_GROUP_ID": "${localEnv:DOCKER_GROUP_ID}"
7 | }
8 | },
9 | "forwardPorts": [ 7071 ],
10 | "runArgs": [
11 | "--network", "host" // use host networking so that the dev container can access the API when running the container locally
12 | ],
13 | "remoteUser": "vscode",
14 | "remoteEnv": {
15 | // We add the .venv to the beginning of the path env in the Dockerfile
16 | // so that we use the proper python, however vscode rewrites/overwrites
17 | // the PATH in the image and puts /usr/local/bin in front of our .venv
18 | // path. This fixes that issue.
19 | "PATH": "${containerEnv:PATH}",
20 | // Add src folder to PYTHONPATH so that we can import modules that
21 | // are in the source dir
22 | "PYTHONPATH": "/graphrag-accelerator/backend/:$PATH"
23 | // disable SSL verification for Azure CLI if working in CodeSpaces
24 | // "AZURE_CLI_DISABLE_CONNECTION_VERIFICATION": "1"
25 | },
26 | "mounts": [
27 | // NOTE: we reference both HOME and USERPROFILE environment variables to simultaneously support both Windows and Unix environments
28 | // in most default situations, only one variable will exist (Windows has USERPROFILE and unix has HOME) and a reference to the other variable will result in an empty string
29 | // Keep command history
30 | "type=volume,source=graphrag-bashhistory,target=/home/vscode/command_history",
31 | "type=volume,source=graphrag-devcontainer-vscode-server,target=/home/vscode/.vscode-server/extensions",
32 | // Mounts the login details from the host machine so azcli works seamlessly in the container
33 | // "type=bind,source=${localEnv:HOME}${localEnv:USERPROFILE}/.azure,target=/home/vscode/.azure",
34 | // Mounts the ssh details from the host machine - this allows the container to connect to ssh hosts
35 | "type=bind,source=${localEnv:HOME}${localEnv:USERPROFILE}/.ssh,target=/home/vscode/.ssh",
36 | // Mount docker socket for docker builds
37 | "type=bind,source=/var/run/docker.sock,target=/var/run/docker.sock"
38 | ],
39 | "customizations": {
40 | "vscode": {
41 | // Set *default* container specific settings.json values on container create.
42 | "settings": {
43 | "python.pythonPath": "/graphrag-accelerator/.venv/bin/python",
44 | "python.defaultInterpreterPath": "/graphrag-accelerator/.venv/bin/python",
45 | "python.languageServer": "Pylance",
46 | "files.watcherExclude": {
47 | "**/.git/objects/**": true,
48 | "**/.git/subtree-cache/**": true,
49 | "**/node_modules/*/**": true,
50 | "**/.python_packages/*/**": true
51 | },
52 | "files.associations": {
53 | "*.workbook": "[jsonc]"
54 | },
55 | "ruff.interpreter": [
56 | "/graphrag-accelerator/.venv/bin/python"
57 | ],
58 | "ruff.lint.args": [
59 | "--config",
60 | "/graphrag-accelerator/pyproject.toml"
61 | ],
62 | "ruff.lint.run": "onType"
63 | },
64 | // Add the IDs of extensions you want installed when the container is created.
65 | "extensions": [
66 | "donjayamanne.githistory",
67 | "codezombiech.gitignore",
68 | "GitHub.copilot",
69 | "GitHub.copilot-chat",
70 | "ms-azuretools.vscode-docker",
71 | "ms-azuretools.vscode-bicep",
72 | "ms-dotnettools.vscode-dotnet-runtime",
73 | "ms-kubernetes-tools.vscode-kubernetes-tools",
74 | "ms-python.python",
75 | "ms-python.vscode-pylance",
76 | "ms-toolsai.datawrangler",
77 | "ms-toolsai.jupyter",
78 | "ms-toolsai.jupyter-keymap",
79 | "ms-toolsai.vscode-jupyter-cell-tags",
80 | "ms-toolsai.vscode-jupyter-slideshow",
81 | "ziyasal.vscode-open-in-github",
82 | "charliermarsh.ruff"
83 | ]
84 | }
85 | },
86 | "postCreateCommand": "bash /usr/local/bin/entrypoint.sh",
87 | "workspaceMount": "source=${localWorkspaceFolder},target=/graphrag-accelerator,type=bind,consistency=cached",
88 | "workspaceFolder": "/graphrag-accelerator"
89 | }
--------------------------------------------------------------------------------
/.devcontainer/entrypoint.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | ################################
4 | ### Docker configuration ###
5 | ################################
6 | sudo chmod 666 /var/run/docker.sock
7 |
8 | ################################
9 | ### Dependency configuration ###
10 | ################################
11 |
12 | # Install graphrag dependencies
13 | ROOT_DIR=/graphrag-accelerator
14 | cd ${ROOT_DIR}
15 | poetry install --no-interaction -v --directory ${ROOT_DIR}/backend
16 |
17 | #########################
18 | ### Git configuration ###
19 | #########################
20 | git config --global --add safe.directory ${ROOT_DIR}
21 | pre-commit install
22 |
--------------------------------------------------------------------------------
/.dockerignore:
--------------------------------------------------------------------------------
1 | notebooks
2 |
3 | # Environments
4 | .env
5 | env/
6 | ENV/
7 | **/env.bak
8 | **/venv.bak
9 | **/.venv
10 | **/venv
11 |
12 | .github
13 | .git
14 | **/__pycache__
15 | *.pyc
16 | *.pyo
17 | *.pyd
18 | **/.pytest_cache
19 | **/.ruff_cache
20 | **/.DS_Store
--------------------------------------------------------------------------------
/.editorconfig:
--------------------------------------------------------------------------------
1 | root = true
2 |
3 | [*]
4 | end_of_line = lf
5 | insert_final_newline = true
6 |
--------------------------------------------------------------------------------
/.github/CODEOWNERS:
--------------------------------------------------------------------------------
1 | # These owners will be the default owners for everything in
2 | # the repo. Unless a later match takes precedence,
3 | # @graphrag-core-team and @societal-resilience-graphrag will be requested for
4 | # review when someone opens a pull request.
5 | * @Azure-Samples/graphrag-core-team @Azure-Samples/societal-resilience-graphrag
6 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Bug report
3 | about: Create a report to help us improve
4 | title: "[BUG]"
5 | labels: bug
6 | assignees: ''
7 |
8 | ---
9 |
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 |
13 | **To Reproduce**
14 | Steps to reproduce the behavior:
15 | 1. Go to '...'
16 | 2. Click on '....'
17 | 3. Scroll down to '....'
18 | 4. See error
19 |
20 | **Expected behavior**
21 | A clear and concise description of what you expected to happen.
22 |
23 | **Screenshots**
24 | If applicable, add screenshots to help explain your problem.
25 |
26 | **Desktop (please complete the following information):**
27 | - OS: [e.g. iOS]
28 | - Version [e.g. 22]
29 |
30 | **Additional context**
31 | Add any other context about the problem here.
32 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Feature request
3 | about: Suggest an idea for this project
4 | title: "[FEATURE]"
5 | labels: ''
6 | assignees: ''
7 |
8 | ---
9 |
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 |
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you want to happen.
15 |
16 | **Describe alternatives you've considered**
17 | A clear and concise description of any alternative solutions or features you've considered.
18 |
19 | **Additional context**
20 | Add any other context or screenshots about the feature request here.
21 |
--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
1 | # To get started with Dependabot version updates, you'll need to specify which
2 | # package ecosystems to update and where the package manifests are located.
3 | # Please see the documentation for all configuration options:
4 | # https://docs.github.com/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file
5 |
6 | version: 2
7 | updates:
8 | - package-ecosystem: "pip"
9 | directory: "/backend"
10 | schedule:
11 | interval: "weekly"
12 |
--------------------------------------------------------------------------------
/.github/workflows/dev.yaml:
--------------------------------------------------------------------------------
1 | name: Dev Build
2 | on:
3 | workflow_dispatch: # triggered manually via the GitHub UI
4 | pull_request: # triggered when a PR is created or updated
5 | types:
6 | - opened
7 | - reopened
8 | - synchronize
9 | - ready_for_review
10 | paths: # only trigger on changes in specific directories
11 | - '.github/**/*.yaml'
12 | - 'backend/**'
13 | - 'docker/**'
14 | - 'infra/**'
15 | - 'poetry.lock'
16 | - 'pyproject.toml'
17 | jobs:
18 | lint-check:
19 | runs-on: ubuntu-latest
20 | steps:
21 | - name: Checkout repository
22 | uses: actions/checkout@v4
23 | - name: Setup python
24 | uses: actions/setup-python@v5
25 | with:
26 | python-version: '3.10'
27 | - run: pip install ruff
28 | - run: |
29 | ruff check
30 | build-devcontainer:
31 | needs: [lint-check]
32 | runs-on: ubuntu-latest
33 | if: ${{ !github.event.pull_request.draft }}
34 | steps:
35 | - name: Checkout repository
36 | uses: actions/checkout@v4
37 | - name: Build docker image
38 | uses: docker/build-push-action@v2
39 | with:
40 | context: .devcontainer
41 | push: false
42 | build-backend:
43 | needs: [lint-check]
44 | runs-on: ubuntu-latest
45 | if: ${{ !github.event.pull_request.draft }}
46 | steps:
47 | - name: Checkout repository
48 | uses: actions/checkout@v4
49 | - name: Build docker image
50 | uses: docker/build-push-action@v2
51 | with:
52 | context: .
53 | file: docker/Dockerfile-backend
54 | push: false
55 | build-frontend:
56 | needs: [lint-check]
57 | runs-on: ubuntu-latest
58 | if: ${{ !github.event.pull_request.draft }}
59 | steps:
60 | - name: Checkout repository
61 | uses: actions/checkout@v4
62 | - name: Build docker image
63 | uses: docker/build-push-action@v2
64 | with:
65 | context: .
66 | file: docker/Dockerfile-frontend
67 | push: false
68 |
--------------------------------------------------------------------------------
/.github/workflows/tests.yaml:
--------------------------------------------------------------------------------
1 | name: Testing
2 | on:
3 | workflow_dispatch: # triggered manually via the GitHub UI
4 | pull_request: # triggered when a PR is created or updated
5 | types:
6 | - opened
7 | - reopened
8 | - synchronize
9 | - ready_for_review
10 | paths: # only trigger on changes in specific directories
11 | - '.github/**/*.yaml'
12 | - 'backend/**'
13 | - 'docker/**'
14 | - 'backend/poetry.lock'
15 | - 'backend/pyproject.toml'
16 |
17 | env:
18 | PYTHON_VERSION: '3.10'
19 |
20 | jobs:
21 | tests:
22 | runs-on: windows-latest
23 | steps:
24 | - name: Checkout repository
25 | uses: actions/checkout@v4
26 |
27 | - name: Install python
28 | uses: actions/setup-python@v5
29 | with:
30 | python-version: '3.10'
31 |
32 | - name: Install Azurite
33 | shell: bash
34 | run: |
35 | npm install -g azurite
36 | azurite --silent &
37 |
38 | # For more information on installation/setup of Azure Cosmos DB Emulator
39 | # https://learn.microsoft.com/en-us/azure/cosmos-db/how-to-develop-emulator?tabs=docker-linux%2Cpython&pivots=api-nosql
40 | # Note: the emulator is only available on Windows runners. It can take longer than the default to initially startup so we increase the default timeout.
41 | # If a job fails due to timeout, restarting the cicd job usually resolves the problem.
42 | - name: Install Azure Cosmos DB emulator
43 | run: |
44 | Write-Host "Launching Cosmos DB Emulator"
45 | Import-Module "$env:ProgramFiles\Azure Cosmos DB Emulator\PSModules\Microsoft.Azure.CosmosDB.Emulator"
46 | Start-CosmosDbEmulator -Timeout 500
47 |
48 | - name: Install dependencies
49 | working-directory: ${{ github.workspace }}/backend
50 | run: |
51 | pip install poetry
52 | poetry config virtualenvs.create false
53 | poetry install --with test
54 |
55 | - name: Run pytests
56 | working-directory: ${{ github.workspace }}/backend
57 | run: |
58 | pytest --cov=graphrag_app --junitxml=test-results.xml tests/
59 |
60 | - name: Upload test results
61 | uses: actions/upload-artifact@v4
62 | with:
63 | name: pytest-results
64 | path: ${{ github.workspace }}/backend/test-results.xml
65 | # Use always() to always run this step to publish test results when there are test failures
66 | if: ${{ always() }}
67 |
--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | exclude: "tests/data"
2 | repos:
3 | - repo: https://github.com/kynan/nbstripout
4 | rev: 0.7.1
5 | hooks:
6 | - id: nbstripout
7 | - repo: https://github.com/pre-commit/pre-commit-hooks
8 | rev: v4.6.0
9 | hooks:
10 | - id: requirements-txt-fixer
11 | - id: mixed-line-ending
12 | - id: trailing-whitespace
13 | - id: check-json
14 | exclude: "devcontainer.json"
15 | - id: pretty-format-json
16 | args:
17 | - "--autofix"
18 | exclude: 'tests/|devcontainer.json|^.*\.ipynb$'
19 | - id: end-of-file-fixer
20 | files: \.(py|sh|bash|conf|yaml|yml|toml|ini)$
21 | - repo: https://github.com/astral-sh/ruff-pre-commit
22 | rev: v0.4.6
23 | hooks:
24 | # Run the linter.
25 | - id: ruff
26 | types_or: [ python, pyi, jupyter ]
27 | # Run the formatter.
28 | - id: ruff-format
29 | types_or: [ python, pyi, jupyter ]
30 | - repo: https://github.com/Yelp/detect-secrets
31 | rev: v1.5.0
32 | hooks:
33 | - id: detect-secrets
34 | args: ['--baseline', '.secrets.baseline']
35 |
--------------------------------------------------------------------------------
/.secrets.baseline:
--------------------------------------------------------------------------------
1 | {
2 | "version": "1.5.0",
3 | "plugins_used": [
4 | {
5 | "name": "ArtifactoryDetector"
6 | },
7 | {
8 | "name": "AWSKeyDetector"
9 | },
10 | {
11 | "name": "AzureStorageKeyDetector"
12 | },
13 | {
14 | "name": "Base64HighEntropyString",
15 | "limit": 4.5
16 | },
17 | {
18 | "name": "BasicAuthDetector"
19 | },
20 | {
21 | "name": "CloudantDetector"
22 | },
23 | {
24 | "name": "DiscordBotTokenDetector"
25 | },
26 | {
27 | "name": "GitHubTokenDetector"
28 | },
29 | {
30 | "name": "GitLabTokenDetector"
31 | },
32 | {
33 | "name": "HexHighEntropyString",
34 | "limit": 3.0
35 | },
36 | {
37 | "name": "IbmCloudIamDetector"
38 | },
39 | {
40 | "name": "IbmCosHmacDetector"
41 | },
42 | {
43 | "name": "IPPublicDetector"
44 | },
45 | {
46 | "name": "JwtTokenDetector"
47 | },
48 | {
49 | "name": "KeywordDetector",
50 | "keyword_exclude": ""
51 | },
52 | {
53 | "name": "MailchimpDetector"
54 | },
55 | {
56 | "name": "NpmDetector"
57 | },
58 | {
59 | "name": "OpenAIDetector"
60 | },
61 | {
62 | "name": "PrivateKeyDetector"
63 | },
64 | {
65 | "name": "PypiTokenDetector"
66 | },
67 | {
68 | "name": "SendGridDetector"
69 | },
70 | {
71 | "name": "SlackDetector"
72 | },
73 | {
74 | "name": "SoftlayerDetector"
75 | },
76 | {
77 | "name": "SquareOAuthDetector"
78 | },
79 | {
80 | "name": "StripeDetector"
81 | },
82 | {
83 | "name": "TelegramBotTokenDetector"
84 | },
85 | {
86 | "name": "TwilioKeyDetector"
87 | }
88 | ],
89 | "filters_used": [
90 | {
91 | "path": "detect_secrets.filters.allowlist.is_line_allowlisted"
92 | },
93 | {
94 | "path": "detect_secrets.filters.common.is_ignored_due_to_verification_policies",
95 | "min_level": 2
96 | },
97 | {
98 | "path": "detect_secrets.filters.heuristic.is_indirect_reference"
99 | },
100 | {
101 | "path": "detect_secrets.filters.heuristic.is_likely_id_string"
102 | },
103 | {
104 | "path": "detect_secrets.filters.heuristic.is_lock_file"
105 | },
106 | {
107 | "path": "detect_secrets.filters.heuristic.is_not_alphanumeric_string"
108 | },
109 | {
110 | "path": "detect_secrets.filters.heuristic.is_potential_uuid"
111 | },
112 | {
113 | "path": "detect_secrets.filters.heuristic.is_prefixed_with_dollar_sign"
114 | },
115 | {
116 | "path": "detect_secrets.filters.heuristic.is_sequential_string"
117 | },
118 | {
119 | "path": "detect_secrets.filters.heuristic.is_swagger_file"
120 | },
121 | {
122 | "path": "detect_secrets.filters.heuristic.is_templated_secret"
123 | }
124 | ],
125 | "results": {
126 | "backend/pytest.ini": [
127 | {
128 | "type": "Secret Keyword",
129 | "filename": "backend/pytest.ini",
130 | "hashed_secret": "589c2d30c725c063a05a59110ea5888a80a28f15",
131 | "is_verified": false,
132 | "line_number": 7
133 | },
134 | {
135 | "type": "Azure Storage Account access key",
136 | "filename": "backend/pytest.ini",
137 | "hashed_secret": "7388811af1e10afcb96c331748597e7a75e27e7d",
138 | "is_verified": false,
139 | "line_number": 7
140 | },
141 | {
142 | "type": "Secret Keyword",
143 | "filename": "backend/pytest.ini",
144 | "hashed_secret": "1655679f8bfda925b76ee655dfac4519d90d3431",
145 | "is_verified": false,
146 | "line_number": 8
147 | },
148 | {
149 | "type": "Azure Storage Account access key",
150 | "filename": "backend/pytest.ini",
151 | "hashed_secret": "5666459779d6a76bea73453137803fd27d8f79cd",
152 | "is_verified": false,
153 | "line_number": 8
154 | }
155 | ]
156 | },
157 | "generated_at": "2024-12-17T06:41:24Z"
158 | }
159 |
--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | # Microsoft Open Source Code of Conduct
2 |
3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
4 |
5 | Resources:
6 |
7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/)
8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # Contributing to [project-title]
2 |
3 | This project welcomes contributions and suggestions. Most contributions require you to agree to a
4 | Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us
5 | the rights to use your contribution. For details, visit https://cla.opensource.microsoft.com.
6 |
7 | When you submit a pull request, a CLA bot will automatically determine whether you need to provide
8 | a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions
9 | provided by the bot. You will only need to do this once across all repos using our CLA.
10 |
11 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
12 | For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or
13 | contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.
14 |
15 | - [Code of Conduct](#coc)
16 | - [Issues and Bugs](#issue)
17 | - [Feature Requests](#feature)
18 | - [Submission Guidelines](#submit)
19 |
20 | ## Code of Conduct
21 | Help us keep this project open and inclusive. Please read and follow our [Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
22 |
23 | ## Found an Issue?
24 | If you find a bug in the source code or a mistake in the documentation, you can help us by
25 | [submitting an issue](#submit-issue) to the GitHub Repository. Even better, you can
26 | [submit a Pull Request](#submit-pr) with a fix.
27 |
28 | ## Want a Feature?
29 | You can *request* a new feature by [submitting an issue](#submit-issue) to the GitHub
30 | Repository. If you would like to *implement* a new feature, please submit an issue with
31 | a proposal for your work first, to be sure that we can use it.
32 |
33 | * **Small Features** can be crafted and directly [submitted as a Pull Request](#submit-pr).
34 |
35 | ## Submission Guidelines
36 |
37 | ### Submitting an Issue
38 | Before you submit an issue, search the archive, maybe your question was already answered.
39 |
40 | If your issue appears to be a bug, and hasn't been reported, open a new issue.
41 | Help us to maximize the effort we can spend fixing issues and adding new
42 | features, by not reporting duplicate issues. Providing the following information will increase the
43 | chances of your issue being dealt with quickly:
44 |
45 | * **Overview of the Issue** - if an error is being thrown a non-minified stack trace helps
46 | * **Version** - what version is affected (e.g. 0.1.2)
47 | * **Motivation for or Use Case** - explain what are you trying to do and why the current behavior is a bug for you
48 | * **Browsers and Operating System** - is this a problem with all browsers?
49 | * **Reproduce the Error** - provide a live example or a unambiguous set of steps
50 | * **Related Issues** - has a similar issue been reported before?
51 | * **Suggest a Fix** - if you can't fix the bug yourself, perhaps you can point to what might be
52 | causing the problem (line of code or commit)
53 |
54 | You can file new issues by providing the above information at the corresponding repository's issues link: https://github.com/[organization-name]/[repository-name]/issues/new].
55 |
56 | ### Submitting a Pull Request (PR)
57 | Before you submit your Pull Request (PR) consider the following guidelines:
58 |
59 | * Search the repository (https://github.com/[organization-name]/[repository-name]/pulls) for an open or closed PR
60 | that relates to your submission. You don't want to duplicate effort.
61 |
62 | * Make your changes in a new git fork:
63 |
64 | * Commit your changes using a descriptive commit message
65 | * Push your fork to GitHub:
66 | * In GitHub, create a pull request
67 | * If we suggest changes then:
68 | * Make the required updates.
69 | * Rebase your fork and force push to your GitHub repository (this will update your Pull Request):
70 |
71 | ```shell
72 | git rebase master -i
73 | git push -f
74 | ```
75 |
76 | That's it! Thank you for your contribution!
77 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright (c) Microsoft Corporation.
2 |
3 | MIT License
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # GraphRAG Accelerator
2 |
3 | ## ⚠️ ATTENTION
4 | This repository is no longer maintained. We sincerely appreciate the interest and all contributors to the Graphrag Solution Accelerator.
5 |
6 | 🚀 Future development and updates - Please visit the [graphrag library](https://github.com/microsoft/graphrag) for future updates and continued collaboration with the graphrag community at Microsoft.
7 |
8 | ---
9 |
10 | [](https://vscode.dev/redirect?url=vscode://ms-vscode-remote.remote-containers/cloneInVolume?url=https://github.com/Azure-Samples/graphrag-accelerator)
11 |
12 | Welcome to the GraphRAG solution accelerator! This accelerator builds on top of the [graphrag](https://github.com/microsoft/graphrag) python package and exposes API endpoints hosted on Azure, which can be used to trigger indexing pipelines and enable querying of the graphrag knowledge graph.
13 |
14 | This repository presents a methodology for running a hosted service using knowledge graph memory structures to enhance LLM outputs. Please note that the provided code serves as a demonstration and is not an officially supported Microsoft offering.
15 |
16 | ⚠️ Warning: The GraphRAG Accelerator uses multiple Azure services and may incur substantial costs. It is meant to host a high-utilization API with auto-scaling and user access control. Please see the [deployment bicep](infra/main.bicep) for further detail on the services used.
17 |
18 | ⚠️ Warning: GraphRAG indexing can be an expensive operation. Please read all documentation to understand the process and costs involved, and start with a small amount of data.
19 |
20 | For FAQ and our roadmap, please visit `aka.ms/graphrag`
21 |
22 | 
23 |
24 | ## Getting Started with GraphRAG on Azure
25 |
26 | ### Deployment Guide
27 | To deploy the solution accelerator, see the [deployment guide](docs/DEPLOYMENT-GUIDE.md). This will result in a full deployment of graphrag as an API.
28 | Afterwards, check out the [Quickstart](notebooks/1-Quickstart.ipynb) notebook for a demonstration of various API calls.
29 |
30 | ## Development Guide
31 | Interested in contributing? Check out the [development guide](docs/DEVELOPMENT-GUIDE.md).
32 |
33 | ### How to file issues and get help
34 | This project uses GitHub Issues to track bugs and feature requests. Please search the existing issues before filing new issues to avoid duplicates. For new issues, file your bug or feature request as a new Issue.
35 |
36 | ## Contributing
37 |
38 | This project welcomes contributions and suggestions. Most contributions require you to
39 | agree to a Contributor License Agreement (CLA) declaring that you have the right to,
40 | and actually do, grant us the rights to use your contribution. For details, visit
41 | https://cla.microsoft.com.
42 |
43 | When you submit a pull request, a CLA-bot will automatically determine whether you need
44 | to provide a CLA and decorate the PR appropriately (e.g., label, comment). Simply follow the
45 | instructions provided by the bot. You will only need to do this once across all repositories using our CLA.
46 |
47 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
48 | For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
49 | or contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.
50 |
51 | # Trademarks
52 |
53 | This project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft trademarks or logos is subject to and must follow [Microsoft’s Trademark & Brand Guidelines](https://www.microsoft.com/en-us/legal/intellectualproperty/trademarks/usage/general). Use of Microsoft trademarks or logos in modified versions of this project must not cause confusion or imply Microsoft sponsorship. Any use of third-party trademarks or logos are subject to those third-party’s policies.
54 |
--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
1 |
2 |
3 | ## Security
4 |
5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet) and [Xamarin](https://github.com/xamarin).
6 |
7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/security.md/definition), please report it to us as described below.
8 |
9 | ## Reporting Security Issues
10 |
11 | **Please do not report security vulnerabilities through public GitHub issues.**
12 |
13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/security.md/msrc/create-report).
14 |
15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/security.md/msrc/pgp).
16 |
17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc).
18 |
19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
20 |
21 | * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
22 | * Full paths of source file(s) related to the manifestation of the issue
23 | * The location of the affected source code (tag/branch/commit or direct URL)
24 | * Any special configuration required to reproduce the issue
25 | * Step-by-step instructions to reproduce the issue
26 | * Proof-of-concept or exploit code (if possible)
27 | * Impact of the issue, including how an attacker might exploit the issue
28 |
29 | This information will help us triage your report more quickly.
30 |
31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/security.md/msrc/bounty) page for more details about our active programs.
32 |
33 | ## Preferred Languages
34 |
35 | We prefer all communications to be in English.
36 |
37 | ## Policy
38 |
39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/security.md/cvd).
40 |
41 |
--------------------------------------------------------------------------------
/TRANSPARENCY.md:
--------------------------------------------------------------------------------
1 | # GraphRAG: Responsible AI FAQ
2 |
3 | ## What is GraphRAG?
4 | GraphRAG is an AI-based content interpretation and search capability. Using LLMs, it parses data to create a knowledge graph and answer user questions about a user-provided private dataset.
5 |
6 | ## What can GraphRAG do?
7 | GraphRAG is able to connect information across large volumes of information and use these connections to answer questions that are difficult or impossible to answer using keyword and vector-based search mechanisms. Building on the previous question, provide semi-technical, high-level information on how the system offers functionality for various uses. This lets a system using GraphRAG to answer questions where the answers span many documents as well as thematic questions such as “what are the top themes in this dataset?.”
8 |
9 | ## What are GraphRAG’s intended use(s)?
10 | - GraphRAG is intended to support critical information discovery and analysis use cases where the information required to arrive at a useful insight spans many documents, is noisy, is mixed with mis and/or dis-information, or when the questions users aim to answer are more abstract or thematic than the underlying data can directly answer.
11 | - GraphRAG is designed to be used in settings where users are already trained on responsible analytic approaches and critical reasoning is expected. GraphRAG is capable of providing high degrees of insight on complex information topics, however human analysis by a domain expert of the answers is needed in order to verify and augment GraphRAG’s generated responses.
12 | - GraphRAG is intended to be deployed and used with a domain specific corpus of text data. GraphRAG itself does not collect user data, but users are encouraged to verify data privacy policies of the chosen LLM used to configure GraphRAG.
13 |
14 | ## How was GraphRAG evaluated? What metrics are used to measure performance?
15 |
16 | GraphRAG has been evaluated in multiple ways. The primary concerns are 1) accurate representation of the data set, 2) providing transparency and groundedness of responses, 3) resilience to prompt and data corpus injection attacks, and 4) low hallucination rates. Details on how each of these has been evaluated is outlined below by number.
17 | 1. Accurate representation of the dataset has been tested by both manual inspection and automated testing against a “gold answer” that is created from randomly selected subsets of a test corpus.
18 | 1. GraphRAG has been tested against datasets with known confusors and noise in multiple domains. These tests include both automated evaluation of answer detail (as compared to vector search approaches) as well as manual inspection using questions that are known to be difficult or impossible for other search systems to answer.
19 | 1. Transparency and groundedness of responses is tested via automated answer coverage evaluation and human inspection of the underlying context returned.
20 | 1. We test both user prompt injection attacks (“jailbreaks”) and cross prompt injection attacks (“data attacks”) using manual and semi-automated techniques.
21 | 1. Hallucination rates are evaluated using claim coverage metrics, manual inspection of answer and source, and adversarial attacks to attempt a forced hallucination through adversarial and exceptionally challenging datasets.
22 |
23 | ## What are the limitations of GraphRAG? How can users minimize the impact of GraphRAG’s limitations when using the system?
24 | - GraphRAG depends on well-constructed indexing examples. For general applications (e.g. content oriented around people, places, organizations, things, etc.) we provide example indexing prompts. For unique datasets, effective indexing can depend on proper identification of domain-specific concepts.
25 | - Indexing is a relatively expensive operation; a best practice to mitigate indexing is to create a small test dataset in the target domain to ensure indexer performance prior to large indexing operations.
26 | - GraphRAG is designed to accept well-formatted UTF-8 text only. Input data that does not conform to this specification will cause issues in indexing with unreliable effects.
27 |
28 | ## What operational factors and settings allow for effective and responsible use of GraphRAG?
29 | - GraphRAG is designed for use by users with domain sophistication and experience working through difficult information challenges. While the approach is generally robust to injection attacks and identifying conflicting sources of information, the system is designed for trusted users. Proper human analysis of responses is important to generate reliable insights, and the provenance of information should be traced to ensure human agreement with the inferences made as part of the answer generation.
30 | - GraphRAG yields the most effective results on natural language text data that is collectively focused on an overall topic or theme, and that is entity rich – entities being people, places, things, or objects that can be uniquely identified.
31 | - GraphRAG has been evaluated for its resilience to prompt and data corpus injection attacks and has been probed for specific types of harms. However, the LLM that the user configures with GraphRAG may produce inappropriate or offensive content which may make it inappropriate to deploy for sensitive contexts without additional mitigations that are specific to the use case and model. Developers should assess outputs for their context and use available safety classifiers, model specific safety filters and features (such as [https://azure.microsoft.com/en-us/products/ai-services/ai-content-safety](https://azure.microsoft.com/en-us/products/ai-services/ai-content-safety)), or custom solutions appropriate for their use case. The use of content safety filters is recommended to prevent XPIA and UPIA attacks, as well as to limit harmful content generation by malicious users. Discretion is advised when modifying or removing filters for applications that require it.
32 |
--------------------------------------------------------------------------------
/backend/.coveragerc:
--------------------------------------------------------------------------------
1 | [run]
2 | omit =
3 | **/__init__.py
4 |
--------------------------------------------------------------------------------
/backend/README.md:
--------------------------------------------------------------------------------
1 | # Web App
2 | This directory contains the source code for a FastAPI application implements a REST API wrapper around the graphrag library. The app has been packaged up as a python package for a cleaner install/deployment experience.
3 |
4 | ## Package Layout
5 | The code has the following structure:
6 | ```shell
7 | backend
8 | ├── README.md
9 | ├── graphrag_app # contains the main application files
10 | │ ├── __init__.py
11 | │ ├── api # endpoint definitions
12 | │ ├── logger # custom loggers designed for graphrag use
13 | │ ├── main.py # initializes the FastAPI application
14 | │ ├── typing # data validation models
15 | │ └── utils # utility/helper functions
16 | ├── manifests # k8s manifest files
17 | ├── poetry.lock
18 | ├── pyproject.toml
19 | ├── pytest.ini
20 | ├── scripts # miscellaneous scripts that get executed in k8s
21 | └── tests # pytests (integration tests + unit tests)
22 | ```
23 |
--------------------------------------------------------------------------------
/backend/graphrag_app/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure-Samples/graphrag-accelerator/c660c84cff1f7dfc7c82a62ee566ab0b2dc8e6b6/backend/graphrag_app/__init__.py
--------------------------------------------------------------------------------
/backend/graphrag_app/api/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure-Samples/graphrag-accelerator/c660c84cff1f7dfc7c82a62ee566ab0b2dc8e6b6/backend/graphrag_app/api/__init__.py
--------------------------------------------------------------------------------
/backend/graphrag_app/api/graph.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # Licensed under the MIT License.
3 |
4 | import os
5 | import traceback
6 |
7 | from fastapi import (
8 | APIRouter,
9 | Depends,
10 | HTTPException,
11 | status,
12 | )
13 | from fastapi.responses import StreamingResponse
14 |
15 | from graphrag_app.logger.load_logger import load_pipeline_logger
16 | from graphrag_app.utils.azure_clients import AzureClientManager
17 | from graphrag_app.utils.common import (
18 | sanitize_name,
19 | subscription_key_check,
20 | validate_index_file_exist,
21 | )
22 |
23 | graph_route = APIRouter(
24 | prefix="/graph",
25 | tags=["Graph Operations"],
26 | )
27 | if os.getenv("KUBERNETES_SERVICE_HOST"):
28 | graph_route.dependencies.append(Depends(subscription_key_check))
29 |
30 |
31 | @graph_route.get(
32 | "/graphml/{container_name}",
33 | summary="Retrieve a GraphML file of the knowledge graph",
34 | response_description="GraphML file successfully downloaded",
35 | status_code=status.HTTP_200_OK,
36 | )
37 | async def get_graphml_file(
38 | container_name, sanitized_container_name: str = Depends(sanitize_name)
39 | ):
40 | # validate graphml file existence
41 | azure_client_manager = AzureClientManager()
42 | graphml_filename = "graph.graphml"
43 | blob_filepath = f"output/{graphml_filename}" # expected file location of the graph based on the workflow
44 | validate_index_file_exist(sanitized_container_name, blob_filepath)
45 | try:
46 | blob_client = azure_client_manager.get_blob_service_client().get_blob_client(
47 | container=sanitized_container_name, blob=blob_filepath
48 | )
49 | blob_stream = blob_client.download_blob().chunks()
50 | return StreamingResponse(
51 | blob_stream,
52 | media_type="application/octet-stream",
53 | headers={"Content-Disposition": f"attachment; filename={graphml_filename}"},
54 | )
55 | except Exception as e:
56 | logger = load_pipeline_logger()
57 | logger.error(
58 | message="Could not fetch graphml file",
59 | cause=e,
60 | stack=traceback.format_exc(),
61 | )
62 | raise HTTPException(
63 | status_code=500,
64 | detail=f"Could not fetch graphml file for '{container_name}'.",
65 | )
66 |
--------------------------------------------------------------------------------
/backend/graphrag_app/api/prompt_tuning.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # Licensed under the MIT License.
3 |
4 | import os
5 | import traceback
6 | from pathlib import Path
7 |
8 | import graphrag.api as api
9 | import yaml
10 | from fastapi import (
11 | APIRouter,
12 | Depends,
13 | HTTPException,
14 | status,
15 | )
16 | from graphrag.config.create_graphrag_config import create_graphrag_config
17 |
18 | from graphrag_app.logger.load_logger import load_pipeline_logger
19 | from graphrag_app.utils.azure_clients import AzureClientManager
20 | from graphrag_app.utils.common import sanitize_name, subscription_key_check
21 |
22 | prompt_tuning_route = APIRouter(prefix="/index/config", tags=["Prompt Tuning"])
23 | if os.getenv("KUBERNETES_SERVICE_HOST"):
24 | prompt_tuning_route.dependencies.append(Depends(subscription_key_check))
25 |
26 |
27 | @prompt_tuning_route.get(
28 | "/prompts",
29 | summary="Generate custom graphrag prompts based on user-provided data.",
30 | description="Generating custom prompts from user-provided data may take several minutes to run based on the amount of data used.",
31 | status_code=status.HTTP_200_OK,
32 | )
33 | async def generate_prompts(
34 | container_name: str,
35 | limit: int = 5,
36 | sanitized_container_name: str = Depends(sanitize_name),
37 | ):
38 | """
39 | Automatically generate custom prompts for entity entraction,
40 | community reports, and summarize descriptions based on a sample of provided data.
41 | """
42 | # check for storage container existence
43 | azure_client_manager = AzureClientManager()
44 | blob_service_client = azure_client_manager.get_blob_service_client()
45 | if not blob_service_client.get_container_client(sanitized_container_name).exists():
46 | raise HTTPException(
47 | status_code=500,
48 | detail=f"Storage container '{container_name}' does not exist.",
49 | )
50 |
51 | # load pipeline configuration file (settings.yaml) for input data and other settings
52 | ROOT_DIR = Path(__file__).resolve().parent.parent.parent
53 | with (ROOT_DIR / "scripts/settings.yaml").open("r") as f:
54 | data = yaml.safe_load(f)
55 | data["input"]["container_name"] = sanitized_container_name
56 | graphrag_config = create_graphrag_config(values=data, root_dir=".")
57 |
58 | # generate prompts
59 | try:
60 | prompts: tuple[str, str, str] = await api.generate_indexing_prompts(
61 | config=graphrag_config,
62 | root=".",
63 | limit=limit,
64 | selection_method="random",
65 | )
66 | except Exception as e:
67 | logger = load_pipeline_logger()
68 | error_details = {
69 | "storage_name": container_name,
70 | }
71 | logger.error(
72 | message="Auto-prompt generation failed.",
73 | cause=e,
74 | stack=traceback.format_exc(),
75 | details=error_details,
76 | )
77 | raise HTTPException(
78 | status_code=500,
79 | detail=f"Error generating prompts for data in '{container_name}'. Please try a lower limit.",
80 | )
81 |
82 | prompt_content = {
83 | "entity_extraction_prompt": prompts[0],
84 | "entity_summarization_prompt": prompts[1],
85 | "community_summarization_prompt": prompts[2],
86 | }
87 | return prompt_content # returns a fastapi.responses.JSONResponse object
88 |
--------------------------------------------------------------------------------
/backend/graphrag_app/logger/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # Licensed under the MIT License.
3 |
4 | from graphrag_app.logger.application_insights_workflow_callbacks import (
5 | ApplicationInsightsWorkflowCallbacks,
6 | )
7 | from graphrag_app.logger.console_workflow_callbacks import ConsoleWorkflowCallbacks
8 | from graphrag_app.logger.load_logger import load_pipeline_logger
9 | from graphrag_app.logger.pipeline_job_updater import PipelineJobUpdater
10 | from graphrag_app.logger.typing import (
11 | Logger,
12 | PipelineAppInsightsReportingConfig,
13 | PipelineReportingConfigTypes,
14 | )
15 |
16 | __all__ = [
17 | "Logger",
18 | "ApplicationInsightsWorkflowCallbacks",
19 | "ConsoleWorkflowCallbacks",
20 | "PipelineAppInsightsReportingConfig",
21 | "PipelineJobUpdater",
22 | "PipelineReportingConfigTypes",
23 | "load_pipeline_logger",
24 | ]
25 |
--------------------------------------------------------------------------------
/backend/graphrag_app/logger/blob_workflow_callbacks.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # Licensed under the MIT License.
3 |
4 | from datetime import datetime
5 | from typing import (
6 | Any,
7 | )
8 |
9 | from azure.storage.blob import BlobServiceClient
10 | from devtools import pformat
11 | from graphrag.callbacks.noop_workflow_callbacks import NoopWorkflowCallbacks
12 |
13 |
14 | class BlobWorkflowCallbacks(NoopWorkflowCallbacks):
15 | """A reporter that writes to a blob storage."""
16 |
17 | _blob_service_client: BlobServiceClient
18 | _container_name: str
19 | _index_name: str
20 | _num_workflow_steps: int
21 | _processed_workflow_steps: list[str] = []
22 | _max_block_count: int = 25000 # 25k blocks per blob
23 | _num_blocks = 0
24 | _blob_name: str
25 |
26 | def __init__(
27 | self,
28 | blob_service_client: BlobServiceClient,
29 | container_name: str,
30 | blob_name: str = "",
31 | index_name: str = "",
32 | num_workflow_steps: int = 0,
33 | ):
34 | """Create a new instance of the BlobWorkflowCallbacks class.
35 |
36 | Args:
37 | storage_account_blob_url (str): The URL to the storage account.
38 | container_name (str): The name of the container.
39 | blob_name (str, optional): The name of the blob. Defaults to "".
40 | index_name (str, optional): The name of the index. Defaults to "".
41 | num_workflow_steps (int): A list of workflow names ordered by their execution. Defaults to [].
42 | """
43 | self._blob_service_client = blob_service_client
44 | self._blob_name = (
45 | f"{datetime.now().strftime('%Y-%m-%d-%H:%M:%S:%f')}.logs.txt"
46 | if not blob_name
47 | else blob_name
48 | )
49 | self._container_name = container_name
50 | self._index_name = index_name
51 | self._num_workflow_steps = num_workflow_steps
52 | self._processed_workflow_steps = [] # maintain a running list of workflow steps that get processed
53 | self._blob_client = self._blob_service_client.get_blob_client(
54 | self._container_name, self._blob_name
55 | )
56 | if not self._blob_client.exists():
57 | self._blob_client.create_append_blob()
58 | self._num_blocks = 0 # refresh block counter
59 |
60 | def _write_log(self, log: dict[str, Any]):
61 | """Write a log message to blob storage."""
62 | # create a new file when block count is close to 25k
63 | if self._num_blocks >= self._max_block_count:
64 | self.__init__(
65 | blob_service_client=self._blob_service_client,
66 | container_name=self._container_name,
67 | )
68 | blob_client = self._blob_service_client.get_blob_client(
69 | self._container_name, self._blob_name
70 | )
71 | blob_client.append_block(pformat(log, indent=2) + "\n")
72 | self._num_blocks += 1
73 |
74 | def workflow_start(self, name: str, instance: object) -> None:
75 | """Execute this callback when a workflow starts."""
76 | self._workflow_name = name
77 | self._processed_workflow_steps.append(name)
78 | message = f"Index: {self._index_name} -- " if self._index_name else ""
79 | workflow_progress = (
80 | f" ({len(self._processed_workflow_steps)}/{self._num_workflow_steps})"
81 | if self._num_workflow_steps
82 | else ""
83 | ) # will take the form "(1/4)"
84 | message += f"Workflow{workflow_progress}: {name} started."
85 | details = {
86 | "workflow_name": name,
87 | # "workflow_instance": str(instance),
88 | }
89 | if self._index_name:
90 | details["index_name"] = self._index_name
91 | self._write_log({
92 | "type": "on_workflow_start",
93 | "data": message,
94 | "details": details,
95 | })
96 |
97 | def workflow_end(self, name: str, instance: object) -> None:
98 | """Execute this callback when a workflow ends."""
99 | message = f"Index: {self._index_name} -- " if self._index_name else ""
100 | workflow_progress = (
101 | f" ({len(self._processed_workflow_steps)}/{self._num_workflow_steps})"
102 | if self._num_workflow_steps
103 | else ""
104 | ) # will take the form "(1/4)"
105 | message += f"Workflow{workflow_progress}: {name} complete."
106 | details = {
107 | "workflow_name": name,
108 | # "workflow_instance": str(instance),
109 | }
110 | if self._index_name:
111 | details["index_name"] = self._index_name
112 | self._write_log({
113 | "type": "on_workflow_end",
114 | "data": message,
115 | "details": details,
116 | })
117 |
118 | def error(
119 | self,
120 | message: str,
121 | cause: BaseException | None = None,
122 | stack: str | None = None,
123 | details: dict | None = None,
124 | ):
125 | """Report an error."""
126 | self._write_log({
127 | "type": "error",
128 | "data": message,
129 | "cause": str(cause),
130 | "stack": stack,
131 | "details": details,
132 | })
133 |
134 | def warning(self, message: str, details: dict | None = None):
135 | """Report a warning."""
136 | self._write_log({"type": "warning", "data": message, "details": details})
137 |
138 | def log(self, message: str, details: dict | None = None):
139 | """Report a generic log message."""
140 | self._write_log({"type": "log", "data": message, "details": details})
141 |
--------------------------------------------------------------------------------
/backend/graphrag_app/logger/load_logger.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # Licensed under the MIT License.
3 |
4 | import os
5 | from pathlib import Path
6 | from typing import List
7 |
8 | from graphrag.callbacks.file_workflow_callbacks import FileWorkflowCallbacks
9 | from graphrag.callbacks.workflow_callbacks import WorkflowCallbacks
10 | from graphrag.callbacks.workflow_callbacks_manager import WorkflowCallbacksManager
11 |
12 | from graphrag_app.logger.application_insights_workflow_callbacks import (
13 | ApplicationInsightsWorkflowCallbacks,
14 | )
15 | from graphrag_app.logger.blob_workflow_callbacks import BlobWorkflowCallbacks
16 | from graphrag_app.logger.console_workflow_callbacks import ConsoleWorkflowCallbacks
17 | from graphrag_app.logger.typing import Logger
18 | from graphrag_app.utils.azure_clients import AzureClientManager
19 |
20 |
21 | def load_pipeline_logger(
22 | logging_dir: str = "",
23 | index_name: str = "",
24 | num_workflow_steps: int = 0,
25 | ) -> WorkflowCallbacks:
26 | """Create and load a list of loggers.
27 |
28 | This function creates loggers for two different scenarios. Loggers can be instantiated as generic loggers or associated with a specified indexing job.
29 | 1. When an indexing job is running, custom index-specific loggers are created to log the job activity
30 | 2. When the fastapi app is running, generic loggers are used to log the app's activities.
31 | """
32 | loggers: List[Logger] = []
33 | for logger_type in ["BLOB", "CONSOLE", "APP_INSIGHTS"]:
34 | loggers.append(Logger[logger_type])
35 |
36 | azure_client_manager = AzureClientManager()
37 | callback_manager = WorkflowCallbacksManager()
38 | for logger in loggers:
39 | match logger:
40 | case Logger.BLOB:
41 | # create a dedicated container for logs
42 | log_blob_name = "logs"
43 | if logging_dir:
44 | log_blob_name = os.path.join(logging_dir, log_blob_name)
45 | # ensure the root directory exists; if not, create it
46 | blob_service_client = azure_client_manager.get_blob_service_client()
47 | container_root = Path(log_blob_name).parts[0]
48 | if not blob_service_client.get_container_client(
49 | container_root
50 | ).exists():
51 | blob_service_client.create_container(container_root)
52 | callback_manager.register(
53 | BlobWorkflowCallbacks(
54 | blob_service_client=blob_service_client,
55 | container_name=log_blob_name,
56 | index_name=index_name,
57 | num_workflow_steps=num_workflow_steps,
58 | )
59 | )
60 | case Logger.FILE:
61 | callback_manager.register(FileWorkflowCallbacks(dir=logging_dir))
62 | case Logger.APP_INSIGHTS:
63 | if os.getenv("APPLICATIONINSIGHTS_CONNECTION_STRING"):
64 | callback_manager.register(
65 | ApplicationInsightsWorkflowCallbacks(
66 | index_name=index_name,
67 | num_workflow_steps=num_workflow_steps,
68 | )
69 | )
70 | case Logger.CONSOLE:
71 | callback_manager.register(
72 | ConsoleWorkflowCallbacks(
73 | index_name=index_name, num_workflow_steps=num_workflow_steps
74 | )
75 | )
76 | case _:
77 | print(f"WARNING: unknown logger type: {logger}. Skipping.")
78 | return callback_manager
79 |
--------------------------------------------------------------------------------
/backend/graphrag_app/logger/pipeline_job_updater.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # Licensed under the MIT License.
3 |
4 | from graphrag.callbacks.noop_workflow_callbacks import NoopWorkflowCallbacks
5 |
6 | from graphrag_app.typing.pipeline import PipelineJobState
7 | from graphrag_app.utils.pipeline import PipelineJob
8 |
9 |
10 | class PipelineJobUpdater(NoopWorkflowCallbacks):
11 | """A callback that records pipeline updates."""
12 |
13 | def __init__(self, pipeline_job: PipelineJob):
14 | """
15 | This class defines a set of callback methods that can be used to log the progress of a pipeline job.
16 | It inherits from the NoopWorkflowCallbacks class, which provides default implementations for all the callback methods.
17 |
18 | Attributes:
19 | pipeline_job (PipelineJob): The pipeline object associated with the job.
20 |
21 | """
22 | self._pipeline_job = pipeline_job
23 |
24 | def workflow_start(self, name: str, instance: object) -> None:
25 | """Execute this callback when a workflow starts."""
26 | self._pipeline_job.status = PipelineJobState.RUNNING
27 | self._pipeline_job.progress = f"Workflow {name} started."
28 |
29 | def workflow_end(self, name: str, instance: object) -> None:
30 | """Execute this callback when a workflow ends."""
31 | self._pipeline_job.completed_workflows.append(name)
32 | self._pipeline_job.update_db()
33 | self._pipeline_job.progress = f"Workflow {name} complete."
34 | self._pipeline_job.percent_complete = (
35 | self._pipeline_job.calculate_percent_complete()
36 | )
37 |
--------------------------------------------------------------------------------
/backend/graphrag_app/logger/typing.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # Licensed under the MIT License.
3 |
4 | import logging
5 | from enum import Enum
6 | from typing import Literal
7 |
8 | from graphrag.index.config.reporting import (
9 | PipelineReportingConfig,
10 | PipelineReportingConfigTypes,
11 | )
12 | from pydantic import Field as pydantic_Field
13 |
14 |
15 | class Logger(Enum):
16 | BLOB = (1, "blob")
17 | CONSOLE = (2, "console")
18 | FILE = (3, "file")
19 | APP_INSIGHTS = (4, "app_insights")
20 |
21 |
22 | class PipelineAppInsightsReportingConfig(
23 | PipelineReportingConfig[Literal["app_insights"]]
24 | ):
25 | """Represents the ApplicationInsights reporting configuration for the pipeline."""
26 |
27 | type: Literal["app_insights"] = Logger.APP_INSIGHTS.name.lower()
28 | """The type of reporting."""
29 |
30 | connection_string: str = pydantic_Field(
31 | description="The connection string for the App Insights instance.",
32 | default=None,
33 | )
34 | """The connection string for the App Insights instance."""
35 |
36 | logger_name: str = pydantic_Field(
37 | description="The name for logger instance", default=None
38 | )
39 | """The name for logger instance"""
40 |
41 | logger_level: int = pydantic_Field(
42 | description="The name of the logger. Defaults to None.", default=logging.INFO
43 | )
44 | """The name of the logger. Defaults to None."""
45 |
46 |
47 | # add the new type to the existing PipelineReportingConfigTypes
48 | PipelineReportingConfigTypes = (
49 | PipelineReportingConfigTypes | PipelineAppInsightsReportingConfig
50 | )
51 |
--------------------------------------------------------------------------------
/backend/graphrag_app/typing/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure-Samples/graphrag-accelerator/c660c84cff1f7dfc7c82a62ee566ab0b2dc8e6b6/backend/graphrag_app/typing/__init__.py
--------------------------------------------------------------------------------
/backend/graphrag_app/typing/models.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # Licensed under the MIT License.
3 |
4 | from typing import (
5 | Any,
6 | List,
7 | )
8 |
9 | from pydantic import BaseModel
10 |
11 |
12 | class BaseResponse(BaseModel):
13 | status: str
14 |
15 |
16 | class ClaimResponse(BaseModel):
17 | covariate_type: str
18 | type: str
19 | description: str
20 | subject_id: str
21 | object_id: str
22 | source_text: str
23 | text_unit_id: str
24 | document_ids: List[str]
25 |
26 |
27 | class EntityResponse(BaseModel):
28 | name: str
29 | description: str
30 | text_units: list[str]
31 |
32 |
33 | class GraphRequest(BaseModel):
34 | index_name: str
35 | query: str
36 | community_level: int | None = None
37 |
38 |
39 | class GraphResponse(BaseModel):
40 | result: Any
41 | context_data: Any
42 |
43 |
44 | class GraphDataResponse(BaseModel):
45 | nodes: int
46 | edges: int
47 |
48 |
49 | class IndexNameList(BaseModel):
50 | index_name: List[str]
51 |
52 |
53 | class IndexStatusResponse(BaseModel):
54 | status_code: int
55 | index_name: str
56 | storage_name: str
57 | status: str
58 | percent_complete: float
59 | progress: str
60 |
61 |
62 | class ReportResponse(BaseModel):
63 | text: str
64 |
65 |
66 | class RelationshipResponse(BaseModel):
67 | source: str
68 | source_id: int
69 | target: str
70 | target_id: int
71 | description: str
72 | text_units: list[str]
73 |
74 |
75 | class StorageNameList(BaseModel):
76 | storage_name: List[str]
77 |
78 |
79 | class TextUnitResponse(BaseModel):
80 | text: str
81 | source_document: str
82 |
--------------------------------------------------------------------------------
/backend/graphrag_app/typing/pipeline.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # Licensed under the MIT License.
3 |
4 | from enum import Enum
5 |
6 |
7 | class PipelineJobState(str, Enum):
8 | SCHEDULED = "scheduled"
9 | RUNNING = "running"
10 | FAILED = "failed"
11 | COMPLETE = "complete"
12 |
13 | def __repr__(self):
14 | """Get a string representation."""
15 | return f'"{self.value}"'
16 |
--------------------------------------------------------------------------------
/backend/graphrag_app/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # Licensed under the MIT License.
3 |
--------------------------------------------------------------------------------
/backend/manifests/cronjob.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # Licensed under the MIT License.
3 |
4 | # NOTE: the location of this file is important as it gets referenced by the src/main.py script
5 | # and depends on the relative path to this file when uvicorn is run
6 |
7 | apiVersion: batch/v1
8 | kind: CronJob
9 | metadata:
10 | name: graphrag-index-manager
11 | spec:
12 | schedule: "*/5 * * * *"
13 | jobTemplate:
14 | spec:
15 | ttlSecondsAfterFinished: 180
16 | template:
17 | metadata:
18 | labels:
19 | azure.workload.identity/use: "true"
20 | spec:
21 | serviceAccountName: PLACEHOLDER
22 | restartPolicy: OnFailure
23 | containers:
24 | - name: index-job-manager
25 | image: PLACEHOLDER
26 | # override default WORKDIR with absolute path to the scripts directory
27 | workingDir: "/backend/scripts"
28 | imagePullPolicy: Always
29 | resources:
30 | requests:
31 | cpu: "0.5"
32 | memory: "0.5Gi"
33 | limits:
34 | cpu: "1"
35 | memory: "1Gi"
36 | envFrom:
37 | - configMapRef:
38 | name: graphrag
39 | command:
40 | - python
41 | - "job-scheduler.py"
42 |
--------------------------------------------------------------------------------
/backend/manifests/job.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # Licensed under the MIT License.
3 |
4 | # NOTE: the location of this file is important as it gets referenced by the job-scheduler.py script
5 | # and depends on the relative path to this file when uvicorn is run
6 |
7 | apiVersion: batch/v1
8 | kind: Job
9 | metadata:
10 | name: PLACEHOLDER
11 | spec:
12 | ttlSecondsAfterFinished: 300
13 | backoffLimit: 3
14 | template:
15 | metadata:
16 | labels:
17 | azure.workload.identity/use: "true"
18 | spec:
19 | serviceAccountName: PLACEHOLDER
20 | restartPolicy: OnFailure
21 | nodeSelector:
22 | workload: graphrag-indexing
23 | containers:
24 | - name: graphrag
25 | image: PLACEHOLDER
26 | # override default WORKDIR with absolute path to the scripts directory
27 | workingDir: "/backend/scripts"
28 | imagePullPolicy: Always
29 | resources:
30 | requests:
31 | cpu: "5"
32 | memory: "36Gi"
33 | limits:
34 | cpu: "8"
35 | memory: "64Gi"
36 | envFrom:
37 | - configMapRef:
38 | name: graphrag
39 | command: [PLACEHOLDER]
40 |
--------------------------------------------------------------------------------
/backend/pyproject.toml:
--------------------------------------------------------------------------------
1 | [project]
2 | name = "graphrag-app"
3 | description = "A web API wrapper around the official GraphRAG library."
4 | # we make the version defined here match the graphrag library version
5 | version = "1.2.0"
6 | license = "MIT"
7 | authors = [
8 | {name = "Josh Bradley", email = "joshbradley@microsoft.com"},
9 | {name = "Newman Cheng", email = "newmancheng@microsoft.com"},
10 | {name = "Christine DiFonzo", email = "cdifonzo@microsoft.com"},
11 | {name = "Gabriel Nieves", email = "gnievesponce@microsoft.com"},
12 | {name = "Douglas Orbaker", email = "dorbaker@microsoft.com"},
13 | {name = "Shane Solomon", email = "shane.solomon@microsoft.com"},
14 | {name = "Kenny Zhang", email = "zhangken@microsoft.com"},
15 | ]
16 | requires-python = '>=3.10, <3.13'
17 | package-mode=false
18 |
19 | [tool.poetry.group.dev.dependencies]
20 | detect-secrets = ">=1.5.0"
21 | devtools = ">=0.12.2"
22 | flake8 = ">=6.1.0"
23 | ipython = "*"
24 | jupyter = "*"
25 | pre-commit = ">=3.6.0"
26 | ruff = ">=0.1.13"
27 |
28 | [tool.poetry.group.test]
29 | optional = true
30 |
31 | [tool.poetry.group.test.dependencies]
32 | pytest = ">=8.2.1"
33 | pytest-asyncio = "^0.25.0"
34 | pytest-cov = "^6.0.0"
35 | pytest-env = "^1.1.5"
36 | pytest-xdist = "^3.6.1"
37 | wikipedia = ">=1.4.0"
38 |
39 | [tool.poetry.group.backend.dependencies]
40 | adlfs = ">=2024.7.0"
41 | attrs = ">=23.2.0"
42 | azure-core = ">=1.30.1"
43 | azure-cosmos = ">=4.5.1"
44 | azure-identity = ">=1.15.0"
45 | azure-monitor-opentelemetry = "^1.6.4"
46 | azure-search-documents = ">=11.4.0"
47 | azure-storage-blob = ">=12.19.0"
48 | environs = ">=9.5.0"
49 | fastapi = ">=0.110.0"
50 | fastparquet = ">=2023.10.1"
51 | fsspec = ">=2024.2.0"
52 | graphrag = "==1.2.0"
53 | httpx = ">=0.25.2"
54 | kubernetes = ">=29.0.0"
55 | markitdown = {extras = ["all"], version = "^0.1.1"}
56 | networkx = ">=3.2.1"
57 | nltk = "*"
58 | pandas = ">=2.2.1"
59 | pyaml-env = ">=1.2.1"
60 | pyarrow = ">=15.0.0"
61 | pydantic = ">=1.10.14"
62 | python-multipart = ">=0.0.6"
63 | requests = "*"
64 | rich = ">=13.7.1"
65 | tiktoken = ">=0.6.0"
66 | urllib3 = ">=2.2.2"
67 | uvicorn = ">=0.23.2"
68 |
69 | [tool.ruff]
70 | indent-width = 4
71 | line-length = 88
72 | target-version = "py310"
73 |
74 | [tool.ruff.format]
75 | preview = true
76 | quote-style = "double"
77 |
78 | [tool.ruff.lint]
79 | ignore = ["E402", "E501", "F821"]
80 | preview = true
81 | select = ["E", "F", "I"]
82 |
83 | [build-system]
84 | build-backend = "poetry.core.masonry.api"
85 | requires = ["poetry-core"]
86 |
--------------------------------------------------------------------------------
/backend/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | required_plugins = anyio pytest-asyncio pytest-cov pytest-env pytest-xdist
3 | asyncio_default_fixture_loop_scope="function"
4 | asyncio_mode=auto
5 | ; NOTE: we use well known credentials for the Cosmos DB emulator and Azure Storage emulator.
6 | ; If executing these pytests locally, users may need to modify the cosmosdb connection string to use http protocol instead of https.
7 | ; This depends on how the cosmosdb emulator has been configured (by the user) to run.
8 | env =
9 | COSMOS_CONNECTION_STRING=AccountEndpoint=https://127.0.0.1:8081/;AccountKey=C2y6yDjf5/R+ob0N8A7Cgv30VRDJIWEHLM+4QDU5DE2nQ9nDuVTqobD4b8mGGyPMbIZnqyMsEcaGQy67XIw/Jw==
10 | STORAGE_CONNECTION_STRING=DefaultEndpointsProtocol=http;AccountName=devstoreaccount1;AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;BlobEndpoint=http://127.0.0.1:10000/devstoreaccount1;QueueEndpoint=http://127.0.0.1:10001/devstoreaccount1;TableEndpoint=http://127.0.0.1:10002/devstoreaccount1;
11 | TESTING=1
12 |
--------------------------------------------------------------------------------
/backend/scripts/settings.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # Licensed under the MIT License.
3 |
4 | # this yaml file serves as a configuration template for the graphrag indexing jobs
5 | # some values are hardcoded while others denoted by PLACEHOLDER will be dynamically set
6 |
7 | ###################### LLM settings ######################
8 | encoding_model: cl100k_base # this needs to be matched to your model!
9 |
10 | llm:
11 | type: azure_openai_chat
12 | api_base: $GRAPHRAG_API_BASE
13 | api_version: $GRAPHRAG_API_VERSION
14 | model: $GRAPHRAG_LLM_MODEL
15 | deployment_name: $GRAPHRAG_LLM_DEPLOYMENT_NAME
16 | audience: $COGNITIVE_SERVICES_AUDIENCE
17 | model_supports_json: True
18 | tokens_per_minute: 80_000
19 | requests_per_minute: 480
20 | concurrent_requests: 25
21 | max_retries: 250
22 | max_retry_wait: 60.0
23 | sleep_on_rate_limit_recommendation: True
24 |
25 | parallelization:
26 | num_threads: 10
27 | stagger: 0.25
28 |
29 | async_mode: threaded # or asyncio
30 |
31 | embeddings:
32 | vector_store:
33 | type: azure_ai_search
34 | collection_name: PLACEHOLDER
35 | title_column: name
36 | overwrite: True
37 | url: $AI_SEARCH_URL
38 | audience: $AI_SEARCH_AUDIENCE
39 | llm:
40 | type: azure_openai_embedding
41 | api_base: $GRAPHRAG_API_BASE
42 | api_version: $GRAPHRAG_API_VERSION
43 | batch_size: 10
44 | model: $GRAPHRAG_EMBEDDING_MODEL
45 | deployment_name: $GRAPHRAG_EMBEDDING_DEPLOYMENT_NAME
46 | audience: $COGNITIVE_SERVICES_AUDIENCE
47 | tokens_per_minute: 350_000
48 | requests_per_minute: 2_100
49 |
50 | ###################### Input settings ######################
51 | input:
52 | type: blob
53 | file_type: text
54 | base_dir: .
55 | file_encoding: utf-8
56 | file_pattern: .*\.txt$
57 | storage_account_blob_url: $STORAGE_ACCOUNT_BLOB_URL
58 | container_name: PLACEHOLDER
59 |
60 | chunks:
61 | size: 1_200
62 | overlap: 100
63 | group_by_columns: [id]
64 |
65 | ###################### Storage settings ######################
66 | cache:
67 | type: blob
68 | storage_account_blob_url: $STORAGE_ACCOUNT_BLOB_URL
69 | container_name: PLACEHOLDER
70 | base_dir: cache
71 |
72 | reporting:
73 | type: blob
74 | storage_account_blob_url: $STORAGE_ACCOUNT_BLOB_URL
75 | container_name: PLACEHOLDER
76 | base_dir: logs
77 |
78 | storage:
79 | type: blob
80 | storage_account_blob_url: $STORAGE_ACCOUNT_BLOB_URL
81 | container_name: PLACEHOLDER
82 | base_dir: output
83 |
84 | ###################### Workflow settings ######################
85 | skip_workflows: []
86 |
87 | entity_extraction:
88 | prompt: PLACEHOLDER
89 | entity_types: [organization, person, geo, event]
90 | max_gleanings: 1
91 |
92 | summarize_descriptions:
93 | prompt: PLACEHOLDER
94 | max_length: 500
95 |
96 | claim_extraction:
97 | enabled: false
98 | prompt: "prompts/claim_extraction.txt"
99 | description: "Any claims or facts that could be relevant to information discovery."
100 | max_gleanings: 1
101 |
102 | community_reports:
103 | prompt: PLACEHOLDER
104 | max_length: 2_000
105 | max_input_length: 8_000
106 |
107 | cluster_graph:
108 | max_cluster_size: 10
109 |
110 | embed_graph:
111 | enabled: false
112 |
113 | umap:
114 | enabled: false
115 |
116 | snapshots:
117 | graphml: True
118 | embeddings: false
119 | transient: false
120 |
121 | ###################### Query settings ######################
122 | ## The prompt locations are required here, but each search method has a number of optional knobs that can be tuned.
123 | ## See the config docs: https://microsoft.github.io/graphrag/config/yaml/#query
124 | local_search:
125 | prompt: PLACEHOLDER
126 |
127 | global_search:
128 | map_prompt: PLACEHOLDER
129 | reduce_prompt: PLACEHOLDER
130 | knowledge_prompt: PLACEHOLDER
131 |
132 | drift_search:
133 | prompt: PLACEHOLDER
134 | reduce_prompt: PLACEHOLDER
135 |
136 | basic_search:
137 | prompt: PLACEHOLDER
138 |
--------------------------------------------------------------------------------
/backend/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure-Samples/graphrag-accelerator/c660c84cff1f7dfc7c82a62ee566ab0b2dc8e6b6/backend/tests/__init__.py
--------------------------------------------------------------------------------
/backend/tests/conftest.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # Licensed under the MIT License.
3 |
4 | import os
5 | from pathlib import Path
6 | from typing import Generator
7 |
8 | import pytest
9 | from azure.cosmos import CosmosClient, PartitionKey
10 | from azure.storage.blob import BlobServiceClient
11 | from fastapi.testclient import TestClient
12 |
13 | from graphrag_app.main import app
14 | from graphrag_app.utils.common import sanitize_name
15 |
16 |
17 | @pytest.fixture(scope="session")
18 | def blob_with_data_container_name(blob_service_client: BlobServiceClient):
19 | # create a storage container and upload some data
20 | container_name = "container-with-data"
21 | sanitized_name = sanitize_name(container_name)
22 | blob_service_client.create_container(sanitized_name)
23 | blob_client = blob_service_client.get_blob_client(sanitized_name, "data.txt")
24 | blob_client.upload_blob(data="Hello, World!", overwrite=True)
25 | yield container_name
26 | # cleanup
27 | blob_service_client.delete_container(sanitized_name)
28 |
29 |
30 | @pytest.fixture(scope="session")
31 | def blob_service_client() -> Generator[BlobServiceClient, None, None]:
32 | blob_service_client = BlobServiceClient.from_connection_string(
33 | os.environ["STORAGE_CONNECTION_STRING"]
34 | )
35 | yield blob_service_client
36 | # no cleanup
37 |
38 |
39 | @pytest.fixture(scope="session")
40 | def cosmos_client() -> Generator[CosmosClient, None, None]:
41 | """Initializes the CosmosDB databases that graphrag expects at startup time."""
42 | # setup
43 | client = CosmosClient.from_connection_string(os.environ["COSMOS_CONNECTION_STRING"])
44 | db_client = client.create_database_if_not_exists(id="graphrag")
45 | db_client.create_container_if_not_exists(
46 | id="container-store", partition_key=PartitionKey(path="/id")
47 | )
48 | db_client.create_container_if_not_exists(
49 | id="jobs", partition_key=PartitionKey(path="/id")
50 | )
51 | yield client # run the test
52 | # teardown
53 | client.delete_database("graphrag")
54 |
55 |
56 | @pytest.fixture(scope="session")
57 | def container_with_graphml_file(
58 | blob_service_client: BlobServiceClient, cosmos_client: CosmosClient
59 | ):
60 | """Create a storage container that mimics a valid index and upload a fake graphml file"""
61 | container_name = "container-with-graphml"
62 | sanitized_name = sanitize_name(container_name)
63 | if not blob_service_client.get_container_client(sanitized_name).exists():
64 | blob_service_client.create_container(sanitized_name)
65 | blob_client = blob_service_client.get_blob_client(
66 | sanitized_name, "output/graph.graphml"
67 | )
68 | blob_client.upload_blob(data="a fake graphml file", overwrite=True)
69 | # add an entry to the container-store table in cosmos db
70 | container_store_client = cosmos_client.get_database_client(
71 | "graphrag"
72 | ).get_container_client("container-store")
73 | container_store_client.upsert_item({
74 | "id": sanitized_name,
75 | "human_readable_name": container_name,
76 | "type": "index",
77 | })
78 | yield container_name
79 | # cleanup
80 | blob_service_client.delete_container(sanitized_name)
81 | # container_store_client.delete_item(sanitized_name, sanitized_name)
82 |
83 |
84 | @pytest.fixture(scope="session")
85 | def container_with_index_files(
86 | blob_service_client: BlobServiceClient, cosmos_client: CosmosClient
87 | ):
88 | """Create a storage container and upload a set of parquet files associated with a valid index"""
89 | container_name = "container-with-index-files"
90 | sanitized_name = sanitize_name(container_name)
91 | if not blob_service_client.get_container_client(sanitized_name).exists():
92 | blob_service_client.create_container(sanitized_name)
93 |
94 | # upload synthetic index to a container
95 | data_root = Path(__file__).parent / "data/synthetic-dataset/output"
96 | for file in data_root.iterdir():
97 | # upload each file in the output folder
98 | blob_client = blob_service_client.get_blob_client(
99 | sanitized_name, f"output/{file.name}"
100 | )
101 | local_file = f"{data_root}/{file.name}"
102 | with open(local_file, "rb") as data:
103 | blob_client.upload_blob(data, overwrite=True)
104 |
105 | # add an entry to the container-store table in cosmos db
106 | container_store_client = cosmos_client.get_database_client(
107 | "graphrag"
108 | ).get_container_client("container-store")
109 | container_store_client.upsert_item({
110 | "id": sanitized_name,
111 | "human_readable_name": container_name,
112 | "type": "index",
113 | })
114 | yield container_name
115 | # cleanup
116 | blob_service_client.delete_container(sanitized_name)
117 | container_store_client.delete_item(sanitized_name, sanitized_name)
118 |
119 |
120 | @pytest.fixture(scope="session")
121 | def client(request) -> Generator[TestClient, None, None]:
122 | with TestClient(app) as c:
123 | yield c
124 |
--------------------------------------------------------------------------------
/backend/tests/data/synthetic-dataset/ABOUT.md:
--------------------------------------------------------------------------------
1 | # About
2 |
3 | This is an index built on a book (Operation Dulce) that is an AI-generated science fiction novella, included here for the purposes of integration testing.
4 |
5 | To regenerate the index and update pytests, the original book can be retrieved from [here](https://github.com/microsoft/graphrag/tree/main/docs/data/operation_dulce).
6 |
--------------------------------------------------------------------------------
/backend/tests/data/synthetic-dataset/output/create_final_communities.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure-Samples/graphrag-accelerator/c660c84cff1f7dfc7c82a62ee566ab0b2dc8e6b6/backend/tests/data/synthetic-dataset/output/create_final_communities.parquet
--------------------------------------------------------------------------------
/backend/tests/data/synthetic-dataset/output/create_final_community_reports.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure-Samples/graphrag-accelerator/c660c84cff1f7dfc7c82a62ee566ab0b2dc8e6b6/backend/tests/data/synthetic-dataset/output/create_final_community_reports.parquet
--------------------------------------------------------------------------------
/backend/tests/data/synthetic-dataset/output/create_final_covariates.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure-Samples/graphrag-accelerator/c660c84cff1f7dfc7c82a62ee566ab0b2dc8e6b6/backend/tests/data/synthetic-dataset/output/create_final_covariates.parquet
--------------------------------------------------------------------------------
/backend/tests/data/synthetic-dataset/output/create_final_documents.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure-Samples/graphrag-accelerator/c660c84cff1f7dfc7c82a62ee566ab0b2dc8e6b6/backend/tests/data/synthetic-dataset/output/create_final_documents.parquet
--------------------------------------------------------------------------------
/backend/tests/data/synthetic-dataset/output/create_final_entities.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure-Samples/graphrag-accelerator/c660c84cff1f7dfc7c82a62ee566ab0b2dc8e6b6/backend/tests/data/synthetic-dataset/output/create_final_entities.parquet
--------------------------------------------------------------------------------
/backend/tests/data/synthetic-dataset/output/create_final_nodes.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure-Samples/graphrag-accelerator/c660c84cff1f7dfc7c82a62ee566ab0b2dc8e6b6/backend/tests/data/synthetic-dataset/output/create_final_nodes.parquet
--------------------------------------------------------------------------------
/backend/tests/data/synthetic-dataset/output/create_final_relationships.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure-Samples/graphrag-accelerator/c660c84cff1f7dfc7c82a62ee566ab0b2dc8e6b6/backend/tests/data/synthetic-dataset/output/create_final_relationships.parquet
--------------------------------------------------------------------------------
/backend/tests/data/synthetic-dataset/output/create_final_text_units.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure-Samples/graphrag-accelerator/c660c84cff1f7dfc7c82a62ee566ab0b2dc8e6b6/backend/tests/data/synthetic-dataset/output/create_final_text_units.parquet
--------------------------------------------------------------------------------
/backend/tests/data/synthetic-dataset/output/stats.json:
--------------------------------------------------------------------------------
1 | {
2 | "total_runtime": 358.0534498691559,
3 | "num_documents": 1,
4 | "input_load_time": 0,
5 | "workflows": {
6 | "create_base_text_units": {
7 | "overall": 2.060708999633789
8 | },
9 | "create_final_documents": {
10 | "overall": 0.043251991271972656
11 | },
12 | "extract_graph": {
13 | "overall": 162.8238878250122
14 | },
15 | "compute_communities": {
16 | "overall": 14.345926284790039
17 | },
18 | "create_final_entities": {
19 | "overall": 0.04870915412902832
20 | },
21 | "create_final_relationships": {
22 | "overall": 0.05901288986206055
23 | },
24 | "create_final_nodes": {
25 | "overall": 0.07453203201293945
26 | },
27 | "create_final_communities": {
28 | "overall": 0.127485990524292
29 | },
30 | "create_final_covariates": {
31 | "overall": 142.99078702926636
32 | },
33 | "create_final_text_units": {
34 | "overall": 0.12473607063293457
35 | },
36 | "create_final_community_reports": {
37 | "overall": 31.13183307647705
38 | },
39 | "generate_text_embeddings": {
40 | "overall": 3.978173017501831
41 | }
42 | }
43 | }
--------------------------------------------------------------------------------
/backend/tests/integration/test_api_data.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # Licensed under the MIT License.
3 | """
4 | Integration tests for the /data API endpoints.
5 | """
6 |
7 | import os
8 |
9 | from azure.cosmos import CosmosClient
10 |
11 |
12 | def test_upload_files(cosmos_client: CosmosClient, client):
13 | """Test uploading files to a data blob container."""
14 | # create a single file
15 | with open("test.txt", "wb") as f:
16 | f.write(b"Hello, world!")
17 | # send the file in the request
18 | with open("test.txt", "rb") as f:
19 | response = client.post(
20 | "/data",
21 | files={"files": ("test.txt", f)},
22 | params={"container_name": "testContainer"},
23 | )
24 | # check the response
25 | assert response.status_code == 200
26 | # remove the sample file as part of garbage collection
27 | if os.path.exists("test.txt"):
28 | os.remove("test.txt")
29 |
30 |
31 | def test_delete_files(cosmos_client: CosmosClient, client):
32 | """Test deleting a data blob container."""
33 | # delete a data blob container
34 | response = client.delete("/data/testContainer")
35 | assert response.status_code == 200
36 |
37 |
38 | def test_get_list_of_data_containers(cosmos_client: CosmosClient, client):
39 | """Test getting a list of all data blob containers."""
40 | response = client.get("/data")
41 | assert response.status_code == 200
42 |
--------------------------------------------------------------------------------
/backend/tests/integration/test_api_default.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # Licensed under the MIT License.
3 | """
4 | Integration tests for the /health API endpoint.
5 | """
6 |
7 |
8 | def test_health_check(client):
9 | """Test health check endpoint."""
10 | response = client.get("/health")
11 | assert response.status_code == 200
12 |
--------------------------------------------------------------------------------
/backend/tests/integration/test_api_graph.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # Licensed under the MIT License.
3 | """
4 | Integration tests for the /graph API endpoints.
5 | """
6 |
7 |
8 | def test_get_graphml_file(client, container_with_graphml_file: str):
9 | """Test retrieving a graphml file endpoint."""
10 | url = f"/graph/graphml/{container_with_graphml_file}"
11 | response = client.get(url)
12 | assert response.status_code == 200
13 | response.raise_for_status()
14 | full_data = b""
15 | for chunk in response.iter_bytes(chunk_size=1024):
16 | full_data += chunk
17 | assert full_data == b"a fake graphml file"
18 |
--------------------------------------------------------------------------------
/backend/tests/integration/test_api_index.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # Licensed under the MIT License.
3 | """
4 | Integration tests for the /index API endpoints.
5 | """
6 |
7 | from azure.cosmos import CosmosClient
8 |
9 |
10 | def test_get_list_of_index_containers_empty(client, cosmos_client: CosmosClient):
11 | """Test getting a list of all blob containers holding an index."""
12 | response = client.get("/index")
13 | assert response.status_code == 200
14 |
15 |
16 | def test_schedule_index_without_data(client, cosmos_client: CosmosClient):
17 | """Test scheduling an index job with a non-existent data blob container."""
18 | response = client.post(
19 | "/index",
20 | params={
21 | "index_container_name": "myindex",
22 | "storage_container_name": "nonexistent-data-container",
23 | },
24 | )
25 | assert response.status_code == 412
26 |
27 |
28 | # def test_schedule_index_with_data(client, cosmos_client, blob_with_data_container_name):
29 | # """Test scheduling an index job with real data."""
30 | # response = client.post("/index", files=None, params={"storage_container_name": blob_with_data_container_name, "index_container_name": "myindex"})
31 | # print(response.json())
32 | # assert response.status_code == 200
33 |
--------------------------------------------------------------------------------
/backend/tests/integration/test_api_index_configuration.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # Licensed under the MIT License.
3 | """
4 | Integration tests for the /index/config API endpoints.
5 | """
6 |
7 | from unittest.mock import AsyncMock, patch
8 |
9 | import pytest_asyncio
10 |
11 |
12 | @pytest_asyncio.fixture
13 | def mock_generate_indexing_prompts():
14 | with patch(
15 | "graphrag.api.generate_indexing_prompts", new_callable=AsyncMock
16 | ) as mock:
17 | mock.return_value = (
18 | "synthetic-prompt1",
19 | "synthetic-prompt2",
20 | "synthetic-prompt3",
21 | )
22 | yield mock
23 |
24 |
25 | def test_generate_prompts(
26 | blob_with_data_container_name, mock_generate_indexing_prompts, client
27 | ):
28 | """Test generating prompts."""
29 | response = client.get(
30 | "/index/config/prompts",
31 | params={"container_name": blob_with_data_container_name},
32 | )
33 | assert response.status_code == 200
34 |
--------------------------------------------------------------------------------
/backend/tests/integration/test_api_prompt_tuning.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # Licensed under the MIT License.
3 | """
4 | Integration tests for the /index/config API endpoints.
5 | """
6 |
7 | from unittest.mock import AsyncMock, patch
8 |
9 | import pytest_asyncio
10 |
11 |
12 | @pytest_asyncio.fixture
13 | def mock_generate_indexing_prompts():
14 | with patch(
15 | "graphrag.api.generate_indexing_prompts", new_callable=AsyncMock
16 | ) as mock:
17 | mock.return_value = (
18 | "synthetic-prompt1",
19 | "synthetic-prompt2",
20 | "synthetic-prompt3",
21 | )
22 | yield mock
23 |
24 |
25 | def test_generate_prompts(
26 | blob_with_data_container_name, mock_generate_indexing_prompts, client
27 | ):
28 | """Test generating prompts."""
29 | response = client.get(
30 | "/index/config/prompts",
31 | params={"container_name": blob_with_data_container_name},
32 | )
33 | assert response.status_code == 200
34 |
--------------------------------------------------------------------------------
/backend/tests/integration/test_api_source.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # Licensed under the MIT License.
3 | """
4 | Integration tests for the /source API endpoints.
5 | """
6 |
7 | from fastapi.testclient import TestClient
8 |
9 |
10 | def test_get_report(container_with_index_files: str, client: TestClient):
11 | """Test retrieving a report via the graphrag_app.api.source.get_report_info() function."""
12 | # retrieve a report that exists
13 | response = client.get(f"/source/report/{container_with_index_files}/1")
14 | assert response.status_code == 200
15 | # # retrieve a report that does not exist
16 | # response = client.get(f"/source/report/{container_with_index_files}/-1")
17 | # assert response.status_code == 500
18 |
19 |
20 | def test_get_chunk_info(container_with_index_files: str, client: TestClient):
21 | """Test retrieving a text chunk."""
22 | response = client.get(
23 | f"/source/text/{container_with_index_files}/c4197a012ea9e7d2618450cbb197852dec47c40883d4a69e0ea473a8111319c80d608ae5fa66acc2d3f95cd845277b3acd8186d7fa326803dde09681da29790c"
24 | )
25 | assert response.status_code == 200
26 |
27 |
28 | def test_get_entity_info(container_with_index_files: str, client: TestClient):
29 | """Test retrieving an entity description."""
30 | response = client.get(f"/source/entity/{container_with_index_files}/1")
31 | assert response.status_code == 200
32 |
33 |
34 | def test_get_relationship_info(container_with_index_files: str, client: TestClient):
35 | """Test retrieving an entity description."""
36 | response = client.get(f"/source/relationship/{container_with_index_files}/1")
37 | assert response.status_code == 200
38 |
--------------------------------------------------------------------------------
/backend/tests/integration/test_utils_pipeline.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # Licensed under the MIT License.
3 | """
4 | Integration tests for the PipelineJob class.
5 | """
6 |
7 | from typing import Generator
8 |
9 | import pytest
10 |
11 | from graphrag_app.typing.pipeline import PipelineJobState
12 | from graphrag_app.utils.pipeline import PipelineJob
13 |
14 |
15 | @pytest.fixture()
16 | def cosmos_index_job_entry(cosmos_client) -> Generator[str, None, None]:
17 | """Create an entry for an indexing job in the appropriate CosmosDB database and container
18 | that graphrag expects when first scheduling an indexing job."""
19 |
20 | db_client = cosmos_client.get_database_client("graphrag")
21 | container_client = db_client.get_container_client("jobs")
22 | synthetic_job_entry = {
23 | "id": "testID",
24 | "epoch_request_time": 0,
25 | "human_readable_index_name": "test_human_readable_index_name",
26 | "sanitized_index_name": "test_sanitized_index_name",
27 | "human_readable_storage_name": "test_human_readable_storage_name",
28 | "sanitized_storage_name": "test_sanitized_storage_name",
29 | "all_workflows": ["workflow1", "workflow2"],
30 | "completed_workflows": ["workflow1"],
31 | "failed_workflows": ["workflow2"],
32 | "status": PipelineJobState.COMPLETE,
33 | "percent_complete": 50.0,
34 | "progress": "some progress",
35 | }
36 | container_client.upsert_item(synthetic_job_entry)
37 | yield synthetic_job_entry["id"]
38 | # teardown
39 | container_client.delete_item(
40 | synthetic_job_entry["id"], partition_key=synthetic_job_entry["id"]
41 | )
42 |
43 |
44 | def test_pipeline_job_interface(cosmos_index_job_entry):
45 | """Test the graphrag_app.utils.pipeline.PipelineJob class interface."""
46 | pipeline_job = PipelineJob()
47 |
48 | # test creating a new entry
49 | pipeline_job.create_item(
50 | id="synthetic_id",
51 | human_readable_index_name="test_human_readable_index_name",
52 | human_readable_storage_name="test_human_readable_storage_name",
53 | entity_extraction_prompt="fake entity extraction prompt",
54 | community_report_prompt="fake community report prompt",
55 | summarize_descriptions_prompt="fake summarize descriptions prompt",
56 | )
57 | assert pipeline_job.item_exist("synthetic_id")
58 |
59 | # test loading an existing entry
60 | pipeline_job = pipeline_job.load_item(cosmos_index_job_entry)
61 | assert pipeline_job.id == "testID"
62 | assert pipeline_job.human_readable_index_name == "test_human_readable_index_name"
63 | assert pipeline_job.sanitized_index_name == "test_sanitized_index_name"
64 | assert (
65 | pipeline_job.human_readable_storage_name == "test_human_readable_storage_name"
66 | )
67 | assert pipeline_job.sanitized_storage_name == "test_sanitized_storage_name"
68 | assert pipeline_job.all_workflows == ["workflow1", "workflow2"]
69 | assert pipeline_job.completed_workflows == ["workflow1"]
70 | assert pipeline_job.failed_workflows == ["workflow2"]
71 | assert pipeline_job.status == PipelineJobState.COMPLETE
72 | assert pipeline_job.percent_complete == 50.0
73 | assert pipeline_job.progress == "some progress"
74 | assert pipeline_job.calculate_percent_complete() == 50.0
75 |
76 | # test setters and getters
77 | pipeline_job.id = "newID"
78 | assert pipeline_job.id == "newID"
79 | pipeline_job.epoch_request_time = 1
80 | assert pipeline_job.epoch_request_time == 1
81 |
82 | pipeline_job.human_readable_index_name = "new_human_readable_index_name"
83 | assert pipeline_job.human_readable_index_name == "new_human_readable_index_name"
84 | pipeline_job.sanitized_index_name = "new_sanitized_index_name"
85 | assert pipeline_job.sanitized_index_name == "new_sanitized_index_name"
86 |
87 | pipeline_job.human_readable_storage_name = "new_human_readable_storage_name"
88 | assert pipeline_job.human_readable_storage_name == "new_human_readable_storage_name"
89 | pipeline_job.sanitized_storage_name = "new_sanitized_storage_name"
90 | assert pipeline_job.sanitized_storage_name == "new_sanitized_storage_name"
91 |
92 | pipeline_job.entity_extraction_prompt = "new_entity_extraction_prompt"
93 | assert pipeline_job.entity_extraction_prompt == "new_entity_extraction_prompt"
94 | pipeline_job.community_report_prompt = "new_community_report_prompt"
95 | assert pipeline_job.community_report_prompt == "new_community_report_prompt"
96 | pipeline_job.summarize_descriptions_prompt = "new_summarize_descriptions_prompt"
97 | assert (
98 | pipeline_job.summarize_descriptions_prompt
99 | == "new_summarize_descriptions_prompt"
100 | )
101 |
102 | pipeline_job.all_workflows = ["new_workflow1", "new_workflow2", "new_workflow3"]
103 | assert len(pipeline_job.all_workflows) == 3
104 |
105 | pipeline_job.completed_workflows = ["new_workflow1", "new_workflow2"]
106 | assert len(pipeline_job.completed_workflows) == 2
107 |
108 | pipeline_job.failed_workflows = ["new_workflow3"]
109 | assert len(pipeline_job.failed_workflows) == 1
110 |
--------------------------------------------------------------------------------
/backend/tests/unit/test_azure_clients.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # Licensed under the MIT License.
3 |
4 | from azure.cosmos import CosmosClient
5 | from azure.storage.blob import BlobServiceClient
6 | from azure.storage.blob.aio import BlobServiceClient as BlobServiceClientAsync
7 |
8 | from graphrag_app.utils.azure_clients import (
9 | AzureClientManager,
10 | _BlobServiceClientSingleton,
11 | _BlobServiceClientSingletonAsync,
12 | _CosmosClientSingleton,
13 | )
14 |
15 |
16 | def test_get_cosmos_singleton():
17 | """verify correctness of singleton implementation"""
18 | client1 = _CosmosClientSingleton.get_instance()
19 | client2 = _CosmosClientSingleton.get_instance()
20 | assert isinstance(client1, CosmosClient)
21 | assert isinstance(client2, CosmosClient)
22 | assert client1 is client2
23 |
24 |
25 | def test_get_storage_singleton():
26 | """Verify correctness of singleton implementation"""
27 | client1 = _BlobServiceClientSingleton.get_instance()
28 | client2 = _BlobServiceClientSingleton.get_instance()
29 | assert isinstance(client1, BlobServiceClient)
30 | assert isinstance(client2, BlobServiceClient)
31 | assert client1 is client2 # check if both reference the same object
32 |
33 |
34 | def test_get_storage_async_singleton():
35 | """Verify correctness of singleton implementation"""
36 | client1 = _BlobServiceClientSingletonAsync.get_instance()
37 | client2 = _BlobServiceClientSingletonAsync.get_instance()
38 | assert isinstance(client1, BlobServiceClientAsync)
39 | assert isinstance(client2, BlobServiceClientAsync)
40 | assert client1 is client2 # check if both reference the same object
41 |
42 |
43 | def test_azure_client_manager():
44 | azure_client_manager = AzureClientManager()
45 | assert isinstance(azure_client_manager, AzureClientManager)
46 | assert isinstance(azure_client_manager.get_cosmos_client(), CosmosClient)
47 | assert isinstance(azure_client_manager.get_blob_service_client(), BlobServiceClient)
48 | assert isinstance(
49 | azure_client_manager.get_blob_service_client_async(), BlobServiceClientAsync
50 | )
51 |
--------------------------------------------------------------------------------
/backend/tests/unit/test_common.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # Licensed under the MIT License.
3 |
4 | import pytest
5 |
6 | from graphrag_app.utils.common import (
7 | desanitize_name,
8 | sanitize_name,
9 | validate_index_file_exist,
10 | )
11 |
12 |
13 | def test_desanitize_name(container_with_graphml_file):
14 | """Test the graphrag_app.utils.common.desanitize_name function."""
15 | # test retrieving a valid container name
16 | original_name = container_with_graphml_file
17 | sanitized_name = sanitize_name(original_name)
18 | assert desanitize_name(sanitized_name) == original_name
19 | # test retrieving an invalid container name
20 | assert desanitize_name("nonexistent-container") is None
21 |
22 |
23 | def test_validate_index_file_exist(container_with_graphml_file):
24 | """Test the graphrag_app.utils.common.validate_index_file_exist function."""
25 | original_name = container_with_graphml_file
26 | sanitized_name = sanitize_name(original_name)
27 | # test with a valid index and valid file
28 | assert validate_index_file_exist(sanitized_name, "output/graph.graphml") is None
29 | # test with a valid index and non-existent file
30 | with pytest.raises(ValueError):
31 | validate_index_file_exist(sanitized_name, "non-existent-file")
32 | # test non-existent index and valid file
33 | with pytest.raises(ValueError):
34 | validate_index_file_exist("nonexistent-index", "output/graph.graphml")
35 |
--------------------------------------------------------------------------------
/backend/tests/unit/test_load_logger.py:
--------------------------------------------------------------------------------
1 | from unittest.mock import patch
2 |
3 | import pytest
4 |
5 | from graphrag_app.logger.load_logger import load_pipeline_logger
6 |
7 |
8 | @pytest.fixture
9 | def mock_app_insights_workflow_callbacks():
10 | with patch(
11 | "graphrag_app.logger.application_insights_workflow_callbacks.ApplicationInsightsWorkflowCallbacks"
12 | ) as mock_app_insights_workflow_callbacks:
13 | yield mock_app_insights_workflow_callbacks
14 |
15 |
16 | @pytest.fixture
17 | def mock_file_workflow_callbacks():
18 | with patch(
19 | "graphrag.index.reporting.file_workflow_callbacks.FileWorkflowCallbacks"
20 | ) as mock_file_workflow_callbacks:
21 | yield mock_file_workflow_callbacks
22 |
23 |
24 | @pytest.fixture
25 | def mock_blob_workflow_callbacks():
26 | with patch(
27 | "graphrag_app.logger.blob_workflow_callbacks.BlobWorkflowCallbacks"
28 | ) as mock_blob_workflow_callbacks:
29 | yield mock_blob_workflow_callbacks
30 |
31 |
32 | @pytest.fixture
33 | def mock_console_workflow_callbacks():
34 | with patch(
35 | "graphrag_app.logger.console_workflow_callbacks.ConsoleWorkflowCallbacks"
36 | ) as mock_console_workflow_callbacks:
37 | yield mock_console_workflow_callbacks
38 |
39 |
40 | @pytest.mark.skip(reason="This test is currently not complete")
41 | def test_load_pipeline_logger_with_console(
42 | mock_app_insights_workflow_callbacks,
43 | mock_blob_workflow_callbacks,
44 | mock_console_workflow_callbacks,
45 | mock_file_workflow_callbacks,
46 | ):
47 | """Test load_pipeline_logger."""
48 | loggers = load_pipeline_logger(
49 | logging_dir="logs",
50 | loggers=["app_insights", "blob", "console", "file"],
51 | index_name="test-index",
52 | num_workflow_steps=4,
53 | )
54 | assert len(loggers._callbacks) == 4
55 |
--------------------------------------------------------------------------------
/backend/tests/unit/test_logger_app_insights_callbacks.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # Licensed under the MIT License.
3 |
4 | import logging
5 | from unittest.mock import MagicMock, patch
6 |
7 | import pytest
8 |
9 | from graphrag_app.logger.application_insights_workflow_callbacks import (
10 | ApplicationInsightsWorkflowCallbacks,
11 | )
12 |
13 |
14 | @pytest.fixture
15 | def mock_logger():
16 | with patch(
17 | "graphrag_app.logger.application_insights_workflow_callbacks.logging.getLogger"
18 | ) as mock_get_logger:
19 | mock_logger_instance = MagicMock(spec=logging.Logger)
20 | mock_get_logger.return_value = mock_logger_instance
21 | yield mock_logger_instance
22 |
23 |
24 | @pytest.fixture
25 | def workflow_callbacks(mock_logger):
26 | with patch(
27 | "graphrag_app.logger.application_insights_workflow_callbacks.ApplicationInsightsWorkflowCallbacks.__init__",
28 | return_value=None,
29 | ):
30 | instance = ApplicationInsightsWorkflowCallbacks()
31 | instance._connection_string = "mock_connection_string"
32 | instance._index_name = "mock_index_name"
33 | instance._num_workflow_steps = 4
34 | instance._logger = mock_logger
35 | instance._processed_workflow_steps = []
36 | instance._properties = {}
37 | yield instance
38 |
39 |
40 | def test_workflow_start(workflow_callbacks, mock_logger):
41 | workflow_callbacks.workflow_start("test_workflow", object())
42 | assert mock_logger.info.called
43 |
44 |
45 | def test_workflow_end(workflow_callbacks, mock_logger):
46 | workflow_callbacks.workflow_end("test_workflow", object())
47 | assert mock_logger.info.called
48 |
49 |
50 | def test_log(workflow_callbacks, mock_logger):
51 | workflow_callbacks.log("test_log_message")
52 | assert mock_logger.info.called
53 |
54 |
55 | def test_warning(workflow_callbacks, mock_logger):
56 | workflow_callbacks.warning("test_warning")
57 | assert mock_logger.warning.called
58 |
59 |
60 | def test_error(workflow_callbacks, mock_logger):
61 | workflow_callbacks.error("test_error", Exception("test_exception"))
62 | assert mock_logger.error.called
63 |
--------------------------------------------------------------------------------
/backend/tests/unit/test_logger_blob_callbacks.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # Licensed under the MIT License.
3 |
4 | from unittest.mock import patch
5 |
6 | import pytest
7 |
8 | from graphrag_app.logger.blob_workflow_callbacks import BlobWorkflowCallbacks
9 |
10 |
11 | @pytest.fixture
12 | def mock_blob_service_client():
13 | with patch(
14 | "graphrag_app.logger.blob_workflow_callbacks.BlobServiceClient"
15 | ) as mock_blob_service_client:
16 | yield mock_blob_service_client
17 |
18 |
19 | @pytest.fixture
20 | def workflow_callbacks(mock_blob_service_client):
21 | with patch(
22 | "graphrag_app.logger.blob_workflow_callbacks.BlobWorkflowCallbacks.__init__",
23 | return_value=None,
24 | ):
25 | instance = BlobWorkflowCallbacks()
26 | instance._blob_service_client = mock_blob_service_client
27 | instance._index_name = "mock_index_name"
28 | instance._container_name = "logs"
29 | instance._blob_name = "logs/logs.txt"
30 | instance._num_workflow_steps = 4
31 | instance._processed_workflow_steps = []
32 | instance._workflow_name = ""
33 | yield instance
34 |
35 |
36 | def test_on_workflow_start(workflow_callbacks):
37 | workflow_callbacks.workflow_start("test_workflow", object())
38 | # check if blob workflow callbacks _write_log() method was called
39 | assert workflow_callbacks._blob_service_client.get_blob_client().append_block.called
40 |
41 |
42 | def test_on_workflow_end(workflow_callbacks):
43 | workflow_callbacks.workflow_end("test_workflow", object())
44 | assert workflow_callbacks._blob_service_client.get_blob_client().append_block.called
45 |
46 |
47 | def test_on_error(workflow_callbacks):
48 | workflow_callbacks.error("test_error", Exception("test_exception"))
49 | assert workflow_callbacks._blob_service_client.get_blob_client().append_block.called
50 |
--------------------------------------------------------------------------------
/backend/tests/unit/test_logger_console_callbacks.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # Licensed under the MIT License.
3 |
4 | import logging
5 | from unittest.mock import MagicMock, patch
6 |
7 | import pytest
8 |
9 | from graphrag_app.logger.console_workflow_callbacks import ConsoleWorkflowCallbacks
10 |
11 |
12 | @pytest.fixture
13 | def mock_logger():
14 | with patch(
15 | "graphrag_app.logger.console_workflow_callbacks.logging.getLogger"
16 | ) as mock_get_logger:
17 | mock_logger_instance = MagicMock(spec=logging.Logger)
18 | mock_get_logger.return_value = mock_logger_instance
19 | yield mock_logger_instance
20 |
21 |
22 | @pytest.fixture
23 | def workflow_callbacks(mock_logger):
24 | with patch(
25 | "graphrag_app.logger.console_workflow_callbacks.ConsoleWorkflowCallbacks.__init__",
26 | return_value=None,
27 | ):
28 | instance = ConsoleWorkflowCallbacks()
29 | instance._logger = mock_logger
30 | instance._index_name = "mock_index_name"
31 | instance._num_workflow_steps = 4
32 | instance._processed_workflow_steps = []
33 | instance._properties = {}
34 | yield instance
35 |
36 |
37 | def test_workflow_start(workflow_callbacks, mock_logger):
38 | workflow_callbacks.workflow_start("test_workflow", object())
39 | assert mock_logger.info.called
40 |
41 |
42 | def test_workflow_end(workflow_callbacks, mock_logger):
43 | workflow_callbacks.workflow_end("test_workflow", object())
44 | assert mock_logger.info.called
45 |
46 |
47 | def test_log(workflow_callbacks, mock_logger):
48 | workflow_callbacks.log("test_log_message")
49 | assert mock_logger.info.called
50 |
51 |
52 | def test_warning(workflow_callbacks, mock_logger):
53 | workflow_callbacks.warning("test_warning")
54 | assert mock_logger.warning.called
55 |
56 |
57 | def test_error(workflow_callbacks, mock_logger):
58 | workflow_callbacks.error("test_error", Exception("test_exception"))
59 | assert mock_logger.error.called
60 |
--------------------------------------------------------------------------------
/docker/Dockerfile-backend:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # Licensed under the MIT License.
3 |
4 | # For more information about the base image: https://mcr.microsoft.com/en-us/artifact/mar/devcontainers/python/about
5 | FROM mcr.microsoft.com/devcontainers/python:3.10-bookworm
6 |
7 | # Patch Debian to remediate CVE findings
8 | # Apply Debian bookworm-updates by running a full system upgrade
9 | RUN echo "deb http://deb.debian.org/debian bookworm-updates main" >> /etc/apt/sources.list.d/bookworm-updates.list \
10 | && echo "deb http://deb.debian.org/debian bookworm-backports main" >> /etc/apt/sources.list.d/backports.list \
11 | && apt-get update \
12 | && apt-get upgrade -y \
13 | && apt-get autoremove -y \
14 | && apt-get clean \
15 | && rm -rf /var/lib/apt/lists/*
16 |
17 | # default graphrag version will be 0.0.0 unless overridden by --build-arg
18 | ARG GRAPHRAG_VERSION=0.0.0
19 | ENV GRAPHRAG_VERSION=v${GRAPHRAG_VERSION}
20 | ENV PIP_ROOT_USER_ACTION=ignore
21 | ENV PIP_DISABLE_PIP_VERSION_CHECK=1
22 | ENV SETUPTOOLS_USE_DISTUTILS=stdlib
23 | ENV TIKTOKEN_CACHE_DIR=/opt/tiktoken_cache/
24 |
25 | # CVE finding in pip < 23.3 - Upgrade pip to version 23.3 or greater
26 | RUN pip install --upgrade pip
27 |
28 | COPY backend /backend
29 | RUN cd backend \
30 | && pip install poetry \
31 | && poetry config virtualenvs.create false \
32 | && poetry install
33 |
34 | # download all nltk data that graphrag requires
35 | RUN python -c "import nltk;nltk.download(['punkt','averaged_perceptron_tagger','maxent_ne_chunker','words','wordnet'])"
36 | # download tiktoken model encodings
37 | RUN python -c "import tiktoken; tiktoken.encoding_for_model('gpt-3.5-turbo'); tiktoken.encoding_for_model('gpt-4'); tiktoken.encoding_for_model('gpt-4o');"
38 |
39 | # CVE finding in cryptography <= 44.0.0 - cache version 44.0.1 of cryptography via pip
40 | RUN pip install cryptography==44.0.1
41 |
42 | WORKDIR /backend
43 | EXPOSE 80
44 | CMD ["uvicorn", "graphrag_app.main:app", "--host", "0.0.0.0", "--port", "80"]
45 |
--------------------------------------------------------------------------------
/docker/Dockerfile-frontend:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # Licensed under the MIT License.
3 |
4 | # For more information about the base image visit:
5 | # https://mcr.microsoft.com/en-us/artifact/mar/devcontainers/python/about
6 | FROM mcr.microsoft.com/devcontainers/python:3.10-bookworm
7 |
8 | ENV PIP_ROOT_USER_ACTION=ignore
9 | ENV PIP_DISABLE_PIP_VERSION_CHECK=1
10 | ENV SETUPTOOLS_USE_DISTUTILS=stdlib
11 |
12 | COPY frontend /frontend
13 | RUN cd frontend \
14 | && pip install poetry \
15 | && poetry config virtualenvs.create false \
16 | && poetry install
17 |
18 | WORKDIR /frontend
19 | EXPOSE 8080
20 | CMD ["streamlit", "run", "app.py", "--server.port", "8080"]
21 |
--------------------------------------------------------------------------------
/docs/DEVELOPMENT-GUIDE.md:
--------------------------------------------------------------------------------
1 | # Development Guide
2 |
3 | This document is for developers interested in contributing to GraphRAG.
4 |
5 | ## Quickstart
6 | Development is best done in a unix environment (Linux, Mac, or [Windows WSL](https://learn.microsoft.com/en-us/windows/wsl/install)).
7 |
8 | 1. Clone the GraphRAG repository.
9 | 1. Follow all directions in the [deployment guide](DEPLOYMENT-GUIDE.md) to install required tools and deploy an instance of the GraphRAG service in Azure. Alternatively, this repo provides a devcontainer with all tools preinstalled.
10 | 1. New unit tests and integration tests are currently being added to improve the developer experience when testing code changes locally.
11 |
12 | ### Testing
13 |
14 | A small collection of unit tests and integrations tests have been written to test functionality of the API. To get started, first ensure that all test dependencies have been installed.
15 |
16 | ```shell
17 | cd /backend
18 | poetry install --with test
19 | ```
20 |
21 | Some tests require the [azurite emulator](https://learn.microsoft.com/en-us/azure/storage/common/storage-use-azurite?toc=%2Fazure%2Fstorage%2Fblobs%2Ftoc.json&bc=%2Fazure%2Fstorage%2Fblobs%2Fbreadcrumb%2Ftoc.json&tabs=docker-hub%2Cblob-storage) and [cosmosdb emulator](https://learn.microsoft.com/en-us/azure/cosmos-db/how-to-develop-emulator?tabs=docker-linux%2Ccsharp&pivots=api-nosql) to be running locally (these are setup in the ci/cd automatically). You may start these emulators by running them in the background as docker containers.
22 |
23 | ```shell
24 | docker run -d -p 10000:10000 -p 10001:10001 -p 10002:10002 mcr.microsoft.com/azure-storage/azurite:latest
25 | docker run -d -p 8081:8081 -p 1234:1234 mcr.microsoft.com/cosmosdb/linux/azure-cosmos-emulator:vnext-preview
26 | ```
27 |
28 | To run the tests:
29 |
30 | ```shell
31 | cd /backend
32 | pytest -s --cov=src tests
33 | ```
34 |
35 | ### Deployment (CI/CD)
36 | This repository uses Github Actions for continuous integration and continious deployment (CI/CD).
37 |
38 | ### Style Guide:
39 | * We follow [PEP 8](https://peps.python.org/pep-0008) standards and naming conventions as close as possible.
40 |
41 | * [ruff](https://docs.astral.sh/ruff) is used for linting and code formatting. A pre-commit hook has been setup to automatically apply settings to this repo. To make use of this tool without explicitly calling it, install the pre-commit hook.
42 | ```
43 | > pre-commit install
44 | ```
45 |
46 | ### Versioning
47 | We use [SemVer](https://aka.ms/StartRight/README-Template/semver) for semantic versioning.
48 |
--------------------------------------------------------------------------------
/docs/assets/graphrag-architecture-diagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure-Samples/graphrag-accelerator/c660c84cff1f7dfc7c82a62ee566ab0b2dc8e6b6/docs/assets/graphrag-architecture-diagram.png
--------------------------------------------------------------------------------
/docs/assets/graphrag-architecture-diagram.vsdx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure-Samples/graphrag-accelerator/c660c84cff1f7dfc7c82a62ee566ab0b2dc8e6b6/docs/assets/graphrag-architecture-diagram.vsdx
--------------------------------------------------------------------------------
/frontend/.streamlit/config.toml:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # Licensed under the MIT License.
3 |
4 | [server]
5 | enableXsrfProtection = false
6 |
--------------------------------------------------------------------------------
/frontend/README.md:
--------------------------------------------------------------------------------
1 | # Frontend Application Launch Instructions
2 | A small frontend application (a streamlit app) is provided to demonstrate how to build and deploy a UI on top of the solution accelerator API.
3 | This application is optional and not required for the solution accelerator API to function properly.
4 |
5 | ### 1. Deploy the GraphRAG solution accelerator
6 | Follow instructions from the [deployment guide](../docs/DEPLOYMENT-GUIDE.md) to deploy a full instance of the solution accelerator.
7 |
8 | ### 2. (optional) Create a `.env` file:
9 | If a `.env` file is not provided, the UI will prompt the user for additional login information.
10 |
11 | | Variable Name | Required | Example | Description |
12 | | :--- | --- | :--- | ---: |
13 | DEPLOYMENT_URL | No | https://.azure-api.net | Base url of the deployed graphrag API. Also referred to as the APIM Gateway URL.
14 | APIM_SUBSCRIPTION_KEY | No | | A [subscription key](https://learn.microsoft.com/en-us/azure/api-management/api-management-subscriptions) generated by APIM.
15 | DEPLOYER_EMAIL | No | deployer@email.com | Email address of the person/organization that deployed the solution accelerator.
16 |
17 | ## Run UI locally
18 |
19 | The frontend application can run locally as a docker container.
20 |
21 | ```
22 | # cd to the root directory of the repo
23 | > docker build -t graphrag:frontend -f docker/Dockerfile-frontend .
24 | > docker run --env-file -p 8080:8080 graphrag:frontend
25 | ```
26 | To access the app , visit `localhost:8080` in your browser.
27 |
28 | ## Host UI in Azure
29 | The frontend application can also be hosted in Azure as a Web App using the provided `frontend/deploy.sh` script.
30 |
31 | ### 1. Create Azure App Registration
32 |
33 | To enable authentication and authorization for the frontend application, you need to create an Azure App Registration with ID tokens enabled. You may need Owner level permissions on the subscription for some steps.
34 | This app registration will be used for Authentication and Authorization to the frontend web app (not the backend). Follow the steps below:
35 |
36 | 1. Go to the [Azure portal](https://portal.azure.com) and sign in with your Azure account.
37 | 2. Navigate to the **Microsoft Entra ID** service.
38 | 3. Select **App registrations** from the left-hand menu.
39 | 4. Click on the **+ New registration** button.
40 | 5. Provide a name for your app registration and select the appropriate account type.
41 | 6. Under the **Redirect URIs** section, select the `Web` platform from dropdown menu. Add `http://localhost:8080/.auth/login/aad/callback` to the URL text field. The deployment script will update this field later with the actual URL of the webapp.
42 | 7. Save the app registration.
43 | 8. Under the **Manage** section select **Authentication**. Select **ID tokens** as the supported token type.
44 | 9. In the **Overview** section of the registered app, take note of the **Application (client) ID**, **Object ID** and **Directory (tenant) ID**. This information will be used later.
45 |
46 | ### 2. Populate the deploy parameters
47 |
48 | Please fill out `frontend/frontend_deploy.parameters.json` with the required values described below.
49 |
50 | 1. Replace the placeholder values with actual values for the following required variables, you may also add optional variables in the json file if you wish to override the default values:
51 |
52 | | Variable Name | Required | Example | Description |
53 | | :------------------- | :------- | :------------------------------------- | :-------------------------------------------------------------- |
54 | | LOCATION | Yes | eastus | The Azure region where resources will be deployed. |
55 | | RESOURCE_GROUP | Yes | my-resource-group | The name of the Azure resource group where resources will be created. At this time, the name must follow [Azure Container Registry](https://learn.microsoft.com/en-us/azure/azure-resource-manager/management/resource-name-rules#microsoftcontainerregistry) naming guidelines. |
56 | | SUBSCRIPTION_ID | Yes | 12345678-1234-1234-1234-1234567890ab | The ID of the Azure subscription where the resources will be deployed. |
57 | | AAD_CLIENT_ID | Yes | 12345678-1234-1234-1234-1234567890ab | The client ID of the Microsoft Entra ID (AAD) app registration. |
58 | | AAD_OBJECT_ID | Yes | 12345678-1234-1234-1234-1234567890ab | The object ID of the Microsoft Entra ID (AAD) app registration. |
59 | | AAD_TENANT_ID | Yes | 12345678-1234-1234-1234-1234567890ab | The ID of the Microsoft Entra ID (AAD) tenant. |
60 | | AAD_TOKEN_ISSUER_URL | No | https://login.microsoftonline.com/12345678-1234-1234-1234-1234567890ab/v2.0 | The URL of the Microsoft Entra ID (AAD) token issuer. Defaults to the tenant-specific issuer URL. |
61 | | IMAGE_NAME | No | graphrag:frontend | The name of the Docker image for the frontend application. Defaults to `graphrag:frontend`. |
62 | | REGISTRY_NAME | No | myresourcegroupreg | The name of the Azure Container Registry. Defaults to the resource group name with `reg` appended. |
63 | | APP_SERVICE_PLAN | No | myresourcegroup-asp | The name of the Azure App Service plan. Defaults to the resource group name with `asp` appended. |
64 | | WEB_APP | No | myresourcegroup-playground | The name of the Azure Web App. Defaults to the resource group name with `playground` appended. |
65 | | WEB_APP_IDENTITY | No | myresourcegroup-playground-identity | The name of the managed identity for the Azure Web App. Defaults to the web app name with `identity` appended. |
66 |
67 | Save the `frontend/frontend_deploy.parameters.json` file after populating the values. If you would like the webapp to automatically connect
68 | to the solution accelerator backend API, create and populate a `.env` file described in step 2, otherwise the webapp will ask for login credentials to the APIM service that was deployed as part of the backend API.
69 |
70 | ### 3. Run the deploy script
71 |
72 | Prerequisite : Please install az-cli version >=2.64.0
73 | To deploy the frontend application:
74 |
75 | 1. Open a terminal and navigate to the `/frontend` directory.
76 | 2. Run the deploy script:
77 |
78 | ```
79 | # cd to graphrag-accelerator/frontend directory
80 | > bash deploy.sh -p frontend_deploy.parameters.json
81 | ```
82 |
83 | One the frontend application has been deployed, please take note of the URL that is displayed at the end of the script. It will have the form `(https://PLACEHOLDER.azurewebsites.net)`. The Web App service will take 2-3 minutes initially to load the first time. This is expected behavior.
84 |
--------------------------------------------------------------------------------
/frontend/app.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # Licensed under the MIT License.
3 |
4 | import os
5 |
6 | import streamlit as st
7 |
8 | from src.components import tabs
9 | from src.components.index_pipeline import IndexPipeline
10 | from src.enums import EnvVars
11 | from src.functions import initialize_app
12 | from src.graphrag_api import GraphragAPI
13 |
14 | # Load environment variables
15 | initialized = initialize_app()
16 | st.session_state["initialized"] = True if initialized else False
17 |
18 |
19 | def graphrag_app(initialized: bool):
20 | st.title("Microsoft GraphRAG Copilot")
21 | main_tab, prompt_gen_tab, prompt_edit_tab, index_tab, query_tab = st.tabs([
22 | "**Intro**",
23 | "**1. Prompt Generation**",
24 | "**2. Prompt Configuration**",
25 | "**3. Index**",
26 | "**4. Query**",
27 | ])
28 | # display only the main tab if a connection to an existing APIM has not been initialized
29 | with main_tab:
30 | tabs.get_main_tab(initialized)
31 | if initialized:
32 | # setup API request information
33 | COLUMN_WIDTHS = [0.275, 0.45, 0.275]
34 | apim_url = st.session_state[EnvVars.DEPLOYMENT_URL.value]
35 | apim_key = st.session_state[EnvVars.APIM_SUBSCRIPTION_KEY.value]
36 | # perform health check to verify connectivity
37 | client = GraphragAPI(apim_url, apim_key)
38 | if not client.health_check_passed():
39 | st.error("APIM Connection Error")
40 | st.stop()
41 | indexPipe = IndexPipeline(client, COLUMN_WIDTHS)
42 | # display tabs
43 | with prompt_gen_tab:
44 | tabs.get_prompt_generation_tab(client, COLUMN_WIDTHS)
45 | with prompt_edit_tab:
46 | tabs.get_prompt_configuration_tab()
47 | with index_tab:
48 | tabs.get_index_tab(indexPipe)
49 | with query_tab:
50 | tabs.get_query_tab(client)
51 | deployer_email = os.getenv("DEPLOYER_EMAIL", "deployer@email.com")
52 | footer = f"""
53 |
56 | """
57 | st.markdown(footer, unsafe_allow_html=True)
58 |
59 |
60 | if __name__ == "__main__":
61 | graphrag_app(st.session_state["initialized"])
62 |
--------------------------------------------------------------------------------
/frontend/frontend_deploy.parameters.json:
--------------------------------------------------------------------------------
1 | {
2 | "SUBSCRIPTION_ID": "",
3 | "RESOURCE_GROUP": "",
4 | "LOCATION": "",
5 | "AAD_CLIENT_ID": "",
6 | "AAD_OBJECT_ID": "",
7 | "AAD_TENANT_ID": ""
8 | }
9 |
--------------------------------------------------------------------------------
/frontend/pyproject.toml:
--------------------------------------------------------------------------------
1 | [tool.poetry]
2 | name = "graphrag-solution-accelerator"
3 | version = "0.1.1"
4 | description = ""
5 | authors = [
6 | "Josh Bradley ",
7 | "Newman Cheng ",
8 | "Christine DiFonzo ",
9 | "Gabriel Nieves ",
10 | "Douglas Orbaker ",
11 | "Chris Sanchez ",
12 | "Shane Solomon ",
13 | ]
14 | readme = "README.md"
15 | license = "MIT"
16 | package-mode = false
17 |
18 | [tool.poetry.dependencies]
19 | python = "~3.10"
20 |
21 | [tool.poetry.group.dev.dependencies]
22 | detect-secrets = ">=1.5.0"
23 | devtools = ">=0.12.2"
24 | flake8 = ">=6.1.0"
25 | ipython = "*"
26 | jupyter = "*"
27 | pre-commit = ">=3.6.0"
28 | ruff = ">=0.1.13"
29 |
30 | [tool.poetry.group.test.dependencies]
31 | pytest = ">=8.2.1"
32 | wikipedia = ">=1.4.0"
33 |
34 | [tool.poetry.group.frontend.dependencies]
35 | requests = "*"
36 | streamlit = ">=1.38.0"
37 | streamlit-nested-layout = "*"
38 |
39 | [tool.ruff]
40 | target-version = "py310"
41 | line-length = 88
42 | indent-width = 4
43 |
44 | [tool.ruff.format]
45 | preview = true
46 | quote-style = "double"
47 |
48 | [tool.ruff.lint]
49 | preview = true
50 | select = ["E", "F", "I"]
51 | ignore = ["E402", "E501", "F821"]
52 |
53 | [build-system]
54 | requires = ["poetry-core"]
55 | build-backend = "poetry.core.masonry.api"
56 |
--------------------------------------------------------------------------------
/frontend/src/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure-Samples/graphrag-accelerator/c660c84cff1f7dfc7c82a62ee566ab0b2dc8e6b6/frontend/src/__init__.py
--------------------------------------------------------------------------------
/frontend/src/components/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure-Samples/graphrag-accelerator/c660c84cff1f7dfc7c82a62ee566ab0b2dc8e6b6/frontend/src/components/__init__.py
--------------------------------------------------------------------------------
/frontend/src/components/login_sidebar.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # Licensed under the MIT License.
3 |
4 | import streamlit as st
5 |
6 | from src.enums import EnvVars
7 | from src.graphrag_api import GraphragAPI
8 |
9 |
10 | def login():
11 | """
12 | Login component that displays in the sidebar. Requires the user to enter
13 | the APIM Gateway URL and Subscription Key to login. After entering user
14 | credentials, a simple health check call is made to the GraphRAG API.
15 | """
16 | with st.sidebar:
17 | st.title(
18 | "Login",
19 | help="Enter your APIM credentials to get started. Refreshing the browser will require you to login again.",
20 | )
21 | with st.form(key="login-form", clear_on_submit=True):
22 | apim_url = st.text_input("APIM Gateway URL", key="apim-url")
23 | apim_sub_key = st.text_input(
24 | "APIM Subscription Key", key="subscription-key"
25 | )
26 | form_submit = st.form_submit_button("Login")
27 | if form_submit:
28 | client = GraphragAPI(apim_url, apim_sub_key)
29 | if client.health_check_passed():
30 | st.success("Login Successful")
31 | st.session_state[EnvVars.DEPLOYMENT_URL.value] = apim_url
32 | st.session_state[EnvVars.APIM_SUBSCRIPTION_KEY.value] = apim_sub_key
33 | st.session_state["initialized"] = True
34 | st.rerun()
35 | else:
36 | st.error("Login Failed")
37 | st.error("Please check the APIM Gateway URL and Subscription Key")
38 |
--------------------------------------------------------------------------------
/frontend/src/components/prompt_configuration.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # Licensed under the MIT License.
3 |
4 | import os
5 |
6 | import streamlit as st
7 |
8 | from src.enums import PromptFileNames, PromptKeys, PromptTextAreas
9 | from src.functions import zip_directory
10 |
11 | SAVED_PROMPT_VAR = "saved_prompts"
12 |
13 |
14 | def save_prompts(
15 | local_dir: str = "./edited_prompts/", zip_file_path: str = "edited_prompts.zip"
16 | ):
17 | """
18 | Save prompts in memory and on disk as a zip file
19 | """
20 | st.session_state[SAVED_PROMPT_VAR] = True
21 | st.session_state[PromptKeys.ENTITY.value] = st.session_state[
22 | PromptTextAreas.ENTITY.value
23 | ]
24 | st.session_state[PromptKeys.SUMMARY.value] = st.session_state[
25 | PromptTextAreas.SUMMARY.value
26 | ]
27 | st.session_state[PromptKeys.COMMUNITY.value] = st.session_state[
28 | PromptTextAreas.COMMUNITY.value
29 | ]
30 | os.makedirs(local_dir, exist_ok=True)
31 | for key, filename in zip(PromptKeys, PromptFileNames):
32 | outpath = os.path.join(local_dir, filename.value)
33 | with open(outpath, "w", encoding="utf-8") as f:
34 | f.write(st.session_state[key.value])
35 | zip_directory(local_dir, zip_file_path)
36 |
37 |
38 | def edit_prompts():
39 | """
40 | Re-edit the prompts
41 | """
42 | st.session_state[SAVED_PROMPT_VAR] = False
43 |
44 |
45 | def prompt_editor(prompt_values: list[str]):
46 | """
47 | Container for prompt configurations
48 | """
49 | saved_prompts = st.session_state[SAVED_PROMPT_VAR]
50 |
51 | entity_ext_prompt, summ_prompt, comm_report_prompt = prompt_values
52 |
53 | with st.container(border=True):
54 | tab_labels = [
55 | "**Entity Extraction**",
56 | "**Summarize Descriptions**",
57 | "**Community Reports**",
58 | ]
59 | # subheaders = [f"{tab_label} Prompt" for tab_label in tab_labels]
60 | tab1, tab2, tab3 = st.tabs(tabs=tab_labels)
61 | with tab1:
62 | st.text_area(
63 | label="Entity Prompt",
64 | value=entity_ext_prompt,
65 | max_chars=20000,
66 | key="entity_text_area",
67 | label_visibility="hidden",
68 | disabled=saved_prompts,
69 | )
70 |
71 | with tab2:
72 | st.text_area(
73 | label="Summarize Prompt",
74 | value=summ_prompt,
75 | max_chars=20000,
76 | key="summary_text_area",
77 | label_visibility="hidden",
78 | disabled=saved_prompts,
79 | )
80 |
81 | with tab3:
82 | st.text_area(
83 | label="Community Reports Prompt",
84 | value=comm_report_prompt,
85 | max_chars=20000,
86 | key="community_text_area",
87 | label_visibility="hidden",
88 | disabled=saved_prompts,
89 | )
90 |
--------------------------------------------------------------------------------
/frontend/src/components/upload_files_component.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # Licensed under the MIT License.
3 |
4 | import json
5 |
6 | import streamlit as st
7 |
8 | from src.graphrag_api import GraphragAPI
9 |
10 | UPLOAD_HELP_MESSAGE = """
11 | This functionality is disabled while an existing Storage Container is selected.
12 | Please deselect the existing Storage Container to upload new data.
13 | """
14 |
15 |
16 | def upload_files(
17 | client: GraphragAPI, key_prefix: str, disable_other_input: bool = False
18 | ):
19 | """
20 | Reusable component to upload files to Blob Storage Container
21 | """
22 | input_storage_name = st.text_input(
23 | label="Enter Storage Name",
24 | key=f"{key_prefix}-storage-name-input",
25 | disabled=disable_other_input,
26 | help=UPLOAD_HELP_MESSAGE,
27 | )
28 | file_upload = st.file_uploader(
29 | "Upload Data",
30 | type=["txt"],
31 | key=f"{key_prefix}-file-uploader",
32 | accept_multiple_files=True,
33 | disabled=disable_other_input,
34 | )
35 |
36 | uploaded = st.button(
37 | "Upload Files",
38 | key=f"{key_prefix}-upload-button",
39 | disabled=disable_other_input or input_storage_name == "",
40 | )
41 | if uploaded:
42 | if file_upload and input_storage_name != "":
43 | file_payloads = []
44 | for file in file_upload:
45 | file_payload = (
46 | "files",
47 | (file.name, file.read(), file.type),
48 | )
49 | file_payloads.append((file_payload))
50 |
51 | response = client.upload_files(file_payloads, input_storage_name)
52 | if response.status_code == 200:
53 | st.success("Files uploaded successfully!")
54 | else:
55 | st.error(f"Error: {json.loads(response.text)}")
56 | return uploaded
57 |
--------------------------------------------------------------------------------
/frontend/src/enums.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # Licensed under the MIT License.
3 |
4 | from enum import Enum
5 |
6 |
7 | class PromptKeys(str, Enum):
8 | ENTITY = "entity_extraction"
9 | SUMMARY = "summarize_descriptions"
10 | COMMUNITY = "community_report"
11 |
12 |
13 | class PromptFileNames(str, Enum):
14 | ENTITY = "entity_extraction_prompt.txt"
15 | SUMMARY = "summarize_descriptions_prompt.txt"
16 | COMMUNITY = "community_report_prompt.txt"
17 |
18 |
19 | class PromptTextAreas(str, Enum):
20 | ENTITY = "entity_text_area"
21 | SUMMARY = "summary_text_area"
22 | COMMUNITY = "community_text_area"
23 |
24 |
25 | class StorageIndexVars(str, Enum):
26 | SELECTED_STORAGE = "selected_storage"
27 | INPUT_STORAGE = "input_storage"
28 | SELECTED_INDEX = "selected_index"
29 |
30 |
31 | class EnvVars(str, Enum):
32 | APIM_SUBSCRIPTION_KEY = "APIM_SUBSCRIPTION_KEY"
33 | DEPLOYMENT_URL = "DEPLOYMENT_URL"
34 |
--------------------------------------------------------------------------------
/frontend/style.css:
--------------------------------------------------------------------------------
1 | /*
2 | Copyright (c) Microsoft Corporation.
3 | Licensed under the MIT License.
4 | */
5 |
6 | @import url('https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.15.1/css/all.min.css');
7 |
8 | #root > div:nth-child(1) > div.withScreencast > div > div > div > section.main.st-emotion-cache-uf99v8.ea3mdgi8 > div.block-container.st-emotion-cache-z5fcl4.ea3mdgi5 > div > div > div > div.st-emotion-cache-ocqkz7.e1f1d6gn5 > div:nth-child(4) > div > div > div > div > div{
9 | margin-top: 1.6em;
10 | }
11 |
12 |
13 | [data-testid="stHeadingDivider"] {
14 | background-color: #3d9df3; /* Set your desired color */
15 | height: 1px;
16 | }
17 |
18 | #microsoft-graphrag-copilot > div > span {
19 | text-align: center;
20 | margin-top: -1em;
21 | }
22 |
23 | /* Tooltip container */
24 | .tooltip {
25 | position: relative;
26 | display: inline-block;
27 | border-bottom: 1px dotted black; /* If you want dots under the hoverable text */
28 | }
29 |
30 | /* Tooltip text */
31 | .tooltip .tooltiptext {
32 | visibility: hidden;
33 | width: 120px;
34 | background-color: #555;
35 | color: #fff;
36 | text-align: center;
37 | border-radius: 6px;
38 | padding: 5px;
39 | position: absolute;
40 | z-index: 1;
41 | bottom: 125%;
42 | left: 50%;
43 | margin-left: -60px;
44 | opacity: 0;
45 | transition: opacity 0.3s;
46 | }
47 |
48 | /* Show the tooltip text when you hover over the tooltip container */
49 | .tooltip:hover .tooltiptext {
50 | visibility: visible;
51 | opacity: 1;
52 | }
53 |
54 | .gray-box {
55 |
56 | background-color: #ffffff;
57 | padding: 10px;
58 | width: 80%;
59 | }
60 |
61 | .center-container {
62 | margin-top: -10em;
63 | display: flex;
64 | align-items: center;
65 | justify-content: center;
66 | height: 100vh;
67 | }
68 |
69 | .footer {
70 | display: flex;
71 | justify-content: center;
72 | align-items: center;
73 | position: fixed;
74 | left: 0;
75 | bottom: 0;
76 | width: 100%;
77 | background-color: #f1f1f1;
78 | text-align: center;
79 | padding: 5px;
80 | z-index: 1000;
81 | }
82 | .footer p{
83 | font-size: 12px;
84 | }
85 |
86 | /* CSS */
87 | button[kind="primary"] {
88 | background-color: #1d9445;
89 | border: 0;
90 | border-radius: 56px;
91 | color: #fff;
92 | cursor: pointer;
93 | display: inline-block;
94 | font-family: system-ui,-apple-system,system-ui,"Segoe UI",Roboto,Ubuntu,"Helvetica Neue",sans-serif;
95 | font-size: 58px;
96 | font-weight: 600;
97 | outline: 0;
98 | padding: 16px 21px;
99 | position: relative;
100 | text-align: center;
101 | text-decoration: none;
102 | transition: all .3s;
103 | user-select: none;
104 | -webkit-user-select: none;
105 | touch-action: manipulation;
106 | }
107 |
108 | button[kind="primary"]:before {
109 | background-color: initial;
110 | background-image: linear-gradient(#fff 0, rgba(255, 255, 255, 0) 100%);
111 | border-radius: 125px;
112 | content: "";
113 | height: 50%;
114 | left: 4%;
115 | opacity: .5;
116 | position: absolute;
117 | top: 0;
118 | transition: all .3s;
119 | width: 62%;
120 | }
121 |
122 | button[kind="primary"]:hover {
123 | box-shadow: rgba(255, 255, 255, .2) 0 3px 15px inset, rgba(0, 0, 0, .1) 0 3px 5px, rgba(0, 0, 0, .1) 0 10px 13px;
124 | transform: scale(1.05);
125 | }
126 |
127 | @media (min-width: 768px) {
128 | button[kind="primary"] {
129 | padding: 15px 34px;
130 | margin: 20px auto;
131 | }
132 | }
133 |
134 | .element-container:has(>.stTextArea), .stTextArea {
135 | display: block;
136 | margin-left: auto;
137 | margin-right: auto;
138 | }
139 | .stTextArea textarea {
140 | height: 500px;
141 | /*background-color: #a7b0a4;*/
142 | }
143 |
--------------------------------------------------------------------------------
/infra/abbreviations.json:
--------------------------------------------------------------------------------
1 | {
2 | "analysisServicesServers": "as",
3 | "apiManagementService": "apim-",
4 | "appConfigurationConfigurationStores": "appcs-",
5 | "appContainerApps": "ca-",
6 | "appManagedEnvironments": "cae-",
7 | "authorizationPolicyDefinitions": "policy-",
8 | "automationAutomationAccounts": "aa-",
9 | "azureOpenAI": "aoai-",
10 | "blueprintBlueprints": "bp-",
11 | "blueprintBlueprintsArtifacts": "bpa-",
12 | "cacheRedis": "redis-",
13 | "cdnProfiles": "cdnp-",
14 | "cdnProfilesEndpoints": "cdne-",
15 | "cognitiveServicesAccounts": "cog-",
16 | "cognitiveServicesFormRecognizer": "cog-fr-",
17 | "cognitiveServicesTextAnalytics": "cog-ta-",
18 | "computeAvailabilitySets": "avail-",
19 | "computeCloudServices": "cld-",
20 | "computeDiskEncryptionSets": "des",
21 | "computeDisks": "disk",
22 | "computeDisksOs": "osdisk",
23 | "computeGalleries": "gal",
24 | "computeSnapshots": "snap-",
25 | "computeVirtualMachineScaleSets": "vmss-",
26 | "computeVirtualMachines": "vm",
27 | "containerInstanceContainerGroups": "ci",
28 | "containerRegistryRegistries": "cr",
29 | "containerServiceManagedClusters": "aks-",
30 | "dBforMySQLServers": "mysql-",
31 | "dBforPostgreSQLServers": "psql-",
32 | "dataFactoryFactories": "adf-",
33 | "dataLakeAnalyticsAccounts": "dla",
34 | "dataLakeStoreAccounts": "dls",
35 | "dataMigrationServices": "dms-",
36 | "databricksWorkspaces": "dbw-",
37 | "devicesIotHubs": "iot-",
38 | "devicesProvisioningServices": "provs-",
39 | "devicesProvisioningServicesCertificates": "pcert-",
40 | "documentDBDatabaseAccounts": "cosmos-",
41 | "eventGridDomains": "evgd-",
42 | "eventGridDomainsTopics": "evgt-",
43 | "eventGridEventSubscriptions": "evgs-",
44 | "eventHubNamespaces": "evhns-",
45 | "eventHubNamespacesEventHubs": "evh-",
46 | "hdInsightClustersHadoop": "hadoop-",
47 | "hdInsightClustersHbase": "hbase-",
48 | "hdInsightClustersKafka": "kafka-",
49 | "hdInsightClustersMl": "mls-",
50 | "hdInsightClustersSpark": "spark-",
51 | "hdInsightClustersStorm": "storm-",
52 | "hybridComputeMachines": "arcs-",
53 | "insightsActionGroups": "ag-",
54 | "insightsComponents": "appi-",
55 | "keyVaultVaults": "kv-",
56 | "kubernetesConnectedClusters": "arck",
57 | "kustoClusters": "dec",
58 | "kustoClustersDatabases": "dedb",
59 | "logicIntegrationAccounts": "ia-",
60 | "logicWorkflows": "logic-",
61 | "machineLearningServicesWorkspaces": "mlw-",
62 | "managedIdentityUserAssignedIdentities": "id-",
63 | "managementManagementGroups": "mg-",
64 | "migrateAssessmentProjects": "migr-",
65 | "networkApplicationGateways": "agw-",
66 | "networkApplicationSecurityGroups": "asg-",
67 | "networkAzureFirewalls": "afw-",
68 | "networkBastionHosts": "bas-",
69 | "networkConnections": "con-",
70 | "networkDnsZones": "dnsz-",
71 | "networkExpressRouteCircuits": "erc-",
72 | "networkFirewallPolicies": "afwp-",
73 | "networkFirewallPoliciesRuleGroups": "wafrg",
74 | "networkFirewallPoliciesWebApplication": "waf",
75 | "networkFrontDoors": "fd-",
76 | "networkFrontdoorWebApplicationFirewallPolicies": "fdfp-",
77 | "networkLoadBalancersExternal": "lbe-",
78 | "networkLoadBalancersInboundNatRules": "rule-",
79 | "networkLoadBalancersInternal": "lbi-",
80 | "networkLocalNetworkGateways": "lgw-",
81 | "networkNatGateways": "ng-",
82 | "networkNetworkInterfaces": "nic-",
83 | "networkNetworkSecurityGroups": "nsg-",
84 | "networkNetworkSecurityGroupsSecurityRules": "nsgsr-",
85 | "networkNetworkWatchers": "nw-",
86 | "networkPrivateDnsZones": "pdnsz-",
87 | "networkPrivateLinkScope": "pls-",
88 | "networkPrivateLinkServices": "pl-",
89 | "networkPublicIPAddresses": "pip-",
90 | "networkPublicIPPrefixes": "ippre-",
91 | "networkRouteFilters": "rf-",
92 | "networkRouteTables": "rt-",
93 | "networkRouteTablesRoutes": "udr-",
94 | "networkTrafficManagerProfiles": "traf-",
95 | "networkVirtualNetworkGateways": "vgw-",
96 | "networkVirtualNetworks": "vnet-",
97 | "networkVirtualNetworksSubnets": "snet-",
98 | "networkVirtualNetworksVirtualNetworkPeerings": "peer-",
99 | "networkVirtualWans": "vwan-",
100 | "networkVpnGateways": "vpng-",
101 | "networkVpnGatewaysVpnConnections": "vcn-",
102 | "networkVpnGatewaysVpnSites": "vst-",
103 | "notificationHubsNamespaces": "ntfns-",
104 | "notificationHubsNamespacesNotificationHubs": "ntf-",
105 | "operationalInsightsWorkspaces": "log-",
106 | "portalDashboards": "dash-",
107 | "powerBIDedicatedCapacities": "pbi-",
108 | "privateEndpoint": "pep-",
109 | "purviewAccounts": "pview-",
110 | "recoveryServicesVaults": "rsv-",
111 | "resourcesResourceGroups": "rg-",
112 | "searchSearchServices": "srch-",
113 | "serviceBusNamespaces": "sb-",
114 | "serviceBusNamespacesQueues": "sbq-",
115 | "serviceBusNamespacesTopics": "sbt-",
116 | "serviceEndPointPolicies": "se-",
117 | "serviceFabricClusters": "sf-",
118 | "signalRServiceSignalR": "sigr",
119 | "sqlManagedInstances": "sqlmi-",
120 | "sqlServers": "sql-",
121 | "sqlServersDataWarehouse": "sqldw-",
122 | "sqlServersDatabases": "sqldb-",
123 | "sqlServersDatabasesStretch": "sqlstrdb-",
124 | "storSimpleManagers": "ssimp",
125 | "storageStorageAccounts": "st",
126 | "storageStorageAccountsVm": "stvm",
127 | "streamAnalyticsCluster": "asa-",
128 | "synapseWorkspaces": "syn",
129 | "synapseWorkspacesAnalyticsWorkspaces": "synw",
130 | "synapseWorkspacesSqlPoolsDedicated": "syndp",
131 | "synapseWorkspacesSqlPoolsSpark": "synsp",
132 | "timeSeriesInsightsEnvironments": "tsi-",
133 | "webServerFarms": "plan-",
134 | "webSitesAppService": "app-",
135 | "webSitesAppServiceEnvironment": "ase-",
136 | "webSitesFunctions": "func-",
137 | "webStaticSites": "stapp-"
138 | }
--------------------------------------------------------------------------------
/infra/core/acr/acr.bicep:
--------------------------------------------------------------------------------
1 | // Copyright (c) Microsoft Corporation.
2 | // Licensed under the MIT License.
3 |
4 | @description('The name of the Container Registry resource. Will be automatically generated if not provided.')
5 | param registryName string
6 |
7 | @description('The location of the Container Registry resource.')
8 | param location string = resourceGroup().location
9 |
10 | resource registry 'Microsoft.ContainerRegistry/registries@2023-11-01-preview' = {
11 | name: registryName
12 | location: location
13 | sku: {
14 | name: 'Standard'
15 | }
16 | properties: {
17 | adminUserEnabled: false
18 | encryption: {
19 | status: 'disabled'
20 | }
21 | dataEndpointEnabled: false
22 | publicNetworkAccess: 'Enabled'
23 | networkRuleBypassOptions: 'AzureServices'
24 | zoneRedundancy: 'Disabled'
25 | anonymousPullEnabled: false
26 | metadataSearch: 'Disabled'
27 | }
28 | }
29 |
30 | output name string = registry.name
31 | output id string = registry.id
32 | output loginServer string = registry.properties.loginServer
33 |
--------------------------------------------------------------------------------
/infra/core/ai-search/ai-search.bicep:
--------------------------------------------------------------------------------
1 | // Copyright (c) Microsoft Corporation.
2 | // Licensed under the MIT License.
3 |
4 | @description('The name of the AI Search instance.')
5 | param name string
6 |
7 | @description('The location of the Managed Cluster resource.')
8 | param location string = resourceGroup().location
9 |
10 | @allowed(['enabled', 'disabled'])
11 | param publicNetworkAccess string = 'enabled'
12 |
13 | resource search 'Microsoft.Search/searchServices@2024-06-01-preview' = {
14 | name: name
15 | location: location
16 | sku: {
17 | name: 'standard'
18 | }
19 | properties: {
20 | disableLocalAuth: true
21 | replicaCount: 1
22 | partitionCount: 1
23 | publicNetworkAccess: publicNetworkAccess
24 | networkRuleSet: {
25 | ipRules: []
26 | bypass: 'AzureServices'
27 | }
28 | semanticSearch: 'disabled'
29 | }
30 | }
31 |
32 | output name string = search.name
33 | output id string = search.id
34 |
--------------------------------------------------------------------------------
/infra/core/aoai/aoai.bicep:
--------------------------------------------------------------------------------
1 | @description('Name of the Azure OpenAI instance')
2 | param openAiName string
3 |
4 | @description('Location for the Azure OpenAI instance')
5 | param location string = resourceGroup().location
6 |
7 | @description('LLM model name')
8 | param llmModelName string = 'gpt-4o'
9 |
10 | @description('LLM model deployment name')
11 | param llmModelDeploymentName string = 'gpt-4o'
12 |
13 | @description('LLM Model API version')
14 | param llmModelVersion string
15 |
16 | @description('Embedding model name')
17 | param embeddingModelName string = 'text-embedding-ada-002'
18 |
19 | @description('Embedding model deployment name')
20 | param embeddingModelDeploymentName string = 'text-embedding-ada-002'
21 |
22 | @description('Embedding Model API version')
23 | param embeddingModelVersion string
24 |
25 | @description('TPM quota for the LLM model (x1000)')
26 | param llmTpmQuota int = 1
27 |
28 | @description('TPM quota for the embedding model (x1000)')
29 | param embeddingTpmQuota int = 1
30 |
31 | resource aoai 'Microsoft.CognitiveServices/accounts@2024-10-01' = {
32 | name: openAiName
33 | location: location
34 | sku: {
35 | name: 'S0'
36 | }
37 | kind: 'OpenAI'
38 | properties: {
39 | publicNetworkAccess: 'Enabled'
40 | disableLocalAuth: true
41 | }
42 | }
43 |
44 | resource llmDeployment 'Microsoft.CognitiveServices/accounts/deployments@2024-10-01' = {
45 | parent: aoai
46 | name: llmModelDeploymentName // model deployment name
47 | sku: {
48 | name: 'GlobalStandard'
49 | capacity: llmTpmQuota
50 | }
51 | properties: {
52 | model: {
53 | format: 'OpenAI'
54 | name: llmModelName // model name
55 | version: llmModelVersion
56 | }
57 | currentCapacity: llmTpmQuota
58 | }
59 | }
60 |
61 | resource embeddingDeployment 'Microsoft.CognitiveServices/accounts/deployments@2024-10-01' = {
62 | parent: aoai
63 | name: embeddingModelDeploymentName // model deployment name
64 | // NOTE: simultaneous AOAI model deployments are not supported at this time. As a workaround, use dependsOn to force the models to get deployed sequentially.
65 | dependsOn: [llmDeployment]
66 | sku: {
67 | name: 'Standard'
68 | capacity: embeddingTpmQuota
69 | }
70 | properties: {
71 | model: {
72 | format: 'OpenAI'
73 | name: embeddingModelName // model name
74 | version: embeddingModelVersion
75 | }
76 | currentCapacity: embeddingTpmQuota
77 | }
78 | }
79 |
80 | output name string = aoai.name
81 | output id string = aoai.id
82 | output endpoint string = aoai.properties.endpoint
83 | output llmModel string = llmDeployment.properties.model.name
84 | output llmModelDeploymentName string = llmDeployment.name
85 | output llmModelQuota int = llmDeployment.sku.capacity
86 | output llmModelVersion string = llmDeployment.apiVersion
87 | output embeddingModel string = embeddingDeployment.properties.model.name
88 | output embeddingModelDeploymentName string = embeddingDeployment.name
89 | output embeddingModelQuota int = embeddingDeployment.sku.capacity
90 | output embeddingModelVersion string = embeddingDeployment.apiVersion
91 |
--------------------------------------------------------------------------------
/infra/core/apim/apim.bicep:
--------------------------------------------------------------------------------
1 | // Copyright (c) Microsoft Corporation.
2 | // Licensed under the MIT License.
3 |
4 | @description('The name of the API Management service instance')
5 | param apiManagementName string
6 |
7 | @description('The email address of the owner of the service')
8 | @minLength(1)
9 | param publisherEmail string
10 |
11 | @description('The name of the owner of the service')
12 | @minLength(1)
13 | param publisherName string
14 |
15 | @description('The pricing tier of this API Management service')
16 | @allowed([
17 | 'Developer'
18 | 'StandardV2'
19 | ])
20 | param sku string = 'Developer'
21 |
22 | @description('The instance size of this API Management service. This should be a multiple of the number of availability zones getting deployed.')
23 | param skuCount int = 1
24 |
25 | @description('Application Insights resource ID')
26 | param appInsightsId string
27 |
28 | @description('Application Insights instrumentation key')
29 | param appInsightsInstrumentationKey string
30 |
31 | @description('Azure region where the resources will be deployed')
32 | param location string = resourceGroup().location
33 |
34 | @description('Numbers for availability zones, for example, 1,2,3.')
35 | param availabilityZones array = [
36 | '1'
37 | '2'
38 | ]
39 |
40 | @description('Name for the public IP address used to access the API Management service.')
41 | param publicIpName string = 'apimPublicIP'
42 |
43 | @description('SKU for the public IP address used to access the API Management service.')
44 | @allowed([
45 | 'Standard'
46 | ])
47 | param publicIpSku string = 'Standard'
48 |
49 | @description('Allocation method for the public IP address used to access the API Management service. Standard SKU public IP requires `Static` allocation.')
50 | @allowed([
51 | 'Static'
52 | ])
53 | param publicIPAllocationMethod string = 'Static'
54 |
55 | @description('Unique DNS name for the public IP address used to access the API management service.')
56 | param dnsLabelPrefix string = toLower('${publicIpName}-${uniqueString(resourceGroup().id)}')
57 |
58 | param restoreAPIM bool = false
59 | param subnetId string
60 |
61 | resource publicIp 'Microsoft.Network/publicIPAddresses@2024-01-01' = {
62 | name: publicIpName
63 | location: location
64 | sku: {
65 | name: publicIpSku
66 | }
67 | properties: {
68 | publicIPAllocationMethod: publicIPAllocationMethod
69 | publicIPAddressVersion: 'IPv4'
70 | dnsSettings: {
71 | domainNameLabel: dnsLabelPrefix
72 | }
73 | }
74 | }
75 |
76 | resource apiManagementService 'Microsoft.ApiManagement/service@2023-09-01-preview' = {
77 | name: apiManagementName
78 | location: location
79 | sku: {
80 | name: sku
81 | capacity: skuCount
82 | }
83 | zones: ((length(availabilityZones) == 0) ? null : availabilityZones)
84 | properties: {
85 | restore: restoreAPIM
86 | publisherEmail: publisherEmail
87 | publisherName: publisherName
88 | virtualNetworkType: 'External'
89 | publicIpAddressId: publicIp.id
90 | virtualNetworkConfiguration: {
91 | subnetResourceId: subnetId
92 | }
93 | customProperties: {
94 | 'Microsoft.WindowsAzure.ApiManagement.Gateway.Security.Ciphers.TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA': 'false'
95 | 'Microsoft.WindowsAzure.ApiManagement.Gateway.Security.Ciphers.TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA': 'false'
96 | 'Microsoft.WindowsAzure.ApiManagement.Gateway.Security.Ciphers.TLS_RSA_WITH_AES_128_GCM_SHA256': 'false'
97 | 'Microsoft.WindowsAzure.ApiManagement.Gateway.Security.Ciphers.TLS_RSA_WITH_AES_256_CBC_SHA256': 'false'
98 | 'Microsoft.WindowsAzure.ApiManagement.Gateway.Security.Ciphers.TLS_RSA_WITH_AES_128_CBC_SHA256': 'false'
99 | 'Microsoft.WindowsAzure.ApiManagement.Gateway.Security.Ciphers.TLS_RSA_WITH_AES_256_CBC_SHA': 'false'
100 | 'Microsoft.WindowsAzure.ApiManagement.Gateway.Security.Ciphers.TLS_RSA_WITH_AES_128_CBC_SHA': 'false'
101 | 'Microsoft.WindowsAzure.ApiManagement.Gateway.Security.Ciphers.TripleDes168': 'false'
102 | 'Microsoft.WindowsAzure.ApiManagement.Gateway.Security.Protocols.Tls10': 'false'
103 | 'Microsoft.WindowsAzure.ApiManagement.Gateway.Security.Protocols.Tls11': 'false'
104 | 'Microsoft.WindowsAzure.ApiManagement.Gateway.Security.Protocols.Ssl30': 'false'
105 | 'Microsoft.WindowsAzure.ApiManagement.Gateway.Security.Backend.Protocols.Tls10': 'false'
106 | 'Microsoft.WindowsAzure.ApiManagement.Gateway.Security.Backend.Protocols.Tls11': 'false'
107 | 'Microsoft.WindowsAzure.ApiManagement.Gateway.Security.Backend.Protocols.Ssl30': 'false'
108 | 'Microsoft.WindowsAzure.ApiManagement.Gateway.Protocols.Server.Http2': 'false'
109 | }
110 | }
111 | }
112 |
113 | resource apimLogger 'Microsoft.ApiManagement/service/loggers@2024-06-01-preview' = {
114 | name: 'apimLogger'
115 | parent: apiManagementService
116 | properties: {
117 | credentials: {
118 | instrumentationKey: appInsightsInstrumentationKey
119 | }
120 | description: 'Application Insights for APIM'
121 | loggerType: 'applicationInsights'
122 | resourceId: appInsightsId
123 | }
124 | }
125 |
126 | resource apimDiagnostics 'Microsoft.ApiManagement/service/diagnostics@2023-09-01-preview' = {
127 | name: 'applicationinsights'
128 | parent: apiManagementService
129 | properties: {
130 | loggerId: apimLogger.id
131 | alwaysLog: 'allErrors'
132 | verbosity: 'information'
133 | sampling: {
134 | percentage: 100
135 | samplingType: 'fixed'
136 | }
137 | }
138 | }
139 |
140 | output name string = apiManagementService.name
141 | output id string = apiManagementService.id
142 | output apimGatewayUrl string = apiManagementService.properties.gatewayUrl
143 |
--------------------------------------------------------------------------------
/infra/core/apim/apim.graphrag-api.bicep:
--------------------------------------------------------------------------------
1 | // Copyright (c) Microsoft Corporation.
2 | // Licensed under the MIT License.
3 |
4 | param apiManagementName string
5 | param name string
6 | param backendUrl string
7 |
8 | resource api 'Microsoft.ApiManagement/service/apis@2023-09-01-preview' = {
9 | name: '${apiManagementName}/${name}'
10 | properties: {
11 | displayName: 'GraphRAG'
12 | apiRevision: '1'
13 | subscriptionRequired: true
14 | serviceUrl: backendUrl
15 | path: ''
16 | protocols: ['https']
17 | authenticationSettings: {
18 | oAuth2AuthenticationSettings: []
19 | openidAuthenticationSettings: []
20 | }
21 | subscriptionKeyParameterNames: {
22 | header: 'Ocp-Apim-Subscription-Key'
23 | query: 'subscription-key'
24 | }
25 | isCurrent: true
26 | format: 'openapi+json'
27 | value: string(loadJsonContent('openapi.json')) // local file will be dynamically created by deployment script
28 | }
29 | resource apiPolicy 'policies@2022-08-01' = {
30 | name: 'policy'
31 | properties: {
32 | format: 'rawxml'
33 | value: loadTextContent('policies/apiPolicy.xml')
34 | }
35 | }
36 | }
37 |
--------------------------------------------------------------------------------
/infra/core/apim/apim.graphrag-docs-api.bicep:
--------------------------------------------------------------------------------
1 | // Copyright (c) Microsoft Corporation.
2 | // Licensed under the MIT License.
3 |
4 | param apiManagementName string
5 | param backendUrl string
6 |
7 | resource api_docs 'Microsoft.ApiManagement/service/apis@2023-09-01-preview' = {
8 | name: '${apiManagementName}/documentation'
9 | properties: {
10 | displayName: 'documentation'
11 | apiRevision: '1'
12 | subscriptionRequired: false
13 | serviceUrl: '${backendUrl}/manpage'
14 | path: 'manpage'
15 | protocols: ['https']
16 | authenticationSettings: {
17 | oAuth2AuthenticationSettings: []
18 | openidAuthenticationSettings: []
19 | }
20 | subscriptionKeyParameterNames: {
21 | header: 'Ocp-Apim-Subscription-Key'
22 | query: 'subscription-key'
23 | }
24 | isCurrent: true
25 | }
26 |
27 | resource documentation_docs 'operations@2023-09-01-preview' = {
28 | name: 'docs'
29 | properties: {
30 | displayName: 'docs'
31 | method: 'GET'
32 | urlTemplate: '/docs'
33 | templateParameters: []
34 | responses: []
35 | }
36 | }
37 |
38 | resource documentation_openapi 'operations@2023-09-01-preview' = {
39 | name: 'openapi'
40 | properties: {
41 | displayName: 'openapi'
42 | method: 'GET'
43 | urlTemplate: '/openapi.json'
44 | templateParameters: []
45 | responses: []
46 | }
47 | }
48 | }
49 |
--------------------------------------------------------------------------------
/infra/core/apim/openapi.json:
--------------------------------------------------------------------------------
1 | {
2 | "comment": "This file is a placeholder for an OpenAPI specification. It will get replaced with an actual OpenAPI spec during the deployment process."
3 | }
--------------------------------------------------------------------------------
/infra/core/apim/policies/apiPolicy.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 | *
7 |
8 |
9 | *
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
--------------------------------------------------------------------------------
/infra/core/cosmosdb/cosmosdb.bicep:
--------------------------------------------------------------------------------
1 | // Copyright (c) Microsoft Corporation.
2 | // Licensed under the MIT License.
3 |
4 | @description('The name of the CosmosDB resource.')
5 | param cosmosDbName string
6 |
7 | @description('The location of the CosmosDB resource.')
8 | param location string = resourceGroup().location
9 |
10 | @allowed(['Enabled', 'Disabled'])
11 | param publicNetworkAccess string = 'Disabled'
12 |
13 | var maxThroughput = 1000
14 |
15 | resource cosmosDb 'Microsoft.DocumentDB/databaseAccounts@2024-11-15' = {
16 | name: cosmosDbName
17 | location: location
18 | tags: {
19 | defaultExperience: 'Core (SQL)'
20 | 'hidden-cosmos-mmspecial': ''
21 | }
22 | kind: 'GlobalDocumentDB'
23 | identity: {
24 | type: 'SystemAssigned'
25 | }
26 | properties: {
27 | publicNetworkAccess: publicNetworkAccess
28 | enableAutomaticFailover: false
29 | enableMultipleWriteLocations: false
30 | isVirtualNetworkFilterEnabled: false
31 | virtualNetworkRules: []
32 | disableKeyBasedMetadataWriteAccess: false
33 | enableFreeTier: false
34 | enableAnalyticalStorage: false
35 | analyticalStorageConfiguration: {
36 | schemaType: 'WellDefined'
37 | }
38 | databaseAccountOfferType: 'Standard'
39 | defaultIdentity: 'FirstPartyIdentity'
40 | networkAclBypass: 'None'
41 | disableLocalAuth: true
42 | enablePartitionMerge: false
43 | minimalTlsVersion: 'Tls12'
44 | consistencyPolicy: {
45 | defaultConsistencyLevel: 'Session'
46 | maxIntervalInSeconds: 5
47 | maxStalenessPrefix: 100
48 | }
49 | locations: [
50 | {
51 | locationName: location
52 | failoverPriority: 0
53 | isZoneRedundant: false
54 | }
55 | ]
56 | cors: []
57 | capabilities: []
58 | ipRules: []
59 | backupPolicy: {
60 | type: 'Periodic'
61 | periodicModeProperties: {
62 | backupIntervalInMinutes: 240
63 | backupRetentionIntervalInHours: 8
64 | backupStorageRedundancy: 'Geo'
65 | }
66 | }
67 | networkAclBypassResourceIds: []
68 | capacity: {
69 | totalThroughputLimit: maxThroughput
70 | }
71 | }
72 | }
73 |
74 | // create a single database that is used to maintain state information for graphrag indexing
75 | // NOTE: The current CosmosDB role assignments are not sufficient to allow the aks workload identity to create databases and containers so we must do it in bicep at deployment time.
76 | // TODO: Identify and assign appropriate RBAC roles that allow the workload identity to create new databases and containers instead of relying on this bicep implementation.
77 | resource graphragDatabase 'Microsoft.DocumentDB/databaseAccounts/sqlDatabases@2024-11-15' = {
78 | parent: cosmosDb
79 | name: 'graphrag'
80 | properties: {
81 | options: {
82 | autoscaleSettings: {
83 | maxThroughput: maxThroughput
84 | }
85 | }
86 | resource: {
87 | id: 'graphrag'
88 | }
89 | }
90 | }
91 |
92 | resource jobsContainer 'Microsoft.DocumentDB/databaseAccounts/sqlDatabases/containers@2024-11-15' = {
93 | parent: graphragDatabase
94 | name: 'jobs'
95 | properties: {
96 | resource: {
97 | id: 'jobs'
98 | indexingPolicy: {
99 | indexingMode: 'consistent'
100 | automatic: true
101 | includedPaths: [
102 | {
103 | path: '/*'
104 | }
105 | ]
106 | excludedPaths: [
107 | {
108 | path: '/"_etag"/?'
109 | }
110 | ]
111 | }
112 | partitionKey: {
113 | paths: [
114 | '/id'
115 | ]
116 | kind: 'Hash'
117 | version: 2
118 | }
119 | uniqueKeyPolicy: {
120 | uniqueKeys: []
121 | }
122 | conflictResolutionPolicy: {
123 | mode: 'LastWriterWins'
124 | conflictResolutionPath: '/_ts'
125 | }
126 | }
127 | }
128 | }
129 |
130 | resource containerStoreContainer 'Microsoft.DocumentDB/databaseAccounts/sqlDatabases/containers@2024-11-15' = {
131 | parent: graphragDatabase
132 | name: 'container-store'
133 | properties: {
134 | resource: {
135 | id: 'container-store'
136 | indexingPolicy: {
137 | indexingMode: 'consistent'
138 | automatic: true
139 | includedPaths: [
140 | {
141 | path: '/*'
142 | }
143 | ]
144 | excludedPaths: [
145 | {
146 | path: '/"_etag"/?'
147 | }
148 | ]
149 | }
150 | partitionKey: {
151 | paths: [
152 | '/id'
153 | ]
154 | kind: 'Hash'
155 | version: 2
156 | }
157 | uniqueKeyPolicy: {
158 | uniqueKeys: []
159 | }
160 | conflictResolutionPolicy: {
161 | mode: 'LastWriterWins'
162 | conflictResolutionPath: '/_ts'
163 | }
164 | }
165 | }
166 | }
167 |
168 | output name string = cosmosDb.name
169 | output id string = cosmosDb.id
170 | output endpoint string = cosmosDb.properties.documentEndpoint
171 |
--------------------------------------------------------------------------------
/infra/core/identity/identity.bicep:
--------------------------------------------------------------------------------
1 | // Copyright (c) Microsoft Corporation.
2 | // Licensed under the MIT License.
3 |
4 | @description('The name of the identity')
5 | param name string
6 |
7 | @description('The location of the identity')
8 | param location string = resourceGroup().location
9 |
10 | @description('federated name: FederatedIdentityCredentialProperties. See https://learn.microsoft.com/en-us/azure/templates/microsoft.managedidentity/userassignedidentities/federatedidentitycredentials?pivots=deployment-language-bicep#federatedidentitycredentialproperties')
11 | param federatedCredentials object = {}
12 |
13 | resource identity 'Microsoft.ManagedIdentity/userAssignedIdentities@2023-01-31' = {
14 | name: name
15 | location: location
16 | }
17 |
18 | resource federatedCredentialResources 'Microsoft.ManagedIdentity/userAssignedIdentities/federatedIdentityCredentials@2023-01-31' = [
19 | for federatedCredential in items(federatedCredentials): {
20 | name: federatedCredential.key
21 | parent: identity
22 | properties: federatedCredential.value
23 | }
24 | ]
25 |
26 | output name string = identity.name
27 | output id string = identity.id
28 | output clientId string = identity.properties.clientId
29 | output principalId string = identity.properties.principalId
30 |
--------------------------------------------------------------------------------
/infra/core/log-analytics/log.bicep:
--------------------------------------------------------------------------------
1 | // Copyright (c) Microsoft Corporation.
2 | // Licensed under the MIT License.
3 |
4 | @description('The name of the Log Analytics resource.')
5 | param name string
6 |
7 | @description('The location of the Log Analytics resource.')
8 | param location string = resourceGroup().location
9 |
10 | @description('The public network access for ingestion.')
11 | param publicNetworkAccessForIngestion string = 'Disabled'
12 |
13 | resource logAnalyticsWorkspace 'Microsoft.OperationalInsights/workspaces@2022-10-01' = {
14 | name: name
15 | location: location
16 | properties: {
17 | retentionInDays: 30
18 | publicNetworkAccessForIngestion: publicNetworkAccessForIngestion
19 | publicNetworkAccessForQuery: 'Enabled'
20 | features: {
21 | immediatePurgeDataOn30Days: true
22 | }
23 | }
24 | }
25 |
26 | output name string = logAnalyticsWorkspace.name
27 | output id string = logAnalyticsWorkspace.id
28 |
--------------------------------------------------------------------------------
/infra/core/monitor/app-insights.bicep:
--------------------------------------------------------------------------------
1 | // Copyright (c) Microsoft Corporation.
2 | // Licensed under the MIT License.
3 |
4 | @description('Application Insights resource name')
5 | param appInsightsName string = 'appi'
6 |
7 | @description('Azure region where the resources will be deployed')
8 | param location string = resourceGroup().location
9 |
10 | @description('Application Insights public network access for ingestion')
11 | param appInsightsPublicNetworkAccessForIngestion string = 'Disabled'
12 |
13 | @description('Workspace id of a Log Analytics resource.')
14 | param logAnalyticsWorkspaceId string
15 |
16 | resource appInsights 'Microsoft.Insights/components@2020-02-02' = {
17 | name: appInsightsName
18 | location: location
19 | kind: 'web'
20 | properties: {
21 | Application_Type: 'web'
22 | WorkspaceResourceId: logAnalyticsWorkspaceId
23 | publicNetworkAccessForIngestion: appInsightsPublicNetworkAccessForIngestion
24 | publicNetworkAccessForQuery: 'Enabled'
25 | }
26 | }
27 |
28 | output name string = appInsights.name
29 | output id string = appInsights.id
30 | output connectionString string = appInsights.properties.ConnectionString
31 | output instrumentationKey string = appInsights.properties.InstrumentationKey
32 |
--------------------------------------------------------------------------------
/infra/core/monitor/private-link-scope.bicep:
--------------------------------------------------------------------------------
1 | // Copyright (c) Microsoft Corporation.
2 | // Licensed under the MIT License.
3 |
4 | param privateLinkScopeName string
5 | param privateLinkScopedResources array = []
6 | param queryAccessMode string = 'Open'
7 | param ingestionAccessMode string = 'PrivateOnly'
8 |
9 | resource privateLinkScope 'Microsoft.Insights/privateLinkScopes@2021-07-01-preview' = {
10 | name: privateLinkScopeName
11 | location: 'global'
12 | properties: {
13 | accessModeSettings: {
14 | queryAccessMode: queryAccessMode
15 | ingestionAccessMode: ingestionAccessMode
16 | }
17 | }
18 | }
19 |
20 | resource scopedResources 'Microsoft.Insights/privateLinkScopes/scopedResources@2021-07-01-preview' = [
21 | for id in privateLinkScopedResources: {
22 | name: uniqueString(id)
23 | parent: privateLinkScope
24 | properties: {
25 | linkedResourceId: id
26 | }
27 | }
28 | ]
29 |
30 | output name string = privateLinkScope.name
31 | output id string = privateLinkScope.id
32 |
--------------------------------------------------------------------------------
/infra/core/rbac/aks-rbac.bicep:
--------------------------------------------------------------------------------
1 | // Copyright (c) Microsoft Corporation.
2 | // Licensed under the MIT License.
3 |
4 | @description('Array of objects with fields principalId, principalType, roleDefinitionId')
5 | param roleAssignments array = []
6 |
7 | resource roleAssignment 'Microsoft.Authorization/roleAssignments@2022-04-01' = [
8 | for role in roleAssignments: {
9 | // note: the guid must be globally unique and deterministic (reproducible) across Azure
10 | name: guid(
11 | subscription().subscriptionId,
12 | resourceGroup().name,
13 | role.principalId,
14 | role.principalType,
15 | role.roleDefinitionId
16 | )
17 | scope: resourceGroup()
18 | properties: role
19 | }
20 | ]
21 |
--------------------------------------------------------------------------------
/infra/core/rbac/aoai-rbac.bicep:
--------------------------------------------------------------------------------
1 | // Copyright (c) Microsoft Corporation.
2 | // Licensed under the MIT License.
3 | // This generic Bicep module can be used to assign RBAC roles to an Azure OpenAI resource at any defined scope
4 |
5 | param name string
6 |
7 | @description('Array of objects with fields principalId, principalType, roleDefinitionId')
8 | param roleAssignments array = []
9 |
10 | resource aoai 'Microsoft.CognitiveServices/accounts@2024-10-01' existing = {
11 | name: name
12 | }
13 |
14 | resource roleAssignment 'Microsoft.Authorization/roleAssignments@2022-04-01' = [
15 | for role in roleAssignments: {
16 | // note: the guid must be globally unique and deterministic (reproducible) across Azure
17 | name: guid(aoai.id, role.principalId, role.principalType, role.roleDefinitionId)
18 | scope: aoai
19 | properties: {
20 | principalId: role.principalId
21 | roleDefinitionId: resourceId('Microsoft.Authorization/roleDefinitions', role.roleDefinitionId)
22 | principalType: role.principalType
23 | }
24 | }
25 | ]
26 |
27 | // output the name, id, and endpoint of the Azure OpenAI resource
28 | output name string = aoai.name
29 | output id string = aoai.id
30 | output endpoint string = aoai.properties.endpoint
31 |
--------------------------------------------------------------------------------
/infra/core/storage/storage.bicep:
--------------------------------------------------------------------------------
1 | // Copyright (c) Microsoft Corporation.
2 | // Licensed under the MIT License.
3 |
4 | @description('The name of the Storage Account resource.')
5 | param name string
6 |
7 | @description('The location of the Storage Account resource.')
8 | param location string = resourceGroup().location
9 |
10 | @allowed(['Hot', 'Cool', 'Premium'])
11 | param accessTier string = 'Hot'
12 |
13 | @allowed(['AzureDnsZone', 'Standard'])
14 | param dnsEndpointType string = 'Standard'
15 |
16 | @allowed(['Enabled', 'Disabled'])
17 | param publicNetworkAccess string = 'Disabled'
18 |
19 | param tags object = {}
20 | param allowBlobPublicAccess bool = false
21 | param allowCrossTenantReplication bool = true
22 | param allowSharedKeyAccess bool = false
23 | param defaultToOAuthAuthentication bool = false
24 | param deleteRetentionPolicy object = {}
25 | param kind string = 'StorageV2'
26 | param minimumTlsVersion string = 'TLS1_2'
27 | param containers array = []
28 |
29 | resource storage 'Microsoft.Storage/storageAccounts@2023-01-01' = {
30 | name: name
31 | location: location
32 | tags: tags
33 | kind: kind
34 | sku: { name: 'Standard_LRS' }
35 | properties: {
36 | accessTier: accessTier
37 | allowBlobPublicAccess: allowBlobPublicAccess
38 | allowCrossTenantReplication: allowCrossTenantReplication
39 | allowSharedKeyAccess: allowSharedKeyAccess
40 | defaultToOAuthAuthentication: defaultToOAuthAuthentication
41 | dnsEndpointType: dnsEndpointType
42 | isHnsEnabled: true
43 | minimumTlsVersion: minimumTlsVersion
44 | networkAcls: {
45 | bypass: 'AzureServices'
46 | defaultAction: 'Allow'
47 | }
48 | publicNetworkAccess: publicNetworkAccess
49 | }
50 |
51 | resource blobServices 'blobServices' = if (!empty(containers)) {
52 | name: 'default'
53 | properties: {
54 | deleteRetentionPolicy: deleteRetentionPolicy
55 | }
56 | resource container 'containers' = [
57 | for container in containers: {
58 | name: container.name
59 | properties: {
60 | publicAccess: container.?publicAccess ?? 'None'
61 | }
62 | }
63 | ]
64 | }
65 | }
66 |
67 | output name string = storage.name
68 | output id string = storage.id
69 | output primaryEndpoints object = storage.properties.primaryEndpoints
70 |
--------------------------------------------------------------------------------
/infra/core/vnet/private-dns-vnet-link.bicep:
--------------------------------------------------------------------------------
1 | // Copyright (c) Microsoft Corporation.
2 | // Licensed under the MIT License.
3 |
4 | param vnetId string
5 | param privateDnsZoneName string
6 | var vnet_id_hash = uniqueString(vnetId)
7 |
8 | resource dnsZone 'Microsoft.Network/privateDnsZones@2024-06-01' = {
9 | name: privateDnsZoneName
10 | location: 'global'
11 | properties: {}
12 | }
13 |
14 | resource dnsZoneLinks 'Microsoft.Network/privateDnsZones/virtualNetworkLinks@2024-06-01' = {
15 | name: 'vnet-link-${privateDnsZoneName}-${vnet_id_hash}'
16 | location: 'global'
17 | parent: dnsZone
18 | properties: {
19 | registrationEnabled: false
20 | virtualNetwork: {
21 | id: vnetId
22 | }
23 | }
24 | }
25 |
--------------------------------------------------------------------------------
/infra/core/vnet/private-dns-zone-a-record.bicep:
--------------------------------------------------------------------------------
1 | // Copyright (c) Microsoft Corporation.
2 | // Licensed under the MIT License.
3 |
4 | @description('DNS name')
5 | param name string
6 |
7 | @description('DNS zone name to create the record in')
8 | param dnsZoneName string
9 |
10 | @description('TTL in seconds')
11 | param ttl int = 900
12 |
13 | @description('The IP address')
14 | param ipv4Address string
15 |
16 | resource dnsZone 'Microsoft.Network/privateDnsZones@2024-06-01' existing = {
17 | name: dnsZoneName
18 | }
19 |
20 | resource aRecord 'Microsoft.Network/privateDnsZones/A@2024-06-01' = {
21 | name: name
22 | parent: dnsZone
23 | properties: {
24 | ttl: ttl
25 | aRecords: [
26 | {
27 | ipv4Address: ipv4Address
28 | }
29 | ]
30 | }
31 | }
32 |
--------------------------------------------------------------------------------
/infra/core/vnet/private-dns-zone-groups.json:
--------------------------------------------------------------------------------
1 | {
2 | "azureCloud": {
3 | "aiSearch": "privatelink.search.windows.net",
4 | "azureMonitor": [
5 | "privatelink.monitor.azure.com",
6 | "privatelink.oms.opinsights.azure.com",
7 | "privatelink.agentsvc.azure-automation.net",
8 | "privatelink.ods.opinsights.azure.com"
9 | ],
10 | "blobStorage": "privatelink.blob.core.windows.net",
11 | "cosmosDB": "privatelink.documents.azure.com"
12 | },
13 | "azureusgovernment": {
14 | "aiSearch": "privatelink.search.azure.us",
15 | "azureMonitor": [
16 | "privatelink.monitor.azure.us",
17 | "privatelink.oms.opinsights.azure.us",
18 | "privatelink.agentsvc.azure-automation.us",
19 | "privatelink.ods.opinsights.azure.us"
20 | ],
21 | "blobStorage": "privatelink.blob.core.usgovcloudapi.net",
22 | "cosmosDB": "privatelink.documents.azure.us"
23 | }
24 | }
--------------------------------------------------------------------------------
/infra/core/vnet/private-dns-zone.bicep:
--------------------------------------------------------------------------------
1 | // Copyright (c) Microsoft Corporation.
2 | // Licensed under the MIT License.
3 |
4 | @description('The name of the private DNS zone.')
5 | param name string
6 |
7 | @description('The name of the virtual networks the DNS zone should be associated with.')
8 | param vnetName string
9 |
10 | resource vnet 'Microsoft.Network/virtualNetworks@2024-05-01' existing = {
11 | name: vnetName
12 | }
13 |
14 | resource dnsZone 'Microsoft.Network/privateDnsZones@2024-06-01' = {
15 | name: name
16 | location: 'global'
17 | properties: {}
18 | }
19 |
20 | resource dnsZoneLinks 'Microsoft.Network/privateDnsZones/virtualNetworkLinks@2024-06-01' = {
21 | name: vnetName
22 | location: 'global'
23 | parent: dnsZone
24 | properties: {
25 | registrationEnabled: false
26 | virtualNetwork: {
27 | id: vnet.id
28 | }
29 | }
30 | }
31 |
32 | output name string = dnsZone.name
33 | output id string = dnsZone.id
34 |
--------------------------------------------------------------------------------
/infra/core/vnet/private-endpoint.bicep:
--------------------------------------------------------------------------------
1 | // Copyright (c) Microsoft Corporation.
2 | // Licensed under the MIT License.
3 |
4 | @description('Resource ID of service the private endpoint is for')
5 | param privateLinkServiceId string
6 |
7 | @description('The resource ID of the subnet to deploy the private endpoint to')
8 | param subnetId string
9 |
10 | @description('Map of group id to array of private dns zone configs to associate with the private endpoint')
11 | param privateDnsZoneConfigs array
12 |
13 | param privateEndpointName string
14 | param groupId string
15 | param location string = resourceGroup().location
16 |
17 | resource privateEndpoint 'Microsoft.Network/privateEndpoints@2024-05-01' = {
18 | name: privateEndpointName
19 | location: location
20 | properties: {
21 | privateLinkServiceConnections: [
22 | {
23 | name: privateEndpointName
24 | properties: {
25 | privateLinkServiceId: privateLinkServiceId
26 | groupIds: [groupId]
27 | }
28 | }
29 | ]
30 | subnet: {
31 | id: subnetId
32 | }
33 | }
34 | }
35 |
36 | resource privateDnsZoneGroup 'Microsoft.Network/privateEndpoints/privateDnsZoneGroups@2024-05-01' = {
37 | name: groupId
38 | parent: privateEndpoint
39 | properties: {
40 | privateDnsZoneConfigs: privateDnsZoneConfigs
41 | }
42 | }
43 |
--------------------------------------------------------------------------------
/infra/core/vnet/privatelink-private-dns-zones.bicep:
--------------------------------------------------------------------------------
1 | // Copyright (c) Microsoft Corporation.
2 | // Licensed under the MIT License.
3 |
4 | @description('The virtual network ID to link to')
5 | param linkedVnetId string
6 |
7 | var privateDnsZoneData = loadJsonContent('private-dns-zone-groups.json') // for more information: https://learn.microsoft.com/en-us/azure/azure-government/compare-azure-government-global-azure
8 | var cloudName = toLower(environment().name)
9 |
10 | var aiSearchPrivateDnsZoneName = privateDnsZoneData[cloudName].aiSearch
11 | var blobStoragePrivateDnsZoneName = privateDnsZoneData[cloudName].blobStorage
12 | var cosmosDbPrivateDnsZoneName = privateDnsZoneData[cloudName].cosmosDb
13 | var storagePrivateDnsZoneNames = [blobStoragePrivateDnsZoneName]
14 | var azureMonitorPrivateDnsZones = privateDnsZoneData[cloudName].azureMonitor
15 |
16 | var privateDnsZones = union(
17 | azureMonitorPrivateDnsZones,
18 | storagePrivateDnsZoneNames,
19 | [cosmosDbPrivateDnsZoneName],
20 | [aiSearchPrivateDnsZoneName]
21 | )
22 |
23 | resource privateDnsZoneResources 'Microsoft.Network/privateDnsZones@2024-06-01' = [
24 | for name in privateDnsZones: {
25 | name: name
26 | location: 'global'
27 | }
28 | ]
29 |
30 | module dnsVnetLinks 'vnet-dns-link.bicep' = [
31 | for (privateDnsZoneName, index) in privateDnsZones: {
32 | name: replace(privateDnsZoneName, '.', '-')
33 | params: {
34 | privateDnsZoneName: privateDnsZoneResources[index].name
35 | vnetId: linkedVnetId
36 | }
37 | }
38 | ]
39 |
40 | output azureMonitorPrivateDnsZoneConfigs array = [
41 | for zoneName in union(azureMonitorPrivateDnsZones, [blobStoragePrivateDnsZoneName]): {
42 | name: privateDnsZoneResources[indexOf(privateDnsZones, zoneName)].name
43 | properties: {
44 | #disable-next-line use-resource-id-functions
45 | privateDnsZoneId: privateDnsZoneResources[indexOf(privateDnsZones, zoneName)].id
46 | }
47 | }
48 | ]
49 |
50 | output blobStoragePrivateDnsZoneConfigs array = [
51 | {
52 | name: blobStoragePrivateDnsZoneName
53 | properties: {
54 | #disable-next-line use-resource-id-functions
55 | privateDnsZoneId: privateDnsZoneResources[indexOf(privateDnsZones, blobStoragePrivateDnsZoneName)].id
56 | }
57 | }
58 | ]
59 |
60 | output cosmosDbPrivateDnsZoneConfigs array = [
61 | {
62 | name: privateDnsZoneResources[indexOf(privateDnsZones, cosmosDbPrivateDnsZoneName)].name
63 | properties: {
64 | #disable-next-line use-resource-id-functions
65 | privateDnsZoneId: privateDnsZoneResources[indexOf(privateDnsZones, cosmosDbPrivateDnsZoneName)].id
66 | }
67 | }
68 | ]
69 |
70 | output aiSearchPrivateDnsZoneConfigs array = [
71 | {
72 | name: privateDnsZoneResources[indexOf(privateDnsZones, aiSearchPrivateDnsZoneName)].name
73 | properties: {
74 | #disable-next-line use-resource-id-functions
75 | privateDnsZoneId: privateDnsZoneResources[indexOf(privateDnsZones, aiSearchPrivateDnsZoneName)].id
76 | }
77 | }
78 | ]
79 |
80 | output privateDnsZones array = privateDnsZones
81 |
--------------------------------------------------------------------------------
/infra/core/vnet/vnet-dns-link.bicep:
--------------------------------------------------------------------------------
1 | // Copyright (c) Microsoft Corporation.
2 | // Licensed under the MIT License.
3 |
4 | param privateDnsZoneName string
5 | param vnetId string
6 |
7 | resource privateDnsZone 'Microsoft.Network/privateDnsZones@2024-06-01' existing = {
8 | name: privateDnsZoneName
9 | }
10 |
11 | resource dnsVnetLinks 'Microsoft.Network/privateDnsZones/virtualNetworkLinks@2024-06-01' = {
12 | name: '${replace(privateDnsZoneName, '.', '-')}-${uniqueString(vnetId)}'
13 | parent: privateDnsZone
14 | location: 'global'
15 | properties: {
16 | registrationEnabled: false
17 | resolutionPolicy: 'Default'
18 | virtualNetwork: {
19 | id: vnetId
20 | }
21 | }
22 | }
23 |
--------------------------------------------------------------------------------
/infra/core/vnet/vnet.bicep:
--------------------------------------------------------------------------------
1 | // Copyright (c) Microsoft Corporation.
2 | // Licensed under the MIT License.
3 |
4 | @description('Name of the vnet resource.')
5 | param vnetName string
6 |
7 | @description('Azure region where the resource will be deployed.')
8 | param location string = resourceGroup().location
9 |
10 | @description('Optional prefix to prepend to subnet names.')
11 | param subnetPrefix string = 'snet-'
12 |
13 | @description('APIM tier - used to determine if subnet delegations are required.')
14 | @allowed(['Developer', 'StandardV2'])
15 | param apimTier string
16 |
17 | @description('NSG resource ID.')
18 | param nsgID string
19 |
20 | resource vnet 'Microsoft.Network/virtualNetworks@2024-05-01' = {
21 | name: vnetName
22 | location: location
23 | properties: {
24 | addressSpace: {
25 | addressPrefixes: [
26 | '10.1.0.0/16'
27 | ]
28 | }
29 | subnets: [
30 | {
31 | name: '${subnetPrefix}apim'
32 | properties: {
33 | addressPrefix: '10.1.0.0/24'
34 | networkSecurityGroup: {
35 | id: nsgID
36 | }
37 | delegations: (apimTier == 'Developer')
38 | ? []
39 | : [
40 | {
41 | name: 'Microsoft.Web/serverFarms'
42 | properties: {
43 | serviceName: 'Microsoft.Web/serverFarms'
44 | }
45 | }
46 | ]
47 | }
48 | }
49 | {
50 | name: '${subnetPrefix}aks'
51 | properties: {
52 | addressPrefix: '10.1.1.0/24'
53 | serviceEndpoints: [
54 | {
55 | service: 'Microsoft.Storage'
56 | }
57 | {
58 | service: 'Microsoft.Sql'
59 | }
60 | {
61 | service: 'Microsoft.EventHub'
62 | }
63 | ]
64 | }
65 | }
66 | ]
67 | }
68 | }
69 |
70 | output name string = vnet.name
71 | output id string = vnet.id
72 | output apimSubnetId string = vnet.properties.subnets[0].id
73 | output aksSubnetId string = vnet.properties.subnets[1].id
74 |
--------------------------------------------------------------------------------
/infra/deploy.parameters.json:
--------------------------------------------------------------------------------
1 | {
2 | "LOCATION": "__LOCATION__",
3 | "RESOURCE_GROUP": "__RESOURCE_GROUP__"
4 | }
--------------------------------------------------------------------------------
/infra/helm/README.md:
--------------------------------------------------------------------------------
1 | # Overview
2 |
3 | This helm chart was created to install graphrag into a kubernetes cluster.
4 |
5 | ## Developer Notes
6 | If making updates to the helm chart, you can validate changes to the helm chart locally by using the following `helm` command example:
7 |
8 | ```shell
9 | helm template test ./graphrag \
10 | --namespace graphrag \
11 | --set "master.image.repository=registry.azurecr.io/graphrag" \
12 | --set "master.image.tag=latest"
13 | ```
14 |
--------------------------------------------------------------------------------
/infra/helm/graphrag/.helmignore:
--------------------------------------------------------------------------------
1 | # Patterns to ignore when building packages.
2 | # This supports shell glob matching, relative path matching, and
3 | # negation (prefixed with !). Only one pattern per line.
4 | .DS_Store
5 | # Common VCS dirs
6 | .git/
7 | .gitignore
8 | .bzr/
9 | .bzrignore
10 | .hg/
11 | .hgignore
12 | .svn/
13 | # Common backup files
14 | *.swp
15 | *.bak
16 | *.tmp
17 | *.orig
18 | *~
19 | # Various IDEs
20 | .project
21 | .idea/
22 | *.tmproj
23 | .vscode/
24 |
--------------------------------------------------------------------------------
/infra/helm/graphrag/Chart.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # Licensed under the MIT License.
3 |
4 | apiVersion: v2
5 | name: graphrag
6 | description: GraphRAG - a graph-based RAG search engine
7 | type: application
8 | version: 0.0.1
9 |
10 | # This is the version number of the application being deployed. This version number should be
11 | # incremented each time you make changes to the application. For graphrag, we define the appVersion to match
12 | # the version of the graphrag library being used.
13 | appVersion: "1.2.0"
14 |
--------------------------------------------------------------------------------
/infra/helm/graphrag/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright (c) Microsoft Corporation.
2 |
3 | MIT License
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
--------------------------------------------------------------------------------
/infra/helm/graphrag/templates/NOTES.txt:
--------------------------------------------------------------------------------
1 | Thank you for installing {{ .Chart.Name }}.
2 |
3 | Your release is named {{ .Release.Name }}.
4 |
5 | To learn more about the release, try:
6 |
7 | $ helm status {{ .Release.Name }}
8 | $ helm get all {{ .Release.Name }}
--------------------------------------------------------------------------------
/infra/helm/graphrag/templates/_helpers.tpl:
--------------------------------------------------------------------------------
1 | {{/*
2 | Expand the name of the chart.
3 | */}}
4 | {{- define "graphrag.name" -}}
5 | {{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }}
6 | {{- end }}
7 |
8 | {{/*
9 | Create a default fully qualified app name.
10 | We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec).
11 | If release name contains chart name it will be used as a full name.
12 | */}}
13 | {{- define "graphrag.fullname" -}}
14 | {{- if .Values.fullnameOverride }}
15 | {{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }}
16 | {{- else }}
17 | {{- $name := default .Chart.Name .Values.nameOverride }}
18 | {{- if contains $name .Release.Name }}
19 | {{- .Release.Name | trunc 63 | trimSuffix "-" }}
20 | {{- else }}
21 | {{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }}
22 | {{- end }}
23 | {{- end }}
24 | {{- end }}
25 |
26 | {{/*
27 | Create a graphrag-master fully qualified app name.
28 | We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec).
29 | If release name contains chart name it will be used as a full name.
30 | */}}
31 | {{- define "graphrag.master.fullname" -}}
32 | {{- if .Values.master.fullnameOverride }}
33 | {{- .Values.master.fullnameOverride | trunc 63 | trimSuffix "-" }}
34 | {{- else }}
35 | {{- $name := default .Chart.Name .Values.nameOverride }}
36 | {{- if contains $name .Release.Name }}
37 | {{- printf "%s-%s" .Release.Name .Values.master.name | trunc 63 | trimSuffix "-" }}
38 | {{- else }}
39 | {{- printf "%s-%s-%s" .Release.Name $name .Values.master.name | trunc 63 | trimSuffix "-" }}
40 | {{- end }}
41 | {{- end }}
42 | {{- end }}
43 |
44 | {{/*
45 | Create chart name and version as used by the chart label.
46 | */}}
47 | {{- define "graphrag.chart" -}}
48 | {{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }}
49 | {{- end }}
50 |
51 | {{/*
52 | Common labels
53 | */}}
54 | {{- define "graphrag.common.labels" -}}
55 | azure.workload.identity/use: "true"
56 | helm.sh/chart: {{ include "graphrag.chart" . }}
57 | {{ include "graphrag.common.selectorLabels" . }}
58 | {{- if .Chart.AppVersion }}
59 | app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
60 | {{- end }}
61 | app.kubernetes.io/managed-by: {{ .Release.Service }}
62 | {{- end }}
63 |
64 | {{- define "graphrag.labels" -}}
65 | {{ include "graphrag.common.labels" . }}
66 | {{- end }}
67 |
68 | {{/*
69 | Selector labels
70 | */}}
71 | {{- define "graphrag.common.selectorLabels" -}}
72 | app.kubernetes.io/name: {{ include "graphrag.name" . }}
73 | app.kubernetes.io/instance: {{ .Release.Name }}
74 | {{- end }}
75 |
76 | {{- define "graphrag.master.labels" -}}
77 | {{ include "graphrag.common.labels" . }}
78 | {{ include "graphrag.master.selectorLabels" . }}
79 | {{- end -}}
80 |
81 | {{- define "graphrag.master.selectorLabels" -}}
82 | {{ include "graphrag.common.selectorLabels" . }}
83 | component: {{ .Values.master.name | quote }}
84 | {{- end -}}
85 |
86 | {{/*
87 | Create the name of the service account to use
88 | */}}
89 | {{- define "graphrag.serviceAccountName" -}}
90 | {{- if .Values.serviceAccount.create }}
91 | {{- default (include "graphrag.fullname" .) .Values.serviceAccount.name }}
92 | {{- else }}
93 | {{- default "default" .Values.serviceAccount.name }}
94 | {{- end }}
95 | {{- end }}
96 |
--------------------------------------------------------------------------------
/infra/helm/graphrag/templates/graphrag-clusterrole.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: rbac.authorization.k8s.io/v1
2 | kind: ClusterRole
3 | metadata:
4 | name: {{ include "graphrag.fullname" . }}
5 | labels:
6 | {{- include "graphrag.labels" . | nindent 4 }}
7 | rules:
8 | - apiGroups: [""]
9 | resources: ["pods"]
10 | verbs: ["get", "list", "watch", "create", "update", "patch", "delete"]
11 | - apiGroups: ["batch", "extensions"]
12 | resources: ["*"]
13 | verbs: ["*"]
14 |
--------------------------------------------------------------------------------
/infra/helm/graphrag/templates/graphrag-configmap.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: ConfigMap
3 | metadata:
4 | labels:
5 | {{- include "graphrag.labels" . | nindent 4 }}
6 | name: {{ include "graphrag.fullname" . }}
7 | data:
8 | {{- toYaml .Values.graphragConfig | nindent 2 }}
9 | AKS_NAMESPACE: {{ .Release.Namespace }}
10 |
--------------------------------------------------------------------------------
/infra/helm/graphrag/templates/graphrag-ingress.yaml:
--------------------------------------------------------------------------------
1 | {{- if .Values.ingress.enabled -}}
2 | {{- $fullName := include "graphrag.fullname" . -}}
3 | {{- $masterFullName := include "graphrag.master.fullname" . -}}
4 | {{- $masterSvcPort := .Values.master.service.port -}}
5 | {{- if semverCompare ">=1.19-0" .Capabilities.KubeVersion.GitVersion -}}
6 | apiVersion: networking.k8s.io/v1
7 | {{- else if semverCompare ">=1.14-0" .Capabilities.KubeVersion.GitVersion -}}
8 | apiVersion: networking.k8s.io/v1beta1
9 | {{- else -}}
10 | apiVersion: extensions/v1beta1
11 | {{- end }}
12 | kind: Ingress
13 | metadata:
14 | name: {{ $fullName }}
15 | namespace: {{ .Release.Namespace }}
16 | labels:
17 | {{- include "graphrag.labels" . | nindent 4 }}
18 | {{- with .Values.ingress.annotations }}
19 | annotations:
20 | {{- toYaml . | nindent 4 }}
21 | {{- end }}
22 | spec:
23 | {{- if and .Values.ingress.className (semverCompare ">=1.18-0" .Capabilities.KubeVersion.GitVersion) }}
24 | ingressClassName: {{ .Values.ingress.className }}
25 | {{- end }}
26 | {{- if .Values.ingress.tls }}
27 | tls:
28 | {{- range .Values.ingress.tls }}
29 | - hosts:
30 | {{- range .hosts }}
31 | - {{ . | quote }}
32 | {{- end }}
33 | secretName: {{ .secretName }}
34 | {{- end }}
35 | {{- end }}
36 | rules:
37 | - host: {{ .Values.ingress.host | quote }}
38 | http:
39 | paths:
40 | - path: "/"
41 | pathType: "Prefix"
42 | backend:
43 | {{- if semverCompare ">=1.19-0" $.Capabilities.KubeVersion.GitVersion }}
44 | service:
45 | name: {{ $masterFullName }}
46 | port:
47 | number: {{ $masterSvcPort }}
48 | {{- else }}
49 | serviceName: {{ $masterFullName }}
50 | servicePort: {{ $masterSvcPort }}
51 | {{- end }}
52 | {{- end }}
53 |
--------------------------------------------------------------------------------
/infra/helm/graphrag/templates/graphrag-master-deployment.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: apps/v1
2 | kind: Deployment
3 | metadata:
4 | name: {{ include "graphrag.master.fullname" . }}
5 | labels:
6 | {{- include "graphrag.master.labels" . | nindent 4 }}
7 | spec:
8 | {{- if not .Values.master.autoscaling.enabled }}
9 | replicas: {{ .Values.master.replicaCount }}
10 | {{- end }}
11 | selector:
12 | matchLabels:
13 | {{- include "graphrag.master.selectorLabels" . | nindent 6 }}
14 | template:
15 | metadata:
16 | {{- with .Values.master.podAnnotations }}
17 | annotations:
18 | {{- toYaml . | nindent 8 }}
19 | {{- end }}
20 | labels:
21 | date: "{{ now | unixEpoch }}"
22 | {{- include "graphrag.master.labels" . | nindent 8 }}
23 | {{- with .Values.master.podLabels }}
24 | {{- toYaml . | nindent 8 }}
25 | {{- end }}
26 | spec:
27 | serviceAccountName: {{ include "graphrag.serviceAccountName" . }}
28 | securityContext:
29 | {{- toYaml .Values.master.podSecurityContext | nindent 8 }}
30 | {{- with .Values.master.imagePullSecrets }}
31 | imagePullSecrets:
32 | {{- toYaml . | nindent 8 }}
33 | {{- end }}
34 | containers:
35 | - name: {{ .Values.master.name }}
36 | securityContext:
37 | {{- toYaml .Values.master.securityContext | nindent 12 }}
38 | image: "{{ .Values.master.image.repository }}:{{ .Values.master.image.tag | default .Chart.AppVersion }}"
39 | imagePullPolicy: {{ .Values.master.image.pullPolicy }}
40 | envFrom:
41 | - configMapRef:
42 | name: {{ include "graphrag.fullname" . }}
43 | ports:
44 | - name: http
45 | containerPort: {{ .Values.master.service.port }}
46 | protocol: TCP
47 | livenessProbe:
48 | {{- toYaml .Values.master.livenessProbe | nindent 12 }}
49 | readinessProbe:
50 | {{- toYaml .Values.master.readinessProbe | nindent 12 }}
51 | resources:
52 | {{- toYaml .Values.master.resources | nindent 12 }}
53 | volumeMounts:
54 | {{- with .Values.master.volumeMounts }}
55 | {{- toYaml . | nindent 12 }}
56 | {{- end }}
57 | {{- with .Values.master.volumes }}
58 | {{- toYaml . | nindent 8 }}
59 | {{- end }}
60 | {{- with .Values.master.nodeSelector }}
61 | nodeSelector:
62 | {{- toYaml . | nindent 8 }}
63 | {{- end }}
64 | {{- with .Values.master.affinity }}
65 | affinity:
66 | {{- toYaml . | nindent 8 }}
67 | {{- end }}
68 | {{- with .Values.master.tolerations }}
69 | tolerations:
70 | {{- toYaml . | nindent 8 }}
71 | {{- end }}
72 |
--------------------------------------------------------------------------------
/infra/helm/graphrag/templates/graphrag-master-hpa.yaml:
--------------------------------------------------------------------------------
1 | {{- if .Values.master.autoscaling.enabled }}
2 | apiVersion: autoscaling/v2
3 | kind: HorizontalPodAutoscaler
4 | metadata:
5 | name: {{ include "graphrag.master.fullname" . }}
6 | labels:
7 | {{- include "graphrag.master.labels" . | nindent 4 }}
8 | spec:
9 | scaleTargetRef:
10 | apiVersion: apps/v1
11 | kind: Deployment
12 | name: {{ include "graphrag.master.fullname" . }}
13 | minReplicas: {{ .Values.master.autoscaling.minReplicas }}
14 | maxReplicas: {{ .Values.master.autoscaling.maxReplicas }}
15 | metrics:
16 | {{- if .Values.master.autoscaling.targetCPUUtilizationPercentage }}
17 | - type: Resource
18 | resource:
19 | name: cpu
20 | target:
21 | type: Utilization
22 | averageUtilization: {{ .Values.master.autoscaling.targetCPUUtilizationPercentage }}
23 | {{- end }}
24 | {{- if .Values.master.autoscaling.targetMemoryUtilizationPercentage }}
25 | - type: Resource
26 | resource:
27 | name: memory
28 | target:
29 | type: Utilization
30 | averageUtilization: {{ .Values.master.autoscaling.targetMemoryUtilizationPercentage }}
31 | {{- end }}
32 | {{- end }}
33 |
--------------------------------------------------------------------------------
/infra/helm/graphrag/templates/graphrag-master-service.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: Service
3 | metadata:
4 | name: {{ include "graphrag.master.fullname" . }}
5 | {{- if .Values.master.service.annotations }}
6 | annotations:
7 | {{- range $key, $value := .Values.master.service.annotations }}
8 | {{ $key }}: {{ $value | quote }}
9 | {{- end }}
10 | {{- end }}
11 | labels:
12 | {{- include "graphrag.master.labels" . | nindent 4 }}
13 | spec:
14 | type: {{ .Values.master.service.type }}
15 | ports:
16 | - port: {{ .Values.master.service.port }}
17 | selector:
18 | {{- include "graphrag.master.selectorLabels" . | nindent 4 }}
19 |
--------------------------------------------------------------------------------
/infra/helm/graphrag/templates/graphrag-nginx-internal-controller.yaml:
--------------------------------------------------------------------------------
1 | {{- if .Values.ingress.createIngressClass -}}
2 | apiVersion: approuting.kubernetes.azure.com/v1alpha1
3 | kind: NginxIngressController
4 | metadata:
5 | name: {{ .Values.ingress.className }}
6 | spec:
7 | ingressClassName: {{ .Values.ingress.className }}
8 | controllerNamePrefix: {{ .Values.ingress.className }}
9 | {{- with .Values.ingress.loadBalancerAnnotations }}
10 | loadBalancerAnnotations:
11 | {{- toYaml . | nindent 4 }}
12 | {{- end }}
13 | {{- end }}
14 |
--------------------------------------------------------------------------------
/infra/helm/graphrag/templates/graphrag-rolebinding.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: rbac.authorization.k8s.io/v1
2 | kind: ClusterRoleBinding
3 | metadata:
4 | name: {{ include "graphrag.fullname" . }}
5 | labels:
6 | {{- include "graphrag.labels" . | nindent 4 }}
7 | subjects:
8 | - kind: ServiceAccount
9 | name: {{ include "graphrag.serviceAccountName" . }}
10 | namespace: {{ .Release.Namespace }}
11 | roleRef:
12 | kind: ClusterRole
13 | name: {{ include "graphrag.fullname" . }}
14 | apiGroup: rbac.authorization.k8s.io
15 |
--------------------------------------------------------------------------------
/infra/helm/graphrag/templates/graphrag-serviceaccount.yaml:
--------------------------------------------------------------------------------
1 | {{- if .Values.serviceAccount.create -}}
2 | apiVersion: v1
3 | kind: ServiceAccount
4 | metadata:
5 | name: {{ include "graphrag.serviceAccountName" . }}
6 | labels:
7 | {{- include "graphrag.labels" . | nindent 4 }}
8 | {{- with .Values.serviceAccount.annotations }}
9 | annotations:
10 | {{- toYaml . | nindent 4 }}
11 | {{- end }}
12 | automountServiceAccountToken: {{ .Values.serviceAccount.automount }}
13 | {{- end }}
14 |
--------------------------------------------------------------------------------
/infra/helm/graphrag/templates/tests/test-connection.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: Pod
3 | metadata:
4 | name: "{{ include "graphrag.master.fullname" . }}-test-connection"
5 | labels:
6 | {{- include "graphrag.master.labels" . | nindent 4 }}
7 | annotations:
8 | "helm.sh/hook": test
9 | spec:
10 | containers:
11 | - name: wget
12 | image: busybox
13 | command: ['wget']
14 | args: ['{{ include "graphrag.master.fullname" . }}:{{ .Values.master.service.port }}']
15 | restartPolicy: Never
16 |
--------------------------------------------------------------------------------
/infra/helm/graphrag/values.yaml:
--------------------------------------------------------------------------------
1 | # Default values for the graphrag helm chart.
2 |
3 | nameOverride: ""
4 | fullnameOverride: ""
5 |
6 | serviceAccount:
7 | # Specifies whether a service account should be created
8 | create: true
9 | # Automatically mount a ServiceAccount's API credentials?
10 | automount: true
11 | # Annotations to add to the service account
12 | annotations:
13 | azure.workload.identity/client-id: ""
14 | # Name of the service account to use.
15 | # If not set and create is true, a name is generated using the fullname template
16 | name: ""
17 |
18 | ingress:
19 | enabled: true
20 | className: nginx-internal
21 | createIngressClass: true
22 | host: graphrag.graphrag.io
23 | tls: []
24 | annotations:
25 | nginx.ingress.kubernetes.io/proxy-connect-timeout: "900"
26 | nginx.ingress.kubernetes.io/proxy-send-timeout: "900"
27 | nginx.ingress.kubernetes.io/proxy-read-timeout: "900"
28 | nginx.ingress.kubernetes.io/proxy-body-size: 500m
29 | loadBalancerAnnotations:
30 | service.beta.kubernetes.io/azure-load-balancer-internal: "true"
31 |
32 | graphragConfig:
33 | AI_SEARCH_AUDIENCE: "https://search.azure.com"
34 | AI_SEARCH_URL: ""
35 | APPLICATIONINSIGHTS_CONNECTION_STRING: ""
36 | # Must set hidden env variable to true to disable statsbeat. For more information: https://github.com/Azure/azure-sdk-for-python/issues/34804
37 | APPLICATIONINSIGHTS_STATSBEAT_DISABLED_ALL: "True"
38 | COSMOS_URI_ENDPOINT: ""
39 | GRAPHRAG_API_BASE: ""
40 | GRAPHRAG_API_VERSION: ""
41 | COGNITIVE_SERVICES_AUDIENCE: "https://cognitiveservices.azure.com/.default"
42 | GRAPHRAG_LLM_MODEL: ""
43 | GRAPHRAG_LLM_DEPLOYMENT_NAME: ""
44 | GRAPHRAG_EMBEDDING_MODEL: ""
45 | GRAPHRAG_EMBEDDING_DEPLOYMENT_NAME: ""
46 | STORAGE_ACCOUNT_BLOB_URL: ""
47 |
48 | master:
49 | name: "master"
50 | replicaCount: 1
51 | image:
52 | repository: ""
53 | pullPolicy: Always
54 | # Override the image tag whose default is the chart appVersion.
55 | tag: ""
56 | imagePullSecrets: []
57 | podAnnotations: {}
58 | podLabels: {}
59 | podSecurityContext:
60 | {}
61 | # fsGroup: 2000
62 |
63 | securityContext:
64 | {}
65 | # capabilities:
66 | # drop:
67 | # - ALL
68 | # readOnlyRootFilesystem: true
69 | # runAsNonRoot: true
70 | # runAsUser: 1000
71 |
72 | service:
73 | annotations: {}
74 | type: ClusterIP
75 | port: 80
76 |
77 | resources:
78 | # We recommend not modifying the default resources below unless you know what you're doing
79 | # and have investigated graphrag's baseline spec requirements to ensure the application
80 | # can run properly.
81 | limits:
82 | cpu: 8
83 | memory: "16Gi"
84 | requests:
85 | cpu: 4
86 | memory: "10Gi"
87 |
88 | livenessProbe:
89 | httpGet:
90 | path: /manpage/docs
91 | port: http
92 | failureThreshold: 50
93 | initialDelaySeconds: 30
94 | periodSeconds: 20
95 |
96 | readinessProbe:
97 | httpGet:
98 | path: /manpage/docs
99 | port: http
100 | failureThreshold: 50
101 | initialDelaySeconds: 30
102 | periodSeconds: 20
103 |
104 | autoscaling:
105 | enabled: true
106 | minReplicas: 1
107 | maxReplicas: 20
108 | targetMemoryUtilizationPercentage: 50
109 | # targetCPUUtilizationPercentage: 50
110 |
111 | # Additional volumes on the output Deployment definition.
112 | volumes: []
113 | # - name: foo
114 | # secret:
115 | # secretName: mysecret
116 | # optional: false
117 |
118 | # Additional volumeMounts on the output Deployment definition.
119 | volumeMounts: []
120 | # - name: foo
121 | # mountPath: "/etc/foo"
122 | # readOnly: true
123 |
124 | nodeSelector: {}
125 |
126 | tolerations: []
127 |
128 | affinity:
129 | nodeAffinity:
130 | requiredDuringSchedulingIgnoredDuringExecution:
131 | nodeSelectorTerms:
132 | - matchExpressions:
133 | - key: workload
134 | operator: In
135 | values:
136 | - graphrag
137 |
--------------------------------------------------------------------------------
/infra/managed-app/README.md:
--------------------------------------------------------------------------------
1 | # Managed App Instructions
2 |
3 | This guide walks through the process to convert the graphrag solution accelerator into a managed app.
4 |
5 | ### Prerequisites
6 | 1. Create an ACR
7 | 1. Push both the graphrag backend docker image and the graphrag helm chart to the registry.
8 | ```shell
9 | # push docker image
10 | az acr login --name .azurecr.io
11 | cd
12 | az acr build --registry acurecr.io -f docker/Dockerfile-backend --image graphrag:latest .
13 | # push helm chart
14 | cd /infra/helm
15 | helm package graphrag
16 | helm push graphrag-.tgz oci://.azurecr.io/helm
17 | ```
18 | 1. A managed app [requires a storage account to deploy](https://learn.microsoft.com/en-us/azure/azure-resource-manager/managed-applications/publish-service-catalog-bring-your-own-storage?tabs=azure-powershell) an Azure Managed App Definition. Create a storage account and take note of the name and SAS key for later.
19 | 1. Enable anonymous access on the blob container that will host the managed app deployment package (a zip file).
20 | 1. The Azure built-in service principle `Managed Applications on Behalf Application` **MUST** be granted the role of `Contributor` and `Role Based Access Control Administrator` on any Azure subscription where the app will be deployed.
21 |
22 | ### Steps to build a Managed App
23 |
24 | ### 1. Auto format the bicep code (optional)
25 |
26 | As a precaution, auto-format and lint the bicep code to detect any mistakes early-on.
27 |
28 | ```bash
29 | cd /infra
30 | find . -type f -name "*.bicep" -exec az bicep format --file {} \;
31 | find . -type f -name "*.bicep" -exec az bicep lint --file {} \;
32 | ```
33 |
34 | ### 2. Create & test the Azure portal interface
35 |
36 | Use the [Azure Portal Sandbox](https://portal.azure.com/#blade/Microsoft_Azure_CreateUIDef/SandboxBlade) to test and make UI changes defined in [createUiDefinition.json](createUiDefinition.json). To make additional changes to the Azure portal experience, check out the [documentation](https://learn.microsoft.com/en-us/azure/azure-resource-manager/managed-applications/create-uidefinition-overview) and copy the contents of `createUiDefinition.json` into the sandbox environment.
37 |
38 | ### 3. Prepare the deployment package
39 |
40 | A *deployment package* is a zip file comprised of several files. This will include an ARM template and other files from the previous steps, along with additional code relevant to the deployment (i.e. artifacts)
41 |
42 | The names of certain files (`mainTemplate.json` and `createUiDefinition.json`) should not be modified and are case-sensitive. Azure expects these files to be included in the final managed app deployment package.
43 |
44 | A local copy of the backend docker image needs to be built in order to retrieve a copy of the openapi json spec associated with GraphRAG's REST API. This api spec file will become part of the final deployment package.
45 | ```shell
46 | cd
47 | docker build -t graphrag:latest -f docker/Dockerfile-backend .
48 | docker run -d -p 8080:80 graphrag:latest
49 | ```
50 |
51 | Now create the deployment package:
52 | ```bash
53 | cd /infra
54 |
55 | # get the openapi specification file
56 | curl --fail-with-body -o core/apim/openapi.json http://localhost:8080/manpage/openapi.json
57 |
58 | # compile bicep -> ARM
59 | az bicep build --file main.bicep --outfile managed-app/mainTemplate.json
60 |
61 | # zip up all files
62 | cd managed-app
63 | tar -a -cf managed-app-deployment-pkg.zip scripts createUiDefinition.json mainTemplate.json viewDefinition.json
64 | ```
65 |
66 | The final deployment package should have the following file structure:
67 | ```bash
68 | managed-app-deployment-pkg.zip
69 | ├── scripts
70 | │ └── install-graphrag.sh
71 | ├── createUiDefinition.json
72 | ├── mainTemplate.json
73 | └── viewDefinition.json
74 | ```
75 |
76 | Upload the zip file to an Azure Storage location in preparation for the next step.
77 |
78 | ### 4. Create a Service Catalog Managed App Definition
79 |
80 | Click [here](https://ms.portal.azure.com/#view/Microsoft_Azure_Marketplace/GalleryItemDetailsBladeNopdl/id/Microsoft.ApplianceDefinition/selectionMode~/false/resourceGroupId//resourceGroupLocation//dontDiscardJourney~/false/selectedMenuId/home/launchingContext~/%7B%22galleryItemId%22%3A%22Microsoft.ApplianceDefinition%22%2C%22source%22%3A%5B%22GalleryFeaturedMenuItemPart%22%2C%22VirtualizedTileDetails%22%5D%2C%22menuItemId%22%3A%22home%22%2C%22subMenuItemId%22%3A%22Search%20results%22%2C%22telemetryId%22%3A%2220409084-39a1-4800-bbce-d0b26a6f46a4%22%7D/searchTelemetryId/d7d20e05-ca16-47f7-bed5-9c7b8d2fa641) or from within the Azure Portal, go to Marketplace and create a `Service Catalog Managed App Definition`. You will be asked to provide a uri link to the uploaded `managed-app-deployment-pkg.zip` file during the creation process.
81 |
82 | ### 5. Deploy the managed app
83 |
84 | There are two deployment options to consider when deploying a managed app. As an app in the Marketplace or as a one-click button:
85 |
86 | * Marketplace App
87 |
88 | 1. In the Azure Portal, find and click on the managed app definition resource created in the previous step.
89 | 2. A button option `Deploy from definition` will be available.
90 | 3. Click on it and proceed through the same setup experience (defined by the `createUiDefinitions.json` file) that a consumer would experience when installing the managed app.
91 | 4. Additional work is needed to [publish the app](https://learn.microsoft.com/en-us/partner-center/marketplace-offers/plan-azure-application-offer) as an official app in the Azure Marketplace
92 |
93 | * 1-click Deployment Button
94 | If `mainTemplate.json` is hosted somewhere publicly (i.e. on Github), a deployment button can be created that deploys the app when clicked, like the the example below.
95 |
96 | [](https://portal.azure.com/#create/Microsoft.Template/uri/https%3A%2F%2Fraw.githubusercontent.com%2FAzure-Samples%2Fgraphrag-accelerator%2Frefs%2Fheads%2Fmain%2Finfra%2Fmanaged-app%2FmainTemplate.json)
97 |
--------------------------------------------------------------------------------
/infra/managed-app/scripts/install-graphrag.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Install kubectl
4 | set -e
5 | az aks install-cli --only-show-errors
6 | az login --identity
7 |
8 | # Get AKS credentials
9 | # requires "Azure Kubernetes Service Cluster Admin" role and "Azure Kubernetes Service RBAC Cluster Admin" role
10 | az aks get-credentials \
11 | --admin \
12 | --name $AKS_NAME \
13 | --resource-group $RESOURCE_GROUP --only-show-errors
14 |
15 | # Define a namespace to install graphrag in
16 | aksNamespace="graphrag"
17 |
18 | # Setup an image pull secret for AKS to access ACR
19 | # NOTE: use an image pull secret instead of managed identity RBAC roles to seamlessly enable ACR access from any subscription/tenant
20 | aksSecretName="regcred"
21 | kubectl create namespace $aksNamespace
22 | kubectl create secret docker-registry $aksSecretName \
23 | --docker-server=$ACR_SERVER \
24 | --docker-username=$ACR_TOKEN_NAME \
25 | --docker-password=$ACR_TOKEN_PASSWORD \
26 | --namespace $aksNamespace
27 |
28 | # Install helm
29 | curl https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 -o get_helm.sh -s
30 | chmod 700 get_helm.sh
31 | ./get_helm.sh &>/dev/null
32 |
33 | # Login to ACR and retrieve helm chart
34 | # A token for the ACR should be generated ahead of time
35 | helm registry login $ACR_SERVER --username $ACR_TOKEN_NAME --password $ACR_TOKEN_PASSWORD
36 | helm pull "oci://$ACR_SERVER/helm/graphrag" --untar
37 |
38 | # Install the helm chart
39 | helm upgrade -i graphrag ./graphrag -f ./graphrag/values.yaml \
40 | --namespace $aksNamespace --create-namespace \
41 | --set "serviceAccount.name=$AKS_SERVICE_ACCOUNT_NAME" \
42 | --set "serviceAccount.annotations.azure\.workload\.identity/client-id=$WORKLOAD_IDENTITY_CLIENT_ID" \
43 | --set "master.imagePullSecrets[0].name=$aksSecretName" \
44 | --set "master.image.repository=$ACR_SERVER/$IMAGE_NAME" \
45 | --set "master.image.tag=$IMAGE_VERSION" \
46 | --set "ingress.host=$APP_HOSTNAME" \
47 | --set "graphragConfig.AI_SEARCH_URL=https://$AI_SEARCH_NAME.$AI_SEARCH_ENDPOINT_SUFFIX" \
48 | --set "graphragConfig.AI_SEARCH_AUDIENCE=$AI_SEARCH_AUDIENCE" \
49 | --set "graphragConfig.APPLICATIONINSIGHTS_CONNECTION_STRING=$APP_INSIGHTS_CONNECTION_STRING" \
50 | --set "graphragConfig.COGNITIVE_SERVICES_AUDIENCE=$COGNITIVE_SERVICES_AUDIENCE" \
51 | --set "graphragConfig.COSMOS_URI_ENDPOINT=$COSMOSDB_ENDPOINT" \
52 | --set "graphragConfig.GRAPHRAG_API_BASE=$AOAI_ENDPOINT" \
53 | --set "graphragConfig.GRAPHRAG_API_VERSION=$AOAI_LLM_MODEL_API_VERSION" \
54 | --set "graphragConfig.GRAPHRAG_LLM_MODEL=$AOAI_LLM_MODEL"\
55 | --set "graphragConfig.GRAPHRAG_LLM_DEPLOYMENT_NAME=$AOAI_LLM_MODEL_DEPLOYMENT_NAME" \
56 | --set "graphragConfig.GRAPHRAG_EMBEDDING_MODEL=$AOAI_EMBEDDING_MODEL" \
57 | --set "graphragConfig.GRAPHRAG_EMBEDDING_DEPLOYMENT_NAME=$AOAI_EMBEDDING_MODEL_DEPLOYMENT_NAME" \
58 | --set "graphragConfig.STORAGE_ACCOUNT_BLOB_URL=$STORAGE_ACCOUNT_BLOB_URL"
59 |
--------------------------------------------------------------------------------
/infra/managed-app/viewDefinition.json:
--------------------------------------------------------------------------------
1 | {
2 | "$schema": "https://schema.management.azure.com/schemas/viewdefinition/0.0.1-preview/ViewDefinition.json#",
3 | "contentVersion": "0.0.0.1",
4 | "views": [
5 | {
6 | "kind": "Overview",
7 | "properties": {
8 | "header": "Welcome to GraphRAG!",
9 | "description": "Enabling customers to build and leverage the power of knowledge graphs every day...
Getting Started: Look under the `Settings` -> `Parameters and Outputs` section, and find `azure_apim_gateway_url`.
GraphRAG Swagger Docs: `/manpage/docs`"
10 | }
11 | }
12 | ]
13 | }
--------------------------------------------------------------------------------
/notebooks/README.md:
--------------------------------------------------------------------------------
1 | # Instructions
2 |
3 | 1. Create a dataset to use with GraphRAG. You may provide your own data or use the `get-wiki-articles.py` script to download a small set of wikipedia articles for demonstration purposes.
4 |
5 | ```shell
6 | > python get-wiki-articles.py testdata
7 | ```
8 | For a faster example with less data
9 | ```shell
10 | > python get-wiki-articles.py --short-summary --num-articles 1 testdata
11 | ```
12 |
13 | 2. Follow instructions in the `1-Quickstart.ipynb` notebook to explore the GraphRAG API, by building an index of the data in `testdata` and executing queries.
14 |
--------------------------------------------------------------------------------
/notebooks/get-wiki-articles.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # Copyright (c) Microsoft Corporation.
3 | # Licensed under the MIT License.
4 |
5 | """
6 | This script downloads a few sample wikipedia articles that can be used for demo or quickstart purposes in conjunction with the solution accelerator.
7 | """
8 |
9 | import argparse
10 | import os
11 |
12 | import wikipedia
13 |
14 | us_states = [
15 | "Alaska",
16 | "California",
17 | "Washington (state)",
18 | "Washington_D.C.",
19 | "New York (state)",
20 | ]
21 |
22 |
23 | def main():
24 | parser = argparse.ArgumentParser(description="Wikipedia Download Script")
25 | parser.add_argument(
26 | "directory",
27 | help="Directory to download sample wikipedia articles to.",
28 | default="testdata",
29 | )
30 | parser.add_argument(
31 | "--short-summary",
32 | help="Retrieve short summary article content.",
33 | action="store_true",
34 | )
35 | parser.add_argument(
36 | "--num-articles",
37 | help="Number of wikipedia articles to download. Default=5",
38 | default=5,
39 | choices=range(1, 6),
40 | type=int,
41 | )
42 | args = parser.parse_args()
43 | os.makedirs(args.directory, exist_ok=True)
44 | for state in us_states[0 : args.num_articles]:
45 | try:
46 | title = wikipedia.page(state).title.lower().replace(" ", "_")
47 | content = (
48 | wikipedia.page(state).summary
49 | if args.short_summary
50 | else wikipedia.page(state).content
51 | )
52 | content = content.strip()
53 | filename = os.path.join(args.directory, f"{title}_wiki_article.txt")
54 | with open(filename, "w", encoding="utf-8") as f:
55 | f.write(content)
56 | print(f"Saving wiki article '{title}' to {filename}")
57 | except Exception:
58 | print(f"Error fetching wiki article {title}")
59 |
60 |
61 | if __name__ == "__main__":
62 | main()
63 |
--------------------------------------------------------------------------------