├── .env.example ├── .github └── workflows │ └── python-version-compatibility.yml ├── .gitignore ├── .pre-commit-config.yaml ├── Dockerfile ├── LICENSE ├── Makefile ├── NOTICE ├── README.md ├── aikg ├── config │ ├── __init__.py │ ├── chat.py │ ├── chroma.py │ ├── common.py │ └── sparql.py ├── flows │ ├── chroma_build.py │ ├── chroma_examples.py │ └── insert_triples.py ├── models.py ├── notebooks │ ├── nl_sparql.ipynb │ └── sphn_example.ipynb ├── server.py └── utils │ ├── chat.py │ ├── chroma.py │ ├── io.py │ ├── llm.py │ └── rdf.py ├── data ├── models │ └── .gitkeep └── test_data.trig ├── docker-compose.yml ├── k8s ├── README.md ├── base │ ├── chatllm │ │ ├── deployment.yaml │ │ ├── kustomization.yaml │ │ ├── params.env │ │ └── service.yaml │ ├── chroma │ │ ├── configmap.yaml │ │ ├── deployment.yaml │ │ ├── kustomization.yaml │ │ ├── params.env │ │ ├── pvc.yaml │ │ └── service.yaml │ ├── graphdb │ │ ├── configmap.yaml │ │ ├── deployment.yaml │ │ ├── kustomization.yaml │ │ ├── params.env │ │ ├── pvc.yaml │ │ └── service.yaml │ ├── kg-llm │ │ ├── deployment.yaml │ │ ├── kustomization.yaml │ │ ├── params.env │ │ ├── pvc.yaml │ │ └── service.yaml │ └── kustomization.yaml └── overlays │ └── custom-config │ ├── kustomization.yaml │ └── params.env ├── pyproject.toml ├── scripts └── standalone_server.sh ├── tests ├── __init__.py ├── chat.test.yml ├── conftest.py ├── test_load_data.py └── test_rdf.py └── uv.lock /.env.example: -------------------------------------------------------------------------------- 1 | COMPOSE_PROJECT_NAME=kg_llm 2 | CONDA_CHANNELS=conda-forge 3 | SERVER_PORT=8001 4 | CHROMA_HOST=kg_llm_chroma 5 | CHROMA_PORT=8000 6 | CHROMA_COLLECTION="test" 7 | SPARQL_USER="admin" 8 | SPARQL_PASSWORD="admin" 9 | SPARQL_ENDPOINT="http://localhost:7200/repositories/test" 10 | -------------------------------------------------------------------------------- /.github/workflows/python-version-compatibility.yml: -------------------------------------------------------------------------------- 1 | name: python-version-compatibility 2 | 3 | on: 4 | pull_request: 5 | paths: 6 | - 'pyproject.toml' 7 | 8 | jobs: 9 | build: 10 | strategy: 11 | fail-fast: true 12 | matrix: 13 | os: [ "ubuntu-latest" ] 14 | python-version: [ "3.10", "3.11", "3.12", "3.13" ] 15 | runs-on: ${{ matrix.os }} 16 | steps: 17 | - name: Check out repository 18 | uses: actions/checkout@v4 19 | 20 | - name: Set up python ${{ matrix.python-version }} 21 | id: setup-python 22 | uses: actions/setup-python@v5 23 | with: 24 | python-version: ${{ matrix.python-version }} 25 | 26 | - name: Install uv 27 | uses: astral-sh/setup-uv@v5 28 | 29 | - name: Install the project 30 | run: uv sync --all-extras --dev 31 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # project 2 | .idea/ 3 | .vscode/ 4 | .metaflow/ 5 | .env 6 | data/ 7 | # Byte-compiled / optimized / DLL files 8 | __pycache__/ 9 | *.py[cod] 10 | *$py.class 11 | 12 | # C extensions 13 | *.so 14 | 15 | # Distribution / packaging 16 | .Python 17 | build/ 18 | develop-eggs/ 19 | dist/ 20 | downloads/ 21 | eggs/ 22 | .eggs/ 23 | lib/ 24 | lib64/ 25 | parts/ 26 | sdist/ 27 | var/ 28 | wheels/ 29 | pip-wheel-metadata/ 30 | share/python-wheels/ 31 | *.egg-info/ 32 | .installed.cfg 33 | *.egg 34 | MANIFEST 35 | 36 | # PyInstaller 37 | # Usually these files are written by a python script from a template 38 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 39 | *.manifest 40 | *.spec 41 | 42 | # Installer logs 43 | pip-log.txt 44 | pip-delete-this-directory.txt 45 | 46 | # Unit test / coverage reports 47 | htmlcov/ 48 | .tox/ 49 | .nox/ 50 | .coverage 51 | .coverage.* 52 | .cache 53 | nosetests.xml 54 | coverage.xml 55 | *.cover 56 | *.py,cover 57 | .hypothesis/ 58 | .pytest_cache/ 59 | 60 | # Translations 61 | *.mo 62 | *.pot 63 | 64 | # Django stuff: 65 | *.log 66 | local_settings.py 67 | db.sqlite3 68 | db.sqlite3-journal 69 | 70 | # Flask stuff: 71 | instance/ 72 | .webassets-cache 73 | 74 | # Scrapy stuff: 75 | .scrapy 76 | 77 | # Sphinx documentation 78 | docs/_build/ 79 | 80 | # PyBuilder 81 | target/ 82 | 83 | # Jupyter Notebook 84 | .ipynb_checkpoints 85 | 86 | # IPython 87 | profile_default/ 88 | ipython_config.py 89 | 90 | # pyenv 91 | .python-version 92 | 93 | # pipenv 94 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 95 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 96 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 97 | # install all needed dependencies. 98 | #Pipfile.lock 99 | 100 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 101 | __pypackages__/ 102 | 103 | # Celery stuff 104 | celerybeat-schedule 105 | celerybeat.pid 106 | 107 | # SageMath parsed files 108 | *.sage.py 109 | 110 | # Environments 111 | .env 112 | .venv 113 | env/ 114 | venv/ 115 | ENV/ 116 | env.bak/ 117 | venv.bak/ 118 | 119 | # Spyder project settings 120 | .spyderproject 121 | .spyproject 122 | 123 | # Rope project settings 124 | .ropeproject 125 | 126 | # mkdocs documentation 127 | /site 128 | 129 | # mypy 130 | .mypy_cache/ 131 | .dmypy.json 132 | dmypy.json 133 | 134 | # Pyre type checker 135 | .pyre/ 136 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/pre-commit-hooks 3 | rev: v2.3.0 4 | hooks: 5 | - id: check-yaml 6 | - id: end-of-file-fixer 7 | - id: trailing-whitespace 8 | - repo: https://github.com/psf/black 9 | rev: 22.10.0 10 | hooks: 11 | - id: black 12 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # Dockerfile for the fastAPI chatbot server 2 | # It uses poetry to setup the environment 3 | FROM nvidia/cuda:11.7.1-base-ubuntu22.04 4 | 5 | # Set the working directory 6 | WORKDIR /app 7 | 8 | # Install system dependencies 9 | ENV DEBIAN_FRONTEND=noninteractive 10 | RUN apt-get update && apt-get -y install \ 11 | git \ 12 | python3-dev \ 13 | g++-11 \ 14 | build-essential \ 15 | curl wget tzdata 16 | 17 | # Install poetry 18 | ENV POETRY_HOME="/opt/poetry" \ 19 | POETRY_NO_INTERACTION=1 \ 20 | POETRY_VERSION=1.5.0 21 | ENV PATH="$PATH:$POETRY_HOME/bin" 22 | RUN curl -sSL https://install.python-poetry.org | python3 - 23 | 24 | # Copy the source code into docker image 25 | COPY . /app 26 | 27 | # Install project and dependencies 28 | RUN rm -f poetry.lock && make install 29 | RUN poetry run python -m ipykernel install --user --name aikg 30 | 31 | # Run the server 32 | ENTRYPOINT ["/bin/bash", "-c", "poetry run uvicorn aikg.server:app --host 0.0.0.0 --port 80"] 33 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | IMAGE := 'ghcr.io/sdsc-ordes/kg-llm-interface:latest' 2 | 3 | .PHONY: install 4 | install: ## Install with the poetry and add pre-commit hooks 5 | @echo "🔨 Installing packages with uv" 6 | @uv sync 7 | @uv run pre-commit install 8 | 9 | .PHONY: check 10 | check: ## Run code quality tools. 11 | @echo "🕵️ Checking Poetry lock file consistency with 'pyproject.toml': Running uv lock --check" 12 | @uv lock --check 13 | @echo "🕵️ Linting code: Running pre-commit" 14 | @uv run pre-commit run -a 15 | 16 | .PHONY: test 17 | test: ## Test the code with pytest 18 | @echo "🧪 Testing code: Running pytest" 19 | @uv run pytest 20 | 21 | .PHONY: server 22 | server: 23 | @echo "🖥️ Running server" 24 | @uv run uvicorn --reload aikg.server:app --port 8001 25 | 26 | .PHONY: deploy 27 | deploy: 28 | @echo "🚀 Deploying all the services" 29 | @kubectl apply -k kubernetes/overlays/data-retriever 30 | 31 | .PHONY: notebook 32 | notebook: docker-build ## Start a jupyter notebook server in a docker container 33 | @echo "🗒️ Starting a containerized notebook server" 34 | @docker run -p 8888:8888 --rm -it --entrypoint 'poetry' $(IMAGE) \ 35 | run jupyter lab --allow-root --port 8888 --ip "0.0.0.0" 36 | 37 | docker-build: Dockerfile 38 | @echo "🐳 Building docker image" 39 | @docker build -t $(IMAGE) . 40 | 41 | .PHONY: help 42 | help: 43 | @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-20s\033[0m %s\n", $$1, $$2}' 44 | 45 | .DEFAULT_GOAL := help 46 | -------------------------------------------------------------------------------- /NOTICE: -------------------------------------------------------------------------------- 1 | kg-llm-interface 2 | Copyright 2023 - Swiss Data Science Center (SDSC) 3 | A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and 4 | Eidgenössische Technische Hochschule Zürich (ETHZ). 5 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # kg-llm-interface 2 | Langchain-powered natural language interface to RDF knowledge-graphs. 3 | 4 | ## Installation 5 | 6 | This repository uses `uv` for package management. A Makefile rule is provided to install the dependencies: 7 | 8 | ```bash 9 | make install 10 | ``` 11 | 12 | ## Configuration 13 | 14 | Configuration variables are loaded from the `.env` file or environment variables. A template configuration file is provided in `.env.example`. 15 | 16 | The chat configuration (`config.chat.ChatConfig`) uses OpenAI by default, however you can run this tool with open source LLMs using a framework such as llamafile, openllm or localGPT. When doing so, simply provide your LLM server url using `openai_api_base` and the model name using`model`. 17 | 18 | 19 | ## Quickstart 20 | 21 | You can read and run the [example notebook](aikg/notebooks/nl_sparql.ipynb) to get a quick overview of the system. 22 | The notebook supports using the Openai API and can run locally on a laptop. 23 | 24 | To run the notebook in a containerized environment, run: 25 | 26 | `make notebook` 27 | 28 | ## Server 29 | 30 | The server can be deployed as a standalone service using the script `scripts/standalone_server.sh`. It will start a uvicorn server on port 8001, use chromaDB in client-only mode and use an RDF file as knowledge graph. This should work for small datasets. 31 | 32 | 33 | ## Pipelines 34 | 35 | Pipelines are used to execute one-time operations for preparing data before the chat server can operate. They load their configuration from the `.env` file as well, but the variables can be overriden using yaml files (run with `--help` for more info). 36 | 37 | ### Insert triples 38 | 39 | ```mermaid 40 | flowchart LR 41 | RDF[RDF file] -->|insert_triples.py| SPARQL(SPARQL endpoint) 42 | ``` 43 | 44 | Insert data from an input RDF file to a SPARQL endpoint. The input file can be in any format supported by rdflib (ttl, json-ld, rdf/xlm, ...). 45 | 46 | Location: [insert_triples.py](aikg/flows/insert_triples.py): 47 | 48 | SPARQL configuration can be overriden by providing a yaml file following the [aikg.config.sparql.SparqlConfig](aikg/config/sparql.py) schema: 49 | 50 | `python insert_triples --sparql-config-path sparql.yaml` 51 | 52 | ```yaml 53 | # sparql.yaml 54 | endpoint: http://localhost:3030/ds/query 55 | user: admin 56 | password: admin 57 | ``` 58 | 59 | CLI usage: `python aikg/flows/insert_triples.py` 60 | 61 | ### Chroma build 62 | 63 | ```mermaid 64 | flowchart LR 65 | SPARQL(SPARQL endpoint) -->|chroma_build.py| CHROMA(ChromaDB) 66 | ``` 67 | 68 | Build the chromaDB index from a SPARQL endpoint. 69 | 70 | Location: [chroma_build.py](aikg/flows/chroma_build.py): 71 | 72 | CLI usage: `python aikg/flows/chroma_build.py` 73 | 74 | Chroma and SPARQL configurations can be overriden by providing a yaml file following the [aikg.config.chroma.ChromaConfig](aikg/config/chroma.py) or [aikg.config.sparql.SparqlConfig](aikg/config/sparql.py) schemas respectively. 75 | 76 | 77 | ## Containerized service 78 | 79 | :warning: WIP, not functional yet 80 | 81 | The chat server can be deployed along with the front-end, SPARQL endpoint and chromaDB server using kubernetes. 82 | 83 | ```mermaid 84 | sequenceDiagram 85 | Front-end->>+Chat server: question 86 | Chat server->>+ChromaDB: question 87 | ChromaDB -->ChromaDB: embed 88 | ChromaDB-->>-Chat server: ontology triples 89 | Chat server-->Chat server: generate query 90 | Chat server-->>+SPARQL endpoint: query 91 | SPARQL endpoint-->SPARQL endpoint: run query 92 | SPARQL endpoint-->>-Chat server: result 93 | Chat server-->>-Front-end: answer 94 | ``` 95 | 96 | ## Contributing 97 | 98 | All contributions are welcome. New functions and classes should have associated docstrings following the [numpy style guide](https://numpydoc.readthedocs.io/en/latest/format.html). 99 | 100 | The code formatting standard we use is [black](https://github.com/psf/black), with `--line-length=79` to follow [PEP8](https://peps.python.org/pep-0008/) recommendations. We use [pytest](https://docs.pytest.org/en/7.2.x/) as our testing framework. This project uses [pyproject.toml](https://pip.pypa.io/en/stable/reference/build-system/pyproject-toml/) to define package information, requirements and tooling configuration. 101 | 102 | Tests can be executed with `make test`. Tests use [testcontainers](https://testcontainers.com) to temporarily deploy the required services. 103 | -------------------------------------------------------------------------------- /aikg/config/__init__.py: -------------------------------------------------------------------------------- 1 | # kg-llm-interface 2 | # Copyright 2023 - Swiss Data Science Center (SDSC) 3 | # A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and 4 | # Eidgenössische Technische Hochschule Zürich (ETHZ). 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | from aikg.config.chat import ChatConfig 19 | from aikg.config.chroma import ChromaConfig 20 | from aikg.config.sparql import SparqlConfig 21 | -------------------------------------------------------------------------------- /aikg/config/chat.py: -------------------------------------------------------------------------------- 1 | # kg-llm-interface 2 | # Copyright 2023 - Swiss Data Science Center (SDSC) 3 | # A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and 4 | # Eidgenössische Technische Hochschule Zürich (ETHZ). 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | import os 19 | from pydantic import BaseModel 20 | 21 | 22 | class ChatConfig(BaseModel): 23 | """Chatbot configuration. 24 | 25 | Attributes: 26 | model_id: The HuggingFace ID of the model to use for text generation. 27 | max_new_tokens: The maximum number of tokens to generate. 28 | max_input_size: The maximum number of tokens in the input. 29 | num_output: The number of outputs to generate. 30 | max_chunk_overlap: The maximum number of tokens to overlap between chunks. 31 | prompt_template: The template for the prompt to inject into the model. The template should contain the following variables: context_str, query_str. 32 | """ 33 | 34 | openai_api_base: str = os.environ.get( 35 | "OPENAI_API_BASE", "https://api.openai.com/v1/" 36 | ) 37 | openai_api_key: str = os.environ.get("OPENAI_API_KEY", "") 38 | model: str = os.environ.get("OPENAI_MODEL", "gpt-4o") 39 | answer_template: str = """ 40 | We have provided the contextual facts below. 41 | ----------------- 42 | {result_str} 43 | ----------------- 44 | Answer the question using only the context and no 45 | prior knowledge. If the context does not contain any fact related to 46 | the question, simply answer the words 'Not found'. The answer should be 47 | maximum 2 sentences directly reflecting the facts from relevant facts while ignoring 48 | irrelevant ones. 49 | Question: {question_str} 50 | Answer: 51 | """ 52 | 53 | sparql_template: str = """ 54 | Use the question and the additional information to generate a sparql query against a knowledge graph where the p and q items are 55 | completely unknown to you. You will need to discover the p and q items before you can generate the sparql. 56 | Do not assume you know the p and q items for any concepts. 57 | After you generate the sparql, you should display it. 58 | 59 | When generating sparql: 60 | * Never enclose the sparql in back-quotes 61 | * Do not include any human text, only the query and nothing else 62 | 63 | {examples_str} 64 | 65 | Use the following format: 66 | 67 | Question: the input question for which you must provide a natural language answer 68 | Information: the additional information you get with the query, in RDF format. This will help you generate the sparql query with the correct format. 69 | 70 | Question: {question_str} 71 | Information: 72 | {context_str} 73 | Answer: 74 | """ 75 | -------------------------------------------------------------------------------- /aikg/config/chroma.py: -------------------------------------------------------------------------------- 1 | # kg-llm-interface 2 | # Copyright 2023 - Swiss Data Science Center (SDSC) 3 | # A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and 4 | # Eidgenössische Technische Hochschule Zürich (ETHZ). 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | import os 19 | from pydantic import BaseModel 20 | 21 | 22 | class ChromaConfig(BaseModel): 23 | """ 24 | Attributes: 25 | host: 26 | The host of the ChromaDB server. If set to "local", chroma will run in client-only mode. 27 | port: 28 | The port of the ChromaDB server. 29 | collection_name: 30 | The name of the ChromaDB collection to store the index in. 31 | collection_examples: 32 | The name of the ChromaDB collection to store examples in. 33 | embedding_model_id: 34 | The HuggingFace ID of the embedding model to use. 35 | batch_size: 36 | The number of documents to vectorize and store in each batch. 37 | persist_directory: 38 | If set to client-only mode, local path where the db is saved. 39 | """ 40 | 41 | host: str = os.environ.get("CHROMA_HOST", "127.0.0.1") 42 | port: int = int(os.environ.get("CHROMA_PORT", "8000")) 43 | collection_name: str = os.environ.get("CHROMA_COLLECTION", "schema") 44 | collection_examples: str = os.environ.get("CHROMA_EXAMPLES", "examples") 45 | batch_size: int = int(os.environ.get("CHROMA_BATCH_SIZE", "50")) 46 | embedding_model: str = os.environ.get("CHROMA_MODEL", "all-mpnet-base-v2") 47 | persist_directory: str = os.environ.get("CHROMA_PERSIST_DIR", ".chroma/") 48 | -------------------------------------------------------------------------------- /aikg/config/common.py: -------------------------------------------------------------------------------- 1 | # kg-llm-interface 2 | # Copyright 2023 - Swiss Data Science Center (SDSC) 3 | # A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and 4 | # Eidgenössische Technische Hochschule Zürich (ETHZ). 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | from pathlib import Path 19 | from typing import Type, TypeVar 20 | 21 | from pydantic import BaseModel 22 | import yaml 23 | 24 | Config = TypeVar("Config", bound=BaseModel) 25 | 26 | 27 | def parse_yaml_config(config_path: Path, config_class: Type[Config]) -> Config: 28 | """Parse a YAML config file into a pydantic model. 29 | 30 | Args: 31 | config_path: Path to YAML config file. 32 | config_class: The pydantic model to parse the config into. 33 | 34 | Returns: 35 | The parsed config. 36 | """ 37 | # Load dict from YAML file 38 | config_dict = yaml.safe_load(config_path.read_text()) 39 | return config_class.parse_obj(config_dict) 40 | -------------------------------------------------------------------------------- /aikg/config/sparql.py: -------------------------------------------------------------------------------- 1 | # kg-llm-interface 2 | # Copyright 2023 - Swiss Data Science Center (SDSC) 3 | # A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and 4 | # Eidgenössische Technische Hochschule Zürich (ETHZ). 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | import os 19 | from pydantic import BaseModel 20 | 21 | 22 | class SparqlConfig(BaseModel): 23 | """ 24 | Attributes: 25 | endpoint: The SPARQL endpoint to connect to. Can also be a local path to an RDF file. 26 | repo: The name of the repository or dataset to query. 27 | user: The username to use for authentication. 28 | password: The password to use for authentication. 29 | """ 30 | 31 | endpoint: str = os.environ.get( 32 | "SPARQL_ENDPOINT", "http://localhost:7200/repositories/test" 33 | ) 34 | 35 | user: str = os.environ.get("SPARQL_USER", "admin") 36 | password: str = os.environ.get("SPARQL_PASSWORD", "admin") 37 | -------------------------------------------------------------------------------- /aikg/flows/chroma_build.py: -------------------------------------------------------------------------------- 1 | # kg-llm-interface 2 | # Copyright 2023 - Swiss Data Science Center (SDSC) 3 | # A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and 4 | # Eidgenössische Technische Hochschule Zürich (ETHZ). 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | """This flow builds a ChromaDB vector index from RDF data in a SPARQL endpoint. 19 | 20 | For each subject in the target graph, a document is generated. The document consists of: 21 | * A human readable body made up of the annotations (rdfs:comment, rdf:label) associated with the subject. 22 | * Triples with the subject attached as metadata. 23 | 24 | The documents are then stored in a vector database. The embedding is computed using the document body, 25 | and triples included as metadata. The index is persisted to disk and can be subsequently loaded into memory 26 | for querying.""" 27 | 28 | from pathlib import Path 29 | from typing import Optional, Tuple 30 | from typing_extensions import Annotated 31 | import uuid 32 | 33 | from chromadb.api import ClientAPI, Collection 34 | from dotenv import load_dotenv 35 | from langchain.schema import Document 36 | from more_itertools import chunked 37 | from prefect import flow, task 38 | from prefect import get_run_logger 39 | from rdflib import ConjunctiveGraph, Graph 40 | from SPARQLWrapper import SPARQLWrapper 41 | import typer 42 | 43 | from aikg.config import ChromaConfig, SparqlConfig 44 | from aikg.config.common import parse_yaml_config 45 | import aikg.utils.rdf as akrdf 46 | import aikg.utils.chroma as akchroma 47 | 48 | 49 | @task 50 | def init_chromadb( 51 | host: str, 52 | port: int, 53 | collection_name: str, 54 | embedding_model: str, 55 | persist_directory: str, 56 | ) -> Tuple[ClientAPI, Collection]: 57 | """Prepare chromadb client.""" 58 | client = akchroma.setup_client(host, port, persist_directory=persist_directory) 59 | coll = akchroma.setup_collection(client, collection_name, embedding_model) 60 | 61 | return client, coll 62 | 63 | 64 | @task 65 | def sparql_to_documents( 66 | kg: Graph | SPARQLWrapper, graph: Optional[str] = None 67 | ) -> list[Document]: 68 | return list(akrdf.get_subjects_docs(kg, graph=graph)) 69 | 70 | 71 | @task 72 | def index_batch(batch: list[Document]): 73 | """Sends a batch of document for indexing in the vector store""" 74 | coll.add( 75 | ids=[str(uuid.uuid4()) for _ in batch], 76 | documents=[doc.page_content for doc in batch], 77 | metadatas=[doc.metadata for doc in batch], 78 | ) 79 | 80 | 81 | @flow 82 | def chroma_build_flow( 83 | chroma_cfg: ChromaConfig = ChromaConfig(), 84 | sparql_cfg: SparqlConfig = SparqlConfig(), 85 | graph: Optional[str] = None, 86 | ): 87 | """Build a ChromaDB vector index from RDF data in a SPARQL endpoint. 88 | 89 | Parameters 90 | ---------- 91 | chroma_cfg: 92 | ChromaDB configuration. 93 | sparql_cfg: 94 | SPARQL endpoint configuration. 95 | graph: 96 | URI of named graph from which to select subjects to embed. 97 | By default, all subjects are used. 98 | """ 99 | load_dotenv() 100 | logger = get_run_logger() 101 | logger.info("INFO Started") 102 | # Connect to external resources 103 | global coll 104 | client, coll = init_chromadb( 105 | chroma_cfg.host, 106 | chroma_cfg.port, 107 | chroma_cfg.collection_name, 108 | chroma_cfg.embedding_model, 109 | chroma_cfg.persist_directory, 110 | ) 111 | kg = akrdf.setup_kg( 112 | sparql_cfg.endpoint, 113 | user=sparql_cfg.user, 114 | password=sparql_cfg.password, 115 | ) 116 | 117 | # Create subject documents 118 | docs = sparql_to_documents( 119 | kg, 120 | graph=graph, 121 | ) 122 | 123 | # Vectorize and index documents by batches to reduce overhead 124 | logger.info(f"Indexing by batches of {chroma_cfg.batch_size} items") 125 | embed_counter = 0 126 | for batch in chunked(docs, chroma_cfg.batch_size): 127 | embed_counter += len(batch) 128 | index_batch(batch) 129 | logger.info(f"Indexed {embed_counter} items.") 130 | 131 | 132 | def cli( 133 | chroma_cfg_path: Annotated[ 134 | Optional[Path], 135 | typer.Option(help="YAML file with Chroma client configuration."), 136 | ] = None, 137 | sparql_cfg_path: Annotated[ 138 | Optional[Path], 139 | typer.Option(help="YAML file with SPARQL endpoint configuration."), 140 | ] = None, 141 | graph: Annotated[ 142 | Optional[str], 143 | typer.Option( 144 | help="URI of named graph from which to select triples to embed. If not set, the default graph is used.", 145 | ), 146 | ] = None, 147 | ): 148 | """Command line wrapper for RDF to ChromaDB index flow.""" 149 | chroma_cfg = ( 150 | parse_yaml_config(chroma_cfg_path, ChromaConfig) 151 | if chroma_cfg_path 152 | else ChromaConfig() 153 | ) 154 | sparql_cfg = ( 155 | parse_yaml_config(sparql_cfg_path, SparqlConfig) 156 | if sparql_cfg_path 157 | else SparqlConfig() 158 | ) 159 | chroma_build_flow(chroma_cfg, sparql_cfg, graph=graph) 160 | 161 | 162 | if __name__ == "__main__": 163 | typer.run(cli) 164 | -------------------------------------------------------------------------------- /aikg/flows/chroma_examples.py: -------------------------------------------------------------------------------- 1 | # kg-llm-interface 2 | # Copyright 2023 - Swiss Data Science Center (SDSC) 3 | # A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and 4 | # Eidgenössische Technische Hochschule Zürich (ETHZ). 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | """This flow builds a ChromaDB vector index from examples consisting of pairs of questions and SPARQL queries. 19 | 20 | For each subject in the target graph, a document is generated. The document consists of: 21 | * A human readable question (document body) 22 | * A corresponding SPARQL query (document metadata) 23 | 24 | The documents are then stored in a vector database. The embedding is computed using the document body (questions), 25 | and SPAQRL queries included as metadata. The index is persisted to disk and can be subsequently loaded into memory 26 | for querying.""" 27 | 28 | from pathlib import Path 29 | from typing import Optional, Tuple 30 | from typing_extensions import Annotated 31 | import uuid 32 | import os 33 | 34 | from chromadb.api import ClientAPI, Collection 35 | from dotenv import load_dotenv 36 | from langchain.schema import Document 37 | from more_itertools import chunked 38 | from prefect import flow, task 39 | from prefect import get_run_logger 40 | import typer 41 | 42 | from aikg.config import ChromaConfig 43 | from aikg.config.common import parse_yaml_config 44 | import aikg.utils.io as akio 45 | import aikg.utils.chroma as akchroma 46 | 47 | 48 | @task 49 | def init_chromadb( 50 | host: str, 51 | port: int, 52 | collection_name: str, 53 | embedding_model: str, 54 | persist_directory: str, 55 | ) -> Tuple[ClientAPI, Collection]: 56 | """Prepare chromadb client.""" 57 | client = akchroma.setup_client(host, port, persist_directory=persist_directory) 58 | coll = akchroma.setup_collection(client, collection_name, embedding_model) 59 | 60 | return client, coll 61 | 62 | 63 | @task 64 | def index_batch(batch: list[Document]): 65 | """Sends a batch of document for indexing in the vector store""" 66 | coll.add( 67 | ids=[str(uuid.uuid4()) for _ in batch], 68 | documents=[doc.page_content for doc in batch], 69 | metadatas=[doc.metadata for doc in batch], 70 | ) 71 | 72 | 73 | @task 74 | def get_sparql_examples(dir: Path) -> list[Document]: 75 | # find files 76 | files = [] 77 | for file_name in os.listdir(dir): 78 | files.append(os.path.join(dir, file_name)) 79 | # provide each file as text stream to be parsed 80 | return [akio.parse_sparql_example(open(ex)) for ex in files] 81 | 82 | 83 | @flow 84 | def chroma_build_examples_flow( 85 | chroma_input_dir: Path, 86 | chroma_cfg: ChromaConfig = ChromaConfig(), 87 | ): 88 | """Build a ChromaDB vector index from examples. 89 | 90 | Parameters 91 | ---------- 92 | chroma_input_dir: 93 | Directory containing files with example question-query pairs. The files should be in sparql format, with the first line being the question as a comment. 94 | chroma_cfg: 95 | ChromaDB configuration. 96 | """ 97 | load_dotenv() 98 | logger = get_run_logger() 99 | logger.info("INFO Started") 100 | # Connect to external resources 101 | global coll 102 | client, coll = init_chromadb( 103 | chroma_cfg.host, 104 | chroma_cfg.port, 105 | chroma_cfg.collection_examples, 106 | chroma_cfg.embedding_model, 107 | chroma_cfg.persist_directory, 108 | ) 109 | 110 | # Create subject documents 111 | docs = get_sparql_examples( 112 | dir=chroma_input_dir, 113 | ) 114 | 115 | # Vectorize and index documents by batches to reduce overhead 116 | logger.info(f"Indexing by batches of {chroma_cfg.batch_size} items") 117 | embed_counter = 0 118 | for batch in chunked(docs, chroma_cfg.batch_size): 119 | embed_counter += len(batch) 120 | index_batch(docs) 121 | logger.info(f"Indexed {embed_counter} items.") 122 | 123 | 124 | def cli( 125 | chroma_input_dir: Annotated[ 126 | Path, 127 | typer.Argument( 128 | help="Path to directory with example SPARQL queries", 129 | exists=True, 130 | file_okay=False, 131 | dir_okay=True, 132 | ), 133 | ], 134 | chroma_cfg_path: Annotated[ 135 | Optional[Path], 136 | typer.Option(default=None, help="YAML file with Chroma client configuration."), 137 | ] = None, 138 | ): 139 | """Command line wrapper for SPARQL examples to ChromaDB index flow.""" 140 | chroma_cfg = ( 141 | parse_yaml_config(chroma_cfg_path, ChromaConfig) 142 | if chroma_cfg_path 143 | else ChromaConfig() 144 | ) 145 | chroma_build_examples_flow(chroma_input_dir, chroma_cfg) 146 | 147 | 148 | if __name__ == "__main__": 149 | typer.run(cli) 150 | -------------------------------------------------------------------------------- /aikg/flows/insert_triples.py: -------------------------------------------------------------------------------- 1 | # kg-llm-interface 2 | # Copyright 2023 - Swiss Data Science Center (SDSC) 3 | # A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and 4 | # Eidgenössische Technische Hochschule Zürich (ETHZ). 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | """This flow populates a SPARQL endpoint from RDF data in a file.""" 19 | import os 20 | from pathlib import Path 21 | from typing import Optional 22 | from typing_extensions import Annotated 23 | 24 | from dotenv import load_dotenv 25 | from prefect import flow, get_run_logger, task 26 | from SPARQLWrapper import SPARQLWrapper 27 | import typer 28 | 29 | from aikg.config.common import parse_yaml_config 30 | from aikg.config import SparqlConfig 31 | 32 | 33 | @task 34 | def setup_sparql_endpoint( 35 | endpoint: str, user: Optional[str] = None, password: Optional[str] = None 36 | ) -> SPARQLWrapper: 37 | """Connect to SPARQL endpoint and setup credentials. 38 | 39 | Parameters 40 | ---------- 41 | endpoint: 42 | URL of the SPARQL endpoint. 43 | user: 44 | Username to use for authentication. 45 | password: 46 | Password to use for authentication. 47 | """ 48 | # Setup sparql endpoint 49 | sparql = SPARQLWrapper(endpoint, updateEndpoint=endpoint + "/statements") 50 | if user and password: 51 | sparql.setCredentials(user, password) 52 | return sparql 53 | 54 | 55 | @task 56 | def insert_triples( 57 | rdf_file: Path, 58 | endpoint: SPARQLWrapper, 59 | graph: Optional[str] = None, 60 | chunk_size: int = 1000, 61 | ): 62 | """Insert triples from source file into SPARQL endpoint. 63 | 64 | Parameters 65 | ---------- 66 | rdf_file: 67 | Path to RDF file to load into the SPARQL endpoint. 68 | endpoint: 69 | SPARQL endpoint to load RDF data into. 70 | graph: 71 | URI of named graph to load RDF data into. 72 | If set to None, the default graph is used. 73 | chunk_size: 74 | Number of triples per insert operation. 75 | """ 76 | from rdflib import Dataset 77 | from rdflib.util import guess_format 78 | 79 | format = guess_format(str(rdf_file)) 80 | if format not in ["nt", "nquads"]: 81 | raise ValueError("Unsupported RDF format, must be ntriples or nquads.") 82 | 83 | cur = 0 84 | tot = os.path.getsize(rdf_file) 85 | with open(rdf_file, "r", encoding="utf-8") as source: 86 | # Run INSERT DATA queries by chunks of triples 87 | while True: 88 | data = "".join([source.readline() for _ in range(chunk_size)]) 89 | if data == "": 90 | break 91 | 92 | ds = Dataset() 93 | ds.parse(data=data, format=format) 94 | 95 | query = "\n".join( 96 | [f"PREFIX {prefix}: {ns.n3()}" for prefix, ns in ds.namespaces()] 97 | ) 98 | query += f"\nINSERT DATA {{" 99 | if graph: 100 | query += f"\n\tGRAPH <{graph}> {{" 101 | query += " .\n".join( 102 | [f"\t\t{s.n3()} {p.n3()} {o.n3()}" for (s, p, o, _) in ds.quads()] 103 | ) 104 | if graph: 105 | query += f"\n\t}}" 106 | query += f" . \n\n}}\n" 107 | endpoint.setQuery(query) 108 | endpoint.queryType = "INSERT" 109 | endpoint.method = "POST" 110 | endpoint.setReturnFormat("json") 111 | endpoint.query() 112 | cur += len(data.encode("utf-8")) 113 | print(f"inserted triples: {round(100 * cur / tot, 2)}%") 114 | 115 | 116 | @flow 117 | def sparql_insert_flow( 118 | rdf_file: Path, 119 | sparql_cfg: SparqlConfig = SparqlConfig(), 120 | graph: Optional[str] = None, 121 | ): 122 | """Workflow to connect to a SPARQL endpoint and send insert 123 | queries to load triples from a local file. 124 | 125 | Parameters 126 | ---------- 127 | rdf_file: 128 | Path to source RDF file. 129 | sparql_cfg: 130 | Configuration for the target SPARQL endpoint. 131 | """ 132 | load_dotenv() 133 | logger = get_run_logger() 134 | sparql = setup_sparql_endpoint( 135 | sparql_cfg.endpoint, sparql_cfg.user, sparql_cfg.password 136 | ) 137 | logger.info("SPARQL endpoint connected") 138 | insert_triples(rdf_file, sparql, graph) 139 | logger.info("all triples inserted") 140 | 141 | 142 | def cli( 143 | rdf_file: Annotated[ 144 | Path, 145 | typer.Argument( 146 | help="RDF file to load into the SPARQL endpoint, in turtle or n-triples format.", 147 | exists=True, 148 | file_okay=True, 149 | dir_okay=False, 150 | ), 151 | ], 152 | sparql_cfg_path: Annotated[ 153 | Optional[Path], 154 | typer.Option(help="YAML file with SPARQL endpoint configuration."), 155 | ] = None, 156 | graph: Annotated[ 157 | Optional[str], 158 | typer.Option( 159 | help="URI of named graph to load RDF data into. If not set, the default graph is used.", 160 | ), 161 | ] = None, 162 | ): 163 | """Command line wrapper to insert triples to a SPARQL endpoint.""" 164 | sparql_cfg = ( 165 | parse_yaml_config(sparql_cfg_path, SparqlConfig) 166 | if sparql_cfg_path 167 | else SparqlConfig() 168 | ) 169 | sparql_insert_flow(rdf_file, sparql_cfg, graph) 170 | 171 | 172 | if __name__ == "__main__": 173 | typer.run(cli) 174 | -------------------------------------------------------------------------------- /aikg/models.py: -------------------------------------------------------------------------------- 1 | # kg-llm-interface 2 | # Copyright 2023 - Swiss Data Science Center (SDSC) 3 | # A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and 4 | # Eidgenössische Technische Hochschule Zürich (ETHZ). 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | from datetime import datetime, timedelta 19 | from pydantic import BaseModel 20 | import uuid 21 | 22 | 23 | class Message(BaseModel): 24 | text: str 25 | time: datetime 26 | sender: str 27 | triples: str | None = None 28 | 29 | 30 | class Conversation(BaseModel): 31 | """A conversation, represented as a list of messages 32 | and a unique identifier (uid).""" 33 | 34 | thread: list[Message] 35 | uid: str | None = str(uuid.uuid4()) 36 | 37 | @property 38 | def start_time(self) -> datetime | None: 39 | try: 40 | return self.thread[0].time 41 | except IndexError: 42 | return None 43 | 44 | @property 45 | def end_time(self) -> datetime | None: 46 | try: 47 | self.thread[-1].time 48 | except IndexError: 49 | return None 50 | 51 | @property 52 | def duration(self) -> timedelta | None: 53 | if self.start_time is None or self.end_time is None: 54 | return None 55 | return self.end_time - self.start_time 56 | 57 | @property 58 | def actors(self) -> list[str]: 59 | return list(set([m.sender for m in self.thread])) 60 | -------------------------------------------------------------------------------- /aikg/notebooks/nl_sparql.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "attachments": {}, 5 | "cell_type": "markdown", 6 | "metadata": {}, 7 | "source": [ 8 | "## Question to SPARQL query generation\n", 9 | "\n", 10 | "In this notebook, we generate a SPARQL query from an input plain english question and execute it against a knowledge graph.\n", 11 | "\n", 12 | "Below are the two prompts we will use for the language model. First, the `SPARQL_TEMPLATE` is used to construct a SPARQL query from an input quersion and context. Then, the output will be executed against the knowledge graph and the `ANSWER_TEMPLATE` will be used to generate a human-readable answer to described the results." 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 1, 18 | "metadata": {}, 19 | "outputs": [], 20 | "source": [ 21 | "\n", 22 | "SPARQL_TEMPLATE = \"\"\"\n", 23 | "Generate a SPARQL query to answer the input question. A sample of the knowledge graph schema is provided to help construct the query.\n", 24 | "After you generate the sparql, you should display it.\n", 25 | "When generating sparql:\n", 26 | "* never enclose the sparql in back-quotes.\n", 27 | "* always include the prefix declarations.\n", 28 | "* prefer using OPTIONAL when selecting multiple variables.\n", 29 | "* Allow case-insensitive matching of strings.\n", 30 | "\n", 31 | "Use the following format:\n", 32 | "\n", 33 | "Question: the input question for which you must generate a SPARQL query\n", 34 | "Information: the schema information in RDF format. This will help you generate the sparql query with the correct format.\n", 35 | "\n", 36 | "Question: {question_str}\n", 37 | "Information:\n", 38 | "{context_str}\n", 39 | "Answer:\n", 40 | "\"\"\"\n", 41 | "\n", 42 | "ANSWER_TEMPLATE = \"\"\"\n", 43 | "The following describe a user question, associated SPARQL query and the result from executing the query.\n", 44 | "Based on this information, write an answer in simple terms that describes the results.\n", 45 | "When appropriate, use markdown formatting to format the results into a table or bullet points.\n", 46 | "\n", 47 | "Question:\n", 48 | "{question_str}\n", 49 | "Query:\n", 50 | "{query_str}\n", 51 | "Result:\n", 52 | "{result_str}\n", 53 | "Answer:\n", 54 | "\"\"\"" 55 | ] 56 | }, 57 | { 58 | "attachments": {}, 59 | "cell_type": "markdown", 60 | "metadata": {}, 61 | "source": [ 62 | "We setup a minimal configuration, with the vector database (Chroma) running in client-only mode, and a small RDF file acting as the knowledge graph. This file contains both the instance data and the ontology. The ontology is enclosed in a named graph inside the file.\n", 63 | "\n", 64 | "For the sake of the demo, we use a small model for embeddings (MiniLM-L6-V2) and rely on the OpenAI key for text geneartion for text generation. A local model can be used instead, but it will require high RAM and ideally a GPU." 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": 19, 70 | "metadata": {}, 71 | "outputs": [], 72 | "source": [ 73 | "from aikg.config import ChatConfig, ChromaConfig, SparqlConfig\n", 74 | "\n", 75 | "chroma_config = ChromaConfig(\n", 76 | " host=\"local\",\n", 77 | " port=8000,\n", 78 | " collection_name=\"test\",\n", 79 | " embedding_model=\"all-MiniLM-L6-v2\",\n", 80 | ")\n", 81 | "sparql_config = SparqlConfig(\n", 82 | " endpoint=\"../data/test_data.trig\",\n", 83 | ")\n", 84 | "chat_config = ChatConfig(\n", 85 | " answer_template=ANSWER_TEMPLATE,\n", 86 | " sparql_template=SPARQL_TEMPLATE\n", 87 | ")\n" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": 25, 93 | "metadata": {}, 94 | "outputs": [], 95 | "source": [ 96 | "import os\n", 97 | "os.environ[\"OPENAI_API_KEY\"] = \"sk-...\"" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": 20, 103 | "metadata": {}, 104 | "outputs": [ 105 | { 106 | "name": "stderr", 107 | "output_type": "stream", 108 | "text": [ 109 | "/home/stefan/kg-llm-interface/.conda/lib/python3.11/site-packages/langchain/llms/openai.py:173: UserWarning: You are trying to use a chat model. This way of initializing it is no longer supported. Instead, please use: `from langchain.chat_models import ChatOpenAI`\n", 110 | " warnings.warn(\n", 111 | "/home/stefan/kg-llm-interface/.conda/lib/python3.11/site-packages/langchain/llms/openai.py:753: UserWarning: You are trying to use a chat model. This way of initializing it is no longer supported. Instead, please use: `from langchain.chat_models import ChatOpenAI`\n", 112 | " warnings.warn(\n" 113 | ] 114 | } 115 | ], 116 | "source": [ 117 | "\n", 118 | "from aikg.utils.llm import setup_llm_chain\n", 119 | "from aikg.utils.rdf import setup_kg\n", 120 | "\n", 121 | "# Use OpenAI API\n", 122 | "from langchain.llms import OpenAI\n", 123 | "llm = OpenAI(model_name=\"gpt-3.5-turbo-0125\")\n", 124 | "\n", 125 | "# For now, both chains share the same model to spare memory\n", 126 | "answer_chain = setup_llm_chain(llm, chat_config.answer_template)\n", 127 | "sparql_chain = setup_llm_chain(llm, chat_config.sparql_template)\n", 128 | "kg = setup_kg(**sparql_config.dict())" 129 | ] 130 | }, 131 | { 132 | "attachments": {}, 133 | "cell_type": "markdown", 134 | "metadata": {}, 135 | "source": [ 136 | "First, we need to embed the ontology into the vector database. This will allow us to retrieve semantically similar concepts from the ontology based on the question.\n", 137 | "\n", 138 | "In the example rdf file, the ontology is enclosed in a named graph calles `http://example.org/ontology`. " 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": 14, 144 | "metadata": {}, 145 | "outputs": [ 146 | { 147 | "data": { 148 | "text/html": [ 149 | "
14:17:06.998 | INFO    | prefect.engine - Created flow run 'electric-terrier' for flow 'chroma-build-flow'\n",
150 |        "
\n" 151 | ], 152 | "text/plain": [ 153 | "14:17:06.998 | \u001b[36mINFO\u001b[0m | prefect.engine - Created flow run\u001b[35m 'electric-terrier'\u001b[0m for flow\u001b[1;35m 'chroma-build-flow'\u001b[0m\n" 154 | ] 155 | }, 156 | "metadata": {}, 157 | "output_type": "display_data" 158 | }, 159 | { 160 | "data": { 161 | "text/html": [ 162 | "
14:17:07.070 | INFO    | Flow run 'electric-terrier' - INFO Started\n",
163 |        "
\n" 164 | ], 165 | "text/plain": [ 166 | "14:17:07.070 | \u001b[36mINFO\u001b[0m | Flow run\u001b[35m 'electric-terrier'\u001b[0m - \u001b[36mINFO\u001b[0m Started\n" 167 | ] 168 | }, 169 | "metadata": {}, 170 | "output_type": "display_data" 171 | }, 172 | { 173 | "data": { 174 | "text/html": [ 175 | "
14:17:07.141 | INFO    | Flow run 'electric-terrier' - Created task run 'init_chromadb-0' for task 'init_chromadb'\n",
176 |        "
\n" 177 | ], 178 | "text/plain": [ 179 | "14:17:07.141 | \u001b[36mINFO\u001b[0m | Flow run\u001b[35m 'electric-terrier'\u001b[0m - Created task run 'init_chromadb-0' for task 'init_chromadb'\n" 180 | ] 181 | }, 182 | "metadata": {}, 183 | "output_type": "display_data" 184 | }, 185 | { 186 | "data": { 187 | "text/html": [ 188 | "
14:17:07.145 | INFO    | Flow run 'electric-terrier' - Executing 'init_chromadb-0' immediately...\n",
189 |        "
\n" 190 | ], 191 | "text/plain": [ 192 | "14:17:07.145 | \u001b[36mINFO\u001b[0m | Flow run\u001b[35m 'electric-terrier'\u001b[0m - Executing 'init_chromadb-0' immediately...\n" 193 | ] 194 | }, 195 | "metadata": {}, 196 | "output_type": "display_data" 197 | }, 198 | { 199 | "name": "stderr", 200 | "output_type": "stream", 201 | "text": [ 202 | "/home/stefan/kg-llm-interface/.conda/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", 203 | " from .autonotebook import tqdm as notebook_tqdm\n" 204 | ] 205 | }, 206 | { 207 | "data": { 208 | "text/html": [ 209 | "
14:17:13.949 | INFO    | Task run 'init_chromadb-0' - Finished in state Completed()\n",
210 |        "
\n" 211 | ], 212 | "text/plain": [ 213 | "14:17:13.949 | \u001b[36mINFO\u001b[0m | Task run 'init_chromadb-0' - Finished in state \u001b[32mCompleted\u001b[0m()\n" 214 | ] 215 | }, 216 | "metadata": {}, 217 | "output_type": "display_data" 218 | }, 219 | { 220 | "data": { 221 | "text/html": [ 222 | "
14:17:14.028 | INFO    | Flow run 'electric-terrier' - Created task run 'sparql_to_documents-0' for task 'sparql_to_documents'\n",
223 |        "
\n" 224 | ], 225 | "text/plain": [ 226 | "14:17:14.028 | \u001b[36mINFO\u001b[0m | Flow run\u001b[35m 'electric-terrier'\u001b[0m - Created task run 'sparql_to_documents-0' for task 'sparql_to_documents'\n" 227 | ] 228 | }, 229 | "metadata": {}, 230 | "output_type": "display_data" 231 | }, 232 | { 233 | "data": { 234 | "text/html": [ 235 | "
14:17:14.033 | INFO    | Flow run 'electric-terrier' - Executing 'sparql_to_documents-0' immediately...\n",
236 |        "
\n" 237 | ], 238 | "text/plain": [ 239 | "14:17:14.033 | \u001b[36mINFO\u001b[0m | Flow run\u001b[35m 'electric-terrier'\u001b[0m - Executing 'sparql_to_documents-0' immediately...\n" 240 | ] 241 | }, 242 | "metadata": {}, 243 | "output_type": "display_data" 244 | }, 245 | { 246 | "data": { 247 | "text/html": [ 248 | "
14:17:14.575 | INFO    | Task run 'sparql_to_documents-0' - Finished in state Completed()\n",
249 |        "
\n" 250 | ], 251 | "text/plain": [ 252 | "14:17:14.575 | \u001b[36mINFO\u001b[0m | Task run 'sparql_to_documents-0' - Finished in state \u001b[32mCompleted\u001b[0m()\n" 253 | ] 254 | }, 255 | "metadata": {}, 256 | "output_type": "display_data" 257 | }, 258 | { 259 | "data": { 260 | "text/html": [ 261 | "
14:17:14.580 | INFO    | Flow run 'electric-terrier' - Indexing by batches of 50 items\n",
262 |        "
\n" 263 | ], 264 | "text/plain": [ 265 | "14:17:14.580 | \u001b[36mINFO\u001b[0m | Flow run\u001b[35m 'electric-terrier'\u001b[0m - Indexing by batches of 50 items\n" 266 | ] 267 | }, 268 | "metadata": {}, 269 | "output_type": "display_data" 270 | }, 271 | { 272 | "data": { 273 | "text/html": [ 274 | "
14:17:14.646 | INFO    | Flow run 'electric-terrier' - Created task run 'index_batch-0' for task 'index_batch'\n",
275 |        "
\n" 276 | ], 277 | "text/plain": [ 278 | "14:17:14.646 | \u001b[36mINFO\u001b[0m | Flow run\u001b[35m 'electric-terrier'\u001b[0m - Created task run 'index_batch-0' for task 'index_batch'\n" 279 | ] 280 | }, 281 | "metadata": {}, 282 | "output_type": "display_data" 283 | }, 284 | { 285 | "data": { 286 | "text/html": [ 287 | "
14:17:14.649 | INFO    | Flow run 'electric-terrier' - Executing 'index_batch-0' immediately...\n",
288 |        "
\n" 289 | ], 290 | "text/plain": [ 291 | "14:17:14.649 | \u001b[36mINFO\u001b[0m | Flow run\u001b[35m 'electric-terrier'\u001b[0m - Executing 'index_batch-0' immediately...\n" 292 | ] 293 | }, 294 | "metadata": {}, 295 | "output_type": "display_data" 296 | }, 297 | { 298 | "data": { 299 | "text/html": [ 300 | "
14:17:17.929 | INFO    | Task run 'index_batch-0' - Finished in state Completed()\n",
301 |        "
\n" 302 | ], 303 | "text/plain": [ 304 | "14:17:17.929 | \u001b[36mINFO\u001b[0m | Task run 'index_batch-0' - Finished in state \u001b[32mCompleted\u001b[0m()\n" 305 | ] 306 | }, 307 | "metadata": {}, 308 | "output_type": "display_data" 309 | }, 310 | { 311 | "data": { 312 | "text/html": [ 313 | "
14:17:17.936 | INFO    | Flow run 'electric-terrier' - Indexed 13 items.\n",
314 |        "
\n" 315 | ], 316 | "text/plain": [ 317 | "14:17:17.936 | \u001b[36mINFO\u001b[0m | Flow run\u001b[35m 'electric-terrier'\u001b[0m - Indexed 13 items.\n" 318 | ] 319 | }, 320 | "metadata": {}, 321 | "output_type": "display_data" 322 | }, 323 | { 324 | "data": { 325 | "text/html": [ 326 | "
14:17:17.998 | INFO    | Flow run 'electric-terrier' - Finished in state Completed('All states completed.')\n",
327 |        "
\n" 328 | ], 329 | "text/plain": [ 330 | "14:17:17.998 | \u001b[36mINFO\u001b[0m | Flow run\u001b[35m 'electric-terrier'\u001b[0m - Finished in state \u001b[32mCompleted\u001b[0m('All states completed.')\n" 331 | ] 332 | }, 333 | "metadata": {}, 334 | "output_type": "display_data" 335 | }, 336 | { 337 | "data": { 338 | "text/plain": [ 339 | "[Completed(message=None, type=COMPLETED, result=UnpersistedResult(type='unpersisted', artifact_type='result', artifact_description='Unpersisted result of type `tuple`')),\n", 340 | " Completed(message=None, type=COMPLETED, result=UnpersistedResult(type='unpersisted', artifact_type='result', artifact_description='Unpersisted result of type `list`')),\n", 341 | " Completed(message=None, type=COMPLETED, result=UnpersistedResult(type='unpersisted', artifact_type='result', artifact_description='Unpersisted result of type `NoneType`'))]" 342 | ] 343 | }, 344 | "execution_count": 14, 345 | "metadata": {}, 346 | "output_type": "execute_result" 347 | } 348 | ], 349 | "source": [ 350 | "from aikg.flows.chroma_build import chroma_build_flow\n", 351 | "chroma_build_flow(chroma_config, sparql_config, graph=\"https://example.org/ontology\")" 352 | ] 353 | }, 354 | { 355 | "cell_type": "code", 356 | "execution_count": 21, 357 | "metadata": {}, 358 | "outputs": [], 359 | "source": [ 360 | "\n", 361 | "from aikg.utils.chroma import setup_client, setup_collection\n", 362 | "client = setup_client(\n", 363 | " chroma_config.host,\n", 364 | " chroma_config.port,\n", 365 | " chroma_config.persist_directory,\n", 366 | ")\n", 367 | "collection = setup_collection(\n", 368 | " client,\n", 369 | " chroma_config.collection_name,\n", 370 | " chroma_config.embedding_model,\n", 371 | ")\n" 372 | ] 373 | }, 374 | { 375 | "attachments": {}, 376 | "cell_type": "markdown", 377 | "metadata": {}, 378 | "source": [ 379 | "The Chroma collection now contains the ontology concepts as vectors. We can retrieve the most similar concepts to a given question.\n", 380 | "Notice that the property \"programmingLanguage\" is retrieved, even though the question does not contain the word \"programming\"." 381 | ] 382 | }, 383 | { 384 | "cell_type": "code", 385 | "execution_count": 22, 386 | "metadata": {}, 387 | "outputs": [ 388 | { 389 | "name": "stdout", 390 | "output_type": "stream", 391 | "text": [ 392 | " \"programming language\" .\n", 393 | " .\n", 394 | " \"The computer programming language.\" .\n", 395 | " .\n", 396 | " .\n", 397 | "\n", 398 | " \"Computer programming source code. Example: Full (compile ready) solutions, code snippet samples, scripts, templates.\" .\n", 399 | " .\n", 400 | " \"SoftwareSourceCode\" .\n", 401 | " .\n", 402 | "\n", 403 | " .\n", 404 | " .\n", 405 | " .\n", 406 | " \"Link to the repository where the un-compiled, human readable code and related code is located (SVN, GitHub, CodePlex).\" .\n", 407 | " \"codeRepository\" .\n", 408 | "\n", 409 | " \"license\" .\n", 410 | " .\n", 411 | " .\n", 412 | " .\n", 413 | " \"A license document that applies to this content, typically indicated by URL.\" .\n", 414 | " .\n", 415 | "\n", 416 | " \"name\" .\n", 417 | " .\n", 418 | " .\n", 419 | " \"The name of the item.\" .\n", 420 | " .\n", 421 | " .\n", 422 | " .\n", 423 | "\n" 424 | ] 425 | } 426 | ], 427 | "source": [ 428 | "QUESTION = \"What softwares are written in Python?\"\n", 429 | "results = collection.query(query_texts=QUESTION, n_results=5)\n", 430 | "print('\\n'.join([res.get(\"triples\", \"\") for res in results['metadatas'][0]]))\n" 431 | ] 432 | }, 433 | { 434 | "attachments": {}, 435 | "cell_type": "markdown", 436 | "metadata": {}, 437 | "source": [ 438 | "Then, we can generate the SPARQL query." 439 | ] 440 | }, 441 | { 442 | "cell_type": "code", 443 | "execution_count": 23, 444 | "metadata": {}, 445 | "outputs": [ 446 | { 447 | "name": "stdout", 448 | "output_type": "stream", 449 | "text": [ 450 | "PREFIX rdf: \n", 451 | "PREFIX rdfs: \n", 452 | "PREFIX xsd: \n", 453 | "\n", 454 | "SELECT DISTINCT ?softwareName\n", 455 | "WHERE {\n", 456 | " ?software rdf:type .\n", 457 | " ?software ?language .\n", 458 | " FILTER regex(str(?language), \"python\", \"i\") .\n", 459 | " ?software ?softwareName .\n", 460 | "}\n" 461 | ] 462 | } 463 | ], 464 | "source": [ 465 | "from aikg.utils.chat import generate_sparql\n", 466 | "query = generate_sparql(QUESTION, collection, sparql_chain)\n", 467 | "print(query)" 468 | ] 469 | }, 470 | { 471 | "attachments": {}, 472 | "cell_type": "markdown", 473 | "metadata": {}, 474 | "source": [ 475 | "and execute it:" 476 | ] 477 | }, 478 | { 479 | "cell_type": "code", 480 | "execution_count": 24, 481 | "metadata": {}, 482 | "outputs": [ 483 | { 484 | "name": "stdout", 485 | "output_type": "stream", 486 | "text": [ 487 | "[['softwareName'], ['SDSC-ORD/gimie'], ['SDSC-ORD/zarr_linked_data']]\n" 488 | ] 489 | } 490 | ], 491 | "source": [ 492 | "from aikg.utils.rdf import query_kg\n", 493 | "results = query_kg(kg, query)\n", 494 | "print(results)" 495 | ] 496 | }, 497 | { 498 | "attachments": {}, 499 | "cell_type": "markdown", 500 | "metadata": {}, 501 | "source": [ 502 | "We can now generate a human-readable answer from the results of the query:" 503 | ] 504 | }, 505 | { 506 | "cell_type": "code", 507 | "execution_count": 32, 508 | "metadata": {}, 509 | "outputs": [ 510 | { 511 | "data": { 512 | "text/plain": [ 513 | "'The query returned two softwares written in Python: SDSC-ORD/gimie and SDSC-ORD/zarr_linked_data.'" 514 | ] 515 | }, 516 | "execution_count": 32, 517 | "metadata": {}, 518 | "output_type": "execute_result" 519 | } 520 | ], 521 | "source": [ 522 | "from aikg.utils.chat import generate_answer\n", 523 | "generate_answer(QUESTION, query, results, answer_chain)" 524 | ] 525 | } 526 | ], 527 | "metadata": { 528 | "kernelspec": { 529 | "display_name": "aikg-URVQdnEY-py3.10", 530 | "language": "python", 531 | "name": "python3" 532 | }, 533 | "language_info": { 534 | "codemirror_mode": { 535 | "name": "ipython", 536 | "version": 3 537 | }, 538 | "file_extension": ".py", 539 | "mimetype": "text/x-python", 540 | "name": "python", 541 | "nbconvert_exporter": "python", 542 | "pygments_lexer": "ipython3", 543 | "version": "3.11.8" 544 | }, 545 | "orig_nbformat": 4 546 | }, 547 | "nbformat": 4, 548 | "nbformat_minor": 2 549 | } 550 | -------------------------------------------------------------------------------- /aikg/notebooks/sphn_example.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "attachments": {}, 5 | "cell_type": "markdown", 6 | "metadata": {}, 7 | "source": [ 8 | "## Example use-case\n", 9 | "\n", 10 | "In this notebook, we showcase a simple question answering task. We will use the [SPHN ontology](https://www.biomedit.ch/rdf/sphn-ontology/sphn), along with a small mock dataset which contains information artificial medical data." 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 1, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "\n", 20 | "SPARQL_TEMPLATE = \"\"\"\n", 21 | "Generate a SPARQL query to answer the input question. A sample of the knowledge graph schema is provided to help construct the query.\n", 22 | "After you generate the sparql, you should display it.\n", 23 | "When generating sparql:\n", 24 | "* never enclose the sparql in back-quotes.\n", 25 | "* always include the prefix declarations.\n", 26 | "* prefer using OPTIONAL when selecting multiple variables.\n", 27 | "* Allow case-insensitive matching of strings.\n", 28 | "\n", 29 | "Use the following format:\n", 30 | "\n", 31 | "Question: the input question for which you must generate a SPARQL query\n", 32 | "Information: the schema information in RDF format. This will help you generate the sparql query with the correct format.\n", 33 | "\n", 34 | "Question: {question_str}\n", 35 | "Information:\n", 36 | "{context_str}\n", 37 | "Answer:\n", 38 | "\"\"\"\n", 39 | "\n", 40 | "ANSWER_TEMPLATE = \"\"\"\n", 41 | "The following describe a user question, associated SPARQL query and the result from executing the query.\n", 42 | "Based on this information, write an answer in simple terms that describes the results.\n", 43 | "When appropriate, use markdown formatting to format the results into a table or bullet points.\n", 44 | "\n", 45 | "Question:\n", 46 | "{question_str}\n", 47 | "Query:\n", 48 | "{query_str}\n", 49 | "Result:\n", 50 | "{result_str}\n", 51 | "Answer:\n", 52 | "\"\"\"" 53 | ] 54 | }, 55 | { 56 | "cell_type": "markdown", 57 | "metadata": {}, 58 | "source": [ 59 | "We setup a similar configuration as in the nl_sparql notebook, but we have one sparql configuration for the ontology, and one for the instance data, each living in different files." 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": 2, 65 | "metadata": {}, 66 | "outputs": [], 67 | "source": [ 68 | "from aikg.config import ChatConfig, ChromaConfig, SparqlConfig\n", 69 | "\n", 70 | "chroma_config = ChromaConfig(\n", 71 | " host=\"local\",\n", 72 | " port=8000,\n", 73 | " collection_name=\"test\",\n", 74 | " embedding_model=\"all-MiniLM-L6-v2\",\n", 75 | " persist_directory=\"/tmp/chroma-test/\",\n", 76 | ")\n", 77 | "ontology_config = SparqlConfig(\n", 78 | " endpoint=\"../sphn/sphn_ontology_2023_2.ttl\",\n", 79 | ")\n", 80 | "kg_config = SparqlConfig(\n", 81 | " endpoint=\"../sphn/sphn_mock_data_2023_2.ttl\",\n", 82 | ")\n", 83 | "\n", 84 | "chat_config = ChatConfig(\n", 85 | " model_id=\"lmsys/vicuna-7b-v1.3\",\n", 86 | " max_new_tokens=48,\n", 87 | " max_input_size=2048,\n", 88 | " num_output=256,\n", 89 | " max_chunk_overlap=20,\n", 90 | " answer_template=ANSWER_TEMPLATE,\n", 91 | " sparql_template=SPARQL_TEMPLATE\n", 92 | ")\n" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": 3, 98 | "metadata": {}, 99 | "outputs": [], 100 | "source": [ 101 | "import os\n", 102 | "os.environ[\"OPENAI_API_KEY\"] = \"sk-...a\"" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": 4, 108 | "metadata": {}, 109 | "outputs": [ 110 | { 111 | "name": "stderr", 112 | "output_type": "stream", 113 | "text": [ 114 | "Failed to convert Literal lexical form to value. Datatype=http://www.w3.org/2001/XMLSchema#gYear, Converter=\n", 115 | "Traceback (most recent call last):\n", 116 | " File \"/home/cmatthey/.cache/pypoetry/virtualenvs/aikg-ULDgE_fB-py3.10/lib/python3.10/site-packages/rdflib/term.py\", line 2084, in _castLexicalToPython\n", 117 | " return conv_func(lexical) # type: ignore[arg-type]\n", 118 | " File \"/home/cmatthey/.cache/pypoetry/virtualenvs/aikg-ULDgE_fB-py3.10/lib/python3.10/site-packages/isodate/isodates.py\", line 203, in parse_date\n", 119 | " raise ISO8601Error('Unrecognised ISO 8601 date format: %r' % datestring)\n", 120 | "isodate.isoerror.ISO8601Error: Unrecognised ISO 8601 date format: '-1508+14:00'\n", 121 | "Failed to convert Literal lexical form to value. Datatype=http://www.w3.org/2001/XMLSchema#gYear, Converter=\n", 122 | "Traceback (most recent call last):\n", 123 | " File \"/home/cmatthey/.cache/pypoetry/virtualenvs/aikg-ULDgE_fB-py3.10/lib/python3.10/site-packages/rdflib/term.py\", line 2084, in _castLexicalToPython\n", 124 | " return conv_func(lexical) # type: ignore[arg-type]\n", 125 | " File \"/home/cmatthey/.cache/pypoetry/virtualenvs/aikg-ULDgE_fB-py3.10/lib/python3.10/site-packages/isodate/isodates.py\", line 203, in parse_date\n", 126 | " raise ISO8601Error('Unrecognised ISO 8601 date format: %r' % datestring)\n", 127 | "isodate.isoerror.ISO8601Error: Unrecognised ISO 8601 date format: '-2358+01:14'\n", 128 | "Failed to convert Literal lexical form to value. Datatype=http://www.w3.org/2001/XMLSchema#gYear, Converter=\n", 129 | "Traceback (most recent call last):\n", 130 | " File \"/home/cmatthey/.cache/pypoetry/virtualenvs/aikg-ULDgE_fB-py3.10/lib/python3.10/site-packages/rdflib/term.py\", line 2084, in _castLexicalToPython\n", 131 | " return conv_func(lexical) # type: ignore[arg-type]\n", 132 | " File \"/home/cmatthey/.cache/pypoetry/virtualenvs/aikg-ULDgE_fB-py3.10/lib/python3.10/site-packages/isodate/isodates.py\", line 203, in parse_date\n", 133 | " raise ISO8601Error('Unrecognised ISO 8601 date format: %r' % datestring)\n", 134 | "isodate.isoerror.ISO8601Error: Unrecognised ISO 8601 date format: '-2221+14:00'\n" 135 | ] 136 | } 137 | ], 138 | "source": [ 139 | "\n", 140 | "from aikg.utils.llm import setup_llm_chain\n", 141 | "from aikg.utils.rdf import setup_kg\n", 142 | "\n", 143 | "\n", 144 | "# Use OpenAI API\n", 145 | "from langchain.llms import OpenAI\n", 146 | "llm = OpenAI(model_name=\"text-davinci-003\")\n", 147 | "\n", 148 | "# For now, both chains share the same model to spare memory\n", 149 | "answer_chain = setup_llm_chain(llm, chat_config.answer_template)\n", 150 | "sparql_chain = setup_llm_chain(llm, chat_config.sparql_template)\n", 151 | "kg = setup_kg(**kg_config.dict())\n", 152 | "\n", 153 | "# Embed ontology\n", 154 | "from aikg.flows.chroma_build import chroma_build_flow\n", 155 | "chroma_build_flow(chroma_config, ontology_config)" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": 5, 161 | "metadata": {}, 162 | "outputs": [ 163 | { 164 | "name": "stderr", 165 | "output_type": "stream", 166 | "text": [ 167 | "/home/cmatthey/.cache/pypoetry/virtualenvs/aikg-ULDgE_fB-py3.10/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", 168 | " from .autonotebook import tqdm as notebook_tqdm\n" 169 | ] 170 | } 171 | ], 172 | "source": [ 173 | "\n", 174 | "from aikg.utils.chroma import setup_client, setup_collection\n", 175 | "client = setup_client(\n", 176 | " chroma_config.host,\n", 177 | " chroma_config.port,\n", 178 | " chroma_config.persist_directory,\n", 179 | ")\n", 180 | "collection = setup_collection(\n", 181 | " client,\n", 182 | " chroma_config.collection_name,\n", 183 | " chroma_config.embedding_model,\n", 184 | ")\n" 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": 45, 190 | "metadata": {}, 191 | "outputs": [], 192 | "source": [ 193 | "QUESTION = \"Please give me the number of healthcare encounters recorded per year.\"" 194 | ] 195 | }, 196 | { 197 | "cell_type": "code", 198 | "execution_count": 46, 199 | "metadata": {}, 200 | "outputs": [ 201 | { 202 | "name": "stdout", 203 | "output_type": "stream", 204 | "text": [ 205 | "\n", 206 | "PREFIX ns1: \n", 207 | "PREFIX ns2: \n", 208 | "PREFIX owl: \n", 209 | "PREFIX rdf: \n", 210 | "PREFIX rdfs: \n", 211 | "PREFIX xsd: \n", 212 | "\n", 213 | "SELECT (COUNT(*) AS ?encounters) (YEAR(?startDateTime) AS ?year)\n", 214 | "WHERE {\n", 215 | " ?encounter a ns2:HealthcareEncounter ;\n", 216 | " ns2:hasStartDateTime ?startDateTime .\n", 217 | "}\n", 218 | "GROUP BY ?year\n" 219 | ] 220 | } 221 | ], 222 | "source": [ 223 | "from aikg.utils.chat import generate_sparql\n", 224 | "query = generate_sparql(QUESTION, collection, sparql_chain)\n", 225 | "print(query)" 226 | ] 227 | }, 228 | { 229 | "cell_type": "code", 230 | "execution_count": 47, 231 | "metadata": {}, 232 | "outputs": [ 233 | { 234 | "name": "stdout", 235 | "output_type": "stream", 236 | "text": [ 237 | "[(rdflib.term.Literal('2', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#integer')), rdflib.term.Literal('2009', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#integer')))]\n" 238 | ] 239 | } 240 | ], 241 | "source": [ 242 | "from aikg.utils.rdf import query_kg\n", 243 | "results = query_kg(kg, query)\n", 244 | "print(results)" 245 | ] 246 | }, 247 | { 248 | "cell_type": "code", 249 | "execution_count": 48, 250 | "metadata": {}, 251 | "outputs": [ 252 | { 253 | "name": "stdout", 254 | "output_type": "stream", 255 | "text": [ 256 | "In 2009 there were 2 healthcare encounters recorded.\n" 257 | ] 258 | } 259 | ], 260 | "source": [ 261 | "from aikg.utils.chat import generate_answer\n", 262 | "print(generate_answer(QUESTION, query, results, answer_chain))" 263 | ] 264 | } 265 | ], 266 | "metadata": { 267 | "kernelspec": { 268 | "display_name": "aikg-URVQdnEY-py3.10", 269 | "language": "python", 270 | "name": "python3" 271 | }, 272 | "language_info": { 273 | "codemirror_mode": { 274 | "name": "ipython", 275 | "version": 3 276 | }, 277 | "file_extension": ".py", 278 | "mimetype": "text/x-python", 279 | "name": "python", 280 | "nbconvert_exporter": "python", 281 | "pygments_lexer": "ipython3", 282 | "version": "3.10.6" 283 | }, 284 | "orig_nbformat": 4 285 | }, 286 | "nbformat": 4, 287 | "nbformat_minor": 2 288 | } 289 | -------------------------------------------------------------------------------- /aikg/server.py: -------------------------------------------------------------------------------- 1 | # kg-llm-interface 2 | # Copyright 2023 - Swiss Data Science Center (SDSC) 3 | # A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and 4 | # Eidgenössische Technische Hochschule Zürich (ETHZ). 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | """This is the chat server. It receives JSON messages with a question, 19 | fetches context for that question in a vector store and injects them into a prompt. 20 | It then sends the prompt to a LLM and returns the response to the client. 21 | """ 22 | from datetime import datetime 23 | import logging 24 | import os 25 | import sys 26 | 27 | from dotenv import load_dotenv 28 | from fastapi import FastAPI 29 | from langchain.chat_models import ChatOpenAI 30 | from pathlib import Path 31 | 32 | from aikg.config import ChatConfig, ChromaConfig, SparqlConfig 33 | from aikg.config.common import parse_yaml_config 34 | from aikg.models import Conversation, Message 35 | from aikg.utils.chat import generate_answer, generate_examples, generate_sparql 36 | from aikg.utils.llm import setup_llm_chain 37 | from aikg.utils.chroma import setup_collection, setup_client 38 | from aikg.utils.rdf import setup_kg, query_kg 39 | 40 | logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) 41 | 42 | load_dotenv() 43 | chroma_config = ChromaConfig() 44 | sparql_config = SparqlConfig() 45 | if os.environ.get("CHAT_CONFIG"): 46 | chat_config = parse_yaml_config(Path(os.environ["CHAT_CONFIG"]), ChatConfig) 47 | else: 48 | chat_config = ChatConfig() 49 | 50 | 51 | client = setup_client( 52 | chroma_config.host, 53 | chroma_config.port, 54 | chroma_config.persist_directory, 55 | ) 56 | collection = setup_collection( 57 | client, 58 | chroma_config.collection_name, 59 | chroma_config.embedding_model, 60 | ) 61 | 62 | llm = ChatOpenAI( 63 | model_name=chat_config.model, 64 | openai_api_key=chat_config.openai_api_key, 65 | openai_api_base=chat_config.openai_api_base, 66 | ) 67 | 68 | answer_chain = setup_llm_chain(llm, chat_config.answer_template) 69 | sparql_chain = setup_llm_chain(llm, chat_config.sparql_template) 70 | kg = setup_kg(**sparql_config.dict()) 71 | app = FastAPI() 72 | 73 | 74 | @app.get("/") 75 | def index(): 76 | return { 77 | "title": "Hello, welcome to the knowledge graph chatbot!", 78 | "description": "This is a simple chatbot that uses a knowledge graph to answer questions.", 79 | "usage": "Ask a single question using /ask?question='...', or only generate the query using /sparql?question='...'.", 80 | } 81 | 82 | 83 | @app.get("/test/") 84 | async def test() -> Message: 85 | return Message(text="Hello, world!", sender="AI", time=datetime.now()) 86 | 87 | 88 | @app.get("/ask/") 89 | async def ask(question: str) -> Message: 90 | """Generate sparql query from question 91 | and execute query on kg and return an answer based on results.""" 92 | ... 93 | query = generate_sparql(question, collection, sparql_chain, limit=15) 94 | results = query_kg(kg, query) 95 | answer = generate_answer(question, query, results, answer_chain) 96 | return Message(text=answer, sender="AI", time=datetime.now()) 97 | 98 | 99 | @app.get("/sparql/") 100 | async def sparql(question: str) -> Message: 101 | """Generate and return sparql query from question.""" 102 | query = generate_sparql(question, collection, sparql_chain) 103 | return Message(text=query, sender="AI", time=datetime.now()) 104 | -------------------------------------------------------------------------------- /aikg/utils/chat.py: -------------------------------------------------------------------------------- 1 | # kg-llm-interface 2 | # Copyright 2023 - Swiss Data Science Center (SDSC) 3 | # A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and 4 | # Eidgenössische Technische Hochschule Zürich (ETHZ). 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | """Utilities to help processing chatbot prompts or answers.""" 19 | from typing import Any, Iterable 20 | 21 | from chromadb.api import Collection 22 | from rdflib import Graph 23 | from langchain import LLMChain 24 | 25 | 26 | def keep_first_line(text: str) -> str: 27 | r"""Truncate a string to the first non-empty line. 28 | 29 | Examples 30 | -------- 31 | >>> keep_first_line("\nFirst line.\nSecond line.") 32 | 'First line.' 33 | """ 34 | return text.lstrip("\n").split("\n")[0].strip(" ") 35 | 36 | 37 | def drop_if_keyword(text: str, keyword: str = "Not found.") -> str: 38 | """If input keyword occurs in text, replace it with the keyword. 39 | 40 | Examples 41 | -------- 42 | >>> drop_if_keyword("Not found. Some made up answer.", keyword="Not found.") 43 | 'Not found.' 44 | """ 45 | if keyword in text: 46 | return keyword 47 | return text 48 | 49 | 50 | def post_process_answer(answer: str) -> str: 51 | """Post-process an answer by keeping only the first line and dropping 52 | it if it contains the keyword 'Not found.'.""" 53 | text = keep_first_line(answer) 54 | text = drop_if_keyword(text) 55 | return text 56 | 57 | 58 | def generate_sparql( 59 | question: str, 60 | collection: Collection, 61 | llm_chain: LLMChain, 62 | examples: str = "", 63 | limit: int = 5, 64 | ) -> str: 65 | """Retrieve k-nearest documents from the vector store and synthesize 66 | SPARQL query.""" 67 | 68 | # Retrieve documents and triples from top k subjects 69 | results = collection.query(query_texts=question, n_results=limit) 70 | # Extract triples and concatenate as a ntriples string 71 | triples = "\n".join([res.get("triples", "") for res in results["metadatas"][0]]) 72 | # Convert to turtle for better readability and fewer tokens 73 | triples = Graph().parse(data=triples).serialize(format="turtle") 74 | query = llm_chain.run( 75 | question_str=question, context_str=triples, examples_str=examples 76 | ) 77 | return query 78 | 79 | 80 | def generate_examples( 81 | question: str, 82 | collection: Collection, 83 | limit: int = 5, 84 | ) -> str: 85 | """Retrieve k-nearest questions from the examples in the vector store and return them 86 | together with their correponding query.""" 87 | 88 | # Retrieve documents and triples from top k subjects 89 | examples = collection.query(query_texts=question, n_results=limit) 90 | # Extract relevant information from dict 91 | example_docs = examples["documents"][0] 92 | example_meta = examples["metadatas"][0] 93 | # 94 | example_prompt = "Examples: \n\n" 95 | for doc, meta in zip(example_docs, example_meta): 96 | example_prompt += f""" 97 | Question: 98 | {doc} 99 | Query: 100 | {meta['query']} 101 | """ 102 | return example_prompt 103 | 104 | 105 | def generate_answer( 106 | question: str, 107 | query: str, 108 | results: Iterable[Any], 109 | llm_chain: LLMChain, 110 | ) -> str: 111 | """ 112 | Given a question, associated SPARQL query and execution result, 113 | use a LLM to generate a natural language answer describing the results. 114 | """ 115 | # Extract triples and concatenate as a ntriples string 116 | fmt_results = ["\n".join(map(str, results))] 117 | answer = llm_chain.run( 118 | query_str=query, question_str=question, result_str=fmt_results 119 | ) 120 | return answer 121 | -------------------------------------------------------------------------------- /aikg/utils/chroma.py: -------------------------------------------------------------------------------- 1 | # kg-llm-interface 2 | # Copyright 2023 - Swiss Data Science Center (SDSC) 3 | # A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and 4 | # Eidgenössische Technische Hochschule Zürich (ETHZ). 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | import chromadb 19 | from chromadb.api import ClientAPI, Collection 20 | 21 | 22 | def setup_client(host: str, port: int, persist_directory: str = ".chroma") -> ClientAPI: 23 | """Prepare chromadb client. If host is 'local', chromadb will run in client-only mode.""" 24 | if host == "local": 25 | chroma_client = chromadb.PersistentClient(path=persist_directory) 26 | else: 27 | chroma_client = chromadb.HttpClient(host=host, port=str(port)) 28 | return chroma_client 29 | 30 | 31 | def setup_collection( 32 | client: ClientAPI, 33 | collection_name: str, 34 | embedding_model: str, 35 | ) -> Collection: 36 | """Setup the connection to ChromaDB collection.""" 37 | 38 | from chromadb.utils import embedding_functions 39 | 40 | embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction( 41 | model_name=embedding_model 42 | ) 43 | collection = client.get_or_create_collection( 44 | collection_name, embedding_function=embedding_function 45 | ) 46 | return collection 47 | -------------------------------------------------------------------------------- /aikg/utils/io.py: -------------------------------------------------------------------------------- 1 | # kg-llm-interface 2 | # Copyright 2023 - Swiss Data Science Center (SDSC) 3 | # A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and 4 | # Eidgenössische Technische Hochschule Zürich (ETHZ). 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | import requests 19 | from pathlib import Path 20 | from typing import TextIO 21 | from langchain.schema import Document 22 | from tqdm import tqdm 23 | 24 | 25 | def download_file(url: str, output_path: str | Path): 26 | # send a GET request to the URL to download the file. Stream since it's large 27 | response = requests.get(url, stream=True) 28 | 29 | # open the file in binary mode and write the contents of the response to it in chunks 30 | # This is a large file, so be prepared to wait. 31 | with open(output_path, "wb") as f: 32 | for chunk in tqdm(response.iter_content(chunk_size=8192)): 33 | if chunk: 34 | f.write(chunk) 35 | 36 | 37 | def parse_sparql_example(example: TextIO) -> Document: 38 | """ 39 | Parse a text stream as input with first line being a question (starting with #) 40 | and the remaining lines being a (SPARQL) query. We reformat this content into a document 41 | where the page content is the question and the query is attached as metadata 42 | """ 43 | # Create temp variable to process text stream 44 | example_temp = [] 45 | example_temp.append(example.read()) 46 | # Splitting the file content into lines 47 | lines = example_temp[0].split("\n") 48 | # Extracting the question (removing '#' from the first line) 49 | question = lines[0].strip()[1:] 50 | # Extracting the SPARQL query from the remaining lines 51 | sparql_query = "\n".join(lines[1:]) 52 | # Create example document for the output 53 | example_doc = Document(page_content=question, metadata={"query": sparql_query}) 54 | return example_doc 55 | -------------------------------------------------------------------------------- /aikg/utils/llm.py: -------------------------------------------------------------------------------- 1 | # kg-llm-interface 2 | # Copyright 2023 - Swiss Data Science Center (SDSC) 3 | # A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and 4 | # Eidgenössische Technische Hochschule Zürich (ETHZ). 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | import re 19 | 20 | from langchain import LLMChain, PromptTemplate 21 | from langchain.llms.base import LLM 22 | 23 | 24 | def setup_llm_chain(llm: LLM, prompt_template: str) -> LLMChain: 25 | """Prepare the prompt injection and text generation system.""" 26 | # Auto-detecting prompt variables surrounded by single curly braces 27 | variables = re.findall(r"[^{]{([^} \n]+)}[^}]", prompt_template) 28 | prompt = PromptTemplate( 29 | template=prompt_template, 30 | input_variables=variables, 31 | ) 32 | return LLMChain(prompt=prompt, llm=llm) 33 | -------------------------------------------------------------------------------- /aikg/utils/rdf.py: -------------------------------------------------------------------------------- 1 | # kg-llm-interface 2 | # Copyright 2023 - Swiss Data Science Center (SDSC) 3 | # A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and 4 | # Eidgenössische Technische Hochschule Zürich (ETHZ). 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | from itertools import groupby 19 | from pathlib import Path 20 | from typing import Any, Dict, Iterable, Iterator, List, Optional 21 | 22 | from langchain.schema import Document 23 | from rdflib import ConjunctiveGraph, Graph 24 | from rdflib.exceptions import ParserError 25 | from SPARQLWrapper import SPARQLWrapper, CSV 26 | from urllib.parse import urlparse 27 | 28 | # Retrieve triples of human readable labels/values from a SPARQL endpoint. 29 | TRIPLE_LABEL_QUERY = """ 30 | PREFIX rdfs: 31 | 32 | SELECT ?s ?p ?o ?sLab ?pLab ?oClean 33 | WHERE 34 | {{ 35 | ?s ?p ?o . 36 | ?s rdfs:label ?sLab . 37 | ?p rdfs:label ?pLab . 38 | OPTIONAL {{ 39 | ?o rdfs:label ?oLab . 40 | FILTER(LANG(?oLab) = "{lang}") 41 | }} 42 | BIND(COALESCE(?oLab, ?o) AS ?oLabOrUri) 43 | BIND( 44 | IF (isLiteral(?o), ?o, STR(?oLabOrUri)) 45 | AS ?oLabOrVal 46 | ) 47 | FILTER(LANG(?sLab) = "{lang}") 48 | FILTER(LANG(?pLab) = "{lang}") 49 | FILTER(LANG(?oLabOrVal) = "{lang}" || LANG(?oLabOrVal) = "") 50 | BIND (REPLACE(STR(?oLabOrVal), "^.*[#/:]([^/:#]*)$", "$1") as ?oClean) 51 | {graph_mask} 52 | }} 53 | """ 54 | 55 | # Retrieve each subject and its annotations 56 | SUBJECT_DOC_QUERY = """ 57 | PREFIX rdfs: 58 | PREFIX skos: 59 | PREFIX sh: 60 | PREFIX schema: 61 | SELECT DISTINCT ?s (SAMPLE(?sLab) as ?sLabel) ?sCom 62 | WHERE 63 | {{ 64 | VALUES ?labelProp {{skos:prefLabel rdfs:label sh:name schema:name}} 65 | VALUES ?defProp {{rdfs:comment skos:definition sh:description schema:description }} 66 | ?s ?labelProp ?sLab . 67 | OPTIONAL {{ 68 | ?s ?defProp ?sCom . 69 | }} 70 | FILTER(LANG(?sLab) = "{lang}" || LANG(?sLab) = "") 71 | FILTER(LANG(?sCom) = "{lang}" || LANG(?sCom) = "") 72 | {graph_mask} 73 | }} 74 | GROUP BY ?s ?sCom 75 | """ 76 | 77 | 78 | def is_uri(uri: str): 79 | """Checks if input is a valid URI.""" 80 | 81 | try: 82 | result = urlparse(uri) 83 | return all([result.scheme, result.netloc]) 84 | except AttributeError: 85 | return False 86 | 87 | 88 | def make_graph_mask(graph: Optional[str] = None) -> str: 89 | if graph: 90 | return f"FILTER EXISTS {{ GRAPH <{graph}> {{ ?s ?p ?o }} }}" 91 | else: 92 | return "" 93 | 94 | 95 | def setup_kg( 96 | endpoint: str, user: Optional[str] = None, password: Optional[str] = None 97 | ) -> Graph | SPARQLWrapper: 98 | """Try to connect to SPARQL endpoint. If not a URL, attempt 99 | to parse RDF file with rdflib.""" 100 | 101 | if is_uri(endpoint): 102 | kg = SPARQLWrapper(endpoint) 103 | kg.setReturnFormat(CSV) 104 | if user and password: 105 | kg.setCredentials(user, password) 106 | else: 107 | kg = ConjunctiveGraph() 108 | kg.parse(endpoint) 109 | return kg 110 | 111 | 112 | def split_documents_from_endpoint( 113 | kg: Graph | SPARQLWrapper, 114 | graph: Optional[str] = None, 115 | ) -> Iterator[Document]: 116 | """Load subject-based documents from a SPARQL endpoint. 117 | 118 | Parameters 119 | ---------- 120 | endpoint: 121 | URL of the SPARQL endpoint. 122 | user: 123 | Username to use for authentication. 124 | password: 125 | Password to use for authentication. 126 | graph: 127 | URI of named graph to load RDF data from. 128 | If not specified, all subjects are used. 129 | """ 130 | 131 | graph_mask = make_graph_mask(graph) 132 | 133 | # Load the query results 134 | # Query results contain 6 columns: 135 | # subject, predicate, object, subject label, predicate label, object label 136 | results = query_kg(kg, TRIPLE_LABEL_QUERY.format(lang="en", graph_mask=graph_mask)) 137 | # skip header if present 138 | if not is_uri(results[0][0]): 139 | results = results[1:] 140 | # Exclude empty / incomplete results (e.g. missing labels) 141 | results = filter(lambda x: len(list(x)) == 6, results) 142 | results = sorted(results, key=lambda x: x[0])[1:] 143 | # Yield triples and text by subject 144 | for k, g in groupby(results, lambda x: x[0]): 145 | # Original triples about subject k 146 | data = list(g) 147 | triples = "\n".join([f"<{s}> <{p}> <{o}>" for s, p, o, _, _, _ in data]) 148 | # Human-readable "triples" about subject k 149 | doc = "\n".join([" ".join(elem[3:]) for elem in data]) 150 | yield Document(page_content=doc, metadata={"subject": k, "triples": triples}) 151 | 152 | 153 | def get_subjects_docs( 154 | kg: Graph | SPARQLWrapper, graph: Optional[str] = None 155 | ) -> List[Document]: 156 | """Given an RDF graph, iterate over subjects, extract human-readable 157 | RDFS annotations. For each subject, retrieve a "text document" with 158 | original triples attached as metadata.""" 159 | 160 | results = query_kg( 161 | kg, SUBJECT_DOC_QUERY.format(lang="en", graph_mask=make_graph_mask(graph)) 162 | ) 163 | docs = [] 164 | # skip header if present 165 | if not is_uri(results[0][0]): 166 | results = results[1:] 167 | 168 | for sub, label, comment in results: 169 | text = f""" 170 | {label} 171 | {comment or ''} 172 | """ 173 | triples = query_kg(kg, f"DESCRIBE <{sub}>") 174 | 175 | g = Graph() 176 | # SPARQLWrapper returns a ntriple string, rdflib a list of triples 177 | try: 178 | g.parse(data=triples[0][0], format="nt") 179 | except (RuntimeError, ParserError): 180 | for triple in triples: 181 | g.add(triple) 182 | meta = {"triples": g.serialize(format="nt")} 183 | docs.append(Document(page_content=text, metadata=meta)) 184 | return docs 185 | 186 | 187 | def query_kg(kg: Graph | SPARQLWrapper, query: str) -> List[List[Any]]: 188 | """Query a knowledge graph, either an rdflib Graph or a SPARQLWrapper. 189 | Results are returned as a list of lists representing a table.""" 190 | query2fmt = {"DESCRIBE": "nt", "SELECT": "csv", "CONSTRUCT": "nt"} 191 | if isinstance(kg, Graph): 192 | resp = kg.query(query) 193 | fmt = query2fmt[resp.type] 194 | raw_results = resp.serialize(format=fmt) 195 | 196 | elif isinstance(kg, SPARQLWrapper): 197 | kg.setQuery(query) 198 | fmt = query2fmt[kg.queryType] 199 | kg.setReturnFormat(fmt) 200 | raw_results = kg.query().convert() 201 | else: 202 | raise ValueError(f"Invalid type for kg: {type(kg)}") 203 | if fmt == "csv": 204 | import csv 205 | 206 | lines = raw_results.decode("utf-8").splitlines() 207 | return [row for row in csv.reader(lines, quotechar='"', delimiter=",") if row] 208 | else: 209 | return [[raw_results]] 210 | 211 | return results 212 | -------------------------------------------------------------------------------- /data/models/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdsc-ordes/kg-llm-interface/dd5f967d2dfbd2fb718450f54aaa4b69635c2642/data/models/.gitkeep -------------------------------------------------------------------------------- /data/test_data.trig: -------------------------------------------------------------------------------- 1 | @prefix ex: . 2 | @prefix rdf: . 3 | @prefix rdfs: . 4 | @prefix schema1: . 5 | @prefix xsd: . 6 | 7 | ex:ontology { 8 | schema1:Organization a rdfs:Class ; 9 | rdfs:label "Organization" ; 10 | rdfs:comment "An organization such as a school, NGO, corporation, club, etc." . 11 | 12 | schema1:Person a rdfs:Class ; 13 | rdfs:label "Person" ; 14 | rdfs:comment "A person (alive, dead, undead, or fictional)." . 15 | 16 | schema1:SoftwareSourceCode a rdfs:Class ; 17 | rdfs:label "SoftwareSourceCode" ; 18 | rdfs:comment "Computer programming source code. Example: Full (compile ready) solutions, code snippet samples, scripts, templates." ; 19 | rdfs:subClassOf schema1:CreativeWork . 20 | 21 | schema1:affiliation a rdf:Property ; 22 | rdfs:label "affiliation" ; 23 | rdfs:comment "An organization that this person is affiliated with. For example, a school/university, a club, or a team." ; 24 | rdfs:domain schema1:Person ; 25 | rdfs:range schema1:Organization . 26 | 27 | schema1:author a rdf:Property ; 28 | rdfs:label "author" ; 29 | rdfs:comment "The author of this content or rating." ; 30 | rdfs:domain schema1:SoftwareSourceCode ; 31 | rdfs:range schema1:Person . 32 | 33 | schema1:datePublished a rdf:Property ; 34 | rdfs:label "datePublished" ; 35 | rdfs:comment "Date of first broadcast/publication." ; 36 | rdfs:domain schema1:SoftwareSourceCode ; 37 | rdfs:range xsd:date . 38 | 39 | schema1:description a rdf:Property ; 40 | rdfs:label "description" ; 41 | rdfs:comment "A description of the item." ; 42 | rdfs:domain schema1:SoftwareSourceCode ; 43 | rdfs:range xsd:string . 44 | 45 | schema1:keywords a rdf:Property ; 46 | rdfs:label "keywords" ; 47 | rdfs:comment "Keywords or tags used to describe some item. Multiple textual entries in a keywords list are typically delimited by commas, or by repeating the property." ; 48 | rdfs:domain schema1:SoftwareSourceCode ; 49 | rdfs:range xsd:string . 50 | 51 | schema1:license a rdf:Property ; 52 | rdfs:label "license" ; 53 | rdfs:comment "A license document that applies to this content, typically indicated by URL." ; 54 | rdfs:domain schema1:SoftwareSourceCode ; 55 | rdfs:range xsd:anyUri, 56 | xsd:string . 57 | 58 | schema1:name a rdf:Property ; 59 | rdfs:label "name" ; 60 | rdfs:comment "The name of the item." ; 61 | rdfs:domain schema1:Organization, 62 | schema1:Person, 63 | schema1:SoftwareSourceCode ; 64 | rdfs:range xsd:string . 65 | 66 | schema1:programmingLanguage a rdf:Property ; 67 | rdfs:label "programming language" ; 68 | rdfs:comment "The computer programming language." ; 69 | rdfs:domain schema1:SoftwareSourceCode ; 70 | rdfs:range xsd:string . 71 | 72 | schema1:codeRepository a rdf:Property ; 73 | rdfs:label "codeRepository" ; 74 | rdfs:comment "Link to the repository where the un-compiled, human readable code and related code is located (SVN, GitHub, CodePlex)." ; 75 | rdfs:domain schema1:SoftwareSourceCode ; 76 | rdfs:range xsd:anyUri . 77 | } 78 | 79 | 80 | a schema1:SoftwareSourceCode ; 81 | schema1:codeRepository ; 82 | schema1:author ; 83 | schema1:name "SDSC-ORD/gimie" ; 84 | schema1:contributor , 85 | , 86 | , 87 | , 88 | , 89 | , 90 | ; 91 | schema1:dateCreated "2022-12-07" ; 92 | schema1:dateModified "2023-06-07" ; 93 | schema1:description "Extract linked metadata from repositories" ; 94 | schema1:downloadUrl "https://github.com/SDSC-ORD/gimie/archive/refs/tags/0.5.0.tar.gz" ; 95 | schema1:keywords "fair-data", 96 | "git", 97 | "linked-open-data", 98 | "metadata-extraction", 99 | "python", 100 | "scientific-software" ; 101 | schema1:license ; 102 | schema1:programmingLanguage "Python" ; 103 | schema1:version "0.5.0" . 104 | 105 | a schema1:Organization ; 106 | schema1:description "Cross-disciplinary community around research data, voluntary EPFL's researchers and staff with keen interest in research data." ; 107 | schema1:legalName "EPFL Data Champions" ; 108 | schema1:logo ; 109 | schema1:name "EPFL-Data-Champions" . 110 | 111 | a schema1:Organization ; 112 | schema1:description "" ; 113 | schema1:legalName "biocypher" ; 114 | schema1:logo ; 115 | schema1:name "biocypher" . 116 | 117 | a schema1:Person ; 118 | schema1:affiliation , 119 | , 120 | , 121 | ; 122 | schema1:identifier "cmdoret" ; 123 | schema1:name "Cyril Matthey-Doret" . 124 | 125 | a schema1:Organization ; 126 | schema1:description "" ; 127 | schema1:legalName "Romain Koszul Laboratory" ; 128 | schema1:logo ; 129 | schema1:name "koszullab" . 130 | 131 | a schema1:Person ; 132 | schema1:affiliation , 133 | ; 134 | schema1:identifier "martinfontanet" . 135 | 136 | a schema1:Person ; 137 | schema1:affiliation ; 138 | schema1:identifier "rmfranken" . 139 | 140 | a schema1:Person ; 141 | schema1:affiliation ; 142 | schema1:identifier "sabinem" ; 143 | schema1:name "Sabine Maennel" . 144 | 145 | a schema1:Person ; 146 | schema1:affiliation , 147 | ; 148 | schema1:identifier "sabrinaossey" ; 149 | schema1:name "sabrinaossey" . 150 | 151 | a schema1:Person ; 152 | schema1:affiliation , 153 | ; 154 | schema1:identifier "supermaxiste" . 155 | 156 | a schema1:Person ; 157 | schema1:affiliation ; 158 | schema1:identifier "vancauwe" ; 159 | schema1:name "Laure Vancau" . 160 | 161 | a schema1:Organization ; 162 | schema1:description "An ETH Domain initiative for accelerating the adoption of data science" ; 163 | schema1:legalName "Swiss Data Science Center" ; 164 | schema1:logo ; 165 | schema1:name "SwissDataScienceCenter" . 166 | 167 | a schema1:Organization ; 168 | schema1:description "Open Research Data team at the Swiss Data Science Center." ; 169 | schema1:legalName "Swiss Data Science Center - ORD" ; 170 | schema1:logo ; 171 | schema1:name "SDSC-ORD" . 172 | 173 | 174 | 175 | a schema1:SoftwareSourceCode ; 176 | schema1:codeRepository ; 177 | schema1:author ; 178 | schema1:name "SDSC-ORD/kg-llm-interface" ; 179 | schema1:contributor ; 180 | schema1:dateCreated "2023-04-19" ; 181 | schema1:dateModified "2023-07-05" ; 182 | schema1:description "Langchain-powered natural language interface to RDF knowledge-graphs." ; 183 | schema1:license ; 184 | schema1:programmingLanguage "Jupyter Notebook" . 185 | 186 | a schema1:Organization ; 187 | schema1:description "Cross-disciplinary community around research data, voluntary EPFL's researchers and staff with keen interest in research data." ; 188 | schema1:legalName "EPFL Data Champions" ; 189 | schema1:logo ; 190 | schema1:name "EPFL-Data-Champions" . 191 | 192 | a schema1:Organization ; 193 | schema1:description "An ETH Domain initiative for accelerating the adoption of data science" ; 194 | schema1:legalName "Swiss Data Science Center" ; 195 | schema1:logo ; 196 | schema1:name "SwissDataScienceCenter" . 197 | 198 | a schema1:Person ; 199 | schema1:affiliation , 200 | , 201 | , 202 | ; 203 | schema1:identifier "cmdoret" ; 204 | schema1:name "Cyril Matthey-Doret" . 205 | 206 | a schema1:Organization ; 207 | schema1:description "" ; 208 | schema1:legalName "Romain Koszul Laboratory" ; 209 | schema1:logo ; 210 | schema1:name "koszullab" . 211 | 212 | a schema1:Organization ; 213 | schema1:description "Open Research Data team at the Swiss Data Science Center." ; 214 | schema1:legalName "Swiss Data Science Center - ORD" ; 215 | schema1:logo ; 216 | schema1:name "SDSC-ORD" . 217 | 218 | 219 | 220 | a schema1:SoftwareSourceCode ; 221 | schema1:codeRepository ; 222 | schema1:author ; 223 | schema1:name "SDSC-ORD/zarr_linked_data" ; 224 | schema1:contributor ; 225 | schema1:dateCreated "2023-04-06" ; 226 | schema1:dateModified "2023-05-09" ; 227 | schema1:description "The project seeks to make a dataflow composed both of the Zarr data format and linked metadata." ; 228 | schema1:license ; 229 | schema1:programmingLanguage "Python" . 230 | 231 | a schema1:Person ; 232 | schema1:affiliation ; 233 | schema1:identifier "vancauwe" ; 234 | schema1:name "Laure Vancau" . 235 | 236 | a schema1:Organization ; 237 | schema1:description "Open Research Data team at the Swiss Data Science Center." ; 238 | schema1:legalName "Swiss Data Science Center - ORD" ; 239 | schema1:logo ; 240 | schema1:name "SDSC-ORD" . 241 | 242 | a schema1:SoftwareSourceCode ; 243 | schema1:codeRepository ; 244 | schema1:author ; 245 | schema1:name "SDSC-ORD/pxRRead" ; 246 | schema1:contributor ; 247 | schema1:dateCreated "2023-02-20" ; 248 | schema1:dateModified "2023-06-02" ; 249 | schema1:description "Read a px file of fso statistical data" ; 250 | schema1:keywords "parsing", 251 | "statistical-data" ; 252 | schema1:license ; 253 | schema1:programmingLanguage "R" . 254 | 255 | a schema1:Person ; 256 | schema1:affiliation ; 257 | schema1:identifier "sabinem" ; 258 | schema1:name "Sabine Maennel" . 259 | 260 | a schema1:Organization ; 261 | schema1:description "Open Research Data team at the Swiss Data Science Center." ; 262 | schema1:legalName "Swiss Data Science Center - ORD" ; 263 | schema1:logo ; 264 | schema1:name "SDSC-ORD" . 265 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3.9' 2 | 3 | networks: 4 | net: 5 | driver: bridge 6 | 7 | services: 8 | 9 | # chat-frontend: 10 | # image: ghcr.io/sdsc-ord/chatllm:latest 11 | # container_name: kg_llm_frontend 12 | # ports: 13 | # - 8000:8000 14 | # networks: 15 | # net: 16 | # profiles: 17 | # - frontend 18 | 19 | chat-server: 20 | build: 21 | context: . 22 | dockerfile: Dockerfile 23 | command: poetry run uvicorn aikg.server:app --reload --port 8001 --host 0.0.0.0 24 | container_name: kg_llm_server 25 | depends_on: 26 | - chroma-server 27 | ports: 28 | - ${SERVER_PORT:-8001}:8001 29 | networks: 30 | net: 31 | env_file: 32 | - .env 33 | 34 | chroma-server: 35 | image: ghcr.io/chroma-core/chroma:0.3.23 36 | volumes: 37 | - index_data:/index_data 38 | command: uvicorn chromadb.app:app --reload --workers 1 --host 0.0.0.0 --port 8000 --log-config log_config.yml 39 | container_name: kg_llm_chroma 40 | environment: 41 | - CHROMA_DB_IMPL=clickhouse 42 | - CLICKHOUSE_HOST=clickhouse 43 | - CLICKHOUSE_PORT=8123 44 | ports: 45 | - ${CHROMA_PORT:-8000}:8000 46 | depends_on: 47 | - clickhouse 48 | profiles: 49 | - db 50 | networks: 51 | net: 52 | 53 | 54 | clickhouse: 55 | image: clickhouse/clickhouse-server:22.9-alpine 56 | container_name: kg_llm_clickhouse 57 | environment: 58 | - ALLOW_EMPTY_PASSWORD=yes 59 | - CLICKHOUSE_TCP_PORT=8999 60 | - CLICKHOUSE_HTTP_PORT=8123 61 | ports: 62 | - '8123:8123' 63 | - '8999:8999' 64 | volumes: 65 | - clickhouse_data:/bitnami/clickhouse 66 | - backups:/backups 67 | - ./config/backup_disk.xml:/etc/clickhouse-server/config.d/backup_disk.xml 68 | - ./config/chroma_users.xml:/etc/clickhouse-server/users.d/chroma.xml 69 | profiles: 70 | - db 71 | networks: 72 | net: 73 | 74 | 75 | volumes: 76 | clickhouse_data: 77 | driver: local 78 | index_data: 79 | driver: local 80 | backups: 81 | driver: local 82 | -------------------------------------------------------------------------------- /k8s/README.md: -------------------------------------------------------------------------------- 1 | # Deploying Apps with Kubernetes 2 | This guide provides concise instructions on how to deploy and manage apps in your project using Kubernetes. 3 | 4 | ## Prerequisites 5 | Make sure you have kubectl installed and configured to interact with your Kubernetes cluster. 6 | 7 | Then create a `kg-llm` namespace, this is the default namespace used to deploy apps in this repository. 8 | ```sh 9 | kubectl create ns kg-llm 10 | ``` 11 | 12 | ## Deploy example config 13 | We provide an example config using a kustomize overlay, located under `overlays/custom-config`. 14 | This config downloads example data from the web and injects it into the graphdb and chroma services using init containers, providing a ready-to-use knowledge graph interface with pre-loaded data. 15 | 16 | To deploy it, simply run: 17 | 18 | ```sh 19 | kubectl apply -k overlays/custom-config 20 | ``` 21 | 22 | > [!TIP] 23 | > The easiest way to get started with custom data is to copy the provided config into a separate folder (e.g. `overlays/my-config`) and edit it. 24 | 25 | 26 | ## Deploy a Single App 27 | 28 | kubectl apply -k 29 | 30 | For example, to deploy only graphdb in this repository: 31 | 32 | ```sh 33 | kubectl apply -k ./base/graphdb 34 | ``` 35 | 36 | ## Deploying Multiple Apps 37 | To deploy all apps at once, execute the following command from the kubernetes/base folder: 38 | 39 | ```sh 40 | kubectl apply -k ./base 41 | ``` 42 | 43 | ## Removing an App or Apps 44 | To remove an app or multiple apps, execute the following command: 45 | 46 | ```sh 47 | kubectl delete -k 48 | ``` 49 | -------------------------------------------------------------------------------- /k8s/base/chatllm/deployment.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: chatllm-server 5 | namespace: kg-llm 6 | spec: 7 | replicas: 1 8 | selector: 9 | matchLabels: 10 | app: chatllm 11 | template: 12 | metadata: 13 | labels: 14 | app: chatllm 15 | spec: 16 | containers: 17 | - name: chatllm-container 18 | image: daniilzhyrov/chatllm:latest 19 | ports: 20 | - containerPort: 0 21 | name: http 22 | -------------------------------------------------------------------------------- /k8s/base/chatllm/kustomization.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: kustomize.config.k8s.io/v1beta1 2 | kind: Kustomization 3 | 4 | resources: 5 | - deployment.yaml 6 | - service.yaml 7 | 8 | configMapGenerator: 9 | - name: chatllm-config 10 | namespace: kg-llm 11 | env: params.env 12 | 13 | generatorOptions: 14 | disableNameSuffixHash: true 15 | 16 | replacements: 17 | - source: 18 | kind: ConfigMap 19 | name: chatllm-config 20 | fieldPath: data.TARGET_PORT 21 | targets: 22 | - select: 23 | kind: Deployment 24 | name: chatllm-server 25 | fieldPaths: 26 | - spec.template.spec.containers.[name=chatllm-container].ports.[name=http].containerPort 27 | - select: 28 | kind: Service 29 | name: chatllm-service 30 | fieldPaths: 31 | - spec.ports.[name=http].targetPort 32 | - source: 33 | kind: ConfigMap 34 | name: chatllm-config 35 | fieldPath: data.EXTERNAL_PORT 36 | targets: 37 | - select: 38 | kind: Service 39 | name: chatllm-service 40 | fieldPaths: 41 | - spec.ports.[name=http].port 42 | -------------------------------------------------------------------------------- /k8s/base/chatllm/params.env: -------------------------------------------------------------------------------- 1 | TARGET_PORT=8501 2 | EXTERNAL_PORT=8501 3 | -------------------------------------------------------------------------------- /k8s/base/chatllm/service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | name: chatllm-service 5 | namespace: kg-llm 6 | spec: 7 | selector: 8 | app: chatllm 9 | ports: 10 | - protocol: TCP 11 | port: 0 12 | targetPort: 0 13 | name: http 14 | -------------------------------------------------------------------------------- /k8s/base/chroma/configmap.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ConfigMap 3 | metadata: 4 | name: backup-disk-config 5 | namespace: kg-llm 6 | data: 7 | backup_disk.xml: | 8 | 9 | 10 | 11 | 12 | local 13 | /etc/clickhouse-server/ 14 | 15 | 16 | 17 | 18 | backups 19 | /etc/clickhouse-server/ 20 | 21 | 22 | --- 23 | apiVersion: v1 24 | kind: ConfigMap 25 | metadata: 26 | name: chroma-users-config 27 | namespace: kg-llm 28 | data: 29 | chroma.xml: | 30 | 31 | 32 | 33 | 1 34 | 1 35 | 36 | 37 | 38 | -------------------------------------------------------------------------------- /k8s/base/chroma/deployment.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: chroma-server 5 | namespace: kg-llm 6 | spec: 7 | replicas: 1 8 | selector: 9 | matchLabels: 10 | app: chroma 11 | template: 12 | metadata: 13 | labels: 14 | app: chroma 15 | spec: 16 | restartPolicy: Always 17 | containers: 18 | - name: kg-llm-chroma 19 | image: ghcr.io/chroma-core/chroma:0.4.24 20 | command: 21 | - uvicorn 22 | args: ["chromadb.app:app", "--reload", "--workers", "1", "--host", "0.0.0.0", "--port", $(CHROMA_PORT), "--log-config", "chromadb/log_config.yml"] 23 | env: 24 | - name: IS_PERSISTENT 25 | value: "true" 26 | - name: CHROMA_PORT 27 | value: chroma_port_placeholder 28 | ports: 29 | - containerPort: 0 30 | name: chroma-port 31 | volumeMounts: 32 | - name: chroma-index-data 33 | mountPath: /index_data 34 | volumes: 35 | - name: chroma-index-data 36 | persistentVolumeClaim: 37 | claimName: chroma-index-data 38 | -------------------------------------------------------------------------------- /k8s/base/chroma/kustomization.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: kustomize.config.k8s.io/v1beta1 2 | kind: Kustomization 3 | 4 | resources: 5 | - deployment.yaml 6 | - service.yaml 7 | - configmap.yaml 8 | - pvc.yaml 9 | 10 | configMapGenerator: 11 | - name: chroma-config 12 | namespace: kg-llm 13 | envs: 14 | - params.env 15 | 16 | generatorOptions: 17 | disableNameSuffixHash: true 18 | 19 | replacements: 20 | - source: 21 | kind: ConfigMap 22 | name: chroma-config 23 | fieldpath: data.CHROMA_PORT 24 | targets: 25 | - select: 26 | kind: Deployment 27 | name: chroma-server 28 | fieldPaths: 29 | - spec.template.spec.containers.[name=kg-llm-chroma].ports.[name=chroma-port].containerPort 30 | - spec.template.spec.containers.[name=kg-llm-chroma].env.[name=CHROMA_PORT].value 31 | - select: 32 | kind: Service 33 | name: chroma-service 34 | fieldPaths: 35 | - spec.ports.[name=http].targetPort 36 | - spec.ports.[name=http].port 37 | -------------------------------------------------------------------------------- /k8s/base/chroma/params.env: -------------------------------------------------------------------------------- 1 | CHROMA_PORT=8000 2 | CHROMA_SERVICE_NAME=chroma-service 3 | CHROMA_SERVICE_NAMESPACE=kg-llm -------------------------------------------------------------------------------- /k8s/base/chroma/pvc.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: PersistentVolumeClaim 3 | metadata: 4 | name: chroma-index-data 5 | namespace: kg-llm 6 | spec: 7 | accessModes: 8 | - ReadWriteOnce 9 | resources: 10 | requests: 11 | storage: 10Gi 12 | -------------------------------------------------------------------------------- /k8s/base/chroma/service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | name: chroma-service 5 | namespace: kg-llm 6 | spec: 7 | type: ClusterIP 8 | selector: 9 | app: chroma 10 | ports: 11 | - protocol: TCP 12 | port: 8000 13 | targetPort: 8000 14 | name: http 15 | -------------------------------------------------------------------------------- /k8s/base/graphdb/configmap.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ConfigMap 3 | metadata: 4 | name: graphdb-repo-config 5 | namespace: kg-llm 6 | data: 7 | graphdb_config.ttl: | 8 | @prefix rdfs: . 9 | @prefix rep: . 10 | @prefix sr: . 11 | @prefix sail: . 12 | @prefix owlim: . 13 | 14 | [] a rep:Repository ; 15 | rep:repositoryID "test" ; 16 | rdfs:label "test" ; 17 | rep:repositoryImpl [ 18 | rep:repositoryType "graphdb:SailRepository" ; 19 | sr:sailImpl [ 20 | sail:sailType "graphdb:Sail" ; 21 | owlim:base-URL "http://www.ontotext.com/" ; 22 | # other configurations... 23 | ] 24 | ]. 25 | -------------------------------------------------------------------------------- /k8s/base/graphdb/deployment.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: graphdb 5 | namespace: kg-llm 6 | 7 | spec: 8 | replicas: 1 9 | selector: 10 | matchLabels: 11 | app: graphdb 12 | template: 13 | metadata: 14 | labels: 15 | app: graphdb 16 | spec: 17 | containers: 18 | - name: graphdb 19 | image: ontotext/graphdb:10.2.0 20 | ports: 21 | - containerPort: 0 22 | name: graphdb 23 | volumeMounts: 24 | - name: graphdb-home 25 | mountPath: /opt/graphdb/home 26 | volumes: 27 | - name: graphdb-home 28 | persistentVolumeClaim: 29 | claimName: graphdb-home 30 | 31 | -------------------------------------------------------------------------------- /k8s/base/graphdb/kustomization.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: kustomize.config.k8s.io/v1beta1 2 | kind: Kustomization 3 | 4 | resources: 5 | - deployment.yaml 6 | - service.yaml 7 | - pvc.yaml 8 | - configmap.yaml 9 | 10 | configMapGenerator: 11 | - name: graphdb-config 12 | namespace: kg-llm 13 | envs: 14 | - params.env 15 | 16 | generatorOptions: 17 | disableNameSuffixHash: true 18 | 19 | replacements: 20 | - source: 21 | kind: ConfigMap 22 | name: graphdb-config 23 | fieldpath: data.GRAPHDB_PORT 24 | targets: 25 | - select: 26 | kind: Deployment 27 | name: graphdb 28 | fieldPaths: 29 | - spec.template.spec.containers.[name=graphdb].ports.[name=graphdb].containerPort 30 | - select: 31 | kind: Service 32 | name: graphdb-service 33 | fieldPaths: 34 | - spec.ports.[name=http].targetPort 35 | - spec.ports.[name=http].port 36 | -------------------------------------------------------------------------------- /k8s/base/graphdb/params.env: -------------------------------------------------------------------------------- 1 | GRAPHDB_PORT=7200 2 | GRAPHDB_SERVICE_NAME=graphdb-service 3 | GRAPHDB_SERVICE_NAMESPACE=kg-llm 4 | 5 | -------------------------------------------------------------------------------- /k8s/base/graphdb/pvc.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: PersistentVolumeClaim 3 | metadata: 4 | name: graphdb-home 5 | namespace: kg-llm 6 | spec: 7 | accessModes: 8 | - ReadWriteOnce 9 | resources: 10 | requests: 11 | storage: 10Gi 12 | -------------------------------------------------------------------------------- /k8s/base/graphdb/service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | name: graphdb-service 5 | namespace: kg-llm 6 | spec: 7 | type: ClusterIP 8 | selector: 9 | app: graphdb 10 | ports: 11 | - protocol: TCP 12 | port: 0 13 | targetPort: 0 14 | name: http 15 | -------------------------------------------------------------------------------- /k8s/base/kg-llm/deployment.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: kg-llm 5 | namespace: kg-llm 6 | spec: 7 | replicas: 1 8 | selector: 9 | matchLabels: 10 | app: kg-llm 11 | template: 12 | metadata: 13 | labels: 14 | app: kg-llm 15 | spec: 16 | restartPolicy: Always 17 | containers: 18 | - name: kg-llm-container 19 | image: ghcr.io/sdsc-ordes/kg-llm-interface:latest 20 | env: 21 | - name: GRAPHDB_HOST 22 | value: "servicename.servicenamespace.svc.cluster.local" 23 | - name: GRAPHDB_PORT 24 | value: "0" 25 | - name: GRAPHDB_REPO 26 | value: "test" 27 | - name: SPARQL_ENDPOINT 28 | value: "http://$(GRAPHDB_HOST):$(GRAPHDB_PORT)/repositories/$(GRAPHDB_REPO)" 29 | - name: CHROMA_HOST 30 | value: "servicename.servicenamespace.svc.cluster.local" 31 | - name: CHROMA_PORT 32 | value: "0" 33 | - name: OPENAI_API_KEY 34 | value: "0" 35 | ports: 36 | - containerPort: 80 37 | initContainers: 38 | - name: graphdb-upload-container 39 | image: ghcr.io/sdsc-ordes/kg-llm-interface:latest 40 | command: ["/bin/sh", "-c"] 41 | args: 42 | - | 43 | cd /app; 44 | # setup graphdb 45 | curl -v -u admin:admin -X POST http://${GRAPHDB_HOST}:${GRAPHDB_PORT}/rest/repositories -H 'Content-Type: multipart/form-data' -F config=@/app/graphdb_config.ttl || true; 46 | export SPARQL_ENDPOINT="http://${GRAPHDB_HOST}:${GRAPHDB_PORT}/repositories/test"; 47 | # download and uncompress nq.gz instance data 48 | wget -O - ${DATA} | gzip -dc > sample.nq; 49 | # download nt ontology and inject named graph -> combine with instance as nq 50 | wget -O - ${ONTOLOGY} | sed 's|^\(.*\) \.|\1 .|' >> sample.nq ; 51 | poetry run python3 aikg/flows/insert_triples.py /app/sample.nq; 52 | volumeMounts: 53 | - name: graphdb-repo-config-volume 54 | mountPath: /app/graphdb_config.ttl 55 | subPath: graphdb_config.ttl 56 | env: 57 | - name: GRAPHDB_HOST 58 | value: graphdb_host_placeholder 59 | - name: GRAPHDB_PORT 60 | value: graphdb_port_placeholder 61 | envFrom: 62 | - configMapRef: 63 | name: kg-llm-config 64 | - name: chroma-upload-container 65 | image: ghcr.io/sdsc-ordes/kg-llm-interface:latest 66 | command: ["/bin/sh", "-c"] 67 | args: 68 | - | 69 | set -e; 70 | cd /app; 71 | wget -O ontology.nt ${ONTOLOGY}; 72 | echo "endpoint: ontology.nt" > ontology.yaml 73 | poetry run python3 aikg/flows/chroma_build.py --sparql-cfg-path ontology.yaml; 74 | env: 75 | - name: CHROMA_HOST 76 | value: chroma_host_placeholder 77 | - name: CHROMA_PORT 78 | value: chroma_port_placeholder 79 | envFrom: 80 | - configMapRef: 81 | name: kg-llm-config 82 | 83 | volumes: 84 | - name: graphdb-repo-config-volume 85 | configMap: 86 | name: graphdb-repo-config 87 | -------------------------------------------------------------------------------- /k8s/base/kg-llm/kustomization.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: kustomize.config.k8s.io/v1beta1 2 | kind: Kustomization 3 | 4 | resources: 5 | - deployment.yaml 6 | - service.yaml 7 | - pvc.yaml 8 | 9 | configMapGenerator: 10 | - name: kg-llm-config 11 | namespace: kg-llm 12 | envs: 13 | - params.env 14 | 15 | generatorOptions: 16 | disableNameSuffixHash: true 17 | 18 | replacements: 19 | - source: 20 | kind: ConfigMap 21 | name: kg-llm-config 22 | fieldPath: data.GRAPHDB_SERVICE_NAME 23 | targets: 24 | - select: 25 | kind: Deployment 26 | name: kg-llm 27 | fieldPaths: 28 | - spec.template.spec.containers.[name=kg-llm-container].env.[name=GRAPHDB_HOST].value 29 | - spec.template.spec.initContainers.[name=graphdb-upload-container].env.[name=GRAPHDB_HOST].value 30 | - source: 31 | kind: ConfigMap 32 | name: kg-llm-config 33 | fieldPath: data.GRAPHDB_PORT 34 | targets: 35 | - select: 36 | kind: Deployment 37 | name: kg-llm 38 | fieldPaths: 39 | - spec.template.spec.containers.[name=kg-llm-container].env.[name=GRAPHDB_PORT].value 40 | - spec.template.spec.initContainers.[name=graphdb-upload-container].env.[name=GRAPHDB_PORT].value 41 | - source: 42 | kind: ConfigMap 43 | name: kg-llm-config 44 | fieldPath: data.CHROMA_SERVICE_NAME 45 | targets: 46 | - select: 47 | kind: Deployment 48 | name: kg-llm 49 | fieldPaths: 50 | - spec.template.spec.containers.[name=kg-llm-container].env.[name=CHROMA_HOST].value 51 | - spec.template.spec.initContainers.[name=chroma-upload-container].env.[name=CHROMA_HOST].value 52 | - source: 53 | kind: ConfigMap 54 | name: kg-llm-config 55 | fieldPath: data.CHROMA_PORT 56 | targets: 57 | - select: 58 | kind: Deployment 59 | name: kg-llm 60 | fieldPaths: 61 | - spec.template.spec.containers.[name=kg-llm-container].env.[name=CHROMA_PORT].value 62 | - spec.template.spec.initContainers.[name=chroma-upload-container].env.[name=CHROMA_PORT].value 63 | - source: 64 | kind: ConfigMap 65 | name: kg-llm-config 66 | fieldPath: data.OPENAI_API_KEY 67 | targets: 68 | - select: 69 | kind: Deployment 70 | name: kg-llm 71 | fieldPaths: 72 | - spec.template.spec.containers.[name=kg-llm-container].env.[name=OPENAI_API_KEY].value 73 | -------------------------------------------------------------------------------- /k8s/base/kg-llm/params.env: -------------------------------------------------------------------------------- 1 | ONTOLOGY=https://www.pokemonkg.org/ontology/ontology.nt 2 | DATA=https://www.pokemonkg.org/download/dump/poke-a.nq.gz 3 | 4 | #figure out how to remove this: can we throw out the entire CHROMA part in the kustomization file ? 5 | CHROMA_PORT=8000 6 | CHROMA_SERVICE_NAME=chroma-service 7 | GRAPHDB_PORT=7200 8 | GRAPHDB_SERVICE_NAME=graphdb-service 9 | GRAPHDB_REPO="test" 10 | 11 | OPENAI_API_KEY=sk-...xxxxx 12 | -------------------------------------------------------------------------------- /k8s/base/kg-llm/pvc.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: PersistentVolumeClaim 3 | metadata: 4 | name: graphdb-config-pvc 5 | namespace: kg-llm 6 | spec: 7 | accessModes: 8 | - ReadWriteOnce 9 | resources: 10 | requests: 11 | storage: 1Gi 12 | -------------------------------------------------------------------------------- /k8s/base/kg-llm/service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | name: kg-llm-service 5 | namespace: kg-llm 6 | spec: 7 | type: NodePort 8 | selector: 9 | app: kg-llm 10 | ports: 11 | - protocol: TCP 12 | port: 80 13 | targetPort: 80 14 | -------------------------------------------------------------------------------- /k8s/base/kustomization.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: kustomize.config.k8s.io/v1beta1 2 | kind: Kustomization 3 | 4 | resources: 5 | - chatllm 6 | - kg-llm 7 | - chroma 8 | - graphdb 9 | -------------------------------------------------------------------------------- /k8s/overlays/custom-config/kustomization.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: kustomize.config.k8s.io/v1beta1 2 | kind: Kustomization 3 | 4 | resources: 5 | - ../../base 6 | 7 | configMapGenerator: 8 | - name: custom-config 9 | namespace: kg-llm 10 | envs: 11 | - params.env 12 | 13 | generatorOptions: 14 | disableNameSuffixHash: true 15 | -------------------------------------------------------------------------------- /k8s/overlays/custom-config/params.env: -------------------------------------------------------------------------------- 1 | CHROMA_SERVICE_NAME=chroma-service 2 | CHROMA_SERVICE_NAMESPACE=kg-llm 3 | CHROMA_PORT=8000 4 | GRAPHDB_SERVICE_NAME=graphdb-service 5 | GRAPHDB_SERVICE_NAMESPACE=kg-llm 6 | GRAPHDB_PORT=7200 7 | TRIPLES_PATH=https://link-to-dump.com/data.ttl.gz 8 | ONTOLOGY_PATH=https://github.com/user/repo/blob/master/ontology.ttl.gz 9 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | authors = [ 3 | {name = "cyril.matthey-doret", email = "cyril.matthey-doret@epfl.ch"}, 4 | ] 5 | license = {text = "MIT"} 6 | requires-python = "<4.0,>=3.10" 7 | dependencies = [ 8 | "tqdm<5.0.0,>=4.65.0", 9 | "sentence-transformers<3.0.0,>=2.2.2", 10 | "python-dotenv<2.0.0,>=1.0.0", 11 | "ipykernel<7.0.0,>=6.22.0", 12 | "rdflib==6.3.0", 13 | "chromadb<1.0.0,>=0.4.22", 14 | "more-itertools<10.0.0,>=9.1.0", 15 | "prefect<3.0.0,>=2.10.6", 16 | "typer<1.0.0,>=0.9.0", 17 | "bokeh==2.4.3", 18 | "sparqlwrapper<3.0.0,>=2.0.0", 19 | "fastapi<1.0.0,>=0.95.1", 20 | "uvicorn<1.0.0,>=0.22.0", 21 | "typing-extensions<5.0.0,>=4.6.3", 22 | "protobuf==3.20", 23 | "jupyterlab<5.0.0,>=4.0.2", 24 | "langchain<1.0.0,>=0.0.230", 25 | "openai<1.0.0,>=0.27.8", 26 | "poethepoet<1.0.0,>=0.21.0", 27 | "html5lib<2.0,>=1.1", 28 | "anyio==3.7.1", 29 | "testcontainers<4.0.0,>=3.7.1", 30 | "torch==2.6.0+cpu", 31 | "torchvision==0.21.0+cpu", 32 | ] 33 | name = "aikg" 34 | version = "0.1.0" 35 | description = "Langchain-powered natural language interface to RDF knowledge-graphs" 36 | readme = "README.md" 37 | 38 | [dependency-groups] 39 | local = [ 40 | "transformers<5.0.0,>=4.28.1", 41 | ] 42 | dev = [ 43 | "black<24.0.0,>=23.3.0", 44 | "pytest<8.0.0,>=7.3.1", 45 | "pre-commit<4.0.0,>=3.2.2", 46 | ] 47 | 48 | [tool.uv.sources] 49 | torch = [ 50 | { index = "pytorch-cpu" }, 51 | ] 52 | torchvision = [ 53 | { index = "pytorch-cpu" }, 54 | ] 55 | 56 | [[tool.uv.index]] 57 | name = "pytorch-cpu" 58 | url = "https://download.pytorch.org/whl/cpu" 59 | explicit = true 60 | -------------------------------------------------------------------------------- /scripts/standalone_server.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # This script starts the server with a local Chroma instance and using 3 | # an RDF file as knowledge graph. 4 | 5 | # Local SPARQL+Chroma configs 6 | export SPARQL_ENDPOINT="data/test_data.trig" 7 | export CHROMA_HOST="local" 8 | export CHROMA_MODEL="all-MiniLM-L6-v2" 9 | export CHROMA_PERSIST_DIR="/tmp/chroma-test" 10 | export CHAT_CONFIG="tests/chat.test.yml" 11 | 12 | # Embed in Chroma 13 | python aikg/flows/chroma_build.py --graph https://example.org/ontology 14 | # Run server 15 | uvicorn "aikg.server:app" --port 8001 16 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdsc-ordes/kg-llm-interface/dd5f967d2dfbd2fb718450f54aaa4b69635c2642/tests/__init__.py -------------------------------------------------------------------------------- /tests/chat.test.yml: -------------------------------------------------------------------------------- 1 | 2 | model_id: lmsys/vicuna-7b-v1.3 3 | answer_template: | 4 | The following describe a user question, associated SPARQL query and the result from the query. 5 | Based on this information, write an answer in simple terms that describes the results. 6 | 7 | Question: 8 | {question_str} 9 | Query: 10 | {query_str} 11 | Result: 12 | {result_str} 13 | Answer: 14 | sparql_template: | 15 | 16 | Use the question and the additional information to generate a sparql query against 17 | a knowledge graph where the p and q items are 18 | 19 | completely unknown to you. You will need to discover the p and q items before you 20 | can generate the sparql. 21 | 22 | Do not assume you know the p and q items for any concepts. 23 | 24 | After you generate the sparql, you should display it. 25 | 26 | When generating sparql, Never enclose the sparql in back-quotes 27 | 28 | Use the following format: 29 | 30 | Question: the input question for which you must provide a natural language answer 31 | 32 | Information: the additional information you get with the query, in RDF format. This 33 | will help you generate the sparql query with the correct format. 34 | 35 | 36 | Question: {question_str} 37 | 38 | Information: 39 | 40 | {context_str} 41 | 42 | Answer: 43 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | import gzip 2 | from pathlib import Path 3 | import pytest 4 | import shutil 5 | import tempfile 6 | 7 | from aikg.utils.io import download_file 8 | 9 | TEST_SCHEMA_URL = "https://www.pokemonkg.org/ontology/ontology.nt" 10 | TEST_INSTANCES_URL = "https://www.pokemonkg.org/download/dump/poke-a.nq.gz" 11 | 12 | 13 | @pytest.fixture(scope="module") 14 | def instance_file() -> Path: 15 | """Download and gunzip remote instance test file.""" 16 | gz_path = tempfile.NamedTemporaryFile(suffix=".nq.gz", delete=False).name 17 | download_file(TEST_INSTANCES_URL, gz_path) 18 | path = gz_path.removesuffix(".gz") 19 | # gunzip downloaded file 20 | with gzip.open(gz_path, "rb") as f_in: 21 | with open(path, "wb") as f_out: 22 | shutil.copyfileobj(f_in, f_out) 23 | 24 | return Path(path) 25 | 26 | 27 | @pytest.fixture(scope="module") 28 | def schema_file() -> Path: 29 | """Download remote schema test file.""" 30 | path = tempfile.NamedTemporaryFile(suffix=".nt", delete=False).name 31 | download_file(TEST_SCHEMA_URL, path) 32 | return Path(path) 33 | 34 | 35 | @pytest.fixture(scope="module") 36 | def small_instance_file(instance_file) -> Path: 37 | """Create a small instance file for testing, truncated to 100 lines.""" 38 | path = tempfile.NamedTemporaryFile(suffix=".nq", delete=False).name 39 | 40 | with open(instance_file) as f, open(path, "w") as f_out: 41 | for i, line in enumerate(f): 42 | if i > 1000: 43 | break 44 | f_out.write(line) 45 | return Path(path) 46 | -------------------------------------------------------------------------------- /tests/test_load_data.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import time 3 | 4 | from testcontainers.core.container import DockerContainer 5 | from aikg.config import ChromaConfig, SparqlConfig 6 | from aikg.flows.chroma_build import chroma_build_flow 7 | from aikg.flows.insert_triples import sparql_insert_flow 8 | 9 | REPO_CONFIG = """ 10 | @prefix rdfs: . 11 | @prefix rep: . 12 | @prefix sr: . 13 | @prefix sail: . 14 | @prefix owlim: . 15 | 16 | [] a rep:Repository ; 17 | rep:repositoryID "test" ; 18 | rdfs:label "test" ; 19 | rep:repositoryImpl [ 20 | rep:repositoryType "graphdb:SailRepository" ; 21 | sr:sailImpl [ 22 | sail:sailType "graphdb:Sail" ; 23 | owlim:base-URL "http://www.ontotext.com/" ; 24 | # other configurations... 25 | ] 26 | ]. 27 | """ 28 | 29 | 30 | def test_init_data(schema_file, small_instance_file): 31 | with ( 32 | DockerContainer("ontotext/graphdb:10.2.2").with_bind_ports(7200, 7200) 33 | ) as graphdb: 34 | # container ready + margin for graphdb to start 35 | graphdb.get_exposed_port(7200) 36 | time.sleep(5) 37 | # Create test repo 38 | resp = requests.post( 39 | "http://localhost:7200/rest/repositories", files={"config": REPO_CONFIG} 40 | ) 41 | sparql_insert_flow(schema_file, SparqlConfig()) 42 | sparql_insert_flow(small_instance_file, SparqlConfig()) 43 | chroma_build_flow(ChromaConfig(host="local")) 44 | -------------------------------------------------------------------------------- /tests/test_rdf.py: -------------------------------------------------------------------------------- 1 | # Test RDF functionality to interact with a knowledge graph. 2 | # The kg may be a SPARQL endpoint or a local RDF file. 3 | from aikg.config import SparqlConfig 4 | from aikg.utils.rdf import query_kg, setup_kg 5 | import pytest 6 | 7 | rdflib_config = SparqlConfig( 8 | endpoint="data/test_data.trig", 9 | ) 10 | sparql_config = SparqlConfig( 11 | endpoint="https://sparql.uniprot.org/", 12 | ) 13 | 14 | CONFIGS = [rdflib_config, sparql_config] 15 | QUERIES = [ 16 | "SELECT * WHERE { ?s ?p ?o } LIMIT 10", 17 | "DESCRIBE ?s WHERE { ?s ?p ?o } LIMIT 10", 18 | ] 19 | 20 | 21 | @pytest.fixture 22 | def sparql_kg(): 23 | """A public SPARQL endpoint.""" 24 | return setup_kg(sparql_config.endpoint, sparql_config.user, sparql_config.password) 25 | 26 | 27 | @pytest.fixture 28 | def rdflib_kg(): 29 | """A local RDF file.""" 30 | return setup_kg(rdflib_config.endpoint, rdflib_config.user, rdflib_config.password) 31 | 32 | 33 | @pytest.mark.parametrize("kg", ["sparql_kg", "rdflib_kg"]) 34 | @pytest.mark.parametrize("query", QUERIES) 35 | def test_run_query_kg(kg, query, request): 36 | """Test if a query on a kg returns at least one result.""" 37 | res = query_kg(request.getfixturevalue(kg), query) 38 | assert len(res) >= 1 39 | 40 | 41 | @pytest.mark.parametrize("query", QUERIES) 42 | def test_compare_query_kg(sparql_kg, rdflib_kg, query): 43 | """Test if the same query on rdflib and sparql yields 44 | the same output dimensions.""" 45 | rdflib_res = query_kg(rdflib_kg, query) 46 | sparql_res = query_kg(sparql_kg, query) 47 | assert len(sparql_res) == len(rdflib_res) 48 | assert all([len(x) == len(y) for x, y in zip(sparql_res, rdflib_res)]) 49 | --------------------------------------------------------------------------------