├── .env.example
├── .github
└── workflows
│ └── python-version-compatibility.yml
├── .gitignore
├── .pre-commit-config.yaml
├── Dockerfile
├── LICENSE
├── Makefile
├── NOTICE
├── README.md
├── aikg
├── config
│ ├── __init__.py
│ ├── chat.py
│ ├── chroma.py
│ ├── common.py
│ └── sparql.py
├── flows
│ ├── chroma_build.py
│ ├── chroma_examples.py
│ └── insert_triples.py
├── models.py
├── notebooks
│ ├── nl_sparql.ipynb
│ └── sphn_example.ipynb
├── server.py
└── utils
│ ├── chat.py
│ ├── chroma.py
│ ├── io.py
│ ├── llm.py
│ └── rdf.py
├── data
├── models
│ └── .gitkeep
└── test_data.trig
├── docker-compose.yml
├── k8s
├── README.md
├── base
│ ├── chatllm
│ │ ├── deployment.yaml
│ │ ├── kustomization.yaml
│ │ ├── params.env
│ │ └── service.yaml
│ ├── chroma
│ │ ├── configmap.yaml
│ │ ├── deployment.yaml
│ │ ├── kustomization.yaml
│ │ ├── params.env
│ │ ├── pvc.yaml
│ │ └── service.yaml
│ ├── graphdb
│ │ ├── configmap.yaml
│ │ ├── deployment.yaml
│ │ ├── kustomization.yaml
│ │ ├── params.env
│ │ ├── pvc.yaml
│ │ └── service.yaml
│ ├── kg-llm
│ │ ├── deployment.yaml
│ │ ├── kustomization.yaml
│ │ ├── params.env
│ │ ├── pvc.yaml
│ │ └── service.yaml
│ └── kustomization.yaml
└── overlays
│ └── custom-config
│ ├── kustomization.yaml
│ └── params.env
├── pyproject.toml
├── scripts
└── standalone_server.sh
├── tests
├── __init__.py
├── chat.test.yml
├── conftest.py
├── test_load_data.py
└── test_rdf.py
└── uv.lock
/.env.example:
--------------------------------------------------------------------------------
1 | COMPOSE_PROJECT_NAME=kg_llm
2 | CONDA_CHANNELS=conda-forge
3 | SERVER_PORT=8001
4 | CHROMA_HOST=kg_llm_chroma
5 | CHROMA_PORT=8000
6 | CHROMA_COLLECTION="test"
7 | SPARQL_USER="admin"
8 | SPARQL_PASSWORD="admin"
9 | SPARQL_ENDPOINT="http://localhost:7200/repositories/test"
10 |
--------------------------------------------------------------------------------
/.github/workflows/python-version-compatibility.yml:
--------------------------------------------------------------------------------
1 | name: python-version-compatibility
2 |
3 | on:
4 | pull_request:
5 | paths:
6 | - 'pyproject.toml'
7 |
8 | jobs:
9 | build:
10 | strategy:
11 | fail-fast: true
12 | matrix:
13 | os: [ "ubuntu-latest" ]
14 | python-version: [ "3.10", "3.11", "3.12", "3.13" ]
15 | runs-on: ${{ matrix.os }}
16 | steps:
17 | - name: Check out repository
18 | uses: actions/checkout@v4
19 |
20 | - name: Set up python ${{ matrix.python-version }}
21 | id: setup-python
22 | uses: actions/setup-python@v5
23 | with:
24 | python-version: ${{ matrix.python-version }}
25 |
26 | - name: Install uv
27 | uses: astral-sh/setup-uv@v5
28 |
29 | - name: Install the project
30 | run: uv sync --all-extras --dev
31 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # project
2 | .idea/
3 | .vscode/
4 | .metaflow/
5 | .env
6 | data/
7 | # Byte-compiled / optimized / DLL files
8 | __pycache__/
9 | *.py[cod]
10 | *$py.class
11 |
12 | # C extensions
13 | *.so
14 |
15 | # Distribution / packaging
16 | .Python
17 | build/
18 | develop-eggs/
19 | dist/
20 | downloads/
21 | eggs/
22 | .eggs/
23 | lib/
24 | lib64/
25 | parts/
26 | sdist/
27 | var/
28 | wheels/
29 | pip-wheel-metadata/
30 | share/python-wheels/
31 | *.egg-info/
32 | .installed.cfg
33 | *.egg
34 | MANIFEST
35 |
36 | # PyInstaller
37 | # Usually these files are written by a python script from a template
38 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
39 | *.manifest
40 | *.spec
41 |
42 | # Installer logs
43 | pip-log.txt
44 | pip-delete-this-directory.txt
45 |
46 | # Unit test / coverage reports
47 | htmlcov/
48 | .tox/
49 | .nox/
50 | .coverage
51 | .coverage.*
52 | .cache
53 | nosetests.xml
54 | coverage.xml
55 | *.cover
56 | *.py,cover
57 | .hypothesis/
58 | .pytest_cache/
59 |
60 | # Translations
61 | *.mo
62 | *.pot
63 |
64 | # Django stuff:
65 | *.log
66 | local_settings.py
67 | db.sqlite3
68 | db.sqlite3-journal
69 |
70 | # Flask stuff:
71 | instance/
72 | .webassets-cache
73 |
74 | # Scrapy stuff:
75 | .scrapy
76 |
77 | # Sphinx documentation
78 | docs/_build/
79 |
80 | # PyBuilder
81 | target/
82 |
83 | # Jupyter Notebook
84 | .ipynb_checkpoints
85 |
86 | # IPython
87 | profile_default/
88 | ipython_config.py
89 |
90 | # pyenv
91 | .python-version
92 |
93 | # pipenv
94 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
95 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
96 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
97 | # install all needed dependencies.
98 | #Pipfile.lock
99 |
100 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
101 | __pypackages__/
102 |
103 | # Celery stuff
104 | celerybeat-schedule
105 | celerybeat.pid
106 |
107 | # SageMath parsed files
108 | *.sage.py
109 |
110 | # Environments
111 | .env
112 | .venv
113 | env/
114 | venv/
115 | ENV/
116 | env.bak/
117 | venv.bak/
118 |
119 | # Spyder project settings
120 | .spyderproject
121 | .spyproject
122 |
123 | # Rope project settings
124 | .ropeproject
125 |
126 | # mkdocs documentation
127 | /site
128 |
129 | # mypy
130 | .mypy_cache/
131 | .dmypy.json
132 | dmypy.json
133 |
134 | # Pyre type checker
135 | .pyre/
136 |
--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | repos:
2 | - repo: https://github.com/pre-commit/pre-commit-hooks
3 | rev: v2.3.0
4 | hooks:
5 | - id: check-yaml
6 | - id: end-of-file-fixer
7 | - id: trailing-whitespace
8 | - repo: https://github.com/psf/black
9 | rev: 22.10.0
10 | hooks:
11 | - id: black
12 |
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | # Dockerfile for the fastAPI chatbot server
2 | # It uses poetry to setup the environment
3 | FROM nvidia/cuda:11.7.1-base-ubuntu22.04
4 |
5 | # Set the working directory
6 | WORKDIR /app
7 |
8 | # Install system dependencies
9 | ENV DEBIAN_FRONTEND=noninteractive
10 | RUN apt-get update && apt-get -y install \
11 | git \
12 | python3-dev \
13 | g++-11 \
14 | build-essential \
15 | curl wget tzdata
16 |
17 | # Install poetry
18 | ENV POETRY_HOME="/opt/poetry" \
19 | POETRY_NO_INTERACTION=1 \
20 | POETRY_VERSION=1.5.0
21 | ENV PATH="$PATH:$POETRY_HOME/bin"
22 | RUN curl -sSL https://install.python-poetry.org | python3 -
23 |
24 | # Copy the source code into docker image
25 | COPY . /app
26 |
27 | # Install project and dependencies
28 | RUN rm -f poetry.lock && make install
29 | RUN poetry run python -m ipykernel install --user --name aikg
30 |
31 | # Run the server
32 | ENTRYPOINT ["/bin/bash", "-c", "poetry run uvicorn aikg.server:app --host 0.0.0.0 --port 80"]
33 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | IMAGE := 'ghcr.io/sdsc-ordes/kg-llm-interface:latest'
2 |
3 | .PHONY: install
4 | install: ## Install with the poetry and add pre-commit hooks
5 | @echo "🔨 Installing packages with uv"
6 | @uv sync
7 | @uv run pre-commit install
8 |
9 | .PHONY: check
10 | check: ## Run code quality tools.
11 | @echo "🕵️ Checking Poetry lock file consistency with 'pyproject.toml': Running uv lock --check"
12 | @uv lock --check
13 | @echo "🕵️ Linting code: Running pre-commit"
14 | @uv run pre-commit run -a
15 |
16 | .PHONY: test
17 | test: ## Test the code with pytest
18 | @echo "🧪 Testing code: Running pytest"
19 | @uv run pytest
20 |
21 | .PHONY: server
22 | server:
23 | @echo "🖥️ Running server"
24 | @uv run uvicorn --reload aikg.server:app --port 8001
25 |
26 | .PHONY: deploy
27 | deploy:
28 | @echo "🚀 Deploying all the services"
29 | @kubectl apply -k kubernetes/overlays/data-retriever
30 |
31 | .PHONY: notebook
32 | notebook: docker-build ## Start a jupyter notebook server in a docker container
33 | @echo "🗒️ Starting a containerized notebook server"
34 | @docker run -p 8888:8888 --rm -it --entrypoint 'poetry' $(IMAGE) \
35 | run jupyter lab --allow-root --port 8888 --ip "0.0.0.0"
36 |
37 | docker-build: Dockerfile
38 | @echo "🐳 Building docker image"
39 | @docker build -t $(IMAGE) .
40 |
41 | .PHONY: help
42 | help:
43 | @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-20s\033[0m %s\n", $$1, $$2}'
44 |
45 | .DEFAULT_GOAL := help
46 |
--------------------------------------------------------------------------------
/NOTICE:
--------------------------------------------------------------------------------
1 | kg-llm-interface
2 | Copyright 2023 - Swiss Data Science Center (SDSC)
3 | A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and
4 | Eidgenössische Technische Hochschule Zürich (ETHZ).
5 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # kg-llm-interface
2 | Langchain-powered natural language interface to RDF knowledge-graphs.
3 |
4 | ## Installation
5 |
6 | This repository uses `uv` for package management. A Makefile rule is provided to install the dependencies:
7 |
8 | ```bash
9 | make install
10 | ```
11 |
12 | ## Configuration
13 |
14 | Configuration variables are loaded from the `.env` file or environment variables. A template configuration file is provided in `.env.example`.
15 |
16 | The chat configuration (`config.chat.ChatConfig`) uses OpenAI by default, however you can run this tool with open source LLMs using a framework such as llamafile, openllm or localGPT. When doing so, simply provide your LLM server url using `openai_api_base` and the model name using`model`.
17 |
18 |
19 | ## Quickstart
20 |
21 | You can read and run the [example notebook](aikg/notebooks/nl_sparql.ipynb) to get a quick overview of the system.
22 | The notebook supports using the Openai API and can run locally on a laptop.
23 |
24 | To run the notebook in a containerized environment, run:
25 |
26 | `make notebook`
27 |
28 | ## Server
29 |
30 | The server can be deployed as a standalone service using the script `scripts/standalone_server.sh`. It will start a uvicorn server on port 8001, use chromaDB in client-only mode and use an RDF file as knowledge graph. This should work for small datasets.
31 |
32 |
33 | ## Pipelines
34 |
35 | Pipelines are used to execute one-time operations for preparing data before the chat server can operate. They load their configuration from the `.env` file as well, but the variables can be overriden using yaml files (run with `--help` for more info).
36 |
37 | ### Insert triples
38 |
39 | ```mermaid
40 | flowchart LR
41 | RDF[RDF file] -->|insert_triples.py| SPARQL(SPARQL endpoint)
42 | ```
43 |
44 | Insert data from an input RDF file to a SPARQL endpoint. The input file can be in any format supported by rdflib (ttl, json-ld, rdf/xlm, ...).
45 |
46 | Location: [insert_triples.py](aikg/flows/insert_triples.py):
47 |
48 | SPARQL configuration can be overriden by providing a yaml file following the [aikg.config.sparql.SparqlConfig](aikg/config/sparql.py) schema:
49 |
50 | `python insert_triples --sparql-config-path sparql.yaml`
51 |
52 | ```yaml
53 | # sparql.yaml
54 | endpoint: http://localhost:3030/ds/query
55 | user: admin
56 | password: admin
57 | ```
58 |
59 | CLI usage: `python aikg/flows/insert_triples.py`
60 |
61 | ### Chroma build
62 |
63 | ```mermaid
64 | flowchart LR
65 | SPARQL(SPARQL endpoint) -->|chroma_build.py| CHROMA(ChromaDB)
66 | ```
67 |
68 | Build the chromaDB index from a SPARQL endpoint.
69 |
70 | Location: [chroma_build.py](aikg/flows/chroma_build.py):
71 |
72 | CLI usage: `python aikg/flows/chroma_build.py`
73 |
74 | Chroma and SPARQL configurations can be overriden by providing a yaml file following the [aikg.config.chroma.ChromaConfig](aikg/config/chroma.py) or [aikg.config.sparql.SparqlConfig](aikg/config/sparql.py) schemas respectively.
75 |
76 |
77 | ## Containerized service
78 |
79 | :warning: WIP, not functional yet
80 |
81 | The chat server can be deployed along with the front-end, SPARQL endpoint and chromaDB server using kubernetes.
82 |
83 | ```mermaid
84 | sequenceDiagram
85 | Front-end->>+Chat server: question
86 | Chat server->>+ChromaDB: question
87 | ChromaDB -->ChromaDB: embed
88 | ChromaDB-->>-Chat server: ontology triples
89 | Chat server-->Chat server: generate query
90 | Chat server-->>+SPARQL endpoint: query
91 | SPARQL endpoint-->SPARQL endpoint: run query
92 | SPARQL endpoint-->>-Chat server: result
93 | Chat server-->>-Front-end: answer
94 | ```
95 |
96 | ## Contributing
97 |
98 | All contributions are welcome. New functions and classes should have associated docstrings following the [numpy style guide](https://numpydoc.readthedocs.io/en/latest/format.html).
99 |
100 | The code formatting standard we use is [black](https://github.com/psf/black), with `--line-length=79` to follow [PEP8](https://peps.python.org/pep-0008/) recommendations. We use [pytest](https://docs.pytest.org/en/7.2.x/) as our testing framework. This project uses [pyproject.toml](https://pip.pypa.io/en/stable/reference/build-system/pyproject-toml/) to define package information, requirements and tooling configuration.
101 |
102 | Tests can be executed with `make test`. Tests use [testcontainers](https://testcontainers.com) to temporarily deploy the required services.
103 |
--------------------------------------------------------------------------------
/aikg/config/__init__.py:
--------------------------------------------------------------------------------
1 | # kg-llm-interface
2 | # Copyright 2023 - Swiss Data Science Center (SDSC)
3 | # A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and
4 | # Eidgenössische Technische Hochschule Zürich (ETHZ).
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 |
18 | from aikg.config.chat import ChatConfig
19 | from aikg.config.chroma import ChromaConfig
20 | from aikg.config.sparql import SparqlConfig
21 |
--------------------------------------------------------------------------------
/aikg/config/chat.py:
--------------------------------------------------------------------------------
1 | # kg-llm-interface
2 | # Copyright 2023 - Swiss Data Science Center (SDSC)
3 | # A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and
4 | # Eidgenössische Technische Hochschule Zürich (ETHZ).
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 |
18 | import os
19 | from pydantic import BaseModel
20 |
21 |
22 | class ChatConfig(BaseModel):
23 | """Chatbot configuration.
24 |
25 | Attributes:
26 | model_id: The HuggingFace ID of the model to use for text generation.
27 | max_new_tokens: The maximum number of tokens to generate.
28 | max_input_size: The maximum number of tokens in the input.
29 | num_output: The number of outputs to generate.
30 | max_chunk_overlap: The maximum number of tokens to overlap between chunks.
31 | prompt_template: The template for the prompt to inject into the model. The template should contain the following variables: context_str, query_str.
32 | """
33 |
34 | openai_api_base: str = os.environ.get(
35 | "OPENAI_API_BASE", "https://api.openai.com/v1/"
36 | )
37 | openai_api_key: str = os.environ.get("OPENAI_API_KEY", "")
38 | model: str = os.environ.get("OPENAI_MODEL", "gpt-4o")
39 | answer_template: str = """
40 | We have provided the contextual facts below.
41 | -----------------
42 | {result_str}
43 | -----------------
44 | Answer the question using only the context and no
45 | prior knowledge. If the context does not contain any fact related to
46 | the question, simply answer the words 'Not found'. The answer should be
47 | maximum 2 sentences directly reflecting the facts from relevant facts while ignoring
48 | irrelevant ones.
49 | Question: {question_str}
50 | Answer:
51 | """
52 |
53 | sparql_template: str = """
54 | Use the question and the additional information to generate a sparql query against a knowledge graph where the p and q items are
55 | completely unknown to you. You will need to discover the p and q items before you can generate the sparql.
56 | Do not assume you know the p and q items for any concepts.
57 | After you generate the sparql, you should display it.
58 |
59 | When generating sparql:
60 | * Never enclose the sparql in back-quotes
61 | * Do not include any human text, only the query and nothing else
62 |
63 | {examples_str}
64 |
65 | Use the following format:
66 |
67 | Question: the input question for which you must provide a natural language answer
68 | Information: the additional information you get with the query, in RDF format. This will help you generate the sparql query with the correct format.
69 |
70 | Question: {question_str}
71 | Information:
72 | {context_str}
73 | Answer:
74 | """
75 |
--------------------------------------------------------------------------------
/aikg/config/chroma.py:
--------------------------------------------------------------------------------
1 | # kg-llm-interface
2 | # Copyright 2023 - Swiss Data Science Center (SDSC)
3 | # A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and
4 | # Eidgenössische Technische Hochschule Zürich (ETHZ).
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 |
18 | import os
19 | from pydantic import BaseModel
20 |
21 |
22 | class ChromaConfig(BaseModel):
23 | """
24 | Attributes:
25 | host:
26 | The host of the ChromaDB server. If set to "local", chroma will run in client-only mode.
27 | port:
28 | The port of the ChromaDB server.
29 | collection_name:
30 | The name of the ChromaDB collection to store the index in.
31 | collection_examples:
32 | The name of the ChromaDB collection to store examples in.
33 | embedding_model_id:
34 | The HuggingFace ID of the embedding model to use.
35 | batch_size:
36 | The number of documents to vectorize and store in each batch.
37 | persist_directory:
38 | If set to client-only mode, local path where the db is saved.
39 | """
40 |
41 | host: str = os.environ.get("CHROMA_HOST", "127.0.0.1")
42 | port: int = int(os.environ.get("CHROMA_PORT", "8000"))
43 | collection_name: str = os.environ.get("CHROMA_COLLECTION", "schema")
44 | collection_examples: str = os.environ.get("CHROMA_EXAMPLES", "examples")
45 | batch_size: int = int(os.environ.get("CHROMA_BATCH_SIZE", "50"))
46 | embedding_model: str = os.environ.get("CHROMA_MODEL", "all-mpnet-base-v2")
47 | persist_directory: str = os.environ.get("CHROMA_PERSIST_DIR", ".chroma/")
48 |
--------------------------------------------------------------------------------
/aikg/config/common.py:
--------------------------------------------------------------------------------
1 | # kg-llm-interface
2 | # Copyright 2023 - Swiss Data Science Center (SDSC)
3 | # A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and
4 | # Eidgenössische Technische Hochschule Zürich (ETHZ).
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 |
18 | from pathlib import Path
19 | from typing import Type, TypeVar
20 |
21 | from pydantic import BaseModel
22 | import yaml
23 |
24 | Config = TypeVar("Config", bound=BaseModel)
25 |
26 |
27 | def parse_yaml_config(config_path: Path, config_class: Type[Config]) -> Config:
28 | """Parse a YAML config file into a pydantic model.
29 |
30 | Args:
31 | config_path: Path to YAML config file.
32 | config_class: The pydantic model to parse the config into.
33 |
34 | Returns:
35 | The parsed config.
36 | """
37 | # Load dict from YAML file
38 | config_dict = yaml.safe_load(config_path.read_text())
39 | return config_class.parse_obj(config_dict)
40 |
--------------------------------------------------------------------------------
/aikg/config/sparql.py:
--------------------------------------------------------------------------------
1 | # kg-llm-interface
2 | # Copyright 2023 - Swiss Data Science Center (SDSC)
3 | # A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and
4 | # Eidgenössische Technische Hochschule Zürich (ETHZ).
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 |
18 | import os
19 | from pydantic import BaseModel
20 |
21 |
22 | class SparqlConfig(BaseModel):
23 | """
24 | Attributes:
25 | endpoint: The SPARQL endpoint to connect to. Can also be a local path to an RDF file.
26 | repo: The name of the repository or dataset to query.
27 | user: The username to use for authentication.
28 | password: The password to use for authentication.
29 | """
30 |
31 | endpoint: str = os.environ.get(
32 | "SPARQL_ENDPOINT", "http://localhost:7200/repositories/test"
33 | )
34 |
35 | user: str = os.environ.get("SPARQL_USER", "admin")
36 | password: str = os.environ.get("SPARQL_PASSWORD", "admin")
37 |
--------------------------------------------------------------------------------
/aikg/flows/chroma_build.py:
--------------------------------------------------------------------------------
1 | # kg-llm-interface
2 | # Copyright 2023 - Swiss Data Science Center (SDSC)
3 | # A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and
4 | # Eidgenössische Technische Hochschule Zürich (ETHZ).
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 |
18 | """This flow builds a ChromaDB vector index from RDF data in a SPARQL endpoint.
19 |
20 | For each subject in the target graph, a document is generated. The document consists of:
21 | * A human readable body made up of the annotations (rdfs:comment, rdf:label) associated with the subject.
22 | * Triples with the subject attached as metadata.
23 |
24 | The documents are then stored in a vector database. The embedding is computed using the document body,
25 | and triples included as metadata. The index is persisted to disk and can be subsequently loaded into memory
26 | for querying."""
27 |
28 | from pathlib import Path
29 | from typing import Optional, Tuple
30 | from typing_extensions import Annotated
31 | import uuid
32 |
33 | from chromadb.api import ClientAPI, Collection
34 | from dotenv import load_dotenv
35 | from langchain.schema import Document
36 | from more_itertools import chunked
37 | from prefect import flow, task
38 | from prefect import get_run_logger
39 | from rdflib import ConjunctiveGraph, Graph
40 | from SPARQLWrapper import SPARQLWrapper
41 | import typer
42 |
43 | from aikg.config import ChromaConfig, SparqlConfig
44 | from aikg.config.common import parse_yaml_config
45 | import aikg.utils.rdf as akrdf
46 | import aikg.utils.chroma as akchroma
47 |
48 |
49 | @task
50 | def init_chromadb(
51 | host: str,
52 | port: int,
53 | collection_name: str,
54 | embedding_model: str,
55 | persist_directory: str,
56 | ) -> Tuple[ClientAPI, Collection]:
57 | """Prepare chromadb client."""
58 | client = akchroma.setup_client(host, port, persist_directory=persist_directory)
59 | coll = akchroma.setup_collection(client, collection_name, embedding_model)
60 |
61 | return client, coll
62 |
63 |
64 | @task
65 | def sparql_to_documents(
66 | kg: Graph | SPARQLWrapper, graph: Optional[str] = None
67 | ) -> list[Document]:
68 | return list(akrdf.get_subjects_docs(kg, graph=graph))
69 |
70 |
71 | @task
72 | def index_batch(batch: list[Document]):
73 | """Sends a batch of document for indexing in the vector store"""
74 | coll.add(
75 | ids=[str(uuid.uuid4()) for _ in batch],
76 | documents=[doc.page_content for doc in batch],
77 | metadatas=[doc.metadata for doc in batch],
78 | )
79 |
80 |
81 | @flow
82 | def chroma_build_flow(
83 | chroma_cfg: ChromaConfig = ChromaConfig(),
84 | sparql_cfg: SparqlConfig = SparqlConfig(),
85 | graph: Optional[str] = None,
86 | ):
87 | """Build a ChromaDB vector index from RDF data in a SPARQL endpoint.
88 |
89 | Parameters
90 | ----------
91 | chroma_cfg:
92 | ChromaDB configuration.
93 | sparql_cfg:
94 | SPARQL endpoint configuration.
95 | graph:
96 | URI of named graph from which to select subjects to embed.
97 | By default, all subjects are used.
98 | """
99 | load_dotenv()
100 | logger = get_run_logger()
101 | logger.info("INFO Started")
102 | # Connect to external resources
103 | global coll
104 | client, coll = init_chromadb(
105 | chroma_cfg.host,
106 | chroma_cfg.port,
107 | chroma_cfg.collection_name,
108 | chroma_cfg.embedding_model,
109 | chroma_cfg.persist_directory,
110 | )
111 | kg = akrdf.setup_kg(
112 | sparql_cfg.endpoint,
113 | user=sparql_cfg.user,
114 | password=sparql_cfg.password,
115 | )
116 |
117 | # Create subject documents
118 | docs = sparql_to_documents(
119 | kg,
120 | graph=graph,
121 | )
122 |
123 | # Vectorize and index documents by batches to reduce overhead
124 | logger.info(f"Indexing by batches of {chroma_cfg.batch_size} items")
125 | embed_counter = 0
126 | for batch in chunked(docs, chroma_cfg.batch_size):
127 | embed_counter += len(batch)
128 | index_batch(batch)
129 | logger.info(f"Indexed {embed_counter} items.")
130 |
131 |
132 | def cli(
133 | chroma_cfg_path: Annotated[
134 | Optional[Path],
135 | typer.Option(help="YAML file with Chroma client configuration."),
136 | ] = None,
137 | sparql_cfg_path: Annotated[
138 | Optional[Path],
139 | typer.Option(help="YAML file with SPARQL endpoint configuration."),
140 | ] = None,
141 | graph: Annotated[
142 | Optional[str],
143 | typer.Option(
144 | help="URI of named graph from which to select triples to embed. If not set, the default graph is used.",
145 | ),
146 | ] = None,
147 | ):
148 | """Command line wrapper for RDF to ChromaDB index flow."""
149 | chroma_cfg = (
150 | parse_yaml_config(chroma_cfg_path, ChromaConfig)
151 | if chroma_cfg_path
152 | else ChromaConfig()
153 | )
154 | sparql_cfg = (
155 | parse_yaml_config(sparql_cfg_path, SparqlConfig)
156 | if sparql_cfg_path
157 | else SparqlConfig()
158 | )
159 | chroma_build_flow(chroma_cfg, sparql_cfg, graph=graph)
160 |
161 |
162 | if __name__ == "__main__":
163 | typer.run(cli)
164 |
--------------------------------------------------------------------------------
/aikg/flows/chroma_examples.py:
--------------------------------------------------------------------------------
1 | # kg-llm-interface
2 | # Copyright 2023 - Swiss Data Science Center (SDSC)
3 | # A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and
4 | # Eidgenössische Technische Hochschule Zürich (ETHZ).
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 |
18 | """This flow builds a ChromaDB vector index from examples consisting of pairs of questions and SPARQL queries.
19 |
20 | For each subject in the target graph, a document is generated. The document consists of:
21 | * A human readable question (document body)
22 | * A corresponding SPARQL query (document metadata)
23 |
24 | The documents are then stored in a vector database. The embedding is computed using the document body (questions),
25 | and SPAQRL queries included as metadata. The index is persisted to disk and can be subsequently loaded into memory
26 | for querying."""
27 |
28 | from pathlib import Path
29 | from typing import Optional, Tuple
30 | from typing_extensions import Annotated
31 | import uuid
32 | import os
33 |
34 | from chromadb.api import ClientAPI, Collection
35 | from dotenv import load_dotenv
36 | from langchain.schema import Document
37 | from more_itertools import chunked
38 | from prefect import flow, task
39 | from prefect import get_run_logger
40 | import typer
41 |
42 | from aikg.config import ChromaConfig
43 | from aikg.config.common import parse_yaml_config
44 | import aikg.utils.io as akio
45 | import aikg.utils.chroma as akchroma
46 |
47 |
48 | @task
49 | def init_chromadb(
50 | host: str,
51 | port: int,
52 | collection_name: str,
53 | embedding_model: str,
54 | persist_directory: str,
55 | ) -> Tuple[ClientAPI, Collection]:
56 | """Prepare chromadb client."""
57 | client = akchroma.setup_client(host, port, persist_directory=persist_directory)
58 | coll = akchroma.setup_collection(client, collection_name, embedding_model)
59 |
60 | return client, coll
61 |
62 |
63 | @task
64 | def index_batch(batch: list[Document]):
65 | """Sends a batch of document for indexing in the vector store"""
66 | coll.add(
67 | ids=[str(uuid.uuid4()) for _ in batch],
68 | documents=[doc.page_content for doc in batch],
69 | metadatas=[doc.metadata for doc in batch],
70 | )
71 |
72 |
73 | @task
74 | def get_sparql_examples(dir: Path) -> list[Document]:
75 | # find files
76 | files = []
77 | for file_name in os.listdir(dir):
78 | files.append(os.path.join(dir, file_name))
79 | # provide each file as text stream to be parsed
80 | return [akio.parse_sparql_example(open(ex)) for ex in files]
81 |
82 |
83 | @flow
84 | def chroma_build_examples_flow(
85 | chroma_input_dir: Path,
86 | chroma_cfg: ChromaConfig = ChromaConfig(),
87 | ):
88 | """Build a ChromaDB vector index from examples.
89 |
90 | Parameters
91 | ----------
92 | chroma_input_dir:
93 | Directory containing files with example question-query pairs. The files should be in sparql format, with the first line being the question as a comment.
94 | chroma_cfg:
95 | ChromaDB configuration.
96 | """
97 | load_dotenv()
98 | logger = get_run_logger()
99 | logger.info("INFO Started")
100 | # Connect to external resources
101 | global coll
102 | client, coll = init_chromadb(
103 | chroma_cfg.host,
104 | chroma_cfg.port,
105 | chroma_cfg.collection_examples,
106 | chroma_cfg.embedding_model,
107 | chroma_cfg.persist_directory,
108 | )
109 |
110 | # Create subject documents
111 | docs = get_sparql_examples(
112 | dir=chroma_input_dir,
113 | )
114 |
115 | # Vectorize and index documents by batches to reduce overhead
116 | logger.info(f"Indexing by batches of {chroma_cfg.batch_size} items")
117 | embed_counter = 0
118 | for batch in chunked(docs, chroma_cfg.batch_size):
119 | embed_counter += len(batch)
120 | index_batch(docs)
121 | logger.info(f"Indexed {embed_counter} items.")
122 |
123 |
124 | def cli(
125 | chroma_input_dir: Annotated[
126 | Path,
127 | typer.Argument(
128 | help="Path to directory with example SPARQL queries",
129 | exists=True,
130 | file_okay=False,
131 | dir_okay=True,
132 | ),
133 | ],
134 | chroma_cfg_path: Annotated[
135 | Optional[Path],
136 | typer.Option(default=None, help="YAML file with Chroma client configuration."),
137 | ] = None,
138 | ):
139 | """Command line wrapper for SPARQL examples to ChromaDB index flow."""
140 | chroma_cfg = (
141 | parse_yaml_config(chroma_cfg_path, ChromaConfig)
142 | if chroma_cfg_path
143 | else ChromaConfig()
144 | )
145 | chroma_build_examples_flow(chroma_input_dir, chroma_cfg)
146 |
147 |
148 | if __name__ == "__main__":
149 | typer.run(cli)
150 |
--------------------------------------------------------------------------------
/aikg/flows/insert_triples.py:
--------------------------------------------------------------------------------
1 | # kg-llm-interface
2 | # Copyright 2023 - Swiss Data Science Center (SDSC)
3 | # A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and
4 | # Eidgenössische Technische Hochschule Zürich (ETHZ).
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 |
18 | """This flow populates a SPARQL endpoint from RDF data in a file."""
19 | import os
20 | from pathlib import Path
21 | from typing import Optional
22 | from typing_extensions import Annotated
23 |
24 | from dotenv import load_dotenv
25 | from prefect import flow, get_run_logger, task
26 | from SPARQLWrapper import SPARQLWrapper
27 | import typer
28 |
29 | from aikg.config.common import parse_yaml_config
30 | from aikg.config import SparqlConfig
31 |
32 |
33 | @task
34 | def setup_sparql_endpoint(
35 | endpoint: str, user: Optional[str] = None, password: Optional[str] = None
36 | ) -> SPARQLWrapper:
37 | """Connect to SPARQL endpoint and setup credentials.
38 |
39 | Parameters
40 | ----------
41 | endpoint:
42 | URL of the SPARQL endpoint.
43 | user:
44 | Username to use for authentication.
45 | password:
46 | Password to use for authentication.
47 | """
48 | # Setup sparql endpoint
49 | sparql = SPARQLWrapper(endpoint, updateEndpoint=endpoint + "/statements")
50 | if user and password:
51 | sparql.setCredentials(user, password)
52 | return sparql
53 |
54 |
55 | @task
56 | def insert_triples(
57 | rdf_file: Path,
58 | endpoint: SPARQLWrapper,
59 | graph: Optional[str] = None,
60 | chunk_size: int = 1000,
61 | ):
62 | """Insert triples from source file into SPARQL endpoint.
63 |
64 | Parameters
65 | ----------
66 | rdf_file:
67 | Path to RDF file to load into the SPARQL endpoint.
68 | endpoint:
69 | SPARQL endpoint to load RDF data into.
70 | graph:
71 | URI of named graph to load RDF data into.
72 | If set to None, the default graph is used.
73 | chunk_size:
74 | Number of triples per insert operation.
75 | """
76 | from rdflib import Dataset
77 | from rdflib.util import guess_format
78 |
79 | format = guess_format(str(rdf_file))
80 | if format not in ["nt", "nquads"]:
81 | raise ValueError("Unsupported RDF format, must be ntriples or nquads.")
82 |
83 | cur = 0
84 | tot = os.path.getsize(rdf_file)
85 | with open(rdf_file, "r", encoding="utf-8") as source:
86 | # Run INSERT DATA queries by chunks of triples
87 | while True:
88 | data = "".join([source.readline() for _ in range(chunk_size)])
89 | if data == "":
90 | break
91 |
92 | ds = Dataset()
93 | ds.parse(data=data, format=format)
94 |
95 | query = "\n".join(
96 | [f"PREFIX {prefix}: {ns.n3()}" for prefix, ns in ds.namespaces()]
97 | )
98 | query += f"\nINSERT DATA {{"
99 | if graph:
100 | query += f"\n\tGRAPH <{graph}> {{"
101 | query += " .\n".join(
102 | [f"\t\t{s.n3()} {p.n3()} {o.n3()}" for (s, p, o, _) in ds.quads()]
103 | )
104 | if graph:
105 | query += f"\n\t}}"
106 | query += f" . \n\n}}\n"
107 | endpoint.setQuery(query)
108 | endpoint.queryType = "INSERT"
109 | endpoint.method = "POST"
110 | endpoint.setReturnFormat("json")
111 | endpoint.query()
112 | cur += len(data.encode("utf-8"))
113 | print(f"inserted triples: {round(100 * cur / tot, 2)}%")
114 |
115 |
116 | @flow
117 | def sparql_insert_flow(
118 | rdf_file: Path,
119 | sparql_cfg: SparqlConfig = SparqlConfig(),
120 | graph: Optional[str] = None,
121 | ):
122 | """Workflow to connect to a SPARQL endpoint and send insert
123 | queries to load triples from a local file.
124 |
125 | Parameters
126 | ----------
127 | rdf_file:
128 | Path to source RDF file.
129 | sparql_cfg:
130 | Configuration for the target SPARQL endpoint.
131 | """
132 | load_dotenv()
133 | logger = get_run_logger()
134 | sparql = setup_sparql_endpoint(
135 | sparql_cfg.endpoint, sparql_cfg.user, sparql_cfg.password
136 | )
137 | logger.info("SPARQL endpoint connected")
138 | insert_triples(rdf_file, sparql, graph)
139 | logger.info("all triples inserted")
140 |
141 |
142 | def cli(
143 | rdf_file: Annotated[
144 | Path,
145 | typer.Argument(
146 | help="RDF file to load into the SPARQL endpoint, in turtle or n-triples format.",
147 | exists=True,
148 | file_okay=True,
149 | dir_okay=False,
150 | ),
151 | ],
152 | sparql_cfg_path: Annotated[
153 | Optional[Path],
154 | typer.Option(help="YAML file with SPARQL endpoint configuration."),
155 | ] = None,
156 | graph: Annotated[
157 | Optional[str],
158 | typer.Option(
159 | help="URI of named graph to load RDF data into. If not set, the default graph is used.",
160 | ),
161 | ] = None,
162 | ):
163 | """Command line wrapper to insert triples to a SPARQL endpoint."""
164 | sparql_cfg = (
165 | parse_yaml_config(sparql_cfg_path, SparqlConfig)
166 | if sparql_cfg_path
167 | else SparqlConfig()
168 | )
169 | sparql_insert_flow(rdf_file, sparql_cfg, graph)
170 |
171 |
172 | if __name__ == "__main__":
173 | typer.run(cli)
174 |
--------------------------------------------------------------------------------
/aikg/models.py:
--------------------------------------------------------------------------------
1 | # kg-llm-interface
2 | # Copyright 2023 - Swiss Data Science Center (SDSC)
3 | # A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and
4 | # Eidgenössische Technische Hochschule Zürich (ETHZ).
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 |
18 | from datetime import datetime, timedelta
19 | from pydantic import BaseModel
20 | import uuid
21 |
22 |
23 | class Message(BaseModel):
24 | text: str
25 | time: datetime
26 | sender: str
27 | triples: str | None = None
28 |
29 |
30 | class Conversation(BaseModel):
31 | """A conversation, represented as a list of messages
32 | and a unique identifier (uid)."""
33 |
34 | thread: list[Message]
35 | uid: str | None = str(uuid.uuid4())
36 |
37 | @property
38 | def start_time(self) -> datetime | None:
39 | try:
40 | return self.thread[0].time
41 | except IndexError:
42 | return None
43 |
44 | @property
45 | def end_time(self) -> datetime | None:
46 | try:
47 | self.thread[-1].time
48 | except IndexError:
49 | return None
50 |
51 | @property
52 | def duration(self) -> timedelta | None:
53 | if self.start_time is None or self.end_time is None:
54 | return None
55 | return self.end_time - self.start_time
56 |
57 | @property
58 | def actors(self) -> list[str]:
59 | return list(set([m.sender for m in self.thread]))
60 |
--------------------------------------------------------------------------------
/aikg/notebooks/nl_sparql.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "attachments": {},
5 | "cell_type": "markdown",
6 | "metadata": {},
7 | "source": [
8 | "## Question to SPARQL query generation\n",
9 | "\n",
10 | "In this notebook, we generate a SPARQL query from an input plain english question and execute it against a knowledge graph.\n",
11 | "\n",
12 | "Below are the two prompts we will use for the language model. First, the `SPARQL_TEMPLATE` is used to construct a SPARQL query from an input quersion and context. Then, the output will be executed against the knowledge graph and the `ANSWER_TEMPLATE` will be used to generate a human-readable answer to described the results."
13 | ]
14 | },
15 | {
16 | "cell_type": "code",
17 | "execution_count": 1,
18 | "metadata": {},
19 | "outputs": [],
20 | "source": [
21 | "\n",
22 | "SPARQL_TEMPLATE = \"\"\"\n",
23 | "Generate a SPARQL query to answer the input question. A sample of the knowledge graph schema is provided to help construct the query.\n",
24 | "After you generate the sparql, you should display it.\n",
25 | "When generating sparql:\n",
26 | "* never enclose the sparql in back-quotes.\n",
27 | "* always include the prefix declarations.\n",
28 | "* prefer using OPTIONAL when selecting multiple variables.\n",
29 | "* Allow case-insensitive matching of strings.\n",
30 | "\n",
31 | "Use the following format:\n",
32 | "\n",
33 | "Question: the input question for which you must generate a SPARQL query\n",
34 | "Information: the schema information in RDF format. This will help you generate the sparql query with the correct format.\n",
35 | "\n",
36 | "Question: {question_str}\n",
37 | "Information:\n",
38 | "{context_str}\n",
39 | "Answer:\n",
40 | "\"\"\"\n",
41 | "\n",
42 | "ANSWER_TEMPLATE = \"\"\"\n",
43 | "The following describe a user question, associated SPARQL query and the result from executing the query.\n",
44 | "Based on this information, write an answer in simple terms that describes the results.\n",
45 | "When appropriate, use markdown formatting to format the results into a table or bullet points.\n",
46 | "\n",
47 | "Question:\n",
48 | "{question_str}\n",
49 | "Query:\n",
50 | "{query_str}\n",
51 | "Result:\n",
52 | "{result_str}\n",
53 | "Answer:\n",
54 | "\"\"\""
55 | ]
56 | },
57 | {
58 | "attachments": {},
59 | "cell_type": "markdown",
60 | "metadata": {},
61 | "source": [
62 | "We setup a minimal configuration, with the vector database (Chroma) running in client-only mode, and a small RDF file acting as the knowledge graph. This file contains both the instance data and the ontology. The ontology is enclosed in a named graph inside the file.\n",
63 | "\n",
64 | "For the sake of the demo, we use a small model for embeddings (MiniLM-L6-V2) and rely on the OpenAI key for text geneartion for text generation. A local model can be used instead, but it will require high RAM and ideally a GPU."
65 | ]
66 | },
67 | {
68 | "cell_type": "code",
69 | "execution_count": 19,
70 | "metadata": {},
71 | "outputs": [],
72 | "source": [
73 | "from aikg.config import ChatConfig, ChromaConfig, SparqlConfig\n",
74 | "\n",
75 | "chroma_config = ChromaConfig(\n",
76 | " host=\"local\",\n",
77 | " port=8000,\n",
78 | " collection_name=\"test\",\n",
79 | " embedding_model=\"all-MiniLM-L6-v2\",\n",
80 | ")\n",
81 | "sparql_config = SparqlConfig(\n",
82 | " endpoint=\"../data/test_data.trig\",\n",
83 | ")\n",
84 | "chat_config = ChatConfig(\n",
85 | " answer_template=ANSWER_TEMPLATE,\n",
86 | " sparql_template=SPARQL_TEMPLATE\n",
87 | ")\n"
88 | ]
89 | },
90 | {
91 | "cell_type": "code",
92 | "execution_count": 25,
93 | "metadata": {},
94 | "outputs": [],
95 | "source": [
96 | "import os\n",
97 | "os.environ[\"OPENAI_API_KEY\"] = \"sk-...\""
98 | ]
99 | },
100 | {
101 | "cell_type": "code",
102 | "execution_count": 20,
103 | "metadata": {},
104 | "outputs": [
105 | {
106 | "name": "stderr",
107 | "output_type": "stream",
108 | "text": [
109 | "/home/stefan/kg-llm-interface/.conda/lib/python3.11/site-packages/langchain/llms/openai.py:173: UserWarning: You are trying to use a chat model. This way of initializing it is no longer supported. Instead, please use: `from langchain.chat_models import ChatOpenAI`\n",
110 | " warnings.warn(\n",
111 | "/home/stefan/kg-llm-interface/.conda/lib/python3.11/site-packages/langchain/llms/openai.py:753: UserWarning: You are trying to use a chat model. This way of initializing it is no longer supported. Instead, please use: `from langchain.chat_models import ChatOpenAI`\n",
112 | " warnings.warn(\n"
113 | ]
114 | }
115 | ],
116 | "source": [
117 | "\n",
118 | "from aikg.utils.llm import setup_llm_chain\n",
119 | "from aikg.utils.rdf import setup_kg\n",
120 | "\n",
121 | "# Use OpenAI API\n",
122 | "from langchain.llms import OpenAI\n",
123 | "llm = OpenAI(model_name=\"gpt-3.5-turbo-0125\")\n",
124 | "\n",
125 | "# For now, both chains share the same model to spare memory\n",
126 | "answer_chain = setup_llm_chain(llm, chat_config.answer_template)\n",
127 | "sparql_chain = setup_llm_chain(llm, chat_config.sparql_template)\n",
128 | "kg = setup_kg(**sparql_config.dict())"
129 | ]
130 | },
131 | {
132 | "attachments": {},
133 | "cell_type": "markdown",
134 | "metadata": {},
135 | "source": [
136 | "First, we need to embed the ontology into the vector database. This will allow us to retrieve semantically similar concepts from the ontology based on the question.\n",
137 | "\n",
138 | "In the example rdf file, the ontology is enclosed in a named graph calles `http://example.org/ontology`. "
139 | ]
140 | },
141 | {
142 | "cell_type": "code",
143 | "execution_count": 14,
144 | "metadata": {},
145 | "outputs": [
146 | {
147 | "data": {
148 | "text/html": [
149 | "
14:17:06.998 | INFO | prefect.engine - Created flow run 'electric-terrier' for flow 'chroma-build-flow'\n",
150 | "
\n"
151 | ],
152 | "text/plain": [
153 | "14:17:06.998 | \u001b[36mINFO\u001b[0m | prefect.engine - Created flow run\u001b[35m 'electric-terrier'\u001b[0m for flow\u001b[1;35m 'chroma-build-flow'\u001b[0m\n"
154 | ]
155 | },
156 | "metadata": {},
157 | "output_type": "display_data"
158 | },
159 | {
160 | "data": {
161 | "text/html": [
162 | "14:17:07.070 | INFO | Flow run 'electric-terrier' - INFO Started\n",
163 | "
\n"
164 | ],
165 | "text/plain": [
166 | "14:17:07.070 | \u001b[36mINFO\u001b[0m | Flow run\u001b[35m 'electric-terrier'\u001b[0m - \u001b[36mINFO\u001b[0m Started\n"
167 | ]
168 | },
169 | "metadata": {},
170 | "output_type": "display_data"
171 | },
172 | {
173 | "data": {
174 | "text/html": [
175 | "14:17:07.141 | INFO | Flow run 'electric-terrier' - Created task run 'init_chromadb-0' for task 'init_chromadb'\n",
176 | "
\n"
177 | ],
178 | "text/plain": [
179 | "14:17:07.141 | \u001b[36mINFO\u001b[0m | Flow run\u001b[35m 'electric-terrier'\u001b[0m - Created task run 'init_chromadb-0' for task 'init_chromadb'\n"
180 | ]
181 | },
182 | "metadata": {},
183 | "output_type": "display_data"
184 | },
185 | {
186 | "data": {
187 | "text/html": [
188 | "14:17:07.145 | INFO | Flow run 'electric-terrier' - Executing 'init_chromadb-0' immediately...\n",
189 | "
\n"
190 | ],
191 | "text/plain": [
192 | "14:17:07.145 | \u001b[36mINFO\u001b[0m | Flow run\u001b[35m 'electric-terrier'\u001b[0m - Executing 'init_chromadb-0' immediately...\n"
193 | ]
194 | },
195 | "metadata": {},
196 | "output_type": "display_data"
197 | },
198 | {
199 | "name": "stderr",
200 | "output_type": "stream",
201 | "text": [
202 | "/home/stefan/kg-llm-interface/.conda/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
203 | " from .autonotebook import tqdm as notebook_tqdm\n"
204 | ]
205 | },
206 | {
207 | "data": {
208 | "text/html": [
209 | "14:17:13.949 | INFO | Task run 'init_chromadb-0' - Finished in state Completed()\n",
210 | "
\n"
211 | ],
212 | "text/plain": [
213 | "14:17:13.949 | \u001b[36mINFO\u001b[0m | Task run 'init_chromadb-0' - Finished in state \u001b[32mCompleted\u001b[0m()\n"
214 | ]
215 | },
216 | "metadata": {},
217 | "output_type": "display_data"
218 | },
219 | {
220 | "data": {
221 | "text/html": [
222 | "14:17:14.028 | INFO | Flow run 'electric-terrier' - Created task run 'sparql_to_documents-0' for task 'sparql_to_documents'\n",
223 | "
\n"
224 | ],
225 | "text/plain": [
226 | "14:17:14.028 | \u001b[36mINFO\u001b[0m | Flow run\u001b[35m 'electric-terrier'\u001b[0m - Created task run 'sparql_to_documents-0' for task 'sparql_to_documents'\n"
227 | ]
228 | },
229 | "metadata": {},
230 | "output_type": "display_data"
231 | },
232 | {
233 | "data": {
234 | "text/html": [
235 | "14:17:14.033 | INFO | Flow run 'electric-terrier' - Executing 'sparql_to_documents-0' immediately...\n",
236 | "
\n"
237 | ],
238 | "text/plain": [
239 | "14:17:14.033 | \u001b[36mINFO\u001b[0m | Flow run\u001b[35m 'electric-terrier'\u001b[0m - Executing 'sparql_to_documents-0' immediately...\n"
240 | ]
241 | },
242 | "metadata": {},
243 | "output_type": "display_data"
244 | },
245 | {
246 | "data": {
247 | "text/html": [
248 | "14:17:14.575 | INFO | Task run 'sparql_to_documents-0' - Finished in state Completed()\n",
249 | "
\n"
250 | ],
251 | "text/plain": [
252 | "14:17:14.575 | \u001b[36mINFO\u001b[0m | Task run 'sparql_to_documents-0' - Finished in state \u001b[32mCompleted\u001b[0m()\n"
253 | ]
254 | },
255 | "metadata": {},
256 | "output_type": "display_data"
257 | },
258 | {
259 | "data": {
260 | "text/html": [
261 | "14:17:14.580 | INFO | Flow run 'electric-terrier' - Indexing by batches of 50 items\n",
262 | "
\n"
263 | ],
264 | "text/plain": [
265 | "14:17:14.580 | \u001b[36mINFO\u001b[0m | Flow run\u001b[35m 'electric-terrier'\u001b[0m - Indexing by batches of 50 items\n"
266 | ]
267 | },
268 | "metadata": {},
269 | "output_type": "display_data"
270 | },
271 | {
272 | "data": {
273 | "text/html": [
274 | "14:17:14.646 | INFO | Flow run 'electric-terrier' - Created task run 'index_batch-0' for task 'index_batch'\n",
275 | "
\n"
276 | ],
277 | "text/plain": [
278 | "14:17:14.646 | \u001b[36mINFO\u001b[0m | Flow run\u001b[35m 'electric-terrier'\u001b[0m - Created task run 'index_batch-0' for task 'index_batch'\n"
279 | ]
280 | },
281 | "metadata": {},
282 | "output_type": "display_data"
283 | },
284 | {
285 | "data": {
286 | "text/html": [
287 | "14:17:14.649 | INFO | Flow run 'electric-terrier' - Executing 'index_batch-0' immediately...\n",
288 | "
\n"
289 | ],
290 | "text/plain": [
291 | "14:17:14.649 | \u001b[36mINFO\u001b[0m | Flow run\u001b[35m 'electric-terrier'\u001b[0m - Executing 'index_batch-0' immediately...\n"
292 | ]
293 | },
294 | "metadata": {},
295 | "output_type": "display_data"
296 | },
297 | {
298 | "data": {
299 | "text/html": [
300 | "14:17:17.929 | INFO | Task run 'index_batch-0' - Finished in state Completed()\n",
301 | "
\n"
302 | ],
303 | "text/plain": [
304 | "14:17:17.929 | \u001b[36mINFO\u001b[0m | Task run 'index_batch-0' - Finished in state \u001b[32mCompleted\u001b[0m()\n"
305 | ]
306 | },
307 | "metadata": {},
308 | "output_type": "display_data"
309 | },
310 | {
311 | "data": {
312 | "text/html": [
313 | "14:17:17.936 | INFO | Flow run 'electric-terrier' - Indexed 13 items.\n",
314 | "
\n"
315 | ],
316 | "text/plain": [
317 | "14:17:17.936 | \u001b[36mINFO\u001b[0m | Flow run\u001b[35m 'electric-terrier'\u001b[0m - Indexed 13 items.\n"
318 | ]
319 | },
320 | "metadata": {},
321 | "output_type": "display_data"
322 | },
323 | {
324 | "data": {
325 | "text/html": [
326 | "14:17:17.998 | INFO | Flow run 'electric-terrier' - Finished in state Completed('All states completed.')\n",
327 | "
\n"
328 | ],
329 | "text/plain": [
330 | "14:17:17.998 | \u001b[36mINFO\u001b[0m | Flow run\u001b[35m 'electric-terrier'\u001b[0m - Finished in state \u001b[32mCompleted\u001b[0m('All states completed.')\n"
331 | ]
332 | },
333 | "metadata": {},
334 | "output_type": "display_data"
335 | },
336 | {
337 | "data": {
338 | "text/plain": [
339 | "[Completed(message=None, type=COMPLETED, result=UnpersistedResult(type='unpersisted', artifact_type='result', artifact_description='Unpersisted result of type `tuple`')),\n",
340 | " Completed(message=None, type=COMPLETED, result=UnpersistedResult(type='unpersisted', artifact_type='result', artifact_description='Unpersisted result of type `list`')),\n",
341 | " Completed(message=None, type=COMPLETED, result=UnpersistedResult(type='unpersisted', artifact_type='result', artifact_description='Unpersisted result of type `NoneType`'))]"
342 | ]
343 | },
344 | "execution_count": 14,
345 | "metadata": {},
346 | "output_type": "execute_result"
347 | }
348 | ],
349 | "source": [
350 | "from aikg.flows.chroma_build import chroma_build_flow\n",
351 | "chroma_build_flow(chroma_config, sparql_config, graph=\"https://example.org/ontology\")"
352 | ]
353 | },
354 | {
355 | "cell_type": "code",
356 | "execution_count": 21,
357 | "metadata": {},
358 | "outputs": [],
359 | "source": [
360 | "\n",
361 | "from aikg.utils.chroma import setup_client, setup_collection\n",
362 | "client = setup_client(\n",
363 | " chroma_config.host,\n",
364 | " chroma_config.port,\n",
365 | " chroma_config.persist_directory,\n",
366 | ")\n",
367 | "collection = setup_collection(\n",
368 | " client,\n",
369 | " chroma_config.collection_name,\n",
370 | " chroma_config.embedding_model,\n",
371 | ")\n"
372 | ]
373 | },
374 | {
375 | "attachments": {},
376 | "cell_type": "markdown",
377 | "metadata": {},
378 | "source": [
379 | "The Chroma collection now contains the ontology concepts as vectors. We can retrieve the most similar concepts to a given question.\n",
380 | "Notice that the property \"programmingLanguage\" is retrieved, even though the question does not contain the word \"programming\"."
381 | ]
382 | },
383 | {
384 | "cell_type": "code",
385 | "execution_count": 22,
386 | "metadata": {},
387 | "outputs": [
388 | {
389 | "name": "stdout",
390 | "output_type": "stream",
391 | "text": [
392 | " \"programming language\" .\n",
393 | " .\n",
394 | " \"The computer programming language.\" .\n",
395 | " .\n",
396 | " .\n",
397 | "\n",
398 | " \"Computer programming source code. Example: Full (compile ready) solutions, code snippet samples, scripts, templates.\" .\n",
399 | " .\n",
400 | " \"SoftwareSourceCode\" .\n",
401 | " .\n",
402 | "\n",
403 | " .\n",
404 | " .\n",
405 | " .\n",
406 | " \"Link to the repository where the un-compiled, human readable code and related code is located (SVN, GitHub, CodePlex).\" .\n",
407 | " \"codeRepository\" .\n",
408 | "\n",
409 | " \"license\" .\n",
410 | " .\n",
411 | " .\n",
412 | " .\n",
413 | " \"A license document that applies to this content, typically indicated by URL.\" .\n",
414 | " .\n",
415 | "\n",
416 | " \"name\" .\n",
417 | " .\n",
418 | " .\n",
419 | " \"The name of the item.\" .\n",
420 | " .\n",
421 | " .\n",
422 | " .\n",
423 | "\n"
424 | ]
425 | }
426 | ],
427 | "source": [
428 | "QUESTION = \"What softwares are written in Python?\"\n",
429 | "results = collection.query(query_texts=QUESTION, n_results=5)\n",
430 | "print('\\n'.join([res.get(\"triples\", \"\") for res in results['metadatas'][0]]))\n"
431 | ]
432 | },
433 | {
434 | "attachments": {},
435 | "cell_type": "markdown",
436 | "metadata": {},
437 | "source": [
438 | "Then, we can generate the SPARQL query."
439 | ]
440 | },
441 | {
442 | "cell_type": "code",
443 | "execution_count": 23,
444 | "metadata": {},
445 | "outputs": [
446 | {
447 | "name": "stdout",
448 | "output_type": "stream",
449 | "text": [
450 | "PREFIX rdf: \n",
451 | "PREFIX rdfs: \n",
452 | "PREFIX xsd: \n",
453 | "\n",
454 | "SELECT DISTINCT ?softwareName\n",
455 | "WHERE {\n",
456 | " ?software rdf:type .\n",
457 | " ?software ?language .\n",
458 | " FILTER regex(str(?language), \"python\", \"i\") .\n",
459 | " ?software ?softwareName .\n",
460 | "}\n"
461 | ]
462 | }
463 | ],
464 | "source": [
465 | "from aikg.utils.chat import generate_sparql\n",
466 | "query = generate_sparql(QUESTION, collection, sparql_chain)\n",
467 | "print(query)"
468 | ]
469 | },
470 | {
471 | "attachments": {},
472 | "cell_type": "markdown",
473 | "metadata": {},
474 | "source": [
475 | "and execute it:"
476 | ]
477 | },
478 | {
479 | "cell_type": "code",
480 | "execution_count": 24,
481 | "metadata": {},
482 | "outputs": [
483 | {
484 | "name": "stdout",
485 | "output_type": "stream",
486 | "text": [
487 | "[['softwareName'], ['SDSC-ORD/gimie'], ['SDSC-ORD/zarr_linked_data']]\n"
488 | ]
489 | }
490 | ],
491 | "source": [
492 | "from aikg.utils.rdf import query_kg\n",
493 | "results = query_kg(kg, query)\n",
494 | "print(results)"
495 | ]
496 | },
497 | {
498 | "attachments": {},
499 | "cell_type": "markdown",
500 | "metadata": {},
501 | "source": [
502 | "We can now generate a human-readable answer from the results of the query:"
503 | ]
504 | },
505 | {
506 | "cell_type": "code",
507 | "execution_count": 32,
508 | "metadata": {},
509 | "outputs": [
510 | {
511 | "data": {
512 | "text/plain": [
513 | "'The query returned two softwares written in Python: SDSC-ORD/gimie and SDSC-ORD/zarr_linked_data.'"
514 | ]
515 | },
516 | "execution_count": 32,
517 | "metadata": {},
518 | "output_type": "execute_result"
519 | }
520 | ],
521 | "source": [
522 | "from aikg.utils.chat import generate_answer\n",
523 | "generate_answer(QUESTION, query, results, answer_chain)"
524 | ]
525 | }
526 | ],
527 | "metadata": {
528 | "kernelspec": {
529 | "display_name": "aikg-URVQdnEY-py3.10",
530 | "language": "python",
531 | "name": "python3"
532 | },
533 | "language_info": {
534 | "codemirror_mode": {
535 | "name": "ipython",
536 | "version": 3
537 | },
538 | "file_extension": ".py",
539 | "mimetype": "text/x-python",
540 | "name": "python",
541 | "nbconvert_exporter": "python",
542 | "pygments_lexer": "ipython3",
543 | "version": "3.11.8"
544 | },
545 | "orig_nbformat": 4
546 | },
547 | "nbformat": 4,
548 | "nbformat_minor": 2
549 | }
550 |
--------------------------------------------------------------------------------
/aikg/notebooks/sphn_example.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "attachments": {},
5 | "cell_type": "markdown",
6 | "metadata": {},
7 | "source": [
8 | "## Example use-case\n",
9 | "\n",
10 | "In this notebook, we showcase a simple question answering task. We will use the [SPHN ontology](https://www.biomedit.ch/rdf/sphn-ontology/sphn), along with a small mock dataset which contains information artificial medical data."
11 | ]
12 | },
13 | {
14 | "cell_type": "code",
15 | "execution_count": 1,
16 | "metadata": {},
17 | "outputs": [],
18 | "source": [
19 | "\n",
20 | "SPARQL_TEMPLATE = \"\"\"\n",
21 | "Generate a SPARQL query to answer the input question. A sample of the knowledge graph schema is provided to help construct the query.\n",
22 | "After you generate the sparql, you should display it.\n",
23 | "When generating sparql:\n",
24 | "* never enclose the sparql in back-quotes.\n",
25 | "* always include the prefix declarations.\n",
26 | "* prefer using OPTIONAL when selecting multiple variables.\n",
27 | "* Allow case-insensitive matching of strings.\n",
28 | "\n",
29 | "Use the following format:\n",
30 | "\n",
31 | "Question: the input question for which you must generate a SPARQL query\n",
32 | "Information: the schema information in RDF format. This will help you generate the sparql query with the correct format.\n",
33 | "\n",
34 | "Question: {question_str}\n",
35 | "Information:\n",
36 | "{context_str}\n",
37 | "Answer:\n",
38 | "\"\"\"\n",
39 | "\n",
40 | "ANSWER_TEMPLATE = \"\"\"\n",
41 | "The following describe a user question, associated SPARQL query and the result from executing the query.\n",
42 | "Based on this information, write an answer in simple terms that describes the results.\n",
43 | "When appropriate, use markdown formatting to format the results into a table or bullet points.\n",
44 | "\n",
45 | "Question:\n",
46 | "{question_str}\n",
47 | "Query:\n",
48 | "{query_str}\n",
49 | "Result:\n",
50 | "{result_str}\n",
51 | "Answer:\n",
52 | "\"\"\""
53 | ]
54 | },
55 | {
56 | "cell_type": "markdown",
57 | "metadata": {},
58 | "source": [
59 | "We setup a similar configuration as in the nl_sparql notebook, but we have one sparql configuration for the ontology, and one for the instance data, each living in different files."
60 | ]
61 | },
62 | {
63 | "cell_type": "code",
64 | "execution_count": 2,
65 | "metadata": {},
66 | "outputs": [],
67 | "source": [
68 | "from aikg.config import ChatConfig, ChromaConfig, SparqlConfig\n",
69 | "\n",
70 | "chroma_config = ChromaConfig(\n",
71 | " host=\"local\",\n",
72 | " port=8000,\n",
73 | " collection_name=\"test\",\n",
74 | " embedding_model=\"all-MiniLM-L6-v2\",\n",
75 | " persist_directory=\"/tmp/chroma-test/\",\n",
76 | ")\n",
77 | "ontology_config = SparqlConfig(\n",
78 | " endpoint=\"../sphn/sphn_ontology_2023_2.ttl\",\n",
79 | ")\n",
80 | "kg_config = SparqlConfig(\n",
81 | " endpoint=\"../sphn/sphn_mock_data_2023_2.ttl\",\n",
82 | ")\n",
83 | "\n",
84 | "chat_config = ChatConfig(\n",
85 | " model_id=\"lmsys/vicuna-7b-v1.3\",\n",
86 | " max_new_tokens=48,\n",
87 | " max_input_size=2048,\n",
88 | " num_output=256,\n",
89 | " max_chunk_overlap=20,\n",
90 | " answer_template=ANSWER_TEMPLATE,\n",
91 | " sparql_template=SPARQL_TEMPLATE\n",
92 | ")\n"
93 | ]
94 | },
95 | {
96 | "cell_type": "code",
97 | "execution_count": 3,
98 | "metadata": {},
99 | "outputs": [],
100 | "source": [
101 | "import os\n",
102 | "os.environ[\"OPENAI_API_KEY\"] = \"sk-...a\""
103 | ]
104 | },
105 | {
106 | "cell_type": "code",
107 | "execution_count": 4,
108 | "metadata": {},
109 | "outputs": [
110 | {
111 | "name": "stderr",
112 | "output_type": "stream",
113 | "text": [
114 | "Failed to convert Literal lexical form to value. Datatype=http://www.w3.org/2001/XMLSchema#gYear, Converter=\n",
115 | "Traceback (most recent call last):\n",
116 | " File \"/home/cmatthey/.cache/pypoetry/virtualenvs/aikg-ULDgE_fB-py3.10/lib/python3.10/site-packages/rdflib/term.py\", line 2084, in _castLexicalToPython\n",
117 | " return conv_func(lexical) # type: ignore[arg-type]\n",
118 | " File \"/home/cmatthey/.cache/pypoetry/virtualenvs/aikg-ULDgE_fB-py3.10/lib/python3.10/site-packages/isodate/isodates.py\", line 203, in parse_date\n",
119 | " raise ISO8601Error('Unrecognised ISO 8601 date format: %r' % datestring)\n",
120 | "isodate.isoerror.ISO8601Error: Unrecognised ISO 8601 date format: '-1508+14:00'\n",
121 | "Failed to convert Literal lexical form to value. Datatype=http://www.w3.org/2001/XMLSchema#gYear, Converter=\n",
122 | "Traceback (most recent call last):\n",
123 | " File \"/home/cmatthey/.cache/pypoetry/virtualenvs/aikg-ULDgE_fB-py3.10/lib/python3.10/site-packages/rdflib/term.py\", line 2084, in _castLexicalToPython\n",
124 | " return conv_func(lexical) # type: ignore[arg-type]\n",
125 | " File \"/home/cmatthey/.cache/pypoetry/virtualenvs/aikg-ULDgE_fB-py3.10/lib/python3.10/site-packages/isodate/isodates.py\", line 203, in parse_date\n",
126 | " raise ISO8601Error('Unrecognised ISO 8601 date format: %r' % datestring)\n",
127 | "isodate.isoerror.ISO8601Error: Unrecognised ISO 8601 date format: '-2358+01:14'\n",
128 | "Failed to convert Literal lexical form to value. Datatype=http://www.w3.org/2001/XMLSchema#gYear, Converter=\n",
129 | "Traceback (most recent call last):\n",
130 | " File \"/home/cmatthey/.cache/pypoetry/virtualenvs/aikg-ULDgE_fB-py3.10/lib/python3.10/site-packages/rdflib/term.py\", line 2084, in _castLexicalToPython\n",
131 | " return conv_func(lexical) # type: ignore[arg-type]\n",
132 | " File \"/home/cmatthey/.cache/pypoetry/virtualenvs/aikg-ULDgE_fB-py3.10/lib/python3.10/site-packages/isodate/isodates.py\", line 203, in parse_date\n",
133 | " raise ISO8601Error('Unrecognised ISO 8601 date format: %r' % datestring)\n",
134 | "isodate.isoerror.ISO8601Error: Unrecognised ISO 8601 date format: '-2221+14:00'\n"
135 | ]
136 | }
137 | ],
138 | "source": [
139 | "\n",
140 | "from aikg.utils.llm import setup_llm_chain\n",
141 | "from aikg.utils.rdf import setup_kg\n",
142 | "\n",
143 | "\n",
144 | "# Use OpenAI API\n",
145 | "from langchain.llms import OpenAI\n",
146 | "llm = OpenAI(model_name=\"text-davinci-003\")\n",
147 | "\n",
148 | "# For now, both chains share the same model to spare memory\n",
149 | "answer_chain = setup_llm_chain(llm, chat_config.answer_template)\n",
150 | "sparql_chain = setup_llm_chain(llm, chat_config.sparql_template)\n",
151 | "kg = setup_kg(**kg_config.dict())\n",
152 | "\n",
153 | "# Embed ontology\n",
154 | "from aikg.flows.chroma_build import chroma_build_flow\n",
155 | "chroma_build_flow(chroma_config, ontology_config)"
156 | ]
157 | },
158 | {
159 | "cell_type": "code",
160 | "execution_count": 5,
161 | "metadata": {},
162 | "outputs": [
163 | {
164 | "name": "stderr",
165 | "output_type": "stream",
166 | "text": [
167 | "/home/cmatthey/.cache/pypoetry/virtualenvs/aikg-ULDgE_fB-py3.10/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
168 | " from .autonotebook import tqdm as notebook_tqdm\n"
169 | ]
170 | }
171 | ],
172 | "source": [
173 | "\n",
174 | "from aikg.utils.chroma import setup_client, setup_collection\n",
175 | "client = setup_client(\n",
176 | " chroma_config.host,\n",
177 | " chroma_config.port,\n",
178 | " chroma_config.persist_directory,\n",
179 | ")\n",
180 | "collection = setup_collection(\n",
181 | " client,\n",
182 | " chroma_config.collection_name,\n",
183 | " chroma_config.embedding_model,\n",
184 | ")\n"
185 | ]
186 | },
187 | {
188 | "cell_type": "code",
189 | "execution_count": 45,
190 | "metadata": {},
191 | "outputs": [],
192 | "source": [
193 | "QUESTION = \"Please give me the number of healthcare encounters recorded per year.\""
194 | ]
195 | },
196 | {
197 | "cell_type": "code",
198 | "execution_count": 46,
199 | "metadata": {},
200 | "outputs": [
201 | {
202 | "name": "stdout",
203 | "output_type": "stream",
204 | "text": [
205 | "\n",
206 | "PREFIX ns1: \n",
207 | "PREFIX ns2: \n",
208 | "PREFIX owl: \n",
209 | "PREFIX rdf: \n",
210 | "PREFIX rdfs: \n",
211 | "PREFIX xsd: \n",
212 | "\n",
213 | "SELECT (COUNT(*) AS ?encounters) (YEAR(?startDateTime) AS ?year)\n",
214 | "WHERE {\n",
215 | " ?encounter a ns2:HealthcareEncounter ;\n",
216 | " ns2:hasStartDateTime ?startDateTime .\n",
217 | "}\n",
218 | "GROUP BY ?year\n"
219 | ]
220 | }
221 | ],
222 | "source": [
223 | "from aikg.utils.chat import generate_sparql\n",
224 | "query = generate_sparql(QUESTION, collection, sparql_chain)\n",
225 | "print(query)"
226 | ]
227 | },
228 | {
229 | "cell_type": "code",
230 | "execution_count": 47,
231 | "metadata": {},
232 | "outputs": [
233 | {
234 | "name": "stdout",
235 | "output_type": "stream",
236 | "text": [
237 | "[(rdflib.term.Literal('2', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#integer')), rdflib.term.Literal('2009', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#integer')))]\n"
238 | ]
239 | }
240 | ],
241 | "source": [
242 | "from aikg.utils.rdf import query_kg\n",
243 | "results = query_kg(kg, query)\n",
244 | "print(results)"
245 | ]
246 | },
247 | {
248 | "cell_type": "code",
249 | "execution_count": 48,
250 | "metadata": {},
251 | "outputs": [
252 | {
253 | "name": "stdout",
254 | "output_type": "stream",
255 | "text": [
256 | "In 2009 there were 2 healthcare encounters recorded.\n"
257 | ]
258 | }
259 | ],
260 | "source": [
261 | "from aikg.utils.chat import generate_answer\n",
262 | "print(generate_answer(QUESTION, query, results, answer_chain))"
263 | ]
264 | }
265 | ],
266 | "metadata": {
267 | "kernelspec": {
268 | "display_name": "aikg-URVQdnEY-py3.10",
269 | "language": "python",
270 | "name": "python3"
271 | },
272 | "language_info": {
273 | "codemirror_mode": {
274 | "name": "ipython",
275 | "version": 3
276 | },
277 | "file_extension": ".py",
278 | "mimetype": "text/x-python",
279 | "name": "python",
280 | "nbconvert_exporter": "python",
281 | "pygments_lexer": "ipython3",
282 | "version": "3.10.6"
283 | },
284 | "orig_nbformat": 4
285 | },
286 | "nbformat": 4,
287 | "nbformat_minor": 2
288 | }
289 |
--------------------------------------------------------------------------------
/aikg/server.py:
--------------------------------------------------------------------------------
1 | # kg-llm-interface
2 | # Copyright 2023 - Swiss Data Science Center (SDSC)
3 | # A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and
4 | # Eidgenössische Technische Hochschule Zürich (ETHZ).
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 |
18 | """This is the chat server. It receives JSON messages with a question,
19 | fetches context for that question in a vector store and injects them into a prompt.
20 | It then sends the prompt to a LLM and returns the response to the client.
21 | """
22 | from datetime import datetime
23 | import logging
24 | import os
25 | import sys
26 |
27 | from dotenv import load_dotenv
28 | from fastapi import FastAPI
29 | from langchain.chat_models import ChatOpenAI
30 | from pathlib import Path
31 |
32 | from aikg.config import ChatConfig, ChromaConfig, SparqlConfig
33 | from aikg.config.common import parse_yaml_config
34 | from aikg.models import Conversation, Message
35 | from aikg.utils.chat import generate_answer, generate_examples, generate_sparql
36 | from aikg.utils.llm import setup_llm_chain
37 | from aikg.utils.chroma import setup_collection, setup_client
38 | from aikg.utils.rdf import setup_kg, query_kg
39 |
40 | logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
41 |
42 | load_dotenv()
43 | chroma_config = ChromaConfig()
44 | sparql_config = SparqlConfig()
45 | if os.environ.get("CHAT_CONFIG"):
46 | chat_config = parse_yaml_config(Path(os.environ["CHAT_CONFIG"]), ChatConfig)
47 | else:
48 | chat_config = ChatConfig()
49 |
50 |
51 | client = setup_client(
52 | chroma_config.host,
53 | chroma_config.port,
54 | chroma_config.persist_directory,
55 | )
56 | collection = setup_collection(
57 | client,
58 | chroma_config.collection_name,
59 | chroma_config.embedding_model,
60 | )
61 |
62 | llm = ChatOpenAI(
63 | model_name=chat_config.model,
64 | openai_api_key=chat_config.openai_api_key,
65 | openai_api_base=chat_config.openai_api_base,
66 | )
67 |
68 | answer_chain = setup_llm_chain(llm, chat_config.answer_template)
69 | sparql_chain = setup_llm_chain(llm, chat_config.sparql_template)
70 | kg = setup_kg(**sparql_config.dict())
71 | app = FastAPI()
72 |
73 |
74 | @app.get("/")
75 | def index():
76 | return {
77 | "title": "Hello, welcome to the knowledge graph chatbot!",
78 | "description": "This is a simple chatbot that uses a knowledge graph to answer questions.",
79 | "usage": "Ask a single question using /ask?question='...', or only generate the query using /sparql?question='...'.",
80 | }
81 |
82 |
83 | @app.get("/test/")
84 | async def test() -> Message:
85 | return Message(text="Hello, world!", sender="AI", time=datetime.now())
86 |
87 |
88 | @app.get("/ask/")
89 | async def ask(question: str) -> Message:
90 | """Generate sparql query from question
91 | and execute query on kg and return an answer based on results."""
92 | ...
93 | query = generate_sparql(question, collection, sparql_chain, limit=15)
94 | results = query_kg(kg, query)
95 | answer = generate_answer(question, query, results, answer_chain)
96 | return Message(text=answer, sender="AI", time=datetime.now())
97 |
98 |
99 | @app.get("/sparql/")
100 | async def sparql(question: str) -> Message:
101 | """Generate and return sparql query from question."""
102 | query = generate_sparql(question, collection, sparql_chain)
103 | return Message(text=query, sender="AI", time=datetime.now())
104 |
--------------------------------------------------------------------------------
/aikg/utils/chat.py:
--------------------------------------------------------------------------------
1 | # kg-llm-interface
2 | # Copyright 2023 - Swiss Data Science Center (SDSC)
3 | # A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and
4 | # Eidgenössische Technische Hochschule Zürich (ETHZ).
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 |
18 | """Utilities to help processing chatbot prompts or answers."""
19 | from typing import Any, Iterable
20 |
21 | from chromadb.api import Collection
22 | from rdflib import Graph
23 | from langchain import LLMChain
24 |
25 |
26 | def keep_first_line(text: str) -> str:
27 | r"""Truncate a string to the first non-empty line.
28 |
29 | Examples
30 | --------
31 | >>> keep_first_line("\nFirst line.\nSecond line.")
32 | 'First line.'
33 | """
34 | return text.lstrip("\n").split("\n")[0].strip(" ")
35 |
36 |
37 | def drop_if_keyword(text: str, keyword: str = "Not found.") -> str:
38 | """If input keyword occurs in text, replace it with the keyword.
39 |
40 | Examples
41 | --------
42 | >>> drop_if_keyword("Not found. Some made up answer.", keyword="Not found.")
43 | 'Not found.'
44 | """
45 | if keyword in text:
46 | return keyword
47 | return text
48 |
49 |
50 | def post_process_answer(answer: str) -> str:
51 | """Post-process an answer by keeping only the first line and dropping
52 | it if it contains the keyword 'Not found.'."""
53 | text = keep_first_line(answer)
54 | text = drop_if_keyword(text)
55 | return text
56 |
57 |
58 | def generate_sparql(
59 | question: str,
60 | collection: Collection,
61 | llm_chain: LLMChain,
62 | examples: str = "",
63 | limit: int = 5,
64 | ) -> str:
65 | """Retrieve k-nearest documents from the vector store and synthesize
66 | SPARQL query."""
67 |
68 | # Retrieve documents and triples from top k subjects
69 | results = collection.query(query_texts=question, n_results=limit)
70 | # Extract triples and concatenate as a ntriples string
71 | triples = "\n".join([res.get("triples", "") for res in results["metadatas"][0]])
72 | # Convert to turtle for better readability and fewer tokens
73 | triples = Graph().parse(data=triples).serialize(format="turtle")
74 | query = llm_chain.run(
75 | question_str=question, context_str=triples, examples_str=examples
76 | )
77 | return query
78 |
79 |
80 | def generate_examples(
81 | question: str,
82 | collection: Collection,
83 | limit: int = 5,
84 | ) -> str:
85 | """Retrieve k-nearest questions from the examples in the vector store and return them
86 | together with their correponding query."""
87 |
88 | # Retrieve documents and triples from top k subjects
89 | examples = collection.query(query_texts=question, n_results=limit)
90 | # Extract relevant information from dict
91 | example_docs = examples["documents"][0]
92 | example_meta = examples["metadatas"][0]
93 | #
94 | example_prompt = "Examples: \n\n"
95 | for doc, meta in zip(example_docs, example_meta):
96 | example_prompt += f"""
97 | Question:
98 | {doc}
99 | Query:
100 | {meta['query']}
101 | """
102 | return example_prompt
103 |
104 |
105 | def generate_answer(
106 | question: str,
107 | query: str,
108 | results: Iterable[Any],
109 | llm_chain: LLMChain,
110 | ) -> str:
111 | """
112 | Given a question, associated SPARQL query and execution result,
113 | use a LLM to generate a natural language answer describing the results.
114 | """
115 | # Extract triples and concatenate as a ntriples string
116 | fmt_results = ["\n".join(map(str, results))]
117 | answer = llm_chain.run(
118 | query_str=query, question_str=question, result_str=fmt_results
119 | )
120 | return answer
121 |
--------------------------------------------------------------------------------
/aikg/utils/chroma.py:
--------------------------------------------------------------------------------
1 | # kg-llm-interface
2 | # Copyright 2023 - Swiss Data Science Center (SDSC)
3 | # A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and
4 | # Eidgenössische Technische Hochschule Zürich (ETHZ).
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 |
18 | import chromadb
19 | from chromadb.api import ClientAPI, Collection
20 |
21 |
22 | def setup_client(host: str, port: int, persist_directory: str = ".chroma") -> ClientAPI:
23 | """Prepare chromadb client. If host is 'local', chromadb will run in client-only mode."""
24 | if host == "local":
25 | chroma_client = chromadb.PersistentClient(path=persist_directory)
26 | else:
27 | chroma_client = chromadb.HttpClient(host=host, port=str(port))
28 | return chroma_client
29 |
30 |
31 | def setup_collection(
32 | client: ClientAPI,
33 | collection_name: str,
34 | embedding_model: str,
35 | ) -> Collection:
36 | """Setup the connection to ChromaDB collection."""
37 |
38 | from chromadb.utils import embedding_functions
39 |
40 | embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(
41 | model_name=embedding_model
42 | )
43 | collection = client.get_or_create_collection(
44 | collection_name, embedding_function=embedding_function
45 | )
46 | return collection
47 |
--------------------------------------------------------------------------------
/aikg/utils/io.py:
--------------------------------------------------------------------------------
1 | # kg-llm-interface
2 | # Copyright 2023 - Swiss Data Science Center (SDSC)
3 | # A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and
4 | # Eidgenössische Technische Hochschule Zürich (ETHZ).
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 |
18 | import requests
19 | from pathlib import Path
20 | from typing import TextIO
21 | from langchain.schema import Document
22 | from tqdm import tqdm
23 |
24 |
25 | def download_file(url: str, output_path: str | Path):
26 | # send a GET request to the URL to download the file. Stream since it's large
27 | response = requests.get(url, stream=True)
28 |
29 | # open the file in binary mode and write the contents of the response to it in chunks
30 | # This is a large file, so be prepared to wait.
31 | with open(output_path, "wb") as f:
32 | for chunk in tqdm(response.iter_content(chunk_size=8192)):
33 | if chunk:
34 | f.write(chunk)
35 |
36 |
37 | def parse_sparql_example(example: TextIO) -> Document:
38 | """
39 | Parse a text stream as input with first line being a question (starting with #)
40 | and the remaining lines being a (SPARQL) query. We reformat this content into a document
41 | where the page content is the question and the query is attached as metadata
42 | """
43 | # Create temp variable to process text stream
44 | example_temp = []
45 | example_temp.append(example.read())
46 | # Splitting the file content into lines
47 | lines = example_temp[0].split("\n")
48 | # Extracting the question (removing '#' from the first line)
49 | question = lines[0].strip()[1:]
50 | # Extracting the SPARQL query from the remaining lines
51 | sparql_query = "\n".join(lines[1:])
52 | # Create example document for the output
53 | example_doc = Document(page_content=question, metadata={"query": sparql_query})
54 | return example_doc
55 |
--------------------------------------------------------------------------------
/aikg/utils/llm.py:
--------------------------------------------------------------------------------
1 | # kg-llm-interface
2 | # Copyright 2023 - Swiss Data Science Center (SDSC)
3 | # A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and
4 | # Eidgenössische Technische Hochschule Zürich (ETHZ).
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 |
18 | import re
19 |
20 | from langchain import LLMChain, PromptTemplate
21 | from langchain.llms.base import LLM
22 |
23 |
24 | def setup_llm_chain(llm: LLM, prompt_template: str) -> LLMChain:
25 | """Prepare the prompt injection and text generation system."""
26 | # Auto-detecting prompt variables surrounded by single curly braces
27 | variables = re.findall(r"[^{]{([^} \n]+)}[^}]", prompt_template)
28 | prompt = PromptTemplate(
29 | template=prompt_template,
30 | input_variables=variables,
31 | )
32 | return LLMChain(prompt=prompt, llm=llm)
33 |
--------------------------------------------------------------------------------
/aikg/utils/rdf.py:
--------------------------------------------------------------------------------
1 | # kg-llm-interface
2 | # Copyright 2023 - Swiss Data Science Center (SDSC)
3 | # A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and
4 | # Eidgenössische Technische Hochschule Zürich (ETHZ).
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 |
18 | from itertools import groupby
19 | from pathlib import Path
20 | from typing import Any, Dict, Iterable, Iterator, List, Optional
21 |
22 | from langchain.schema import Document
23 | from rdflib import ConjunctiveGraph, Graph
24 | from rdflib.exceptions import ParserError
25 | from SPARQLWrapper import SPARQLWrapper, CSV
26 | from urllib.parse import urlparse
27 |
28 | # Retrieve triples of human readable labels/values from a SPARQL endpoint.
29 | TRIPLE_LABEL_QUERY = """
30 | PREFIX rdfs:
31 |
32 | SELECT ?s ?p ?o ?sLab ?pLab ?oClean
33 | WHERE
34 | {{
35 | ?s ?p ?o .
36 | ?s rdfs:label ?sLab .
37 | ?p rdfs:label ?pLab .
38 | OPTIONAL {{
39 | ?o rdfs:label ?oLab .
40 | FILTER(LANG(?oLab) = "{lang}")
41 | }}
42 | BIND(COALESCE(?oLab, ?o) AS ?oLabOrUri)
43 | BIND(
44 | IF (isLiteral(?o), ?o, STR(?oLabOrUri))
45 | AS ?oLabOrVal
46 | )
47 | FILTER(LANG(?sLab) = "{lang}")
48 | FILTER(LANG(?pLab) = "{lang}")
49 | FILTER(LANG(?oLabOrVal) = "{lang}" || LANG(?oLabOrVal) = "")
50 | BIND (REPLACE(STR(?oLabOrVal), "^.*[#/:]([^/:#]*)$", "$1") as ?oClean)
51 | {graph_mask}
52 | }}
53 | """
54 |
55 | # Retrieve each subject and its annotations
56 | SUBJECT_DOC_QUERY = """
57 | PREFIX rdfs:
58 | PREFIX skos:
59 | PREFIX sh:
60 | PREFIX schema:
61 | SELECT DISTINCT ?s (SAMPLE(?sLab) as ?sLabel) ?sCom
62 | WHERE
63 | {{
64 | VALUES ?labelProp {{skos:prefLabel rdfs:label sh:name schema:name}}
65 | VALUES ?defProp {{rdfs:comment skos:definition sh:description schema:description }}
66 | ?s ?labelProp ?sLab .
67 | OPTIONAL {{
68 | ?s ?defProp ?sCom .
69 | }}
70 | FILTER(LANG(?sLab) = "{lang}" || LANG(?sLab) = "")
71 | FILTER(LANG(?sCom) = "{lang}" || LANG(?sCom) = "")
72 | {graph_mask}
73 | }}
74 | GROUP BY ?s ?sCom
75 | """
76 |
77 |
78 | def is_uri(uri: str):
79 | """Checks if input is a valid URI."""
80 |
81 | try:
82 | result = urlparse(uri)
83 | return all([result.scheme, result.netloc])
84 | except AttributeError:
85 | return False
86 |
87 |
88 | def make_graph_mask(graph: Optional[str] = None) -> str:
89 | if graph:
90 | return f"FILTER EXISTS {{ GRAPH <{graph}> {{ ?s ?p ?o }} }}"
91 | else:
92 | return ""
93 |
94 |
95 | def setup_kg(
96 | endpoint: str, user: Optional[str] = None, password: Optional[str] = None
97 | ) -> Graph | SPARQLWrapper:
98 | """Try to connect to SPARQL endpoint. If not a URL, attempt
99 | to parse RDF file with rdflib."""
100 |
101 | if is_uri(endpoint):
102 | kg = SPARQLWrapper(endpoint)
103 | kg.setReturnFormat(CSV)
104 | if user and password:
105 | kg.setCredentials(user, password)
106 | else:
107 | kg = ConjunctiveGraph()
108 | kg.parse(endpoint)
109 | return kg
110 |
111 |
112 | def split_documents_from_endpoint(
113 | kg: Graph | SPARQLWrapper,
114 | graph: Optional[str] = None,
115 | ) -> Iterator[Document]:
116 | """Load subject-based documents from a SPARQL endpoint.
117 |
118 | Parameters
119 | ----------
120 | endpoint:
121 | URL of the SPARQL endpoint.
122 | user:
123 | Username to use for authentication.
124 | password:
125 | Password to use for authentication.
126 | graph:
127 | URI of named graph to load RDF data from.
128 | If not specified, all subjects are used.
129 | """
130 |
131 | graph_mask = make_graph_mask(graph)
132 |
133 | # Load the query results
134 | # Query results contain 6 columns:
135 | # subject, predicate, object, subject label, predicate label, object label
136 | results = query_kg(kg, TRIPLE_LABEL_QUERY.format(lang="en", graph_mask=graph_mask))
137 | # skip header if present
138 | if not is_uri(results[0][0]):
139 | results = results[1:]
140 | # Exclude empty / incomplete results (e.g. missing labels)
141 | results = filter(lambda x: len(list(x)) == 6, results)
142 | results = sorted(results, key=lambda x: x[0])[1:]
143 | # Yield triples and text by subject
144 | for k, g in groupby(results, lambda x: x[0]):
145 | # Original triples about subject k
146 | data = list(g)
147 | triples = "\n".join([f"<{s}> <{p}> <{o}>" for s, p, o, _, _, _ in data])
148 | # Human-readable "triples" about subject k
149 | doc = "\n".join([" ".join(elem[3:]) for elem in data])
150 | yield Document(page_content=doc, metadata={"subject": k, "triples": triples})
151 |
152 |
153 | def get_subjects_docs(
154 | kg: Graph | SPARQLWrapper, graph: Optional[str] = None
155 | ) -> List[Document]:
156 | """Given an RDF graph, iterate over subjects, extract human-readable
157 | RDFS annotations. For each subject, retrieve a "text document" with
158 | original triples attached as metadata."""
159 |
160 | results = query_kg(
161 | kg, SUBJECT_DOC_QUERY.format(lang="en", graph_mask=make_graph_mask(graph))
162 | )
163 | docs = []
164 | # skip header if present
165 | if not is_uri(results[0][0]):
166 | results = results[1:]
167 |
168 | for sub, label, comment in results:
169 | text = f"""
170 | {label}
171 | {comment or ''}
172 | """
173 | triples = query_kg(kg, f"DESCRIBE <{sub}>")
174 |
175 | g = Graph()
176 | # SPARQLWrapper returns a ntriple string, rdflib a list of triples
177 | try:
178 | g.parse(data=triples[0][0], format="nt")
179 | except (RuntimeError, ParserError):
180 | for triple in triples:
181 | g.add(triple)
182 | meta = {"triples": g.serialize(format="nt")}
183 | docs.append(Document(page_content=text, metadata=meta))
184 | return docs
185 |
186 |
187 | def query_kg(kg: Graph | SPARQLWrapper, query: str) -> List[List[Any]]:
188 | """Query a knowledge graph, either an rdflib Graph or a SPARQLWrapper.
189 | Results are returned as a list of lists representing a table."""
190 | query2fmt = {"DESCRIBE": "nt", "SELECT": "csv", "CONSTRUCT": "nt"}
191 | if isinstance(kg, Graph):
192 | resp = kg.query(query)
193 | fmt = query2fmt[resp.type]
194 | raw_results = resp.serialize(format=fmt)
195 |
196 | elif isinstance(kg, SPARQLWrapper):
197 | kg.setQuery(query)
198 | fmt = query2fmt[kg.queryType]
199 | kg.setReturnFormat(fmt)
200 | raw_results = kg.query().convert()
201 | else:
202 | raise ValueError(f"Invalid type for kg: {type(kg)}")
203 | if fmt == "csv":
204 | import csv
205 |
206 | lines = raw_results.decode("utf-8").splitlines()
207 | return [row for row in csv.reader(lines, quotechar='"', delimiter=",") if row]
208 | else:
209 | return [[raw_results]]
210 |
211 | return results
212 |
--------------------------------------------------------------------------------
/data/models/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sdsc-ordes/kg-llm-interface/dd5f967d2dfbd2fb718450f54aaa4b69635c2642/data/models/.gitkeep
--------------------------------------------------------------------------------
/data/test_data.trig:
--------------------------------------------------------------------------------
1 | @prefix ex: .
2 | @prefix rdf: .
3 | @prefix rdfs: .
4 | @prefix schema1: .
5 | @prefix xsd: .
6 |
7 | ex:ontology {
8 | schema1:Organization a rdfs:Class ;
9 | rdfs:label "Organization" ;
10 | rdfs:comment "An organization such as a school, NGO, corporation, club, etc." .
11 |
12 | schema1:Person a rdfs:Class ;
13 | rdfs:label "Person" ;
14 | rdfs:comment "A person (alive, dead, undead, or fictional)." .
15 |
16 | schema1:SoftwareSourceCode a rdfs:Class ;
17 | rdfs:label "SoftwareSourceCode" ;
18 | rdfs:comment "Computer programming source code. Example: Full (compile ready) solutions, code snippet samples, scripts, templates." ;
19 | rdfs:subClassOf schema1:CreativeWork .
20 |
21 | schema1:affiliation a rdf:Property ;
22 | rdfs:label "affiliation" ;
23 | rdfs:comment "An organization that this person is affiliated with. For example, a school/university, a club, or a team." ;
24 | rdfs:domain schema1:Person ;
25 | rdfs:range schema1:Organization .
26 |
27 | schema1:author a rdf:Property ;
28 | rdfs:label "author" ;
29 | rdfs:comment "The author of this content or rating." ;
30 | rdfs:domain schema1:SoftwareSourceCode ;
31 | rdfs:range schema1:Person .
32 |
33 | schema1:datePublished a rdf:Property ;
34 | rdfs:label "datePublished" ;
35 | rdfs:comment "Date of first broadcast/publication." ;
36 | rdfs:domain schema1:SoftwareSourceCode ;
37 | rdfs:range xsd:date .
38 |
39 | schema1:description a rdf:Property ;
40 | rdfs:label "description" ;
41 | rdfs:comment "A description of the item." ;
42 | rdfs:domain schema1:SoftwareSourceCode ;
43 | rdfs:range xsd:string .
44 |
45 | schema1:keywords a rdf:Property ;
46 | rdfs:label "keywords" ;
47 | rdfs:comment "Keywords or tags used to describe some item. Multiple textual entries in a keywords list are typically delimited by commas, or by repeating the property." ;
48 | rdfs:domain schema1:SoftwareSourceCode ;
49 | rdfs:range xsd:string .
50 |
51 | schema1:license a rdf:Property ;
52 | rdfs:label "license" ;
53 | rdfs:comment "A license document that applies to this content, typically indicated by URL." ;
54 | rdfs:domain schema1:SoftwareSourceCode ;
55 | rdfs:range xsd:anyUri,
56 | xsd:string .
57 |
58 | schema1:name a rdf:Property ;
59 | rdfs:label "name" ;
60 | rdfs:comment "The name of the item." ;
61 | rdfs:domain schema1:Organization,
62 | schema1:Person,
63 | schema1:SoftwareSourceCode ;
64 | rdfs:range xsd:string .
65 |
66 | schema1:programmingLanguage a rdf:Property ;
67 | rdfs:label "programming language" ;
68 | rdfs:comment "The computer programming language." ;
69 | rdfs:domain schema1:SoftwareSourceCode ;
70 | rdfs:range xsd:string .
71 |
72 | schema1:codeRepository a rdf:Property ;
73 | rdfs:label "codeRepository" ;
74 | rdfs:comment "Link to the repository where the un-compiled, human readable code and related code is located (SVN, GitHub, CodePlex)." ;
75 | rdfs:domain schema1:SoftwareSourceCode ;
76 | rdfs:range xsd:anyUri .
77 | }
78 |
79 |
80 | a schema1:SoftwareSourceCode ;
81 | schema1:codeRepository ;
82 | schema1:author ;
83 | schema1:name "SDSC-ORD/gimie" ;
84 | schema1:contributor ,
85 | ,
86 | ,
87 | ,
88 | ,
89 | ,
90 | ;
91 | schema1:dateCreated "2022-12-07" ;
92 | schema1:dateModified "2023-06-07" ;
93 | schema1:description "Extract linked metadata from repositories" ;
94 | schema1:downloadUrl "https://github.com/SDSC-ORD/gimie/archive/refs/tags/0.5.0.tar.gz" ;
95 | schema1:keywords "fair-data",
96 | "git",
97 | "linked-open-data",
98 | "metadata-extraction",
99 | "python",
100 | "scientific-software" ;
101 | schema1:license ;
102 | schema1:programmingLanguage "Python" ;
103 | schema1:version "0.5.0" .
104 |
105 | a schema1:Organization ;
106 | schema1:description "Cross-disciplinary community around research data, voluntary EPFL's researchers and staff with keen interest in research data." ;
107 | schema1:legalName "EPFL Data Champions" ;
108 | schema1:logo ;
109 | schema1:name "EPFL-Data-Champions" .
110 |
111 | a schema1:Organization ;
112 | schema1:description "" ;
113 | schema1:legalName "biocypher" ;
114 | schema1:logo ;
115 | schema1:name "biocypher" .
116 |
117 | a schema1:Person ;
118 | schema1:affiliation ,
119 | ,
120 | ,
121 | ;
122 | schema1:identifier "cmdoret" ;
123 | schema1:name "Cyril Matthey-Doret" .
124 |
125 | a schema1:Organization ;
126 | schema1:description "" ;
127 | schema1:legalName "Romain Koszul Laboratory" ;
128 | schema1:logo ;
129 | schema1:name "koszullab" .
130 |
131 | a schema1:Person ;
132 | schema1:affiliation ,
133 | ;
134 | schema1:identifier "martinfontanet" .
135 |
136 | a schema1:Person ;
137 | schema1:affiliation ;
138 | schema1:identifier "rmfranken" .
139 |
140 | a schema1:Person ;
141 | schema1:affiliation ;
142 | schema1:identifier "sabinem" ;
143 | schema1:name "Sabine Maennel" .
144 |
145 | a schema1:Person ;
146 | schema1:affiliation ,
147 | ;
148 | schema1:identifier "sabrinaossey" ;
149 | schema1:name "sabrinaossey" .
150 |
151 | a schema1:Person ;
152 | schema1:affiliation ,
153 | ;
154 | schema1:identifier "supermaxiste" .
155 |
156 | a schema1:Person ;
157 | schema1:affiliation ;
158 | schema1:identifier "vancauwe" ;
159 | schema1:name "Laure Vancau" .
160 |
161 | a schema1:Organization ;
162 | schema1:description "An ETH Domain initiative for accelerating the adoption of data science" ;
163 | schema1:legalName "Swiss Data Science Center" ;
164 | schema1:logo ;
165 | schema1:name "SwissDataScienceCenter" .
166 |
167 | a schema1:Organization ;
168 | schema1:description "Open Research Data team at the Swiss Data Science Center." ;
169 | schema1:legalName "Swiss Data Science Center - ORD" ;
170 | schema1:logo ;
171 | schema1:name "SDSC-ORD" .
172 |
173 |
174 |
175 | a schema1:SoftwareSourceCode ;
176 | schema1:codeRepository ;
177 | schema1:author ;
178 | schema1:name "SDSC-ORD/kg-llm-interface" ;
179 | schema1:contributor ;
180 | schema1:dateCreated "2023-04-19" ;
181 | schema1:dateModified "2023-07-05" ;
182 | schema1:description "Langchain-powered natural language interface to RDF knowledge-graphs." ;
183 | schema1:license ;
184 | schema1:programmingLanguage "Jupyter Notebook" .
185 |
186 | a schema1:Organization ;
187 | schema1:description "Cross-disciplinary community around research data, voluntary EPFL's researchers and staff with keen interest in research data." ;
188 | schema1:legalName "EPFL Data Champions" ;
189 | schema1:logo ;
190 | schema1:name "EPFL-Data-Champions" .
191 |
192 | a schema1:Organization ;
193 | schema1:description "An ETH Domain initiative for accelerating the adoption of data science" ;
194 | schema1:legalName "Swiss Data Science Center" ;
195 | schema1:logo ;
196 | schema1:name "SwissDataScienceCenter" .
197 |
198 | a schema1:Person ;
199 | schema1:affiliation ,
200 | ,
201 | ,
202 | ;
203 | schema1:identifier "cmdoret" ;
204 | schema1:name "Cyril Matthey-Doret" .
205 |
206 | a schema1:Organization ;
207 | schema1:description "" ;
208 | schema1:legalName "Romain Koszul Laboratory" ;
209 | schema1:logo ;
210 | schema1:name "koszullab" .
211 |
212 | a schema1:Organization ;
213 | schema1:description "Open Research Data team at the Swiss Data Science Center." ;
214 | schema1:legalName "Swiss Data Science Center - ORD" ;
215 | schema1:logo ;
216 | schema1:name "SDSC-ORD" .
217 |
218 |
219 |
220 | a schema1:SoftwareSourceCode ;
221 | schema1:codeRepository ;
222 | schema1:author ;
223 | schema1:name "SDSC-ORD/zarr_linked_data" ;
224 | schema1:contributor ;
225 | schema1:dateCreated "2023-04-06" ;
226 | schema1:dateModified "2023-05-09" ;
227 | schema1:description "The project seeks to make a dataflow composed both of the Zarr data format and linked metadata." ;
228 | schema1:license ;
229 | schema1:programmingLanguage "Python" .
230 |
231 | a schema1:Person ;
232 | schema1:affiliation ;
233 | schema1:identifier "vancauwe" ;
234 | schema1:name "Laure Vancau" .
235 |
236 | a schema1:Organization ;
237 | schema1:description "Open Research Data team at the Swiss Data Science Center." ;
238 | schema1:legalName "Swiss Data Science Center - ORD" ;
239 | schema1:logo ;
240 | schema1:name "SDSC-ORD" .
241 |
242 | a schema1:SoftwareSourceCode ;
243 | schema1:codeRepository ;
244 | schema1:author ;
245 | schema1:name "SDSC-ORD/pxRRead" ;
246 | schema1:contributor ;
247 | schema1:dateCreated "2023-02-20" ;
248 | schema1:dateModified "2023-06-02" ;
249 | schema1:description "Read a px file of fso statistical data" ;
250 | schema1:keywords "parsing",
251 | "statistical-data" ;
252 | schema1:license ;
253 | schema1:programmingLanguage "R" .
254 |
255 | a schema1:Person ;
256 | schema1:affiliation ;
257 | schema1:identifier "sabinem" ;
258 | schema1:name "Sabine Maennel" .
259 |
260 | a schema1:Organization ;
261 | schema1:description "Open Research Data team at the Swiss Data Science Center." ;
262 | schema1:legalName "Swiss Data Science Center - ORD" ;
263 | schema1:logo ;
264 | schema1:name "SDSC-ORD" .
265 |
--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
1 | version: '3.9'
2 |
3 | networks:
4 | net:
5 | driver: bridge
6 |
7 | services:
8 |
9 | # chat-frontend:
10 | # image: ghcr.io/sdsc-ord/chatllm:latest
11 | # container_name: kg_llm_frontend
12 | # ports:
13 | # - 8000:8000
14 | # networks:
15 | # net:
16 | # profiles:
17 | # - frontend
18 |
19 | chat-server:
20 | build:
21 | context: .
22 | dockerfile: Dockerfile
23 | command: poetry run uvicorn aikg.server:app --reload --port 8001 --host 0.0.0.0
24 | container_name: kg_llm_server
25 | depends_on:
26 | - chroma-server
27 | ports:
28 | - ${SERVER_PORT:-8001}:8001
29 | networks:
30 | net:
31 | env_file:
32 | - .env
33 |
34 | chroma-server:
35 | image: ghcr.io/chroma-core/chroma:0.3.23
36 | volumes:
37 | - index_data:/index_data
38 | command: uvicorn chromadb.app:app --reload --workers 1 --host 0.0.0.0 --port 8000 --log-config log_config.yml
39 | container_name: kg_llm_chroma
40 | environment:
41 | - CHROMA_DB_IMPL=clickhouse
42 | - CLICKHOUSE_HOST=clickhouse
43 | - CLICKHOUSE_PORT=8123
44 | ports:
45 | - ${CHROMA_PORT:-8000}:8000
46 | depends_on:
47 | - clickhouse
48 | profiles:
49 | - db
50 | networks:
51 | net:
52 |
53 |
54 | clickhouse:
55 | image: clickhouse/clickhouse-server:22.9-alpine
56 | container_name: kg_llm_clickhouse
57 | environment:
58 | - ALLOW_EMPTY_PASSWORD=yes
59 | - CLICKHOUSE_TCP_PORT=8999
60 | - CLICKHOUSE_HTTP_PORT=8123
61 | ports:
62 | - '8123:8123'
63 | - '8999:8999'
64 | volumes:
65 | - clickhouse_data:/bitnami/clickhouse
66 | - backups:/backups
67 | - ./config/backup_disk.xml:/etc/clickhouse-server/config.d/backup_disk.xml
68 | - ./config/chroma_users.xml:/etc/clickhouse-server/users.d/chroma.xml
69 | profiles:
70 | - db
71 | networks:
72 | net:
73 |
74 |
75 | volumes:
76 | clickhouse_data:
77 | driver: local
78 | index_data:
79 | driver: local
80 | backups:
81 | driver: local
82 |
--------------------------------------------------------------------------------
/k8s/README.md:
--------------------------------------------------------------------------------
1 | # Deploying Apps with Kubernetes
2 | This guide provides concise instructions on how to deploy and manage apps in your project using Kubernetes.
3 |
4 | ## Prerequisites
5 | Make sure you have kubectl installed and configured to interact with your Kubernetes cluster.
6 |
7 | Then create a `kg-llm` namespace, this is the default namespace used to deploy apps in this repository.
8 | ```sh
9 | kubectl create ns kg-llm
10 | ```
11 |
12 | ## Deploy example config
13 | We provide an example config using a kustomize overlay, located under `overlays/custom-config`.
14 | This config downloads example data from the web and injects it into the graphdb and chroma services using init containers, providing a ready-to-use knowledge graph interface with pre-loaded data.
15 |
16 | To deploy it, simply run:
17 |
18 | ```sh
19 | kubectl apply -k overlays/custom-config
20 | ```
21 |
22 | > [!TIP]
23 | > The easiest way to get started with custom data is to copy the provided config into a separate folder (e.g. `overlays/my-config`) and edit it.
24 |
25 |
26 | ## Deploy a Single App
27 |
28 | kubectl apply -k
29 |
30 | For example, to deploy only graphdb in this repository:
31 |
32 | ```sh
33 | kubectl apply -k ./base/graphdb
34 | ```
35 |
36 | ## Deploying Multiple Apps
37 | To deploy all apps at once, execute the following command from the kubernetes/base folder:
38 |
39 | ```sh
40 | kubectl apply -k ./base
41 | ```
42 |
43 | ## Removing an App or Apps
44 | To remove an app or multiple apps, execute the following command:
45 |
46 | ```sh
47 | kubectl delete -k
48 | ```
49 |
--------------------------------------------------------------------------------
/k8s/base/chatllm/deployment.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: apps/v1
2 | kind: Deployment
3 | metadata:
4 | name: chatllm-server
5 | namespace: kg-llm
6 | spec:
7 | replicas: 1
8 | selector:
9 | matchLabels:
10 | app: chatllm
11 | template:
12 | metadata:
13 | labels:
14 | app: chatllm
15 | spec:
16 | containers:
17 | - name: chatllm-container
18 | image: daniilzhyrov/chatllm:latest
19 | ports:
20 | - containerPort: 0
21 | name: http
22 |
--------------------------------------------------------------------------------
/k8s/base/chatllm/kustomization.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: kustomize.config.k8s.io/v1beta1
2 | kind: Kustomization
3 |
4 | resources:
5 | - deployment.yaml
6 | - service.yaml
7 |
8 | configMapGenerator:
9 | - name: chatllm-config
10 | namespace: kg-llm
11 | env: params.env
12 |
13 | generatorOptions:
14 | disableNameSuffixHash: true
15 |
16 | replacements:
17 | - source:
18 | kind: ConfigMap
19 | name: chatllm-config
20 | fieldPath: data.TARGET_PORT
21 | targets:
22 | - select:
23 | kind: Deployment
24 | name: chatllm-server
25 | fieldPaths:
26 | - spec.template.spec.containers.[name=chatllm-container].ports.[name=http].containerPort
27 | - select:
28 | kind: Service
29 | name: chatllm-service
30 | fieldPaths:
31 | - spec.ports.[name=http].targetPort
32 | - source:
33 | kind: ConfigMap
34 | name: chatllm-config
35 | fieldPath: data.EXTERNAL_PORT
36 | targets:
37 | - select:
38 | kind: Service
39 | name: chatllm-service
40 | fieldPaths:
41 | - spec.ports.[name=http].port
42 |
--------------------------------------------------------------------------------
/k8s/base/chatllm/params.env:
--------------------------------------------------------------------------------
1 | TARGET_PORT=8501
2 | EXTERNAL_PORT=8501
3 |
--------------------------------------------------------------------------------
/k8s/base/chatllm/service.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: Service
3 | metadata:
4 | name: chatllm-service
5 | namespace: kg-llm
6 | spec:
7 | selector:
8 | app: chatllm
9 | ports:
10 | - protocol: TCP
11 | port: 0
12 | targetPort: 0
13 | name: http
14 |
--------------------------------------------------------------------------------
/k8s/base/chroma/configmap.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: ConfigMap
3 | metadata:
4 | name: backup-disk-config
5 | namespace: kg-llm
6 | data:
7 | backup_disk.xml: |
8 |
9 |
10 |
11 |
12 | local
13 | /etc/clickhouse-server/
14 |
15 |
16 |
17 |
18 | backups
19 | /etc/clickhouse-server/
20 |
21 |
22 | ---
23 | apiVersion: v1
24 | kind: ConfigMap
25 | metadata:
26 | name: chroma-users-config
27 | namespace: kg-llm
28 | data:
29 | chroma.xml: |
30 |
31 |
32 |
33 | 1
34 | 1
35 |
36 |
37 |
38 |
--------------------------------------------------------------------------------
/k8s/base/chroma/deployment.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: apps/v1
2 | kind: Deployment
3 | metadata:
4 | name: chroma-server
5 | namespace: kg-llm
6 | spec:
7 | replicas: 1
8 | selector:
9 | matchLabels:
10 | app: chroma
11 | template:
12 | metadata:
13 | labels:
14 | app: chroma
15 | spec:
16 | restartPolicy: Always
17 | containers:
18 | - name: kg-llm-chroma
19 | image: ghcr.io/chroma-core/chroma:0.4.24
20 | command:
21 | - uvicorn
22 | args: ["chromadb.app:app", "--reload", "--workers", "1", "--host", "0.0.0.0", "--port", $(CHROMA_PORT), "--log-config", "chromadb/log_config.yml"]
23 | env:
24 | - name: IS_PERSISTENT
25 | value: "true"
26 | - name: CHROMA_PORT
27 | value: chroma_port_placeholder
28 | ports:
29 | - containerPort: 0
30 | name: chroma-port
31 | volumeMounts:
32 | - name: chroma-index-data
33 | mountPath: /index_data
34 | volumes:
35 | - name: chroma-index-data
36 | persistentVolumeClaim:
37 | claimName: chroma-index-data
38 |
--------------------------------------------------------------------------------
/k8s/base/chroma/kustomization.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: kustomize.config.k8s.io/v1beta1
2 | kind: Kustomization
3 |
4 | resources:
5 | - deployment.yaml
6 | - service.yaml
7 | - configmap.yaml
8 | - pvc.yaml
9 |
10 | configMapGenerator:
11 | - name: chroma-config
12 | namespace: kg-llm
13 | envs:
14 | - params.env
15 |
16 | generatorOptions:
17 | disableNameSuffixHash: true
18 |
19 | replacements:
20 | - source:
21 | kind: ConfigMap
22 | name: chroma-config
23 | fieldpath: data.CHROMA_PORT
24 | targets:
25 | - select:
26 | kind: Deployment
27 | name: chroma-server
28 | fieldPaths:
29 | - spec.template.spec.containers.[name=kg-llm-chroma].ports.[name=chroma-port].containerPort
30 | - spec.template.spec.containers.[name=kg-llm-chroma].env.[name=CHROMA_PORT].value
31 | - select:
32 | kind: Service
33 | name: chroma-service
34 | fieldPaths:
35 | - spec.ports.[name=http].targetPort
36 | - spec.ports.[name=http].port
37 |
--------------------------------------------------------------------------------
/k8s/base/chroma/params.env:
--------------------------------------------------------------------------------
1 | CHROMA_PORT=8000
2 | CHROMA_SERVICE_NAME=chroma-service
3 | CHROMA_SERVICE_NAMESPACE=kg-llm
--------------------------------------------------------------------------------
/k8s/base/chroma/pvc.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: PersistentVolumeClaim
3 | metadata:
4 | name: chroma-index-data
5 | namespace: kg-llm
6 | spec:
7 | accessModes:
8 | - ReadWriteOnce
9 | resources:
10 | requests:
11 | storage: 10Gi
12 |
--------------------------------------------------------------------------------
/k8s/base/chroma/service.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: Service
3 | metadata:
4 | name: chroma-service
5 | namespace: kg-llm
6 | spec:
7 | type: ClusterIP
8 | selector:
9 | app: chroma
10 | ports:
11 | - protocol: TCP
12 | port: 8000
13 | targetPort: 8000
14 | name: http
15 |
--------------------------------------------------------------------------------
/k8s/base/graphdb/configmap.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: ConfigMap
3 | metadata:
4 | name: graphdb-repo-config
5 | namespace: kg-llm
6 | data:
7 | graphdb_config.ttl: |
8 | @prefix rdfs: .
9 | @prefix rep: .
10 | @prefix sr: .
11 | @prefix sail: .
12 | @prefix owlim: .
13 |
14 | [] a rep:Repository ;
15 | rep:repositoryID "test" ;
16 | rdfs:label "test" ;
17 | rep:repositoryImpl [
18 | rep:repositoryType "graphdb:SailRepository" ;
19 | sr:sailImpl [
20 | sail:sailType "graphdb:Sail" ;
21 | owlim:base-URL "http://www.ontotext.com/" ;
22 | # other configurations...
23 | ]
24 | ].
25 |
--------------------------------------------------------------------------------
/k8s/base/graphdb/deployment.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: apps/v1
2 | kind: Deployment
3 | metadata:
4 | name: graphdb
5 | namespace: kg-llm
6 |
7 | spec:
8 | replicas: 1
9 | selector:
10 | matchLabels:
11 | app: graphdb
12 | template:
13 | metadata:
14 | labels:
15 | app: graphdb
16 | spec:
17 | containers:
18 | - name: graphdb
19 | image: ontotext/graphdb:10.2.0
20 | ports:
21 | - containerPort: 0
22 | name: graphdb
23 | volumeMounts:
24 | - name: graphdb-home
25 | mountPath: /opt/graphdb/home
26 | volumes:
27 | - name: graphdb-home
28 | persistentVolumeClaim:
29 | claimName: graphdb-home
30 |
31 |
--------------------------------------------------------------------------------
/k8s/base/graphdb/kustomization.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: kustomize.config.k8s.io/v1beta1
2 | kind: Kustomization
3 |
4 | resources:
5 | - deployment.yaml
6 | - service.yaml
7 | - pvc.yaml
8 | - configmap.yaml
9 |
10 | configMapGenerator:
11 | - name: graphdb-config
12 | namespace: kg-llm
13 | envs:
14 | - params.env
15 |
16 | generatorOptions:
17 | disableNameSuffixHash: true
18 |
19 | replacements:
20 | - source:
21 | kind: ConfigMap
22 | name: graphdb-config
23 | fieldpath: data.GRAPHDB_PORT
24 | targets:
25 | - select:
26 | kind: Deployment
27 | name: graphdb
28 | fieldPaths:
29 | - spec.template.spec.containers.[name=graphdb].ports.[name=graphdb].containerPort
30 | - select:
31 | kind: Service
32 | name: graphdb-service
33 | fieldPaths:
34 | - spec.ports.[name=http].targetPort
35 | - spec.ports.[name=http].port
36 |
--------------------------------------------------------------------------------
/k8s/base/graphdb/params.env:
--------------------------------------------------------------------------------
1 | GRAPHDB_PORT=7200
2 | GRAPHDB_SERVICE_NAME=graphdb-service
3 | GRAPHDB_SERVICE_NAMESPACE=kg-llm
4 |
5 |
--------------------------------------------------------------------------------
/k8s/base/graphdb/pvc.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: PersistentVolumeClaim
3 | metadata:
4 | name: graphdb-home
5 | namespace: kg-llm
6 | spec:
7 | accessModes:
8 | - ReadWriteOnce
9 | resources:
10 | requests:
11 | storage: 10Gi
12 |
--------------------------------------------------------------------------------
/k8s/base/graphdb/service.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: Service
3 | metadata:
4 | name: graphdb-service
5 | namespace: kg-llm
6 | spec:
7 | type: ClusterIP
8 | selector:
9 | app: graphdb
10 | ports:
11 | - protocol: TCP
12 | port: 0
13 | targetPort: 0
14 | name: http
15 |
--------------------------------------------------------------------------------
/k8s/base/kg-llm/deployment.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: apps/v1
2 | kind: Deployment
3 | metadata:
4 | name: kg-llm
5 | namespace: kg-llm
6 | spec:
7 | replicas: 1
8 | selector:
9 | matchLabels:
10 | app: kg-llm
11 | template:
12 | metadata:
13 | labels:
14 | app: kg-llm
15 | spec:
16 | restartPolicy: Always
17 | containers:
18 | - name: kg-llm-container
19 | image: ghcr.io/sdsc-ordes/kg-llm-interface:latest
20 | env:
21 | - name: GRAPHDB_HOST
22 | value: "servicename.servicenamespace.svc.cluster.local"
23 | - name: GRAPHDB_PORT
24 | value: "0"
25 | - name: GRAPHDB_REPO
26 | value: "test"
27 | - name: SPARQL_ENDPOINT
28 | value: "http://$(GRAPHDB_HOST):$(GRAPHDB_PORT)/repositories/$(GRAPHDB_REPO)"
29 | - name: CHROMA_HOST
30 | value: "servicename.servicenamespace.svc.cluster.local"
31 | - name: CHROMA_PORT
32 | value: "0"
33 | - name: OPENAI_API_KEY
34 | value: "0"
35 | ports:
36 | - containerPort: 80
37 | initContainers:
38 | - name: graphdb-upload-container
39 | image: ghcr.io/sdsc-ordes/kg-llm-interface:latest
40 | command: ["/bin/sh", "-c"]
41 | args:
42 | - |
43 | cd /app;
44 | # setup graphdb
45 | curl -v -u admin:admin -X POST http://${GRAPHDB_HOST}:${GRAPHDB_PORT}/rest/repositories -H 'Content-Type: multipart/form-data' -F config=@/app/graphdb_config.ttl || true;
46 | export SPARQL_ENDPOINT="http://${GRAPHDB_HOST}:${GRAPHDB_PORT}/repositories/test";
47 | # download and uncompress nq.gz instance data
48 | wget -O - ${DATA} | gzip -dc > sample.nq;
49 | # download nt ontology and inject named graph -> combine with instance as nq
50 | wget -O - ${ONTOLOGY} | sed 's|^\(.*\) \.|\1 .|' >> sample.nq ;
51 | poetry run python3 aikg/flows/insert_triples.py /app/sample.nq;
52 | volumeMounts:
53 | - name: graphdb-repo-config-volume
54 | mountPath: /app/graphdb_config.ttl
55 | subPath: graphdb_config.ttl
56 | env:
57 | - name: GRAPHDB_HOST
58 | value: graphdb_host_placeholder
59 | - name: GRAPHDB_PORT
60 | value: graphdb_port_placeholder
61 | envFrom:
62 | - configMapRef:
63 | name: kg-llm-config
64 | - name: chroma-upload-container
65 | image: ghcr.io/sdsc-ordes/kg-llm-interface:latest
66 | command: ["/bin/sh", "-c"]
67 | args:
68 | - |
69 | set -e;
70 | cd /app;
71 | wget -O ontology.nt ${ONTOLOGY};
72 | echo "endpoint: ontology.nt" > ontology.yaml
73 | poetry run python3 aikg/flows/chroma_build.py --sparql-cfg-path ontology.yaml;
74 | env:
75 | - name: CHROMA_HOST
76 | value: chroma_host_placeholder
77 | - name: CHROMA_PORT
78 | value: chroma_port_placeholder
79 | envFrom:
80 | - configMapRef:
81 | name: kg-llm-config
82 |
83 | volumes:
84 | - name: graphdb-repo-config-volume
85 | configMap:
86 | name: graphdb-repo-config
87 |
--------------------------------------------------------------------------------
/k8s/base/kg-llm/kustomization.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: kustomize.config.k8s.io/v1beta1
2 | kind: Kustomization
3 |
4 | resources:
5 | - deployment.yaml
6 | - service.yaml
7 | - pvc.yaml
8 |
9 | configMapGenerator:
10 | - name: kg-llm-config
11 | namespace: kg-llm
12 | envs:
13 | - params.env
14 |
15 | generatorOptions:
16 | disableNameSuffixHash: true
17 |
18 | replacements:
19 | - source:
20 | kind: ConfigMap
21 | name: kg-llm-config
22 | fieldPath: data.GRAPHDB_SERVICE_NAME
23 | targets:
24 | - select:
25 | kind: Deployment
26 | name: kg-llm
27 | fieldPaths:
28 | - spec.template.spec.containers.[name=kg-llm-container].env.[name=GRAPHDB_HOST].value
29 | - spec.template.spec.initContainers.[name=graphdb-upload-container].env.[name=GRAPHDB_HOST].value
30 | - source:
31 | kind: ConfigMap
32 | name: kg-llm-config
33 | fieldPath: data.GRAPHDB_PORT
34 | targets:
35 | - select:
36 | kind: Deployment
37 | name: kg-llm
38 | fieldPaths:
39 | - spec.template.spec.containers.[name=kg-llm-container].env.[name=GRAPHDB_PORT].value
40 | - spec.template.spec.initContainers.[name=graphdb-upload-container].env.[name=GRAPHDB_PORT].value
41 | - source:
42 | kind: ConfigMap
43 | name: kg-llm-config
44 | fieldPath: data.CHROMA_SERVICE_NAME
45 | targets:
46 | - select:
47 | kind: Deployment
48 | name: kg-llm
49 | fieldPaths:
50 | - spec.template.spec.containers.[name=kg-llm-container].env.[name=CHROMA_HOST].value
51 | - spec.template.spec.initContainers.[name=chroma-upload-container].env.[name=CHROMA_HOST].value
52 | - source:
53 | kind: ConfigMap
54 | name: kg-llm-config
55 | fieldPath: data.CHROMA_PORT
56 | targets:
57 | - select:
58 | kind: Deployment
59 | name: kg-llm
60 | fieldPaths:
61 | - spec.template.spec.containers.[name=kg-llm-container].env.[name=CHROMA_PORT].value
62 | - spec.template.spec.initContainers.[name=chroma-upload-container].env.[name=CHROMA_PORT].value
63 | - source:
64 | kind: ConfigMap
65 | name: kg-llm-config
66 | fieldPath: data.OPENAI_API_KEY
67 | targets:
68 | - select:
69 | kind: Deployment
70 | name: kg-llm
71 | fieldPaths:
72 | - spec.template.spec.containers.[name=kg-llm-container].env.[name=OPENAI_API_KEY].value
73 |
--------------------------------------------------------------------------------
/k8s/base/kg-llm/params.env:
--------------------------------------------------------------------------------
1 | ONTOLOGY=https://www.pokemonkg.org/ontology/ontology.nt
2 | DATA=https://www.pokemonkg.org/download/dump/poke-a.nq.gz
3 |
4 | #figure out how to remove this: can we throw out the entire CHROMA part in the kustomization file ?
5 | CHROMA_PORT=8000
6 | CHROMA_SERVICE_NAME=chroma-service
7 | GRAPHDB_PORT=7200
8 | GRAPHDB_SERVICE_NAME=graphdb-service
9 | GRAPHDB_REPO="test"
10 |
11 | OPENAI_API_KEY=sk-...xxxxx
12 |
--------------------------------------------------------------------------------
/k8s/base/kg-llm/pvc.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: PersistentVolumeClaim
3 | metadata:
4 | name: graphdb-config-pvc
5 | namespace: kg-llm
6 | spec:
7 | accessModes:
8 | - ReadWriteOnce
9 | resources:
10 | requests:
11 | storage: 1Gi
12 |
--------------------------------------------------------------------------------
/k8s/base/kg-llm/service.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: Service
3 | metadata:
4 | name: kg-llm-service
5 | namespace: kg-llm
6 | spec:
7 | type: NodePort
8 | selector:
9 | app: kg-llm
10 | ports:
11 | - protocol: TCP
12 | port: 80
13 | targetPort: 80
14 |
--------------------------------------------------------------------------------
/k8s/base/kustomization.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: kustomize.config.k8s.io/v1beta1
2 | kind: Kustomization
3 |
4 | resources:
5 | - chatllm
6 | - kg-llm
7 | - chroma
8 | - graphdb
9 |
--------------------------------------------------------------------------------
/k8s/overlays/custom-config/kustomization.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: kustomize.config.k8s.io/v1beta1
2 | kind: Kustomization
3 |
4 | resources:
5 | - ../../base
6 |
7 | configMapGenerator:
8 | - name: custom-config
9 | namespace: kg-llm
10 | envs:
11 | - params.env
12 |
13 | generatorOptions:
14 | disableNameSuffixHash: true
15 |
--------------------------------------------------------------------------------
/k8s/overlays/custom-config/params.env:
--------------------------------------------------------------------------------
1 | CHROMA_SERVICE_NAME=chroma-service
2 | CHROMA_SERVICE_NAMESPACE=kg-llm
3 | CHROMA_PORT=8000
4 | GRAPHDB_SERVICE_NAME=graphdb-service
5 | GRAPHDB_SERVICE_NAMESPACE=kg-llm
6 | GRAPHDB_PORT=7200
7 | TRIPLES_PATH=https://link-to-dump.com/data.ttl.gz
8 | ONTOLOGY_PATH=https://github.com/user/repo/blob/master/ontology.ttl.gz
9 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [project]
2 | authors = [
3 | {name = "cyril.matthey-doret", email = "cyril.matthey-doret@epfl.ch"},
4 | ]
5 | license = {text = "MIT"}
6 | requires-python = "<4.0,>=3.10"
7 | dependencies = [
8 | "tqdm<5.0.0,>=4.65.0",
9 | "sentence-transformers<3.0.0,>=2.2.2",
10 | "python-dotenv<2.0.0,>=1.0.0",
11 | "ipykernel<7.0.0,>=6.22.0",
12 | "rdflib==6.3.0",
13 | "chromadb<1.0.0,>=0.4.22",
14 | "more-itertools<10.0.0,>=9.1.0",
15 | "prefect<3.0.0,>=2.10.6",
16 | "typer<1.0.0,>=0.9.0",
17 | "bokeh==2.4.3",
18 | "sparqlwrapper<3.0.0,>=2.0.0",
19 | "fastapi<1.0.0,>=0.95.1",
20 | "uvicorn<1.0.0,>=0.22.0",
21 | "typing-extensions<5.0.0,>=4.6.3",
22 | "protobuf==3.20",
23 | "jupyterlab<5.0.0,>=4.0.2",
24 | "langchain<1.0.0,>=0.0.230",
25 | "openai<1.0.0,>=0.27.8",
26 | "poethepoet<1.0.0,>=0.21.0",
27 | "html5lib<2.0,>=1.1",
28 | "anyio==3.7.1",
29 | "testcontainers<4.0.0,>=3.7.1",
30 | "torch==2.6.0+cpu",
31 | "torchvision==0.21.0+cpu",
32 | ]
33 | name = "aikg"
34 | version = "0.1.0"
35 | description = "Langchain-powered natural language interface to RDF knowledge-graphs"
36 | readme = "README.md"
37 |
38 | [dependency-groups]
39 | local = [
40 | "transformers<5.0.0,>=4.28.1",
41 | ]
42 | dev = [
43 | "black<24.0.0,>=23.3.0",
44 | "pytest<8.0.0,>=7.3.1",
45 | "pre-commit<4.0.0,>=3.2.2",
46 | ]
47 |
48 | [tool.uv.sources]
49 | torch = [
50 | { index = "pytorch-cpu" },
51 | ]
52 | torchvision = [
53 | { index = "pytorch-cpu" },
54 | ]
55 |
56 | [[tool.uv.index]]
57 | name = "pytorch-cpu"
58 | url = "https://download.pytorch.org/whl/cpu"
59 | explicit = true
60 |
--------------------------------------------------------------------------------
/scripts/standalone_server.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | # This script starts the server with a local Chroma instance and using
3 | # an RDF file as knowledge graph.
4 |
5 | # Local SPARQL+Chroma configs
6 | export SPARQL_ENDPOINT="data/test_data.trig"
7 | export CHROMA_HOST="local"
8 | export CHROMA_MODEL="all-MiniLM-L6-v2"
9 | export CHROMA_PERSIST_DIR="/tmp/chroma-test"
10 | export CHAT_CONFIG="tests/chat.test.yml"
11 |
12 | # Embed in Chroma
13 | python aikg/flows/chroma_build.py --graph https://example.org/ontology
14 | # Run server
15 | uvicorn "aikg.server:app" --port 8001
16 |
--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sdsc-ordes/kg-llm-interface/dd5f967d2dfbd2fb718450f54aaa4b69635c2642/tests/__init__.py
--------------------------------------------------------------------------------
/tests/chat.test.yml:
--------------------------------------------------------------------------------
1 |
2 | model_id: lmsys/vicuna-7b-v1.3
3 | answer_template: |
4 | The following describe a user question, associated SPARQL query and the result from the query.
5 | Based on this information, write an answer in simple terms that describes the results.
6 |
7 | Question:
8 | {question_str}
9 | Query:
10 | {query_str}
11 | Result:
12 | {result_str}
13 | Answer:
14 | sparql_template: |
15 |
16 | Use the question and the additional information to generate a sparql query against
17 | a knowledge graph where the p and q items are
18 |
19 | completely unknown to you. You will need to discover the p and q items before you
20 | can generate the sparql.
21 |
22 | Do not assume you know the p and q items for any concepts.
23 |
24 | After you generate the sparql, you should display it.
25 |
26 | When generating sparql, Never enclose the sparql in back-quotes
27 |
28 | Use the following format:
29 |
30 | Question: the input question for which you must provide a natural language answer
31 |
32 | Information: the additional information you get with the query, in RDF format. This
33 | will help you generate the sparql query with the correct format.
34 |
35 |
36 | Question: {question_str}
37 |
38 | Information:
39 |
40 | {context_str}
41 |
42 | Answer:
43 |
--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
1 | import gzip
2 | from pathlib import Path
3 | import pytest
4 | import shutil
5 | import tempfile
6 |
7 | from aikg.utils.io import download_file
8 |
9 | TEST_SCHEMA_URL = "https://www.pokemonkg.org/ontology/ontology.nt"
10 | TEST_INSTANCES_URL = "https://www.pokemonkg.org/download/dump/poke-a.nq.gz"
11 |
12 |
13 | @pytest.fixture(scope="module")
14 | def instance_file() -> Path:
15 | """Download and gunzip remote instance test file."""
16 | gz_path = tempfile.NamedTemporaryFile(suffix=".nq.gz", delete=False).name
17 | download_file(TEST_INSTANCES_URL, gz_path)
18 | path = gz_path.removesuffix(".gz")
19 | # gunzip downloaded file
20 | with gzip.open(gz_path, "rb") as f_in:
21 | with open(path, "wb") as f_out:
22 | shutil.copyfileobj(f_in, f_out)
23 |
24 | return Path(path)
25 |
26 |
27 | @pytest.fixture(scope="module")
28 | def schema_file() -> Path:
29 | """Download remote schema test file."""
30 | path = tempfile.NamedTemporaryFile(suffix=".nt", delete=False).name
31 | download_file(TEST_SCHEMA_URL, path)
32 | return Path(path)
33 |
34 |
35 | @pytest.fixture(scope="module")
36 | def small_instance_file(instance_file) -> Path:
37 | """Create a small instance file for testing, truncated to 100 lines."""
38 | path = tempfile.NamedTemporaryFile(suffix=".nq", delete=False).name
39 |
40 | with open(instance_file) as f, open(path, "w") as f_out:
41 | for i, line in enumerate(f):
42 | if i > 1000:
43 | break
44 | f_out.write(line)
45 | return Path(path)
46 |
--------------------------------------------------------------------------------
/tests/test_load_data.py:
--------------------------------------------------------------------------------
1 | import requests
2 | import time
3 |
4 | from testcontainers.core.container import DockerContainer
5 | from aikg.config import ChromaConfig, SparqlConfig
6 | from aikg.flows.chroma_build import chroma_build_flow
7 | from aikg.flows.insert_triples import sparql_insert_flow
8 |
9 | REPO_CONFIG = """
10 | @prefix rdfs: .
11 | @prefix rep: .
12 | @prefix sr: .
13 | @prefix sail: .
14 | @prefix owlim: .
15 |
16 | [] a rep:Repository ;
17 | rep:repositoryID "test" ;
18 | rdfs:label "test" ;
19 | rep:repositoryImpl [
20 | rep:repositoryType "graphdb:SailRepository" ;
21 | sr:sailImpl [
22 | sail:sailType "graphdb:Sail" ;
23 | owlim:base-URL "http://www.ontotext.com/" ;
24 | # other configurations...
25 | ]
26 | ].
27 | """
28 |
29 |
30 | def test_init_data(schema_file, small_instance_file):
31 | with (
32 | DockerContainer("ontotext/graphdb:10.2.2").with_bind_ports(7200, 7200)
33 | ) as graphdb:
34 | # container ready + margin for graphdb to start
35 | graphdb.get_exposed_port(7200)
36 | time.sleep(5)
37 | # Create test repo
38 | resp = requests.post(
39 | "http://localhost:7200/rest/repositories", files={"config": REPO_CONFIG}
40 | )
41 | sparql_insert_flow(schema_file, SparqlConfig())
42 | sparql_insert_flow(small_instance_file, SparqlConfig())
43 | chroma_build_flow(ChromaConfig(host="local"))
44 |
--------------------------------------------------------------------------------
/tests/test_rdf.py:
--------------------------------------------------------------------------------
1 | # Test RDF functionality to interact with a knowledge graph.
2 | # The kg may be a SPARQL endpoint or a local RDF file.
3 | from aikg.config import SparqlConfig
4 | from aikg.utils.rdf import query_kg, setup_kg
5 | import pytest
6 |
7 | rdflib_config = SparqlConfig(
8 | endpoint="data/test_data.trig",
9 | )
10 | sparql_config = SparqlConfig(
11 | endpoint="https://sparql.uniprot.org/",
12 | )
13 |
14 | CONFIGS = [rdflib_config, sparql_config]
15 | QUERIES = [
16 | "SELECT * WHERE { ?s ?p ?o } LIMIT 10",
17 | "DESCRIBE ?s WHERE { ?s ?p ?o } LIMIT 10",
18 | ]
19 |
20 |
21 | @pytest.fixture
22 | def sparql_kg():
23 | """A public SPARQL endpoint."""
24 | return setup_kg(sparql_config.endpoint, sparql_config.user, sparql_config.password)
25 |
26 |
27 | @pytest.fixture
28 | def rdflib_kg():
29 | """A local RDF file."""
30 | return setup_kg(rdflib_config.endpoint, rdflib_config.user, rdflib_config.password)
31 |
32 |
33 | @pytest.mark.parametrize("kg", ["sparql_kg", "rdflib_kg"])
34 | @pytest.mark.parametrize("query", QUERIES)
35 | def test_run_query_kg(kg, query, request):
36 | """Test if a query on a kg returns at least one result."""
37 | res = query_kg(request.getfixturevalue(kg), query)
38 | assert len(res) >= 1
39 |
40 |
41 | @pytest.mark.parametrize("query", QUERIES)
42 | def test_compare_query_kg(sparql_kg, rdflib_kg, query):
43 | """Test if the same query on rdflib and sparql yields
44 | the same output dimensions."""
45 | rdflib_res = query_kg(rdflib_kg, query)
46 | sparql_res = query_kg(sparql_kg, query)
47 | assert len(sparql_res) == len(rdflib_res)
48 | assert all([len(x) == len(y) for x, y in zip(sparql_res, rdflib_res)])
49 |
--------------------------------------------------------------------------------