├── .cargo
└── config.toml
├── .github
└── workflows
│ ├── build.yml
│ ├── release.yml
│ └── test_api_server.yml
├── .gitignore
├── Cargo.lock
├── Cargo.toml
├── LICENSE
├── README.md
├── check_code_before_commit.sh
├── docs
├── assets
│ └── kw_search.png
├── keyword_search.md
└── vectordb.md
├── src
├── backend
│ ├── ggml.rs
│ └── mod.rs
├── error.rs
├── main.rs
└── utils.rs
└── tests
├── test_chat.hurl
├── test_embeddings.hurl
└── test_rag.hurl
/.cargo/config.toml:
--------------------------------------------------------------------------------
1 | [build]
2 | target = "wasm32-wasip1"
3 | rustflags = ["--cfg", "wasmedge", "--cfg", "tokio_unstable"]
4 |
--------------------------------------------------------------------------------
/.github/workflows/build.yml:
--------------------------------------------------------------------------------
1 | name: Build
2 |
3 | on:
4 | push:
5 | branches:
6 | - dev
7 | - main
8 | - release-*
9 | - feat-*
10 | - ci-*
11 | - refactor-*
12 | - fix-*
13 | - test-*
14 | paths:
15 | - '.github/workflows/build.yml'
16 | - '**/Cargo.toml'
17 | - '**/Cargo.lock'
18 | - '**/*.rs'
19 | - '**/*.sh'
20 | pull_request:
21 | branches:
22 | - dev
23 | - main
24 | types: [opened, synchronize, reopened]
25 | paths:
26 | - '.github/workflows/**'
27 | - '**/Cargo.toml'
28 | - '**/*.rs'
29 | - '**/*.sh'
30 |
31 | jobs:
32 | build-wasm:
33 | runs-on: ${{ matrix.os }}
34 | strategy:
35 | matrix:
36 | os: [ubuntu-22.04, macos-13, macos-14, macos-15]
37 | steps:
38 | - name: Clone project
39 | id: checkout
40 | uses: actions/checkout@v3
41 |
42 | - name: Install Rust-nightly
43 | uses: actions-rust-lang/setup-rust-toolchain@v1
44 | with:
45 | toolchain: nightly
46 | target: wasm32-wasip1
47 | components: rustfmt, clippy
48 |
49 | - name: Install Rust-stable
50 | uses: actions-rust-lang/setup-rust-toolchain@v1
51 | with:
52 | target: wasm32-wasip1
53 |
54 | - name: Download wasi-sdk for x86_64-macos
55 | if: matrix.os == 'macos-13'
56 | run: |
57 | curl -LO https://github.com/WebAssembly/wasi-sdk/releases/download/wasi-sdk-24/wasi-sdk-24.0-x86_64-macos.tar.gz
58 | tar -xzvf wasi-sdk-24.0-x86_64-macos.tar.gz
59 | mv wasi-sdk-24.0-x86_64-macos wasi-sdk-24.0
60 |
61 | - name: Download wasi-sdk for arm64-macos
62 | if: matrix.os == 'macos-14' || matrix.os == 'macos-15'
63 | run: |
64 | curl -LO https://github.com/WebAssembly/wasi-sdk/releases/download/wasi-sdk-24/wasi-sdk-24.0-arm64-macos.tar.gz
65 | tar -xzvf wasi-sdk-24.0-arm64-macos.tar.gz
66 | mv wasi-sdk-24.0-arm64-macos wasi-sdk-24.0
67 |
68 | - name: Build api-server for linux
69 | id: build_api_server_linux
70 | if: startsWith(matrix.os, 'ubuntu')
71 | env:
72 | RUSTFLAGS: "--cfg wasmedge --cfg tokio_unstable"
73 | run: |
74 | cargo +nightly fmt --all -- --check
75 | cargo +nightly clippy --target wasm32-wasip1 -- -D warnings
76 | cargo build --target wasm32-wasip1 --release
77 |
78 | - name: Build api-server for macos
79 | id: build_api_server_macos
80 | if: startsWith(matrix.os, 'macos')
81 | env:
82 | WASI_SDK_PATH: /Users/runner/work/rag-api-server/rag-api-server/wasi-sdk-24.0
83 | CC: "/Users/runner/work/rag-api-server/rag-api-server/wasi-sdk-24.0/bin/clang --sysroot=/Users/runner/work/rag-api-server/rag-api-server/wasi-sdk-24.0/share/wasi-sysroot"
84 | RUSTFLAGS: "--cfg wasmedge --cfg tokio_unstable"
85 | run: |
86 | cargo +nightly fmt --all -- --check
87 | cargo +nightly clippy --target wasm32-wasip1 -- -D warnings
88 | cargo build --target wasm32-wasip1 --release
89 |
--------------------------------------------------------------------------------
/.github/workflows/release.yml:
--------------------------------------------------------------------------------
1 | name: Release
2 |
3 | on:
4 | workflow_dispatch: # manual trigger release
5 | inputs:
6 | create_release:
7 | description: 'Create new release'
8 | required: true
9 | type: boolean
10 | release_version:
11 | description: "Version (e.g. 1.0.0)"
12 | required: true
13 | type: string
14 |
15 | jobs:
16 | build-and-release:
17 | runs-on: ubuntu-latest
18 | steps:
19 | - name: Clone project
20 | id: checkout
21 | uses: actions/checkout@v3
22 |
23 | - name: Setup rustup
24 | id: rustup
25 | uses: actions-rust-lang/setup-rust-toolchain@v1
26 | with:
27 | target: wasm32-wasip1
28 |
29 | - name: Build rag-api-server
30 | id: build_rag_api_server
31 | env:
32 | RUSTFLAGS: "--cfg wasmedge --cfg tokio_unstable"
33 | run: |
34 | cargo clean
35 | cargo build --target wasm32-wasip1 --release
36 | cp target/wasm32-wasip1/release/rag-api-server.wasm rag-api-server.wasm
37 |
38 | - name: Calculate checksum
39 | id: checksum
40 | run: |
41 | sha256sum *.wasm > SHA256SUM
42 |
43 | echo "Debug info(SHA256SUM):"
44 | cat SHA256SUM
45 |
46 | - name: Tag and release names
47 | id: tag_and_release_names
48 | run: |
49 | echo "tag_name=${{ github.event.inputs.release_version }}" >> $GITHUB_OUTPUT
50 | echo "release_name=LlamaEdge-RAG ${{ github.event.inputs.release_version }}" >> $GITHUB_OUTPUT
51 |
52 | - name: Create Release and Upload Release Asset
53 | if: ${{ github.event.inputs.create_release == 'true' && github.ref == 'refs/heads/main'}}
54 | uses: softprops/action-gh-release@v1
55 | with:
56 | name: ${{ steps.tag_and_release_names.outputs.release_name }}
57 | tag_name: ${{ steps.tag_and_release_names.outputs.tag_name }}
58 | body: TODO New Release.
59 | draft: true
60 | prerelease: true
61 | files: |
62 | rag-api-server.wasm
63 | SHA256SUM
64 |
--------------------------------------------------------------------------------
/.github/workflows/test_api_server.yml:
--------------------------------------------------------------------------------
1 | name: Test API Server
2 |
3 | on:
4 | push:
5 | branches:
6 | - dev
7 | - main
8 | - release-*
9 | - feat-*
10 | - ci-*
11 | - refactor-*
12 | - fix-*
13 | - test-*
14 | paths:
15 | - '.github/workflows/test_api_server.yml'
16 | - '**/Cargo.toml'
17 | - '**/Cargo.lock'
18 | - '**/*.rs'
19 | - '**/*.sh'
20 | - '**/.cargo/config.toml'
21 | - 'tests/*.hurl'
22 | pull_request:
23 | branches:
24 | - dev
25 | - main
26 | types: [opened, synchronize, reopened]
27 | paths:
28 | - '.github/workflows/**'
29 | - '**/Cargo.toml'
30 | - '**/*.rs'
31 | - '**/*.sh'
32 | - 'tests/*.hurl'
33 |
34 | jobs:
35 | test-api-server-ubuntu:
36 | runs-on: ubuntu-latest
37 | strategy:
38 | matrix:
39 | wasmedge_version: [0.14.1]
40 | ggml_version: [b5074]
41 | steps:
42 | - name: Clone project
43 | id: checkout
44 | uses: actions/checkout@v3
45 |
46 | - name: Install Rust-nightly
47 | uses: actions-rust-lang/setup-rust-toolchain@v1
48 | with:
49 | toolchain: nightly
50 | target: wasm32-wasip1
51 | components: rustfmt, clippy
52 |
53 | - name: Install Rust-stable
54 | uses: actions-rust-lang/setup-rust-toolchain@v1
55 | with:
56 | target: wasm32-wasip1
57 |
58 | - name: Install WasmEdge
59 | run: |
60 | curl -sSf https://raw.githubusercontent.com/WasmEdge/WasmEdge/master/utils/install_v2.sh | bash -s -- -v ${{ matrix.wasmedge_version }} --ggmlbn=${{ matrix.ggml_version }}
61 | ls -al $HOME/.wasmedge/bin
62 |
63 | - name: Install Hurl
64 | run: |
65 | curl --location --remote-name https://github.com/Orange-OpenSource/hurl/releases/download/5.0.1/hurl_5.0.1_amd64.deb
66 | sudo apt update && sudo apt install ./hurl_5.0.1_amd64.deb
67 |
68 | - name: Install Qdrant and download snapshot
69 | run: |
70 | # Download Qdrant
71 | curl -LO https://github.com/qdrant/qdrant/releases/download/v1.11.4/qdrant-x86_64-unknown-linux-musl.tar.gz
72 | tar -xvf qdrant-x86_64-unknown-linux-musl.tar.gz
73 | rm qdrant-x86_64-unknown-linux-musl.tar.gz
74 |
75 | # Download snapshot
76 | curl -LO https://huggingface.co/datasets/gaianet/paris/resolve/main/paris_768_nomic-embed-text-v1.5-f16.snapshot
77 | mv paris_768_nomic-embed-text-v1.5-f16.snapshot default.snapshot
78 |
79 | ls -al
80 |
81 | - name: Build rag-api-server on linux
82 | env:
83 | RUSTFLAGS: "--cfg wasmedge --cfg tokio_unstable"
84 | run: |
85 | cargo build -p rag-api-server --release
86 | cp target/wasm32-wasip1/release/rag-api-server.wasm ./rag-api-server.wasm
87 |
88 | - name: Download models
89 | run: |
90 | curl -LO https://huggingface.co/second-state/Qwen2-1.5B-Instruct-GGUF/resolve/main/Qwen2-1.5B-Instruct-Q3_K_M.gguf
91 | curl -LO https://huggingface.co/second-state/Nomic-embed-text-v1.5-Embedding-GGUF/resolve/main/nomic-embed-text-v1.5-f16.gguf
92 |
93 | - name: Start Qdrant
94 | run: |
95 | nohup ./qdrant > ./start-qdrant.log 2>&1 &
96 | sleep 5
97 | cat start-qdrant.log
98 |
99 | - name: Import the default.snapshot file to Qdrant
100 | run: |
101 | curl -s -X POST http://localhost:6333/collections/default/snapshots/upload?priority=snapshot -H 'Content-Type:multipart/form-data' -F 'snapshot=@default.snapshot'
102 |
103 | - name: Start rag-api-server for testing chat completions
104 | run: |
105 | nohup $HOME/.wasmedge/bin/wasmedge --dir .:. --nn-preload default:GGML:AUTO:Qwen2-1.5B-Instruct-Q3_K_M.gguf --nn-preload embedding:GGML:AUTO:nomic-embed-text-v1.5-f16.gguf rag-api-server.wasm --model-name Qwen2-1.5B-Instruct,nomic-embed-text-v1.5 --ctx-size 4096,512 --batch-size 16,512 --prompt-template chatml,embedding --rag-policy last-user-message --socket-addr 0.0.0.0:8080 > ./start-llamaedge.log 2>&1 &
106 | sleep 30
107 | cat start-llamaedge.log
108 |
109 | # - name: Run test_chat.hurl
110 | # run: |
111 | # hurl --test --jobs 1 ./tests/test_chat.hurl
112 |
113 | - name: Run test_embeddings.hurl
114 | run: |
115 | hurl --test --jobs 1 ./tests/test_embeddings.hurl
116 |
117 | # - name: Run test_rag.hurl
118 | # run: |
119 | # hurl --test --jobs 1 ./tests/test_rag.hurl
120 |
121 | - name: Stop rag-api-server for testing chat completions
122 | run: |
123 | pkill -f wasmedge
124 |
125 | - name: Stop Qdrant
126 | run: |
127 | pkill -f qdrant
128 |
129 | test-api-server-macos-13:
130 | runs-on: macos-13
131 | needs: test-api-server-ubuntu
132 | strategy:
133 | matrix:
134 | wasmedge_version: [0.14.1]
135 | ggml_version: [b5074]
136 | steps:
137 | - name: Clone project
138 | id: checkout
139 | uses: actions/checkout@v3
140 |
141 | - name: Install Rust-nightly
142 | uses: actions-rust-lang/setup-rust-toolchain@v1
143 | with:
144 | toolchain: nightly
145 | target: wasm32-wasip1
146 | components: rustfmt, clippy
147 |
148 | - name: Install Rust-stable
149 | uses: actions-rust-lang/setup-rust-toolchain@v1
150 | with:
151 | target: wasm32-wasip1
152 |
153 | - name: Download wasi-sdk for x86_64-macos
154 | run: |
155 | curl -LO https://github.com/WebAssembly/wasi-sdk/releases/download/wasi-sdk-24/wasi-sdk-24.0-x86_64-macos.tar.gz
156 | tar -xzvf wasi-sdk-24.0-x86_64-macos.tar.gz
157 | mv wasi-sdk-24.0-x86_64-macos wasi-sdk-24.0
158 |
159 | - name: Install WasmEdge
160 | run: |
161 | curl -sSf https://raw.githubusercontent.com/WasmEdge/WasmEdge/master/utils/install_v2.sh | bash -s -- -v ${{ matrix.wasmedge_version }} --ggmlbn=${{ matrix.ggml_version }}
162 | ls -al $HOME/.wasmedge/bin
163 |
164 | - name: Install Hurl
165 | run: |
166 | brew install hurl
167 |
168 | - name: Install Qdrant and download snapshot
169 | run: |
170 | # Download Qdrant
171 | curl -LO https://github.com/qdrant/qdrant/releases/download/v1.11.4/qdrant-x86_64-apple-darwin.tar.gz
172 | tar -xzvf qdrant-x86_64-apple-darwin.tar.gz
173 | rm qdrant-x86_64-apple-darwin.tar.gz
174 |
175 | # Download snapshot
176 | curl -LO https://huggingface.co/datasets/gaianet/paris/resolve/main/paris_768_nomic-embed-text-v1.5-f16.snapshot
177 | mv paris_768_nomic-embed-text-v1.5-f16.snapshot default.snapshot
178 |
179 | ls -al
180 |
181 | - name: Build rag-api-server on macos-13
182 | env:
183 | WASI_SDK_PATH: /Users/runner/work/rag-api-server/rag-api-server/wasi-sdk-24.0
184 | CC: "/Users/runner/work/rag-api-server/rag-api-server/wasi-sdk-24.0/bin/clang --sysroot=/Users/runner/work/rag-api-server/rag-api-server/wasi-sdk-24.0/share/wasi-sysroot"
185 | RUSTFLAGS: "--cfg wasmedge --cfg tokio_unstable"
186 | run: |
187 | cargo build -p rag-api-server --release
188 | cp target/wasm32-wasip1/release/rag-api-server.wasm ./rag-api-server.wasm
189 |
190 | - name: Download models
191 | run: |
192 | curl -LO https://huggingface.co/second-state/Qwen2-1.5B-Instruct-GGUF/resolve/main/Qwen2-1.5B-Instruct-Q3_K_M.gguf
193 | curl -LO https://huggingface.co/second-state/Nomic-embed-text-v1.5-Embedding-GGUF/resolve/main/nomic-embed-text-v1.5-f16.gguf
194 |
195 | - name: Start Qdrant
196 | run: |
197 | nohup ./qdrant > ./start-qdrant.log 2>&1 &
198 | sleep 5
199 | cat start-qdrant.log
200 |
201 | - name: Import the default.snapshot file to Qdrant
202 | run: |
203 | curl -s -X POST http://localhost:6333/collections/default/snapshots/upload?priority=snapshot -H 'Content-Type:multipart/form-data' -F 'snapshot=@default.snapshot'
204 |
205 | - name: Start rag-api-server for testing chat completions
206 | run: |
207 | nohup $HOME/.wasmedge/bin/wasmedge --dir .:. --nn-preload default:GGML:AUTO:Qwen2-1.5B-Instruct-Q3_K_M.gguf --nn-preload embedding:GGML:AUTO:nomic-embed-text-v1.5-f16.gguf rag-api-server.wasm --model-name Qwen2-1.5B-Instruct,nomic-embed-text-v1.5 --ctx-size 4096,512 --batch-size 16,512 --prompt-template chatml,embedding --rag-policy last-user-message --socket-addr 0.0.0.0:8080 > ./start-llamaedge.log 2>&1 &
208 | sleep 30
209 | cat start-llamaedge.log
210 |
211 | # - name: Run test_chat.hurl
212 | # run: |
213 | # hurl --test --jobs 1 ./tests/test_chat.hurl
214 |
215 | - name: Run test_embeddings.hurl
216 | run: |
217 | hurl --test --jobs 1 ./tests/test_embeddings.hurl
218 |
219 | # - name: Run test_rag.hurl
220 | # run: |
221 | # hurl --test --jobs 1 ./tests/test_rag.hurl
222 |
223 | - name: Stop rag-api-server for testing chat completions
224 | run: |
225 | pkill -f wasmedge
226 |
227 | - name: Stop Qdrant
228 | run: |
229 | pkill -f qdrant
230 |
231 | test-api-server-macos-14:
232 | runs-on: macos-14
233 | needs: test-api-server-macos-13
234 | strategy:
235 | matrix:
236 | wasmedge_version: [0.14.1]
237 | ggml_version: [b5074]
238 | steps:
239 | - name: Clone project
240 | id: checkout
241 | uses: actions/checkout@v3
242 |
243 | - name: Install Rust-nightly
244 | uses: actions-rust-lang/setup-rust-toolchain@v1
245 | with:
246 | toolchain: nightly
247 | target: wasm32-wasip1
248 | components: rustfmt, clippy
249 |
250 | - name: Install Rust-stable
251 | uses: actions-rust-lang/setup-rust-toolchain@v1
252 | with:
253 | target: wasm32-wasip1
254 |
255 | - name: Download wasi-sdk for arm64-macos
256 | run: |
257 | curl -LO https://github.com/WebAssembly/wasi-sdk/releases/download/wasi-sdk-24/wasi-sdk-24.0-arm64-macos.tar.gz
258 | tar -xzvf wasi-sdk-24.0-arm64-macos.tar.gz
259 | mv wasi-sdk-24.0-arm64-macos wasi-sdk-24.0
260 |
261 | - name: Install WasmEdge
262 | run: |
263 | curl -sSf https://raw.githubusercontent.com/WasmEdge/WasmEdge/master/utils/install_v2.sh | bash -s -- -v ${{ matrix.wasmedge_version }} --ggmlbn=${{ matrix.ggml_version }}
264 | ls -al $HOME/.wasmedge/bin
265 |
266 | - name: Install Hurl
267 | run: |
268 | brew install hurl
269 |
270 | - name: Install Qdrant and download snapshot
271 | run: |
272 | # Download Qdrant
273 | curl -LO https://github.com/qdrant/qdrant/releases/download/v1.11.4/qdrant-aarch64-apple-darwin.tar.gz
274 | tar -xzvf qdrant-aarch64-apple-darwin.tar.gz
275 | rm qdrant-aarch64-apple-darwin.tar.gz
276 |
277 | # Download snapshot
278 | curl -LO https://huggingface.co/datasets/gaianet/paris/resolve/main/paris_768_nomic-embed-text-v1.5-f16.snapshot
279 | mv paris_768_nomic-embed-text-v1.5-f16.snapshot default.snapshot
280 |
281 | ls -al
282 |
283 | - name: Build rag-api-server on macos-14
284 | env:
285 | WASI_SDK_PATH: /Users/runner/work/rag-api-server/rag-api-server/wasi-sdk-24.0
286 | CC: "/Users/runner/work/rag-api-server/rag-api-server/wasi-sdk-24.0/bin/clang --sysroot=/Users/runner/work/rag-api-server/rag-api-server/wasi-sdk-24.0/share/wasi-sysroot"
287 | RUSTFLAGS: "--cfg wasmedge --cfg tokio_unstable"
288 | run: |
289 | cargo build -p rag-api-server --release
290 | cp target/wasm32-wasip1/release/rag-api-server.wasm ./rag-api-server.wasm
291 |
292 | - name: Download models
293 | run: |
294 | curl -LO https://huggingface.co/second-state/Qwen2-1.5B-Instruct-GGUF/resolve/main/Qwen2-1.5B-Instruct-Q3_K_M.gguf
295 | curl -LO https://huggingface.co/second-state/Nomic-embed-text-v1.5-Embedding-GGUF/resolve/main/nomic-embed-text-v1.5-f16.gguf
296 |
297 | - name: Start Qdrant
298 | run: |
299 | nohup ./qdrant > ./start-qdrant.log 2>&1 &
300 | sleep 5
301 | cat start-qdrant.log
302 |
303 | - name: Import the default.snapshot file to Qdrant
304 | run: |
305 | curl -s -X POST http://localhost:6333/collections/default/snapshots/upload?priority=snapshot -H 'Content-Type:multipart/form-data' -F 'snapshot=@default.snapshot'
306 |
307 | - name: Start rag-api-server for testing chat completions
308 | run: |
309 | nohup $HOME/.wasmedge/bin/wasmedge --dir .:. --nn-preload default:GGML:AUTO:Qwen2-1.5B-Instruct-Q3_K_M.gguf --nn-preload embedding:GGML:AUTO:nomic-embed-text-v1.5-f16.gguf rag-api-server.wasm --model-name Qwen2-1.5B-Instruct,nomic-embed-text-v1.5 --ctx-size 4096,512 --batch-size 16,512 --prompt-template chatml,embedding --rag-policy last-user-message --socket-addr 0.0.0.0:8080 > ./start-llamaedge.log 2>&1 &
310 | sleep 30
311 | cat start-llamaedge.log
312 |
313 | # - name: Run test_chat.hurl
314 | # run: |
315 | # hurl --test --jobs 1 ./tests/test_chat.hurl
316 |
317 | - name: Run test_embeddings.hurl
318 | run: |
319 | hurl --test --jobs 1 ./tests/test_embeddings.hurl
320 |
321 | # - name: Run test_rag.hurl
322 | # run: |
323 | # hurl --test --jobs 1 ./tests/test_rag.hurl
324 |
325 | - name: Stop rag-api-server for testing chat completions
326 | run: |
327 | pkill -f wasmedge
328 |
329 | - name: Stop Qdrant
330 | run: |
331 | pkill -f qdrant
332 |
333 | test-api-server-macos-15:
334 | runs-on: macos-15
335 | needs: test-api-server-macos-14
336 | strategy:
337 | matrix:
338 | wasmedge_version: [0.14.1]
339 | ggml_version: [b5074]
340 | steps:
341 | - name: Clone project
342 | id: checkout
343 | uses: actions/checkout@v3
344 |
345 | - name: Install Rust-nightly
346 | uses: actions-rust-lang/setup-rust-toolchain@v1
347 | with:
348 | toolchain: nightly
349 | target: wasm32-wasip1
350 | components: rustfmt, clippy
351 |
352 | - name: Install Rust-stable
353 | uses: actions-rust-lang/setup-rust-toolchain@v1
354 | with:
355 | target: wasm32-wasip1
356 |
357 | - name: Download wasi-sdk for arm64-macos
358 | run: |
359 | curl -LO https://github.com/WebAssembly/wasi-sdk/releases/download/wasi-sdk-24/wasi-sdk-24.0-arm64-macos.tar.gz
360 | tar -xzvf wasi-sdk-24.0-arm64-macos.tar.gz
361 | mv wasi-sdk-24.0-arm64-macos wasi-sdk-24.0
362 |
363 | - name: Install WasmEdge
364 | run: |
365 | curl -sSf https://raw.githubusercontent.com/WasmEdge/WasmEdge/master/utils/install_v2.sh | bash -s -- -v ${{ matrix.wasmedge_version }} --ggmlbn=${{ matrix.ggml_version }}
366 | ls -al $HOME/.wasmedge/bin
367 |
368 | - name: Install Hurl
369 | run: |
370 | brew install hurl
371 |
372 | - name: Install Qdrant and download snapshot
373 | run: |
374 | # Download Qdrant
375 | curl -LO https://github.com/qdrant/qdrant/releases/download/v1.11.4/qdrant-aarch64-apple-darwin.tar.gz
376 | tar -xzvf qdrant-aarch64-apple-darwin.tar.gz
377 | rm qdrant-aarch64-apple-darwin.tar.gz
378 |
379 | # Download snapshot
380 | curl -LO https://huggingface.co/datasets/gaianet/paris/resolve/main/paris_768_nomic-embed-text-v1.5-f16.snapshot
381 | mv paris_768_nomic-embed-text-v1.5-f16.snapshot default.snapshot
382 |
383 | ls -al
384 |
385 | - name: Build rag-api-server on macos-14
386 | env:
387 | WASI_SDK_PATH: /Users/runner/work/rag-api-server/rag-api-server/wasi-sdk-24.0
388 | CC: "/Users/runner/work/rag-api-server/rag-api-server/wasi-sdk-24.0/bin/clang --sysroot=/Users/runner/work/rag-api-server/rag-api-server/wasi-sdk-24.0/share/wasi-sysroot"
389 | RUSTFLAGS: "--cfg wasmedge --cfg tokio_unstable"
390 | run: |
391 | cargo build -p rag-api-server --release
392 | cp target/wasm32-wasip1/release/rag-api-server.wasm ./rag-api-server.wasm
393 |
394 | - name: Download models
395 | run: |
396 | curl -LO https://huggingface.co/second-state/Qwen2-1.5B-Instruct-GGUF/resolve/main/Qwen2-1.5B-Instruct-Q3_K_M.gguf
397 | curl -LO https://huggingface.co/second-state/Nomic-embed-text-v1.5-Embedding-GGUF/resolve/main/nomic-embed-text-v1.5-f16.gguf
398 |
399 | - name: Start Qdrant
400 | run: |
401 | nohup ./qdrant > ./start-qdrant.log 2>&1 &
402 | sleep 5
403 | cat start-qdrant.log
404 |
405 | - name: Import the default.snapshot file to Qdrant
406 | run: |
407 | curl -s -X POST http://localhost:6333/collections/default/snapshots/upload?priority=snapshot -H 'Content-Type:multipart/form-data' -F 'snapshot=@default.snapshot'
408 |
409 | - name: Start rag-api-server for testing chat completions
410 | run: |
411 | nohup $HOME/.wasmedge/bin/wasmedge --dir .:. --nn-preload default:GGML:AUTO:Qwen2-1.5B-Instruct-Q3_K_M.gguf --nn-preload embedding:GGML:AUTO:nomic-embed-text-v1.5-f16.gguf rag-api-server.wasm --model-name Qwen2-1.5B-Instruct,nomic-embed-text-v1.5 --ctx-size 4096,512 --batch-size 16,512 --prompt-template chatml,embedding --rag-policy last-user-message --socket-addr 0.0.0.0:8080 > ./start-llamaedge.log 2>&1 &
412 | sleep 30
413 | cat start-llamaedge.log
414 |
415 | # - name: Run test_chat.hurl
416 | # run: |
417 | # hurl --test --jobs 1 ./tests/test_chat.hurl
418 |
419 | - name: Run test_embeddings.hurl
420 | run: |
421 | hurl --test --jobs 1 ./tests/test_embeddings.hurl
422 |
423 | # - name: Run test_rag.hurl
424 | # run: |
425 | # hurl --test --jobs 1 ./tests/test_rag.hurl
426 |
427 | - name: Stop rag-api-server for testing chat completions
428 | run: |
429 | pkill -f wasmedge
430 |
431 | - name: Stop Qdrant
432 | run: |
433 | pkill -f qdrant
434 |
435 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | /target
2 |
--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
1 | [package]
2 | name = "rag-api-server"
3 | version = "0.13.15"
4 | edition = "2021"
5 |
6 | [dependencies]
7 | anyhow = "1"
8 | chat-prompts = { version = "=0.26.1" }
9 | chrono = "0.4.38"
10 | clap = { version = "4.4.6", features = ["cargo"] }
11 | either = "1.12.0"
12 | endpoints = { version = "=0.25.1", features = ["rag", "index"] }
13 | futures = { version = "0.3.6", default-features = false, features = ["async-await", "std"] }
14 | futures-util = "0.3"
15 | hyper = { version = "0.14", features = ["full"] }
16 | llama-core = { version = "=0.30.0", features = ["logging", "rag", "index"] }
17 | log = { version = "0.4.21", features = ["std", "kv", "kv_serde"] }
18 | mime_guess = "2.0.4"
19 | multipart-2021 = "0.19.0"
20 | once_cell = "1.18"
21 | reqwest = { version = "0.11", default-features = false, features = ["json", "stream", "rustls-tls"] }
22 | serde = { version = "1.0", features = ["derive"] }
23 | serde_json = "1.0"
24 | thiserror = "1"
25 | tokio = { version = "^1.36", features = ["io-util", "fs", "net", "time", "rt", "macros"] }
26 | url = "^2.5"
27 | uuid = { version = "1.4", features = ["v4", "fast-rng", "macro-diagnostics"] }
28 | walkdir = "2.5.0"
29 | wasi-logger = { version = "0.1.2", features = ["kv"] }
30 |
31 | [patch.crates-io]
32 | socket2 = { git = "https://github.com/second-state/socket2.git", branch = "v0.5.x" }
33 | reqwest = { git = "https://github.com/second-state/wasi_reqwest.git", branch = "0.11.x" }
34 | hyper = { git = "https://github.com/second-state/wasi_hyper.git", branch = "v0.14.x" }
35 | tokio = { git = "https://github.com/second-state/wasi_tokio.git", branch = "v1.36.x" }
36 |
37 | [features]
38 | default = []
39 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # LlamaEdge-RAG API Server
2 |
3 |
4 |
5 |
6 |
7 | - [LlamaEdge-RAG API Server](#llamaedge-rag-api-server)
8 | - [Introduction](#introduction)
9 | - [Endpoints](#endpoints)
10 | - [List models](#list-models)
11 | - [Chat completions](#chat-completions)
12 | - [Upload a file](#upload-a-file)
13 | - [List all files](#list-all-files)
14 | - [Retrieve information about a specific file](#retrieve-information-about-a-specific-file)
15 | - [Retrieve the content of a specific file](#retrieve-the-content-of-a-specific-file)
16 | - [Download a specific file](#download-a-specific-file)
17 | - [Delete a file](#delete-a-file)
18 | - [Segment a file to chunks](#segment-a-file-to-chunks)
19 | - [Compute embeddings for user query or file chunks](#compute-embeddings-for-user-query-or-file-chunks)
20 | - [Generate embeddings from a file](#generate-embeddings-from-a-file)
21 | - [Get server information](#get-server-information)
22 | - [Retrieve context](#retrieve-context)
23 | - [Setup](#setup)
24 | - [Build](#build)
25 | - [Execute](#execute)
26 | - [Usage Example](#usage-example)
27 | - [Set Log Level](#set-log-level)
28 |
29 |
30 |
31 | ## Introduction
32 |
33 | LlamaEdge-RAG API server provides a group of OpenAI-compatible web APIs for the Retrieval-Augmented Generation (RAG) applications. The server is implemented in WebAssembly (Wasm) and runs on [WasmEdge Runtime](https://github.com/WasmEdge/WasmEdge).
34 |
35 | ### Endpoints
36 |
37 | #### List models
38 |
39 | `rag-api-server` provides a POST API `/v1/models` to list currently available models.
40 |
41 | Example
42 |
43 | You can use `curl` to test it on a new terminal:
44 |
45 | ```bash
46 | curl -X POST http://localhost:8080/v1/models -H 'accept:application/json'
47 | ```
48 |
49 | If the command runs successfully, you should see the similar output as below in your terminal:
50 |
51 | ```json
52 | {
53 | "object":"list",
54 | "data":[
55 | {
56 | "id":"llama-2-chat",
57 | "created":1697084821,
58 | "object":"model",
59 | "owned_by":"Not specified"
60 | }
61 | ]
62 | }
63 | ```
64 |
65 |
66 |
67 | #### Chat completions
68 |
69 | Ask a question using OpenAI's JSON message format.
70 |
71 | Example
72 |
73 | ```bash
74 | curl -X POST http://localhost:8080/v1/chat/completions \
75 | -H 'accept:application/json' \
76 | -H 'Content-Type: application/json' \
77 | -d '{"messages":[{"role":"system", "content": "You are a helpful assistant."}, {"role":"user", "content": "Who is Robert Oppenheimer?"}], "model":"llama-2-chat"}'
78 | ```
79 |
80 | Here is the response.
81 |
82 | ```json
83 | {
84 | "id":"",
85 | "object":"chat.completion",
86 | "created":1697092593,
87 | "model":"llama-2-chat",
88 | "choices":[
89 | {
90 | "index":0,
91 | "message":{
92 | "role":"assistant",
93 | "content":"Robert Oppenheimer was an American theoretical physicist and director of the Manhattan Project, which developed the atomic bomb during World War II. He is widely regarded as one of the most important physicists of the 20th century and is known for his contributions to the development of quantum mechanics and the theory of the atomic nucleus. Oppenheimer was also a prominent figure in the post-war nuclear weapons debate, advocating for international control and regulation of nuclear weapons."
94 | },
95 | "finish_reason":"stop"
96 | }
97 | ],
98 | "usage":{
99 | "prompt_tokens":9,
100 | "completion_tokens":12,
101 | "total_tokens":21
102 | }
103 | }
104 | ```
105 |
106 |
107 |
108 | #### Upload a file
109 |
110 | In RAG applications, uploading files is a necessary step.
111 |
112 | Example: Upload a file
113 |
114 | The following command upload a text file [paris.txt](https://huggingface.co/datasets/gaianet/paris/raw/main/paris.txt) to the API server via the `/v1/files` endpoint:
115 |
116 | ```bash
117 | curl -X POST http://127.0.0.1:8080/v1/files -F "file=@paris.txt"
118 | ```
119 |
120 | If the command is successful, you should see the similar output as below in your terminal:
121 |
122 | ```json
123 | {
124 | "id": "file_4bc24593-2a57-4646-af16-028855e7802e",
125 | "bytes": 2161,
126 | "created_at": 1711611801,
127 | "filename": "paris.txt",
128 | "object": "file",
129 | "purpose": "assistants"
130 | }
131 | ```
132 |
133 | The `id` and `filename` fields are important for the next step, for example, to segment the uploaded file to chunks for computing embeddings.
134 |
135 |
136 |
137 | #### List all files
138 |
139 | `GET /v1/files` endpoint is used for listing all files on the server.
140 |
141 | Example: List files
142 |
143 | The following command lists all files on the server via the `/v1/files` endpoint:
144 |
145 | ```bash
146 | curl -X GET http://127.0.0.1:8080/v1/files
147 | ```
148 |
149 | If the command is successful, you should see the similar output as below in your terminal:
150 |
151 | ```bash
152 | {
153 | "object": "list",
154 | "data": [
155 | {
156 | "id": "file_33d9188d-5060-4141-8c52-ae148fd15f6a",
157 | "bytes": 17039,
158 | "created_at": 1718296362,
159 | "filename": "test-123.m4a",
160 | "object": "file",
161 | "purpose": "assistants"
162 | },
163 | {
164 | "id": "file_8c6439da-df59-4b9a-bb5e-dba4b2f23c04",
165 | "bytes": 17039,
166 | "created_at": 1718294169,
167 | "filename": "test-123.m4a",
168 | "object": "file",
169 | "purpose": "assistants"
170 | }
171 | ]
172 | }
173 | ```
174 |
175 |
176 |
177 | #### Retrieve information about a specific file
178 |
179 | `GET /v1/files/{file_id}` endpoint is used for retrieving information about a specific file on the server.
180 |
181 | Example: Retrieve information about a specific file
182 |
183 | The following command retrieves information about a specific file on the server via the `/v1/files/{file_id}` endpoint:
184 |
185 | ```bash
186 | curl -X GET http://localhost:10086/v1/files/file_b892bc81-35e9-44a6-8c01-ae915c1d3832
187 | ```
188 |
189 | If the command is successful, you should see the similar output as below in your terminal:
190 |
191 | ```bash
192 | {
193 | "id": "file_b892bc81-35e9-44a6-8c01-ae915c1d3832",
194 | "bytes": 2161,
195 | "created_at": 1715832065,
196 | "filename": "paris.txt",
197 | "object": "file",
198 | "purpose": "assistants"
199 | }
200 | ```
201 |
202 |
203 |
204 | #### Retrieve the content of a specific file
205 |
206 | `GET /v1/files/{file_id}/content` endpoint is used for retrieving the content of a specific file on the server.
207 |
208 | Example: Retrieve the content of a specific file
209 |
210 | The following command retrieves the content of a specific file on the server via the `/v1/files/{file_id}/content` endpoint:
211 |
212 | ```bash
213 | curl -X GET http://localhost:10086/v1/files/file_b892bc81-35e9-44a6-8c01-ae915c1d3832/content
214 | ```
215 |
216 |
217 |
218 | #### Download a specific file
219 |
220 | `GET /v1/files/download/{file_id}` endpoint is used for downloading a specific file on the server.
221 |
222 | Example: Download a specific file
223 |
224 | The following command downloads a specific file on the server via the `/v1/files/download/{file_id}` endpoint:
225 |
226 | ```bash
227 | curl -X GET http://localhost:10086/v1/files/download/file_b892bc81-35e9-44a6-8c01-ae915c1d3832
228 | ```
229 |
230 |
231 |
232 | #### Delete a file
233 |
234 | `DELETE /v1/files/{file_id}` endpoint is used for deleting a specific file on the server.
235 |
236 | Example: Delete a specific file
237 |
238 | The following command deletes a specific file on the server via the `/v1/files/{file_id}` endpoint:
239 |
240 | ```bash
241 | curl -X DELETE http://localhost:10086/v1/files/file_6a6d8046-fd98-410a-b70e-0a0142ec9a39
242 | ```
243 |
244 | If the command is successful, you should see the similar output as below in your terminal:
245 |
246 | ```bash
247 | {
248 | "id": "file_6a6d8046-fd98-410a-b70e-0a0142ec9a39",
249 | "object": "file",
250 | "deleted": true
251 | }
252 | ```
253 |
254 |
255 |
256 | #### Segment a file to chunks
257 |
258 | To segment the uploaded file to chunks for computing embeddings, use the `/v1/chunks` API.
259 |
260 | Example
261 |
262 | The following command sends the uploaded file ID and filename to the API server and gets the chunks:
263 |
264 | ```bash
265 | curl -X POST http://localhost:8080/v1/chunks \
266 | -H 'accept:application/json' \
267 | -H 'Content-Type: application/json' \
268 | -d '{"id":"file_4bc24593-2a57-4646-af16-028855e7802e", "filename":"paris.txt"}'
269 | ```
270 |
271 | The following is an example return with the generated chunks:
272 |
273 | ```json
274 | {
275 | "id": "file_4bc24593-2a57-4646-af16-028855e7802e",
276 | "filename": "paris.txt",
277 | "chunks": [
278 | "Paris, city and capital of France, ..., for Paris has retained its importance as a centre for education and intellectual pursuits.",
279 | "Paris’s site at a crossroads ..., drawing to itself much of the talent and vitality of the provinces."
280 | ]
281 | }
282 | ```
283 |
284 |
285 |
286 | #### Compute embeddings for user query or file chunks
287 |
288 | To compute embeddings for user query or file chunks, use the `/v1/embeddings` API.
289 |
290 | Example
291 |
292 | The following command sends a query to the API server and gets the embeddings as return:
293 |
294 | ```bash
295 | curl -X POST http://localhost:8080/v1/embeddings \
296 | -H 'accept:application/json' \
297 | -H 'Content-Type: application/json' \
298 | -d '{"model": "e5-mistral-7b-instruct-Q5_K_M", "input":["Paris, city and capital of France, ..., for Paris has retained its importance as a centre for education and intellectual pursuits.", "Paris’s site at a crossroads ..., drawing to itself much of the talent and vitality of the provinces."]}'
299 | ```
300 |
301 | The embeddings returned are like below:
302 |
303 | ```json
304 | {
305 | "object": "list",
306 | "data": [
307 | {
308 | "index": 0,
309 | "object": "embedding",
310 | "embedding": [
311 | 0.1428378969,
312 | -0.0447309874,
313 | 0.007660218049,
314 | ...
315 | -0.0128974719,
316 | -0.03543198109,
317 | 0.03974733502,
318 | 0.00946635101,
319 | -0.01531364303
320 | ]
321 | },
322 | {
323 | "index": 1,
324 | "object": "embedding",
325 | "embedding": [
326 | 0.0697753951,
327 | -0.0001159032545,
328 | 0.02073983476,
329 | ...
330 | 0.03565846011,
331 | -0.04550019652,
332 | 0.02691745944,
333 | 0.02498772368,
334 | -0.003226313973
335 | ]
336 | }
337 | ],
338 | "model": "e5-mistral-7b-instruct-Q5_K_M",
339 | "usage": {
340 | "prompt_tokens": 491,
341 | "completion_tokens": 0,
342 | "total_tokens": 491
343 | }
344 | }
345 | ```
346 |
347 |
348 |
349 | #### Generate embeddings from a file
350 |
351 | `/v1/create/rag` endpoint provides users a one-click way to convert a text or markdown file to embeddings directly. The effect of the endpoint is equivalent to running `/v1/files` + `/v1/chunks` + `/v1/embeddings` sequently. Note that the `--chunk-capacity` CLI option is required for the endpoint. The default value of the option is `100`. You can set it to different values while starting LlamaEdge-RAG API server.
352 |
353 | Example
354 |
355 | The following command uploads a text file [paris.txt](https://huggingface.co/datasets/gaianet/paris/raw/main/paris.txt) to the API server via the `/v1/create/rag` endpoint:
356 |
357 | ```bash
358 | curl -X POST http://127.0.0.1:8080/v1/create/rag -F "file=@paris.txt"
359 | ```
360 |
361 | The embeddings returned are like below:
362 |
363 | ```json
364 | {
365 | "object": "list",
366 | "data": [
367 | {
368 | "index": 0,
369 | "object": "embedding",
370 | "embedding": [
371 | 0.1428378969,
372 | -0.0447309874,
373 | 0.007660218049,
374 | ...
375 | -0.0128974719,
376 | -0.03543198109,
377 | 0.03974733502,
378 | 0.00946635101,
379 | -0.01531364303
380 | ]
381 | },
382 | {
383 | "index": 1,
384 | "object": "embedding",
385 | "embedding": [
386 | 0.0697753951,
387 | -0.0001159032545,
388 | 0.02073983476,
389 | ...
390 | 0.03565846011,
391 | -0.04550019652,
392 | 0.02691745944,
393 | 0.02498772368,
394 | -0.003226313973
395 | ]
396 | }
397 | ],
398 | "model": "e5-mistral-7b-instruct-Q5_K_M",
399 | "usage": {
400 | "prompt_tokens": 491,
401 | "completion_tokens": 0,
402 | "total_tokens": 491
403 | }
404 | }
405 | ```
406 |
407 |
408 |
409 | #### Get server information
410 |
411 | `/v1/info` endpoint provides the information of the API server, including the version of the server, the parameters of models, and etc.
412 |
413 | Example
414 |
415 | You can use `curl` to test it on a new terminal:
416 |
417 | ```bash
418 | curl -X POST http://localhost:8080/v1/info -H 'accept:application/json'
419 | ```
420 |
421 | If the command runs successfully, you should see the similar output as below in your terminal:
422 |
423 | ```json
424 | {
425 | "version": "0.3.4",
426 | "plugin_version": "b2694 (commit 0d56246f)",
427 | "port": "8080",
428 | "models": [
429 | {
430 | "name": "Llama-2-7b-chat-hf-Q5_K_M",
431 | "type": "chat",
432 | "prompt_template": "Llama2Chat",
433 | "n_predict": 1024,
434 | "n_gpu_layers": 100,
435 | "ctx_size": 4096,
436 | "batch_size": 512,
437 | "temperature": 1.0,
438 | "top_p": 1.0,
439 | "repeat_penalty": 1.1,
440 | "presence_penalty": 0.0,
441 | "frequency_penalty": 0.0
442 | },
443 | {
444 | "name": "all-MiniLM-L6-v2-ggml-model-f16",
445 | "type": "embedding",
446 | "prompt_template": "Llama2Chat",
447 | "n_predict": 1024,
448 | "n_gpu_layers": 100,
449 | "ctx_size": 384,
450 | "batch_size": 512,
451 | "temperature": 1.0,
452 | "top_p": 1.0,
453 | "repeat_penalty": 1.1,
454 | "presence_penalty": 0.0,
455 | "frequency_penalty": 0.0
456 | }
457 | ],
458 | "qdrant_config": {
459 | "url": "http://localhost:6333",
460 | "collection_name": "default",
461 | "limit": 5,
462 | "score_threshold": 0.4
463 | }
464 | }
465 | ```
466 |
467 |
468 |
469 | #### Retrieve context
470 |
471 | `/v1/retrieve` endpoint sends a query and gets the retrieval results.
472 |
473 | Example
474 |
475 | You can use `curl` to test it on a new terminal:
476 |
477 | ```bash
478 | curl -X POST http://localhost:8080/v1/retrieve \
479 | -H 'accept:application/json' \
480 | -H 'Content-Type: application/json' \
481 | -d '{"messages":[{"role":"system", "content": "You are a helpful assistant."}, {"role":"user", "content": "What is the location of Paris, France along the Seine River?"}], "model":"llama-2-chat"}'
482 | ```
483 |
484 | If the command runs successfully, you should see the similar output as below in your terminal:
485 |
486 | ```json
487 | {
488 | "points": [
489 | {
490 | "source": "\"Paris is located in northern central France, in a north-bending arc of the river Seine whose crest includes two islands, the Île Saint-Louis and the larger Île de la Cité, which form the oldest part of the city. The river's mouth on the English Channel is about 233 mi downstream from the city. The city is spread widely on both banks of the river. Overall, the city is relatively flat, and the lowest point is 35 m above sea level. Paris has several prominent hills, the highest of which is Montmartre at 130 m.\\n\"",
491 | "score": 0.74011195
492 | },
493 | {
494 | "source": "\"The Paris region is the most active water transport area in France, with most of the cargo handled by Ports of Paris in facilities located around Paris. The rivers Loire, Rhine, Rhône, Me\\n\"",
495 | "score": 0.63990676
496 | },
497 | {
498 | "source": "\"Paris\\nCountry\\tFrance\\nRegion\\nÎle-de-France\\r\\nDepartment\\nParis\\nIntercommunality\\nMétropole du Grand Paris\\nSubdivisions\\n20 arrondissements\\nGovernment\\n • Mayor (2020–2026)\\tAnne Hidalgo (PS)\\r\\nArea\\n1\\t105.4 km2 (40.7 sq mi)\\n • Urban\\n (2020)\\t2,853.5 km2 (1,101.7 sq mi)\\n • Metro\\n (2020)\\t18,940.7 km2 (7,313.0 sq mi)\\nPopulation\\n (2023)\\n2,102,650\\n • Rank\\t9th in Europe\\n1st in France\\r\\n • Density\\t20,000/km2 (52,000/sq mi)\\n • Urban\\n (2019)\\n10,858,852\\n • Urban density\\t3,800/km2 (9,900/sq mi)\\n • Metro\\n (Jan. 2017)\\n13,024,518\\n • Metro density\\t690/km2 (1,800/sq mi)\\nDemonym(s)\\nParisian(s) (en) Parisien(s) (masc.), Parisienne(s) (fem.) (fr), Parigot(s) (masc.), \\\"Parigote(s)\\\" (fem.) (fr, colloquial)\\nTime zone\\nUTC+01:00 (CET)\\r\\n • Summer (DST)\\nUTC+02:00 (CEST)\\r\\nINSEE/Postal code\\t75056 /75001-75020, 75116\\r\\nElevation\\t28–131 m (92–430 ft)\\n(avg. 78 m or 256 ft)\\nWebsite\\twww.paris.fr\\r\\n1 French Land Register data, which excludes lakes, ponds, glaciers > 1 km2 (0.386 sq mi or 247 acres) and river estuaries.\\n\"",
499 | "score": 0.62259054
500 | },
501 | {
502 | "source": "\" in Paris\\n\"",
503 | "score": 0.6152092
504 | },
505 | {
506 | "source": "\"The Parisii, a sub-tribe of the Celtic Senones, inhabited the Paris area from around the middle of the 3rd century BC. One of the area's major north–south trade routes crossed the Seine on the île de la Cité, which gradually became an important trading centre. The Parisii traded with many river towns (some as far away as the Iberian Peninsula) and minted their own coins.\\n\"",
507 | "score": 0.5720232
508 | }
509 | ],
510 | "limit": 5,
511 | "score_threshold": 0.4
512 | }
513 | ```
514 |
515 |
516 |
517 | ## Setup
518 |
519 | Llama-RAG API server runs on WasmEdge Runtime. According to the operating system you are using, choose the installation command:
520 |
521 | For macOS (apple silicon)
522 |
523 | ```console
524 | # install WasmEdge-0.13.4 with wasi-nn-ggml plugin
525 | curl -sSf https://raw.githubusercontent.com/WasmEdge/WasmEdge/master/utils/install_v2.sh | bash -s
526 |
527 | # Assuming you use zsh (the default shell on macOS), run the following command to activate the environment
528 | source $HOME/.zshenv
529 | ```
530 |
531 |
532 |
533 | For Ubuntu (>= 20.04)
534 |
535 | ```console
536 | # install libopenblas-dev
537 | apt update && apt install -y libopenblas-dev
538 |
539 | # install WasmEdge-0.13.4 with wasi-nn-ggml plugin
540 | curl -sSf https://raw.githubusercontent.com/WasmEdge/WasmEdge/master/utils/install_v2.sh | bash -s
541 |
542 | # Assuming you use bash (the default shell on Ubuntu), run the following command to activate the environment
543 | source $HOME/.bashrc
544 | ```
545 |
546 |
547 |
548 | For General Linux
549 |
550 | ```console
551 | # install WasmEdge-0.13.4 with wasi-nn-ggml plugin
552 | curl -sSf https://raw.githubusercontent.com/WasmEdge/WasmEdge/master/utils/install_v2.sh | bash -s
553 |
554 | # Assuming you use bash (the default shell on Ubuntu), run the following command to activate the environment
555 | source $HOME/.bashrc
556 | ```
557 |
558 |
559 |
560 | ## Build
561 |
562 | ```bash
563 | # Clone the repository
564 | git clone https://github.com/LlamaEdge/rag-api-server.git
565 |
566 | # Change the working directory
567 | cd rag-api-server
568 |
569 | # (Optional) Add the `wasm32-wasip1` target to the Rust toolchain
570 | rustup target add wasm32-wasip1
571 |
572 | # Build `rag-api-server.wasm` with the `http` support only, or
573 | cargo build --target wasm32-wasip1 --release
574 |
575 | # Build `rag-api-server.wasm` with both `http` and `https` support
576 | cargo build --target wasm32-wasip1 --release --features full
577 |
578 | # Copy the `rag-api-server.wasm` to the root directory
579 | cp target/wasm32-wasip1/release/rag-api-server.wasm .
580 | ```
581 |
582 | To check the CLI options of the `rag-api-server` wasm app, you can run the following command:
583 |
584 | ```bash
585 | $ wasmedge rag-api-server.wasm -h
586 |
587 | LlamaEdge-RAG API Server
588 |
589 | Usage: rag-api-server.wasm [OPTIONS] --model-name --prompt-template
590 |
591 | Options:
592 | -m, --model-name
593 | Sets names for chat and embedding models. The names are separated by comma without space, for example, '--model-name Llama-2-7b,all-minilm'
594 |
595 | -a, --model-alias
596 | Model aliases for chat and embedding models
597 |
598 | [default: default,embedding]
599 |
600 | -c, --ctx-size
601 | Sets context sizes for chat and embedding models, respectively. The sizes are separated by comma without space, for example, '--ctx-size 4096,384'. The first value is for the chat model, and the second is for the embedding model
602 |
603 | [default: 4096,384]
604 |
605 | -p, --prompt-template
606 | Sets prompt templates for chat and embedding models, respectively. The prompt templates are separated by comma without space, for example, '--prompt-template llama-2-chat,embedding'. The first value is for the chat model, and the second is for the embedding model
607 |
608 | [possible values: llama-2-chat, llama-3-chat, llama-3-tool, mistral-instruct, mistral-tool, mistrallite, mistral-small-chat, mistral-small-tool, openchat, codellama-instruct, codellama-super-instruct, human-assistant, vicuna-1.0-chat, vicuna-1.1-chat, vicuna-llava, chatml, chatml-tool, internlm-2-tool, baichuan-2, wizard-coder, zephyr, stablelm-zephyr, intel-neural, deepseek-chat, deepseek-coder, deepseek-chat-2, deepseek-chat-25, deepseek-chat-3, solar-instruct, phi-2-chat, phi-2-instruct, phi-3-chat, phi-3-instruct, phi-4-chat, gemma-instruct, gemma-3, octopus, glm-4-chat, groq-llama3-tool, mediatek-breeze, nemotron-chat, nemotron-tool, functionary-32, functionary-31, minicpmv, moxin-chat, falcon3, megrez, qwen2-vision, exaone-deep-chat, exaone-chat, embedding, tts, none]
609 |
610 | -r, --reverse-prompt
611 | Halt generation at PROMPT, return control
612 |
613 | -n, --n-predict
614 | Number of tokens to predict, -1 = infinity, -2 = until context filled
615 |
616 | [default: -1]
617 |
618 | -g, --n-gpu-layers
619 | Number of layers to run on the GPU
620 |
621 | [default: 100]
622 |
623 | --split-mode
624 | Split the model across multiple GPUs. Possible values: `none` (use one GPU only), `layer` (split layers and KV across GPUs, default), `row` (split rows across GPUs)
625 |
626 | [default: layer]
627 |
628 | --main-gpu
629 | The main GPU to use
630 |
631 | --tensor-split
632 | How split tensors should be distributed accross GPUs. If None the model is not split; otherwise, a comma-separated list of non-negative values, e.g., "3,2" presents 60% of the data to GPU 0 and 40% to GPU 1
633 |
634 | --threads
635 | Number of threads to use during computation
636 |
637 | [default: 2]
638 |
639 | --grammar
640 | BNF-like grammar to constrain generations (see samples in grammars/ dir)
641 |
642 | [default: ]
643 |
644 | --json-schema
645 | JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object. For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead
646 |
647 | -b, --batch-size
648 | Sets batch sizes for chat and embedding models, respectively. The sizes are separated by comma without space, for example, '--batch-size 128,64'. The first value is for the chat model, and the second is for the embedding model
649 |
650 | [default: 512,512]
651 |
652 | -u, --ubatch-size
653 | Sets physical maximum batch sizes for chat and/or embedding models. To run both chat and embedding models, the sizes should be separated by comma without space, for example, '--ubatch-size 512,512'. The first value is for the chat model, and the second for the embedding model
654 |
655 | [default: 512,512]
656 |
657 | --rag-prompt
658 | Custom rag prompt
659 |
660 | --rag-policy
661 | Strategy for merging RAG context into chat messages
662 |
663 | [default: system-message]
664 |
665 | Possible values:
666 | - system-message: Merge RAG context into the system message
667 | - last-user-message: Merge RAG context into the last user message
668 |
669 | --qdrant-url
670 | URL of Qdrant REST Service
671 |
672 | [default: http://127.0.0.1:6333]
673 |
674 | --qdrant-collection-name
675 | Name of Qdrant collection
676 |
677 | [default: default]
678 |
679 | --qdrant-limit
680 | Max number of retrieved result (no less than 1)
681 |
682 | [default: 5]
683 |
684 | --qdrant-score-threshold
685 | Minimal score threshold for the search result
686 |
687 | [default: 0.4]
688 |
689 | --chunk-capacity
690 | Maximum number of tokens each chunk contains
691 |
692 | [default: 100]
693 |
694 | --context-window
695 | Maximum number of user messages used in the retrieval
696 |
697 | [default: 1]
698 |
699 | --kw-search-url
700 | URL of the keyword search service
701 |
702 | --include-usage
703 | Whether to include usage in the stream response. Defaults to false
704 |
705 | --socket-addr
706 | Socket address of LlamaEdge-RAG API Server instance. For example, `0.0.0.0:8080`
707 |
708 | --port
709 | Port number
710 |
711 | [default: 8080]
712 |
713 | --web-ui
714 | Root path for the Web UI files
715 |
716 | [default: chatbot-ui]
717 |
718 | --log-prompts
719 | Deprecated. Print prompt strings to stdout
720 |
721 | --log-stat
722 | Deprecated. Print statistics to stdout
723 |
724 | --log-all
725 | Deprecated. Print all log information to stdout
726 |
727 | -h, --help
728 | Print help (see a summary with '-h')
729 |
730 | -V, --version
731 | Print version
732 | ```
733 |
734 |
735 |
736 | ## Execute
737 |
738 | LlamaEdge-RAG API server requires two types of models: chat and embedding. The chat model is used for generating responses to user queries, while the embedding model is used for computing embeddings for user queries or file chunks.
739 |
740 | Execution also requires the presence of a running [Qdrant](https://qdrant.tech/) service.
741 |
742 | For the purpose of demonstration, we use the [Llama-2-7b-chat-hf-Q5_K_M.gguf](https://huggingface.co/second-state/Llama-2-7B-Chat-GGUF/resolve/main/Llama-2-7b-chat-hf-Q5_K_M.gguf) and [all-MiniLM-L6-v2-ggml-model-f16.gguf](https://huggingface.co/second-state/All-MiniLM-L6-v2-Embedding-GGUF/resolve/main/all-MiniLM-L6-v2-ggml-model-f16.gguf) models as examples. Download these models and place them in the root directory of the repository.
743 |
744 | - Ensure the Qdrant service is running
745 |
746 | ```bash
747 | # Pull the Qdrant docker image
748 | docker pull qdrant/qdrant
749 |
750 | # Create a directory to store Qdrant data
751 | mkdir qdrant_storage
752 |
753 | # Run Qdrant service
754 | docker run -p 6333:6333 -p 6334:6334 -v $(pwd)/qdrant_storage:/qdrant/storage:z qdrant/qdrant
755 | ```
756 |
757 | - Start an instance of LlamaEdge-RAG API server
758 |
759 | ```bash
760 | wasmedge --dir .:. --nn-preload default:GGML:AUTO:Llama-2-7b-chat-hf-Q5_K_M.gguf \
761 | --nn-preload embedding:GGML:AUTO:all-MiniLM-L6-v2-ggml-model-f16.gguf \
762 | rag-api-server.wasm \
763 | --model-name Llama-2-7b-chat-hf-Q5_K_M,all-MiniLM-L6-v2-ggml-model-f16 \
764 | --ctx-size 4096,384 \
765 | --prompt-template llama-2-chat,embedding \
766 | --rag-policy system-message \
767 | --qdrant-collection-name default \
768 | --qdrant-limit 3 \
769 | --qdrant-score-threshold 0.5 \
770 | --rag-prompt "Use the following pieces of context to answer the user's question.\nIf you don't know the answer, just say that you don't know, don't try to make up an answer.\n----------------\n" \
771 | --port 8080
772 | ```
773 |
774 | ## Usage Example
775 |
776 | - [Execute](#execute) the server
777 |
778 | - Generate embeddings for [paris.txt](https://huggingface.co/datasets/gaianet/paris/raw/main/paris.txt) via the `/v1/create/rag` endpoint
779 |
780 | ```bash
781 | curl -X POST http://127.0.0.1:8080/v1/create/rag -F "file=@paris.txt"
782 | ```
783 |
784 | - Ask a question
785 |
786 | ```bash
787 | curl -X POST http://localhost:8080/v1/chat/completions \
788 | -H 'accept:application/json' \
789 | -H 'Content-Type: application/json' \
790 | -d '{"messages":[{"role":"system", "content": "You are a helpful assistant."}, {"role":"user", "content": "What is the location of Paris, France along the Seine River?"}], "model":"Llama-2-7b-chat-hf-Q5_K_M"}'
791 | ```
792 |
793 | ## Set Log Level
794 |
795 | You can set the log level of the API server by setting the `LLAMA_LOG` environment variable. For example, to set the log level to `debug`, you can run the following command:
796 |
797 | ```bash
798 | wasmedge --dir .:. --env LLAMA_LOG=debug \
799 | --nn-preload default:GGML:AUTO:Llama-2-7b-chat-hf-Q5_K_M.gguf \
800 | --nn-preload embedding:GGML:AUTO:all-MiniLM-L6-v2-ggml-model-f16.gguf \
801 | rag-api-server.wasm \
802 | --model-name Llama-2-7b-chat-hf-Q5_K_M,all-MiniLM-L6-v2-ggml-model-f16 \
803 | --ctx-size 4096,384 \
804 | --prompt-template llama-2-chat,embedding \
805 | --rag-prompt "Use the following pieces of context to answer the user's question.\nIf you don't know the answer, just say that you don't know, don't try to make up an answer.\n----------------\n"
806 | ```
807 |
808 | The log level can be one of the following values: `trace`, `debug`, `info`, `warn`, `error`. The default log level is `info`.
809 |
--------------------------------------------------------------------------------
/check_code_before_commit.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Find unused dependencies in Cargo.toml
4 | cargo +nightly udeps
5 |
6 | # Sort dependencies in Cargo.toml alphabetically
7 | cargo sort
8 |
9 | # Format code
10 | cargo +nightly fmt --all -- --check
11 |
12 | # Clippy
13 | cargo +nightly clippy --target wasm32-wasip1 --all-features -- -D warnings
14 |
--------------------------------------------------------------------------------
/docs/assets/kw_search.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LlamaEdge/rag-api-server/bd32425ce91b6ec72916d4a0e3d8922f3a4576f9/docs/assets/kw_search.png
--------------------------------------------------------------------------------
/docs/keyword_search.md:
--------------------------------------------------------------------------------
1 | # Integration with Keyword Search
2 |
3 | Since `LlamaEdge-RAG v0.13.0`, the keyword search feature is supported. This feature is powered by [kw-search-server](https://github.com/LlamaEdge/kw-search-server), which is a standalone server that provides keyword search services.
4 |
5 | The following diagram shows the interactions between `rag-api-server` and `kw-search-server` while performing a chat completion.
6 |
7 |
8 |

9 |
10 |
11 | ## Usage
12 |
13 | ### Enable keyword search
14 |
15 | Assume that a keyword search server is running on `http://localhost:9069`. There are two ways to enable keyword search in `rag-api-server`:
16 |
17 | - Set the `--kw-search-url` CLI option of `rag-api-server` while starting the rag-api-server.
18 |
19 | - Set the `kw_search_url` field of the chat completion request(see [Perform chat completion with keyword search](#perform-chat-completion-with-keyword-search)) or rag creation request (see [Create collection and index from a document](#create-collection-and-index-from-a-document)) to the rag-api-server.
20 |
21 | ### Create collection and index from a document
22 |
23 | The process of creating indexes for the target document is integrated into the creation of RAG. In other words, when creating RAG collections through the `/v1/create/rag` endpoint of the rag-api-server, indexes for the target document are created simultaneously.
24 |
25 | Assume that the keyword search server is running on `http://localhost:9069`.
26 |
27 | - If the rag-api-server is running on `http://localhost:8080` and using `--kw-search-url http://localhost:9069` to specify the keyword search server, you can create a RAG collection with the following command:
28 |
29 | ```bash
30 | curl --location 'http://localhost:8080/v1/create/rag' \
31 | --header 'Content-Type: multipart/form-data' \
32 | --form 'file=@"/Users/sam/workspace/demo/paris.txt"' \
33 | --form 'vdb_server_url="your_vdb_server_url"' \
34 | --form 'vdb_collection_name="your_vdb_collection_name"' \
35 | --form 'vdb_api_key="your_vdb_api_key"' \
36 | ```
37 |
38 | - If the rag-api-server is running on `http://localhost:8080` without specifying the keyword search server, you can create a RAG collection with the following command:
39 |
40 | ```bash
41 | curl --location 'http://localhost:8080/v1/create/rag' \
42 | --header 'Content-Type: multipart/form-data' \
43 | --form 'file=@"/Users/sam/workspace/demo/paris.txt"' \
44 | --form 'vdb_server_url="your_vdb_server_url"' \
45 | --form 'vdb_collection_name="your_vdb_collection_name"' \
46 | --form 'vdb_api_key="your_vdb_api_key"' \
47 | --form 'kw_search_url="http://localhost:9069"'
48 | ```
49 |
50 | If the curl request above is handled successfully, the following response body will be returned as shown below. The body contains two parts: `index` for the keyword search index and `embeddings` for the embeddings.
51 |
52 | ```json
53 | {
54 | "index": {
55 | "results": [
56 | {
57 | "filename": "Unknown",
58 | "status": "indexed",
59 | "error": null
60 | },
61 | ...,
62 | {
63 | "filename": "Unknown",
64 | "status": "indexed",
65 | "error": null
66 | }
67 | ],
68 | "index_name": "index-2c70ccde-916e-45b1-99ef-97ac893fd438",
69 | "download_url": "http://localhost:9069/v1/index/download/index-2c70ccde-916e-45b1-99ef-97ac893fd438"
70 | },
71 | "embeddings": {
72 | "object": "list",
73 | "data": [
74 | {
75 | "index": 0,
76 | "object": "embedding",
77 | "embedding": []
78 | },
79 | ...,
80 | {
81 | "index": 326,
82 | "object": "embedding",
83 | "embedding": []
84 | }
85 | ],
86 | "model": "Nomic-embed-text-v1.5",
87 | "usage": {
88 | "prompt_tokens": 20355,
89 | "completion_tokens": 0,
90 | "total_tokens": 20355
91 | }
92 | }
93 | }
94 | ```
95 |
96 | ### Perform chat completion with keyword search
97 |
98 | The keyword search feature is integrated into the chat completion process. When performing a chat completion, the rag-api-server will first perform a keyword search and embedding search for the user query, then fuse the search results into the context, and finally build prompt with the user query and the context and feed it to the model to generate a chat completion.
99 |
100 | Assume that the keyword search server is running on `http://localhost:9069`.
101 |
102 | - If the rag-api-server is running on `http://localhost:8080` and using `--kw-search-url http://localhost:9069` to specify the keyword search server, you can create a RAG collection with the following command:
103 |
104 | ```bash
105 | curl --location 'http://localhost:10086/v1/chat/completions' \
106 | --header 'Content-Type: application/json' \
107 | --data '{
108 | "messages": [
109 | {
110 | "role": "system",
111 | "content": "You are a helpful assistant. Answer questions as concisely as possible."
112 | },
113 | {
114 | "role": "user",
115 | "content": "What is the location of Paris, France along the Seine river?"
116 | }
117 | ],
118 |
119 | "vdb_server_url": "your_vdb_server_url",
120 | "vdb_collection_name": ["your_vdb_collection_name"],
121 | "limit": [5],
122 | "score_threshold": [0.5],
123 | "vdb_api_key": "your_vdb_api_key",
124 | "kw_index_name": "index-2c70ccde-916e-45b1-99ef-97ac893fd438",
125 | "kw_top_k": 5,
126 | "model": "Llama-3.2-3B-Instruct",
127 | "stream": false
128 | }'
129 | ```
130 |
131 | - If the rag-api-server is running on `http://localhost:8080` without specifying the keyword search server, you can create a RAG collection with the following command:
132 |
133 | ```bash
134 | curl --location 'http://localhost:10086/v1/chat/completions' \
135 | --header 'Content-Type: application/json' \
136 | --data '{
137 | "messages": [
138 | {
139 | "role": "system",
140 | "content": "You are a helpful assistant. Answer questions as concisely as possible."
141 | },
142 | {
143 | "role": "user",
144 | "content": "What is the location of Paris, France along the Seine river?"
145 | }
146 | ],
147 |
148 | "vdb_server_url": "your_vdb_server_url",
149 | "vdb_collection_name": ["your_vdb_collection_name"],
150 | "limit": [5],
151 | "score_threshold": [0.5],
152 | "vdb_api_key": "your_vdb_api_key",
153 | "kw_search_url": "http://localhost:9069",
154 | "kw_index_name": "index-2c70ccde-916e-45b1-99ef-97ac893fd438",
155 | "kw_top_k": 5,
156 | "model": "Llama-3.2-3B-Instruct",
157 | "stream": false
158 | }'
159 | ```
160 |
161 | If the curl request above is handled successfully, the following response body will be returned as shown below:
162 |
163 | ```json
164 | {
165 | "id": "chatcmpl-72d9b542-4ee6-4a38-b9f6-75677765eef3",
166 | "object": "chat.completion",
167 | "created": 1737531879,
168 | "model": "Llama-3.2-3B-Instruct",
169 | "choices": [
170 | {
171 | "index": 0,
172 | "message": {
173 | "content": "Paris, France is located on the banks of the Seine River, with two islands, Île Saint-Louis and Île de la Cité, within the city.",
174 | "role": "assistant"
175 | },
176 | "finish_reason": "stop",
177 | "logprobs": null
178 | }
179 | ],
180 | "usage": {
181 | "prompt_tokens": 209,
182 | "completion_tokens": 37,
183 | "total_tokens": 246
184 | }
185 | }
186 | ```
187 |
--------------------------------------------------------------------------------
/docs/vectordb.md:
--------------------------------------------------------------------------------
1 | # Interaction with VectorDB
2 |
3 | LlamaEdge-RAG interacts with external VectorDB through two approaches: one is via the CLI options of rag-api-server, and the other is through the request fields. In the following two sections, these two approaches are discussed separately. For the convenience of the following discussion, Qdrant is used as the example VectorDB.
4 |
5 | > [!NOTE]
6 | > Sinece v0.11.0 release, the VectorDB support addressed below is supported by LlamaEdge-RAG.
7 |
8 | ## Via CLI options
9 |
10 | If retrieving information from a fixed VectorDB, this method is recommended. The startup command of rag-api-server provides four command-line options, which are:
11 |
12 | - `--qdrant-url ` specifies the URL of VectorDB REST Service
13 | - `--qdrant-collection-name ` specifies one or multiple names of VectorDB collections
14 | - `--qdrant-limit ` specifies the max number of retrieved result (no less than 1) from each collection specified in the `--qdrant-collection-name` option
15 | - `--qdrant-score-threshold ` specifies the minimal score threshold for the search results from each collection specified in the `--qdrant-collection-name` option
16 |
17 | By setting the above four options in the startup command when starting rag-api-server, it helps avoid repeatedly providing these parameters in every retrieval request, such as chat completion request. The following is an example of the startup command:
18 |
19 | ```bash
20 | wasmedge --dir .:. \
21 | --env VDB_API_KEY=your-vdb-api-key \
22 | ...
23 | --qdrant-url https://651ca7e5-e1d1-4851-abba-xxxxxxxxxxxx.europe-west3-0.gcp.cloud.qdrant.io:6333 \
24 | --qdrant-collection-name paris1,paris2 \
25 | --qdrant-limit 3,5 \
26 | --qdrant-score-threshold 0.5,0.7
27 | ```
28 |
29 | **Note** that `--env VDB_API_KEY=your-vdb-api-key` is required if the VectorDB requires an API key for access.
30 |
31 | ## Via request fields
32 |
33 | For the cases where retrieving information from different VectorDBs or collections in different requests, this method is recommended. The requests for chat completions and rag creation tasks provide the fields respectively for specifying the VectorDB information.
34 |
35 | ### VectorDB related fields in requests to the `/v1/create/rag` endpoint
36 |
37 | The request to the `/v1/create/rag` endpoint also provides the fields for specifying the VectorDB information, which are
38 |
39 | - `vdb_server_url` specifies the URL of VectorDB REST Service
40 | - `vdb_collection_name` specifies one or multiple names of VectorDB collections
41 | - `vdb_api_key` specifies the API key for accessing the VectorDB
42 |
43 | The following is an example of the request:
44 |
45 | ```bash
46 | curl --location 'http://localhost:8080/v1/create/rag' \
47 | --header 'Content-Type: multipart/form-data' \
48 | --form 'file=@"paris.txt"' \
49 | --form 'vdb_server_url="https://651ca7e5-e1d1-4851-abba-xxxxxxxxxxxx.europe-west3-0.gcp.cloud.qdrant.io:6333"' \
50 | --form 'vdb_collection_name="paris"' \
51 | --form 'vdb_api_key="your-vdb-api-key"'
52 | ```
53 |
54 | ### VectorDB related fields in the request to the `/v1/chat/completion` endpoint
55 |
56 | The chat completion request to the `/v1/chat/completion` endpoint defines five VectorDB related fields for specifying the VectorDB information, which are
57 |
58 | - `vdb_server_url` specifies the URL of VectorDB REST Service
59 | - `vdb_collection_name` specifies one or multiple names of VectorDB collections
60 | - `limit` specifies the max number of retrieved result (no less than 1) from each collection specified in the `vdb_collection_name` field
61 | - `score_threshold` specifies the minimal score threshold for the search results from each collection specified in the `vdb_collection_name` field
62 | - `vdb_api_key` specifies the API key for accessing the VectorDB
63 |
64 | The following is an example of the chat completion request:
65 |
66 | ```bash
67 | curl --location 'http://localhost:8080/v1/chat/completions' \
68 | --header 'Content-Type: application/json' \
69 | --data '{
70 | "messages": [
71 | {
72 | "role": "system",
73 | "content": "You are a helpful assistant. Answer questions as concisely as possible."
74 | },
75 | {
76 | "role": "user",
77 | "content": "What is the location of Paris, France along the Seine river?"
78 | }
79 | ],
80 | "vdb_server_url": "https://651ca7e5-e1d1-4851-abba-xxxxxxxxxxxx.europe-west3-0.gcp.cloud.qdrant.io:6333",
81 | "vdb_collection_name": ["paris1","paris2"],
82 | "limit": [3,5],
83 | "score_threshold": [0.5,0.7],
84 | "vdb_api_key": "your-vdb-api-key",
85 | "model": "Llama-3.2-3B-Instruct",
86 | "stream": false
87 | }'
88 | ```
89 |
90 | **Note** that the `limit`, and `score_threshold` fields are required in the chat completion request if `vdb_server_url` and `vdb_collection_name` are present. The `vdb_api_key` field is required only if the VectorDB requires an API key for access.
91 |
92 | ### VectorDB related fields in the request to the `/v1/retrieve` endpoint
93 |
94 | Similarly, the request to the `/v1/retrieve` endpoint defines five VectorDB related fields for specifying the VectorDB information, which are
95 |
96 | - `vdb_server_url` specifies the URL of VectorDB REST Service
97 | - `vdb_collection_name` specifies one or multiple names of VectorDB collections
98 | - `limit` specifies the max number of retrieved result (no less than 1) from each collection specified in the `vdb_collection_name` field
99 | - `score_threshold` specifies the minimal score threshold for the search results from each collection specified in the `vdb_collection_name` field
100 | - `vdb_api_key` specifies the API key for accessing the VectorDB
101 |
102 | The following is an example of the retrieval request:
103 |
104 | ```bash
105 | curl --location 'http://localhost:8080/v1/retrieve' \
106 | --header 'Content-Type: application/json' \
107 | --data '{
108 | "messages": [
109 | {
110 | "role": "system",
111 | "content": "You are a helpful assistant. Answer questions as concisely as possible."
112 | },
113 | {
114 | "role": "user",
115 | "content": "What is the location of Paris, France along the Seine river?"
116 | }
117 | ],
118 | "vdb_server_url": "https://651ca7e5-e1d1-4851-abba-xxxxxxxxxxxx.europe-west3-0.gcp.cloud.qdrant.io:6333",
119 | "vdb_collection_name": ["paris1","paris2"],
120 | "limit": [3,5],
121 | "score_threshold": [0.5,0.7],
122 | "vdb_api_key": "your-vdb-api-key",
123 | "model": "Llama-3.2-3B-Instruct",
124 | "stream": false
125 | }'
126 | ```
127 |
128 | **Note** that the `limit`, and `score_threshold` fields are required in the chat completion request if `vdb_server_url` and `vdb_collection_name` are present. The `vdb_api_key` field is required only if the VectorDB requires an API key for access.
129 |
--------------------------------------------------------------------------------
/src/backend/mod.rs:
--------------------------------------------------------------------------------
1 | pub(crate) mod ggml;
2 |
3 | use crate::error;
4 | use hyper::{Body, Request, Response};
5 |
6 | pub(crate) async fn handle_llama_request(
7 | req: Request,
8 | chunk_capacity: usize,
9 | ) -> Response {
10 | match req.uri().path() {
11 | "/v1/chat/completions" => ggml::rag_query_handler(req).await,
12 | "/v1/models" => ggml::models_handler().await,
13 | "/v1/embeddings" => ggml::embeddings_handler(req).await,
14 | "/v1/files" => ggml::files_handler(req).await,
15 | "/v1/chunks" => ggml::chunks_handler(req).await,
16 | "/v1/retrieve" => ggml::retrieve_handler(req).await,
17 | "/v1/create/rag" => ggml::create_rag_handler(req, chunk_capacity).await,
18 | "/v1/info" => ggml::server_info_handler().await,
19 | path => {
20 | if path.starts_with("/v1/files/") {
21 | ggml::files_handler(req).await
22 | } else {
23 | error::invalid_endpoint(path)
24 | }
25 | }
26 | }
27 | }
28 |
--------------------------------------------------------------------------------
/src/error.rs:
--------------------------------------------------------------------------------
1 | use hyper::{Body, Response};
2 | use thiserror::Error;
3 |
4 | #[allow(dead_code)]
5 | pub(crate) fn not_implemented() -> Response {
6 | // log error
7 | error!(target: "stdout", "501 Not Implemented");
8 |
9 | Response::builder()
10 | .header("Access-Control-Allow-Origin", "*")
11 | .header("Access-Control-Allow-Methods", "*")
12 | .header("Access-Control-Allow-Headers", "*")
13 | .status(hyper::StatusCode::NOT_IMPLEMENTED)
14 | .body(Body::from("501 Not Implemented"))
15 | .unwrap()
16 | }
17 |
18 | pub(crate) fn internal_server_error(msg: impl AsRef) -> Response {
19 | let err_msg = match msg.as_ref().is_empty() {
20 | true => "500 Internal Server Error".to_string(),
21 | false => format!("500 Internal Server Error: {}", msg.as_ref()),
22 | };
23 |
24 | // log error
25 | error!(target: "stdout", "{}", &err_msg);
26 |
27 | Response::builder()
28 | .header("Access-Control-Allow-Origin", "*")
29 | .header("Access-Control-Allow-Methods", "*")
30 | .header("Access-Control-Allow-Headers", "*")
31 | .status(hyper::StatusCode::INTERNAL_SERVER_ERROR)
32 | .body(Body::from(err_msg))
33 | .unwrap()
34 | }
35 |
36 | pub(crate) fn bad_request(msg: impl AsRef) -> Response {
37 | let err_msg = match msg.as_ref().is_empty() {
38 | true => "400 Bad Request".to_string(),
39 | false => format!("400 Bad Request: {}", msg.as_ref()),
40 | };
41 |
42 | // log error
43 | error!(target: "stdout", "{}", &err_msg);
44 |
45 | Response::builder()
46 | .header("Access-Control-Allow-Origin", "*")
47 | .header("Access-Control-Allow-Methods", "*")
48 | .header("Access-Control-Allow-Headers", "*")
49 | .status(hyper::StatusCode::BAD_REQUEST)
50 | .body(Body::from(err_msg))
51 | .unwrap()
52 | }
53 |
54 | pub(crate) fn unauthorized(msg: impl AsRef) -> Response {
55 | let err_msg = match msg.as_ref().is_empty() {
56 | true => "401 Unauthorized".to_string(),
57 | false => format!("401 Unauthorized: {}", msg.as_ref()),
58 | };
59 |
60 | // log error
61 | error!(target: "stdout", "{}", &err_msg);
62 |
63 | Response::builder()
64 | .header("Access-Control-Allow-Origin", "*")
65 | .header("Access-Control-Allow-Methods", "*")
66 | .header("Access-Control-Allow-Headers", "*")
67 | .status(hyper::StatusCode::UNAUTHORIZED)
68 | .body(Body::from(err_msg))
69 | .unwrap()
70 | }
71 |
72 | pub(crate) fn invalid_endpoint(msg: impl AsRef) -> Response {
73 | let err_msg = match msg.as_ref().is_empty() {
74 | true => "404 The requested service endpoint is not found".to_string(),
75 | false => format!(
76 | "404 The requested service endpoint is not found: {}",
77 | msg.as_ref()
78 | ),
79 | };
80 |
81 | // log error
82 | error!(target: "stdout", "{}", &err_msg);
83 |
84 | Response::builder()
85 | .header("Access-Control-Allow-Origin", "*")
86 | .header("Access-Control-Allow-Methods", "*")
87 | .header("Access-Control-Allow-Headers", "*")
88 | .status(hyper::StatusCode::NOT_FOUND)
89 | .body(Body::from(err_msg))
90 | .unwrap()
91 | }
92 |
93 | #[derive(Error, Clone, Debug, PartialEq, Eq)]
94 | pub enum ServerError {
95 | /// Error returned while parsing CLI options failed
96 | #[error("{0}")]
97 | ArgumentError(String),
98 | #[error("{0}")]
99 | Operation(String),
100 | }
101 |
--------------------------------------------------------------------------------
/src/main.rs:
--------------------------------------------------------------------------------
1 | #[macro_use]
2 | extern crate log;
3 |
4 | mod backend;
5 | mod error;
6 | mod utils;
7 |
8 | use anyhow::Result;
9 | use chat_prompts::{MergeRagContextPolicy, PromptTemplateType};
10 | use clap::{ArgGroup, Parser};
11 | use error::ServerError;
12 | use hyper::{
13 | body::HttpBody,
14 | header,
15 | server::conn::AddrStream,
16 | service::{make_service_fn, service_fn},
17 | Body, Request, Response, Server, StatusCode,
18 | };
19 | use llama_core::metadata::ggml::GgmlMetadataBuilder;
20 | use once_cell::sync::OnceCell;
21 | use serde::{Deserialize, Serialize};
22 | use std::{collections::HashMap, fmt, net::SocketAddr, path::PathBuf};
23 | use tokio::{net::TcpListener, sync::RwLock};
24 | use utils::{is_valid_url, LogLevel};
25 |
26 | type Error = Box;
27 |
28 | // global system prompt
29 | pub(crate) static GLOBAL_RAG_PROMPT: OnceCell = OnceCell::new();
30 | // server info
31 | pub(crate) static SERVER_INFO: OnceCell> = OnceCell::new();
32 | // API key
33 | pub(crate) static LLAMA_API_KEY: OnceCell = OnceCell::new();
34 | // Global context window used for setting the max number of user messages for the retrieval
35 | pub(crate) static CONTEXT_WINDOW: OnceCell = OnceCell::new();
36 | // Global keyword search configuration
37 | pub(crate) static KW_SEARCH_CONFIG: OnceCell = OnceCell::new();
38 |
39 | // default port
40 | const DEFAULT_PORT: &str = "8080";
41 |
42 | #[derive(Clone, Debug)]
43 | pub struct AppState {
44 | pub state_thing: String,
45 | }
46 |
47 | #[derive(Debug, Parser)]
48 | #[command(name = "LlamaEdge-RAG API Server", version = env!("CARGO_PKG_VERSION"), author = env!("CARGO_PKG_AUTHORS"), about = "LlamaEdge-RAG API Server")]
49 | #[command(group = ArgGroup::new("socket_address_group").multiple(false).args(&["socket_addr", "port"]))]
50 | struct Cli {
51 | /// Sets names for chat and embedding models. The names are separated by comma without space, for example, '--model-name Llama-2-7b,all-minilm'.
52 | #[arg(short, long, value_delimiter = ',', required = true)]
53 | model_name: Vec,
54 | /// Model aliases for chat and embedding models
55 | #[arg(
56 | short = 'a',
57 | long,
58 | value_delimiter = ',',
59 | default_value = "default,embedding"
60 | )]
61 | model_alias: Vec,
62 | /// Sets context sizes for chat and embedding models, respectively. The sizes are separated by comma without space, for example, '--ctx-size 4096,384'. The first value is for the chat model, and the second is for the embedding model.
63 | #[arg(
64 | short = 'c',
65 | long,
66 | value_delimiter = ',',
67 | default_value = "4096,384",
68 | value_parser = clap::value_parser!(u64)
69 | )]
70 | ctx_size: Vec,
71 | /// Sets prompt templates for chat and embedding models, respectively. The prompt templates are separated by comma without space, for example, '--prompt-template llama-2-chat,embedding'. The first value is for the chat model, and the second is for the embedding model.
72 | #[arg(short, long, value_delimiter = ',', value_parser = clap::value_parser!(PromptTemplateType), required = true)]
73 | prompt_template: Vec,
74 | /// Halt generation at PROMPT, return control.
75 | #[arg(short, long)]
76 | reverse_prompt: Option,
77 | /// Number of tokens to predict, -1 = infinity, -2 = until context filled.
78 | #[arg(short, long, default_value = "-1")]
79 | n_predict: i32,
80 | /// Number of layers to run on the GPU
81 | #[arg(short = 'g', long, default_value = "100")]
82 | n_gpu_layers: u64,
83 | /// Split the model across multiple GPUs. Possible values: `none` (use one GPU only), `layer` (split layers and KV across GPUs, default), `row` (split rows across GPUs)
84 | #[arg(long, default_value = "layer")]
85 | split_mode: String,
86 | /// The main GPU to use.
87 | #[arg(long)]
88 | main_gpu: Option,
89 | /// How split tensors should be distributed accross GPUs. If None the model is not split; otherwise, a comma-separated list of non-negative values, e.g., "3,2" presents 60% of the data to GPU 0 and 40% to GPU 1.
90 | #[arg(long)]
91 | tensor_split: Option,
92 | /// Number of threads to use during computation
93 | #[arg(long, default_value = "2")]
94 | threads: u64,
95 | /// BNF-like grammar to constrain generations (see samples in grammars/ dir).
96 | #[arg(long, default_value = "")]
97 | pub grammar: String,
98 | /// JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object. For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead.
99 | #[arg(long)]
100 | pub json_schema: Option,
101 | /// Sets batch sizes for chat and embedding models, respectively. The sizes are separated by comma without space, for example, '--batch-size 128,64'. The first value is for the chat model, and the second is for the embedding model.
102 | #[arg(short, long, value_delimiter = ',', default_value = "512,512", value_parser = clap::value_parser!(u64))]
103 | batch_size: Vec,
104 | /// Sets physical maximum batch sizes for chat and/or embedding models. To run both chat and embedding models, the sizes should be separated by comma without space, for example, '--ubatch-size 512,512'. The first value is for the chat model, and the second for the embedding model.
105 | #[arg(short, long, value_delimiter = ',', default_value = "512,512", value_parser = clap::value_parser!(u64))]
106 | ubatch_size: Vec,
107 | /// Custom rag prompt.
108 | #[arg(long)]
109 | rag_prompt: Option,
110 | /// Strategy for merging RAG context into chat messages.
111 | #[arg(long = "rag-policy", default_value_t, value_enum)]
112 | policy: MergeRagContextPolicy,
113 | /// URL of Qdrant REST Service
114 | #[arg(long, default_value = "http://127.0.0.1:6333")]
115 | qdrant_url: String,
116 | /// Name of Qdrant collection
117 | #[arg(long, default_value = "default", value_delimiter = ',')]
118 | qdrant_collection_name: Vec,
119 | /// Max number of retrieved result (no less than 1)
120 | #[arg(long, default_value = "5", value_delimiter = ',', value_parser = clap::value_parser!(u64))]
121 | qdrant_limit: Vec,
122 | /// Minimal score threshold for the search result
123 | #[arg(long, default_value = "0.4", value_delimiter = ',', value_parser = clap::value_parser!(f32))]
124 | qdrant_score_threshold: Vec,
125 | /// Maximum number of tokens each chunk contains
126 | #[arg(long, default_value = "100", value_parser = clap::value_parser!(usize))]
127 | chunk_capacity: usize,
128 | /// Maximum number of user messages used in the retrieval
129 | #[arg(long, default_value = "1", value_parser = clap::value_parser!(u64))]
130 | context_window: u64,
131 | /// URL of the keyword search service
132 | #[arg(long)]
133 | kw_search_url: Option,
134 | /// Whether to include usage in the stream response. Defaults to false.
135 | #[arg(long, default_value = "false")]
136 | include_usage: bool,
137 | /// Socket address of LlamaEdge-RAG API Server instance. For example, `0.0.0.0:8080`.
138 | #[arg(long, default_value = None, value_parser = clap::value_parser!(SocketAddr), group = "socket_address_group")]
139 | socket_addr: Option,
140 | /// Port number
141 | #[arg(long, default_value = DEFAULT_PORT, value_parser = clap::value_parser!(u16), group = "socket_address_group")]
142 | port: u16,
143 | /// Root path for the Web UI files
144 | #[arg(long, default_value = "chatbot-ui")]
145 | web_ui: PathBuf,
146 | /// Deprecated. Print prompt strings to stdout
147 | #[arg(long)]
148 | log_prompts: bool,
149 | /// Deprecated. Print statistics to stdout
150 | #[arg(long)]
151 | log_stat: bool,
152 | /// Deprecated. Print all log information to stdout
153 | #[arg(long)]
154 | log_all: bool,
155 | }
156 |
157 | #[allow(clippy::needless_return)]
158 | #[tokio::main(flavor = "current_thread")]
159 | async fn main() -> Result<(), ServerError> {
160 | let mut plugin_debug = false;
161 |
162 | // get the environment variable `LLAMA_LOG`
163 | let log_level: LogLevel = std::env::var("LLAMA_LOG")
164 | .unwrap_or("info".to_string())
165 | .parse()
166 | .unwrap_or(LogLevel::Info);
167 |
168 | if log_level == LogLevel::Debug || log_level == LogLevel::Trace {
169 | plugin_debug = true;
170 | }
171 | // set global logger
172 | wasi_logger::Logger::install().expect("failed to install wasi_logger::Logger");
173 | log::set_max_level(log_level.into());
174 |
175 | if let Ok(api_key) = std::env::var("API_KEY") {
176 | // define a const variable for the API key
177 | if let Err(e) = LLAMA_API_KEY.set(api_key) {
178 | let err_msg = format!("Failed to set API key. {}", e);
179 |
180 | error!(target: "stdout", "{}", err_msg);
181 |
182 | return Err(ServerError::Operation(err_msg));
183 | }
184 | }
185 |
186 | // parse the command line arguments
187 | let cli = Cli::parse();
188 |
189 | info!(target: "stdout", "log_level: {}", log_level);
190 |
191 | // log the version of the server
192 | info!(target: "stdout", "server_version: {}", env!("CARGO_PKG_VERSION"));
193 |
194 | // log model name
195 | if cli.model_name.len() != 2 {
196 | return Err(ServerError::ArgumentError(
197 | "LlamaEdge RAG API server requires a chat model and an embedding model.".to_owned(),
198 | ));
199 | }
200 | info!(target: "stdout", "model_name: {}", cli.model_name.join(","));
201 |
202 | // log model alias
203 | if cli.model_alias.len() != 2 {
204 | return Err(ServerError::ArgumentError(
205 | "LlamaEdge RAG API server requires two model aliases: one for chat model, one for embedding model.".to_owned(),
206 | ));
207 | }
208 | info!(target: "stdout", "model_alias: {}", cli.model_alias.join(","));
209 |
210 | // log context size
211 | if cli.ctx_size.len() != 2 {
212 | return Err(ServerError::ArgumentError(
213 | "LlamaEdge RAG API server requires two context sizes: one for chat model, one for embedding model.".to_owned(),
214 | ));
215 | }
216 | let ctx_sizes_str: String = cli
217 | .ctx_size
218 | .iter()
219 | .map(|n| n.to_string())
220 | .collect::>()
221 | .join(",");
222 | info!(target: "stdout", "ctx_size: {}", ctx_sizes_str);
223 |
224 | // log batch size
225 | if cli.batch_size.len() != 2 {
226 | return Err(ServerError::ArgumentError(
227 | "LlamaEdge RAG API server requires two batch sizes: one for chat model, one for embedding model.".to_owned(),
228 | ));
229 | }
230 | let batch_sizes_str: String = cli
231 | .batch_size
232 | .iter()
233 | .map(|n| n.to_string())
234 | .collect::>()
235 | .join(",");
236 | info!(target: "stdout", "batch_size: {}", batch_sizes_str);
237 |
238 | // log ubatch size
239 | if cli.ubatch_size.len() != 2 {
240 | return Err(ServerError::ArgumentError(
241 | "LlamaEdge RAG API server requires two ubatch sizes: one for chat model, one for embedding model.".to_owned(),
242 | ));
243 | }
244 | let ubatch_sizes_str: String = cli
245 | .ubatch_size
246 | .iter()
247 | .map(|n| n.to_string())
248 | .collect::>()
249 | .join(",");
250 | info!(target: "stdout", "ubatch_size: {}", ubatch_sizes_str);
251 |
252 | // log prompt template
253 | if cli.prompt_template.len() != 2 {
254 | return Err(ServerError::ArgumentError(
255 | "LlamaEdge RAG API server requires two prompt templates: one for chat model, one for embedding model.".to_owned(),
256 | ));
257 | }
258 | let prompt_template_str: String = cli
259 | .prompt_template
260 | .iter()
261 | .map(|n| n.to_string())
262 | .collect::>()
263 | .join(",");
264 | info!(target: "stdout", "prompt_template: {}", prompt_template_str);
265 |
266 | // log reverse prompt
267 | if let Some(reverse_prompt) = &cli.reverse_prompt {
268 | info!(target: "stdout", "reverse_prompt: {}", reverse_prompt);
269 | }
270 |
271 | // log n_predict
272 | info!(target: "stdout", "n_predict: {}", &cli.n_predict);
273 |
274 | // log n_gpu_layers
275 | info!(target: "stdout", "n_gpu_layers: {}", &cli.n_gpu_layers);
276 |
277 | // log split_mode
278 | info!(target: "stdout", "split_mode: {}", cli.split_mode);
279 |
280 | // log main GPU
281 | if let Some(main_gpu) = &cli.main_gpu {
282 | info!(target: "stdout", "main_gpu: {}", main_gpu);
283 | }
284 |
285 | // log tensor split
286 | if let Some(tensor_split) = &cli.tensor_split {
287 | info!(target: "stdout", "tensor_split: {}", tensor_split);
288 | }
289 |
290 | // log threads
291 | info!(target: "stdout", "threads: {}", cli.threads);
292 |
293 | // log grammar
294 | if !cli.grammar.is_empty() {
295 | info!(target: "stdout", "grammar: {}", &cli.grammar);
296 | }
297 |
298 | // log json schema
299 | if let Some(json_schema) = &cli.json_schema {
300 | info!(target: "stdout", "json_schema: {}", json_schema);
301 | }
302 |
303 | // log rag prompt
304 | if let Some(rag_prompt) = &cli.rag_prompt {
305 | info!(target: "stdout", "rag_prompt: {}", rag_prompt);
306 |
307 | GLOBAL_RAG_PROMPT.set(rag_prompt.clone()).map_err(|_| {
308 | ServerError::Operation("Failed to set `GLOBAL_RAG_PROMPT`.".to_string())
309 | })?;
310 | }
311 |
312 | // log qdrant url
313 | if !is_valid_url(&cli.qdrant_url) {
314 | let err_msg = format!(
315 | "The URL of Qdrant REST API is invalid: {}.",
316 | &cli.qdrant_url
317 | );
318 |
319 | // log
320 | {
321 | error!(target: "stdout", "rag_prompt: {}", err_msg);
322 | }
323 |
324 | return Err(ServerError::ArgumentError(err_msg));
325 | }
326 | info!(target: "stdout", "qdrant_url: {}", &cli.qdrant_url);
327 |
328 | if cli.qdrant_collection_name.len() != cli.qdrant_limit.len()
329 | && cli.qdrant_limit.len() > 1
330 | && cli.qdrant_score_threshold.len() > 1
331 | {
332 | return Err(ServerError::ArgumentError(
333 | "LlamaEdge RAG API server requires the same number of Qdrant collection names and limits; or the limit is only one value for all collections.".to_owned(),
334 | ));
335 | }
336 |
337 | if cli.qdrant_collection_name.len() != cli.qdrant_score_threshold.len()
338 | && cli.qdrant_score_threshold.len() > 1
339 | && cli.qdrant_score_threshold.len() > 1
340 | {
341 | return Err(ServerError::ArgumentError(
342 | "LlamaEdge RAG API server requires the same number of Qdrant collection names and score thresholds; or the score threshold is only one value for all collections.".to_owned(),
343 | ));
344 | }
345 |
346 | // log qdrant collection name
347 | let qdrant_collection_name_str: String = cli
348 | .qdrant_collection_name
349 | .iter()
350 | .map(|n| n.to_string())
351 | .collect::>()
352 | .join(",");
353 | info!(target: "stdout", "qdrant_collection_name: {}", qdrant_collection_name_str);
354 |
355 | // log qdrant limit
356 | let qdrant_limit_str: String = cli
357 | .qdrant_limit
358 | .iter()
359 | .map(|n| n.to_string())
360 | .collect::>()
361 | .join(",");
362 | info!(target: "stdout", "qdrant_limit: {}", qdrant_limit_str);
363 |
364 | // log qdrant score threshold
365 | let qdrant_score_threshold_str: String = cli
366 | .qdrant_score_threshold
367 | .iter()
368 | .map(|n| n.to_string())
369 | .collect::>()
370 | .join(",");
371 | info!(target: "stdout", "qdrant_score_threshold: {}", qdrant_score_threshold_str);
372 |
373 | // create qdrant config
374 | let mut qdrant_config_vec: Vec = Vec::new();
375 | for (idx, col_name) in cli.qdrant_collection_name.iter().enumerate() {
376 | let limit = if cli.qdrant_limit.len() == 1 {
377 | cli.qdrant_limit[0]
378 | } else {
379 | cli.qdrant_limit[idx]
380 | };
381 |
382 | let score_threshold = if cli.qdrant_score_threshold.len() == 1 {
383 | cli.qdrant_score_threshold[0]
384 | } else {
385 | cli.qdrant_score_threshold[idx]
386 | };
387 |
388 | let qdrant_config = QdrantConfig {
389 | url: cli.qdrant_url.clone(),
390 | collection_name: col_name.clone(),
391 | limit,
392 | score_threshold,
393 | };
394 |
395 | qdrant_config_vec.push(qdrant_config);
396 | }
397 |
398 | // log chunk capacity
399 | info!(target: "stdout", "chunk_capacity: {}", &cli.chunk_capacity);
400 |
401 | // log context window
402 | info!(target: "stdout", "context_window: {}", &cli.context_window);
403 | CONTEXT_WINDOW
404 | .set(cli.context_window)
405 | .map_err(|e| ServerError::Operation(format!("Failed to set `CONTEXT_WINDOW`. {}", e)))?;
406 |
407 | // RAG policy
408 | info!(target: "stdout", "rag_policy: {}", &cli.policy);
409 |
410 | let mut policy = cli.policy;
411 | if policy == MergeRagContextPolicy::SystemMessage && !cli.prompt_template[0].has_system_prompt()
412 | {
413 | warn!(target: "server_config", "{}", format!("The chat model does not support system message, while the '--policy' option sets to \"{}\". Update the RAG policy to {}.", cli.policy, MergeRagContextPolicy::LastUserMessage));
414 |
415 | policy = MergeRagContextPolicy::LastUserMessage;
416 | }
417 |
418 | // keyword search configuration
419 | if let Some(kw_search_url) = &cli.kw_search_url {
420 | let kw_search_config = KeywordSearchConfig {
421 | url: kw_search_url.clone(),
422 | };
423 | KW_SEARCH_CONFIG.set(kw_search_config).unwrap();
424 | }
425 |
426 | // log include_usage
427 | info!(target: "stdout", "include_usage: {}", cli.include_usage);
428 |
429 | // create metadata for chat model
430 | let chat_metadata = GgmlMetadataBuilder::new(
431 | cli.model_name[0].clone(),
432 | cli.model_alias[0].clone(),
433 | cli.prompt_template[0],
434 | )
435 | .with_ctx_size(cli.ctx_size[0])
436 | .with_reverse_prompt(cli.reverse_prompt)
437 | .with_batch_size(cli.batch_size[0])
438 | .with_ubatch_size(cli.ubatch_size[0])
439 | .with_n_predict(cli.n_predict)
440 | .with_n_gpu_layers(cli.n_gpu_layers)
441 | .with_split_mode(cli.split_mode.clone())
442 | .with_main_gpu(cli.main_gpu)
443 | .with_tensor_split(cli.tensor_split.clone())
444 | .with_threads(cli.threads)
445 | .with_grammar(cli.grammar)
446 | .with_json_schema(cli.json_schema)
447 | .enable_plugin_log(true)
448 | .enable_debug_log(plugin_debug)
449 | .include_usage(cli.include_usage)
450 | .build();
451 |
452 | let chat_model_info = ModelConfig {
453 | name: chat_metadata.model_name.clone(),
454 | ty: "chat".to_string(),
455 | prompt_template: chat_metadata.prompt_template,
456 | n_predict: chat_metadata.n_predict,
457 | reverse_prompt: chat_metadata.reverse_prompt.clone(),
458 | n_gpu_layers: chat_metadata.n_gpu_layers,
459 | ctx_size: chat_metadata.ctx_size,
460 | batch_size: chat_metadata.batch_size,
461 | ubatch_size: chat_metadata.ubatch_size,
462 | temperature: chat_metadata.temperature,
463 | top_p: chat_metadata.top_p,
464 | repeat_penalty: chat_metadata.repeat_penalty,
465 | presence_penalty: chat_metadata.presence_penalty,
466 | frequency_penalty: chat_metadata.frequency_penalty,
467 | split_mode: chat_metadata.split_mode.clone(),
468 | main_gpu: chat_metadata.main_gpu,
469 | tensor_split: chat_metadata.tensor_split.clone(),
470 | };
471 |
472 | // chat model
473 | let chat_models = [chat_metadata];
474 |
475 | // create metadata for embedding model
476 | let embedding_metadata = GgmlMetadataBuilder::new(
477 | cli.model_name[1].clone(),
478 | cli.model_alias[1].clone(),
479 | cli.prompt_template[1],
480 | )
481 | .with_ctx_size(cli.ctx_size[1])
482 | .with_batch_size(cli.batch_size[1])
483 | .with_ubatch_size(cli.ubatch_size[1])
484 | .with_split_mode(cli.split_mode)
485 | .with_main_gpu(cli.main_gpu)
486 | .with_tensor_split(cli.tensor_split)
487 | .with_threads(cli.threads)
488 | .enable_plugin_log(true)
489 | .enable_debug_log(plugin_debug)
490 | .build();
491 |
492 | let embedding_model_info = ModelConfig {
493 | name: embedding_metadata.model_name.clone(),
494 | ty: "embedding".to_string(),
495 | ctx_size: embedding_metadata.ctx_size,
496 | batch_size: embedding_metadata.batch_size,
497 | ubatch_size: embedding_metadata.ubatch_size,
498 | prompt_template: embedding_metadata.prompt_template,
499 | n_predict: embedding_metadata.n_predict,
500 | reverse_prompt: embedding_metadata.reverse_prompt.clone(),
501 | n_gpu_layers: embedding_metadata.n_gpu_layers,
502 | temperature: embedding_metadata.temperature,
503 | top_p: embedding_metadata.top_p,
504 | repeat_penalty: embedding_metadata.repeat_penalty,
505 | presence_penalty: embedding_metadata.presence_penalty,
506 | frequency_penalty: embedding_metadata.frequency_penalty,
507 | split_mode: embedding_metadata.split_mode.clone(),
508 | main_gpu: embedding_metadata.main_gpu,
509 | tensor_split: embedding_metadata.tensor_split.clone(),
510 | };
511 |
512 | // embedding model
513 | let embedding_models = [embedding_metadata];
514 |
515 | // create rag config
516 | let rag_config = RagConfig {
517 | chat_model: chat_model_info,
518 | embedding_model: embedding_model_info,
519 | policy,
520 | };
521 |
522 | // initialize the core context
523 | llama_core::init_ggml_rag_context(&chat_models[..], &embedding_models[..]).map_err(|e| {
524 | let err_msg = format!("Failed to initialize the core context. {}", e);
525 |
526 | // log
527 | error!(target: "stdout", "{}", &err_msg);
528 |
529 | ServerError::Operation(err_msg)
530 | })?;
531 |
532 | // get the plugin version info
533 | let plugin_info =
534 | llama_core::get_plugin_info().map_err(|e| ServerError::Operation(e.to_string()))?;
535 | let plugin_version = format!(
536 | "b{build_number} (commit {commit_id})",
537 | build_number = plugin_info.build_number,
538 | commit_id = plugin_info.commit_id,
539 | );
540 |
541 | // log plugin version
542 | info!(target: "stdout", "plugin_ggml_version: {}", &plugin_version);
543 |
544 | // socket address
545 | let addr = match cli.socket_addr {
546 | Some(addr) => addr,
547 | None => SocketAddr::from(([0, 0, 0, 0], cli.port)),
548 | };
549 | let port = addr.port().to_string();
550 |
551 | // get the environment variable `NODE_VERSION`
552 | // Note that this is for satisfying the requirement of `gaianet-node` project.
553 | let node = std::env::var("NODE_VERSION").ok();
554 | if node.is_some() {
555 | // log node version
556 | info!(target: "stdout", "gaianet_node_version: {}", node.as_ref().unwrap());
557 | }
558 |
559 | // create server info
560 | let server_info = ServerInfo {
561 | node,
562 | server: ApiServer {
563 | ty: "rag".to_string(),
564 | version: env!("CARGO_PKG_VERSION").to_string(),
565 | plugin_version,
566 | port,
567 | },
568 | rag_config,
569 | qdrant_config: qdrant_config_vec,
570 | extras: HashMap::new(),
571 | };
572 | SERVER_INFO
573 | .set(RwLock::new(server_info))
574 | .map_err(|_| ServerError::Operation("Failed to set `SERVER_INFO`.".to_string()))?;
575 |
576 | let new_service = make_service_fn(move |conn: &AddrStream| {
577 | // log socket address
578 | info!(target: "stdout", "remote_addr: {}, local_addr: {}", conn.remote_addr().to_string(), conn.local_addr().to_string());
579 |
580 | let web_ui = cli.web_ui.to_string_lossy().to_string();
581 | let chunk_capacity = cli.chunk_capacity;
582 |
583 | async move {
584 | Ok::<_, Error>(service_fn(move |req| {
585 | handle_request(req, chunk_capacity, web_ui.clone())
586 | }))
587 | }
588 | });
589 |
590 | let tcp_listener = TcpListener::bind(addr).await.unwrap();
591 | info!(target: "stdout", "Listening on {}", addr);
592 |
593 | let server = Server::from_tcp(tcp_listener.into_std().unwrap())
594 | .unwrap()
595 | .serve(new_service);
596 |
597 | match server.await {
598 | Ok(_) => Ok(()),
599 | Err(e) => Err(ServerError::Operation(e.to_string())),
600 | }
601 | }
602 |
603 | async fn handle_request(
604 | req: Request,
605 | chunk_capacity: usize,
606 | web_ui: String,
607 | ) -> Result, hyper::Error> {
608 | let path_str = req.uri().path();
609 | let path_buf = PathBuf::from(path_str);
610 | let mut path_iter = path_buf.iter();
611 | path_iter.next(); // Must be Some(OsStr::new(&path::MAIN_SEPARATOR.to_string()))
612 | let root_path = path_iter.next().unwrap_or_default();
613 | let root_path = "/".to_owned() + root_path.to_str().unwrap_or_default();
614 |
615 | // check if the API key is valid
616 | if let Some(auth_header) = req.headers().get("authorization") {
617 | if !auth_header.is_empty() {
618 | let auth_header = match auth_header.to_str() {
619 | Ok(auth_header) => auth_header,
620 | Err(e) => {
621 | let err_msg = format!("Failed to get authorization header: {}", e);
622 | return Ok(error::unauthorized(err_msg));
623 | }
624 | };
625 |
626 | let api_key = auth_header.split(" ").nth(1).unwrap_or_default();
627 | info!(target: "stdout", "API Key: {}", api_key);
628 |
629 | if let Some(stored_api_key) = LLAMA_API_KEY.get() {
630 | if api_key != stored_api_key {
631 | let err_msg = "Invalid API key.";
632 | return Ok(error::unauthorized(err_msg));
633 | }
634 | }
635 | }
636 | }
637 |
638 | // log request
639 | {
640 | let method = hyper::http::Method::as_str(req.method()).to_string();
641 | let path = req.uri().path().to_string();
642 | let version = format!("{:?}", req.version());
643 | if req.method() == hyper::http::Method::POST {
644 | let size: u64 = match req.headers().get("content-length") {
645 | Some(content_length) => content_length.to_str().unwrap().parse().unwrap(),
646 | None => 0,
647 | };
648 |
649 | info!(target: "stdout", "method: {}, http_version: {}, content-length: {}", method, version, size);
650 | info!(target: "stdout", "endpoint: {}", path);
651 | } else {
652 | info!(target: "stdout", "method: {}, http_version: {}", method, version);
653 | info!(target: "stdout", "endpoint: {}", path);
654 | }
655 | }
656 |
657 | let response = match root_path.as_str() {
658 | "/echo" => Response::new(Body::from("echo test")),
659 | "/v1" => backend::handle_llama_request(req, chunk_capacity).await,
660 | _ => static_response(path_str, web_ui),
661 | };
662 |
663 | // log response
664 | {
665 | let status_code = response.status();
666 | if status_code.as_u16() < 400 {
667 | // log response
668 | let response_version = format!("{:?}", response.version());
669 | info!(target: "stdout", "response_version: {}", response_version);
670 | let response_body_size: u64 = response.body().size_hint().lower();
671 | info!(target: "stdout", "response_body_size: {}", response_body_size);
672 | let response_status = status_code.as_u16();
673 | info!(target: "stdout", "response_status: {}", response_status);
674 | let response_is_success = status_code.is_success();
675 | info!(target: "stdout", "response_is_success: {}", response_is_success);
676 | } else {
677 | let response_version = format!("{:?}", response.version());
678 | error!(target: "stdout", "response_version: {}", response_version);
679 | let response_body_size: u64 = response.body().size_hint().lower();
680 | error!(target: "stdout", "response_body_size: {}", response_body_size);
681 | let response_status = status_code.as_u16();
682 | error!(target: "stdout", "response_status: {}", response_status);
683 | let response_is_success = status_code.is_success();
684 | error!(target: "stdout", "response_is_success: {}", response_is_success);
685 | let response_is_client_error = status_code.is_client_error();
686 | error!(target: "stdout", "response_is_client_error: {}", response_is_client_error);
687 | let response_is_server_error = status_code.is_server_error();
688 | error!(target: "stdout", "response_is_server_error: {}", response_is_server_error);
689 | }
690 | }
691 |
692 | Ok(response)
693 | }
694 |
695 | fn static_response(path_str: &str, root: String) -> Response {
696 | let path = match path_str {
697 | "/" => "/index.html",
698 | _ => path_str,
699 | };
700 |
701 | let mime = mime_guess::from_path(path);
702 |
703 | match std::fs::read(format!("{root}/{path}")) {
704 | Ok(content) => Response::builder()
705 | .status(StatusCode::OK)
706 | .header(header::CONTENT_TYPE, mime.first_or_text_plain().to_string())
707 | .body(Body::from(content))
708 | .unwrap(),
709 | Err(_) => {
710 | let body = Body::from(std::fs::read(format!("{root}/404.html")).unwrap_or_default());
711 | Response::builder()
712 | .status(StatusCode::NOT_FOUND)
713 | .header(header::CONTENT_TYPE, "text/html")
714 | .body(body)
715 | .unwrap()
716 | }
717 | }
718 | }
719 |
720 | #[derive(Debug, Clone, Default, Serialize, Deserialize)]
721 | pub(crate) struct QdrantConfig {
722 | pub(crate) url: String,
723 | pub(crate) collection_name: String,
724 | pub(crate) limit: u64,
725 | pub(crate) score_threshold: f32,
726 | }
727 | impl fmt::Display for QdrantConfig {
728 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
729 | write!(
730 | f,
731 | "url: {}, collection_name: {}, limit: {}, score_threshold: {}",
732 | self.url, self.collection_name, self.limit, self.score_threshold
733 | )
734 | }
735 | }
736 |
737 | #[derive(Debug, Serialize, Deserialize)]
738 | pub(crate) struct ModelConfig {
739 | // model name
740 | name: String,
741 | // type: chat or embedding
742 | #[serde(rename = "type")]
743 | ty: String,
744 | pub ctx_size: u64,
745 | pub batch_size: u64,
746 | pub ubatch_size: u64,
747 | pub prompt_template: PromptTemplateType,
748 | pub n_predict: i32,
749 | #[serde(skip_serializing_if = "Option::is_none")]
750 | pub reverse_prompt: Option,
751 | pub n_gpu_layers: u64,
752 | pub temperature: f64,
753 | pub top_p: f64,
754 | pub repeat_penalty: f64,
755 | pub presence_penalty: f64,
756 | pub frequency_penalty: f64,
757 | pub split_mode: String,
758 | #[serde(skip_serializing_if = "Option::is_none")]
759 | pub main_gpu: Option,
760 | #[serde(skip_serializing_if = "Option::is_none")]
761 | pub tensor_split: Option,
762 | }
763 |
764 | #[derive(Debug, Serialize, Deserialize)]
765 | pub(crate) struct ServerInfo {
766 | #[serde(skip_serializing_if = "Option::is_none")]
767 | #[serde(rename = "node_version")]
768 | node: Option,
769 | #[serde(rename = "api_server")]
770 | server: ApiServer,
771 | #[serde(flatten)]
772 | rag_config: RagConfig,
773 | qdrant_config: Vec,
774 | extras: HashMap,
775 | }
776 |
777 | #[derive(Debug, Serialize, Deserialize)]
778 | pub(crate) struct ApiServer {
779 | #[serde(rename = "type")]
780 | ty: String,
781 | version: String,
782 | #[serde(rename = "ggml_plugin_version")]
783 | plugin_version: String,
784 | port: String,
785 | }
786 |
787 | #[derive(Debug, Serialize, Deserialize)]
788 | pub(crate) struct RagConfig {
789 | pub chat_model: ModelConfig,
790 | pub embedding_model: ModelConfig,
791 | #[serde(rename = "rag_policy")]
792 | pub policy: MergeRagContextPolicy,
793 | }
794 |
795 | #[derive(Debug, Clone, Default)]
796 | pub(crate) struct KeywordSearchConfig {
797 | pub url: String,
798 | }
799 |
--------------------------------------------------------------------------------
/src/utils.rs:
--------------------------------------------------------------------------------
1 | use serde::{Deserialize, Serialize};
2 | use url::Url;
3 |
4 | pub(crate) fn is_valid_url(url: &str) -> bool {
5 | Url::parse(url).is_ok()
6 | }
7 |
8 | pub(crate) fn gen_chat_id() -> String {
9 | format!("chatcmpl-{}", uuid::Uuid::new_v4())
10 | }
11 |
12 | #[derive(
13 | Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, clap::ValueEnum, Serialize, Deserialize,
14 | )]
15 | #[serde(rename_all = "lowercase")]
16 | pub(crate) enum LogLevel {
17 | /// Describes messages about the values of variables and the flow of
18 | /// control within a program.
19 | Trace,
20 |
21 | /// Describes messages likely to be of interest to someone debugging a
22 | /// program.
23 | Debug,
24 |
25 | /// Describes messages likely to be of interest to someone monitoring a
26 | /// program.
27 | Info,
28 |
29 | /// Describes messages indicating hazardous situations.
30 | Warn,
31 |
32 | /// Describes messages indicating serious errors.
33 | Error,
34 |
35 | /// Describes messages indicating fatal errors.
36 | Critical,
37 | }
38 | impl From for log::LevelFilter {
39 | fn from(level: LogLevel) -> Self {
40 | match level {
41 | LogLevel::Trace => log::LevelFilter::Trace,
42 | LogLevel::Debug => log::LevelFilter::Debug,
43 | LogLevel::Info => log::LevelFilter::Info,
44 | LogLevel::Warn => log::LevelFilter::Warn,
45 | LogLevel::Error => log::LevelFilter::Error,
46 | LogLevel::Critical => log::LevelFilter::Error,
47 | }
48 | }
49 | }
50 | impl std::fmt::Display for LogLevel {
51 | fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
52 | match self {
53 | LogLevel::Trace => write!(f, "trace"),
54 | LogLevel::Debug => write!(f, "debug"),
55 | LogLevel::Info => write!(f, "info"),
56 | LogLevel::Warn => write!(f, "warn"),
57 | LogLevel::Error => write!(f, "error"),
58 | LogLevel::Critical => write!(f, "critical"),
59 | }
60 | }
61 | }
62 | impl std::str::FromStr for LogLevel {
63 | type Err = String;
64 | fn from_str(s: &str) -> Result {
65 | match s.to_lowercase().as_str() {
66 | "trace" => Ok(LogLevel::Trace),
67 | "debug" => Ok(LogLevel::Debug),
68 | "info" => Ok(LogLevel::Info),
69 | "warn" => Ok(LogLevel::Warn),
70 | "error" => Ok(LogLevel::Error),
71 | "critical" => Ok(LogLevel::Critical),
72 | _ => Err(format!("Invalid log level: {}", s)),
73 | }
74 | }
75 | }
76 |
--------------------------------------------------------------------------------
/tests/test_chat.hurl:
--------------------------------------------------------------------------------
1 | # test /v1/models endpoint
2 | GET http://localhost:8080/v1/models
3 | screencapability: low
4 | HTTP 200
5 | [Asserts]
6 | jsonpath "$.data[0].id" == "Qwen2-1.5B-Instruct"
7 |
8 | # test /v1/chat/completions endpoint
9 | POST http://localhost:8080/v1/chat/completions
10 | Accept: application/json
11 | Content-Type: application/json
12 | ```json
13 | {
14 | "messages": [
15 | {
16 | "role": "user",
17 | "content": "What is the capital of France?"
18 | }
19 | ],
20 | "model": "Qwen2-1.5B-Instruct",
21 | "stream": false
22 | }
23 | ```
24 | HTTP 200
25 | [Asserts]
26 | jsonpath "$.model" == "Qwen2-1.5B-Instruct"
27 | jsonpath "$.choices[0].message.content" contains "Paris"
28 |
29 |
30 | # test /v1/chat/completions endpoint
31 | # Test purpose: The model name is incorrect
32 | POST http://localhost:8080/v1/chat/completions
33 | Accept: application/json
34 | Content-Type: application/json
35 | ```json
36 | {
37 | "messages": [
38 | {
39 | "role": "user",
40 | "content": "What is the capital of France?"
41 | }
42 | ],
43 | "model": "Qwen2-1.5B-Instruct-invalid",
44 | "stream": false
45 | }
46 | ```
47 | HTTP 200
48 | [Asserts]
49 | jsonpath "$.model" == "Qwen2-1.5B-Instruct"
50 | jsonpath "$.choices[0].message.content" contains "Paris"
--------------------------------------------------------------------------------
/tests/test_embeddings.hurl:
--------------------------------------------------------------------------------
1 |
2 | # test /v1/embeddings endpoint
3 | POST http://localhost:8080/v1/embeddings
4 | Accept: application/json
5 | Content-Type: application/json
6 | ```json
7 | {
8 | "model": "nomic-embed-text-v1.5",
9 | "input": [
10 | "Gaianet is revolutionizing the AI landscape with a distributed AI infrastructure that seeks to decentralize the dominance of major players such as OpenAI, Google, and Anthropic. By leveraging a network of edge-computing nodes owned by individuals around the world, Gaianet enables hosting of both open-source and finely-tuned models. This infrastructure is designed to cater to diverse AI demands, offering a scalable alternative to traditional centralized servers.",
11 | "The inception of Gaianet is driven by the necessity to address key issues in the current AI industry: censorship and bias in AI outputs, lack of privacy for user data, and the high costs associated with accessing and developing on centralized AI models. These challenges have restricted the dissemination of unbiased information, compromised data security, and erected barriers to innovation and broader application of AI technologies.",
12 | "In an effort to mitigate these issues, Gaianet proposes a model that is inherently more inclusive and secure. It offers a platform where individuals can contribute their computing resources, expertise, and proprietary data to foster a collaborative advancement of AI technology. This approach not only democratizes access to AI but also aligns with principles of privacy and equitable distribution of resources.",
13 | "The Gaianet ecosystem comprises four key components:\nComponent 1, The capability for individuals to run an Agent style API service on private devices, utilizing proprietary knowledge and finely-tuned models.\nComponent 2, Compatibility with the OpenAI API to ensure seamless integration with existing toolchains.\nComponent 3, A decentralized marketplace for AI services and components.\nComponent 4, Open-source development tools for creating LLM/AI applications.",
14 | "Features of Gaia nodes are extensive and include integrated data, knowledge, and computing resources; support for open-source and OpenAI models; customization with fine-tuned LLMs; and the ability to chain multiple specialized models for enhanced functionality. Additionally, nodes offer knowledge bases for LLM enhancements, memorized chat history for extended conversations, and the capability for actions and function calls to external systems.",
15 | "Gaianet also introduces a suite of ancillary offerings aimed at developers, including tools for fine-tuning LLMs, marketplaces for fine-tuned models and embeddings, and SDKs for various integrations.",
16 | "The team behind Gaianet is spearheaded by CEO Matt Wright, previously Director of Fellowship Accelerator at Consensys and Director of DAOs. Chief Scientist Allen Yang brings his expertise from the University of California at Berkeley, while COO Shashank Sripada contributes his entrepreneurial experience. Advisors Michael Yuan and Brian Shin add further depth to the project's foundation.",
17 | "Gaianet's official website can be accessed at [https://www.gaianet.ai/](https://www.gaianet.ai/), where you can find comprehensive information and resources about their initiatives.\n\nFor community engagement and discussions, Gaianet has a Discord server available at [https://discord.com/invite/gaianet-ai](https://discord.com/invite/gaianet-ai), welcoming members to join.",
18 | "To stay updated with Gaianet's latest news and insights, you can follow their Telegram channel at [https://t.me/Gaianet_AI](https://t.me/Gaianet_AI).\n\nInsightful articles and updates from Gaianet are regularly published on their Medium blog at [https://medium.com/@Gaianet.ai](https://medium.com/@Gaianet.ai).",
19 | "For the latest announcements and engagements, follow Gaianet on Twitter at [https://twitter.com/Gaianet_AI](https://twitter.com/Gaianet_AI).\n\nDevelopers and contributors can explore Gaianet's GitHub repository at [https://github.com/GaiaNet-AI/](https://github.com/GaiaNet-AI/)."
20 | ]
21 | }
22 | ```
23 | HTTP 200
24 | [Asserts]
25 | jsonpath "$.model" == "nomic-embed-text-v1.5"
26 | jsonpath "$.data" count > 0
27 |
28 | # test /v1/embeddings endpoint
29 | # Test purpose: The model name is incorrect
30 | POST http://localhost:8080/v1/embeddings
31 | Accept: application/json
32 | Content-Type: application/json
33 | ```json
34 | {
35 | "model": "nomic-embed-text-v1.5-invalid",
36 | "input": [
37 | "Gaianet is revolutionizing the AI landscape with a distributed AI infrastructure that seeks to decentralize the dominance of major players such as OpenAI, Google, and Anthropic. By leveraging a network of edge-computing nodes owned by individuals around the world, Gaianet enables hosting of both open-source and finely-tuned models. This infrastructure is designed to cater to diverse AI demands, offering a scalable alternative to traditional centralized servers.",
38 | "The inception of Gaianet is driven by the necessity to address key issues in the current AI industry: censorship and bias in AI outputs, lack of privacy for user data, and the high costs associated with accessing and developing on centralized AI models. These challenges have restricted the dissemination of unbiased information, compromised data security, and erected barriers to innovation and broader application of AI technologies.",
39 | "In an effort to mitigate these issues, Gaianet proposes a model that is inherently more inclusive and secure. It offers a platform where individuals can contribute their computing resources, expertise, and proprietary data to foster a collaborative advancement of AI technology. This approach not only democratizes access to AI but also aligns with principles of privacy and equitable distribution of resources.",
40 | "The Gaianet ecosystem comprises four key components:\nComponent 1, The capability for individuals to run an Agent style API service on private devices, utilizing proprietary knowledge and finely-tuned models.\nComponent 2, Compatibility with the OpenAI API to ensure seamless integration with existing toolchains.\nComponent 3, A decentralized marketplace for AI services and components.\nComponent 4, Open-source development tools for creating LLM/AI applications.",
41 | "Features of Gaia nodes are extensive and include integrated data, knowledge, and computing resources; support for open-source and OpenAI models; customization with fine-tuned LLMs; and the ability to chain multiple specialized models for enhanced functionality. Additionally, nodes offer knowledge bases for LLM enhancements, memorized chat history for extended conversations, and the capability for actions and function calls to external systems.",
42 | "Gaianet also introduces a suite of ancillary offerings aimed at developers, including tools for fine-tuning LLMs, marketplaces for fine-tuned models and embeddings, and SDKs for various integrations.",
43 | "The team behind Gaianet is spearheaded by CEO Matt Wright, previously Director of Fellowship Accelerator at Consensys and Director of DAOs. Chief Scientist Allen Yang brings his expertise from the University of California at Berkeley, while COO Shashank Sripada contributes his entrepreneurial experience. Advisors Michael Yuan and Brian Shin add further depth to the project's foundation.",
44 | "Gaianet's official website can be accessed at [https://www.gaianet.ai/](https://www.gaianet.ai/), where you can find comprehensive information and resources about their initiatives.\n\nFor community engagement and discussions, Gaianet has a Discord server available at [https://discord.com/invite/gaianet-ai](https://discord.com/invite/gaianet-ai), welcoming members to join.",
45 | "To stay updated with Gaianet's latest news and insights, you can follow their Telegram channel at [https://t.me/Gaianet_AI](https://t.me/Gaianet_AI).\n\nInsightful articles and updates from Gaianet are regularly published on their Medium blog at [https://medium.com/@Gaianet.ai](https://medium.com/@Gaianet.ai).",
46 | "For the latest announcements and engagements, follow Gaianet on Twitter at [https://twitter.com/Gaianet_AI](https://twitter.com/Gaianet_AI).\n\nDevelopers and contributors can explore Gaianet's GitHub repository at [https://github.com/GaiaNet-AI/](https://github.com/GaiaNet-AI/)."
47 | ]
48 | }
49 | ```
50 | HTTP 200
51 | [Asserts]
52 | jsonpath "$.model" == "nomic-embed-text-v1.5"
53 | jsonpath "$.data" count > 0
--------------------------------------------------------------------------------
/tests/test_rag.hurl:
--------------------------------------------------------------------------------
1 |
2 | # test /v1/chat/completions endpoint
3 | POST http://localhost:8080/v1/chat/completions
4 | Accept: application/json
5 | Content-Type: application/json
6 | ```json
7 | {
8 | "messages": [
9 | {
10 | "role": "user",
11 | "content": "What is the location of Paris, France along the Siene River?"
12 | }
13 | ],
14 | "model": "Qwen2-1.5B-Instruct",
15 | "stream": false
16 | }
17 | ```
18 | HTTP 200
19 | [Asserts]
20 | jsonpath "$.model" == "Qwen2-1.5B-Instruct"
21 | jsonpath "$.choices[0].message.content" contains "Paris"
22 |
--------------------------------------------------------------------------------