├── .cargo └── config.toml ├── .github └── workflows │ ├── build.yml │ ├── release.yml │ └── test_api_server.yml ├── .gitignore ├── Cargo.lock ├── Cargo.toml ├── LICENSE ├── README.md ├── check_code_before_commit.sh ├── docs ├── assets │ └── kw_search.png ├── keyword_search.md └── vectordb.md ├── src ├── backend │ ├── ggml.rs │ └── mod.rs ├── error.rs ├── main.rs └── utils.rs └── tests ├── test_chat.hurl ├── test_embeddings.hurl └── test_rag.hurl /.cargo/config.toml: -------------------------------------------------------------------------------- 1 | [build] 2 | target = "wasm32-wasip1" 3 | rustflags = ["--cfg", "wasmedge", "--cfg", "tokio_unstable"] 4 | -------------------------------------------------------------------------------- /.github/workflows/build.yml: -------------------------------------------------------------------------------- 1 | name: Build 2 | 3 | on: 4 | push: 5 | branches: 6 | - dev 7 | - main 8 | - release-* 9 | - feat-* 10 | - ci-* 11 | - refactor-* 12 | - fix-* 13 | - test-* 14 | paths: 15 | - '.github/workflows/build.yml' 16 | - '**/Cargo.toml' 17 | - '**/Cargo.lock' 18 | - '**/*.rs' 19 | - '**/*.sh' 20 | pull_request: 21 | branches: 22 | - dev 23 | - main 24 | types: [opened, synchronize, reopened] 25 | paths: 26 | - '.github/workflows/**' 27 | - '**/Cargo.toml' 28 | - '**/*.rs' 29 | - '**/*.sh' 30 | 31 | jobs: 32 | build-wasm: 33 | runs-on: ${{ matrix.os }} 34 | strategy: 35 | matrix: 36 | os: [ubuntu-22.04, macos-13, macos-14, macos-15] 37 | steps: 38 | - name: Clone project 39 | id: checkout 40 | uses: actions/checkout@v3 41 | 42 | - name: Install Rust-nightly 43 | uses: actions-rust-lang/setup-rust-toolchain@v1 44 | with: 45 | toolchain: nightly 46 | target: wasm32-wasip1 47 | components: rustfmt, clippy 48 | 49 | - name: Install Rust-stable 50 | uses: actions-rust-lang/setup-rust-toolchain@v1 51 | with: 52 | target: wasm32-wasip1 53 | 54 | - name: Download wasi-sdk for x86_64-macos 55 | if: matrix.os == 'macos-13' 56 | run: | 57 | curl -LO https://github.com/WebAssembly/wasi-sdk/releases/download/wasi-sdk-24/wasi-sdk-24.0-x86_64-macos.tar.gz 58 | tar -xzvf wasi-sdk-24.0-x86_64-macos.tar.gz 59 | mv wasi-sdk-24.0-x86_64-macos wasi-sdk-24.0 60 | 61 | - name: Download wasi-sdk for arm64-macos 62 | if: matrix.os == 'macos-14' || matrix.os == 'macos-15' 63 | run: | 64 | curl -LO https://github.com/WebAssembly/wasi-sdk/releases/download/wasi-sdk-24/wasi-sdk-24.0-arm64-macos.tar.gz 65 | tar -xzvf wasi-sdk-24.0-arm64-macos.tar.gz 66 | mv wasi-sdk-24.0-arm64-macos wasi-sdk-24.0 67 | 68 | - name: Build api-server for linux 69 | id: build_api_server_linux 70 | if: startsWith(matrix.os, 'ubuntu') 71 | env: 72 | RUSTFLAGS: "--cfg wasmedge --cfg tokio_unstable" 73 | run: | 74 | cargo +nightly fmt --all -- --check 75 | cargo +nightly clippy --target wasm32-wasip1 -- -D warnings 76 | cargo build --target wasm32-wasip1 --release 77 | 78 | - name: Build api-server for macos 79 | id: build_api_server_macos 80 | if: startsWith(matrix.os, 'macos') 81 | env: 82 | WASI_SDK_PATH: /Users/runner/work/rag-api-server/rag-api-server/wasi-sdk-24.0 83 | CC: "/Users/runner/work/rag-api-server/rag-api-server/wasi-sdk-24.0/bin/clang --sysroot=/Users/runner/work/rag-api-server/rag-api-server/wasi-sdk-24.0/share/wasi-sysroot" 84 | RUSTFLAGS: "--cfg wasmedge --cfg tokio_unstable" 85 | run: | 86 | cargo +nightly fmt --all -- --check 87 | cargo +nightly clippy --target wasm32-wasip1 -- -D warnings 88 | cargo build --target wasm32-wasip1 --release 89 | -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | name: Release 2 | 3 | on: 4 | workflow_dispatch: # manual trigger release 5 | inputs: 6 | create_release: 7 | description: 'Create new release' 8 | required: true 9 | type: boolean 10 | release_version: 11 | description: "Version (e.g. 1.0.0)" 12 | required: true 13 | type: string 14 | 15 | jobs: 16 | build-and-release: 17 | runs-on: ubuntu-latest 18 | steps: 19 | - name: Clone project 20 | id: checkout 21 | uses: actions/checkout@v3 22 | 23 | - name: Setup rustup 24 | id: rustup 25 | uses: actions-rust-lang/setup-rust-toolchain@v1 26 | with: 27 | target: wasm32-wasip1 28 | 29 | - name: Build rag-api-server 30 | id: build_rag_api_server 31 | env: 32 | RUSTFLAGS: "--cfg wasmedge --cfg tokio_unstable" 33 | run: | 34 | cargo clean 35 | cargo build --target wasm32-wasip1 --release 36 | cp target/wasm32-wasip1/release/rag-api-server.wasm rag-api-server.wasm 37 | 38 | - name: Calculate checksum 39 | id: checksum 40 | run: | 41 | sha256sum *.wasm > SHA256SUM 42 | 43 | echo "Debug info(SHA256SUM):" 44 | cat SHA256SUM 45 | 46 | - name: Tag and release names 47 | id: tag_and_release_names 48 | run: | 49 | echo "tag_name=${{ github.event.inputs.release_version }}" >> $GITHUB_OUTPUT 50 | echo "release_name=LlamaEdge-RAG ${{ github.event.inputs.release_version }}" >> $GITHUB_OUTPUT 51 | 52 | - name: Create Release and Upload Release Asset 53 | if: ${{ github.event.inputs.create_release == 'true' && github.ref == 'refs/heads/main'}} 54 | uses: softprops/action-gh-release@v1 55 | with: 56 | name: ${{ steps.tag_and_release_names.outputs.release_name }} 57 | tag_name: ${{ steps.tag_and_release_names.outputs.tag_name }} 58 | body: TODO New Release. 59 | draft: true 60 | prerelease: true 61 | files: | 62 | rag-api-server.wasm 63 | SHA256SUM 64 | -------------------------------------------------------------------------------- /.github/workflows/test_api_server.yml: -------------------------------------------------------------------------------- 1 | name: Test API Server 2 | 3 | on: 4 | push: 5 | branches: 6 | - dev 7 | - main 8 | - release-* 9 | - feat-* 10 | - ci-* 11 | - refactor-* 12 | - fix-* 13 | - test-* 14 | paths: 15 | - '.github/workflows/test_api_server.yml' 16 | - '**/Cargo.toml' 17 | - '**/Cargo.lock' 18 | - '**/*.rs' 19 | - '**/*.sh' 20 | - '**/.cargo/config.toml' 21 | - 'tests/*.hurl' 22 | pull_request: 23 | branches: 24 | - dev 25 | - main 26 | types: [opened, synchronize, reopened] 27 | paths: 28 | - '.github/workflows/**' 29 | - '**/Cargo.toml' 30 | - '**/*.rs' 31 | - '**/*.sh' 32 | - 'tests/*.hurl' 33 | 34 | jobs: 35 | test-api-server-ubuntu: 36 | runs-on: ubuntu-latest 37 | strategy: 38 | matrix: 39 | wasmedge_version: [0.14.1] 40 | ggml_version: [b5074] 41 | steps: 42 | - name: Clone project 43 | id: checkout 44 | uses: actions/checkout@v3 45 | 46 | - name: Install Rust-nightly 47 | uses: actions-rust-lang/setup-rust-toolchain@v1 48 | with: 49 | toolchain: nightly 50 | target: wasm32-wasip1 51 | components: rustfmt, clippy 52 | 53 | - name: Install Rust-stable 54 | uses: actions-rust-lang/setup-rust-toolchain@v1 55 | with: 56 | target: wasm32-wasip1 57 | 58 | - name: Install WasmEdge 59 | run: | 60 | curl -sSf https://raw.githubusercontent.com/WasmEdge/WasmEdge/master/utils/install_v2.sh | bash -s -- -v ${{ matrix.wasmedge_version }} --ggmlbn=${{ matrix.ggml_version }} 61 | ls -al $HOME/.wasmedge/bin 62 | 63 | - name: Install Hurl 64 | run: | 65 | curl --location --remote-name https://github.com/Orange-OpenSource/hurl/releases/download/5.0.1/hurl_5.0.1_amd64.deb 66 | sudo apt update && sudo apt install ./hurl_5.0.1_amd64.deb 67 | 68 | - name: Install Qdrant and download snapshot 69 | run: | 70 | # Download Qdrant 71 | curl -LO https://github.com/qdrant/qdrant/releases/download/v1.11.4/qdrant-x86_64-unknown-linux-musl.tar.gz 72 | tar -xvf qdrant-x86_64-unknown-linux-musl.tar.gz 73 | rm qdrant-x86_64-unknown-linux-musl.tar.gz 74 | 75 | # Download snapshot 76 | curl -LO https://huggingface.co/datasets/gaianet/paris/resolve/main/paris_768_nomic-embed-text-v1.5-f16.snapshot 77 | mv paris_768_nomic-embed-text-v1.5-f16.snapshot default.snapshot 78 | 79 | ls -al 80 | 81 | - name: Build rag-api-server on linux 82 | env: 83 | RUSTFLAGS: "--cfg wasmedge --cfg tokio_unstable" 84 | run: | 85 | cargo build -p rag-api-server --release 86 | cp target/wasm32-wasip1/release/rag-api-server.wasm ./rag-api-server.wasm 87 | 88 | - name: Download models 89 | run: | 90 | curl -LO https://huggingface.co/second-state/Qwen2-1.5B-Instruct-GGUF/resolve/main/Qwen2-1.5B-Instruct-Q3_K_M.gguf 91 | curl -LO https://huggingface.co/second-state/Nomic-embed-text-v1.5-Embedding-GGUF/resolve/main/nomic-embed-text-v1.5-f16.gguf 92 | 93 | - name: Start Qdrant 94 | run: | 95 | nohup ./qdrant > ./start-qdrant.log 2>&1 & 96 | sleep 5 97 | cat start-qdrant.log 98 | 99 | - name: Import the default.snapshot file to Qdrant 100 | run: | 101 | curl -s -X POST http://localhost:6333/collections/default/snapshots/upload?priority=snapshot -H 'Content-Type:multipart/form-data' -F 'snapshot=@default.snapshot' 102 | 103 | - name: Start rag-api-server for testing chat completions 104 | run: | 105 | nohup $HOME/.wasmedge/bin/wasmedge --dir .:. --nn-preload default:GGML:AUTO:Qwen2-1.5B-Instruct-Q3_K_M.gguf --nn-preload embedding:GGML:AUTO:nomic-embed-text-v1.5-f16.gguf rag-api-server.wasm --model-name Qwen2-1.5B-Instruct,nomic-embed-text-v1.5 --ctx-size 4096,512 --batch-size 16,512 --prompt-template chatml,embedding --rag-policy last-user-message --socket-addr 0.0.0.0:8080 > ./start-llamaedge.log 2>&1 & 106 | sleep 30 107 | cat start-llamaedge.log 108 | 109 | # - name: Run test_chat.hurl 110 | # run: | 111 | # hurl --test --jobs 1 ./tests/test_chat.hurl 112 | 113 | - name: Run test_embeddings.hurl 114 | run: | 115 | hurl --test --jobs 1 ./tests/test_embeddings.hurl 116 | 117 | # - name: Run test_rag.hurl 118 | # run: | 119 | # hurl --test --jobs 1 ./tests/test_rag.hurl 120 | 121 | - name: Stop rag-api-server for testing chat completions 122 | run: | 123 | pkill -f wasmedge 124 | 125 | - name: Stop Qdrant 126 | run: | 127 | pkill -f qdrant 128 | 129 | test-api-server-macos-13: 130 | runs-on: macos-13 131 | needs: test-api-server-ubuntu 132 | strategy: 133 | matrix: 134 | wasmedge_version: [0.14.1] 135 | ggml_version: [b5074] 136 | steps: 137 | - name: Clone project 138 | id: checkout 139 | uses: actions/checkout@v3 140 | 141 | - name: Install Rust-nightly 142 | uses: actions-rust-lang/setup-rust-toolchain@v1 143 | with: 144 | toolchain: nightly 145 | target: wasm32-wasip1 146 | components: rustfmt, clippy 147 | 148 | - name: Install Rust-stable 149 | uses: actions-rust-lang/setup-rust-toolchain@v1 150 | with: 151 | target: wasm32-wasip1 152 | 153 | - name: Download wasi-sdk for x86_64-macos 154 | run: | 155 | curl -LO https://github.com/WebAssembly/wasi-sdk/releases/download/wasi-sdk-24/wasi-sdk-24.0-x86_64-macos.tar.gz 156 | tar -xzvf wasi-sdk-24.0-x86_64-macos.tar.gz 157 | mv wasi-sdk-24.0-x86_64-macos wasi-sdk-24.0 158 | 159 | - name: Install WasmEdge 160 | run: | 161 | curl -sSf https://raw.githubusercontent.com/WasmEdge/WasmEdge/master/utils/install_v2.sh | bash -s -- -v ${{ matrix.wasmedge_version }} --ggmlbn=${{ matrix.ggml_version }} 162 | ls -al $HOME/.wasmedge/bin 163 | 164 | - name: Install Hurl 165 | run: | 166 | brew install hurl 167 | 168 | - name: Install Qdrant and download snapshot 169 | run: | 170 | # Download Qdrant 171 | curl -LO https://github.com/qdrant/qdrant/releases/download/v1.11.4/qdrant-x86_64-apple-darwin.tar.gz 172 | tar -xzvf qdrant-x86_64-apple-darwin.tar.gz 173 | rm qdrant-x86_64-apple-darwin.tar.gz 174 | 175 | # Download snapshot 176 | curl -LO https://huggingface.co/datasets/gaianet/paris/resolve/main/paris_768_nomic-embed-text-v1.5-f16.snapshot 177 | mv paris_768_nomic-embed-text-v1.5-f16.snapshot default.snapshot 178 | 179 | ls -al 180 | 181 | - name: Build rag-api-server on macos-13 182 | env: 183 | WASI_SDK_PATH: /Users/runner/work/rag-api-server/rag-api-server/wasi-sdk-24.0 184 | CC: "/Users/runner/work/rag-api-server/rag-api-server/wasi-sdk-24.0/bin/clang --sysroot=/Users/runner/work/rag-api-server/rag-api-server/wasi-sdk-24.0/share/wasi-sysroot" 185 | RUSTFLAGS: "--cfg wasmedge --cfg tokio_unstable" 186 | run: | 187 | cargo build -p rag-api-server --release 188 | cp target/wasm32-wasip1/release/rag-api-server.wasm ./rag-api-server.wasm 189 | 190 | - name: Download models 191 | run: | 192 | curl -LO https://huggingface.co/second-state/Qwen2-1.5B-Instruct-GGUF/resolve/main/Qwen2-1.5B-Instruct-Q3_K_M.gguf 193 | curl -LO https://huggingface.co/second-state/Nomic-embed-text-v1.5-Embedding-GGUF/resolve/main/nomic-embed-text-v1.5-f16.gguf 194 | 195 | - name: Start Qdrant 196 | run: | 197 | nohup ./qdrant > ./start-qdrant.log 2>&1 & 198 | sleep 5 199 | cat start-qdrant.log 200 | 201 | - name: Import the default.snapshot file to Qdrant 202 | run: | 203 | curl -s -X POST http://localhost:6333/collections/default/snapshots/upload?priority=snapshot -H 'Content-Type:multipart/form-data' -F 'snapshot=@default.snapshot' 204 | 205 | - name: Start rag-api-server for testing chat completions 206 | run: | 207 | nohup $HOME/.wasmedge/bin/wasmedge --dir .:. --nn-preload default:GGML:AUTO:Qwen2-1.5B-Instruct-Q3_K_M.gguf --nn-preload embedding:GGML:AUTO:nomic-embed-text-v1.5-f16.gguf rag-api-server.wasm --model-name Qwen2-1.5B-Instruct,nomic-embed-text-v1.5 --ctx-size 4096,512 --batch-size 16,512 --prompt-template chatml,embedding --rag-policy last-user-message --socket-addr 0.0.0.0:8080 > ./start-llamaedge.log 2>&1 & 208 | sleep 30 209 | cat start-llamaedge.log 210 | 211 | # - name: Run test_chat.hurl 212 | # run: | 213 | # hurl --test --jobs 1 ./tests/test_chat.hurl 214 | 215 | - name: Run test_embeddings.hurl 216 | run: | 217 | hurl --test --jobs 1 ./tests/test_embeddings.hurl 218 | 219 | # - name: Run test_rag.hurl 220 | # run: | 221 | # hurl --test --jobs 1 ./tests/test_rag.hurl 222 | 223 | - name: Stop rag-api-server for testing chat completions 224 | run: | 225 | pkill -f wasmedge 226 | 227 | - name: Stop Qdrant 228 | run: | 229 | pkill -f qdrant 230 | 231 | test-api-server-macos-14: 232 | runs-on: macos-14 233 | needs: test-api-server-macos-13 234 | strategy: 235 | matrix: 236 | wasmedge_version: [0.14.1] 237 | ggml_version: [b5074] 238 | steps: 239 | - name: Clone project 240 | id: checkout 241 | uses: actions/checkout@v3 242 | 243 | - name: Install Rust-nightly 244 | uses: actions-rust-lang/setup-rust-toolchain@v1 245 | with: 246 | toolchain: nightly 247 | target: wasm32-wasip1 248 | components: rustfmt, clippy 249 | 250 | - name: Install Rust-stable 251 | uses: actions-rust-lang/setup-rust-toolchain@v1 252 | with: 253 | target: wasm32-wasip1 254 | 255 | - name: Download wasi-sdk for arm64-macos 256 | run: | 257 | curl -LO https://github.com/WebAssembly/wasi-sdk/releases/download/wasi-sdk-24/wasi-sdk-24.0-arm64-macos.tar.gz 258 | tar -xzvf wasi-sdk-24.0-arm64-macos.tar.gz 259 | mv wasi-sdk-24.0-arm64-macos wasi-sdk-24.0 260 | 261 | - name: Install WasmEdge 262 | run: | 263 | curl -sSf https://raw.githubusercontent.com/WasmEdge/WasmEdge/master/utils/install_v2.sh | bash -s -- -v ${{ matrix.wasmedge_version }} --ggmlbn=${{ matrix.ggml_version }} 264 | ls -al $HOME/.wasmedge/bin 265 | 266 | - name: Install Hurl 267 | run: | 268 | brew install hurl 269 | 270 | - name: Install Qdrant and download snapshot 271 | run: | 272 | # Download Qdrant 273 | curl -LO https://github.com/qdrant/qdrant/releases/download/v1.11.4/qdrant-aarch64-apple-darwin.tar.gz 274 | tar -xzvf qdrant-aarch64-apple-darwin.tar.gz 275 | rm qdrant-aarch64-apple-darwin.tar.gz 276 | 277 | # Download snapshot 278 | curl -LO https://huggingface.co/datasets/gaianet/paris/resolve/main/paris_768_nomic-embed-text-v1.5-f16.snapshot 279 | mv paris_768_nomic-embed-text-v1.5-f16.snapshot default.snapshot 280 | 281 | ls -al 282 | 283 | - name: Build rag-api-server on macos-14 284 | env: 285 | WASI_SDK_PATH: /Users/runner/work/rag-api-server/rag-api-server/wasi-sdk-24.0 286 | CC: "/Users/runner/work/rag-api-server/rag-api-server/wasi-sdk-24.0/bin/clang --sysroot=/Users/runner/work/rag-api-server/rag-api-server/wasi-sdk-24.0/share/wasi-sysroot" 287 | RUSTFLAGS: "--cfg wasmedge --cfg tokio_unstable" 288 | run: | 289 | cargo build -p rag-api-server --release 290 | cp target/wasm32-wasip1/release/rag-api-server.wasm ./rag-api-server.wasm 291 | 292 | - name: Download models 293 | run: | 294 | curl -LO https://huggingface.co/second-state/Qwen2-1.5B-Instruct-GGUF/resolve/main/Qwen2-1.5B-Instruct-Q3_K_M.gguf 295 | curl -LO https://huggingface.co/second-state/Nomic-embed-text-v1.5-Embedding-GGUF/resolve/main/nomic-embed-text-v1.5-f16.gguf 296 | 297 | - name: Start Qdrant 298 | run: | 299 | nohup ./qdrant > ./start-qdrant.log 2>&1 & 300 | sleep 5 301 | cat start-qdrant.log 302 | 303 | - name: Import the default.snapshot file to Qdrant 304 | run: | 305 | curl -s -X POST http://localhost:6333/collections/default/snapshots/upload?priority=snapshot -H 'Content-Type:multipart/form-data' -F 'snapshot=@default.snapshot' 306 | 307 | - name: Start rag-api-server for testing chat completions 308 | run: | 309 | nohup $HOME/.wasmedge/bin/wasmedge --dir .:. --nn-preload default:GGML:AUTO:Qwen2-1.5B-Instruct-Q3_K_M.gguf --nn-preload embedding:GGML:AUTO:nomic-embed-text-v1.5-f16.gguf rag-api-server.wasm --model-name Qwen2-1.5B-Instruct,nomic-embed-text-v1.5 --ctx-size 4096,512 --batch-size 16,512 --prompt-template chatml,embedding --rag-policy last-user-message --socket-addr 0.0.0.0:8080 > ./start-llamaedge.log 2>&1 & 310 | sleep 30 311 | cat start-llamaedge.log 312 | 313 | # - name: Run test_chat.hurl 314 | # run: | 315 | # hurl --test --jobs 1 ./tests/test_chat.hurl 316 | 317 | - name: Run test_embeddings.hurl 318 | run: | 319 | hurl --test --jobs 1 ./tests/test_embeddings.hurl 320 | 321 | # - name: Run test_rag.hurl 322 | # run: | 323 | # hurl --test --jobs 1 ./tests/test_rag.hurl 324 | 325 | - name: Stop rag-api-server for testing chat completions 326 | run: | 327 | pkill -f wasmedge 328 | 329 | - name: Stop Qdrant 330 | run: | 331 | pkill -f qdrant 332 | 333 | test-api-server-macos-15: 334 | runs-on: macos-15 335 | needs: test-api-server-macos-14 336 | strategy: 337 | matrix: 338 | wasmedge_version: [0.14.1] 339 | ggml_version: [b5074] 340 | steps: 341 | - name: Clone project 342 | id: checkout 343 | uses: actions/checkout@v3 344 | 345 | - name: Install Rust-nightly 346 | uses: actions-rust-lang/setup-rust-toolchain@v1 347 | with: 348 | toolchain: nightly 349 | target: wasm32-wasip1 350 | components: rustfmt, clippy 351 | 352 | - name: Install Rust-stable 353 | uses: actions-rust-lang/setup-rust-toolchain@v1 354 | with: 355 | target: wasm32-wasip1 356 | 357 | - name: Download wasi-sdk for arm64-macos 358 | run: | 359 | curl -LO https://github.com/WebAssembly/wasi-sdk/releases/download/wasi-sdk-24/wasi-sdk-24.0-arm64-macos.tar.gz 360 | tar -xzvf wasi-sdk-24.0-arm64-macos.tar.gz 361 | mv wasi-sdk-24.0-arm64-macos wasi-sdk-24.0 362 | 363 | - name: Install WasmEdge 364 | run: | 365 | curl -sSf https://raw.githubusercontent.com/WasmEdge/WasmEdge/master/utils/install_v2.sh | bash -s -- -v ${{ matrix.wasmedge_version }} --ggmlbn=${{ matrix.ggml_version }} 366 | ls -al $HOME/.wasmedge/bin 367 | 368 | - name: Install Hurl 369 | run: | 370 | brew install hurl 371 | 372 | - name: Install Qdrant and download snapshot 373 | run: | 374 | # Download Qdrant 375 | curl -LO https://github.com/qdrant/qdrant/releases/download/v1.11.4/qdrant-aarch64-apple-darwin.tar.gz 376 | tar -xzvf qdrant-aarch64-apple-darwin.tar.gz 377 | rm qdrant-aarch64-apple-darwin.tar.gz 378 | 379 | # Download snapshot 380 | curl -LO https://huggingface.co/datasets/gaianet/paris/resolve/main/paris_768_nomic-embed-text-v1.5-f16.snapshot 381 | mv paris_768_nomic-embed-text-v1.5-f16.snapshot default.snapshot 382 | 383 | ls -al 384 | 385 | - name: Build rag-api-server on macos-14 386 | env: 387 | WASI_SDK_PATH: /Users/runner/work/rag-api-server/rag-api-server/wasi-sdk-24.0 388 | CC: "/Users/runner/work/rag-api-server/rag-api-server/wasi-sdk-24.0/bin/clang --sysroot=/Users/runner/work/rag-api-server/rag-api-server/wasi-sdk-24.0/share/wasi-sysroot" 389 | RUSTFLAGS: "--cfg wasmedge --cfg tokio_unstable" 390 | run: | 391 | cargo build -p rag-api-server --release 392 | cp target/wasm32-wasip1/release/rag-api-server.wasm ./rag-api-server.wasm 393 | 394 | - name: Download models 395 | run: | 396 | curl -LO https://huggingface.co/second-state/Qwen2-1.5B-Instruct-GGUF/resolve/main/Qwen2-1.5B-Instruct-Q3_K_M.gguf 397 | curl -LO https://huggingface.co/second-state/Nomic-embed-text-v1.5-Embedding-GGUF/resolve/main/nomic-embed-text-v1.5-f16.gguf 398 | 399 | - name: Start Qdrant 400 | run: | 401 | nohup ./qdrant > ./start-qdrant.log 2>&1 & 402 | sleep 5 403 | cat start-qdrant.log 404 | 405 | - name: Import the default.snapshot file to Qdrant 406 | run: | 407 | curl -s -X POST http://localhost:6333/collections/default/snapshots/upload?priority=snapshot -H 'Content-Type:multipart/form-data' -F 'snapshot=@default.snapshot' 408 | 409 | - name: Start rag-api-server for testing chat completions 410 | run: | 411 | nohup $HOME/.wasmedge/bin/wasmedge --dir .:. --nn-preload default:GGML:AUTO:Qwen2-1.5B-Instruct-Q3_K_M.gguf --nn-preload embedding:GGML:AUTO:nomic-embed-text-v1.5-f16.gguf rag-api-server.wasm --model-name Qwen2-1.5B-Instruct,nomic-embed-text-v1.5 --ctx-size 4096,512 --batch-size 16,512 --prompt-template chatml,embedding --rag-policy last-user-message --socket-addr 0.0.0.0:8080 > ./start-llamaedge.log 2>&1 & 412 | sleep 30 413 | cat start-llamaedge.log 414 | 415 | # - name: Run test_chat.hurl 416 | # run: | 417 | # hurl --test --jobs 1 ./tests/test_chat.hurl 418 | 419 | - name: Run test_embeddings.hurl 420 | run: | 421 | hurl --test --jobs 1 ./tests/test_embeddings.hurl 422 | 423 | # - name: Run test_rag.hurl 424 | # run: | 425 | # hurl --test --jobs 1 ./tests/test_rag.hurl 426 | 427 | - name: Stop rag-api-server for testing chat completions 428 | run: | 429 | pkill -f wasmedge 430 | 431 | - name: Stop Qdrant 432 | run: | 433 | pkill -f qdrant 434 | 435 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "rag-api-server" 3 | version = "0.13.15" 4 | edition = "2021" 5 | 6 | [dependencies] 7 | anyhow = "1" 8 | chat-prompts = { version = "=0.26.1" } 9 | chrono = "0.4.38" 10 | clap = { version = "4.4.6", features = ["cargo"] } 11 | either = "1.12.0" 12 | endpoints = { version = "=0.25.1", features = ["rag", "index"] } 13 | futures = { version = "0.3.6", default-features = false, features = ["async-await", "std"] } 14 | futures-util = "0.3" 15 | hyper = { version = "0.14", features = ["full"] } 16 | llama-core = { version = "=0.30.0", features = ["logging", "rag", "index"] } 17 | log = { version = "0.4.21", features = ["std", "kv", "kv_serde"] } 18 | mime_guess = "2.0.4" 19 | multipart-2021 = "0.19.0" 20 | once_cell = "1.18" 21 | reqwest = { version = "0.11", default-features = false, features = ["json", "stream", "rustls-tls"] } 22 | serde = { version = "1.0", features = ["derive"] } 23 | serde_json = "1.0" 24 | thiserror = "1" 25 | tokio = { version = "^1.36", features = ["io-util", "fs", "net", "time", "rt", "macros"] } 26 | url = "^2.5" 27 | uuid = { version = "1.4", features = ["v4", "fast-rng", "macro-diagnostics"] } 28 | walkdir = "2.5.0" 29 | wasi-logger = { version = "0.1.2", features = ["kv"] } 30 | 31 | [patch.crates-io] 32 | socket2 = { git = "https://github.com/second-state/socket2.git", branch = "v0.5.x" } 33 | reqwest = { git = "https://github.com/second-state/wasi_reqwest.git", branch = "0.11.x" } 34 | hyper = { git = "https://github.com/second-state/wasi_hyper.git", branch = "v0.14.x" } 35 | tokio = { git = "https://github.com/second-state/wasi_tokio.git", branch = "v1.36.x" } 36 | 37 | [features] 38 | default = [] 39 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # LlamaEdge-RAG API Server 2 | 3 | 4 | 5 | 6 | 7 | - [LlamaEdge-RAG API Server](#llamaedge-rag-api-server) 8 | - [Introduction](#introduction) 9 | - [Endpoints](#endpoints) 10 | - [List models](#list-models) 11 | - [Chat completions](#chat-completions) 12 | - [Upload a file](#upload-a-file) 13 | - [List all files](#list-all-files) 14 | - [Retrieve information about a specific file](#retrieve-information-about-a-specific-file) 15 | - [Retrieve the content of a specific file](#retrieve-the-content-of-a-specific-file) 16 | - [Download a specific file](#download-a-specific-file) 17 | - [Delete a file](#delete-a-file) 18 | - [Segment a file to chunks](#segment-a-file-to-chunks) 19 | - [Compute embeddings for user query or file chunks](#compute-embeddings-for-user-query-or-file-chunks) 20 | - [Generate embeddings from a file](#generate-embeddings-from-a-file) 21 | - [Get server information](#get-server-information) 22 | - [Retrieve context](#retrieve-context) 23 | - [Setup](#setup) 24 | - [Build](#build) 25 | - [Execute](#execute) 26 | - [Usage Example](#usage-example) 27 | - [Set Log Level](#set-log-level) 28 | 29 | 30 | 31 | ## Introduction 32 | 33 | LlamaEdge-RAG API server provides a group of OpenAI-compatible web APIs for the Retrieval-Augmented Generation (RAG) applications. The server is implemented in WebAssembly (Wasm) and runs on [WasmEdge Runtime](https://github.com/WasmEdge/WasmEdge). 34 | 35 | ### Endpoints 36 | 37 | #### List models 38 | 39 | `rag-api-server` provides a POST API `/v1/models` to list currently available models. 40 | 41 |
Example 42 | 43 | You can use `curl` to test it on a new terminal: 44 | 45 | ```bash 46 | curl -X POST http://localhost:8080/v1/models -H 'accept:application/json' 47 | ``` 48 | 49 | If the command runs successfully, you should see the similar output as below in your terminal: 50 | 51 | ```json 52 | { 53 | "object":"list", 54 | "data":[ 55 | { 56 | "id":"llama-2-chat", 57 | "created":1697084821, 58 | "object":"model", 59 | "owned_by":"Not specified" 60 | } 61 | ] 62 | } 63 | ``` 64 | 65 |
66 | 67 | #### Chat completions 68 | 69 | Ask a question using OpenAI's JSON message format. 70 | 71 |
Example 72 | 73 | ```bash 74 | curl -X POST http://localhost:8080/v1/chat/completions \ 75 | -H 'accept:application/json' \ 76 | -H 'Content-Type: application/json' \ 77 | -d '{"messages":[{"role":"system", "content": "You are a helpful assistant."}, {"role":"user", "content": "Who is Robert Oppenheimer?"}], "model":"llama-2-chat"}' 78 | ``` 79 | 80 | Here is the response. 81 | 82 | ```json 83 | { 84 | "id":"", 85 | "object":"chat.completion", 86 | "created":1697092593, 87 | "model":"llama-2-chat", 88 | "choices":[ 89 | { 90 | "index":0, 91 | "message":{ 92 | "role":"assistant", 93 | "content":"Robert Oppenheimer was an American theoretical physicist and director of the Manhattan Project, which developed the atomic bomb during World War II. He is widely regarded as one of the most important physicists of the 20th century and is known for his contributions to the development of quantum mechanics and the theory of the atomic nucleus. Oppenheimer was also a prominent figure in the post-war nuclear weapons debate, advocating for international control and regulation of nuclear weapons." 94 | }, 95 | "finish_reason":"stop" 96 | } 97 | ], 98 | "usage":{ 99 | "prompt_tokens":9, 100 | "completion_tokens":12, 101 | "total_tokens":21 102 | } 103 | } 104 | ``` 105 | 106 |
107 | 108 | #### Upload a file 109 | 110 | In RAG applications, uploading files is a necessary step. 111 | 112 |
Example: Upload a file 113 | 114 | The following command upload a text file [paris.txt](https://huggingface.co/datasets/gaianet/paris/raw/main/paris.txt) to the API server via the `/v1/files` endpoint: 115 | 116 | ```bash 117 | curl -X POST http://127.0.0.1:8080/v1/files -F "file=@paris.txt" 118 | ``` 119 | 120 | If the command is successful, you should see the similar output as below in your terminal: 121 | 122 | ```json 123 | { 124 | "id": "file_4bc24593-2a57-4646-af16-028855e7802e", 125 | "bytes": 2161, 126 | "created_at": 1711611801, 127 | "filename": "paris.txt", 128 | "object": "file", 129 | "purpose": "assistants" 130 | } 131 | ``` 132 | 133 | The `id` and `filename` fields are important for the next step, for example, to segment the uploaded file to chunks for computing embeddings. 134 | 135 |
136 | 137 | #### List all files 138 | 139 | `GET /v1/files` endpoint is used for listing all files on the server. 140 | 141 |
Example: List files 142 | 143 | The following command lists all files on the server via the `/v1/files` endpoint: 144 | 145 | ```bash 146 | curl -X GET http://127.0.0.1:8080/v1/files 147 | ``` 148 | 149 | If the command is successful, you should see the similar output as below in your terminal: 150 | 151 | ```bash 152 | { 153 | "object": "list", 154 | "data": [ 155 | { 156 | "id": "file_33d9188d-5060-4141-8c52-ae148fd15f6a", 157 | "bytes": 17039, 158 | "created_at": 1718296362, 159 | "filename": "test-123.m4a", 160 | "object": "file", 161 | "purpose": "assistants" 162 | }, 163 | { 164 | "id": "file_8c6439da-df59-4b9a-bb5e-dba4b2f23c04", 165 | "bytes": 17039, 166 | "created_at": 1718294169, 167 | "filename": "test-123.m4a", 168 | "object": "file", 169 | "purpose": "assistants" 170 | } 171 | ] 172 | } 173 | ``` 174 | 175 |
176 | 177 | #### Retrieve information about a specific file 178 | 179 | `GET /v1/files/{file_id}` endpoint is used for retrieving information about a specific file on the server. 180 | 181 |
Example: Retrieve information about a specific file 182 | 183 | The following command retrieves information about a specific file on the server via the `/v1/files/{file_id}` endpoint: 184 | 185 | ```bash 186 | curl -X GET http://localhost:10086/v1/files/file_b892bc81-35e9-44a6-8c01-ae915c1d3832 187 | ``` 188 | 189 | If the command is successful, you should see the similar output as below in your terminal: 190 | 191 | ```bash 192 | { 193 | "id": "file_b892bc81-35e9-44a6-8c01-ae915c1d3832", 194 | "bytes": 2161, 195 | "created_at": 1715832065, 196 | "filename": "paris.txt", 197 | "object": "file", 198 | "purpose": "assistants" 199 | } 200 | ``` 201 | 202 |
203 | 204 | #### Retrieve the content of a specific file 205 | 206 | `GET /v1/files/{file_id}/content` endpoint is used for retrieving the content of a specific file on the server. 207 | 208 |
Example: Retrieve the content of a specific file 209 | 210 | The following command retrieves the content of a specific file on the server via the `/v1/files/{file_id}/content` endpoint: 211 | 212 | ```bash 213 | curl -X GET http://localhost:10086/v1/files/file_b892bc81-35e9-44a6-8c01-ae915c1d3832/content 214 | ``` 215 | 216 |
217 | 218 | #### Download a specific file 219 | 220 | `GET /v1/files/download/{file_id}` endpoint is used for downloading a specific file on the server. 221 | 222 |
Example: Download a specific file 223 | 224 | The following command downloads a specific file on the server via the `/v1/files/download/{file_id}` endpoint: 225 | 226 | ```bash 227 | curl -X GET http://localhost:10086/v1/files/download/file_b892bc81-35e9-44a6-8c01-ae915c1d3832 228 | ``` 229 | 230 |
231 | 232 | #### Delete a file 233 | 234 | `DELETE /v1/files/{file_id}` endpoint is used for deleting a specific file on the server. 235 | 236 |
Example: Delete a specific file 237 | 238 | The following command deletes a specific file on the server via the `/v1/files/{file_id}` endpoint: 239 | 240 | ```bash 241 | curl -X DELETE http://localhost:10086/v1/files/file_6a6d8046-fd98-410a-b70e-0a0142ec9a39 242 | ``` 243 | 244 | If the command is successful, you should see the similar output as below in your terminal: 245 | 246 | ```bash 247 | { 248 | "id": "file_6a6d8046-fd98-410a-b70e-0a0142ec9a39", 249 | "object": "file", 250 | "deleted": true 251 | } 252 | ``` 253 | 254 |
255 | 256 | #### Segment a file to chunks 257 | 258 | To segment the uploaded file to chunks for computing embeddings, use the `/v1/chunks` API. 259 | 260 |
Example 261 | 262 | The following command sends the uploaded file ID and filename to the API server and gets the chunks: 263 | 264 | ```bash 265 | curl -X POST http://localhost:8080/v1/chunks \ 266 | -H 'accept:application/json' \ 267 | -H 'Content-Type: application/json' \ 268 | -d '{"id":"file_4bc24593-2a57-4646-af16-028855e7802e", "filename":"paris.txt"}' 269 | ``` 270 | 271 | The following is an example return with the generated chunks: 272 | 273 | ```json 274 | { 275 | "id": "file_4bc24593-2a57-4646-af16-028855e7802e", 276 | "filename": "paris.txt", 277 | "chunks": [ 278 | "Paris, city and capital of France, ..., for Paris has retained its importance as a centre for education and intellectual pursuits.", 279 | "Paris’s site at a crossroads ..., drawing to itself much of the talent and vitality of the provinces." 280 | ] 281 | } 282 | ``` 283 | 284 |
285 | 286 | #### Compute embeddings for user query or file chunks 287 | 288 | To compute embeddings for user query or file chunks, use the `/v1/embeddings` API. 289 | 290 |
Example 291 | 292 | The following command sends a query to the API server and gets the embeddings as return: 293 | 294 | ```bash 295 | curl -X POST http://localhost:8080/v1/embeddings \ 296 | -H 'accept:application/json' \ 297 | -H 'Content-Type: application/json' \ 298 | -d '{"model": "e5-mistral-7b-instruct-Q5_K_M", "input":["Paris, city and capital of France, ..., for Paris has retained its importance as a centre for education and intellectual pursuits.", "Paris’s site at a crossroads ..., drawing to itself much of the talent and vitality of the provinces."]}' 299 | ``` 300 | 301 | The embeddings returned are like below: 302 | 303 | ```json 304 | { 305 | "object": "list", 306 | "data": [ 307 | { 308 | "index": 0, 309 | "object": "embedding", 310 | "embedding": [ 311 | 0.1428378969, 312 | -0.0447309874, 313 | 0.007660218049, 314 | ... 315 | -0.0128974719, 316 | -0.03543198109, 317 | 0.03974733502, 318 | 0.00946635101, 319 | -0.01531364303 320 | ] 321 | }, 322 | { 323 | "index": 1, 324 | "object": "embedding", 325 | "embedding": [ 326 | 0.0697753951, 327 | -0.0001159032545, 328 | 0.02073983476, 329 | ... 330 | 0.03565846011, 331 | -0.04550019652, 332 | 0.02691745944, 333 | 0.02498772368, 334 | -0.003226313973 335 | ] 336 | } 337 | ], 338 | "model": "e5-mistral-7b-instruct-Q5_K_M", 339 | "usage": { 340 | "prompt_tokens": 491, 341 | "completion_tokens": 0, 342 | "total_tokens": 491 343 | } 344 | } 345 | ``` 346 | 347 |
348 | 349 | #### Generate embeddings from a file 350 | 351 | `/v1/create/rag` endpoint provides users a one-click way to convert a text or markdown file to embeddings directly. The effect of the endpoint is equivalent to running `/v1/files` + `/v1/chunks` + `/v1/embeddings` sequently. Note that the `--chunk-capacity` CLI option is required for the endpoint. The default value of the option is `100`. You can set it to different values while starting LlamaEdge-RAG API server. 352 | 353 |
Example 354 | 355 | The following command uploads a text file [paris.txt](https://huggingface.co/datasets/gaianet/paris/raw/main/paris.txt) to the API server via the `/v1/create/rag` endpoint: 356 | 357 | ```bash 358 | curl -X POST http://127.0.0.1:8080/v1/create/rag -F "file=@paris.txt" 359 | ``` 360 | 361 | The embeddings returned are like below: 362 | 363 | ```json 364 | { 365 | "object": "list", 366 | "data": [ 367 | { 368 | "index": 0, 369 | "object": "embedding", 370 | "embedding": [ 371 | 0.1428378969, 372 | -0.0447309874, 373 | 0.007660218049, 374 | ... 375 | -0.0128974719, 376 | -0.03543198109, 377 | 0.03974733502, 378 | 0.00946635101, 379 | -0.01531364303 380 | ] 381 | }, 382 | { 383 | "index": 1, 384 | "object": "embedding", 385 | "embedding": [ 386 | 0.0697753951, 387 | -0.0001159032545, 388 | 0.02073983476, 389 | ... 390 | 0.03565846011, 391 | -0.04550019652, 392 | 0.02691745944, 393 | 0.02498772368, 394 | -0.003226313973 395 | ] 396 | } 397 | ], 398 | "model": "e5-mistral-7b-instruct-Q5_K_M", 399 | "usage": { 400 | "prompt_tokens": 491, 401 | "completion_tokens": 0, 402 | "total_tokens": 491 403 | } 404 | } 405 | ``` 406 | 407 |
408 | 409 | #### Get server information 410 | 411 | `/v1/info` endpoint provides the information of the API server, including the version of the server, the parameters of models, and etc. 412 | 413 |
Example 414 | 415 | You can use `curl` to test it on a new terminal: 416 | 417 | ```bash 418 | curl -X POST http://localhost:8080/v1/info -H 'accept:application/json' 419 | ``` 420 | 421 | If the command runs successfully, you should see the similar output as below in your terminal: 422 | 423 | ```json 424 | { 425 | "version": "0.3.4", 426 | "plugin_version": "b2694 (commit 0d56246f)", 427 | "port": "8080", 428 | "models": [ 429 | { 430 | "name": "Llama-2-7b-chat-hf-Q5_K_M", 431 | "type": "chat", 432 | "prompt_template": "Llama2Chat", 433 | "n_predict": 1024, 434 | "n_gpu_layers": 100, 435 | "ctx_size": 4096, 436 | "batch_size": 512, 437 | "temperature": 1.0, 438 | "top_p": 1.0, 439 | "repeat_penalty": 1.1, 440 | "presence_penalty": 0.0, 441 | "frequency_penalty": 0.0 442 | }, 443 | { 444 | "name": "all-MiniLM-L6-v2-ggml-model-f16", 445 | "type": "embedding", 446 | "prompt_template": "Llama2Chat", 447 | "n_predict": 1024, 448 | "n_gpu_layers": 100, 449 | "ctx_size": 384, 450 | "batch_size": 512, 451 | "temperature": 1.0, 452 | "top_p": 1.0, 453 | "repeat_penalty": 1.1, 454 | "presence_penalty": 0.0, 455 | "frequency_penalty": 0.0 456 | } 457 | ], 458 | "qdrant_config": { 459 | "url": "http://localhost:6333", 460 | "collection_name": "default", 461 | "limit": 5, 462 | "score_threshold": 0.4 463 | } 464 | } 465 | ``` 466 | 467 |
468 | 469 | #### Retrieve context 470 | 471 | `/v1/retrieve` endpoint sends a query and gets the retrieval results. 472 | 473 |
Example 474 | 475 | You can use `curl` to test it on a new terminal: 476 | 477 | ```bash 478 | curl -X POST http://localhost:8080/v1/retrieve \ 479 | -H 'accept:application/json' \ 480 | -H 'Content-Type: application/json' \ 481 | -d '{"messages":[{"role":"system", "content": "You are a helpful assistant."}, {"role":"user", "content": "What is the location of Paris, France along the Seine River?"}], "model":"llama-2-chat"}' 482 | ``` 483 | 484 | If the command runs successfully, you should see the similar output as below in your terminal: 485 | 486 | ```json 487 | { 488 | "points": [ 489 | { 490 | "source": "\"Paris is located in northern central France, in a north-bending arc of the river Seine whose crest includes two islands, the Île Saint-Louis and the larger Île de la Cité, which form the oldest part of the city. The river's mouth on the English Channel is about 233 mi downstream from the city. The city is spread widely on both banks of the river. Overall, the city is relatively flat, and the lowest point is 35 m above sea level. Paris has several prominent hills, the highest of which is Montmartre at 130 m.\\n\"", 491 | "score": 0.74011195 492 | }, 493 | { 494 | "source": "\"The Paris region is the most active water transport area in France, with most of the cargo handled by Ports of Paris in facilities located around Paris. The rivers Loire, Rhine, Rhône, Me\\n\"", 495 | "score": 0.63990676 496 | }, 497 | { 498 | "source": "\"Paris\\nCountry\\tFrance\\nRegion\\nÎle-de-France\\r\\nDepartment\\nParis\\nIntercommunality\\nMétropole du Grand Paris\\nSubdivisions\\n20 arrondissements\\nGovernment\\n • Mayor (2020–2026)\\tAnne Hidalgo (PS)\\r\\nArea\\n1\\t105.4 km2 (40.7 sq mi)\\n • Urban\\n (2020)\\t2,853.5 km2 (1,101.7 sq mi)\\n • Metro\\n (2020)\\t18,940.7 km2 (7,313.0 sq mi)\\nPopulation\\n (2023)\\n2,102,650\\n • Rank\\t9th in Europe\\n1st in France\\r\\n • Density\\t20,000/km2 (52,000/sq mi)\\n • Urban\\n (2019)\\n10,858,852\\n • Urban density\\t3,800/km2 (9,900/sq mi)\\n • Metro\\n (Jan. 2017)\\n13,024,518\\n • Metro density\\t690/km2 (1,800/sq mi)\\nDemonym(s)\\nParisian(s) (en) Parisien(s) (masc.), Parisienne(s) (fem.) (fr), Parigot(s) (masc.), \\\"Parigote(s)\\\" (fem.) (fr, colloquial)\\nTime zone\\nUTC+01:00 (CET)\\r\\n • Summer (DST)\\nUTC+02:00 (CEST)\\r\\nINSEE/Postal code\\t75056 /75001-75020, 75116\\r\\nElevation\\t28–131 m (92–430 ft)\\n(avg. 78 m or 256 ft)\\nWebsite\\twww.paris.fr\\r\\n1 French Land Register data, which excludes lakes, ponds, glaciers > 1 km2 (0.386 sq mi or 247 acres) and river estuaries.\\n\"", 499 | "score": 0.62259054 500 | }, 501 | { 502 | "source": "\" in Paris\\n\"", 503 | "score": 0.6152092 504 | }, 505 | { 506 | "source": "\"The Parisii, a sub-tribe of the Celtic Senones, inhabited the Paris area from around the middle of the 3rd century BC. One of the area's major north–south trade routes crossed the Seine on the île de la Cité, which gradually became an important trading centre. The Parisii traded with many river towns (some as far away as the Iberian Peninsula) and minted their own coins.\\n\"", 507 | "score": 0.5720232 508 | } 509 | ], 510 | "limit": 5, 511 | "score_threshold": 0.4 512 | } 513 | ``` 514 | 515 |
516 | 517 | ## Setup 518 | 519 | Llama-RAG API server runs on WasmEdge Runtime. According to the operating system you are using, choose the installation command: 520 | 521 |
For macOS (apple silicon) 522 | 523 | ```console 524 | # install WasmEdge-0.13.4 with wasi-nn-ggml plugin 525 | curl -sSf https://raw.githubusercontent.com/WasmEdge/WasmEdge/master/utils/install_v2.sh | bash -s 526 | 527 | # Assuming you use zsh (the default shell on macOS), run the following command to activate the environment 528 | source $HOME/.zshenv 529 | ``` 530 | 531 |
532 | 533 |
For Ubuntu (>= 20.04) 534 | 535 | ```console 536 | # install libopenblas-dev 537 | apt update && apt install -y libopenblas-dev 538 | 539 | # install WasmEdge-0.13.4 with wasi-nn-ggml plugin 540 | curl -sSf https://raw.githubusercontent.com/WasmEdge/WasmEdge/master/utils/install_v2.sh | bash -s 541 | 542 | # Assuming you use bash (the default shell on Ubuntu), run the following command to activate the environment 543 | source $HOME/.bashrc 544 | ``` 545 | 546 |
547 | 548 |
For General Linux 549 | 550 | ```console 551 | # install WasmEdge-0.13.4 with wasi-nn-ggml plugin 552 | curl -sSf https://raw.githubusercontent.com/WasmEdge/WasmEdge/master/utils/install_v2.sh | bash -s 553 | 554 | # Assuming you use bash (the default shell on Ubuntu), run the following command to activate the environment 555 | source $HOME/.bashrc 556 | ``` 557 | 558 |
559 | 560 | ## Build 561 | 562 | ```bash 563 | # Clone the repository 564 | git clone https://github.com/LlamaEdge/rag-api-server.git 565 | 566 | # Change the working directory 567 | cd rag-api-server 568 | 569 | # (Optional) Add the `wasm32-wasip1` target to the Rust toolchain 570 | rustup target add wasm32-wasip1 571 | 572 | # Build `rag-api-server.wasm` with the `http` support only, or 573 | cargo build --target wasm32-wasip1 --release 574 | 575 | # Build `rag-api-server.wasm` with both `http` and `https` support 576 | cargo build --target wasm32-wasip1 --release --features full 577 | 578 | # Copy the `rag-api-server.wasm` to the root directory 579 | cp target/wasm32-wasip1/release/rag-api-server.wasm . 580 | ``` 581 | 582 | To check the CLI options of the `rag-api-server` wasm app, you can run the following command: 583 | 584 | ```bash 585 | $ wasmedge rag-api-server.wasm -h 586 | 587 | LlamaEdge-RAG API Server 588 | 589 | Usage: rag-api-server.wasm [OPTIONS] --model-name --prompt-template 590 | 591 | Options: 592 | -m, --model-name 593 | Sets names for chat and embedding models. The names are separated by comma without space, for example, '--model-name Llama-2-7b,all-minilm' 594 | 595 | -a, --model-alias 596 | Model aliases for chat and embedding models 597 | 598 | [default: default,embedding] 599 | 600 | -c, --ctx-size 601 | Sets context sizes for chat and embedding models, respectively. The sizes are separated by comma without space, for example, '--ctx-size 4096,384'. The first value is for the chat model, and the second is for the embedding model 602 | 603 | [default: 4096,384] 604 | 605 | -p, --prompt-template 606 | Sets prompt templates for chat and embedding models, respectively. The prompt templates are separated by comma without space, for example, '--prompt-template llama-2-chat,embedding'. The first value is for the chat model, and the second is for the embedding model 607 | 608 | [possible values: llama-2-chat, llama-3-chat, llama-3-tool, mistral-instruct, mistral-tool, mistrallite, mistral-small-chat, mistral-small-tool, openchat, codellama-instruct, codellama-super-instruct, human-assistant, vicuna-1.0-chat, vicuna-1.1-chat, vicuna-llava, chatml, chatml-tool, internlm-2-tool, baichuan-2, wizard-coder, zephyr, stablelm-zephyr, intel-neural, deepseek-chat, deepseek-coder, deepseek-chat-2, deepseek-chat-25, deepseek-chat-3, solar-instruct, phi-2-chat, phi-2-instruct, phi-3-chat, phi-3-instruct, phi-4-chat, gemma-instruct, gemma-3, octopus, glm-4-chat, groq-llama3-tool, mediatek-breeze, nemotron-chat, nemotron-tool, functionary-32, functionary-31, minicpmv, moxin-chat, falcon3, megrez, qwen2-vision, exaone-deep-chat, exaone-chat, embedding, tts, none] 609 | 610 | -r, --reverse-prompt 611 | Halt generation at PROMPT, return control 612 | 613 | -n, --n-predict 614 | Number of tokens to predict, -1 = infinity, -2 = until context filled 615 | 616 | [default: -1] 617 | 618 | -g, --n-gpu-layers 619 | Number of layers to run on the GPU 620 | 621 | [default: 100] 622 | 623 | --split-mode 624 | Split the model across multiple GPUs. Possible values: `none` (use one GPU only), `layer` (split layers and KV across GPUs, default), `row` (split rows across GPUs) 625 | 626 | [default: layer] 627 | 628 | --main-gpu 629 | The main GPU to use 630 | 631 | --tensor-split 632 | How split tensors should be distributed accross GPUs. If None the model is not split; otherwise, a comma-separated list of non-negative values, e.g., "3,2" presents 60% of the data to GPU 0 and 40% to GPU 1 633 | 634 | --threads 635 | Number of threads to use during computation 636 | 637 | [default: 2] 638 | 639 | --grammar 640 | BNF-like grammar to constrain generations (see samples in grammars/ dir) 641 | 642 | [default: ] 643 | 644 | --json-schema 645 | JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object. For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead 646 | 647 | -b, --batch-size 648 | Sets batch sizes for chat and embedding models, respectively. The sizes are separated by comma without space, for example, '--batch-size 128,64'. The first value is for the chat model, and the second is for the embedding model 649 | 650 | [default: 512,512] 651 | 652 | -u, --ubatch-size 653 | Sets physical maximum batch sizes for chat and/or embedding models. To run both chat and embedding models, the sizes should be separated by comma without space, for example, '--ubatch-size 512,512'. The first value is for the chat model, and the second for the embedding model 654 | 655 | [default: 512,512] 656 | 657 | --rag-prompt 658 | Custom rag prompt 659 | 660 | --rag-policy 661 | Strategy for merging RAG context into chat messages 662 | 663 | [default: system-message] 664 | 665 | Possible values: 666 | - system-message: Merge RAG context into the system message 667 | - last-user-message: Merge RAG context into the last user message 668 | 669 | --qdrant-url 670 | URL of Qdrant REST Service 671 | 672 | [default: http://127.0.0.1:6333] 673 | 674 | --qdrant-collection-name 675 | Name of Qdrant collection 676 | 677 | [default: default] 678 | 679 | --qdrant-limit 680 | Max number of retrieved result (no less than 1) 681 | 682 | [default: 5] 683 | 684 | --qdrant-score-threshold 685 | Minimal score threshold for the search result 686 | 687 | [default: 0.4] 688 | 689 | --chunk-capacity 690 | Maximum number of tokens each chunk contains 691 | 692 | [default: 100] 693 | 694 | --context-window 695 | Maximum number of user messages used in the retrieval 696 | 697 | [default: 1] 698 | 699 | --kw-search-url 700 | URL of the keyword search service 701 | 702 | --include-usage 703 | Whether to include usage in the stream response. Defaults to false 704 | 705 | --socket-addr 706 | Socket address of LlamaEdge-RAG API Server instance. For example, `0.0.0.0:8080` 707 | 708 | --port 709 | Port number 710 | 711 | [default: 8080] 712 | 713 | --web-ui 714 | Root path for the Web UI files 715 | 716 | [default: chatbot-ui] 717 | 718 | --log-prompts 719 | Deprecated. Print prompt strings to stdout 720 | 721 | --log-stat 722 | Deprecated. Print statistics to stdout 723 | 724 | --log-all 725 | Deprecated. Print all log information to stdout 726 | 727 | -h, --help 728 | Print help (see a summary with '-h') 729 | 730 | -V, --version 731 | Print version 732 | ``` 733 | 734 | 735 | 736 | ## Execute 737 | 738 | LlamaEdge-RAG API server requires two types of models: chat and embedding. The chat model is used for generating responses to user queries, while the embedding model is used for computing embeddings for user queries or file chunks. 739 | 740 | Execution also requires the presence of a running [Qdrant](https://qdrant.tech/) service. 741 | 742 | For the purpose of demonstration, we use the [Llama-2-7b-chat-hf-Q5_K_M.gguf](https://huggingface.co/second-state/Llama-2-7B-Chat-GGUF/resolve/main/Llama-2-7b-chat-hf-Q5_K_M.gguf) and [all-MiniLM-L6-v2-ggml-model-f16.gguf](https://huggingface.co/second-state/All-MiniLM-L6-v2-Embedding-GGUF/resolve/main/all-MiniLM-L6-v2-ggml-model-f16.gguf) models as examples. Download these models and place them in the root directory of the repository. 743 | 744 | - Ensure the Qdrant service is running 745 | 746 | ```bash 747 | # Pull the Qdrant docker image 748 | docker pull qdrant/qdrant 749 | 750 | # Create a directory to store Qdrant data 751 | mkdir qdrant_storage 752 | 753 | # Run Qdrant service 754 | docker run -p 6333:6333 -p 6334:6334 -v $(pwd)/qdrant_storage:/qdrant/storage:z qdrant/qdrant 755 | ``` 756 | 757 | - Start an instance of LlamaEdge-RAG API server 758 | 759 | ```bash 760 | wasmedge --dir .:. --nn-preload default:GGML:AUTO:Llama-2-7b-chat-hf-Q5_K_M.gguf \ 761 | --nn-preload embedding:GGML:AUTO:all-MiniLM-L6-v2-ggml-model-f16.gguf \ 762 | rag-api-server.wasm \ 763 | --model-name Llama-2-7b-chat-hf-Q5_K_M,all-MiniLM-L6-v2-ggml-model-f16 \ 764 | --ctx-size 4096,384 \ 765 | --prompt-template llama-2-chat,embedding \ 766 | --rag-policy system-message \ 767 | --qdrant-collection-name default \ 768 | --qdrant-limit 3 \ 769 | --qdrant-score-threshold 0.5 \ 770 | --rag-prompt "Use the following pieces of context to answer the user's question.\nIf you don't know the answer, just say that you don't know, don't try to make up an answer.\n----------------\n" \ 771 | --port 8080 772 | ``` 773 | 774 | ## Usage Example 775 | 776 | - [Execute](#execute) the server 777 | 778 | - Generate embeddings for [paris.txt](https://huggingface.co/datasets/gaianet/paris/raw/main/paris.txt) via the `/v1/create/rag` endpoint 779 | 780 | ```bash 781 | curl -X POST http://127.0.0.1:8080/v1/create/rag -F "file=@paris.txt" 782 | ``` 783 | 784 | - Ask a question 785 | 786 | ```bash 787 | curl -X POST http://localhost:8080/v1/chat/completions \ 788 | -H 'accept:application/json' \ 789 | -H 'Content-Type: application/json' \ 790 | -d '{"messages":[{"role":"system", "content": "You are a helpful assistant."}, {"role":"user", "content": "What is the location of Paris, France along the Seine River?"}], "model":"Llama-2-7b-chat-hf-Q5_K_M"}' 791 | ``` 792 | 793 | ## Set Log Level 794 | 795 | You can set the log level of the API server by setting the `LLAMA_LOG` environment variable. For example, to set the log level to `debug`, you can run the following command: 796 | 797 | ```bash 798 | wasmedge --dir .:. --env LLAMA_LOG=debug \ 799 | --nn-preload default:GGML:AUTO:Llama-2-7b-chat-hf-Q5_K_M.gguf \ 800 | --nn-preload embedding:GGML:AUTO:all-MiniLM-L6-v2-ggml-model-f16.gguf \ 801 | rag-api-server.wasm \ 802 | --model-name Llama-2-7b-chat-hf-Q5_K_M,all-MiniLM-L6-v2-ggml-model-f16 \ 803 | --ctx-size 4096,384 \ 804 | --prompt-template llama-2-chat,embedding \ 805 | --rag-prompt "Use the following pieces of context to answer the user's question.\nIf you don't know the answer, just say that you don't know, don't try to make up an answer.\n----------------\n" 806 | ``` 807 | 808 | The log level can be one of the following values: `trace`, `debug`, `info`, `warn`, `error`. The default log level is `info`. 809 | -------------------------------------------------------------------------------- /check_code_before_commit.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Find unused dependencies in Cargo.toml 4 | cargo +nightly udeps 5 | 6 | # Sort dependencies in Cargo.toml alphabetically 7 | cargo sort 8 | 9 | # Format code 10 | cargo +nightly fmt --all -- --check 11 | 12 | # Clippy 13 | cargo +nightly clippy --target wasm32-wasip1 --all-features -- -D warnings 14 | -------------------------------------------------------------------------------- /docs/assets/kw_search.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LlamaEdge/rag-api-server/bd32425ce91b6ec72916d4a0e3d8922f3a4576f9/docs/assets/kw_search.png -------------------------------------------------------------------------------- /docs/keyword_search.md: -------------------------------------------------------------------------------- 1 | # Integration with Keyword Search 2 | 3 | Since `LlamaEdge-RAG v0.13.0`, the keyword search feature is supported. This feature is powered by [kw-search-server](https://github.com/LlamaEdge/kw-search-server), which is a standalone server that provides keyword search services. 4 | 5 | The following diagram shows the interactions between `rag-api-server` and `kw-search-server` while performing a chat completion. 6 | 7 |
8 | Integration with LlamaEdge-RAG 9 |
10 | 11 | ## Usage 12 | 13 | ### Enable keyword search 14 | 15 | Assume that a keyword search server is running on `http://localhost:9069`. There are two ways to enable keyword search in `rag-api-server`: 16 | 17 | - Set the `--kw-search-url` CLI option of `rag-api-server` while starting the rag-api-server. 18 | 19 | - Set the `kw_search_url` field of the chat completion request(see [Perform chat completion with keyword search](#perform-chat-completion-with-keyword-search)) or rag creation request (see [Create collection and index from a document](#create-collection-and-index-from-a-document)) to the rag-api-server. 20 | 21 | ### Create collection and index from a document 22 | 23 | The process of creating indexes for the target document is integrated into the creation of RAG. In other words, when creating RAG collections through the `/v1/create/rag` endpoint of the rag-api-server, indexes for the target document are created simultaneously. 24 | 25 | Assume that the keyword search server is running on `http://localhost:9069`. 26 | 27 | - If the rag-api-server is running on `http://localhost:8080` and using `--kw-search-url http://localhost:9069` to specify the keyword search server, you can create a RAG collection with the following command: 28 | 29 | ```bash 30 | curl --location 'http://localhost:8080/v1/create/rag' \ 31 | --header 'Content-Type: multipart/form-data' \ 32 | --form 'file=@"/Users/sam/workspace/demo/paris.txt"' \ 33 | --form 'vdb_server_url="your_vdb_server_url"' \ 34 | --form 'vdb_collection_name="your_vdb_collection_name"' \ 35 | --form 'vdb_api_key="your_vdb_api_key"' \ 36 | ``` 37 | 38 | - If the rag-api-server is running on `http://localhost:8080` without specifying the keyword search server, you can create a RAG collection with the following command: 39 | 40 | ```bash 41 | curl --location 'http://localhost:8080/v1/create/rag' \ 42 | --header 'Content-Type: multipart/form-data' \ 43 | --form 'file=@"/Users/sam/workspace/demo/paris.txt"' \ 44 | --form 'vdb_server_url="your_vdb_server_url"' \ 45 | --form 'vdb_collection_name="your_vdb_collection_name"' \ 46 | --form 'vdb_api_key="your_vdb_api_key"' \ 47 | --form 'kw_search_url="http://localhost:9069"' 48 | ``` 49 | 50 | If the curl request above is handled successfully, the following response body will be returned as shown below. The body contains two parts: `index` for the keyword search index and `embeddings` for the embeddings. 51 | 52 | ```json 53 | { 54 | "index": { 55 | "results": [ 56 | { 57 | "filename": "Unknown", 58 | "status": "indexed", 59 | "error": null 60 | }, 61 | ..., 62 | { 63 | "filename": "Unknown", 64 | "status": "indexed", 65 | "error": null 66 | } 67 | ], 68 | "index_name": "index-2c70ccde-916e-45b1-99ef-97ac893fd438", 69 | "download_url": "http://localhost:9069/v1/index/download/index-2c70ccde-916e-45b1-99ef-97ac893fd438" 70 | }, 71 | "embeddings": { 72 | "object": "list", 73 | "data": [ 74 | { 75 | "index": 0, 76 | "object": "embedding", 77 | "embedding": [] 78 | }, 79 | ..., 80 | { 81 | "index": 326, 82 | "object": "embedding", 83 | "embedding": [] 84 | } 85 | ], 86 | "model": "Nomic-embed-text-v1.5", 87 | "usage": { 88 | "prompt_tokens": 20355, 89 | "completion_tokens": 0, 90 | "total_tokens": 20355 91 | } 92 | } 93 | } 94 | ``` 95 | 96 | ### Perform chat completion with keyword search 97 | 98 | The keyword search feature is integrated into the chat completion process. When performing a chat completion, the rag-api-server will first perform a keyword search and embedding search for the user query, then fuse the search results into the context, and finally build prompt with the user query and the context and feed it to the model to generate a chat completion. 99 | 100 | Assume that the keyword search server is running on `http://localhost:9069`. 101 | 102 | - If the rag-api-server is running on `http://localhost:8080` and using `--kw-search-url http://localhost:9069` to specify the keyword search server, you can create a RAG collection with the following command: 103 | 104 | ```bash 105 | curl --location 'http://localhost:10086/v1/chat/completions' \ 106 | --header 'Content-Type: application/json' \ 107 | --data '{ 108 | "messages": [ 109 | { 110 | "role": "system", 111 | "content": "You are a helpful assistant. Answer questions as concisely as possible." 112 | }, 113 | { 114 | "role": "user", 115 | "content": "What is the location of Paris, France along the Seine river?" 116 | } 117 | ], 118 | 119 | "vdb_server_url": "your_vdb_server_url", 120 | "vdb_collection_name": ["your_vdb_collection_name"], 121 | "limit": [5], 122 | "score_threshold": [0.5], 123 | "vdb_api_key": "your_vdb_api_key", 124 | "kw_index_name": "index-2c70ccde-916e-45b1-99ef-97ac893fd438", 125 | "kw_top_k": 5, 126 | "model": "Llama-3.2-3B-Instruct", 127 | "stream": false 128 | }' 129 | ``` 130 | 131 | - If the rag-api-server is running on `http://localhost:8080` without specifying the keyword search server, you can create a RAG collection with the following command: 132 | 133 | ```bash 134 | curl --location 'http://localhost:10086/v1/chat/completions' \ 135 | --header 'Content-Type: application/json' \ 136 | --data '{ 137 | "messages": [ 138 | { 139 | "role": "system", 140 | "content": "You are a helpful assistant. Answer questions as concisely as possible." 141 | }, 142 | { 143 | "role": "user", 144 | "content": "What is the location of Paris, France along the Seine river?" 145 | } 146 | ], 147 | 148 | "vdb_server_url": "your_vdb_server_url", 149 | "vdb_collection_name": ["your_vdb_collection_name"], 150 | "limit": [5], 151 | "score_threshold": [0.5], 152 | "vdb_api_key": "your_vdb_api_key", 153 | "kw_search_url": "http://localhost:9069", 154 | "kw_index_name": "index-2c70ccde-916e-45b1-99ef-97ac893fd438", 155 | "kw_top_k": 5, 156 | "model": "Llama-3.2-3B-Instruct", 157 | "stream": false 158 | }' 159 | ``` 160 | 161 | If the curl request above is handled successfully, the following response body will be returned as shown below: 162 | 163 | ```json 164 | { 165 | "id": "chatcmpl-72d9b542-4ee6-4a38-b9f6-75677765eef3", 166 | "object": "chat.completion", 167 | "created": 1737531879, 168 | "model": "Llama-3.2-3B-Instruct", 169 | "choices": [ 170 | { 171 | "index": 0, 172 | "message": { 173 | "content": "Paris, France is located on the banks of the Seine River, with two islands, Île Saint-Louis and Île de la Cité, within the city.", 174 | "role": "assistant" 175 | }, 176 | "finish_reason": "stop", 177 | "logprobs": null 178 | } 179 | ], 180 | "usage": { 181 | "prompt_tokens": 209, 182 | "completion_tokens": 37, 183 | "total_tokens": 246 184 | } 185 | } 186 | ``` 187 | -------------------------------------------------------------------------------- /docs/vectordb.md: -------------------------------------------------------------------------------- 1 | # Interaction with VectorDB 2 | 3 | LlamaEdge-RAG interacts with external VectorDB through two approaches: one is via the CLI options of rag-api-server, and the other is through the request fields. In the following two sections, these two approaches are discussed separately. For the convenience of the following discussion, Qdrant is used as the example VectorDB. 4 | 5 | > [!NOTE] 6 | > Sinece v0.11.0 release, the VectorDB support addressed below is supported by LlamaEdge-RAG. 7 | 8 | ## Via CLI options 9 | 10 | If retrieving information from a fixed VectorDB, this method is recommended. The startup command of rag-api-server provides four command-line options, which are: 11 | 12 | - `--qdrant-url ` specifies the URL of VectorDB REST Service 13 | - `--qdrant-collection-name ` specifies one or multiple names of VectorDB collections 14 | - `--qdrant-limit ` specifies the max number of retrieved result (no less than 1) from each collection specified in the `--qdrant-collection-name` option 15 | - `--qdrant-score-threshold ` specifies the minimal score threshold for the search results from each collection specified in the `--qdrant-collection-name` option 16 | 17 | By setting the above four options in the startup command when starting rag-api-server, it helps avoid repeatedly providing these parameters in every retrieval request, such as chat completion request. The following is an example of the startup command: 18 | 19 | ```bash 20 | wasmedge --dir .:. \ 21 | --env VDB_API_KEY=your-vdb-api-key \ 22 | ... 23 | --qdrant-url https://651ca7e5-e1d1-4851-abba-xxxxxxxxxxxx.europe-west3-0.gcp.cloud.qdrant.io:6333 \ 24 | --qdrant-collection-name paris1,paris2 \ 25 | --qdrant-limit 3,5 \ 26 | --qdrant-score-threshold 0.5,0.7 27 | ``` 28 | 29 | **Note** that `--env VDB_API_KEY=your-vdb-api-key` is required if the VectorDB requires an API key for access. 30 | 31 | ## Via request fields 32 | 33 | For the cases where retrieving information from different VectorDBs or collections in different requests, this method is recommended. The requests for chat completions and rag creation tasks provide the fields respectively for specifying the VectorDB information. 34 | 35 | ### VectorDB related fields in requests to the `/v1/create/rag` endpoint 36 | 37 | The request to the `/v1/create/rag` endpoint also provides the fields for specifying the VectorDB information, which are 38 | 39 | - `vdb_server_url` specifies the URL of VectorDB REST Service 40 | - `vdb_collection_name` specifies one or multiple names of VectorDB collections 41 | - `vdb_api_key` specifies the API key for accessing the VectorDB 42 | 43 | The following is an example of the request: 44 | 45 | ```bash 46 | curl --location 'http://localhost:8080/v1/create/rag' \ 47 | --header 'Content-Type: multipart/form-data' \ 48 | --form 'file=@"paris.txt"' \ 49 | --form 'vdb_server_url="https://651ca7e5-e1d1-4851-abba-xxxxxxxxxxxx.europe-west3-0.gcp.cloud.qdrant.io:6333"' \ 50 | --form 'vdb_collection_name="paris"' \ 51 | --form 'vdb_api_key="your-vdb-api-key"' 52 | ``` 53 | 54 | ### VectorDB related fields in the request to the `/v1/chat/completion` endpoint 55 | 56 | The chat completion request to the `/v1/chat/completion` endpoint defines five VectorDB related fields for specifying the VectorDB information, which are 57 | 58 | - `vdb_server_url` specifies the URL of VectorDB REST Service 59 | - `vdb_collection_name` specifies one or multiple names of VectorDB collections 60 | - `limit` specifies the max number of retrieved result (no less than 1) from each collection specified in the `vdb_collection_name` field 61 | - `score_threshold` specifies the minimal score threshold for the search results from each collection specified in the `vdb_collection_name` field 62 | - `vdb_api_key` specifies the API key for accessing the VectorDB 63 | 64 | The following is an example of the chat completion request: 65 | 66 | ```bash 67 | curl --location 'http://localhost:8080/v1/chat/completions' \ 68 | --header 'Content-Type: application/json' \ 69 | --data '{ 70 | "messages": [ 71 | { 72 | "role": "system", 73 | "content": "You are a helpful assistant. Answer questions as concisely as possible." 74 | }, 75 | { 76 | "role": "user", 77 | "content": "What is the location of Paris, France along the Seine river?" 78 | } 79 | ], 80 | "vdb_server_url": "https://651ca7e5-e1d1-4851-abba-xxxxxxxxxxxx.europe-west3-0.gcp.cloud.qdrant.io:6333", 81 | "vdb_collection_name": ["paris1","paris2"], 82 | "limit": [3,5], 83 | "score_threshold": [0.5,0.7], 84 | "vdb_api_key": "your-vdb-api-key", 85 | "model": "Llama-3.2-3B-Instruct", 86 | "stream": false 87 | }' 88 | ``` 89 | 90 | **Note** that the `limit`, and `score_threshold` fields are required in the chat completion request if `vdb_server_url` and `vdb_collection_name` are present. The `vdb_api_key` field is required only if the VectorDB requires an API key for access. 91 | 92 | ### VectorDB related fields in the request to the `/v1/retrieve` endpoint 93 | 94 | Similarly, the request to the `/v1/retrieve` endpoint defines five VectorDB related fields for specifying the VectorDB information, which are 95 | 96 | - `vdb_server_url` specifies the URL of VectorDB REST Service 97 | - `vdb_collection_name` specifies one or multiple names of VectorDB collections 98 | - `limit` specifies the max number of retrieved result (no less than 1) from each collection specified in the `vdb_collection_name` field 99 | - `score_threshold` specifies the minimal score threshold for the search results from each collection specified in the `vdb_collection_name` field 100 | - `vdb_api_key` specifies the API key for accessing the VectorDB 101 | 102 | The following is an example of the retrieval request: 103 | 104 | ```bash 105 | curl --location 'http://localhost:8080/v1/retrieve' \ 106 | --header 'Content-Type: application/json' \ 107 | --data '{ 108 | "messages": [ 109 | { 110 | "role": "system", 111 | "content": "You are a helpful assistant. Answer questions as concisely as possible." 112 | }, 113 | { 114 | "role": "user", 115 | "content": "What is the location of Paris, France along the Seine river?" 116 | } 117 | ], 118 | "vdb_server_url": "https://651ca7e5-e1d1-4851-abba-xxxxxxxxxxxx.europe-west3-0.gcp.cloud.qdrant.io:6333", 119 | "vdb_collection_name": ["paris1","paris2"], 120 | "limit": [3,5], 121 | "score_threshold": [0.5,0.7], 122 | "vdb_api_key": "your-vdb-api-key", 123 | "model": "Llama-3.2-3B-Instruct", 124 | "stream": false 125 | }' 126 | ``` 127 | 128 | **Note** that the `limit`, and `score_threshold` fields are required in the chat completion request if `vdb_server_url` and `vdb_collection_name` are present. The `vdb_api_key` field is required only if the VectorDB requires an API key for access. 129 | -------------------------------------------------------------------------------- /src/backend/mod.rs: -------------------------------------------------------------------------------- 1 | pub(crate) mod ggml; 2 | 3 | use crate::error; 4 | use hyper::{Body, Request, Response}; 5 | 6 | pub(crate) async fn handle_llama_request( 7 | req: Request, 8 | chunk_capacity: usize, 9 | ) -> Response { 10 | match req.uri().path() { 11 | "/v1/chat/completions" => ggml::rag_query_handler(req).await, 12 | "/v1/models" => ggml::models_handler().await, 13 | "/v1/embeddings" => ggml::embeddings_handler(req).await, 14 | "/v1/files" => ggml::files_handler(req).await, 15 | "/v1/chunks" => ggml::chunks_handler(req).await, 16 | "/v1/retrieve" => ggml::retrieve_handler(req).await, 17 | "/v1/create/rag" => ggml::create_rag_handler(req, chunk_capacity).await, 18 | "/v1/info" => ggml::server_info_handler().await, 19 | path => { 20 | if path.starts_with("/v1/files/") { 21 | ggml::files_handler(req).await 22 | } else { 23 | error::invalid_endpoint(path) 24 | } 25 | } 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /src/error.rs: -------------------------------------------------------------------------------- 1 | use hyper::{Body, Response}; 2 | use thiserror::Error; 3 | 4 | #[allow(dead_code)] 5 | pub(crate) fn not_implemented() -> Response { 6 | // log error 7 | error!(target: "stdout", "501 Not Implemented"); 8 | 9 | Response::builder() 10 | .header("Access-Control-Allow-Origin", "*") 11 | .header("Access-Control-Allow-Methods", "*") 12 | .header("Access-Control-Allow-Headers", "*") 13 | .status(hyper::StatusCode::NOT_IMPLEMENTED) 14 | .body(Body::from("501 Not Implemented")) 15 | .unwrap() 16 | } 17 | 18 | pub(crate) fn internal_server_error(msg: impl AsRef) -> Response { 19 | let err_msg = match msg.as_ref().is_empty() { 20 | true => "500 Internal Server Error".to_string(), 21 | false => format!("500 Internal Server Error: {}", msg.as_ref()), 22 | }; 23 | 24 | // log error 25 | error!(target: "stdout", "{}", &err_msg); 26 | 27 | Response::builder() 28 | .header("Access-Control-Allow-Origin", "*") 29 | .header("Access-Control-Allow-Methods", "*") 30 | .header("Access-Control-Allow-Headers", "*") 31 | .status(hyper::StatusCode::INTERNAL_SERVER_ERROR) 32 | .body(Body::from(err_msg)) 33 | .unwrap() 34 | } 35 | 36 | pub(crate) fn bad_request(msg: impl AsRef) -> Response { 37 | let err_msg = match msg.as_ref().is_empty() { 38 | true => "400 Bad Request".to_string(), 39 | false => format!("400 Bad Request: {}", msg.as_ref()), 40 | }; 41 | 42 | // log error 43 | error!(target: "stdout", "{}", &err_msg); 44 | 45 | Response::builder() 46 | .header("Access-Control-Allow-Origin", "*") 47 | .header("Access-Control-Allow-Methods", "*") 48 | .header("Access-Control-Allow-Headers", "*") 49 | .status(hyper::StatusCode::BAD_REQUEST) 50 | .body(Body::from(err_msg)) 51 | .unwrap() 52 | } 53 | 54 | pub(crate) fn unauthorized(msg: impl AsRef) -> Response { 55 | let err_msg = match msg.as_ref().is_empty() { 56 | true => "401 Unauthorized".to_string(), 57 | false => format!("401 Unauthorized: {}", msg.as_ref()), 58 | }; 59 | 60 | // log error 61 | error!(target: "stdout", "{}", &err_msg); 62 | 63 | Response::builder() 64 | .header("Access-Control-Allow-Origin", "*") 65 | .header("Access-Control-Allow-Methods", "*") 66 | .header("Access-Control-Allow-Headers", "*") 67 | .status(hyper::StatusCode::UNAUTHORIZED) 68 | .body(Body::from(err_msg)) 69 | .unwrap() 70 | } 71 | 72 | pub(crate) fn invalid_endpoint(msg: impl AsRef) -> Response { 73 | let err_msg = match msg.as_ref().is_empty() { 74 | true => "404 The requested service endpoint is not found".to_string(), 75 | false => format!( 76 | "404 The requested service endpoint is not found: {}", 77 | msg.as_ref() 78 | ), 79 | }; 80 | 81 | // log error 82 | error!(target: "stdout", "{}", &err_msg); 83 | 84 | Response::builder() 85 | .header("Access-Control-Allow-Origin", "*") 86 | .header("Access-Control-Allow-Methods", "*") 87 | .header("Access-Control-Allow-Headers", "*") 88 | .status(hyper::StatusCode::NOT_FOUND) 89 | .body(Body::from(err_msg)) 90 | .unwrap() 91 | } 92 | 93 | #[derive(Error, Clone, Debug, PartialEq, Eq)] 94 | pub enum ServerError { 95 | /// Error returned while parsing CLI options failed 96 | #[error("{0}")] 97 | ArgumentError(String), 98 | #[error("{0}")] 99 | Operation(String), 100 | } 101 | -------------------------------------------------------------------------------- /src/main.rs: -------------------------------------------------------------------------------- 1 | #[macro_use] 2 | extern crate log; 3 | 4 | mod backend; 5 | mod error; 6 | mod utils; 7 | 8 | use anyhow::Result; 9 | use chat_prompts::{MergeRagContextPolicy, PromptTemplateType}; 10 | use clap::{ArgGroup, Parser}; 11 | use error::ServerError; 12 | use hyper::{ 13 | body::HttpBody, 14 | header, 15 | server::conn::AddrStream, 16 | service::{make_service_fn, service_fn}, 17 | Body, Request, Response, Server, StatusCode, 18 | }; 19 | use llama_core::metadata::ggml::GgmlMetadataBuilder; 20 | use once_cell::sync::OnceCell; 21 | use serde::{Deserialize, Serialize}; 22 | use std::{collections::HashMap, fmt, net::SocketAddr, path::PathBuf}; 23 | use tokio::{net::TcpListener, sync::RwLock}; 24 | use utils::{is_valid_url, LogLevel}; 25 | 26 | type Error = Box; 27 | 28 | // global system prompt 29 | pub(crate) static GLOBAL_RAG_PROMPT: OnceCell = OnceCell::new(); 30 | // server info 31 | pub(crate) static SERVER_INFO: OnceCell> = OnceCell::new(); 32 | // API key 33 | pub(crate) static LLAMA_API_KEY: OnceCell = OnceCell::new(); 34 | // Global context window used for setting the max number of user messages for the retrieval 35 | pub(crate) static CONTEXT_WINDOW: OnceCell = OnceCell::new(); 36 | // Global keyword search configuration 37 | pub(crate) static KW_SEARCH_CONFIG: OnceCell = OnceCell::new(); 38 | 39 | // default port 40 | const DEFAULT_PORT: &str = "8080"; 41 | 42 | #[derive(Clone, Debug)] 43 | pub struct AppState { 44 | pub state_thing: String, 45 | } 46 | 47 | #[derive(Debug, Parser)] 48 | #[command(name = "LlamaEdge-RAG API Server", version = env!("CARGO_PKG_VERSION"), author = env!("CARGO_PKG_AUTHORS"), about = "LlamaEdge-RAG API Server")] 49 | #[command(group = ArgGroup::new("socket_address_group").multiple(false).args(&["socket_addr", "port"]))] 50 | struct Cli { 51 | /// Sets names for chat and embedding models. The names are separated by comma without space, for example, '--model-name Llama-2-7b,all-minilm'. 52 | #[arg(short, long, value_delimiter = ',', required = true)] 53 | model_name: Vec, 54 | /// Model aliases for chat and embedding models 55 | #[arg( 56 | short = 'a', 57 | long, 58 | value_delimiter = ',', 59 | default_value = "default,embedding" 60 | )] 61 | model_alias: Vec, 62 | /// Sets context sizes for chat and embedding models, respectively. The sizes are separated by comma without space, for example, '--ctx-size 4096,384'. The first value is for the chat model, and the second is for the embedding model. 63 | #[arg( 64 | short = 'c', 65 | long, 66 | value_delimiter = ',', 67 | default_value = "4096,384", 68 | value_parser = clap::value_parser!(u64) 69 | )] 70 | ctx_size: Vec, 71 | /// Sets prompt templates for chat and embedding models, respectively. The prompt templates are separated by comma without space, for example, '--prompt-template llama-2-chat,embedding'. The first value is for the chat model, and the second is for the embedding model. 72 | #[arg(short, long, value_delimiter = ',', value_parser = clap::value_parser!(PromptTemplateType), required = true)] 73 | prompt_template: Vec, 74 | /// Halt generation at PROMPT, return control. 75 | #[arg(short, long)] 76 | reverse_prompt: Option, 77 | /// Number of tokens to predict, -1 = infinity, -2 = until context filled. 78 | #[arg(short, long, default_value = "-1")] 79 | n_predict: i32, 80 | /// Number of layers to run on the GPU 81 | #[arg(short = 'g', long, default_value = "100")] 82 | n_gpu_layers: u64, 83 | /// Split the model across multiple GPUs. Possible values: `none` (use one GPU only), `layer` (split layers and KV across GPUs, default), `row` (split rows across GPUs) 84 | #[arg(long, default_value = "layer")] 85 | split_mode: String, 86 | /// The main GPU to use. 87 | #[arg(long)] 88 | main_gpu: Option, 89 | /// How split tensors should be distributed accross GPUs. If None the model is not split; otherwise, a comma-separated list of non-negative values, e.g., "3,2" presents 60% of the data to GPU 0 and 40% to GPU 1. 90 | #[arg(long)] 91 | tensor_split: Option, 92 | /// Number of threads to use during computation 93 | #[arg(long, default_value = "2")] 94 | threads: u64, 95 | /// BNF-like grammar to constrain generations (see samples in grammars/ dir). 96 | #[arg(long, default_value = "")] 97 | pub grammar: String, 98 | /// JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object. For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead. 99 | #[arg(long)] 100 | pub json_schema: Option, 101 | /// Sets batch sizes for chat and embedding models, respectively. The sizes are separated by comma without space, for example, '--batch-size 128,64'. The first value is for the chat model, and the second is for the embedding model. 102 | #[arg(short, long, value_delimiter = ',', default_value = "512,512", value_parser = clap::value_parser!(u64))] 103 | batch_size: Vec, 104 | /// Sets physical maximum batch sizes for chat and/or embedding models. To run both chat and embedding models, the sizes should be separated by comma without space, for example, '--ubatch-size 512,512'. The first value is for the chat model, and the second for the embedding model. 105 | #[arg(short, long, value_delimiter = ',', default_value = "512,512", value_parser = clap::value_parser!(u64))] 106 | ubatch_size: Vec, 107 | /// Custom rag prompt. 108 | #[arg(long)] 109 | rag_prompt: Option, 110 | /// Strategy for merging RAG context into chat messages. 111 | #[arg(long = "rag-policy", default_value_t, value_enum)] 112 | policy: MergeRagContextPolicy, 113 | /// URL of Qdrant REST Service 114 | #[arg(long, default_value = "http://127.0.0.1:6333")] 115 | qdrant_url: String, 116 | /// Name of Qdrant collection 117 | #[arg(long, default_value = "default", value_delimiter = ',')] 118 | qdrant_collection_name: Vec, 119 | /// Max number of retrieved result (no less than 1) 120 | #[arg(long, default_value = "5", value_delimiter = ',', value_parser = clap::value_parser!(u64))] 121 | qdrant_limit: Vec, 122 | /// Minimal score threshold for the search result 123 | #[arg(long, default_value = "0.4", value_delimiter = ',', value_parser = clap::value_parser!(f32))] 124 | qdrant_score_threshold: Vec, 125 | /// Maximum number of tokens each chunk contains 126 | #[arg(long, default_value = "100", value_parser = clap::value_parser!(usize))] 127 | chunk_capacity: usize, 128 | /// Maximum number of user messages used in the retrieval 129 | #[arg(long, default_value = "1", value_parser = clap::value_parser!(u64))] 130 | context_window: u64, 131 | /// URL of the keyword search service 132 | #[arg(long)] 133 | kw_search_url: Option, 134 | /// Whether to include usage in the stream response. Defaults to false. 135 | #[arg(long, default_value = "false")] 136 | include_usage: bool, 137 | /// Socket address of LlamaEdge-RAG API Server instance. For example, `0.0.0.0:8080`. 138 | #[arg(long, default_value = None, value_parser = clap::value_parser!(SocketAddr), group = "socket_address_group")] 139 | socket_addr: Option, 140 | /// Port number 141 | #[arg(long, default_value = DEFAULT_PORT, value_parser = clap::value_parser!(u16), group = "socket_address_group")] 142 | port: u16, 143 | /// Root path for the Web UI files 144 | #[arg(long, default_value = "chatbot-ui")] 145 | web_ui: PathBuf, 146 | /// Deprecated. Print prompt strings to stdout 147 | #[arg(long)] 148 | log_prompts: bool, 149 | /// Deprecated. Print statistics to stdout 150 | #[arg(long)] 151 | log_stat: bool, 152 | /// Deprecated. Print all log information to stdout 153 | #[arg(long)] 154 | log_all: bool, 155 | } 156 | 157 | #[allow(clippy::needless_return)] 158 | #[tokio::main(flavor = "current_thread")] 159 | async fn main() -> Result<(), ServerError> { 160 | let mut plugin_debug = false; 161 | 162 | // get the environment variable `LLAMA_LOG` 163 | let log_level: LogLevel = std::env::var("LLAMA_LOG") 164 | .unwrap_or("info".to_string()) 165 | .parse() 166 | .unwrap_or(LogLevel::Info); 167 | 168 | if log_level == LogLevel::Debug || log_level == LogLevel::Trace { 169 | plugin_debug = true; 170 | } 171 | // set global logger 172 | wasi_logger::Logger::install().expect("failed to install wasi_logger::Logger"); 173 | log::set_max_level(log_level.into()); 174 | 175 | if let Ok(api_key) = std::env::var("API_KEY") { 176 | // define a const variable for the API key 177 | if let Err(e) = LLAMA_API_KEY.set(api_key) { 178 | let err_msg = format!("Failed to set API key. {}", e); 179 | 180 | error!(target: "stdout", "{}", err_msg); 181 | 182 | return Err(ServerError::Operation(err_msg)); 183 | } 184 | } 185 | 186 | // parse the command line arguments 187 | let cli = Cli::parse(); 188 | 189 | info!(target: "stdout", "log_level: {}", log_level); 190 | 191 | // log the version of the server 192 | info!(target: "stdout", "server_version: {}", env!("CARGO_PKG_VERSION")); 193 | 194 | // log model name 195 | if cli.model_name.len() != 2 { 196 | return Err(ServerError::ArgumentError( 197 | "LlamaEdge RAG API server requires a chat model and an embedding model.".to_owned(), 198 | )); 199 | } 200 | info!(target: "stdout", "model_name: {}", cli.model_name.join(",")); 201 | 202 | // log model alias 203 | if cli.model_alias.len() != 2 { 204 | return Err(ServerError::ArgumentError( 205 | "LlamaEdge RAG API server requires two model aliases: one for chat model, one for embedding model.".to_owned(), 206 | )); 207 | } 208 | info!(target: "stdout", "model_alias: {}", cli.model_alias.join(",")); 209 | 210 | // log context size 211 | if cli.ctx_size.len() != 2 { 212 | return Err(ServerError::ArgumentError( 213 | "LlamaEdge RAG API server requires two context sizes: one for chat model, one for embedding model.".to_owned(), 214 | )); 215 | } 216 | let ctx_sizes_str: String = cli 217 | .ctx_size 218 | .iter() 219 | .map(|n| n.to_string()) 220 | .collect::>() 221 | .join(","); 222 | info!(target: "stdout", "ctx_size: {}", ctx_sizes_str); 223 | 224 | // log batch size 225 | if cli.batch_size.len() != 2 { 226 | return Err(ServerError::ArgumentError( 227 | "LlamaEdge RAG API server requires two batch sizes: one for chat model, one for embedding model.".to_owned(), 228 | )); 229 | } 230 | let batch_sizes_str: String = cli 231 | .batch_size 232 | .iter() 233 | .map(|n| n.to_string()) 234 | .collect::>() 235 | .join(","); 236 | info!(target: "stdout", "batch_size: {}", batch_sizes_str); 237 | 238 | // log ubatch size 239 | if cli.ubatch_size.len() != 2 { 240 | return Err(ServerError::ArgumentError( 241 | "LlamaEdge RAG API server requires two ubatch sizes: one for chat model, one for embedding model.".to_owned(), 242 | )); 243 | } 244 | let ubatch_sizes_str: String = cli 245 | .ubatch_size 246 | .iter() 247 | .map(|n| n.to_string()) 248 | .collect::>() 249 | .join(","); 250 | info!(target: "stdout", "ubatch_size: {}", ubatch_sizes_str); 251 | 252 | // log prompt template 253 | if cli.prompt_template.len() != 2 { 254 | return Err(ServerError::ArgumentError( 255 | "LlamaEdge RAG API server requires two prompt templates: one for chat model, one for embedding model.".to_owned(), 256 | )); 257 | } 258 | let prompt_template_str: String = cli 259 | .prompt_template 260 | .iter() 261 | .map(|n| n.to_string()) 262 | .collect::>() 263 | .join(","); 264 | info!(target: "stdout", "prompt_template: {}", prompt_template_str); 265 | 266 | // log reverse prompt 267 | if let Some(reverse_prompt) = &cli.reverse_prompt { 268 | info!(target: "stdout", "reverse_prompt: {}", reverse_prompt); 269 | } 270 | 271 | // log n_predict 272 | info!(target: "stdout", "n_predict: {}", &cli.n_predict); 273 | 274 | // log n_gpu_layers 275 | info!(target: "stdout", "n_gpu_layers: {}", &cli.n_gpu_layers); 276 | 277 | // log split_mode 278 | info!(target: "stdout", "split_mode: {}", cli.split_mode); 279 | 280 | // log main GPU 281 | if let Some(main_gpu) = &cli.main_gpu { 282 | info!(target: "stdout", "main_gpu: {}", main_gpu); 283 | } 284 | 285 | // log tensor split 286 | if let Some(tensor_split) = &cli.tensor_split { 287 | info!(target: "stdout", "tensor_split: {}", tensor_split); 288 | } 289 | 290 | // log threads 291 | info!(target: "stdout", "threads: {}", cli.threads); 292 | 293 | // log grammar 294 | if !cli.grammar.is_empty() { 295 | info!(target: "stdout", "grammar: {}", &cli.grammar); 296 | } 297 | 298 | // log json schema 299 | if let Some(json_schema) = &cli.json_schema { 300 | info!(target: "stdout", "json_schema: {}", json_schema); 301 | } 302 | 303 | // log rag prompt 304 | if let Some(rag_prompt) = &cli.rag_prompt { 305 | info!(target: "stdout", "rag_prompt: {}", rag_prompt); 306 | 307 | GLOBAL_RAG_PROMPT.set(rag_prompt.clone()).map_err(|_| { 308 | ServerError::Operation("Failed to set `GLOBAL_RAG_PROMPT`.".to_string()) 309 | })?; 310 | } 311 | 312 | // log qdrant url 313 | if !is_valid_url(&cli.qdrant_url) { 314 | let err_msg = format!( 315 | "The URL of Qdrant REST API is invalid: {}.", 316 | &cli.qdrant_url 317 | ); 318 | 319 | // log 320 | { 321 | error!(target: "stdout", "rag_prompt: {}", err_msg); 322 | } 323 | 324 | return Err(ServerError::ArgumentError(err_msg)); 325 | } 326 | info!(target: "stdout", "qdrant_url: {}", &cli.qdrant_url); 327 | 328 | if cli.qdrant_collection_name.len() != cli.qdrant_limit.len() 329 | && cli.qdrant_limit.len() > 1 330 | && cli.qdrant_score_threshold.len() > 1 331 | { 332 | return Err(ServerError::ArgumentError( 333 | "LlamaEdge RAG API server requires the same number of Qdrant collection names and limits; or the limit is only one value for all collections.".to_owned(), 334 | )); 335 | } 336 | 337 | if cli.qdrant_collection_name.len() != cli.qdrant_score_threshold.len() 338 | && cli.qdrant_score_threshold.len() > 1 339 | && cli.qdrant_score_threshold.len() > 1 340 | { 341 | return Err(ServerError::ArgumentError( 342 | "LlamaEdge RAG API server requires the same number of Qdrant collection names and score thresholds; or the score threshold is only one value for all collections.".to_owned(), 343 | )); 344 | } 345 | 346 | // log qdrant collection name 347 | let qdrant_collection_name_str: String = cli 348 | .qdrant_collection_name 349 | .iter() 350 | .map(|n| n.to_string()) 351 | .collect::>() 352 | .join(","); 353 | info!(target: "stdout", "qdrant_collection_name: {}", qdrant_collection_name_str); 354 | 355 | // log qdrant limit 356 | let qdrant_limit_str: String = cli 357 | .qdrant_limit 358 | .iter() 359 | .map(|n| n.to_string()) 360 | .collect::>() 361 | .join(","); 362 | info!(target: "stdout", "qdrant_limit: {}", qdrant_limit_str); 363 | 364 | // log qdrant score threshold 365 | let qdrant_score_threshold_str: String = cli 366 | .qdrant_score_threshold 367 | .iter() 368 | .map(|n| n.to_string()) 369 | .collect::>() 370 | .join(","); 371 | info!(target: "stdout", "qdrant_score_threshold: {}", qdrant_score_threshold_str); 372 | 373 | // create qdrant config 374 | let mut qdrant_config_vec: Vec = Vec::new(); 375 | for (idx, col_name) in cli.qdrant_collection_name.iter().enumerate() { 376 | let limit = if cli.qdrant_limit.len() == 1 { 377 | cli.qdrant_limit[0] 378 | } else { 379 | cli.qdrant_limit[idx] 380 | }; 381 | 382 | let score_threshold = if cli.qdrant_score_threshold.len() == 1 { 383 | cli.qdrant_score_threshold[0] 384 | } else { 385 | cli.qdrant_score_threshold[idx] 386 | }; 387 | 388 | let qdrant_config = QdrantConfig { 389 | url: cli.qdrant_url.clone(), 390 | collection_name: col_name.clone(), 391 | limit, 392 | score_threshold, 393 | }; 394 | 395 | qdrant_config_vec.push(qdrant_config); 396 | } 397 | 398 | // log chunk capacity 399 | info!(target: "stdout", "chunk_capacity: {}", &cli.chunk_capacity); 400 | 401 | // log context window 402 | info!(target: "stdout", "context_window: {}", &cli.context_window); 403 | CONTEXT_WINDOW 404 | .set(cli.context_window) 405 | .map_err(|e| ServerError::Operation(format!("Failed to set `CONTEXT_WINDOW`. {}", e)))?; 406 | 407 | // RAG policy 408 | info!(target: "stdout", "rag_policy: {}", &cli.policy); 409 | 410 | let mut policy = cli.policy; 411 | if policy == MergeRagContextPolicy::SystemMessage && !cli.prompt_template[0].has_system_prompt() 412 | { 413 | warn!(target: "server_config", "{}", format!("The chat model does not support system message, while the '--policy' option sets to \"{}\". Update the RAG policy to {}.", cli.policy, MergeRagContextPolicy::LastUserMessage)); 414 | 415 | policy = MergeRagContextPolicy::LastUserMessage; 416 | } 417 | 418 | // keyword search configuration 419 | if let Some(kw_search_url) = &cli.kw_search_url { 420 | let kw_search_config = KeywordSearchConfig { 421 | url: kw_search_url.clone(), 422 | }; 423 | KW_SEARCH_CONFIG.set(kw_search_config).unwrap(); 424 | } 425 | 426 | // log include_usage 427 | info!(target: "stdout", "include_usage: {}", cli.include_usage); 428 | 429 | // create metadata for chat model 430 | let chat_metadata = GgmlMetadataBuilder::new( 431 | cli.model_name[0].clone(), 432 | cli.model_alias[0].clone(), 433 | cli.prompt_template[0], 434 | ) 435 | .with_ctx_size(cli.ctx_size[0]) 436 | .with_reverse_prompt(cli.reverse_prompt) 437 | .with_batch_size(cli.batch_size[0]) 438 | .with_ubatch_size(cli.ubatch_size[0]) 439 | .with_n_predict(cli.n_predict) 440 | .with_n_gpu_layers(cli.n_gpu_layers) 441 | .with_split_mode(cli.split_mode.clone()) 442 | .with_main_gpu(cli.main_gpu) 443 | .with_tensor_split(cli.tensor_split.clone()) 444 | .with_threads(cli.threads) 445 | .with_grammar(cli.grammar) 446 | .with_json_schema(cli.json_schema) 447 | .enable_plugin_log(true) 448 | .enable_debug_log(plugin_debug) 449 | .include_usage(cli.include_usage) 450 | .build(); 451 | 452 | let chat_model_info = ModelConfig { 453 | name: chat_metadata.model_name.clone(), 454 | ty: "chat".to_string(), 455 | prompt_template: chat_metadata.prompt_template, 456 | n_predict: chat_metadata.n_predict, 457 | reverse_prompt: chat_metadata.reverse_prompt.clone(), 458 | n_gpu_layers: chat_metadata.n_gpu_layers, 459 | ctx_size: chat_metadata.ctx_size, 460 | batch_size: chat_metadata.batch_size, 461 | ubatch_size: chat_metadata.ubatch_size, 462 | temperature: chat_metadata.temperature, 463 | top_p: chat_metadata.top_p, 464 | repeat_penalty: chat_metadata.repeat_penalty, 465 | presence_penalty: chat_metadata.presence_penalty, 466 | frequency_penalty: chat_metadata.frequency_penalty, 467 | split_mode: chat_metadata.split_mode.clone(), 468 | main_gpu: chat_metadata.main_gpu, 469 | tensor_split: chat_metadata.tensor_split.clone(), 470 | }; 471 | 472 | // chat model 473 | let chat_models = [chat_metadata]; 474 | 475 | // create metadata for embedding model 476 | let embedding_metadata = GgmlMetadataBuilder::new( 477 | cli.model_name[1].clone(), 478 | cli.model_alias[1].clone(), 479 | cli.prompt_template[1], 480 | ) 481 | .with_ctx_size(cli.ctx_size[1]) 482 | .with_batch_size(cli.batch_size[1]) 483 | .with_ubatch_size(cli.ubatch_size[1]) 484 | .with_split_mode(cli.split_mode) 485 | .with_main_gpu(cli.main_gpu) 486 | .with_tensor_split(cli.tensor_split) 487 | .with_threads(cli.threads) 488 | .enable_plugin_log(true) 489 | .enable_debug_log(plugin_debug) 490 | .build(); 491 | 492 | let embedding_model_info = ModelConfig { 493 | name: embedding_metadata.model_name.clone(), 494 | ty: "embedding".to_string(), 495 | ctx_size: embedding_metadata.ctx_size, 496 | batch_size: embedding_metadata.batch_size, 497 | ubatch_size: embedding_metadata.ubatch_size, 498 | prompt_template: embedding_metadata.prompt_template, 499 | n_predict: embedding_metadata.n_predict, 500 | reverse_prompt: embedding_metadata.reverse_prompt.clone(), 501 | n_gpu_layers: embedding_metadata.n_gpu_layers, 502 | temperature: embedding_metadata.temperature, 503 | top_p: embedding_metadata.top_p, 504 | repeat_penalty: embedding_metadata.repeat_penalty, 505 | presence_penalty: embedding_metadata.presence_penalty, 506 | frequency_penalty: embedding_metadata.frequency_penalty, 507 | split_mode: embedding_metadata.split_mode.clone(), 508 | main_gpu: embedding_metadata.main_gpu, 509 | tensor_split: embedding_metadata.tensor_split.clone(), 510 | }; 511 | 512 | // embedding model 513 | let embedding_models = [embedding_metadata]; 514 | 515 | // create rag config 516 | let rag_config = RagConfig { 517 | chat_model: chat_model_info, 518 | embedding_model: embedding_model_info, 519 | policy, 520 | }; 521 | 522 | // initialize the core context 523 | llama_core::init_ggml_rag_context(&chat_models[..], &embedding_models[..]).map_err(|e| { 524 | let err_msg = format!("Failed to initialize the core context. {}", e); 525 | 526 | // log 527 | error!(target: "stdout", "{}", &err_msg); 528 | 529 | ServerError::Operation(err_msg) 530 | })?; 531 | 532 | // get the plugin version info 533 | let plugin_info = 534 | llama_core::get_plugin_info().map_err(|e| ServerError::Operation(e.to_string()))?; 535 | let plugin_version = format!( 536 | "b{build_number} (commit {commit_id})", 537 | build_number = plugin_info.build_number, 538 | commit_id = plugin_info.commit_id, 539 | ); 540 | 541 | // log plugin version 542 | info!(target: "stdout", "plugin_ggml_version: {}", &plugin_version); 543 | 544 | // socket address 545 | let addr = match cli.socket_addr { 546 | Some(addr) => addr, 547 | None => SocketAddr::from(([0, 0, 0, 0], cli.port)), 548 | }; 549 | let port = addr.port().to_string(); 550 | 551 | // get the environment variable `NODE_VERSION` 552 | // Note that this is for satisfying the requirement of `gaianet-node` project. 553 | let node = std::env::var("NODE_VERSION").ok(); 554 | if node.is_some() { 555 | // log node version 556 | info!(target: "stdout", "gaianet_node_version: {}", node.as_ref().unwrap()); 557 | } 558 | 559 | // create server info 560 | let server_info = ServerInfo { 561 | node, 562 | server: ApiServer { 563 | ty: "rag".to_string(), 564 | version: env!("CARGO_PKG_VERSION").to_string(), 565 | plugin_version, 566 | port, 567 | }, 568 | rag_config, 569 | qdrant_config: qdrant_config_vec, 570 | extras: HashMap::new(), 571 | }; 572 | SERVER_INFO 573 | .set(RwLock::new(server_info)) 574 | .map_err(|_| ServerError::Operation("Failed to set `SERVER_INFO`.".to_string()))?; 575 | 576 | let new_service = make_service_fn(move |conn: &AddrStream| { 577 | // log socket address 578 | info!(target: "stdout", "remote_addr: {}, local_addr: {}", conn.remote_addr().to_string(), conn.local_addr().to_string()); 579 | 580 | let web_ui = cli.web_ui.to_string_lossy().to_string(); 581 | let chunk_capacity = cli.chunk_capacity; 582 | 583 | async move { 584 | Ok::<_, Error>(service_fn(move |req| { 585 | handle_request(req, chunk_capacity, web_ui.clone()) 586 | })) 587 | } 588 | }); 589 | 590 | let tcp_listener = TcpListener::bind(addr).await.unwrap(); 591 | info!(target: "stdout", "Listening on {}", addr); 592 | 593 | let server = Server::from_tcp(tcp_listener.into_std().unwrap()) 594 | .unwrap() 595 | .serve(new_service); 596 | 597 | match server.await { 598 | Ok(_) => Ok(()), 599 | Err(e) => Err(ServerError::Operation(e.to_string())), 600 | } 601 | } 602 | 603 | async fn handle_request( 604 | req: Request, 605 | chunk_capacity: usize, 606 | web_ui: String, 607 | ) -> Result, hyper::Error> { 608 | let path_str = req.uri().path(); 609 | let path_buf = PathBuf::from(path_str); 610 | let mut path_iter = path_buf.iter(); 611 | path_iter.next(); // Must be Some(OsStr::new(&path::MAIN_SEPARATOR.to_string())) 612 | let root_path = path_iter.next().unwrap_or_default(); 613 | let root_path = "/".to_owned() + root_path.to_str().unwrap_or_default(); 614 | 615 | // check if the API key is valid 616 | if let Some(auth_header) = req.headers().get("authorization") { 617 | if !auth_header.is_empty() { 618 | let auth_header = match auth_header.to_str() { 619 | Ok(auth_header) => auth_header, 620 | Err(e) => { 621 | let err_msg = format!("Failed to get authorization header: {}", e); 622 | return Ok(error::unauthorized(err_msg)); 623 | } 624 | }; 625 | 626 | let api_key = auth_header.split(" ").nth(1).unwrap_or_default(); 627 | info!(target: "stdout", "API Key: {}", api_key); 628 | 629 | if let Some(stored_api_key) = LLAMA_API_KEY.get() { 630 | if api_key != stored_api_key { 631 | let err_msg = "Invalid API key."; 632 | return Ok(error::unauthorized(err_msg)); 633 | } 634 | } 635 | } 636 | } 637 | 638 | // log request 639 | { 640 | let method = hyper::http::Method::as_str(req.method()).to_string(); 641 | let path = req.uri().path().to_string(); 642 | let version = format!("{:?}", req.version()); 643 | if req.method() == hyper::http::Method::POST { 644 | let size: u64 = match req.headers().get("content-length") { 645 | Some(content_length) => content_length.to_str().unwrap().parse().unwrap(), 646 | None => 0, 647 | }; 648 | 649 | info!(target: "stdout", "method: {}, http_version: {}, content-length: {}", method, version, size); 650 | info!(target: "stdout", "endpoint: {}", path); 651 | } else { 652 | info!(target: "stdout", "method: {}, http_version: {}", method, version); 653 | info!(target: "stdout", "endpoint: {}", path); 654 | } 655 | } 656 | 657 | let response = match root_path.as_str() { 658 | "/echo" => Response::new(Body::from("echo test")), 659 | "/v1" => backend::handle_llama_request(req, chunk_capacity).await, 660 | _ => static_response(path_str, web_ui), 661 | }; 662 | 663 | // log response 664 | { 665 | let status_code = response.status(); 666 | if status_code.as_u16() < 400 { 667 | // log response 668 | let response_version = format!("{:?}", response.version()); 669 | info!(target: "stdout", "response_version: {}", response_version); 670 | let response_body_size: u64 = response.body().size_hint().lower(); 671 | info!(target: "stdout", "response_body_size: {}", response_body_size); 672 | let response_status = status_code.as_u16(); 673 | info!(target: "stdout", "response_status: {}", response_status); 674 | let response_is_success = status_code.is_success(); 675 | info!(target: "stdout", "response_is_success: {}", response_is_success); 676 | } else { 677 | let response_version = format!("{:?}", response.version()); 678 | error!(target: "stdout", "response_version: {}", response_version); 679 | let response_body_size: u64 = response.body().size_hint().lower(); 680 | error!(target: "stdout", "response_body_size: {}", response_body_size); 681 | let response_status = status_code.as_u16(); 682 | error!(target: "stdout", "response_status: {}", response_status); 683 | let response_is_success = status_code.is_success(); 684 | error!(target: "stdout", "response_is_success: {}", response_is_success); 685 | let response_is_client_error = status_code.is_client_error(); 686 | error!(target: "stdout", "response_is_client_error: {}", response_is_client_error); 687 | let response_is_server_error = status_code.is_server_error(); 688 | error!(target: "stdout", "response_is_server_error: {}", response_is_server_error); 689 | } 690 | } 691 | 692 | Ok(response) 693 | } 694 | 695 | fn static_response(path_str: &str, root: String) -> Response { 696 | let path = match path_str { 697 | "/" => "/index.html", 698 | _ => path_str, 699 | }; 700 | 701 | let mime = mime_guess::from_path(path); 702 | 703 | match std::fs::read(format!("{root}/{path}")) { 704 | Ok(content) => Response::builder() 705 | .status(StatusCode::OK) 706 | .header(header::CONTENT_TYPE, mime.first_or_text_plain().to_string()) 707 | .body(Body::from(content)) 708 | .unwrap(), 709 | Err(_) => { 710 | let body = Body::from(std::fs::read(format!("{root}/404.html")).unwrap_or_default()); 711 | Response::builder() 712 | .status(StatusCode::NOT_FOUND) 713 | .header(header::CONTENT_TYPE, "text/html") 714 | .body(body) 715 | .unwrap() 716 | } 717 | } 718 | } 719 | 720 | #[derive(Debug, Clone, Default, Serialize, Deserialize)] 721 | pub(crate) struct QdrantConfig { 722 | pub(crate) url: String, 723 | pub(crate) collection_name: String, 724 | pub(crate) limit: u64, 725 | pub(crate) score_threshold: f32, 726 | } 727 | impl fmt::Display for QdrantConfig { 728 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 729 | write!( 730 | f, 731 | "url: {}, collection_name: {}, limit: {}, score_threshold: {}", 732 | self.url, self.collection_name, self.limit, self.score_threshold 733 | ) 734 | } 735 | } 736 | 737 | #[derive(Debug, Serialize, Deserialize)] 738 | pub(crate) struct ModelConfig { 739 | // model name 740 | name: String, 741 | // type: chat or embedding 742 | #[serde(rename = "type")] 743 | ty: String, 744 | pub ctx_size: u64, 745 | pub batch_size: u64, 746 | pub ubatch_size: u64, 747 | pub prompt_template: PromptTemplateType, 748 | pub n_predict: i32, 749 | #[serde(skip_serializing_if = "Option::is_none")] 750 | pub reverse_prompt: Option, 751 | pub n_gpu_layers: u64, 752 | pub temperature: f64, 753 | pub top_p: f64, 754 | pub repeat_penalty: f64, 755 | pub presence_penalty: f64, 756 | pub frequency_penalty: f64, 757 | pub split_mode: String, 758 | #[serde(skip_serializing_if = "Option::is_none")] 759 | pub main_gpu: Option, 760 | #[serde(skip_serializing_if = "Option::is_none")] 761 | pub tensor_split: Option, 762 | } 763 | 764 | #[derive(Debug, Serialize, Deserialize)] 765 | pub(crate) struct ServerInfo { 766 | #[serde(skip_serializing_if = "Option::is_none")] 767 | #[serde(rename = "node_version")] 768 | node: Option, 769 | #[serde(rename = "api_server")] 770 | server: ApiServer, 771 | #[serde(flatten)] 772 | rag_config: RagConfig, 773 | qdrant_config: Vec, 774 | extras: HashMap, 775 | } 776 | 777 | #[derive(Debug, Serialize, Deserialize)] 778 | pub(crate) struct ApiServer { 779 | #[serde(rename = "type")] 780 | ty: String, 781 | version: String, 782 | #[serde(rename = "ggml_plugin_version")] 783 | plugin_version: String, 784 | port: String, 785 | } 786 | 787 | #[derive(Debug, Serialize, Deserialize)] 788 | pub(crate) struct RagConfig { 789 | pub chat_model: ModelConfig, 790 | pub embedding_model: ModelConfig, 791 | #[serde(rename = "rag_policy")] 792 | pub policy: MergeRagContextPolicy, 793 | } 794 | 795 | #[derive(Debug, Clone, Default)] 796 | pub(crate) struct KeywordSearchConfig { 797 | pub url: String, 798 | } 799 | -------------------------------------------------------------------------------- /src/utils.rs: -------------------------------------------------------------------------------- 1 | use serde::{Deserialize, Serialize}; 2 | use url::Url; 3 | 4 | pub(crate) fn is_valid_url(url: &str) -> bool { 5 | Url::parse(url).is_ok() 6 | } 7 | 8 | pub(crate) fn gen_chat_id() -> String { 9 | format!("chatcmpl-{}", uuid::Uuid::new_v4()) 10 | } 11 | 12 | #[derive( 13 | Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, clap::ValueEnum, Serialize, Deserialize, 14 | )] 15 | #[serde(rename_all = "lowercase")] 16 | pub(crate) enum LogLevel { 17 | /// Describes messages about the values of variables and the flow of 18 | /// control within a program. 19 | Trace, 20 | 21 | /// Describes messages likely to be of interest to someone debugging a 22 | /// program. 23 | Debug, 24 | 25 | /// Describes messages likely to be of interest to someone monitoring a 26 | /// program. 27 | Info, 28 | 29 | /// Describes messages indicating hazardous situations. 30 | Warn, 31 | 32 | /// Describes messages indicating serious errors. 33 | Error, 34 | 35 | /// Describes messages indicating fatal errors. 36 | Critical, 37 | } 38 | impl From for log::LevelFilter { 39 | fn from(level: LogLevel) -> Self { 40 | match level { 41 | LogLevel::Trace => log::LevelFilter::Trace, 42 | LogLevel::Debug => log::LevelFilter::Debug, 43 | LogLevel::Info => log::LevelFilter::Info, 44 | LogLevel::Warn => log::LevelFilter::Warn, 45 | LogLevel::Error => log::LevelFilter::Error, 46 | LogLevel::Critical => log::LevelFilter::Error, 47 | } 48 | } 49 | } 50 | impl std::fmt::Display for LogLevel { 51 | fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { 52 | match self { 53 | LogLevel::Trace => write!(f, "trace"), 54 | LogLevel::Debug => write!(f, "debug"), 55 | LogLevel::Info => write!(f, "info"), 56 | LogLevel::Warn => write!(f, "warn"), 57 | LogLevel::Error => write!(f, "error"), 58 | LogLevel::Critical => write!(f, "critical"), 59 | } 60 | } 61 | } 62 | impl std::str::FromStr for LogLevel { 63 | type Err = String; 64 | fn from_str(s: &str) -> Result { 65 | match s.to_lowercase().as_str() { 66 | "trace" => Ok(LogLevel::Trace), 67 | "debug" => Ok(LogLevel::Debug), 68 | "info" => Ok(LogLevel::Info), 69 | "warn" => Ok(LogLevel::Warn), 70 | "error" => Ok(LogLevel::Error), 71 | "critical" => Ok(LogLevel::Critical), 72 | _ => Err(format!("Invalid log level: {}", s)), 73 | } 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /tests/test_chat.hurl: -------------------------------------------------------------------------------- 1 | # test /v1/models endpoint 2 | GET http://localhost:8080/v1/models 3 | screencapability: low 4 | HTTP 200 5 | [Asserts] 6 | jsonpath "$.data[0].id" == "Qwen2-1.5B-Instruct" 7 | 8 | # test /v1/chat/completions endpoint 9 | POST http://localhost:8080/v1/chat/completions 10 | Accept: application/json 11 | Content-Type: application/json 12 | ```json 13 | { 14 | "messages": [ 15 | { 16 | "role": "user", 17 | "content": "What is the capital of France?" 18 | } 19 | ], 20 | "model": "Qwen2-1.5B-Instruct", 21 | "stream": false 22 | } 23 | ``` 24 | HTTP 200 25 | [Asserts] 26 | jsonpath "$.model" == "Qwen2-1.5B-Instruct" 27 | jsonpath "$.choices[0].message.content" contains "Paris" 28 | 29 | 30 | # test /v1/chat/completions endpoint 31 | # Test purpose: The model name is incorrect 32 | POST http://localhost:8080/v1/chat/completions 33 | Accept: application/json 34 | Content-Type: application/json 35 | ```json 36 | { 37 | "messages": [ 38 | { 39 | "role": "user", 40 | "content": "What is the capital of France?" 41 | } 42 | ], 43 | "model": "Qwen2-1.5B-Instruct-invalid", 44 | "stream": false 45 | } 46 | ``` 47 | HTTP 200 48 | [Asserts] 49 | jsonpath "$.model" == "Qwen2-1.5B-Instruct" 50 | jsonpath "$.choices[0].message.content" contains "Paris" -------------------------------------------------------------------------------- /tests/test_embeddings.hurl: -------------------------------------------------------------------------------- 1 | 2 | # test /v1/embeddings endpoint 3 | POST http://localhost:8080/v1/embeddings 4 | Accept: application/json 5 | Content-Type: application/json 6 | ```json 7 | { 8 | "model": "nomic-embed-text-v1.5", 9 | "input": [ 10 | "Gaianet is revolutionizing the AI landscape with a distributed AI infrastructure that seeks to decentralize the dominance of major players such as OpenAI, Google, and Anthropic. By leveraging a network of edge-computing nodes owned by individuals around the world, Gaianet enables hosting of both open-source and finely-tuned models. This infrastructure is designed to cater to diverse AI demands, offering a scalable alternative to traditional centralized servers.", 11 | "The inception of Gaianet is driven by the necessity to address key issues in the current AI industry: censorship and bias in AI outputs, lack of privacy for user data, and the high costs associated with accessing and developing on centralized AI models. These challenges have restricted the dissemination of unbiased information, compromised data security, and erected barriers to innovation and broader application of AI technologies.", 12 | "In an effort to mitigate these issues, Gaianet proposes a model that is inherently more inclusive and secure. It offers a platform where individuals can contribute their computing resources, expertise, and proprietary data to foster a collaborative advancement of AI technology. This approach not only democratizes access to AI but also aligns with principles of privacy and equitable distribution of resources.", 13 | "The Gaianet ecosystem comprises four key components:\nComponent 1, The capability for individuals to run an Agent style API service on private devices, utilizing proprietary knowledge and finely-tuned models.\nComponent 2, Compatibility with the OpenAI API to ensure seamless integration with existing toolchains.\nComponent 3, A decentralized marketplace for AI services and components.\nComponent 4, Open-source development tools for creating LLM/AI applications.", 14 | "Features of Gaia nodes are extensive and include integrated data, knowledge, and computing resources; support for open-source and OpenAI models; customization with fine-tuned LLMs; and the ability to chain multiple specialized models for enhanced functionality. Additionally, nodes offer knowledge bases for LLM enhancements, memorized chat history for extended conversations, and the capability for actions and function calls to external systems.", 15 | "Gaianet also introduces a suite of ancillary offerings aimed at developers, including tools for fine-tuning LLMs, marketplaces for fine-tuned models and embeddings, and SDKs for various integrations.", 16 | "The team behind Gaianet is spearheaded by CEO Matt Wright, previously Director of Fellowship Accelerator at Consensys and Director of DAOs. Chief Scientist Allen Yang brings his expertise from the University of California at Berkeley, while COO Shashank Sripada contributes his entrepreneurial experience. Advisors Michael Yuan and Brian Shin add further depth to the project's foundation.", 17 | "Gaianet's official website can be accessed at [https://www.gaianet.ai/](https://www.gaianet.ai/), where you can find comprehensive information and resources about their initiatives.\n\nFor community engagement and discussions, Gaianet has a Discord server available at [https://discord.com/invite/gaianet-ai](https://discord.com/invite/gaianet-ai), welcoming members to join.", 18 | "To stay updated with Gaianet's latest news and insights, you can follow their Telegram channel at [https://t.me/Gaianet_AI](https://t.me/Gaianet_AI).\n\nInsightful articles and updates from Gaianet are regularly published on their Medium blog at [https://medium.com/@Gaianet.ai](https://medium.com/@Gaianet.ai).", 19 | "For the latest announcements and engagements, follow Gaianet on Twitter at [https://twitter.com/Gaianet_AI](https://twitter.com/Gaianet_AI).\n\nDevelopers and contributors can explore Gaianet's GitHub repository at [https://github.com/GaiaNet-AI/](https://github.com/GaiaNet-AI/)." 20 | ] 21 | } 22 | ``` 23 | HTTP 200 24 | [Asserts] 25 | jsonpath "$.model" == "nomic-embed-text-v1.5" 26 | jsonpath "$.data" count > 0 27 | 28 | # test /v1/embeddings endpoint 29 | # Test purpose: The model name is incorrect 30 | POST http://localhost:8080/v1/embeddings 31 | Accept: application/json 32 | Content-Type: application/json 33 | ```json 34 | { 35 | "model": "nomic-embed-text-v1.5-invalid", 36 | "input": [ 37 | "Gaianet is revolutionizing the AI landscape with a distributed AI infrastructure that seeks to decentralize the dominance of major players such as OpenAI, Google, and Anthropic. By leveraging a network of edge-computing nodes owned by individuals around the world, Gaianet enables hosting of both open-source and finely-tuned models. This infrastructure is designed to cater to diverse AI demands, offering a scalable alternative to traditional centralized servers.", 38 | "The inception of Gaianet is driven by the necessity to address key issues in the current AI industry: censorship and bias in AI outputs, lack of privacy for user data, and the high costs associated with accessing and developing on centralized AI models. These challenges have restricted the dissemination of unbiased information, compromised data security, and erected barriers to innovation and broader application of AI technologies.", 39 | "In an effort to mitigate these issues, Gaianet proposes a model that is inherently more inclusive and secure. It offers a platform where individuals can contribute their computing resources, expertise, and proprietary data to foster a collaborative advancement of AI technology. This approach not only democratizes access to AI but also aligns with principles of privacy and equitable distribution of resources.", 40 | "The Gaianet ecosystem comprises four key components:\nComponent 1, The capability for individuals to run an Agent style API service on private devices, utilizing proprietary knowledge and finely-tuned models.\nComponent 2, Compatibility with the OpenAI API to ensure seamless integration with existing toolchains.\nComponent 3, A decentralized marketplace for AI services and components.\nComponent 4, Open-source development tools for creating LLM/AI applications.", 41 | "Features of Gaia nodes are extensive and include integrated data, knowledge, and computing resources; support for open-source and OpenAI models; customization with fine-tuned LLMs; and the ability to chain multiple specialized models for enhanced functionality. Additionally, nodes offer knowledge bases for LLM enhancements, memorized chat history for extended conversations, and the capability for actions and function calls to external systems.", 42 | "Gaianet also introduces a suite of ancillary offerings aimed at developers, including tools for fine-tuning LLMs, marketplaces for fine-tuned models and embeddings, and SDKs for various integrations.", 43 | "The team behind Gaianet is spearheaded by CEO Matt Wright, previously Director of Fellowship Accelerator at Consensys and Director of DAOs. Chief Scientist Allen Yang brings his expertise from the University of California at Berkeley, while COO Shashank Sripada contributes his entrepreneurial experience. Advisors Michael Yuan and Brian Shin add further depth to the project's foundation.", 44 | "Gaianet's official website can be accessed at [https://www.gaianet.ai/](https://www.gaianet.ai/), where you can find comprehensive information and resources about their initiatives.\n\nFor community engagement and discussions, Gaianet has a Discord server available at [https://discord.com/invite/gaianet-ai](https://discord.com/invite/gaianet-ai), welcoming members to join.", 45 | "To stay updated with Gaianet's latest news and insights, you can follow their Telegram channel at [https://t.me/Gaianet_AI](https://t.me/Gaianet_AI).\n\nInsightful articles and updates from Gaianet are regularly published on their Medium blog at [https://medium.com/@Gaianet.ai](https://medium.com/@Gaianet.ai).", 46 | "For the latest announcements and engagements, follow Gaianet on Twitter at [https://twitter.com/Gaianet_AI](https://twitter.com/Gaianet_AI).\n\nDevelopers and contributors can explore Gaianet's GitHub repository at [https://github.com/GaiaNet-AI/](https://github.com/GaiaNet-AI/)." 47 | ] 48 | } 49 | ``` 50 | HTTP 200 51 | [Asserts] 52 | jsonpath "$.model" == "nomic-embed-text-v1.5" 53 | jsonpath "$.data" count > 0 -------------------------------------------------------------------------------- /tests/test_rag.hurl: -------------------------------------------------------------------------------- 1 | 2 | # test /v1/chat/completions endpoint 3 | POST http://localhost:8080/v1/chat/completions 4 | Accept: application/json 5 | Content-Type: application/json 6 | ```json 7 | { 8 | "messages": [ 9 | { 10 | "role": "user", 11 | "content": "What is the location of Paris, France along the Siene River?" 12 | } 13 | ], 14 | "model": "Qwen2-1.5B-Instruct", 15 | "stream": false 16 | } 17 | ``` 18 | HTTP 200 19 | [Asserts] 20 | jsonpath "$.model" == "Qwen2-1.5B-Instruct" 21 | jsonpath "$.choices[0].message.content" contains "Paris" 22 | --------------------------------------------------------------------------------