├── .dockerignore ├── .github ├── dependabot.yaml └── workflows │ ├── build-push-kubeai.yml │ ├── build-push-model-loader.yml │ ├── create-gh-release.yml │ ├── docs-lint.yml │ ├── helm-lint.yml │ ├── publish-docs.yml │ ├── release-helm-chart.yml │ └── tests.yml ├── .gitignore ├── .golangci.yml ├── Dockerfile ├── LICENSE ├── Makefile ├── PROJECT ├── api ├── k8s │ └── v1 │ │ ├── groupversion_info.go │ │ ├── metadata.go │ │ ├── model_types.go │ │ └── zz_generated.deepcopy.go └── openai │ └── v1 │ ├── README.md │ ├── chat_completions.go │ ├── chat_completions_test.go │ ├── completions.go │ ├── completions_test.go │ ├── embeddings.go │ ├── embeddings_test.go │ ├── reference │ ├── chat_completions.openai.openapi.yaml │ ├── completions.openai.openapi.yaml │ ├── embeddings.openai.openapi.yaml │ ├── example-requests.ollama.output │ ├── example-requests.openai.output │ ├── example-requests.sh │ └── example-requests.vllm.output │ ├── usage.go │ ├── utils.go │ └── utils_test.go ├── benchmarks ├── chat-py │ ├── .gitignore │ ├── Dockerfile │ ├── README.md │ ├── backend_request_func.py │ ├── benchmark_serving.py │ ├── job.yaml │ ├── requirements.txt │ └── vllm-direct-service.yaml ├── multi-turn-chat-go │ ├── .dockerignore │ ├── .gitignore │ ├── .python-version │ ├── Dockerfile │ ├── Makefile │ ├── README.md │ ├── benchmark │ │ ├── runner.go │ │ └── runner_test.go │ ├── dashs │ │ └── vLLM-1740366828970.json │ ├── data │ │ └── prepare-input-threads.py │ ├── go.mod │ ├── go.sum │ ├── hack │ │ ├── Dockerfile │ │ ├── bench-pod.yaml │ │ ├── chat-template.jinja │ │ ├── kubeai-config.json │ │ ├── llama-3.1-8b-instruct-fp8-l4.yaml │ │ ├── model.yaml │ │ ├── ollama-config.json │ │ ├── openai-config.json │ │ ├── pod.opt-125m.yaml │ │ ├── pod.qwen.yaml │ │ ├── pod.yaml │ │ ├── podmonitor.yaml │ │ ├── tokenizer │ │ │ ├── tokenizer.go │ │ │ └── tokens.py │ │ └── vllm.Dockerfile │ ├── main.go │ ├── pyproject.toml │ ├── run.ipynb │ ├── runs │ │ └── llama-3.1-8x-l4 │ │ │ ├── itl.png │ │ │ ├── run.ipynb │ │ │ ├── throughput.png │ │ │ └── ttft.png │ └── uv.lock └── multi-turn-chat-k6 │ ├── .dockerignore │ ├── .gitignore │ ├── Dockerfile │ ├── Makefile │ ├── data │ └── prepare-message-threads.py │ ├── k6.js │ └── scenarios │ ├── least-load-vs-prefix-hash-70b-8r │ ├── README.md │ ├── base-request.json │ ├── k6.json │ ├── model.yaml │ └── pod.yaml │ └── least-load-vs-prefix-hash │ ├── README.md │ ├── base-request.json │ ├── k6.json │ ├── model.yaml │ └── pod.yaml ├── charts ├── .gitignore ├── kubeai │ ├── .helmignore │ ├── Chart.yaml │ ├── templates │ │ ├── _helpers.tpl │ │ ├── autoscalerstateconfigmap.yaml │ │ ├── aws-secret.yaml │ │ ├── configmap.yaml │ │ ├── crds │ │ │ └── kubeai.org_models.yaml │ │ ├── deployment.yaml │ │ ├── huggingface-secret.yaml │ │ ├── ingress.yaml │ │ ├── role.yaml │ │ ├── rolebinding.yaml │ │ ├── securityContextConstraints.yaml │ │ ├── service.yaml │ │ ├── serviceaccount.yaml │ │ └── vllm-pod-monitor.yaml │ ├── values-amd-gpu-device-plugin.yaml │ ├── values-eks.yaml │ ├── values-gke.yaml │ ├── values-nvidia-k8s-device-plugin.yaml │ └── values.yaml └── models │ ├── .helmignore │ ├── Chart.yaml │ ├── templates │ └── models.yaml │ └── values.yaml ├── cmd └── main.go ├── components └── model-loader │ ├── Dockerfile │ └── load.sh ├── docs ├── CNAME ├── README.md ├── benchmarks │ ├── llama-3.2-11b-vision.md │ ├── prefix-aware-load-balancing-mean-ttft.png │ ├── prefix-aware-load-balancing-throughput.png │ └── prefix-aware-load-balancing.md ├── blog │ ├── .authors.yml │ ├── index.md │ └── posts │ │ └── llm-load-balancing-at-scale-chwbl.md ├── concepts │ ├── autoscaling.md │ ├── backend-servers.md │ ├── load-balancing.md │ ├── lora-adapters.md │ ├── resource-profiles.md │ └── storage-caching.md ├── contributing │ ├── development-environment.md │ ├── development-guide.md │ ├── documentation.md │ └── release-process.md ├── diagrams │ ├── arch.excalidraw.png │ ├── autoscaling.excalidraw.png │ ├── caching-shared-filesystem.excalidraw.png │ ├── chwbl.excalidraw.png │ ├── lora-direct-loading.excalidraw.png │ ├── multi-threaded-shared-context.excalidraw.png │ ├── multi-turn-clients.excalidraw.png │ ├── multitenancy-labels.excalidraw.png │ ├── private-deep-chat.excalidraw.png │ └── random-vs-consistent-hash.excalidraw.png ├── graphs │ ├── throughput-benchmark.png │ └── ttft-benchmark.png ├── how-to │ ├── architect-for-multitenancy.md │ ├── authenticate-to-model-repos.md │ ├── build-models-into-containers.md │ ├── cache-models-with-aws-efs.md │ ├── cache-models-with-gcp-filestore.md │ ├── configure-autoscaling.md │ ├── configure-embedding-models.md │ ├── configure-resource-profiles.md │ ├── configure-speech-to-text.md │ ├── configure-text-generation-models.md │ ├── install-models.md │ ├── load-models-from-pvc.md │ ├── observability-with-prometheus-stack.md │ └── serve-lora-adapters.md ├── index.yaml ├── installation │ ├── aks.md │ ├── any.md │ ├── eks.md │ └── gke.md ├── overrides │ └── partials │ │ └── integrations │ │ └── analytics │ │ └── custom.html ├── reference │ ├── .kubernetes-api │ │ └── config.yaml │ ├── kubernetes-api.md │ └── openai-api-compatibility.md ├── requirements.txt ├── screenshots │ ├── gcp-cpus-all-regions.png │ ├── gcp-gpus-all-regions.png │ ├── gcp-quota-preemptible-nvidia-l4-gpus-regional.png │ ├── gcp-quota-premium-storage-gb-per-region.png │ ├── gcp-tpu-preemptible-v5e-quota.png │ ├── langtrace.png │ └── private-deep-chat.png └── tutorials │ ├── langchain.md │ ├── langtrace.md │ ├── private-deep-chat.md │ └── weaviate.md ├── examples ├── k8s-api-clients │ └── python │ │ ├── .gitignore │ │ ├── example.py │ │ └── requirements.txt ├── observability │ └── vllm-grafana-dashboard.json ├── ollama-builtin │ ├── Dockerfile │ └── download.sh ├── ollama-pvc │ ├── job.yaml │ └── pvc.yaml ├── priority-examples │ ├── README.md │ ├── background-research-model.yaml │ ├── critical-service-model.yaml │ ├── hello-world-llm.yaml │ └── priority-classes.yaml ├── private-deep-chat │ ├── Dockerfile │ ├── go.mod │ ├── main.go │ ├── manifests │ │ ├── deployment.yaml │ │ ├── models.yaml │ │ └── service.yaml │ └── static │ │ └── index.html └── storage-classes │ └── gcp-filestore.yaml ├── go.mod ├── go.sum ├── hack ├── apply-model.sh ├── boilerplate.go.txt ├── create-dev-gke-cluster.sh ├── dev-configs │ ├── gke.yaml │ └── kind.yaml ├── dev-gke-helm-values.yaml ├── dev-load │ ├── k6.js │ ├── pod.yaml │ └── run.sh ├── dev-models │ ├── kind-cpu-adapters.yaml │ ├── kind-cpu.yaml │ ├── kind-vllm-cpu.yaml │ ├── vllm-chat.yaml │ ├── vllm-gs-url.yaml │ ├── vllm-s3-url.yaml │ └── vllm-with-adapters.yaml ├── filter-openapi-components.py ├── pvs │ └── preprov-filestore.yaml ├── vllm-mock-metrics │ ├── main.go │ └── metrics.txt └── volume-debug-pod.yaml ├── internal ├── apiutils │ ├── model.go │ ├── model_test.go │ ├── request.go │ └── request_test.go ├── config │ ├── system.go │ └── system_test.go ├── k8sutils │ ├── apply.go │ ├── client_options.go │ ├── gvk.go │ ├── jobs.go │ ├── meta.go │ ├── meta_test.go │ └── pods.go ├── leader │ └── election.go ├── loadbalancer │ ├── balance_chwbl.go │ ├── balance_least_load.go │ ├── group.go │ ├── group_bench_test.go │ ├── group_test.go │ ├── load_balancer.go │ └── load_balancer_test.go ├── manager │ ├── configure.go │ ├── otel.go │ └── run.go ├── messenger │ └── messenger.go ├── metrics │ ├── metrics.go │ └── metricstest │ │ └── metricstest.go ├── modelautoscaler │ ├── autoscaler.go │ ├── metrics.go │ └── state.go ├── modelclient │ ├── client.go │ └── scale.go ├── modelcontroller │ ├── adapters.go │ ├── cache.go │ ├── engine_fasterwhisper.go │ ├── engine_infinity.go │ ├── engine_ollama.go │ ├── engine_ollama_test.go │ ├── engine_vllm.go │ ├── files.go │ ├── files_test.go │ ├── model_controller.go │ ├── model_controller_test.go │ ├── model_source.go │ ├── model_source_test.go │ ├── patch.go │ ├── patch_test.go │ ├── pod_plan.go │ ├── pod_plan_test.go │ └── pod_utils.go ├── modelproxy │ ├── handler.go │ ├── handler_test.go │ └── request.go ├── movingaverage │ ├── simple.go │ └── simple_test.go ├── openaiserver │ ├── handler.go │ └── models.go └── vllmclient │ └── client.go ├── manifests └── models │ ├── bge-embed-text-cpu.yaml │ ├── deepseek-r1-1.5b-cpu.yaml │ ├── deepseek-r1-70b-gh200-fp8.yaml │ ├── deepseek-r1-70b-gh200.yaml │ ├── deepseek-r1-distill-llama-8b-l4.yaml │ ├── deepseek-r1-distill-qwen-1.5b-rtx4070.yaml │ ├── deepseek-r1-mi300x.yaml │ ├── e5-mistral-7b-instruct-cpu.yaml │ ├── faster-whisper-medium-en-cpu.yaml │ ├── gemma-2-9b-it-fp8-l4.yaml │ ├── gemma-27b-ollama-l4.yaml │ ├── gemma-2b-it-tpu.yaml │ ├── gemma-3-12b-ollama-l4.yaml │ ├── gemma-3-27b-ollama-l4.yaml │ ├── gemma-9b-ollama-l4.yaml │ ├── gemma2-2b-cpu.yaml │ ├── granite-3.1-dense-ollama-l4.yaml │ ├── llama-3.1-405b-instruct-fp8-a100-80b.yaml │ ├── llama-3.1-405b-instruct-fp8-h100.yaml │ ├── llama-3.1-405b-instruct-fp8-mi300x.yaml │ ├── llama-3.1-70b-instruct-awq-int4-gh200.yaml │ ├── llama-3.1-70b-instruct-fp8-1-h100.yaml │ ├── llama-3.1-70b-instruct-fp8-gh200.yaml │ ├── llama-3.1-70b-instruct-fp8-h100.yaml │ ├── llama-3.1-70b-instruct-fp8-l4.yaml │ ├── llama-3.1-70b-instruct-fp8-mi300x.yaml │ ├── llama-3.1-8b-instruct-cpu.yaml │ ├── llama-3.1-8b-instruct-fp8-l4.yaml │ ├── llama-3.1-8b-instruct-tpu.yaml │ ├── llama-3.1-supernova-lite-l4.yaml │ ├── llama-3.1-tulu-3-8b-l4.yaml │ ├── llama-3.2-11b-vision-instruct-l4.yaml │ ├── llama-3.3-70b-instruct-bf16-gh200.yaml │ ├── llama-3.3-70b-ollama-l4.yaml │ ├── llama-4-maverick-430k-h100.yaml │ ├── mistral-small-24b-instruct-h100.yaml │ ├── mistral-small-3.1-24b-instruct-h100.yaml │ ├── nomic-embed-text-cpu.yaml │ ├── opt-125m-cpu.yaml │ ├── opt-125m-l4.yaml │ ├── phi-4-bnb-4bit-l4.yaml │ ├── phi-4-ollama-l4.yaml │ ├── qwen2-500m-cpu.yaml │ ├── qwen2.5-7b-cpu.yaml │ ├── qwen2.5-7b-instruct-l4.yaml │ ├── qwen2.5-coder-1.5b-cpu.yaml │ └── qwen2.5-coder-1.5b-rtx4070-8gb.yaml ├── mkdocs.yml ├── proposals ├── diagrams │ ├── auth-with-label-selector.excalidraw.png │ ├── cache-optimized-routing.excalidraw.png │ ├── lora-direct-loading.excalidraw.png │ ├── lora.excalidraw.png │ ├── model-mgmt-buckets.excalidraw.png │ └── model-mgmt-volumes.excalidraw.png ├── lora-adapters.md ├── model-storage.md └── multitenancy.md ├── skaffold-build.json ├── skaffold-tags.json ├── skaffold.yaml └── test ├── e2e-manual ├── gke-vllm-adapters │ ├── model.yaml │ └── run.sh └── gke-vllm-gpu-tpu │ └── run.sh ├── e2e ├── autoscaler-restart-no-load │ ├── k6-pod.yaml │ ├── k6.js │ ├── model.yaml │ ├── skaffold.yaml │ ├── test.sh │ └── values.yaml ├── autoscaler-restart-under-load │ ├── k6-pod.yaml │ ├── k6.js │ ├── model.yaml │ ├── skaffold.yaml │ ├── test.sh │ └── values.yaml ├── cache-shared-filesystem │ ├── cache-mount-pod.yaml │ └── test.sh ├── common-manifests.yaml ├── common.sh ├── engine-fasterwhisper │ └── test.sh ├── engine-infinity │ └── test.sh ├── engine-ollama-pvc │ ├── ollama-hydrate-job.yaml │ ├── pv.yaml │ ├── pvc.yaml │ └── test.sh ├── engine-vllm-pvc │ ├── pv.yaml │ ├── pvc.yaml │ └── test.sh ├── model-files │ └── test.sh ├── openai-python-client │ ├── .gitignore │ ├── requirements.txt │ ├── test.py │ └── test.sh ├── quickstart │ └── test.sh ├── rollouts │ └── test.sh ├── run.sh ├── s3-model │ ├── model.yaml │ ├── pv.yaml │ ├── pvc.yaml │ ├── s3-instance.yaml │ ├── skaffold.yaml │ ├── test.sh │ ├── upload-model-to-s3.yaml │ └── values.yaml ├── skaffold.default.yaml └── values.default.yaml ├── integration ├── adapter_test.go ├── autoscaler_state_test.go ├── autoscaling_ha_test.go ├── cache_shared_filesystem_test.go ├── main_test.go ├── messenger_test.go ├── model_default_test.go ├── model_files_test.go ├── model_pod_recovery_test.go ├── model_pod_update_rollout_test.go ├── model_priority_test.go ├── model_profiles_test.go ├── model_scaling_bounds_test.go ├── model_validation_test.go ├── proxy_test.go ├── selector_test.go └── utils_test.go └── utils └── utils.go /.dockerignore: -------------------------------------------------------------------------------- 1 | # More info: https://docs.docker.com/engine/reference/builder/#dockerignore-file 2 | # Ignore build and test binaries. 3 | bin/ 4 | benchmarks/ 5 | charts/ 6 | components/ 7 | docs/ 8 | examples/ 9 | manifests/ 10 | proposals/ 11 | test/ 12 | tmp/ -------------------------------------------------------------------------------- /.github/dependabot.yaml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: "gomod" 4 | directory: "/" 5 | schedule: 6 | interval: daily 7 | time: "01:00" 8 | 9 | - package-ecosystem: "docker" 10 | directory: "/" 11 | schedule: 12 | interval: daily 13 | time: "01:00" 14 | 15 | - package-ecosystem: "github-actions" 16 | directory: "/" 17 | schedule: 18 | interval: daily 19 | time: "01:00" 20 | groups: 21 | actions-all: 22 | patterns: 23 | - "*" 24 | 25 | - package-ecosystem: "pip" 26 | directory: "/docs" 27 | schedule: 28 | interval: daily 29 | time: "01:00" 30 | 31 | - package-ecosystem: "docker" 32 | directory: "/components/model-loader" 33 | schedule: 34 | interval: daily 35 | time: "01:00" 36 | 37 | - package-ecosystem: "docker" 38 | directory: "/examples/ollama-builtin" 39 | schedule: 40 | interval: daily 41 | time: "01:00" 42 | 43 | - package-ecosystem: "gomod" 44 | directory: "/examples/private-deep-chat" 45 | schedule: 46 | interval: daily 47 | time: "01:00" 48 | 49 | - package-ecosystem: "docker" 50 | directory: "/examples/private-deep-chat" 51 | schedule: 52 | interval: daily 53 | time: "01:00" 54 | -------------------------------------------------------------------------------- /.github/workflows/create-gh-release.yml: -------------------------------------------------------------------------------- 1 | # Create a GitHub release on tag push 2 | # source: https://stackoverflow.com/a/75679739/376445 3 | name: Create GitHub Release 4 | 5 | on: 6 | push: 7 | tags: 8 | - "v*.*.*" 9 | 10 | permissions: 11 | contents: write 12 | 13 | jobs: 14 | release: 15 | name: Release pushed tag 16 | runs-on: ubuntu-22.04 17 | steps: 18 | - name: Create release 19 | env: 20 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 21 | tag: ${{ github.ref_name }} 22 | run: | 23 | gh release create "$tag" \ 24 | --repo="$GITHUB_REPOSITORY" \ 25 | --title="${GITHUB_REPOSITORY#*/} ${tag#v}" \ 26 | --generate-notes 27 | -------------------------------------------------------------------------------- /.github/workflows/docs-lint.yml: -------------------------------------------------------------------------------- 1 | name: Doc linter 2 | run-name: Run doc linter by @${{ github.actor }} 3 | 4 | on: 5 | push: 6 | branches: 7 | - main 8 | pull_request: 9 | 10 | jobs: 11 | mkdocs-build-strict: 12 | runs-on: ubuntu-latest 13 | steps: 14 | - name: Checkout code 15 | uses: actions/checkout@v4 16 | - uses: actions/setup-python@v5 17 | with: 18 | python-version: 3.x 19 | - name: Install dependencies 20 | run: pip install -r docs/requirements.txt 21 | - name: Run mkdocs in strict mode 22 | run: mkdocs build --strict 23 | -------------------------------------------------------------------------------- /.github/workflows/helm-lint.yml: -------------------------------------------------------------------------------- 1 | name: Lint and Test Charts 2 | 3 | on: 4 | pull_request: 5 | 6 | jobs: 7 | lint-test: 8 | runs-on: ubuntu-latest 9 | steps: 10 | - name: Checkout 11 | uses: actions/checkout@v4 12 | with: 13 | fetch-depth: 0 14 | 15 | - name: Set up Helm 16 | uses: azure/setup-helm@v4.2.0 17 | with: 18 | version: v3.14.4 19 | 20 | - uses: actions/setup-python@v5 21 | with: 22 | python-version: '3.x' 23 | check-latest: true 24 | 25 | - name: Set up chart-testing 26 | uses: helm/chart-testing-action@v2.6.1 27 | 28 | - name: Run chart-testing (list-changed) 29 | id: list-changed 30 | run: | 31 | changed=$(ct list-changed --target-branch ${{ github.event.repository.default_branch }}) 32 | if [[ -n "$changed" ]]; then 33 | echo "changed=true" >> "$GITHUB_OUTPUT" 34 | fi 35 | 36 | - name: Run chart-testing (lint) 37 | if: steps.list-changed.outputs.changed == 'true' 38 | run: ct lint --check-version-increment=false --target-branch ${{ github.event.repository.default_branch }} 39 | -------------------------------------------------------------------------------- /.github/workflows/publish-docs.yml: -------------------------------------------------------------------------------- 1 | name: Publish docs 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | 8 | jobs: 9 | publish-docs: 10 | permissions: 11 | contents: write 12 | runs-on: ubuntu-latest 13 | steps: 14 | - uses: actions/checkout@v4 15 | - name: Configure Git 16 | run: | 17 | git config user.name "$GITHUB_ACTOR" 18 | git config user.email "$GITHUB_ACTOR@users.noreply.github.com" 19 | - uses: actions/setup-python@v5 20 | with: 21 | python-version: 3.x 22 | - run: echo "cache_id=$(date --utc '+%V')" >> $GITHUB_ENV 23 | - uses: actions/cache@v4 24 | with: 25 | key: mkdocs-material-${{ env.cache_id }} 26 | path: .cache 27 | restore-keys: | 28 | mkdocs-material- 29 | - run: | 30 | git fetch origin 31 | # This is needed because otherwise mkdocs removes the index.yaml file. 32 | # Get the latest index.yaml from gh-pages branch. 33 | git checkout gh-pages 34 | git pull origin gh-pages 35 | cp index.yaml /tmp/index.yaml 36 | git checkout main 37 | git pull origin main 38 | cp /tmp/index.yaml docs/index.yaml 39 | pip install -r docs/requirements.txt 40 | - run: make generate-kubernetes-api-reference 41 | - run: mkdocs gh-deploy -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Binaries for programs and plugins 2 | *.exe 3 | *.exe~ 4 | *.dll 5 | *.so 6 | *.dylib 7 | bin/* 8 | Dockerfile.cross 9 | 10 | # Test binary, built with `go test -c` 11 | *.test 12 | 13 | # Output of the go coverage tool, specifically when used with LiteIDE 14 | *.out 15 | 16 | # Go workspace file 17 | go.work 18 | 19 | # Kubernetes Generated files - skip generated files, except for vendored files 20 | !vendor/**/zz_generated.* 21 | 22 | # editor and IDE paraphernalia 23 | .idea 24 | .vscode 25 | *.swp 26 | *.swo 27 | *~ 28 | 29 | # Files that might be committed from running guides 30 | /kubeai.yaml 31 | /kubeai-models.yaml 32 | /helm-values.yaml 33 | /model-helm-values.yaml 34 | Chart.lock 35 | 36 | # Ignore python virtual env 37 | .venv 38 | *__pycache__ 39 | site 40 | 41 | /tmp 42 | 43 | ./charts/kubeai/charts/*.tgz 44 | 45 | .cache/ -------------------------------------------------------------------------------- /.golangci.yml: -------------------------------------------------------------------------------- 1 | run: 2 | timeout: 5m 3 | allow-parallel-runners: true 4 | 5 | issues: 6 | # don't skip warning about doc comments 7 | # don't exclude the default set of lint 8 | exclude-use-default: false 9 | # restore some of the defaults 10 | # (fill in the rest as needed) 11 | exclude-rules: 12 | - path: "api/*" 13 | linters: 14 | - lll 15 | - path: "internal/*" 16 | linters: 17 | - dupl 18 | - lll 19 | linters: 20 | disable-all: true 21 | enable: 22 | - dupl 23 | - errcheck 24 | - exportloopref 25 | - ginkgolinter 26 | - goconst 27 | - gocyclo 28 | - gofmt 29 | - goimports 30 | - gosimple 31 | - govet 32 | - ineffassign 33 | - lll 34 | - misspell 35 | - nakedret 36 | - prealloc 37 | - revive 38 | - staticcheck 39 | - typecheck 40 | - unconvert 41 | - unparam 42 | - unused 43 | 44 | linters-settings: 45 | revive: 46 | rules: 47 | - name: comment-spacings 48 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # Build the manager binary 2 | FROM golang:1.24.1 AS builder 3 | ARG TARGETOS 4 | ARG TARGETARCH 5 | 6 | WORKDIR /workspace 7 | # Copy the Go Modules manifests 8 | COPY go.mod go.mod 9 | COPY go.sum go.sum 10 | # cache deps before building and copying source so that we don't need to re-download as much 11 | # and so that source changes don't invalidate our downloaded layer 12 | RUN go mod download 13 | 14 | # Copy the go source 15 | COPY cmd/main.go cmd/main.go 16 | COPY api/ api/ 17 | COPY internal/ internal/ 18 | 19 | # Build 20 | # the GOARCH has not a default value to allow the binary be built according to the host where the command 21 | # was called. For example, if we call make docker-build in a local env which has the Apple Silicon M1 SO 22 | # the docker BUILDPLATFORM arg will be linux/arm64 when for Apple x86 it will be linux/amd64. Therefore, 23 | # by leaving it empty we can ensure that the container and binary shipped on it will have the same platform. 24 | RUN CGO_ENABLED=0 GOOS=${TARGETOS:-linux} GOARCH=${TARGETARCH} go build -a -o manager cmd/main.go 25 | 26 | # Use distroless as minimal base image to package the manager binary 27 | # Refer to https://github.com/GoogleContainerTools/distroless for more details 28 | FROM gcr.io/distroless/static:nonroot 29 | WORKDIR /app 30 | COPY --from=builder /workspace/manager /app/ 31 | USER 65532:65532 32 | 33 | ENTRYPOINT ["/app/manager"] 34 | -------------------------------------------------------------------------------- /PROJECT: -------------------------------------------------------------------------------- 1 | # Code generated by tool. DO NOT EDIT. 2 | # This file is used to track the info used to scaffold your project 3 | # and allow the plugins properly work. 4 | # More info: https://book.kubebuilder.io/reference/project-config.html 5 | domain: substratus.ai 6 | layout: 7 | - go.kubebuilder.io/v4 8 | projectName: kubeai 9 | repo: github.com/substratusai/kubeai 10 | resources: 11 | - api: 12 | crdVersion: v1 13 | namespaced: true 14 | controller: true 15 | domain: substratus.ai 16 | group: kubeai 17 | kind: Model 18 | path: github.com/substratusai/kubeai/api/k8s/v1 19 | version: v1 20 | version: "3" 21 | -------------------------------------------------------------------------------- /api/k8s/v1/groupversion_info.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2024. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | // Package v1 contains API Schema definitions for the kubeai v1 API group 18 | // +kubebuilder:object:generate=true 19 | // +groupName=kubeai.org 20 | package v1 21 | 22 | import ( 23 | "k8s.io/apimachinery/pkg/runtime/schema" 24 | "sigs.k8s.io/controller-runtime/pkg/scheme" 25 | ) 26 | 27 | var ( 28 | // GroupVersion is group version used to register these objects 29 | GroupVersion = schema.GroupVersion{Group: "kubeai.org", Version: "v1"} 30 | 31 | // SchemeBuilder is used to add go types to the GroupVersionKind scheme 32 | SchemeBuilder = &scheme.Builder{GroupVersion: GroupVersion} 33 | 34 | // AddToScheme adds the types in this group-version to the given scheme. 35 | AddToScheme = SchemeBuilder.AddToScheme 36 | ) 37 | -------------------------------------------------------------------------------- /api/k8s/v1/metadata.go: -------------------------------------------------------------------------------- 1 | package v1 2 | 3 | const ( 4 | PodModelLabel = "model" 5 | // PodHashLabel is a label key used to store the hash of the Pod spec 6 | // that was used to create the Pod. This is used to determine if a Pod 7 | // needs to be recreated. 8 | PodHashLabel = "pod-hash" 9 | 10 | ModelFeatureLabelDomain = "features.kubeai.org" 11 | 12 | // ModelPodIPAnnotation is the annotation key used to specify an IP 13 | // to use for the model Pod instead of the IP address in the status of the Pod. 14 | // Use in conjunction with --allow-pod-address-override for development purposes. 15 | ModelPodIPAnnotation = "model-pod-ip" 16 | ModelPodPortAnnotation = "model-pod-port" 17 | 18 | ModelCacheEvictionFinalizer = "kubeai.org/cache-eviction" 19 | ) 20 | 21 | func PVCModelAnnotation(modelName string) string { 22 | return "models.kubeai.org/" + modelName 23 | } 24 | 25 | const ( 26 | PodAdapterLabelPrefix = "adapter.kubeai.org/" 27 | ) 28 | 29 | func PodAdapterLabel(adapterID string) string { 30 | return PodAdapterLabelPrefix + adapterID 31 | } 32 | -------------------------------------------------------------------------------- /api/openai/v1/utils.go: -------------------------------------------------------------------------------- 1 | package v1 2 | 3 | // firstNChars returns the first n characters of a string. 4 | // This function is needed because Go's string indexing is based on bytes, not runes. 5 | func firstNChars(s string, n int) string { 6 | runes := []rune(s) 7 | return string(runes[:min(n, len(runes))]) 8 | } 9 | 10 | // Ptr is a helper function for creating an inline pointer to a constant. 11 | func Ptr[T any](v T) *T { 12 | return &v 13 | } 14 | -------------------------------------------------------------------------------- /api/openai/v1/utils_test.go: -------------------------------------------------------------------------------- 1 | package v1 2 | 3 | import ( 4 | "fmt" 5 | "testing" 6 | 7 | "github.com/stretchr/testify/require" 8 | ) 9 | 10 | func Test_firstNChars(t *testing.T) { 11 | cases := []struct { 12 | input string 13 | n int 14 | exp string 15 | }{ 16 | {"", 0, ""}, 17 | {"", 1, ""}, 18 | {"abc", 0, ""}, 19 | {"abc", 1, "a"}, 20 | {"abc", 2, "ab"}, 21 | {"abc", 3, "abc"}, 22 | {"abc", 4, "abc"}, 23 | {"世界", 1, "世"}, 24 | {"世界", 2, "世界"}, 25 | {"世界", 3, "世界"}, 26 | } 27 | for _, c := range cases { 28 | t.Run(fmt.Sprintf("%q %d", c.input, c.n), func(t *testing.T) { 29 | require.Equal(t, c.exp, firstNChars(c.input, c.n)) 30 | }) 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /benchmarks/chat-py/.gitignore: -------------------------------------------------------------------------------- 1 | sharegpt_16_messages_or_more.json 2 | -------------------------------------------------------------------------------- /benchmarks/chat-py/Dockerfile: -------------------------------------------------------------------------------- 1 | # Use a lightweight Python base image 2 | FROM python:3.10 3 | 4 | # Set the working directory 5 | WORKDIR /app 6 | 7 | # Copy requirements first to leverage Docker cache 8 | COPY requirements.txt . 9 | 10 | # Install Python dependencies 11 | RUN pip install --no-cache-dir -r requirements.txt 12 | 13 | # Copy the benchmark serving script 14 | COPY backend_request_func.py . 15 | COPY benchmark_serving.py . 16 | RUN curl -O -L https://huggingface.co/datasets/samos123/share-gpt-long-convos/resolve/main/sharegpt_16_messages_or_more.json 17 | 18 | # Set environment variables 19 | ENV PYTHONPATH=/app 20 | 21 | # Define the entrypoint command 22 | ENTRYPOINT ["python", "benchmark_serving.py"] 23 | 24 | CMD ["--dataset-name=sharegpt", "--dataset-path=sharegpt_16_messages_or_more.json"] 25 | -------------------------------------------------------------------------------- /benchmarks/chat-py/README.md: -------------------------------------------------------------------------------- 1 | # Benchmarking Text Generation 2 | 3 | This script was adopted from the vLLM code base. The main differences are: 4 | - Load the whole conversation as prompts. 5 | - Limit the amount of max conversations and re-use the same conversation if needed. 6 | 7 | This allows us to verify whether prefix aware load balancing provides a performance 8 | boost under heavy production traffic with ongoing chat conversations. 9 | 10 | ## Running 11 | 12 | Adjust the parameters in the `job.yaml` file and run the job using the following command: 13 | ``` 14 | kubectl apply -f job.yaml 15 | ``` 16 | 17 | -------------------------------------------------------------------------------- /benchmarks/chat-py/job.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: batch/v1 2 | kind: Job 3 | metadata: 4 | name: benchmark-serving 5 | spec: 6 | template: 7 | spec: 8 | containers: 9 | - name: benchmark-serving 10 | image: substratusai/benchmark_serving:latest 11 | args: 12 | - --base-url=http://kubeai/openai 13 | - --dataset-name=sharegpt 14 | - --dataset-path=/app/sharegpt_16_messages_or_more.json 15 | - --model=llama-3.1-8b-instruct-fp8-l4 16 | - --seed=12345 17 | - --tokenizer=neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8 18 | - --request-rate=800 19 | - --max-concurrency=800 20 | - --num-prompts=8000 21 | - --max-conversations=800 22 | restartPolicy: Never -------------------------------------------------------------------------------- /benchmarks/chat-py/requirements.txt: -------------------------------------------------------------------------------- 1 | numpy 2 | huggingface_hub 3 | aiohttp 4 | transformers 5 | datasets 6 | pillow -------------------------------------------------------------------------------- /benchmarks/chat-py/vllm-direct-service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | name: vllm-direct 5 | labels: 6 | app: vllm-direct 7 | spec: 8 | selector: 9 | app.kubernetes.io/name: vllm 10 | ports: 11 | - name: http 12 | protocol: TCP 13 | port: 80 # The port exposed by the Service. 14 | targetPort: 8000 # The container port that your pods are listening on. 15 | type: ClusterIP 16 | 17 | -------------------------------------------------------------------------------- /benchmarks/multi-turn-chat-go/.dockerignore: -------------------------------------------------------------------------------- 1 | /data/raw/ 2 | .venv 3 | __pycache__ -------------------------------------------------------------------------------- /benchmarks/multi-turn-chat-go/.gitignore: -------------------------------------------------------------------------------- 1 | data/raw/*.json 2 | data/*.json 3 | /values-gke.yaml -------------------------------------------------------------------------------- /benchmarks/multi-turn-chat-go/.python-version: -------------------------------------------------------------------------------- 1 | 3.13 2 | -------------------------------------------------------------------------------- /benchmarks/multi-turn-chat-go/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM golang:1.23.5 2 | 3 | WORKDIR /work 4 | 5 | COPY ./go.mod . 6 | COPY ./go.sum . 7 | 8 | RUN go mod download 9 | 10 | COPY ./main.go . 11 | COPY ./benchmark ./benchmark 12 | 13 | RUN mkdir bin 14 | ENV PATH="/work/bin:$PATH" 15 | RUN go build -o bin/bench ./main.go 16 | 17 | COPY ./data ./data 18 | COPY ./example ./example 19 | 20 | ENTRYPOINT [ "bench" ] -------------------------------------------------------------------------------- /benchmarks/multi-turn-chat-go/README.md: -------------------------------------------------------------------------------- 1 | # Benchmark 2 | 3 | ## E2E Run 4 | 5 | Build the docker image. 6 | 7 | ```bash 8 | make data 9 | make build-docker-image 10 | make push-docker-image 11 | ``` 12 | 13 | Run `run.ipynb`. 14 | 15 | 16 | ## Run with Docker 17 | 18 | ### Example: Ollama (with config flags) 19 | 20 | Make sure the Ollama server is running on your machine. 21 | 22 | ```bash 23 | docker run --network=host -e OPENAI_BASE_URL=http://host.docker.internal:11434/v1 $BENCH_IMAGE \ 24 | --threads ./data/tiny.json \ 25 | --thread-count 4 \ 26 | --request-model qwen2:0.5b \ 27 | --max-concurrent-threads 2 \ 28 | --max-completion-tokens 10 \ 29 | --request-timeout 30s 30 | ``` 31 | 32 | ### Example: OpenAI (with config file) 33 | 34 | Make sure you have set `OPENAI_API_KEY`. 35 | 36 | ```bash 37 | docker run --network=host -e OPENAI_API_KEY=$OPENAI_API_KEY -e OPENAI_BASE_URL=https://api.openai.com/v1 $BENCH_IMAGE --config ./hack/openai-config.json --threads ./data/tiny.json 38 | ``` 39 | 40 | 41 | ## Run with Go 42 | 43 | Run the benchmark (against a local ollama instance). 44 | 45 | ```bash 46 | OPENAI_BASE_URL=http://localhost:11434/v1 go run . --config ./hack/ollama-config.json --threads ./data/tiny.json 47 | ``` -------------------------------------------------------------------------------- /benchmarks/multi-turn-chat-go/go.mod: -------------------------------------------------------------------------------- 1 | module multi-turn-chat-go 2 | 3 | go 1.23.5 4 | 5 | require ( 6 | github.com/davecgh/go-spew v1.1.1 // indirect 7 | github.com/pmezard/go-difflib v1.0.0 // indirect 8 | github.com/sashabaranov/go-openai v1.37.0 // indirect 9 | github.com/stretchr/testify v1.10.0 // indirect 10 | gopkg.in/yaml.v3 v3.0.1 // indirect 11 | ) 12 | -------------------------------------------------------------------------------- /benchmarks/multi-turn-chat-go/go.sum: -------------------------------------------------------------------------------- 1 | github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= 2 | github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= 3 | github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= 4 | github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= 5 | github.com/sashabaranov/go-openai v1.37.0 h1:hQQowgYm4OXJ1Z/wTrE+XZaO20BYsL0R3uRPSpfNZkY= 6 | github.com/sashabaranov/go-openai v1.37.0/go.mod h1:lj5b/K+zjTSFxVLijLSTDZuP7adOgerWeFyZLUhAKRg= 7 | github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA= 8 | github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= 9 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= 10 | gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= 11 | gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= 12 | -------------------------------------------------------------------------------- /benchmarks/multi-turn-chat-go/hack/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:24.04 2 | RUN apt-get update -y && \ 3 | apt-get install -y python3 python3-pip golang 4 | RUN apt-get install -y python3.12-venv 5 | 6 | WORKDIR /work 7 | RUN python3 -m venv venv 8 | ENV PATH="/work/venv/bin:$PATH" 9 | RUN pip install pydantic fastapi 'uvicorn[standard]' transformers 10 | 11 | COPY ./go.mod . 12 | COPY ./go.sum . 13 | 14 | RUN go mod download 15 | 16 | COPY ./main.go . 17 | COPY ./benchmark ./benchmark 18 | COPY ./tokenizer ./tokenizer 19 | 20 | RUN mkdir bin 21 | ENV PATH="/work/bin:$PATH" 22 | RUN go build -o bin/bench ./main.go 23 | 24 | COPY ./data ./data 25 | COPY ./example ./example 26 | 27 | ENTRYPOINT [ "bench" ] -------------------------------------------------------------------------------- /benchmarks/multi-turn-chat-go/hack/bench-pod.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: bench 5 | spec: 6 | restartPolicy: Never 7 | containers: 8 | - name: bench 9 | image: us-central1-docker.pkg.dev/substratus-dev/default/benchmark-multi-turn-chat-go:v0.1.1 10 | imagePullPolicy: Always 11 | command: ["sleep", "infinity"] 12 | env: 13 | - name: OPENAI_BASE_URL 14 | value: http://kubeai/openai/v1 15 | resources: 16 | requests: 17 | cpu: 2 18 | memory: 2G 19 | limits: 20 | cpu: 2 21 | memory: 2G -------------------------------------------------------------------------------- /benchmarks/multi-turn-chat-go/hack/chat-template.jinja: -------------------------------------------------------------------------------- 1 | {% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %} 2 | {% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\n' }}{% endif %} -------------------------------------------------------------------------------- /benchmarks/multi-turn-chat-go/hack/kubeai-config.json: -------------------------------------------------------------------------------- 1 | { 2 | "request_model": "deepseek-r1-1.5b-cpu", 3 | "tokenizer_model": "deepseek-ai/DeepSeek-R1", 4 | "max_concurrent_threads": 2, 5 | "max_completion_tokens": 10, 6 | "request_timeout": "180s" 7 | } -------------------------------------------------------------------------------- /benchmarks/multi-turn-chat-go/hack/llama-3.1-8b-instruct-fp8-l4.yaml: -------------------------------------------------------------------------------- 1 | # Source: models/templates/models.yaml 2 | apiVersion: kubeai.org/v1 3 | kind: Model 4 | metadata: 5 | name: llama-3.1-8b-instruct-fp8-l4 6 | spec: 7 | features: [TextGeneration] 8 | url: hf://neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8 9 | engine: VLLM 10 | args: 11 | - --max-model-len=16384 12 | - --max-num-batched-token=16384 13 | - --gpu-memory-utilization=0.9 14 | - --disable-log-requests 15 | resourceProfile: nvidia-gpu-l4:1 16 | minReplicas: 2 17 | maxReplicas: 2 18 | -------------------------------------------------------------------------------- /benchmarks/multi-turn-chat-go/hack/model.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: kubeai.org/v1 2 | kind: Model 3 | metadata: 4 | name: qwen2-0-5b 5 | spec: 6 | features: [TextGeneration] 7 | url: ollama://qwen2:0.5b 8 | engine: OLlama 9 | resourceProfile: cpu:2 10 | minReplicas: 8 11 | maxReplicas: 8 12 | -------------------------------------------------------------------------------- /benchmarks/multi-turn-chat-go/hack/ollama-config.json: -------------------------------------------------------------------------------- 1 | { 2 | "request_model": "qwen2:0.5b", 3 | "max_concurrent_threads": 2, 4 | "thread_count": 4, 5 | "max_completion_tokens": 10, 6 | "request_timeout": "30s" 7 | } -------------------------------------------------------------------------------- /benchmarks/multi-turn-chat-go/hack/openai-config.json: -------------------------------------------------------------------------------- 1 | { 2 | "request_model": "gpt-4o-mini", 3 | "max_concurrent_threads": 2, 4 | "thread_count": 4, 5 | "max_completion_tokens": 10, 6 | "request_timeout": "30s" 7 | } -------------------------------------------------------------------------------- /benchmarks/multi-turn-chat-go/hack/pod.opt-125m.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: bench 5 | spec: 6 | restartPolicy: Never 7 | containers: 8 | - name: bench 9 | image: substratusai/multi-turn-chat-go:v0.0.2 10 | imagePullPolicy: Always 11 | args: 12 | - --threads=./data/small.json 13 | - --thread-count=40 14 | - --max-concurrent-threads=10 15 | - --request-model=opt-125m-cpu 16 | - --max-completion-tokens=10 17 | - --request-timeout=6m 18 | - --no-shuffle 19 | env: 20 | - name: OPENAI_BASE_URL 21 | value: http://kubeai/openai/v1 22 | resources: 23 | requests: 24 | cpu: 4 25 | memory: 4G 26 | limits: 27 | cpu: 4 28 | memory: 4G -------------------------------------------------------------------------------- /benchmarks/multi-turn-chat-go/hack/pod.qwen.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: bench 5 | spec: 6 | restartPolicy: Never 7 | containers: 8 | - name: bench 9 | image: substratusai/multi-turn-chat-go:v0.0.2 10 | imagePullPolicy: Always 11 | args: 12 | - --threads=./data/small.json 13 | - --thread-count=30 14 | - --max-concurrent-threads=15 15 | - --request-model=qwen2.5-coder-1.5b-cpu 16 | - --max-completion-tokens=4 17 | - --request-timeout=6m 18 | - --no-shuffle 19 | env: 20 | - name: OPENAI_BASE_URL 21 | value: http://kubeai/openai/v1 22 | - name: HUGGING_FACE_HUB_TOKEN 23 | valueFrom: 24 | secretKeyRef: 25 | name: kubeai-huggingface 26 | key: token 27 | resources: 28 | requests: 29 | cpu: 4 30 | memory: 4G 31 | limits: 32 | cpu: 4 33 | memory: 4G -------------------------------------------------------------------------------- /benchmarks/multi-turn-chat-go/hack/pod.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: bench 5 | spec: 6 | restartPolicy: Never 7 | containers: 8 | - name: bench 9 | image: substratusai/multi-turn-chat-go:v0.0.2 10 | args: 11 | - --threads=./data/large.json 12 | - --thread-count=2000 13 | - --max-concurrent-threads=400 14 | - --request-model=llama-3.1-8b-instruct-fp8-l4 15 | - --max-completion-tokens=40 16 | - --request-timeout=2m 17 | env: 18 | - name: OPENAI_BASE_URL 19 | value: http://kubeai/openai/v1 20 | - name: HUGGING_FACE_HUB_TOKEN 21 | valueFrom: 22 | secretKeyRef: 23 | name: kubeai-huggingface 24 | key: token 25 | resources: 26 | requests: 27 | cpu: 4 28 | memory: 4G 29 | limits: 30 | cpu: 4 31 | memory: 4G 32 | -------------------------------------------------------------------------------- /benchmarks/multi-turn-chat-go/hack/podmonitor.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: monitoring.coreos.com/v1 2 | kind: PodMonitor 3 | metadata: 4 | name: kubeai-vllm 5 | spec: 6 | selector: 7 | matchLabels: 8 | app.kubernetes.io/name: vllm 9 | podMetricsEndpoints: 10 | - port: http 11 | interval: 2s 12 | -------------------------------------------------------------------------------- /benchmarks/multi-turn-chat-go/hack/tokenizer/tokens.py: -------------------------------------------------------------------------------- 1 | from fastapi import FastAPI 2 | from pydantic import BaseModel 3 | from transformers import AutoTokenizer 4 | import os 5 | 6 | app = FastAPI() 7 | tokenizer_model = os.environ["TOKENIZER_MODEL"] 8 | print("Tokenizer model:", tokenizer_model) 9 | # TODO: Account for model_max_length 10 | tokenizer = AutoTokenizer.from_pretrained(tokenizer_model) 11 | 12 | print(len(tokenizer("Your code appears to be a web application built using").input_ids)) 13 | 14 | 15 | class TextInput(BaseModel): 16 | text: str 17 | 18 | 19 | @app.get("/healthz") 20 | def healthz(): 21 | return {"status": "ok"} 22 | 23 | 24 | @app.post("/tokens") 25 | def count_tokens(data: TextInput): 26 | # Tokenize text 27 | input_ids = tokenizer(data.text).input_ids 28 | # Count the number of tokens 29 | num_tokens = len(input_ids) 30 | return {"num_tokens": num_tokens} 31 | -------------------------------------------------------------------------------- /benchmarks/multi-turn-chat-go/hack/vllm.Dockerfile: -------------------------------------------------------------------------------- 1 | FROM docker.io/substratusai/vllm:v0.6.3.post1-cpu 2 | COPY ./example/chat-template.jinja /tmp -------------------------------------------------------------------------------- /benchmarks/multi-turn-chat-go/pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "multi-turn-chat" 3 | version = "0.1.0" 4 | description = "Add your description here" 5 | readme = "README.md" 6 | requires-python = ">=3.13" 7 | dependencies = [ 8 | "kubernetes>=32.0.1", 9 | "matplotlib>=3.10.0", 10 | ] 11 | 12 | [dependency-groups] 13 | dev = [ 14 | "ipykernel>=6.29.5", 15 | "jupyterlab>=4.3.5", 16 | ] 17 | -------------------------------------------------------------------------------- /benchmarks/multi-turn-chat-go/runs/llama-3.1-8x-l4/itl.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/substratusai/kubeai/818a871ca6c1c0af774e467217de0a17b8bd9d7c/benchmarks/multi-turn-chat-go/runs/llama-3.1-8x-l4/itl.png -------------------------------------------------------------------------------- /benchmarks/multi-turn-chat-go/runs/llama-3.1-8x-l4/throughput.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/substratusai/kubeai/818a871ca6c1c0af774e467217de0a17b8bd9d7c/benchmarks/multi-turn-chat-go/runs/llama-3.1-8x-l4/throughput.png -------------------------------------------------------------------------------- /benchmarks/multi-turn-chat-go/runs/llama-3.1-8x-l4/ttft.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/substratusai/kubeai/818a871ca6c1c0af774e467217de0a17b8bd9d7c/benchmarks/multi-turn-chat-go/runs/llama-3.1-8x-l4/ttft.png -------------------------------------------------------------------------------- /benchmarks/multi-turn-chat-k6/.dockerignore: -------------------------------------------------------------------------------- 1 | data/ShareGPT_V3_unfiltered_cleaned_split.json -------------------------------------------------------------------------------- /benchmarks/multi-turn-chat-k6/.gitignore: -------------------------------------------------------------------------------- 1 | data/*.json -------------------------------------------------------------------------------- /benchmarks/multi-turn-chat-k6/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:20.04 2 | 3 | RUN apt-get update && apt-get install -y build-essential make python3 wget vim 4 | 5 | # Install k6 binary. 6 | ENV K6_VERSION=v0.55.0 7 | RUN wget https://github.com/grafana/k6/releases/download/${K6_VERSION}/k6-${K6_VERSION}-linux-amd64.tar.gz && tar -zxvf k6-${K6_VERSION}-linux-amd64.tar.gz && mv k6-${K6_VERSION}-linux-amd64/k6 /usr/local/bin && rm k6-${K6_VERSION}-linux-amd64.tar.gz 8 | 9 | WORKDIR /work 10 | 11 | COPY ./k6.js . 12 | COPY ./Makefile . 13 | COPY ./data ./data 14 | COPY ./scenarios ./scenarios -------------------------------------------------------------------------------- /benchmarks/multi-turn-chat-k6/Makefile: -------------------------------------------------------------------------------- 1 | data/ShareGPT_V3_unfiltered_cleaned_split.json: 2 | cd data && wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json 3 | 4 | .PHONY: data 5 | data: data/ShareGPT_V3_unfiltered_cleaned_split.json 6 | cd data && python prepare-message-threads.py 7 | 8 | run: 9 | ls scenarios/${SCENARIO} 10 | CONFIG_DIR=scenarios/${SCENARIO} DATA_DIR=data MODEL_ADDR=kubeai/openai k6 run ./k6.js -------------------------------------------------------------------------------- /benchmarks/multi-turn-chat-k6/data/prepare-message-threads.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | 4 | def main(): 5 | with open("./ShareGPT_V3_unfiltered_cleaned_split.json", "r") as f: 6 | data = json.load(f) 7 | 8 | # Select a subnet the first conversations that start with a human. 9 | max = 2000 10 | output = [] 11 | for entry in data: 12 | conv = entry.get("conversations") 13 | if conv and conv[0]["from"] == "human" and len(conv[0]["value"]) != 0: 14 | # Filter the conversation to only include messages from a human using a for loop. 15 | # entry["userMessages"] = [c["value"] for c in conv if c["from"] == "human"] 16 | totalContentLength = 0 17 | userMessages = [] 18 | for c in conv: 19 | if c["from"] == "human": 20 | content = c["value"] 21 | userMessages.append(content) 22 | totalContentLength += len(content) 23 | 24 | if totalContentLength < 2500: 25 | continue 26 | 27 | if len(userMessages) < 5: 28 | continue 29 | 30 | # Delete the original conversation 31 | entry["userMessages"] = userMessages 32 | del entry["conversations"] 33 | output.append(entry) 34 | 35 | if len(output) >= max: 36 | break 37 | 38 | with open("./message-threads.json", "w") as f: 39 | data = json.dump(output, f, indent=4) 40 | 41 | 42 | if __name__ == "__main__": 43 | main() 44 | -------------------------------------------------------------------------------- /benchmarks/multi-turn-chat-k6/scenarios/least-load-vs-prefix-hash-70b-8r/base-request.json: -------------------------------------------------------------------------------- 1 | { 2 | "model": "llama-3.1-70b-instruct-fp8-h100", 3 | "max_tokens": 10, 4 | "temperature": 0, 5 | "messages": [] 6 | } -------------------------------------------------------------------------------- /benchmarks/multi-turn-chat-k6/scenarios/least-load-vs-prefix-hash-70b-8r/k6.json: -------------------------------------------------------------------------------- 1 | { 2 | "thresholds": { 3 | "http_req_failed": [ 4 | "rate==0" 5 | ] 6 | }, 7 | "scenarios": { 8 | "chat": { 9 | "executor": "shared-iterations", 10 | "vus": 320, 11 | "iterations": 1000, 12 | "maxDuration": "600s" 13 | } 14 | } 15 | } -------------------------------------------------------------------------------- /benchmarks/multi-turn-chat-k6/scenarios/least-load-vs-prefix-hash-70b-8r/model.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: kubeai.org/v1 2 | kind: Model 3 | metadata: 4 | name: llama-3.1-70b-instruct-fp8-h100 5 | spec: 6 | features: [TextGeneration] 7 | url: hf://neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8 8 | engine: VLLM 9 | args: 10 | - --enable-prefix-caching 11 | - --max-model-len=16384 12 | - --max-num-batched-token=16384 13 | - --gpu-memory-utilization=0.95 14 | - --disable-log-requests 15 | - --kv-cache-dtype=fp8 16 | resourceProfile: nvidia-gpu-h100:1 17 | minReplicas: 8 18 | maxReplicas: 8 19 | -------------------------------------------------------------------------------- /benchmarks/multi-turn-chat-k6/scenarios/least-load-vs-prefix-hash-70b-8r/pod.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: chat-benchmark 5 | spec: 6 | restartPolicy: Never 7 | containers: 8 | - name: bench 9 | image: $IMG 10 | command: ["sleep", "infinity"] 11 | resources: 12 | requests: 13 | cpu: 6 14 | ephemeral-storage: 10Gi 15 | memory: 24Gi 16 | limits: 17 | cpu: 6 18 | ephemeral-storage: 10Gi 19 | memory: 24Gi -------------------------------------------------------------------------------- /benchmarks/multi-turn-chat-k6/scenarios/least-load-vs-prefix-hash/base-request.json: -------------------------------------------------------------------------------- 1 | { 2 | "model": "llama-3.1-8b-instruct-fp8-l4", 3 | "max_tokens": 10, 4 | "temperature": 0, 5 | "messages": [] 6 | } -------------------------------------------------------------------------------- /benchmarks/multi-turn-chat-k6/scenarios/least-load-vs-prefix-hash/k6.json: -------------------------------------------------------------------------------- 1 | { 2 | "thresholds": { 3 | "http_req_failed": [ 4 | "rate==0" 5 | ] 6 | }, 7 | "scenarios": { 8 | "chat": { 9 | "executor": "shared-iterations", 10 | "vus": 80, 11 | "iterations": 1000, 12 | "maxDuration": "600s" 13 | } 14 | } 15 | } -------------------------------------------------------------------------------- /benchmarks/multi-turn-chat-k6/scenarios/least-load-vs-prefix-hash/model.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: kubeai.org/v1 2 | kind: Model 3 | metadata: 4 | name: llama-3.1-8b-instruct-fp8-l4 5 | spec: 6 | features: [TextGeneration] 7 | url: hf://neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8 8 | engine: VLLM 9 | args: 10 | - --enable-prefix-caching 11 | - --max-model-len=16384 12 | - --max-num-batched-token=16384 13 | - --gpu-memory-utilization=0.6 14 | - --disable-log-requests 15 | resourceProfile: nvidia-gpu-l4:1 16 | minReplicas: 2 17 | maxReplicas: 2 18 | -------------------------------------------------------------------------------- /benchmarks/multi-turn-chat-k6/scenarios/least-load-vs-prefix-hash/pod.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: chat-benchmark 5 | spec: 6 | restartPolicy: Never 7 | containers: 8 | - name: bench 9 | image: us-central1-docker.pkg.dev/substratus-dev/default/kubeai-benchmark-chat:v0.0.2 10 | command: ["sleep", "infinity"] 11 | resources: 12 | requests: 13 | cpu: 6 14 | ephemeral-storage: 10Gi 15 | memory: 24Gi 16 | limits: 17 | cpu: 6 18 | ephemeral-storage: 10Gi 19 | memory: 24Gi -------------------------------------------------------------------------------- /charts/.gitignore: -------------------------------------------------------------------------------- 1 | charts/ 2 | -------------------------------------------------------------------------------- /charts/kubeai/.helmignore: -------------------------------------------------------------------------------- 1 | # Patterns to ignore when building packages. 2 | # This supports shell glob matching, relative path matching, and 3 | # negation (prefixed with !). Only one pattern per line. 4 | .DS_Store 5 | # Common VCS dirs 6 | .git/ 7 | .gitignore 8 | .bzr/ 9 | .bzrignore 10 | .hg/ 11 | .hgignore 12 | .svn/ 13 | # Common backup files 14 | *.swp 15 | *.bak 16 | *.tmp 17 | *.orig 18 | *~ 19 | # Various IDEs 20 | .project 21 | .idea/ 22 | *.tmproj 23 | .vscode/ 24 | -------------------------------------------------------------------------------- /charts/kubeai/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v2 2 | name: kubeai 3 | description: Private Open AI Platform for Kubernetes. 4 | 5 | type: application 6 | 7 | # This is the chart version. This version number should be incremented each time you make changes 8 | # to the chart and its templates, including the app version. 9 | # Versions are expected to follow Semantic Versioning (https://semver.org/) 10 | version: 0.21.0 11 | 12 | # This is the version number of the application being deployed. This version number should be 13 | # incremented each time you make changes to the application. Versions are not expected to 14 | # follow Semantic Versioning. They should reflect the version the application is using. 15 | # It is recommended to use it with quotes. 16 | appVersion: "v0.21.0" 17 | 18 | dependencies: 19 | # Open Web UI is an open source ChatGPT-like user interface. 20 | # https://docs.openwebui.com/ 21 | - name: open-webui 22 | condition: open-webui.enabled 23 | repository: https://helm.openwebui.com/ 24 | version: 6.4.0 25 | 26 | keywords: ["LLM", "AI"] 27 | 28 | # TODO replace with kubeai.org once live 29 | home: https://www.substratus.ai 30 | 31 | maintainers: 32 | - name: nstogner 33 | url: https://www.linkedin.com/in/nstogner/ 34 | - name: samos123 35 | email: sammiestoel@gmail.com 36 | url: https://www.linkedin.com/in/samstoelinga/ 37 | -------------------------------------------------------------------------------- /charts/kubeai/templates/autoscalerstateconfigmap.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ConfigMap 3 | metadata: 4 | name: {{ include "models.autoscalerStateConfigMapName" . }} -------------------------------------------------------------------------------- /charts/kubeai/templates/aws-secret.yaml: -------------------------------------------------------------------------------- 1 | {{- if and .Values.secrets.aws.create (and (not (empty .Values.secrets.aws.accessKeyID)) (not (empty .Values.secrets.aws.secretAccessKey))) }} 2 | apiVersion: v1 3 | kind: Secret 4 | metadata: 5 | name: {{ include "kubeai.awsSecretName" . }} 6 | labels: 7 | {{- include "kubeai.labels" . | nindent 4 }} 8 | data: 9 | accessKeyID: {{ .Values.secrets.aws.accessKeyID | b64enc }} 10 | secretAccessKey: {{ .Values.secrets.aws.secretAccessKey | b64enc }} 11 | {{- end }} 12 | -------------------------------------------------------------------------------- /charts/kubeai/templates/configmap.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ConfigMap 3 | metadata: 4 | name: {{ include "kubeai.fullname" . }}-config 5 | labels: 6 | {{- include "kubeai.labels" . | nindent 4 }} 7 | data: 8 | system.yaml: | 9 | secretNames: 10 | alibaba: {{ include "kubeai.alibabaSecretName" . }} 11 | aws: {{ include "kubeai.awsSecretName" . }} 12 | gcp: {{ include "kubeai.gcpSecretName" . }} 13 | huggingface: {{ include "kubeai.huggingfaceSecretName" . }} 14 | resourceProfiles: 15 | {{- .Values.resourceProfiles | toYaml | nindent 6 }} 16 | cacheProfiles: 17 | {{- .Values.cacheProfiles | toYaml | nindent 6 }} 18 | modelServers: 19 | {{- .Values.modelServers | toYaml | nindent 6 }} 20 | modelLoading: 21 | {{- .Values.modelLoading | toYaml | nindent 6 }} 22 | modelRollouts: 23 | {{- .Values.modelRollouts | toYaml | nindent 6 }} 24 | modelServerPods: 25 | {{- if .Values.modelServerPods }} 26 | {{- if .Values.modelServerPods.podSecurityContext }} 27 | podSecurityContext: 28 | {{- .Values.modelServerPods.podSecurityContext | toYaml | nindent 8}} 29 | {{- end}} 30 | {{- if .Values.modelServerPods.jsonPatches }} 31 | jsonPatches: 32 | {{- .Values.modelServerPods.jsonPatches | toYaml | nindent 8}} 33 | {{- end}} 34 | {{- if .Values.modelServerPods.securityContext }} 35 | securityContext: 36 | {{- .Values.modelServerPods.securityContext | toYaml | nindent 8}} 37 | {{- end}} 38 | {{- if .Values.imagePullSecrets }} 39 | imagePullSecrets: 40 | {{- toYaml .Values.imagePullSecrets | nindent 8}} 41 | {{- end}} 42 | {{- end}} 43 | serviceAccountName: {{ include "models.serviceAccountName" . }} 44 | modelAutoscaling: 45 | interval: {{ .Values.modelAutoscaling.interval }} 46 | timeWindow: {{ .Values.modelAutoscaling.timeWindow }} 47 | stateConfigMapName: {{ include "models.autoscalerStateConfigMapName" . }} 48 | messaging: 49 | {{- .Values.messaging | toYaml | nindent 6 }} 50 | -------------------------------------------------------------------------------- /charts/kubeai/templates/huggingface-secret.yaml: -------------------------------------------------------------------------------- 1 | # Only create the secret if the token is not empty. 2 | # See: https://github.com/substratusai/kubeai/issues/232 3 | {{- if and .Values.secrets.huggingface.create (not (empty .Values.secrets.huggingface.token)) }} 4 | apiVersion: v1 5 | kind: Secret 6 | metadata: 7 | name: {{ include "kubeai.huggingfaceSecretName" . }} 8 | labels: 9 | {{- include "kubeai.labels" . | nindent 4 }} 10 | data: 11 | token: {{ .Values.secrets.huggingface.token | b64enc }} 12 | {{- end }} -------------------------------------------------------------------------------- /charts/kubeai/templates/rolebinding.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: RoleBinding 3 | metadata: 4 | name: {{ include "kubeai.fullname" . }} 5 | labels: 6 | {{- include "kubeai.labels" . | nindent 4 }} 7 | roleRef: 8 | apiGroup: rbac.authorization.k8s.io 9 | kind: Role 10 | name: {{ include "kubeai.fullname" . }} 11 | subjects: 12 | - kind: ServiceAccount 13 | name: {{ include "kubeai.serviceAccountName" . }} -------------------------------------------------------------------------------- /charts/kubeai/templates/securityContextConstraints.yaml: -------------------------------------------------------------------------------- 1 | # Create securityContextConstraints for the model pods if running on OpenShift. 2 | # This is needed because pods in OpenShift run with the restricted-v2 SCC by 3 | # default which do not allow a container to start with uid=0 4 | # (The model pod images run as the root user) 5 | {{- if .Capabilities.APIVersions.Has "security.openshift.io/v1" }} 6 | apiVersion: security.openshift.io/v1 7 | kind: SecurityContextConstraints 8 | metadata: 9 | name: {{ include "kubeai.fullname" . }}-models 10 | allowPrivilegeEscalation: false 11 | readOnlyRootFilesystem: false 12 | runAsUser: 13 | type: RunAsAny 14 | seLinuxContext: 15 | type: MustRunAs 16 | seccompProfiles: 17 | - runtime/default 18 | requiredDropCapabilities: 19 | - ALL 20 | users: 21 | - system:serviceaccount:{{ .Release.Namespace }}:{{ include "models.serviceAccountName" . }} 22 | {{- end }} 23 | -------------------------------------------------------------------------------- /charts/kubeai/templates/service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | name: {{ include "kubeai.fullname" . }} 5 | labels: 6 | {{- include "kubeai.labels" . | nindent 4 }} 7 | {{- with .Values.service.annotations }} 8 | annotations: 9 | {{- toYaml . | nindent 4 }} 10 | {{- end }} 11 | spec: 12 | type: {{ .Values.service.type }} 13 | ports: 14 | - name: http 15 | port: {{ .Values.service.port }} 16 | targetPort: http 17 | protocol: TCP 18 | {{- with .Values.service.nodePort }} 19 | nodePort: {{ . }} 20 | {{- end }} 21 | selector: 22 | {{- include "kubeai.selectorLabels" . | nindent 4 }} 23 | -------------------------------------------------------------------------------- /charts/kubeai/templates/serviceaccount.yaml: -------------------------------------------------------------------------------- 1 | {{- if .Values.serviceAccount.create -}} 2 | apiVersion: v1 3 | kind: ServiceAccount 4 | metadata: 5 | name: {{ include "kubeai.serviceAccountName" . }} 6 | labels: 7 | {{- include "kubeai.labels" . | nindent 4 }} 8 | {{- with .Values.serviceAccount.annotations }} 9 | annotations: 10 | {{- toYaml . | nindent 4 }} 11 | {{- end }} 12 | automountServiceAccountToken: {{ .Values.serviceAccount.automount }} 13 | {{- end }} 14 | {{- if .Values.modelServiceAccount.create }} 15 | --- 16 | apiVersion: v1 17 | kind: ServiceAccount 18 | metadata: 19 | name: {{ include "models.serviceAccountName" . }} 20 | labels: 21 | {{- include "kubeai.labels" . | nindent 4 }} 22 | {{- with .Values.modelServiceAccount.annotations }} 23 | annotations: 24 | {{- toYaml . | nindent 4 }} 25 | {{- end }} 26 | automountServiceAccountToken: {{ .Values.modelServiceAccount.automount }} 27 | {{- end }} 28 | -------------------------------------------------------------------------------- /charts/kubeai/templates/vllm-pod-monitor.yaml: -------------------------------------------------------------------------------- 1 | {{- if .Values.metrics.prometheusOperator.vLLMPodMonitor.enabled }} 2 | apiVersion: {{ .Values.metrics.prometheusOperator.vLLMPodMonitor.apiVersion }} 3 | kind: PodMonitor 4 | metadata: 5 | name: {{ include "kubeai.fullname" . }}-vllm 6 | labels: 7 | {{- include "kubeai.labels" . | nindent 4 }} 8 | {{- with .Values.metrics.prometheusOperator.vLLMPodMonitor.labels }} 9 | {{- toYaml . | nindent 4 }} 10 | {{- end }} 11 | spec: 12 | selector: 13 | matchLabels: 14 | app.kubernetes.io/name: vllm 15 | podMetricsEndpoints: 16 | - port: http 17 | {{- end }} 18 | -------------------------------------------------------------------------------- /charts/kubeai/values-amd-gpu-device-plugin.yaml: -------------------------------------------------------------------------------- 1 | resourceProfiles: 2 | amd-gpu-mi300x: 3 | nodeSelector: 4 | # Source: https://gitlab.freedesktop.org/mesa/drm/-/blob/main/data/amdgpu.ids#L569 5 | amd.com/gpu.device-id: 74a1 6 | amd.com/gpu.vram: "192G" 7 | amd.com/gpu.family: "AI" 8 | -------------------------------------------------------------------------------- /charts/kubeai/values-eks.yaml: -------------------------------------------------------------------------------- 1 | resourceProfiles: 2 | nvidia-gpu-l4: 3 | nodeSelector: 4 | karpenter.k8s.aws/instance-gpu-name: "l4" 5 | nvidia-gpu-l40s: 6 | nodeSelector: 7 | karpenter.k8s.aws/instance-gpu-name: "l40s" 8 | nvidia-gpu-h100: 9 | nodeSelector: 10 | karpenter.k8s.aws/instance-gpu-name: "h100" 11 | nvidia-gpu-a100-80gb: 12 | nodeSelector: 13 | karpenter.k8s.aws/instance-gpu-name: "a100" 14 | karpenter.k8s.aws/instance-gpu-memory: "81920" 15 | nvidia-gpu-a100-40gb: 16 | nodeSelector: 17 | karpenter.k8s.aws/instance-gpu-name: "a100" 18 | karpenter.k8s.aws/instance-gpu-memory: "40960" 19 | 20 | cacheProfiles: 21 | efs-dynamic: 22 | sharedFilesystem: 23 | storageClassName: "efs-sc" 24 | efs-static: 25 | sharedFilesystem: 26 | persistentVolumeName: "efs-pv" -------------------------------------------------------------------------------- /charts/kubeai/values-gke.yaml: -------------------------------------------------------------------------------- 1 | resourceProfiles: 2 | nvidia-gpu-l4: 3 | nodeSelector: 4 | cloud.google.com/gke-accelerator: "nvidia-l4" 5 | cloud.google.com/gke-spot: "true" 6 | nvidia-gpu-h100: 7 | nodeSelector: 8 | cloud.google.com/gke-accelerator: "nvidia-h100-80gb" 9 | cloud.google.com/gke-spot: "true" 10 | nvidia-gpu-a100-80gb: 11 | nodeSelector: 12 | cloud.google.com/gke-accelerator: "nvidia-a100-80gb" 13 | cloud.google.com/gke-spot: "true" 14 | nvidia-gpu-a100-40gb: 15 | nodeSelector: 16 | cloud.google.com/gke-accelerator: "nvidia-tesla-a100" 17 | cloud.google.com/gke-spot: "true" 18 | google-tpu-v5e-1x1: 19 | imageName: google-tpu 20 | limits: 21 | google.com/tpu: 1 22 | nodeSelector: 23 | cloud.google.com/gke-tpu-accelerator: tpu-v5-lite-podslice 24 | cloud.google.com/gke-tpu-topology: "1x1" 25 | cloud.google.com/gke-spot: "true" 26 | google-tpu-v5e-2x2: 27 | imageName: google-tpu 28 | limits: 29 | google.com/tpu: 1 30 | nodeSelector: 31 | cloud.google.com/gke-tpu-accelerator: tpu-v5-lite-podslice 32 | cloud.google.com/gke-tpu-topology: "2x2" 33 | cloud.google.com/gke-spot: "true" 34 | google-tpu-v5e-2x4: 35 | imageName: google-tpu 36 | limits: 37 | google.com/tpu: 1 38 | nodeSelector: 39 | cloud.google.com/gke-tpu-accelerator: tpu-v5-lite-podslice 40 | cloud.google.com/gke-tpu-topology: "2x4" 41 | cloud.google.com/gke-spot: "true" 42 | 43 | cacheProfiles: 44 | standard-filestore: 45 | sharedFilesystem: 46 | storageClassName: "standard-rwx" 47 | premium-filestore: 48 | sharedFilesystem: 49 | storageClassName: "premium-rwx" -------------------------------------------------------------------------------- /charts/kubeai/values-nvidia-k8s-device-plugin.yaml: -------------------------------------------------------------------------------- 1 | resourceProfiles: 2 | nvidia-gpu-a16: 3 | nodeSelector: 4 | nvidia.com/gpu.family: "ampere" 5 | nvidia.com/gpu.memory: "16384" 6 | nvidia-gpu-l4: 7 | nodeSelector: 8 | nvidia.com/gpu.family: "ada-lovelace" 9 | nvidia.com/gpu.memory: "23034" 10 | nvidia-gpu-h100: 11 | nodeSelector: 12 | nvidia.com/gpu.family: "hopper" 13 | nvidia.com/gpu.memory: "81920" 14 | nvidia-gpu-gh200: 15 | nodeSelector: 16 | nvidia.com/gpu.family: "hopper" 17 | nvidia.com/gpu.memory: "97871" 18 | nvidia-gpu-a100-80gb: 19 | nodeSelector: 20 | nvidia.com/gpu.family: "ampere" 21 | nvidia.com/gpu.memory: "81920" 22 | nvidia-gpu-a100-40gb: 23 | nodeSelector: 24 | nvidia.com/gpu.family: "ampere" 25 | nvidia.com/gpu.memory: "40960" 26 | nvidia-gpu-rtx4070-8gb: 27 | nodeSelector: 28 | nvidia.com/gpu.family: "ampere" 29 | nvidia.com/gpu.memory: "8188" 30 | nvidia-gpu-rtx4090-24gb: 31 | nodeSelector: 32 | nvidia.com/gpu.family: "ampere" 33 | nvidia.com/gpu.memory: "24564" 34 | -------------------------------------------------------------------------------- /charts/models/.helmignore: -------------------------------------------------------------------------------- 1 | # Patterns to ignore when building packages. 2 | # This supports shell glob matching, relative path matching, and 3 | # negation (prefixed with !). Only one pattern per line. 4 | .DS_Store 5 | # Common VCS dirs 6 | .git/ 7 | .gitignore 8 | .bzr/ 9 | .bzrignore 10 | .hg/ 11 | .hgignore 12 | .svn/ 13 | # Common backup files 14 | *.swp 15 | *.bak 16 | *.tmp 17 | *.orig 18 | *~ 19 | # Various IDEs 20 | .project 21 | .idea/ 22 | *.tmproj 23 | .vscode/ 24 | -------------------------------------------------------------------------------- /charts/models/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v2 2 | name: models 3 | description: A Helm chart for Kubernetes 4 | 5 | # A chart can be either an 'application' or a 'library' chart. 6 | # 7 | # Application charts are a collection of templates that can be packaged into versioned archives 8 | # to be deployed. 9 | # 10 | # Library charts provide useful utilities or functions for the chart developer. They're included as 11 | # a dependency of application charts to inject those utilities and functions into the rendering 12 | # pipeline. Library charts do not define any templates and therefore cannot be deployed. 13 | type: application 14 | 15 | # This is the chart version. This version number should be incremented each time you make changes 16 | # to the chart and its templates, including the app version. 17 | # Versions are expected to follow Semantic Versioning (https://semver.org/) 18 | version: 0.21.0 19 | 20 | # This is the version number of the application being deployed. This version number should be 21 | # incremented each time you make changes to the application. Versions are not expected to 22 | # follow Semantic Versioning. They should reflect the version the application is using. 23 | # It is recommended to use it with quotes. 24 | appVersion: "1.16.0" 25 | 26 | maintainers: 27 | - name: nstogner 28 | url: https://www.linkedin.com/in/nstogner/ 29 | - name: samos123 30 | email: sammiestoel@gmail.com 31 | url: https://www.linkedin.com/in/samstoelinga/ 32 | -------------------------------------------------------------------------------- /charts/models/templates/models.yaml: -------------------------------------------------------------------------------- 1 | {{- range $name, $model := .Values.catalog}} 2 | {{- if or $model.enabled $.Values.all.enabled }} 3 | --- 4 | apiVersion: kubeai.org/v1 5 | kind: Model 6 | metadata: 7 | name: {{ $name }} 8 | {{- with $model.labels }} 9 | labels: 10 | {{- toYaml . | nindent 4 }} 11 | {{- end }} 12 | spec: 13 | features: {{ $model.features }} 14 | {{- with $model.owner }} 15 | owner: {{ . }} 16 | {{- end }} 17 | url: {{ $model.url }} 18 | {{- with $model.adapters }} 19 | adapters: 20 | {{- toYaml . | nindent 4 }} 21 | {{- end }} 22 | {{- with $model.engine }} 23 | engine: {{ . }} 24 | {{- end }} 25 | {{- with $model.args }} 26 | args: 27 | {{- toYaml . | nindent 4 }} 28 | {{- end }} 29 | {{- with $model.env }} 30 | env: 31 | {{- toYaml . | nindent 4 }} 32 | {{- end }} 33 | minReplicas: {{ default 0 $model.minReplicas }} 34 | {{- with $model.maxReplicas }} 35 | maxReplicas: {{ . }} 36 | {{- end}} 37 | {{- with $model.targetRequests }} 38 | targetRequests: {{ . }} 39 | {{- end}} 40 | {{- with $model.scaleDownDelaySeconds }} 41 | scaleDownDelaySeconds: {{ . }} 42 | {{- end}} 43 | {{- with $model.resourceProfile }} 44 | resourceProfile: {{ . }} 45 | {{- end}} 46 | {{- with $model.cacheProfile }} 47 | cacheProfile: {{ . }} 48 | {{- end}} 49 | {{- with $model.files }} 50 | files: 51 | {{- toYaml . | nindent 4 }} 52 | {{- end }} 53 | {{- end}} 54 | {{- end}} -------------------------------------------------------------------------------- /cmd/main.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2024. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package main 18 | 19 | import ( 20 | "flag" 21 | "os" 22 | 23 | "github.com/substratusai/kubeai/internal/manager" 24 | ctrl "sigs.k8s.io/controller-runtime" 25 | "sigs.k8s.io/controller-runtime/pkg/log/zap" 26 | ) 27 | 28 | func main() { 29 | // Flag parsing can cause a panic if done inside of command.Run() and called in a goroutine (as in tests). 30 | // So we parse flags here. 31 | opts := zap.Options{ 32 | Development: true, 33 | } 34 | opts.BindFlags(flag.CommandLine) 35 | flag.Parse() 36 | ctrl.SetLogger(zap.New(zap.UseFlagOptions(&opts))) 37 | 38 | configPath := os.Getenv("CONFIG_PATH") 39 | if configPath == "" { 40 | configPath = "./config.yaml" 41 | } 42 | 43 | sysCfg, err := manager.LoadConfigFile(configPath) 44 | if err != nil { 45 | manager.Log.Error(err, "failed to load config file", "path", configPath) 46 | os.Exit(1) 47 | } 48 | 49 | if err := manager.Run(ctrl.SetupSignalHandler(), ctrl.GetConfigOrDie(), sysCfg); err != nil { 50 | manager.Log.Error(err, "failed to run command") 51 | os.Exit(1) 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /components/model-loader/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM alpine:3.20 2 | 3 | # Common 4 | # * unzip (full version) needed for installing ossutil 5 | RUN apk add --no-cache curl unzip tar 6 | 7 | # Python 8 | ENV PYTHONUNBUFFERED=1 9 | RUN apk add --no-cache python3 py3-pip pipx bash && rm -rf ~/.cache/* /usr/local/share/man /tmp/* 10 | # Location where pipx installs executables: 11 | ENV PATH="/root/.local/bin:$PATH" 12 | 13 | # Hugging Face ("hf://") 14 | RUN pipx install huggingface_hub 15 | RUN huggingface-cli version 16 | 17 | # AWS S3 ("s3://") 18 | RUN pipx install awscli 19 | RUN aws --version 20 | 21 | # Determine architecture. 22 | RUN if [ `uname -m` = 'x86_64' ]; then echo -n "x86_64" > /tmp/arch; else echo -n "arm" > /tmp/arch; fi; 23 | 24 | # Google Cloud Storage ("gs://") 25 | RUN ARCH=`cat /tmp/arch` && curl -OL https://dl.google.com/dl/cloudsdk/channels/rapid/downloads/google-cloud-cli-linux-$ARCH.tar.gz 26 | RUN ls 27 | RUN ARCH=`cat /tmp/arch` && tar -xf google-cloud-cli-linux-$ARCH.tar.gz 28 | RUN ./google-cloud-sdk/install.sh --usage-reporting=false 29 | ENV PATH="/google-cloud-sdk/bin:$PATH" 30 | RUN gcloud config set component_manager/disable_update_check true 31 | RUN gcloud --version 32 | 33 | # Alibaba Object Storage Service ("oss://") 34 | RUN wget -O - https://gosspublic.alicdn.com/ossutil/install.sh | bash 35 | RUN ossutil --version 36 | 37 | # Loader script 38 | COPY ./load.sh /bin/load 39 | RUN chmod +x /bin/load 40 | ENTRYPOINT ["/bin/load"] -------------------------------------------------------------------------------- /components/model-loader/load.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -euxo pipefail 4 | 5 | src=$1 6 | dest=$2 7 | 8 | # If dest is a local directory, download the model to that directory. 9 | # Otherwise, download to a temporary directory and upload from there. 10 | dest_type="" 11 | if [[ $dest == *"://"* ]]; then 12 | dir=$(mktemp -d) 13 | dest_type="url" 14 | else 15 | dir=$dest 16 | dest_type="dir" 17 | mkdir -p $dir 18 | fi 19 | 20 | # Download 21 | case $src in 22 | "hf://"*) 23 | repo=${src#hf://} 24 | huggingface-cli download --local-dir $dir $repo 25 | rm -rf $dir/.cache 26 | ;; 27 | "s3://"*) 28 | aws s3 sync $src $dir 29 | ;; 30 | "gs://"*) 31 | gcloud auth activate-service-account --key-file $GOOGLE_APPLICATION_CREDENTIALS 32 | gcloud storage rsync $src $dir 33 | ;; 34 | "oss://"*) 35 | ossutil sync $src $dir 36 | ;; 37 | *) 38 | echo "Unsupported source url: $src" 39 | exit 1 40 | ;; 41 | esac 42 | 43 | # Upload 44 | if [[ $dest_type == "url" ]]; then 45 | case $dest in 46 | "hf://"*) 47 | repo=${dest#hf://} 48 | huggingface-cli upload $repo $dir 49 | ;; 50 | "s3://"*) 51 | aws s3 sync $dir $dest 52 | ;; 53 | "gs://"*) 54 | gcloud auth activate-service-account --key-file $GOOGLE_APPLICATION_CREDENTIALS 55 | gcloud storage rsync $dir $dest 56 | ;; 57 | "oss://"*) 58 | ossutil sync $dir $dest 59 | ;; 60 | *) 61 | echo "Unsupported destination url: $dest" 62 | exit 1 63 | ;; 64 | esac 65 | fi -------------------------------------------------------------------------------- /docs/CNAME: -------------------------------------------------------------------------------- 1 | www.kubeai.org 2 | -------------------------------------------------------------------------------- /docs/benchmarks/llama-3.2-11b-vision.md: -------------------------------------------------------------------------------- 1 | # Llama 3.2 11B Vision Instruct vLLM Benchmarks 2 | 3 | 4 | Single L4 GPU vLLM 0.6.2 5 | ``` 6 | python3 benchmark_serving.py --backend openai \ 7 | --base-url http://localhost:8000/openai \ 8 | --dataset-name=sharegpt --dataset-path=ShareGPT_V3_unfiltered_cleaned_split.json \ 9 | --model meta-llama-3.2-11b-vision-instruct \ 10 | --seed 12345 --tokenizer neuralmagic/Llama-3.2-11B-Vision-Instruct-FP8-dynamic 11 | ============ Serving Benchmark Result ============ 12 | Successful requests: 1000 13 | Benchmark duration (s): 681.93 14 | Total input tokens: 230969 15 | Total generated tokens: 194523 16 | Request throughput (req/s): 1.47 17 | Output token throughput (tok/s): 285.25 18 | Total Token throughput (tok/s): 623.95 19 | ---------------Time to First Token---------------- 20 | Mean TTFT (ms): 319146.12 21 | Median TTFT (ms): 322707.98 22 | P99 TTFT (ms): 642512.79 23 | -----Time per Output Token (excl. 1st token)------ 24 | Mean TPOT (ms): 54.84 25 | Median TPOT (ms): 53.66 26 | P99 TPOT (ms): 83.75 27 | ---------------Inter-token Latency---------------- 28 | Mean ITL (ms): 54.09 29 | Median ITL (ms): 47.44 30 | P99 ITL (ms): 216.77 31 | ================================================== 32 | ``` -------------------------------------------------------------------------------- /docs/benchmarks/prefix-aware-load-balancing-mean-ttft.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/substratusai/kubeai/818a871ca6c1c0af774e467217de0a17b8bd9d7c/docs/benchmarks/prefix-aware-load-balancing-mean-ttft.png -------------------------------------------------------------------------------- /docs/benchmarks/prefix-aware-load-balancing-throughput.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/substratusai/kubeai/818a871ca6c1c0af774e467217de0a17b8bd9d7c/docs/benchmarks/prefix-aware-load-balancing-throughput.png -------------------------------------------------------------------------------- /docs/blog/.authors.yml: -------------------------------------------------------------------------------- 1 | authors: 2 | nstogner: 3 | name: Nick Stogner 4 | description: Creator 5 | url: https://www.linkedin.com/in/nstogner/ 6 | avatar: https://avatars.githubusercontent.com/u/10274189 7 | samstoelinga: 8 | name: Sam Stoelinga 9 | description: Creator 10 | url: https://www.linkedin.com/in/samstoelinga/ 11 | avatar: https://avatars.githubusercontent.com/u/388784 -------------------------------------------------------------------------------- /docs/blog/index.md: -------------------------------------------------------------------------------- 1 | # Recent 2 | 3 | -------------------------------------------------------------------------------- /docs/concepts/autoscaling.md: -------------------------------------------------------------------------------- 1 | # Autoscaling 2 | 3 | KubeAI proxies HTTP and messaging (i.e. Kafka, etc) requests and messages to models. It will adjust the number Pods serving a given model based on the average active number of requests. If no Pods are running when a request comes in, KubeAI will hold the request, scale up a Pod and forward the request when the Pod is ready. This process happens in a manner that is transparent to the end client (other than the added delay from a cold-start). 4 | 5 |
6 | 7 | 8 | ## Next 9 | 10 | Read about [how to configure autoscaling](../how-to/configure-autoscaling.md). -------------------------------------------------------------------------------- /docs/concepts/backend-servers.md: -------------------------------------------------------------------------------- 1 | # Backend Servers 2 | 3 | KubeAI serves ML models by launching Pods on Kubernetes. The configuration and lifecycle of these Pods are managed by the KubeAI controller. Every model server Pod loads exactly one model on startup. 4 | 5 | In a Model manifest you can define what server to use for inference (`VLLM`, `OLlama`). Any model-specific settings can be passed to the server process via the `args` and `env` fields. 6 | 7 | ## Next 8 | 9 | Read about [how to install models](../how-to/install-models.md). -------------------------------------------------------------------------------- /docs/concepts/load-balancing.md: -------------------------------------------------------------------------------- 1 | # Load Balancing 2 | 3 | To optimize inference performance and resource utilization, KubeAI supports load balancing strategies specifically tailored for model inference servers such as vLLM. This document explains two primary load balancing strategies available in KubeAI: Least Load and Prefix Hash. 4 | 5 | ## Least Load 6 | 7 | The Least Load strategy distributes inference requests to the model replica that has the least number of in-flight requests. This strategy aims to balance the inference workload evenly across available replicas, reducing the risk of overloading any single server. 8 | 9 | ## Prefix Hash 10 | 11 | The Prefix Hash strategy leverages the Consistent Hashing with With Bounded Loads (CHWBL) algorithm to optimize the performance of engines such as vLLM that support prefix caching. This strategy increases the likelihood of KV cache hits for common prefixes. See vLLM prefix hashing docs for more info. 12 | 13 | With this strategy, KubeAI hashes incoming requests based on their prefixes (in addition to a requested LoRA adapter name - if present). Requests with the same hash value are routed to the same replica, except when that replica's in-flight requests exceed the overall average by a configurable percentage. 14 | 15 | This strategy has the most benefit for use cases such as chat completion. This is because the entire chat thread is sent in each successive chat requests. 16 | 17 | KubeAI supports this strategy for the following endpoints: 18 | 19 | ``` 20 | /openai/v1/completions 21 | /openai/v1/chat/completions 22 | ``` 23 | 24 | ## Next 25 | 26 | See the [Kubernetes API docs](../reference/kubernetes-api.md) to view how to configure Model load balancing. -------------------------------------------------------------------------------- /docs/concepts/lora-adapters.md: -------------------------------------------------------------------------------- 1 | # LoRA Adapters 2 | 3 | KubeAI orchestrates the loading of LoRA adapters into model serving containers. New LoRA adapters can be swapped in and out without needing to restart the container that is serving the base model. 4 | 5 | 6 | 7 | ## Next 8 | 9 | Read about [how to serve lora adapters](../how-to/serve-lora-adapters.md). -------------------------------------------------------------------------------- /docs/concepts/resource-profiles.md: -------------------------------------------------------------------------------- 1 | # Resource Profiles 2 | 3 | A resource profile maps a type of compute resource (i.e. NVIDIA L4 GPU) to a collection of Kubernetes settings that are configured on inference server Pods. These profiles are defined in the KubeAI `config.yaml` file (via a ConfigMap). Each model specifies the resource profile that it requires. 4 | 5 | Kubernetes Model resources specify a resource profile and the count of that resource that they require (for example `resourceProfile: nvidia-gpu-l4:2` - 2x L4 GPUs). 6 | 7 | A given profile might need to contain slightly different settings based on the cluster/cloud that KubeAI is deployed in. 8 | 9 | Example: A resource profile named `nvidia-gpu-l4` might contain the following node selectors when installing KubeAI on a GKE Kubernetes cluster: 10 | 11 | ```yaml 12 | cloud.google.com/gke-accelerator: "nvidia-l4" 13 | cloud.google.com/gke-spot: "true" 14 | ``` 15 | 16 | and add the following resource requests to the model server Pods: 17 | 18 | ```yaml 19 | nvidia.com/gpu: "1" 20 | ``` 21 | 22 | In addition to node selectors and resource requirements, a resource profile may optionally specify an image name. This name maps to the container image that will be selected when serving a model on that resource. 23 | 24 | ## Next 25 | 26 | Read about [how to configure resource profiles](../how-to/configure-resource-profiles.md). -------------------------------------------------------------------------------- /docs/concepts/storage-caching.md: -------------------------------------------------------------------------------- 1 | # Storage / Caching 2 | 3 | With "Large" in the name, caching is a critical part of serving LLMs. 4 | 5 | The best caching technique may very depending on your environment: 6 | 7 | * What cloud features are available? 8 | * Is your cluster deployed in an air-gapped environment? 9 | 10 | ## A. Model built into container 11 | 12 | **Status:** Supported 13 | 14 | Building a model into a container image can provide a simple way to take advantage of image-related optimizations built into Kubernetes: 15 | 16 | * Relaunching a model server on the same Node that it ran on before will [likely](https://kubernetes.io/docs/concepts/architecture/garbage-collection/#container-image-lifecycle) be able to reuse the previously pulled image. 17 | 18 | * [Secondary boot disks on GKE](https://cloud.google.com/kubernetes-engine/docs/how-to/data-container-image-preloading) can be used to avoid needing to pull images. 19 | 20 | * [Image streaming on GKE](https://cloud.google.com/blog/products/containers-kubernetes/introducing-container-image-streaming-in-gke) can allow for containers to startup before the entire image is present on the Node. 21 | 22 | * Container images can be pre-installed on Nodes in air-gapped environments (example: [k3s airgap installation](https://docs.k3s.io/installation/airgap)). 23 | 24 | 25 | **Guides:** 26 | 27 | * [How to build models into container images](../how-to/build-models-into-containers.md) 28 | 29 | ## B. Model on shared filesystem (read-write-many) 30 | 31 | KubeAI can manage model caches on a shared filesystem (i.e. AWS [EFS](https://aws.amazon.com/efs/), GCP [Filestore](https://cloud.google.com/filestore/docs/overview), NFS). It manages the full lifecycle of a cached model: loading, serving, and cache eviction (on deletion of the Model). 32 | 33 |
34 | 35 | 36 | ## C. Model on read-only-many disk 37 | 38 | **Status:** [Planned](https://github.com/substratusai/kubeai/blob/main/proposals/model-storage.md). 39 | 40 | Examples: [GCP Hyperdisk ML](https://cloud.google.com/compute/docs/disks/hyperdisks) 41 | -------------------------------------------------------------------------------- /docs/contributing/development-guide.md: -------------------------------------------------------------------------------- 1 | # KubeAI Development Guide 2 | 3 | ## OpenAI API 4 | - Types: See `./api/openai/v1/README.md` 5 | 6 | ## Build and Run Commands 7 | - Build: `make build` (manager binary) 8 | - Docker: `make docker-build` 9 | - Run locally: `make run` 10 | - Generate go code (for `./api/*`): `make generate` 11 | - Generate manifests: `make manifests` 12 | 13 | ## Testing Commands 14 | - Unit tests: `make test-unit` 15 | * Single unit test (does not work for integration tests): `go test -v ./path/to/package -run TestNamePattern` 16 | - Integration tests: `make test-integration RUN=SpecificTestToRun` 17 | - E2E tests: `make test-e2e-*` (various test suites) 18 | * Must be run with an active `kind` cluster (Run `kind create cluster` if `kubectl config current-context` does not report a cluster as existing). 19 | 20 | ## Code Style 21 | - Format: `make fmt` (standard Go formatting) 22 | - Lint: `make lint` (golangci-lint v1.59.1) 23 | - Vet: `make vet` (standard Go vetting) 24 | 25 | ## Conventions 26 | - Standard Go project layout (cmd/, internal/, api/, test/) 27 | - Table-driven tests with descriptive names 28 | - Use testify for assertions 29 | - Integration tests use require.EventuallyWithT for async verification 30 | - Follow Kubernetes controller patterns (kubebuilder / controller-runtime) -------------------------------------------------------------------------------- /docs/contributing/documentation.md: -------------------------------------------------------------------------------- 1 | # Documentation 2 | 3 | We are grateful for anyone who takes the time to improve KubeAI documentation! In order to keep our docs clear and consistent we ask that you first read about the approach to documentation that we have standardized on... 4 | 5 | ## Read before writing! 6 | 7 | The KubeAI approach to documentation is loosely inspired by the [Diataxis](https://diataxis.fr/) method. 8 | 9 | TLDR on how KubeAI docs are organized: 10 | 11 | * **Installation**: How-to guides specific to installing KubeAI. 12 | * **How To**: Directions that guide the reader through a problem or towards a result. How-to guides are goal-oriented. They assume the user is familiar with general concepts, tools, and has already installed KubeAI. 13 | * **Concepts**: A reflective explanation of KubeAI topics with a focus on giving the reader an understanding of the why. 14 | * **Tutorials**: Learning oriented experiences. Lessons that often guide a user from beginning to end. The goal is to help the reader *learn* something (compared to a how-to guide that is focused on helping the reader *do* something). 15 | * **Contributing**: The docs in here differ from the rest of the docs by audience: these docs are for anyone who will be contributing code or docs to the KubeAI project. 16 | 17 | ## How to serve kubeai.org locally 18 | 19 | Make sure you have python3 installed and run: 20 | 21 | ```bash 22 | make docs 23 | ``` -------------------------------------------------------------------------------- /docs/diagrams/arch.excalidraw.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/substratusai/kubeai/818a871ca6c1c0af774e467217de0a17b8bd9d7c/docs/diagrams/arch.excalidraw.png -------------------------------------------------------------------------------- /docs/diagrams/autoscaling.excalidraw.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/substratusai/kubeai/818a871ca6c1c0af774e467217de0a17b8bd9d7c/docs/diagrams/autoscaling.excalidraw.png -------------------------------------------------------------------------------- /docs/diagrams/caching-shared-filesystem.excalidraw.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/substratusai/kubeai/818a871ca6c1c0af774e467217de0a17b8bd9d7c/docs/diagrams/caching-shared-filesystem.excalidraw.png -------------------------------------------------------------------------------- /docs/diagrams/chwbl.excalidraw.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/substratusai/kubeai/818a871ca6c1c0af774e467217de0a17b8bd9d7c/docs/diagrams/chwbl.excalidraw.png -------------------------------------------------------------------------------- /docs/diagrams/lora-direct-loading.excalidraw.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/substratusai/kubeai/818a871ca6c1c0af774e467217de0a17b8bd9d7c/docs/diagrams/lora-direct-loading.excalidraw.png -------------------------------------------------------------------------------- /docs/diagrams/multi-threaded-shared-context.excalidraw.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/substratusai/kubeai/818a871ca6c1c0af774e467217de0a17b8bd9d7c/docs/diagrams/multi-threaded-shared-context.excalidraw.png -------------------------------------------------------------------------------- /docs/diagrams/multi-turn-clients.excalidraw.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/substratusai/kubeai/818a871ca6c1c0af774e467217de0a17b8bd9d7c/docs/diagrams/multi-turn-clients.excalidraw.png -------------------------------------------------------------------------------- /docs/diagrams/multitenancy-labels.excalidraw.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/substratusai/kubeai/818a871ca6c1c0af774e467217de0a17b8bd9d7c/docs/diagrams/multitenancy-labels.excalidraw.png -------------------------------------------------------------------------------- /docs/diagrams/private-deep-chat.excalidraw.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/substratusai/kubeai/818a871ca6c1c0af774e467217de0a17b8bd9d7c/docs/diagrams/private-deep-chat.excalidraw.png -------------------------------------------------------------------------------- /docs/diagrams/random-vs-consistent-hash.excalidraw.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/substratusai/kubeai/818a871ca6c1c0af774e467217de0a17b8bd9d7c/docs/diagrams/random-vs-consistent-hash.excalidraw.png -------------------------------------------------------------------------------- /docs/graphs/throughput-benchmark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/substratusai/kubeai/818a871ca6c1c0af774e467217de0a17b8bd9d7c/docs/graphs/throughput-benchmark.png -------------------------------------------------------------------------------- /docs/graphs/ttft-benchmark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/substratusai/kubeai/818a871ca6c1c0af774e467217de0a17b8bd9d7c/docs/graphs/ttft-benchmark.png -------------------------------------------------------------------------------- /docs/how-to/build-models-into-containers.md: -------------------------------------------------------------------------------- 1 | # Build models into containers 2 | 3 | In this guide we will preload a LLM into a custom built Ollama serving image. You can follow the same steps for other models and other serving engines. 4 | 5 | Define some values 6 | ```bash 7 | export MODEL_URL=ollama://qwen2:0.5b 8 | 9 | # Customize with your own image repo. 10 | export IMAGE=us-central1-docker.pkg.dev/substratus-dev/default/ollama-builtin-qwen2-05b:latest 11 | ``` 12 | 13 | Build and push image. Note: building (downloading base image & model) and pushing (uploading image & model) can take a while depending on the size of the model. 14 | 15 | ```bash 16 | git clone https://github.com/substratusai/kubeai 17 | cd ./kubeai/examples/ollama-builtin 18 | 19 | docker build --build-arg MODEL_URL=$MODEL_URL -t $IMAGE . 20 | docker push $IMAGE 21 | ``` 22 | 23 | Create a model manifest & apply into a cluster with KubeAI installed. NOTE: The only difference between an built-in model image and otherwise is the addition of the `image:` field. 24 | 25 | ```bash 26 | kubectl apply -f - << EOF 27 | apiVersion: kubeai.org/v1 28 | kind: Model 29 | metadata: 30 | name: builtin-model-example 31 | spec: 32 | features: ["TextGeneration"] 33 | owner: alibaba 34 | image: $IMAGE # <-- The image with model built-in 35 | url: "$MODEL_URL" 36 | engine: OLlama 37 | resourceProfile: cpu:1 38 | EOF 39 | ``` 40 | -------------------------------------------------------------------------------- /docs/how-to/configure-autoscaling.md: -------------------------------------------------------------------------------- 1 | # Configure autoscaling 2 | 3 | This guide will cover how to configure KubeAI [autoscaling](../concepts/autoscaling.md) parameters. 4 | 5 | ## System Settings 6 | 7 | KubeAI administrators can define system-wide autoscaling settings by setting the following Helm values (for the `kubeai/kubeai` chart): 8 | 9 | Example: 10 | 11 | ```yaml 12 | # helm-values.yaml 13 | modelAutoscaling: 14 | interval: 15s 15 | timeWindow: 10m 16 | # ... 17 | ``` 18 | 19 | ## Model Settings 20 | 21 | The following settings can be configured on a model-by-model basis. 22 | 23 | ### Model settings: helm 24 | 25 | If you are managing models via the `kubeai/models` Helm chart, you can use: 26 | 27 | ```yaml 28 | # helm-values.yaml 29 | catalog: 30 | model-a: 31 | # ... 32 | minReplicas: 1 33 | maxReplicas: 9 34 | targetRequests: 250 35 | scaleDownDelaySeconds: 45 36 | model-b: 37 | # ... 38 | disableAutoscaling: true 39 | # ... 40 | ``` 41 | 42 | Re-running `helm upgrade` with these additional parameters will update model settings in the cluster. 43 | 44 | ### Model settings: kubectl 45 | 46 | You can also specify the autoscaling profile directly via the Models custom resource in the Kubernetes API: 47 | 48 | ```yaml 49 | apiVersion: kubeai.org/v1 50 | kind: Model 51 | metadata: 52 | name: my-model 53 | spec: 54 | # ... 55 | minReplicas: 1 56 | maxReplicas: 9 57 | targetRequests: 250 58 | scaleDownDelaySeconds: 45 59 | ``` 60 | 61 | If you are already managing models using Model manifest files, you can make the update to your file and reapply it using `kubectl apply -f .yaml`. 62 | -------------------------------------------------------------------------------- /docs/how-to/configure-embedding-models.md: -------------------------------------------------------------------------------- 1 | # Configure embedding models 2 | 3 | KubeAI supports the following engines for text embedding models: 4 | 5 | - Infinity 6 | - vLLM 7 | - Ollama 8 | 9 | Infinity supports any HuggingFace models listed as text-embedding. See the [models, reranking or clip models on huggingface](https://huggingface.co/models?other=text-embeddings-inference&sort=trending) for reference. 10 | 11 | 12 | ## Install BAAI/bge-small-en-v1.5 model using Infinity 13 | 14 | Create a file named `kubeai-models.yaml` with the following content: 15 | 16 | ```yaml 17 | catalog: 18 | bge-embed-text-cpu: 19 | enabled: true 20 | features: ["TextEmbedding"] 21 | owner: baai 22 | url: "hf://BAAI/bge-small-en-v1.5" 23 | engine: Infinity 24 | resourceProfile: cpu:1 25 | minReplicas: 1 26 | ``` 27 | 28 | Apply the kubeai-models helm chart: 29 | 30 | ```bash 31 | helm install kubeai-models kubeai/models -f ./kubeai-models.yaml 32 | ``` 33 | 34 | Once the pod is ready, you can use the OpenAI Python SDK to interact with the model: 35 | 36 | ```python 37 | from openai import OpenAI 38 | # Assumes port-forward of kubeai service to localhost:8000. 39 | client = OpenAI(api_key="ignored", base_url="http://localhost:8000/openai/v1") 40 | response = client.embeddings.create( 41 | input="Your text goes here.", 42 | model="bge-embed-text-cpu" 43 | ) 44 | ``` 45 | -------------------------------------------------------------------------------- /docs/how-to/configure-speech-to-text.md: -------------------------------------------------------------------------------- 1 | # Configure speech-to-text 2 | 3 | KubeAI provides a Speech to Text endpoint that can be used to transcribe audio files. This guide will walk you through the steps to enable this feature. 4 | 5 | ## Enable Speech to Text model 6 | You can create new models by creating a Model CRD object or by enabling a model from the model catalog. 7 | 8 | ### Enable from model catalog 9 | KubeAI provides predefined models in the `kubeai/models` Helm chart. To enable the Speech to Text model, you can set the `enabled` flag to `true` in your values file. 10 | 11 | ```yaml 12 | # models-helm-values.yaml 13 | catalog: 14 | faster-whisper-medium-en-cpu: 15 | enabled: true 16 | minReplicas: 1 17 | ``` 18 | 19 | ### Enable by creating Model 20 | You can also create a Model object to enable the Speech to Text model. For example: 21 | 22 | ```yaml 23 | apiVersion: kubeai.org/v1 24 | kind: Model 25 | metadata: 26 | name: faster-whisper-medium-en-cpu 27 | spec: 28 | features: [SpeechToText] 29 | owner: Systran 30 | url: hf://Systran/faster-whisper-medium.en 31 | engine: FasterWhisper 32 | resourceProfile: cpu:1 33 | ``` 34 | 35 | ## Usage 36 | The Speech to Text endpoint is available at `/openai/v1/transcriptions`. 37 | 38 | Example usage using curl: 39 | 40 | ```bash 41 | curl -L -o kubeai.mp4 https://github.com/user-attachments/assets/711d1279-6af9-4c6c-a052-e59e7730b757 42 | curl http://localhost:8000/openai/v1/audio/transcriptions \ 43 | -F "file=@kubeai.mp4" \ 44 | -F "language=en" \ 45 | -F "model=faster-whisper-medium-en-cpu" 46 | ``` 47 | -------------------------------------------------------------------------------- /docs/how-to/load-models-from-pvc.md: -------------------------------------------------------------------------------- 1 | # Load Models from PVC 2 | 3 | You can store your models in a Persistent Volume Claim (PVC) and let KubeAI use them for serving. 4 | Both vLLM and Ollama engines support loading models from PVCs. 5 | 6 | You must ensure the model files are already present in the PVC before creating the Model resource. 7 | Alternatively you can use KubeAI's native caching mechanism which downloads the model for you: 8 | 9 | - [Cache Models with GCP Filestore](./cache-models-with-gcp-filestore.md) 10 | - [Cache Models with EFS](./cache-models-with-aws-efs.md) 11 | 12 | 13 | ## vLLM 14 | 15 | For vLLM, use the following URL format: 16 | ```yaml 17 | url: pvc://$PVC_NAME # Loads the model from the PVC named $PVC_NAME 18 | url: pvc://$PVC_NAME/$PATH # Loads from a specific path within the PVC 19 | ``` 20 | 21 | ### PVC requirements 22 | 23 | vLLM supports both ReadWriteMany and ReadOnlyMany access modes. `Many` is used in order to support more than 1 vLLM replica. 24 | 25 | 26 | ## Ollama 27 | 28 | For Ollama, use the following URL formats: 29 | ```yaml 30 | url: pvc://$PVC_NAME?model=$MODEL_NAME # Loads the model named $MODEL_NAME that's loaded on the disk 31 | url: pvc://$PVC_NAME/$PATH?model=$MODEL_NAME 32 | ``` 33 | 34 | ### PVC Requirements 35 | Ollama requires using ReadWriteMany access mode because the rename operation `ollama cp` needs to write to the PVC. 36 | 37 | ### Example: Loading Qwen 0.5b from PVC 38 | 39 | 1. Create a PVC with ReadWriteMany named `model-pvc`. See [example](https://github.com/substratusai/kubeai/blob/main/examples/ollama-pvc/pvc.yaml). 40 | 2. Create a K8s Job to load the model onto `model-pvc. See [example](https://github.com/substratusai/kubeai/blob/main/examples/ollama-pvc/job.yaml) 41 | 42 | The PVC should now have a `blobs/` and `manifests/` directory after the loader completes. 43 | 44 | 45 | 3. Create a Model to load from PVC: 46 | 47 | ```yaml 48 | url: pvc://model-pvc?model=qwen:0.5b 49 | ``` 50 | -------------------------------------------------------------------------------- /docs/overrides/partials/integrations/analytics/custom.html: -------------------------------------------------------------------------------- 1 | 3 | -------------------------------------------------------------------------------- /docs/reference/.kubernetes-api/config.yaml: -------------------------------------------------------------------------------- 1 | processor: 2 | # RE2 regular expressions describing types that should be excluded from the generated documentation. 3 | ignoreTypes: 4 | - "List" 5 | # RE2 regular expressions describing type fields that should be excluded from the generated documentation. 6 | ignoreFields: 7 | - "TypeMeta" 8 | 9 | render: 10 | # Version of Kubernetes to use when generating links to Kubernetes API documentation. 11 | kubernetesVersion: 1.31 12 | -------------------------------------------------------------------------------- /docs/reference/openai-api-compatibility.md: -------------------------------------------------------------------------------- 1 | # OpenAI API Compatibility 2 | 3 | KubeAI provides an OpenAI API compatiblity layer. 4 | 5 | ## General: 6 | 7 | ### Models 8 | 9 | ``` 10 | GET /v1/models 11 | ``` 12 | 13 | * Lists all `kind: Model` object installed in teh Kubernetes API Server. 14 | 15 | 16 | ## Inference 17 | 18 | ### Text Generation 19 | 20 | ``` 21 | POST /v1/chat/completions 22 | POST /v1/completions 23 | ``` 24 | 25 | * Supported for Models with `.spec.features: ["TextGeneration"]`. 26 | 27 | ### Embeddings 28 | 29 | ``` 30 | POST /v1/embeddings 31 | ``` 32 | 33 | * Supported for Models with `.spec.features: ["TextEmbedding"]`. 34 | 35 | ### Speech-to-Text 36 | 37 | ``` 38 | POST /v1/audio/transcriptions 39 | ``` 40 | 41 | * Supported for Models with `.spec.features: ["SpeechToText"]`. 42 | 43 | ## OpenAI Client libaries 44 | You can use the official OpenAI client libraries by setting the 45 | `base_url` to the KubeAI endpoint. 46 | 47 | For example, you can use the Python client like this: 48 | ```python 49 | from openai import OpenAI 50 | client = OpenAI(api_key="ignored", 51 | base_url="http://kubeai/openai/v1") 52 | response = client.chat.completions.create( 53 | model="gemma2-2b-cpu", 54 | messages=[ 55 | {"role": "system", "content": "You are a helpful assistant."}, 56 | {"role": "user", "content": "Who won the world series in 2020?"}, 57 | {"role": "assistant", "content": "The Los Angeles Dodgers won the World Series in 2020."}, 58 | {"role": "user", "content": "Where was it played?"} 59 | ] 60 | ) 61 | ``` 62 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | mkdocs 2 | mkdocs-material 3 | mkdocs-awesome-pages-plugin 4 | mkdocs-material[imaging] -------------------------------------------------------------------------------- /docs/screenshots/gcp-cpus-all-regions.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/substratusai/kubeai/818a871ca6c1c0af774e467217de0a17b8bd9d7c/docs/screenshots/gcp-cpus-all-regions.png -------------------------------------------------------------------------------- /docs/screenshots/gcp-gpus-all-regions.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/substratusai/kubeai/818a871ca6c1c0af774e467217de0a17b8bd9d7c/docs/screenshots/gcp-gpus-all-regions.png -------------------------------------------------------------------------------- /docs/screenshots/gcp-quota-preemptible-nvidia-l4-gpus-regional.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/substratusai/kubeai/818a871ca6c1c0af774e467217de0a17b8bd9d7c/docs/screenshots/gcp-quota-preemptible-nvidia-l4-gpus-regional.png -------------------------------------------------------------------------------- /docs/screenshots/gcp-quota-premium-storage-gb-per-region.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/substratusai/kubeai/818a871ca6c1c0af774e467217de0a17b8bd9d7c/docs/screenshots/gcp-quota-premium-storage-gb-per-region.png -------------------------------------------------------------------------------- /docs/screenshots/gcp-tpu-preemptible-v5e-quota.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/substratusai/kubeai/818a871ca6c1c0af774e467217de0a17b8bd9d7c/docs/screenshots/gcp-tpu-preemptible-v5e-quota.png -------------------------------------------------------------------------------- /docs/screenshots/langtrace.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/substratusai/kubeai/818a871ca6c1c0af774e467217de0a17b8bd9d7c/docs/screenshots/langtrace.png -------------------------------------------------------------------------------- /docs/screenshots/private-deep-chat.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/substratusai/kubeai/818a871ca6c1c0af774e467217de0a17b8bd9d7c/docs/screenshots/private-deep-chat.png -------------------------------------------------------------------------------- /examples/k8s-api-clients/python/example.py: -------------------------------------------------------------------------------- 1 | from kubernetes import config, dynamic 2 | from kubernetes.client import api_client 3 | 4 | k8s_client = dynamic.DynamicClient( 5 | api_client.ApiClient(configuration=config.load_kube_config()) 6 | ) 7 | 8 | models_client = k8s_client.resources.get(api_version="kubeai.org/v1", kind="Model") 9 | 10 | model = { 11 | "apiVersion": "kubeai.org/v1", 12 | "kind": "Model", 13 | "metadata": { 14 | "name": "facebook-opt-125m", 15 | "namespace": "default", 16 | }, 17 | "spec": { 18 | "features": ["TextGeneration"], 19 | "owner": "facebook", 20 | "url": "hf://facebook/opt-125m", 21 | "engine": "VLLM", 22 | "resourceProfile": "cpu:1", 23 | }, 24 | } 25 | 26 | 27 | models_client.create(body=model) 28 | 29 | # Alternative: Use "server-side apply" (i.e. kubectl apply) to upsert the Model. 30 | # models_client.patch( 31 | # body=model, 32 | # content_type="application/apply-patch+yaml", 33 | # field_manager="my-example-app", # Set a field manager to track ownership of fields. 34 | # ) 35 | 36 | created_model = models_client.get(name="facebook-opt-125m", namespace="default") 37 | print(created_model) 38 | 39 | # Optionally delete the Model. 40 | # models_client.delete(name="facebook-opt-125m", namespace="default") 41 | -------------------------------------------------------------------------------- /examples/k8s-api-clients/python/requirements.txt: -------------------------------------------------------------------------------- 1 | kubernetes==31.0.0 2 | -------------------------------------------------------------------------------- /examples/ollama-builtin/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ollama/ollama:latest 2 | 3 | # Model to be downloaded. 4 | ARG MODEL_URL 5 | 6 | # MODEL_URL is a required argument. 7 | RUN test -n "${MODEL_URL}" 8 | 9 | # Set the model to be downloaded. 10 | ENV MODEL_URL=${MODEL_URL} 11 | 12 | COPY ./download.sh /download.sh 13 | RUN ./download.sh -------------------------------------------------------------------------------- /examples/ollama-builtin/download.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -eu 4 | 5 | # Exit if the model URL is not set. 6 | : "$MODEL_URL" 7 | 8 | # Check if the model URL is in the correct format - matching 9 | # the format used in .spec.url in the Model Custom Resource. 10 | if [[ $MODEL_URL != ollama://* ]] ; 11 | then 12 | echo "MODEL_URL must use the \"ollama://\" format" 13 | exit 1 14 | fi 15 | 16 | ollama_model_name=${MODEL_URL#ollama://} 17 | 18 | # Run Ollama server in the background. 19 | /bin/ollama serve & 20 | pid=$! 21 | 22 | # TODO: Wait for the server to start using something more exact. 23 | sleep 5 24 | 25 | /bin/ollama pull $ollama_model_name 26 | 27 | # Send SIGTERM to the server to allow it to gracefully exit. 28 | kill -SIGTERM "$pid" 29 | 30 | # Wait for the server to exit. 31 | wait "$pid" 32 | -------------------------------------------------------------------------------- /examples/ollama-pvc/job.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: batch/v1 2 | kind: Job 3 | metadata: 4 | name: ollama-load-model-to-pvc 5 | spec: 6 | template: 7 | spec: 8 | containers: 9 | - name: ollama 10 | image: ollama/ollama:latest 11 | env: 12 | - name: OLLAMA_MODELS 13 | value: /model 14 | command: 15 | - /bin/sh 16 | - -c 17 | - | 18 | /bin/ollama serve & 19 | echo "Waiting for Ollama server to start..." 20 | sleep 10 21 | 22 | # Pull the model and ensure it downloads successfully 23 | echo "Pulling model qwen:0.5b..." 24 | if ! /bin/ollama pull qwen:0.5b; then 25 | echo "Failed to pull model" 26 | exit 1 27 | fi 28 | 29 | # Verify the model files exist 30 | echo "Verifying model files..." 31 | ls -R /model 32 | if [ ! -d "/model/blobs" ] || [ ! -d "/model/manifests" ]; then 33 | echo "Model directories not found" 34 | exit 1 35 | fi 36 | 37 | echo "Model setup completed successfully" 38 | ls -la /model/manifests/registry.ollama.ai/library/qwen/0.5b 39 | volumeMounts: 40 | - name: models-volume 41 | mountPath: /model 42 | volumes: 43 | - name: models-volume 44 | persistentVolumeClaim: 45 | claimName: model-pvc 46 | readOnly: false 47 | restartPolicy: OnFailure 48 | -------------------------------------------------------------------------------- /examples/ollama-pvc/pvc.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: PersistentVolumeClaim 3 | metadata: 4 | name: model-pvc 5 | spec: 6 | storageClassName: premium-rwx # replace with your actual storage class 7 | accessModes: 8 | - ReadWriteMany 9 | resources: 10 | requests: 11 | storage: 10Gi -------------------------------------------------------------------------------- /examples/priority-examples/background-research-model.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: kubeai.org/v1 2 | kind: Model 3 | metadata: 4 | name: background-research-model 5 | spec: 6 | features: [TextGeneration] 7 | url: ollama://gemma2:2b 8 | engine: OLlama 9 | # Background tasks with low priority will be preempted when resources are needed for higher priority models 10 | priorityClassName: low-priority 11 | resourceProfile: cpu:2 -------------------------------------------------------------------------------- /examples/priority-examples/critical-service-model.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: kubeai.org/v1 2 | kind: Model 3 | metadata: 4 | name: critical-service-model 5 | spec: 6 | features: [TextGeneration] 7 | url: hf://neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8 8 | engine: VLLM 9 | # Critical service model gets high priority to preempt other models when resources are limited 10 | priorityClassName: high-priority 11 | resourceProfile: nvidia-gpu-l4:1 -------------------------------------------------------------------------------- /examples/priority-examples/hello-world-llm.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: kubeai.org/v1 2 | kind: Model 3 | metadata: 4 | name: hello-world-llm 5 | spec: 6 | features: [TextGeneration] 7 | url: hf://neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8 8 | engine: VLLM 9 | # Uncomment to set the priority class for this model 10 | # priorityClassName: high-priority 11 | resourceProfile: nvidia-gpu-l4:1 -------------------------------------------------------------------------------- /examples/priority-examples/priority-classes.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: scheduling.k8s.io/v1 2 | kind: PriorityClass 3 | metadata: 4 | name: high-priority 5 | value: 1000000 # Higher value means higher priority 6 | globalDefault: false 7 | description: "This priority class should be used for critical inference models only." 8 | --- 9 | apiVersion: scheduling.k8s.io/v1 10 | kind: PriorityClass 11 | metadata: 12 | name: medium-priority 13 | value: 100000 14 | globalDefault: false 15 | description: "This priority class should be used for medium priority inference models." 16 | --- 17 | apiVersion: scheduling.k8s.io/v1 18 | kind: PriorityClass 19 | metadata: 20 | name: low-priority 21 | value: 10000 22 | globalDefault: false 23 | description: "This priority class should be used for low priority inference models." -------------------------------------------------------------------------------- /examples/private-deep-chat/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM golang:1.23 AS builder 2 | 3 | WORKDIR /workspace 4 | COPY go.* . 5 | 6 | RUN go mod download 7 | 8 | COPY main.go main.go 9 | RUN CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build -a -o server ./main.go 10 | 11 | FROM gcr.io/distroless/static:nonroot 12 | 13 | WORKDIR /app 14 | COPY --from=builder /workspace/server /app/ 15 | COPY ./static /app/static 16 | USER 65532:65532 17 | 18 | ENTRYPOINT ["/app/server"] 19 | -------------------------------------------------------------------------------- /examples/private-deep-chat/go.mod: -------------------------------------------------------------------------------- 1 | module private-chat 2 | 3 | go 1.22.0 4 | -------------------------------------------------------------------------------- /examples/private-deep-chat/manifests/deployment.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: private-deep-chat 5 | labels: 6 | app: private-deep-chat 7 | spec: 8 | replicas: 1 9 | selector: 10 | matchLabels: 11 | app: private-deep-chat 12 | template: 13 | metadata: 14 | labels: 15 | app: private-deep-chat 16 | spec: 17 | containers: 18 | - name: server 19 | image: private-deep-chat:latest 20 | imagePullPolicy: IfNotPresent 21 | ports: 22 | - containerPort: 8000 23 | env: 24 | - name: LISTEN_ADDR 25 | value: ":8000" 26 | - name: KUBEAI_ADDR 27 | value: "http://kubeai" 28 | 29 | -------------------------------------------------------------------------------- /examples/private-deep-chat/manifests/models.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: kubeai.org/v1 2 | kind: Model 3 | metadata: 4 | name: gemma2-a 5 | labels: 6 | tenancy: group-a 7 | spec: 8 | features: [TextGeneration] 9 | owner: google 10 | url: ollama://gemma2:2b 11 | engine: OLlama 12 | resourceProfile: cpu:2 13 | --- 14 | apiVersion: kubeai.org/v1 15 | kind: Model 16 | metadata: 17 | name: gemma2-b 18 | labels: 19 | tenancy: group-b 20 | spec: 21 | features: [TextGeneration] 22 | owner: google 23 | url: ollama://gemma2:2b 24 | engine: OLlama 25 | resourceProfile: cpu:2 -------------------------------------------------------------------------------- /examples/private-deep-chat/manifests/service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | name: private-deep-chat 5 | labels: 6 | app: private-deep-chat 7 | spec: 8 | ports: 9 | - port: 80 10 | protocol: TCP 11 | targetPort: 8000 12 | selector: 13 | app: private-deep-chat 14 | -------------------------------------------------------------------------------- /examples/storage-classes/gcp-filestore.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: storage.k8s.io/v1 2 | kind: StorageClass 3 | metadata: 4 | name: gcp-filestore 5 | provisioner: filestore.csi.storage.gke.io 6 | volumeBindingMode: Immediate 7 | allowVolumeExpansion: true 8 | parameters: 9 | tier: standard 10 | network: default -------------------------------------------------------------------------------- /hack/apply-model.sh: -------------------------------------------------------------------------------- 1 | model='opt-125m-cpu' 2 | helm template ./charts/models --set "catalog.$model.enabled=true" --set "catalog.$model.minReplicas=1" | kubectl apply -f - -------------------------------------------------------------------------------- /hack/boilerplate.go.txt: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2024. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ -------------------------------------------------------------------------------- /hack/create-dev-gke-cluster.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | cluster_name="kubeai-dev-1" 4 | 5 | gcloud container clusters create $cluster_name \ 6 | --zone us-central1-a \ 7 | --node-locations us-central1-a --num-nodes 1 --machine-type e2-medium 8 | 9 | gcloud container node-pools create n2s4 \ 10 | --cluster=$cluster_name \ 11 | --zone us-central1-a \ 12 | --machine-type=n2-standard-4 \ 13 | --enable-autoscaling \ 14 | --num-nodes=0 \ 15 | --min-nodes=0 \ 16 | --max-nodes=3 17 | 18 | gcloud container node-pools create n2s8 \ 19 | --cluster=$cluster_name \ 20 | --zone us-central1-a \ 21 | --machine-type=n2-standard-8 \ 22 | --enable-autoscaling \ 23 | --num-nodes=0 \ 24 | --min-nodes=0 \ 25 | --max-nodes=3 26 | 27 | gcloud container node-pools create n2s16 \ 28 | --cluster=$cluster_name \ 29 | --zone us-central1-a \ 30 | --machine-type=n2-standard-16 \ 31 | --enable-autoscaling \ 32 | --num-nodes=0 \ 33 | --min-nodes=0 \ 34 | --max-nodes=3 35 | 36 | gcloud container node-pools create g2s8 \ 37 | --cluster=$cluster_name \ 38 | --zone us-central1-a \ 39 | --accelerator=type=nvidia-l4,count=1,gpu-driver-version=default \ 40 | --machine-type=g2-standard-8 \ 41 | --enable-autoscaling \ 42 | --num-nodes=0 \ 43 | --min-nodes=0 \ 44 | --max-nodes=3 45 | 46 | -------------------------------------------------------------------------------- /hack/dev-configs/gke.yaml: -------------------------------------------------------------------------------- 1 | secretNames: 2 | huggingface: huggingface 3 | 4 | modelServers: 5 | VLLM: 6 | images: 7 | default: "vllm/vllm-openai:v0.6.3.post1" 8 | cpu: "substratusai/vllm:v0.6.3.post1-cpu" 9 | google-tpu: "substratusai/vllm:v0.6.3.post1-tpu" 10 | OLlama: 11 | images: 12 | default: "ollama/ollama:latest" 13 | FasterWhisper: 14 | images: 15 | default: "fedirz/faster-whisper-server:latest-cpu" 16 | nvidia-gpu: "fedirz/faster-whisper-server:latest-cuda" 17 | Infinity: 18 | images: 19 | default: "michaelf34/infinity:latest" 20 | 21 | modelLoading: 22 | image: us-central1-docker.pkg.dev/substratus-dev/default/kubeai-model-loader 23 | 24 | modelRollouts: 25 | surge: 0 26 | messaging: 27 | errorMaxBackoff: 30s 28 | streams: [] 29 | #- requestsURL: gcppubsub://projects/substratus-dev/subscriptions/test-kubeai-requests-sub 30 | # responsesURL: gcppubsub://projects/substratus-dev/topics/test-kubeai-responses 31 | # maxHandlers: 1 32 | resourceProfiles: 33 | cpu: 34 | imageName: "cpu" 35 | requests: 36 | # Kind 37 | #cpu: 0.5 38 | #memory: 1Gi 39 | # GKE 40 | cpu: 3 41 | memory: 12Gi 42 | limits: 43 | cpu: 3 44 | memory: 12Gi 45 | nvidia-gpu-l4: 46 | limits: 47 | nvidia.com/gpu: "1" 48 | requests: 49 | nvidia.com/gpu: "1" 50 | cpu: "6" 51 | memory: "24Gi" 52 | nodeSelector: 53 | cloud.google.com/gke-accelerator: "nvidia-l4" 54 | cloud.google.com/gke-spot: "true" 55 | 56 | cacheProfiles: 57 | fstore: 58 | sharedFilesystem: 59 | #storageClassName: "kubeai-filestore" 60 | persistentVolumeName: "preprov1" 61 | 62 | # Dev-only configuration. 63 | allowPodAddressOverride: true 64 | fixedSelfMetricAddrs: ["127.0.0.1:"] 65 | 66 | modelAutoscaling: 67 | interval: 10s 68 | timeWindow: 60s 69 | stateConfigMapName: kubeai-autoscaler-state -------------------------------------------------------------------------------- /hack/dev-configs/kind.yaml: -------------------------------------------------------------------------------- 1 | secretNames: 2 | huggingface: huggingface 3 | 4 | modelServers: 5 | VLLM: 6 | images: 7 | # The key is the image name (referenced from resourceProfiles) and the value is the image. 8 | # The "default" image should always be specified. 9 | # "default" is used when no imageName is specified or if a specific image is not found. 10 | default: "vllm/vllm-openai:v0.6.2" 11 | cpu: "substratusai/vllm:v0.6.1-cpu" 12 | nvidia-gpu: "vllm/vllm-openai:v0.6.2" 13 | google-tpu: "substratusai/vllm:v0.6.1-tpu" 14 | OLlama: 15 | images: 16 | default: "ollama/ollama:latest" 17 | FasterWhisper: 18 | images: 19 | default: "fedirz/faster-whisper-server:latest-cpu" 20 | nvidia-gpu: "fedirz/faster-whisper-server:latest-cuda" 21 | Infinity: 22 | images: 23 | default: "michaelf34/infinity:latest" 24 | 25 | modelLoading: 26 | image: kubeai-model-loader:latest 27 | 28 | modelRollouts: 29 | surge: 0 30 | messaging: 31 | errorMaxBackoff: 30s 32 | streams: [] 33 | #- requestsURL: gcppubsub://projects/substratus-dev/subscriptions/test-kubeai-requests-sub 34 | # responsesURL: gcppubsub://projects/substratus-dev/topics/test-kubeai-responses 35 | # maxHandlers: 1 36 | resourceProfiles: 37 | cpu: 38 | imageName: "cpu" 39 | requests: 40 | cpu: 0.5 41 | memory: 1Gi 42 | limits: 43 | cpu: 3 44 | memory: 12Gi 45 | nvidia-gpu-l4: 46 | limits: 47 | nvidia.com/gpu: "1" 48 | requests: 49 | nvidia.com/gpu: "1" 50 | cpu: "6" 51 | memory: "24Gi" 52 | 53 | cacheProfiles: 54 | fstore: 55 | sharedFilesystem: 56 | #storageClassName: "kubeai-filestore" 57 | persistentVolumeName: "preprov1" 58 | 59 | # Dev-only configuration. 60 | allowPodAddressOverride: true 61 | fixedSelfMetricAddrs: ["127.0.0.1:8080"] 62 | 63 | modelAutoscaling: 64 | interval: 10s 65 | timeWindow: 60s 66 | stateConfigMapName: kubeai-autoscaler-state -------------------------------------------------------------------------------- /hack/dev-gke-helm-values.yaml: -------------------------------------------------------------------------------- 1 | models: 2 | catalog: 3 | llama-3.1-8b-instruct-fp8-l4: 4 | enabled: true 5 | 6 | resourceProfiles: 7 | nvidia-gpu-l4: 8 | nodeSelector: 9 | cloud.google.com/gke-accelerator: "nvidia-l4" 10 | cloud.google.com/gke-spot: "true" -------------------------------------------------------------------------------- /hack/dev-load/k6.js: -------------------------------------------------------------------------------- 1 | import http from 'k6/http'; 2 | import { sleep } from 'k6'; 3 | 4 | export const options = { 5 | stages: [ 6 | { duration: '15s', target: 1 }, 7 | { duration: '15s', target: 9 }, 8 | { duration: '1m', target: 9 }, 9 | { duration: '15s', target: 0 }, 10 | { duration: '15s', target: 0 }, 11 | ], 12 | }; 13 | 14 | export default function () { 15 | const url = 'http://kubeai/openai/v1/completions'; 16 | 17 | let data = { 18 | "prompt": "Your text string goes here", 19 | "model": "dev" 20 | }; 21 | 22 | let res = http.post(url, JSON.stringify(data), { 23 | headers: { 'Content-Type': 'application/json' }, 24 | }); 25 | 26 | sleep(1); 27 | } 28 | -------------------------------------------------------------------------------- /hack/dev-load/pod.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: dev-load 5 | spec: 6 | restartPolicy: Never 7 | containers: 8 | - name: k6 9 | image: grafana/k6 10 | args: ["run", "/config/k6.js"] #, "--http-debug"] 11 | volumeMounts: 12 | - name: config 13 | mountPath: /config 14 | volumes: 15 | - name: config 16 | configMap: 17 | name: dev-load -------------------------------------------------------------------------------- /hack/dev-load/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | set -u 5 | set -x 6 | 7 | this_dir=$(dirname "$0") 8 | 9 | kubectl create configmap dev-load --from-file $this_dir/k6.js --dry-run=client -oyaml | kubectl apply -f - 10 | 11 | kubectl create -f $this_dir/pod.yaml -------------------------------------------------------------------------------- /hack/dev-models/kind-cpu-adapters.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: kubeai.org/v1 2 | kind: Model 3 | metadata: 4 | name: dev 5 | annotations: 6 | # Have the controller send requests to localhost to allow for 7 | # running the controller locally (assuming a port-forward is in place). 8 | model-pod-ip: "127.0.0.1" 9 | model-pod-port: "7000" 10 | spec: 11 | features: ["TextGeneration"] 12 | owner: alibaba 13 | url: "ollama://qwen2:0.5b" 14 | engine: OLlama 15 | resourceProfile: cpu:1 16 | minReplicas: 1 17 | maxReplicas: 3 18 | adapters: 19 | - name: abc 20 | url: hf://facebook/opt-125m 21 | --- 22 | # Service for port-fowarding to the model: 23 | # 24 | # while true; do kubectl port-forward service/dev-model 7000:7000; done 25 | # 26 | apiVersion: v1 27 | kind: Service 28 | metadata: 29 | name: dev-model 30 | spec: 31 | selector: 32 | model: dev 33 | ports: 34 | - protocol: TCP 35 | port: 7000 36 | targetPort: 8000 -------------------------------------------------------------------------------- /hack/dev-models/kind-cpu.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: kubeai.org/v1 2 | kind: Model 3 | metadata: 4 | name: dev 5 | annotations: 6 | # Have the controller send requests to localhost to allow for 7 | # running the controller locally (assuming a port-forward is in place). 8 | model-pod-ip: "127.0.0.1" 9 | model-pod-port: "7000" 10 | spec: 11 | features: ["TextGeneration"] 12 | owner: alibaba 13 | url: "ollama://qwen2:0.5b" 14 | engine: OLlama 15 | #url: hf://facebook/opt-125m 16 | #engine: VLLM 17 | resourceProfile: cpu:1 18 | #cacheProfile: fstore 19 | minReplicas: 1 20 | maxReplicas: 3 21 | #url: hf://meta-llama/Meta-Llama-3.1-8B-Instruct 22 | #args: 23 | # - --max-model-len=32768 24 | # - --max-num-batched-token=32768 25 | --- 26 | # Service for port-fowarding to the model: 27 | # 28 | # while true; do kubectl port-forward service/dev-model 7000:7000; done 29 | # 30 | apiVersion: v1 31 | kind: Service 32 | metadata: 33 | name: dev-model 34 | spec: 35 | selector: 36 | model: dev 37 | ports: 38 | - protocol: TCP 39 | port: 7000 40 | targetPort: 8000 -------------------------------------------------------------------------------- /hack/dev-models/kind-vllm-cpu.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: kubeai.org/v1 2 | kind: Model 3 | metadata: 4 | name: dev 5 | annotations: 6 | # Have the controller send requests to localhost to allow for 7 | # running the controller locally (assuming a port-forward is in place). 8 | model-pod-ip: "127.0.0.1" 9 | model-pod-port: "7000" 10 | spec: 11 | features: ["TextGeneration"] 12 | owner: facebook 13 | url: hf://facebook/opt-125m 14 | engine: VLLM 15 | resourceProfile: cpu:1 16 | minReplicas: 1 17 | maxReplicas: 3 18 | args: 19 | # This revision does not contain its own chat template. 20 | - --revision=27dcfa74d334bc871f3234de431e71c6eeba5dd6 21 | - --chat-template=/config/chat-template.jinja 22 | - --swap-space=1 23 | env: 24 | VLLM_CPU_KVCACHE_SPACE: "2" 25 | files: 26 | - path: "/config/chat-template.jinja" 27 | content: "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ 'Question:\n' + message['content'] + '\n\n' }}{% elif message['role'] == 'system' %}\n{{ 'System:\n' + message['content'] + '\n\n' }}{% elif message['role'] == 'assistant' %}{{ 'Answer:\n' + message['content'] + '\n\n' }}{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ 'Answer:\n' }}{% endif %}{% endfor %}" 28 | - path: "/config/prompt.txt" 29 | content: "prompt content" 30 | --- 31 | # Service for port-fowarding to the model: 32 | # 33 | # while true; do kubectl port-forward service/dev-model 7000:7000; done 34 | # 35 | apiVersion: v1 36 | kind: Service 37 | metadata: 38 | name: dev-model 39 | spec: 40 | selector: 41 | model: dev 42 | ports: 43 | - protocol: TCP 44 | port: 7000 45 | targetPort: 8000 -------------------------------------------------------------------------------- /hack/dev-models/vllm-chat.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: kubeai.org/v1 2 | kind: Model 3 | metadata: 4 | name: tinyllama-chat 5 | spec: 6 | features: [TextGeneration] 7 | owner: meta-llama 8 | url: hf://TinyLlama/TinyLlama-1.1B-Chat-v1.0 9 | #adapters: 10 | #- name: foo 11 | # url: hf://jashing/tinyllama-colorist-lora 12 | #- name: bar 13 | # url: s3://substratus-ai-test-0/adapters/jashing/tinyllama-colorist-lora 14 | #- name: baz 15 | # url: gs://substratus-ai-test-0/adapters/jashing/tinyllama-colorist-lora 16 | engine: VLLM 17 | resourceProfile: nvidia-gpu-l4:1 18 | minReplicas: 1 -------------------------------------------------------------------------------- /hack/dev-models/vllm-gs-url.yaml: -------------------------------------------------------------------------------- 1 | # Source: models/templates/models.yaml 2 | apiVersion: kubeai.org/v1 3 | kind: Model 4 | metadata: 5 | name: gs-opt-125m-cpu 6 | spec: 7 | features: [TextGeneration] 8 | owner: facebook 9 | url: gs://substratus-ai-test-0/models/facebook/opt-125m 10 | cacheProfile: standard-filestore 11 | engine: VLLM 12 | resourceProfile: cpu:4 13 | minReplicas: 1 14 | -------------------------------------------------------------------------------- /hack/dev-models/vllm-s3-url.yaml: -------------------------------------------------------------------------------- 1 | # Source: models/templates/models.yaml 2 | apiVersion: kubeai.org/v1 3 | kind: Model 4 | metadata: 5 | name: s3-opt-125m-cpu 6 | spec: 7 | features: [TextGeneration] 8 | owner: facebook 9 | url: s3://substratus-ai-test-0/models/facebook/opt-125m 10 | cacheProfile: standard-filestore 11 | engine: VLLM 12 | resourceProfile: cpu:4 13 | minReplicas: 1 14 | -------------------------------------------------------------------------------- /hack/dev-models/vllm-with-adapters.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: kubeai.org/v1 2 | kind: Model 3 | metadata: 4 | name: tinyllama-chat-adapters 5 | spec: 6 | features: [TextGeneration] 7 | owner: meta-llama 8 | url: hf://TinyLlama/TinyLlama-1.1B-Chat-v0.3 9 | adapters: 10 | - name: foo 11 | url: hf://jashing/tinyllama-colorist-lora 12 | - name: bar 13 | url: s3://substratus-ai-test-0/adapters/jashing/tinyllama-colorist-lora 14 | #- name: baz 15 | # url: gs://substratus-ai-test-0/adapters/jashing/tinyllama-colorist-lora 16 | engine: VLLM 17 | resourceProfile: nvidia-gpu-l4:1 18 | minReplicas: 1 -------------------------------------------------------------------------------- /hack/pvs/preprov-filestore.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: PersistentVolume 3 | metadata: 4 | name: preprov1 5 | spec: 6 | storageClassName: "" 7 | capacity: 8 | storage: 1Ti 9 | accessModes: 10 | - ReadWriteMany 11 | persistentVolumeReclaimPolicy: Retain 12 | volumeMode: Filesystem 13 | csi: 14 | driver: filestore.csi.storage.gke.io 15 | volumeHandle: "modeInstance/us-central1-f/preprov1/vol1" 16 | volumeAttributes: 17 | # Replace with IP from created Filestore instance: 18 | ip: "10.100.234.50" 19 | volume: vol1 -------------------------------------------------------------------------------- /hack/vllm-mock-metrics/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "log" 5 | "net/http" 6 | "os" 7 | ) 8 | 9 | func main() { 10 | // Serve metrics.txt at /metrics 11 | metrics, err := os.ReadFile("metrics.txt") 12 | if err != nil { 13 | log.Fatal(err) 14 | } 15 | log.Println("starting") 16 | log.Fatal(http.ListenAndServe(":8888", http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { 17 | log.Println("serving") 18 | w.Write(metrics) 19 | }))) 20 | } 21 | -------------------------------------------------------------------------------- /hack/volume-debug-pod.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: volume-debug-pod 5 | spec: 6 | containers: 7 | - name: main 8 | image: ubuntu 9 | command: ["sleep", "10000"] 10 | volumeMounts: 11 | - name: models 12 | mountPath: /my-mnt 13 | volumes: 14 | - name: models 15 | persistentVolumeClaim: 16 | claimName: shared-model-cache-fstore -------------------------------------------------------------------------------- /internal/apiutils/model.go: -------------------------------------------------------------------------------- 1 | package apiutils 2 | 3 | import "strings" 4 | 5 | const ( 6 | // adapterSeparator is the separator used to split model and adapter names 7 | // in API requests. 8 | // 9 | // Alternatives considered: 10 | // 11 | // "-" (hyphen): This is a common separator in Kubernetes resource names. 12 | // "." (dot): This is a common separator in model versions "llama-3.2". 13 | // "/" (slash): This would be incompatible with specifying model names inbetween slashes in URL paths (i.e. "/some-api/models//details"). 14 | // ":" (colon): This might cause problems when specifying model names before colons in URL paths (see example below). 15 | // 16 | // See example of a path used in the Gemini API (https://ai.google.dev/gemini-api/docs/text-generation?lang=rest): 17 | // "https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash:generateContent?key=$GOOGLE_API_KEY" 18 | adapterSeparator = "_" 19 | ) 20 | 21 | // SplitModelAdapter splits a requested model name into KubeAI 22 | // Model.metadata.name and Model.spec.adapters[].name. 23 | func SplitModelAdapter(s string) (model, adapter string) { 24 | parts := strings.SplitN(s, adapterSeparator, 2) 25 | if len(parts) == 1 { 26 | return parts[0], "" 27 | } 28 | return parts[0], parts[1] 29 | } 30 | 31 | // MergeModelAdapter merges a model and adapter name into a single string. 32 | func MergeModelAdapter(model, adapter string) string { 33 | if adapter == "" { 34 | return model 35 | } 36 | return model + adapterSeparator + adapter 37 | } 38 | -------------------------------------------------------------------------------- /internal/apiutils/model_test.go: -------------------------------------------------------------------------------- 1 | package apiutils_test 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/stretchr/testify/require" 7 | "github.com/substratusai/kubeai/internal/apiutils" 8 | ) 9 | 10 | func TestSplitModelAdapter(t *testing.T) { 11 | t.Parallel() 12 | 13 | cases := map[string]struct { 14 | input string 15 | expModel, expAdapter string 16 | }{ 17 | "empty input": { 18 | input: "", 19 | expModel: "", 20 | expAdapter: "", 21 | }, 22 | "model only": { 23 | input: "my-model", 24 | expModel: "my-model", 25 | }, 26 | "model and adapter": { 27 | input: "my-model_my-adapter", 28 | expModel: "my-model", 29 | expAdapter: "my-adapter", 30 | }, 31 | "too many separators": { 32 | input: "my-model_my-adapter_extra", 33 | expModel: "my-model", 34 | expAdapter: "my-adapter_extra", 35 | }, 36 | "trailing": { 37 | input: "my-model_", 38 | expModel: "my-model", 39 | expAdapter: "", 40 | }, 41 | } 42 | 43 | for name, spec := range cases { 44 | t.Run(name, func(t *testing.T) { 45 | t.Parallel() 46 | model, adapter := apiutils.SplitModelAdapter(spec.input) 47 | require.Equal(t, spec.expModel, model, "model") 48 | require.Equal(t, spec.expAdapter, adapter, "adapter") 49 | }) 50 | } 51 | } 52 | 53 | func TestMergeModelAdapter(t *testing.T) { 54 | t.Parallel() 55 | 56 | cases := map[string]struct { 57 | model, adapter, exp string 58 | }{ 59 | "model only": { 60 | model: "my-model", 61 | exp: "my-model", 62 | }, 63 | "model and adapter": { 64 | model: "my-model", 65 | adapter: "my-adapter", 66 | exp: "my-model_my-adapter", 67 | }, 68 | } 69 | 70 | for name, spec := range cases { 71 | t.Run(name, func(t *testing.T) { 72 | t.Parallel() 73 | merged := apiutils.MergeModelAdapter(spec.model, spec.adapter) 74 | require.Equal(t, spec.exp, merged) 75 | }) 76 | } 77 | } 78 | -------------------------------------------------------------------------------- /internal/config/system_test.go: -------------------------------------------------------------------------------- 1 | package config_test 2 | 3 | import ( 4 | "testing" 5 | "time" 6 | 7 | "github.com/stretchr/testify/require" 8 | "github.com/substratusai/kubeai/internal/config" 9 | ) 10 | 11 | func TestAutoscalingConfig(t *testing.T) { 12 | cases := []struct { 13 | name string 14 | cfg config.ModelAutoscaling 15 | scaleDownDelaySeconds int64 16 | expectedRequiredConsecutiveScaleDowns int 17 | expectedAverageWindowCount int 18 | }{ 19 | { 20 | name: "default", 21 | cfg: config.ModelAutoscaling{ 22 | Interval: config.Duration{Duration: 10 * time.Second}, 23 | TimeWindow: config.Duration{Duration: 10 * time.Minute}, 24 | }, 25 | scaleDownDelaySeconds: 30, 26 | expectedRequiredConsecutiveScaleDowns: 3, 27 | // 10 * 60 / 10 28 | expectedAverageWindowCount: 60, 29 | }, 30 | { 31 | name: "even", 32 | cfg: config.ModelAutoscaling{ 33 | Interval: config.Duration{Duration: 1 * time.Second}, 34 | TimeWindow: config.Duration{Duration: 10 * time.Second}, 35 | }, 36 | scaleDownDelaySeconds: 10, 37 | expectedRequiredConsecutiveScaleDowns: 10, 38 | expectedAverageWindowCount: 10, 39 | }, 40 | { 41 | name: "with-remainder", 42 | cfg: config.ModelAutoscaling{ 43 | Interval: config.Duration{Duration: 2 * time.Second}, 44 | TimeWindow: config.Duration{Duration: 5 * time.Second}, 45 | }, 46 | scaleDownDelaySeconds: 3, 47 | expectedRequiredConsecutiveScaleDowns: 2, 48 | expectedAverageWindowCount: 3, 49 | }, 50 | } 51 | 52 | for _, c := range cases { 53 | t.Run(c.name, func(t *testing.T) { 54 | require.Equal(t, c.expectedRequiredConsecutiveScaleDowns, c.cfg.RequiredConsecutiveScaleDowns(c.scaleDownDelaySeconds)) 55 | }) 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /internal/k8sutils/apply.go: -------------------------------------------------------------------------------- 1 | package k8sutils 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | 7 | "sigs.k8s.io/controller-runtime/pkg/client" 8 | ) 9 | 10 | func ServerSideApply(ctx context.Context, cl client.Client, obj client.Object, controllerName string) error { 11 | gvk, err := ObjectToGroupVersionKind(cl.Scheme(), obj) 12 | if err != nil { 13 | return fmt.Errorf("getting group version kind: %w", err) 14 | } 15 | obj.GetObjectKind().SetGroupVersionKind(gvk) 16 | return cl.Patch(ctx, obj, client.Apply, client.FieldOwner(controllerName), client.ForceOwnership) 17 | } 18 | -------------------------------------------------------------------------------- /internal/k8sutils/client_options.go: -------------------------------------------------------------------------------- 1 | package k8sutils 2 | 3 | import "sigs.k8s.io/controller-runtime/pkg/client" 4 | 5 | const ManagerName = "kubeai-manager" 6 | 7 | func DefaultUpdateOptions() *client.UpdateOptions { 8 | return &client.UpdateOptions{ 9 | FieldManager: ManagerName, 10 | } 11 | } 12 | 13 | func DefaultSubResourceUpdateOptions() *client.UpdateOptions { 14 | return &client.UpdateOptions{ 15 | FieldManager: ManagerName, 16 | } 17 | } 18 | 19 | func DefaultCreateOptions() *client.CreateOptions { 20 | return &client.CreateOptions{ 21 | FieldManager: ManagerName, 22 | } 23 | } 24 | 25 | func DefaultPatchOptions() *client.PatchOptions { 26 | return &client.PatchOptions{ 27 | FieldManager: ManagerName, 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /internal/k8sutils/gvk.go: -------------------------------------------------------------------------------- 1 | package k8sutils 2 | 3 | import ( 4 | "fmt" 5 | 6 | "k8s.io/apimachinery/pkg/runtime" 7 | "k8s.io/apimachinery/pkg/runtime/schema" 8 | "sigs.k8s.io/controller-runtime/pkg/client" 9 | ) 10 | 11 | func ObjectToGroupKind(s *runtime.Scheme, o client.Object) (schema.GroupKind, error) { 12 | gvks, _, err := s.ObjectKinds(o) 13 | if err != nil { 14 | return schema.GroupKind{}, err 15 | } 16 | if len(gvks) == 0 { 17 | return schema.GroupKind{}, fmt.Errorf("no group kind for object") 18 | } 19 | return schema.GroupKind{ 20 | Group: gvks[0].Group, 21 | Kind: gvks[0].Kind, 22 | }, nil 23 | } 24 | 25 | func ObjectToGroupVersionKind(s *runtime.Scheme, o client.Object) (schema.GroupVersionKind, error) { 26 | gvks, _, err := s.ObjectKinds(o) 27 | if err != nil { 28 | return schema.GroupVersionKind{}, err 29 | } 30 | if len(gvks) == 0 { 31 | return schema.GroupVersionKind{}, fmt.Errorf("no group version kind for object") 32 | } 33 | return schema.GroupVersionKind{ 34 | Group: gvks[0].Group, 35 | Version: gvks[0].Version, 36 | Kind: gvks[0].Kind, 37 | }, nil 38 | } 39 | -------------------------------------------------------------------------------- /internal/k8sutils/jobs.go: -------------------------------------------------------------------------------- 1 | package k8sutils 2 | 3 | import ( 4 | batchv1 "k8s.io/api/batch/v1" 5 | corev1 "k8s.io/api/core/v1" 6 | ) 7 | 8 | func IsJobCompleted(job *batchv1.Job) bool { 9 | for _, cond := range job.Status.Conditions { 10 | if cond.Type == batchv1.JobComplete && cond.Status == corev1.ConditionTrue { 11 | return true 12 | } 13 | } 14 | return false 15 | } 16 | -------------------------------------------------------------------------------- /internal/k8sutils/meta.go: -------------------------------------------------------------------------------- 1 | package k8sutils 2 | 3 | import "sigs.k8s.io/controller-runtime/pkg/client" 4 | 5 | func SetLabel(obj client.Object, key, value string) { 6 | labels := obj.GetLabels() 7 | if labels == nil { 8 | labels = make(map[string]string) 9 | obj.SetLabels(labels) 10 | } 11 | labels[key] = value 12 | } 13 | 14 | func SetAnnotation(obj client.Object, key, value string) { 15 | annotations := obj.GetAnnotations() 16 | if annotations == nil { 17 | annotations = make(map[string]string) 18 | obj.SetAnnotations(annotations) 19 | } 20 | annotations[key] = value 21 | } 22 | 23 | func GetLabel(obj client.Object, key string) string { 24 | labels := obj.GetLabels() 25 | if labels == nil { 26 | return "" 27 | } 28 | return labels[key] 29 | } 30 | 31 | func GetAnnotation(obj client.Object, key string) string { 32 | annotations := obj.GetAnnotations() 33 | if annotations == nil { 34 | return "" 35 | } 36 | return annotations[key] 37 | } 38 | -------------------------------------------------------------------------------- /internal/k8sutils/pods.go: -------------------------------------------------------------------------------- 1 | package k8sutils 2 | 3 | import ( 4 | "fmt" 5 | "hash" 6 | "hash/fnv" 7 | 8 | corev1 "k8s.io/api/core/v1" 9 | "k8s.io/apimachinery/pkg/util/dump" 10 | "k8s.io/apimachinery/pkg/util/rand" 11 | ) 12 | 13 | func PodIsScheduled(pod *corev1.Pod) bool { 14 | return pod.Spec.NodeName != "" 15 | } 16 | 17 | func PodIsReady(pod *corev1.Pod) bool { 18 | for _, cond := range pod.Status.Conditions { 19 | if cond.Type == corev1.PodReady && cond.Status == corev1.ConditionTrue { 20 | return true 21 | } 22 | } 23 | return false 24 | } 25 | 26 | // PodHash returns a hash value calculated from Pod spec. 27 | // Inspired by k8s.io/kubernetes/pkg/controller.ComputeHash() 28 | func PodHash(podSpec corev1.PodSpec) string { 29 | podTemplateSpecHasher := fnv.New32a() 30 | DeepHashObject(podTemplateSpecHasher, podSpec) 31 | 32 | // TODO: Implement collision detection if needed. 33 | //// Add collisionCount in the hash if it exists. 34 | //if collisionCount != nil { 35 | // collisionCountBytes := make([]byte, 8) 36 | // binary.LittleEndian.PutUint32(collisionCountBytes, uint32(*collisionCount)) 37 | // podTemplateSpecHasher.Write(collisionCountBytes) 38 | //} 39 | 40 | return rand.SafeEncodeString(fmt.Sprint(podTemplateSpecHasher.Sum32())) 41 | } 42 | 43 | // StringHash returns a hash value calculated from the input string. 44 | func StringHash(s string) string { 45 | h := fnv.New32a() 46 | h.Write([]byte(s)) 47 | return rand.SafeEncodeString(fmt.Sprint(h.Sum32())) 48 | } 49 | 50 | // DeepHashObject writes specified object to hash using the spew library 51 | // which follows pointers and prints actual values of the nested objects 52 | // ensuring the hash does not change when a pointer changes. 53 | // Copied from k8s.io/kubernetes/pkg/util/hash to avoid dependency on k8s.io/kubernetes. 54 | func DeepHashObject(hasher hash.Hash, objectToWrite interface{}) { 55 | hasher.Reset() 56 | fmt.Fprintf(hasher, "%v", dump.ForHash(objectToWrite)) 57 | } 58 | 59 | func ContainerIsReady(pod *corev1.Pod, containerName string) bool { 60 | for _, status := range pod.Status.ContainerStatuses { 61 | if status.Name == containerName { 62 | return status.Ready 63 | } 64 | } 65 | return false 66 | } 67 | -------------------------------------------------------------------------------- /internal/loadbalancer/balance_least_load.go: -------------------------------------------------------------------------------- 1 | package loadbalancer 2 | 3 | func (g *group) getAddrLeastLoad(adapter string) (endpoint, bool) { 4 | var bestEp endpoint 5 | var found bool 6 | var minInFlight int 7 | for _, ep := range g.endpoints { 8 | if adapter != "" { 9 | // Skip endpoints that don't have the requested adapter. 10 | if _, ok := ep.adapters[adapter]; !ok { 11 | continue 12 | } 13 | } 14 | inFlight := int(ep.inFlight.Load()) 15 | if !found || inFlight < minInFlight { 16 | bestEp = ep 17 | found = true 18 | minInFlight = inFlight 19 | } 20 | } 21 | 22 | return bestEp, found 23 | } 24 | -------------------------------------------------------------------------------- /internal/loadbalancer/group_bench_test.go: -------------------------------------------------------------------------------- 1 | package loadbalancer 2 | 3 | import ( 4 | "context" 5 | "testing" 6 | 7 | v1 "github.com/substratusai/kubeai/api/k8s/v1" 8 | "github.com/substratusai/kubeai/internal/apiutils" 9 | ) 10 | 11 | func BenchmarkEndpointGroup(b *testing.B) { 12 | e := newEndpointGroup(v1.LoadBalancing{PrefixHash: v1.PrefixHash{Replication: 100}}) 13 | e.reconcileEndpoints(map[string]endpoint{"pod1": {address: "10.0.0.1:8000"}}) 14 | b.ResetTimer() 15 | b.RunParallel(func(pb *testing.PB) { 16 | for pb.Next() { 17 | _, f, err := e.getBestAddr(context.Background(), &apiutils.Request{}, false) 18 | if err != nil { 19 | b.Fatal(err) 20 | } 21 | f() 22 | } 23 | }) 24 | } 25 | -------------------------------------------------------------------------------- /internal/manager/configure.go: -------------------------------------------------------------------------------- 1 | package manager 2 | 3 | import ( 4 | "os" 5 | 6 | "github.com/substratusai/kubeai/internal/config" 7 | "sigs.k8s.io/yaml" 8 | ) 9 | 10 | func LoadConfigFile(path string) (config.System, error) { 11 | contents, err := os.ReadFile(path) 12 | if err != nil { 13 | return config.System{}, err 14 | } 15 | var cfg config.System 16 | if err := yaml.Unmarshal(contents, &cfg); err != nil { 17 | return config.System{}, err 18 | } 19 | 20 | return cfg, nil 21 | } 22 | -------------------------------------------------------------------------------- /internal/modelautoscaler/state.go: -------------------------------------------------------------------------------- 1 | package modelautoscaler 2 | 3 | import ( 4 | "context" 5 | "encoding/json" 6 | "fmt" 7 | "log" 8 | "time" 9 | 10 | corev1 "k8s.io/api/core/v1" 11 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 12 | "k8s.io/apimachinery/pkg/types" 13 | "sigs.k8s.io/controller-runtime/pkg/client" 14 | ) 15 | 16 | func newTotalModelState() totalModelState { 17 | return totalModelState{ 18 | Models: make(map[string]modelState), 19 | LastCalculationTime: time.Now(), 20 | } 21 | } 22 | 23 | type totalModelState struct { 24 | Models map[string]modelState `json:"models"` 25 | LastCalculationTime time.Time `json:"lastCalculationTime"` 26 | } 27 | 28 | type modelState struct { 29 | AverageActiveRequests float64 `json:"averageActiveRequests"` 30 | } 31 | 32 | func (a *Autoscaler) loadLastTotalModelState(ctx context.Context) (totalModelState, error) { 33 | cm := &corev1.ConfigMap{} 34 | if err := a.k8sClient.Get(ctx, a.stateConfigMapRef, cm); err != nil { 35 | return totalModelState{}, fmt.Errorf("get ConfigMap %q: %w", a.stateConfigMapRef, err) 36 | } 37 | const key = "models" 38 | jsonState, ok := cm.Data[key] 39 | if !ok { 40 | log.Printf("Autoscaler state ConfigMap %q has no key %q, state not loaded", key, a.stateConfigMapRef) 41 | return totalModelState{}, nil 42 | } 43 | tms := totalModelState{} 44 | if err := json.Unmarshal([]byte(jsonState), &tms); err != nil { 45 | return totalModelState{}, fmt.Errorf("unmarshalling state: %w", err) 46 | } 47 | return tms, nil 48 | } 49 | 50 | func (a *Autoscaler) saveTotalModelState(ctx context.Context, state totalModelState) error { 51 | jsonState, err := json.Marshal(state) 52 | if err != nil { 53 | return fmt.Errorf("marshalling state: %w", err) 54 | } 55 | patch := fmt.Sprintf(`{"data":{"models":%q}}`, string(jsonState)) 56 | if err := a.k8sClient.Patch(ctx, &corev1.ConfigMap{ 57 | ObjectMeta: metav1.ObjectMeta{ 58 | Namespace: a.stateConfigMapRef.Namespace, 59 | Name: a.stateConfigMapRef.Name, 60 | }, 61 | }, client.RawPatch(types.StrategicMergePatchType, []byte(patch))); err != nil { 62 | return fmt.Errorf("patching ConfigMap %q: %w", a.stateConfigMapRef, err) 63 | } 64 | return nil 65 | } 66 | -------------------------------------------------------------------------------- /internal/modelclient/client.go: -------------------------------------------------------------------------------- 1 | package modelclient 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | "sync" 7 | 8 | kubeaiv1 "github.com/substratusai/kubeai/api/k8s/v1" 9 | apierrors "k8s.io/apimachinery/pkg/api/errors" 10 | "k8s.io/apimachinery/pkg/labels" 11 | "k8s.io/apimachinery/pkg/types" 12 | "sigs.k8s.io/controller-runtime/pkg/client" 13 | ) 14 | 15 | type ModelClient struct { 16 | client client.Client 17 | namespace string 18 | consecutiveScaleDownsMtx sync.RWMutex 19 | consecutiveScaleDowns map[string]int 20 | } 21 | 22 | func NewModelClient(client client.Client, namespace string) *ModelClient { 23 | return &ModelClient{client: client, namespace: namespace, consecutiveScaleDowns: map[string]int{}} 24 | } 25 | 26 | // LookupModel checks if a model exists and matches the given label selectors. 27 | func (c *ModelClient) LookupModel(ctx context.Context, model, adapter string, labelSelectors []string) (*kubeaiv1.Model, error) { 28 | m := &kubeaiv1.Model{} 29 | if err := c.client.Get(ctx, types.NamespacedName{Name: model, Namespace: c.namespace}, m); err != nil { 30 | if apierrors.IsNotFound(err) { 31 | return nil, nil 32 | } 33 | return nil, err 34 | } 35 | 36 | modelLabels := m.GetLabels() 37 | if modelLabels == nil { 38 | modelLabels = map[string]string{} 39 | } 40 | for _, sel := range labelSelectors { 41 | parsedSel, err := labels.Parse(sel) 42 | if err != nil { 43 | return nil, fmt.Errorf("parse label selector: %w", err) 44 | } 45 | if !parsedSel.Matches(labels.Set(modelLabels)) { 46 | return nil, nil 47 | } 48 | } 49 | 50 | if adapter != "" { 51 | adapterFound := false 52 | for _, a := range m.Spec.Adapters { 53 | if a.Name == adapter { 54 | adapterFound = true 55 | break 56 | } 57 | } 58 | if !adapterFound { 59 | return nil, nil 60 | } 61 | } 62 | 63 | return m, nil 64 | } 65 | 66 | func (s *ModelClient) ListAllModels(ctx context.Context) ([]kubeaiv1.Model, error) { 67 | models := &kubeaiv1.ModelList{} 68 | if err := s.client.List(ctx, models, client.InNamespace(s.namespace)); err != nil { 69 | return nil, fmt.Errorf("list models: %w", err) 70 | } 71 | 72 | return models.Items, nil 73 | } 74 | -------------------------------------------------------------------------------- /internal/modelcontroller/patch.go: -------------------------------------------------------------------------------- 1 | package modelcontroller 2 | 3 | import ( 4 | "encoding/json" 5 | "fmt" 6 | 7 | "github.com/substratusai/kubeai/internal/config" 8 | jsonpatch "gopkg.in/evanphx/json-patch.v4" 9 | corev1 "k8s.io/api/core/v1" 10 | ) 11 | 12 | func applyJSONPatchToPod(patches []config.JSONPatch, pod *corev1.Pod) error { 13 | if len(patches) == 0 { 14 | return nil 15 | } 16 | 17 | pb, err := json.Marshal(patches) 18 | if err != nil { 19 | return fmt.Errorf("marshal pod patch: %w", err) 20 | } 21 | 22 | patch, err := jsonpatch.DecodePatch(pb) 23 | if err != nil { 24 | return fmt.Errorf("decode pod patch: %w", err) 25 | } 26 | 27 | podJson, err := json.Marshal(pod) 28 | if err != nil { 29 | return fmt.Errorf("marshal pod: %w", err) 30 | } 31 | 32 | patchedPodJson, err := patch.Apply(podJson) 33 | if err != nil { 34 | return fmt.Errorf("apply pod patch: %w", err) 35 | } 36 | 37 | patchedPod := &corev1.Pod{} 38 | if err := json.Unmarshal(patchedPodJson, patchedPod); err != nil { 39 | return fmt.Errorf("unmarshal patched pod: %w", err) 40 | } 41 | *pod = *patchedPod 42 | return nil 43 | } 44 | -------------------------------------------------------------------------------- /internal/modelcontroller/pod_utils.go: -------------------------------------------------------------------------------- 1 | package modelcontroller 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | "os" 7 | 8 | corev1 "k8s.io/api/core/v1" 9 | "k8s.io/apimachinery/pkg/runtime" 10 | "k8s.io/client-go/tools/remotecommand" 11 | ) 12 | 13 | func (r *ModelReconciler) execPod(ctx context.Context, pod *corev1.Pod, container string, command []string) error { 14 | execReq := r.PodRESTClient. 15 | Post(). 16 | Namespace(pod.Namespace). 17 | Resource("pods"). 18 | Name(pod.Name). 19 | SubResource("exec"). 20 | VersionedParams(&corev1.PodExecOptions{ 21 | Container: container, 22 | Command: command, 23 | Stdin: true, 24 | Stdout: true, 25 | Stderr: true, 26 | }, runtime.NewParameterCodec(r.Scheme)) 27 | 28 | exec, err := remotecommand.NewSPDYExecutor(r.RESTConfig, "POST", execReq.URL()) 29 | if err != nil { 30 | return fmt.Errorf("creating remote command executor: %w", err) 31 | } 32 | 33 | if err := exec.StreamWithContext(ctx, remotecommand.StreamOptions{ 34 | Stdin: os.Stdin, 35 | Stdout: os.Stdout, 36 | Stderr: os.Stderr, 37 | Tty: false, 38 | }); err != nil { 39 | return fmt.Errorf("streaming: %w", err) 40 | } 41 | 42 | return nil 43 | } 44 | 45 | func (r *ModelReconciler) updatePodRemoveLabel(ctx context.Context, pod *corev1.Pod, key string) error { 46 | if pod.Labels == nil { 47 | return nil 48 | } 49 | delete(pod.Labels, key) 50 | if err := r.Client.Update(ctx, pod); err != nil { 51 | return fmt.Errorf("update pod labels: %w", err) 52 | } 53 | return nil 54 | } 55 | 56 | func (r *ModelReconciler) updatePodAddLabel(ctx context.Context, pod *corev1.Pod, key, value string) error { 57 | if pod.Labels == nil { 58 | pod.Labels = make(map[string]string) 59 | } 60 | pod.Labels[key] = value 61 | if err := r.Client.Update(ctx, pod); err != nil { 62 | return fmt.Errorf("update pod labels: %w", err) 63 | } 64 | return nil 65 | } 66 | -------------------------------------------------------------------------------- /internal/movingaverage/simple.go: -------------------------------------------------------------------------------- 1 | package movingaverage 2 | 3 | import ( 4 | "sync" 5 | ) 6 | 7 | // Simple keeps track of a history of measurements and returns the average. 8 | // One important feature of this implementation is that the average can go to zero. 9 | // All methods are thread safe. 10 | // 11 | // Alternative: consider exponential moving average where near-zero values are treated 12 | // as zero (for scale to zero): 13 | // 14 | // func MovingExpAvg(value, oldValue, fdtime, ftime float64) float64 { 15 | // alpha := 1.0 - math.Exp(-fdtime/ftime) 16 | // r := alpha * value + (1.0 - alpha) * oldValue 17 | // return r 18 | // } 19 | type Simple struct { 20 | mtx sync.Mutex 21 | history []float64 22 | index int 23 | } 24 | 25 | func NewSimple(seed []float64) *Simple { 26 | return &Simple{ 27 | history: seed, 28 | } 29 | } 30 | 31 | func (a *Simple) Next(next float64) { 32 | a.mtx.Lock() 33 | a.history[a.index] = next 34 | a.index++ 35 | if a.index == len(a.history) { 36 | a.index = 0 37 | } 38 | a.mtx.Unlock() 39 | } 40 | 41 | func (a *Simple) History() []float64 { 42 | a.mtx.Lock() 43 | result := make([]float64, len(a.history)) 44 | copy(result, a.history) 45 | a.mtx.Unlock() 46 | 47 | return result 48 | } 49 | 50 | func (a *Simple) Calculate() (result float64) { 51 | a.mtx.Lock() 52 | for _, p := range a.history { 53 | result += p 54 | } 55 | result /= float64(len(a.history)) 56 | a.mtx.Unlock() 57 | 58 | return result 59 | } 60 | -------------------------------------------------------------------------------- /internal/movingaverage/simple_test.go: -------------------------------------------------------------------------------- 1 | package movingaverage_test 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/substratusai/kubeai/internal/movingaverage" 7 | ) 8 | 9 | func TestSimple(t *testing.T) { 10 | cases := []struct { 11 | name string 12 | seed []float64 13 | values []float64 14 | want float64 15 | }{ 16 | { 17 | name: "1-2-3", 18 | seed: []float64{0, 0, 0}, 19 | values: []float64{1, 2, 3}, 20 | want: 2, 21 | }, 22 | { 23 | name: "3-2-1", 24 | seed: make([]float64, 3), 25 | values: []float64{3, 2, 1}, 26 | want: 2, 27 | }, 28 | { 29 | name: "3-2-1-1-1-1", 30 | seed: make([]float64, 3), 31 | values: []float64{3, 2, 1, 1, 1, 1}, 32 | want: 1, 33 | }, 34 | { 35 | name: "2-3", 36 | seed: make([]float64, 2), 37 | values: []float64{2, 3}, 38 | want: 2.5, 39 | }, 40 | { 41 | name: "2-2-2", 42 | seed: []float64{0, 0, 0}, 43 | values: []float64{2, 2, 2}, 44 | want: 2, 45 | }, 46 | } 47 | for _, tc := range cases { 48 | t.Run(tc.name, func(t *testing.T) { 49 | a := movingaverage.NewSimple(tc.seed) 50 | for _, v := range tc.values { 51 | a.Next(v) 52 | } 53 | got := a.Calculate() 54 | if got != tc.want { 55 | t.Errorf("got %v; want %v", got, tc.want) 56 | } 57 | }) 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /manifests/models/bge-embed-text-cpu.yaml: -------------------------------------------------------------------------------- 1 | # Source: models/templates/models.yaml 2 | apiVersion: kubeai.org/v1 3 | kind: Model 4 | metadata: 5 | name: bge-embed-text-cpu 6 | spec: 7 | features: [TextEmbedding] 8 | url: hf://BAAI/bge-small-en-v1.5 9 | engine: Infinity 10 | minReplicas: 0 11 | resourceProfile: cpu:1 12 | -------------------------------------------------------------------------------- /manifests/models/deepseek-r1-1.5b-cpu.yaml: -------------------------------------------------------------------------------- 1 | # Source: models/templates/models.yaml 2 | apiVersion: kubeai.org/v1 3 | kind: Model 4 | metadata: 5 | name: deepseek-r1-1.5b-cpu 6 | spec: 7 | features: [TextGeneration] 8 | url: ollama://deepseek-r1:1.5b 9 | engine: OLlama 10 | minReplicas: 0 11 | resourceProfile: cpu:1 12 | -------------------------------------------------------------------------------- /manifests/models/deepseek-r1-70b-gh200-fp8.yaml: -------------------------------------------------------------------------------- 1 | # Source: models/templates/models.yaml 2 | apiVersion: kubeai.org/v1 3 | kind: Model 4 | metadata: 5 | name: deepseek-r1-70b-gh200-fp8 6 | spec: 7 | features: [TextGeneration] 8 | url: hf://neuralmagic/DeepSeek-R1-Distill-Llama-70B-FP8-dynamic 9 | engine: VLLM 10 | args: 11 | - --max-model-len=32768 12 | - --max-num-batched-token=32768 13 | - --gpu-memory-utilization=0.95 14 | - --kv-cache-dtype=fp8 15 | - --enable-prefix-caching 16 | - --disable-log-requests 17 | minReplicas: 0 18 | resourceProfile: nvidia-gpu-gh200:1 19 | -------------------------------------------------------------------------------- /manifests/models/deepseek-r1-70b-gh200.yaml: -------------------------------------------------------------------------------- 1 | # Source: models/templates/models.yaml 2 | apiVersion: kubeai.org/v1 3 | kind: Model 4 | metadata: 5 | name: deepseek-r1-70b-gh200 6 | spec: 7 | features: [TextGeneration] 8 | url: hf://deepseek-ai/DeepSeek-R1-Distill-Llama-70B 9 | engine: VLLM 10 | args: 11 | - --max-model-len=32768 12 | - --max-num-batched-token=32768 13 | - --gpu-memory-utilization=0.95 14 | - --kv-cache-dtype=fp8 15 | - --cpu-offload-gb=120 16 | - --enable-prefix-caching 17 | - --disable-log-requests 18 | env: 19 | VLLM_ATTENTION_BACKEND: FLASHINFER 20 | minReplicas: 0 21 | resourceProfile: nvidia-gpu-gh200:1 22 | -------------------------------------------------------------------------------- /manifests/models/deepseek-r1-distill-llama-8b-l4.yaml: -------------------------------------------------------------------------------- 1 | # Source: models/templates/models.yaml 2 | apiVersion: kubeai.org/v1 3 | kind: Model 4 | metadata: 5 | name: deepseek-r1-distill-llama-8b-l4 6 | spec: 7 | features: [TextGeneration] 8 | url: hf://deepseek-ai/DeepSeek-R1-Distill-Llama-8B 9 | engine: VLLM 10 | args: 11 | - --max-model-len=8192 12 | - --max-num-batched-token=8192 13 | - --max-num-seqs=256 14 | - --gpu-memory-utilization=0.95 15 | - --kv-cache-dtype=fp8 16 | - --disable-log-requests 17 | - --quantization=fp8 18 | - --enforce-eager 19 | env: 20 | VLLM_ATTENTION_BACKEND: FLASHINFER 21 | minReplicas: 0 22 | resourceProfile: nvidia-gpu-l4:1 23 | -------------------------------------------------------------------------------- /manifests/models/deepseek-r1-distill-qwen-1.5b-rtx4070.yaml: -------------------------------------------------------------------------------- 1 | # Source: models/templates/models.yaml 2 | apiVersion: kubeai.org/v1 3 | kind: Model 4 | metadata: 5 | name: deepseek-r1-distill-qwen-1.5b-rtx4070 6 | spec: 7 | features: [TextGeneration] 8 | url: hf://deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B 9 | engine: VLLM 10 | args: 11 | - --max-model-len=2048 12 | - --max-num-batched-token=2048 13 | - --max-num-seqs=8 14 | - --kv-cache-dtype=fp8 15 | env: 16 | VLLM_USE_V1: "1" 17 | minReplicas: 0 18 | resourceProfile: nvidia-gpu-rtx4070-8gb:1 19 | -------------------------------------------------------------------------------- /manifests/models/deepseek-r1-mi300x.yaml: -------------------------------------------------------------------------------- 1 | # Source: models/templates/models.yaml 2 | apiVersion: kubeai.org/v1 3 | kind: Model 4 | metadata: 5 | name: deepseek-r1-mi300x 6 | spec: 7 | features: [TextGeneration] 8 | url: hf://deepseek-ai/DeepSeek-R1 9 | engine: VLLM 10 | args: 11 | - --trust-remote-code 12 | - --max-model-len=32768 13 | - --max-num-batched-token=32768 14 | - --max-num-seqs=1024 15 | - --num-scheduler-steps=10 16 | - --tensor-parallel-size=8 17 | - --gpu-memory-utilization=0.90 18 | - --disable-log-requests 19 | - --enable-chunked-prefill=false 20 | - --max-seq-len-to-capture=16384 21 | - --kv-cache-dtype=fp8 22 | env: 23 | HIP_FORCE_DEV_KERNARG: "1" 24 | NCCL_MIN_NCHANNELS: "112" 25 | TORCH_BLAS_PREFER_HIPBLASLT: "1" 26 | VLLM_FP8_PADDING: "0" 27 | VLLM_USE_TRITON_FLASH_ATTN: "0" 28 | minReplicas: 0 29 | targetRequests: 1024 30 | resourceProfile: amd-gpu-mi300x:8 31 | -------------------------------------------------------------------------------- /manifests/models/e5-mistral-7b-instruct-cpu.yaml: -------------------------------------------------------------------------------- 1 | # Source: models/templates/models.yaml 2 | apiVersion: kubeai.org/v1 3 | kind: Model 4 | metadata: 5 | name: e5-mistral-7b-instruct-cpu 6 | spec: 7 | features: [TextEmbedding] 8 | url: hf://intfloat/e5-mistral-7b-instruct 9 | engine: VLLM 10 | args: 11 | - --gpu-memory-utilization=0.9 12 | minReplicas: 0 13 | resourceProfile: cpu:1 14 | -------------------------------------------------------------------------------- /manifests/models/faster-whisper-medium-en-cpu.yaml: -------------------------------------------------------------------------------- 1 | # Source: models/templates/models.yaml 2 | apiVersion: kubeai.org/v1 3 | kind: Model 4 | metadata: 5 | name: faster-whisper-medium-en-cpu 6 | spec: 7 | features: [SpeechToText] 8 | url: hf://Systran/faster-whisper-medium.en 9 | engine: FasterWhisper 10 | minReplicas: 0 11 | resourceProfile: cpu:1 12 | -------------------------------------------------------------------------------- /manifests/models/gemma-2-9b-it-fp8-l4.yaml: -------------------------------------------------------------------------------- 1 | # Source: models/templates/models.yaml 2 | apiVersion: kubeai.org/v1 3 | kind: Model 4 | metadata: 5 | name: gemma-2-9b-it-fp8-l4 6 | spec: 7 | features: [TextGeneration] 8 | url: hf://neuralmagic/gemma-2-9b-it-FP8 9 | engine: VLLM 10 | args: 11 | - --max-model-len=4096 12 | - --max-num-batched-token=4096 13 | - --max-num-seqs=256 14 | - --gpu-memory-utilization=0.95 15 | - --kv-cache-dtype=fp8 16 | env: 17 | VLLM_USE_V1: "1" 18 | minReplicas: 0 19 | resourceProfile: nvidia-gpu-l4:1 20 | -------------------------------------------------------------------------------- /manifests/models/gemma-27b-ollama-l4.yaml: -------------------------------------------------------------------------------- 1 | # Source: models/templates/models.yaml 2 | apiVersion: kubeai.org/v1 3 | kind: Model 4 | metadata: 5 | name: gemma-27b-ollama-l4 6 | spec: 7 | features: [TextGeneration] 8 | url: ollama://gemma2:27b 9 | engine: OLlama 10 | minReplicas: 0 11 | resourceProfile: nvidia-gpu-l4:1 12 | -------------------------------------------------------------------------------- /manifests/models/gemma-2b-it-tpu.yaml: -------------------------------------------------------------------------------- 1 | # Source: models/templates/models.yaml 2 | apiVersion: kubeai.org/v1 3 | kind: Model 4 | metadata: 5 | name: gemma-2b-it-tpu 6 | spec: 7 | features: [TextGeneration] 8 | url: hf://google/gemma-2b-it 9 | engine: VLLM 10 | args: 11 | - --disable-log-requests 12 | minReplicas: 0 13 | resourceProfile: google-tpu-v5e-1x1:1 14 | -------------------------------------------------------------------------------- /manifests/models/gemma-3-12b-ollama-l4.yaml: -------------------------------------------------------------------------------- 1 | # Source: models/templates/models.yaml 2 | apiVersion: kubeai.org/v1 3 | kind: Model 4 | metadata: 5 | name: gemma-3-12b-ollama-l4 6 | spec: 7 | features: [TextGeneration] 8 | url: ollama://gemma3:12b 9 | engine: OLlama 10 | minReplicas: 0 11 | resourceProfile: nvidia-gpu-l4:1 12 | -------------------------------------------------------------------------------- /manifests/models/gemma-3-27b-ollama-l4.yaml: -------------------------------------------------------------------------------- 1 | # Source: models/templates/models.yaml 2 | apiVersion: kubeai.org/v1 3 | kind: Model 4 | metadata: 5 | name: gemma-3-27b-ollama-l4 6 | spec: 7 | features: [TextGeneration] 8 | url: ollama://gemma3:27b 9 | engine: OLlama 10 | minReplicas: 0 11 | resourceProfile: nvidia-gpu-l4:1 12 | -------------------------------------------------------------------------------- /manifests/models/gemma-9b-ollama-l4.yaml: -------------------------------------------------------------------------------- 1 | # Source: models/templates/models.yaml 2 | apiVersion: kubeai.org/v1 3 | kind: Model 4 | metadata: 5 | name: gemma-9b-ollama-l4 6 | spec: 7 | features: [TextGeneration] 8 | url: ollama://gemma2:9b 9 | engine: OLlama 10 | minReplicas: 0 11 | resourceProfile: nvidia-gpu-l4:1 12 | -------------------------------------------------------------------------------- /manifests/models/gemma2-2b-cpu.yaml: -------------------------------------------------------------------------------- 1 | # Source: models/templates/models.yaml 2 | apiVersion: kubeai.org/v1 3 | kind: Model 4 | metadata: 5 | name: gemma2-2b-cpu 6 | spec: 7 | features: [TextGeneration] 8 | url: ollama://gemma2:2b 9 | engine: OLlama 10 | minReplicas: 0 11 | resourceProfile: cpu:2 12 | -------------------------------------------------------------------------------- /manifests/models/granite-3.1-dense-ollama-l4.yaml: -------------------------------------------------------------------------------- 1 | # Source: models/templates/models.yaml 2 | apiVersion: kubeai.org/v1 3 | kind: Model 4 | metadata: 5 | name: granite-3.1-dense-ollama-l4 6 | spec: 7 | features: [TextGeneration] 8 | url: ollama://granite3.1-dense 9 | engine: OLlama 10 | minReplicas: 0 11 | resourceProfile: nvidia-gpu-l4:1 12 | -------------------------------------------------------------------------------- /manifests/models/llama-3.1-405b-instruct-fp8-a100-80b.yaml: -------------------------------------------------------------------------------- 1 | # Source: models/templates/models.yaml 2 | apiVersion: kubeai.org/v1 3 | kind: Model 4 | metadata: 5 | name: llama-3.1-405b-instruct-fp8-a100-80b 6 | spec: 7 | features: [TextGeneration] 8 | url: hf://neuralmagic/Meta-Llama-3.1-405B-Instruct-FP8 9 | engine: VLLM 10 | args: 11 | - --max-model-len=65536 12 | - --max-num-batched-token=65536 13 | - --gpu-memory-utilization=0.98 14 | - --tensor-parallel-size=8 15 | - --enable-prefix-caching 16 | - --disable-log-requests 17 | - --max-num-seqs=128 18 | - --kv-cache-dtype=fp8 19 | - --enforce-eager 20 | - --enable-chunked-prefill=false 21 | - --num-scheduler-steps=8 22 | env: 23 | VLLM_ATTENTION_BACKEND: FLASHINFER 24 | minReplicas: 0 25 | targetRequests: 128 26 | resourceProfile: nvidia-gpu-a100-80gb:8 27 | -------------------------------------------------------------------------------- /manifests/models/llama-3.1-405b-instruct-fp8-h100.yaml: -------------------------------------------------------------------------------- 1 | # Source: models/templates/models.yaml 2 | apiVersion: kubeai.org/v1 3 | kind: Model 4 | metadata: 5 | name: llama-3.1-405b-instruct-fp8-h100 6 | spec: 7 | features: [TextGeneration] 8 | url: hf://neuralmagic/Meta-Llama-3.1-405B-Instruct-FP8 9 | engine: VLLM 10 | args: 11 | - --max-model-len=65536 12 | - --max-num-batched-token=65536 13 | - --gpu-memory-utilization=0.9 14 | - --tensor-parallel-size=8 15 | - --enable-prefix-caching 16 | - --disable-log-requests 17 | - --max-num-seqs=1024 18 | - --kv-cache-dtype=fp8 19 | minReplicas: 0 20 | targetRequests: 500 21 | resourceProfile: nvidia-gpu-h100:8 22 | -------------------------------------------------------------------------------- /manifests/models/llama-3.1-405b-instruct-fp8-mi300x.yaml: -------------------------------------------------------------------------------- 1 | # Source: models/templates/models.yaml 2 | apiVersion: kubeai.org/v1 3 | kind: Model 4 | metadata: 5 | name: llama-3.1-405b-instruct-fp8-mi300x 6 | spec: 7 | features: [TextGeneration] 8 | url: hf://amd/Llama-3.1-405B-Instruct-FP8-KV 9 | engine: VLLM 10 | args: 11 | - --max-model-len=120000 12 | - --max-num-batched-token=120000 13 | - --max-num-seqs=1024 14 | - --num-scheduler-steps=15 15 | - --tensor-parallel-size=8 16 | - --gpu-memory-utilization=0.90 17 | - --disable-log-requests 18 | - --kv-cache-dtype=fp8 19 | - --enable-chunked-prefill=false 20 | - --max-seq-len-to-capture=16384 21 | env: 22 | HIP_FORCE_DEV_KERNARG: "1" 23 | NCCL_MIN_NCHANNELS: "112" 24 | TORCH_BLAS_PREFER_HIPBLASLT: "1" 25 | VLLM_USE_TRITON_FLASH_ATTN: "0" 26 | minReplicas: 0 27 | targetRequests: 1024 28 | resourceProfile: amd-gpu-mi300x:8 29 | -------------------------------------------------------------------------------- /manifests/models/llama-3.1-70b-instruct-awq-int4-gh200.yaml: -------------------------------------------------------------------------------- 1 | # Source: models/templates/models.yaml 2 | apiVersion: kubeai.org/v1 3 | kind: Model 4 | metadata: 5 | name: llama-3.1-70b-instruct-awq-int4-gh200 6 | spec: 7 | features: [TextGeneration] 8 | url: hf://hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4 9 | engine: VLLM 10 | args: 11 | - --max-model-len=16384 12 | - --max-num-batched-token=16384 13 | - --enable-prefix-caching 14 | - --disable-log-requests 15 | minReplicas: 0 16 | targetRequests: 50 17 | resourceProfile: nvidia-gpu-gh200:1 18 | -------------------------------------------------------------------------------- /manifests/models/llama-3.1-70b-instruct-fp8-1-h100.yaml: -------------------------------------------------------------------------------- 1 | # Source: models/templates/models.yaml 2 | apiVersion: kubeai.org/v1 3 | kind: Model 4 | metadata: 5 | name: llama-3.1-70b-instruct-fp8-1-h100 6 | spec: 7 | features: [TextGeneration] 8 | url: hf://neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8 9 | engine: VLLM 10 | args: 11 | - --enable-prefix-caching 12 | - --max-model-len=16384 13 | - --max-num-batched-token=16384 14 | - --gpu-memory-utilization=0.95 15 | - --disable-log-requests 16 | - --kv-cache-dtype=fp8 17 | minReplicas: 0 18 | resourceProfile: nvidia-gpu-h100:1 19 | -------------------------------------------------------------------------------- /manifests/models/llama-3.1-70b-instruct-fp8-gh200.yaml: -------------------------------------------------------------------------------- 1 | # Source: models/templates/models.yaml 2 | apiVersion: kubeai.org/v1 3 | kind: Model 4 | metadata: 5 | name: llama-3.1-70b-instruct-fp8-gh200 6 | spec: 7 | features: [TextGeneration] 8 | url: hf://neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8 9 | engine: VLLM 10 | args: 11 | - --max-model-len=32768 12 | - --max-num-batched-token=32768 13 | - --max-num-seqs=1024 14 | - --gpu-memory-utilization=0.9 15 | - --enable-prefix-caching 16 | - --enable-chunked-prefill=false 17 | - --disable-log-requests 18 | - --kv-cache-dtype=fp8 19 | - --enforce-eager 20 | env: 21 | VLLM_ATTENTION_BACKEND: FLASHINFER 22 | minReplicas: 0 23 | targetRequests: 1024 24 | resourceProfile: nvidia-gpu-gh200:1 25 | -------------------------------------------------------------------------------- /manifests/models/llama-3.1-70b-instruct-fp8-h100.yaml: -------------------------------------------------------------------------------- 1 | # Source: models/templates/models.yaml 2 | apiVersion: kubeai.org/v1 3 | kind: Model 4 | metadata: 5 | name: llama-3.1-70b-instruct-fp8-h100 6 | spec: 7 | features: [TextGeneration] 8 | url: hf://neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8 9 | engine: VLLM 10 | args: 11 | - --max-model-len=65536 12 | - --max-num-batched-token=65536 13 | - --max-num-seqs=1024 14 | - --gpu-memory-utilization=0.9 15 | - --tensor-parallel-size=2 16 | - --enable-prefix-caching 17 | - --disable-log-requests 18 | minReplicas: 0 19 | targetRequests: 500 20 | resourceProfile: nvidia-gpu-h100:2 21 | -------------------------------------------------------------------------------- /manifests/models/llama-3.1-70b-instruct-fp8-l4.yaml: -------------------------------------------------------------------------------- 1 | # Source: models/templates/models.yaml 2 | apiVersion: kubeai.org/v1 3 | kind: Model 4 | metadata: 5 | name: llama-3.1-70b-instruct-fp8-l4 6 | spec: 7 | features: [TextGeneration] 8 | url: hf://neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8 9 | engine: VLLM 10 | args: 11 | - --max-model-len=32768 12 | - --max-num-batched-token=32768 13 | - --max-num-seqs=512 14 | - --gpu-memory-utilization=0.9 15 | - --pipeline-parallel-size=4 16 | - --tensor-parallel-size=2 17 | - --enable-prefix-caching 18 | - --enable-chunked-prefill=false 19 | - --disable-log-requests 20 | - --kv-cache-dtype=fp8 21 | - --enforce-eager 22 | env: 23 | VLLM_ATTENTION_BACKEND: FLASHINFER 24 | minReplicas: 0 25 | targetRequests: 500 26 | resourceProfile: nvidia-gpu-l4:8 27 | -------------------------------------------------------------------------------- /manifests/models/llama-3.1-70b-instruct-fp8-mi300x.yaml: -------------------------------------------------------------------------------- 1 | # Source: models/templates/models.yaml 2 | apiVersion: kubeai.org/v1 3 | kind: Model 4 | metadata: 5 | name: llama-3.1-70b-instruct-fp8-mi300x 6 | spec: 7 | features: [TextGeneration] 8 | url: hf://amd/Llama-3.1-70B-Instruct-FP8-KV 9 | engine: VLLM 10 | args: 11 | - --max-model-len=120000 12 | - --max-num-batched-token=120000 13 | - --max-num-seqs=1024 14 | - --num-scheduler-steps=15 15 | - --gpu-memory-utilization=0.9 16 | - --disable-log-requests 17 | - --kv-cache-dtype=fp8 18 | - --enable-chunked-prefill=false 19 | - --max-seq-len-to-capture=16384 20 | env: 21 | HIP_FORCE_DEV_KERNARG: "1" 22 | NCCL_MIN_NCHANNELS: "112" 23 | TORCH_BLAS_PREFER_HIPBLASLT: "1" 24 | VLLM_USE_TRITON_FLASH_ATTN: "0" 25 | minReplicas: 0 26 | targetRequests: 1024 27 | resourceProfile: amd-gpu-mi300x:1 28 | -------------------------------------------------------------------------------- /manifests/models/llama-3.1-8b-instruct-cpu.yaml: -------------------------------------------------------------------------------- 1 | # Source: models/templates/models.yaml 2 | apiVersion: kubeai.org/v1 3 | kind: Model 4 | metadata: 5 | name: llama-3.1-8b-instruct-cpu 6 | spec: 7 | features: [TextGeneration] 8 | url: hf://meta-llama/Meta-Llama-3.1-8B-Instruct 9 | engine: VLLM 10 | args: 11 | - --max-model-len=32768 12 | - --max-num-batched-token=32768 13 | env: 14 | VLLM_CPU_KVCACHE_SPACE: "4" 15 | minReplicas: 0 16 | resourceProfile: cpu:6 17 | -------------------------------------------------------------------------------- /manifests/models/llama-3.1-8b-instruct-fp8-l4.yaml: -------------------------------------------------------------------------------- 1 | # Source: models/templates/models.yaml 2 | apiVersion: kubeai.org/v1 3 | kind: Model 4 | metadata: 5 | name: llama-3.1-8b-instruct-fp8-l4 6 | spec: 7 | features: [TextGeneration] 8 | url: hf://neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8 9 | engine: VLLM 10 | args: 11 | - --max-model-len=16384 12 | - --max-num-batched-token=16384 13 | - --gpu-memory-utilization=0.9 14 | - --disable-log-requests 15 | minReplicas: 0 16 | resourceProfile: nvidia-gpu-l4:1 17 | -------------------------------------------------------------------------------- /manifests/models/llama-3.1-8b-instruct-tpu.yaml: -------------------------------------------------------------------------------- 1 | # Source: models/templates/models.yaml 2 | apiVersion: kubeai.org/v1 3 | kind: Model 4 | metadata: 5 | name: llama-3.1-8b-instruct-tpu 6 | spec: 7 | features: [TextGeneration] 8 | url: hf://meta-llama/Meta-Llama-3.1-8B-Instruct 9 | engine: VLLM 10 | args: 11 | - --disable-log-requests 12 | - --swap-space=8 13 | - --tensor-parallel-size=4 14 | - --num-scheduler-steps=4 15 | - --max-model-len=8192 16 | - --distributed-executor-backend=ray 17 | minReplicas: 0 18 | resourceProfile: google-tpu-v5e-2x2:4 19 | -------------------------------------------------------------------------------- /manifests/models/llama-3.1-supernova-lite-l4.yaml: -------------------------------------------------------------------------------- 1 | # Source: models/templates/models.yaml 2 | apiVersion: kubeai.org/v1 3 | kind: Model 4 | metadata: 5 | name: llama-3.1-supernova-lite-l4 6 | spec: 7 | features: [TextGeneration] 8 | url: hf://arcee-ai/Llama-3.1-SuperNova-Lite 9 | engine: VLLM 10 | args: 11 | - --max-model-len=2048 12 | - --max-num-batched-token=2048 13 | - --max-num-seqs=1 14 | - --gpu-memory-utilization=0.95 15 | - --kv-cache-dtype=fp8 16 | - --disable-log-requests 17 | - --quantization=fp8 18 | - --enforce-eager 19 | env: 20 | VLLM_ATTENTION_BACKEND: FLASHINFER 21 | minReplicas: 0 22 | resourceProfile: nvidia-gpu-l4:1 23 | -------------------------------------------------------------------------------- /manifests/models/llama-3.1-tulu-3-8b-l4.yaml: -------------------------------------------------------------------------------- 1 | # Source: models/templates/models.yaml 2 | apiVersion: kubeai.org/v1 3 | kind: Model 4 | metadata: 5 | name: llama-3.1-tulu-3-8b-l4 6 | spec: 7 | features: [TextGeneration] 8 | url: hf://allenai/Llama-3.1-Tulu-3-8B 9 | engine: VLLM 10 | args: 11 | - --max-model-len=8192 12 | - --max-num-batched-token=8192 13 | - --max-num-seqs=256 14 | - --gpu-memory-utilization=0.95 15 | - --kv-cache-dtype=fp8 16 | env: 17 | VLLM_ATTENTION_BACKEND: FLASHINFER 18 | minReplicas: 0 19 | resourceProfile: nvidia-gpu-l4:1 20 | -------------------------------------------------------------------------------- /manifests/models/llama-3.2-11b-vision-instruct-l4.yaml: -------------------------------------------------------------------------------- 1 | # Source: models/templates/models.yaml 2 | apiVersion: kubeai.org/v1 3 | kind: Model 4 | metadata: 5 | name: llama-3.2-11b-vision-instruct-l4 6 | spec: 7 | features: [TextGeneration] 8 | url: hf://neuralmagic/Llama-3.2-11B-Vision-Instruct-FP8-dynamic 9 | engine: VLLM 10 | args: 11 | - --max-model-len=8192 12 | - --max-num-batched-token=8192 13 | - --gpu-memory-utilization=0.99 14 | - --enforce-eager 15 | - --disable-log-requests 16 | - --max-num-seqs=16 17 | env: 18 | VLLM_WORKER_MULTIPROC_METHOD: spawn 19 | minReplicas: 1 20 | maxReplicas: 1 21 | targetRequests: 32 22 | resourceProfile: nvidia-gpu-l4:1 23 | -------------------------------------------------------------------------------- /manifests/models/llama-3.3-70b-instruct-bf16-gh200.yaml: -------------------------------------------------------------------------------- 1 | # Source: models/templates/models.yaml 2 | apiVersion: kubeai.org/v1 3 | kind: Model 4 | metadata: 5 | name: llama-3.3-70b-instruct-bf16-gh200 6 | spec: 7 | features: [TextGeneration] 8 | url: hf://meta-llama/Llama-3.3-70B-Instruct 9 | engine: VLLM 10 | args: 11 | - --max-model-len=32768 12 | - --max-num-batched-token=32768 13 | - --gpu-memory-utilization=0.98 14 | - --kv-cache-dtype=fp8 15 | - --cpu-offload-gb=60 16 | - --enable-prefix-caching 17 | - --disable-log-requests 18 | env: 19 | VLLM_ATTENTION_BACKEND: FLASHINFER 20 | minReplicas: 0 21 | targetRequests: 200 22 | resourceProfile: nvidia-gpu-gh200:1 23 | -------------------------------------------------------------------------------- /manifests/models/llama-3.3-70b-ollama-l4.yaml: -------------------------------------------------------------------------------- 1 | # Source: models/templates/models.yaml 2 | apiVersion: kubeai.org/v1 3 | kind: Model 4 | metadata: 5 | name: llama-3.3-70b-ollama-l4 6 | spec: 7 | features: [TextGeneration] 8 | url: ollama://llama3.3:70b 9 | engine: OLlama 10 | minReplicas: 0 11 | resourceProfile: nvidia-gpu-l4:1 12 | -------------------------------------------------------------------------------- /manifests/models/llama-4-maverick-430k-h100.yaml: -------------------------------------------------------------------------------- 1 | # Source: models/templates/models.yaml 2 | apiVersion: kubeai.org/v1 3 | kind: Model 4 | metadata: 5 | name: llama-4-maverick-430k-h100 6 | spec: 7 | features: [TextGeneration] 8 | url: hf://meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 9 | engine: VLLM 10 | args: 11 | - --max-model-len=430000 12 | - --tensor-parallel-size=8 13 | - --enable-prefix-caching 14 | - --disable-log-requests 15 | env: 16 | VLLM_DISABLE_COMPILE_CACHE: "1" 17 | minReplicas: 0 18 | resourceProfile: nvidia-gpu-h100:8 19 | -------------------------------------------------------------------------------- /manifests/models/mistral-small-24b-instruct-h100.yaml: -------------------------------------------------------------------------------- 1 | # Source: models/templates/models.yaml 2 | apiVersion: kubeai.org/v1 3 | kind: Model 4 | metadata: 5 | name: mistral-small-24b-instruct-h100 6 | spec: 7 | features: [TextGeneration] 8 | url: hf://mistralai/Mistral-Small-24B-Instruct-2501 9 | engine: VLLM 10 | args: 11 | - --kv-cache-dtype=fp8 12 | - --max-num-batched-token=65536 13 | - --gpu-memory-utilization=0.9 14 | - --enable-prefix-caching 15 | - --disable-log-requests 16 | env: 17 | VLLM_ATTENTION_BACKEND: FLASHINFER 18 | minReplicas: 0 19 | resourceProfile: nvidia-gpu-h100:1 20 | -------------------------------------------------------------------------------- /manifests/models/mistral-small-3.1-24b-instruct-h100.yaml: -------------------------------------------------------------------------------- 1 | # Source: models/templates/models.yaml 2 | apiVersion: kubeai.org/v1 3 | kind: Model 4 | metadata: 5 | name: mistral-small-3.1-24b-instruct-h100 6 | spec: 7 | features: [TextGeneration] 8 | url: hf://mistralai/Mistral-Small-3.1-24B-Instruct-2503 9 | engine: VLLM 10 | args: 11 | - --kv-cache-dtype=fp8 12 | - --max-model-len=65536 13 | - --gpu-memory-utilization=0.9 14 | - --disable-log-requests 15 | - --tokenizer-mode=mistral 16 | - --load-format=mistral 17 | - --config-format=mistral 18 | env: 19 | VLLM_ATTENTION_BACKEND: FLASHINFER 20 | minReplicas: 0 21 | resourceProfile: nvidia-gpu-h100:1 22 | -------------------------------------------------------------------------------- /manifests/models/nomic-embed-text-cpu.yaml: -------------------------------------------------------------------------------- 1 | # Source: models/templates/models.yaml 2 | apiVersion: kubeai.org/v1 3 | kind: Model 4 | metadata: 5 | name: nomic-embed-text-cpu 6 | spec: 7 | features: [TextEmbedding] 8 | url: ollama://nomic-embed-text 9 | engine: OLlama 10 | minReplicas: 0 11 | resourceProfile: cpu:1 12 | -------------------------------------------------------------------------------- /manifests/models/opt-125m-cpu.yaml: -------------------------------------------------------------------------------- 1 | # Source: models/templates/models.yaml 2 | apiVersion: kubeai.org/v1 3 | kind: Model 4 | metadata: 5 | name: opt-125m-cpu 6 | spec: 7 | features: [TextGeneration] 8 | url: hf://facebook/opt-125m 9 | engine: VLLM 10 | args: 11 | - --chat-template=/config/chat-template.jinja 12 | minReplicas: 0 13 | resourceProfile: cpu:1 14 | files: 15 | - content: |- 16 | {% for message in messages %} 17 | {% if message['role'] == 'user' %} 18 | {{ 'Question: 19 | ' + message['content'] + ' 20 | 21 | ' }}{% elif message['role'] == 'system' %} 22 | {{ 'System: 23 | ' + message['content'] + ' 24 | 25 | ' }}{% elif message['role'] == 'assistant' %}{{ 'Answer: 26 | ' + message['content'] + ' 27 | 28 | ' }}{% endif %} 29 | {% if loop.last and add_generation_prompt %} 30 | {{ 'Answer: 31 | ' }}{% endif %}{% endfor %} 32 | path: /config/chat-template.jinja 33 | -------------------------------------------------------------------------------- /manifests/models/opt-125m-l4.yaml: -------------------------------------------------------------------------------- 1 | # Source: models/templates/models.yaml 2 | apiVersion: kubeai.org/v1 3 | kind: Model 4 | metadata: 5 | name: opt-125m-l4 6 | spec: 7 | features: [TextGeneration] 8 | url: hf://facebook/opt-125m 9 | engine: VLLM 10 | minReplicas: 0 11 | resourceProfile: nvidia-gpu-l4:1 12 | -------------------------------------------------------------------------------- /manifests/models/phi-4-bnb-4bit-l4.yaml: -------------------------------------------------------------------------------- 1 | # Source: models/templates/models.yaml 2 | apiVersion: kubeai.org/v1 3 | kind: Model 4 | metadata: 5 | name: phi-4-bnb-4bit-l4 6 | spec: 7 | features: [TextGeneration] 8 | url: hf://unsloth/phi-4-bnb-4bit 9 | engine: VLLM 10 | args: 11 | - --max-model-len=8192 12 | - --max-num-batched-token=8192 13 | - --max-num-seqs=1 14 | - --gpu-memory-utilization=0.95 15 | - --disable-log-requests 16 | - --enforce-eager 17 | - --quantization=bitsandbytes 18 | - --load_format=bitsandbytes 19 | env: 20 | VLLM_ATTENTION_BACKEND: FLASHINFER 21 | minReplicas: 0 22 | resourceProfile: nvidia-gpu-l4:1 23 | -------------------------------------------------------------------------------- /manifests/models/phi-4-ollama-l4.yaml: -------------------------------------------------------------------------------- 1 | # Source: models/templates/models.yaml 2 | apiVersion: kubeai.org/v1 3 | kind: Model 4 | metadata: 5 | name: phi-4-ollama-l4 6 | spec: 7 | features: [TextGeneration] 8 | url: ollama://phi4 9 | engine: OLlama 10 | minReplicas: 0 11 | resourceProfile: nvidia-gpu-l4:1 12 | -------------------------------------------------------------------------------- /manifests/models/qwen2-500m-cpu.yaml: -------------------------------------------------------------------------------- 1 | # Source: models/templates/models.yaml 2 | apiVersion: kubeai.org/v1 3 | kind: Model 4 | metadata: 5 | name: qwen2-500m-cpu 6 | spec: 7 | features: [TextGeneration] 8 | url: ollama://qwen2:0.5b 9 | engine: OLlama 10 | minReplicas: 0 11 | resourceProfile: cpu:1 12 | -------------------------------------------------------------------------------- /manifests/models/qwen2.5-7b-cpu.yaml: -------------------------------------------------------------------------------- 1 | # Source: models/templates/models.yaml 2 | apiVersion: kubeai.org/v1 3 | kind: Model 4 | metadata: 5 | name: qwen2.5-7b-cpu 6 | spec: 7 | features: [TextGeneration] 8 | url: ollama://qwen2.5:7b 9 | engine: OLlama 10 | minReplicas: 0 11 | resourceProfile: cpu:2 12 | -------------------------------------------------------------------------------- /manifests/models/qwen2.5-7b-instruct-l4.yaml: -------------------------------------------------------------------------------- 1 | # Source: models/templates/models.yaml 2 | apiVersion: kubeai.org/v1 3 | kind: Model 4 | metadata: 5 | name: qwen2.5-7b-instruct-l4 6 | spec: 7 | features: [TextGeneration] 8 | url: hf://Qwen/Qwen2.5-7B-Instruct 9 | engine: VLLM 10 | args: 11 | - --max-model-len=8192 12 | - --max-num-batched-token=8192 13 | - --max-num-seqs=256 14 | - --gpu-memory-utilization=0.95 15 | - --kv-cache-dtype=fp8 16 | - --enable-prefix-caching 17 | env: 18 | VLLM_ATTENTION_BACKEND: FLASHINFER 19 | minReplicas: 0 20 | resourceProfile: nvidia-gpu-l4:1 21 | -------------------------------------------------------------------------------- /manifests/models/qwen2.5-coder-1.5b-cpu.yaml: -------------------------------------------------------------------------------- 1 | # Source: models/templates/models.yaml 2 | apiVersion: kubeai.org/v1 3 | kind: Model 4 | metadata: 5 | name: qwen2.5-coder-1.5b-cpu 6 | spec: 7 | features: [TextGeneration] 8 | url: ollama://qwen2.5-coder:1.5b 9 | engine: OLlama 10 | minReplicas: 0 11 | resourceProfile: cpu:1 12 | -------------------------------------------------------------------------------- /manifests/models/qwen2.5-coder-1.5b-rtx4070-8gb.yaml: -------------------------------------------------------------------------------- 1 | # Source: models/templates/models.yaml 2 | apiVersion: kubeai.org/v1 3 | kind: Model 4 | metadata: 5 | name: qwen2.5-coder-1.5b-rtx4070-8gb 6 | spec: 7 | features: [TextGeneration] 8 | url: hf://Qwen/Qwen2.5-Coder-1.5B-Instruct 9 | engine: VLLM 10 | args: 11 | - --max-model-len=2048 12 | - --max-num-seqs=16 13 | - --quantization=fp8 14 | - --kv-cache-dtype=fp8 15 | env: 16 | VLLM_ATTENTION_BACKEND: FLASHINFER 17 | minReplicas: 1 18 | resourceProfile: nvidia-gpu-rtx4070-8gb:1 19 | -------------------------------------------------------------------------------- /mkdocs.yml: -------------------------------------------------------------------------------- 1 | site_name: KubeAI 2 | site_url: https://www.kubeai.org 3 | repo_url: https://github.com/substratusai/kubeai 4 | 5 | theme: 6 | name: material 7 | custom_dir: docs/overrides 8 | palette: 9 | primary: white # Defaults to indigo. 10 | accent: blue # Defaults to indigo. 11 | 12 | nav: 13 | - Home: README.md 14 | - ... | installation/*.md 15 | - ... | how-to/*.md 16 | - ... | concepts/*.md 17 | - ... | tutorials/*.md 18 | - ... | contributing/*.md 19 | - ... | reference/*.md 20 | - ... 21 | plugins: 22 | - search 23 | - awesome-pages 24 | - blog 25 | - social 26 | markdown_extensions: 27 | # Python Markdown 28 | - abbr 29 | - admonition 30 | - attr_list 31 | - def_list 32 | - footnotes 33 | - md_in_html 34 | - toc: 35 | permalink: true 36 | 37 | # Python Markdown Extensions 38 | - pymdownx.arithmatex: 39 | generic: true 40 | - pymdownx.betterem: 41 | smart_enable: all 42 | - pymdownx.caret 43 | - pymdownx.details 44 | - pymdownx.emoji: 45 | emoji_index: !!python/name:material.extensions.emoji.twemoji 46 | emoji_generator: !!python/name:material.extensions.emoji.to_svg 47 | - pymdownx.highlight 48 | - pymdownx.inlinehilite 49 | - pymdownx.keys 50 | - pymdownx.mark 51 | - pymdownx.smartsymbols 52 | - pymdownx.superfences 53 | - pymdownx.tabbed: 54 | alternate_style: true 55 | - pymdownx.tasklist: 56 | custom_checkbox: true 57 | - pymdownx.tilde 58 | 59 | # Analytics tracking with GoatCounter 60 | extra: 61 | analytics: 62 | provider: custom 63 | -------------------------------------------------------------------------------- /proposals/diagrams/auth-with-label-selector.excalidraw.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/substratusai/kubeai/818a871ca6c1c0af774e467217de0a17b8bd9d7c/proposals/diagrams/auth-with-label-selector.excalidraw.png -------------------------------------------------------------------------------- /proposals/diagrams/cache-optimized-routing.excalidraw.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/substratusai/kubeai/818a871ca6c1c0af774e467217de0a17b8bd9d7c/proposals/diagrams/cache-optimized-routing.excalidraw.png -------------------------------------------------------------------------------- /proposals/diagrams/lora-direct-loading.excalidraw.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/substratusai/kubeai/818a871ca6c1c0af774e467217de0a17b8bd9d7c/proposals/diagrams/lora-direct-loading.excalidraw.png -------------------------------------------------------------------------------- /proposals/diagrams/lora.excalidraw.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/substratusai/kubeai/818a871ca6c1c0af774e467217de0a17b8bd9d7c/proposals/diagrams/lora.excalidraw.png -------------------------------------------------------------------------------- /proposals/diagrams/model-mgmt-buckets.excalidraw.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/substratusai/kubeai/818a871ca6c1c0af774e467217de0a17b8bd9d7c/proposals/diagrams/model-mgmt-buckets.excalidraw.png -------------------------------------------------------------------------------- /proposals/diagrams/model-mgmt-volumes.excalidraw.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/substratusai/kubeai/818a871ca6c1c0af774e467217de0a17b8bd9d7c/proposals/diagrams/model-mgmt-volumes.excalidraw.png -------------------------------------------------------------------------------- /proposals/lora-adapters.md: -------------------------------------------------------------------------------- 1 | # LoRA Adapters 2 | 3 | ## Overview 4 | 5 | ![LoRA Adapters](./diagrams/lora.excalidraw.png) 6 | 7 | ## Direct Loading Implementation 8 | 9 | ![Direct Loading](./diagrams/lora-direct-loading.excalidraw.png) -------------------------------------------------------------------------------- /proposals/multitenancy.md: -------------------------------------------------------------------------------- 1 | # Multitenancy 2 | 3 | The goal of this proposal is to allow KubeAI to be used in a multitenancy environment where 4 | some users only have access to some models. 5 | 6 | ## Implementation Option 1: Auth Labels 7 | 8 | In this implementation, KubeAI has well-known labels that correspond to groups that are allowed to access models. 9 | 10 | The KubeAI system is configured to trust a configured header. 11 | 12 | ```yaml 13 | auth: 14 | http: 15 | trustedHeader: X-Auth-Groups 16 | # Possibly in future: configure Model roles. 17 | # modelRoles: 18 | # user: ["list", "describe", "infer"] 19 | ``` 20 | 21 | The groups associated with a request are passed in a trusted header. 22 | 23 | ```bash 24 | curl http://localhost:8000/openai/v1/completions \ 25 | -H "X-Auth-Groups: grp-a, grp-b" 26 | ``` 27 | 28 | The groups that are allowed to access a given model are configured as labels on the Model. 29 | 30 | ```yaml 31 | kind: Model 32 | metadata: 33 | name: llama-3.2 34 | labels: 35 | auth.kubeai.org/grp-a: 36 | auth.kubeai.org/grp-c: 37 | ``` 38 | 39 | ## Implementation Option 2: General Label Selector 40 | 41 | **CURRENT PREFERENCE** (Unless there is a reason to introduce auth-specific configuration.) 42 | 43 | In this implementation, label selectors are used to filter models. The decision of which labels to use are up to the architects of the system that KubeAI is a part of. These label selectors could be enforced by a server that is an intermediary between KubeAI and the end users. 44 | 45 | ![Auth with Label Selector](./diagrams/auth-with-label-selector.excalidraw.png) 46 | 47 | ```bash 48 | curl http://localhost:8000/openai/v1/completions \ 49 | -H "X-Label-Selector: key1=value1" 50 | 51 | curl http://localhost:8000/openai/v1/models \ 52 | -H "X-Label-Selector: key1=value1" 53 | ``` 54 | 55 | Models just need to have the labels set. 56 | 57 | ```yaml 58 | kind: Model 59 | metadata: 60 | name: llama-3.2 61 | labels: 62 | key1: value1 63 | ``` 64 | -------------------------------------------------------------------------------- /skaffold-build.json: -------------------------------------------------------------------------------- 1 | {"builds":null} -------------------------------------------------------------------------------- /skaffold-tags.json: -------------------------------------------------------------------------------- 1 | {"builds":null} -------------------------------------------------------------------------------- /skaffold.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: skaffold/v4beta11 2 | kind: Config 3 | metadata: 4 | name: kubeai-project 5 | build: 6 | artifacts: 7 | - image: substratusai/kubeai 8 | local: 9 | push: false 10 | deploy: 11 | helm: 12 | releases: 13 | - name: kubeai 14 | chartPath: ./charts/kubeai 15 | setValueTemplates: 16 | open-webui.enabled: "false" 17 | skipBuildDependencies: true 18 | portForward: 19 | - resourceType: service 20 | resourceName: kubeai 21 | namespace: default 22 | port: 80 23 | localPort: 8000 24 | profiles: 25 | - name: kubeai-only 26 | deploy: 27 | helm: 28 | releases: 29 | - name: kubeai 30 | chartPath: ./charts/kubeai 31 | setValueTemplates: 32 | open-webui.enabled: "false" 33 | skipBuildDependencies: true 34 | - name: kubeai-only-gke 35 | build: 36 | local: 37 | push: true 38 | deploy: 39 | helm: 40 | releases: 41 | - name: kubeai 42 | chartPath: ./charts/kubeai 43 | valuesFiles: 44 | - ./charts/kubeai/values-gke.yaml 45 | setValueTemplates: 46 | open-webui.enabled: "false" 47 | skipBuildDependencies: true -------------------------------------------------------------------------------- /test/e2e-manual/gke-vllm-adapters/model.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: kubeai.org/v1 2 | kind: Model 3 | metadata: 4 | name: tiny-llama 5 | spec: 6 | features: [TextGeneration] 7 | owner: meta-llama 8 | url: hf://TinyLlama/TinyLlama-1.1B-Chat-v0.3 9 | adapters: 10 | - id: colorist 11 | url: hf://jashing/tinyllama-colorist-lora 12 | engine: VLLM 13 | resourceProfile: nvidia-gpu-l4:1 14 | minReplicas: 1 -------------------------------------------------------------------------------- /test/e2e-manual/gke-vllm-adapters/run.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -ex 4 | 5 | skaffold run -f ./skaffold.yaml --tail --port-forward --profile kubeai-only-gke --default-repo us-central1-docker.pkg.dev/substratus-dev 6 | 7 | kubectl apply -f ./model.yaml 8 | 9 | kubectl port-forward svc/kubeai 8000:80 & 10 | 11 | # raw model 12 | curl -v http://localhost:8000/openai/v1/completions \ 13 | -H "Content-Type: application/json" \ 14 | -d '{"model": "tiny-llama", "prompt": "Who was the first president of the United States?", "max_tokens": 40}' 15 | 16 | # with adapter 17 | curl -v http://localhost:8000/openai/v1/completions \ 18 | -H "Content-Type: application/json" \ 19 | -d '{"model": "tiny-llama/colorist", "prompt": "Who was the first president of the United States?", "max_tokens": 40}' 20 | -------------------------------------------------------------------------------- /test/e2e-manual/gke-vllm-gpu-tpu/run.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -ex 4 | 5 | # Spin up latest release and run test GPU and TPU on GKE autopilot. 6 | 7 | helm install kubeai ./charts/kubeai \ 8 | -f ./charts/kubeai/values-gke.yaml \ 9 | -f - < $replica_log & 19 | kubectl_watch_pid=$! 20 | 21 | sleep 60 22 | kubectl delete pods -l app.kubernetes.io/name=kubeai 23 | sleep 120 24 | 25 | kill $kubectl_watch_pid 26 | 27 | echo "Replica log:" 28 | cat $replica_log 29 | replicas_over_time=$(cat $replica_log | sort | uniq) 30 | 31 | # Replicas should have remained at 3 32 | if [ "$replicas_over_time" != "3" ]; then 33 | echo "TEST FAILURE: Replicas changed during autoscaler restart." 34 | cat $replica_log 35 | exit 1 36 | fi -------------------------------------------------------------------------------- /test/e2e/autoscaler-restart-under-load/values.yaml: -------------------------------------------------------------------------------- 1 | modelAutoscaling: 2 | interval: 1s 3 | timeWindow: 30s 4 | open-webui: 5 | enabled: false 6 | -------------------------------------------------------------------------------- /test/e2e/cache-shared-filesystem/cache-mount-pod.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: cache-mount-pod 5 | spec: 6 | containers: 7 | - name: main 8 | image: ubuntu 9 | command: ["sleep", "10000"] 10 | volumeMounts: 11 | - name: models 12 | mountPath: /test-mount 13 | volumes: 14 | - name: models 15 | persistentVolumeClaim: 16 | claimName: shared-model-cache-e2e-test-kind-pv -------------------------------------------------------------------------------- /test/e2e/cache-shared-filesystem/test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | source $REPO_DIR/test/e2e/common.sh 4 | 5 | models_release="kubeai-models" 6 | 7 | helm install $models_release $REPO_DIR/charts/models -f - < $transcription_file 17 | 18 | result_contains_kubernetes=$(cat $transcription_file | jq '.text | ascii_downcase | contains("kubernetes")') 19 | if [ "$result_contains_kubernetes" = "true" ]; then 20 | echo "The transcript contains 'kubernetes'." 21 | else 22 | echo "The text does not contain 'kubernetes':" 23 | cat $transcription_file 24 | exit 1 25 | fi 26 | -------------------------------------------------------------------------------- /test/e2e/engine-infinity/test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | source $REPO_DIR/test/e2e/common.sh 4 | 5 | model=bge-embed-text-cpu 6 | 7 | apply_model $model 8 | 9 | # Test embedding generation 10 | response_file=$TMP_DIR/embedding.json 11 | curl http://localhost:8000/openai/v1/embeddings \ 12 | -H "Content-Type: application/json" \ 13 | -d '{ 14 | "input": "Hello world", 15 | "model": "'$model'" 16 | }' > $response_file 17 | 18 | # Verify response structure and content 19 | embedding_length=$(cat $response_file | jq '.data[0].embedding | length') 20 | if [ "$embedding_length" -ne 384 ]; then 21 | echo "Unexpected embedding dimension: got $embedding_length, expected 384" 22 | cat $response_file 23 | exit 1 24 | fi 25 | 26 | echo "Successfully generated embedding with $embedding_length dimensions" 27 | -------------------------------------------------------------------------------- /test/e2e/engine-ollama-pvc/ollama-hydrate-job.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: batch/v1 2 | kind: Job 3 | metadata: 4 | name: ollama-pvc-hydrate 5 | spec: 6 | template: 7 | spec: 8 | containers: 9 | - name: ollama 10 | image: ollama/ollama:latest 11 | env: 12 | - name: OLLAMA_MODELS 13 | value: /model 14 | command: 15 | - /bin/sh 16 | - -c 17 | - | 18 | /bin/ollama serve & 19 | echo "Waiting for Ollama server to start..." 20 | sleep 10 21 | 22 | # Pull the model and ensure it downloads successfully 23 | echo "Pulling model qwen:0.5b..." 24 | if ! /bin/ollama pull qwen:0.5b; then 25 | echo "Failed to pull model" 26 | exit 1 27 | fi 28 | 29 | # Verify the model files exist 30 | echo "Verifying model files..." 31 | ls -R /model 32 | if [ ! -d "/model/blobs" ] || [ ! -d "/model/manifests" ]; then 33 | echo "Model directories not found" 34 | exit 1 35 | fi 36 | 37 | echo "Model setup completed successfully" 38 | ls -la /model/manifests/registry.ollama.ai/library/qwen/0.5b 39 | volumeMounts: 40 | - name: models-volume 41 | mountPath: /model 42 | volumes: 43 | - name: models-volume 44 | persistentVolumeClaim: 45 | claimName: model-pvc 46 | readOnly: false 47 | restartPolicy: OnFailure 48 | -------------------------------------------------------------------------------- /test/e2e/engine-ollama-pvc/pv.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: PersistentVolume 3 | metadata: 4 | name: kind-model-hostpath 5 | spec: 6 | storageClassName: manual 7 | capacity: 8 | storage: 25Gi 9 | accessModes: 10 | - ReadWriteMany 11 | - ReadOnlyMany 12 | - ReadWriteOnce 13 | hostPath: 14 | path: $PV_HOST_PATH 15 | type: DirectoryOrCreate 16 | persistentVolumeReclaimPolicy: Retain 17 | -------------------------------------------------------------------------------- /test/e2e/engine-ollama-pvc/pvc.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: PersistentVolumeClaim 3 | metadata: 4 | name: model-pvc 5 | spec: 6 | storageClassName: manual 7 | accessModes: 8 | - ReadWriteMany 9 | resources: 10 | requests: 11 | storage: 10Gi 12 | volumeName: kind-model-hostpath -------------------------------------------------------------------------------- /test/e2e/engine-ollama-pvc/test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | source $REPO_DIR/test/e2e/common.sh 4 | 5 | models_release="kubeai-models" 6 | 7 | # Create PV_HOST_PATH inside the kind container 8 | kind_container=$(docker ps --filter "name=kind-control-plane" --format "{{.ID}}") 9 | export PV_HOST_PATH="/mnt/models" 10 | docker exec -i $kind_container mkdir -p $PV_HOST_PATH 11 | echo "PV_HOST_PATH: $PV_HOST_PATH" 12 | 13 | 14 | envsubst < $REPO_DIR/test/e2e/engine-ollama-pvc/pv.yaml | kubectl apply -f - 15 | kubectl apply -f $REPO_DIR/test/e2e/engine-ollama-pvc/pvc.yaml 16 | 17 | # Apply the Ollama hydrate job 18 | kubectl apply -f $REPO_DIR/test/e2e/engine-ollama-pvc/ollama-hydrate-job.yaml 19 | 20 | # Wait for job completion with timeout 21 | echo "Waiting for Ollama hydrate job to complete..." 22 | if ! kubectl wait --for=condition=complete --timeout=600s job/ollama-pvc-hydrate; then 23 | echo "Ollama hydrate job failed or timed out" 24 | kubectl logs job/ollama-pvc-hydrate 25 | exit 1 26 | fi 27 | 28 | 29 | helm install $models_release $REPO_DIR/charts/models -f - < 0 28 | 29 | 30 | def test_completion(): 31 | response = client.completions.create( 32 | model=model, prompt="How are you?", max_tokens=50 33 | ) 34 | 35 | print(response) 36 | # Assert that the response contains at least one "choices" 37 | assert len(response.choices) > 0 38 | -------------------------------------------------------------------------------- /test/e2e/openai-python-client/test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | source $REPO_DIR/test/e2e/common.sh 4 | 5 | kubectl apply -f $REPO_DIR/manifests/models/opt-125m-cpu.yaml 6 | 7 | python -m venv $TEST_DIR/venv 8 | 9 | source $TEST_DIR/venv/bin/activate 10 | 11 | which pip 12 | pip install -r $TEST_DIR/requirements.txt 13 | 14 | # Wait for models to sync. 15 | sleep 3 16 | 17 | pytest $TEST_DIR/test.py 18 | -------------------------------------------------------------------------------- /test/e2e/quickstart/test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | source $REPO_DIR/test/e2e/common.sh 4 | 5 | models_release="kubeai-models" 6 | 7 | helm install $models_release $REPO_DIR/charts/models -f - <