├── .dockerignore
├── .github
├── dependabot.yaml
└── workflows
│ ├── build-push-kubeai.yml
│ ├── build-push-model-loader.yml
│ ├── create-gh-release.yml
│ ├── docs-lint.yml
│ ├── helm-lint.yml
│ ├── publish-docs.yml
│ ├── release-helm-chart.yml
│ └── tests.yml
├── .gitignore
├── .golangci.yml
├── Dockerfile
├── LICENSE
├── Makefile
├── PROJECT
├── api
├── k8s
│ └── v1
│ │ ├── groupversion_info.go
│ │ ├── metadata.go
│ │ ├── model_types.go
│ │ └── zz_generated.deepcopy.go
└── openai
│ └── v1
│ ├── README.md
│ ├── chat_completions.go
│ ├── chat_completions_test.go
│ ├── completions.go
│ ├── completions_test.go
│ ├── embeddings.go
│ ├── embeddings_test.go
│ ├── reference
│ ├── chat_completions.openai.openapi.yaml
│ ├── completions.openai.openapi.yaml
│ ├── embeddings.openai.openapi.yaml
│ ├── example-requests.ollama.output
│ ├── example-requests.openai.output
│ ├── example-requests.sh
│ └── example-requests.vllm.output
│ ├── usage.go
│ ├── utils.go
│ └── utils_test.go
├── benchmarks
├── chat-py
│ ├── .gitignore
│ ├── Dockerfile
│ ├── README.md
│ ├── backend_request_func.py
│ ├── benchmark_serving.py
│ ├── job.yaml
│ ├── requirements.txt
│ └── vllm-direct-service.yaml
├── multi-turn-chat-go
│ ├── .dockerignore
│ ├── .gitignore
│ ├── .python-version
│ ├── Dockerfile
│ ├── Makefile
│ ├── README.md
│ ├── benchmark
│ │ ├── runner.go
│ │ └── runner_test.go
│ ├── dashs
│ │ └── vLLM-1740366828970.json
│ ├── data
│ │ └── prepare-input-threads.py
│ ├── go.mod
│ ├── go.sum
│ ├── hack
│ │ ├── Dockerfile
│ │ ├── bench-pod.yaml
│ │ ├── chat-template.jinja
│ │ ├── kubeai-config.json
│ │ ├── llama-3.1-8b-instruct-fp8-l4.yaml
│ │ ├── model.yaml
│ │ ├── ollama-config.json
│ │ ├── openai-config.json
│ │ ├── pod.opt-125m.yaml
│ │ ├── pod.qwen.yaml
│ │ ├── pod.yaml
│ │ ├── podmonitor.yaml
│ │ ├── tokenizer
│ │ │ ├── tokenizer.go
│ │ │ └── tokens.py
│ │ └── vllm.Dockerfile
│ ├── main.go
│ ├── pyproject.toml
│ ├── run.ipynb
│ ├── runs
│ │ └── llama-3.1-8x-l4
│ │ │ ├── itl.png
│ │ │ ├── run.ipynb
│ │ │ ├── throughput.png
│ │ │ └── ttft.png
│ └── uv.lock
└── multi-turn-chat-k6
│ ├── .dockerignore
│ ├── .gitignore
│ ├── Dockerfile
│ ├── Makefile
│ ├── data
│ └── prepare-message-threads.py
│ ├── k6.js
│ └── scenarios
│ ├── least-load-vs-prefix-hash-70b-8r
│ ├── README.md
│ ├── base-request.json
│ ├── k6.json
│ ├── model.yaml
│ └── pod.yaml
│ └── least-load-vs-prefix-hash
│ ├── README.md
│ ├── base-request.json
│ ├── k6.json
│ ├── model.yaml
│ └── pod.yaml
├── charts
├── .gitignore
├── kubeai
│ ├── .helmignore
│ ├── Chart.yaml
│ ├── templates
│ │ ├── _helpers.tpl
│ │ ├── autoscalerstateconfigmap.yaml
│ │ ├── aws-secret.yaml
│ │ ├── configmap.yaml
│ │ ├── crds
│ │ │ └── kubeai.org_models.yaml
│ │ ├── deployment.yaml
│ │ ├── huggingface-secret.yaml
│ │ ├── ingress.yaml
│ │ ├── role.yaml
│ │ ├── rolebinding.yaml
│ │ ├── securityContextConstraints.yaml
│ │ ├── service.yaml
│ │ ├── serviceaccount.yaml
│ │ └── vllm-pod-monitor.yaml
│ ├── values-amd-gpu-device-plugin.yaml
│ ├── values-eks.yaml
│ ├── values-gke.yaml
│ ├── values-nvidia-k8s-device-plugin.yaml
│ └── values.yaml
└── models
│ ├── .helmignore
│ ├── Chart.yaml
│ ├── templates
│ └── models.yaml
│ └── values.yaml
├── cmd
└── main.go
├── components
└── model-loader
│ ├── Dockerfile
│ └── load.sh
├── docs
├── CNAME
├── README.md
├── benchmarks
│ ├── llama-3.2-11b-vision.md
│ ├── prefix-aware-load-balancing-mean-ttft.png
│ ├── prefix-aware-load-balancing-throughput.png
│ └── prefix-aware-load-balancing.md
├── blog
│ ├── .authors.yml
│ ├── index.md
│ └── posts
│ │ └── llm-load-balancing-at-scale-chwbl.md
├── concepts
│ ├── autoscaling.md
│ ├── backend-servers.md
│ ├── load-balancing.md
│ ├── lora-adapters.md
│ ├── resource-profiles.md
│ └── storage-caching.md
├── contributing
│ ├── development-environment.md
│ ├── development-guide.md
│ ├── documentation.md
│ └── release-process.md
├── diagrams
│ ├── arch.excalidraw.png
│ ├── autoscaling.excalidraw.png
│ ├── caching-shared-filesystem.excalidraw.png
│ ├── chwbl.excalidraw.png
│ ├── lora-direct-loading.excalidraw.png
│ ├── multi-threaded-shared-context.excalidraw.png
│ ├── multi-turn-clients.excalidraw.png
│ ├── multitenancy-labels.excalidraw.png
│ ├── private-deep-chat.excalidraw.png
│ └── random-vs-consistent-hash.excalidraw.png
├── graphs
│ ├── throughput-benchmark.png
│ └── ttft-benchmark.png
├── how-to
│ ├── architect-for-multitenancy.md
│ ├── authenticate-to-model-repos.md
│ ├── build-models-into-containers.md
│ ├── cache-models-with-aws-efs.md
│ ├── cache-models-with-gcp-filestore.md
│ ├── configure-autoscaling.md
│ ├── configure-embedding-models.md
│ ├── configure-resource-profiles.md
│ ├── configure-speech-to-text.md
│ ├── configure-text-generation-models.md
│ ├── install-models.md
│ ├── load-models-from-pvc.md
│ ├── observability-with-prometheus-stack.md
│ └── serve-lora-adapters.md
├── index.yaml
├── installation
│ ├── aks.md
│ ├── any.md
│ ├── eks.md
│ └── gke.md
├── overrides
│ └── partials
│ │ └── integrations
│ │ └── analytics
│ │ └── custom.html
├── reference
│ ├── .kubernetes-api
│ │ └── config.yaml
│ ├── kubernetes-api.md
│ └── openai-api-compatibility.md
├── requirements.txt
├── screenshots
│ ├── gcp-cpus-all-regions.png
│ ├── gcp-gpus-all-regions.png
│ ├── gcp-quota-preemptible-nvidia-l4-gpus-regional.png
│ ├── gcp-quota-premium-storage-gb-per-region.png
│ ├── gcp-tpu-preemptible-v5e-quota.png
│ ├── langtrace.png
│ └── private-deep-chat.png
└── tutorials
│ ├── langchain.md
│ ├── langtrace.md
│ ├── private-deep-chat.md
│ └── weaviate.md
├── examples
├── k8s-api-clients
│ └── python
│ │ ├── .gitignore
│ │ ├── example.py
│ │ └── requirements.txt
├── observability
│ └── vllm-grafana-dashboard.json
├── ollama-builtin
│ ├── Dockerfile
│ └── download.sh
├── ollama-pvc
│ ├── job.yaml
│ └── pvc.yaml
├── priority-examples
│ ├── README.md
│ ├── background-research-model.yaml
│ ├── critical-service-model.yaml
│ ├── hello-world-llm.yaml
│ └── priority-classes.yaml
├── private-deep-chat
│ ├── Dockerfile
│ ├── go.mod
│ ├── main.go
│ ├── manifests
│ │ ├── deployment.yaml
│ │ ├── models.yaml
│ │ └── service.yaml
│ └── static
│ │ └── index.html
└── storage-classes
│ └── gcp-filestore.yaml
├── go.mod
├── go.sum
├── hack
├── apply-model.sh
├── boilerplate.go.txt
├── create-dev-gke-cluster.sh
├── dev-configs
│ ├── gke.yaml
│ └── kind.yaml
├── dev-gke-helm-values.yaml
├── dev-load
│ ├── k6.js
│ ├── pod.yaml
│ └── run.sh
├── dev-models
│ ├── kind-cpu-adapters.yaml
│ ├── kind-cpu.yaml
│ ├── kind-vllm-cpu.yaml
│ ├── vllm-chat.yaml
│ ├── vllm-gs-url.yaml
│ ├── vllm-s3-url.yaml
│ └── vllm-with-adapters.yaml
├── filter-openapi-components.py
├── pvs
│ └── preprov-filestore.yaml
├── vllm-mock-metrics
│ ├── main.go
│ └── metrics.txt
└── volume-debug-pod.yaml
├── internal
├── apiutils
│ ├── model.go
│ ├── model_test.go
│ ├── request.go
│ └── request_test.go
├── config
│ ├── system.go
│ └── system_test.go
├── k8sutils
│ ├── apply.go
│ ├── client_options.go
│ ├── gvk.go
│ ├── jobs.go
│ ├── meta.go
│ ├── meta_test.go
│ └── pods.go
├── leader
│ └── election.go
├── loadbalancer
│ ├── balance_chwbl.go
│ ├── balance_least_load.go
│ ├── group.go
│ ├── group_bench_test.go
│ ├── group_test.go
│ ├── load_balancer.go
│ └── load_balancer_test.go
├── manager
│ ├── configure.go
│ ├── otel.go
│ └── run.go
├── messenger
│ └── messenger.go
├── metrics
│ ├── metrics.go
│ └── metricstest
│ │ └── metricstest.go
├── modelautoscaler
│ ├── autoscaler.go
│ ├── metrics.go
│ └── state.go
├── modelclient
│ ├── client.go
│ └── scale.go
├── modelcontroller
│ ├── adapters.go
│ ├── cache.go
│ ├── engine_fasterwhisper.go
│ ├── engine_infinity.go
│ ├── engine_ollama.go
│ ├── engine_ollama_test.go
│ ├── engine_vllm.go
│ ├── files.go
│ ├── files_test.go
│ ├── model_controller.go
│ ├── model_controller_test.go
│ ├── model_source.go
│ ├── model_source_test.go
│ ├── patch.go
│ ├── patch_test.go
│ ├── pod_plan.go
│ ├── pod_plan_test.go
│ └── pod_utils.go
├── modelproxy
│ ├── handler.go
│ ├── handler_test.go
│ └── request.go
├── movingaverage
│ ├── simple.go
│ └── simple_test.go
├── openaiserver
│ ├── handler.go
│ └── models.go
└── vllmclient
│ └── client.go
├── manifests
└── models
│ ├── bge-embed-text-cpu.yaml
│ ├── deepseek-r1-1.5b-cpu.yaml
│ ├── deepseek-r1-70b-gh200-fp8.yaml
│ ├── deepseek-r1-70b-gh200.yaml
│ ├── deepseek-r1-distill-llama-8b-l4.yaml
│ ├── deepseek-r1-distill-qwen-1.5b-rtx4070.yaml
│ ├── deepseek-r1-mi300x.yaml
│ ├── e5-mistral-7b-instruct-cpu.yaml
│ ├── faster-whisper-medium-en-cpu.yaml
│ ├── gemma-2-9b-it-fp8-l4.yaml
│ ├── gemma-27b-ollama-l4.yaml
│ ├── gemma-2b-it-tpu.yaml
│ ├── gemma-3-12b-ollama-l4.yaml
│ ├── gemma-3-27b-ollama-l4.yaml
│ ├── gemma-9b-ollama-l4.yaml
│ ├── gemma2-2b-cpu.yaml
│ ├── granite-3.1-dense-ollama-l4.yaml
│ ├── llama-3.1-405b-instruct-fp8-a100-80b.yaml
│ ├── llama-3.1-405b-instruct-fp8-h100.yaml
│ ├── llama-3.1-405b-instruct-fp8-mi300x.yaml
│ ├── llama-3.1-70b-instruct-awq-int4-gh200.yaml
│ ├── llama-3.1-70b-instruct-fp8-1-h100.yaml
│ ├── llama-3.1-70b-instruct-fp8-gh200.yaml
│ ├── llama-3.1-70b-instruct-fp8-h100.yaml
│ ├── llama-3.1-70b-instruct-fp8-l4.yaml
│ ├── llama-3.1-70b-instruct-fp8-mi300x.yaml
│ ├── llama-3.1-8b-instruct-cpu.yaml
│ ├── llama-3.1-8b-instruct-fp8-l4.yaml
│ ├── llama-3.1-8b-instruct-tpu.yaml
│ ├── llama-3.1-supernova-lite-l4.yaml
│ ├── llama-3.1-tulu-3-8b-l4.yaml
│ ├── llama-3.2-11b-vision-instruct-l4.yaml
│ ├── llama-3.3-70b-instruct-bf16-gh200.yaml
│ ├── llama-3.3-70b-ollama-l4.yaml
│ ├── llama-4-maverick-430k-h100.yaml
│ ├── mistral-small-24b-instruct-h100.yaml
│ ├── mistral-small-3.1-24b-instruct-h100.yaml
│ ├── nomic-embed-text-cpu.yaml
│ ├── opt-125m-cpu.yaml
│ ├── opt-125m-l4.yaml
│ ├── phi-4-bnb-4bit-l4.yaml
│ ├── phi-4-ollama-l4.yaml
│ ├── qwen2-500m-cpu.yaml
│ ├── qwen2.5-7b-cpu.yaml
│ ├── qwen2.5-7b-instruct-l4.yaml
│ ├── qwen2.5-coder-1.5b-cpu.yaml
│ └── qwen2.5-coder-1.5b-rtx4070-8gb.yaml
├── mkdocs.yml
├── proposals
├── diagrams
│ ├── auth-with-label-selector.excalidraw.png
│ ├── cache-optimized-routing.excalidraw.png
│ ├── lora-direct-loading.excalidraw.png
│ ├── lora.excalidraw.png
│ ├── model-mgmt-buckets.excalidraw.png
│ └── model-mgmt-volumes.excalidraw.png
├── lora-adapters.md
├── model-storage.md
└── multitenancy.md
├── skaffold-build.json
├── skaffold-tags.json
├── skaffold.yaml
└── test
├── e2e-manual
├── gke-vllm-adapters
│ ├── model.yaml
│ └── run.sh
└── gke-vllm-gpu-tpu
│ └── run.sh
├── e2e
├── autoscaler-restart-no-load
│ ├── k6-pod.yaml
│ ├── k6.js
│ ├── model.yaml
│ ├── skaffold.yaml
│ ├── test.sh
│ └── values.yaml
├── autoscaler-restart-under-load
│ ├── k6-pod.yaml
│ ├── k6.js
│ ├── model.yaml
│ ├── skaffold.yaml
│ ├── test.sh
│ └── values.yaml
├── cache-shared-filesystem
│ ├── cache-mount-pod.yaml
│ └── test.sh
├── common-manifests.yaml
├── common.sh
├── engine-fasterwhisper
│ └── test.sh
├── engine-infinity
│ └── test.sh
├── engine-ollama-pvc
│ ├── ollama-hydrate-job.yaml
│ ├── pv.yaml
│ ├── pvc.yaml
│ └── test.sh
├── engine-vllm-pvc
│ ├── pv.yaml
│ ├── pvc.yaml
│ └── test.sh
├── model-files
│ └── test.sh
├── openai-python-client
│ ├── .gitignore
│ ├── requirements.txt
│ ├── test.py
│ └── test.sh
├── quickstart
│ └── test.sh
├── rollouts
│ └── test.sh
├── run.sh
├── s3-model
│ ├── model.yaml
│ ├── pv.yaml
│ ├── pvc.yaml
│ ├── s3-instance.yaml
│ ├── skaffold.yaml
│ ├── test.sh
│ ├── upload-model-to-s3.yaml
│ └── values.yaml
├── skaffold.default.yaml
└── values.default.yaml
├── integration
├── adapter_test.go
├── autoscaler_state_test.go
├── autoscaling_ha_test.go
├── cache_shared_filesystem_test.go
├── main_test.go
├── messenger_test.go
├── model_default_test.go
├── model_files_test.go
├── model_pod_recovery_test.go
├── model_pod_update_rollout_test.go
├── model_priority_test.go
├── model_profiles_test.go
├── model_scaling_bounds_test.go
├── model_validation_test.go
├── proxy_test.go
├── selector_test.go
└── utils_test.go
└── utils
└── utils.go
/.dockerignore:
--------------------------------------------------------------------------------
1 | # More info: https://docs.docker.com/engine/reference/builder/#dockerignore-file
2 | # Ignore build and test binaries.
3 | bin/
4 | benchmarks/
5 | charts/
6 | components/
7 | docs/
8 | examples/
9 | manifests/
10 | proposals/
11 | test/
12 | tmp/
--------------------------------------------------------------------------------
/.github/dependabot.yaml:
--------------------------------------------------------------------------------
1 | version: 2
2 | updates:
3 | - package-ecosystem: "gomod"
4 | directory: "/"
5 | schedule:
6 | interval: daily
7 | time: "01:00"
8 |
9 | - package-ecosystem: "docker"
10 | directory: "/"
11 | schedule:
12 | interval: daily
13 | time: "01:00"
14 |
15 | - package-ecosystem: "github-actions"
16 | directory: "/"
17 | schedule:
18 | interval: daily
19 | time: "01:00"
20 | groups:
21 | actions-all:
22 | patterns:
23 | - "*"
24 |
25 | - package-ecosystem: "pip"
26 | directory: "/docs"
27 | schedule:
28 | interval: daily
29 | time: "01:00"
30 |
31 | - package-ecosystem: "docker"
32 | directory: "/components/model-loader"
33 | schedule:
34 | interval: daily
35 | time: "01:00"
36 |
37 | - package-ecosystem: "docker"
38 | directory: "/examples/ollama-builtin"
39 | schedule:
40 | interval: daily
41 | time: "01:00"
42 |
43 | - package-ecosystem: "gomod"
44 | directory: "/examples/private-deep-chat"
45 | schedule:
46 | interval: daily
47 | time: "01:00"
48 |
49 | - package-ecosystem: "docker"
50 | directory: "/examples/private-deep-chat"
51 | schedule:
52 | interval: daily
53 | time: "01:00"
54 |
--------------------------------------------------------------------------------
/.github/workflows/create-gh-release.yml:
--------------------------------------------------------------------------------
1 | # Create a GitHub release on tag push
2 | # source: https://stackoverflow.com/a/75679739/376445
3 | name: Create GitHub Release
4 |
5 | on:
6 | push:
7 | tags:
8 | - "v*.*.*"
9 |
10 | permissions:
11 | contents: write
12 |
13 | jobs:
14 | release:
15 | name: Release pushed tag
16 | runs-on: ubuntu-22.04
17 | steps:
18 | - name: Create release
19 | env:
20 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
21 | tag: ${{ github.ref_name }}
22 | run: |
23 | gh release create "$tag" \
24 | --repo="$GITHUB_REPOSITORY" \
25 | --title="${GITHUB_REPOSITORY#*/} ${tag#v}" \
26 | --generate-notes
27 |
--------------------------------------------------------------------------------
/.github/workflows/docs-lint.yml:
--------------------------------------------------------------------------------
1 | name: Doc linter
2 | run-name: Run doc linter by @${{ github.actor }}
3 |
4 | on:
5 | push:
6 | branches:
7 | - main
8 | pull_request:
9 |
10 | jobs:
11 | mkdocs-build-strict:
12 | runs-on: ubuntu-latest
13 | steps:
14 | - name: Checkout code
15 | uses: actions/checkout@v4
16 | - uses: actions/setup-python@v5
17 | with:
18 | python-version: 3.x
19 | - name: Install dependencies
20 | run: pip install -r docs/requirements.txt
21 | - name: Run mkdocs in strict mode
22 | run: mkdocs build --strict
23 |
--------------------------------------------------------------------------------
/.github/workflows/helm-lint.yml:
--------------------------------------------------------------------------------
1 | name: Lint and Test Charts
2 |
3 | on:
4 | pull_request:
5 |
6 | jobs:
7 | lint-test:
8 | runs-on: ubuntu-latest
9 | steps:
10 | - name: Checkout
11 | uses: actions/checkout@v4
12 | with:
13 | fetch-depth: 0
14 |
15 | - name: Set up Helm
16 | uses: azure/setup-helm@v4.2.0
17 | with:
18 | version: v3.14.4
19 |
20 | - uses: actions/setup-python@v5
21 | with:
22 | python-version: '3.x'
23 | check-latest: true
24 |
25 | - name: Set up chart-testing
26 | uses: helm/chart-testing-action@v2.6.1
27 |
28 | - name: Run chart-testing (list-changed)
29 | id: list-changed
30 | run: |
31 | changed=$(ct list-changed --target-branch ${{ github.event.repository.default_branch }})
32 | if [[ -n "$changed" ]]; then
33 | echo "changed=true" >> "$GITHUB_OUTPUT"
34 | fi
35 |
36 | - name: Run chart-testing (lint)
37 | if: steps.list-changed.outputs.changed == 'true'
38 | run: ct lint --check-version-increment=false --target-branch ${{ github.event.repository.default_branch }}
39 |
--------------------------------------------------------------------------------
/.github/workflows/publish-docs.yml:
--------------------------------------------------------------------------------
1 | name: Publish docs
2 |
3 | on:
4 | push:
5 | branches:
6 | - main
7 |
8 | jobs:
9 | publish-docs:
10 | permissions:
11 | contents: write
12 | runs-on: ubuntu-latest
13 | steps:
14 | - uses: actions/checkout@v4
15 | - name: Configure Git
16 | run: |
17 | git config user.name "$GITHUB_ACTOR"
18 | git config user.email "$GITHUB_ACTOR@users.noreply.github.com"
19 | - uses: actions/setup-python@v5
20 | with:
21 | python-version: 3.x
22 | - run: echo "cache_id=$(date --utc '+%V')" >> $GITHUB_ENV
23 | - uses: actions/cache@v4
24 | with:
25 | key: mkdocs-material-${{ env.cache_id }}
26 | path: .cache
27 | restore-keys: |
28 | mkdocs-material-
29 | - run: |
30 | git fetch origin
31 | # This is needed because otherwise mkdocs removes the index.yaml file.
32 | # Get the latest index.yaml from gh-pages branch.
33 | git checkout gh-pages
34 | git pull origin gh-pages
35 | cp index.yaml /tmp/index.yaml
36 | git checkout main
37 | git pull origin main
38 | cp /tmp/index.yaml docs/index.yaml
39 | pip install -r docs/requirements.txt
40 | - run: make generate-kubernetes-api-reference
41 | - run: mkdocs gh-deploy
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Binaries for programs and plugins
2 | *.exe
3 | *.exe~
4 | *.dll
5 | *.so
6 | *.dylib
7 | bin/*
8 | Dockerfile.cross
9 |
10 | # Test binary, built with `go test -c`
11 | *.test
12 |
13 | # Output of the go coverage tool, specifically when used with LiteIDE
14 | *.out
15 |
16 | # Go workspace file
17 | go.work
18 |
19 | # Kubernetes Generated files - skip generated files, except for vendored files
20 | !vendor/**/zz_generated.*
21 |
22 | # editor and IDE paraphernalia
23 | .idea
24 | .vscode
25 | *.swp
26 | *.swo
27 | *~
28 |
29 | # Files that might be committed from running guides
30 | /kubeai.yaml
31 | /kubeai-models.yaml
32 | /helm-values.yaml
33 | /model-helm-values.yaml
34 | Chart.lock
35 |
36 | # Ignore python virtual env
37 | .venv
38 | *__pycache__
39 | site
40 |
41 | /tmp
42 |
43 | ./charts/kubeai/charts/*.tgz
44 |
45 | .cache/
--------------------------------------------------------------------------------
/.golangci.yml:
--------------------------------------------------------------------------------
1 | run:
2 | timeout: 5m
3 | allow-parallel-runners: true
4 |
5 | issues:
6 | # don't skip warning about doc comments
7 | # don't exclude the default set of lint
8 | exclude-use-default: false
9 | # restore some of the defaults
10 | # (fill in the rest as needed)
11 | exclude-rules:
12 | - path: "api/*"
13 | linters:
14 | - lll
15 | - path: "internal/*"
16 | linters:
17 | - dupl
18 | - lll
19 | linters:
20 | disable-all: true
21 | enable:
22 | - dupl
23 | - errcheck
24 | - exportloopref
25 | - ginkgolinter
26 | - goconst
27 | - gocyclo
28 | - gofmt
29 | - goimports
30 | - gosimple
31 | - govet
32 | - ineffassign
33 | - lll
34 | - misspell
35 | - nakedret
36 | - prealloc
37 | - revive
38 | - staticcheck
39 | - typecheck
40 | - unconvert
41 | - unparam
42 | - unused
43 |
44 | linters-settings:
45 | revive:
46 | rules:
47 | - name: comment-spacings
48 |
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | # Build the manager binary
2 | FROM golang:1.24.1 AS builder
3 | ARG TARGETOS
4 | ARG TARGETARCH
5 |
6 | WORKDIR /workspace
7 | # Copy the Go Modules manifests
8 | COPY go.mod go.mod
9 | COPY go.sum go.sum
10 | # cache deps before building and copying source so that we don't need to re-download as much
11 | # and so that source changes don't invalidate our downloaded layer
12 | RUN go mod download
13 |
14 | # Copy the go source
15 | COPY cmd/main.go cmd/main.go
16 | COPY api/ api/
17 | COPY internal/ internal/
18 |
19 | # Build
20 | # the GOARCH has not a default value to allow the binary be built according to the host where the command
21 | # was called. For example, if we call make docker-build in a local env which has the Apple Silicon M1 SO
22 | # the docker BUILDPLATFORM arg will be linux/arm64 when for Apple x86 it will be linux/amd64. Therefore,
23 | # by leaving it empty we can ensure that the container and binary shipped on it will have the same platform.
24 | RUN CGO_ENABLED=0 GOOS=${TARGETOS:-linux} GOARCH=${TARGETARCH} go build -a -o manager cmd/main.go
25 |
26 | # Use distroless as minimal base image to package the manager binary
27 | # Refer to https://github.com/GoogleContainerTools/distroless for more details
28 | FROM gcr.io/distroless/static:nonroot
29 | WORKDIR /app
30 | COPY --from=builder /workspace/manager /app/
31 | USER 65532:65532
32 |
33 | ENTRYPOINT ["/app/manager"]
34 |
--------------------------------------------------------------------------------
/PROJECT:
--------------------------------------------------------------------------------
1 | # Code generated by tool. DO NOT EDIT.
2 | # This file is used to track the info used to scaffold your project
3 | # and allow the plugins properly work.
4 | # More info: https://book.kubebuilder.io/reference/project-config.html
5 | domain: substratus.ai
6 | layout:
7 | - go.kubebuilder.io/v4
8 | projectName: kubeai
9 | repo: github.com/substratusai/kubeai
10 | resources:
11 | - api:
12 | crdVersion: v1
13 | namespaced: true
14 | controller: true
15 | domain: substratus.ai
16 | group: kubeai
17 | kind: Model
18 | path: github.com/substratusai/kubeai/api/k8s/v1
19 | version: v1
20 | version: "3"
21 |
--------------------------------------------------------------------------------
/api/k8s/v1/groupversion_info.go:
--------------------------------------------------------------------------------
1 | /*
2 | Copyright 2024.
3 |
4 | Licensed under the Apache License, Version 2.0 (the "License");
5 | you may not use this file except in compliance with the License.
6 | You may obtain a copy of the License at
7 |
8 | http://www.apache.org/licenses/LICENSE-2.0
9 |
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 |
17 | // Package v1 contains API Schema definitions for the kubeai v1 API group
18 | // +kubebuilder:object:generate=true
19 | // +groupName=kubeai.org
20 | package v1
21 |
22 | import (
23 | "k8s.io/apimachinery/pkg/runtime/schema"
24 | "sigs.k8s.io/controller-runtime/pkg/scheme"
25 | )
26 |
27 | var (
28 | // GroupVersion is group version used to register these objects
29 | GroupVersion = schema.GroupVersion{Group: "kubeai.org", Version: "v1"}
30 |
31 | // SchemeBuilder is used to add go types to the GroupVersionKind scheme
32 | SchemeBuilder = &scheme.Builder{GroupVersion: GroupVersion}
33 |
34 | // AddToScheme adds the types in this group-version to the given scheme.
35 | AddToScheme = SchemeBuilder.AddToScheme
36 | )
37 |
--------------------------------------------------------------------------------
/api/k8s/v1/metadata.go:
--------------------------------------------------------------------------------
1 | package v1
2 |
3 | const (
4 | PodModelLabel = "model"
5 | // PodHashLabel is a label key used to store the hash of the Pod spec
6 | // that was used to create the Pod. This is used to determine if a Pod
7 | // needs to be recreated.
8 | PodHashLabel = "pod-hash"
9 |
10 | ModelFeatureLabelDomain = "features.kubeai.org"
11 |
12 | // ModelPodIPAnnotation is the annotation key used to specify an IP
13 | // to use for the model Pod instead of the IP address in the status of the Pod.
14 | // Use in conjunction with --allow-pod-address-override for development purposes.
15 | ModelPodIPAnnotation = "model-pod-ip"
16 | ModelPodPortAnnotation = "model-pod-port"
17 |
18 | ModelCacheEvictionFinalizer = "kubeai.org/cache-eviction"
19 | )
20 |
21 | func PVCModelAnnotation(modelName string) string {
22 | return "models.kubeai.org/" + modelName
23 | }
24 |
25 | const (
26 | PodAdapterLabelPrefix = "adapter.kubeai.org/"
27 | )
28 |
29 | func PodAdapterLabel(adapterID string) string {
30 | return PodAdapterLabelPrefix + adapterID
31 | }
32 |
--------------------------------------------------------------------------------
/api/openai/v1/utils.go:
--------------------------------------------------------------------------------
1 | package v1
2 |
3 | // firstNChars returns the first n characters of a string.
4 | // This function is needed because Go's string indexing is based on bytes, not runes.
5 | func firstNChars(s string, n int) string {
6 | runes := []rune(s)
7 | return string(runes[:min(n, len(runes))])
8 | }
9 |
10 | // Ptr is a helper function for creating an inline pointer to a constant.
11 | func Ptr[T any](v T) *T {
12 | return &v
13 | }
14 |
--------------------------------------------------------------------------------
/api/openai/v1/utils_test.go:
--------------------------------------------------------------------------------
1 | package v1
2 |
3 | import (
4 | "fmt"
5 | "testing"
6 |
7 | "github.com/stretchr/testify/require"
8 | )
9 |
10 | func Test_firstNChars(t *testing.T) {
11 | cases := []struct {
12 | input string
13 | n int
14 | exp string
15 | }{
16 | {"", 0, ""},
17 | {"", 1, ""},
18 | {"abc", 0, ""},
19 | {"abc", 1, "a"},
20 | {"abc", 2, "ab"},
21 | {"abc", 3, "abc"},
22 | {"abc", 4, "abc"},
23 | {"世界", 1, "世"},
24 | {"世界", 2, "世界"},
25 | {"世界", 3, "世界"},
26 | }
27 | for _, c := range cases {
28 | t.Run(fmt.Sprintf("%q %d", c.input, c.n), func(t *testing.T) {
29 | require.Equal(t, c.exp, firstNChars(c.input, c.n))
30 | })
31 | }
32 | }
33 |
--------------------------------------------------------------------------------
/benchmarks/chat-py/.gitignore:
--------------------------------------------------------------------------------
1 | sharegpt_16_messages_or_more.json
2 |
--------------------------------------------------------------------------------
/benchmarks/chat-py/Dockerfile:
--------------------------------------------------------------------------------
1 | # Use a lightweight Python base image
2 | FROM python:3.10
3 |
4 | # Set the working directory
5 | WORKDIR /app
6 |
7 | # Copy requirements first to leverage Docker cache
8 | COPY requirements.txt .
9 |
10 | # Install Python dependencies
11 | RUN pip install --no-cache-dir -r requirements.txt
12 |
13 | # Copy the benchmark serving script
14 | COPY backend_request_func.py .
15 | COPY benchmark_serving.py .
16 | RUN curl -O -L https://huggingface.co/datasets/samos123/share-gpt-long-convos/resolve/main/sharegpt_16_messages_or_more.json
17 |
18 | # Set environment variables
19 | ENV PYTHONPATH=/app
20 |
21 | # Define the entrypoint command
22 | ENTRYPOINT ["python", "benchmark_serving.py"]
23 |
24 | CMD ["--dataset-name=sharegpt", "--dataset-path=sharegpt_16_messages_or_more.json"]
25 |
--------------------------------------------------------------------------------
/benchmarks/chat-py/README.md:
--------------------------------------------------------------------------------
1 | # Benchmarking Text Generation
2 |
3 | This script was adopted from the vLLM code base. The main differences are:
4 | - Load the whole conversation as prompts.
5 | - Limit the amount of max conversations and re-use the same conversation if needed.
6 |
7 | This allows us to verify whether prefix aware load balancing provides a performance
8 | boost under heavy production traffic with ongoing chat conversations.
9 |
10 | ## Running
11 |
12 | Adjust the parameters in the `job.yaml` file and run the job using the following command:
13 | ```
14 | kubectl apply -f job.yaml
15 | ```
16 |
17 |
--------------------------------------------------------------------------------
/benchmarks/chat-py/job.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: batch/v1
2 | kind: Job
3 | metadata:
4 | name: benchmark-serving
5 | spec:
6 | template:
7 | spec:
8 | containers:
9 | - name: benchmark-serving
10 | image: substratusai/benchmark_serving:latest
11 | args:
12 | - --base-url=http://kubeai/openai
13 | - --dataset-name=sharegpt
14 | - --dataset-path=/app/sharegpt_16_messages_or_more.json
15 | - --model=llama-3.1-8b-instruct-fp8-l4
16 | - --seed=12345
17 | - --tokenizer=neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8
18 | - --request-rate=800
19 | - --max-concurrency=800
20 | - --num-prompts=8000
21 | - --max-conversations=800
22 | restartPolicy: Never
--------------------------------------------------------------------------------
/benchmarks/chat-py/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy
2 | huggingface_hub
3 | aiohttp
4 | transformers
5 | datasets
6 | pillow
--------------------------------------------------------------------------------
/benchmarks/chat-py/vllm-direct-service.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: Service
3 | metadata:
4 | name: vllm-direct
5 | labels:
6 | app: vllm-direct
7 | spec:
8 | selector:
9 | app.kubernetes.io/name: vllm
10 | ports:
11 | - name: http
12 | protocol: TCP
13 | port: 80 # The port exposed by the Service.
14 | targetPort: 8000 # The container port that your pods are listening on.
15 | type: ClusterIP
16 |
17 |
--------------------------------------------------------------------------------
/benchmarks/multi-turn-chat-go/.dockerignore:
--------------------------------------------------------------------------------
1 | /data/raw/
2 | .venv
3 | __pycache__
--------------------------------------------------------------------------------
/benchmarks/multi-turn-chat-go/.gitignore:
--------------------------------------------------------------------------------
1 | data/raw/*.json
2 | data/*.json
3 | /values-gke.yaml
--------------------------------------------------------------------------------
/benchmarks/multi-turn-chat-go/.python-version:
--------------------------------------------------------------------------------
1 | 3.13
2 |
--------------------------------------------------------------------------------
/benchmarks/multi-turn-chat-go/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM golang:1.23.5
2 |
3 | WORKDIR /work
4 |
5 | COPY ./go.mod .
6 | COPY ./go.sum .
7 |
8 | RUN go mod download
9 |
10 | COPY ./main.go .
11 | COPY ./benchmark ./benchmark
12 |
13 | RUN mkdir bin
14 | ENV PATH="/work/bin:$PATH"
15 | RUN go build -o bin/bench ./main.go
16 |
17 | COPY ./data ./data
18 | COPY ./example ./example
19 |
20 | ENTRYPOINT [ "bench" ]
--------------------------------------------------------------------------------
/benchmarks/multi-turn-chat-go/README.md:
--------------------------------------------------------------------------------
1 | # Benchmark
2 |
3 | ## E2E Run
4 |
5 | Build the docker image.
6 |
7 | ```bash
8 | make data
9 | make build-docker-image
10 | make push-docker-image
11 | ```
12 |
13 | Run `run.ipynb`.
14 |
15 |
16 | ## Run with Docker
17 |
18 | ### Example: Ollama (with config flags)
19 |
20 | Make sure the Ollama server is running on your machine.
21 |
22 | ```bash
23 | docker run --network=host -e OPENAI_BASE_URL=http://host.docker.internal:11434/v1 $BENCH_IMAGE \
24 | --threads ./data/tiny.json \
25 | --thread-count 4 \
26 | --request-model qwen2:0.5b \
27 | --max-concurrent-threads 2 \
28 | --max-completion-tokens 10 \
29 | --request-timeout 30s
30 | ```
31 |
32 | ### Example: OpenAI (with config file)
33 |
34 | Make sure you have set `OPENAI_API_KEY`.
35 |
36 | ```bash
37 | docker run --network=host -e OPENAI_API_KEY=$OPENAI_API_KEY -e OPENAI_BASE_URL=https://api.openai.com/v1 $BENCH_IMAGE --config ./hack/openai-config.json --threads ./data/tiny.json
38 | ```
39 |
40 |
41 | ## Run with Go
42 |
43 | Run the benchmark (against a local ollama instance).
44 |
45 | ```bash
46 | OPENAI_BASE_URL=http://localhost:11434/v1 go run . --config ./hack/ollama-config.json --threads ./data/tiny.json
47 | ```
--------------------------------------------------------------------------------
/benchmarks/multi-turn-chat-go/go.mod:
--------------------------------------------------------------------------------
1 | module multi-turn-chat-go
2 |
3 | go 1.23.5
4 |
5 | require (
6 | github.com/davecgh/go-spew v1.1.1 // indirect
7 | github.com/pmezard/go-difflib v1.0.0 // indirect
8 | github.com/sashabaranov/go-openai v1.37.0 // indirect
9 | github.com/stretchr/testify v1.10.0 // indirect
10 | gopkg.in/yaml.v3 v3.0.1 // indirect
11 | )
12 |
--------------------------------------------------------------------------------
/benchmarks/multi-turn-chat-go/go.sum:
--------------------------------------------------------------------------------
1 | github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
2 | github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
3 | github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
4 | github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
5 | github.com/sashabaranov/go-openai v1.37.0 h1:hQQowgYm4OXJ1Z/wTrE+XZaO20BYsL0R3uRPSpfNZkY=
6 | github.com/sashabaranov/go-openai v1.37.0/go.mod h1:lj5b/K+zjTSFxVLijLSTDZuP7adOgerWeFyZLUhAKRg=
7 | github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA=
8 | github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
9 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
10 | gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
11 | gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
12 |
--------------------------------------------------------------------------------
/benchmarks/multi-turn-chat-go/hack/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM ubuntu:24.04
2 | RUN apt-get update -y && \
3 | apt-get install -y python3 python3-pip golang
4 | RUN apt-get install -y python3.12-venv
5 |
6 | WORKDIR /work
7 | RUN python3 -m venv venv
8 | ENV PATH="/work/venv/bin:$PATH"
9 | RUN pip install pydantic fastapi 'uvicorn[standard]' transformers
10 |
11 | COPY ./go.mod .
12 | COPY ./go.sum .
13 |
14 | RUN go mod download
15 |
16 | COPY ./main.go .
17 | COPY ./benchmark ./benchmark
18 | COPY ./tokenizer ./tokenizer
19 |
20 | RUN mkdir bin
21 | ENV PATH="/work/bin:$PATH"
22 | RUN go build -o bin/bench ./main.go
23 |
24 | COPY ./data ./data
25 | COPY ./example ./example
26 |
27 | ENTRYPOINT [ "bench" ]
--------------------------------------------------------------------------------
/benchmarks/multi-turn-chat-go/hack/bench-pod.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: Pod
3 | metadata:
4 | name: bench
5 | spec:
6 | restartPolicy: Never
7 | containers:
8 | - name: bench
9 | image: us-central1-docker.pkg.dev/substratus-dev/default/benchmark-multi-turn-chat-go:v0.1.1
10 | imagePullPolicy: Always
11 | command: ["sleep", "infinity"]
12 | env:
13 | - name: OPENAI_BASE_URL
14 | value: http://kubeai/openai/v1
15 | resources:
16 | requests:
17 | cpu: 2
18 | memory: 2G
19 | limits:
20 | cpu: 2
21 | memory: 2G
--------------------------------------------------------------------------------
/benchmarks/multi-turn-chat-go/hack/chat-template.jinja:
--------------------------------------------------------------------------------
1 | {% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %}
2 | {% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\n' }}{% endif %}
--------------------------------------------------------------------------------
/benchmarks/multi-turn-chat-go/hack/kubeai-config.json:
--------------------------------------------------------------------------------
1 | {
2 | "request_model": "deepseek-r1-1.5b-cpu",
3 | "tokenizer_model": "deepseek-ai/DeepSeek-R1",
4 | "max_concurrent_threads": 2,
5 | "max_completion_tokens": 10,
6 | "request_timeout": "180s"
7 | }
--------------------------------------------------------------------------------
/benchmarks/multi-turn-chat-go/hack/llama-3.1-8b-instruct-fp8-l4.yaml:
--------------------------------------------------------------------------------
1 | # Source: models/templates/models.yaml
2 | apiVersion: kubeai.org/v1
3 | kind: Model
4 | metadata:
5 | name: llama-3.1-8b-instruct-fp8-l4
6 | spec:
7 | features: [TextGeneration]
8 | url: hf://neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8
9 | engine: VLLM
10 | args:
11 | - --max-model-len=16384
12 | - --max-num-batched-token=16384
13 | - --gpu-memory-utilization=0.9
14 | - --disable-log-requests
15 | resourceProfile: nvidia-gpu-l4:1
16 | minReplicas: 2
17 | maxReplicas: 2
18 |
--------------------------------------------------------------------------------
/benchmarks/multi-turn-chat-go/hack/model.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: kubeai.org/v1
2 | kind: Model
3 | metadata:
4 | name: qwen2-0-5b
5 | spec:
6 | features: [TextGeneration]
7 | url: ollama://qwen2:0.5b
8 | engine: OLlama
9 | resourceProfile: cpu:2
10 | minReplicas: 8
11 | maxReplicas: 8
12 |
--------------------------------------------------------------------------------
/benchmarks/multi-turn-chat-go/hack/ollama-config.json:
--------------------------------------------------------------------------------
1 | {
2 | "request_model": "qwen2:0.5b",
3 | "max_concurrent_threads": 2,
4 | "thread_count": 4,
5 | "max_completion_tokens": 10,
6 | "request_timeout": "30s"
7 | }
--------------------------------------------------------------------------------
/benchmarks/multi-turn-chat-go/hack/openai-config.json:
--------------------------------------------------------------------------------
1 | {
2 | "request_model": "gpt-4o-mini",
3 | "max_concurrent_threads": 2,
4 | "thread_count": 4,
5 | "max_completion_tokens": 10,
6 | "request_timeout": "30s"
7 | }
--------------------------------------------------------------------------------
/benchmarks/multi-turn-chat-go/hack/pod.opt-125m.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: Pod
3 | metadata:
4 | name: bench
5 | spec:
6 | restartPolicy: Never
7 | containers:
8 | - name: bench
9 | image: substratusai/multi-turn-chat-go:v0.0.2
10 | imagePullPolicy: Always
11 | args:
12 | - --threads=./data/small.json
13 | - --thread-count=40
14 | - --max-concurrent-threads=10
15 | - --request-model=opt-125m-cpu
16 | - --max-completion-tokens=10
17 | - --request-timeout=6m
18 | - --no-shuffle
19 | env:
20 | - name: OPENAI_BASE_URL
21 | value: http://kubeai/openai/v1
22 | resources:
23 | requests:
24 | cpu: 4
25 | memory: 4G
26 | limits:
27 | cpu: 4
28 | memory: 4G
--------------------------------------------------------------------------------
/benchmarks/multi-turn-chat-go/hack/pod.qwen.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: Pod
3 | metadata:
4 | name: bench
5 | spec:
6 | restartPolicy: Never
7 | containers:
8 | - name: bench
9 | image: substratusai/multi-turn-chat-go:v0.0.2
10 | imagePullPolicy: Always
11 | args:
12 | - --threads=./data/small.json
13 | - --thread-count=30
14 | - --max-concurrent-threads=15
15 | - --request-model=qwen2.5-coder-1.5b-cpu
16 | - --max-completion-tokens=4
17 | - --request-timeout=6m
18 | - --no-shuffle
19 | env:
20 | - name: OPENAI_BASE_URL
21 | value: http://kubeai/openai/v1
22 | - name: HUGGING_FACE_HUB_TOKEN
23 | valueFrom:
24 | secretKeyRef:
25 | name: kubeai-huggingface
26 | key: token
27 | resources:
28 | requests:
29 | cpu: 4
30 | memory: 4G
31 | limits:
32 | cpu: 4
33 | memory: 4G
--------------------------------------------------------------------------------
/benchmarks/multi-turn-chat-go/hack/pod.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: Pod
3 | metadata:
4 | name: bench
5 | spec:
6 | restartPolicy: Never
7 | containers:
8 | - name: bench
9 | image: substratusai/multi-turn-chat-go:v0.0.2
10 | args:
11 | - --threads=./data/large.json
12 | - --thread-count=2000
13 | - --max-concurrent-threads=400
14 | - --request-model=llama-3.1-8b-instruct-fp8-l4
15 | - --max-completion-tokens=40
16 | - --request-timeout=2m
17 | env:
18 | - name: OPENAI_BASE_URL
19 | value: http://kubeai/openai/v1
20 | - name: HUGGING_FACE_HUB_TOKEN
21 | valueFrom:
22 | secretKeyRef:
23 | name: kubeai-huggingface
24 | key: token
25 | resources:
26 | requests:
27 | cpu: 4
28 | memory: 4G
29 | limits:
30 | cpu: 4
31 | memory: 4G
32 |
--------------------------------------------------------------------------------
/benchmarks/multi-turn-chat-go/hack/podmonitor.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: monitoring.coreos.com/v1
2 | kind: PodMonitor
3 | metadata:
4 | name: kubeai-vllm
5 | spec:
6 | selector:
7 | matchLabels:
8 | app.kubernetes.io/name: vllm
9 | podMetricsEndpoints:
10 | - port: http
11 | interval: 2s
12 |
--------------------------------------------------------------------------------
/benchmarks/multi-turn-chat-go/hack/tokenizer/tokens.py:
--------------------------------------------------------------------------------
1 | from fastapi import FastAPI
2 | from pydantic import BaseModel
3 | from transformers import AutoTokenizer
4 | import os
5 |
6 | app = FastAPI()
7 | tokenizer_model = os.environ["TOKENIZER_MODEL"]
8 | print("Tokenizer model:", tokenizer_model)
9 | # TODO: Account for model_max_length
10 | tokenizer = AutoTokenizer.from_pretrained(tokenizer_model)
11 |
12 | print(len(tokenizer("Your code appears to be a web application built using").input_ids))
13 |
14 |
15 | class TextInput(BaseModel):
16 | text: str
17 |
18 |
19 | @app.get("/healthz")
20 | def healthz():
21 | return {"status": "ok"}
22 |
23 |
24 | @app.post("/tokens")
25 | def count_tokens(data: TextInput):
26 | # Tokenize text
27 | input_ids = tokenizer(data.text).input_ids
28 | # Count the number of tokens
29 | num_tokens = len(input_ids)
30 | return {"num_tokens": num_tokens}
31 |
--------------------------------------------------------------------------------
/benchmarks/multi-turn-chat-go/hack/vllm.Dockerfile:
--------------------------------------------------------------------------------
1 | FROM docker.io/substratusai/vllm:v0.6.3.post1-cpu
2 | COPY ./example/chat-template.jinja /tmp
--------------------------------------------------------------------------------
/benchmarks/multi-turn-chat-go/pyproject.toml:
--------------------------------------------------------------------------------
1 | [project]
2 | name = "multi-turn-chat"
3 | version = "0.1.0"
4 | description = "Add your description here"
5 | readme = "README.md"
6 | requires-python = ">=3.13"
7 | dependencies = [
8 | "kubernetes>=32.0.1",
9 | "matplotlib>=3.10.0",
10 | ]
11 |
12 | [dependency-groups]
13 | dev = [
14 | "ipykernel>=6.29.5",
15 | "jupyterlab>=4.3.5",
16 | ]
17 |
--------------------------------------------------------------------------------
/benchmarks/multi-turn-chat-go/runs/llama-3.1-8x-l4/itl.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/substratusai/kubeai/818a871ca6c1c0af774e467217de0a17b8bd9d7c/benchmarks/multi-turn-chat-go/runs/llama-3.1-8x-l4/itl.png
--------------------------------------------------------------------------------
/benchmarks/multi-turn-chat-go/runs/llama-3.1-8x-l4/throughput.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/substratusai/kubeai/818a871ca6c1c0af774e467217de0a17b8bd9d7c/benchmarks/multi-turn-chat-go/runs/llama-3.1-8x-l4/throughput.png
--------------------------------------------------------------------------------
/benchmarks/multi-turn-chat-go/runs/llama-3.1-8x-l4/ttft.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/substratusai/kubeai/818a871ca6c1c0af774e467217de0a17b8bd9d7c/benchmarks/multi-turn-chat-go/runs/llama-3.1-8x-l4/ttft.png
--------------------------------------------------------------------------------
/benchmarks/multi-turn-chat-k6/.dockerignore:
--------------------------------------------------------------------------------
1 | data/ShareGPT_V3_unfiltered_cleaned_split.json
--------------------------------------------------------------------------------
/benchmarks/multi-turn-chat-k6/.gitignore:
--------------------------------------------------------------------------------
1 | data/*.json
--------------------------------------------------------------------------------
/benchmarks/multi-turn-chat-k6/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM ubuntu:20.04
2 |
3 | RUN apt-get update && apt-get install -y build-essential make python3 wget vim
4 |
5 | # Install k6 binary.
6 | ENV K6_VERSION=v0.55.0
7 | RUN wget https://github.com/grafana/k6/releases/download/${K6_VERSION}/k6-${K6_VERSION}-linux-amd64.tar.gz && tar -zxvf k6-${K6_VERSION}-linux-amd64.tar.gz && mv k6-${K6_VERSION}-linux-amd64/k6 /usr/local/bin && rm k6-${K6_VERSION}-linux-amd64.tar.gz
8 |
9 | WORKDIR /work
10 |
11 | COPY ./k6.js .
12 | COPY ./Makefile .
13 | COPY ./data ./data
14 | COPY ./scenarios ./scenarios
--------------------------------------------------------------------------------
/benchmarks/multi-turn-chat-k6/Makefile:
--------------------------------------------------------------------------------
1 | data/ShareGPT_V3_unfiltered_cleaned_split.json:
2 | cd data && wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
3 |
4 | .PHONY: data
5 | data: data/ShareGPT_V3_unfiltered_cleaned_split.json
6 | cd data && python prepare-message-threads.py
7 |
8 | run:
9 | ls scenarios/${SCENARIO}
10 | CONFIG_DIR=scenarios/${SCENARIO} DATA_DIR=data MODEL_ADDR=kubeai/openai k6 run ./k6.js
--------------------------------------------------------------------------------
/benchmarks/multi-turn-chat-k6/data/prepare-message-threads.py:
--------------------------------------------------------------------------------
1 | import json
2 |
3 |
4 | def main():
5 | with open("./ShareGPT_V3_unfiltered_cleaned_split.json", "r") as f:
6 | data = json.load(f)
7 |
8 | # Select a subnet the first conversations that start with a human.
9 | max = 2000
10 | output = []
11 | for entry in data:
12 | conv = entry.get("conversations")
13 | if conv and conv[0]["from"] == "human" and len(conv[0]["value"]) != 0:
14 | # Filter the conversation to only include messages from a human using a for loop.
15 | # entry["userMessages"] = [c["value"] for c in conv if c["from"] == "human"]
16 | totalContentLength = 0
17 | userMessages = []
18 | for c in conv:
19 | if c["from"] == "human":
20 | content = c["value"]
21 | userMessages.append(content)
22 | totalContentLength += len(content)
23 |
24 | if totalContentLength < 2500:
25 | continue
26 |
27 | if len(userMessages) < 5:
28 | continue
29 |
30 | # Delete the original conversation
31 | entry["userMessages"] = userMessages
32 | del entry["conversations"]
33 | output.append(entry)
34 |
35 | if len(output) >= max:
36 | break
37 |
38 | with open("./message-threads.json", "w") as f:
39 | data = json.dump(output, f, indent=4)
40 |
41 |
42 | if __name__ == "__main__":
43 | main()
44 |
--------------------------------------------------------------------------------
/benchmarks/multi-turn-chat-k6/scenarios/least-load-vs-prefix-hash-70b-8r/base-request.json:
--------------------------------------------------------------------------------
1 | {
2 | "model": "llama-3.1-70b-instruct-fp8-h100",
3 | "max_tokens": 10,
4 | "temperature": 0,
5 | "messages": []
6 | }
--------------------------------------------------------------------------------
/benchmarks/multi-turn-chat-k6/scenarios/least-load-vs-prefix-hash-70b-8r/k6.json:
--------------------------------------------------------------------------------
1 | {
2 | "thresholds": {
3 | "http_req_failed": [
4 | "rate==0"
5 | ]
6 | },
7 | "scenarios": {
8 | "chat": {
9 | "executor": "shared-iterations",
10 | "vus": 320,
11 | "iterations": 1000,
12 | "maxDuration": "600s"
13 | }
14 | }
15 | }
--------------------------------------------------------------------------------
/benchmarks/multi-turn-chat-k6/scenarios/least-load-vs-prefix-hash-70b-8r/model.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: kubeai.org/v1
2 | kind: Model
3 | metadata:
4 | name: llama-3.1-70b-instruct-fp8-h100
5 | spec:
6 | features: [TextGeneration]
7 | url: hf://neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8
8 | engine: VLLM
9 | args:
10 | - --enable-prefix-caching
11 | - --max-model-len=16384
12 | - --max-num-batched-token=16384
13 | - --gpu-memory-utilization=0.95
14 | - --disable-log-requests
15 | - --kv-cache-dtype=fp8
16 | resourceProfile: nvidia-gpu-h100:1
17 | minReplicas: 8
18 | maxReplicas: 8
19 |
--------------------------------------------------------------------------------
/benchmarks/multi-turn-chat-k6/scenarios/least-load-vs-prefix-hash-70b-8r/pod.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: Pod
3 | metadata:
4 | name: chat-benchmark
5 | spec:
6 | restartPolicy: Never
7 | containers:
8 | - name: bench
9 | image: $IMG
10 | command: ["sleep", "infinity"]
11 | resources:
12 | requests:
13 | cpu: 6
14 | ephemeral-storage: 10Gi
15 | memory: 24Gi
16 | limits:
17 | cpu: 6
18 | ephemeral-storage: 10Gi
19 | memory: 24Gi
--------------------------------------------------------------------------------
/benchmarks/multi-turn-chat-k6/scenarios/least-load-vs-prefix-hash/base-request.json:
--------------------------------------------------------------------------------
1 | {
2 | "model": "llama-3.1-8b-instruct-fp8-l4",
3 | "max_tokens": 10,
4 | "temperature": 0,
5 | "messages": []
6 | }
--------------------------------------------------------------------------------
/benchmarks/multi-turn-chat-k6/scenarios/least-load-vs-prefix-hash/k6.json:
--------------------------------------------------------------------------------
1 | {
2 | "thresholds": {
3 | "http_req_failed": [
4 | "rate==0"
5 | ]
6 | },
7 | "scenarios": {
8 | "chat": {
9 | "executor": "shared-iterations",
10 | "vus": 80,
11 | "iterations": 1000,
12 | "maxDuration": "600s"
13 | }
14 | }
15 | }
--------------------------------------------------------------------------------
/benchmarks/multi-turn-chat-k6/scenarios/least-load-vs-prefix-hash/model.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: kubeai.org/v1
2 | kind: Model
3 | metadata:
4 | name: llama-3.1-8b-instruct-fp8-l4
5 | spec:
6 | features: [TextGeneration]
7 | url: hf://neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8
8 | engine: VLLM
9 | args:
10 | - --enable-prefix-caching
11 | - --max-model-len=16384
12 | - --max-num-batched-token=16384
13 | - --gpu-memory-utilization=0.6
14 | - --disable-log-requests
15 | resourceProfile: nvidia-gpu-l4:1
16 | minReplicas: 2
17 | maxReplicas: 2
18 |
--------------------------------------------------------------------------------
/benchmarks/multi-turn-chat-k6/scenarios/least-load-vs-prefix-hash/pod.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: Pod
3 | metadata:
4 | name: chat-benchmark
5 | spec:
6 | restartPolicy: Never
7 | containers:
8 | - name: bench
9 | image: us-central1-docker.pkg.dev/substratus-dev/default/kubeai-benchmark-chat:v0.0.2
10 | command: ["sleep", "infinity"]
11 | resources:
12 | requests:
13 | cpu: 6
14 | ephemeral-storage: 10Gi
15 | memory: 24Gi
16 | limits:
17 | cpu: 6
18 | ephemeral-storage: 10Gi
19 | memory: 24Gi
--------------------------------------------------------------------------------
/charts/.gitignore:
--------------------------------------------------------------------------------
1 | charts/
2 |
--------------------------------------------------------------------------------
/charts/kubeai/.helmignore:
--------------------------------------------------------------------------------
1 | # Patterns to ignore when building packages.
2 | # This supports shell glob matching, relative path matching, and
3 | # negation (prefixed with !). Only one pattern per line.
4 | .DS_Store
5 | # Common VCS dirs
6 | .git/
7 | .gitignore
8 | .bzr/
9 | .bzrignore
10 | .hg/
11 | .hgignore
12 | .svn/
13 | # Common backup files
14 | *.swp
15 | *.bak
16 | *.tmp
17 | *.orig
18 | *~
19 | # Various IDEs
20 | .project
21 | .idea/
22 | *.tmproj
23 | .vscode/
24 |
--------------------------------------------------------------------------------
/charts/kubeai/Chart.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v2
2 | name: kubeai
3 | description: Private Open AI Platform for Kubernetes.
4 |
5 | type: application
6 |
7 | # This is the chart version. This version number should be incremented each time you make changes
8 | # to the chart and its templates, including the app version.
9 | # Versions are expected to follow Semantic Versioning (https://semver.org/)
10 | version: 0.21.0
11 |
12 | # This is the version number of the application being deployed. This version number should be
13 | # incremented each time you make changes to the application. Versions are not expected to
14 | # follow Semantic Versioning. They should reflect the version the application is using.
15 | # It is recommended to use it with quotes.
16 | appVersion: "v0.21.0"
17 |
18 | dependencies:
19 | # Open Web UI is an open source ChatGPT-like user interface.
20 | # https://docs.openwebui.com/
21 | - name: open-webui
22 | condition: open-webui.enabled
23 | repository: https://helm.openwebui.com/
24 | version: 6.4.0
25 |
26 | keywords: ["LLM", "AI"]
27 |
28 | # TODO replace with kubeai.org once live
29 | home: https://www.substratus.ai
30 |
31 | maintainers:
32 | - name: nstogner
33 | url: https://www.linkedin.com/in/nstogner/
34 | - name: samos123
35 | email: sammiestoel@gmail.com
36 | url: https://www.linkedin.com/in/samstoelinga/
37 |
--------------------------------------------------------------------------------
/charts/kubeai/templates/autoscalerstateconfigmap.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: ConfigMap
3 | metadata:
4 | name: {{ include "models.autoscalerStateConfigMapName" . }}
--------------------------------------------------------------------------------
/charts/kubeai/templates/aws-secret.yaml:
--------------------------------------------------------------------------------
1 | {{- if and .Values.secrets.aws.create (and (not (empty .Values.secrets.aws.accessKeyID)) (not (empty .Values.secrets.aws.secretAccessKey))) }}
2 | apiVersion: v1
3 | kind: Secret
4 | metadata:
5 | name: {{ include "kubeai.awsSecretName" . }}
6 | labels:
7 | {{- include "kubeai.labels" . | nindent 4 }}
8 | data:
9 | accessKeyID: {{ .Values.secrets.aws.accessKeyID | b64enc }}
10 | secretAccessKey: {{ .Values.secrets.aws.secretAccessKey | b64enc }}
11 | {{- end }}
12 |
--------------------------------------------------------------------------------
/charts/kubeai/templates/configmap.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: ConfigMap
3 | metadata:
4 | name: {{ include "kubeai.fullname" . }}-config
5 | labels:
6 | {{- include "kubeai.labels" . | nindent 4 }}
7 | data:
8 | system.yaml: |
9 | secretNames:
10 | alibaba: {{ include "kubeai.alibabaSecretName" . }}
11 | aws: {{ include "kubeai.awsSecretName" . }}
12 | gcp: {{ include "kubeai.gcpSecretName" . }}
13 | huggingface: {{ include "kubeai.huggingfaceSecretName" . }}
14 | resourceProfiles:
15 | {{- .Values.resourceProfiles | toYaml | nindent 6 }}
16 | cacheProfiles:
17 | {{- .Values.cacheProfiles | toYaml | nindent 6 }}
18 | modelServers:
19 | {{- .Values.modelServers | toYaml | nindent 6 }}
20 | modelLoading:
21 | {{- .Values.modelLoading | toYaml | nindent 6 }}
22 | modelRollouts:
23 | {{- .Values.modelRollouts | toYaml | nindent 6 }}
24 | modelServerPods:
25 | {{- if .Values.modelServerPods }}
26 | {{- if .Values.modelServerPods.podSecurityContext }}
27 | podSecurityContext:
28 | {{- .Values.modelServerPods.podSecurityContext | toYaml | nindent 8}}
29 | {{- end}}
30 | {{- if .Values.modelServerPods.jsonPatches }}
31 | jsonPatches:
32 | {{- .Values.modelServerPods.jsonPatches | toYaml | nindent 8}}
33 | {{- end}}
34 | {{- if .Values.modelServerPods.securityContext }}
35 | securityContext:
36 | {{- .Values.modelServerPods.securityContext | toYaml | nindent 8}}
37 | {{- end}}
38 | {{- if .Values.imagePullSecrets }}
39 | imagePullSecrets:
40 | {{- toYaml .Values.imagePullSecrets | nindent 8}}
41 | {{- end}}
42 | {{- end}}
43 | serviceAccountName: {{ include "models.serviceAccountName" . }}
44 | modelAutoscaling:
45 | interval: {{ .Values.modelAutoscaling.interval }}
46 | timeWindow: {{ .Values.modelAutoscaling.timeWindow }}
47 | stateConfigMapName: {{ include "models.autoscalerStateConfigMapName" . }}
48 | messaging:
49 | {{- .Values.messaging | toYaml | nindent 6 }}
50 |
--------------------------------------------------------------------------------
/charts/kubeai/templates/huggingface-secret.yaml:
--------------------------------------------------------------------------------
1 | # Only create the secret if the token is not empty.
2 | # See: https://github.com/substratusai/kubeai/issues/232
3 | {{- if and .Values.secrets.huggingface.create (not (empty .Values.secrets.huggingface.token)) }}
4 | apiVersion: v1
5 | kind: Secret
6 | metadata:
7 | name: {{ include "kubeai.huggingfaceSecretName" . }}
8 | labels:
9 | {{- include "kubeai.labels" . | nindent 4 }}
10 | data:
11 | token: {{ .Values.secrets.huggingface.token | b64enc }}
12 | {{- end }}
--------------------------------------------------------------------------------
/charts/kubeai/templates/rolebinding.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: rbac.authorization.k8s.io/v1
2 | kind: RoleBinding
3 | metadata:
4 | name: {{ include "kubeai.fullname" . }}
5 | labels:
6 | {{- include "kubeai.labels" . | nindent 4 }}
7 | roleRef:
8 | apiGroup: rbac.authorization.k8s.io
9 | kind: Role
10 | name: {{ include "kubeai.fullname" . }}
11 | subjects:
12 | - kind: ServiceAccount
13 | name: {{ include "kubeai.serviceAccountName" . }}
--------------------------------------------------------------------------------
/charts/kubeai/templates/securityContextConstraints.yaml:
--------------------------------------------------------------------------------
1 | # Create securityContextConstraints for the model pods if running on OpenShift.
2 | # This is needed because pods in OpenShift run with the restricted-v2 SCC by
3 | # default which do not allow a container to start with uid=0
4 | # (The model pod images run as the root user)
5 | {{- if .Capabilities.APIVersions.Has "security.openshift.io/v1" }}
6 | apiVersion: security.openshift.io/v1
7 | kind: SecurityContextConstraints
8 | metadata:
9 | name: {{ include "kubeai.fullname" . }}-models
10 | allowPrivilegeEscalation: false
11 | readOnlyRootFilesystem: false
12 | runAsUser:
13 | type: RunAsAny
14 | seLinuxContext:
15 | type: MustRunAs
16 | seccompProfiles:
17 | - runtime/default
18 | requiredDropCapabilities:
19 | - ALL
20 | users:
21 | - system:serviceaccount:{{ .Release.Namespace }}:{{ include "models.serviceAccountName" . }}
22 | {{- end }}
23 |
--------------------------------------------------------------------------------
/charts/kubeai/templates/service.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: Service
3 | metadata:
4 | name: {{ include "kubeai.fullname" . }}
5 | labels:
6 | {{- include "kubeai.labels" . | nindent 4 }}
7 | {{- with .Values.service.annotations }}
8 | annotations:
9 | {{- toYaml . | nindent 4 }}
10 | {{- end }}
11 | spec:
12 | type: {{ .Values.service.type }}
13 | ports:
14 | - name: http
15 | port: {{ .Values.service.port }}
16 | targetPort: http
17 | protocol: TCP
18 | {{- with .Values.service.nodePort }}
19 | nodePort: {{ . }}
20 | {{- end }}
21 | selector:
22 | {{- include "kubeai.selectorLabels" . | nindent 4 }}
23 |
--------------------------------------------------------------------------------
/charts/kubeai/templates/serviceaccount.yaml:
--------------------------------------------------------------------------------
1 | {{- if .Values.serviceAccount.create -}}
2 | apiVersion: v1
3 | kind: ServiceAccount
4 | metadata:
5 | name: {{ include "kubeai.serviceAccountName" . }}
6 | labels:
7 | {{- include "kubeai.labels" . | nindent 4 }}
8 | {{- with .Values.serviceAccount.annotations }}
9 | annotations:
10 | {{- toYaml . | nindent 4 }}
11 | {{- end }}
12 | automountServiceAccountToken: {{ .Values.serviceAccount.automount }}
13 | {{- end }}
14 | {{- if .Values.modelServiceAccount.create }}
15 | ---
16 | apiVersion: v1
17 | kind: ServiceAccount
18 | metadata:
19 | name: {{ include "models.serviceAccountName" . }}
20 | labels:
21 | {{- include "kubeai.labels" . | nindent 4 }}
22 | {{- with .Values.modelServiceAccount.annotations }}
23 | annotations:
24 | {{- toYaml . | nindent 4 }}
25 | {{- end }}
26 | automountServiceAccountToken: {{ .Values.modelServiceAccount.automount }}
27 | {{- end }}
28 |
--------------------------------------------------------------------------------
/charts/kubeai/templates/vllm-pod-monitor.yaml:
--------------------------------------------------------------------------------
1 | {{- if .Values.metrics.prometheusOperator.vLLMPodMonitor.enabled }}
2 | apiVersion: {{ .Values.metrics.prometheusOperator.vLLMPodMonitor.apiVersion }}
3 | kind: PodMonitor
4 | metadata:
5 | name: {{ include "kubeai.fullname" . }}-vllm
6 | labels:
7 | {{- include "kubeai.labels" . | nindent 4 }}
8 | {{- with .Values.metrics.prometheusOperator.vLLMPodMonitor.labels }}
9 | {{- toYaml . | nindent 4 }}
10 | {{- end }}
11 | spec:
12 | selector:
13 | matchLabels:
14 | app.kubernetes.io/name: vllm
15 | podMetricsEndpoints:
16 | - port: http
17 | {{- end }}
18 |
--------------------------------------------------------------------------------
/charts/kubeai/values-amd-gpu-device-plugin.yaml:
--------------------------------------------------------------------------------
1 | resourceProfiles:
2 | amd-gpu-mi300x:
3 | nodeSelector:
4 | # Source: https://gitlab.freedesktop.org/mesa/drm/-/blob/main/data/amdgpu.ids#L569
5 | amd.com/gpu.device-id: 74a1
6 | amd.com/gpu.vram: "192G"
7 | amd.com/gpu.family: "AI"
8 |
--------------------------------------------------------------------------------
/charts/kubeai/values-eks.yaml:
--------------------------------------------------------------------------------
1 | resourceProfiles:
2 | nvidia-gpu-l4:
3 | nodeSelector:
4 | karpenter.k8s.aws/instance-gpu-name: "l4"
5 | nvidia-gpu-l40s:
6 | nodeSelector:
7 | karpenter.k8s.aws/instance-gpu-name: "l40s"
8 | nvidia-gpu-h100:
9 | nodeSelector:
10 | karpenter.k8s.aws/instance-gpu-name: "h100"
11 | nvidia-gpu-a100-80gb:
12 | nodeSelector:
13 | karpenter.k8s.aws/instance-gpu-name: "a100"
14 | karpenter.k8s.aws/instance-gpu-memory: "81920"
15 | nvidia-gpu-a100-40gb:
16 | nodeSelector:
17 | karpenter.k8s.aws/instance-gpu-name: "a100"
18 | karpenter.k8s.aws/instance-gpu-memory: "40960"
19 |
20 | cacheProfiles:
21 | efs-dynamic:
22 | sharedFilesystem:
23 | storageClassName: "efs-sc"
24 | efs-static:
25 | sharedFilesystem:
26 | persistentVolumeName: "efs-pv"
--------------------------------------------------------------------------------
/charts/kubeai/values-gke.yaml:
--------------------------------------------------------------------------------
1 | resourceProfiles:
2 | nvidia-gpu-l4:
3 | nodeSelector:
4 | cloud.google.com/gke-accelerator: "nvidia-l4"
5 | cloud.google.com/gke-spot: "true"
6 | nvidia-gpu-h100:
7 | nodeSelector:
8 | cloud.google.com/gke-accelerator: "nvidia-h100-80gb"
9 | cloud.google.com/gke-spot: "true"
10 | nvidia-gpu-a100-80gb:
11 | nodeSelector:
12 | cloud.google.com/gke-accelerator: "nvidia-a100-80gb"
13 | cloud.google.com/gke-spot: "true"
14 | nvidia-gpu-a100-40gb:
15 | nodeSelector:
16 | cloud.google.com/gke-accelerator: "nvidia-tesla-a100"
17 | cloud.google.com/gke-spot: "true"
18 | google-tpu-v5e-1x1:
19 | imageName: google-tpu
20 | limits:
21 | google.com/tpu: 1
22 | nodeSelector:
23 | cloud.google.com/gke-tpu-accelerator: tpu-v5-lite-podslice
24 | cloud.google.com/gke-tpu-topology: "1x1"
25 | cloud.google.com/gke-spot: "true"
26 | google-tpu-v5e-2x2:
27 | imageName: google-tpu
28 | limits:
29 | google.com/tpu: 1
30 | nodeSelector:
31 | cloud.google.com/gke-tpu-accelerator: tpu-v5-lite-podslice
32 | cloud.google.com/gke-tpu-topology: "2x2"
33 | cloud.google.com/gke-spot: "true"
34 | google-tpu-v5e-2x4:
35 | imageName: google-tpu
36 | limits:
37 | google.com/tpu: 1
38 | nodeSelector:
39 | cloud.google.com/gke-tpu-accelerator: tpu-v5-lite-podslice
40 | cloud.google.com/gke-tpu-topology: "2x4"
41 | cloud.google.com/gke-spot: "true"
42 |
43 | cacheProfiles:
44 | standard-filestore:
45 | sharedFilesystem:
46 | storageClassName: "standard-rwx"
47 | premium-filestore:
48 | sharedFilesystem:
49 | storageClassName: "premium-rwx"
--------------------------------------------------------------------------------
/charts/kubeai/values-nvidia-k8s-device-plugin.yaml:
--------------------------------------------------------------------------------
1 | resourceProfiles:
2 | nvidia-gpu-a16:
3 | nodeSelector:
4 | nvidia.com/gpu.family: "ampere"
5 | nvidia.com/gpu.memory: "16384"
6 | nvidia-gpu-l4:
7 | nodeSelector:
8 | nvidia.com/gpu.family: "ada-lovelace"
9 | nvidia.com/gpu.memory: "23034"
10 | nvidia-gpu-h100:
11 | nodeSelector:
12 | nvidia.com/gpu.family: "hopper"
13 | nvidia.com/gpu.memory: "81920"
14 | nvidia-gpu-gh200:
15 | nodeSelector:
16 | nvidia.com/gpu.family: "hopper"
17 | nvidia.com/gpu.memory: "97871"
18 | nvidia-gpu-a100-80gb:
19 | nodeSelector:
20 | nvidia.com/gpu.family: "ampere"
21 | nvidia.com/gpu.memory: "81920"
22 | nvidia-gpu-a100-40gb:
23 | nodeSelector:
24 | nvidia.com/gpu.family: "ampere"
25 | nvidia.com/gpu.memory: "40960"
26 | nvidia-gpu-rtx4070-8gb:
27 | nodeSelector:
28 | nvidia.com/gpu.family: "ampere"
29 | nvidia.com/gpu.memory: "8188"
30 | nvidia-gpu-rtx4090-24gb:
31 | nodeSelector:
32 | nvidia.com/gpu.family: "ampere"
33 | nvidia.com/gpu.memory: "24564"
34 |
--------------------------------------------------------------------------------
/charts/models/.helmignore:
--------------------------------------------------------------------------------
1 | # Patterns to ignore when building packages.
2 | # This supports shell glob matching, relative path matching, and
3 | # negation (prefixed with !). Only one pattern per line.
4 | .DS_Store
5 | # Common VCS dirs
6 | .git/
7 | .gitignore
8 | .bzr/
9 | .bzrignore
10 | .hg/
11 | .hgignore
12 | .svn/
13 | # Common backup files
14 | *.swp
15 | *.bak
16 | *.tmp
17 | *.orig
18 | *~
19 | # Various IDEs
20 | .project
21 | .idea/
22 | *.tmproj
23 | .vscode/
24 |
--------------------------------------------------------------------------------
/charts/models/Chart.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v2
2 | name: models
3 | description: A Helm chart for Kubernetes
4 |
5 | # A chart can be either an 'application' or a 'library' chart.
6 | #
7 | # Application charts are a collection of templates that can be packaged into versioned archives
8 | # to be deployed.
9 | #
10 | # Library charts provide useful utilities or functions for the chart developer. They're included as
11 | # a dependency of application charts to inject those utilities and functions into the rendering
12 | # pipeline. Library charts do not define any templates and therefore cannot be deployed.
13 | type: application
14 |
15 | # This is the chart version. This version number should be incremented each time you make changes
16 | # to the chart and its templates, including the app version.
17 | # Versions are expected to follow Semantic Versioning (https://semver.org/)
18 | version: 0.21.0
19 |
20 | # This is the version number of the application being deployed. This version number should be
21 | # incremented each time you make changes to the application. Versions are not expected to
22 | # follow Semantic Versioning. They should reflect the version the application is using.
23 | # It is recommended to use it with quotes.
24 | appVersion: "1.16.0"
25 |
26 | maintainers:
27 | - name: nstogner
28 | url: https://www.linkedin.com/in/nstogner/
29 | - name: samos123
30 | email: sammiestoel@gmail.com
31 | url: https://www.linkedin.com/in/samstoelinga/
32 |
--------------------------------------------------------------------------------
/charts/models/templates/models.yaml:
--------------------------------------------------------------------------------
1 | {{- range $name, $model := .Values.catalog}}
2 | {{- if or $model.enabled $.Values.all.enabled }}
3 | ---
4 | apiVersion: kubeai.org/v1
5 | kind: Model
6 | metadata:
7 | name: {{ $name }}
8 | {{- with $model.labels }}
9 | labels:
10 | {{- toYaml . | nindent 4 }}
11 | {{- end }}
12 | spec:
13 | features: {{ $model.features }}
14 | {{- with $model.owner }}
15 | owner: {{ . }}
16 | {{- end }}
17 | url: {{ $model.url }}
18 | {{- with $model.adapters }}
19 | adapters:
20 | {{- toYaml . | nindent 4 }}
21 | {{- end }}
22 | {{- with $model.engine }}
23 | engine: {{ . }}
24 | {{- end }}
25 | {{- with $model.args }}
26 | args:
27 | {{- toYaml . | nindent 4 }}
28 | {{- end }}
29 | {{- with $model.env }}
30 | env:
31 | {{- toYaml . | nindent 4 }}
32 | {{- end }}
33 | minReplicas: {{ default 0 $model.minReplicas }}
34 | {{- with $model.maxReplicas }}
35 | maxReplicas: {{ . }}
36 | {{- end}}
37 | {{- with $model.targetRequests }}
38 | targetRequests: {{ . }}
39 | {{- end}}
40 | {{- with $model.scaleDownDelaySeconds }}
41 | scaleDownDelaySeconds: {{ . }}
42 | {{- end}}
43 | {{- with $model.resourceProfile }}
44 | resourceProfile: {{ . }}
45 | {{- end}}
46 | {{- with $model.cacheProfile }}
47 | cacheProfile: {{ . }}
48 | {{- end}}
49 | {{- with $model.files }}
50 | files:
51 | {{- toYaml . | nindent 4 }}
52 | {{- end }}
53 | {{- end}}
54 | {{- end}}
--------------------------------------------------------------------------------
/cmd/main.go:
--------------------------------------------------------------------------------
1 | /*
2 | Copyright 2024.
3 |
4 | Licensed under the Apache License, Version 2.0 (the "License");
5 | you may not use this file except in compliance with the License.
6 | You may obtain a copy of the License at
7 |
8 | http://www.apache.org/licenses/LICENSE-2.0
9 |
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 |
17 | package main
18 |
19 | import (
20 | "flag"
21 | "os"
22 |
23 | "github.com/substratusai/kubeai/internal/manager"
24 | ctrl "sigs.k8s.io/controller-runtime"
25 | "sigs.k8s.io/controller-runtime/pkg/log/zap"
26 | )
27 |
28 | func main() {
29 | // Flag parsing can cause a panic if done inside of command.Run() and called in a goroutine (as in tests).
30 | // So we parse flags here.
31 | opts := zap.Options{
32 | Development: true,
33 | }
34 | opts.BindFlags(flag.CommandLine)
35 | flag.Parse()
36 | ctrl.SetLogger(zap.New(zap.UseFlagOptions(&opts)))
37 |
38 | configPath := os.Getenv("CONFIG_PATH")
39 | if configPath == "" {
40 | configPath = "./config.yaml"
41 | }
42 |
43 | sysCfg, err := manager.LoadConfigFile(configPath)
44 | if err != nil {
45 | manager.Log.Error(err, "failed to load config file", "path", configPath)
46 | os.Exit(1)
47 | }
48 |
49 | if err := manager.Run(ctrl.SetupSignalHandler(), ctrl.GetConfigOrDie(), sysCfg); err != nil {
50 | manager.Log.Error(err, "failed to run command")
51 | os.Exit(1)
52 | }
53 | }
54 |
--------------------------------------------------------------------------------
/components/model-loader/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM alpine:3.20
2 |
3 | # Common
4 | # * unzip (full version) needed for installing ossutil
5 | RUN apk add --no-cache curl unzip tar
6 |
7 | # Python
8 | ENV PYTHONUNBUFFERED=1
9 | RUN apk add --no-cache python3 py3-pip pipx bash && rm -rf ~/.cache/* /usr/local/share/man /tmp/*
10 | # Location where pipx installs executables:
11 | ENV PATH="/root/.local/bin:$PATH"
12 |
13 | # Hugging Face ("hf://")
14 | RUN pipx install huggingface_hub
15 | RUN huggingface-cli version
16 |
17 | # AWS S3 ("s3://")
18 | RUN pipx install awscli
19 | RUN aws --version
20 |
21 | # Determine architecture.
22 | RUN if [ `uname -m` = 'x86_64' ]; then echo -n "x86_64" > /tmp/arch; else echo -n "arm" > /tmp/arch; fi;
23 |
24 | # Google Cloud Storage ("gs://")
25 | RUN ARCH=`cat /tmp/arch` && curl -OL https://dl.google.com/dl/cloudsdk/channels/rapid/downloads/google-cloud-cli-linux-$ARCH.tar.gz
26 | RUN ls
27 | RUN ARCH=`cat /tmp/arch` && tar -xf google-cloud-cli-linux-$ARCH.tar.gz
28 | RUN ./google-cloud-sdk/install.sh --usage-reporting=false
29 | ENV PATH="/google-cloud-sdk/bin:$PATH"
30 | RUN gcloud config set component_manager/disable_update_check true
31 | RUN gcloud --version
32 |
33 | # Alibaba Object Storage Service ("oss://")
34 | RUN wget -O - https://gosspublic.alicdn.com/ossutil/install.sh | bash
35 | RUN ossutil --version
36 |
37 | # Loader script
38 | COPY ./load.sh /bin/load
39 | RUN chmod +x /bin/load
40 | ENTRYPOINT ["/bin/load"]
--------------------------------------------------------------------------------
/components/model-loader/load.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | set -euxo pipefail
4 |
5 | src=$1
6 | dest=$2
7 |
8 | # If dest is a local directory, download the model to that directory.
9 | # Otherwise, download to a temporary directory and upload from there.
10 | dest_type=""
11 | if [[ $dest == *"://"* ]]; then
12 | dir=$(mktemp -d)
13 | dest_type="url"
14 | else
15 | dir=$dest
16 | dest_type="dir"
17 | mkdir -p $dir
18 | fi
19 |
20 | # Download
21 | case $src in
22 | "hf://"*)
23 | repo=${src#hf://}
24 | huggingface-cli download --local-dir $dir $repo
25 | rm -rf $dir/.cache
26 | ;;
27 | "s3://"*)
28 | aws s3 sync $src $dir
29 | ;;
30 | "gs://"*)
31 | gcloud auth activate-service-account --key-file $GOOGLE_APPLICATION_CREDENTIALS
32 | gcloud storage rsync $src $dir
33 | ;;
34 | "oss://"*)
35 | ossutil sync $src $dir
36 | ;;
37 | *)
38 | echo "Unsupported source url: $src"
39 | exit 1
40 | ;;
41 | esac
42 |
43 | # Upload
44 | if [[ $dest_type == "url" ]]; then
45 | case $dest in
46 | "hf://"*)
47 | repo=${dest#hf://}
48 | huggingface-cli upload $repo $dir
49 | ;;
50 | "s3://"*)
51 | aws s3 sync $dir $dest
52 | ;;
53 | "gs://"*)
54 | gcloud auth activate-service-account --key-file $GOOGLE_APPLICATION_CREDENTIALS
55 | gcloud storage rsync $dir $dest
56 | ;;
57 | "oss://"*)
58 | ossutil sync $dir $dest
59 | ;;
60 | *)
61 | echo "Unsupported destination url: $dest"
62 | exit 1
63 | ;;
64 | esac
65 | fi
--------------------------------------------------------------------------------
/docs/CNAME:
--------------------------------------------------------------------------------
1 | www.kubeai.org
2 |
--------------------------------------------------------------------------------
/docs/benchmarks/llama-3.2-11b-vision.md:
--------------------------------------------------------------------------------
1 | # Llama 3.2 11B Vision Instruct vLLM Benchmarks
2 |
3 |
4 | Single L4 GPU vLLM 0.6.2
5 | ```
6 | python3 benchmark_serving.py --backend openai \
7 | --base-url http://localhost:8000/openai \
8 | --dataset-name=sharegpt --dataset-path=ShareGPT_V3_unfiltered_cleaned_split.json \
9 | --model meta-llama-3.2-11b-vision-instruct \
10 | --seed 12345 --tokenizer neuralmagic/Llama-3.2-11B-Vision-Instruct-FP8-dynamic
11 | ============ Serving Benchmark Result ============
12 | Successful requests: 1000
13 | Benchmark duration (s): 681.93
14 | Total input tokens: 230969
15 | Total generated tokens: 194523
16 | Request throughput (req/s): 1.47
17 | Output token throughput (tok/s): 285.25
18 | Total Token throughput (tok/s): 623.95
19 | ---------------Time to First Token----------------
20 | Mean TTFT (ms): 319146.12
21 | Median TTFT (ms): 322707.98
22 | P99 TTFT (ms): 642512.79
23 | -----Time per Output Token (excl. 1st token)------
24 | Mean TPOT (ms): 54.84
25 | Median TPOT (ms): 53.66
26 | P99 TPOT (ms): 83.75
27 | ---------------Inter-token Latency----------------
28 | Mean ITL (ms): 54.09
29 | Median ITL (ms): 47.44
30 | P99 ITL (ms): 216.77
31 | ==================================================
32 | ```
--------------------------------------------------------------------------------
/docs/benchmarks/prefix-aware-load-balancing-mean-ttft.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/substratusai/kubeai/818a871ca6c1c0af774e467217de0a17b8bd9d7c/docs/benchmarks/prefix-aware-load-balancing-mean-ttft.png
--------------------------------------------------------------------------------
/docs/benchmarks/prefix-aware-load-balancing-throughput.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/substratusai/kubeai/818a871ca6c1c0af774e467217de0a17b8bd9d7c/docs/benchmarks/prefix-aware-load-balancing-throughput.png
--------------------------------------------------------------------------------
/docs/blog/.authors.yml:
--------------------------------------------------------------------------------
1 | authors:
2 | nstogner:
3 | name: Nick Stogner
4 | description: Creator
5 | url: https://www.linkedin.com/in/nstogner/
6 | avatar: https://avatars.githubusercontent.com/u/10274189
7 | samstoelinga:
8 | name: Sam Stoelinga
9 | description: Creator
10 | url: https://www.linkedin.com/in/samstoelinga/
11 | avatar: https://avatars.githubusercontent.com/u/388784
--------------------------------------------------------------------------------
/docs/blog/index.md:
--------------------------------------------------------------------------------
1 | # Recent
2 |
3 |
--------------------------------------------------------------------------------
/docs/concepts/autoscaling.md:
--------------------------------------------------------------------------------
1 | # Autoscaling
2 |
3 | KubeAI proxies HTTP and messaging (i.e. Kafka, etc) requests and messages to models. It will adjust the number Pods serving a given model based on the average active number of requests. If no Pods are running when a request comes in, KubeAI will hold the request, scale up a Pod and forward the request when the Pod is ready. This process happens in a manner that is transparent to the end client (other than the added delay from a cold-start).
4 |
5 |
6 |
7 |
8 | ## Next
9 |
10 | Read about [how to configure autoscaling](../how-to/configure-autoscaling.md).
--------------------------------------------------------------------------------
/docs/concepts/backend-servers.md:
--------------------------------------------------------------------------------
1 | # Backend Servers
2 |
3 | KubeAI serves ML models by launching Pods on Kubernetes. The configuration and lifecycle of these Pods are managed by the KubeAI controller. Every model server Pod loads exactly one model on startup.
4 |
5 | In a Model manifest you can define what server to use for inference (`VLLM`, `OLlama`). Any model-specific settings can be passed to the server process via the `args` and `env` fields.
6 |
7 | ## Next
8 |
9 | Read about [how to install models](../how-to/install-models.md).
--------------------------------------------------------------------------------
/docs/concepts/load-balancing.md:
--------------------------------------------------------------------------------
1 | # Load Balancing
2 |
3 | To optimize inference performance and resource utilization, KubeAI supports load balancing strategies specifically tailored for model inference servers such as vLLM. This document explains two primary load balancing strategies available in KubeAI: Least Load and Prefix Hash.
4 |
5 | ## Least Load
6 |
7 | The Least Load strategy distributes inference requests to the model replica that has the least number of in-flight requests. This strategy aims to balance the inference workload evenly across available replicas, reducing the risk of overloading any single server.
8 |
9 | ## Prefix Hash
10 |
11 | The Prefix Hash strategy leverages the Consistent Hashing with With Bounded Loads (CHWBL) algorithm to optimize the performance of engines such as vLLM that support prefix caching. This strategy increases the likelihood of KV cache hits for common prefixes. See vLLM prefix hashing docs for more info.
12 |
13 | With this strategy, KubeAI hashes incoming requests based on their prefixes (in addition to a requested LoRA adapter name - if present). Requests with the same hash value are routed to the same replica, except when that replica's in-flight requests exceed the overall average by a configurable percentage.
14 |
15 | This strategy has the most benefit for use cases such as chat completion. This is because the entire chat thread is sent in each successive chat requests.
16 |
17 | KubeAI supports this strategy for the following endpoints:
18 |
19 | ```
20 | /openai/v1/completions
21 | /openai/v1/chat/completions
22 | ```
23 |
24 | ## Next
25 |
26 | See the [Kubernetes API docs](../reference/kubernetes-api.md) to view how to configure Model load balancing.
--------------------------------------------------------------------------------
/docs/concepts/lora-adapters.md:
--------------------------------------------------------------------------------
1 | # LoRA Adapters
2 |
3 | KubeAI orchestrates the loading of LoRA adapters into model serving containers. New LoRA adapters can be swapped in and out without needing to restart the container that is serving the base model.
4 |
5 |
6 |
7 | ## Next
8 |
9 | Read about [how to serve lora adapters](../how-to/serve-lora-adapters.md).
--------------------------------------------------------------------------------
/docs/concepts/resource-profiles.md:
--------------------------------------------------------------------------------
1 | # Resource Profiles
2 |
3 | A resource profile maps a type of compute resource (i.e. NVIDIA L4 GPU) to a collection of Kubernetes settings that are configured on inference server Pods. These profiles are defined in the KubeAI `config.yaml` file (via a ConfigMap). Each model specifies the resource profile that it requires.
4 |
5 | Kubernetes Model resources specify a resource profile and the count of that resource that they require (for example `resourceProfile: nvidia-gpu-l4:2` - 2x L4 GPUs).
6 |
7 | A given profile might need to contain slightly different settings based on the cluster/cloud that KubeAI is deployed in.
8 |
9 | Example: A resource profile named `nvidia-gpu-l4` might contain the following node selectors when installing KubeAI on a GKE Kubernetes cluster:
10 |
11 | ```yaml
12 | cloud.google.com/gke-accelerator: "nvidia-l4"
13 | cloud.google.com/gke-spot: "true"
14 | ```
15 |
16 | and add the following resource requests to the model server Pods:
17 |
18 | ```yaml
19 | nvidia.com/gpu: "1"
20 | ```
21 |
22 | In addition to node selectors and resource requirements, a resource profile may optionally specify an image name. This name maps to the container image that will be selected when serving a model on that resource.
23 |
24 | ## Next
25 |
26 | Read about [how to configure resource profiles](../how-to/configure-resource-profiles.md).
--------------------------------------------------------------------------------
/docs/concepts/storage-caching.md:
--------------------------------------------------------------------------------
1 | # Storage / Caching
2 |
3 | With "Large" in the name, caching is a critical part of serving LLMs.
4 |
5 | The best caching technique may very depending on your environment:
6 |
7 | * What cloud features are available?
8 | * Is your cluster deployed in an air-gapped environment?
9 |
10 | ## A. Model built into container
11 |
12 | **Status:** Supported
13 |
14 | Building a model into a container image can provide a simple way to take advantage of image-related optimizations built into Kubernetes:
15 |
16 | * Relaunching a model server on the same Node that it ran on before will [likely](https://kubernetes.io/docs/concepts/architecture/garbage-collection/#container-image-lifecycle) be able to reuse the previously pulled image.
17 |
18 | * [Secondary boot disks on GKE](https://cloud.google.com/kubernetes-engine/docs/how-to/data-container-image-preloading) can be used to avoid needing to pull images.
19 |
20 | * [Image streaming on GKE](https://cloud.google.com/blog/products/containers-kubernetes/introducing-container-image-streaming-in-gke) can allow for containers to startup before the entire image is present on the Node.
21 |
22 | * Container images can be pre-installed on Nodes in air-gapped environments (example: [k3s airgap installation](https://docs.k3s.io/installation/airgap)).
23 |
24 |
25 | **Guides:**
26 |
27 | * [How to build models into container images](../how-to/build-models-into-containers.md)
28 |
29 | ## B. Model on shared filesystem (read-write-many)
30 |
31 | KubeAI can manage model caches on a shared filesystem (i.e. AWS [EFS](https://aws.amazon.com/efs/), GCP [Filestore](https://cloud.google.com/filestore/docs/overview), NFS). It manages the full lifecycle of a cached model: loading, serving, and cache eviction (on deletion of the Model).
32 |
33 |
34 |
35 |
36 | ## C. Model on read-only-many disk
37 |
38 | **Status:** [Planned](https://github.com/substratusai/kubeai/blob/main/proposals/model-storage.md).
39 |
40 | Examples: [GCP Hyperdisk ML](https://cloud.google.com/compute/docs/disks/hyperdisks)
41 |
--------------------------------------------------------------------------------
/docs/contributing/development-guide.md:
--------------------------------------------------------------------------------
1 | # KubeAI Development Guide
2 |
3 | ## OpenAI API
4 | - Types: See `./api/openai/v1/README.md`
5 |
6 | ## Build and Run Commands
7 | - Build: `make build` (manager binary)
8 | - Docker: `make docker-build`
9 | - Run locally: `make run`
10 | - Generate go code (for `./api/*`): `make generate`
11 | - Generate manifests: `make manifests`
12 |
13 | ## Testing Commands
14 | - Unit tests: `make test-unit`
15 | * Single unit test (does not work for integration tests): `go test -v ./path/to/package -run TestNamePattern`
16 | - Integration tests: `make test-integration RUN=SpecificTestToRun`
17 | - E2E tests: `make test-e2e-*` (various test suites)
18 | * Must be run with an active `kind` cluster (Run `kind create cluster` if `kubectl config current-context` does not report a cluster as existing).
19 |
20 | ## Code Style
21 | - Format: `make fmt` (standard Go formatting)
22 | - Lint: `make lint` (golangci-lint v1.59.1)
23 | - Vet: `make vet` (standard Go vetting)
24 |
25 | ## Conventions
26 | - Standard Go project layout (cmd/, internal/, api/, test/)
27 | - Table-driven tests with descriptive names
28 | - Use testify for assertions
29 | - Integration tests use require.EventuallyWithT for async verification
30 | - Follow Kubernetes controller patterns (kubebuilder / controller-runtime)
--------------------------------------------------------------------------------
/docs/contributing/documentation.md:
--------------------------------------------------------------------------------
1 | # Documentation
2 |
3 | We are grateful for anyone who takes the time to improve KubeAI documentation! In order to keep our docs clear and consistent we ask that you first read about the approach to documentation that we have standardized on...
4 |
5 | ## Read before writing!
6 |
7 | The KubeAI approach to documentation is loosely inspired by the [Diataxis](https://diataxis.fr/) method.
8 |
9 | TLDR on how KubeAI docs are organized:
10 |
11 | * **Installation**: How-to guides specific to installing KubeAI.
12 | * **How To**: Directions that guide the reader through a problem or towards a result. How-to guides are goal-oriented. They assume the user is familiar with general concepts, tools, and has already installed KubeAI.
13 | * **Concepts**: A reflective explanation of KubeAI topics with a focus on giving the reader an understanding of the why.
14 | * **Tutorials**: Learning oriented experiences. Lessons that often guide a user from beginning to end. The goal is to help the reader *learn* something (compared to a how-to guide that is focused on helping the reader *do* something).
15 | * **Contributing**: The docs in here differ from the rest of the docs by audience: these docs are for anyone who will be contributing code or docs to the KubeAI project.
16 |
17 | ## How to serve kubeai.org locally
18 |
19 | Make sure you have python3 installed and run:
20 |
21 | ```bash
22 | make docs
23 | ```
--------------------------------------------------------------------------------
/docs/diagrams/arch.excalidraw.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/substratusai/kubeai/818a871ca6c1c0af774e467217de0a17b8bd9d7c/docs/diagrams/arch.excalidraw.png
--------------------------------------------------------------------------------
/docs/diagrams/autoscaling.excalidraw.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/substratusai/kubeai/818a871ca6c1c0af774e467217de0a17b8bd9d7c/docs/diagrams/autoscaling.excalidraw.png
--------------------------------------------------------------------------------
/docs/diagrams/caching-shared-filesystem.excalidraw.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/substratusai/kubeai/818a871ca6c1c0af774e467217de0a17b8bd9d7c/docs/diagrams/caching-shared-filesystem.excalidraw.png
--------------------------------------------------------------------------------
/docs/diagrams/chwbl.excalidraw.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/substratusai/kubeai/818a871ca6c1c0af774e467217de0a17b8bd9d7c/docs/diagrams/chwbl.excalidraw.png
--------------------------------------------------------------------------------
/docs/diagrams/lora-direct-loading.excalidraw.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/substratusai/kubeai/818a871ca6c1c0af774e467217de0a17b8bd9d7c/docs/diagrams/lora-direct-loading.excalidraw.png
--------------------------------------------------------------------------------
/docs/diagrams/multi-threaded-shared-context.excalidraw.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/substratusai/kubeai/818a871ca6c1c0af774e467217de0a17b8bd9d7c/docs/diagrams/multi-threaded-shared-context.excalidraw.png
--------------------------------------------------------------------------------
/docs/diagrams/multi-turn-clients.excalidraw.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/substratusai/kubeai/818a871ca6c1c0af774e467217de0a17b8bd9d7c/docs/diagrams/multi-turn-clients.excalidraw.png
--------------------------------------------------------------------------------
/docs/diagrams/multitenancy-labels.excalidraw.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/substratusai/kubeai/818a871ca6c1c0af774e467217de0a17b8bd9d7c/docs/diagrams/multitenancy-labels.excalidraw.png
--------------------------------------------------------------------------------
/docs/diagrams/private-deep-chat.excalidraw.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/substratusai/kubeai/818a871ca6c1c0af774e467217de0a17b8bd9d7c/docs/diagrams/private-deep-chat.excalidraw.png
--------------------------------------------------------------------------------
/docs/diagrams/random-vs-consistent-hash.excalidraw.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/substratusai/kubeai/818a871ca6c1c0af774e467217de0a17b8bd9d7c/docs/diagrams/random-vs-consistent-hash.excalidraw.png
--------------------------------------------------------------------------------
/docs/graphs/throughput-benchmark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/substratusai/kubeai/818a871ca6c1c0af774e467217de0a17b8bd9d7c/docs/graphs/throughput-benchmark.png
--------------------------------------------------------------------------------
/docs/graphs/ttft-benchmark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/substratusai/kubeai/818a871ca6c1c0af774e467217de0a17b8bd9d7c/docs/graphs/ttft-benchmark.png
--------------------------------------------------------------------------------
/docs/how-to/build-models-into-containers.md:
--------------------------------------------------------------------------------
1 | # Build models into containers
2 |
3 | In this guide we will preload a LLM into a custom built Ollama serving image. You can follow the same steps for other models and other serving engines.
4 |
5 | Define some values
6 | ```bash
7 | export MODEL_URL=ollama://qwen2:0.5b
8 |
9 | # Customize with your own image repo.
10 | export IMAGE=us-central1-docker.pkg.dev/substratus-dev/default/ollama-builtin-qwen2-05b:latest
11 | ```
12 |
13 | Build and push image. Note: building (downloading base image & model) and pushing (uploading image & model) can take a while depending on the size of the model.
14 |
15 | ```bash
16 | git clone https://github.com/substratusai/kubeai
17 | cd ./kubeai/examples/ollama-builtin
18 |
19 | docker build --build-arg MODEL_URL=$MODEL_URL -t $IMAGE .
20 | docker push $IMAGE
21 | ```
22 |
23 | Create a model manifest & apply into a cluster with KubeAI installed. NOTE: The only difference between an built-in model image and otherwise is the addition of the `image:` field.
24 |
25 | ```bash
26 | kubectl apply -f - << EOF
27 | apiVersion: kubeai.org/v1
28 | kind: Model
29 | metadata:
30 | name: builtin-model-example
31 | spec:
32 | features: ["TextGeneration"]
33 | owner: alibaba
34 | image: $IMAGE # <-- The image with model built-in
35 | url: "$MODEL_URL"
36 | engine: OLlama
37 | resourceProfile: cpu:1
38 | EOF
39 | ```
40 |
--------------------------------------------------------------------------------
/docs/how-to/configure-autoscaling.md:
--------------------------------------------------------------------------------
1 | # Configure autoscaling
2 |
3 | This guide will cover how to configure KubeAI [autoscaling](../concepts/autoscaling.md) parameters.
4 |
5 | ## System Settings
6 |
7 | KubeAI administrators can define system-wide autoscaling settings by setting the following Helm values (for the `kubeai/kubeai` chart):
8 |
9 | Example:
10 |
11 | ```yaml
12 | # helm-values.yaml
13 | modelAutoscaling:
14 | interval: 15s
15 | timeWindow: 10m
16 | # ...
17 | ```
18 |
19 | ## Model Settings
20 |
21 | The following settings can be configured on a model-by-model basis.
22 |
23 | ### Model settings: helm
24 |
25 | If you are managing models via the `kubeai/models` Helm chart, you can use:
26 |
27 | ```yaml
28 | # helm-values.yaml
29 | catalog:
30 | model-a:
31 | # ...
32 | minReplicas: 1
33 | maxReplicas: 9
34 | targetRequests: 250
35 | scaleDownDelaySeconds: 45
36 | model-b:
37 | # ...
38 | disableAutoscaling: true
39 | # ...
40 | ```
41 |
42 | Re-running `helm upgrade` with these additional parameters will update model settings in the cluster.
43 |
44 | ### Model settings: kubectl
45 |
46 | You can also specify the autoscaling profile directly via the Models custom resource in the Kubernetes API:
47 |
48 | ```yaml
49 | apiVersion: kubeai.org/v1
50 | kind: Model
51 | metadata:
52 | name: my-model
53 | spec:
54 | # ...
55 | minReplicas: 1
56 | maxReplicas: 9
57 | targetRequests: 250
58 | scaleDownDelaySeconds: 45
59 | ```
60 |
61 | If you are already managing models using Model manifest files, you can make the update to your file and reapply it using `kubectl apply -f .yaml`.
62 |
--------------------------------------------------------------------------------
/docs/how-to/configure-embedding-models.md:
--------------------------------------------------------------------------------
1 | # Configure embedding models
2 |
3 | KubeAI supports the following engines for text embedding models:
4 |
5 | - Infinity
6 | - vLLM
7 | - Ollama
8 |
9 | Infinity supports any HuggingFace models listed as text-embedding. See the [models, reranking or clip models on huggingface](https://huggingface.co/models?other=text-embeddings-inference&sort=trending) for reference.
10 |
11 |
12 | ## Install BAAI/bge-small-en-v1.5 model using Infinity
13 |
14 | Create a file named `kubeai-models.yaml` with the following content:
15 |
16 | ```yaml
17 | catalog:
18 | bge-embed-text-cpu:
19 | enabled: true
20 | features: ["TextEmbedding"]
21 | owner: baai
22 | url: "hf://BAAI/bge-small-en-v1.5"
23 | engine: Infinity
24 | resourceProfile: cpu:1
25 | minReplicas: 1
26 | ```
27 |
28 | Apply the kubeai-models helm chart:
29 |
30 | ```bash
31 | helm install kubeai-models kubeai/models -f ./kubeai-models.yaml
32 | ```
33 |
34 | Once the pod is ready, you can use the OpenAI Python SDK to interact with the model:
35 |
36 | ```python
37 | from openai import OpenAI
38 | # Assumes port-forward of kubeai service to localhost:8000.
39 | client = OpenAI(api_key="ignored", base_url="http://localhost:8000/openai/v1")
40 | response = client.embeddings.create(
41 | input="Your text goes here.",
42 | model="bge-embed-text-cpu"
43 | )
44 | ```
45 |
--------------------------------------------------------------------------------
/docs/how-to/configure-speech-to-text.md:
--------------------------------------------------------------------------------
1 | # Configure speech-to-text
2 |
3 | KubeAI provides a Speech to Text endpoint that can be used to transcribe audio files. This guide will walk you through the steps to enable this feature.
4 |
5 | ## Enable Speech to Text model
6 | You can create new models by creating a Model CRD object or by enabling a model from the model catalog.
7 |
8 | ### Enable from model catalog
9 | KubeAI provides predefined models in the `kubeai/models` Helm chart. To enable the Speech to Text model, you can set the `enabled` flag to `true` in your values file.
10 |
11 | ```yaml
12 | # models-helm-values.yaml
13 | catalog:
14 | faster-whisper-medium-en-cpu:
15 | enabled: true
16 | minReplicas: 1
17 | ```
18 |
19 | ### Enable by creating Model
20 | You can also create a Model object to enable the Speech to Text model. For example:
21 |
22 | ```yaml
23 | apiVersion: kubeai.org/v1
24 | kind: Model
25 | metadata:
26 | name: faster-whisper-medium-en-cpu
27 | spec:
28 | features: [SpeechToText]
29 | owner: Systran
30 | url: hf://Systran/faster-whisper-medium.en
31 | engine: FasterWhisper
32 | resourceProfile: cpu:1
33 | ```
34 |
35 | ## Usage
36 | The Speech to Text endpoint is available at `/openai/v1/transcriptions`.
37 |
38 | Example usage using curl:
39 |
40 | ```bash
41 | curl -L -o kubeai.mp4 https://github.com/user-attachments/assets/711d1279-6af9-4c6c-a052-e59e7730b757
42 | curl http://localhost:8000/openai/v1/audio/transcriptions \
43 | -F "file=@kubeai.mp4" \
44 | -F "language=en" \
45 | -F "model=faster-whisper-medium-en-cpu"
46 | ```
47 |
--------------------------------------------------------------------------------
/docs/how-to/load-models-from-pvc.md:
--------------------------------------------------------------------------------
1 | # Load Models from PVC
2 |
3 | You can store your models in a Persistent Volume Claim (PVC) and let KubeAI use them for serving.
4 | Both vLLM and Ollama engines support loading models from PVCs.
5 |
6 | You must ensure the model files are already present in the PVC before creating the Model resource.
7 | Alternatively you can use KubeAI's native caching mechanism which downloads the model for you:
8 |
9 | - [Cache Models with GCP Filestore](./cache-models-with-gcp-filestore.md)
10 | - [Cache Models with EFS](./cache-models-with-aws-efs.md)
11 |
12 |
13 | ## vLLM
14 |
15 | For vLLM, use the following URL format:
16 | ```yaml
17 | url: pvc://$PVC_NAME # Loads the model from the PVC named $PVC_NAME
18 | url: pvc://$PVC_NAME/$PATH # Loads from a specific path within the PVC
19 | ```
20 |
21 | ### PVC requirements
22 |
23 | vLLM supports both ReadWriteMany and ReadOnlyMany access modes. `Many` is used in order to support more than 1 vLLM replica.
24 |
25 |
26 | ## Ollama
27 |
28 | For Ollama, use the following URL formats:
29 | ```yaml
30 | url: pvc://$PVC_NAME?model=$MODEL_NAME # Loads the model named $MODEL_NAME that's loaded on the disk
31 | url: pvc://$PVC_NAME/$PATH?model=$MODEL_NAME
32 | ```
33 |
34 | ### PVC Requirements
35 | Ollama requires using ReadWriteMany access mode because the rename operation `ollama cp` needs to write to the PVC.
36 |
37 | ### Example: Loading Qwen 0.5b from PVC
38 |
39 | 1. Create a PVC with ReadWriteMany named `model-pvc`. See [example](https://github.com/substratusai/kubeai/blob/main/examples/ollama-pvc/pvc.yaml).
40 | 2. Create a K8s Job to load the model onto `model-pvc. See [example](https://github.com/substratusai/kubeai/blob/main/examples/ollama-pvc/job.yaml)
41 |
42 | The PVC should now have a `blobs/` and `manifests/` directory after the loader completes.
43 |
44 |
45 | 3. Create a Model to load from PVC:
46 |
47 | ```yaml
48 | url: pvc://model-pvc?model=qwen:0.5b
49 | ```
50 |
--------------------------------------------------------------------------------
/docs/overrides/partials/integrations/analytics/custom.html:
--------------------------------------------------------------------------------
1 |
3 |
--------------------------------------------------------------------------------
/docs/reference/.kubernetes-api/config.yaml:
--------------------------------------------------------------------------------
1 | processor:
2 | # RE2 regular expressions describing types that should be excluded from the generated documentation.
3 | ignoreTypes:
4 | - "List"
5 | # RE2 regular expressions describing type fields that should be excluded from the generated documentation.
6 | ignoreFields:
7 | - "TypeMeta"
8 |
9 | render:
10 | # Version of Kubernetes to use when generating links to Kubernetes API documentation.
11 | kubernetesVersion: 1.31
12 |
--------------------------------------------------------------------------------
/docs/reference/openai-api-compatibility.md:
--------------------------------------------------------------------------------
1 | # OpenAI API Compatibility
2 |
3 | KubeAI provides an OpenAI API compatiblity layer.
4 |
5 | ## General:
6 |
7 | ### Models
8 |
9 | ```
10 | GET /v1/models
11 | ```
12 |
13 | * Lists all `kind: Model` object installed in teh Kubernetes API Server.
14 |
15 |
16 | ## Inference
17 |
18 | ### Text Generation
19 |
20 | ```
21 | POST /v1/chat/completions
22 | POST /v1/completions
23 | ```
24 |
25 | * Supported for Models with `.spec.features: ["TextGeneration"]`.
26 |
27 | ### Embeddings
28 |
29 | ```
30 | POST /v1/embeddings
31 | ```
32 |
33 | * Supported for Models with `.spec.features: ["TextEmbedding"]`.
34 |
35 | ### Speech-to-Text
36 |
37 | ```
38 | POST /v1/audio/transcriptions
39 | ```
40 |
41 | * Supported for Models with `.spec.features: ["SpeechToText"]`.
42 |
43 | ## OpenAI Client libaries
44 | You can use the official OpenAI client libraries by setting the
45 | `base_url` to the KubeAI endpoint.
46 |
47 | For example, you can use the Python client like this:
48 | ```python
49 | from openai import OpenAI
50 | client = OpenAI(api_key="ignored",
51 | base_url="http://kubeai/openai/v1")
52 | response = client.chat.completions.create(
53 | model="gemma2-2b-cpu",
54 | messages=[
55 | {"role": "system", "content": "You are a helpful assistant."},
56 | {"role": "user", "content": "Who won the world series in 2020?"},
57 | {"role": "assistant", "content": "The Los Angeles Dodgers won the World Series in 2020."},
58 | {"role": "user", "content": "Where was it played?"}
59 | ]
60 | )
61 | ```
62 |
--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | mkdocs
2 | mkdocs-material
3 | mkdocs-awesome-pages-plugin
4 | mkdocs-material[imaging]
--------------------------------------------------------------------------------
/docs/screenshots/gcp-cpus-all-regions.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/substratusai/kubeai/818a871ca6c1c0af774e467217de0a17b8bd9d7c/docs/screenshots/gcp-cpus-all-regions.png
--------------------------------------------------------------------------------
/docs/screenshots/gcp-gpus-all-regions.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/substratusai/kubeai/818a871ca6c1c0af774e467217de0a17b8bd9d7c/docs/screenshots/gcp-gpus-all-regions.png
--------------------------------------------------------------------------------
/docs/screenshots/gcp-quota-preemptible-nvidia-l4-gpus-regional.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/substratusai/kubeai/818a871ca6c1c0af774e467217de0a17b8bd9d7c/docs/screenshots/gcp-quota-preemptible-nvidia-l4-gpus-regional.png
--------------------------------------------------------------------------------
/docs/screenshots/gcp-quota-premium-storage-gb-per-region.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/substratusai/kubeai/818a871ca6c1c0af774e467217de0a17b8bd9d7c/docs/screenshots/gcp-quota-premium-storage-gb-per-region.png
--------------------------------------------------------------------------------
/docs/screenshots/gcp-tpu-preemptible-v5e-quota.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/substratusai/kubeai/818a871ca6c1c0af774e467217de0a17b8bd9d7c/docs/screenshots/gcp-tpu-preemptible-v5e-quota.png
--------------------------------------------------------------------------------
/docs/screenshots/langtrace.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/substratusai/kubeai/818a871ca6c1c0af774e467217de0a17b8bd9d7c/docs/screenshots/langtrace.png
--------------------------------------------------------------------------------
/docs/screenshots/private-deep-chat.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/substratusai/kubeai/818a871ca6c1c0af774e467217de0a17b8bd9d7c/docs/screenshots/private-deep-chat.png
--------------------------------------------------------------------------------
/examples/k8s-api-clients/python/example.py:
--------------------------------------------------------------------------------
1 | from kubernetes import config, dynamic
2 | from kubernetes.client import api_client
3 |
4 | k8s_client = dynamic.DynamicClient(
5 | api_client.ApiClient(configuration=config.load_kube_config())
6 | )
7 |
8 | models_client = k8s_client.resources.get(api_version="kubeai.org/v1", kind="Model")
9 |
10 | model = {
11 | "apiVersion": "kubeai.org/v1",
12 | "kind": "Model",
13 | "metadata": {
14 | "name": "facebook-opt-125m",
15 | "namespace": "default",
16 | },
17 | "spec": {
18 | "features": ["TextGeneration"],
19 | "owner": "facebook",
20 | "url": "hf://facebook/opt-125m",
21 | "engine": "VLLM",
22 | "resourceProfile": "cpu:1",
23 | },
24 | }
25 |
26 |
27 | models_client.create(body=model)
28 |
29 | # Alternative: Use "server-side apply" (i.e. kubectl apply) to upsert the Model.
30 | # models_client.patch(
31 | # body=model,
32 | # content_type="application/apply-patch+yaml",
33 | # field_manager="my-example-app", # Set a field manager to track ownership of fields.
34 | # )
35 |
36 | created_model = models_client.get(name="facebook-opt-125m", namespace="default")
37 | print(created_model)
38 |
39 | # Optionally delete the Model.
40 | # models_client.delete(name="facebook-opt-125m", namespace="default")
41 |
--------------------------------------------------------------------------------
/examples/k8s-api-clients/python/requirements.txt:
--------------------------------------------------------------------------------
1 | kubernetes==31.0.0
2 |
--------------------------------------------------------------------------------
/examples/ollama-builtin/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM ollama/ollama:latest
2 |
3 | # Model to be downloaded.
4 | ARG MODEL_URL
5 |
6 | # MODEL_URL is a required argument.
7 | RUN test -n "${MODEL_URL}"
8 |
9 | # Set the model to be downloaded.
10 | ENV MODEL_URL=${MODEL_URL}
11 |
12 | COPY ./download.sh /download.sh
13 | RUN ./download.sh
--------------------------------------------------------------------------------
/examples/ollama-builtin/download.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | set -eu
4 |
5 | # Exit if the model URL is not set.
6 | : "$MODEL_URL"
7 |
8 | # Check if the model URL is in the correct format - matching
9 | # the format used in .spec.url in the Model Custom Resource.
10 | if [[ $MODEL_URL != ollama://* ]] ;
11 | then
12 | echo "MODEL_URL must use the \"ollama://\" format"
13 | exit 1
14 | fi
15 |
16 | ollama_model_name=${MODEL_URL#ollama://}
17 |
18 | # Run Ollama server in the background.
19 | /bin/ollama serve &
20 | pid=$!
21 |
22 | # TODO: Wait for the server to start using something more exact.
23 | sleep 5
24 |
25 | /bin/ollama pull $ollama_model_name
26 |
27 | # Send SIGTERM to the server to allow it to gracefully exit.
28 | kill -SIGTERM "$pid"
29 |
30 | # Wait for the server to exit.
31 | wait "$pid"
32 |
--------------------------------------------------------------------------------
/examples/ollama-pvc/job.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: batch/v1
2 | kind: Job
3 | metadata:
4 | name: ollama-load-model-to-pvc
5 | spec:
6 | template:
7 | spec:
8 | containers:
9 | - name: ollama
10 | image: ollama/ollama:latest
11 | env:
12 | - name: OLLAMA_MODELS
13 | value: /model
14 | command:
15 | - /bin/sh
16 | - -c
17 | - |
18 | /bin/ollama serve &
19 | echo "Waiting for Ollama server to start..."
20 | sleep 10
21 |
22 | # Pull the model and ensure it downloads successfully
23 | echo "Pulling model qwen:0.5b..."
24 | if ! /bin/ollama pull qwen:0.5b; then
25 | echo "Failed to pull model"
26 | exit 1
27 | fi
28 |
29 | # Verify the model files exist
30 | echo "Verifying model files..."
31 | ls -R /model
32 | if [ ! -d "/model/blobs" ] || [ ! -d "/model/manifests" ]; then
33 | echo "Model directories not found"
34 | exit 1
35 | fi
36 |
37 | echo "Model setup completed successfully"
38 | ls -la /model/manifests/registry.ollama.ai/library/qwen/0.5b
39 | volumeMounts:
40 | - name: models-volume
41 | mountPath: /model
42 | volumes:
43 | - name: models-volume
44 | persistentVolumeClaim:
45 | claimName: model-pvc
46 | readOnly: false
47 | restartPolicy: OnFailure
48 |
--------------------------------------------------------------------------------
/examples/ollama-pvc/pvc.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: PersistentVolumeClaim
3 | metadata:
4 | name: model-pvc
5 | spec:
6 | storageClassName: premium-rwx # replace with your actual storage class
7 | accessModes:
8 | - ReadWriteMany
9 | resources:
10 | requests:
11 | storage: 10Gi
--------------------------------------------------------------------------------
/examples/priority-examples/background-research-model.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: kubeai.org/v1
2 | kind: Model
3 | metadata:
4 | name: background-research-model
5 | spec:
6 | features: [TextGeneration]
7 | url: ollama://gemma2:2b
8 | engine: OLlama
9 | # Background tasks with low priority will be preempted when resources are needed for higher priority models
10 | priorityClassName: low-priority
11 | resourceProfile: cpu:2
--------------------------------------------------------------------------------
/examples/priority-examples/critical-service-model.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: kubeai.org/v1
2 | kind: Model
3 | metadata:
4 | name: critical-service-model
5 | spec:
6 | features: [TextGeneration]
7 | url: hf://neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8
8 | engine: VLLM
9 | # Critical service model gets high priority to preempt other models when resources are limited
10 | priorityClassName: high-priority
11 | resourceProfile: nvidia-gpu-l4:1
--------------------------------------------------------------------------------
/examples/priority-examples/hello-world-llm.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: kubeai.org/v1
2 | kind: Model
3 | metadata:
4 | name: hello-world-llm
5 | spec:
6 | features: [TextGeneration]
7 | url: hf://neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8
8 | engine: VLLM
9 | # Uncomment to set the priority class for this model
10 | # priorityClassName: high-priority
11 | resourceProfile: nvidia-gpu-l4:1
--------------------------------------------------------------------------------
/examples/priority-examples/priority-classes.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: scheduling.k8s.io/v1
2 | kind: PriorityClass
3 | metadata:
4 | name: high-priority
5 | value: 1000000 # Higher value means higher priority
6 | globalDefault: false
7 | description: "This priority class should be used for critical inference models only."
8 | ---
9 | apiVersion: scheduling.k8s.io/v1
10 | kind: PriorityClass
11 | metadata:
12 | name: medium-priority
13 | value: 100000
14 | globalDefault: false
15 | description: "This priority class should be used for medium priority inference models."
16 | ---
17 | apiVersion: scheduling.k8s.io/v1
18 | kind: PriorityClass
19 | metadata:
20 | name: low-priority
21 | value: 10000
22 | globalDefault: false
23 | description: "This priority class should be used for low priority inference models."
--------------------------------------------------------------------------------
/examples/private-deep-chat/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM golang:1.23 AS builder
2 |
3 | WORKDIR /workspace
4 | COPY go.* .
5 |
6 | RUN go mod download
7 |
8 | COPY main.go main.go
9 | RUN CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build -a -o server ./main.go
10 |
11 | FROM gcr.io/distroless/static:nonroot
12 |
13 | WORKDIR /app
14 | COPY --from=builder /workspace/server /app/
15 | COPY ./static /app/static
16 | USER 65532:65532
17 |
18 | ENTRYPOINT ["/app/server"]
19 |
--------------------------------------------------------------------------------
/examples/private-deep-chat/go.mod:
--------------------------------------------------------------------------------
1 | module private-chat
2 |
3 | go 1.22.0
4 |
--------------------------------------------------------------------------------
/examples/private-deep-chat/manifests/deployment.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: apps/v1
2 | kind: Deployment
3 | metadata:
4 | name: private-deep-chat
5 | labels:
6 | app: private-deep-chat
7 | spec:
8 | replicas: 1
9 | selector:
10 | matchLabels:
11 | app: private-deep-chat
12 | template:
13 | metadata:
14 | labels:
15 | app: private-deep-chat
16 | spec:
17 | containers:
18 | - name: server
19 | image: private-deep-chat:latest
20 | imagePullPolicy: IfNotPresent
21 | ports:
22 | - containerPort: 8000
23 | env:
24 | - name: LISTEN_ADDR
25 | value: ":8000"
26 | - name: KUBEAI_ADDR
27 | value: "http://kubeai"
28 |
29 |
--------------------------------------------------------------------------------
/examples/private-deep-chat/manifests/models.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: kubeai.org/v1
2 | kind: Model
3 | metadata:
4 | name: gemma2-a
5 | labels:
6 | tenancy: group-a
7 | spec:
8 | features: [TextGeneration]
9 | owner: google
10 | url: ollama://gemma2:2b
11 | engine: OLlama
12 | resourceProfile: cpu:2
13 | ---
14 | apiVersion: kubeai.org/v1
15 | kind: Model
16 | metadata:
17 | name: gemma2-b
18 | labels:
19 | tenancy: group-b
20 | spec:
21 | features: [TextGeneration]
22 | owner: google
23 | url: ollama://gemma2:2b
24 | engine: OLlama
25 | resourceProfile: cpu:2
--------------------------------------------------------------------------------
/examples/private-deep-chat/manifests/service.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: Service
3 | metadata:
4 | name: private-deep-chat
5 | labels:
6 | app: private-deep-chat
7 | spec:
8 | ports:
9 | - port: 80
10 | protocol: TCP
11 | targetPort: 8000
12 | selector:
13 | app: private-deep-chat
14 |
--------------------------------------------------------------------------------
/examples/storage-classes/gcp-filestore.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: storage.k8s.io/v1
2 | kind: StorageClass
3 | metadata:
4 | name: gcp-filestore
5 | provisioner: filestore.csi.storage.gke.io
6 | volumeBindingMode: Immediate
7 | allowVolumeExpansion: true
8 | parameters:
9 | tier: standard
10 | network: default
--------------------------------------------------------------------------------
/hack/apply-model.sh:
--------------------------------------------------------------------------------
1 | model='opt-125m-cpu'
2 | helm template ./charts/models --set "catalog.$model.enabled=true" --set "catalog.$model.minReplicas=1" | kubectl apply -f -
--------------------------------------------------------------------------------
/hack/boilerplate.go.txt:
--------------------------------------------------------------------------------
1 | /*
2 | Copyright 2024.
3 |
4 | Licensed under the Apache License, Version 2.0 (the "License");
5 | you may not use this file except in compliance with the License.
6 | You may obtain a copy of the License at
7 |
8 | http://www.apache.org/licenses/LICENSE-2.0
9 |
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
--------------------------------------------------------------------------------
/hack/create-dev-gke-cluster.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | cluster_name="kubeai-dev-1"
4 |
5 | gcloud container clusters create $cluster_name \
6 | --zone us-central1-a \
7 | --node-locations us-central1-a --num-nodes 1 --machine-type e2-medium
8 |
9 | gcloud container node-pools create n2s4 \
10 | --cluster=$cluster_name \
11 | --zone us-central1-a \
12 | --machine-type=n2-standard-4 \
13 | --enable-autoscaling \
14 | --num-nodes=0 \
15 | --min-nodes=0 \
16 | --max-nodes=3
17 |
18 | gcloud container node-pools create n2s8 \
19 | --cluster=$cluster_name \
20 | --zone us-central1-a \
21 | --machine-type=n2-standard-8 \
22 | --enable-autoscaling \
23 | --num-nodes=0 \
24 | --min-nodes=0 \
25 | --max-nodes=3
26 |
27 | gcloud container node-pools create n2s16 \
28 | --cluster=$cluster_name \
29 | --zone us-central1-a \
30 | --machine-type=n2-standard-16 \
31 | --enable-autoscaling \
32 | --num-nodes=0 \
33 | --min-nodes=0 \
34 | --max-nodes=3
35 |
36 | gcloud container node-pools create g2s8 \
37 | --cluster=$cluster_name \
38 | --zone us-central1-a \
39 | --accelerator=type=nvidia-l4,count=1,gpu-driver-version=default \
40 | --machine-type=g2-standard-8 \
41 | --enable-autoscaling \
42 | --num-nodes=0 \
43 | --min-nodes=0 \
44 | --max-nodes=3
45 |
46 |
--------------------------------------------------------------------------------
/hack/dev-configs/gke.yaml:
--------------------------------------------------------------------------------
1 | secretNames:
2 | huggingface: huggingface
3 |
4 | modelServers:
5 | VLLM:
6 | images:
7 | default: "vllm/vllm-openai:v0.6.3.post1"
8 | cpu: "substratusai/vllm:v0.6.3.post1-cpu"
9 | google-tpu: "substratusai/vllm:v0.6.3.post1-tpu"
10 | OLlama:
11 | images:
12 | default: "ollama/ollama:latest"
13 | FasterWhisper:
14 | images:
15 | default: "fedirz/faster-whisper-server:latest-cpu"
16 | nvidia-gpu: "fedirz/faster-whisper-server:latest-cuda"
17 | Infinity:
18 | images:
19 | default: "michaelf34/infinity:latest"
20 |
21 | modelLoading:
22 | image: us-central1-docker.pkg.dev/substratus-dev/default/kubeai-model-loader
23 |
24 | modelRollouts:
25 | surge: 0
26 | messaging:
27 | errorMaxBackoff: 30s
28 | streams: []
29 | #- requestsURL: gcppubsub://projects/substratus-dev/subscriptions/test-kubeai-requests-sub
30 | # responsesURL: gcppubsub://projects/substratus-dev/topics/test-kubeai-responses
31 | # maxHandlers: 1
32 | resourceProfiles:
33 | cpu:
34 | imageName: "cpu"
35 | requests:
36 | # Kind
37 | #cpu: 0.5
38 | #memory: 1Gi
39 | # GKE
40 | cpu: 3
41 | memory: 12Gi
42 | limits:
43 | cpu: 3
44 | memory: 12Gi
45 | nvidia-gpu-l4:
46 | limits:
47 | nvidia.com/gpu: "1"
48 | requests:
49 | nvidia.com/gpu: "1"
50 | cpu: "6"
51 | memory: "24Gi"
52 | nodeSelector:
53 | cloud.google.com/gke-accelerator: "nvidia-l4"
54 | cloud.google.com/gke-spot: "true"
55 |
56 | cacheProfiles:
57 | fstore:
58 | sharedFilesystem:
59 | #storageClassName: "kubeai-filestore"
60 | persistentVolumeName: "preprov1"
61 |
62 | # Dev-only configuration.
63 | allowPodAddressOverride: true
64 | fixedSelfMetricAddrs: ["127.0.0.1:"]
65 |
66 | modelAutoscaling:
67 | interval: 10s
68 | timeWindow: 60s
69 | stateConfigMapName: kubeai-autoscaler-state
--------------------------------------------------------------------------------
/hack/dev-configs/kind.yaml:
--------------------------------------------------------------------------------
1 | secretNames:
2 | huggingface: huggingface
3 |
4 | modelServers:
5 | VLLM:
6 | images:
7 | # The key is the image name (referenced from resourceProfiles) and the value is the image.
8 | # The "default" image should always be specified.
9 | # "default" is used when no imageName is specified or if a specific image is not found.
10 | default: "vllm/vllm-openai:v0.6.2"
11 | cpu: "substratusai/vllm:v0.6.1-cpu"
12 | nvidia-gpu: "vllm/vllm-openai:v0.6.2"
13 | google-tpu: "substratusai/vllm:v0.6.1-tpu"
14 | OLlama:
15 | images:
16 | default: "ollama/ollama:latest"
17 | FasterWhisper:
18 | images:
19 | default: "fedirz/faster-whisper-server:latest-cpu"
20 | nvidia-gpu: "fedirz/faster-whisper-server:latest-cuda"
21 | Infinity:
22 | images:
23 | default: "michaelf34/infinity:latest"
24 |
25 | modelLoading:
26 | image: kubeai-model-loader:latest
27 |
28 | modelRollouts:
29 | surge: 0
30 | messaging:
31 | errorMaxBackoff: 30s
32 | streams: []
33 | #- requestsURL: gcppubsub://projects/substratus-dev/subscriptions/test-kubeai-requests-sub
34 | # responsesURL: gcppubsub://projects/substratus-dev/topics/test-kubeai-responses
35 | # maxHandlers: 1
36 | resourceProfiles:
37 | cpu:
38 | imageName: "cpu"
39 | requests:
40 | cpu: 0.5
41 | memory: 1Gi
42 | limits:
43 | cpu: 3
44 | memory: 12Gi
45 | nvidia-gpu-l4:
46 | limits:
47 | nvidia.com/gpu: "1"
48 | requests:
49 | nvidia.com/gpu: "1"
50 | cpu: "6"
51 | memory: "24Gi"
52 |
53 | cacheProfiles:
54 | fstore:
55 | sharedFilesystem:
56 | #storageClassName: "kubeai-filestore"
57 | persistentVolumeName: "preprov1"
58 |
59 | # Dev-only configuration.
60 | allowPodAddressOverride: true
61 | fixedSelfMetricAddrs: ["127.0.0.1:8080"]
62 |
63 | modelAutoscaling:
64 | interval: 10s
65 | timeWindow: 60s
66 | stateConfigMapName: kubeai-autoscaler-state
--------------------------------------------------------------------------------
/hack/dev-gke-helm-values.yaml:
--------------------------------------------------------------------------------
1 | models:
2 | catalog:
3 | llama-3.1-8b-instruct-fp8-l4:
4 | enabled: true
5 |
6 | resourceProfiles:
7 | nvidia-gpu-l4:
8 | nodeSelector:
9 | cloud.google.com/gke-accelerator: "nvidia-l4"
10 | cloud.google.com/gke-spot: "true"
--------------------------------------------------------------------------------
/hack/dev-load/k6.js:
--------------------------------------------------------------------------------
1 | import http from 'k6/http';
2 | import { sleep } from 'k6';
3 |
4 | export const options = {
5 | stages: [
6 | { duration: '15s', target: 1 },
7 | { duration: '15s', target: 9 },
8 | { duration: '1m', target: 9 },
9 | { duration: '15s', target: 0 },
10 | { duration: '15s', target: 0 },
11 | ],
12 | };
13 |
14 | export default function () {
15 | const url = 'http://kubeai/openai/v1/completions';
16 |
17 | let data = {
18 | "prompt": "Your text string goes here",
19 | "model": "dev"
20 | };
21 |
22 | let res = http.post(url, JSON.stringify(data), {
23 | headers: { 'Content-Type': 'application/json' },
24 | });
25 |
26 | sleep(1);
27 | }
28 |
--------------------------------------------------------------------------------
/hack/dev-load/pod.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: Pod
3 | metadata:
4 | name: dev-load
5 | spec:
6 | restartPolicy: Never
7 | containers:
8 | - name: k6
9 | image: grafana/k6
10 | args: ["run", "/config/k6.js"] #, "--http-debug"]
11 | volumeMounts:
12 | - name: config
13 | mountPath: /config
14 | volumes:
15 | - name: config
16 | configMap:
17 | name: dev-load
--------------------------------------------------------------------------------
/hack/dev-load/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | set -e
4 | set -u
5 | set -x
6 |
7 | this_dir=$(dirname "$0")
8 |
9 | kubectl create configmap dev-load --from-file $this_dir/k6.js --dry-run=client -oyaml | kubectl apply -f -
10 |
11 | kubectl create -f $this_dir/pod.yaml
--------------------------------------------------------------------------------
/hack/dev-models/kind-cpu-adapters.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: kubeai.org/v1
2 | kind: Model
3 | metadata:
4 | name: dev
5 | annotations:
6 | # Have the controller send requests to localhost to allow for
7 | # running the controller locally (assuming a port-forward is in place).
8 | model-pod-ip: "127.0.0.1"
9 | model-pod-port: "7000"
10 | spec:
11 | features: ["TextGeneration"]
12 | owner: alibaba
13 | url: "ollama://qwen2:0.5b"
14 | engine: OLlama
15 | resourceProfile: cpu:1
16 | minReplicas: 1
17 | maxReplicas: 3
18 | adapters:
19 | - name: abc
20 | url: hf://facebook/opt-125m
21 | ---
22 | # Service for port-fowarding to the model:
23 | #
24 | # while true; do kubectl port-forward service/dev-model 7000:7000; done
25 | #
26 | apiVersion: v1
27 | kind: Service
28 | metadata:
29 | name: dev-model
30 | spec:
31 | selector:
32 | model: dev
33 | ports:
34 | - protocol: TCP
35 | port: 7000
36 | targetPort: 8000
--------------------------------------------------------------------------------
/hack/dev-models/kind-cpu.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: kubeai.org/v1
2 | kind: Model
3 | metadata:
4 | name: dev
5 | annotations:
6 | # Have the controller send requests to localhost to allow for
7 | # running the controller locally (assuming a port-forward is in place).
8 | model-pod-ip: "127.0.0.1"
9 | model-pod-port: "7000"
10 | spec:
11 | features: ["TextGeneration"]
12 | owner: alibaba
13 | url: "ollama://qwen2:0.5b"
14 | engine: OLlama
15 | #url: hf://facebook/opt-125m
16 | #engine: VLLM
17 | resourceProfile: cpu:1
18 | #cacheProfile: fstore
19 | minReplicas: 1
20 | maxReplicas: 3
21 | #url: hf://meta-llama/Meta-Llama-3.1-8B-Instruct
22 | #args:
23 | # - --max-model-len=32768
24 | # - --max-num-batched-token=32768
25 | ---
26 | # Service for port-fowarding to the model:
27 | #
28 | # while true; do kubectl port-forward service/dev-model 7000:7000; done
29 | #
30 | apiVersion: v1
31 | kind: Service
32 | metadata:
33 | name: dev-model
34 | spec:
35 | selector:
36 | model: dev
37 | ports:
38 | - protocol: TCP
39 | port: 7000
40 | targetPort: 8000
--------------------------------------------------------------------------------
/hack/dev-models/kind-vllm-cpu.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: kubeai.org/v1
2 | kind: Model
3 | metadata:
4 | name: dev
5 | annotations:
6 | # Have the controller send requests to localhost to allow for
7 | # running the controller locally (assuming a port-forward is in place).
8 | model-pod-ip: "127.0.0.1"
9 | model-pod-port: "7000"
10 | spec:
11 | features: ["TextGeneration"]
12 | owner: facebook
13 | url: hf://facebook/opt-125m
14 | engine: VLLM
15 | resourceProfile: cpu:1
16 | minReplicas: 1
17 | maxReplicas: 3
18 | args:
19 | # This revision does not contain its own chat template.
20 | - --revision=27dcfa74d334bc871f3234de431e71c6eeba5dd6
21 | - --chat-template=/config/chat-template.jinja
22 | - --swap-space=1
23 | env:
24 | VLLM_CPU_KVCACHE_SPACE: "2"
25 | files:
26 | - path: "/config/chat-template.jinja"
27 | content: "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ 'Question:\n' + message['content'] + '\n\n' }}{% elif message['role'] == 'system' %}\n{{ 'System:\n' + message['content'] + '\n\n' }}{% elif message['role'] == 'assistant' %}{{ 'Answer:\n' + message['content'] + '\n\n' }}{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ 'Answer:\n' }}{% endif %}{% endfor %}"
28 | - path: "/config/prompt.txt"
29 | content: "prompt content"
30 | ---
31 | # Service for port-fowarding to the model:
32 | #
33 | # while true; do kubectl port-forward service/dev-model 7000:7000; done
34 | #
35 | apiVersion: v1
36 | kind: Service
37 | metadata:
38 | name: dev-model
39 | spec:
40 | selector:
41 | model: dev
42 | ports:
43 | - protocol: TCP
44 | port: 7000
45 | targetPort: 8000
--------------------------------------------------------------------------------
/hack/dev-models/vllm-chat.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: kubeai.org/v1
2 | kind: Model
3 | metadata:
4 | name: tinyllama-chat
5 | spec:
6 | features: [TextGeneration]
7 | owner: meta-llama
8 | url: hf://TinyLlama/TinyLlama-1.1B-Chat-v1.0
9 | #adapters:
10 | #- name: foo
11 | # url: hf://jashing/tinyllama-colorist-lora
12 | #- name: bar
13 | # url: s3://substratus-ai-test-0/adapters/jashing/tinyllama-colorist-lora
14 | #- name: baz
15 | # url: gs://substratus-ai-test-0/adapters/jashing/tinyllama-colorist-lora
16 | engine: VLLM
17 | resourceProfile: nvidia-gpu-l4:1
18 | minReplicas: 1
--------------------------------------------------------------------------------
/hack/dev-models/vllm-gs-url.yaml:
--------------------------------------------------------------------------------
1 | # Source: models/templates/models.yaml
2 | apiVersion: kubeai.org/v1
3 | kind: Model
4 | metadata:
5 | name: gs-opt-125m-cpu
6 | spec:
7 | features: [TextGeneration]
8 | owner: facebook
9 | url: gs://substratus-ai-test-0/models/facebook/opt-125m
10 | cacheProfile: standard-filestore
11 | engine: VLLM
12 | resourceProfile: cpu:4
13 | minReplicas: 1
14 |
--------------------------------------------------------------------------------
/hack/dev-models/vllm-s3-url.yaml:
--------------------------------------------------------------------------------
1 | # Source: models/templates/models.yaml
2 | apiVersion: kubeai.org/v1
3 | kind: Model
4 | metadata:
5 | name: s3-opt-125m-cpu
6 | spec:
7 | features: [TextGeneration]
8 | owner: facebook
9 | url: s3://substratus-ai-test-0/models/facebook/opt-125m
10 | cacheProfile: standard-filestore
11 | engine: VLLM
12 | resourceProfile: cpu:4
13 | minReplicas: 1
14 |
--------------------------------------------------------------------------------
/hack/dev-models/vllm-with-adapters.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: kubeai.org/v1
2 | kind: Model
3 | metadata:
4 | name: tinyllama-chat-adapters
5 | spec:
6 | features: [TextGeneration]
7 | owner: meta-llama
8 | url: hf://TinyLlama/TinyLlama-1.1B-Chat-v0.3
9 | adapters:
10 | - name: foo
11 | url: hf://jashing/tinyllama-colorist-lora
12 | - name: bar
13 | url: s3://substratus-ai-test-0/adapters/jashing/tinyllama-colorist-lora
14 | #- name: baz
15 | # url: gs://substratus-ai-test-0/adapters/jashing/tinyllama-colorist-lora
16 | engine: VLLM
17 | resourceProfile: nvidia-gpu-l4:1
18 | minReplicas: 1
--------------------------------------------------------------------------------
/hack/pvs/preprov-filestore.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: PersistentVolume
3 | metadata:
4 | name: preprov1
5 | spec:
6 | storageClassName: ""
7 | capacity:
8 | storage: 1Ti
9 | accessModes:
10 | - ReadWriteMany
11 | persistentVolumeReclaimPolicy: Retain
12 | volumeMode: Filesystem
13 | csi:
14 | driver: filestore.csi.storage.gke.io
15 | volumeHandle: "modeInstance/us-central1-f/preprov1/vol1"
16 | volumeAttributes:
17 | # Replace with IP from created Filestore instance:
18 | ip: "10.100.234.50"
19 | volume: vol1
--------------------------------------------------------------------------------
/hack/vllm-mock-metrics/main.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "log"
5 | "net/http"
6 | "os"
7 | )
8 |
9 | func main() {
10 | // Serve metrics.txt at /metrics
11 | metrics, err := os.ReadFile("metrics.txt")
12 | if err != nil {
13 | log.Fatal(err)
14 | }
15 | log.Println("starting")
16 | log.Fatal(http.ListenAndServe(":8888", http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
17 | log.Println("serving")
18 | w.Write(metrics)
19 | })))
20 | }
21 |
--------------------------------------------------------------------------------
/hack/volume-debug-pod.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: Pod
3 | metadata:
4 | name: volume-debug-pod
5 | spec:
6 | containers:
7 | - name: main
8 | image: ubuntu
9 | command: ["sleep", "10000"]
10 | volumeMounts:
11 | - name: models
12 | mountPath: /my-mnt
13 | volumes:
14 | - name: models
15 | persistentVolumeClaim:
16 | claimName: shared-model-cache-fstore
--------------------------------------------------------------------------------
/internal/apiutils/model.go:
--------------------------------------------------------------------------------
1 | package apiutils
2 |
3 | import "strings"
4 |
5 | const (
6 | // adapterSeparator is the separator used to split model and adapter names
7 | // in API requests.
8 | //
9 | // Alternatives considered:
10 | //
11 | // "-" (hyphen): This is a common separator in Kubernetes resource names.
12 | // "." (dot): This is a common separator in model versions "llama-3.2".
13 | // "/" (slash): This would be incompatible with specifying model names inbetween slashes in URL paths (i.e. "/some-api/models//details").
14 | // ":" (colon): This might cause problems when specifying model names before colons in URL paths (see example below).
15 | //
16 | // See example of a path used in the Gemini API (https://ai.google.dev/gemini-api/docs/text-generation?lang=rest):
17 | // "https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash:generateContent?key=$GOOGLE_API_KEY"
18 | adapterSeparator = "_"
19 | )
20 |
21 | // SplitModelAdapter splits a requested model name into KubeAI
22 | // Model.metadata.name and Model.spec.adapters[].name.
23 | func SplitModelAdapter(s string) (model, adapter string) {
24 | parts := strings.SplitN(s, adapterSeparator, 2)
25 | if len(parts) == 1 {
26 | return parts[0], ""
27 | }
28 | return parts[0], parts[1]
29 | }
30 |
31 | // MergeModelAdapter merges a model and adapter name into a single string.
32 | func MergeModelAdapter(model, adapter string) string {
33 | if adapter == "" {
34 | return model
35 | }
36 | return model + adapterSeparator + adapter
37 | }
38 |
--------------------------------------------------------------------------------
/internal/apiutils/model_test.go:
--------------------------------------------------------------------------------
1 | package apiutils_test
2 |
3 | import (
4 | "testing"
5 |
6 | "github.com/stretchr/testify/require"
7 | "github.com/substratusai/kubeai/internal/apiutils"
8 | )
9 |
10 | func TestSplitModelAdapter(t *testing.T) {
11 | t.Parallel()
12 |
13 | cases := map[string]struct {
14 | input string
15 | expModel, expAdapter string
16 | }{
17 | "empty input": {
18 | input: "",
19 | expModel: "",
20 | expAdapter: "",
21 | },
22 | "model only": {
23 | input: "my-model",
24 | expModel: "my-model",
25 | },
26 | "model and adapter": {
27 | input: "my-model_my-adapter",
28 | expModel: "my-model",
29 | expAdapter: "my-adapter",
30 | },
31 | "too many separators": {
32 | input: "my-model_my-adapter_extra",
33 | expModel: "my-model",
34 | expAdapter: "my-adapter_extra",
35 | },
36 | "trailing": {
37 | input: "my-model_",
38 | expModel: "my-model",
39 | expAdapter: "",
40 | },
41 | }
42 |
43 | for name, spec := range cases {
44 | t.Run(name, func(t *testing.T) {
45 | t.Parallel()
46 | model, adapter := apiutils.SplitModelAdapter(spec.input)
47 | require.Equal(t, spec.expModel, model, "model")
48 | require.Equal(t, spec.expAdapter, adapter, "adapter")
49 | })
50 | }
51 | }
52 |
53 | func TestMergeModelAdapter(t *testing.T) {
54 | t.Parallel()
55 |
56 | cases := map[string]struct {
57 | model, adapter, exp string
58 | }{
59 | "model only": {
60 | model: "my-model",
61 | exp: "my-model",
62 | },
63 | "model and adapter": {
64 | model: "my-model",
65 | adapter: "my-adapter",
66 | exp: "my-model_my-adapter",
67 | },
68 | }
69 |
70 | for name, spec := range cases {
71 | t.Run(name, func(t *testing.T) {
72 | t.Parallel()
73 | merged := apiutils.MergeModelAdapter(spec.model, spec.adapter)
74 | require.Equal(t, spec.exp, merged)
75 | })
76 | }
77 | }
78 |
--------------------------------------------------------------------------------
/internal/config/system_test.go:
--------------------------------------------------------------------------------
1 | package config_test
2 |
3 | import (
4 | "testing"
5 | "time"
6 |
7 | "github.com/stretchr/testify/require"
8 | "github.com/substratusai/kubeai/internal/config"
9 | )
10 |
11 | func TestAutoscalingConfig(t *testing.T) {
12 | cases := []struct {
13 | name string
14 | cfg config.ModelAutoscaling
15 | scaleDownDelaySeconds int64
16 | expectedRequiredConsecutiveScaleDowns int
17 | expectedAverageWindowCount int
18 | }{
19 | {
20 | name: "default",
21 | cfg: config.ModelAutoscaling{
22 | Interval: config.Duration{Duration: 10 * time.Second},
23 | TimeWindow: config.Duration{Duration: 10 * time.Minute},
24 | },
25 | scaleDownDelaySeconds: 30,
26 | expectedRequiredConsecutiveScaleDowns: 3,
27 | // 10 * 60 / 10
28 | expectedAverageWindowCount: 60,
29 | },
30 | {
31 | name: "even",
32 | cfg: config.ModelAutoscaling{
33 | Interval: config.Duration{Duration: 1 * time.Second},
34 | TimeWindow: config.Duration{Duration: 10 * time.Second},
35 | },
36 | scaleDownDelaySeconds: 10,
37 | expectedRequiredConsecutiveScaleDowns: 10,
38 | expectedAverageWindowCount: 10,
39 | },
40 | {
41 | name: "with-remainder",
42 | cfg: config.ModelAutoscaling{
43 | Interval: config.Duration{Duration: 2 * time.Second},
44 | TimeWindow: config.Duration{Duration: 5 * time.Second},
45 | },
46 | scaleDownDelaySeconds: 3,
47 | expectedRequiredConsecutiveScaleDowns: 2,
48 | expectedAverageWindowCount: 3,
49 | },
50 | }
51 |
52 | for _, c := range cases {
53 | t.Run(c.name, func(t *testing.T) {
54 | require.Equal(t, c.expectedRequiredConsecutiveScaleDowns, c.cfg.RequiredConsecutiveScaleDowns(c.scaleDownDelaySeconds))
55 | })
56 | }
57 | }
58 |
--------------------------------------------------------------------------------
/internal/k8sutils/apply.go:
--------------------------------------------------------------------------------
1 | package k8sutils
2 |
3 | import (
4 | "context"
5 | "fmt"
6 |
7 | "sigs.k8s.io/controller-runtime/pkg/client"
8 | )
9 |
10 | func ServerSideApply(ctx context.Context, cl client.Client, obj client.Object, controllerName string) error {
11 | gvk, err := ObjectToGroupVersionKind(cl.Scheme(), obj)
12 | if err != nil {
13 | return fmt.Errorf("getting group version kind: %w", err)
14 | }
15 | obj.GetObjectKind().SetGroupVersionKind(gvk)
16 | return cl.Patch(ctx, obj, client.Apply, client.FieldOwner(controllerName), client.ForceOwnership)
17 | }
18 |
--------------------------------------------------------------------------------
/internal/k8sutils/client_options.go:
--------------------------------------------------------------------------------
1 | package k8sutils
2 |
3 | import "sigs.k8s.io/controller-runtime/pkg/client"
4 |
5 | const ManagerName = "kubeai-manager"
6 |
7 | func DefaultUpdateOptions() *client.UpdateOptions {
8 | return &client.UpdateOptions{
9 | FieldManager: ManagerName,
10 | }
11 | }
12 |
13 | func DefaultSubResourceUpdateOptions() *client.UpdateOptions {
14 | return &client.UpdateOptions{
15 | FieldManager: ManagerName,
16 | }
17 | }
18 |
19 | func DefaultCreateOptions() *client.CreateOptions {
20 | return &client.CreateOptions{
21 | FieldManager: ManagerName,
22 | }
23 | }
24 |
25 | func DefaultPatchOptions() *client.PatchOptions {
26 | return &client.PatchOptions{
27 | FieldManager: ManagerName,
28 | }
29 | }
30 |
--------------------------------------------------------------------------------
/internal/k8sutils/gvk.go:
--------------------------------------------------------------------------------
1 | package k8sutils
2 |
3 | import (
4 | "fmt"
5 |
6 | "k8s.io/apimachinery/pkg/runtime"
7 | "k8s.io/apimachinery/pkg/runtime/schema"
8 | "sigs.k8s.io/controller-runtime/pkg/client"
9 | )
10 |
11 | func ObjectToGroupKind(s *runtime.Scheme, o client.Object) (schema.GroupKind, error) {
12 | gvks, _, err := s.ObjectKinds(o)
13 | if err != nil {
14 | return schema.GroupKind{}, err
15 | }
16 | if len(gvks) == 0 {
17 | return schema.GroupKind{}, fmt.Errorf("no group kind for object")
18 | }
19 | return schema.GroupKind{
20 | Group: gvks[0].Group,
21 | Kind: gvks[0].Kind,
22 | }, nil
23 | }
24 |
25 | func ObjectToGroupVersionKind(s *runtime.Scheme, o client.Object) (schema.GroupVersionKind, error) {
26 | gvks, _, err := s.ObjectKinds(o)
27 | if err != nil {
28 | return schema.GroupVersionKind{}, err
29 | }
30 | if len(gvks) == 0 {
31 | return schema.GroupVersionKind{}, fmt.Errorf("no group version kind for object")
32 | }
33 | return schema.GroupVersionKind{
34 | Group: gvks[0].Group,
35 | Version: gvks[0].Version,
36 | Kind: gvks[0].Kind,
37 | }, nil
38 | }
39 |
--------------------------------------------------------------------------------
/internal/k8sutils/jobs.go:
--------------------------------------------------------------------------------
1 | package k8sutils
2 |
3 | import (
4 | batchv1 "k8s.io/api/batch/v1"
5 | corev1 "k8s.io/api/core/v1"
6 | )
7 |
8 | func IsJobCompleted(job *batchv1.Job) bool {
9 | for _, cond := range job.Status.Conditions {
10 | if cond.Type == batchv1.JobComplete && cond.Status == corev1.ConditionTrue {
11 | return true
12 | }
13 | }
14 | return false
15 | }
16 |
--------------------------------------------------------------------------------
/internal/k8sutils/meta.go:
--------------------------------------------------------------------------------
1 | package k8sutils
2 |
3 | import "sigs.k8s.io/controller-runtime/pkg/client"
4 |
5 | func SetLabel(obj client.Object, key, value string) {
6 | labels := obj.GetLabels()
7 | if labels == nil {
8 | labels = make(map[string]string)
9 | obj.SetLabels(labels)
10 | }
11 | labels[key] = value
12 | }
13 |
14 | func SetAnnotation(obj client.Object, key, value string) {
15 | annotations := obj.GetAnnotations()
16 | if annotations == nil {
17 | annotations = make(map[string]string)
18 | obj.SetAnnotations(annotations)
19 | }
20 | annotations[key] = value
21 | }
22 |
23 | func GetLabel(obj client.Object, key string) string {
24 | labels := obj.GetLabels()
25 | if labels == nil {
26 | return ""
27 | }
28 | return labels[key]
29 | }
30 |
31 | func GetAnnotation(obj client.Object, key string) string {
32 | annotations := obj.GetAnnotations()
33 | if annotations == nil {
34 | return ""
35 | }
36 | return annotations[key]
37 | }
38 |
--------------------------------------------------------------------------------
/internal/k8sutils/pods.go:
--------------------------------------------------------------------------------
1 | package k8sutils
2 |
3 | import (
4 | "fmt"
5 | "hash"
6 | "hash/fnv"
7 |
8 | corev1 "k8s.io/api/core/v1"
9 | "k8s.io/apimachinery/pkg/util/dump"
10 | "k8s.io/apimachinery/pkg/util/rand"
11 | )
12 |
13 | func PodIsScheduled(pod *corev1.Pod) bool {
14 | return pod.Spec.NodeName != ""
15 | }
16 |
17 | func PodIsReady(pod *corev1.Pod) bool {
18 | for _, cond := range pod.Status.Conditions {
19 | if cond.Type == corev1.PodReady && cond.Status == corev1.ConditionTrue {
20 | return true
21 | }
22 | }
23 | return false
24 | }
25 |
26 | // PodHash returns a hash value calculated from Pod spec.
27 | // Inspired by k8s.io/kubernetes/pkg/controller.ComputeHash()
28 | func PodHash(podSpec corev1.PodSpec) string {
29 | podTemplateSpecHasher := fnv.New32a()
30 | DeepHashObject(podTemplateSpecHasher, podSpec)
31 |
32 | // TODO: Implement collision detection if needed.
33 | //// Add collisionCount in the hash if it exists.
34 | //if collisionCount != nil {
35 | // collisionCountBytes := make([]byte, 8)
36 | // binary.LittleEndian.PutUint32(collisionCountBytes, uint32(*collisionCount))
37 | // podTemplateSpecHasher.Write(collisionCountBytes)
38 | //}
39 |
40 | return rand.SafeEncodeString(fmt.Sprint(podTemplateSpecHasher.Sum32()))
41 | }
42 |
43 | // StringHash returns a hash value calculated from the input string.
44 | func StringHash(s string) string {
45 | h := fnv.New32a()
46 | h.Write([]byte(s))
47 | return rand.SafeEncodeString(fmt.Sprint(h.Sum32()))
48 | }
49 |
50 | // DeepHashObject writes specified object to hash using the spew library
51 | // which follows pointers and prints actual values of the nested objects
52 | // ensuring the hash does not change when a pointer changes.
53 | // Copied from k8s.io/kubernetes/pkg/util/hash to avoid dependency on k8s.io/kubernetes.
54 | func DeepHashObject(hasher hash.Hash, objectToWrite interface{}) {
55 | hasher.Reset()
56 | fmt.Fprintf(hasher, "%v", dump.ForHash(objectToWrite))
57 | }
58 |
59 | func ContainerIsReady(pod *corev1.Pod, containerName string) bool {
60 | for _, status := range pod.Status.ContainerStatuses {
61 | if status.Name == containerName {
62 | return status.Ready
63 | }
64 | }
65 | return false
66 | }
67 |
--------------------------------------------------------------------------------
/internal/loadbalancer/balance_least_load.go:
--------------------------------------------------------------------------------
1 | package loadbalancer
2 |
3 | func (g *group) getAddrLeastLoad(adapter string) (endpoint, bool) {
4 | var bestEp endpoint
5 | var found bool
6 | var minInFlight int
7 | for _, ep := range g.endpoints {
8 | if adapter != "" {
9 | // Skip endpoints that don't have the requested adapter.
10 | if _, ok := ep.adapters[adapter]; !ok {
11 | continue
12 | }
13 | }
14 | inFlight := int(ep.inFlight.Load())
15 | if !found || inFlight < minInFlight {
16 | bestEp = ep
17 | found = true
18 | minInFlight = inFlight
19 | }
20 | }
21 |
22 | return bestEp, found
23 | }
24 |
--------------------------------------------------------------------------------
/internal/loadbalancer/group_bench_test.go:
--------------------------------------------------------------------------------
1 | package loadbalancer
2 |
3 | import (
4 | "context"
5 | "testing"
6 |
7 | v1 "github.com/substratusai/kubeai/api/k8s/v1"
8 | "github.com/substratusai/kubeai/internal/apiutils"
9 | )
10 |
11 | func BenchmarkEndpointGroup(b *testing.B) {
12 | e := newEndpointGroup(v1.LoadBalancing{PrefixHash: v1.PrefixHash{Replication: 100}})
13 | e.reconcileEndpoints(map[string]endpoint{"pod1": {address: "10.0.0.1:8000"}})
14 | b.ResetTimer()
15 | b.RunParallel(func(pb *testing.PB) {
16 | for pb.Next() {
17 | _, f, err := e.getBestAddr(context.Background(), &apiutils.Request{}, false)
18 | if err != nil {
19 | b.Fatal(err)
20 | }
21 | f()
22 | }
23 | })
24 | }
25 |
--------------------------------------------------------------------------------
/internal/manager/configure.go:
--------------------------------------------------------------------------------
1 | package manager
2 |
3 | import (
4 | "os"
5 |
6 | "github.com/substratusai/kubeai/internal/config"
7 | "sigs.k8s.io/yaml"
8 | )
9 |
10 | func LoadConfigFile(path string) (config.System, error) {
11 | contents, err := os.ReadFile(path)
12 | if err != nil {
13 | return config.System{}, err
14 | }
15 | var cfg config.System
16 | if err := yaml.Unmarshal(contents, &cfg); err != nil {
17 | return config.System{}, err
18 | }
19 |
20 | return cfg, nil
21 | }
22 |
--------------------------------------------------------------------------------
/internal/modelautoscaler/state.go:
--------------------------------------------------------------------------------
1 | package modelautoscaler
2 |
3 | import (
4 | "context"
5 | "encoding/json"
6 | "fmt"
7 | "log"
8 | "time"
9 |
10 | corev1 "k8s.io/api/core/v1"
11 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
12 | "k8s.io/apimachinery/pkg/types"
13 | "sigs.k8s.io/controller-runtime/pkg/client"
14 | )
15 |
16 | func newTotalModelState() totalModelState {
17 | return totalModelState{
18 | Models: make(map[string]modelState),
19 | LastCalculationTime: time.Now(),
20 | }
21 | }
22 |
23 | type totalModelState struct {
24 | Models map[string]modelState `json:"models"`
25 | LastCalculationTime time.Time `json:"lastCalculationTime"`
26 | }
27 |
28 | type modelState struct {
29 | AverageActiveRequests float64 `json:"averageActiveRequests"`
30 | }
31 |
32 | func (a *Autoscaler) loadLastTotalModelState(ctx context.Context) (totalModelState, error) {
33 | cm := &corev1.ConfigMap{}
34 | if err := a.k8sClient.Get(ctx, a.stateConfigMapRef, cm); err != nil {
35 | return totalModelState{}, fmt.Errorf("get ConfigMap %q: %w", a.stateConfigMapRef, err)
36 | }
37 | const key = "models"
38 | jsonState, ok := cm.Data[key]
39 | if !ok {
40 | log.Printf("Autoscaler state ConfigMap %q has no key %q, state not loaded", key, a.stateConfigMapRef)
41 | return totalModelState{}, nil
42 | }
43 | tms := totalModelState{}
44 | if err := json.Unmarshal([]byte(jsonState), &tms); err != nil {
45 | return totalModelState{}, fmt.Errorf("unmarshalling state: %w", err)
46 | }
47 | return tms, nil
48 | }
49 |
50 | func (a *Autoscaler) saveTotalModelState(ctx context.Context, state totalModelState) error {
51 | jsonState, err := json.Marshal(state)
52 | if err != nil {
53 | return fmt.Errorf("marshalling state: %w", err)
54 | }
55 | patch := fmt.Sprintf(`{"data":{"models":%q}}`, string(jsonState))
56 | if err := a.k8sClient.Patch(ctx, &corev1.ConfigMap{
57 | ObjectMeta: metav1.ObjectMeta{
58 | Namespace: a.stateConfigMapRef.Namespace,
59 | Name: a.stateConfigMapRef.Name,
60 | },
61 | }, client.RawPatch(types.StrategicMergePatchType, []byte(patch))); err != nil {
62 | return fmt.Errorf("patching ConfigMap %q: %w", a.stateConfigMapRef, err)
63 | }
64 | return nil
65 | }
66 |
--------------------------------------------------------------------------------
/internal/modelclient/client.go:
--------------------------------------------------------------------------------
1 | package modelclient
2 |
3 | import (
4 | "context"
5 | "fmt"
6 | "sync"
7 |
8 | kubeaiv1 "github.com/substratusai/kubeai/api/k8s/v1"
9 | apierrors "k8s.io/apimachinery/pkg/api/errors"
10 | "k8s.io/apimachinery/pkg/labels"
11 | "k8s.io/apimachinery/pkg/types"
12 | "sigs.k8s.io/controller-runtime/pkg/client"
13 | )
14 |
15 | type ModelClient struct {
16 | client client.Client
17 | namespace string
18 | consecutiveScaleDownsMtx sync.RWMutex
19 | consecutiveScaleDowns map[string]int
20 | }
21 |
22 | func NewModelClient(client client.Client, namespace string) *ModelClient {
23 | return &ModelClient{client: client, namespace: namespace, consecutiveScaleDowns: map[string]int{}}
24 | }
25 |
26 | // LookupModel checks if a model exists and matches the given label selectors.
27 | func (c *ModelClient) LookupModel(ctx context.Context, model, adapter string, labelSelectors []string) (*kubeaiv1.Model, error) {
28 | m := &kubeaiv1.Model{}
29 | if err := c.client.Get(ctx, types.NamespacedName{Name: model, Namespace: c.namespace}, m); err != nil {
30 | if apierrors.IsNotFound(err) {
31 | return nil, nil
32 | }
33 | return nil, err
34 | }
35 |
36 | modelLabels := m.GetLabels()
37 | if modelLabels == nil {
38 | modelLabels = map[string]string{}
39 | }
40 | for _, sel := range labelSelectors {
41 | parsedSel, err := labels.Parse(sel)
42 | if err != nil {
43 | return nil, fmt.Errorf("parse label selector: %w", err)
44 | }
45 | if !parsedSel.Matches(labels.Set(modelLabels)) {
46 | return nil, nil
47 | }
48 | }
49 |
50 | if adapter != "" {
51 | adapterFound := false
52 | for _, a := range m.Spec.Adapters {
53 | if a.Name == adapter {
54 | adapterFound = true
55 | break
56 | }
57 | }
58 | if !adapterFound {
59 | return nil, nil
60 | }
61 | }
62 |
63 | return m, nil
64 | }
65 |
66 | func (s *ModelClient) ListAllModels(ctx context.Context) ([]kubeaiv1.Model, error) {
67 | models := &kubeaiv1.ModelList{}
68 | if err := s.client.List(ctx, models, client.InNamespace(s.namespace)); err != nil {
69 | return nil, fmt.Errorf("list models: %w", err)
70 | }
71 |
72 | return models.Items, nil
73 | }
74 |
--------------------------------------------------------------------------------
/internal/modelcontroller/patch.go:
--------------------------------------------------------------------------------
1 | package modelcontroller
2 |
3 | import (
4 | "encoding/json"
5 | "fmt"
6 |
7 | "github.com/substratusai/kubeai/internal/config"
8 | jsonpatch "gopkg.in/evanphx/json-patch.v4"
9 | corev1 "k8s.io/api/core/v1"
10 | )
11 |
12 | func applyJSONPatchToPod(patches []config.JSONPatch, pod *corev1.Pod) error {
13 | if len(patches) == 0 {
14 | return nil
15 | }
16 |
17 | pb, err := json.Marshal(patches)
18 | if err != nil {
19 | return fmt.Errorf("marshal pod patch: %w", err)
20 | }
21 |
22 | patch, err := jsonpatch.DecodePatch(pb)
23 | if err != nil {
24 | return fmt.Errorf("decode pod patch: %w", err)
25 | }
26 |
27 | podJson, err := json.Marshal(pod)
28 | if err != nil {
29 | return fmt.Errorf("marshal pod: %w", err)
30 | }
31 |
32 | patchedPodJson, err := patch.Apply(podJson)
33 | if err != nil {
34 | return fmt.Errorf("apply pod patch: %w", err)
35 | }
36 |
37 | patchedPod := &corev1.Pod{}
38 | if err := json.Unmarshal(patchedPodJson, patchedPod); err != nil {
39 | return fmt.Errorf("unmarshal patched pod: %w", err)
40 | }
41 | *pod = *patchedPod
42 | return nil
43 | }
44 |
--------------------------------------------------------------------------------
/internal/modelcontroller/pod_utils.go:
--------------------------------------------------------------------------------
1 | package modelcontroller
2 |
3 | import (
4 | "context"
5 | "fmt"
6 | "os"
7 |
8 | corev1 "k8s.io/api/core/v1"
9 | "k8s.io/apimachinery/pkg/runtime"
10 | "k8s.io/client-go/tools/remotecommand"
11 | )
12 |
13 | func (r *ModelReconciler) execPod(ctx context.Context, pod *corev1.Pod, container string, command []string) error {
14 | execReq := r.PodRESTClient.
15 | Post().
16 | Namespace(pod.Namespace).
17 | Resource("pods").
18 | Name(pod.Name).
19 | SubResource("exec").
20 | VersionedParams(&corev1.PodExecOptions{
21 | Container: container,
22 | Command: command,
23 | Stdin: true,
24 | Stdout: true,
25 | Stderr: true,
26 | }, runtime.NewParameterCodec(r.Scheme))
27 |
28 | exec, err := remotecommand.NewSPDYExecutor(r.RESTConfig, "POST", execReq.URL())
29 | if err != nil {
30 | return fmt.Errorf("creating remote command executor: %w", err)
31 | }
32 |
33 | if err := exec.StreamWithContext(ctx, remotecommand.StreamOptions{
34 | Stdin: os.Stdin,
35 | Stdout: os.Stdout,
36 | Stderr: os.Stderr,
37 | Tty: false,
38 | }); err != nil {
39 | return fmt.Errorf("streaming: %w", err)
40 | }
41 |
42 | return nil
43 | }
44 |
45 | func (r *ModelReconciler) updatePodRemoveLabel(ctx context.Context, pod *corev1.Pod, key string) error {
46 | if pod.Labels == nil {
47 | return nil
48 | }
49 | delete(pod.Labels, key)
50 | if err := r.Client.Update(ctx, pod); err != nil {
51 | return fmt.Errorf("update pod labels: %w", err)
52 | }
53 | return nil
54 | }
55 |
56 | func (r *ModelReconciler) updatePodAddLabel(ctx context.Context, pod *corev1.Pod, key, value string) error {
57 | if pod.Labels == nil {
58 | pod.Labels = make(map[string]string)
59 | }
60 | pod.Labels[key] = value
61 | if err := r.Client.Update(ctx, pod); err != nil {
62 | return fmt.Errorf("update pod labels: %w", err)
63 | }
64 | return nil
65 | }
66 |
--------------------------------------------------------------------------------
/internal/movingaverage/simple.go:
--------------------------------------------------------------------------------
1 | package movingaverage
2 |
3 | import (
4 | "sync"
5 | )
6 |
7 | // Simple keeps track of a history of measurements and returns the average.
8 | // One important feature of this implementation is that the average can go to zero.
9 | // All methods are thread safe.
10 | //
11 | // Alternative: consider exponential moving average where near-zero values are treated
12 | // as zero (for scale to zero):
13 | //
14 | // func MovingExpAvg(value, oldValue, fdtime, ftime float64) float64 {
15 | // alpha := 1.0 - math.Exp(-fdtime/ftime)
16 | // r := alpha * value + (1.0 - alpha) * oldValue
17 | // return r
18 | // }
19 | type Simple struct {
20 | mtx sync.Mutex
21 | history []float64
22 | index int
23 | }
24 |
25 | func NewSimple(seed []float64) *Simple {
26 | return &Simple{
27 | history: seed,
28 | }
29 | }
30 |
31 | func (a *Simple) Next(next float64) {
32 | a.mtx.Lock()
33 | a.history[a.index] = next
34 | a.index++
35 | if a.index == len(a.history) {
36 | a.index = 0
37 | }
38 | a.mtx.Unlock()
39 | }
40 |
41 | func (a *Simple) History() []float64 {
42 | a.mtx.Lock()
43 | result := make([]float64, len(a.history))
44 | copy(result, a.history)
45 | a.mtx.Unlock()
46 |
47 | return result
48 | }
49 |
50 | func (a *Simple) Calculate() (result float64) {
51 | a.mtx.Lock()
52 | for _, p := range a.history {
53 | result += p
54 | }
55 | result /= float64(len(a.history))
56 | a.mtx.Unlock()
57 |
58 | return result
59 | }
60 |
--------------------------------------------------------------------------------
/internal/movingaverage/simple_test.go:
--------------------------------------------------------------------------------
1 | package movingaverage_test
2 |
3 | import (
4 | "testing"
5 |
6 | "github.com/substratusai/kubeai/internal/movingaverage"
7 | )
8 |
9 | func TestSimple(t *testing.T) {
10 | cases := []struct {
11 | name string
12 | seed []float64
13 | values []float64
14 | want float64
15 | }{
16 | {
17 | name: "1-2-3",
18 | seed: []float64{0, 0, 0},
19 | values: []float64{1, 2, 3},
20 | want: 2,
21 | },
22 | {
23 | name: "3-2-1",
24 | seed: make([]float64, 3),
25 | values: []float64{3, 2, 1},
26 | want: 2,
27 | },
28 | {
29 | name: "3-2-1-1-1-1",
30 | seed: make([]float64, 3),
31 | values: []float64{3, 2, 1, 1, 1, 1},
32 | want: 1,
33 | },
34 | {
35 | name: "2-3",
36 | seed: make([]float64, 2),
37 | values: []float64{2, 3},
38 | want: 2.5,
39 | },
40 | {
41 | name: "2-2-2",
42 | seed: []float64{0, 0, 0},
43 | values: []float64{2, 2, 2},
44 | want: 2,
45 | },
46 | }
47 | for _, tc := range cases {
48 | t.Run(tc.name, func(t *testing.T) {
49 | a := movingaverage.NewSimple(tc.seed)
50 | for _, v := range tc.values {
51 | a.Next(v)
52 | }
53 | got := a.Calculate()
54 | if got != tc.want {
55 | t.Errorf("got %v; want %v", got, tc.want)
56 | }
57 | })
58 | }
59 | }
60 |
--------------------------------------------------------------------------------
/manifests/models/bge-embed-text-cpu.yaml:
--------------------------------------------------------------------------------
1 | # Source: models/templates/models.yaml
2 | apiVersion: kubeai.org/v1
3 | kind: Model
4 | metadata:
5 | name: bge-embed-text-cpu
6 | spec:
7 | features: [TextEmbedding]
8 | url: hf://BAAI/bge-small-en-v1.5
9 | engine: Infinity
10 | minReplicas: 0
11 | resourceProfile: cpu:1
12 |
--------------------------------------------------------------------------------
/manifests/models/deepseek-r1-1.5b-cpu.yaml:
--------------------------------------------------------------------------------
1 | # Source: models/templates/models.yaml
2 | apiVersion: kubeai.org/v1
3 | kind: Model
4 | metadata:
5 | name: deepseek-r1-1.5b-cpu
6 | spec:
7 | features: [TextGeneration]
8 | url: ollama://deepseek-r1:1.5b
9 | engine: OLlama
10 | minReplicas: 0
11 | resourceProfile: cpu:1
12 |
--------------------------------------------------------------------------------
/manifests/models/deepseek-r1-70b-gh200-fp8.yaml:
--------------------------------------------------------------------------------
1 | # Source: models/templates/models.yaml
2 | apiVersion: kubeai.org/v1
3 | kind: Model
4 | metadata:
5 | name: deepseek-r1-70b-gh200-fp8
6 | spec:
7 | features: [TextGeneration]
8 | url: hf://neuralmagic/DeepSeek-R1-Distill-Llama-70B-FP8-dynamic
9 | engine: VLLM
10 | args:
11 | - --max-model-len=32768
12 | - --max-num-batched-token=32768
13 | - --gpu-memory-utilization=0.95
14 | - --kv-cache-dtype=fp8
15 | - --enable-prefix-caching
16 | - --disable-log-requests
17 | minReplicas: 0
18 | resourceProfile: nvidia-gpu-gh200:1
19 |
--------------------------------------------------------------------------------
/manifests/models/deepseek-r1-70b-gh200.yaml:
--------------------------------------------------------------------------------
1 | # Source: models/templates/models.yaml
2 | apiVersion: kubeai.org/v1
3 | kind: Model
4 | metadata:
5 | name: deepseek-r1-70b-gh200
6 | spec:
7 | features: [TextGeneration]
8 | url: hf://deepseek-ai/DeepSeek-R1-Distill-Llama-70B
9 | engine: VLLM
10 | args:
11 | - --max-model-len=32768
12 | - --max-num-batched-token=32768
13 | - --gpu-memory-utilization=0.95
14 | - --kv-cache-dtype=fp8
15 | - --cpu-offload-gb=120
16 | - --enable-prefix-caching
17 | - --disable-log-requests
18 | env:
19 | VLLM_ATTENTION_BACKEND: FLASHINFER
20 | minReplicas: 0
21 | resourceProfile: nvidia-gpu-gh200:1
22 |
--------------------------------------------------------------------------------
/manifests/models/deepseek-r1-distill-llama-8b-l4.yaml:
--------------------------------------------------------------------------------
1 | # Source: models/templates/models.yaml
2 | apiVersion: kubeai.org/v1
3 | kind: Model
4 | metadata:
5 | name: deepseek-r1-distill-llama-8b-l4
6 | spec:
7 | features: [TextGeneration]
8 | url: hf://deepseek-ai/DeepSeek-R1-Distill-Llama-8B
9 | engine: VLLM
10 | args:
11 | - --max-model-len=8192
12 | - --max-num-batched-token=8192
13 | - --max-num-seqs=256
14 | - --gpu-memory-utilization=0.95
15 | - --kv-cache-dtype=fp8
16 | - --disable-log-requests
17 | - --quantization=fp8
18 | - --enforce-eager
19 | env:
20 | VLLM_ATTENTION_BACKEND: FLASHINFER
21 | minReplicas: 0
22 | resourceProfile: nvidia-gpu-l4:1
23 |
--------------------------------------------------------------------------------
/manifests/models/deepseek-r1-distill-qwen-1.5b-rtx4070.yaml:
--------------------------------------------------------------------------------
1 | # Source: models/templates/models.yaml
2 | apiVersion: kubeai.org/v1
3 | kind: Model
4 | metadata:
5 | name: deepseek-r1-distill-qwen-1.5b-rtx4070
6 | spec:
7 | features: [TextGeneration]
8 | url: hf://deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
9 | engine: VLLM
10 | args:
11 | - --max-model-len=2048
12 | - --max-num-batched-token=2048
13 | - --max-num-seqs=8
14 | - --kv-cache-dtype=fp8
15 | env:
16 | VLLM_USE_V1: "1"
17 | minReplicas: 0
18 | resourceProfile: nvidia-gpu-rtx4070-8gb:1
19 |
--------------------------------------------------------------------------------
/manifests/models/deepseek-r1-mi300x.yaml:
--------------------------------------------------------------------------------
1 | # Source: models/templates/models.yaml
2 | apiVersion: kubeai.org/v1
3 | kind: Model
4 | metadata:
5 | name: deepseek-r1-mi300x
6 | spec:
7 | features: [TextGeneration]
8 | url: hf://deepseek-ai/DeepSeek-R1
9 | engine: VLLM
10 | args:
11 | - --trust-remote-code
12 | - --max-model-len=32768
13 | - --max-num-batched-token=32768
14 | - --max-num-seqs=1024
15 | - --num-scheduler-steps=10
16 | - --tensor-parallel-size=8
17 | - --gpu-memory-utilization=0.90
18 | - --disable-log-requests
19 | - --enable-chunked-prefill=false
20 | - --max-seq-len-to-capture=16384
21 | - --kv-cache-dtype=fp8
22 | env:
23 | HIP_FORCE_DEV_KERNARG: "1"
24 | NCCL_MIN_NCHANNELS: "112"
25 | TORCH_BLAS_PREFER_HIPBLASLT: "1"
26 | VLLM_FP8_PADDING: "0"
27 | VLLM_USE_TRITON_FLASH_ATTN: "0"
28 | minReplicas: 0
29 | targetRequests: 1024
30 | resourceProfile: amd-gpu-mi300x:8
31 |
--------------------------------------------------------------------------------
/manifests/models/e5-mistral-7b-instruct-cpu.yaml:
--------------------------------------------------------------------------------
1 | # Source: models/templates/models.yaml
2 | apiVersion: kubeai.org/v1
3 | kind: Model
4 | metadata:
5 | name: e5-mistral-7b-instruct-cpu
6 | spec:
7 | features: [TextEmbedding]
8 | url: hf://intfloat/e5-mistral-7b-instruct
9 | engine: VLLM
10 | args:
11 | - --gpu-memory-utilization=0.9
12 | minReplicas: 0
13 | resourceProfile: cpu:1
14 |
--------------------------------------------------------------------------------
/manifests/models/faster-whisper-medium-en-cpu.yaml:
--------------------------------------------------------------------------------
1 | # Source: models/templates/models.yaml
2 | apiVersion: kubeai.org/v1
3 | kind: Model
4 | metadata:
5 | name: faster-whisper-medium-en-cpu
6 | spec:
7 | features: [SpeechToText]
8 | url: hf://Systran/faster-whisper-medium.en
9 | engine: FasterWhisper
10 | minReplicas: 0
11 | resourceProfile: cpu:1
12 |
--------------------------------------------------------------------------------
/manifests/models/gemma-2-9b-it-fp8-l4.yaml:
--------------------------------------------------------------------------------
1 | # Source: models/templates/models.yaml
2 | apiVersion: kubeai.org/v1
3 | kind: Model
4 | metadata:
5 | name: gemma-2-9b-it-fp8-l4
6 | spec:
7 | features: [TextGeneration]
8 | url: hf://neuralmagic/gemma-2-9b-it-FP8
9 | engine: VLLM
10 | args:
11 | - --max-model-len=4096
12 | - --max-num-batched-token=4096
13 | - --max-num-seqs=256
14 | - --gpu-memory-utilization=0.95
15 | - --kv-cache-dtype=fp8
16 | env:
17 | VLLM_USE_V1: "1"
18 | minReplicas: 0
19 | resourceProfile: nvidia-gpu-l4:1
20 |
--------------------------------------------------------------------------------
/manifests/models/gemma-27b-ollama-l4.yaml:
--------------------------------------------------------------------------------
1 | # Source: models/templates/models.yaml
2 | apiVersion: kubeai.org/v1
3 | kind: Model
4 | metadata:
5 | name: gemma-27b-ollama-l4
6 | spec:
7 | features: [TextGeneration]
8 | url: ollama://gemma2:27b
9 | engine: OLlama
10 | minReplicas: 0
11 | resourceProfile: nvidia-gpu-l4:1
12 |
--------------------------------------------------------------------------------
/manifests/models/gemma-2b-it-tpu.yaml:
--------------------------------------------------------------------------------
1 | # Source: models/templates/models.yaml
2 | apiVersion: kubeai.org/v1
3 | kind: Model
4 | metadata:
5 | name: gemma-2b-it-tpu
6 | spec:
7 | features: [TextGeneration]
8 | url: hf://google/gemma-2b-it
9 | engine: VLLM
10 | args:
11 | - --disable-log-requests
12 | minReplicas: 0
13 | resourceProfile: google-tpu-v5e-1x1:1
14 |
--------------------------------------------------------------------------------
/manifests/models/gemma-3-12b-ollama-l4.yaml:
--------------------------------------------------------------------------------
1 | # Source: models/templates/models.yaml
2 | apiVersion: kubeai.org/v1
3 | kind: Model
4 | metadata:
5 | name: gemma-3-12b-ollama-l4
6 | spec:
7 | features: [TextGeneration]
8 | url: ollama://gemma3:12b
9 | engine: OLlama
10 | minReplicas: 0
11 | resourceProfile: nvidia-gpu-l4:1
12 |
--------------------------------------------------------------------------------
/manifests/models/gemma-3-27b-ollama-l4.yaml:
--------------------------------------------------------------------------------
1 | # Source: models/templates/models.yaml
2 | apiVersion: kubeai.org/v1
3 | kind: Model
4 | metadata:
5 | name: gemma-3-27b-ollama-l4
6 | spec:
7 | features: [TextGeneration]
8 | url: ollama://gemma3:27b
9 | engine: OLlama
10 | minReplicas: 0
11 | resourceProfile: nvidia-gpu-l4:1
12 |
--------------------------------------------------------------------------------
/manifests/models/gemma-9b-ollama-l4.yaml:
--------------------------------------------------------------------------------
1 | # Source: models/templates/models.yaml
2 | apiVersion: kubeai.org/v1
3 | kind: Model
4 | metadata:
5 | name: gemma-9b-ollama-l4
6 | spec:
7 | features: [TextGeneration]
8 | url: ollama://gemma2:9b
9 | engine: OLlama
10 | minReplicas: 0
11 | resourceProfile: nvidia-gpu-l4:1
12 |
--------------------------------------------------------------------------------
/manifests/models/gemma2-2b-cpu.yaml:
--------------------------------------------------------------------------------
1 | # Source: models/templates/models.yaml
2 | apiVersion: kubeai.org/v1
3 | kind: Model
4 | metadata:
5 | name: gemma2-2b-cpu
6 | spec:
7 | features: [TextGeneration]
8 | url: ollama://gemma2:2b
9 | engine: OLlama
10 | minReplicas: 0
11 | resourceProfile: cpu:2
12 |
--------------------------------------------------------------------------------
/manifests/models/granite-3.1-dense-ollama-l4.yaml:
--------------------------------------------------------------------------------
1 | # Source: models/templates/models.yaml
2 | apiVersion: kubeai.org/v1
3 | kind: Model
4 | metadata:
5 | name: granite-3.1-dense-ollama-l4
6 | spec:
7 | features: [TextGeneration]
8 | url: ollama://granite3.1-dense
9 | engine: OLlama
10 | minReplicas: 0
11 | resourceProfile: nvidia-gpu-l4:1
12 |
--------------------------------------------------------------------------------
/manifests/models/llama-3.1-405b-instruct-fp8-a100-80b.yaml:
--------------------------------------------------------------------------------
1 | # Source: models/templates/models.yaml
2 | apiVersion: kubeai.org/v1
3 | kind: Model
4 | metadata:
5 | name: llama-3.1-405b-instruct-fp8-a100-80b
6 | spec:
7 | features: [TextGeneration]
8 | url: hf://neuralmagic/Meta-Llama-3.1-405B-Instruct-FP8
9 | engine: VLLM
10 | args:
11 | - --max-model-len=65536
12 | - --max-num-batched-token=65536
13 | - --gpu-memory-utilization=0.98
14 | - --tensor-parallel-size=8
15 | - --enable-prefix-caching
16 | - --disable-log-requests
17 | - --max-num-seqs=128
18 | - --kv-cache-dtype=fp8
19 | - --enforce-eager
20 | - --enable-chunked-prefill=false
21 | - --num-scheduler-steps=8
22 | env:
23 | VLLM_ATTENTION_BACKEND: FLASHINFER
24 | minReplicas: 0
25 | targetRequests: 128
26 | resourceProfile: nvidia-gpu-a100-80gb:8
27 |
--------------------------------------------------------------------------------
/manifests/models/llama-3.1-405b-instruct-fp8-h100.yaml:
--------------------------------------------------------------------------------
1 | # Source: models/templates/models.yaml
2 | apiVersion: kubeai.org/v1
3 | kind: Model
4 | metadata:
5 | name: llama-3.1-405b-instruct-fp8-h100
6 | spec:
7 | features: [TextGeneration]
8 | url: hf://neuralmagic/Meta-Llama-3.1-405B-Instruct-FP8
9 | engine: VLLM
10 | args:
11 | - --max-model-len=65536
12 | - --max-num-batched-token=65536
13 | - --gpu-memory-utilization=0.9
14 | - --tensor-parallel-size=8
15 | - --enable-prefix-caching
16 | - --disable-log-requests
17 | - --max-num-seqs=1024
18 | - --kv-cache-dtype=fp8
19 | minReplicas: 0
20 | targetRequests: 500
21 | resourceProfile: nvidia-gpu-h100:8
22 |
--------------------------------------------------------------------------------
/manifests/models/llama-3.1-405b-instruct-fp8-mi300x.yaml:
--------------------------------------------------------------------------------
1 | # Source: models/templates/models.yaml
2 | apiVersion: kubeai.org/v1
3 | kind: Model
4 | metadata:
5 | name: llama-3.1-405b-instruct-fp8-mi300x
6 | spec:
7 | features: [TextGeneration]
8 | url: hf://amd/Llama-3.1-405B-Instruct-FP8-KV
9 | engine: VLLM
10 | args:
11 | - --max-model-len=120000
12 | - --max-num-batched-token=120000
13 | - --max-num-seqs=1024
14 | - --num-scheduler-steps=15
15 | - --tensor-parallel-size=8
16 | - --gpu-memory-utilization=0.90
17 | - --disable-log-requests
18 | - --kv-cache-dtype=fp8
19 | - --enable-chunked-prefill=false
20 | - --max-seq-len-to-capture=16384
21 | env:
22 | HIP_FORCE_DEV_KERNARG: "1"
23 | NCCL_MIN_NCHANNELS: "112"
24 | TORCH_BLAS_PREFER_HIPBLASLT: "1"
25 | VLLM_USE_TRITON_FLASH_ATTN: "0"
26 | minReplicas: 0
27 | targetRequests: 1024
28 | resourceProfile: amd-gpu-mi300x:8
29 |
--------------------------------------------------------------------------------
/manifests/models/llama-3.1-70b-instruct-awq-int4-gh200.yaml:
--------------------------------------------------------------------------------
1 | # Source: models/templates/models.yaml
2 | apiVersion: kubeai.org/v1
3 | kind: Model
4 | metadata:
5 | name: llama-3.1-70b-instruct-awq-int4-gh200
6 | spec:
7 | features: [TextGeneration]
8 | url: hf://hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4
9 | engine: VLLM
10 | args:
11 | - --max-model-len=16384
12 | - --max-num-batched-token=16384
13 | - --enable-prefix-caching
14 | - --disable-log-requests
15 | minReplicas: 0
16 | targetRequests: 50
17 | resourceProfile: nvidia-gpu-gh200:1
18 |
--------------------------------------------------------------------------------
/manifests/models/llama-3.1-70b-instruct-fp8-1-h100.yaml:
--------------------------------------------------------------------------------
1 | # Source: models/templates/models.yaml
2 | apiVersion: kubeai.org/v1
3 | kind: Model
4 | metadata:
5 | name: llama-3.1-70b-instruct-fp8-1-h100
6 | spec:
7 | features: [TextGeneration]
8 | url: hf://neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8
9 | engine: VLLM
10 | args:
11 | - --enable-prefix-caching
12 | - --max-model-len=16384
13 | - --max-num-batched-token=16384
14 | - --gpu-memory-utilization=0.95
15 | - --disable-log-requests
16 | - --kv-cache-dtype=fp8
17 | minReplicas: 0
18 | resourceProfile: nvidia-gpu-h100:1
19 |
--------------------------------------------------------------------------------
/manifests/models/llama-3.1-70b-instruct-fp8-gh200.yaml:
--------------------------------------------------------------------------------
1 | # Source: models/templates/models.yaml
2 | apiVersion: kubeai.org/v1
3 | kind: Model
4 | metadata:
5 | name: llama-3.1-70b-instruct-fp8-gh200
6 | spec:
7 | features: [TextGeneration]
8 | url: hf://neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8
9 | engine: VLLM
10 | args:
11 | - --max-model-len=32768
12 | - --max-num-batched-token=32768
13 | - --max-num-seqs=1024
14 | - --gpu-memory-utilization=0.9
15 | - --enable-prefix-caching
16 | - --enable-chunked-prefill=false
17 | - --disable-log-requests
18 | - --kv-cache-dtype=fp8
19 | - --enforce-eager
20 | env:
21 | VLLM_ATTENTION_BACKEND: FLASHINFER
22 | minReplicas: 0
23 | targetRequests: 1024
24 | resourceProfile: nvidia-gpu-gh200:1
25 |
--------------------------------------------------------------------------------
/manifests/models/llama-3.1-70b-instruct-fp8-h100.yaml:
--------------------------------------------------------------------------------
1 | # Source: models/templates/models.yaml
2 | apiVersion: kubeai.org/v1
3 | kind: Model
4 | metadata:
5 | name: llama-3.1-70b-instruct-fp8-h100
6 | spec:
7 | features: [TextGeneration]
8 | url: hf://neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8
9 | engine: VLLM
10 | args:
11 | - --max-model-len=65536
12 | - --max-num-batched-token=65536
13 | - --max-num-seqs=1024
14 | - --gpu-memory-utilization=0.9
15 | - --tensor-parallel-size=2
16 | - --enable-prefix-caching
17 | - --disable-log-requests
18 | minReplicas: 0
19 | targetRequests: 500
20 | resourceProfile: nvidia-gpu-h100:2
21 |
--------------------------------------------------------------------------------
/manifests/models/llama-3.1-70b-instruct-fp8-l4.yaml:
--------------------------------------------------------------------------------
1 | # Source: models/templates/models.yaml
2 | apiVersion: kubeai.org/v1
3 | kind: Model
4 | metadata:
5 | name: llama-3.1-70b-instruct-fp8-l4
6 | spec:
7 | features: [TextGeneration]
8 | url: hf://neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8
9 | engine: VLLM
10 | args:
11 | - --max-model-len=32768
12 | - --max-num-batched-token=32768
13 | - --max-num-seqs=512
14 | - --gpu-memory-utilization=0.9
15 | - --pipeline-parallel-size=4
16 | - --tensor-parallel-size=2
17 | - --enable-prefix-caching
18 | - --enable-chunked-prefill=false
19 | - --disable-log-requests
20 | - --kv-cache-dtype=fp8
21 | - --enforce-eager
22 | env:
23 | VLLM_ATTENTION_BACKEND: FLASHINFER
24 | minReplicas: 0
25 | targetRequests: 500
26 | resourceProfile: nvidia-gpu-l4:8
27 |
--------------------------------------------------------------------------------
/manifests/models/llama-3.1-70b-instruct-fp8-mi300x.yaml:
--------------------------------------------------------------------------------
1 | # Source: models/templates/models.yaml
2 | apiVersion: kubeai.org/v1
3 | kind: Model
4 | metadata:
5 | name: llama-3.1-70b-instruct-fp8-mi300x
6 | spec:
7 | features: [TextGeneration]
8 | url: hf://amd/Llama-3.1-70B-Instruct-FP8-KV
9 | engine: VLLM
10 | args:
11 | - --max-model-len=120000
12 | - --max-num-batched-token=120000
13 | - --max-num-seqs=1024
14 | - --num-scheduler-steps=15
15 | - --gpu-memory-utilization=0.9
16 | - --disable-log-requests
17 | - --kv-cache-dtype=fp8
18 | - --enable-chunked-prefill=false
19 | - --max-seq-len-to-capture=16384
20 | env:
21 | HIP_FORCE_DEV_KERNARG: "1"
22 | NCCL_MIN_NCHANNELS: "112"
23 | TORCH_BLAS_PREFER_HIPBLASLT: "1"
24 | VLLM_USE_TRITON_FLASH_ATTN: "0"
25 | minReplicas: 0
26 | targetRequests: 1024
27 | resourceProfile: amd-gpu-mi300x:1
28 |
--------------------------------------------------------------------------------
/manifests/models/llama-3.1-8b-instruct-cpu.yaml:
--------------------------------------------------------------------------------
1 | # Source: models/templates/models.yaml
2 | apiVersion: kubeai.org/v1
3 | kind: Model
4 | metadata:
5 | name: llama-3.1-8b-instruct-cpu
6 | spec:
7 | features: [TextGeneration]
8 | url: hf://meta-llama/Meta-Llama-3.1-8B-Instruct
9 | engine: VLLM
10 | args:
11 | - --max-model-len=32768
12 | - --max-num-batched-token=32768
13 | env:
14 | VLLM_CPU_KVCACHE_SPACE: "4"
15 | minReplicas: 0
16 | resourceProfile: cpu:6
17 |
--------------------------------------------------------------------------------
/manifests/models/llama-3.1-8b-instruct-fp8-l4.yaml:
--------------------------------------------------------------------------------
1 | # Source: models/templates/models.yaml
2 | apiVersion: kubeai.org/v1
3 | kind: Model
4 | metadata:
5 | name: llama-3.1-8b-instruct-fp8-l4
6 | spec:
7 | features: [TextGeneration]
8 | url: hf://neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8
9 | engine: VLLM
10 | args:
11 | - --max-model-len=16384
12 | - --max-num-batched-token=16384
13 | - --gpu-memory-utilization=0.9
14 | - --disable-log-requests
15 | minReplicas: 0
16 | resourceProfile: nvidia-gpu-l4:1
17 |
--------------------------------------------------------------------------------
/manifests/models/llama-3.1-8b-instruct-tpu.yaml:
--------------------------------------------------------------------------------
1 | # Source: models/templates/models.yaml
2 | apiVersion: kubeai.org/v1
3 | kind: Model
4 | metadata:
5 | name: llama-3.1-8b-instruct-tpu
6 | spec:
7 | features: [TextGeneration]
8 | url: hf://meta-llama/Meta-Llama-3.1-8B-Instruct
9 | engine: VLLM
10 | args:
11 | - --disable-log-requests
12 | - --swap-space=8
13 | - --tensor-parallel-size=4
14 | - --num-scheduler-steps=4
15 | - --max-model-len=8192
16 | - --distributed-executor-backend=ray
17 | minReplicas: 0
18 | resourceProfile: google-tpu-v5e-2x2:4
19 |
--------------------------------------------------------------------------------
/manifests/models/llama-3.1-supernova-lite-l4.yaml:
--------------------------------------------------------------------------------
1 | # Source: models/templates/models.yaml
2 | apiVersion: kubeai.org/v1
3 | kind: Model
4 | metadata:
5 | name: llama-3.1-supernova-lite-l4
6 | spec:
7 | features: [TextGeneration]
8 | url: hf://arcee-ai/Llama-3.1-SuperNova-Lite
9 | engine: VLLM
10 | args:
11 | - --max-model-len=2048
12 | - --max-num-batched-token=2048
13 | - --max-num-seqs=1
14 | - --gpu-memory-utilization=0.95
15 | - --kv-cache-dtype=fp8
16 | - --disable-log-requests
17 | - --quantization=fp8
18 | - --enforce-eager
19 | env:
20 | VLLM_ATTENTION_BACKEND: FLASHINFER
21 | minReplicas: 0
22 | resourceProfile: nvidia-gpu-l4:1
23 |
--------------------------------------------------------------------------------
/manifests/models/llama-3.1-tulu-3-8b-l4.yaml:
--------------------------------------------------------------------------------
1 | # Source: models/templates/models.yaml
2 | apiVersion: kubeai.org/v1
3 | kind: Model
4 | metadata:
5 | name: llama-3.1-tulu-3-8b-l4
6 | spec:
7 | features: [TextGeneration]
8 | url: hf://allenai/Llama-3.1-Tulu-3-8B
9 | engine: VLLM
10 | args:
11 | - --max-model-len=8192
12 | - --max-num-batched-token=8192
13 | - --max-num-seqs=256
14 | - --gpu-memory-utilization=0.95
15 | - --kv-cache-dtype=fp8
16 | env:
17 | VLLM_ATTENTION_BACKEND: FLASHINFER
18 | minReplicas: 0
19 | resourceProfile: nvidia-gpu-l4:1
20 |
--------------------------------------------------------------------------------
/manifests/models/llama-3.2-11b-vision-instruct-l4.yaml:
--------------------------------------------------------------------------------
1 | # Source: models/templates/models.yaml
2 | apiVersion: kubeai.org/v1
3 | kind: Model
4 | metadata:
5 | name: llama-3.2-11b-vision-instruct-l4
6 | spec:
7 | features: [TextGeneration]
8 | url: hf://neuralmagic/Llama-3.2-11B-Vision-Instruct-FP8-dynamic
9 | engine: VLLM
10 | args:
11 | - --max-model-len=8192
12 | - --max-num-batched-token=8192
13 | - --gpu-memory-utilization=0.99
14 | - --enforce-eager
15 | - --disable-log-requests
16 | - --max-num-seqs=16
17 | env:
18 | VLLM_WORKER_MULTIPROC_METHOD: spawn
19 | minReplicas: 1
20 | maxReplicas: 1
21 | targetRequests: 32
22 | resourceProfile: nvidia-gpu-l4:1
23 |
--------------------------------------------------------------------------------
/manifests/models/llama-3.3-70b-instruct-bf16-gh200.yaml:
--------------------------------------------------------------------------------
1 | # Source: models/templates/models.yaml
2 | apiVersion: kubeai.org/v1
3 | kind: Model
4 | metadata:
5 | name: llama-3.3-70b-instruct-bf16-gh200
6 | spec:
7 | features: [TextGeneration]
8 | url: hf://meta-llama/Llama-3.3-70B-Instruct
9 | engine: VLLM
10 | args:
11 | - --max-model-len=32768
12 | - --max-num-batched-token=32768
13 | - --gpu-memory-utilization=0.98
14 | - --kv-cache-dtype=fp8
15 | - --cpu-offload-gb=60
16 | - --enable-prefix-caching
17 | - --disable-log-requests
18 | env:
19 | VLLM_ATTENTION_BACKEND: FLASHINFER
20 | minReplicas: 0
21 | targetRequests: 200
22 | resourceProfile: nvidia-gpu-gh200:1
23 |
--------------------------------------------------------------------------------
/manifests/models/llama-3.3-70b-ollama-l4.yaml:
--------------------------------------------------------------------------------
1 | # Source: models/templates/models.yaml
2 | apiVersion: kubeai.org/v1
3 | kind: Model
4 | metadata:
5 | name: llama-3.3-70b-ollama-l4
6 | spec:
7 | features: [TextGeneration]
8 | url: ollama://llama3.3:70b
9 | engine: OLlama
10 | minReplicas: 0
11 | resourceProfile: nvidia-gpu-l4:1
12 |
--------------------------------------------------------------------------------
/manifests/models/llama-4-maverick-430k-h100.yaml:
--------------------------------------------------------------------------------
1 | # Source: models/templates/models.yaml
2 | apiVersion: kubeai.org/v1
3 | kind: Model
4 | metadata:
5 | name: llama-4-maverick-430k-h100
6 | spec:
7 | features: [TextGeneration]
8 | url: hf://meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8
9 | engine: VLLM
10 | args:
11 | - --max-model-len=430000
12 | - --tensor-parallel-size=8
13 | - --enable-prefix-caching
14 | - --disable-log-requests
15 | env:
16 | VLLM_DISABLE_COMPILE_CACHE: "1"
17 | minReplicas: 0
18 | resourceProfile: nvidia-gpu-h100:8
19 |
--------------------------------------------------------------------------------
/manifests/models/mistral-small-24b-instruct-h100.yaml:
--------------------------------------------------------------------------------
1 | # Source: models/templates/models.yaml
2 | apiVersion: kubeai.org/v1
3 | kind: Model
4 | metadata:
5 | name: mistral-small-24b-instruct-h100
6 | spec:
7 | features: [TextGeneration]
8 | url: hf://mistralai/Mistral-Small-24B-Instruct-2501
9 | engine: VLLM
10 | args:
11 | - --kv-cache-dtype=fp8
12 | - --max-num-batched-token=65536
13 | - --gpu-memory-utilization=0.9
14 | - --enable-prefix-caching
15 | - --disable-log-requests
16 | env:
17 | VLLM_ATTENTION_BACKEND: FLASHINFER
18 | minReplicas: 0
19 | resourceProfile: nvidia-gpu-h100:1
20 |
--------------------------------------------------------------------------------
/manifests/models/mistral-small-3.1-24b-instruct-h100.yaml:
--------------------------------------------------------------------------------
1 | # Source: models/templates/models.yaml
2 | apiVersion: kubeai.org/v1
3 | kind: Model
4 | metadata:
5 | name: mistral-small-3.1-24b-instruct-h100
6 | spec:
7 | features: [TextGeneration]
8 | url: hf://mistralai/Mistral-Small-3.1-24B-Instruct-2503
9 | engine: VLLM
10 | args:
11 | - --kv-cache-dtype=fp8
12 | - --max-model-len=65536
13 | - --gpu-memory-utilization=0.9
14 | - --disable-log-requests
15 | - --tokenizer-mode=mistral
16 | - --load-format=mistral
17 | - --config-format=mistral
18 | env:
19 | VLLM_ATTENTION_BACKEND: FLASHINFER
20 | minReplicas: 0
21 | resourceProfile: nvidia-gpu-h100:1
22 |
--------------------------------------------------------------------------------
/manifests/models/nomic-embed-text-cpu.yaml:
--------------------------------------------------------------------------------
1 | # Source: models/templates/models.yaml
2 | apiVersion: kubeai.org/v1
3 | kind: Model
4 | metadata:
5 | name: nomic-embed-text-cpu
6 | spec:
7 | features: [TextEmbedding]
8 | url: ollama://nomic-embed-text
9 | engine: OLlama
10 | minReplicas: 0
11 | resourceProfile: cpu:1
12 |
--------------------------------------------------------------------------------
/manifests/models/opt-125m-cpu.yaml:
--------------------------------------------------------------------------------
1 | # Source: models/templates/models.yaml
2 | apiVersion: kubeai.org/v1
3 | kind: Model
4 | metadata:
5 | name: opt-125m-cpu
6 | spec:
7 | features: [TextGeneration]
8 | url: hf://facebook/opt-125m
9 | engine: VLLM
10 | args:
11 | - --chat-template=/config/chat-template.jinja
12 | minReplicas: 0
13 | resourceProfile: cpu:1
14 | files:
15 | - content: |-
16 | {% for message in messages %}
17 | {% if message['role'] == 'user' %}
18 | {{ 'Question:
19 | ' + message['content'] + '
20 |
21 | ' }}{% elif message['role'] == 'system' %}
22 | {{ 'System:
23 | ' + message['content'] + '
24 |
25 | ' }}{% elif message['role'] == 'assistant' %}{{ 'Answer:
26 | ' + message['content'] + '
27 |
28 | ' }}{% endif %}
29 | {% if loop.last and add_generation_prompt %}
30 | {{ 'Answer:
31 | ' }}{% endif %}{% endfor %}
32 | path: /config/chat-template.jinja
33 |
--------------------------------------------------------------------------------
/manifests/models/opt-125m-l4.yaml:
--------------------------------------------------------------------------------
1 | # Source: models/templates/models.yaml
2 | apiVersion: kubeai.org/v1
3 | kind: Model
4 | metadata:
5 | name: opt-125m-l4
6 | spec:
7 | features: [TextGeneration]
8 | url: hf://facebook/opt-125m
9 | engine: VLLM
10 | minReplicas: 0
11 | resourceProfile: nvidia-gpu-l4:1
12 |
--------------------------------------------------------------------------------
/manifests/models/phi-4-bnb-4bit-l4.yaml:
--------------------------------------------------------------------------------
1 | # Source: models/templates/models.yaml
2 | apiVersion: kubeai.org/v1
3 | kind: Model
4 | metadata:
5 | name: phi-4-bnb-4bit-l4
6 | spec:
7 | features: [TextGeneration]
8 | url: hf://unsloth/phi-4-bnb-4bit
9 | engine: VLLM
10 | args:
11 | - --max-model-len=8192
12 | - --max-num-batched-token=8192
13 | - --max-num-seqs=1
14 | - --gpu-memory-utilization=0.95
15 | - --disable-log-requests
16 | - --enforce-eager
17 | - --quantization=bitsandbytes
18 | - --load_format=bitsandbytes
19 | env:
20 | VLLM_ATTENTION_BACKEND: FLASHINFER
21 | minReplicas: 0
22 | resourceProfile: nvidia-gpu-l4:1
23 |
--------------------------------------------------------------------------------
/manifests/models/phi-4-ollama-l4.yaml:
--------------------------------------------------------------------------------
1 | # Source: models/templates/models.yaml
2 | apiVersion: kubeai.org/v1
3 | kind: Model
4 | metadata:
5 | name: phi-4-ollama-l4
6 | spec:
7 | features: [TextGeneration]
8 | url: ollama://phi4
9 | engine: OLlama
10 | minReplicas: 0
11 | resourceProfile: nvidia-gpu-l4:1
12 |
--------------------------------------------------------------------------------
/manifests/models/qwen2-500m-cpu.yaml:
--------------------------------------------------------------------------------
1 | # Source: models/templates/models.yaml
2 | apiVersion: kubeai.org/v1
3 | kind: Model
4 | metadata:
5 | name: qwen2-500m-cpu
6 | spec:
7 | features: [TextGeneration]
8 | url: ollama://qwen2:0.5b
9 | engine: OLlama
10 | minReplicas: 0
11 | resourceProfile: cpu:1
12 |
--------------------------------------------------------------------------------
/manifests/models/qwen2.5-7b-cpu.yaml:
--------------------------------------------------------------------------------
1 | # Source: models/templates/models.yaml
2 | apiVersion: kubeai.org/v1
3 | kind: Model
4 | metadata:
5 | name: qwen2.5-7b-cpu
6 | spec:
7 | features: [TextGeneration]
8 | url: ollama://qwen2.5:7b
9 | engine: OLlama
10 | minReplicas: 0
11 | resourceProfile: cpu:2
12 |
--------------------------------------------------------------------------------
/manifests/models/qwen2.5-7b-instruct-l4.yaml:
--------------------------------------------------------------------------------
1 | # Source: models/templates/models.yaml
2 | apiVersion: kubeai.org/v1
3 | kind: Model
4 | metadata:
5 | name: qwen2.5-7b-instruct-l4
6 | spec:
7 | features: [TextGeneration]
8 | url: hf://Qwen/Qwen2.5-7B-Instruct
9 | engine: VLLM
10 | args:
11 | - --max-model-len=8192
12 | - --max-num-batched-token=8192
13 | - --max-num-seqs=256
14 | - --gpu-memory-utilization=0.95
15 | - --kv-cache-dtype=fp8
16 | - --enable-prefix-caching
17 | env:
18 | VLLM_ATTENTION_BACKEND: FLASHINFER
19 | minReplicas: 0
20 | resourceProfile: nvidia-gpu-l4:1
21 |
--------------------------------------------------------------------------------
/manifests/models/qwen2.5-coder-1.5b-cpu.yaml:
--------------------------------------------------------------------------------
1 | # Source: models/templates/models.yaml
2 | apiVersion: kubeai.org/v1
3 | kind: Model
4 | metadata:
5 | name: qwen2.5-coder-1.5b-cpu
6 | spec:
7 | features: [TextGeneration]
8 | url: ollama://qwen2.5-coder:1.5b
9 | engine: OLlama
10 | minReplicas: 0
11 | resourceProfile: cpu:1
12 |
--------------------------------------------------------------------------------
/manifests/models/qwen2.5-coder-1.5b-rtx4070-8gb.yaml:
--------------------------------------------------------------------------------
1 | # Source: models/templates/models.yaml
2 | apiVersion: kubeai.org/v1
3 | kind: Model
4 | metadata:
5 | name: qwen2.5-coder-1.5b-rtx4070-8gb
6 | spec:
7 | features: [TextGeneration]
8 | url: hf://Qwen/Qwen2.5-Coder-1.5B-Instruct
9 | engine: VLLM
10 | args:
11 | - --max-model-len=2048
12 | - --max-num-seqs=16
13 | - --quantization=fp8
14 | - --kv-cache-dtype=fp8
15 | env:
16 | VLLM_ATTENTION_BACKEND: FLASHINFER
17 | minReplicas: 1
18 | resourceProfile: nvidia-gpu-rtx4070-8gb:1
19 |
--------------------------------------------------------------------------------
/mkdocs.yml:
--------------------------------------------------------------------------------
1 | site_name: KubeAI
2 | site_url: https://www.kubeai.org
3 | repo_url: https://github.com/substratusai/kubeai
4 |
5 | theme:
6 | name: material
7 | custom_dir: docs/overrides
8 | palette:
9 | primary: white # Defaults to indigo.
10 | accent: blue # Defaults to indigo.
11 |
12 | nav:
13 | - Home: README.md
14 | - ... | installation/*.md
15 | - ... | how-to/*.md
16 | - ... | concepts/*.md
17 | - ... | tutorials/*.md
18 | - ... | contributing/*.md
19 | - ... | reference/*.md
20 | - ...
21 | plugins:
22 | - search
23 | - awesome-pages
24 | - blog
25 | - social
26 | markdown_extensions:
27 | # Python Markdown
28 | - abbr
29 | - admonition
30 | - attr_list
31 | - def_list
32 | - footnotes
33 | - md_in_html
34 | - toc:
35 | permalink: true
36 |
37 | # Python Markdown Extensions
38 | - pymdownx.arithmatex:
39 | generic: true
40 | - pymdownx.betterem:
41 | smart_enable: all
42 | - pymdownx.caret
43 | - pymdownx.details
44 | - pymdownx.emoji:
45 | emoji_index: !!python/name:material.extensions.emoji.twemoji
46 | emoji_generator: !!python/name:material.extensions.emoji.to_svg
47 | - pymdownx.highlight
48 | - pymdownx.inlinehilite
49 | - pymdownx.keys
50 | - pymdownx.mark
51 | - pymdownx.smartsymbols
52 | - pymdownx.superfences
53 | - pymdownx.tabbed:
54 | alternate_style: true
55 | - pymdownx.tasklist:
56 | custom_checkbox: true
57 | - pymdownx.tilde
58 |
59 | # Analytics tracking with GoatCounter
60 | extra:
61 | analytics:
62 | provider: custom
63 |
--------------------------------------------------------------------------------
/proposals/diagrams/auth-with-label-selector.excalidraw.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/substratusai/kubeai/818a871ca6c1c0af774e467217de0a17b8bd9d7c/proposals/diagrams/auth-with-label-selector.excalidraw.png
--------------------------------------------------------------------------------
/proposals/diagrams/cache-optimized-routing.excalidraw.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/substratusai/kubeai/818a871ca6c1c0af774e467217de0a17b8bd9d7c/proposals/diagrams/cache-optimized-routing.excalidraw.png
--------------------------------------------------------------------------------
/proposals/diagrams/lora-direct-loading.excalidraw.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/substratusai/kubeai/818a871ca6c1c0af774e467217de0a17b8bd9d7c/proposals/diagrams/lora-direct-loading.excalidraw.png
--------------------------------------------------------------------------------
/proposals/diagrams/lora.excalidraw.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/substratusai/kubeai/818a871ca6c1c0af774e467217de0a17b8bd9d7c/proposals/diagrams/lora.excalidraw.png
--------------------------------------------------------------------------------
/proposals/diagrams/model-mgmt-buckets.excalidraw.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/substratusai/kubeai/818a871ca6c1c0af774e467217de0a17b8bd9d7c/proposals/diagrams/model-mgmt-buckets.excalidraw.png
--------------------------------------------------------------------------------
/proposals/diagrams/model-mgmt-volumes.excalidraw.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/substratusai/kubeai/818a871ca6c1c0af774e467217de0a17b8bd9d7c/proposals/diagrams/model-mgmt-volumes.excalidraw.png
--------------------------------------------------------------------------------
/proposals/lora-adapters.md:
--------------------------------------------------------------------------------
1 | # LoRA Adapters
2 |
3 | ## Overview
4 |
5 | 
6 |
7 | ## Direct Loading Implementation
8 |
9 | 
--------------------------------------------------------------------------------
/proposals/multitenancy.md:
--------------------------------------------------------------------------------
1 | # Multitenancy
2 |
3 | The goal of this proposal is to allow KubeAI to be used in a multitenancy environment where
4 | some users only have access to some models.
5 |
6 | ## Implementation Option 1: Auth Labels
7 |
8 | In this implementation, KubeAI has well-known labels that correspond to groups that are allowed to access models.
9 |
10 | The KubeAI system is configured to trust a configured header.
11 |
12 | ```yaml
13 | auth:
14 | http:
15 | trustedHeader: X-Auth-Groups
16 | # Possibly in future: configure Model roles.
17 | # modelRoles:
18 | # user: ["list", "describe", "infer"]
19 | ```
20 |
21 | The groups associated with a request are passed in a trusted header.
22 |
23 | ```bash
24 | curl http://localhost:8000/openai/v1/completions \
25 | -H "X-Auth-Groups: grp-a, grp-b"
26 | ```
27 |
28 | The groups that are allowed to access a given model are configured as labels on the Model.
29 |
30 | ```yaml
31 | kind: Model
32 | metadata:
33 | name: llama-3.2
34 | labels:
35 | auth.kubeai.org/grp-a:
36 | auth.kubeai.org/grp-c:
37 | ```
38 |
39 | ## Implementation Option 2: General Label Selector
40 |
41 | **CURRENT PREFERENCE** (Unless there is a reason to introduce auth-specific configuration.)
42 |
43 | In this implementation, label selectors are used to filter models. The decision of which labels to use are up to the architects of the system that KubeAI is a part of. These label selectors could be enforced by a server that is an intermediary between KubeAI and the end users.
44 |
45 | 
46 |
47 | ```bash
48 | curl http://localhost:8000/openai/v1/completions \
49 | -H "X-Label-Selector: key1=value1"
50 |
51 | curl http://localhost:8000/openai/v1/models \
52 | -H "X-Label-Selector: key1=value1"
53 | ```
54 |
55 | Models just need to have the labels set.
56 |
57 | ```yaml
58 | kind: Model
59 | metadata:
60 | name: llama-3.2
61 | labels:
62 | key1: value1
63 | ```
64 |
--------------------------------------------------------------------------------
/skaffold-build.json:
--------------------------------------------------------------------------------
1 | {"builds":null}
--------------------------------------------------------------------------------
/skaffold-tags.json:
--------------------------------------------------------------------------------
1 | {"builds":null}
--------------------------------------------------------------------------------
/skaffold.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: skaffold/v4beta11
2 | kind: Config
3 | metadata:
4 | name: kubeai-project
5 | build:
6 | artifacts:
7 | - image: substratusai/kubeai
8 | local:
9 | push: false
10 | deploy:
11 | helm:
12 | releases:
13 | - name: kubeai
14 | chartPath: ./charts/kubeai
15 | setValueTemplates:
16 | open-webui.enabled: "false"
17 | skipBuildDependencies: true
18 | portForward:
19 | - resourceType: service
20 | resourceName: kubeai
21 | namespace: default
22 | port: 80
23 | localPort: 8000
24 | profiles:
25 | - name: kubeai-only
26 | deploy:
27 | helm:
28 | releases:
29 | - name: kubeai
30 | chartPath: ./charts/kubeai
31 | setValueTemplates:
32 | open-webui.enabled: "false"
33 | skipBuildDependencies: true
34 | - name: kubeai-only-gke
35 | build:
36 | local:
37 | push: true
38 | deploy:
39 | helm:
40 | releases:
41 | - name: kubeai
42 | chartPath: ./charts/kubeai
43 | valuesFiles:
44 | - ./charts/kubeai/values-gke.yaml
45 | setValueTemplates:
46 | open-webui.enabled: "false"
47 | skipBuildDependencies: true
--------------------------------------------------------------------------------
/test/e2e-manual/gke-vllm-adapters/model.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: kubeai.org/v1
2 | kind: Model
3 | metadata:
4 | name: tiny-llama
5 | spec:
6 | features: [TextGeneration]
7 | owner: meta-llama
8 | url: hf://TinyLlama/TinyLlama-1.1B-Chat-v0.3
9 | adapters:
10 | - id: colorist
11 | url: hf://jashing/tinyllama-colorist-lora
12 | engine: VLLM
13 | resourceProfile: nvidia-gpu-l4:1
14 | minReplicas: 1
--------------------------------------------------------------------------------
/test/e2e-manual/gke-vllm-adapters/run.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | set -ex
4 |
5 | skaffold run -f ./skaffold.yaml --tail --port-forward --profile kubeai-only-gke --default-repo us-central1-docker.pkg.dev/substratus-dev
6 |
7 | kubectl apply -f ./model.yaml
8 |
9 | kubectl port-forward svc/kubeai 8000:80 &
10 |
11 | # raw model
12 | curl -v http://localhost:8000/openai/v1/completions \
13 | -H "Content-Type: application/json" \
14 | -d '{"model": "tiny-llama", "prompt": "Who was the first president of the United States?", "max_tokens": 40}'
15 |
16 | # with adapter
17 | curl -v http://localhost:8000/openai/v1/completions \
18 | -H "Content-Type: application/json" \
19 | -d '{"model": "tiny-llama/colorist", "prompt": "Who was the first president of the United States?", "max_tokens": 40}'
20 |
--------------------------------------------------------------------------------
/test/e2e-manual/gke-vllm-gpu-tpu/run.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | set -ex
4 |
5 | # Spin up latest release and run test GPU and TPU on GKE autopilot.
6 |
7 | helm install kubeai ./charts/kubeai \
8 | -f ./charts/kubeai/values-gke.yaml \
9 | -f - < $replica_log &
19 | kubectl_watch_pid=$!
20 |
21 | sleep 60
22 | kubectl delete pods -l app.kubernetes.io/name=kubeai
23 | sleep 120
24 |
25 | kill $kubectl_watch_pid
26 |
27 | echo "Replica log:"
28 | cat $replica_log
29 | replicas_over_time=$(cat $replica_log | sort | uniq)
30 |
31 | # Replicas should have remained at 3
32 | if [ "$replicas_over_time" != "3" ]; then
33 | echo "TEST FAILURE: Replicas changed during autoscaler restart."
34 | cat $replica_log
35 | exit 1
36 | fi
--------------------------------------------------------------------------------
/test/e2e/autoscaler-restart-under-load/values.yaml:
--------------------------------------------------------------------------------
1 | modelAutoscaling:
2 | interval: 1s
3 | timeWindow: 30s
4 | open-webui:
5 | enabled: false
6 |
--------------------------------------------------------------------------------
/test/e2e/cache-shared-filesystem/cache-mount-pod.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: Pod
3 | metadata:
4 | name: cache-mount-pod
5 | spec:
6 | containers:
7 | - name: main
8 | image: ubuntu
9 | command: ["sleep", "10000"]
10 | volumeMounts:
11 | - name: models
12 | mountPath: /test-mount
13 | volumes:
14 | - name: models
15 | persistentVolumeClaim:
16 | claimName: shared-model-cache-e2e-test-kind-pv
--------------------------------------------------------------------------------
/test/e2e/cache-shared-filesystem/test.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | source $REPO_DIR/test/e2e/common.sh
4 |
5 | models_release="kubeai-models"
6 |
7 | helm install $models_release $REPO_DIR/charts/models -f - < $transcription_file
17 |
18 | result_contains_kubernetes=$(cat $transcription_file | jq '.text | ascii_downcase | contains("kubernetes")')
19 | if [ "$result_contains_kubernetes" = "true" ]; then
20 | echo "The transcript contains 'kubernetes'."
21 | else
22 | echo "The text does not contain 'kubernetes':"
23 | cat $transcription_file
24 | exit 1
25 | fi
26 |
--------------------------------------------------------------------------------
/test/e2e/engine-infinity/test.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | source $REPO_DIR/test/e2e/common.sh
4 |
5 | model=bge-embed-text-cpu
6 |
7 | apply_model $model
8 |
9 | # Test embedding generation
10 | response_file=$TMP_DIR/embedding.json
11 | curl http://localhost:8000/openai/v1/embeddings \
12 | -H "Content-Type: application/json" \
13 | -d '{
14 | "input": "Hello world",
15 | "model": "'$model'"
16 | }' > $response_file
17 |
18 | # Verify response structure and content
19 | embedding_length=$(cat $response_file | jq '.data[0].embedding | length')
20 | if [ "$embedding_length" -ne 384 ]; then
21 | echo "Unexpected embedding dimension: got $embedding_length, expected 384"
22 | cat $response_file
23 | exit 1
24 | fi
25 |
26 | echo "Successfully generated embedding with $embedding_length dimensions"
27 |
--------------------------------------------------------------------------------
/test/e2e/engine-ollama-pvc/ollama-hydrate-job.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: batch/v1
2 | kind: Job
3 | metadata:
4 | name: ollama-pvc-hydrate
5 | spec:
6 | template:
7 | spec:
8 | containers:
9 | - name: ollama
10 | image: ollama/ollama:latest
11 | env:
12 | - name: OLLAMA_MODELS
13 | value: /model
14 | command:
15 | - /bin/sh
16 | - -c
17 | - |
18 | /bin/ollama serve &
19 | echo "Waiting for Ollama server to start..."
20 | sleep 10
21 |
22 | # Pull the model and ensure it downloads successfully
23 | echo "Pulling model qwen:0.5b..."
24 | if ! /bin/ollama pull qwen:0.5b; then
25 | echo "Failed to pull model"
26 | exit 1
27 | fi
28 |
29 | # Verify the model files exist
30 | echo "Verifying model files..."
31 | ls -R /model
32 | if [ ! -d "/model/blobs" ] || [ ! -d "/model/manifests" ]; then
33 | echo "Model directories not found"
34 | exit 1
35 | fi
36 |
37 | echo "Model setup completed successfully"
38 | ls -la /model/manifests/registry.ollama.ai/library/qwen/0.5b
39 | volumeMounts:
40 | - name: models-volume
41 | mountPath: /model
42 | volumes:
43 | - name: models-volume
44 | persistentVolumeClaim:
45 | claimName: model-pvc
46 | readOnly: false
47 | restartPolicy: OnFailure
48 |
--------------------------------------------------------------------------------
/test/e2e/engine-ollama-pvc/pv.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: PersistentVolume
3 | metadata:
4 | name: kind-model-hostpath
5 | spec:
6 | storageClassName: manual
7 | capacity:
8 | storage: 25Gi
9 | accessModes:
10 | - ReadWriteMany
11 | - ReadOnlyMany
12 | - ReadWriteOnce
13 | hostPath:
14 | path: $PV_HOST_PATH
15 | type: DirectoryOrCreate
16 | persistentVolumeReclaimPolicy: Retain
17 |
--------------------------------------------------------------------------------
/test/e2e/engine-ollama-pvc/pvc.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: PersistentVolumeClaim
3 | metadata:
4 | name: model-pvc
5 | spec:
6 | storageClassName: manual
7 | accessModes:
8 | - ReadWriteMany
9 | resources:
10 | requests:
11 | storage: 10Gi
12 | volumeName: kind-model-hostpath
--------------------------------------------------------------------------------
/test/e2e/engine-ollama-pvc/test.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | source $REPO_DIR/test/e2e/common.sh
4 |
5 | models_release="kubeai-models"
6 |
7 | # Create PV_HOST_PATH inside the kind container
8 | kind_container=$(docker ps --filter "name=kind-control-plane" --format "{{.ID}}")
9 | export PV_HOST_PATH="/mnt/models"
10 | docker exec -i $kind_container mkdir -p $PV_HOST_PATH
11 | echo "PV_HOST_PATH: $PV_HOST_PATH"
12 |
13 |
14 | envsubst < $REPO_DIR/test/e2e/engine-ollama-pvc/pv.yaml | kubectl apply -f -
15 | kubectl apply -f $REPO_DIR/test/e2e/engine-ollama-pvc/pvc.yaml
16 |
17 | # Apply the Ollama hydrate job
18 | kubectl apply -f $REPO_DIR/test/e2e/engine-ollama-pvc/ollama-hydrate-job.yaml
19 |
20 | # Wait for job completion with timeout
21 | echo "Waiting for Ollama hydrate job to complete..."
22 | if ! kubectl wait --for=condition=complete --timeout=600s job/ollama-pvc-hydrate; then
23 | echo "Ollama hydrate job failed or timed out"
24 | kubectl logs job/ollama-pvc-hydrate
25 | exit 1
26 | fi
27 |
28 |
29 | helm install $models_release $REPO_DIR/charts/models -f - < 0
28 |
29 |
30 | def test_completion():
31 | response = client.completions.create(
32 | model=model, prompt="How are you?", max_tokens=50
33 | )
34 |
35 | print(response)
36 | # Assert that the response contains at least one "choices"
37 | assert len(response.choices) > 0
38 |
--------------------------------------------------------------------------------
/test/e2e/openai-python-client/test.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | source $REPO_DIR/test/e2e/common.sh
4 |
5 | kubectl apply -f $REPO_DIR/manifests/models/opt-125m-cpu.yaml
6 |
7 | python -m venv $TEST_DIR/venv
8 |
9 | source $TEST_DIR/venv/bin/activate
10 |
11 | which pip
12 | pip install -r $TEST_DIR/requirements.txt
13 |
14 | # Wait for models to sync.
15 | sleep 3
16 |
17 | pytest $TEST_DIR/test.py
18 |
--------------------------------------------------------------------------------
/test/e2e/quickstart/test.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | source $REPO_DIR/test/e2e/common.sh
4 |
5 | models_release="kubeai-models"
6 |
7 | helm install $models_release $REPO_DIR/charts/models -f - <