├── .github ├── actions │ ├── docker-build-and-push │ │ └── action.yml │ └── trivy-scan │ │ └── action.yml ├── dependabot.yml └── workflows │ ├── ci-pr-checks.yaml │ ├── ci-release.yaml │ └── md-link-check.yml ├── .gitignore ├── .golangci.yml ├── .lychee.toml ├── DEVELOPMENT.md ├── Dockerfile ├── LICENSE ├── Makefile ├── README.md ├── cmd └── epp │ ├── health.go │ └── main.go ├── deploy ├── components │ ├── crds-gateway-api │ │ └── kustomization.yaml │ ├── crds-gie │ │ └── kustomization.yaml │ ├── crds-istio │ │ ├── istio.yaml │ │ └── kustomization.yaml │ ├── inference-gateway │ │ ├── deployments.yaml │ │ ├── gateways.yaml │ │ ├── httproutes.yaml │ │ ├── inference-models.yaml │ │ ├── inference-pools.yaml │ │ ├── kustomization.yaml │ │ ├── rbac.yaml │ │ ├── service-accounts.yaml │ │ └── services.yaml │ ├── istio-control-plane │ │ ├── configmaps.yaml │ │ ├── deployments.yaml │ │ ├── hpa.yaml │ │ ├── kustomization.yaml │ │ ├── namespaces.yaml │ │ ├── policies.yaml │ │ ├── rbac.yaml │ │ ├── service-accounts.yaml │ │ ├── services.yaml │ │ ├── telemetry.yaml │ │ └── webhooks.yaml │ ├── vllm-sim-pd │ │ ├── deployments.yaml │ │ └── kustomization.yaml │ └── vllm-sim │ │ ├── deployments.yaml │ │ └── kustomization.yaml └── environments │ ├── dev │ ├── base-kind-istio │ │ ├── destination-rules.yaml │ │ ├── kustomization.yaml │ │ ├── patch-deployments.yaml │ │ ├── patch-gateways.yaml │ │ └── services.yaml │ ├── kind-istio-pd │ │ └── kustomization.yaml │ └── kind-istio │ │ └── kustomization.yaml │ └── openshift-base │ ├── common │ ├── patch-service.yaml │ ├── patch-statefulset.yaml │ ├── service.yaml │ └── statefulset.yaml │ ├── kustomization.yaml │ ├── openshift │ ├── patch-route.yaml │ └── route.yaml │ └── rbac │ ├── exec-rbac-role.yaml │ ├── exec-rbac-rolebinding.yaml │ ├── patch-rbac-role.yaml │ └── patch-rbac-rolebinding.yaml ├── docs ├── architecture.md ├── create_new_filter.md ├── dp.md └── images │ ├── architecture.png │ ├── dp_architecture.png │ └── plugability.png ├── go.mod ├── go.sum ├── hooks └── pre-commit ├── internal └── controller │ ├── runnable │ ├── grpc.go │ └── leader_election.go │ └── tls │ └── tls.go ├── pkg ├── config │ └── config.go └── scheduling │ ├── dual │ └── scheduler.go │ ├── pd │ ├── doc.go │ ├── scheduler.go │ └── scheduler_test.go │ └── plugins │ ├── filter │ ├── by_labels.go │ ├── passthrough.go │ ├── pd_role_filter.go │ └── random.go │ └── scorer │ ├── doc.go │ ├── kvcache-aware.go │ ├── load_aware_scorer.go │ ├── load_aware_scorer_test.go │ ├── passthrough.go │ ├── prefix_aware.go │ ├── prefix_aware_test.go │ ├── prefix_store.go │ ├── prefix_store_test.go │ ├── random.go │ ├── session_affinity.go │ └── utils.go ├── scripts ├── istio │ ├── generate-cp.sh │ ├── istio-cp.yaml │ └── manifest-splitter.py └── kind-dev-env.sh └── test └── integration ├── epp_test.go └── suite_test.go /.github/actions/docker-build-and-push/action.yml: -------------------------------------------------------------------------------- 1 | name: Docker Build - ghcr 2 | description: Build image using buildx 3 | inputs: 4 | image-name: 5 | required: true 6 | description: Image name 7 | tag: 8 | required: true 9 | description: Image tag 10 | github-token: 11 | required: true 12 | description: GitHub token for login 13 | registry: 14 | required: true 15 | description: Container registry (e.g., ghcr.io/llm-d) 16 | runs: 17 | using: "composite" 18 | steps: 19 | - name: Set up Docker Buildx 20 | uses: docker/setup-buildx-action@v3 21 | 22 | - name: Login to GitHub Container Registry 23 | run: echo "${{ inputs.github-token }}" | docker login ghcr.io -u ${{ github.actor }} --password-stdin 24 | shell: bash 25 | 26 | - name: Print image info 27 | run: | 28 | echo "Image name: ${{ inputs.image-name }}" 29 | echo "Tag: ${{ inputs.tag }}" 30 | echo "Registry: ${{ inputs.registry }}" 31 | shell: bash 32 | 33 | - name: Build image and push 34 | run: | 35 | docker buildx build \ 36 | --platform linux/amd64 \ 37 | -t ${{ inputs.registry }}/${{ inputs.image-name }}:${{ inputs.tag }} \ 38 | --push . 39 | shell: bash 40 | -------------------------------------------------------------------------------- /.github/actions/trivy-scan/action.yml: -------------------------------------------------------------------------------- 1 | name: Trivy Scan 2 | description: Scan container image with Trivy 3 | inputs: 4 | image: 5 | required: true 6 | runs: 7 | using: "composite" 8 | steps: 9 | - name: Install Trivy 10 | run: | 11 | wget https://github.com/aquasecurity/trivy/releases/download/v0.44.1/trivy_0.44.1_Linux-64bit.deb 12 | sudo dpkg -i trivy_0.44.1_Linux-64bit.deb 13 | shell: bash 14 | 15 | 16 | - name: Scan image 17 | run: | 18 | trivy image --severity HIGH,CRITICAL --no-progress ${{ inputs.image }} 19 | shell: bash 20 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | 4 | # 1. Go module updates 5 | - package-ecosystem: "gomod" 6 | directory: "/" 7 | schedule: 8 | interval: "weekly" 9 | open-pull-requests-limit: 10 10 | commit-message: 11 | prefix: "deps(go)" 12 | labels: 13 | - "dependencies" 14 | - "release-note-none" 15 | groups: 16 | go-dependencies: 17 | patterns: 18 | - "*" 19 | kubernetes: 20 | patterns: 21 | - "k8s.io/*" 22 | - "sigs.k8s.io/*" 23 | 24 | # 2. GitHub Actions dependencies 25 | - package-ecosystem: "github-actions" 26 | directory: "/" 27 | schedule: 28 | interval: "weekly" 29 | labels: 30 | - "ci" 31 | - "dependencies" 32 | commit-message: 33 | prefix: "deps(actions)" 34 | 35 | # 3. Docker base image updates (e.g., for Dockerfile FROM lines) 36 | - package-ecosystem: "docker" 37 | directory: "/" 38 | schedule: 39 | interval: "weekly" 40 | labels: 41 | - "dependencies" 42 | - "docker" 43 | commit-message: 44 | prefix: "deps(docker)" 45 | -------------------------------------------------------------------------------- /.github/workflows/ci-pr-checks.yaml: -------------------------------------------------------------------------------- 1 | name: CI - PR Checks 2 | 3 | on: 4 | pull_request: 5 | branches: 6 | - dev 7 | - main 8 | 9 | jobs: 10 | lint-and-test: 11 | runs-on: ubuntu-latest 12 | steps: 13 | - name: Checkout source 14 | uses: actions/checkout@v4 15 | 16 | - name: Sanity check repo contents 17 | run: ls -la 18 | 19 | - name: Extract Go version from go.mod 20 | run: sed -En 's/^go (.*)$/GO_VERSION=\1/p' go.mod >> $GITHUB_ENV 21 | 22 | - name: Set up Go with cache 23 | uses: actions/setup-go@v5 24 | with: 25 | go-version: "${{ env.GO_VERSION }}" 26 | cache-dependency-path: ./go.sum 27 | 28 | - name: go mod tidy 29 | run: go mod tidy 30 | 31 | - name: Run lint checks 32 | uses: golangci/golangci-lint-action@v8 33 | with: 34 | version: 'v2.1.6' 35 | args: "--config=./.golangci.yml" 36 | 37 | - name: Run make test 38 | shell: bash 39 | run: | 40 | make test 41 | 42 | - name: Run make build 43 | shell: bash 44 | run: | 45 | make build 46 | -------------------------------------------------------------------------------- /.github/workflows/ci-release.yaml: -------------------------------------------------------------------------------- 1 | name: CI - Release - Docker Container Image 2 | 3 | on: 4 | push: 5 | tags: 6 | - 'v*' # Runs when a tag like v0.1.0 is pushed 7 | release: 8 | types: [published] # Also runs when a GitHub release is published 9 | 10 | jobs: 11 | docker-build-and-push: 12 | runs-on: ubuntu-latest 13 | steps: 14 | - name: Checkout source 15 | uses: actions/checkout@v4 16 | 17 | - name: Set project name from repository 18 | id: version 19 | run: | 20 | repo="${GITHUB_REPOSITORY##*/}" 21 | echo "project_name=$repo" >> "$GITHUB_OUTPUT" 22 | 23 | - name: Print project name 24 | run: echo "Project is ${{ steps.version.outputs.project_name }}" 25 | 26 | - name: Determine tag name 27 | id: tag 28 | run: | 29 | if [[ "${GITHUB_EVENT_NAME}" == "release" ]]; then 30 | echo "tag=${GITHUB_REF##refs/tags/}" >> "$GITHUB_OUTPUT" 31 | elif [[ "${GITHUB_REF}" == refs/tags/* ]]; then 32 | echo "tag=${GITHUB_REF##refs/tags/}" >> "$GITHUB_OUTPUT" 33 | else 34 | echo "tag=latest" >> "$GITHUB_OUTPUT" 35 | fi 36 | shell: bash 37 | 38 | - name: Build and push image 39 | uses: ./.github/actions/docker-build-and-push 40 | with: 41 | tag: ${{ steps.tag.outputs.tag }} 42 | image-name: ${{ steps.version.outputs.project_name }} 43 | registry: ghcr.io/llm-d 44 | github-token: ${{ secrets.GHCR_TOKEN }} 45 | 46 | - name: Run Trivy scan 47 | uses: ./.github/actions/trivy-scan 48 | with: 49 | image: ghcr.io/llm-d/${{ steps.version.outputs.project_name }}:${{ steps.tag.outputs.tag }} 50 | -------------------------------------------------------------------------------- /.github/workflows/md-link-check.yml: -------------------------------------------------------------------------------- 1 | name: Markdown Link Checker 2 | 3 | on: 4 | push: 5 | branches: [ main ] 6 | pull_request: 7 | branches: [ main ] 8 | workflow_dispatch: 9 | 10 | jobs: 11 | lychee: 12 | name: Check Markdown Links 13 | runs-on: ubuntu-latest 14 | 15 | steps: 16 | - name: Checkout code 17 | uses: actions/checkout@v4 18 | 19 | - name: Install lychee v0.18.1 20 | run: | 21 | curl -Ls https://github.com/lycheeverse/lychee/releases/download/lychee-v0.18.1/lychee-x86_64-unknown-linux-gnu.tar.gz | tar xz 22 | sudo mv lychee /usr/local/bin 23 | 24 | - name: Run lychee on Markdown files with config 25 | run: | 26 | find . -name "*.md" -print0 | xargs -0 lychee --config .lychee.toml --verbose --no-progress 27 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # If you prefer the allow list template instead of the deny list, see community template: 2 | # https://github.com/github/gitignore/blob/main/community/Golang/Go.AllowList.gitignore 3 | # 4 | # Binaries for programs and plugins 5 | *.exe 6 | *.exe~ 7 | *.dll 8 | *.so 9 | *.dylib 10 | *.a 11 | main 12 | bin/ 13 | 14 | # Test binary, built with `go test -c` 15 | *.test 16 | 17 | # Output of the go coverage tool, specifically when used with LiteIDE 18 | *.out 19 | 20 | # Dependency directories (remove the comment below to include it) 21 | # vendor/ 22 | 23 | # Go workspace file 24 | go.work 25 | go.work.sum 26 | 27 | # Environment Files 28 | .DS_Store 29 | .env 30 | -------------------------------------------------------------------------------- /.golangci.yml: -------------------------------------------------------------------------------- 1 | version: "2" 2 | 3 | run: 4 | timeout: 5m 5 | allow-parallel-runners: true 6 | 7 | formatters: 8 | enable: 9 | - goimports 10 | - gofmt 11 | 12 | linters: 13 | enable: 14 | - copyloopvar 15 | - dupword 16 | - durationcheck 17 | - fatcontext 18 | - ginkgolinter 19 | - gocritic 20 | - govet 21 | - loggercheck 22 | - misspell 23 | - perfsprint 24 | - revive 25 | - unconvert 26 | - makezero 27 | - errcheck 28 | - goconst 29 | - ineffassign 30 | - nakedret 31 | - prealloc 32 | - unparam 33 | - unused 34 | -------------------------------------------------------------------------------- /.lychee.toml: -------------------------------------------------------------------------------- 1 | # Ignore transient failures on gnu.org (it sometimes refuses connections) 2 | exclude = [ 3 | "^https://www.gnu.org/software/make/?$" 4 | ] 5 | 6 | # Timeout in seconds 7 | timeout = 20 8 | 9 | # Retry failed links (helpful for flaky sites) 10 | retry_count = 3 11 | 12 | # Accept non-200 status codes (429: rate limits) 13 | accept = [200, 429] 14 | -------------------------------------------------------------------------------- /DEVELOPMENT.md: -------------------------------------------------------------------------------- 1 | # Development 2 | 3 | Documentation for developing the inference scheduler. 4 | 5 | ## Requirements 6 | 7 | - [Make] `v4`+ 8 | - [Golang] `v1.24`+ 9 | - [Docker] (or [Podman]) 10 | - [Kubernetes in Docker (KIND)] 11 | 12 | [Make]:https://www.gnu.org/software/make/ 13 | [Golang]:https://go.dev/ 14 | [Docker]:https://www.docker.com/ 15 | [Podman]:https://podman.io/ 16 | [Kubernetes in Docker (KIND)]:https://github.com/kubernetes-sigs/kind 17 | 18 | ## Kind Development Environment 19 | 20 | > **WARNING**: This current requires you to have manually built the vllm 21 | > simulator separately on your local system. In a future iteration this will 22 | > be handled automatically and will not be required. The tag for the simulator 23 | > currently needs to be `0.0.4`. 24 | 25 | You can deploy the current scheduler with a Gateway API implementation into a 26 | [Kubernetes in Docker (KIND)] cluster locally with the following: 27 | 28 | ```console 29 | make env-dev-kind 30 | ``` 31 | 32 | This will create a `kind` cluster (or re-use an existing one) using the system's 33 | local container runtime and deploy the development stack into the `default` 34 | namespace. 35 | 36 | There are several ways to access the gateway: 37 | 38 | **Port forward**: 39 | 40 | ```console 41 | $ kubectl --context llm-d-inference-scheduler-dev port-forward service/inference-gateway 8080:80 42 | ``` 43 | 44 | **NodePort** 45 | 46 | ```console 47 | # Determine the k8s node address 48 | $ kubectl --context llm-d-inference-scheduler-dev get node -o yaml | grep address 49 | # The service is accessible over port 80 of the worker IP address. 50 | ``` 51 | 52 | **LoadBalancer** 53 | 54 | ```console 55 | # Install and run cloud-provider-kind: 56 | $ go install sigs.k8s.io/cloud-provider-kind@latest && cloud-provider-kind & 57 | $ kubectl --context llm-d-inference-scheduler-dev get service inference-gateway 58 | # Wait for the LoadBalancer External-IP to become available. The service is accessible over port 80. 59 | ``` 60 | 61 | You can now make requests macthing the IP:port of one of the access mode above: 62 | 63 | ```console 64 | $ curl -s -w '\n' http:///v1/completions -H 'Content-Type: application/json' -d '{"model":"food-review","prompt":"hi","max_tokens":10,"temperature":0}' | jq 65 | ``` 66 | 67 | By default the created inference gateway, can be accessed on port 30080. This can 68 | be overriden to any free port in the range of 30000 to 32767, by running the above 69 | command as follows: 70 | 71 | ```console 72 | KIND_GATEWAY_HOST_PORT= make env-dev-kind 73 | ``` 74 | 75 | **Where:** <selected-port> is the port on your local machine you want to use to 76 | access the inference gatyeway. 77 | 78 | > **NOTE**: If you require significant customization of this environment beyond 79 | > what the standard deployment provides, you can use the `deploy/components` 80 | > with `kustomize` to build your own highly customized environment. You can use 81 | > the `deploy/environments/kind` deployment as a reference for your own. 82 | 83 | [Kubernetes in Docker (KIND)]:https://github.com/kubernetes-sigs/kind 84 | 85 | ### Development Cycle 86 | 87 | To test your changes to `llm-d-inferernce-scheduler` in this environment, make your changes locally 88 | and then re-run the deployment: 89 | 90 | ```console 91 | make env-dev-kind 92 | ``` 93 | 94 | This will build images with your recent changes and load the new images to the 95 | cluster. By default the image tag will be `dev`. It will also load llm-d-inference-sim, using a tag of `dev` by default. 96 | 97 | **NOTE:** The built image tag can be specified via the `EPP_TAG` environment variable so it is used in the deployment. For example: 98 | 99 | ```console 100 | EPP_TAG=0.0.4 make env-dev-kind 101 | ``` 102 | 103 | **NOTE:** If you want to load a different tag of llm-d-inference-sim, you can use the environment variable `VLLM_SIMULATOR_TAG` to specify it. 104 | 105 | **NOTE**: If you are working on a MacOS with Apple Silicon, it is required to add 106 | the environment variable `GOOS=linux`. 107 | 108 | Then do a rollout of the EPP `Deployment` so that your recent changes are 109 | reflected: 110 | 111 | ```console 112 | kubectl rollout restart deployment endpoint-picker 113 | ``` 114 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # Build Stage: using Go 1.24.1 image 2 | FROM quay.io/projectquay/golang:1.24 AS builder 3 | ARG TARGETOS 4 | ARG TARGETARCH 5 | 6 | # Install build tools 7 | RUN dnf install -y gcc-c++ libstdc++ libstdc++-devel clang && dnf clean all 8 | 9 | WORKDIR /workspace 10 | 11 | # Copy the Go Modules manifests 12 | COPY go.mod go.mod 13 | COPY go.sum go.sum 14 | 15 | # Copy the go source 16 | COPY cmd/ cmd/ 17 | COPY pkg/ pkg/ 18 | COPY internal/ internal/ 19 | 20 | # HuggingFace tokenizer bindings 21 | RUN mkdir -p lib 22 | RUN curl -L https://github.com/daulet/tokenizers/releases/download/v1.20.2/libtokenizers.${TARGETOS}-${TARGETARCH}.tar.gz | tar -xz -C lib 23 | RUN ranlib lib/*.a 24 | 25 | # Build 26 | # the GOARCH has not a default value to allow the binary be built according to the host where the command 27 | # was called. For example, if we call make image-build in a local env which has the Apple Silicon M1 SO 28 | # the docker BUILDPLATFORM arg will be linux/arm64 when for Apple x86 it will be linux/amd64. Therefore, 29 | # by leaving it empty we can ensure that the container and binary shipped on it will have the same platform. 30 | ENV CGO_ENABLED=1 31 | ENV GOOS=${TARGETOS:-linux} 32 | ENV GOARCH=${TARGETARCH} 33 | RUN go build -a -o bin/epp -ldflags="-extldflags '-L$(pwd)/lib'" cmd/epp/main.go cmd/epp/health.go 34 | 35 | # Use distroless as minimal base image to package the manager binary 36 | # Refer to https://github.com/GoogleContainerTools/distroless for more details 37 | FROM registry.access.redhat.com/ubi9/ubi:latest 38 | WORKDIR / 39 | COPY --from=builder /workspace/bin/epp /app/epp 40 | USER 65532:65532 41 | 42 | # expose gRPC, health and metrics ports 43 | EXPOSE 9002 44 | EXPOSE 9003 45 | EXPOSE 9090 46 | 47 | ENTRYPOINT ["/app/epp"] 48 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | SHELL := /usr/bin/env bash 2 | 3 | # Defaults 4 | TARGETOS ?= $(shell go env GOOS) 5 | TARGETARCH ?= $(shell go env GOARCH) 6 | PROJECT_NAME ?= llm-d-inference-scheduler 7 | IMAGE_REGISTRY ?= ghcr.io/llm-d 8 | IMAGE_TAG_BASE ?= $(IMAGE_REGISTRY)/$(PROJECT_NAME) 9 | EPP_TAG ?= dev 10 | IMG = $(IMAGE_TAG_BASE):$(EPP_TAG) 11 | NAMESPACE ?= hc4ai-operator 12 | 13 | CONTAINER_TOOL := $(shell { command -v docker >/dev/null 2>&1 && echo docker; } || { command -v podman >/dev/null 2>&1 && echo podman; } || echo "") 14 | BUILDER := $(shell command -v buildah >/dev/null 2>&1 && echo buildah || echo $(CONTAINER_TOOL)) 15 | PLATFORMS ?= linux/amd64 # linux/arm64 # linux/s390x,linux/ppc64le 16 | 17 | # go source files 18 | SRC = $(shell find . -type f -name '*.go') 19 | 20 | .PHONY: help 21 | help: ## Print help 22 | @awk 'BEGIN {FS = ":.*##"; printf "\nUsage:\n make \033[36m\033[0m\n"} /^[a-zA-Z_0-9-]+:.*?##/ { printf " \033[36m%-15s\033[0m %s\n", $$1, $$2 } /^##@/ { printf "\n\033[1m%s\033[0m\n", substr($$0, 5) } ' $(MAKEFILE_LIST) 23 | 24 | ##@ Tokenizer & Linking 25 | 26 | LDFLAGS ?= -extldflags '-L$(shell pwd)/lib' 27 | CGO_ENABLED=1 28 | TOKENIZER_LIB = lib/libtokenizers.a 29 | 30 | .PHONY: download-tokenizer 31 | download-tokenizer: $(TOKENIZER_LIB) 32 | $(TOKENIZER_LIB): 33 | ## Download the HuggingFace tokenizer bindings. 34 | @echo "Downloading HuggingFace tokenizer bindings..." 35 | mkdir -p lib 36 | curl -L https://github.com/daulet/tokenizers/releases/download/v1.20.2/libtokenizers.$(TARGETOS)-$(TARGETARCH).tar.gz | tar -xz -C lib 37 | ranlib lib/*.a 38 | 39 | ##@ Development 40 | 41 | .PHONY: clean 42 | clean: 43 | go clean -testcache -cache 44 | rm -f $(TOKENIZER_LIB) 45 | rmdir lib 46 | 47 | .PHONY: format 48 | format: ## Format Go source files 49 | @printf "\033[33;1m==== Running gofmt ====\033[0m\n" 50 | @gofmt -l -w $(SRC) 51 | 52 | .PHONY: test 53 | test: test-unit 54 | 55 | .PHONY: test-unit 56 | test-unit: download-tokenizer 57 | @printf "\033[33;1m==== Running Unit Tests ====\033[0m\n" 58 | go test -ldflags="$(LDFLAGS)" -v ./... 59 | 60 | .PHONY: test-integration 61 | test-integration: download-tokenizer 62 | @printf "\033[33;1m==== Running Integration Tests ====\033[0m\n" 63 | go test -ldflags="$(LDFLAGS)" -v -tags=integration_tests ./test/integration/ 64 | 65 | .PHONY: post-deploy-test 66 | post-deploy-test: ## Run post deployment tests 67 | echo Success! 68 | @echo "Post-deployment tests passed." 69 | 70 | .PHONY: lint 71 | lint: check-golangci-lint ## Run lint 72 | @printf "\033[33;1m==== Running linting ====\033[0m\n" 73 | golangci-lint run 74 | 75 | ##@ Build 76 | 77 | .PHONY: build 78 | build: check-go download-tokenizer ## 79 | @printf "\033[33;1m==== Building ====\033[0m\n" 80 | go build -ldflags="$(LDFLAGS)" -o bin/epp cmd/epp/main.go cmd/epp/health.go 81 | 82 | ##@ Container Build/Push 83 | 84 | .PHONY: image-build 85 | image-build: check-container-tool ## Build Docker image ## Build Docker image using $(CONTAINER_TOOL) 86 | @printf "\033[33;1m==== Building Docker image $(IMG) ====\033[0m\n" 87 | $(CONTAINER_TOOL) build \ 88 | --platform $(TARGETOS)/$(TARGETARCH) \ 89 | --build-arg TARGETOS=$(TARGETOS) \ 90 | --build-arg TARGETARCH=$(TARGETARCH) \ 91 | -t $(IMG) . 92 | 93 | .PHONY: image-push 94 | image-push: check-container-tool ## Push Docker image $(IMG) to registry 95 | @printf "\033[33;1m==== Pushing Docker image $(IMG) ====\033[0m\n" 96 | $(CONTAINER_TOOL) push $(IMG) 97 | 98 | ##@ Install/Uninstall Targets 99 | 100 | # Default install/uninstall (Docker) 101 | install: install-docker ## Default install using Docker 102 | @echo "Default Docker install complete." 103 | 104 | uninstall: uninstall-docker ## Default uninstall using Docker 105 | @echo "Default Docker uninstall complete." 106 | 107 | ### Docker Targets 108 | 109 | .PHONY: install-docker 110 | install-docker: check-container-tool ## Install app using $(CONTAINER_TOOL) 111 | @echo "Starting container with $(CONTAINER_TOOL)..." 112 | $(CONTAINER_TOOL) run -d --name $(PROJECT_NAME)-container $(IMG) 113 | @echo "$(CONTAINER_TOOL) installation complete." 114 | @echo "To use $(PROJECT_NAME), run:" 115 | @echo "alias $(PROJECT_NAME)='$(CONTAINER_TOOL) exec -it $(PROJECT_NAME)-container /app/$(PROJECT_NAME)'" 116 | 117 | .PHONY: uninstall-docker 118 | uninstall-docker: check-container-tool ## Uninstall app from $(CONTAINER_TOOL) 119 | @echo "Stopping and removing container in $(CONTAINER_TOOL)..." 120 | -$(CONTAINER_TOOL) stop $(PROJECT_NAME)-container && $(CONTAINER_TOOL) rm $(PROJECT_NAME)-container 121 | @echo "$(CONTAINER_TOOL) uninstallation complete. Remove alias if set: unalias $(PROJECT_NAME)" 122 | 123 | ### Kubernetes Targets (kubectl) 124 | 125 | .PHONY: install-k8s 126 | install-k8s: check-kubectl check-kustomize check-envsubst ## Install on Kubernetes 127 | export PROJECT_NAME=${PROJECT_NAME} 128 | export NAMESPACE=${NAMESPACE} 129 | @echo "Creating namespace (if needed) and setting context to $(NAMESPACE)..." 130 | kubectl create namespace $(NAMESPACE) 2>/dev/null || true 131 | kubectl config set-context --current --namespace=$(NAMESPACE) 132 | @echo "Deploying resources from deploy/ ..." 133 | # Build the kustomization from deploy, substitute variables, and apply the YAML 134 | kustomize build deploy/environments/openshift-base | envsubst | kubectl apply -f - 135 | @echo "Waiting for pod to become ready..." 136 | sleep 5 137 | @POD=$$(kubectl get pod -l app=$(PROJECT_NAME)-statefulset -o jsonpath='{.items[0].metadata.name}'); \ 138 | echo "Kubernetes installation complete."; \ 139 | echo "To use the app, run:"; \ 140 | echo "alias $(PROJECT_NAME)='kubectl exec -n $(NAMESPACE) -it $$POD -- /app/$(PROJECT_NAME)'" 141 | 142 | .PHONY: uninstall-k8s 143 | uninstall-k8s: check-kubectl check-kustomize check-envsubst ## Uninstall from Kubernetes 144 | export PROJECT_NAME=${PROJECT_NAME} 145 | export NAMESPACE=${NAMESPACE} 146 | @echo "Removing resources from Kubernetes..." 147 | kustomize build deploy/environments/openshift-base | envsubst | kubectl delete --force -f - || true 148 | POD=$$(kubectl get pod -l app=$(PROJECT_NAME)-statefulset -o jsonpath='{.items[0].metadata.name}'); \ 149 | echo "Deleting pod: $$POD"; \ 150 | kubectl delete pod "$$POD" --force --grace-period=0 || true; \ 151 | echo "Kubernetes uninstallation complete. Remove alias if set: unalias $(PROJECT_NAME)" 152 | 153 | ### OpenShift Targets (oc) 154 | 155 | .PHONY: install-openshift 156 | install-openshift: check-kubectl check-kustomize check-envsubst ## Install on OpenShift 157 | @echo $$PROJECT_NAME $$NAMESPACE $$IMAGE_TAG_BASE $$VERSION 158 | @echo "Creating namespace $(NAMESPACE)..." 159 | kubectl create namespace $(NAMESPACE) 2>/dev/null || true 160 | @echo "Deploying common resources from deploy/ ..." 161 | # Build and substitute the base manifests from deploy, then apply them 162 | kustomize build deploy/environments/openshift-base | envsubst '$$PROJECT_NAME $$NAMESPACE $$IMAGE_TAG_BASE $$VERSION' | kubectl apply -n $(NAMESPACE) -f - 163 | @echo "Waiting for pod to become ready..." 164 | sleep 5 165 | @POD=$$(kubectl get pod -l app=$(PROJECT_NAME)-statefulset -n $(NAMESPACE) -o jsonpath='{.items[0].metadata.name}'); \ 166 | echo "OpenShift installation complete."; \ 167 | echo "To use the app, run:"; \ 168 | echo "alias $(PROJECT_NAME)='kubectl exec -n $(NAMESPACE) -it $$POD -- /app/$(PROJECT_NAME)'" 169 | 170 | .PHONY: uninstall-openshift 171 | uninstall-openshift: check-kubectl check-kustomize check-envsubst ## Uninstall from OpenShift 172 | @echo "Removing resources from OpenShift..." 173 | kustomize build deploy/environments/openshift-base | envsubst '$$PROJECT_NAME $$NAMESPACE $$IMAGE_TAG_BASE $$VERSION' | kubectl delete --force -f - || true 174 | # @if kubectl api-resources --api-group=route.openshift.io | grep -q Route; then \ 175 | # envsubst '$$PROJECT_NAME $$NAMESPACE $$IMAGE_TAG_BASE $$VERSION' < deploy/openshift/route.yaml | kubectl delete --force -f - || true; \ 176 | # fi 177 | @POD=$$(kubectl get pod -l app=$(PROJECT_NAME)-statefulset -n $(NAMESPACE) -o jsonpath='{.items[0].metadata.name}'); \ 178 | echo "Deleting pod: $$POD"; \ 179 | kubectl delete pod "$$POD" --force --grace-period=0 || true; \ 180 | echo "OpenShift uninstallation complete. Remove alias if set: unalias $(PROJECT_NAME)" 181 | 182 | ### RBAC Targets (using kustomize and envsubst) 183 | 184 | .PHONY: install-rbac 185 | install-rbac: check-kubectl check-kustomize check-envsubst ## Install RBAC 186 | @echo "Applying RBAC configuration from deploy/rbac..." 187 | kustomize build deploy/environments/openshift-base/rbac | envsubst '$$PROJECT_NAME $$NAMESPACE $$IMAGE_TAG_BASE $$VERSION' | kubectl apply -f - 188 | 189 | .PHONY: uninstall-rbac 190 | uninstall-rbac: check-kubectl check-kustomize check-envsubst ## Uninstall RBAC 191 | @echo "Removing RBAC configuration from deploy/rbac..." 192 | kustomize build deploy/environments/openshift-base/rbac | envsubst '$$PROJECT_NAME $$NAMESPACE $$IMAGE_TAG_BASE $$VERSION' | kubectl delete -f - || true 193 | 194 | 195 | ##@ Version Extraction 196 | .PHONY: version extract-version-info 197 | 198 | .PHONY: env 199 | env: ## Print environment variables 200 | @echo "IMAGE_TAG_BASE=$(IMAGE_TAG_BASE)" 201 | @echo "IMG=$(IMG)" 202 | @echo "CONTAINER_TOOL=$(CONTAINER_TOOL)" 203 | 204 | 205 | ##@ Tools 206 | 207 | .PHONY: check-tools 208 | check-tools: \ 209 | check-go \ 210 | check-ginkgo \ 211 | check-golangci-lint \ 212 | check-kustomize \ 213 | check-envsubst \ 214 | check-container-tool \ 215 | check-kubectl \ 216 | check-buildah 217 | @echo "✅ All required tools are installed." 218 | 219 | .PHONY: check-go 220 | check-go: 221 | @command -v go >/dev/null 2>&1 || { \ 222 | echo "❌ Go is not installed. Install it from https://golang.org/dl/"; exit 1; } 223 | 224 | .PHONY: check-ginkgo 225 | check-ginkgo: 226 | @command -v ginkgo >/dev/null 2>&1 || { \ 227 | echo "❌ ginkgo is not installed. Install with: go install github.com/onsi/ginkgo/v2/ginkgo@latest"; exit 1; } 228 | 229 | .PHONY: check-golangci-lint 230 | check-golangci-lint: 231 | @command -v golangci-lint >/dev/null 2>&1 || { \ 232 | echo "❌ golangci-lint is not installed. Install from https://golangci-lint.run/usage/install/"; exit 1; } 233 | 234 | .PHONY: check-kustomize 235 | check-kustomize: 236 | @command -v kustomize >/dev/null 2>&1 || { \ 237 | echo "❌ kustomize is not installed. Install it from https://kubectl.docs.kubernetes.io/installation/kustomize/"; exit 1; } 238 | 239 | .PHONY: check-envsubst 240 | check-envsubst: 241 | @command -v envsubst >/dev/null 2>&1 || { \ 242 | echo "❌ envsubst is not installed. It is part of gettext."; \ 243 | echo "🔧 Try: sudo apt install gettext OR brew install gettext"; exit 1; } 244 | 245 | .PHONY: check-container-tool 246 | check-container-tool: 247 | @command -v $(CONTAINER_TOOL) >/dev/null 2>&1 || { \ 248 | echo "❌ $(CONTAINER_TOOL) is not installed."; \ 249 | echo "🔧 Try: sudo apt install $(CONTAINER_TOOL) OR brew install $(CONTAINER_TOOL)"; exit 1; } 250 | 251 | .PHONY: check-kubectl 252 | check-kubectl: 253 | @command -v kubectl >/dev/null 2>&1 || { \ 254 | echo "❌ kubectl is not installed. Install it from https://kubernetes.io/docs/tasks/tools/"; exit 1; } 255 | 256 | .PHONY: check-builder 257 | check-builder: 258 | @if [ -z "$(BUILDER)" ]; then \ 259 | echo "❌ No container builder tool (buildah, docker, or podman) found."; \ 260 | exit 1; \ 261 | else \ 262 | echo "✅ Using builder: $(BUILDER)"; \ 263 | fi 264 | 265 | ##@ Alias checking 266 | .PHONY: check-alias 267 | check-alias: check-container-tool 268 | @echo "🔍 Checking alias functionality for container '$(PROJECT_NAME)-container'..." 269 | @if ! $(CONTAINER_TOOL) exec $(PROJECT_NAME)-container /app/$(PROJECT_NAME) --help >/dev/null 2>&1; then \ 270 | echo "⚠️ The container '$(PROJECT_NAME)-container' is running, but the alias might not work."; \ 271 | echo "🔧 Try: $(CONTAINER_TOOL) exec -it $(PROJECT_NAME)-container /app/$(PROJECT_NAME)"; \ 272 | else \ 273 | echo "✅ Alias is likely to work: alias $(PROJECT_NAME)='$(CONTAINER_TOOL) exec -it $(PROJECT_NAME)-container /app/$(PROJECT_NAME)'"; \ 274 | fi 275 | 276 | .PHONY: print-namespace 277 | print-namespace: ## Print the current namespace 278 | @echo "$(NAMESPACE)" 279 | 280 | .PHONY: print-project-name 281 | print-project-name: ## Print the current project name 282 | @echo "$(PROJECT_NAME)" 283 | 284 | .PHONY: install-hooks 285 | install-hooks: ## Install git hooks 286 | git config core.hooksPath hooks 287 | 288 | ##@ Dev Environments 289 | 290 | KIND_CLUSTER_NAME ?= llm-d-inference-scheduler-dev 291 | KIND_GATEWAY_HOST_PORT ?= 30080 292 | 293 | .PHONY: env-dev-kind 294 | env-dev-kind: image-build 295 | CLUSTER_NAME=$(KIND_CLUSTER_NAME) \ 296 | GATEWAY_HOST_PORT=$(KIND_GATEWAY_HOST_PORT) \ 297 | IMAGE_REGISTRY=$(IMAGE_REGISTRY) \ 298 | EPP_TAG=$(EPP_TAG) \ 299 | ./scripts/kind-dev-env.sh 300 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Inference Scheduler 2 | 3 | This scheduler makes optimized routing decisions for inference requests to 4 | the llm-d inference framework. 5 | 6 | ## About 7 | 8 | This provides an "Endpoint Picker (EPP)" component to the llm-d inference 9 | framework which schedules incoming inference requests to the platform via a 10 | [Kubernetes] Gateway according to scheduler plugins (for more 11 | details, see the [Architecture Documentation]). 12 | 13 | The EPP extends the [Gateway API Inference Extension (GIE)] project, 14 | which provides the API resources and machinery for scheduling. We add some 15 | custom features that are specific to llm-d here, such as [P/D Disaggregation]. 16 | 17 | A compatible [Gateway API] implementation is used as the Gateway. The Gateway 18 | API implementation must utilize [Envoy] and support [ext-proc], as this is the 19 | callback mechanism the EPP relies on to make routing decisions to model serving 20 | workloads currently. 21 | 22 | [Kubernetes]:https://kubernetes.io 23 | [Architecture Documentation]:docs/architecture.md 24 | [Gateway API Inference Extension (GIE)]:https://github.com/kubernetes-sigs/gateway-api-inference-extension 25 | [P/D Disaggregation]:docs/dp.md 26 | [Gateway API]:https://github.com/kubernetes-sigs/gateway-api 27 | [Envoy]:https://github.com/envoyproxy/envoy 28 | [ext-proc]:https://www.envoyproxy.io/docs/envoy/latest/configuration/http/http_filters/ext_proc_filter 29 | 30 | ## Contributing 31 | 32 | Contributions are welcome! 33 | 34 | For large changes please [create an issue] first describing the change so the 35 | maintainers can do an assessment, and work on the details with you. See 36 | [DEVELOPMENT.md](DEVELOPMENT.md) for details on how to work with the codebase. 37 | 38 | Note that in general features should go to the upstream [Gateway API Inference 39 | Extension (GIE)] project _first_ if applicable. The GIE is a major dependency of 40 | ours, and where most _general purpose_ inference features live. If you have 41 | something that you feel is general purpose or use, it probably should go to the 42 | GIE. If you have something that's _llm-d specific_ then it should go here. If 43 | you're not sure whether your feature belongs here or in the GIE, feel free to 44 | create a [discussion] or ask on [Slack]. 45 | 46 | [create an issue]:https://github.com/llm-d/llm-d-inference-scheduler/issues/new 47 | [Gateway API Inference Extension (GIE)]:https://github.com/kubernetes-sigs/gateway-api-inference-extension 48 | [discussion]:https://github.com/llm-d/llm-d-inference-scheduler/discussions/new?category=q-a 49 | [Slack]:https://llm-d.slack.com/ 50 | -------------------------------------------------------------------------------- /cmd/epp/health.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2025 The Kubernetes Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | /** 18 | * This file is adapted from Gateway API Inference Extension 19 | * Original source: https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/cmd/epp/health.go 20 | * Licensed under the Apache License, Version 2.0 21 | */ 22 | 23 | package main 24 | 25 | import ( 26 | "context" 27 | 28 | extProcPb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3" 29 | "github.com/go-logr/logr" 30 | "google.golang.org/grpc/codes" 31 | healthPb "google.golang.org/grpc/health/grpc_health_v1" 32 | "google.golang.org/grpc/status" 33 | "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" 34 | logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" 35 | ) 36 | 37 | type healthServer struct { 38 | logger logr.Logger 39 | datastore datastore.Datastore 40 | } 41 | 42 | func (s *healthServer) Check(_ context.Context, in *healthPb.HealthCheckRequest) (*healthPb.HealthCheckResponse, error) { 43 | // TODO: we're accepting ANY service name for now as a temporary hack in alignment with 44 | // upstream issues. See https://github.com/kubernetes-sigs/gateway-api-inference-extension/pull/788 45 | // if in.Service != extProcPb.ExternalProcessor_ServiceDesc.ServiceName { 46 | // s.logger.V(logutil.DEFAULT).Info("gRPC health check requested unknown service", "available-services", []string{extProcPb.ExternalProcessor_ServiceDesc.ServiceName}, "requested-service", in.Service) 47 | // return &healthPb.HealthCheckResponse{Status: healthPb.HealthCheckResponse_SERVICE_UNKNOWN}, nil 48 | // } 49 | 50 | if !s.datastore.PoolHasSynced() { 51 | s.logger.V(logutil.DEFAULT).Info("gRPC health check not serving", "service", in.Service) 52 | return &healthPb.HealthCheckResponse{Status: healthPb.HealthCheckResponse_NOT_SERVING}, nil 53 | } 54 | 55 | s.logger.V(logutil.TRACE).Info("gRPC health check serving", "service", in.Service) 56 | return &healthPb.HealthCheckResponse{Status: healthPb.HealthCheckResponse_SERVING}, nil 57 | } 58 | 59 | func (s *healthServer) List(ctx context.Context, _ *healthPb.HealthListRequest) (*healthPb.HealthListResponse, error) { 60 | // currently only the ext_proc service is provided 61 | serviceHealthResponse, err := s.Check(ctx, &healthPb.HealthCheckRequest{Service: extProcPb.ExternalProcessor_ServiceDesc.ServiceName}) 62 | if err != nil { 63 | return nil, err 64 | } 65 | 66 | return &healthPb.HealthListResponse{ 67 | Statuses: map[string]*healthPb.HealthCheckResponse{ 68 | extProcPb.ExternalProcessor_ServiceDesc.ServiceName: serviceHealthResponse, 69 | }, 70 | }, nil 71 | } 72 | 73 | func (s *healthServer) Watch(_ *healthPb.HealthCheckRequest, _ healthPb.Health_WatchServer) error { 74 | return status.Error(codes.Unimplemented, "Watch is not implemented") 75 | } 76 | -------------------------------------------------------------------------------- /cmd/epp/main.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2025 The Kubernetes Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | /** 18 | * This file is adapted from Gateway API Inference Extension 19 | * Original source: https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/cmd/epp/main.go 20 | * Licensed under the Apache License, Version 2.0 21 | */ 22 | 23 | // Package main contains the "Endpoint Picker (EPP)" program for scheduling 24 | // inference requests. 25 | package main 26 | 27 | import ( 28 | "flag" 29 | "fmt" 30 | "net" 31 | "net/http" 32 | "os" 33 | "strconv" 34 | 35 | "github.com/go-logr/logr" 36 | "github.com/prometheus/client_golang/prometheus/promhttp" 37 | uberzap "go.uber.org/zap" 38 | "go.uber.org/zap/zapcore" 39 | "google.golang.org/grpc" 40 | healthPb "google.golang.org/grpc/health/grpc_health_v1" 41 | "k8s.io/apimachinery/pkg/types" 42 | "k8s.io/client-go/rest" 43 | "k8s.io/component-base/metrics/legacyregistry" 44 | ctrl "sigs.k8s.io/controller-runtime" 45 | "sigs.k8s.io/controller-runtime/pkg/log/zap" 46 | "sigs.k8s.io/controller-runtime/pkg/manager" 47 | "sigs.k8s.io/controller-runtime/pkg/metrics/filters" 48 | backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics" 49 | "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" 50 | "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics" 51 | runserver "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/server" 52 | "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" 53 | 54 | "github.com/llm-d/llm-d-inference-scheduler/internal/controller/runnable" 55 | "github.com/llm-d/llm-d-inference-scheduler/pkg/config" 56 | "github.com/llm-d/llm-d-inference-scheduler/pkg/scheduling/pd" 57 | ) 58 | 59 | const ( 60 | defaultMetricsEndpoint = "/metrics" 61 | ) 62 | 63 | var ( 64 | grpcPort = flag.Int( 65 | "grpcPort", 66 | runserver.DefaultGrpcPort, 67 | "The gRPC port used for communicating with Envoy proxy") 68 | grpcHealthPort = flag.Int( 69 | "grpcHealthPort", 70 | 9003, 71 | "The port used for gRPC liveness and readiness probes") 72 | metricsPort = flag.Int( 73 | "metricsPort", 9090, "The metrics port") 74 | destinationEndpointHintKey = flag.String( 75 | "destinationEndpointHintKey", 76 | runserver.DefaultDestinationEndpointHintKey, 77 | "Header and response metadata key used by Envoy to route to the appropriate pod. This must match Envoy configuration.") 78 | destinationEndpointHintMetadataNamespace = flag.String( 79 | "DestinationEndpointHintMetadataNamespace", 80 | runserver.DefaultDestinationEndpointHintMetadataNamespace, 81 | "The key for the outer namespace struct in the metadata field of the extproc response that is used to wrap the"+ 82 | "target endpoint. If not set, then an outer namespace struct should not be created.") 83 | poolName = flag.String( 84 | "poolName", 85 | runserver.DefaultPoolName, 86 | "Name of the InferencePool this Endpoint Picker is associated with.") 87 | poolNamespace = flag.String( 88 | "poolNamespace", 89 | runserver.DefaultPoolNamespace, 90 | "Namespace of the InferencePool this Endpoint Picker is associated with.") 91 | refreshMetricsInterval = flag.Duration( 92 | "refreshMetricsInterval", 93 | runserver.DefaultRefreshMetricsInterval, 94 | "interval to refresh metrics") 95 | refreshPrometheusMetricsInterval = flag.Duration( 96 | "refreshPrometheusMetricsInterval", 97 | runserver.DefaultRefreshPrometheusMetricsInterval, 98 | "interval to flush prometheus metrics") 99 | logVerbosity = flag.Int("v", logging.DEFAULT, "number for the log level verbosity") 100 | secureServing = flag.Bool( 101 | "secureServing", runserver.DefaultSecureServing, "Enables secure serving. Defaults to true.") 102 | certPath = flag.String( 103 | "certPath", "", "The path to the certificate for secure serving. The certificate and private key files "+ 104 | "are assumed to be named tls.crt and tls.key, respectively. If not set, and secureServing is enabled, "+ 105 | "then a self-signed certificate is used.") 106 | // metric flags 107 | totalQueuedRequestsMetric = flag.String("totalQueuedRequestsMetric", 108 | "vllm:num_requests_waiting", 109 | "Prometheus metric for the number of queued requests.") 110 | kvCacheUsagePercentageMetric = flag.String("kvCacheUsagePercentageMetric", 111 | "vllm:gpu_cache_usage_perc", 112 | "Prometheus metric for the fraction of KV-cache blocks currently in use (from 0 to 1).") 113 | // LoRA metrics 114 | loraInfoMetric = flag.String("loraInfoMetric", 115 | "vllm:lora_requests_info", 116 | "Prometheus metric for the LoRA info metrics (must be in vLLM label format).") 117 | 118 | setupLog = ctrl.Log.WithName("setup") 119 | ) 120 | 121 | func main() { 122 | if err := run(); err != nil { 123 | os.Exit(1) 124 | } 125 | } 126 | 127 | func run() error { 128 | opts := zap.Options{ 129 | Development: true, 130 | } 131 | opts.BindFlags(flag.CommandLine) 132 | flag.Parse() 133 | initLogging(&opts) 134 | 135 | // Validate flags 136 | if err := validateFlags(); err != nil { 137 | setupLog.Error(err, "Failed to validate flags") 138 | return err 139 | } 140 | 141 | // Print all flag values 142 | flags := make(map[string]any) 143 | flag.VisitAll(func(f *flag.Flag) { 144 | flags[f.Name] = f.Value 145 | }) 146 | setupLog.Info("Flags processed", "flags", flags) 147 | 148 | // Init runtime. 149 | cfg, err := ctrl.GetConfig() 150 | if err != nil { 151 | setupLog.Error(err, "Failed to get rest config") 152 | return err 153 | } 154 | 155 | poolNamespacedName := types.NamespacedName{ 156 | Name: *poolName, 157 | Namespace: *poolNamespace, 158 | } 159 | mgr, err := runserver.NewDefaultManager(poolNamespacedName, cfg) 160 | if err != nil { 161 | setupLog.Error(err, "Failed to create controller manager") 162 | return err 163 | } 164 | 165 | // Set up mapper for metric scraping. 166 | mapping, err := backendmetrics.NewMetricMapping( 167 | *totalQueuedRequestsMetric, 168 | *kvCacheUsagePercentageMetric, 169 | *loraInfoMetric, 170 | ) 171 | if err != nil { 172 | setupLog.Error(err, "Failed to create metric mapping from flags.") 173 | return err 174 | } 175 | verifyMetricMapping(*mapping, setupLog) 176 | 177 | pmf := backendmetrics.NewPodMetricsFactory(&backendmetrics.PodMetricsClientImpl{MetricMapping: mapping}, *refreshMetricsInterval) 178 | // Setup runner. 179 | ctx := ctrl.SetupSignalHandler() 180 | 181 | schedCfg := config.NewConfig(setupLog) 182 | schedCfg.LoadConfig() 183 | 184 | datastore := datastore.NewDatastore(ctx, pmf) 185 | scheduler, err := pd.NewScheduler(ctx, schedCfg, datastore) 186 | if err != nil { 187 | setupLog.Error(err, "Failed to create PD scheduler") 188 | return err 189 | } 190 | 191 | serverRunner := &runserver.ExtProcServerRunner{ 192 | GrpcPort: *grpcPort, 193 | DestinationEndpointHintMetadataNamespace: *destinationEndpointHintMetadataNamespace, 194 | DestinationEndpointHintKey: *destinationEndpointHintKey, 195 | PoolNamespacedName: poolNamespacedName, 196 | Datastore: datastore, 197 | SecureServing: *secureServing, 198 | CertPath: *certPath, 199 | RefreshPrometheusMetricsInterval: *refreshPrometheusMetricsInterval, 200 | Scheduler: scheduler, 201 | } 202 | if err := serverRunner.SetupWithManager(ctx, mgr); err != nil { 203 | setupLog.Error(err, "Failed to setup ext-proc controllers") 204 | return err 205 | } 206 | 207 | // Register health server. 208 | if err := registerHealthServer(mgr, ctrl.Log.WithName("health"), datastore, *grpcHealthPort); err != nil { 209 | return err 210 | } 211 | 212 | // Register ext-proc server. 213 | if err := mgr.Add(serverRunner.AsRunnable(ctrl.Log.WithName("ext-proc"))); err != nil { 214 | setupLog.Error(err, "Failed to register ext-proc gRPC server") 215 | return err 216 | } 217 | 218 | // Register metrics handler. 219 | if err := registerMetricsHandler(mgr, *metricsPort, cfg); err != nil { 220 | return err 221 | } 222 | 223 | // Start the manager. This blocks until a signal is received. 224 | setupLog.Info("Controller manager starting") 225 | if err := mgr.Start(ctx); err != nil { 226 | setupLog.Error(err, "Error starting controller manager") 227 | return err 228 | } 229 | setupLog.Info("Controller manager terminated") 230 | return nil 231 | } 232 | 233 | func initLogging(opts *zap.Options) { 234 | // Unless -zap-log-level is explicitly set, use -v 235 | useV := true 236 | flag.Visit(func(f *flag.Flag) { 237 | if f.Name == "zap-log-level" { 238 | useV = false 239 | } 240 | }) 241 | if useV { 242 | // See https://pkg.go.dev/sigs.k8s.io/controller-runtime/pkg/log/zap#Options.Level 243 | lvl := -1 * (*logVerbosity) 244 | opts.Level = uberzap.NewAtomicLevelAt(zapcore.Level(int8(lvl))) 245 | } 246 | 247 | logger := zap.New(zap.UseFlagOptions(opts), zap.RawZapOpts(uberzap.AddCaller())) 248 | ctrl.SetLogger(logger) 249 | } 250 | 251 | // registerHealthServer adds the Health gRPC server as a Runnable to the given manager. 252 | func registerHealthServer(mgr manager.Manager, logger logr.Logger, ds datastore.Datastore, port int) error { 253 | srv := grpc.NewServer() 254 | healthPb.RegisterHealthServer(srv, &healthServer{ 255 | logger: logger, 256 | datastore: ds, 257 | }) 258 | if err := mgr.Add( 259 | runnable.NoLeaderElection(runnable.GRPCServer("health", srv, port))); err != nil { 260 | setupLog.Error(err, "Failed to register health server") 261 | return err 262 | } 263 | return nil 264 | } 265 | 266 | // registerMetricsHandler adds the metrics HTTP handler as a Runnable to the given manager. 267 | func registerMetricsHandler(mgr manager.Manager, port int, cfg *rest.Config) error { 268 | metrics.Register() 269 | 270 | // Init HTTP server. 271 | h, err := metricsHandlerWithAuthenticationAndAuthorization(cfg) 272 | if err != nil { 273 | return err 274 | } 275 | 276 | mux := http.NewServeMux() 277 | mux.Handle(defaultMetricsEndpoint, h) 278 | 279 | srv := &http.Server{ 280 | Addr: net.JoinHostPort("", strconv.Itoa(port)), 281 | Handler: mux, 282 | } 283 | 284 | if err := mgr.Add(&manager.Server{ 285 | Name: "metrics", 286 | Server: srv, 287 | }); err != nil { 288 | setupLog.Error(err, "Failed to register metrics HTTP handler") 289 | return err 290 | } 291 | return nil 292 | } 293 | 294 | func metricsHandlerWithAuthenticationAndAuthorization(cfg *rest.Config) (http.Handler, error) { 295 | h := promhttp.HandlerFor( 296 | legacyregistry.DefaultGatherer, 297 | promhttp.HandlerOpts{}, 298 | ) 299 | httpClient, err := rest.HTTPClientFor(cfg) 300 | if err != nil { 301 | setupLog.Error(err, "Failed to create http client for metrics auth") 302 | return nil, err 303 | } 304 | 305 | filter, err := filters.WithAuthenticationAndAuthorization(cfg, httpClient) 306 | if err != nil { 307 | setupLog.Error(err, "Failed to create metrics filter for auth") 308 | return nil, err 309 | } 310 | metricsLogger := ctrl.Log.WithName("metrics").WithValues("path", defaultMetricsEndpoint) 311 | metricsAuthHandler, err := filter(metricsLogger, h) 312 | if err != nil { 313 | setupLog.Error(err, "Failed to create metrics auth handler") 314 | return nil, err 315 | } 316 | return metricsAuthHandler, nil 317 | } 318 | 319 | func validateFlags() error { 320 | if *poolName == "" { 321 | return fmt.Errorf("required %q flag not set", "poolName") 322 | } 323 | 324 | return nil 325 | } 326 | 327 | func verifyMetricMapping(mapping backendmetrics.MetricMapping, logger logr.Logger) { 328 | if mapping.TotalQueuedRequests == nil { 329 | logger.Info("Not scraping metric: TotalQueuedRequests") 330 | } 331 | if mapping.KVCacheUtilization == nil { 332 | logger.Info("Not scraping metric: KVCacheUtilization") 333 | } 334 | if mapping.LoraRequestInfo == nil { 335 | logger.Info("Not scraping metric: LoraRequestInfo") 336 | } 337 | 338 | } 339 | -------------------------------------------------------------------------------- /deploy/components/crds-gateway-api/kustomization.yaml: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Custom Resource Definitions (CRDs) for Gateway API 3 | # 4 | # **Warning**: CRDs are cluster-level, so in a shared development environment 5 | # this needs to be done in a controlled and communicated manner. 6 | # ------------------------------------------------------------------------------ 7 | apiVersion: kustomize.config.k8s.io/v1beta1 8 | kind: Kustomization 9 | 10 | resources: 11 | - https://github.com/kubernetes-sigs/gateway-api/config/crd?ref=v1.3.0 12 | -------------------------------------------------------------------------------- /deploy/components/crds-gie/kustomization.yaml: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Custom Resource Definitions (CRDs) for Gateway API Inference Extension (GIE) 3 | # 4 | # This deploys the GIE CRDs from the local directory. 5 | # 6 | # **Warning**: CRDs are cluster-level, so in a shared development environment 7 | # this needs to be done in a controlled and communicated manner. 8 | # ------------------------------------------------------------------------------ 9 | apiVersion: kustomize.config.k8s.io/v1beta1 10 | kind: Kustomization 11 | 12 | resources: 13 | - https://github.com/kubernetes-sigs/gateway-api-inference-extension/config/crd?ref=v0.3.0 -------------------------------------------------------------------------------- /deploy/components/crds-istio/kustomization.yaml: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Custom Resource Definitions (CRDs) for Istio 3 | # 4 | # **Warning**: CRDs are cluster-level, so in a shared development environment 5 | # this needs to be done in a controlled and communicated manner. 6 | # ------------------------------------------------------------------------------ 7 | apiVersion: kustomize.config.k8s.io/v1beta1 8 | kind: Kustomization 9 | 10 | resources: 11 | - istio.yaml 12 | -------------------------------------------------------------------------------- /deploy/components/inference-gateway/deployments.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: endpoint-picker 5 | labels: 6 | app: endpoint-picker 7 | spec: 8 | replicas: 1 9 | selector: 10 | matchLabels: 11 | app: endpoint-picker 12 | template: 13 | metadata: 14 | labels: 15 | app: endpoint-picker 16 | spec: 17 | serviceAccountName: endpoint-picker 18 | terminationGracePeriodSeconds: 130 19 | containers: 20 | - name: epp 21 | image: ghcr.io/llm-d/llm-d-inference-scheduler:latest 22 | imagePullPolicy: IfNotPresent 23 | args: 24 | - -poolName 25 | - "${POOL_NAME}" 26 | - -v 27 | - "4" 28 | - --zap-encoder 29 | - "json" 30 | - -grpcPort 31 | - "9002" 32 | - -grpcHealthPort 33 | - "9003" 34 | env: 35 | - name: PD_ENABLED 36 | value: '${PD_ENABLED}' 37 | - name: PD_PROMPT_LEN_THRESHOLD 38 | value: '${PD_PROMPT_LEN_THRESHOLD}' 39 | ports: 40 | - containerPort: 9002 41 | - containerPort: 9003 42 | - name: metrics 43 | containerPort: 9090 44 | livenessProbe: 45 | grpc: 46 | port: 9003 47 | service: envoy.service.ext_proc.v3.ExternalProcessor 48 | initialDelaySeconds: 5 49 | periodSeconds: 10 50 | readinessProbe: 51 | grpc: 52 | port: 9003 53 | service: envoy.service.ext_proc.v3.ExternalProcessor 54 | initialDelaySeconds: 5 55 | periodSeconds: 10 56 | -------------------------------------------------------------------------------- /deploy/components/inference-gateway/gateways.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: gateway.networking.k8s.io/v1 2 | kind: Gateway 3 | metadata: 4 | name: inference-gateway 5 | spec: 6 | listeners: 7 | - name: default 8 | port: 80 9 | protocol: HTTP 10 | -------------------------------------------------------------------------------- /deploy/components/inference-gateway/httproutes.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: gateway.networking.k8s.io/v1 2 | kind: HTTPRoute 3 | metadata: 4 | name: inference-route 5 | spec: 6 | parentRefs: 7 | - name: inference-gateway 8 | rules: 9 | - matches: 10 | - path: 11 | type: PathPrefix 12 | value: / 13 | backendRefs: 14 | - group: inference.networking.x-k8s.io 15 | kind: InferencePool 16 | name: ${POOL_NAME} 17 | port: 8000 18 | timeouts: 19 | request: 30s 20 | -------------------------------------------------------------------------------- /deploy/components/inference-gateway/inference-models.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: inference.networking.x-k8s.io/v1alpha2 2 | kind: InferenceModel 3 | metadata: 4 | name: food-review 5 | spec: 6 | modelName: food-review 7 | criticality: Critical 8 | poolRef: 9 | name: ${POOL_NAME} 10 | targetModels: 11 | - name: food-review 12 | weight: 100 13 | --- 14 | apiVersion: inference.networking.x-k8s.io/v1alpha2 15 | kind: InferenceModel 16 | metadata: 17 | name: base-model 18 | spec: 19 | modelName: ${MODEL_NAME} 20 | criticality: Critical 21 | poolRef: 22 | name: ${POOL_NAME} 23 | -------------------------------------------------------------------------------- /deploy/components/inference-gateway/inference-pools.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: inference.networking.x-k8s.io/v1alpha2 2 | kind: InferencePool 3 | metadata: 4 | name: ${POOL_NAME} 5 | spec: 6 | targetPortNumber: 8000 7 | selector: 8 | app: ${POOL_NAME} 9 | extensionRef: 10 | name: endpoint-picker 11 | -------------------------------------------------------------------------------- /deploy/components/inference-gateway/kustomization.yaml: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Inference Gateway 3 | # 4 | # This provides a working stack for an inference Gateway, including the Gateway 5 | # itself, the Endpoint Picker (EPP) attached to it, and the Inference Pools and 6 | # Inference Models to collect pods from a model serving framework (e.g. VLLM, 7 | # or even just the VLLM Simulator). 8 | # 9 | # ------------------------------------------------------------------------------ 10 | apiVersion: kustomize.config.k8s.io/v1beta1 11 | kind: Kustomization 12 | 13 | resources: 14 | - service-accounts.yaml 15 | - rbac.yaml 16 | - inference-pools.yaml 17 | - inference-models.yaml 18 | - services.yaml 19 | - deployments.yaml 20 | - gateways.yaml 21 | - httproutes.yaml 22 | 23 | images: 24 | - name: ghcr.io/llm-d/llm-d-inference-scheduler 25 | newTag: ${EPP_TAG} 26 | -------------------------------------------------------------------------------- /deploy/components/inference-gateway/rbac.yaml: -------------------------------------------------------------------------------- 1 | kind: Role 2 | apiVersion: rbac.authorization.k8s.io/v1 3 | metadata: 4 | name: endpoint-picker 5 | rules: 6 | - apiGroups: 7 | - "inference.networking.x-k8s.io" 8 | resources: 9 | - "inferencepools" 10 | - "inferencemodels" 11 | verbs: 12 | - "get" 13 | - "watch" 14 | - "list" 15 | - apiGroups: 16 | - "" 17 | resources: 18 | - "pods" 19 | verbs: 20 | - "get" 21 | - "watch" 22 | - "list" 23 | - apiGroups: 24 | - "discovery.k8s.io" 25 | resources: 26 | - "endpointslices" 27 | verbs: 28 | - "get" 29 | - "watch" 30 | - "list" 31 | - apiGroups: 32 | - "authentication.k8s.io" 33 | resources: 34 | - "tokenreviews" 35 | verbs: 36 | - "create" 37 | - apiGroups: 38 | - "authorization.k8s.io" 39 | resources: 40 | - "subjectaccessreviews" 41 | verbs: 42 | - "create" 43 | --- 44 | apiVersion: rbac.authorization.k8s.io/v1 45 | kind: RoleBinding 46 | metadata: 47 | name: endpoint-picker-binding 48 | subjects: 49 | - kind: ServiceAccount 50 | name: endpoint-picker 51 | roleRef: 52 | apiGroup: rbac.authorization.k8s.io 53 | kind: Role 54 | name: endpoint-picker 55 | -------------------------------------------------------------------------------- /deploy/components/inference-gateway/service-accounts.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ServiceAccount 3 | metadata: 4 | name: endpoint-picker 5 | -------------------------------------------------------------------------------- /deploy/components/inference-gateway/services.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | name: endpoint-picker 5 | spec: 6 | selector: 7 | app: endpoint-picker 8 | ports: 9 | - protocol: TCP 10 | port: 9002 11 | targetPort: 9002 12 | appProtocol: http2 13 | type: ClusterIP 14 | -------------------------------------------------------------------------------- /deploy/components/istio-control-plane/deployments.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | labels: 5 | app: istiod 6 | app.kubernetes.io/instance: istio 7 | app.kubernetes.io/managed-by: Helm 8 | app.kubernetes.io/name: istiod 9 | app.kubernetes.io/part-of: istio 10 | app.kubernetes.io/version: 1.26-alpha.9befed2f1439d883120f8de70fd70d84ca0ebc3d 11 | helm.sh/chart: istiod-1.26-alpha.9befed2f1439d883120f8de70fd70d84ca0ebc3d 12 | install.operator.istio.io/owning-resource: unknown 13 | istio: pilot 14 | istio.io/rev: llm-d-gateway 15 | operator.istio.io/component: Pilot 16 | release: istio 17 | name: istiod-llm-d-gateway 18 | namespace: llm-d-istio-system 19 | spec: 20 | selector: 21 | matchLabels: 22 | app: istiod 23 | istio.io/rev: llm-d-gateway 24 | strategy: 25 | rollingUpdate: 26 | maxSurge: 100% 27 | maxUnavailable: 25% 28 | template: 29 | metadata: 30 | annotations: 31 | prometheus.io/port: "15014" 32 | prometheus.io/scrape: "true" 33 | sidecar.istio.io/inject: "false" 34 | labels: 35 | app: istiod 36 | app.kubernetes.io/instance: istio 37 | app.kubernetes.io/managed-by: Helm 38 | app.kubernetes.io/name: istiod 39 | app.kubernetes.io/part-of: istio 40 | app.kubernetes.io/version: 1.26-alpha.9befed2f1439d883120f8de70fd70d84ca0ebc3d 41 | helm.sh/chart: istiod-1.26-alpha.9befed2f1439d883120f8de70fd70d84ca0ebc3d 42 | install.operator.istio.io/owning-resource: unknown 43 | istio: istiod 44 | istio.io/dataplane-mode: none 45 | istio.io/rev: llm-d-gateway 46 | operator.istio.io/component: Pilot 47 | sidecar.istio.io/inject: "false" 48 | spec: 49 | containers: 50 | - args: 51 | - discovery 52 | - --monitoringAddr=:15014 53 | - --log_output_level=default:info 54 | - --domain 55 | - cluster.local 56 | - --keepaliveMaxServerConnectionAge 57 | - 30m 58 | env: 59 | - name: REVISION 60 | value: llm-d-gateway 61 | - name: PILOT_CERT_PROVIDER 62 | value: istiod 63 | - name: POD_NAME 64 | valueFrom: 65 | fieldRef: 66 | apiVersion: v1 67 | fieldPath: metadata.name 68 | - name: POD_NAMESPACE 69 | valueFrom: 70 | fieldRef: 71 | apiVersion: v1 72 | fieldPath: metadata.namespace 73 | - name: SERVICE_ACCOUNT 74 | valueFrom: 75 | fieldRef: 76 | apiVersion: v1 77 | fieldPath: spec.serviceAccountName 78 | - name: KUBECONFIG 79 | value: /var/run/secrets/remote/config 80 | - name: CA_TRUSTED_NODE_ACCOUNTS 81 | value: llm-d-istio-system/ztunnel 82 | - name: PILOT_TRACE_SAMPLING 83 | value: "1" 84 | - name: PILOT_ENABLE_ANALYSIS 85 | value: "false" 86 | - name: CLUSTER_ID 87 | value: Kubernetes 88 | - name: GOMEMLIMIT 89 | valueFrom: 90 | resourceFieldRef: 91 | resource: limits.memory 92 | - name: GOMAXPROCS 93 | valueFrom: 94 | resourceFieldRef: 95 | divisor: "1" 96 | resource: limits.cpu 97 | - name: PLATFORM 98 | value: "" 99 | image: quay.io/rh-ee-sutt/istio-testing/pilot:1.26-alpha.9befed2f1439d883120f8de70fd70d84ca0ebc3d 100 | name: discovery 101 | ports: 102 | - containerPort: 8080 103 | name: http-debug 104 | protocol: TCP 105 | - containerPort: 15010 106 | name: grpc-xds 107 | protocol: TCP 108 | - containerPort: 15012 109 | name: tls-xds 110 | protocol: TCP 111 | - containerPort: 15017 112 | name: https-webhooks 113 | protocol: TCP 114 | - containerPort: 15014 115 | name: http-monitoring 116 | protocol: TCP 117 | readinessProbe: 118 | httpGet: 119 | path: /ready 120 | port: 8080 121 | initialDelaySeconds: 1 122 | periodSeconds: 3 123 | timeoutSeconds: 5 124 | resources: 125 | requests: 126 | cpu: 500m 127 | memory: 1024Mi 128 | securityContext: 129 | allowPrivilegeEscalation: false 130 | capabilities: 131 | drop: 132 | - ALL 133 | readOnlyRootFilesystem: true 134 | runAsNonRoot: true 135 | volumeMounts: 136 | - mountPath: /var/run/secrets/tokens 137 | name: istio-token 138 | readOnly: true 139 | - mountPath: /var/run/secrets/istio-dns 140 | name: local-certs 141 | - mountPath: /etc/cacerts 142 | name: cacerts 143 | readOnly: true 144 | - mountPath: /var/run/secrets/remote 145 | name: istio-kubeconfig 146 | readOnly: true 147 | - mountPath: /var/run/secrets/istiod/tls 148 | name: istio-csr-dns-cert 149 | readOnly: true 150 | - mountPath: /var/run/secrets/istiod/ca 151 | name: istio-csr-ca-configmap 152 | readOnly: true 153 | serviceAccountName: istiod-llm-d-gateway 154 | tolerations: 155 | - key: cni.istio.io/not-ready 156 | operator: Exists 157 | volumes: 158 | - emptyDir: 159 | medium: Memory 160 | name: local-certs 161 | - name: istio-token 162 | projected: 163 | sources: 164 | - serviceAccountToken: 165 | audience: istio-ca 166 | expirationSeconds: 43200 167 | path: istio-token 168 | - name: cacerts 169 | secret: 170 | optional: true 171 | secretName: cacerts 172 | - name: istio-kubeconfig 173 | secret: 174 | optional: true 175 | secretName: istio-kubeconfig 176 | - name: istio-csr-dns-cert 177 | secret: 178 | optional: true 179 | secretName: istiod-tls 180 | - configMap: 181 | defaultMode: 420 182 | name: istio-ca-root-cert 183 | optional: true 184 | name: istio-csr-ca-configmap 185 | -------------------------------------------------------------------------------- /deploy/components/istio-control-plane/hpa.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: autoscaling/v2 2 | kind: HorizontalPodAutoscaler 3 | metadata: 4 | labels: 5 | app: istiod 6 | app.kubernetes.io/instance: istio 7 | app.kubernetes.io/managed-by: Helm 8 | app.kubernetes.io/name: istiod 9 | app.kubernetes.io/part-of: istio 10 | app.kubernetes.io/version: 1.26-alpha.9befed2f1439d883120f8de70fd70d84ca0ebc3d 11 | helm.sh/chart: istiod-1.26-alpha.9befed2f1439d883120f8de70fd70d84ca0ebc3d 12 | install.operator.istio.io/owning-resource: unknown 13 | istio.io/rev: llm-d-gateway 14 | operator.istio.io/component: Pilot 15 | release: istio 16 | name: istiod-llm-d-gateway 17 | namespace: llm-d-istio-system 18 | spec: 19 | maxReplicas: 5 20 | metrics: 21 | - resource: 22 | name: cpu 23 | target: 24 | averageUtilization: 80 25 | type: Utilization 26 | type: Resource 27 | minReplicas: 1 28 | scaleTargetRef: 29 | apiVersion: apps/v1 30 | kind: Deployment 31 | name: istiod-llm-d-gateway 32 | -------------------------------------------------------------------------------- /deploy/components/istio-control-plane/kustomization.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: kustomize.config.k8s.io/v1beta1 2 | kind: Kustomization 3 | resources: 4 | - namespaces.yaml 5 | - configmaps.yaml 6 | - deployments.yaml 7 | - hpa.yaml 8 | - policies.yaml 9 | - rbac.yaml 10 | - service-accounts.yaml 11 | - services.yaml 12 | - webhooks.yaml 13 | -------------------------------------------------------------------------------- /deploy/components/istio-control-plane/namespaces.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Namespace 3 | metadata: 4 | name: llm-d-istio-system 5 | -------------------------------------------------------------------------------- /deploy/components/istio-control-plane/policies.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: policy/v1 2 | kind: PodDisruptionBudget 3 | metadata: 4 | labels: 5 | app: istiod 6 | app.kubernetes.io/instance: istio 7 | app.kubernetes.io/managed-by: Helm 8 | app.kubernetes.io/name: istiod 9 | app.kubernetes.io/part-of: istio 10 | app.kubernetes.io/version: 1.26-alpha.9befed2f1439d883120f8de70fd70d84ca0ebc3d 11 | helm.sh/chart: istiod-1.26-alpha.9befed2f1439d883120f8de70fd70d84ca0ebc3d 12 | install.operator.istio.io/owning-resource: unknown 13 | istio: pilot 14 | istio.io/rev: llm-d-gateway 15 | operator.istio.io/component: Pilot 16 | release: istio 17 | name: istiod-llm-d-gateway 18 | namespace: llm-d-istio-system 19 | spec: 20 | minAvailable: 1 21 | selector: 22 | matchLabels: 23 | app: istiod 24 | istio.io/rev: llm-d-gateway 25 | -------------------------------------------------------------------------------- /deploy/components/istio-control-plane/service-accounts.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ServiceAccount 3 | metadata: 4 | labels: 5 | app: istio-reader 6 | app.kubernetes.io/instance: istio 7 | app.kubernetes.io/managed-by: Helm 8 | app.kubernetes.io/name: istio-reader 9 | app.kubernetes.io/part-of: istio 10 | app.kubernetes.io/version: 1.26-alpha.9befed2f1439d883120f8de70fd70d84ca0ebc3d 11 | helm.sh/chart: base-1.26-alpha.9befed2f1439d883120f8de70fd70d84ca0ebc3d 12 | release: istio 13 | name: istio-reader-service-account 14 | namespace: llm-d-istio-system 15 | --- 16 | apiVersion: v1 17 | kind: ServiceAccount 18 | metadata: 19 | labels: 20 | app: istiod 21 | app.kubernetes.io/instance: istio 22 | app.kubernetes.io/managed-by: Helm 23 | app.kubernetes.io/name: istiod 24 | app.kubernetes.io/part-of: istio 25 | app.kubernetes.io/version: 1.26-alpha.9befed2f1439d883120f8de70fd70d84ca0ebc3d 26 | helm.sh/chart: istiod-1.26-alpha.9befed2f1439d883120f8de70fd70d84ca0ebc3d 27 | release: istio 28 | name: istiod-llm-d-gateway 29 | namespace: llm-d-istio-system 30 | -------------------------------------------------------------------------------- /deploy/components/istio-control-plane/services.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | labels: 5 | app: istiod 6 | app.kubernetes.io/instance: istio 7 | app.kubernetes.io/managed-by: Helm 8 | app.kubernetes.io/name: istiod 9 | app.kubernetes.io/part-of: istio 10 | app.kubernetes.io/version: 1.26-alpha.9befed2f1439d883120f8de70fd70d84ca0ebc3d 11 | helm.sh/chart: istiod-1.26-alpha.9befed2f1439d883120f8de70fd70d84ca0ebc3d 12 | install.operator.istio.io/owning-resource: unknown 13 | istio: pilot 14 | istio.io/rev: llm-d-gateway 15 | operator.istio.io/component: Pilot 16 | release: istio 17 | name: istiod-llm-d-gateway 18 | namespace: llm-d-istio-system 19 | spec: 20 | ports: 21 | - name: grpc-xds 22 | port: 15010 23 | protocol: TCP 24 | - name: https-dns 25 | port: 15012 26 | protocol: TCP 27 | - name: https-webhook 28 | port: 443 29 | protocol: TCP 30 | targetPort: 15017 31 | - name: http-monitoring 32 | port: 15014 33 | protocol: TCP 34 | selector: 35 | app: istiod 36 | istio.io/rev: llm-d-gateway 37 | -------------------------------------------------------------------------------- /deploy/components/istio-control-plane/telemetry.yaml: -------------------------------------------------------------------------------- 1 | # Enables debug logging for Gateways 2 | apiVersion: telemetry.istio.io/v1 3 | kind: Telemetry 4 | metadata: 5 | name: mesh-default 6 | namespace: istio-gateway 7 | spec: 8 | accessLogging: 9 | - providers: 10 | - name: envoy 11 | -------------------------------------------------------------------------------- /deploy/components/istio-control-plane/webhooks.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: admissionregistration.k8s.io/v1 2 | kind: ValidatingWebhookConfiguration 3 | metadata: 4 | labels: 5 | app: istiod 6 | app.kubernetes.io/instance: istio 7 | app.kubernetes.io/managed-by: Helm 8 | app.kubernetes.io/name: istiod 9 | app.kubernetes.io/part-of: istio 10 | app.kubernetes.io/version: 1.26-alpha.9befed2f1439d883120f8de70fd70d84ca0ebc3d 11 | helm.sh/chart: istiod-1.26-alpha.9befed2f1439d883120f8de70fd70d84ca0ebc3d 12 | istio: istiod 13 | istio.io/rev: llm-d-gateway 14 | release: istio 15 | name: istio-validator-llm-d-gateway-llm-d-istio-system 16 | webhooks: 17 | - admissionReviewVersions: 18 | - v1 19 | clientConfig: 20 | service: 21 | name: istiod-llm-d-gateway 22 | namespace: llm-d-istio-system 23 | path: /validate 24 | failurePolicy: Ignore 25 | name: rev.validation.istio.io 26 | objectSelector: 27 | matchExpressions: 28 | - key: istio.io/rev 29 | operator: In 30 | values: 31 | - llm-d-gateway 32 | rules: 33 | - apiGroups: 34 | - security.istio.io 35 | - networking.istio.io 36 | - telemetry.istio.io 37 | - extensions.istio.io 38 | apiVersions: 39 | - '*' 40 | operations: 41 | - CREATE 42 | - UPDATE 43 | resources: 44 | - '*' 45 | sideEffects: None 46 | --- 47 | apiVersion: admissionregistration.k8s.io/v1 48 | kind: MutatingWebhookConfiguration 49 | metadata: 50 | labels: 51 | app: sidecar-injector 52 | app.kubernetes.io/instance: istio 53 | app.kubernetes.io/managed-by: Helm 54 | app.kubernetes.io/name: istiod 55 | app.kubernetes.io/part-of: istio 56 | app.kubernetes.io/version: 1.26-alpha.9befed2f1439d883120f8de70fd70d84ca0ebc3d 57 | helm.sh/chart: istiod-1.26-alpha.9befed2f1439d883120f8de70fd70d84ca0ebc3d 58 | install.operator.istio.io/owning-resource: unknown 59 | istio.io/rev: llm-d-gateway 60 | operator.istio.io/component: Pilot 61 | release: istio 62 | name: istio-sidecar-injector-llm-d-gateway-llm-d-istio-system 63 | webhooks: 64 | - admissionReviewVersions: 65 | - v1 66 | clientConfig: 67 | service: 68 | name: istiod-llm-d-gateway 69 | namespace: llm-d-istio-system 70 | path: /inject 71 | port: 443 72 | failurePolicy: Fail 73 | name: rev.namespace.sidecar-injector.istio.io 74 | namespaceSelector: 75 | matchExpressions: 76 | - key: istio.io/rev 77 | operator: In 78 | values: 79 | - llm-d-gateway 80 | - key: istio-injection 81 | operator: DoesNotExist 82 | objectSelector: 83 | matchExpressions: 84 | - key: sidecar.istio.io/inject 85 | operator: NotIn 86 | values: 87 | - "false" 88 | reinvocationPolicy: Never 89 | rules: 90 | - apiGroups: 91 | - "" 92 | apiVersions: 93 | - v1 94 | operations: 95 | - CREATE 96 | resources: 97 | - pods 98 | sideEffects: None 99 | - admissionReviewVersions: 100 | - v1 101 | clientConfig: 102 | service: 103 | name: istiod-llm-d-gateway 104 | namespace: llm-d-istio-system 105 | path: /inject 106 | port: 443 107 | failurePolicy: Fail 108 | name: rev.object.sidecar-injector.istio.io 109 | namespaceSelector: 110 | matchExpressions: 111 | - key: istio.io/rev 112 | operator: DoesNotExist 113 | - key: istio-injection 114 | operator: DoesNotExist 115 | objectSelector: 116 | matchExpressions: 117 | - key: sidecar.istio.io/inject 118 | operator: NotIn 119 | values: 120 | - "false" 121 | - key: istio.io/rev 122 | operator: In 123 | values: 124 | - llm-d-gateway 125 | reinvocationPolicy: Never 126 | rules: 127 | - apiGroups: 128 | - "" 129 | apiVersions: 130 | - v1 131 | operations: 132 | - CREATE 133 | resources: 134 | - pods 135 | sideEffects: None 136 | -------------------------------------------------------------------------------- /deploy/components/vllm-sim-pd/deployments.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: vllm-sim-p 5 | labels: 6 | app: ${POOL_NAME} 7 | spec: 8 | replicas: ${VLLM_REPLICA_COUNT_P} 9 | selector: 10 | matchLabels: 11 | app: ${POOL_NAME} 12 | template: 13 | metadata: 14 | labels: 15 | app: ${POOL_NAME} 16 | llm-d.ai/role: prefill 17 | spec: 18 | containers: 19 | - name: vllm 20 | image: ghcr.io/llm-d/llm-d-inference-sim:latest 21 | imagePullPolicy: IfNotPresent 22 | args: 23 | - "--port=8000" 24 | - "--model=food-review" 25 | ports: 26 | - name: http 27 | containerPort: 8000 28 | protocol: TCP 29 | env: 30 | - name: PORT 31 | value: "8000" 32 | --- 33 | apiVersion: apps/v1 34 | kind: Deployment 35 | metadata: 36 | name: vllm-sim-d 37 | labels: 38 | app: ${POOL_NAME} 39 | spec: 40 | replicas: ${VLLM_REPLICA_COUNT_D} 41 | selector: 42 | matchLabels: 43 | app: ${POOL_NAME} 44 | template: 45 | metadata: 46 | labels: 47 | app: ${POOL_NAME} 48 | llm-d.ai/role: decode 49 | spec: 50 | initContainers: 51 | - name: routing-sidecar 52 | image: ghcr.io/llm-d/llm-d-routing-sidecar:latest 53 | imagePullPolicy: IfNotPresent 54 | args: 55 | - "--port=8000" 56 | - "--vllm-port=8200" 57 | - "--connector=lmcache" 58 | ports: 59 | - containerPort: 8000 60 | protocol: TCP 61 | restartPolicy: Always 62 | containers: 63 | - name: vllm 64 | image: ghcr.io/llm-d/llm-d-inference-sim:latest 65 | imagePullPolicy: IfNotPresent 66 | args: 67 | - "--port=8200" 68 | - "--model=food-review" 69 | ports: 70 | - name: http 71 | containerPort: 8200 72 | protocol: TCP 73 | env: 74 | - name: PORT 75 | value: "8200" 76 | -------------------------------------------------------------------------------- /deploy/components/vllm-sim-pd/kustomization.yaml: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # VLLM Simulator 3 | # 4 | # This deploys a VLLM simulator which can be used to simulate inference for 5 | # small environments (e.g. Kubernetes In Docker (KIND) clusters), or for when 6 | # all that is needed is some basic functionality. 7 | # ------------------------------------------------------------------------------ 8 | apiVersion: kustomize.config.k8s.io/v1beta1 9 | kind: Kustomization 10 | 11 | resources: 12 | - deployments.yaml 13 | 14 | images: 15 | - name: ghcr.io/llm-d/llm-d-inference-sim 16 | newTag: ${VLLM_SIMULATOR_TAG} 17 | - name: ghcr.io/llm-d/llm-d-routing-sidecar 18 | newTag: ${ROUTING_SIDECAR_TAG} 19 | -------------------------------------------------------------------------------- /deploy/components/vllm-sim/deployments.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: vllm-sim 5 | labels: 6 | app: ${POOL_NAME} 7 | spec: 8 | replicas: ${VLLM_REPLICA_COUNT} 9 | selector: 10 | matchLabels: 11 | app: ${POOL_NAME} 12 | template: 13 | metadata: 14 | labels: 15 | app: ${POOL_NAME} 16 | spec: 17 | containers: 18 | - name: vllm 19 | image: ghcr.io/llm-d/llm-d-inference-sim:latest 20 | imagePullPolicy: IfNotPresent 21 | args: 22 | - "--port=8000" 23 | - "--model=food-review" 24 | ports: 25 | - name: http 26 | containerPort: 8000 27 | protocol: TCP 28 | env: 29 | - name: PORT 30 | value: "8000" 31 | -------------------------------------------------------------------------------- /deploy/components/vllm-sim/kustomization.yaml: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # VLLM Simulator 3 | # 4 | # This deploys a VLLM simulator which can be used to simulate inference for 5 | # small environments (e.g. Kubernetes In Docker (KIND) clusters), or for when 6 | # all that is needed is some basic functionality. 7 | # ------------------------------------------------------------------------------ 8 | apiVersion: kustomize.config.k8s.io/v1beta1 9 | kind: Kustomization 10 | 11 | resources: 12 | - deployments.yaml 13 | 14 | images: 15 | - name: ghcr.io/llm-d/llm-d-inference-sim 16 | newTag: ${VLLM_SIMULATOR_TAG} 17 | -------------------------------------------------------------------------------- /deploy/environments/dev/base-kind-istio/destination-rules.yaml: -------------------------------------------------------------------------------- 1 | # **WARNING** Only use in testing scenarios 2 | apiVersion: networking.istio.io/v1 3 | kind: DestinationRule 4 | metadata: 5 | name: endpoint-picker-insecure-tls 6 | spec: 7 | host: endpoint-picker 8 | trafficPolicy: 9 | tls: 10 | mode: SIMPLE 11 | insecureSkipVerify: true 12 | -------------------------------------------------------------------------------- /deploy/environments/dev/base-kind-istio/kustomization.yaml: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Kubernetes In Docker (KIND) Environment 3 | # 4 | # This will deploy the base development stack on a KIND cluster: 5 | # 6 | # * Istio Control Plane 7 | # * Inference Gateway 8 | # 9 | # This will expose the VLLM simulator via InferencePool and an HTTPRoute. 10 | # 11 | # The vLLM simulator is deployment by a kustomization directory that includes this directory 12 | # ------------------------------------------------------------------------------ 13 | apiVersion: kustomize.config.k8s.io/v1beta1 14 | kind: Kustomization 15 | 16 | resources: 17 | - destination-rules.yaml 18 | - services.yaml 19 | - ../../../components/istio-control-plane/ 20 | - ../../../components/inference-gateway/ 21 | 22 | patches: 23 | - path: patch-deployments.yaml 24 | - path: patch-gateways.yaml 25 | -------------------------------------------------------------------------------- /deploy/environments/dev/base-kind-istio/patch-deployments.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: endpoint-picker 5 | spec: 6 | template: 7 | spec: 8 | containers: 9 | - name: epp 10 | args: 11 | - -poolName 12 | - ${POOL_NAME} 13 | - -poolNamespace 14 | - "default" 15 | - -v 16 | - "4" 17 | - --zap-encoder 18 | - "json" 19 | - -grpcPort 20 | - "9002" 21 | - -grpcHealthPort 22 | - "9003" 23 | -------------------------------------------------------------------------------- /deploy/environments/dev/base-kind-istio/patch-gateways.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: gateway.networking.k8s.io/v1 2 | kind: Gateway 3 | metadata: 4 | name: inference-gateway 5 | labels: 6 | istio.io/enable-inference-extproc: "true" 7 | istio.io/rev: llm-d-gateway 8 | annotations: 9 | networking.istio.io/service-type: ClusterIP 10 | spec: 11 | gatewayClassName: istio 12 | -------------------------------------------------------------------------------- /deploy/environments/dev/base-kind-istio/services.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | annotations: 5 | networking.istio.io/service-type: NodePort 6 | labels: 7 | gateway.istio.io/managed: istio.io-gateway-controller 8 | gateway.networking.k8s.io/gateway-name: inference-gateway 9 | istio.io/enable-inference-extproc: "true" 10 | name: inference-gateway-istio-nodeport 11 | spec: 12 | type: NodePort 13 | selector: 14 | gateway.networking.k8s.io/gateway-name: inference-gateway 15 | ports: 16 | - appProtocol: tcp 17 | name: status-port 18 | port: 15021 19 | protocol: TCP 20 | targetPort: 15021 21 | nodePort: 32021 22 | - appProtocol: http 23 | name: default 24 | port: 80 25 | protocol: TCP 26 | targetPort: 80 27 | nodePort: 30080 28 | -------------------------------------------------------------------------------- /deploy/environments/dev/kind-istio-pd/kustomization.yaml: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Kubernetes In Docker (KIND) Environment 3 | # 4 | # This will deploy the full development stack on a KIND cluster: 5 | # 6 | # * Istio Control Plane 7 | # * VLLM Simulator 8 | # * Inference Gateway 9 | # 10 | # This will expose the VLLM simulator via InferencePool and an HTTPRoute. 11 | # ------------------------------------------------------------------------------ 12 | apiVersion: kustomize.config.k8s.io/v1beta1 13 | kind: Kustomization 14 | 15 | resources: 16 | - ../base-kind-istio/ 17 | - ../../../components/vllm-sim-pd/ 18 | -------------------------------------------------------------------------------- /deploy/environments/dev/kind-istio/kustomization.yaml: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Kubernetes In Docker (KIND) Environment 3 | # 4 | # This will deploy the full development stack on a KIND cluster: 5 | # 6 | # * Istio Control Plane 7 | # * VLLM Simulator 8 | # * Inference Gateway 9 | # 10 | # This will expose the VLLM simulator via InferencePool and an HTTPRoute. 11 | # ------------------------------------------------------------------------------ 12 | apiVersion: kustomize.config.k8s.io/v1beta1 13 | kind: Kustomization 14 | 15 | resources: 16 | - ../base-kind-istio/ 17 | - ../../../components/vllm-sim/ 18 | -------------------------------------------------------------------------------- /deploy/environments/openshift-base/common/patch-service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | name: service 5 | spec: 6 | selector: 7 | app: ${PROJECT_NAME}-service 8 | ports: 9 | - protocol: TCP 10 | port: 8080 11 | targetPort: 8080 12 | type: ClusterIP 13 | -------------------------------------------------------------------------------- /deploy/environments/openshift-base/common/patch-statefulset.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: StatefulSet 3 | metadata: 4 | name: 0 5 | spec: 6 | serviceName: ${PROJECT_NAME}-service 7 | replicas: 1 8 | selector: 9 | matchLabels: 10 | app: ${PROJECT_NAME}-statefulset 11 | template: 12 | metadata: 13 | labels: 14 | app: ${PROJECT_NAME}-statefulset 15 | spec: 16 | serviceAccountName: operator-controller-manager 17 | containers: 18 | - name: cmd 19 | image: ${IMAGE_TAG_BASE}:${VERSION} 20 | imagePullPolicy: Always 21 | -------------------------------------------------------------------------------- /deploy/environments/openshift-base/common/service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | name: service 5 | spec: 6 | selector: 7 | app: placeholder 8 | ports: 9 | - protocol: TCP 10 | port: 8080 11 | targetPort: 8080 12 | type: ClusterIP 13 | -------------------------------------------------------------------------------- /deploy/environments/openshift-base/common/statefulset.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: StatefulSet 3 | metadata: 4 | name: "0" 5 | spec: 6 | serviceName: placeholder 7 | replicas: 1 8 | selector: 9 | matchLabels: 10 | app: placeholder 11 | template: 12 | metadata: 13 | labels: 14 | app: placeholder 15 | spec: 16 | serviceAccountName: operator-controller-manager 17 | containers: 18 | - name: cmd 19 | image: ghcr.io/llm-d/placeholder:placeholder 20 | imagePullPolicy: Always 21 | -------------------------------------------------------------------------------- /deploy/environments/openshift-base/kustomization.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: kustomize.config.k8s.io/v1beta1 2 | kind: Kustomization 3 | 4 | # Set the namespace for all resources using a placeholder. 5 | namespace: ${NAMESPACE} 6 | 7 | # Use a prefix for all object names. You can substitute the PROJECT_NAME variable. 8 | namePrefix: ${PROJECT_NAME}- 9 | 10 | # List all the resources (manifests) you want to deploy. 11 | resources: 12 | - common/statefulset.yaml 13 | - common/service.yaml 14 | - openshift/route.yaml 15 | - rbac/exec-rbac-role.yaml 16 | - rbac/exec-rbac-rolebinding.yaml 17 | 18 | # Generate the ConfigMap with a variable name. 19 | configMapGenerator: 20 | - name: config 21 | options: 22 | disableNameSuffixHash: true 23 | 24 | # Include patches to update the Service, StatefulSet, Route, and RBAC resources. 25 | 26 | # Define the image to be updated. 27 | # images: 28 | # - name: ghcr.io/llm-d/placeholder 29 | # newName: ghcr.io/llm-d/${IMAGE_TAG_BASE} 30 | # newTag: ${VERSION} 31 | patches: 32 | - path: common/patch-service.yaml 33 | - path: common/patch-statefulset.yaml 34 | - path: openshift/patch-route.yaml 35 | - path: rbac/patch-rbac-role.yaml 36 | - path: rbac/patch-rbac-rolebinding.yaml 37 | -------------------------------------------------------------------------------- /deploy/environments/openshift-base/openshift/patch-route.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: route.openshift.io/v1 2 | kind: Route 3 | metadata: 4 | name: route 5 | spec: 6 | to: 7 | name: "${PROJECT_NAME}-service" 8 | -------------------------------------------------------------------------------- /deploy/environments/openshift-base/openshift/route.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: route.openshift.io/v1 2 | kind: Route 3 | metadata: 4 | name: route 5 | spec: 6 | to: 7 | kind: Service 8 | name: placeholder 9 | port: 10 | targetPort: 8080 11 | tls: 12 | termination: edge 13 | -------------------------------------------------------------------------------- /deploy/environments/openshift-base/rbac/exec-rbac-role.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: Role 3 | metadata: 4 | name: exec-role 5 | rules: 6 | - apiGroups: [""] 7 | resources: ["pods/exec"] 8 | resourceNames: ["placeholder-0-0"] 9 | verbs: ["create"] 10 | -------------------------------------------------------------------------------- /deploy/environments/openshift-base/rbac/exec-rbac-rolebinding.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: RoleBinding 3 | metadata: 4 | name: exec-rolebinding 5 | subjects: 6 | - kind: Group 7 | name: system:authenticated 8 | apiGroup: rbac.authorization.k8s.io 9 | roleRef: 10 | kind: Role 11 | name: exec-role 12 | apiGroup: rbac.authorization.k8s.io 13 | 14 | -------------------------------------------------------------------------------- /deploy/environments/openshift-base/rbac/patch-rbac-role.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: Role 3 | metadata: 4 | name: exec-role 5 | rules: 6 | - apiGroups: [""] 7 | resources: ["pods/exec"] 8 | resourceNames: 9 | - "${PROJECT_NAME}-0-0" 10 | verbs: ["create"] 11 | -------------------------------------------------------------------------------- /deploy/environments/openshift-base/rbac/patch-rbac-rolebinding.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: RoleBinding 3 | metadata: 4 | name: exec-rolebinding 5 | subjects: 6 | - kind: Group 7 | name: system:authenticated 8 | apiGroup: rbac.authorization.k8s.io 9 | roleRef: 10 | kind: Role 11 | name: ${PROJECT_NAME}-exec-role 12 | apiGroup: rbac.authorization.k8s.io 13 | -------------------------------------------------------------------------------- /docs/architecture.md: -------------------------------------------------------------------------------- 1 | # llm-d Inference Router Architecture 2 | 3 | ## Overview 4 | 5 | **llm-d** is an extensible architecture designed to route inference requests efficiently across model-serving pods. A central component of this architecture is the **Inference Gateway**, which builds on the Kubernetes-native **Gateway API Inference Extension (GIE)** to enable scalable, flexible, and pluggable routing of requests. 6 | 7 | The design enables: 8 | - Support for **multiple base models** and **LoRA adapters** within a shared cluster [Not supported in Phase1] 9 | - Efficient routing based on **KV cache locality**, **prefix**, **session affinity**, **load**, and **model metadata** 10 | - Disaggregated **Prefill/Decode (P/D)** execution 11 | - Pluggable **filters**, **scorers**, and **scrapers** for extensible routing 12 | 13 | --- 14 | 15 | ## Core Goals 16 | 17 | - Route inference requests to optimal pods based on: 18 | - Base model compatibility 19 | - KV cache reuse 20 | - Load balancing 21 | - Support multi-model deployments on heterogeneous hardware 22 | - Enable runtime extensibility with pluggable logic (filters, scorers, scrapers) 23 | - Community-aligned implementation using GIE and Envoy + External Processing (EPP) 24 | 25 | --- 26 | 27 | ## Architecture Design 28 | 29 | ![Inference Gateway Architecture](./images/architecture.png) 30 | 31 | The inference scheduler is built on top of: 32 | - **Envoy** as a programmable data plane 33 | - **EPP (External Processing Plugin)** using **GIE** 34 | 35 | ### Pluggability 36 | 37 | ![Pluggability Architecture](./images/plugability.png) 38 | 39 | Routing decisions are governed by dynamic components: 40 | - **Filters**: Exclude pods based on static or dynamic criteria 41 | - **Scorers**: Assign scores to candidate pods 42 | - **Scrapers**: Collect pod metadata and metrics for scorers 43 | 44 | These components are maintained in the `llm-d-inference-scheduler` repository and can evolve independently. 45 | 46 | --- 47 | 48 | ## Filters, Scorers, and Scrapers 49 | 50 | ### Core Design Principles 51 | 52 | - **Pluggability**: No core changes are needed to add new scorers or filters 53 | - **Isolation**: Each component operates independently 54 | 55 | 56 | ### Routing Flow 57 | 58 | 1. **Filtering** 59 | - Pods in an `InferencePool` go through a sequential chain of filters 60 | - Pods may be excluded based on criteria like model compatibility, resource usage, or custom logic 61 | 62 | 2. **Scoring** 63 | - Filtered pods are scored using a weighted set of scorers 64 | - Scorers currently run sequentially (future: parallel execution) 65 | - Scorers access a shared datastore populated by scrapers 66 | 67 | 3. **Pod Selection** 68 | - The highest-scored pod is selected 69 | - If multiple pods share the same score, one is selected at random 70 | 71 | ### Lifecycle Hooks 72 | - `Pre-call` 73 | - `Scoring` 74 | - `Post-choice` 75 | - `After-response` 76 | 77 | --- 78 | 79 | ## Scorers & Configuration 80 | 81 | | Scorer | Description | Env Vars | 82 | |------------------|--------------------------------------------|----------| 83 | | Session-aware | Prefers pods from same session | `ENABLE_SESSION_AWARE_SCORER`, `SESSION_AWARE_SCORER_WEIGHT`, `PREFILL_ENABLE_SESSION_AWARE_SCORER`, `PREFILL_SESSION_AWARE_SCORER_WEIGHT` | 84 | | Prefix-aware | Matches prompt prefix | `ENABLE_PREFIX_AWARE_SCORER`, `PREFIX_AWARE_SCORER_WEIGHT`, `PREFILL_ENABLE_PREFIX_AWARE_SCORER`, `PREFILL_PREFIX_AWARE_SCORER_WEIGHT`, `PREFIX_SCORER_BLOCK_SIZE`| 85 | | KVCache-aware | Optimizes for KV reuse | `ENABLE_KVCACHE_AWARE_SCORER`, `KVCACHE_INDEXER_REDIS_ADDR`, `PREFILL_ENABLE_KVCACHE_AWARE_SCORER`, `PREFILL_KVCACHE_INDEXER_REDIS_ADDR`, `HF_TOKEN`, `KVCACHE_INDEXER_REDIS_ADDR` | 86 | | Load-aware | Avoids busy pods | `ENABLE_LOAD_AWARE_SCORER`, `LOAD_AWARE_SCORER_WEIGHT`, `PREFILL_ENABLE_LOAD_AWARE_SCORER`, `PREFILL_LOAD_AWARE_SCORER_WEIGHT` | 87 | 88 | ### Prefill / Decode Configuration 89 | 90 | In case Disaggrigated Prefill is enabled, you should also define the following environment variables. 91 | 92 | - Toggle P/D mode: `PD_ENABLED=true` 93 | - Threshold: `PD_PROMPT_LEN_THRESHOLD=` 94 | 95 | #### Prefill Scorers: 96 | ```bash 97 | export PREFILL_ENABLE_SESSION_AWARE_SCORER=true 98 | export PREFILL_SESSION_AWARE_SCORER_WEIGHT=1 99 | export PREFILL_ENABLE_KVCACHE_AWARE_SCORER=true 100 | export PREFILL_KVCACHE_AWARE_SCORER_WEIGHT=1 101 | export PREFILL_ENABLE_LOAD_AWARE_SCORER=true 102 | export PREFILL_LOAD_AWARE_SCORER_WEIGHT=1 103 | export PREFILL_ENABLE_PREFIX_AWARE_SCORER=true 104 | export PREFILL_PREFIX_AWARE_SCORER_WEIGHT=1 105 | ``` 106 | 107 | 108 | --- 109 | 110 | ## Metric Scraping 111 | 112 | - Scrapers collect metrics (e.g., memory usage, active adapters) 113 | - Data is injected into the shared datastore for scorers 114 | - Scoring can rely on numerical metrics or metadata (model ID, adapter tags) 115 | 116 | --- 117 | 118 | ## Disaggregated Prefill/Decode (P/D) 119 | 120 | When enabled, the router: 121 | - Selects one pod for **Prefill** (prompt processing) 122 | - Selects another pod for **Decode** (token generation) 123 | 124 | The **vLLM sidecar** handles orchestration between Prefill and Decode stages. It allows: 125 | - Queuing 126 | - Local memory management 127 | - Experimental protocol compatibility 128 | 129 | > **Note**: The detailed P/D design is available in this document: [Disaggregated Prefill/Decode in llm-d](./dp.md) 130 | --- 131 | 132 | ## InferencePool & InferenceModel Design 133 | 134 | ### Current Assumptions 135 | - Single `InferencePool` and single `EPP` due to Envoy limitations 136 | - Model-based filtering can be handled within EPP 137 | - Currently only one base model is supported 138 | 139 | --- 140 | 141 | ## References 142 | - [GIE Spec](https://gateway-api-inference-extension.sigs.k8s.io/) 143 | - [Envoy External Processing](https://www.envoyproxy.io/docs/envoy/latest/configuration/http/http_filters/ext_proc_filter) 144 | 145 | 146 | -------------------------------------------------------------------------------- /docs/create_new_filter.md: -------------------------------------------------------------------------------- 1 | # Extending llm-d-inference-scheduler with a custom filter 2 | 3 | ## Goal 4 | 5 | This tutorial outlines the steps needed for creating and hooking a new filter 6 | for the llm-d-inference-scheduler. 7 | 8 | The tutorial demonstrates the coding of a new filter, which selects inference 9 | serving Pods based on their labels. All relevant code is contained in the 10 | [`by_labels.go`](https://github.com/llm-d/llm-d-inference-scheduler/blob/main/pkg/scheduling/plugins/filter/by_labels.go) file. 11 | 12 | ## Introduction to filtering 13 | 14 | Plugins are used to modify llm-d-inference-scheduler's default behavior. Filter plugins 15 | are provided with a list of candidate inference serving Pods and filter out the 16 | Pods which do not match the filtering criteria. Several filtering plugins can 17 | run in succession to produce the final candidate list which is then evaluated, 18 | through the process of _scoring_, to select the most appropriate target Pods. 19 | While llm-d-inference-scheduler comes with several existing filters and 20 | more are availble in the upstream [Gateway API Inference Extension](https://sigs.k8s.io/gateway-api-inference-extension), 21 | in some cases it may be desireable to create and deploy custom filtering code to 22 | match your specific requirements. 23 | 24 | The filters` main operating function is 25 | 26 | ```go 27 | func Filter(*types.SchedulingContext, []types.Pod) []types.Pod 28 | ``` 29 | 30 | The `Filter` function accepts a `SchedulingContext` (e.g., containing the 31 | incoming LLM request) and an array of `Pod` objects as potential targets. Each `Pod` 32 | entry includes relevant inference metrics and attributes which can be used 33 | to make scheduling decisions. The function returns a (possibly smaller) array 34 | of `Pod`s which satisfy the filtering criteria. 35 | 36 | ## Code walkthough 37 | 38 | The top of the file has the expected Go package and import statments: 39 | 40 | ```go 41 | package filter 42 | 43 | import ( 44 | "errors" 45 | 46 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 47 | "k8s.io/apimachinery/pkg/labels" 48 | "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/plugins" 49 | "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types" 50 | ) 51 | ``` 52 | 53 | Specifically, we import the Kubernetes `meta/v1` and `labels` packages to allow 54 | defining and using `label.Selector` objects, and the Gateway API Infernce 55 | Extension's `plugin` (defininig the plugin interfaces) and `types` (defining 56 | scheduling related objects) packages. 57 | 58 | Next we define the `ByLabels` struct type, along with the relevant fields, 59 | and a consturctor function. 60 | 61 | ```go 62 | // ByLabels filters out pods that do not match its label selector criteria 63 | type ByLabels struct { 64 | name string 65 | selector labels.Selector 66 | } 67 | 68 | var _ plugins.Filter = &ByLabels{} // validate interface conformance 69 | 70 | // NewByLabel returns a new filter instance, configured with the provided 71 | // name and label selector. 72 | func NewByLabel(name string, selector *metav1.LabelSelector) (plugins.Filter, error) { 73 | if name == "" { 74 | return nil, errors.New("ByLabels: missing filter name") 75 | } 76 | labelSelector, err := metav1.LabelSelectorAsSelector(selector) 77 | if err != nil { 78 | return nil, err 79 | } 80 | 81 | return &ByLabels{ 82 | name: name, 83 | selector: labelSelector, 84 | }, nil 85 | } 86 | ``` 87 | 88 | > Note that, since Go supports "duck typing", the`plugin` package is 89 | not strictly required. We use it to validate `ByLabels` interface conformance 90 | (a pattern known as "interface implementation assertion" or "compile-time 91 | interface" check). The statement asserts at compile time that `ByLabels` 92 | implements the `plugins.Filter` interface and is useful for catching errors 93 | early, especially when refactoring (e.g. interface methods or signatures change). 94 | 95 | Next, we define the required `plugins.Filter` interface methods: 96 | 97 | ```go 98 | // Name returns the name of the filter 99 | func (blf *ByLabels) Name() string { 100 | return blf.name 101 | } 102 | 103 | // Filter filters out all pods that do not satisfy the label selector 104 | func (blf *ByLabels) Filter(_ *types.SchedulingContext, pods []types.Pod) []types.Pod { 105 | filtered := []types.Pod{} 106 | 107 | for _, pod := range pods { 108 | labels := labels.Set(pod.GetPod().Labels) 109 | if blf.selector.Matches(labels) { 110 | filtered = append(filtered, pod) 111 | } 112 | } 113 | return filtered 114 | } 115 | ``` 116 | 117 | Since the filter is only matching on candidate `types.Pod` labels, 118 | we leave the `types.SchedullingContext` parameter unnamed. Filters 119 | that need access to LLM request information (e.g., filtering based 120 | on prompt length) may use it. 121 | 122 | ## Hooking the filter into the scheduling flow 123 | 124 | Once a filter is defined, it can be used to modify llm-d-inference-scheduler 125 | configuration. This would typically be done by modifying the 126 | `pkg/config/config.go` file to 127 | 128 | - Add the relevant import path (if defined outside this repository); 129 | - Add any desired configuration knobs (e.g., environment variables); and 130 | - Listing the new filter in the `LoadConfig()` function's `cfg.loadPluginInfo` 131 | list of available plugins. 132 | 133 | In the case of the llm-d-inference-scheduler, filters can be hooked into the 134 | `Prefill` and/or `Decode` scheduling cycles. For example, the following snippet 135 | adds the `ByLabels` filter to the list of plugins available to the `Decode` 136 | scheduler (assuming a `ByLabelFilterName` constant is defined along with other 137 | environment variables): 138 | 139 | ```go 140 | func (c *Config) LoadConfig() { 141 | c.loadPluginInfo(c.DecodeSchedulerPlugins, false, 142 | KVCacheScorerName, ..., ByLabelFilterName, ... ) 143 | c.loadPluginInfo(c.PrefillSchedulerPlugins, true, ... ) 144 | // ... 145 | } 146 | ``` 147 | 148 | > Note: a real filter would require unit tests, etc. These are left out to 149 | keep the tutorial short and focused. 150 | 151 | ## Next steps 152 | 153 | If you have an idea for a new `Filter` (or other) plugin - we'd love to hear 154 | from you! Please open an [issue](https://github.com/llm-d/llm-d-inference-scheduler/issues/new/choose), 155 | describing your use case and requirements, and we'll reach out to refine 156 | and collaborate. 157 | -------------------------------------------------------------------------------- /docs/dp.md: -------------------------------------------------------------------------------- 1 | # Disaggregated Prefill/Decode Inference Serving in llm-d 2 | 3 | ## Overview 4 | 5 | This document describes the architecture and request lifecycle for enabling **disaggregated prefill and decode (P/D)** inference execution in the llm-d router. The architecture aims to improve flexibility, scalability, and performance by enabling separation of prefill and decode stages onto different workers. 6 | 7 | This evolved version removes the requirement for sidecars on the **prefill node**, simplifying deployment while maintaining orchestration from the **decode node**. 8 | 9 | --- 10 | 11 | ## Goals 12 | 13 | - Enable routing of prefill and decode to different pods 14 | - Maintain low latency and high throughput 15 | - Improve resource utilization by specializing pods for prefill or decode 16 | - Align with GIE-compatible architectures for potential upstreaming 17 | 18 | --- 19 | 20 | ## Key Components 21 | 22 | | Component | Role | 23 | |----------------------|----------------------------------------------------------------------| 24 | | **Prefill Worker** | Handles only prefill stage using vLLM engine | 25 | | **Decode Worker** | Handles decode stage and contains the sidecar for coordination | 26 | | **Sidecar (Decode)** | Orchestrates communication with prefill worker and manages lifecycle | 27 | | **Envoy Proxy** | Accepts OpenAI-style requests and forwards them to EPP | 28 | | **EPP** | End Point Picker, makes scheduling decisions | 29 | 30 | --- 31 | 32 | ## Request Lifecycle 33 | 34 | 1. **User Request** 35 | - Sent via OpenAI API to the Envoy Proxy 36 | 37 | 2. **EPP Scheduling Decision** 38 | - EPP evaluates: 39 | - Prompt length 40 | - KV cache hit probability 41 | - System and pod load 42 | - Selects either: 43 | - **Single node** path (decode handles all) 44 | - **Split node** path (distinct prefill and decode workers) 45 | - Returns Decode Worker (always), and optionally Prefill Worker URL 46 | 47 | 3. **Execution** 48 | - Request lands on Decode Worker (as selected by EPP) 49 | - Decode sidecar coordinates: 50 | - If `prefill_worker_id == nil`, runs both stages locally by passing request to local vllm 51 | - If split: 52 | - Sends prefill job to Prefill Worker with a special header `do_remote_decode=true` 53 | - Upon receiving response from Prefill Worker runs decode stage 54 | 55 | 4. **Response Flow** 56 | - Response flows from decode sidecar → Envoy → EPP → User 57 | 58 | --- 59 | 60 | ## Architectural Details 61 | 62 | ### Sidecar Responsibilities (Decode Only) 63 | 64 | - Receives EPP metadata (decode pod, optional prefill pod) 65 | - Sends request to prefill 66 | - Waits and validates result 67 | - Launches local decode job 68 | - Sends final response 69 | 70 | > **Note**: No sidecar or coordination logic is needed on the prefill node. 71 | 72 | --- 73 | 74 | ## Worker Selection Logic 75 | 76 | - **Decode Worker**: 77 | - Prefer longest prefix match / KV cache utilization (depends on avaialble scorers) 78 | 79 | - **Prefill Worker**: 80 | - High prefix-cache hit rate 81 | - Low load 82 | 83 | > **Skip prefill worker** when: 84 | > - Prefix match/kv cache hit is high 85 | > - Prompt is very short 86 | 87 | --- 88 | 89 | ## vLLM and LMCache Integration 90 | 91 | - **vLLM changes** (or wrapper APIs): 92 | - `save()`, `load()` APIs 93 | - `done_sending`, `done_receiving` 94 | - Connector API supporting async transfer 95 | 96 | --- 97 | 98 | ## Drawbacks & Limitations 99 | 100 | - Slight increase in TTFT for split P/D 101 | - Possibility of stranded memory on prefill crash 102 | - Need for timeout and retry logic 103 | 104 | --- 105 | 106 | ## Design Benefits 107 | 108 | - **Flexibility**: Enables per-request specialization and resource balancing 109 | - **Scalability**: Clean separation of concerns for easier ops and tuning 110 | - **Upstream-ready**: Follows GIE-compatible request handling 111 | - **Minimal Changes**: Only decode node includes orchestration sidecar 112 | 113 | --- 114 | 115 | ## Future Considerations 116 | 117 | - Cache coordinate 118 | - Pre allocation of kv blocks in decode node , push cache from prefill to decode worker during calculation 119 | 120 | --- 121 | 122 | ## Diagram 123 | 124 | ![Disaggregated Prefill/Decode Architecture](./images/dp_architecture.png) 125 | 126 | --- 127 | 128 | ## References 129 | -------------------------------------------------------------------------------- /docs/images/architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/llm-d/llm-d-inference-scheduler/f6c57c520c3fb9fc89735da8469ddad847074273/docs/images/architecture.png -------------------------------------------------------------------------------- /docs/images/dp_architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/llm-d/llm-d-inference-scheduler/f6c57c520c3fb9fc89735da8469ddad847074273/docs/images/dp_architecture.png -------------------------------------------------------------------------------- /docs/images/plugability.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/llm-d/llm-d-inference-scheduler/f6c57c520c3fb9fc89735da8469ddad847074273/docs/images/plugability.png -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/llm-d/llm-d-inference-scheduler 2 | 3 | go 1.24.1 4 | 5 | toolchain go1.24.2 6 | 7 | require ( 8 | github.com/cespare/xxhash/v2 v2.3.0 9 | github.com/envoyproxy/go-control-plane/envoy v1.32.4 10 | github.com/go-logr/logr v1.4.2 11 | github.com/google/go-cmp v0.7.0 12 | github.com/hashicorp/golang-lru/v2 v2.0.7 13 | github.com/llm-d/llm-d-kv-cache-manager v0.1.0 14 | github.com/prometheus/client_golang v1.22.0 15 | github.com/stretchr/testify v1.10.0 16 | go.uber.org/zap v1.27.0 17 | google.golang.org/grpc v1.72.0 18 | k8s.io/apimachinery v0.33.1 19 | k8s.io/client-go v0.32.5 20 | k8s.io/component-base v0.32.5 21 | sigs.k8s.io/controller-runtime v0.20.4 22 | sigs.k8s.io/gateway-api v1.3.0 23 | sigs.k8s.io/gateway-api-inference-extension v0.0.0-20250521193836-a5bf0acd13cc 24 | ) 25 | 26 | require ( 27 | cel.dev/expr v0.20.0 // indirect 28 | github.com/antlr4-go/antlr/v4 v4.13.0 // indirect 29 | github.com/asaskevich/govalidator v0.0.0-20190424111038-f61b66f89f4a // indirect 30 | github.com/beorn7/perks v1.0.1 // indirect 31 | github.com/blang/semver/v4 v4.0.0 // indirect 32 | github.com/cenkalti/backoff/v4 v4.3.0 // indirect 33 | github.com/cncf/xds/go v0.0.0-20250121191232-2f005788dc42 // indirect 34 | github.com/daulet/tokenizers v1.20.2 // indirect 35 | github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect 36 | github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f // indirect 37 | github.com/emicklei/go-restful/v3 v3.12.0 // indirect 38 | github.com/envoyproxy/protoc-gen-validate v1.2.1 // indirect 39 | github.com/evanphx/json-patch/v5 v5.9.11 // indirect 40 | github.com/felixge/httpsnoop v1.0.4 // indirect 41 | github.com/fsnotify/fsnotify v1.7.0 // indirect 42 | github.com/fxamacker/cbor/v2 v2.7.0 // indirect 43 | github.com/go-logr/stdr v1.2.2 // indirect 44 | github.com/go-logr/zapr v1.3.0 // indirect 45 | github.com/go-openapi/jsonpointer v0.21.0 // indirect 46 | github.com/go-openapi/jsonreference v0.21.0 // indirect 47 | github.com/go-openapi/swag v0.23.0 // indirect 48 | github.com/gogo/protobuf v1.3.2 // indirect 49 | github.com/golang/protobuf v1.5.4 // indirect 50 | github.com/google/btree v1.1.3 // indirect 51 | github.com/google/cel-go v0.22.0 // indirect 52 | github.com/google/gnostic-models v0.6.9 // indirect 53 | github.com/google/uuid v1.6.0 // indirect 54 | github.com/grpc-ecosystem/grpc-gateway/v2 v2.24.0 // indirect 55 | github.com/inconshreveable/mousetrap v1.1.0 // indirect 56 | github.com/josharian/intern v1.0.0 // indirect 57 | github.com/json-iterator/go v1.1.12 // indirect 58 | github.com/mailru/easyjson v0.7.7 // indirect 59 | github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect 60 | github.com/modern-go/reflect2 v1.0.2 // indirect 61 | github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect 62 | github.com/pkg/errors v0.9.1 // indirect 63 | github.com/planetscale/vtprotobuf v0.6.1-0.20240319094008-0393e58bdf10 // indirect 64 | github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect 65 | github.com/prometheus/client_model v0.6.2 // indirect 66 | github.com/prometheus/common v0.64.0 // indirect 67 | github.com/prometheus/procfs v0.15.1 // indirect 68 | github.com/redis/go-redis/v9 v9.7.3 // indirect 69 | github.com/spf13/cobra v1.9.1 // indirect 70 | github.com/spf13/pflag v1.0.6 // indirect 71 | github.com/stoewer/go-strcase v1.3.0 // indirect 72 | github.com/x448/float16 v0.8.4 // indirect 73 | go.opentelemetry.io/auto/sdk v1.1.0 // indirect 74 | go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.58.0 // indirect 75 | go.opentelemetry.io/otel v1.34.0 // indirect 76 | go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.33.0 // indirect 77 | go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.33.0 // indirect 78 | go.opentelemetry.io/otel/metric v1.34.0 // indirect 79 | go.opentelemetry.io/otel/sdk v1.34.0 // indirect 80 | go.opentelemetry.io/otel/trace v1.34.0 // indirect 81 | go.opentelemetry.io/proto/otlp v1.4.0 // indirect 82 | go.uber.org/multierr v1.11.0 // indirect 83 | golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56 // indirect 84 | golang.org/x/net v0.40.0 // indirect 85 | golang.org/x/oauth2 v0.30.0 // indirect 86 | golang.org/x/sync v0.14.0 // indirect 87 | golang.org/x/sys v0.33.0 // indirect 88 | golang.org/x/term v0.32.0 // indirect 89 | golang.org/x/text v0.25.0 // indirect 90 | golang.org/x/time v0.9.0 // indirect 91 | gomodules.xyz/jsonpatch/v2 v2.4.0 // indirect 92 | google.golang.org/genproto/googleapis/api v0.0.0-20250218202821-56aae31c358a // indirect 93 | google.golang.org/genproto/googleapis/rpc v0.0.0-20250428153025-10db94c68c34 // indirect 94 | google.golang.org/protobuf v1.36.6 // indirect 95 | gopkg.in/evanphx/json-patch.v4 v4.12.0 // indirect 96 | gopkg.in/inf.v0 v0.9.1 // indirect 97 | gopkg.in/yaml.v3 v3.0.1 // indirect 98 | k8s.io/api v0.32.5 // indirect 99 | k8s.io/apiextensions-apiserver v0.32.5 // indirect 100 | k8s.io/apiserver v0.32.5 // indirect 101 | k8s.io/klog/v2 v2.130.1 // indirect 102 | k8s.io/kube-openapi v0.0.0-20250318190949-c8a335a9a2ff // indirect 103 | k8s.io/utils v0.0.0-20241210054802-24370beab758 // indirect 104 | sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.31.0 // indirect 105 | sigs.k8s.io/json v0.0.0-20241010143419-9aa6b5e7a4b3 // indirect 106 | sigs.k8s.io/randfill v1.0.0 // indirect 107 | sigs.k8s.io/structured-merge-diff/v4 v4.7.0 // indirect 108 | sigs.k8s.io/yaml v1.4.0 // indirect 109 | ) 110 | -------------------------------------------------------------------------------- /hooks/pre-commit: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -e 3 | 4 | echo "▶️ Running lint…" 5 | make lint 6 | 7 | echo "▶️ Running tests…" 8 | make test 9 | 10 | echo "✔️ All checks passed!" 11 | -------------------------------------------------------------------------------- /internal/controller/runnable/grpc.go: -------------------------------------------------------------------------------- 1 | /** 2 | * This file is adapted from Gateway API Inference Extension 3 | * Original source: https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/internal/runnable/grpc.go 4 | * Licensed under the Apache License, Version 2.0 5 | */ 6 | 7 | // Package runnable contains tooling to manage and convert manager.Runnable 8 | // objects for controllers. 9 | package runnable 10 | 11 | import ( 12 | "context" 13 | "fmt" 14 | "net" 15 | 16 | "google.golang.org/grpc" 17 | 18 | ctrl "sigs.k8s.io/controller-runtime" 19 | "sigs.k8s.io/controller-runtime/pkg/manager" 20 | ) 21 | 22 | // GRPCServer promotes the provided grpc.Server to a manager.Runnable. 23 | func GRPCServer(name string, srv *grpc.Server, port int) manager.Runnable { 24 | return manager.RunnableFunc(func(ctx context.Context) error { 25 | log := ctrl.Log.WithValues("name", name) 26 | log.Info("gRPC server starting") 27 | 28 | listener, err := net.Listen("tcp", fmt.Sprintf(":%d", port)) 29 | if err != nil { 30 | log.Error(err, "gRPC server failed to listen", "port", port) 31 | return err 32 | } 33 | 34 | log.Info("gRPC server listening", "port", port) 35 | 36 | doneCh := make(chan struct{}) 37 | defer close(doneCh) 38 | go func() { 39 | select { 40 | case <-ctx.Done(): 41 | log.Info("gRPC server shutting down") 42 | srv.GracefulStop() 43 | case <-doneCh: 44 | } 45 | }() 46 | 47 | if err := srv.Serve(listener); err != nil && err != grpc.ErrServerStopped { 48 | log.Error(err, "gRPC server failed") 49 | return err 50 | } 51 | 52 | log.Info("gRPC server terminated") 53 | 54 | return nil 55 | }) 56 | } 57 | -------------------------------------------------------------------------------- /internal/controller/runnable/leader_election.go: -------------------------------------------------------------------------------- 1 | /** 2 | * This file is adapted from Gateway API Inference Extension 3 | * Original source: https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/internal/runnable/leader_election.go 4 | * Licensed under the Apache License, Version 2.0 5 | */ 6 | 7 | package runnable 8 | 9 | import ( 10 | "sigs.k8s.io/controller-runtime/pkg/manager" 11 | ) 12 | 13 | // LeaderElection enables or disabled leader election for the provided manager.Runnable. 14 | func LeaderElection(runnable manager.Runnable, needsLeaderElection bool) manager.Runnable { 15 | return &leaderElection{ 16 | Runnable: runnable, 17 | needsLeaderElection: needsLeaderElection, 18 | } 19 | } 20 | 21 | // RequireLeaderElection enables leader election for the provided manager.Runnable. 22 | func RequireLeaderElection(runnable manager.Runnable) manager.Runnable { 23 | return LeaderElection(runnable, true) 24 | } 25 | 26 | // NoLeaderElection disabled leader election for the provided manager.Runnable. 27 | func NoLeaderElection(runnable manager.Runnable) manager.Runnable { 28 | return LeaderElection(runnable, false) 29 | } 30 | 31 | // leaderElection is a wrapped manager.Runnable with configuration for enabling 32 | // or disabling leader election. 33 | type leaderElection struct { 34 | manager.Runnable 35 | needsLeaderElection bool 36 | } 37 | 38 | // NeedLeaderElection indicates whether or not leader election is enabled. 39 | func (r *leaderElection) NeedLeaderElection() bool { 40 | return r.needsLeaderElection 41 | } 42 | -------------------------------------------------------------------------------- /internal/controller/tls/tls.go: -------------------------------------------------------------------------------- 1 | /** 2 | * This file is adapted from Gateway API Inference Extension 3 | * Original source: https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/internal/tls/tls.go 4 | * Licensed under the Apache License, Version 2.0 5 | */ 6 | 7 | // Package tls includes tooling for handling TLS certificates for controllers. 8 | package tls 9 | 10 | import ( 11 | "crypto/rand" 12 | "crypto/rsa" 13 | "crypto/tls" 14 | "crypto/x509" 15 | "crypto/x509/pkix" 16 | "encoding/pem" 17 | "fmt" 18 | "math/big" 19 | "time" 20 | ) 21 | 22 | // CreateSelfSignedTLSCertificate generates a self-signed certificate. 23 | func CreateSelfSignedTLSCertificate() (tls.Certificate, error) { 24 | serialNumberLimit := new(big.Int).Lsh(big.NewInt(1), 128) 25 | serialNumber, err := rand.Int(rand.Reader, serialNumberLimit) 26 | if err != nil { 27 | return tls.Certificate{}, fmt.Errorf("error creating serial number: %v", err) 28 | } 29 | 30 | now := time.Now() 31 | notBefore := now.UTC() 32 | template := x509.Certificate{ 33 | SerialNumber: serialNumber, 34 | Subject: pkix.Name{ 35 | Organization: []string{"Inference Ext"}, 36 | }, 37 | NotBefore: notBefore, 38 | NotAfter: now.Add(time.Hour * 24 * 365 * 1).UTC(), 39 | KeyUsage: x509.KeyUsageKeyEncipherment | x509.KeyUsageDigitalSignature, 40 | ExtKeyUsage: []x509.ExtKeyUsage{x509.ExtKeyUsageServerAuth}, 41 | BasicConstraintsValid: true, 42 | } 43 | 44 | priv, err := rsa.GenerateKey(rand.Reader, 4096) 45 | if err != nil { 46 | return tls.Certificate{}, fmt.Errorf("error generating key: %v", err) 47 | } 48 | 49 | derBytes, err := x509.CreateCertificate(rand.Reader, &template, &template, &priv.PublicKey, priv) 50 | if err != nil { 51 | return tls.Certificate{}, fmt.Errorf("error creating certificate: %v", err) 52 | } 53 | 54 | certBytes := pem.EncodeToMemory(&pem.Block{Type: "CERTIFICATE", Bytes: derBytes}) 55 | 56 | privBytes, err := x509.MarshalPKCS8PrivateKey(priv) 57 | if err != nil { 58 | return tls.Certificate{}, fmt.Errorf("error marshalling private key: %v", err) 59 | } 60 | keyBytes := pem.EncodeToMemory(&pem.Block{Type: "PRIVATE KEY", Bytes: privBytes}) 61 | 62 | return tls.X509KeyPair(certBytes, keyBytes) 63 | } 64 | -------------------------------------------------------------------------------- /pkg/config/config.go: -------------------------------------------------------------------------------- 1 | // Package config provides the configuration reading abilities 2 | // Current version read configuration from environment variables 3 | package config 4 | 5 | import ( 6 | "math" 7 | 8 | "github.com/go-logr/logr" 9 | "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/env" 10 | ) 11 | 12 | const ( 13 | // For every plugin named below, there are four environment variables. They are: 14 | // - "ENABLE_" + pluginName Enables the named plugin for decode processing 15 | // - pluginName + "_WEIGHT" The weight for a scorer in decode processing 16 | // - "PREFILL_ENABLE_" + pluginName Enables the named plugin for prefill processing 17 | // - "PREFILL_" + pluginName + "_WEIGHT" The weight for a scorer in prefill processing 18 | 19 | // KVCacheScorerName name of the kv-cache scorer in configuration 20 | KVCacheScorerName = "KVCACHE_AWARE_SCORER" 21 | // LoadAwareScorerName name of the load aware scorer in configuration 22 | LoadAwareScorerName = "LOAD_AWARE_SCORER" 23 | // PrefixScorerName name of the prefix scorer in configuration 24 | PrefixScorerName = "PREFIX_AWARE_SCORER" 25 | // SessionAwareScorerName name of the session aware scorer in configuration 26 | SessionAwareScorerName = "SESSION_AWARE_SCORER" 27 | 28 | prefillPrefix = "PREFILL_" 29 | enablePrefix = "ENABLE_" 30 | weightSuffix = "_WEIGHT" 31 | 32 | // Plugins from Upstream 33 | 34 | // GIELeastKVCacheFilterName name of the GIE least kv-cache filter in configuration 35 | GIELeastKVCacheFilterName = "GIE_LEAST_KVCACHE_FILTER" 36 | // GIELeastQueueFilterName name of the GIE least queue filter in configuration 37 | GIELeastQueueFilterName = "GIE_LEAST_QUEUE_FILTER" 38 | // GIELoraAffinityFilterName name of the GIE LoRA affinity filter in configuration 39 | GIELoraAffinityFilterName = "GIE_LORA_AFFINITY_FILTER" 40 | // GIELowQueueFilterName name of the GIE low queue filter in configuration 41 | GIELowQueueFilterName = "GIE_LOW_QUEUE_FILTER" 42 | // GIESheddableCapacityFilterName name of the GIE sheddable capacity filter in configuration 43 | GIESheddableCapacityFilterName = "GIE_SHEDDABLE_CAPACITY_FILTER" 44 | // GIEKVCacheUtilizationScorerName name of the GIE kv-cache utilization scorer in configuration 45 | GIEKVCacheUtilizationScorerName = "GIE_KVCACHE_UTILIZATION_SCORER" 46 | // GIEQueueScorerName name of the GIE queue scorer in configuration 47 | GIEQueueScorerName = "GIE_QUEUE_SCORER" 48 | // GIEPrefixScorerName name of the GIE prefix plugin in configuration 49 | GIEPrefixScorerName = "GIE_PREFIX_SCORER" 50 | 51 | pdEnabledEnvKey = "PD_ENABLED" 52 | pdPromptLenThresholdEnvKey = "PD_PROMPT_LEN_THRESHOLD" 53 | pdPromptLenThresholdDefault = 100 54 | 55 | prefixScorerBlockSizeEnvKey = "PREFIX_SCORER_BLOCK_SIZE" 56 | prefixScorerBlockSizeDefault = 256 57 | ) 58 | 59 | // Config contains scheduler configuration, currently configuration is loaded from environment variables 60 | type Config struct { 61 | logger logr.Logger 62 | DecodeSchedulerPlugins map[string]int 63 | PrefillSchedulerPlugins map[string]int 64 | 65 | PDEnabled bool 66 | PDThreshold int 67 | PrefixBlockSize int 68 | } 69 | 70 | // NewConfig creates a new instance if Config 71 | func NewConfig(logger logr.Logger) *Config { 72 | return &Config{ 73 | logger: logger, 74 | DecodeSchedulerPlugins: map[string]int{}, 75 | PrefillSchedulerPlugins: map[string]int{}, 76 | PDEnabled: false, 77 | PDThreshold: math.MaxInt, 78 | PrefixBlockSize: prefixScorerBlockSizeDefault, 79 | } 80 | } 81 | 82 | // LoadConfig loads configuration from environment variables 83 | func (c *Config) LoadConfig() { 84 | c.loadPluginInfo(c.DecodeSchedulerPlugins, false, 85 | KVCacheScorerName, LoadAwareScorerName, PrefixScorerName, SessionAwareScorerName, 86 | GIELeastKVCacheFilterName, GIELeastQueueFilterName, GIELoraAffinityFilterName, 87 | GIELowQueueFilterName, GIESheddableCapacityFilterName, 88 | GIEKVCacheUtilizationScorerName, GIEQueueScorerName, GIEPrefixScorerName) 89 | 90 | c.loadPluginInfo(c.PrefillSchedulerPlugins, true, 91 | KVCacheScorerName, LoadAwareScorerName, PrefixScorerName, SessionAwareScorerName, 92 | GIELeastKVCacheFilterName, GIELeastQueueFilterName, GIELoraAffinityFilterName, 93 | GIELowQueueFilterName, GIESheddableCapacityFilterName, 94 | GIEKVCacheUtilizationScorerName, GIEQueueScorerName, GIEPrefixScorerName) 95 | 96 | c.PDEnabled = env.GetEnvString(pdEnabledEnvKey, "false", c.logger) == "true" 97 | c.PDThreshold = env.GetEnvInt(pdPromptLenThresholdEnvKey, pdPromptLenThresholdDefault, c.logger) 98 | c.PrefixBlockSize = env.GetEnvInt(prefixScorerBlockSizeEnvKey, prefixScorerBlockSizeDefault, c.logger) 99 | } 100 | 101 | func (c *Config) loadPluginInfo(plugins map[string]int, prefill bool, pluginNames ...string) { 102 | for _, pluginName := range pluginNames { 103 | var enablementKey string 104 | var weightKey string 105 | if prefill { 106 | enablementKey = prefillPrefix + enablePrefix + pluginName 107 | weightKey = prefillPrefix + pluginName + weightSuffix 108 | } else { 109 | enablementKey = enablePrefix + pluginName 110 | weightKey = pluginName + weightSuffix 111 | } 112 | 113 | if env.GetEnvString(enablementKey, "false", c.logger) != "true" { 114 | c.logger.Info("Skipping plugin creation as it is not enabled", "name", pluginName) 115 | } else { 116 | weight := env.GetEnvInt(weightKey, 1, c.logger) 117 | 118 | plugins[pluginName] = weight 119 | c.logger.Info("Initialized plugin", "plugin", pluginName, "weight", weight) 120 | } 121 | } 122 | } 123 | -------------------------------------------------------------------------------- /pkg/scheduling/dual/scheduler.go: -------------------------------------------------------------------------------- 1 | // Package dual provides a sample Scheduler that internally uses 2 | // a dual scheduler construct (primary and secondary). 3 | package dual 4 | 5 | import ( 6 | "context" 7 | "fmt" 8 | "math/rand" 9 | "time" 10 | 11 | "sigs.k8s.io/controller-runtime/pkg/log" 12 | "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" 13 | "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics" 14 | "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/requestcontrol" 15 | "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling" 16 | "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/plugins/picker" 17 | giescorer "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/plugins/scorer" 18 | "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types" 19 | logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" 20 | 21 | "github.com/llm-d/llm-d-inference-scheduler/pkg/scheduling/plugins/filter" 22 | "github.com/llm-d/llm-d-inference-scheduler/pkg/scheduling/plugins/scorer" 23 | ) 24 | 25 | // Scheduler implements the dual scheduler concept, along with a threshold 26 | // determining when each is invoked. 27 | type Scheduler struct { 28 | threshold float32 29 | store datastore.Datastore 30 | primary requestcontrol.Scheduler 31 | secondary requestcontrol.Scheduler 32 | } 33 | 34 | // NewScheduler create a new scheduler with the given datastore and threshold 35 | func NewScheduler(threshold float32, datastore datastore.Datastore) *Scheduler { 36 | scheduler := &Scheduler{ 37 | threshold: threshold, 38 | store: datastore, 39 | } 40 | 41 | scheduler.primary = scheduling.NewSchedulerWithConfig(datastore, scheduling.NewSchedulerConfig(). 42 | WithFilters(&filter.Passthrough{}). 43 | WithScorers(giescorer.NewWeightedScorer(&scorer.Passthrough{}, 10)). 44 | WithPicker(picker.NewMaxScorePicker())) 45 | 46 | scheduler.secondary = scheduling.NewSchedulerWithConfig(datastore, scheduling.NewSchedulerConfig(). 47 | WithFilters(&filter.Random{}). 48 | WithScorers(giescorer.NewWeightedScorer(&scorer.Random{}, 10)). 49 | WithPicker(picker.NewRandomPicker())) 50 | 51 | return scheduler 52 | } 53 | 54 | // Schedule selects a Pod for the given request and context 55 | func (s *Scheduler) Schedule(ctx context.Context, req *types.LLMRequest) (*types.Result, error) { 56 | logger := log.FromContext(ctx).WithName("PD-scheduler").WithValues("request", req) 57 | debugLog := logger.V(logutil.DEBUG) 58 | 59 | scheduleStart := time.Now() 60 | defer func() { 61 | metrics.RecordSchedulerE2ELatency(time.Since(scheduleStart)) 62 | }() 63 | 64 | if rand.Float32() > s.threshold { // choose a primary only 65 | return s.primary.Schedule(ctx, req) 66 | } 67 | 68 | primary, err := s.primary.Schedule(ctx, req) 69 | if err != nil { 70 | return nil, err 71 | } 72 | debugLog.Info(fmt.Sprintf("Primary scheduler selected %+v", primary)) 73 | 74 | // TODO: this is demo behavior we need to replace once we know what we want. 75 | if rand.Float32() < s.threshold { // choose a secondary as well 76 | secondary, err := s.secondary.Schedule(ctx, req) 77 | if err != nil { 78 | debugLog.Info(fmt.Sprintf("Secondary scheduler failed %+v, returning primary", err)) 79 | } 80 | debugLog.Info(fmt.Sprintf("Secondary scheduler selected %+v", secondary)) 81 | if rand.Float32() < s.threshold { // lucky again: return the secondary 82 | return secondary, nil 83 | } 84 | } 85 | return primary, nil 86 | } 87 | -------------------------------------------------------------------------------- /pkg/scheduling/pd/doc.go: -------------------------------------------------------------------------------- 1 | // Package pd implements disaggregated Prefill/Decode scheduling 2 | package pd 3 | -------------------------------------------------------------------------------- /pkg/scheduling/pd/scheduler.go: -------------------------------------------------------------------------------- 1 | package pd 2 | 3 | import ( 4 | "context" 5 | "errors" 6 | "fmt" 7 | "time" 8 | 9 | "sigs.k8s.io/controller-runtime/pkg/log" 10 | "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" 11 | backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics" 12 | "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics" 13 | "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/requestcontrol" 14 | "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling" 15 | "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/plugins" 16 | giefilter "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/plugins/filter" 17 | "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/plugins/multi/prefix" 18 | "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/plugins/picker" 19 | giescorer "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/plugins/scorer" 20 | "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types" 21 | envutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/env" 22 | logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" 23 | 24 | "github.com/llm-d/llm-d-inference-scheduler/pkg/config" 25 | "github.com/llm-d/llm-d-inference-scheduler/pkg/scheduling/plugins/filter" 26 | "github.com/llm-d/llm-d-inference-scheduler/pkg/scheduling/plugins/scorer" 27 | ) 28 | 29 | const ( 30 | // PrefillPodHeader is the HTTP header name used to indicate Prefill worker 31 | PrefillPodHeader = "x-prefiller-url" 32 | ) 33 | 34 | // Scheduler implements the disaggreagted P/D scheduling logic 35 | type Scheduler struct { 36 | threshold int 37 | pdEnabled bool 38 | store Datastore 39 | prefill requestcontrol.Scheduler 40 | decode requestcontrol.Scheduler 41 | 42 | // prefixScorer is a prefix scorer which will be used for decission if prefill step is required 43 | // if pd is enabled, prefix scorers should be the same instance in all: 44 | // prefill scheduler, decode scheduler and prefixScorer 45 | prefixScorer *scorer.PrefixAwareScorer 46 | } 47 | 48 | var _ requestcontrol.Scheduler = &Scheduler{} // validate interface conformance 49 | 50 | // Datastore portion used by scheduler 51 | type Datastore interface { 52 | // InferencePool operations 53 | PoolGet() (*v1alpha2.InferencePool, error) 54 | // PodMetrics operations 55 | PodGetAll() []backendmetrics.PodMetrics 56 | } 57 | 58 | // NewScheduler returns a new disaggregated Prefill/Decode filter, using the 59 | // provided configuration. 60 | func NewScheduler(ctx context.Context, schedCfg *config.Config, ds Datastore) (*Scheduler, error) { 61 | prefixConfig := scorer.DefaultPrefixStoreConfig() 62 | prefixConfig.BlockSize = schedCfg.PrefixBlockSize 63 | 64 | scheduler := &Scheduler{ 65 | threshold: schedCfg.PDThreshold, 66 | pdEnabled: schedCfg.PDEnabled, 67 | store: ds, 68 | prefixScorer: scorer.NewPrefixAwareScorer(ctx, prefixConfig), 69 | } 70 | 71 | scheduler.prefill = scheduling.NewSchedulerWithConfig( 72 | ds, 73 | scheduler.generateSchedulerConfig(ctx, schedCfg.PrefillSchedulerPlugins, 74 | &filter.PrefillFilter{}), 75 | ) 76 | 77 | scheduler.decode = scheduling.NewSchedulerWithConfig( 78 | ds, 79 | scheduler.generateSchedulerConfig(ctx, schedCfg.DecodeSchedulerPlugins, 80 | &filter.DecodeFilter{}), 81 | ) 82 | 83 | return scheduler, nil 84 | } 85 | 86 | // Schedule uses (up to) two internal schedulers to process requests. 87 | // If the request prompt is short (as defined by the configured threshold) 88 | // the scheduler use the default behavior ("Decode scheduler"). 89 | // If the request prompt is long enough to warrant disaggregated prefill-decode, 90 | // both the Prefill and Decode schedulers are invoked. In the case of the 91 | // Prefill scheduler, the selected Pod's URL is saved in a header 92 | // and communicated back to the inference gateway. 93 | func (s *Scheduler) Schedule(ctx context.Context, req *types.LLMRequest) (*types.Result, error) { 94 | logger := log.FromContext(ctx).WithName("PD").WithValues("request", req) 95 | debugLog := logger.V(logutil.DEBUG) 96 | 97 | scheduleStart := time.Now() 98 | defer func() { 99 | metrics.RecordSchedulerE2ELatency(time.Since(scheduleStart)) 100 | }() 101 | 102 | if !s.pdEnabled { 103 | debugLog.Info("Disagregated prefill/decode disabled - scheduling to decode worker only") 104 | return s.decode.Schedule(ctx, req) 105 | } 106 | 107 | // find the best pod for decode 108 | // assumes that prefix scorer was activated 109 | decodeRes, err := s.decode.Schedule(ctx, req) 110 | 111 | if decodeRes == nil || decodeRes.TargetPod == nil { 112 | logger.Info("No decode pod found, skipping scheduling") 113 | return nil, errors.New("no decode pod found") 114 | } 115 | 116 | // if the request is short enough, use the default scheduler 117 | hitPercentage := s.prefixScorer.GetCachedPercentage(decodeRes.TargetPod.GetPod().NamespacedName.String(), req.Prompt) 118 | if (1.0-hitPercentage)*float64(len(req.Prompt)) < float64(s.threshold) { 119 | logger.Info("Non-cached suffix is smaller than threshold, using decode scheduler", 120 | "hitPercentage", hitPercentage) 121 | return decodeRes, err 122 | } 123 | 124 | logger.Info("Non-cached suffix is larger than threshold, using PD scheduler", 125 | "hitPercentage", hitPercentage) 126 | prefillRes, prefillErr := s.prefill.Schedule(ctx, req) 127 | 128 | if prefillErr == nil && prefillRes.TargetPod != nil { // record the prefill worker 129 | pool, err := s.store.PoolGet() 130 | if err != nil { 131 | debugLog.Error(err, "Get inference pool failed - scheduling to decode worker only") 132 | return s.decode.Schedule(ctx, req) 133 | } 134 | 135 | // TODO: should the scheme be conifgurable (e.g., https://)? 136 | prefillURL := fmt.Sprintf("http://%s:%d", prefillRes.TargetPod.GetPod().Address, pool.Spec.TargetPortNumber) 137 | if req.Headers == nil { // TODO should always be populated? 138 | req.Headers = make(map[string]string) 139 | } 140 | req.Headers[PrefillPodHeader] = prefillURL 141 | } 142 | 143 | debugLog.Info("Scheduling to separate Prefill and Decode workers") 144 | 145 | return decodeRes, nil // decode pod 146 | } 147 | 148 | // OnResponse normally processes all LLMResponses - forwards all responses to the decode scheduler 149 | func (s *Scheduler) OnResponse(ctx context.Context, resp *types.LLMResponse, targetPodName string) { 150 | // prefill scheduler will never get OnReponse, need to take care of plugin, issue #97 151 | s.decode.OnResponse(ctx, resp, targetPodName) 152 | } 153 | 154 | func (s *Scheduler) pluginsFromConfig(ctx context.Context, pluginsConfig map[string]int) map[plugins.Plugin]int { 155 | logger := log.FromContext(ctx) 156 | 157 | plugins := map[plugins.Plugin]int{} 158 | prefixWasAdded := false 159 | 160 | for pluginName, pluginWeight := range pluginsConfig { 161 | switch pluginName { 162 | case config.KVCacheScorerName: 163 | scorer, err := scorer.NewKVCacheAwareScorer(ctx) 164 | if err == nil { 165 | plugins[scorer] = pluginWeight 166 | } else { 167 | logger.Error(err, "KVCache scorer creation failed") 168 | } 169 | case config.LoadAwareScorerName: 170 | plugins[scorer.NewLoadAwareScorer(ctx)] = pluginWeight 171 | case config.PrefixScorerName: 172 | // TODO - create config? based on what? - issue #55 173 | // use the same instance 174 | plugins[s.prefixScorer] = pluginWeight 175 | prefixWasAdded = true 176 | case config.SessionAwareScorerName: 177 | plugins[scorer.NewSessionAffinity()] = pluginWeight 178 | 179 | // Plugins from upstream 180 | 181 | case config.GIELeastKVCacheFilterName: 182 | plugins[giefilter.NewLeastKVCacheFilter()] = pluginWeight 183 | case config.GIELeastQueueFilterName: 184 | plugins[giefilter.NewLeastQueueFilter()] = pluginWeight 185 | case config.GIELoraAffinityFilterName: 186 | plugins[giefilter.NewLoraAffinityFilter()] = pluginWeight 187 | case config.GIELowQueueFilterName: 188 | plugins[giefilter.NewLowQueueFilter()] = pluginWeight 189 | case config.GIESheddableCapacityFilterName: 190 | plugins[giefilter.NewSheddableCapacityFilter()] = pluginWeight 191 | case config.GIEKVCacheUtilizationScorerName: 192 | plugins[&giescorer.KVCacheScorer{}] = pluginWeight 193 | case config.GIEPrefixScorerName: 194 | // For now use the default configuration 195 | prefixConfig := prefix.Config{ 196 | HashBlockSize: envutil.GetEnvInt("PREFIX_CACHE_HASH_BLOCK_SIZE", prefix.DefaultHashBlockSize, logger), 197 | MaxPrefixBlocksToMatch: envutil.GetEnvInt("PREFIX_CACHE_MAX_PREFIX_BLOCKS", prefix.DefaultMaxPrefixBlocks, logger), 198 | LRUIndexerCapacity: envutil.GetEnvInt("PREFIX_CACHE_LRU_CAPACITY", prefix.DefaultLRUIndexerCapacity, logger), 199 | } 200 | plugins[prefix.New(prefixConfig)] = pluginWeight 201 | case config.GIEQueueScorerName: 202 | plugins[&giescorer.QueueScorer{}] = pluginWeight 203 | } 204 | } 205 | 206 | // only in case pd is enabled and prefix scorer was not enabled for decode scheduler 207 | // add prefix scorer to list of all scorers to collect information used for decision if PD should be acrivated 208 | if s.pdEnabled && !prefixWasAdded { 209 | plugins[s.prefixScorer] = 0.0 210 | } 211 | 212 | return plugins 213 | } 214 | 215 | func (s *Scheduler) generateSchedulerConfig(ctx context.Context, pluginsConfig map[string]int, extraFilters ...plugins.Filter) *scheduling.SchedulerConfig { 216 | thePlugins := s.pluginsFromConfig(ctx, pluginsConfig) 217 | preSchedulePlugins := []plugins.PreSchedule{} 218 | filters := []plugins.Filter{} 219 | scorers := []*giescorer.WeightedScorer{} 220 | postSchedulePlugins := []plugins.PostSchedule{} 221 | postResponsePlugins := []plugins.PostResponse{} 222 | 223 | filters = append(filters, extraFilters...) 224 | 225 | for plugin, pluginWeight := range thePlugins { 226 | if preSchedule, ok := plugin.(plugins.PreSchedule); ok { 227 | preSchedulePlugins = append(preSchedulePlugins, preSchedule) 228 | } 229 | if filter, ok := plugin.(plugins.Filter); ok { 230 | filters = append(filters, filter) 231 | } 232 | if scorer, ok := plugin.(plugins.Scorer); ok { 233 | scorers = append(scorers, giescorer.NewWeightedScorer(scorer, pluginWeight)) 234 | } 235 | if postSchedule, ok := plugin.(plugins.PostSchedule); ok { 236 | postSchedulePlugins = append(postSchedulePlugins, postSchedule) 237 | } 238 | if postResponse, ok := plugin.(plugins.PostResponse); ok { 239 | postResponsePlugins = append(postResponsePlugins, postResponse) 240 | } 241 | } 242 | 243 | return scheduling.NewSchedulerConfig(). 244 | WithPreSchedulePlugins(preSchedulePlugins...). 245 | WithFilters(filters...). 246 | WithScorers(scorers...). 247 | WithPicker(picker.NewMaxScorePicker()). 248 | WithPostSchedulePlugins(postSchedulePlugins...). 249 | WithPostResponsePlugins(postResponsePlugins...) 250 | } 251 | -------------------------------------------------------------------------------- /pkg/scheduling/pd/scheduler_test.go: -------------------------------------------------------------------------------- 1 | package pd_test 2 | 3 | import ( 4 | "context" 5 | "testing" 6 | 7 | "github.com/go-logr/logr/testr" 8 | 9 | "github.com/google/go-cmp/cmp" 10 | k8stypes "k8s.io/apimachinery/pkg/types" 11 | "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" 12 | "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend" 13 | backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics" // Import config for thresholds 14 | "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types" 15 | 16 | "github.com/llm-d/llm-d-inference-scheduler/pkg/config" 17 | "github.com/llm-d/llm-d-inference-scheduler/pkg/scheduling/pd" 18 | "github.com/llm-d/llm-d-inference-scheduler/pkg/scheduling/plugins/filter" 19 | "sigs.k8s.io/controller-runtime/pkg/log" 20 | ) 21 | 22 | // Tests the default scheduler configuration and expected behavior. 23 | func TestPDSchedule(t *testing.T) { 24 | pod1 := &backendmetrics.FakePodMetrics{ 25 | Pod: &backend.Pod{ 26 | NamespacedName: k8stypes.NamespacedName{Name: "pod1"}, 27 | Address: "1.2.3.4", 28 | Labels: map[string]string{filter.RoleLabel: filter.RolePrefill}, 29 | }, 30 | Metrics: &backendmetrics.MetricsState{}, 31 | } 32 | pod2 := &backendmetrics.FakePodMetrics{ 33 | Pod: &backend.Pod{ 34 | NamespacedName: k8stypes.NamespacedName{Name: "pod2"}, 35 | Address: "5.6.7.8", 36 | Labels: map[string]string{filter.RoleLabel: filter.RoleDecode}, 37 | }, 38 | Metrics: &backendmetrics.MetricsState{}, 39 | } 40 | wantPod2 := &types.PodMetrics{ 41 | Pod: &backend.Pod{ 42 | NamespacedName: k8stypes.NamespacedName{Name: "pod2"}, 43 | Address: "5.6.7.8", 44 | Labels: map[string]string{filter.RoleLabel: filter.RoleDecode}, 45 | }, 46 | MetricsState: &backendmetrics.MetricsState{ 47 | ActiveModels: map[string]int{}, 48 | WaitingModels: map[string]int{}, 49 | }, 50 | } 51 | 52 | tests := []struct { 53 | name string 54 | req *types.LLMRequest 55 | input []*backendmetrics.FakePodMetrics 56 | wantRes *types.Result 57 | wantHeaders map[string]string 58 | unwantedHeaders []string 59 | err bool 60 | }{ 61 | { 62 | name: "no pods in datastore", 63 | req: &types.LLMRequest{ 64 | TargetModel: "any-model", 65 | Critical: true, 66 | Prompt: "12345678901", 67 | }, 68 | input: []*backendmetrics.FakePodMetrics{}, 69 | err: true, 70 | }, 71 | { 72 | name: "one decode pod, long prompt", 73 | req: &types.LLMRequest{ 74 | TargetModel: "critical", 75 | Critical: true, 76 | Prompt: "12345678901", 77 | }, 78 | // pod2 will be picked because it is the only pod with Decode role 79 | input: []*backendmetrics.FakePodMetrics{pod2}, 80 | wantRes: &types.Result{ 81 | TargetPod: &types.ScoredPod{ 82 | Pod: wantPod2, 83 | }, 84 | }, 85 | unwantedHeaders: []string{"x-prefiller-url"}, 86 | }, 87 | { 88 | name: "one prefill pod, long prompt", 89 | req: &types.LLMRequest{ 90 | TargetModel: "critical", 91 | Critical: true, 92 | Prompt: "12345678901", 93 | }, 94 | // no Decode pod 95 | input: []*backendmetrics.FakePodMetrics{pod1}, 96 | err: true, 97 | }, 98 | { 99 | name: "1P1D", 100 | req: &types.LLMRequest{ 101 | TargetModel: "critical", 102 | Critical: true, 103 | Prompt: "12345678901", 104 | }, 105 | // pod2 will be picked because it is the decode pod, pod1 IP will be in the header 106 | input: []*backendmetrics.FakePodMetrics{pod1, pod2}, 107 | wantRes: &types.Result{ 108 | TargetPod: &types.ScoredPod{ 109 | Pod: wantPod2, 110 | Score: 0.0, 111 | }, 112 | }, 113 | wantHeaders: map[string]string{"x-prefiller-url": "http://1.2.3.4:80"}, 114 | }, 115 | { 116 | name: "1P1Dshort", 117 | req: &types.LLMRequest{ 118 | TargetModel: "critical", 119 | Critical: true, 120 | Prompt: "123", 121 | }, 122 | // pod2 will be picked because it is the decode pod, pod1 IP should no be in the header, 123 | // because the prompt is too short 124 | input: []*backendmetrics.FakePodMetrics{pod1, pod2}, 125 | wantRes: &types.Result{ 126 | TargetPod: &types.ScoredPod{ 127 | Pod: wantPod2, 128 | Score: 0.0, 129 | }, 130 | }, 131 | unwantedHeaders: []string{"x-prefiller-url"}, 132 | }, 133 | } 134 | 135 | ctx := context.Background() 136 | logger := testr.New(t) 137 | ctx = log.IntoContext(ctx, logger) 138 | 139 | schedCfg := config.NewConfig(logger) 140 | schedCfg.PDEnabled = true 141 | schedCfg.PDThreshold = 5 142 | 143 | for _, test := range tests { 144 | t.Run(test.name, func(t *testing.T) { 145 | scheduler, _ := pd.NewScheduler(ctx, schedCfg, &fakeDataStore{pods: test.input}) 146 | got, err := scheduler.Schedule(ctx, test.req) 147 | 148 | if test.err != (err != nil) { 149 | t.Errorf("Unexpected error, got %v, want %v", err, test.err) 150 | } 151 | 152 | if diff := cmp.Diff(test.wantRes, got); diff != "" { 153 | t.Errorf("Unexpected output (-want +got): %v", diff) 154 | } 155 | 156 | for header, value := range test.wantHeaders { 157 | gotValue, ok := test.req.Headers[header] 158 | if !ok { 159 | t.Errorf("Missing header: %s", header) 160 | } else if gotValue != value { 161 | t.Errorf("Wrong header value for %s: want %s got %s)", header, value, gotValue) 162 | } 163 | } 164 | 165 | for _, header := range test.unwantedHeaders { 166 | if _, exists := test.req.Headers[header]; exists { 167 | t.Errorf("Unwanted header %s exists", header) 168 | } 169 | } 170 | }) 171 | } 172 | } 173 | 174 | // TODO: this is probably better in upstream (e.g., epp/scheduling or epp/scheduling/plugins) 175 | // currently duplicated from pkg/scheduling/plugins/ 176 | type fakeDataStore struct { 177 | pods []*backendmetrics.FakePodMetrics 178 | } 179 | 180 | // PodGetAll returns all pods in the store 181 | func (fds *fakeDataStore) PodGetAll() []backendmetrics.PodMetrics { 182 | pm := make([]backendmetrics.PodMetrics, 0, len(fds.pods)) 183 | for _, pod := range fds.pods { 184 | pm = append(pm, pod) 185 | } 186 | return pm 187 | } 188 | 189 | func (fds *fakeDataStore) PoolGet() (*v1alpha2.InferencePool, error) { 190 | return &v1alpha2.InferencePool{ 191 | Spec: v1alpha2.InferencePoolSpec{ 192 | TargetPortNumber: 80, 193 | }, 194 | }, nil 195 | } 196 | -------------------------------------------------------------------------------- /pkg/scheduling/plugins/filter/by_labels.go: -------------------------------------------------------------------------------- 1 | package filter 2 | 3 | import ( 4 | "errors" 5 | 6 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 7 | "k8s.io/apimachinery/pkg/labels" 8 | "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/plugins" 9 | "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types" 10 | ) 11 | 12 | // ByLabels filters out pods that do not match its label selector criteria 13 | type ByLabels struct { 14 | name string 15 | selector labels.Selector 16 | } 17 | 18 | var _ plugins.Filter = &ByLabels{} // validate interface conformance 19 | 20 | // NewByLabel returns a new filter instance, configured with the provided 21 | // name and label selector. 22 | func NewByLabel(name string, selector *metav1.LabelSelector) (plugins.Filter, error) { 23 | if name == "" { 24 | return nil, errors.New("ByLabels: missing filter name") 25 | } 26 | labelSelector, err := metav1.LabelSelectorAsSelector(selector) 27 | if err != nil { 28 | return nil, err 29 | } 30 | 31 | return &ByLabels{ 32 | name: name, 33 | selector: labelSelector, 34 | }, nil 35 | } 36 | 37 | // Name returns the name of the filter 38 | func (blf *ByLabels) Name() string { 39 | return blf.name 40 | } 41 | 42 | // Filter filters out all pods that do not satisfy the label selector 43 | func (blf *ByLabels) Filter(_ *types.SchedulingContext, pods []types.Pod) []types.Pod { 44 | filtered := []types.Pod{} 45 | 46 | for _, pod := range pods { 47 | labels := labels.Set(pod.GetPod().Labels) 48 | if blf.selector.Matches(labels) { 49 | filtered = append(filtered, pod) 50 | } 51 | } 52 | return filtered 53 | } 54 | -------------------------------------------------------------------------------- /pkg/scheduling/plugins/filter/passthrough.go: -------------------------------------------------------------------------------- 1 | // Package filter provides filter plugins for the epp. 2 | package filter 3 | 4 | import ( 5 | "fmt" 6 | 7 | "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/plugins" 8 | "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types" 9 | logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" 10 | ) 11 | 12 | // Passthrough filter type 13 | type Passthrough struct{} 14 | 15 | var _ plugins.Filter = &Passthrough{} 16 | 17 | // Name returns the filter name 18 | func (p *Passthrough) Name() string { 19 | return "passthrough-filter" 20 | } 21 | 22 | // Filter defines the filtering function. In this case it is a passthrough 23 | func (p *Passthrough) Filter(ctx *types.SchedulingContext, pods []types.Pod) []types.Pod { 24 | ctx.Logger.V(logutil.DEBUG).Info(fmt.Sprintf("Passthrough filter called with %d candidates: %+v", 25 | len(pods), pods)) 26 | 27 | return pods 28 | } 29 | -------------------------------------------------------------------------------- /pkg/scheduling/plugins/filter/pd_role_filter.go: -------------------------------------------------------------------------------- 1 | package filter 2 | 3 | import ( 4 | "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/plugins" 5 | "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types" 6 | ) 7 | 8 | const ( 9 | // RoleLabel name 10 | RoleLabel = "llm-d.ai/role" 11 | // RolePrefill set for designated prefill workers 12 | RolePrefill = "prefill" 13 | // RoleDecode set for designated decode workers 14 | RoleDecode = "decode" 15 | // RoleBoth set for workers that can act as both prefill and decode 16 | RoleBoth = "both" 17 | ) 18 | 19 | // PrefillFilter - filters out pods that are not marked with role Prefill 20 | type PrefillFilter struct{} 21 | 22 | var _ plugins.Filter = &PrefillFilter{} // validate interface conformance 23 | 24 | // Name returns the name of the filter 25 | func (pf *PrefillFilter) Name() string { 26 | return "prefill-filter" 27 | } 28 | 29 | // Filter filters out all pods that are not marked as "prefill" 30 | func (pf *PrefillFilter) Filter(_ *types.SchedulingContext, pods []types.Pod) []types.Pod { 31 | filteredPods := []types.Pod{} 32 | 33 | for _, pod := range pods { 34 | role := pod.GetPod().Labels[RoleLabel] 35 | if role == RolePrefill { // TODO: doesn't RoleBoth also imply Prefill? 36 | filteredPods = append(filteredPods, pod) 37 | } 38 | } 39 | return filteredPods 40 | } 41 | 42 | // DecodeFilter - filters out pods that are not marked with role Decode or Both 43 | type DecodeFilter struct{} 44 | 45 | var _ plugins.Filter = &DecodeFilter{} // validate interface conformance 46 | 47 | // Name returns the name of the filter 48 | func (df *DecodeFilter) Name() string { 49 | return "decode-filter" 50 | } 51 | 52 | // Filter removes all pods that are not marked as "decode" or "both" 53 | func (df *DecodeFilter) Filter(_ *types.SchedulingContext, pods []types.Pod) []types.Pod { 54 | filteredPods := []types.Pod{} 55 | 56 | for _, pod := range pods { 57 | role, defined := pod.GetPod().Labels[RoleLabel] 58 | if !defined || role == RoleDecode || role == RoleBoth { 59 | filteredPods = append(filteredPods, pod) 60 | } 61 | } 62 | return filteredPods 63 | } 64 | -------------------------------------------------------------------------------- /pkg/scheduling/plugins/filter/random.go: -------------------------------------------------------------------------------- 1 | // Package filter provides filter plugins for the epp. 2 | package filter 3 | 4 | import ( 5 | "fmt" 6 | "math/rand/v2" 7 | 8 | "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/plugins" 9 | "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types" 10 | logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" 11 | ) 12 | 13 | // Random drop filter type 14 | type Random struct { 15 | probability float64 16 | } 17 | 18 | var _ plugins.Filter = &Random{} 19 | 20 | // Name returns the filter name 21 | func (r *Random) Name() string { 22 | return "random-drop-filter" 23 | } 24 | 25 | // Filter defines the filtering function. In this case it is a passthrough 26 | func (r *Random) Filter(ctx *types.SchedulingContext, pods []types.Pod) []types.Pod { 27 | ctx.Logger.V(logutil.DEBUG).Info(fmt.Sprintf("Random filter called with %d candidates: %+v", 28 | len(pods), pods)) 29 | filtered := []types.Pod{} 30 | 31 | for _, p := range pods { 32 | if rand.Float64() >= r.probability { 33 | filtered = append(filtered, p) 34 | } else { 35 | ctx.Logger.V(logutil.DEBUG).Info(fmt.Sprintf("%v dropped", p)) 36 | } 37 | } 38 | 39 | return filtered 40 | } 41 | -------------------------------------------------------------------------------- /pkg/scheduling/plugins/scorer/doc.go: -------------------------------------------------------------------------------- 1 | // Package scorer provides scorer plugins for the scheduler. 2 | package scorer 3 | -------------------------------------------------------------------------------- /pkg/scheduling/plugins/scorer/kvcache-aware.go: -------------------------------------------------------------------------------- 1 | package scorer 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | "os" 7 | 8 | kvcache "github.com/llm-d/llm-d-kv-cache-manager/pkg/kv-cache" 9 | 10 | "sigs.k8s.io/controller-runtime/pkg/log" 11 | "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/plugins" 12 | "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types" 13 | 14 | logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" 15 | ) 16 | 17 | const ( 18 | kvCacheAwareScorerName = "kvcache-aware-scorer" 19 | 20 | kvCacheRedisEnvVar = "KVCACHE_INDEXER_REDIS_ADDR" 21 | huggingFaceTokenEnvVar = "HF_TOKEN" 22 | ) 23 | 24 | // KVCacheAwareScorer uses the KVCacheIndexer to score pods based on KVCache 25 | // awareness. 26 | type KVCacheAwareScorer struct { 27 | kvCacheIndexer *kvcache.Indexer 28 | } 29 | 30 | // NewKVCacheAwareScorer creates a new KVCacheAwareScorer instance. 31 | // It initializes the KVCacheIndexer from environment variables. 32 | // 33 | // If the environment variables are not set, or if the indexer 34 | // fails to initialize, an error is returned. 35 | func NewKVCacheAwareScorer(ctx context.Context) (plugins.Scorer, error) { 36 | config := kvcache.NewDefaultConfig() 37 | 38 | redisAddr := os.Getenv(kvCacheRedisEnvVar) 39 | if redisAddr != "" { 40 | config.KVBlockIndexerConfig.RedisAddr = redisAddr 41 | } else { 42 | return nil, fmt.Errorf("environment variable %s is not set", kvCacheRedisEnvVar) 43 | } 44 | 45 | hfToken := os.Getenv(huggingFaceTokenEnvVar) 46 | if hfToken != "" { 47 | config.TokenizersPoolConfig.HuggingFaceToken = hfToken 48 | } else { 49 | return nil, fmt.Errorf("environment variable %s is not set", huggingFaceTokenEnvVar) 50 | } 51 | 52 | kvCacheIndexer, err := kvcache.NewKVCacheIndexer(config) 53 | if err != nil { 54 | return nil, fmt.Errorf("failed to create KVCacheIndexer: %w", err) 55 | } 56 | 57 | go kvCacheIndexer.Run(ctx) 58 | 59 | return &KVCacheAwareScorer{ 60 | kvCacheIndexer: kvCacheIndexer, 61 | }, nil 62 | } 63 | 64 | // Name returns the name of the scorer. 65 | func (s *KVCacheAwareScorer) Name() string { 66 | return kvCacheAwareScorerName 67 | } 68 | 69 | // Score scores the provided pod based on the KVCache index state. 70 | // The returned scores are normalized to a range of 0-1. 71 | func (s *KVCacheAwareScorer) Score(ctx *types.SchedulingContext, pods []types.Pod) map[types.Pod]float64 { 72 | loggerDebug := log.FromContext(ctx).WithName(kvCacheAwareScorerName).V(logutil.DEBUG) 73 | if ctx.Req == nil { 74 | loggerDebug.Info("Request is nil, skipping scoring") 75 | return nil 76 | } 77 | 78 | scores, err := s.kvCacheIndexer.GetPodScores(ctx.Context, ctx.Req.Prompt, ctx.Req.TargetModel, nil) 79 | if err != nil { 80 | loggerDebug.Error(err, "Failed to get pod scores") 81 | return nil 82 | } 83 | loggerDebug.Info("Got pod scores", "scores", scores) 84 | 85 | podToKey := func(pod types.Pod) (string, bool) { 86 | metricsPod := pod.GetPod() 87 | if metricsPod == nil { 88 | return "", false 89 | } 90 | 91 | return metricsPod.Address, true 92 | } 93 | 94 | return indexedScoresToNormalizedScoredPods(pods, podToKey, scores) 95 | } 96 | -------------------------------------------------------------------------------- /pkg/scheduling/plugins/scorer/load_aware_scorer.go: -------------------------------------------------------------------------------- 1 | package scorer 2 | 3 | import ( 4 | "context" 5 | 6 | "sigs.k8s.io/controller-runtime/pkg/log" 7 | "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/plugins" 8 | "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types" 9 | "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/env" 10 | ) 11 | 12 | const ( 13 | queueThresholdEnvName = "LOAD_AWARE_SCORER_QUEUE_THRESHOLD" 14 | queueThresholdDefault = 128 15 | ) 16 | 17 | // LoadAwareScorer scorer that is based on load 18 | type LoadAwareScorer struct { 19 | queueThreshold float64 20 | } 21 | 22 | var _ plugins.Scorer = &LoadAwareScorer{} // validate interface conformance 23 | 24 | // NewLoadAwareScorer creates a new load based scorer 25 | func NewLoadAwareScorer(ctx context.Context) plugins.Scorer { 26 | return &LoadAwareScorer{ 27 | queueThreshold: float64(env.GetEnvInt(queueThresholdEnvName, queueThresholdDefault, log.FromContext(ctx))), 28 | } 29 | } 30 | 31 | // Name returns the scorer's name 32 | func (s *LoadAwareScorer) Name() string { 33 | return "load-aware-scorer" 34 | } 35 | 36 | // Score scores the given pod in range of 0-1 37 | // Currently metrics contains number of requests waiting in the queue, there is no information about number of requests 38 | // that can be processed in the given pod immediately. 39 | // Pod with empty waiting requests queue is scored with 0.5 40 | // Pod with requests in the queue will get score between 0.5 and 0. 41 | // Score 0 will get pod with number of requests in the queue equal to the threshold used in load-based filter (QueueingThresholdLoRA) 42 | // In future pods with additional capacity will get score higher than 0.5 43 | func (s *LoadAwareScorer) Score(_ *types.SchedulingContext, pods []types.Pod) map[types.Pod]float64 { 44 | scoredPods := make(map[types.Pod]float64) 45 | 46 | for _, pod := range pods { 47 | waitingRequests := float64(pod.GetMetrics().WaitingQueueSize) 48 | 49 | if waitingRequests == 0 { 50 | scoredPods[pod] = 0.5 51 | } else { 52 | if waitingRequests > s.queueThreshold { 53 | waitingRequests = s.queueThreshold 54 | } 55 | scoredPods[pod] = 0.5 * (1.0 - (waitingRequests / s.queueThreshold)) 56 | } 57 | } 58 | return scoredPods 59 | } 60 | -------------------------------------------------------------------------------- /pkg/scheduling/plugins/scorer/load_aware_scorer_test.go: -------------------------------------------------------------------------------- 1 | package scorer_test 2 | 3 | import ( 4 | "context" 5 | "testing" 6 | 7 | "github.com/google/go-cmp/cmp" 8 | k8stypes "k8s.io/apimachinery/pkg/types" 9 | "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend" 10 | backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics" // Import config for thresholds 11 | "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling" 12 | "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/plugins" 13 | "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/plugins/picker" 14 | giescorer "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/plugins/scorer" 15 | "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types" 16 | 17 | "github.com/llm-d/llm-d-inference-scheduler/pkg/scheduling/plugins/scorer" 18 | ) 19 | 20 | func TestLoadBasedScorer(t *testing.T) { 21 | ctx := context.Background() 22 | tests := []struct { 23 | name string 24 | scorer plugins.Scorer 25 | req *types.LLMRequest 26 | input []*backendmetrics.FakePodMetrics 27 | wantRes *types.Result 28 | err bool 29 | }{ 30 | { 31 | name: "load based scorer", 32 | scorer: scorer.NewLoadAwareScorer(ctx), 33 | req: &types.LLMRequest{ 34 | TargetModel: "critical", 35 | Critical: true, 36 | }, 37 | // pod2 will be picked because it has the shortest queue 38 | input: []*backendmetrics.FakePodMetrics{ 39 | { 40 | Pod: &backend.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod1"}}, 41 | Metrics: &backendmetrics.MetricsState{ 42 | WaitingQueueSize: 2, 43 | KVCacheUsagePercent: 0.2, 44 | MaxActiveModels: 2, 45 | ActiveModels: map[string]int{ 46 | "foo": 1, 47 | "bar": 1, 48 | }, 49 | }, 50 | }, 51 | { 52 | Pod: &backend.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod2"}}, 53 | Metrics: &backendmetrics.MetricsState{ 54 | WaitingQueueSize: 0, 55 | KVCacheUsagePercent: 0.2, 56 | MaxActiveModels: 2, 57 | ActiveModels: map[string]int{ 58 | "foo": 1, 59 | "bar": 1, 60 | }, 61 | }, 62 | }, 63 | { 64 | Pod: &backend.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod3"}}, 65 | Metrics: &backendmetrics.MetricsState{ 66 | WaitingQueueSize: 5, 67 | KVCacheUsagePercent: 0.2, 68 | MaxActiveModels: 2, 69 | ActiveModels: map[string]int{ 70 | "foo": 1, 71 | "bar": 1, 72 | }, 73 | }, 74 | }, 75 | }, 76 | wantRes: &types.Result{ 77 | TargetPod: &types.ScoredPod{ 78 | Pod: &types.PodMetrics{ 79 | Pod: &backend.Pod{ 80 | NamespacedName: k8stypes.NamespacedName{Name: "pod2"}, 81 | Labels: map[string]string{}, 82 | }, 83 | MetricsState: &backendmetrics.MetricsState{ 84 | WaitingQueueSize: 0, 85 | KVCacheUsagePercent: 0.2, 86 | MaxActiveModels: 2, 87 | ActiveModels: map[string]int{ 88 | "foo": 1, 89 | "bar": 1, 90 | }, 91 | WaitingModels: map[string]int{}, 92 | }, 93 | }, 94 | Score: 0.5, 95 | }, 96 | }, 97 | }, 98 | } 99 | 100 | for _, test := range tests { 101 | t.Run(test.name, func(t *testing.T) { 102 | datastore := &fakeDataStore{pods: test.input} 103 | 104 | scheduler := scheduling.NewSchedulerWithConfig(datastore, scheduling.NewSchedulerConfig(). 105 | WithScorers(giescorer.NewWeightedScorer(test.scorer, 1)). 106 | WithPicker(picker.NewMaxScorePicker())) 107 | 108 | got, err := scheduler.Schedule(context.Background(), test.req) 109 | if test.err != (err != nil) { 110 | t.Errorf("Unexpected error, got %v, want %v", err, test.err) 111 | } 112 | 113 | opt := cmp.AllowUnexported(types.PodMetrics{}) 114 | if diff := cmp.Diff(test.wantRes, got, opt); diff != "" { 115 | t.Errorf("Unexpected output (-want +got): %v", diff) 116 | } 117 | }) 118 | } 119 | } 120 | 121 | // TODO: this is probably better in upstream (e.g., epp/scheduling or epp/scheduling/plugins) 122 | type fakeDataStore struct { 123 | pods []*backendmetrics.FakePodMetrics 124 | } 125 | 126 | // PodGetAll returns all pods in the store 127 | func (fds *fakeDataStore) PodGetAll() []backendmetrics.PodMetrics { 128 | pm := make([]backendmetrics.PodMetrics, 0, len(fds.pods)) 129 | for _, pod := range fds.pods { 130 | pm = append(pm, pod) 131 | } 132 | return pm 133 | } 134 | -------------------------------------------------------------------------------- /pkg/scheduling/plugins/scorer/passthrough.go: -------------------------------------------------------------------------------- 1 | // Package scorer provides scorer plugins for the scheduler. 2 | package scorer 3 | 4 | import ( 5 | "fmt" 6 | 7 | "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/plugins" 8 | "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types" 9 | logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" 10 | ) 11 | 12 | // Passthrough is an example scorer which processes the pods, but does not 13 | // give them any score. 14 | type Passthrough struct{} 15 | 16 | var _ plugins.Scorer = &Passthrough{} 17 | 18 | // Name provides the textual identifier for this scorer. 19 | func (p *Passthrough) Name() string { 20 | return "passthrough-scorer" 21 | } 22 | 23 | // Score accepts a list of []types.Pod and processes them for scoring. 24 | func (p *Passthrough) Score(ctx *types.SchedulingContext, pods []types.Pod) map[types.Pod]float64 { 25 | ctx.Logger.V(logutil.DEBUG).Info(fmt.Sprintf("Scoring pods passthrough was initialized %d candidates: %+v", len(pods), pods)) 26 | 27 | scoredPods := make(map[types.Pod]float64, len(pods)) 28 | for _, pod := range pods { 29 | scoredPods[pod] = 0.0 30 | } 31 | 32 | return scoredPods 33 | } 34 | -------------------------------------------------------------------------------- /pkg/scheduling/plugins/scorer/prefix_aware.go: -------------------------------------------------------------------------------- 1 | package scorer 2 | 3 | import ( 4 | "context" 5 | "sync" 6 | "time" 7 | 8 | "sigs.k8s.io/controller-runtime/pkg/log" 9 | "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/plugins" 10 | "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types" 11 | logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" 12 | ) 13 | 14 | const ( 15 | prefixAwareScorerName = "prefix-aware-scorer" 16 | prefixAwareKeepAliveTime = 60 * time.Minute // How long should an idle session be kept alive 17 | prefixAwareKeepAliveCheckFrequency = 15 * time.Minute // How often to check for overly idle sessions 18 | ) 19 | 20 | type promptHits struct { 21 | lastUpdate time.Time 22 | // hits map from string to int 23 | hits sync.Map 24 | } 25 | 26 | // PrefixAwareScorer is a routing scorer that scores pods based on the longest prefix match 27 | // between the request's prompt and stored prefixes. The score is normalized between 0 and 1, 28 | // where 1 represents the longest matching prefix. 29 | type PrefixAwareScorer struct { 30 | prefixStore *PrefixStore 31 | 32 | // podToPromptHits map from podID(string) to promptHits 33 | podToPromptHits sync.Map 34 | } 35 | 36 | var _ plugins.Scorer = &PrefixAwareScorer{} // validate interface conformance 37 | 38 | // NewPrefixAwareScorer creates a new PrefixAwareScorer with the given 39 | // PrefixStoreConfig. If the config is nil, default is used. 40 | func NewPrefixAwareScorer(ctx context.Context, config *PrefixStoreConfig) *PrefixAwareScorer { 41 | if config == nil { 42 | config = DefaultPrefixStoreConfig() 43 | } 44 | 45 | scorer := &PrefixAwareScorer{ 46 | prefixStore: NewPrefixStore(config), 47 | podToPromptHits: sync.Map{}, 48 | } 49 | 50 | go scorer.cleanup(ctx, prefixAwareKeepAliveCheckFrequency, prefixAwareKeepAliveTime) 51 | 52 | return scorer 53 | } 54 | 55 | // Name returns the scorer's name 56 | func (s *PrefixAwareScorer) Name() string { 57 | return "prefix-aware-scorer" 58 | } 59 | 60 | // Score scores the target pods based on the longest prefix match. 61 | func (s *PrefixAwareScorer) Score(ctx *types.SchedulingContext, pods []types.Pod) map[types.Pod]float64 { 62 | loggerDebug := log.FromContext(ctx).WithName(prefixAwareScorerName).V(logutil.DEBUG) 63 | if ctx.Req == nil { 64 | loggerDebug.Info("Request is nil, skipping scoring") 65 | return nil 66 | } 67 | 68 | scores := s.prefixStore.FindMatchingPods(ctx.Req.Prompt, ctx.Req.TargetModel) 69 | loggerDebug.Info("Got pod scores", "scores", scores) 70 | 71 | if len(scores) == 0 { 72 | loggerDebug.Info("No scores found for pods") 73 | return nil 74 | } 75 | 76 | for pod, score := range scores { 77 | if pod == "" { 78 | continue 79 | } 80 | 81 | rawPromptHitsInfo, _ := s.podToPromptHits.LoadOrStore(pod, &promptHits{lastUpdate: time.Now()}) 82 | if promptHitsInfo, ok := rawPromptHitsInfo.(*promptHits); ok { 83 | promptHitsInfo.lastUpdate = time.Now() 84 | promptHitsInfo.hits.Store(ctx.Req.Prompt, score) 85 | } 86 | } 87 | 88 | podToKey := func(pod types.Pod) (string, bool) { 89 | if pod.GetPod() == nil { 90 | return "", false 91 | } 92 | 93 | return pod.GetPod().NamespacedName.String(), true 94 | } 95 | 96 | return indexedScoresToNormalizedScoredPods(pods, podToKey, scores) 97 | } 98 | 99 | // PostSchedule implements the PostSchedulePlugin interface. 100 | // It adds the prefix to the PrefixStore for the given pod. 101 | // TODO: switch to PostResponse. 102 | func (s *PrefixAwareScorer) PostSchedule(ctx *types.SchedulingContext, res *types.Result) { 103 | pod := res.TargetPod 104 | 105 | debugLogger := log.FromContext(ctx).WithName(prefixAwareScorerName) 106 | debugLogger.Info("PostResponse called", "req", ctx.Req, "pod", pod) 107 | 108 | if ctx.Req == nil { 109 | debugLogger.Info("Request is nil, skipping PostResponse") 110 | return 111 | } 112 | 113 | if pod.GetPod() == nil { 114 | debugLogger.Info("Pod is nil, skipping PostResponse", "req", ctx.Req, "pod", pod) 115 | return 116 | } 117 | 118 | if err := s.prefixStore.AddEntry(ctx.Req.TargetModel, ctx.Req.Prompt, &pod.GetPod().NamespacedName); err != nil { 119 | debugLogger.Error(err, "Failed to add entry to prefix store", "req", ctx.Req, "pod", pod) 120 | return 121 | } 122 | } 123 | 124 | // GetPrefixStore returns the scorer's PrefixStore. 125 | func (s *PrefixAwareScorer) GetPrefixStore() *PrefixStore { 126 | return s.prefixStore 127 | } 128 | 129 | // GetCachedPercentage returns the percentage of the prompt that is cached for the given pod. 130 | func (s *PrefixAwareScorer) GetCachedPercentage(pod, prompt string) float64 { 131 | rawHitsForPod, ok := s.podToPromptHits.Load(pod) 132 | if !ok { 133 | return 0.0 134 | } 135 | 136 | hitsForPod, ok := rawHitsForPod.(*promptHits) 137 | if !ok { 138 | return 0.0 139 | } 140 | 141 | rawVal, ok := hitsForPod.hits.Load(prompt) 142 | if !ok { 143 | return 0.0 144 | } 145 | 146 | intVal, _ := rawVal.(int) 147 | return float64(intVal*s.prefixStore.blockSize) / float64(len(prompt)) 148 | } 149 | 150 | // cleanup Cleans up hits map 151 | func (s *PrefixAwareScorer) cleanup(ctx context.Context, keepAliveCheckFrequency time.Duration, keepAliveDuration time.Duration) { 152 | logger := log.FromContext(ctx) 153 | 154 | logger.Info("Prefix aware scorer cleanup started") 155 | ticker := time.NewTicker(keepAliveCheckFrequency) 156 | defer ticker.Stop() 157 | 158 | for { 159 | select { 160 | case <-ctx.Done(): 161 | logger.Info("Prefix aware scorer cleanup stopped:") 162 | return 163 | case now := <-ticker.C: 164 | logger.Info("Prefix aware scorer cleanup") 165 | s.podToPromptHits.Range( 166 | func(podID any, rawPromptHit any) bool { 167 | if promptHitInfo, ok := rawPromptHit.(*promptHits); ok { 168 | if now.Sub(promptHitInfo.lastUpdate) > keepAliveDuration { 169 | // info is stale, remove it 170 | s.podToPromptHits.Delete(podID) 171 | } 172 | } else { 173 | // Value is not of the correct type, remove it 174 | s.podToPromptHits.Delete(podID) 175 | } 176 | return true 177 | }) 178 | } 179 | } 180 | } 181 | -------------------------------------------------------------------------------- /pkg/scheduling/plugins/scorer/prefix_aware_test.go: -------------------------------------------------------------------------------- 1 | package scorer_test 2 | 3 | import ( 4 | "context" 5 | "math/rand" 6 | "strconv" 7 | "testing" 8 | "time" 9 | 10 | "github.com/go-logr/logr" 11 | k8stypes "k8s.io/apimachinery/pkg/types" 12 | "sigs.k8s.io/controller-runtime/pkg/log" 13 | "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend" 14 | backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics" 15 | "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types" 16 | 17 | "github.com/llm-d/llm-d-inference-scheduler/pkg/scheduling/plugins/scorer" 18 | ) 19 | 20 | func TestPrefixAwareScorer(t *testing.T) { 21 | // Create test pods 22 | pod1 := &types.PodMetrics{ 23 | Pod: &backend.Pod{ 24 | NamespacedName: k8stypes.NamespacedName{ 25 | Name: "pod1", 26 | Namespace: "default", 27 | }, 28 | }, 29 | MetricsState: &backendmetrics.MetricsState{}, 30 | } 31 | pod2 := &types.PodMetrics{ 32 | Pod: &backend.Pod{ 33 | NamespacedName: k8stypes.NamespacedName{ 34 | Name: "pod2", 35 | Namespace: "default", 36 | }, 37 | }, 38 | MetricsState: &backendmetrics.MetricsState{}, 39 | } 40 | 41 | tests := []struct { 42 | name string 43 | weight float64 44 | prompt string 45 | modelName string 46 | prefixToAdd string 47 | podToAdd k8stypes.NamespacedName 48 | prefixModel string // Model name to use when adding the prefix 49 | expectedScores map[types.Pod]float64 50 | }{ 51 | { 52 | name: "no prompt", 53 | weight: 1.0, 54 | prompt: "", 55 | modelName: "model1", 56 | prefixToAdd: "hello", 57 | podToAdd: pod1.NamespacedName, 58 | prefixModel: "model1", 59 | expectedScores: map[types.Pod]float64{}, // No prompt means zero scores 60 | }, 61 | { 62 | name: "exact prefix match", 63 | weight: 1.0, 64 | prompt: "hello world", 65 | modelName: "model1", 66 | prefixToAdd: "hello", 67 | podToAdd: pod1.NamespacedName, 68 | prefixModel: "model1", 69 | expectedScores: map[types.Pod]float64{ 70 | pod1: 1.0, 71 | pod2: 0.0, 72 | }, // pod1 matches, pod2 doesn't 73 | }, 74 | { 75 | name: "no prefix match", 76 | weight: 1.0, 77 | prompt: "goodbye", 78 | modelName: "model1", 79 | prefixToAdd: "hello", 80 | podToAdd: pod1.NamespacedName, 81 | prefixModel: "model1", 82 | expectedScores: map[types.Pod]float64{}, // No matching prefix 83 | }, 84 | { 85 | name: "different model name", 86 | weight: 1.0, 87 | prompt: "hello world", 88 | modelName: "model2", // Try to find with model2 89 | prefixToAdd: "hello", 90 | podToAdd: pod1.NamespacedName, 91 | prefixModel: "model1", // But prefix was added with model1 92 | expectedScores: map[types.Pod]float64{}, // Model name mismatch should result in no match 93 | }, 94 | { 95 | name: "custom weight", 96 | weight: 0.5, 97 | prompt: "hello world", 98 | modelName: "model1", 99 | prefixToAdd: "hello", 100 | podToAdd: pod1.NamespacedName, 101 | prefixModel: "model1", 102 | expectedScores: map[types.Pod]float64{ 103 | pod1: 1.0, // Pod1 matches with weight 104 | pod2: 0.0, // Pod2 doesn't match 105 | }, // Weight affects score 106 | }, 107 | } 108 | 109 | ctx := context.TODO() 110 | _ = log.IntoContext(ctx, logr.New(log.NullLogSink{})) 111 | 112 | for _, tt := range tests { 113 | t.Run(tt.name, func(t *testing.T) { 114 | // Reset prefix store for each test 115 | config := scorer.DefaultPrefixStoreConfig() 116 | config.BlockSize = 5 // set small chunking for testing 117 | 118 | s := scorer.NewPrefixAwareScorer(ctx, config) 119 | 120 | // Add prefix if specified 121 | if tt.prefixToAdd != "" { 122 | err := s.GetPrefixStore().AddEntry(tt.prefixModel, 123 | tt.prefixToAdd, &tt.podToAdd) 124 | if err != nil { 125 | t.Fatalf("Failed to add prefix: %v", err) 126 | } 127 | } 128 | 129 | // Create test context 130 | sCtx := types.NewSchedulingContext(ctx, &types.LLMRequest{ 131 | Prompt: tt.prompt, 132 | TargetModel: tt.modelName, 133 | }, nil, []types.Pod{}) 134 | 135 | // Score pods 136 | pods := []types.Pod{pod1, pod2} 137 | scores := s.Score(sCtx, pods) 138 | 139 | for p, score := range scores { 140 | if score != tt.expectedScores[p] { 141 | t.Errorf("Pod %v: expected score %v, got %v", p, tt.expectedScores[p], score) 142 | } 143 | } 144 | }) 145 | } 146 | } 147 | 148 | func TestPrefixAwareScorerProfiling(t *testing.T) { 149 | const testName = "profiling_test" 150 | const modelName = "test1" // store contains single cache for this model 151 | const nPodsTotal = 200 152 | const nPodsInStore = 100 // number of chunks stored for pod is proportional to the pod number 153 | 154 | ctx := context.Background() 155 | logger := log.FromContext(ctx) 156 | ctx = log.IntoContext(ctx, logger) 157 | 158 | name2Pod := createPods(nPodsTotal) 159 | config := scorer.DefaultPrefixStoreConfig() 160 | text := generateNonRepeatingText(config.BlockSize * nPodsInStore) 161 | t.Run(testName, func(t *testing.T) { 162 | start := time.Now() // record start time 163 | config := scorer.DefaultPrefixStoreConfig() 164 | s := scorer.NewPrefixAwareScorer(ctx, config) 165 | for i := range nPodsInStore { 166 | prompt := text[0 : (i+1)*config.BlockSize-1] 167 | err := s.GetPrefixStore().AddEntry(modelName, prompt, &name2Pod["pod"+strconv.Itoa(i)].NamespacedName) 168 | if err != nil { 169 | t.Errorf("Failed to add entry to prefix store: %v", err) 170 | } 171 | } 172 | sCtx := types.NewSchedulingContext(ctx, &types.LLMRequest{ 173 | Prompt: text, 174 | TargetModel: modelName, 175 | }, nil, []types.Pod{}) 176 | 177 | // Score pods 178 | pods := make([]types.Pod, 0, len(name2Pod)) 179 | for _, v := range name2Pod { 180 | pods = append(pods, v) 181 | } 182 | 183 | scores := s.Score(sCtx, pods) 184 | 185 | highestScore := scores[name2Pod["pod"+strconv.Itoa(nPodsInStore-1)]] 186 | if highestScore < 0.99 { 187 | t.Error("Failed to calculate scores") 188 | } 189 | 190 | // use 'elapsed' time when built-in profiler is not suitable because of short time periods 191 | elapsed := time.Since(start) // calculate duration 192 | t.Log("Time spent in microsec: " + strconv.FormatInt(elapsed.Microseconds(), 10)) 193 | }) 194 | 195 | } 196 | 197 | func createPods(nPods int) map[string]*types.PodMetrics { 198 | res := map[string]*types.PodMetrics{} 199 | for i := range nPods { 200 | pShortName := "pod" + strconv.Itoa(i) 201 | pod := &types.PodMetrics{ 202 | Pod: &backend.Pod{ 203 | NamespacedName: k8stypes.NamespacedName{ 204 | Name: pShortName, 205 | Namespace: "default", 206 | }, 207 | }, 208 | MetricsState: &backendmetrics.MetricsState{}, 209 | } 210 | res[pShortName] = pod 211 | } 212 | return res 213 | } 214 | 215 | func generateNonRepeatingText(length int) string { 216 | r := rand.New(rand.NewSource(time.Now().UnixNano())) 217 | chars := []rune("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 .,!?;:-_[]{}()<>|@#$%^&*+=") 218 | 219 | result := make([]rune, length) 220 | for i := range result { 221 | result[i] = chars[r.Intn(len(chars))] 222 | } 223 | return string(result) 224 | } 225 | -------------------------------------------------------------------------------- /pkg/scheduling/plugins/scorer/prefix_store.go: -------------------------------------------------------------------------------- 1 | package scorer 2 | 3 | import ( 4 | "encoding/binary" 5 | "fmt" 6 | "sync" 7 | "time" 8 | 9 | "k8s.io/apimachinery/pkg/types" 10 | 11 | "github.com/cespare/xxhash/v2" 12 | lru "github.com/hashicorp/golang-lru/v2" 13 | ) 14 | 15 | const ( 16 | // defaultMaxCacheSize sets the maximum number of blocks the LRU cache can store. 17 | defaultMaxCacheSize = 500000 18 | // defaultBlockSize defines how many runes each block contains in the prefix cache. 19 | defaultBlockSize = 256 20 | // defaultMaxBlockCacheSize sets the maximum number of pods a block can store. 21 | defaultMaxBlockCacheSize = 100 22 | ) 23 | 24 | // PrefixStoreConfig contains initialization configuration for PrefixStore. 25 | type PrefixStoreConfig struct { 26 | // CacheSize sets the maximum number of blocks the LRU cache can store. 27 | CacheSize int 28 | // BlockSize defines how many runes each block contains in the prefix cache. 29 | BlockSize int 30 | // BlockCacheSize sets the maximum number of pods a block can store. 31 | BlockCacheSize int 32 | } 33 | 34 | // DefaultPrefixStoreConfig returns an PrefixStoreConfig instance with default 35 | // configuration. 36 | func DefaultPrefixStoreConfig() *PrefixStoreConfig { 37 | return &PrefixStoreConfig{ 38 | CacheSize: defaultMaxCacheSize, 39 | BlockSize: defaultBlockSize, 40 | BlockCacheSize: defaultMaxBlockCacheSize, 41 | } 42 | } 43 | 44 | // block holds the tokens contained in the block. 45 | type block struct { 46 | Pods *lru.Cache[types.NamespacedName, time.Time] //TODO: implement Pod eviction based on staleness 47 | } 48 | 49 | // PrefixStore is an in-memory prefix-to-block cache with xxhash keys and LRU 50 | // eviction. 51 | type PrefixStore struct { 52 | sync.RWMutex 53 | 54 | cacheSize int 55 | blockSize int 56 | blockCacheSize int 57 | 58 | store map[string]*lru.Cache[uint64, *block] 59 | } 60 | 61 | // NewPrefixStore initializes the PrefixStore with LRU cache. 62 | // If the configuration is nil, default is used. 63 | func NewPrefixStore(config *PrefixStoreConfig) *PrefixStore { 64 | if config == nil { 65 | config = DefaultPrefixStoreConfig() 66 | } 67 | 68 | return &PrefixStore{ 69 | cacheSize: config.CacheSize, 70 | blockSize: config.BlockSize, 71 | blockCacheSize: config.BlockCacheSize, 72 | store: make(map[string]*lru.Cache[uint64, *block]), 73 | } 74 | } 75 | 76 | // AddEntry adds a new entry to the prefix store. 77 | func (s *PrefixStore) AddEntry(modelName string, prompt string, pod *types.NamespacedName) error { 78 | if prompt == "" || pod == nil || len(prompt) < s.blockSize /* skip if prompt is too short */ { 79 | return nil 80 | } 81 | 82 | s.Lock() 83 | // Get or create the LRU cache for the model 84 | cache, ok := s.store[modelName] 85 | if !ok { 86 | var err error 87 | cache, err = lru.New[uint64, *block](s.cacheSize) 88 | if err != nil { 89 | return fmt.Errorf("failed to create LRU cache for model %s: %w", modelName, err) 90 | } 91 | 92 | s.store[modelName] = cache 93 | } 94 | s.Unlock() 95 | 96 | promptBytes := []byte(prompt) 97 | previousHash := uint64(0) 98 | digest := xxhash.New() 99 | 100 | // Chunk the text into blocks and populate the cache 101 | for start := 0; start < len(promptBytes); start += s.blockSize { 102 | end := start + s.blockSize 103 | if end > len(promptBytes) { 104 | break // skip partial blocks 105 | } 106 | 107 | // Compute the hash for the current block 108 | digest.Reset() 109 | if err := binary.Write(digest, binary.LittleEndian, previousHash); err != nil { 110 | return fmt.Errorf("failed to write previous hash: %w", err) 111 | } 112 | if _, err := digest.Write(promptBytes[start:end]); err != nil { 113 | return fmt.Errorf("failed to write prompt bytes: %w", err) 114 | } 115 | 116 | blockHash := digest.Sum64() 117 | previousHash = blockHash 118 | 119 | b, ok := cache.Get(blockHash) 120 | if !ok { 121 | pods, err := lru.New[types.NamespacedName, time.Time](s.blockCacheSize) 122 | if err != nil { 123 | return fmt.Errorf("failed to create LRU cache for block: %w", err) 124 | } 125 | 126 | b = &block{Pods: pods} 127 | cache.Add(blockHash, b) 128 | } 129 | 130 | b.Pods.Add(*pod, time.Now()) // thread-safe 131 | } 132 | 133 | return nil 134 | } 135 | 136 | // FindMatchingPods finds all pods that match the given prompt and model name. 137 | // It returns a map of pods and the number of blocks they match. 138 | func (s *PrefixStore) FindMatchingPods(prompt, modelName string) map[string]int { 139 | if prompt == "" || modelName == "" || len(prompt) < s.blockSize /* skip if prompt is too short */ { 140 | return nil 141 | } 142 | 143 | s.RLock() 144 | cache, ok := s.store[modelName] // cache is thread-safe 145 | s.RUnlock() 146 | 147 | if !ok { 148 | return nil 149 | } 150 | 151 | promptBytes := []byte(prompt) 152 | previousHash := uint64(0) 153 | digest := xxhash.New() 154 | 155 | matchedPods := make(map[string]int) 156 | for start := 0; start < len(promptBytes); start += s.blockSize { 157 | end := start + s.blockSize 158 | if end > len(promptBytes) { 159 | break // skip partial blocks 160 | } 161 | 162 | digest.Reset() 163 | if err := binary.Write(digest, binary.LittleEndian, previousHash); err != nil { 164 | break 165 | } 166 | if _, err := digest.Write(promptBytes[start:end]); err != nil { 167 | break 168 | } 169 | 170 | blockHash := digest.Sum64() 171 | previousHash = blockHash 172 | 173 | b, ok := cache.Get(blockHash) 174 | if !ok { 175 | break // match consecutive blocks 176 | } 177 | 178 | for _, pod := range b.Pods.Keys() { 179 | matchedPods[pod.String()]++ 180 | } 181 | } 182 | 183 | return matchedPods 184 | } 185 | -------------------------------------------------------------------------------- /pkg/scheduling/plugins/scorer/prefix_store_test.go: -------------------------------------------------------------------------------- 1 | package scorer_test 2 | 3 | import ( 4 | "context" 5 | "testing" 6 | 7 | "github.com/go-logr/logr" 8 | k8stypes "k8s.io/apimachinery/pkg/types" 9 | "sigs.k8s.io/controller-runtime/pkg/log" 10 | 11 | "github.com/llm-d/llm-d-inference-scheduler/pkg/scheduling/plugins/scorer" 12 | ) 13 | 14 | // TestBasicPrefixOperations tests the basic functionality of adding and finding prefixes 15 | func TestBasicPrefixOperations(t *testing.T) { 16 | ctx := context.TODO() 17 | _ = log.IntoContext(ctx, logr.New(log.NullLogSink{})) 18 | 19 | config := scorer.DefaultPrefixStoreConfig() 20 | config.BlockSize = 5 // set small chunking for testing 21 | store := scorer.NewPrefixStore(config) 22 | 23 | podName := k8stypes.NamespacedName{ 24 | Name: "pod1", 25 | Namespace: "default", 26 | } 27 | 28 | // Test adding a prefix 29 | err := store.AddEntry("model1", "hello", &podName) 30 | if err != nil { 31 | t.Errorf("Failed to add prefix: %v", err) 32 | } 33 | 34 | // Test finding the exact prefix 35 | scores := store.FindMatchingPods("hello", "model1") 36 | if _, ok := scores[podName.String()]; !ok { 37 | t.Errorf("Expected pod %v, scores %v", podName, scores) 38 | } 39 | 40 | // Test finding with a longer prefix 41 | scores = store.FindMatchingPods("hello world", "model1") 42 | if _, ok := scores[podName.String()]; !ok { 43 | t.Errorf("Expected pod %v, scores %v", podName, scores) 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /pkg/scheduling/plugins/scorer/random.go: -------------------------------------------------------------------------------- 1 | // Package scorer provides scorer plugins for the scheduler. 2 | package scorer 3 | 4 | import ( 5 | "fmt" 6 | "math/rand" 7 | 8 | "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/plugins" 9 | "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types" 10 | logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" 11 | ) 12 | 13 | // Random is an example scorer which processes the pods, giving each a random score. 14 | type Random struct{} 15 | 16 | var _ plugins.Scorer = &Random{} 17 | 18 | // Name provides the textual identifier for this scorer. 19 | func (r *Random) Name() string { 20 | return "random-scorer" 21 | } 22 | 23 | // Score accepts a list of []types.Pod and processes them for scoring. 24 | func (r *Random) Score(ctx *types.SchedulingContext, pods []types.Pod) map[types.Pod]float64 { 25 | ctx.Logger.V(logutil.DEBUG).Info(fmt.Sprintf("Scoring pods randomly called with %d candidates: %+v", 26 | len(pods), pods)) 27 | 28 | scores := make(map[types.Pod]float64, len(pods)) 29 | for _, pod := range pods { 30 | scores[pod] = rand.Float64() 31 | } 32 | 33 | return scores 34 | } 35 | -------------------------------------------------------------------------------- /pkg/scheduling/plugins/scorer/session_affinity.go: -------------------------------------------------------------------------------- 1 | package scorer 2 | 3 | import ( 4 | "encoding/base64" 5 | "time" 6 | 7 | "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/plugins" 8 | "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types" 9 | logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" 10 | ) 11 | 12 | const ( 13 | sessionKeepAliveTime = 60 * time.Minute // How long should an idle session be kept alive 14 | sessionKeepAliveCheckFrequency = 15 * time.Minute // How often to check for overly idle sessions 15 | sessionTokenHeader = "x-session-token" // name of the session header in request 16 | ) 17 | 18 | // SessionAffinity is a routing scorer that routes subsequent 19 | // requests in a session to the same pod as the first request in the 20 | // session was sent to, by giving that pod the specified weight and assigning 21 | // zero score to the rest of the targets 22 | type SessionAffinity struct { 23 | } 24 | 25 | var _ plugins.Scorer = &SessionAffinity{} // validate interface conformance 26 | var _ plugins.PostResponse = &SessionAffinity{} // validate interface conformance 27 | 28 | // NewSessionAffinity returns a scorer 29 | func NewSessionAffinity() *SessionAffinity { 30 | return &SessionAffinity{} 31 | } 32 | 33 | // Name returns the scorer's name 34 | func (s *SessionAffinity) Name() string { 35 | return "session-affinity-scorer" 36 | } 37 | 38 | // Score assign a high score to the pod used in previous requests and zero to others 39 | func (s *SessionAffinity) Score(ctx *types.SchedulingContext, pods []types.Pod) map[types.Pod]float64 { 40 | scoredPods := make(map[types.Pod]float64) 41 | sessionToken := ctx.Req.Headers[sessionTokenHeader] 42 | podName := "" 43 | 44 | if sessionToken != "" { 45 | decodedBytes, err := base64.StdEncoding.DecodeString(sessionToken) 46 | if err != nil { 47 | ctx.Logger.Error(err, "Error decoding session header") 48 | } else { 49 | podName = string(decodedBytes) 50 | } 51 | } 52 | for _, pod := range pods { 53 | scoredPods[pod] = 0.0 // initial value 54 | if pod.GetPod().NamespacedName.String() == podName { 55 | scoredPods[pod] = 1.0 56 | } 57 | } 58 | 59 | return scoredPods 60 | } 61 | 62 | // PostResponse sets the session header on the response sent to the client 63 | // TODO: this should be using a cookie and ensure not overriding any other 64 | // cookie values if present. 65 | // Tracked in https://github.com/llm-d/llm-d-inference-scheduler/issues/28 66 | func (s *SessionAffinity) PostResponse(ctx *types.SchedulingContext, pod types.Pod) { 67 | if ctx.Resp == nil || pod == nil || pod.GetPod() == nil { 68 | reqID := "undefined" 69 | if ctx.Resp != nil { 70 | reqID = ctx.Resp.RequestId 71 | } 72 | ctx.Logger.V(logutil.DEBUG).Info("Session affinity scorer - skip post response because one of ctx.Resp, pod, pod.GetPod is nil", "req id", reqID) 73 | return 74 | } 75 | 76 | if ctx.Resp.Headers == nil { // TODO should always be populated? 77 | ctx.Resp.Headers = make(map[string]string) 78 | } 79 | 80 | ctx.Resp.Headers[sessionTokenHeader] = base64.StdEncoding.EncodeToString([]byte(pod.GetPod().NamespacedName.String())) 81 | } 82 | -------------------------------------------------------------------------------- /pkg/scheduling/plugins/scorer/utils.go: -------------------------------------------------------------------------------- 1 | package scorer 2 | 3 | import "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types" 4 | 5 | // podToKey is a function type that converts a Pod to a string key. 6 | // It returns the key and a boolean indicating success. 7 | type podToKeyFunc func(pod types.Pod) (string, bool) 8 | 9 | // indexedScoresToNormalizedScoredPods converts a map of pod scores to a map of 10 | // normalized scores. The function takes a list of pods, a function to convert 11 | // a pod to a key, and a map of scores indexed by those keys. It returns a map 12 | // of pods to their normalized scores. 13 | func indexedScoresToNormalizedScoredPods(pods []types.Pod, podToKey podToKeyFunc, 14 | scores map[string]int) map[types.Pod]float64 { 15 | scoredPods := make(map[types.Pod]float64) 16 | minScore, maxScore := getMinMax(scores) 17 | 18 | for _, pod := range pods { 19 | key, ok := podToKey(pod) 20 | if !ok { 21 | continue 22 | } 23 | 24 | if score, ok := scores[key]; ok { 25 | if minScore == maxScore { 26 | scoredPods[pod] = 1.0 27 | continue 28 | } 29 | 30 | scoredPods[pod] = float64(score-minScore) / float64(maxScore-minScore) 31 | } else { 32 | scoredPods[pod] = 0.0 33 | } 34 | } 35 | 36 | return scoredPods 37 | } 38 | 39 | func getMinMax(scores map[string]int) (int, int) { 40 | minScore := int(^uint(0) >> 1) // max int 41 | maxScore := -1 42 | 43 | for _, score := range scores { 44 | if score < minScore { 45 | minScore = score 46 | } 47 | if score > maxScore { 48 | maxScore = score 49 | } 50 | } 51 | 52 | return minScore, maxScore 53 | } 54 | -------------------------------------------------------------------------------- /scripts/istio/generate-cp.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # python3 & pip install ruamel.yaml 4 | # istioctl https://gcsweb.istio.io/gcs/istio-build/dev/1.26-alpha.9befed2f1439d883120f8de70fd70d84ca0ebc3d alpha pre release 5 | 6 | GATEWAY_NAMESPACE=llm-d-istio-system 7 | 8 | CRD_DIR=deploy/components/crds-istio/ 9 | CP_DIR=deploy/components/istio-control-plane/ 10 | ISTIO_CP="$(dirname "$0")/istio-cp.yaml" 11 | 12 | istioctl manifest generate --dry-run --set values.global.istioNamespace=$GATEWAY_NAMESPACE -f $ISTIO_CP | scripts/istio/manifest-splitter.py -o $CP_DIR 13 | mv $CP_DIR/crds.yaml $CRD_DIR/istio.yaml 14 | -------------------------------------------------------------------------------- /scripts/istio/istio-cp.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: install.istio.io/v1alpha1 2 | kind: IstioOperator 3 | spec: 4 | profile: default 5 | revision: llm-d-gateway 6 | components: 7 | pilot: 8 | k8s: 9 | resources: 10 | requests: 11 | memory: 1024Mi 12 | ingressGateways: 13 | - name: istio-ingressgateway 14 | enabled: false 15 | -------------------------------------------------------------------------------- /scripts/istio/manifest-splitter.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import sys 4 | import os 5 | import argparse # Added for command-line arguments 6 | from collections import defaultdict 7 | from ruamel.yaml import YAML 8 | from ruamel.yaml.constructor import DuplicateKeyError 9 | from ruamel.yaml.error import YAMLError as RuamelYAMLError 10 | 11 | 12 | # Define the mapping from Kubernetes Kind to output filename 13 | # This can be customized as needed. 14 | KIND_TO_FILENAME_MAP = { 15 | "ConfigMap": "configmaps.yaml", 16 | "Deployment": "deployments.yaml", 17 | "HorizontalPodAutoscaler": "hpa.yaml", 18 | "Namespace": "namespaces.yaml", 19 | "ServiceAccount": "service-accounts.yaml", 20 | "Service": "services.yaml", 21 | "Telemetry": "telemetry.yaml", # Istio specific 22 | # RBAC Components 23 | "Role": "rbac.yaml", 24 | "ClusterRole": "rbac.yaml", 25 | "RoleBinding": "rbac.yaml", 26 | "ClusterRoleBinding": "rbac.yaml", 27 | # Webhook Configurations 28 | "MutatingWebhookConfiguration": "webhooks.yaml", 29 | "ValidatingWebhookConfiguration": "webhooks.yaml", 30 | # Istio "Policy-like" CRDs and Networking 31 | "AuthorizationPolicy": "policies.yaml", 32 | "PeerAuthentication": "policies.yaml", 33 | "RequestAuthentication": "policies.yaml", 34 | "Sidecar": "policies.yaml", 35 | "EnvoyFilter": "policies.yaml", 36 | "WasmPlugin": "policies.yaml", 37 | "Gateway": "policies.yaml", # Istio Gateway 38 | "VirtualService": "policies.yaml", 39 | "DestinationRule": "policies.yaml", 40 | "ServiceEntry": "policies.yaml", 41 | "WorkloadEntry": "policies.yaml", 42 | "WorkloadGroup": "policies.yaml", 43 | "PodDisruptionBudget": "policies.yaml", 44 | "Telemetry": "telemetry.yaml", 45 | "IstioOperator": "istiooperators.yaml", # Often part of istioctl output 46 | "CustomResourceDefinition": "crds.yaml", # For CRDs themselves 47 | # Add more kinds as needed 48 | } 49 | 50 | # Files requested by the user (for kustomization.yaml) 51 | REQUESTED_FILES_FOR_KUSTOMIZATION = [ 52 | "configmaps.yaml", 53 | "deployments.yaml", 54 | "hpa.yaml", 55 | "namespaces.yaml", 56 | "policies.yaml", 57 | "rbac.yaml", 58 | "service-accounts.yaml", 59 | "services.yaml", 60 | "telemetry.yaml", 61 | "webhooks.yaml", 62 | # Potentially useful additions if they appear 63 | #"crds.yaml", 64 | #"istiooperators.yaml", 65 | "others.yaml" # Catch-all for unmapped kinds 66 | ] 67 | 68 | def main(): 69 | parser = argparse.ArgumentParser( 70 | description="Split Istio manifests from stdin into categorized files in an output directory." 71 | ) 72 | parser.add_argument( 73 | "-o", "--output-dir", 74 | default="istio_manifests_output", # Default output directory name 75 | help="The directory where YAML files will be saved (default: istio_manifests_output)" 76 | ) 77 | args = parser.parse_args() 78 | output_dir = args.output_dir 79 | 80 | # Create the output directory if it doesn't exist 81 | try: 82 | os.makedirs(output_dir, exist_ok=True) 83 | print(f"Output directory: {os.path.abspath(output_dir)}") 84 | except OSError as e: 85 | print(f"Error: Could not create output directory '{output_dir}': {e}", file=sys.stderr) 86 | sys.exit(1) 87 | 88 | # Initialize ruamel.yaml instance for round-trip preservation 89 | yaml = YAML() 90 | yaml.preserve_quotes = True 91 | # yaml.indent(mapping=2, sequence=4, offset=2) # Optional: to enforce specific indent 92 | 93 | # Dictionary to store YAML content for each file 94 | output_files_content = defaultdict(list) 95 | # Set to keep track of which files actually get content (just filenames) 96 | created_filenames = set() 97 | 98 | try: 99 | yaml_documents = list(yaml.load_all(sys.stdin)) 100 | except RuamelYAMLError as e: 101 | print(f"Error parsing YAML input: {e}", file=sys.stderr) 102 | if hasattr(e, 'problem_mark') and e.problem_mark: 103 | print(f"Error found near line {e.problem_mark.line + 1}, column {e.problem_mark.column + 1}", file=sys.stderr) 104 | sys.exit(1) 105 | except Exception as e: 106 | print(f"An unexpected error occurred while reading/parsing stdin: {e}", file=sys.stderr) 107 | sys.exit(1) 108 | 109 | if not yaml_documents: 110 | print("No YAML input received from stdin.", file=sys.stderr) 111 | return 112 | 113 | for doc in yaml_documents: 114 | if doc is None: 115 | continue 116 | kind = doc.get("kind") 117 | filename = KIND_TO_FILENAME_MAP.get(kind, "others.yaml") # Just the filename, not path 118 | output_files_content[filename].append(doc) 119 | created_filenames.add(filename) 120 | 121 | # Write the collected YAML documents to their respective files in the output directory 122 | for filename, docs in output_files_content.items(): 123 | if not docs: 124 | continue 125 | 126 | output_filepath = os.path.join(output_dir, filename) 127 | try: 128 | with open(output_filepath, "w") as f: 129 | yaml.dump_all(docs, f) 130 | print(f"Written {len(docs)} resource(s) to {output_filepath}") 131 | except IOError as e: 132 | print(f"Error writing to file {output_filepath}: {e}", file=sys.stderr) 133 | except RuamelYAMLError as e: 134 | print(f"Error serializing YAML for {output_filepath}: {e}", file=sys.stderr) 135 | except Exception as e: 136 | print(f"An unexpected error occurred while writing {output_filepath}: {e}", file=sys.stderr) 137 | 138 | # Generate kustomization.yaml in the output directory 139 | kustomization_yaml = YAML() 140 | kustomization_yaml.indent(mapping=2, sequence=2, offset=0) # Common kustomize style 141 | 142 | kustomization_content = {"apiVersion": "kustomize.config.k8s.io/v1beta1", "kind": "Kustomization"} 143 | 144 | # Resources in kustomization.yaml are relative to kustomization.yaml itself 145 | kustomization_resources = sorted([ 146 | fname for fname in created_filenames 147 | if fname != "kustomization.yaml" and fname in REQUESTED_FILES_FOR_KUSTOMIZATION 148 | ]) 149 | 150 | if not kustomization_resources: 151 | print(f"No resources found to include in kustomization.yaml within {output_dir}.", file=sys.stderr) 152 | else: 153 | kustomization_content["resources"] = kustomization_resources 154 | kustomization_filepath = os.path.join(output_dir, "kustomization.yaml") 155 | try: 156 | with open(kustomization_filepath, "w") as f: 157 | kustomization_yaml.dump(kustomization_content, f) 158 | print(f"Written {kustomization_filepath}") 159 | except IOError as e: 160 | print(f"Error writing to {kustomization_filepath}: {e}", file=sys.stderr) 161 | except RuamelYAMLError as e: 162 | print(f"Error serializing YAML for {kustomization_filepath}: {e}", file=sys.stderr) 163 | except Exception as e: 164 | print(f"An unexpected error occurred while writing {kustomization_filepath}: {e}", file=sys.stderr) 165 | 166 | if __name__ == "__main__": 167 | main() -------------------------------------------------------------------------------- /scripts/kind-dev-env.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # This shell script deploys a kind cluster with an Istio-based Gateway API 4 | # implementation fully configured. It deploys the vllm simulator, which it 5 | # exposes with a Gateway -> HTTPRoute -> InferencePool. The Gateway is 6 | # configured with the a filter for the ext_proc endpoint picker. 7 | 8 | set -eo pipefail 9 | 10 | # ------------------------------------------------------------------------------ 11 | # Variables 12 | # ------------------------------------------------------------------------------ 13 | 14 | SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" 15 | 16 | # Set a default CLUSTER_NAME if not provided 17 | : "${CLUSTER_NAME:=llm-d-inference-scheduler-dev}" 18 | 19 | # Set the host port to map to the Gateway's inbound port (30080) 20 | : "${GATEWAY_HOST_PORT:=30080}" 21 | 22 | # Set the default IMAGE_REGISTRY if not provided 23 | : "${IMAGE_REGISTRY:=ghcr.io/llm-d}" 24 | 25 | # Set a default VLLM_SIMULATOR_IMAGE if not provided 26 | : "${VLLM_SIMULATOR_IMAGE:=llm-d-inference-sim}" 27 | 28 | # Set a default VLLM_SIMULATOR_TAG if not provided 29 | export VLLM_SIMULATOR_TAG="${VLLM_SIMULATOR_TAG:-dev}" 30 | 31 | # Set a default EPP_IMAGE if not provided 32 | : "${EPP_IMAGE:=llm-d-inference-scheduler}" 33 | 34 | # Set a default EPP_TAG if not provided 35 | export EPP_TAG="${EPP_TAG:-dev}" 36 | 37 | # Set the default routing side car image tag 38 | export ROUTING_SIDECAR_TAG="${ROUTING_SIDECAR_TAG:-0.0.6}" 39 | 40 | # Set the inference pool name for the deployment 41 | export POOL_NAME="${POOL_NAME:-vllm-llama3-8b-instruct}" 42 | 43 | # Set the model name to deploy 44 | export MODEL_NAME="${MODEL_NAME:-meta-llama/Llama-3.1-8B-Instruct}" 45 | 46 | # vLLM replica count (without PD) 47 | export VLLM_REPLICA_COUNT="${VLLM_REPLICA_COUNT:-1}" 48 | 49 | # By default we are not setting up for PD 50 | export PD_ENABLED="\"${PD_ENABLED:-false}\"" 51 | 52 | # By default the PD threshhold is ten tokens 53 | export PD_PROMPT_LEN_THRESHOLD="\"${PD_PROMPT_LEN_THRESHOLD:-10}\"" 54 | 55 | # Replica counts for P and D 56 | export VLLM_REPLICA_COUNT_P="${VLLM_REPLICA_COUNT_P:-1}" 57 | export VLLM_REPLICA_COUNT_D="${VLLM_REPLICA_COUNT_D:-2}" 58 | 59 | # ------------------------------------------------------------------------------ 60 | # Setup & Requirement Checks 61 | # ------------------------------------------------------------------------------ 62 | 63 | # Check for a supported container runtime if an explicit one was not set 64 | if [ -z "${CONTAINER_RUNTIME}" ]; then 65 | if command -v docker &> /dev/null; then 66 | CONTAINER_RUNTIME="docker" 67 | elif command -v podman &> /dev/null; then 68 | CONTAINER_RUNTIME="podman" 69 | else 70 | echo "Neither docker nor podman could be found in PATH" >&2 71 | exit 1 72 | fi 73 | fi 74 | 75 | set -u 76 | 77 | # Check for required programs 78 | for cmd in kind kubectl kustomize ${CONTAINER_RUNTIME}; do 79 | if ! command -v "$cmd" &> /dev/null; then 80 | echo "Error: $cmd is not installed or not in the PATH." 81 | exit 1 82 | fi 83 | done 84 | 85 | # ------------------------------------------------------------------------------ 86 | # Cluster Deployment 87 | # ------------------------------------------------------------------------------ 88 | 89 | # Check if the cluster already exists 90 | if kind get clusters 2>/dev/null | grep -q "^${CLUSTER_NAME}$"; then 91 | echo "Cluster '${CLUSTER_NAME}' already exists, re-using" 92 | else 93 | kind create cluster --name "${CLUSTER_NAME}" --config - << EOF 94 | kind: Cluster 95 | apiVersion: kind.x-k8s.io/v1alpha4 96 | nodes: 97 | - role: control-plane 98 | extraPortMappings: 99 | - containerPort: 30080 100 | hostPort: ${GATEWAY_HOST_PORT} 101 | protocol: TCP 102 | EOF 103 | fi 104 | 105 | # Set the kubectl context to the kind cluster 106 | KUBE_CONTEXT="kind-${CLUSTER_NAME}" 107 | kubectl config set-context ${KUBE_CONTEXT} --namespace=default 108 | 109 | set -x 110 | 111 | # Hotfix for https://github.com/kubernetes-sigs/kind/issues/3880 112 | CONTAINER_NAME="${CLUSTER_NAME}-control-plane" 113 | ${CONTAINER_RUNTIME} exec -it ${CONTAINER_NAME} /bin/bash -c "sysctl net.ipv4.conf.all.arp_ignore=0" 114 | 115 | # Wait for all pods to be ready 116 | kubectl --context ${KUBE_CONTEXT} -n kube-system wait --for=condition=Ready --all pods --timeout=300s 117 | 118 | echo "Waiting for local-path-storage pods to be created..." 119 | until kubectl --context ${KUBE_CONTEXT} -n local-path-storage get pods -o name | grep -q pod/; do 120 | sleep 2 121 | done 122 | kubectl --context ${KUBE_CONTEXT} -n local-path-storage wait --for=condition=Ready --all pods --timeout=300s 123 | 124 | # ------------------------------------------------------------------------------ 125 | # Load Container Images 126 | # ------------------------------------------------------------------------------ 127 | 128 | # Load the vllm simulator image into the cluster 129 | if [ "${CONTAINER_RUNTIME}" == "podman" ]; then 130 | podman save ${IMAGE_REGISTRY}/${VLLM_SIMULATOR_IMAGE}:${VLLM_SIMULATOR_TAG} -o /dev/stdout | kind --name ${CLUSTER_NAME} load image-archive /dev/stdin 131 | else 132 | kind --name ${CLUSTER_NAME} load docker-image ${IMAGE_REGISTRY}/${VLLM_SIMULATOR_IMAGE}:${VLLM_SIMULATOR_TAG} 133 | fi 134 | 135 | # Load the ext_proc endpoint-picker image into the cluster 136 | if [ "${CONTAINER_RUNTIME}" == "podman" ]; then 137 | podman save ${IMAGE_REGISTRY}/${EPP_IMAGE}:${EPP_TAG} -o /dev/stdout | kind --name ${CLUSTER_NAME} load image-archive /dev/stdin 138 | else 139 | kind --name ${CLUSTER_NAME} load docker-image ${IMAGE_REGISTRY}/${EPP_IMAGE}:${EPP_TAG} 140 | fi 141 | # ------------------------------------------------------------------------------ 142 | # CRD Deployment (Gateway API + GIE) 143 | # ------------------------------------------------------------------------------ 144 | 145 | kustomize build deploy/components/crds-gateway-api | 146 | kubectl --context ${KUBE_CONTEXT} apply --server-side --force-conflicts -f - 147 | 148 | kustomize build deploy/components/crds-gie | 149 | kubectl --context ${KUBE_CONTEXT} apply --server-side --force-conflicts -f - 150 | 151 | kustomize build --enable-helm deploy/components/crds-istio | 152 | kubectl --context ${KUBE_CONTEXT} apply --server-side --force-conflicts -f - 153 | 154 | # ------------------------------------------------------------------------------ 155 | # Development Environment 156 | # ------------------------------------------------------------------------------ 157 | 158 | # Deploy the environment to the "default" namespace 159 | if [ "${PD_ENABLED}" != "\"true\"" ]; then 160 | KUSTOMIZE_DIR="deploy/environments/dev/kind-istio" 161 | else 162 | KUSTOMIZE_DIR="deploy/environments/dev/kind-istio-pd" 163 | fi 164 | kustomize build --enable-helm ${KUSTOMIZE_DIR} \ 165 | | envsubst \${POOL_NAME} | envsubst \${EPP_TAG} | envsubst \${VLLM_SIMULATOR_TAG} \ 166 | | envsubst \${PD_ENABLED} | envsubst \${PD_PROMPT_LEN_THRESHOLD} \ 167 | | envsubst \${ROUTING_SIDECAR_TAG} | envsubst \${VLLM_REPLICA_COUNT} \ 168 | | envsubst \${VLLM_REPLICA_COUNT_P} | envsubst \${VLLM_REPLICA_COUNT_D} \ 169 | | kubectl --context ${KUBE_CONTEXT} apply -f - 170 | 171 | # ------------------------------------------------------------------------------ 172 | # Check & Verify 173 | # ------------------------------------------------------------------------------ 174 | 175 | # Wait for all control-plane deployments to be ready 176 | kubectl --context ${KUBE_CONTEXT} -n llm-d-istio-system wait --for=condition=available --timeout=300s deployment --all 177 | 178 | # Wait for all deployments to be ready 179 | kubectl --context ${KUBE_CONTEXT} -n default wait --for=condition=available --timeout=300s deployment --all 180 | 181 | # Wait for the gateway to be ready 182 | kubectl --context ${KUBE_CONTEXT} wait gateway/inference-gateway --for=condition=Programmed --timeout=300s 183 | 184 | cat < 0) 50 | assert.NotEmpty(t, infResp.Choices[0].Text) 51 | } 52 | -------------------------------------------------------------------------------- /test/integration/suite_test.go: -------------------------------------------------------------------------------- 1 | //go:build integration_tests 2 | // +build integration_tests 3 | 4 | package integration_test 5 | 6 | import ( 7 | "fmt" 8 | "net/http" 9 | "os" 10 | "testing" 11 | 12 | "k8s.io/client-go/kubernetes" 13 | "k8s.io/client-go/kubernetes/scheme" 14 | "k8s.io/client-go/tools/clientcmd" 15 | gwinfv1alpha2 "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" 16 | gwapiv1 "sigs.k8s.io/gateway-api/apis/v1" 17 | ) 18 | 19 | const ( 20 | gatewayURL = "http://localhost:30080" // TODO: make configurable 21 | ) 22 | 23 | var ( 24 | kube *kubernetes.Clientset 25 | ) 26 | 27 | func TestMain(m *testing.M) { 28 | if err := initializeKubernetesClient(); err != nil { 29 | fmt.Fprintf(os.Stderr, "failed to initialize kubernetes client: %v\n", err) 30 | os.Exit(1) 31 | } 32 | 33 | if err := initializeGateway(); err != nil { 34 | fmt.Fprintf(os.Stderr, "failed to initialize gateway: %v\n", err) 35 | os.Exit(1) 36 | } 37 | 38 | code := m.Run() 39 | os.Exit(code) 40 | } 41 | 42 | func initializeKubernetesClient() error { 43 | kubeConfigPath := os.Getenv("KUBECONFIG") 44 | if kubeConfigPath == "" { 45 | return fmt.Errorf("no KUBECONFIG set") 46 | } 47 | 48 | kubeConfig, err := clientcmd.BuildConfigFromFlags("", kubeConfigPath) 49 | if err != nil { 50 | return err 51 | } 52 | 53 | if err := gwapiv1.Install(scheme.Scheme); err != nil { 54 | return err 55 | } 56 | 57 | if err := gwinfv1alpha2.Install(scheme.Scheme); err != nil { 58 | return err 59 | } 60 | 61 | kube, err = kubernetes.NewForConfig(kubeConfig) 62 | if err != nil { 63 | return err 64 | } 65 | 66 | _, err = kube.ServerVersion() 67 | if err != nil { 68 | return fmt.Errorf("request to kubernetes api failed: %w", err) 69 | } 70 | 71 | return nil 72 | } 73 | 74 | func initializeGateway() (err error) { 75 | resp, err := http.Get(gatewayURL) 76 | if err != nil { 77 | return err 78 | } 79 | 80 | if resp.StatusCode != http.StatusNotFound { 81 | return fmt.Errorf("expected gateway to return 404, found: %s", resp.Status) 82 | } 83 | 84 | serverHeader := resp.Header.Get("Server") 85 | if serverHeader == "" { 86 | return fmt.Errorf(`expected gateway to return "istio-envoy" server header, found no value`) 87 | } 88 | if serverHeader != "istio-envoy" { 89 | return fmt.Errorf(`expected gateway to return "istio-envoy" server header, found: %s`, serverHeader) 90 | } 91 | 92 | return nil 93 | } 94 | --------------------------------------------------------------------------------