├── .github
    ├── actions
    │   ├── docker-build-and-push
    │   │   └── action.yml
    │   └── trivy-scan
    │   │   └── action.yml
    ├── dependabot.yml
    └── workflows
    │   ├── ci-pr-checks.yaml
    │   ├── ci-release.yaml
    │   └── md-link-check.yml
├── .gitignore
├── .golangci.yml
├── .lychee.toml
├── DEVELOPMENT.md
├── Dockerfile
├── LICENSE
├── Makefile
├── README.md
├── cmd
    └── epp
    │   ├── health.go
    │   └── main.go
├── deploy
    ├── components
    │   ├── crds-gateway-api
    │   │   └── kustomization.yaml
    │   ├── crds-gie
    │   │   └── kustomization.yaml
    │   ├── crds-istio
    │   │   ├── istio.yaml
    │   │   └── kustomization.yaml
    │   ├── inference-gateway
    │   │   ├── deployments.yaml
    │   │   ├── gateways.yaml
    │   │   ├── httproutes.yaml
    │   │   ├── inference-models.yaml
    │   │   ├── inference-pools.yaml
    │   │   ├── kustomization.yaml
    │   │   ├── rbac.yaml
    │   │   ├── service-accounts.yaml
    │   │   └── services.yaml
    │   ├── istio-control-plane
    │   │   ├── configmaps.yaml
    │   │   ├── deployments.yaml
    │   │   ├── hpa.yaml
    │   │   ├── kustomization.yaml
    │   │   ├── namespaces.yaml
    │   │   ├── policies.yaml
    │   │   ├── rbac.yaml
    │   │   ├── service-accounts.yaml
    │   │   ├── services.yaml
    │   │   ├── telemetry.yaml
    │   │   └── webhooks.yaml
    │   ├── vllm-sim-pd
    │   │   ├── deployments.yaml
    │   │   └── kustomization.yaml
    │   └── vllm-sim
    │   │   ├── deployments.yaml
    │   │   └── kustomization.yaml
    └── environments
    │   ├── dev
    │       ├── base-kind-istio
    │       │   ├── destination-rules.yaml
    │       │   ├── kustomization.yaml
    │       │   ├── patch-deployments.yaml
    │       │   ├── patch-gateways.yaml
    │       │   └── services.yaml
    │       ├── kind-istio-pd
    │       │   └── kustomization.yaml
    │       └── kind-istio
    │       │   └── kustomization.yaml
    │   └── openshift-base
    │       ├── common
    │           ├── patch-service.yaml
    │           ├── patch-statefulset.yaml
    │           ├── service.yaml
    │           └── statefulset.yaml
    │       ├── kustomization.yaml
    │       ├── openshift
    │           ├── patch-route.yaml
    │           └── route.yaml
    │       └── rbac
    │           ├── exec-rbac-role.yaml
    │           ├── exec-rbac-rolebinding.yaml
    │           ├── patch-rbac-role.yaml
    │           └── patch-rbac-rolebinding.yaml
├── docs
    ├── architecture.md
    ├── create_new_filter.md
    ├── dp.md
    └── images
    │   ├── architecture.png
    │   ├── dp_architecture.png
    │   └── plugability.png
├── go.mod
├── go.sum
├── hooks
    └── pre-commit
├── internal
    └── controller
    │   ├── runnable
    │       ├── grpc.go
    │       └── leader_election.go
    │   └── tls
    │       └── tls.go
├── pkg
    ├── config
    │   └── config.go
    └── scheduling
    │   ├── dual
    │       └── scheduler.go
    │   ├── pd
    │       ├── doc.go
    │       ├── scheduler.go
    │       └── scheduler_test.go
    │   └── plugins
    │       ├── filter
    │           ├── by_labels.go
    │           ├── passthrough.go
    │           ├── pd_role_filter.go
    │           └── random.go
    │       └── scorer
    │           ├── doc.go
    │           ├── kvcache-aware.go
    │           ├── load_aware_scorer.go
    │           ├── load_aware_scorer_test.go
    │           ├── passthrough.go
    │           ├── prefix_aware.go
    │           ├── prefix_aware_test.go
    │           ├── prefix_store.go
    │           ├── prefix_store_test.go
    │           ├── random.go
    │           ├── session_affinity.go
    │           └── utils.go
├── scripts
    ├── istio
    │   ├── generate-cp.sh
    │   ├── istio-cp.yaml
    │   └── manifest-splitter.py
    └── kind-dev-env.sh
└── test
    └── integration
        ├── epp_test.go
        └── suite_test.go


/.github/actions/docker-build-and-push/action.yml:
--------------------------------------------------------------------------------
 1 | name: Docker Build - ghcr
 2 | description: Build image using buildx
 3 | inputs:
 4 |   image-name:
 5 |     required: true
 6 |     description: Image name
 7 |   tag:
 8 |     required: true
 9 |     description: Image tag
10 |   github-token:
11 |     required: true
12 |     description: GitHub token for login
13 |   registry:
14 |     required: true
15 |     description: Container registry (e.g., ghcr.io/llm-d)
16 | runs:
17 |   using: "composite"
18 |   steps:
19 |     - name: Set up Docker Buildx
20 |       uses: docker/setup-buildx-action@v3
21 | 
22 |     - name: Login to GitHub Container Registry
23 |       run: echo "${{ inputs.github-token }}" | docker login ghcr.io -u ${{ github.actor }} --password-stdin
24 |       shell: bash
25 | 
26 |     - name: Print image info
27 |       run: |
28 |         echo "Image name: ${{ inputs.image-name }}"
29 |         echo "Tag: ${{ inputs.tag }}"
30 |         echo "Registry: ${{ inputs.registry }}"
31 |       shell: bash
32 | 
33 |     - name: Build image and push
34 |       run: |
35 |         docker buildx build \
36 |           --platform linux/amd64 \
37 |           -t ${{ inputs.registry }}/${{ inputs.image-name }}:${{ inputs.tag }} \
38 |           --push .
39 |       shell: bash
40 | 


--------------------------------------------------------------------------------
/.github/actions/trivy-scan/action.yml:
--------------------------------------------------------------------------------
 1 | name: Trivy Scan
 2 | description: Scan container image with Trivy
 3 | inputs:
 4 |   image:
 5 |     required: true
 6 | runs:
 7 |   using: "composite"
 8 |   steps:
 9 |     - name: Install Trivy
10 |       run: |
11 |         wget https://github.com/aquasecurity/trivy/releases/download/v0.44.1/trivy_0.44.1_Linux-64bit.deb
12 |         sudo dpkg -i trivy_0.44.1_Linux-64bit.deb
13 |       shell: bash
14 | 
15 | 
16 |     - name: Scan image
17 |       run: |
18 |         trivy image --severity HIGH,CRITICAL --no-progress ${{ inputs.image }}
19 |       shell: bash
20 | 


--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | updates:
 3 | 
 4 |   # 1. Go module updates
 5 |   - package-ecosystem: "gomod"
 6 |     directory: "/"
 7 |     schedule:
 8 |       interval: "weekly"
 9 |     open-pull-requests-limit: 10
10 |     commit-message:
11 |       prefix: "deps(go)"
12 |     labels:
13 |       - "dependencies"
14 |       - "release-note-none"
15 |     groups:
16 |       go-dependencies:
17 |         patterns:
18 |           - "*"
19 |       kubernetes:
20 |         patterns:
21 |           - "k8s.io/*"
22 |           - "sigs.k8s.io/*"
23 | 
24 |   # 2. GitHub Actions dependencies
25 |   - package-ecosystem: "github-actions"
26 |     directory: "/"
27 |     schedule:
28 |       interval: "weekly"
29 |     labels:
30 |       - "ci"
31 |       - "dependencies"
32 |     commit-message:
33 |       prefix: "deps(actions)"
34 | 
35 |   # 3. Docker base image updates (e.g., for Dockerfile FROM lines)
36 |   - package-ecosystem: "docker"
37 |     directory: "/"
38 |     schedule:
39 |       interval: "weekly"
40 |     labels:
41 |       - "dependencies"
42 |       - "docker"
43 |     commit-message:
44 |       prefix: "deps(docker)"
45 | 


--------------------------------------------------------------------------------
/.github/workflows/ci-pr-checks.yaml:
--------------------------------------------------------------------------------
 1 | name: CI - PR Checks
 2 | 
 3 | on:
 4 |   pull_request:
 5 |     branches:
 6 |       - dev
 7 |       - main
 8 | 
 9 | jobs:
10 |   lint-and-test:
11 |     runs-on: ubuntu-latest
12 |     steps:
13 |       - name: Checkout source
14 |         uses: actions/checkout@v4
15 |         
16 |       - name: Sanity check repo contents
17 |         run: ls -la
18 | 
19 |       - name: Extract Go version from go.mod
20 |         run: sed -En 's/^go (.*)$/GO_VERSION=\1/p' go.mod >> $GITHUB_ENV
21 | 
22 |       - name: Set up Go with cache
23 |         uses: actions/setup-go@v5
24 |         with:
25 |           go-version: "${{ env.GO_VERSION }}"
26 |           cache-dependency-path: ./go.sum
27 | 
28 |       - name: go mod tidy
29 |         run: go mod tidy
30 | 
31 |       - name: Run lint checks
32 |         uses: golangci/golangci-lint-action@v8
33 |         with:
34 |           version: 'v2.1.6'
35 |           args: "--config=./.golangci.yml"
36 | 
37 |       - name: Run make test
38 |         shell: bash 
39 |         run: |
40 |           make test
41 | 
42 |       - name: Run make build
43 |         shell: bash
44 |         run: |
45 |           make build
46 | 


--------------------------------------------------------------------------------
/.github/workflows/ci-release.yaml:
--------------------------------------------------------------------------------
 1 | name: CI - Release - Docker Container Image
 2 | 
 3 | on:
 4 |   push:
 5 |     tags:
 6 |       - 'v*'  # Runs when a tag like v0.1.0 is pushed
 7 |   release:
 8 |     types: [published]  # Also runs when a GitHub release is published
 9 | 
10 | jobs:
11 |   docker-build-and-push:
12 |     runs-on: ubuntu-latest
13 |     steps:
14 |       - name: Checkout source
15 |         uses: actions/checkout@v4
16 | 
17 |       - name: Set project name from repository
18 |         id: version
19 |         run: |
20 |           repo="${GITHUB_REPOSITORY##*/}"
21 |           echo "project_name=$repo" >> "$GITHUB_OUTPUT"
22 | 
23 |       - name: Print project name
24 |         run: echo "Project is ${{ steps.version.outputs.project_name }}"
25 | 
26 |       - name: Determine tag name
27 |         id: tag
28 |         run: |
29 |           if [[ "${GITHUB_EVENT_NAME}" == "release" ]]; then
30 |             echo "tag=${GITHUB_REF##refs/tags/}" >> "$GITHUB_OUTPUT"
31 |           elif [[ "${GITHUB_REF}" == refs/tags/* ]]; then
32 |             echo "tag=${GITHUB_REF##refs/tags/}" >> "$GITHUB_OUTPUT"
33 |           else
34 |             echo "tag=latest" >> "$GITHUB_OUTPUT"
35 |           fi
36 |         shell: bash
37 | 
38 |       - name: Build and push image
39 |         uses: ./.github/actions/docker-build-and-push
40 |         with:
41 |           tag: ${{ steps.tag.outputs.tag }}
42 |           image-name: ${{ steps.version.outputs.project_name }}
43 |           registry: ghcr.io/llm-d
44 |           github-token: ${{ secrets.GHCR_TOKEN }}
45 | 
46 |       - name: Run Trivy scan
47 |         uses: ./.github/actions/trivy-scan
48 |         with:
49 |           image: ghcr.io/llm-d/${{ steps.version.outputs.project_name }}:${{ steps.tag.outputs.tag }}
50 | 


--------------------------------------------------------------------------------
/.github/workflows/md-link-check.yml:
--------------------------------------------------------------------------------
 1 | name: Markdown Link Checker
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ main ]
 6 |   pull_request:
 7 |     branches: [ main ]
 8 |   workflow_dispatch:
 9 | 
10 | jobs:
11 |   lychee:
12 |     name: Check Markdown Links
13 |     runs-on: ubuntu-latest
14 | 
15 |     steps:
16 |       - name: Checkout code
17 |         uses: actions/checkout@v4
18 | 
19 |       - name: Install lychee v0.18.1
20 |         run: |
21 |           curl -Ls https://github.com/lycheeverse/lychee/releases/download/lychee-v0.18.1/lychee-x86_64-unknown-linux-gnu.tar.gz | tar xz
22 |           sudo mv lychee /usr/local/bin
23 | 
24 |       - name: Run lychee on Markdown files with config
25 |         run: |
26 |           find . -name "*.md" -print0 | xargs -0 lychee --config .lychee.toml --verbose --no-progress
27 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # If you prefer the allow list template instead of the deny list, see community template:
 2 | # https://github.com/github/gitignore/blob/main/community/Golang/Go.AllowList.gitignore
 3 | #
 4 | # Binaries for programs and plugins
 5 | *.exe
 6 | *.exe~
 7 | *.dll
 8 | *.so
 9 | *.dylib
10 | *.a
11 | main
12 | bin/
13 | 
14 | # Test binary, built with `go test -c`
15 | *.test
16 | 
17 | # Output of the go coverage tool, specifically when used with LiteIDE
18 | *.out
19 | 
20 | # Dependency directories (remove the comment below to include it)
21 | # vendor/
22 | 
23 | # Go workspace file
24 | go.work
25 | go.work.sum
26 | 
27 | # Environment Files
28 | .DS_Store
29 | .env
30 | 


--------------------------------------------------------------------------------
/.golangci.yml:
--------------------------------------------------------------------------------
 1 | version: "2"
 2 | 
 3 | run:
 4 |   timeout: 5m
 5 |   allow-parallel-runners: true
 6 | 
 7 | formatters:
 8 |   enable:
 9 |   - goimports
10 |   - gofmt
11 | 
12 | linters:
13 |   enable:
14 |   - copyloopvar
15 |   - dupword
16 |   - durationcheck
17 |   - fatcontext
18 |   - ginkgolinter
19 |   - gocritic
20 |   - govet
21 |   - loggercheck
22 |   - misspell
23 |   - perfsprint
24 |   - revive
25 |   - unconvert
26 |   - makezero
27 |   - errcheck
28 |   - goconst
29 |   - ineffassign
30 |   - nakedret
31 |   - prealloc
32 |   - unparam
33 |   - unused
34 | 


--------------------------------------------------------------------------------
/.lychee.toml:
--------------------------------------------------------------------------------
 1 | # Ignore transient failures on gnu.org (it sometimes refuses connections)
 2 | exclude = [
 3 |   "^https://www.gnu.org/software/make/?$"
 4 | ]
 5 | 
 6 | # Timeout in seconds
 7 | timeout = 20
 8 | 
 9 | # Retry failed links (helpful for flaky sites)
10 | retry_count = 3
11 | 
12 | # Accept non-200 status codes (429: rate limits)
13 | accept = [200, 429]
14 | 


--------------------------------------------------------------------------------
/DEVELOPMENT.md:
--------------------------------------------------------------------------------
  1 | # Development
  2 | 
  3 | Documentation for developing the inference scheduler.
  4 | 
  5 | ## Requirements
  6 | 
  7 | - [Make] `v4`+
  8 | - [Golang] `v1.24`+
  9 | - [Docker] (or [Podman])
 10 | - [Kubernetes in Docker (KIND)]
 11 | 
 12 | [Make]:https://www.gnu.org/software/make/
 13 | [Golang]:https://go.dev/
 14 | [Docker]:https://www.docker.com/
 15 | [Podman]:https://podman.io/
 16 | [Kubernetes in Docker (KIND)]:https://github.com/kubernetes-sigs/kind
 17 | 
 18 | ## Kind Development Environment
 19 | 
 20 | > **WARNING**: This current requires you to have manually built the vllm
 21 | > simulator separately on your local system. In a future iteration this will
 22 | > be handled automatically and will not be required. The tag for the simulator
 23 | > currently needs to be `0.0.4`.
 24 | 
 25 | You can deploy the current scheduler with a Gateway API implementation into a
 26 | [Kubernetes in Docker (KIND)] cluster locally with the following:
 27 | 
 28 | ```console
 29 | make env-dev-kind
 30 | ```
 31 | 
 32 | This will create a `kind` cluster (or re-use an existing one) using the system's
 33 | local container runtime and deploy the development stack into the `default`
 34 | namespace.
 35 | 
 36 | There are several ways to access the gateway:
 37 | 
 38 | **Port forward**:
 39 | 
 40 | ```console
 41 | $ kubectl --context llm-d-inference-scheduler-dev port-forward service/inference-gateway 8080:80
 42 | ```
 43 | 
 44 | **NodePort**
 45 | 
 46 | ```console
 47 | # Determine the k8s node address
 48 | $ kubectl --context llm-d-inference-scheduler-dev get node -o yaml | grep address
 49 | # The service is accessible over port 80 of the worker IP address.
 50 | ```
 51 | 
 52 | **LoadBalancer**
 53 | 
 54 | ```console
 55 | # Install and run cloud-provider-kind:
 56 | $ go install sigs.k8s.io/cloud-provider-kind@latest && cloud-provider-kind &
 57 | $ kubectl --context llm-d-inference-scheduler-dev get service inference-gateway
 58 | # Wait for the LoadBalancer External-IP to become available. The service is accessible over port 80.
 59 | ```
 60 | 
 61 | You can now make requests macthing the IP:port of one of the access mode above:
 62 | 
 63 | ```console
 64 | $ curl -s -w '\n' http://<IP:port>/v1/completions -H 'Content-Type: application/json' -d '{"model":"food-review","prompt":"hi","max_tokens":10,"temperature":0}' | jq
 65 | ```
 66 | 
 67 | By default the created inference gateway, can be accessed on port 30080. This can
 68 | be overriden to any free port in the range of 30000 to 32767, by running the above
 69 | command as follows:
 70 | 
 71 | ```console
 72 | KIND_GATEWAY_HOST_PORT=<selected-port> make env-dev-kind
 73 | ```
 74 | 
 75 | **Where:** &lt;selected-port&gt; is the port on your local machine you want to use to
 76 | access the inference gatyeway.
 77 | 
 78 | > **NOTE**: If you require significant customization of this environment beyond
 79 | > what the standard deployment provides, you can use the `deploy/components`
 80 | > with `kustomize` to build your own highly customized environment. You can use
 81 | > the `deploy/environments/kind` deployment as a reference for your own.
 82 | 
 83 | [Kubernetes in Docker (KIND)]:https://github.com/kubernetes-sigs/kind
 84 | 
 85 | ### Development Cycle
 86 | 
 87 | To test your changes to `llm-d-inferernce-scheduler` in this environment, make your changes locally
 88 | and then re-run the deployment:
 89 | 
 90 | ```console
 91 | make env-dev-kind
 92 | ```
 93 | 
 94 | This will build images with your recent changes and load the new images to the
 95 | cluster. By default the image tag will be `dev`. It will also load llm-d-inference-sim, using a tag of `dev` by default.
 96 | 
 97 | **NOTE:** The built image tag can be specified via the `EPP_TAG` environment variable so it is used in the deployment. For example:
 98 | 
 99 | ```console
100 | EPP_TAG=0.0.4 make env-dev-kind
101 | ```
102 | 
103 | **NOTE:** If you want to load a different tag of llm-d-inference-sim, you can use the environment variable `VLLM_SIMULATOR_TAG` to specify it.
104 | 
105 | **NOTE**: If you are working on a MacOS with Apple Silicon, it is required to add
106 | the environment variable `GOOS=linux`.
107 | 
108 | Then do a rollout of the EPP `Deployment` so that your recent changes are
109 | reflected:
110 | 
111 | ```console
112 | kubectl rollout restart deployment endpoint-picker
113 | ```
114 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Build Stage: using Go 1.24.1 image
 2 | FROM quay.io/projectquay/golang:1.24 AS builder
 3 | ARG TARGETOS
 4 | ARG TARGETARCH
 5 | 
 6 | # Install build tools
 7 | RUN dnf install -y gcc-c++ libstdc++ libstdc++-devel clang && dnf clean all
 8 | 
 9 | WORKDIR /workspace
10 | 
11 | # Copy the Go Modules manifests
12 | COPY go.mod go.mod
13 | COPY go.sum go.sum
14 | 
15 | # Copy the go source
16 | COPY cmd/ cmd/
17 | COPY pkg/ pkg/
18 | COPY internal/ internal/
19 | 
20 | # HuggingFace tokenizer bindings
21 | RUN mkdir -p lib
22 | RUN curl -L https://github.com/daulet/tokenizers/releases/download/v1.20.2/libtokenizers.${TARGETOS}-${TARGETARCH}.tar.gz | tar -xz -C lib
23 | RUN ranlib lib/*.a
24 | 
25 | # Build
26 | # the GOARCH has not a default value to allow the binary be built according to the host where the command
27 | # was called. For example, if we call make image-build in a local env which has the Apple Silicon M1 SO
28 | # the docker BUILDPLATFORM arg will be linux/arm64 when for Apple x86 it will be linux/amd64. Therefore,
29 | # by leaving it empty we can ensure that the container and binary shipped on it will have the same platform.
30 | ENV CGO_ENABLED=1
31 | ENV GOOS=${TARGETOS:-linux}
32 | ENV GOARCH=${TARGETARCH}
33 | RUN go build -a -o bin/epp -ldflags="-extldflags '-L$(pwd)/lib'" cmd/epp/main.go cmd/epp/health.go
34 | 
35 | # Use distroless as minimal base image to package the manager binary
36 | # Refer to https://github.com/GoogleContainerTools/distroless for more details
37 | FROM registry.access.redhat.com/ubi9/ubi:latest
38 | WORKDIR /
39 | COPY --from=builder /workspace/bin/epp /app/epp
40 | USER 65532:65532
41 | 
42 | # expose gRPC, health and metrics ports
43 | EXPOSE 9002
44 | EXPOSE 9003
45 | EXPOSE 9090
46 | 
47 | ENTRYPOINT ["/app/epp"]
48 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
  1 | SHELL := /usr/bin/env bash
  2 | 
  3 | # Defaults
  4 | TARGETOS ?= $(shell go env GOOS)
  5 | TARGETARCH ?= $(shell go env GOARCH)
  6 | PROJECT_NAME ?= llm-d-inference-scheduler
  7 | IMAGE_REGISTRY ?= ghcr.io/llm-d
  8 | IMAGE_TAG_BASE ?= $(IMAGE_REGISTRY)/$(PROJECT_NAME)
  9 | EPP_TAG ?= dev
 10 | IMG = $(IMAGE_TAG_BASE):$(EPP_TAG)
 11 | NAMESPACE ?= hc4ai-operator
 12 | 
 13 | CONTAINER_TOOL := $(shell { command -v docker >/dev/null 2>&1 && echo docker; } || { command -v podman >/dev/null 2>&1 && echo podman; } || echo "")
 14 | BUILDER := $(shell command -v buildah >/dev/null 2>&1 && echo buildah || echo $(CONTAINER_TOOL))
 15 | PLATFORMS ?= linux/amd64 # linux/arm64 # linux/s390x,linux/ppc64le
 16 | 
 17 | # go source files
 18 | SRC = $(shell find . -type f -name '*.go')
 19 | 
 20 | .PHONY: help
 21 | help: ## Print help
 22 | 	@awk 'BEGIN {FS = ":.*##"; printf "\nUsage:\n  make \033[36m<target>\033[0m\n"} /^[a-zA-Z_0-9-]+:.*?##/ { printf "  \033[36m%-15s\033[0m %s\n", $$1, $$2 } /^##@/ { printf "\n\033[1m%s\033[0m\n", substr($$0, 5) } ' $(MAKEFILE_LIST)
 23 | 
 24 | ##@ Tokenizer & Linking
 25 | 
 26 | LDFLAGS ?= -extldflags '-L$(shell pwd)/lib'
 27 | CGO_ENABLED=1
 28 | TOKENIZER_LIB = lib/libtokenizers.a
 29 | 
 30 | .PHONY: download-tokenizer
 31 | download-tokenizer: $(TOKENIZER_LIB)
 32 | $(TOKENIZER_LIB):
 33 | 	## Download the HuggingFace tokenizer bindings.
 34 | 	@echo "Downloading HuggingFace tokenizer bindings..."
 35 | 	mkdir -p lib
 36 | 	curl -L https://github.com/daulet/tokenizers/releases/download/v1.20.2/libtokenizers.$(TARGETOS)-$(TARGETARCH).tar.gz | tar -xz -C lib
 37 | 	ranlib lib/*.a
 38 | 
 39 | ##@ Development
 40 | 
 41 | .PHONY: clean
 42 | clean:
 43 | 	go clean -testcache -cache
 44 | 	rm -f $(TOKENIZER_LIB)
 45 | 	rmdir lib
 46 | 
 47 | .PHONY: format
 48 | format: ## Format Go source files
 49 | 	@printf "\033[33;1m==== Running gofmt ====\033[0m\n"
 50 | 	@gofmt -l -w $(SRC)
 51 | 
 52 | .PHONY: test
 53 | test: test-unit
 54 | 
 55 | .PHONY: test-unit
 56 | test-unit: download-tokenizer
 57 | 	@printf "\033[33;1m==== Running Unit Tests ====\033[0m\n"
 58 | 	go test -ldflags="$(LDFLAGS)" -v ./...
 59 | 
 60 | .PHONY: test-integration
 61 | test-integration: download-tokenizer
 62 | 	@printf "\033[33;1m==== Running Integration Tests ====\033[0m\n"
 63 | 	go test -ldflags="$(LDFLAGS)" -v -tags=integration_tests ./test/integration/
 64 | 
 65 | .PHONY: post-deploy-test
 66 | post-deploy-test: ## Run post deployment tests
 67 | 	echo Success!
 68 | 	@echo "Post-deployment tests passed."
 69 | 	
 70 | .PHONY: lint
 71 | lint: check-golangci-lint ## Run lint
 72 | 	@printf "\033[33;1m==== Running linting ====\033[0m\n"
 73 | 	golangci-lint run
 74 | 
 75 | ##@ Build
 76 | 
 77 | .PHONY: build
 78 | build: check-go download-tokenizer ##
 79 | 	@printf "\033[33;1m==== Building ====\033[0m\n"
 80 | 	go build -ldflags="$(LDFLAGS)" -o bin/epp cmd/epp/main.go cmd/epp/health.go
 81 | 
 82 | ##@ Container Build/Push
 83 | 
 84 | .PHONY:	image-build
 85 | image-build: check-container-tool ## Build Docker image ## Build Docker image using $(CONTAINER_TOOL)
 86 | 	@printf "\033[33;1m==== Building Docker image $(IMG) ====\033[0m\n"
 87 | 	$(CONTAINER_TOOL) build \
 88 | 		--platform $(TARGETOS)/$(TARGETARCH) \
 89 |  		--build-arg TARGETOS=$(TARGETOS) \
 90 | 		--build-arg TARGETARCH=$(TARGETARCH) \
 91 |  		-t $(IMG) .
 92 | 
 93 | .PHONY: image-push
 94 | image-push: check-container-tool ## Push Docker image $(IMG) to registry
 95 | 	@printf "\033[33;1m==== Pushing Docker image $(IMG) ====\033[0m\n"
 96 | 	$(CONTAINER_TOOL) push $(IMG)
 97 | 
 98 | ##@ Install/Uninstall Targets
 99 | 
100 | # Default install/uninstall (Docker)
101 | install: install-docker ## Default install using Docker
102 | 	@echo "Default Docker install complete."
103 | 
104 | uninstall: uninstall-docker ## Default uninstall using Docker
105 | 	@echo "Default Docker uninstall complete."
106 | 
107 | ### Docker Targets
108 | 
109 | .PHONY: install-docker
110 | install-docker: check-container-tool ## Install app using $(CONTAINER_TOOL)
111 | 	@echo "Starting container with $(CONTAINER_TOOL)..."
112 | 	$(CONTAINER_TOOL) run -d --name $(PROJECT_NAME)-container $(IMG)
113 | 	@echo "$(CONTAINER_TOOL) installation complete."
114 | 	@echo "To use $(PROJECT_NAME), run:"
115 | 	@echo "alias $(PROJECT_NAME)='$(CONTAINER_TOOL) exec -it $(PROJECT_NAME)-container /app/$(PROJECT_NAME)'"
116 | 
117 | .PHONY: uninstall-docker
118 | uninstall-docker: check-container-tool ## Uninstall app from $(CONTAINER_TOOL)
119 | 	@echo "Stopping and removing container in $(CONTAINER_TOOL)..."
120 | 	-$(CONTAINER_TOOL) stop $(PROJECT_NAME)-container && $(CONTAINER_TOOL) rm $(PROJECT_NAME)-container
121 | @echo "$(CONTAINER_TOOL) uninstallation complete. Remove alias if set: unalias $(PROJECT_NAME)"
122 | 
123 | ### Kubernetes Targets (kubectl)
124 | 
125 | .PHONY: install-k8s
126 | install-k8s: check-kubectl check-kustomize check-envsubst ## Install on Kubernetes
127 | 	export PROJECT_NAME=${PROJECT_NAME}
128 | 	export NAMESPACE=${NAMESPACE}
129 | 	@echo "Creating namespace (if needed) and setting context to $(NAMESPACE)..."
130 | 	kubectl create namespace $(NAMESPACE) 2>/dev/null || true
131 | 	kubectl config set-context --current --namespace=$(NAMESPACE)
132 | 	@echo "Deploying resources from deploy/ ..."
133 | 	# Build the kustomization from deploy, substitute variables, and apply the YAML
134 | 	kustomize build deploy/environments/openshift-base | envsubst | kubectl apply -f -
135 | 	@echo "Waiting for pod to become ready..."
136 | 	sleep 5
137 | 	@POD=$$(kubectl get pod -l app=$(PROJECT_NAME)-statefulset -o jsonpath='{.items[0].metadata.name}'); \
138 | 	echo "Kubernetes installation complete."; \
139 | 	echo "To use the app, run:"; \
140 | 	echo "alias $(PROJECT_NAME)='kubectl exec -n $(NAMESPACE) -it $$POD -- /app/$(PROJECT_NAME)'"
141 | 	
142 | .PHONY: uninstall-k8s
143 | uninstall-k8s: check-kubectl check-kustomize check-envsubst ## Uninstall from Kubernetes
144 | 	export PROJECT_NAME=${PROJECT_NAME}
145 | 	export NAMESPACE=${NAMESPACE}
146 | 	@echo "Removing resources from Kubernetes..."
147 | 	kustomize build deploy/environments/openshift-base | envsubst | kubectl delete --force -f - || true
148 | 	POD=$$(kubectl get pod -l app=$(PROJECT_NAME)-statefulset -o jsonpath='{.items[0].metadata.name}'); \
149 | 	echo "Deleting pod: $$POD"; \
150 | 	kubectl delete pod "$$POD" --force --grace-period=0 || true; \
151 | 	echo "Kubernetes uninstallation complete. Remove alias if set: unalias $(PROJECT_NAME)"
152 | 
153 | ### OpenShift Targets (oc)
154 | 
155 | .PHONY: install-openshift
156 | install-openshift: check-kubectl check-kustomize check-envsubst ## Install on OpenShift
157 | 	@echo $$PROJECT_NAME $$NAMESPACE $$IMAGE_TAG_BASE $$VERSION
158 | 	@echo "Creating namespace $(NAMESPACE)..."
159 | 	kubectl create namespace $(NAMESPACE) 2>/dev/null || true
160 | 	@echo "Deploying common resources from deploy/ ..."
161 | 	# Build and substitute the base manifests from deploy, then apply them
162 | 	kustomize build deploy/environments/openshift-base | envsubst '$$PROJECT_NAME $$NAMESPACE $$IMAGE_TAG_BASE $$VERSION' | kubectl apply -n $(NAMESPACE) -f -
163 | 	@echo "Waiting for pod to become ready..."
164 | 	sleep 5
165 | 	@POD=$$(kubectl get pod -l app=$(PROJECT_NAME)-statefulset -n $(NAMESPACE) -o jsonpath='{.items[0].metadata.name}'); \
166 | 	echo "OpenShift installation complete."; \
167 | 	echo "To use the app, run:"; \
168 | 	echo "alias $(PROJECT_NAME)='kubectl exec -n $(NAMESPACE) -it $$POD -- /app/$(PROJECT_NAME)'" 
169 | 
170 | .PHONY: uninstall-openshift
171 | uninstall-openshift: check-kubectl check-kustomize check-envsubst ## Uninstall from OpenShift
172 | 	@echo "Removing resources from OpenShift..."
173 | 	kustomize build deploy/environments/openshift-base | envsubst '$$PROJECT_NAME $$NAMESPACE $$IMAGE_TAG_BASE $$VERSION' | kubectl delete --force -f - || true
174 | 	# @if kubectl api-resources --api-group=route.openshift.io | grep -q Route; then \
175 | 	#   envsubst '$$PROJECT_NAME $$NAMESPACE $$IMAGE_TAG_BASE $$VERSION' < deploy/openshift/route.yaml | kubectl delete --force -f - || true; \
176 | 	# fi
177 | 	@POD=$$(kubectl get pod -l app=$(PROJECT_NAME)-statefulset -n $(NAMESPACE) -o jsonpath='{.items[0].metadata.name}'); \
178 | 	echo "Deleting pod: $$POD"; \
179 | 	kubectl delete pod "$$POD" --force --grace-period=0 || true; \
180 | 	echo "OpenShift uninstallation complete. Remove alias if set: unalias $(PROJECT_NAME)"
181 | 
182 | ### RBAC Targets (using kustomize and envsubst)
183 | 
184 | .PHONY: install-rbac
185 | install-rbac: check-kubectl check-kustomize check-envsubst ## Install RBAC
186 | 	@echo "Applying RBAC configuration from deploy/rbac..."
187 | 	kustomize build deploy/environments/openshift-base/rbac | envsubst '$$PROJECT_NAME $$NAMESPACE $$IMAGE_TAG_BASE $$VERSION' | kubectl apply -f -
188 | 
189 | .PHONY: uninstall-rbac
190 | uninstall-rbac: check-kubectl check-kustomize check-envsubst ## Uninstall RBAC
191 | 	@echo "Removing RBAC configuration from deploy/rbac..."
192 | 	kustomize build deploy/environments/openshift-base/rbac | envsubst '$$PROJECT_NAME $$NAMESPACE $$IMAGE_TAG_BASE $$VERSION' | kubectl delete -f - || true
193 | 
194 | 
195 | ##@ Version Extraction
196 | .PHONY: version extract-version-info
197 | 
198 | .PHONY: env
199 | env: ## Print environment variables
200 | 	@echo "IMAGE_TAG_BASE=$(IMAGE_TAG_BASE)"
201 | 	@echo "IMG=$(IMG)"
202 | 	@echo "CONTAINER_TOOL=$(CONTAINER_TOOL)"
203 | 
204 | 
205 | ##@ Tools
206 | 
207 | .PHONY: check-tools
208 | check-tools: \
209 |   check-go \
210 |   check-ginkgo \
211 |   check-golangci-lint \
212 |   check-kustomize \
213 |   check-envsubst \
214 |   check-container-tool \
215 |   check-kubectl \
216 |   check-buildah 
217 | 	@echo "✅ All required tools are installed."
218 | 
219 | .PHONY: check-go
220 | check-go:
221 | 	@command -v go >/dev/null 2>&1 || { \
222 | 	  echo "❌ Go is not installed. Install it from https://golang.org/dl/"; exit 1; }
223 | 
224 | .PHONY: check-ginkgo
225 | check-ginkgo:
226 | 	@command -v ginkgo >/dev/null 2>&1 || { \
227 | 	  echo "❌ ginkgo is not installed. Install with: go install github.com/onsi/ginkgo/v2/ginkgo@latest"; exit 1; }
228 | 
229 | .PHONY: check-golangci-lint
230 | check-golangci-lint:
231 | 	@command -v golangci-lint >/dev/null 2>&1 || { \
232 | 	  echo "❌ golangci-lint is not installed. Install from https://golangci-lint.run/usage/install/"; exit 1; }
233 | 
234 | .PHONY: check-kustomize
235 | check-kustomize:
236 | 	@command -v kustomize >/dev/null 2>&1 || { \
237 | 	  echo "❌ kustomize is not installed. Install it from https://kubectl.docs.kubernetes.io/installation/kustomize/"; exit 1; }
238 | 
239 | .PHONY: check-envsubst
240 | check-envsubst:
241 | 	@command -v envsubst >/dev/null 2>&1 || { \
242 | 	  echo "❌ envsubst is not installed. It is part of gettext."; \
243 | 	  echo "🔧 Try: sudo apt install gettext OR brew install gettext"; exit 1; }
244 | 
245 | .PHONY: check-container-tool
246 | check-container-tool:
247 | 	@command -v $(CONTAINER_TOOL) >/dev/null 2>&1 || { \
248 | 	  echo "❌ $(CONTAINER_TOOL) is not installed."; \
249 | 	  echo "🔧 Try: sudo apt install $(CONTAINER_TOOL) OR brew install $(CONTAINER_TOOL)"; exit 1; }
250 | 
251 | .PHONY: check-kubectl
252 | check-kubectl:
253 | 	@command -v kubectl >/dev/null 2>&1 || { \
254 | 	  echo "❌ kubectl is not installed. Install it from https://kubernetes.io/docs/tasks/tools/"; exit 1; }
255 | 
256 | .PHONY: check-builder
257 | check-builder:
258 | 	@if [ -z "$(BUILDER)" ]; then \
259 | 		echo "❌ No container builder tool (buildah, docker, or podman) found."; \
260 | 		exit 1; \
261 | 	else \
262 | 		echo "✅ Using builder: $(BUILDER)"; \
263 | 	fi
264 | 
265 | ##@ Alias checking
266 | .PHONY: check-alias
267 | check-alias: check-container-tool
268 | 	@echo "🔍 Checking alias functionality for container '$(PROJECT_NAME)-container'..."
269 | 	@if ! $(CONTAINER_TOOL) exec $(PROJECT_NAME)-container /app/$(PROJECT_NAME) --help >/dev/null 2>&1; then \
270 | 	  echo "⚠️  The container '$(PROJECT_NAME)-container' is running, but the alias might not work."; \
271 | 	  echo "🔧 Try: $(CONTAINER_TOOL) exec -it $(PROJECT_NAME)-container /app/$(PROJECT_NAME)"; \
272 | 	else \
273 | 	  echo "✅ Alias is likely to work: alias $(PROJECT_NAME)='$(CONTAINER_TOOL) exec -it $(PROJECT_NAME)-container /app/$(PROJECT_NAME)'"; \
274 | 	fi
275 | 
276 | .PHONY: print-namespace
277 | print-namespace: ## Print the current namespace
278 | 	@echo "$(NAMESPACE)"
279 | 
280 | .PHONY: print-project-name
281 | print-project-name: ## Print the current project name
282 | 	@echo "$(PROJECT_NAME)"
283 | 
284 | .PHONY: install-hooks
285 | install-hooks: ## Install git hooks
286 | 	git config core.hooksPath hooks
287 | 
288 | ##@ Dev Environments
289 | 
290 | KIND_CLUSTER_NAME ?= llm-d-inference-scheduler-dev
291 | KIND_GATEWAY_HOST_PORT ?= 30080
292 | 
293 | .PHONY: env-dev-kind
294 | env-dev-kind: image-build
295 | 	CLUSTER_NAME=$(KIND_CLUSTER_NAME) \
296 | 	GATEWAY_HOST_PORT=$(KIND_GATEWAY_HOST_PORT) \
297 | 	IMAGE_REGISTRY=$(IMAGE_REGISTRY) \
298 | 	EPP_TAG=$(EPP_TAG) \
299 | 		./scripts/kind-dev-env.sh
300 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Inference Scheduler
 2 | 
 3 | This scheduler makes optimized routing decisions for inference requests to
 4 | the llm-d inference framework.
 5 | 
 6 | ## About
 7 | 
 8 | This provides an "Endpoint Picker (EPP)" component to the llm-d inference
 9 | framework which schedules incoming inference requests to the platform via a
10 | [Kubernetes] Gateway according to scheduler plugins (for more
11 | details, see the [Architecture Documentation]).
12 | 
13 | The EPP extends the [Gateway API Inference Extension (GIE)] project,
14 | which provides the API resources and machinery for scheduling. We add some
15 | custom features that are specific to llm-d here, such as [P/D Disaggregation].
16 | 
17 | A compatible [Gateway API] implementation is used as the Gateway. The Gateway
18 | API implementation must utilize [Envoy] and support [ext-proc], as this is the
19 | callback mechanism the EPP relies on to make routing decisions to model serving
20 | workloads currently.
21 | 
22 | [Kubernetes]:https://kubernetes.io
23 | [Architecture Documentation]:docs/architecture.md
24 | [Gateway API Inference Extension (GIE)]:https://github.com/kubernetes-sigs/gateway-api-inference-extension
25 | [P/D Disaggregation]:docs/dp.md
26 | [Gateway API]:https://github.com/kubernetes-sigs/gateway-api
27 | [Envoy]:https://github.com/envoyproxy/envoy
28 | [ext-proc]:https://www.envoyproxy.io/docs/envoy/latest/configuration/http/http_filters/ext_proc_filter
29 | 
30 | ## Contributing
31 | 
32 | Contributions are welcome!
33 | 
34 | For large changes please [create an issue] first describing the change so the
35 | maintainers can do an assessment, and work on the details with you. See
36 | [DEVELOPMENT.md](DEVELOPMENT.md) for details on how to work with the codebase.
37 | 
38 | Note that in general features should go to the upstream [Gateway API Inference
39 | Extension (GIE)] project _first_ if applicable. The GIE is a major dependency of
40 | ours, and where most _general purpose_ inference features live. If you have
41 | something that you feel is general purpose or use, it probably should go to the
42 | GIE. If you have something that's _llm-d specific_ then it should go here. If
43 | you're not sure whether your feature belongs here or in the GIE, feel free to
44 | create a [discussion] or ask on [Slack].
45 | 
46 | [create an issue]:https://github.com/llm-d/llm-d-inference-scheduler/issues/new
47 | [Gateway API Inference Extension (GIE)]:https://github.com/kubernetes-sigs/gateway-api-inference-extension
48 | [discussion]:https://github.com/llm-d/llm-d-inference-scheduler/discussions/new?category=q-a
49 | [Slack]:https://llm-d.slack.com/
50 | 


--------------------------------------------------------------------------------
/cmd/epp/health.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright 2025 The Kubernetes Authors.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 | 
17 | /**
18 |  * This file is adapted from Gateway API Inference Extension
19 |  * Original source: https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/cmd/epp/health.go
20 |  * Licensed under the Apache License, Version 2.0
21 |  */
22 | 
23 | package main
24 | 
25 | import (
26 | 	"context"
27 | 
28 | 	extProcPb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3"
29 | 	"github.com/go-logr/logr"
30 | 	"google.golang.org/grpc/codes"
31 | 	healthPb "google.golang.org/grpc/health/grpc_health_v1"
32 | 	"google.golang.org/grpc/status"
33 | 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore"
34 | 	logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging"
35 | )
36 | 
37 | type healthServer struct {
38 | 	logger    logr.Logger
39 | 	datastore datastore.Datastore
40 | }
41 | 
42 | func (s *healthServer) Check(_ context.Context, in *healthPb.HealthCheckRequest) (*healthPb.HealthCheckResponse, error) {
43 | 	// TODO: we're accepting ANY service name for now as a temporary hack in alignment with
44 | 	// upstream issues. See https://github.com/kubernetes-sigs/gateway-api-inference-extension/pull/788
45 | 	// if in.Service != extProcPb.ExternalProcessor_ServiceDesc.ServiceName {
46 | 	// 	s.logger.V(logutil.DEFAULT).Info("gRPC health check requested unknown service", "available-services", []string{extProcPb.ExternalProcessor_ServiceDesc.ServiceName}, "requested-service", in.Service)
47 | 	// 	return &healthPb.HealthCheckResponse{Status: healthPb.HealthCheckResponse_SERVICE_UNKNOWN}, nil
48 | 	// }
49 | 
50 | 	if !s.datastore.PoolHasSynced() {
51 | 		s.logger.V(logutil.DEFAULT).Info("gRPC health check not serving", "service", in.Service)
52 | 		return &healthPb.HealthCheckResponse{Status: healthPb.HealthCheckResponse_NOT_SERVING}, nil
53 | 	}
54 | 
55 | 	s.logger.V(logutil.TRACE).Info("gRPC health check serving", "service", in.Service)
56 | 	return &healthPb.HealthCheckResponse{Status: healthPb.HealthCheckResponse_SERVING}, nil
57 | }
58 | 
59 | func (s *healthServer) List(ctx context.Context, _ *healthPb.HealthListRequest) (*healthPb.HealthListResponse, error) {
60 | 	// currently only the ext_proc service is provided
61 | 	serviceHealthResponse, err := s.Check(ctx, &healthPb.HealthCheckRequest{Service: extProcPb.ExternalProcessor_ServiceDesc.ServiceName})
62 | 	if err != nil {
63 | 		return nil, err
64 | 	}
65 | 
66 | 	return &healthPb.HealthListResponse{
67 | 		Statuses: map[string]*healthPb.HealthCheckResponse{
68 | 			extProcPb.ExternalProcessor_ServiceDesc.ServiceName: serviceHealthResponse,
69 | 		},
70 | 	}, nil
71 | }
72 | 
73 | func (s *healthServer) Watch(_ *healthPb.HealthCheckRequest, _ healthPb.Health_WatchServer) error {
74 | 	return status.Error(codes.Unimplemented, "Watch is not implemented")
75 | }
76 | 


--------------------------------------------------------------------------------
/cmd/epp/main.go:
--------------------------------------------------------------------------------
  1 | /*
  2 | Copyright 2025 The Kubernetes Authors.
  3 | 
  4 | Licensed under the Apache License, Version 2.0 (the "License");
  5 | you may not use this file except in compliance with the License.
  6 | You may obtain a copy of the License at
  7 | 
  8 |     http://www.apache.org/licenses/LICENSE-2.0
  9 | 
 10 | Unless required by applicable law or agreed to in writing, software
 11 | distributed under the License is distributed on an "AS IS" BASIS,
 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | See the License for the specific language governing permissions and
 14 | limitations under the License.
 15 | */
 16 | 
 17 | /**
 18 |  * This file is adapted from Gateway API Inference Extension
 19 |  * Original source: https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/cmd/epp/main.go
 20 |  * Licensed under the Apache License, Version 2.0
 21 |  */
 22 | 
 23 | // Package main contains the "Endpoint Picker (EPP)" program for scheduling
 24 | // inference requests.
 25 | package main
 26 | 
 27 | import (
 28 | 	"flag"
 29 | 	"fmt"
 30 | 	"net"
 31 | 	"net/http"
 32 | 	"os"
 33 | 	"strconv"
 34 | 
 35 | 	"github.com/go-logr/logr"
 36 | 	"github.com/prometheus/client_golang/prometheus/promhttp"
 37 | 	uberzap "go.uber.org/zap"
 38 | 	"go.uber.org/zap/zapcore"
 39 | 	"google.golang.org/grpc"
 40 | 	healthPb "google.golang.org/grpc/health/grpc_health_v1"
 41 | 	"k8s.io/apimachinery/pkg/types"
 42 | 	"k8s.io/client-go/rest"
 43 | 	"k8s.io/component-base/metrics/legacyregistry"
 44 | 	ctrl "sigs.k8s.io/controller-runtime"
 45 | 	"sigs.k8s.io/controller-runtime/pkg/log/zap"
 46 | 	"sigs.k8s.io/controller-runtime/pkg/manager"
 47 | 	"sigs.k8s.io/controller-runtime/pkg/metrics/filters"
 48 | 	backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics"
 49 | 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore"
 50 | 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics"
 51 | 	runserver "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/server"
 52 | 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging"
 53 | 
 54 | 	"github.com/llm-d/llm-d-inference-scheduler/internal/controller/runnable"
 55 | 	"github.com/llm-d/llm-d-inference-scheduler/pkg/config"
 56 | 	"github.com/llm-d/llm-d-inference-scheduler/pkg/scheduling/pd"
 57 | )
 58 | 
 59 | const (
 60 | 	defaultMetricsEndpoint = "/metrics"
 61 | )
 62 | 
 63 | var (
 64 | 	grpcPort = flag.Int(
 65 | 		"grpcPort",
 66 | 		runserver.DefaultGrpcPort,
 67 | 		"The gRPC port used for communicating with Envoy proxy")
 68 | 	grpcHealthPort = flag.Int(
 69 | 		"grpcHealthPort",
 70 | 		9003,
 71 | 		"The port used for gRPC liveness and readiness probes")
 72 | 	metricsPort = flag.Int(
 73 | 		"metricsPort", 9090, "The metrics port")
 74 | 	destinationEndpointHintKey = flag.String(
 75 | 		"destinationEndpointHintKey",
 76 | 		runserver.DefaultDestinationEndpointHintKey,
 77 | 		"Header and response metadata key used by Envoy to route to the appropriate pod. This must match Envoy configuration.")
 78 | 	destinationEndpointHintMetadataNamespace = flag.String(
 79 | 		"DestinationEndpointHintMetadataNamespace",
 80 | 		runserver.DefaultDestinationEndpointHintMetadataNamespace,
 81 | 		"The key for the outer namespace struct in the metadata field of the extproc response that is used to wrap the"+
 82 | 			"target endpoint. If not set, then an outer namespace struct should not be created.")
 83 | 	poolName = flag.String(
 84 | 		"poolName",
 85 | 		runserver.DefaultPoolName,
 86 | 		"Name of the InferencePool this Endpoint Picker is associated with.")
 87 | 	poolNamespace = flag.String(
 88 | 		"poolNamespace",
 89 | 		runserver.DefaultPoolNamespace,
 90 | 		"Namespace of the InferencePool this Endpoint Picker is associated with.")
 91 | 	refreshMetricsInterval = flag.Duration(
 92 | 		"refreshMetricsInterval",
 93 | 		runserver.DefaultRefreshMetricsInterval,
 94 | 		"interval to refresh metrics")
 95 | 	refreshPrometheusMetricsInterval = flag.Duration(
 96 | 		"refreshPrometheusMetricsInterval",
 97 | 		runserver.DefaultRefreshPrometheusMetricsInterval,
 98 | 		"interval to flush prometheus metrics")
 99 | 	logVerbosity  = flag.Int("v", logging.DEFAULT, "number for the log level verbosity")
100 | 	secureServing = flag.Bool(
101 | 		"secureServing", runserver.DefaultSecureServing, "Enables secure serving. Defaults to true.")
102 | 	certPath = flag.String(
103 | 		"certPath", "", "The path to the certificate for secure serving. The certificate and private key files "+
104 | 			"are assumed to be named tls.crt and tls.key, respectively. If not set, and secureServing is enabled, "+
105 | 			"then a self-signed certificate is used.")
106 | 	// metric flags
107 | 	totalQueuedRequestsMetric = flag.String("totalQueuedRequestsMetric",
108 | 		"vllm:num_requests_waiting",
109 | 		"Prometheus metric for the number of queued requests.")
110 | 	kvCacheUsagePercentageMetric = flag.String("kvCacheUsagePercentageMetric",
111 | 		"vllm:gpu_cache_usage_perc",
112 | 		"Prometheus metric for the fraction of KV-cache blocks currently in use (from 0 to 1).")
113 | 	// LoRA metrics
114 | 	loraInfoMetric = flag.String("loraInfoMetric",
115 | 		"vllm:lora_requests_info",
116 | 		"Prometheus metric for the LoRA info metrics (must be in vLLM label format).")
117 | 
118 | 	setupLog = ctrl.Log.WithName("setup")
119 | )
120 | 
121 | func main() {
122 | 	if err := run(); err != nil {
123 | 		os.Exit(1)
124 | 	}
125 | }
126 | 
127 | func run() error {
128 | 	opts := zap.Options{
129 | 		Development: true,
130 | 	}
131 | 	opts.BindFlags(flag.CommandLine)
132 | 	flag.Parse()
133 | 	initLogging(&opts)
134 | 
135 | 	// Validate flags
136 | 	if err := validateFlags(); err != nil {
137 | 		setupLog.Error(err, "Failed to validate flags")
138 | 		return err
139 | 	}
140 | 
141 | 	// Print all flag values
142 | 	flags := make(map[string]any)
143 | 	flag.VisitAll(func(f *flag.Flag) {
144 | 		flags[f.Name] = f.Value
145 | 	})
146 | 	setupLog.Info("Flags processed", "flags", flags)
147 | 
148 | 	// Init runtime.
149 | 	cfg, err := ctrl.GetConfig()
150 | 	if err != nil {
151 | 		setupLog.Error(err, "Failed to get rest config")
152 | 		return err
153 | 	}
154 | 
155 | 	poolNamespacedName := types.NamespacedName{
156 | 		Name:      *poolName,
157 | 		Namespace: *poolNamespace,
158 | 	}
159 | 	mgr, err := runserver.NewDefaultManager(poolNamespacedName, cfg)
160 | 	if err != nil {
161 | 		setupLog.Error(err, "Failed to create controller manager")
162 | 		return err
163 | 	}
164 | 
165 | 	// Set up mapper for metric scraping.
166 | 	mapping, err := backendmetrics.NewMetricMapping(
167 | 		*totalQueuedRequestsMetric,
168 | 		*kvCacheUsagePercentageMetric,
169 | 		*loraInfoMetric,
170 | 	)
171 | 	if err != nil {
172 | 		setupLog.Error(err, "Failed to create metric mapping from flags.")
173 | 		return err
174 | 	}
175 | 	verifyMetricMapping(*mapping, setupLog)
176 | 
177 | 	pmf := backendmetrics.NewPodMetricsFactory(&backendmetrics.PodMetricsClientImpl{MetricMapping: mapping}, *refreshMetricsInterval)
178 | 	// Setup runner.
179 | 	ctx := ctrl.SetupSignalHandler()
180 | 
181 | 	schedCfg := config.NewConfig(setupLog)
182 | 	schedCfg.LoadConfig()
183 | 
184 | 	datastore := datastore.NewDatastore(ctx, pmf)
185 | 	scheduler, err := pd.NewScheduler(ctx, schedCfg, datastore)
186 | 	if err != nil {
187 | 		setupLog.Error(err, "Failed to create PD scheduler")
188 | 		return err
189 | 	}
190 | 
191 | 	serverRunner := &runserver.ExtProcServerRunner{
192 | 		GrpcPort:                                 *grpcPort,
193 | 		DestinationEndpointHintMetadataNamespace: *destinationEndpointHintMetadataNamespace,
194 | 		DestinationEndpointHintKey:               *destinationEndpointHintKey,
195 | 		PoolNamespacedName:                       poolNamespacedName,
196 | 		Datastore:                                datastore,
197 | 		SecureServing:                            *secureServing,
198 | 		CertPath:                                 *certPath,
199 | 		RefreshPrometheusMetricsInterval:         *refreshPrometheusMetricsInterval,
200 | 		Scheduler:                                scheduler,
201 | 	}
202 | 	if err := serverRunner.SetupWithManager(ctx, mgr); err != nil {
203 | 		setupLog.Error(err, "Failed to setup ext-proc controllers")
204 | 		return err
205 | 	}
206 | 
207 | 	// Register health server.
208 | 	if err := registerHealthServer(mgr, ctrl.Log.WithName("health"), datastore, *grpcHealthPort); err != nil {
209 | 		return err
210 | 	}
211 | 
212 | 	// Register ext-proc server.
213 | 	if err := mgr.Add(serverRunner.AsRunnable(ctrl.Log.WithName("ext-proc"))); err != nil {
214 | 		setupLog.Error(err, "Failed to register ext-proc gRPC server")
215 | 		return err
216 | 	}
217 | 
218 | 	// Register metrics handler.
219 | 	if err := registerMetricsHandler(mgr, *metricsPort, cfg); err != nil {
220 | 		return err
221 | 	}
222 | 
223 | 	// Start the manager. This blocks until a signal is received.
224 | 	setupLog.Info("Controller manager starting")
225 | 	if err := mgr.Start(ctx); err != nil {
226 | 		setupLog.Error(err, "Error starting controller manager")
227 | 		return err
228 | 	}
229 | 	setupLog.Info("Controller manager terminated")
230 | 	return nil
231 | }
232 | 
233 | func initLogging(opts *zap.Options) {
234 | 	// Unless -zap-log-level is explicitly set, use -v
235 | 	useV := true
236 | 	flag.Visit(func(f *flag.Flag) {
237 | 		if f.Name == "zap-log-level" {
238 | 			useV = false
239 | 		}
240 | 	})
241 | 	if useV {
242 | 		// See https://pkg.go.dev/sigs.k8s.io/controller-runtime/pkg/log/zap#Options.Level
243 | 		lvl := -1 * (*logVerbosity)
244 | 		opts.Level = uberzap.NewAtomicLevelAt(zapcore.Level(int8(lvl)))
245 | 	}
246 | 
247 | 	logger := zap.New(zap.UseFlagOptions(opts), zap.RawZapOpts(uberzap.AddCaller()))
248 | 	ctrl.SetLogger(logger)
249 | }
250 | 
251 | // registerHealthServer adds the Health gRPC server as a Runnable to the given manager.
252 | func registerHealthServer(mgr manager.Manager, logger logr.Logger, ds datastore.Datastore, port int) error {
253 | 	srv := grpc.NewServer()
254 | 	healthPb.RegisterHealthServer(srv, &healthServer{
255 | 		logger:    logger,
256 | 		datastore: ds,
257 | 	})
258 | 	if err := mgr.Add(
259 | 		runnable.NoLeaderElection(runnable.GRPCServer("health", srv, port))); err != nil {
260 | 		setupLog.Error(err, "Failed to register health server")
261 | 		return err
262 | 	}
263 | 	return nil
264 | }
265 | 
266 | // registerMetricsHandler adds the metrics HTTP handler as a Runnable to the given manager.
267 | func registerMetricsHandler(mgr manager.Manager, port int, cfg *rest.Config) error {
268 | 	metrics.Register()
269 | 
270 | 	// Init HTTP server.
271 | 	h, err := metricsHandlerWithAuthenticationAndAuthorization(cfg)
272 | 	if err != nil {
273 | 		return err
274 | 	}
275 | 
276 | 	mux := http.NewServeMux()
277 | 	mux.Handle(defaultMetricsEndpoint, h)
278 | 
279 | 	srv := &http.Server{
280 | 		Addr:    net.JoinHostPort("", strconv.Itoa(port)),
281 | 		Handler: mux,
282 | 	}
283 | 
284 | 	if err := mgr.Add(&manager.Server{
285 | 		Name:   "metrics",
286 | 		Server: srv,
287 | 	}); err != nil {
288 | 		setupLog.Error(err, "Failed to register metrics HTTP handler")
289 | 		return err
290 | 	}
291 | 	return nil
292 | }
293 | 
294 | func metricsHandlerWithAuthenticationAndAuthorization(cfg *rest.Config) (http.Handler, error) {
295 | 	h := promhttp.HandlerFor(
296 | 		legacyregistry.DefaultGatherer,
297 | 		promhttp.HandlerOpts{},
298 | 	)
299 | 	httpClient, err := rest.HTTPClientFor(cfg)
300 | 	if err != nil {
301 | 		setupLog.Error(err, "Failed to create http client for metrics auth")
302 | 		return nil, err
303 | 	}
304 | 
305 | 	filter, err := filters.WithAuthenticationAndAuthorization(cfg, httpClient)
306 | 	if err != nil {
307 | 		setupLog.Error(err, "Failed to create metrics filter for auth")
308 | 		return nil, err
309 | 	}
310 | 	metricsLogger := ctrl.Log.WithName("metrics").WithValues("path", defaultMetricsEndpoint)
311 | 	metricsAuthHandler, err := filter(metricsLogger, h)
312 | 	if err != nil {
313 | 		setupLog.Error(err, "Failed to create metrics auth handler")
314 | 		return nil, err
315 | 	}
316 | 	return metricsAuthHandler, nil
317 | }
318 | 
319 | func validateFlags() error {
320 | 	if *poolName == "" {
321 | 		return fmt.Errorf("required %q flag not set", "poolName")
322 | 	}
323 | 
324 | 	return nil
325 | }
326 | 
327 | func verifyMetricMapping(mapping backendmetrics.MetricMapping, logger logr.Logger) {
328 | 	if mapping.TotalQueuedRequests == nil {
329 | 		logger.Info("Not scraping metric: TotalQueuedRequests")
330 | 	}
331 | 	if mapping.KVCacheUtilization == nil {
332 | 		logger.Info("Not scraping metric: KVCacheUtilization")
333 | 	}
334 | 	if mapping.LoraRequestInfo == nil {
335 | 		logger.Info("Not scraping metric: LoraRequestInfo")
336 | 	}
337 | 
338 | }
339 | 


--------------------------------------------------------------------------------
/deploy/components/crds-gateway-api/kustomization.yaml:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------
 2 | # Custom Resource Definitions (CRDs) for Gateway API
 3 | #
 4 | # **Warning**: CRDs are cluster-level, so in a shared development environment
 5 | # this needs to be done in a controlled and communicated manner.
 6 | # ------------------------------------------------------------------------------
 7 | apiVersion: kustomize.config.k8s.io/v1beta1
 8 | kind: Kustomization
 9 | 
10 | resources:
11 | - https://github.com/kubernetes-sigs/gateway-api/config/crd?ref=v1.3.0
12 | 


--------------------------------------------------------------------------------
/deploy/components/crds-gie/kustomization.yaml:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------
 2 | # Custom Resource Definitions (CRDs) for Gateway API Inference Extension (GIE)
 3 | #
 4 | # This deploys the GIE CRDs from the local directory.
 5 | #
 6 | # **Warning**: CRDs are cluster-level, so in a shared development environment
 7 | # this needs to be done in a controlled and communicated manner.
 8 | # ------------------------------------------------------------------------------
 9 | apiVersion: kustomize.config.k8s.io/v1beta1
10 | kind: Kustomization
11 | 
12 | resources:
13 | - https://github.com/kubernetes-sigs/gateway-api-inference-extension/config/crd?ref=v0.3.0


--------------------------------------------------------------------------------
/deploy/components/crds-istio/kustomization.yaml:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------
 2 | # Custom Resource Definitions (CRDs) for Istio
 3 | #
 4 | # **Warning**: CRDs are cluster-level, so in a shared development environment
 5 | # this needs to be done in a controlled and communicated manner.
 6 | # ------------------------------------------------------------------------------
 7 | apiVersion: kustomize.config.k8s.io/v1beta1
 8 | kind: Kustomization
 9 | 
10 | resources:
11 | - istio.yaml
12 | 


--------------------------------------------------------------------------------
/deploy/components/inference-gateway/deployments.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: apps/v1
 2 | kind: Deployment
 3 | metadata:
 4 |   name: endpoint-picker
 5 |   labels:
 6 |     app: endpoint-picker
 7 | spec:
 8 |   replicas: 1
 9 |   selector:
10 |     matchLabels:
11 |       app: endpoint-picker
12 |   template:
13 |     metadata:
14 |       labels:
15 |         app: endpoint-picker
16 |     spec:
17 |       serviceAccountName: endpoint-picker
18 |       terminationGracePeriodSeconds: 130
19 |       containers:
20 |       - name: epp
21 |         image: ghcr.io/llm-d/llm-d-inference-scheduler:latest
22 |         imagePullPolicy: IfNotPresent
23 |         args:
24 |         - -poolName
25 |         - "${POOL_NAME}"
26 |         - -v
27 |         - "4"
28 |         - --zap-encoder
29 |         - "json"
30 |         - -grpcPort
31 |         - "9002"
32 |         - -grpcHealthPort
33 |         - "9003"
34 |         env:
35 |         - name: PD_ENABLED
36 |           value: '${PD_ENABLED}'
37 |         - name: PD_PROMPT_LEN_THRESHOLD
38 |           value: '${PD_PROMPT_LEN_THRESHOLD}'
39 |         ports:
40 |         - containerPort: 9002
41 |         - containerPort: 9003
42 |         - name: metrics
43 |           containerPort: 9090
44 |         livenessProbe:
45 |           grpc:
46 |             port: 9003
47 |             service: envoy.service.ext_proc.v3.ExternalProcessor
48 |           initialDelaySeconds: 5
49 |           periodSeconds: 10
50 |         readinessProbe:
51 |           grpc:
52 |             port: 9003
53 |             service: envoy.service.ext_proc.v3.ExternalProcessor
54 |           initialDelaySeconds: 5
55 |           periodSeconds: 10
56 | 


--------------------------------------------------------------------------------
/deploy/components/inference-gateway/gateways.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: gateway.networking.k8s.io/v1
 2 | kind: Gateway
 3 | metadata:
 4 |   name: inference-gateway
 5 | spec:
 6 |   listeners:
 7 |   - name: default
 8 |     port: 80
 9 |     protocol: HTTP
10 | 


--------------------------------------------------------------------------------
/deploy/components/inference-gateway/httproutes.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: gateway.networking.k8s.io/v1
 2 | kind: HTTPRoute
 3 | metadata:
 4 |   name: inference-route
 5 | spec:
 6 |   parentRefs:
 7 |   - name: inference-gateway
 8 |   rules:
 9 |   - matches:
10 |     - path:
11 |         type: PathPrefix
12 |         value: /
13 |     backendRefs:
14 |     - group: inference.networking.x-k8s.io
15 |       kind: InferencePool
16 |       name: ${POOL_NAME}
17 |       port: 8000
18 |     timeouts:
19 |       request: 30s
20 | 


--------------------------------------------------------------------------------
/deploy/components/inference-gateway/inference-models.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: inference.networking.x-k8s.io/v1alpha2
 2 | kind: InferenceModel
 3 | metadata:
 4 |   name: food-review
 5 | spec:
 6 |   modelName: food-review
 7 |   criticality: Critical
 8 |   poolRef:
 9 |     name: ${POOL_NAME}
10 |   targetModels:
11 |   - name: food-review
12 |     weight: 100
13 | ---
14 | apiVersion: inference.networking.x-k8s.io/v1alpha2
15 | kind: InferenceModel
16 | metadata:
17 |   name: base-model
18 | spec:
19 |   modelName: ${MODEL_NAME}
20 |   criticality: Critical
21 |   poolRef:
22 |     name: ${POOL_NAME}
23 | 


--------------------------------------------------------------------------------
/deploy/components/inference-gateway/inference-pools.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: inference.networking.x-k8s.io/v1alpha2
 2 | kind: InferencePool
 3 | metadata:
 4 |   name: ${POOL_NAME}
 5 | spec:
 6 |   targetPortNumber: 8000
 7 |   selector:
 8 |     app: ${POOL_NAME}
 9 |   extensionRef:
10 |     name: endpoint-picker
11 | 


--------------------------------------------------------------------------------
/deploy/components/inference-gateway/kustomization.yaml:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------
 2 | # Inference Gateway
 3 | #
 4 | # This provides a working stack for an inference Gateway, including the Gateway
 5 | # itself, the Endpoint Picker (EPP) attached to it, and the Inference Pools and
 6 | # Inference Models to collect pods from a model serving framework (e.g. VLLM,
 7 | # or even just the VLLM Simulator).
 8 | #
 9 | # ------------------------------------------------------------------------------
10 | apiVersion: kustomize.config.k8s.io/v1beta1
11 | kind: Kustomization
12 | 
13 | resources:
14 | - service-accounts.yaml
15 | - rbac.yaml
16 | - inference-pools.yaml
17 | - inference-models.yaml
18 | - services.yaml
19 | - deployments.yaml
20 | - gateways.yaml
21 | - httproutes.yaml
22 | 
23 | images:
24 | - name: ghcr.io/llm-d/llm-d-inference-scheduler
25 |   newTag: ${EPP_TAG}
26 | 


--------------------------------------------------------------------------------
/deploy/components/inference-gateway/rbac.yaml:
--------------------------------------------------------------------------------
 1 | kind: Role
 2 | apiVersion: rbac.authorization.k8s.io/v1
 3 | metadata:
 4 |   name: endpoint-picker
 5 | rules:
 6 | - apiGroups:
 7 |   - "inference.networking.x-k8s.io"
 8 |   resources:
 9 |   - "inferencepools"
10 |   - "inferencemodels"
11 |   verbs:
12 |   - "get"
13 |   - "watch"
14 |   - "list"
15 | - apiGroups:
16 |   - ""
17 |   resources:
18 |   - "pods"
19 |   verbs:
20 |   - "get"
21 |   - "watch"
22 |   - "list"
23 | - apiGroups:
24 |   - "discovery.k8s.io"
25 |   resources:
26 |   - "endpointslices"
27 |   verbs:
28 |   - "get"
29 |   - "watch"
30 |   - "list"
31 | - apiGroups:
32 |   - "authentication.k8s.io"
33 |   resources:
34 |   - "tokenreviews"
35 |   verbs:
36 |   - "create"
37 | - apiGroups:
38 |   - "authorization.k8s.io"
39 |   resources:
40 |   - "subjectaccessreviews"
41 |   verbs:
42 |   - "create"
43 | --- 
44 | apiVersion: rbac.authorization.k8s.io/v1
45 | kind: RoleBinding
46 | metadata:
47 |   name: endpoint-picker-binding
48 | subjects:
49 | - kind: ServiceAccount
50 |   name: endpoint-picker
51 | roleRef:
52 |   apiGroup: rbac.authorization.k8s.io
53 |   kind: Role
54 |   name: endpoint-picker
55 | 


--------------------------------------------------------------------------------
/deploy/components/inference-gateway/service-accounts.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: ServiceAccount
3 | metadata:
4 |   name: endpoint-picker
5 | 


--------------------------------------------------------------------------------
/deploy/components/inference-gateway/services.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Service
 3 | metadata:
 4 |   name: endpoint-picker
 5 | spec:
 6 |   selector:
 7 |     app: endpoint-picker
 8 |   ports:
 9 |   - protocol: TCP
10 |     port: 9002
11 |     targetPort: 9002
12 |     appProtocol: http2
13 |   type: ClusterIP
14 | 


--------------------------------------------------------------------------------
/deploy/components/istio-control-plane/deployments.yaml:
--------------------------------------------------------------------------------
  1 | apiVersion: apps/v1
  2 | kind: Deployment
  3 | metadata:
  4 |   labels:
  5 |     app: istiod
  6 |     app.kubernetes.io/instance: istio
  7 |     app.kubernetes.io/managed-by: Helm
  8 |     app.kubernetes.io/name: istiod
  9 |     app.kubernetes.io/part-of: istio
 10 |     app.kubernetes.io/version: 1.26-alpha.9befed2f1439d883120f8de70fd70d84ca0ebc3d
 11 |     helm.sh/chart: istiod-1.26-alpha.9befed2f1439d883120f8de70fd70d84ca0ebc3d
 12 |     install.operator.istio.io/owning-resource: unknown
 13 |     istio: pilot
 14 |     istio.io/rev: llm-d-gateway
 15 |     operator.istio.io/component: Pilot
 16 |     release: istio
 17 |   name: istiod-llm-d-gateway
 18 |   namespace: llm-d-istio-system
 19 | spec:
 20 |   selector:
 21 |     matchLabels:
 22 |       app: istiod
 23 |       istio.io/rev: llm-d-gateway
 24 |   strategy:
 25 |     rollingUpdate:
 26 |       maxSurge: 100%
 27 |       maxUnavailable: 25%
 28 |   template:
 29 |     metadata:
 30 |       annotations:
 31 |         prometheus.io/port: "15014"
 32 |         prometheus.io/scrape: "true"
 33 |         sidecar.istio.io/inject: "false"
 34 |       labels:
 35 |         app: istiod
 36 |         app.kubernetes.io/instance: istio
 37 |         app.kubernetes.io/managed-by: Helm
 38 |         app.kubernetes.io/name: istiod
 39 |         app.kubernetes.io/part-of: istio
 40 |         app.kubernetes.io/version: 1.26-alpha.9befed2f1439d883120f8de70fd70d84ca0ebc3d
 41 |         helm.sh/chart: istiod-1.26-alpha.9befed2f1439d883120f8de70fd70d84ca0ebc3d
 42 |         install.operator.istio.io/owning-resource: unknown
 43 |         istio: istiod
 44 |         istio.io/dataplane-mode: none
 45 |         istio.io/rev: llm-d-gateway
 46 |         operator.istio.io/component: Pilot
 47 |         sidecar.istio.io/inject: "false"
 48 |     spec:
 49 |       containers:
 50 |       - args:
 51 |         - discovery
 52 |         - --monitoringAddr=:15014
 53 |         - --log_output_level=default:info
 54 |         - --domain
 55 |         - cluster.local
 56 |         - --keepaliveMaxServerConnectionAge
 57 |         - 30m
 58 |         env:
 59 |         - name: REVISION
 60 |           value: llm-d-gateway
 61 |         - name: PILOT_CERT_PROVIDER
 62 |           value: istiod
 63 |         - name: POD_NAME
 64 |           valueFrom:
 65 |             fieldRef:
 66 |               apiVersion: v1
 67 |               fieldPath: metadata.name
 68 |         - name: POD_NAMESPACE
 69 |           valueFrom:
 70 |             fieldRef:
 71 |               apiVersion: v1
 72 |               fieldPath: metadata.namespace
 73 |         - name: SERVICE_ACCOUNT
 74 |           valueFrom:
 75 |             fieldRef:
 76 |               apiVersion: v1
 77 |               fieldPath: spec.serviceAccountName
 78 |         - name: KUBECONFIG
 79 |           value: /var/run/secrets/remote/config
 80 |         - name: CA_TRUSTED_NODE_ACCOUNTS
 81 |           value: llm-d-istio-system/ztunnel
 82 |         - name: PILOT_TRACE_SAMPLING
 83 |           value: "1"
 84 |         - name: PILOT_ENABLE_ANALYSIS
 85 |           value: "false"
 86 |         - name: CLUSTER_ID
 87 |           value: Kubernetes
 88 |         - name: GOMEMLIMIT
 89 |           valueFrom:
 90 |             resourceFieldRef:
 91 |               resource: limits.memory
 92 |         - name: GOMAXPROCS
 93 |           valueFrom:
 94 |             resourceFieldRef:
 95 |               divisor: "1"
 96 |               resource: limits.cpu
 97 |         - name: PLATFORM
 98 |           value: ""
 99 |         image: quay.io/rh-ee-sutt/istio-testing/pilot:1.26-alpha.9befed2f1439d883120f8de70fd70d84ca0ebc3d
100 |         name: discovery
101 |         ports:
102 |         - containerPort: 8080
103 |           name: http-debug
104 |           protocol: TCP
105 |         - containerPort: 15010
106 |           name: grpc-xds
107 |           protocol: TCP
108 |         - containerPort: 15012
109 |           name: tls-xds
110 |           protocol: TCP
111 |         - containerPort: 15017
112 |           name: https-webhooks
113 |           protocol: TCP
114 |         - containerPort: 15014
115 |           name: http-monitoring
116 |           protocol: TCP
117 |         readinessProbe:
118 |           httpGet:
119 |             path: /ready
120 |             port: 8080
121 |           initialDelaySeconds: 1
122 |           periodSeconds: 3
123 |           timeoutSeconds: 5
124 |         resources:
125 |           requests:
126 |             cpu: 500m
127 |             memory: 1024Mi
128 |         securityContext:
129 |           allowPrivilegeEscalation: false
130 |           capabilities:
131 |             drop:
132 |             - ALL
133 |           readOnlyRootFilesystem: true
134 |           runAsNonRoot: true
135 |         volumeMounts:
136 |         - mountPath: /var/run/secrets/tokens
137 |           name: istio-token
138 |           readOnly: true
139 |         - mountPath: /var/run/secrets/istio-dns
140 |           name: local-certs
141 |         - mountPath: /etc/cacerts
142 |           name: cacerts
143 |           readOnly: true
144 |         - mountPath: /var/run/secrets/remote
145 |           name: istio-kubeconfig
146 |           readOnly: true
147 |         - mountPath: /var/run/secrets/istiod/tls
148 |           name: istio-csr-dns-cert
149 |           readOnly: true
150 |         - mountPath: /var/run/secrets/istiod/ca
151 |           name: istio-csr-ca-configmap
152 |           readOnly: true
153 |       serviceAccountName: istiod-llm-d-gateway
154 |       tolerations:
155 |       - key: cni.istio.io/not-ready
156 |         operator: Exists
157 |       volumes:
158 |       - emptyDir:
159 |           medium: Memory
160 |         name: local-certs
161 |       - name: istio-token
162 |         projected:
163 |           sources:
164 |           - serviceAccountToken:
165 |               audience: istio-ca
166 |               expirationSeconds: 43200
167 |               path: istio-token
168 |       - name: cacerts
169 |         secret:
170 |           optional: true
171 |           secretName: cacerts
172 |       - name: istio-kubeconfig
173 |         secret:
174 |           optional: true
175 |           secretName: istio-kubeconfig
176 |       - name: istio-csr-dns-cert
177 |         secret:
178 |           optional: true
179 |           secretName: istiod-tls
180 |       - configMap:
181 |           defaultMode: 420
182 |           name: istio-ca-root-cert
183 |           optional: true
184 |         name: istio-csr-ca-configmap
185 | 


--------------------------------------------------------------------------------
/deploy/components/istio-control-plane/hpa.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: autoscaling/v2
 2 | kind: HorizontalPodAutoscaler
 3 | metadata:
 4 |   labels:
 5 |     app: istiod
 6 |     app.kubernetes.io/instance: istio
 7 |     app.kubernetes.io/managed-by: Helm
 8 |     app.kubernetes.io/name: istiod
 9 |     app.kubernetes.io/part-of: istio
10 |     app.kubernetes.io/version: 1.26-alpha.9befed2f1439d883120f8de70fd70d84ca0ebc3d
11 |     helm.sh/chart: istiod-1.26-alpha.9befed2f1439d883120f8de70fd70d84ca0ebc3d
12 |     install.operator.istio.io/owning-resource: unknown
13 |     istio.io/rev: llm-d-gateway
14 |     operator.istio.io/component: Pilot
15 |     release: istio
16 |   name: istiod-llm-d-gateway
17 |   namespace: llm-d-istio-system
18 | spec:
19 |   maxReplicas: 5
20 |   metrics:
21 |   - resource:
22 |       name: cpu
23 |       target:
24 |         averageUtilization: 80
25 |         type: Utilization
26 |     type: Resource
27 |   minReplicas: 1
28 |   scaleTargetRef:
29 |     apiVersion: apps/v1
30 |     kind: Deployment
31 |     name: istiod-llm-d-gateway
32 | 


--------------------------------------------------------------------------------
/deploy/components/istio-control-plane/kustomization.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: kustomize.config.k8s.io/v1beta1
 2 | kind: Kustomization
 3 | resources:
 4 | - namespaces.yaml
 5 | - configmaps.yaml
 6 | - deployments.yaml
 7 | - hpa.yaml
 8 | - policies.yaml
 9 | - rbac.yaml
10 | - service-accounts.yaml
11 | - services.yaml
12 | - webhooks.yaml
13 | 


--------------------------------------------------------------------------------
/deploy/components/istio-control-plane/namespaces.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: Namespace
3 | metadata:
4 |   name: llm-d-istio-system
5 | 


--------------------------------------------------------------------------------
/deploy/components/istio-control-plane/policies.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: policy/v1
 2 | kind: PodDisruptionBudget
 3 | metadata:
 4 |   labels:
 5 |     app: istiod
 6 |     app.kubernetes.io/instance: istio
 7 |     app.kubernetes.io/managed-by: Helm
 8 |     app.kubernetes.io/name: istiod
 9 |     app.kubernetes.io/part-of: istio
10 |     app.kubernetes.io/version: 1.26-alpha.9befed2f1439d883120f8de70fd70d84ca0ebc3d
11 |     helm.sh/chart: istiod-1.26-alpha.9befed2f1439d883120f8de70fd70d84ca0ebc3d
12 |     install.operator.istio.io/owning-resource: unknown
13 |     istio: pilot
14 |     istio.io/rev: llm-d-gateway
15 |     operator.istio.io/component: Pilot
16 |     release: istio
17 |   name: istiod-llm-d-gateway
18 |   namespace: llm-d-istio-system
19 | spec:
20 |   minAvailable: 1
21 |   selector:
22 |     matchLabels:
23 |       app: istiod
24 |       istio.io/rev: llm-d-gateway
25 | 


--------------------------------------------------------------------------------
/deploy/components/istio-control-plane/service-accounts.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: ServiceAccount
 3 | metadata:
 4 |   labels:
 5 |     app: istio-reader
 6 |     app.kubernetes.io/instance: istio
 7 |     app.kubernetes.io/managed-by: Helm
 8 |     app.kubernetes.io/name: istio-reader
 9 |     app.kubernetes.io/part-of: istio
10 |     app.kubernetes.io/version: 1.26-alpha.9befed2f1439d883120f8de70fd70d84ca0ebc3d
11 |     helm.sh/chart: base-1.26-alpha.9befed2f1439d883120f8de70fd70d84ca0ebc3d
12 |     release: istio
13 |   name: istio-reader-service-account
14 |   namespace: llm-d-istio-system
15 | ---
16 | apiVersion: v1
17 | kind: ServiceAccount
18 | metadata:
19 |   labels:
20 |     app: istiod
21 |     app.kubernetes.io/instance: istio
22 |     app.kubernetes.io/managed-by: Helm
23 |     app.kubernetes.io/name: istiod
24 |     app.kubernetes.io/part-of: istio
25 |     app.kubernetes.io/version: 1.26-alpha.9befed2f1439d883120f8de70fd70d84ca0ebc3d
26 |     helm.sh/chart: istiod-1.26-alpha.9befed2f1439d883120f8de70fd70d84ca0ebc3d
27 |     release: istio
28 |   name: istiod-llm-d-gateway
29 |   namespace: llm-d-istio-system
30 | 


--------------------------------------------------------------------------------
/deploy/components/istio-control-plane/services.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Service
 3 | metadata:
 4 |   labels:
 5 |     app: istiod
 6 |     app.kubernetes.io/instance: istio
 7 |     app.kubernetes.io/managed-by: Helm
 8 |     app.kubernetes.io/name: istiod
 9 |     app.kubernetes.io/part-of: istio
10 |     app.kubernetes.io/version: 1.26-alpha.9befed2f1439d883120f8de70fd70d84ca0ebc3d
11 |     helm.sh/chart: istiod-1.26-alpha.9befed2f1439d883120f8de70fd70d84ca0ebc3d
12 |     install.operator.istio.io/owning-resource: unknown
13 |     istio: pilot
14 |     istio.io/rev: llm-d-gateway
15 |     operator.istio.io/component: Pilot
16 |     release: istio
17 |   name: istiod-llm-d-gateway
18 |   namespace: llm-d-istio-system
19 | spec:
20 |   ports:
21 |   - name: grpc-xds
22 |     port: 15010
23 |     protocol: TCP
24 |   - name: https-dns
25 |     port: 15012
26 |     protocol: TCP
27 |   - name: https-webhook
28 |     port: 443
29 |     protocol: TCP
30 |     targetPort: 15017
31 |   - name: http-monitoring
32 |     port: 15014
33 |     protocol: TCP
34 |   selector:
35 |     app: istiod
36 |     istio.io/rev: llm-d-gateway
37 | 


--------------------------------------------------------------------------------
/deploy/components/istio-control-plane/telemetry.yaml:
--------------------------------------------------------------------------------
 1 | # Enables debug logging for Gateways
 2 | apiVersion: telemetry.istio.io/v1
 3 | kind: Telemetry
 4 | metadata:
 5 |   name: mesh-default
 6 |   namespace: istio-gateway
 7 | spec:
 8 |   accessLogging:
 9 |   - providers:
10 |     - name: envoy
11 | 


--------------------------------------------------------------------------------
/deploy/components/istio-control-plane/webhooks.yaml:
--------------------------------------------------------------------------------
  1 | apiVersion: admissionregistration.k8s.io/v1
  2 | kind: ValidatingWebhookConfiguration
  3 | metadata:
  4 |   labels:
  5 |     app: istiod
  6 |     app.kubernetes.io/instance: istio
  7 |     app.kubernetes.io/managed-by: Helm
  8 |     app.kubernetes.io/name: istiod
  9 |     app.kubernetes.io/part-of: istio
 10 |     app.kubernetes.io/version: 1.26-alpha.9befed2f1439d883120f8de70fd70d84ca0ebc3d
 11 |     helm.sh/chart: istiod-1.26-alpha.9befed2f1439d883120f8de70fd70d84ca0ebc3d
 12 |     istio: istiod
 13 |     istio.io/rev: llm-d-gateway
 14 |     release: istio
 15 |   name: istio-validator-llm-d-gateway-llm-d-istio-system
 16 | webhooks:
 17 | - admissionReviewVersions:
 18 |   - v1
 19 |   clientConfig:
 20 |     service:
 21 |       name: istiod-llm-d-gateway
 22 |       namespace: llm-d-istio-system
 23 |       path: /validate
 24 |   failurePolicy: Ignore
 25 |   name: rev.validation.istio.io
 26 |   objectSelector:
 27 |     matchExpressions:
 28 |     - key: istio.io/rev
 29 |       operator: In
 30 |       values:
 31 |       - llm-d-gateway
 32 |   rules:
 33 |   - apiGroups:
 34 |     - security.istio.io
 35 |     - networking.istio.io
 36 |     - telemetry.istio.io
 37 |     - extensions.istio.io
 38 |     apiVersions:
 39 |     - '*'
 40 |     operations:
 41 |     - CREATE
 42 |     - UPDATE
 43 |     resources:
 44 |     - '*'
 45 |   sideEffects: None
 46 | ---
 47 | apiVersion: admissionregistration.k8s.io/v1
 48 | kind: MutatingWebhookConfiguration
 49 | metadata:
 50 |   labels:
 51 |     app: sidecar-injector
 52 |     app.kubernetes.io/instance: istio
 53 |     app.kubernetes.io/managed-by: Helm
 54 |     app.kubernetes.io/name: istiod
 55 |     app.kubernetes.io/part-of: istio
 56 |     app.kubernetes.io/version: 1.26-alpha.9befed2f1439d883120f8de70fd70d84ca0ebc3d
 57 |     helm.sh/chart: istiod-1.26-alpha.9befed2f1439d883120f8de70fd70d84ca0ebc3d
 58 |     install.operator.istio.io/owning-resource: unknown
 59 |     istio.io/rev: llm-d-gateway
 60 |     operator.istio.io/component: Pilot
 61 |     release: istio
 62 |   name: istio-sidecar-injector-llm-d-gateway-llm-d-istio-system
 63 | webhooks:
 64 | - admissionReviewVersions:
 65 |   - v1
 66 |   clientConfig:
 67 |     service:
 68 |       name: istiod-llm-d-gateway
 69 |       namespace: llm-d-istio-system
 70 |       path: /inject
 71 |       port: 443
 72 |   failurePolicy: Fail
 73 |   name: rev.namespace.sidecar-injector.istio.io
 74 |   namespaceSelector:
 75 |     matchExpressions:
 76 |     - key: istio.io/rev
 77 |       operator: In
 78 |       values:
 79 |       - llm-d-gateway
 80 |     - key: istio-injection
 81 |       operator: DoesNotExist
 82 |   objectSelector:
 83 |     matchExpressions:
 84 |     - key: sidecar.istio.io/inject
 85 |       operator: NotIn
 86 |       values:
 87 |       - "false"
 88 |   reinvocationPolicy: Never
 89 |   rules:
 90 |   - apiGroups:
 91 |     - ""
 92 |     apiVersions:
 93 |     - v1
 94 |     operations:
 95 |     - CREATE
 96 |     resources:
 97 |     - pods
 98 |   sideEffects: None
 99 | - admissionReviewVersions:
100 |   - v1
101 |   clientConfig:
102 |     service:
103 |       name: istiod-llm-d-gateway
104 |       namespace: llm-d-istio-system
105 |       path: /inject
106 |       port: 443
107 |   failurePolicy: Fail
108 |   name: rev.object.sidecar-injector.istio.io
109 |   namespaceSelector:
110 |     matchExpressions:
111 |     - key: istio.io/rev
112 |       operator: DoesNotExist
113 |     - key: istio-injection
114 |       operator: DoesNotExist
115 |   objectSelector:
116 |     matchExpressions:
117 |     - key: sidecar.istio.io/inject
118 |       operator: NotIn
119 |       values:
120 |       - "false"
121 |     - key: istio.io/rev
122 |       operator: In
123 |       values:
124 |       - llm-d-gateway
125 |   reinvocationPolicy: Never
126 |   rules:
127 |   - apiGroups:
128 |     - ""
129 |     apiVersions:
130 |     - v1
131 |     operations:
132 |     - CREATE
133 |     resources:
134 |     - pods
135 |   sideEffects: None
136 | 


--------------------------------------------------------------------------------
/deploy/components/vllm-sim-pd/deployments.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: apps/v1
 2 | kind: Deployment
 3 | metadata:
 4 |   name: vllm-sim-p
 5 |   labels:
 6 |     app: ${POOL_NAME}
 7 | spec:
 8 |   replicas: ${VLLM_REPLICA_COUNT_P}
 9 |   selector:
10 |     matchLabels:
11 |       app: ${POOL_NAME}
12 |   template:
13 |     metadata:
14 |       labels:
15 |         app: ${POOL_NAME}
16 |         llm-d.ai/role: prefill
17 |     spec:
18 |       containers:
19 |       - name: vllm
20 |         image: ghcr.io/llm-d/llm-d-inference-sim:latest
21 |         imagePullPolicy: IfNotPresent
22 |         args:
23 |         - "--port=8000"
24 |         - "--model=food-review"
25 |         ports:
26 |         - name: http
27 |           containerPort: 8000
28 |           protocol: TCP
29 |         env:
30 |         - name: PORT
31 |           value: "8000"
32 | ---
33 | apiVersion: apps/v1
34 | kind: Deployment
35 | metadata:
36 |   name: vllm-sim-d
37 |   labels:
38 |     app: ${POOL_NAME}
39 | spec:
40 |   replicas: ${VLLM_REPLICA_COUNT_D}
41 |   selector:
42 |     matchLabels:
43 |       app: ${POOL_NAME}
44 |   template:
45 |     metadata:
46 |       labels:
47 |         app: ${POOL_NAME}
48 |         llm-d.ai/role: decode
49 |     spec:
50 |       initContainers:
51 |       - name: routing-sidecar
52 |         image: ghcr.io/llm-d/llm-d-routing-sidecar:latest
53 |         imagePullPolicy: IfNotPresent
54 |         args:
55 |         - "--port=8000"
56 |         - "--vllm-port=8200"
57 |         - "--connector=lmcache"
58 |         ports:
59 |         - containerPort: 8000
60 |           protocol: TCP
61 |         restartPolicy: Always
62 |       containers:
63 |       - name: vllm
64 |         image: ghcr.io/llm-d/llm-d-inference-sim:latest
65 |         imagePullPolicy: IfNotPresent
66 |         args:
67 |         - "--port=8200"
68 |         - "--model=food-review"
69 |         ports:
70 |         - name: http
71 |           containerPort: 8200
72 |           protocol: TCP
73 |         env:
74 |         - name: PORT
75 |           value: "8200"
76 | 


--------------------------------------------------------------------------------
/deploy/components/vllm-sim-pd/kustomization.yaml:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------
 2 | # VLLM Simulator
 3 | #
 4 | # This deploys a VLLM simulator which can be used to simulate inference for
 5 | # small environments (e.g. Kubernetes In Docker (KIND) clusters), or for when
 6 | # all that is needed is some basic functionality.
 7 | # ------------------------------------------------------------------------------
 8 | apiVersion: kustomize.config.k8s.io/v1beta1
 9 | kind: Kustomization
10 | 
11 | resources:
12 | - deployments.yaml
13 | 
14 | images:
15 | - name: ghcr.io/llm-d/llm-d-inference-sim
16 |   newTag: ${VLLM_SIMULATOR_TAG}
17 | - name: ghcr.io/llm-d/llm-d-routing-sidecar
18 |   newTag: ${ROUTING_SIDECAR_TAG}
19 | 


--------------------------------------------------------------------------------
/deploy/components/vllm-sim/deployments.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: apps/v1
 2 | kind: Deployment
 3 | metadata:
 4 |   name: vllm-sim
 5 |   labels:
 6 |     app: ${POOL_NAME}
 7 | spec:
 8 |   replicas: ${VLLM_REPLICA_COUNT}
 9 |   selector:
10 |     matchLabels:
11 |       app: ${POOL_NAME}
12 |   template:
13 |     metadata:
14 |       labels:
15 |         app: ${POOL_NAME}
16 |     spec:
17 |       containers:
18 |       - name: vllm
19 |         image: ghcr.io/llm-d/llm-d-inference-sim:latest
20 |         imagePullPolicy: IfNotPresent
21 |         args:
22 |         - "--port=8000"
23 |         - "--model=food-review"
24 |         ports:
25 |         - name: http
26 |           containerPort: 8000
27 |           protocol: TCP
28 |         env:
29 |         - name: PORT
30 |           value: "8000"
31 | 


--------------------------------------------------------------------------------
/deploy/components/vllm-sim/kustomization.yaml:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------
 2 | # VLLM Simulator
 3 | #
 4 | # This deploys a VLLM simulator which can be used to simulate inference for
 5 | # small environments (e.g. Kubernetes In Docker (KIND) clusters), or for when
 6 | # all that is needed is some basic functionality.
 7 | # ------------------------------------------------------------------------------
 8 | apiVersion: kustomize.config.k8s.io/v1beta1
 9 | kind: Kustomization
10 | 
11 | resources:
12 | - deployments.yaml
13 | 
14 | images:
15 | - name: ghcr.io/llm-d/llm-d-inference-sim
16 |   newTag: ${VLLM_SIMULATOR_TAG}
17 | 


--------------------------------------------------------------------------------
/deploy/environments/dev/base-kind-istio/destination-rules.yaml:
--------------------------------------------------------------------------------
 1 | # **WARNING** Only use in testing scenarios
 2 | apiVersion: networking.istio.io/v1
 3 | kind: DestinationRule
 4 | metadata:
 5 |   name: endpoint-picker-insecure-tls
 6 | spec:
 7 |   host: endpoint-picker
 8 |   trafficPolicy:
 9 |       tls:
10 |         mode: SIMPLE
11 |         insecureSkipVerify: true
12 | 


--------------------------------------------------------------------------------
/deploy/environments/dev/base-kind-istio/kustomization.yaml:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------
 2 | # Kubernetes In Docker (KIND) Environment
 3 | #
 4 | # This will deploy the base development stack on a KIND cluster:
 5 | #
 6 | #  * Istio Control Plane
 7 | #  * Inference Gateway
 8 | #
 9 | # This will expose the VLLM simulator via InferencePool and an HTTPRoute.
10 | #
11 | # The vLLM simulator is deployment by a kustomization directory that includes this directory
12 | # ------------------------------------------------------------------------------
13 | apiVersion: kustomize.config.k8s.io/v1beta1
14 | kind: Kustomization
15 | 
16 | resources:
17 | - destination-rules.yaml
18 | - services.yaml
19 | - ../../../components/istio-control-plane/
20 | - ../../../components/inference-gateway/
21 | 
22 | patches:
23 | - path: patch-deployments.yaml
24 | - path: patch-gateways.yaml
25 | 


--------------------------------------------------------------------------------
/deploy/environments/dev/base-kind-istio/patch-deployments.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: apps/v1
 2 | kind: Deployment
 3 | metadata:
 4 |   name: endpoint-picker
 5 | spec:
 6 |   template:
 7 |     spec:
 8 |       containers:
 9 |       - name: epp
10 |         args:
11 |         - -poolName
12 |         - ${POOL_NAME}
13 |         - -poolNamespace
14 |         - "default"
15 |         - -v
16 |         - "4"
17 |         - --zap-encoder
18 |         - "json"
19 |         - -grpcPort
20 |         - "9002"
21 |         - -grpcHealthPort
22 |         - "9003"
23 | 


--------------------------------------------------------------------------------
/deploy/environments/dev/base-kind-istio/patch-gateways.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: gateway.networking.k8s.io/v1
 2 | kind: Gateway
 3 | metadata:
 4 |   name: inference-gateway
 5 |   labels:
 6 |     istio.io/enable-inference-extproc: "true"
 7 |     istio.io/rev: llm-d-gateway
 8 |   annotations:
 9 |     networking.istio.io/service-type: ClusterIP
10 | spec:
11 |   gatewayClassName: istio
12 | 


--------------------------------------------------------------------------------
/deploy/environments/dev/base-kind-istio/services.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Service
 3 | metadata:
 4 |   annotations:
 5 |     networking.istio.io/service-type: NodePort
 6 |   labels:
 7 |     gateway.istio.io/managed: istio.io-gateway-controller
 8 |     gateway.networking.k8s.io/gateway-name: inference-gateway
 9 |     istio.io/enable-inference-extproc: "true"
10 |   name: inference-gateway-istio-nodeport
11 | spec:
12 |   type: NodePort
13 |   selector:
14 |     gateway.networking.k8s.io/gateway-name: inference-gateway
15 |   ports:
16 |   - appProtocol: tcp
17 |     name: status-port
18 |     port: 15021
19 |     protocol: TCP
20 |     targetPort: 15021
21 |     nodePort: 32021
22 |   - appProtocol: http
23 |     name: default
24 |     port: 80
25 |     protocol: TCP
26 |     targetPort: 80
27 |     nodePort: 30080
28 | 


--------------------------------------------------------------------------------
/deploy/environments/dev/kind-istio-pd/kustomization.yaml:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------
 2 | # Kubernetes In Docker (KIND) Environment
 3 | #
 4 | # This will deploy the full development stack on a KIND cluster:
 5 | #
 6 | #  * Istio Control Plane
 7 | #  * VLLM Simulator
 8 | #  * Inference Gateway
 9 | #
10 | # This will expose the VLLM simulator via InferencePool and an HTTPRoute.
11 | # ------------------------------------------------------------------------------
12 | apiVersion: kustomize.config.k8s.io/v1beta1
13 | kind: Kustomization
14 | 
15 | resources:
16 | - ../base-kind-istio/
17 | - ../../../components/vllm-sim-pd/
18 | 


--------------------------------------------------------------------------------
/deploy/environments/dev/kind-istio/kustomization.yaml:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------
 2 | # Kubernetes In Docker (KIND) Environment
 3 | #
 4 | # This will deploy the full development stack on a KIND cluster:
 5 | #
 6 | #  * Istio Control Plane
 7 | #  * VLLM Simulator
 8 | #  * Inference Gateway
 9 | #
10 | # This will expose the VLLM simulator via InferencePool and an HTTPRoute.
11 | # ------------------------------------------------------------------------------
12 | apiVersion: kustomize.config.k8s.io/v1beta1
13 | kind: Kustomization
14 | 
15 | resources:
16 | - ../base-kind-istio/
17 | - ../../../components/vllm-sim/
18 | 


--------------------------------------------------------------------------------
/deploy/environments/openshift-base/common/patch-service.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Service
 3 | metadata:
 4 |   name: service
 5 | spec:
 6 |   selector:
 7 |     app: ${PROJECT_NAME}-service
 8 |   ports:
 9 |     - protocol: TCP
10 |       port: 8080
11 |       targetPort: 8080
12 |   type: ClusterIP
13 |  


--------------------------------------------------------------------------------
/deploy/environments/openshift-base/common/patch-statefulset.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: apps/v1
 2 | kind: StatefulSet
 3 | metadata:
 4 |   name: 0
 5 | spec:
 6 |   serviceName: ${PROJECT_NAME}-service
 7 |   replicas: 1
 8 |   selector:
 9 |     matchLabels:
10 |       app: ${PROJECT_NAME}-statefulset
11 |   template:
12 |     metadata:
13 |       labels:
14 |         app: ${PROJECT_NAME}-statefulset
15 |     spec:
16 |       serviceAccountName: operator-controller-manager
17 |       containers:
18 |         - name: cmd
19 |           image: ${IMAGE_TAG_BASE}:${VERSION}
20 |           imagePullPolicy: Always
21 | 


--------------------------------------------------------------------------------
/deploy/environments/openshift-base/common/service.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Service
 3 | metadata:
 4 |   name: service
 5 | spec:
 6 |   selector:
 7 |     app: placeholder
 8 |   ports:
 9 |     - protocol: TCP
10 |       port: 8080
11 |       targetPort: 8080
12 |   type: ClusterIP
13 | 


--------------------------------------------------------------------------------
/deploy/environments/openshift-base/common/statefulset.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: apps/v1
 2 | kind: StatefulSet
 3 | metadata:
 4 |   name: "0"
 5 | spec:
 6 |   serviceName: placeholder
 7 |   replicas: 1
 8 |   selector:
 9 |     matchLabels:
10 |       app: placeholder
11 |   template:
12 |     metadata:
13 |       labels:
14 |         app: placeholder
15 |     spec:
16 |       serviceAccountName: operator-controller-manager
17 |       containers:
18 |         - name: cmd
19 |           image: ghcr.io/llm-d/placeholder:placeholder
20 |           imagePullPolicy: Always
21 | 


--------------------------------------------------------------------------------
/deploy/environments/openshift-base/kustomization.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: kustomize.config.k8s.io/v1beta1
 2 | kind: Kustomization
 3 | 
 4 | # Set the namespace for all resources using a placeholder.
 5 | namespace: ${NAMESPACE}
 6 | 
 7 | # Use a prefix for all object names. You can substitute the PROJECT_NAME variable.
 8 | namePrefix: ${PROJECT_NAME}-
 9 | 
10 | # List all the resources (manifests) you want to deploy.
11 | resources:
12 | - common/statefulset.yaml
13 | - common/service.yaml
14 | - openshift/route.yaml
15 | - rbac/exec-rbac-role.yaml
16 | - rbac/exec-rbac-rolebinding.yaml
17 | 
18 | # Generate the ConfigMap with a variable name.
19 | configMapGenerator:
20 | - name: config
21 |   options:
22 |     disableNameSuffixHash: true
23 | 
24 | # Include patches to update the Service, StatefulSet, Route, and RBAC resources.
25 | 
26 | # Define the image to be updated.
27 | # images:
28 | # - name: ghcr.io/llm-d/placeholder
29 | #   newName: ghcr.io/llm-d/${IMAGE_TAG_BASE}
30 | #   newTag: ${VERSION}
31 | patches:
32 | - path: common/patch-service.yaml
33 | - path: common/patch-statefulset.yaml
34 | - path: openshift/patch-route.yaml
35 | - path: rbac/patch-rbac-role.yaml
36 | - path: rbac/patch-rbac-rolebinding.yaml
37 | 


--------------------------------------------------------------------------------
/deploy/environments/openshift-base/openshift/patch-route.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: route.openshift.io/v1
2 | kind: Route
3 | metadata:
4 |   name: route
5 | spec:
6 |   to:
7 |     name: "${PROJECT_NAME}-service"
8 | 


--------------------------------------------------------------------------------
/deploy/environments/openshift-base/openshift/route.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: route.openshift.io/v1
 2 | kind: Route
 3 | metadata:
 4 |   name: route
 5 | spec:
 6 |   to:
 7 |     kind: Service
 8 |     name: placeholder
 9 |   port:
10 |     targetPort: 8080
11 |   tls:
12 |     termination: edge
13 | 


--------------------------------------------------------------------------------
/deploy/environments/openshift-base/rbac/exec-rbac-role.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: rbac.authorization.k8s.io/v1
 2 | kind: Role
 3 | metadata:
 4 |   name: exec-role
 5 | rules:
 6 |   - apiGroups: [""]
 7 |     resources: ["pods/exec"]
 8 |     resourceNames: ["placeholder-0-0"]
 9 |     verbs: ["create"]
10 | 


--------------------------------------------------------------------------------
/deploy/environments/openshift-base/rbac/exec-rbac-rolebinding.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: rbac.authorization.k8s.io/v1
 2 | kind: RoleBinding
 3 | metadata:
 4 |   name: exec-rolebinding
 5 | subjects:
 6 |   - kind: Group
 7 |     name: system:authenticated
 8 |     apiGroup: rbac.authorization.k8s.io
 9 | roleRef:
10 |   kind: Role
11 |   name: exec-role
12 |   apiGroup: rbac.authorization.k8s.io
13 | 
14 | 


--------------------------------------------------------------------------------
/deploy/environments/openshift-base/rbac/patch-rbac-role.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: rbac.authorization.k8s.io/v1
 2 | kind: Role
 3 | metadata:
 4 |   name: exec-role
 5 | rules:
 6 |   - apiGroups: [""]
 7 |     resources: ["pods/exec"]
 8 |     resourceNames:
 9 |       - "${PROJECT_NAME}-0-0"
10 |     verbs: ["create"]
11 | 


--------------------------------------------------------------------------------
/deploy/environments/openshift-base/rbac/patch-rbac-rolebinding.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: rbac.authorization.k8s.io/v1
 2 | kind: RoleBinding
 3 | metadata:
 4 |   name: exec-rolebinding
 5 | subjects:
 6 |   - kind: Group
 7 |     name: system:authenticated
 8 |     apiGroup: rbac.authorization.k8s.io
 9 | roleRef:
10 |   kind: Role
11 |   name: ${PROJECT_NAME}-exec-role
12 |   apiGroup: rbac.authorization.k8s.io
13 | 


--------------------------------------------------------------------------------
/docs/architecture.md:
--------------------------------------------------------------------------------
  1 | # llm-d Inference Router Architecture
  2 | 
  3 | ## Overview
  4 | 
  5 | **llm-d** is an extensible architecture designed to route inference requests efficiently across model-serving pods. A central component of this architecture is the **Inference Gateway**, which builds on the Kubernetes-native **Gateway API Inference Extension (GIE)** to enable scalable, flexible, and pluggable routing of requests.
  6 | 
  7 | The design enables:
  8 | - Support for **multiple base models** and **LoRA adapters** within a shared cluster [Not supported in Phase1]
  9 | - Efficient routing based on **KV cache locality**, **prefix**, **session affinity**, **load**, and **model metadata**
 10 | - Disaggregated **Prefill/Decode (P/D)** execution
 11 | - Pluggable **filters**, **scorers**, and **scrapers** for extensible routing
 12 | 
 13 | ---
 14 | 
 15 | ## Core Goals
 16 | 
 17 | - Route inference requests to optimal pods based on:
 18 |   - Base model compatibility
 19 |   - KV cache reuse
 20 |   - Load balancing
 21 | - Support multi-model deployments on heterogeneous hardware
 22 | - Enable runtime extensibility with pluggable logic (filters, scorers, scrapers)
 23 | - Community-aligned implementation using GIE and Envoy + External Processing (EPP)
 24 | 
 25 | ---
 26 | 
 27 | ## Architecture Design
 28 | 
 29 | ![Inference Gateway Architecture](./images/architecture.png)
 30 | 
 31 | The inference scheduler is built on top of:
 32 | - **Envoy** as a programmable data plane
 33 | - **EPP (External Processing Plugin)** using **GIE**
 34 | 
 35 | ### Pluggability
 36 | 
 37 | ![Pluggability Architecture](./images/plugability.png)
 38 | 
 39 | Routing decisions are governed by dynamic components:
 40 | - **Filters**: Exclude pods based on static or dynamic criteria
 41 | - **Scorers**: Assign scores to candidate pods
 42 | - **Scrapers**: Collect pod metadata and metrics for scorers
 43 | 
 44 | These components are maintained in the `llm-d-inference-scheduler` repository and can evolve independently.
 45 | 
 46 | ---
 47 | 
 48 | ## Filters, Scorers, and Scrapers
 49 | 
 50 | ### Core Design Principles
 51 | 
 52 | - **Pluggability**: No core changes are needed to add new scorers or filters
 53 | - **Isolation**: Each component operates independently
 54 | 
 55 | 
 56 | ### Routing Flow
 57 | 
 58 | 1. **Filtering**
 59 |    - Pods in an `InferencePool` go through a sequential chain of filters
 60 |    - Pods may be excluded based on criteria like model compatibility, resource usage, or custom logic
 61 | 
 62 | 2. **Scoring**
 63 |    - Filtered pods are scored using a weighted set of scorers
 64 |    - Scorers currently run sequentially (future: parallel execution)
 65 |    - Scorers access a shared datastore populated by scrapers
 66 | 
 67 | 3. **Pod Selection**
 68 |    - The highest-scored pod is selected
 69 |    - If multiple pods share the same score, one is selected at random
 70 | 
 71 | ### Lifecycle Hooks
 72 | - `Pre-call`
 73 | - `Scoring`
 74 | - `Post-choice`
 75 | - `After-response`
 76 | 
 77 | ---
 78 | 
 79 | ## Scorers & Configuration
 80 | 
 81 | | Scorer           | Description                                | Env Vars |
 82 | |------------------|--------------------------------------------|----------|
 83 | | Session-aware    | Prefers pods from same session             | `ENABLE_SESSION_AWARE_SCORER`, `SESSION_AWARE_SCORER_WEIGHT`, `PREFILL_ENABLE_SESSION_AWARE_SCORER`, `PREFILL_SESSION_AWARE_SCORER_WEIGHT` |
 84 | | Prefix-aware     | Matches prompt prefix                      | `ENABLE_PREFIX_AWARE_SCORER`, `PREFIX_AWARE_SCORER_WEIGHT`, `PREFILL_ENABLE_PREFIX_AWARE_SCORER`, `PREFILL_PREFIX_AWARE_SCORER_WEIGHT`, `PREFIX_SCORER_BLOCK_SIZE`|
 85 | | KVCache-aware    | Optimizes for KV reuse                     | `ENABLE_KVCACHE_AWARE_SCORER`, `KVCACHE_INDEXER_REDIS_ADDR`, `PREFILL_ENABLE_KVCACHE_AWARE_SCORER`, `PREFILL_KVCACHE_INDEXER_REDIS_ADDR`, `HF_TOKEN`, `KVCACHE_INDEXER_REDIS_ADDR` |
 86 | | Load-aware       | Avoids busy pods                           | `ENABLE_LOAD_AWARE_SCORER`, `LOAD_AWARE_SCORER_WEIGHT`, `PREFILL_ENABLE_LOAD_AWARE_SCORER`, `PREFILL_LOAD_AWARE_SCORER_WEIGHT` |
 87 | 
 88 | ### Prefill / Decode Configuration
 89 | 
 90 | In case Disaggrigated Prefill is enabled, you should also define the following environment variables. 
 91 | 
 92 | - Toggle P/D mode: `PD_ENABLED=true`
 93 | - Threshold: `PD_PROMPT_LEN_THRESHOLD=<value>`
 94 | 
 95 | #### Prefill Scorers:
 96 | ```bash
 97 | export PREFILL_ENABLE_SESSION_AWARE_SCORER=true
 98 | export PREFILL_SESSION_AWARE_SCORER_WEIGHT=1
 99 | export PREFILL_ENABLE_KVCACHE_AWARE_SCORER=true
100 | export PREFILL_KVCACHE_AWARE_SCORER_WEIGHT=1
101 | export PREFILL_ENABLE_LOAD_AWARE_SCORER=true
102 | export PREFILL_LOAD_AWARE_SCORER_WEIGHT=1
103 | export PREFILL_ENABLE_PREFIX_AWARE_SCORER=true
104 | export PREFILL_PREFIX_AWARE_SCORER_WEIGHT=1
105 | ```
106 | 
107 | 
108 | ---
109 | 
110 | ## Metric Scraping
111 | 
112 | - Scrapers collect metrics (e.g., memory usage, active adapters)
113 | - Data is injected into the shared datastore for scorers
114 | - Scoring can rely on numerical metrics or metadata (model ID, adapter tags)
115 | 
116 | ---
117 | 
118 | ## Disaggregated Prefill/Decode (P/D)
119 | 
120 | When enabled, the router:
121 | - Selects one pod for **Prefill** (prompt processing)
122 | - Selects another pod for **Decode** (token generation)
123 | 
124 | The **vLLM sidecar** handles orchestration between Prefill and Decode stages. It allows:
125 | - Queuing
126 | - Local memory management
127 | - Experimental protocol compatibility
128 | 
129 | > **Note**: The detailed P/D design is available in this document: [Disaggregated Prefill/Decode in llm-d](./dp.md)
130 | ---
131 | 
132 | ## InferencePool & InferenceModel Design
133 | 
134 | ### Current Assumptions
135 | - Single `InferencePool` and single `EPP` due to Envoy limitations
136 | - Model-based filtering can be handled within EPP
137 | - Currently only one base model is supported
138 | 
139 | ---
140 | 
141 | ## References
142 | - [GIE Spec](https://gateway-api-inference-extension.sigs.k8s.io/)
143 | - [Envoy External Processing](https://www.envoyproxy.io/docs/envoy/latest/configuration/http/http_filters/ext_proc_filter)
144 | 
145 | 
146 | 


--------------------------------------------------------------------------------
/docs/create_new_filter.md:
--------------------------------------------------------------------------------
  1 | # Extending llm-d-inference-scheduler with a custom filter
  2 | 
  3 | ## Goal
  4 | 
  5 | This tutorial outlines the steps needed for creating and hooking a new filter
  6 |  for the llm-d-inference-scheduler.
  7 |  
  8 | The tutorial demonstrates the coding of a new filter, which selects inference
  9 |  serving Pods based on their labels. All relevant code is contained in the
 10 |  [`by_labels.go`](https://github.com/llm-d/llm-d-inference-scheduler/blob/main/pkg/scheduling/plugins/filter/by_labels.go) file.
 11 | 
 12 | ## Introduction to filtering
 13 | 
 14 | Plugins are used to modify llm-d-inference-scheduler's default behavior. Filter plugins
 15 |  are provided with a list of candidate inference serving Pods and filter out the
 16 |  Pods which do not match the filtering criteria. Several filtering plugins can
 17 |  run in succession to produce the final candidate list which is then evaluated,
 18 |  through the process of _scoring_, to select the most appropriate target Pods.
 19 |  While llm-d-inference-scheduler comes with several existing filters and
 20 |  more are availble in the upstream [Gateway API Inference Extension](https://sigs.k8s.io/gateway-api-inference-extension),
 21 |  in some cases it may be desireable to create and deploy custom filtering code to
 22 |  match your specific requirements.
 23 | 
 24 | The filters` main operating function is
 25 | 
 26 | ```go
 27 | func Filter(*types.SchedulingContext, []types.Pod) []types.Pod
 28 | ```
 29 | 
 30 | The `Filter` function accepts a `SchedulingContext` (e.g., containing the
 31 |  incoming LLM request) and an array of `Pod` objects as potential targets. Each `Pod`
 32 |  entry includes relevant inference metrics and attributes which can be used
 33 |  to make scheduling decisions. The function returns a (possibly smaller) array
 34 |  of `Pod`s which satisfy the filtering criteria.
 35 | 
 36 | ## Code walkthough
 37 | 
 38 | The top of the file has the expected Go package and import statments:
 39 | 
 40 | ```go
 41 | package filter
 42 | 
 43 | import (
 44 | 	"errors"
 45 | 
 46 | 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 47 | 	"k8s.io/apimachinery/pkg/labels"
 48 | 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/plugins"
 49 | 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types"
 50 | )
 51 | ```
 52 | 
 53 | Specifically, we import the Kubernetes `meta/v1` and `labels` packages to allow
 54 |  defining and using `label.Selector` objects, and the Gateway API Infernce
 55 |  Extension's `plugin` (defininig the plugin interfaces) and `types` (defining
 56 |  scheduling related objects) packages.
 57 | 
 58 | Next we define the `ByLabels` struct type, along with the relevant fields,
 59 |  and a consturctor function.
 60 | 
 61 | ```go
 62 | // ByLabels filters out pods that do not match its label selector criteria
 63 | type ByLabels struct {
 64 | 	name     string
 65 | 	selector labels.Selector
 66 | }
 67 | 
 68 | var _ plugins.Filter = &ByLabels{} // validate interface conformance
 69 | 
 70 | // NewByLabel returns a new filter instance, configured with the provided
 71 | // name and label selector.
 72 | func NewByLabel(name string, selector *metav1.LabelSelector) (plugins.Filter, error) {
 73 | 	if name == "" {
 74 | 		return nil, errors.New("ByLabels: missing filter name")
 75 | 	}
 76 | 	labelSelector, err := metav1.LabelSelectorAsSelector(selector)
 77 | 	if err != nil {
 78 | 		return nil, err
 79 | 	}
 80 | 
 81 | 	return &ByLabels{
 82 | 		name:     name,
 83 | 		selector: labelSelector,
 84 | 	}, nil
 85 | }
 86 | ```
 87 | 
 88 | > Note that, since Go supports "duck typing", the`plugin` package is
 89 |  not strictly required. We use it to validate `ByLabels` interface conformance
 90 |  (a pattern known as "interface implementation assertion" or "compile-time
 91 |  interface" check). The statement asserts at compile time that `ByLabels`
 92 |  implements the `plugins.Filter` interface and is useful for catching errors
 93 |  early, especially when refactoring (e.g. interface methods or signatures change).
 94 | 
 95 | Next, we define the required `plugins.Filter` interface methods:
 96 | 
 97 | ```go
 98 | // Name returns the name of the filter
 99 | func (blf *ByLabels) Name() string {
100 | 	return blf.name
101 | }
102 | 
103 | // Filter filters out all pods that do not satisfy the label selector
104 | func (blf *ByLabels) Filter(_ *types.SchedulingContext, pods []types.Pod) []types.Pod {
105 | 	filtered := []types.Pod{}
106 | 
107 | 	for _, pod := range pods {
108 | 		labels := labels.Set(pod.GetPod().Labels)
109 | 		if blf.selector.Matches(labels) {
110 | 			filtered = append(filtered, pod)
111 | 		}
112 | 	}
113 | 	return filtered
114 | }
115 | ```
116 | 
117 | Since the filter is only matching on candidate `types.Pod` labels,
118 |  we leave the `types.SchedullingContext` parameter unnamed. Filters
119 |  that need access to LLM request information (e.g., filtering based
120 |  on prompt length) may use it.
121 | 
122 | ## Hooking the filter into the scheduling flow
123 | 
124 | Once a filter is defined, it can be used to modify llm-d-inference-scheduler
125 |  configuration. This would typically be done by modifying the
126 | `pkg/config/config.go` file to
127 |  
128 | - Add the relevant import path (if defined outside this repository);
129 | - Add any desired configuration knobs (e.g., environment variables); and
130 | - Listing the new filter in the `LoadConfig()` function's `cfg.loadPluginInfo`
131 |  list of available plugins.
132 | 
133 | In the case of the llm-d-inference-scheduler, filters can be hooked into the
134 |  `Prefill` and/or `Decode` scheduling cycles. For example, the following snippet
135 |  adds the `ByLabels` filter to the list of plugins available to the `Decode`
136 |  scheduler (assuming a `ByLabelFilterName` constant is defined along with other
137 |  environment variables):
138 | 
139 | ```go 
140 | func (c *Config) LoadConfig() {
141 | 	c.loadPluginInfo(c.DecodeSchedulerPlugins, false,
142 | 		KVCacheScorerName, ..., ByLabelFilterName, ... )
143 | 	c.loadPluginInfo(c.PrefillSchedulerPlugins, true, ... )
144 | 	// ...
145 | }
146 | ```
147 | 
148 | > Note: a real filter would require unit tests, etc. These are left out to
149 |  keep the tutorial short and focused.
150 | 
151 | ## Next steps
152 | 
153 | If you have an idea for a new `Filter` (or other) plugin - we'd love to hear
154 |  from you! Please open an [issue](https://github.com/llm-d/llm-d-inference-scheduler/issues/new/choose),
155 |  describing your use case and requirements, and we'll reach out to refine
156 |  and collaborate.
157 | 


--------------------------------------------------------------------------------
/docs/dp.md:
--------------------------------------------------------------------------------
  1 | # Disaggregated Prefill/Decode Inference Serving in llm-d
  2 | 
  3 | ## Overview
  4 | 
  5 | This document describes the architecture and request lifecycle for enabling **disaggregated prefill and decode (P/D)** inference execution in the llm-d router. The architecture aims to improve flexibility, scalability, and performance by enabling separation of prefill and decode stages onto different workers.
  6 | 
  7 | This evolved version removes the requirement for sidecars on the **prefill node**, simplifying deployment while maintaining orchestration from the **decode node**.
  8 | 
  9 | ---
 10 | 
 11 | ## Goals
 12 | 
 13 | - Enable routing of prefill and decode to different pods
 14 | - Maintain low latency and high throughput
 15 | - Improve resource utilization by specializing pods for prefill or decode
 16 | - Align with GIE-compatible architectures for potential upstreaming
 17 | 
 18 | ---
 19 | 
 20 | ## Key Components
 21 | 
 22 | | Component            | Role                                                                 |
 23 | |----------------------|----------------------------------------------------------------------|
 24 | | **Prefill Worker**   | Handles only prefill stage using vLLM engine                         |
 25 | | **Decode Worker**    | Handles decode stage and contains the sidecar for coordination       |
 26 | | **Sidecar (Decode)** | Orchestrates communication with prefill worker and manages lifecycle |
 27 | | **Envoy Proxy**      | Accepts OpenAI-style requests and forwards them to EPP               |
 28 | | **EPP**              | End Point Picker, makes scheduling decisions                     |
 29 | 
 30 | ---
 31 | 
 32 | ## Request Lifecycle
 33 | 
 34 | 1. **User Request**
 35 |    - Sent via OpenAI API to the Envoy Proxy
 36 | 
 37 | 2. **EPP Scheduling Decision**
 38 |    - EPP evaluates:
 39 |      - Prompt length
 40 |      - KV cache hit probability
 41 |      - System and pod load
 42 |    - Selects either:
 43 |      - **Single node** path (decode handles all)
 44 |      - **Split node** path (distinct prefill and decode workers)
 45 |    - Returns Decode Worker (always), and optionally Prefill Worker URL
 46 | 
 47 | 3. **Execution**
 48 |    - Request lands on Decode Worker (as selected by EPP)
 49 |    - Decode sidecar coordinates:
 50 |      - If `prefill_worker_id == nil`, runs both stages locally by passing request to local vllm
 51 |      - If split:
 52 |        - Sends prefill job to Prefill Worker with a special header `do_remote_decode=true`
 53 |        - Upon receiving response from Prefill Worker runs decode stage 
 54 | 
 55 | 4. **Response Flow**
 56 |    - Response flows from decode sidecar → Envoy → EPP → User
 57 | 
 58 | ---
 59 | 
 60 | ## Architectural Details
 61 | 
 62 | ### Sidecar Responsibilities (Decode Only)
 63 | 
 64 | - Receives EPP metadata (decode pod, optional prefill pod)
 65 | - Sends request to prefill
 66 | - Waits and validates result
 67 | - Launches local decode job
 68 | - Sends final response
 69 | 
 70 | > **Note**: No sidecar or coordination logic is needed on the prefill node.
 71 | 
 72 | ---
 73 | 
 74 | ## Worker Selection Logic
 75 | 
 76 | - **Decode Worker**:
 77 |   - Prefer longest prefix match / KV cache utilization (depends on avaialble scorers)
 78 | 
 79 | - **Prefill Worker**:
 80 |   - High prefix-cache hit rate
 81 |   - Low load
 82 | 
 83 | > **Skip prefill worker** when:
 84 | > - Prefix match/kv cache hit is high
 85 | > - Prompt is very short
 86 | 
 87 | ---
 88 | 
 89 | ## vLLM and LMCache Integration
 90 | 
 91 | - **vLLM changes** (or wrapper APIs):
 92 |   - `save()`, `load()` APIs
 93 |   - `done_sending`, `done_receiving`
 94 |   - Connector API supporting async transfer
 95 | 
 96 | ---
 97 | 
 98 | ## Drawbacks & Limitations
 99 | 
100 | - Slight increase in TTFT for split P/D
101 | - Possibility of stranded memory on prefill crash
102 | - Need for timeout and retry logic
103 | 
104 | ---
105 | 
106 | ## Design Benefits
107 | 
108 | - **Flexibility**: Enables per-request specialization and resource balancing
109 | - **Scalability**: Clean separation of concerns for easier ops and tuning
110 | - **Upstream-ready**: Follows GIE-compatible request handling
111 | - **Minimal Changes**: Only decode node includes orchestration sidecar
112 | 
113 | ---
114 | 
115 | ## Future Considerations
116 | 
117 | - Cache coordinate
118 | - Pre allocation of kv blocks in decode node , push cache from prefill to decode worker during calculation
119 | 
120 | ---
121 | 
122 | ## Diagram 
123 | 
124 | ![Disaggregated Prefill/Decode Architecture](./images/dp_architecture.png)
125 | 
126 | ---
127 | 
128 | ## References
129 | 


--------------------------------------------------------------------------------
/docs/images/architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/llm-d/llm-d-inference-scheduler/f6c57c520c3fb9fc89735da8469ddad847074273/docs/images/architecture.png


--------------------------------------------------------------------------------
/docs/images/dp_architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/llm-d/llm-d-inference-scheduler/f6c57c520c3fb9fc89735da8469ddad847074273/docs/images/dp_architecture.png


--------------------------------------------------------------------------------
/docs/images/plugability.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/llm-d/llm-d-inference-scheduler/f6c57c520c3fb9fc89735da8469ddad847074273/docs/images/plugability.png


--------------------------------------------------------------------------------
/go.mod:
--------------------------------------------------------------------------------
  1 | module github.com/llm-d/llm-d-inference-scheduler
  2 | 
  3 | go 1.24.1
  4 | 
  5 | toolchain go1.24.2
  6 | 
  7 | require (
  8 | 	github.com/cespare/xxhash/v2 v2.3.0
  9 | 	github.com/envoyproxy/go-control-plane/envoy v1.32.4
 10 | 	github.com/go-logr/logr v1.4.2
 11 | 	github.com/google/go-cmp v0.7.0
 12 | 	github.com/hashicorp/golang-lru/v2 v2.0.7
 13 | 	github.com/llm-d/llm-d-kv-cache-manager v0.1.0
 14 | 	github.com/prometheus/client_golang v1.22.0
 15 | 	github.com/stretchr/testify v1.10.0
 16 | 	go.uber.org/zap v1.27.0
 17 | 	google.golang.org/grpc v1.72.0
 18 | 	k8s.io/apimachinery v0.33.1
 19 | 	k8s.io/client-go v0.32.5
 20 | 	k8s.io/component-base v0.32.5
 21 | 	sigs.k8s.io/controller-runtime v0.20.4
 22 | 	sigs.k8s.io/gateway-api v1.3.0
 23 | 	sigs.k8s.io/gateway-api-inference-extension v0.0.0-20250521193836-a5bf0acd13cc
 24 | )
 25 | 
 26 | require (
 27 | 	cel.dev/expr v0.20.0 // indirect
 28 | 	github.com/antlr4-go/antlr/v4 v4.13.0 // indirect
 29 | 	github.com/asaskevich/govalidator v0.0.0-20190424111038-f61b66f89f4a // indirect
 30 | 	github.com/beorn7/perks v1.0.1 // indirect
 31 | 	github.com/blang/semver/v4 v4.0.0 // indirect
 32 | 	github.com/cenkalti/backoff/v4 v4.3.0 // indirect
 33 | 	github.com/cncf/xds/go v0.0.0-20250121191232-2f005788dc42 // indirect
 34 | 	github.com/daulet/tokenizers v1.20.2 // indirect
 35 | 	github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect
 36 | 	github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f // indirect
 37 | 	github.com/emicklei/go-restful/v3 v3.12.0 // indirect
 38 | 	github.com/envoyproxy/protoc-gen-validate v1.2.1 // indirect
 39 | 	github.com/evanphx/json-patch/v5 v5.9.11 // indirect
 40 | 	github.com/felixge/httpsnoop v1.0.4 // indirect
 41 | 	github.com/fsnotify/fsnotify v1.7.0 // indirect
 42 | 	github.com/fxamacker/cbor/v2 v2.7.0 // indirect
 43 | 	github.com/go-logr/stdr v1.2.2 // indirect
 44 | 	github.com/go-logr/zapr v1.3.0 // indirect
 45 | 	github.com/go-openapi/jsonpointer v0.21.0 // indirect
 46 | 	github.com/go-openapi/jsonreference v0.21.0 // indirect
 47 | 	github.com/go-openapi/swag v0.23.0 // indirect
 48 | 	github.com/gogo/protobuf v1.3.2 // indirect
 49 | 	github.com/golang/protobuf v1.5.4 // indirect
 50 | 	github.com/google/btree v1.1.3 // indirect
 51 | 	github.com/google/cel-go v0.22.0 // indirect
 52 | 	github.com/google/gnostic-models v0.6.9 // indirect
 53 | 	github.com/google/uuid v1.6.0 // indirect
 54 | 	github.com/grpc-ecosystem/grpc-gateway/v2 v2.24.0 // indirect
 55 | 	github.com/inconshreveable/mousetrap v1.1.0 // indirect
 56 | 	github.com/josharian/intern v1.0.0 // indirect
 57 | 	github.com/json-iterator/go v1.1.12 // indirect
 58 | 	github.com/mailru/easyjson v0.7.7 // indirect
 59 | 	github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
 60 | 	github.com/modern-go/reflect2 v1.0.2 // indirect
 61 | 	github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
 62 | 	github.com/pkg/errors v0.9.1 // indirect
 63 | 	github.com/planetscale/vtprotobuf v0.6.1-0.20240319094008-0393e58bdf10 // indirect
 64 | 	github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
 65 | 	github.com/prometheus/client_model v0.6.2 // indirect
 66 | 	github.com/prometheus/common v0.64.0 // indirect
 67 | 	github.com/prometheus/procfs v0.15.1 // indirect
 68 | 	github.com/redis/go-redis/v9 v9.7.3 // indirect
 69 | 	github.com/spf13/cobra v1.9.1 // indirect
 70 | 	github.com/spf13/pflag v1.0.6 // indirect
 71 | 	github.com/stoewer/go-strcase v1.3.0 // indirect
 72 | 	github.com/x448/float16 v0.8.4 // indirect
 73 | 	go.opentelemetry.io/auto/sdk v1.1.0 // indirect
 74 | 	go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.58.0 // indirect
 75 | 	go.opentelemetry.io/otel v1.34.0 // indirect
 76 | 	go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.33.0 // indirect
 77 | 	go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.33.0 // indirect
 78 | 	go.opentelemetry.io/otel/metric v1.34.0 // indirect
 79 | 	go.opentelemetry.io/otel/sdk v1.34.0 // indirect
 80 | 	go.opentelemetry.io/otel/trace v1.34.0 // indirect
 81 | 	go.opentelemetry.io/proto/otlp v1.4.0 // indirect
 82 | 	go.uber.org/multierr v1.11.0 // indirect
 83 | 	golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56 // indirect
 84 | 	golang.org/x/net v0.40.0 // indirect
 85 | 	golang.org/x/oauth2 v0.30.0 // indirect
 86 | 	golang.org/x/sync v0.14.0 // indirect
 87 | 	golang.org/x/sys v0.33.0 // indirect
 88 | 	golang.org/x/term v0.32.0 // indirect
 89 | 	golang.org/x/text v0.25.0 // indirect
 90 | 	golang.org/x/time v0.9.0 // indirect
 91 | 	gomodules.xyz/jsonpatch/v2 v2.4.0 // indirect
 92 | 	google.golang.org/genproto/googleapis/api v0.0.0-20250218202821-56aae31c358a // indirect
 93 | 	google.golang.org/genproto/googleapis/rpc v0.0.0-20250428153025-10db94c68c34 // indirect
 94 | 	google.golang.org/protobuf v1.36.6 // indirect
 95 | 	gopkg.in/evanphx/json-patch.v4 v4.12.0 // indirect
 96 | 	gopkg.in/inf.v0 v0.9.1 // indirect
 97 | 	gopkg.in/yaml.v3 v3.0.1 // indirect
 98 | 	k8s.io/api v0.32.5 // indirect
 99 | 	k8s.io/apiextensions-apiserver v0.32.5 // indirect
100 | 	k8s.io/apiserver v0.32.5 // indirect
101 | 	k8s.io/klog/v2 v2.130.1 // indirect
102 | 	k8s.io/kube-openapi v0.0.0-20250318190949-c8a335a9a2ff // indirect
103 | 	k8s.io/utils v0.0.0-20241210054802-24370beab758 // indirect
104 | 	sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.31.0 // indirect
105 | 	sigs.k8s.io/json v0.0.0-20241010143419-9aa6b5e7a4b3 // indirect
106 | 	sigs.k8s.io/randfill v1.0.0 // indirect
107 | 	sigs.k8s.io/structured-merge-diff/v4 v4.7.0 // indirect
108 | 	sigs.k8s.io/yaml v1.4.0 // indirect
109 | )
110 | 


--------------------------------------------------------------------------------
/hooks/pre-commit:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -e
 3 | 
 4 | echo "▶️  Running lint…"
 5 | make lint
 6 | 
 7 | echo "▶️  Running tests…"
 8 | make test
 9 | 
10 | echo "✔️  All checks passed!"
11 | 


--------------------------------------------------------------------------------
/internal/controller/runnable/grpc.go:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * This file is adapted from Gateway API Inference Extension
 3 |  * Original source: https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/internal/runnable/grpc.go
 4 |  * Licensed under the Apache License, Version 2.0
 5 |  */
 6 | 
 7 | // Package runnable contains tooling to manage and convert manager.Runnable
 8 | // objects for controllers.
 9 | package runnable
10 | 
11 | import (
12 | 	"context"
13 | 	"fmt"
14 | 	"net"
15 | 
16 | 	"google.golang.org/grpc"
17 | 
18 | 	ctrl "sigs.k8s.io/controller-runtime"
19 | 	"sigs.k8s.io/controller-runtime/pkg/manager"
20 | )
21 | 
22 | // GRPCServer promotes the provided grpc.Server to a manager.Runnable.
23 | func GRPCServer(name string, srv *grpc.Server, port int) manager.Runnable {
24 | 	return manager.RunnableFunc(func(ctx context.Context) error {
25 | 		log := ctrl.Log.WithValues("name", name)
26 | 		log.Info("gRPC server starting")
27 | 
28 | 		listener, err := net.Listen("tcp", fmt.Sprintf(":%d", port))
29 | 		if err != nil {
30 | 			log.Error(err, "gRPC server failed to listen", "port", port)
31 | 			return err
32 | 		}
33 | 
34 | 		log.Info("gRPC server listening", "port", port)
35 | 
36 | 		doneCh := make(chan struct{})
37 | 		defer close(doneCh)
38 | 		go func() {
39 | 			select {
40 | 			case <-ctx.Done():
41 | 				log.Info("gRPC server shutting down")
42 | 				srv.GracefulStop()
43 | 			case <-doneCh:
44 | 			}
45 | 		}()
46 | 
47 | 		if err := srv.Serve(listener); err != nil && err != grpc.ErrServerStopped {
48 | 			log.Error(err, "gRPC server failed")
49 | 			return err
50 | 		}
51 | 
52 | 		log.Info("gRPC server terminated")
53 | 
54 | 		return nil
55 | 	})
56 | }
57 | 


--------------------------------------------------------------------------------
/internal/controller/runnable/leader_election.go:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * This file is adapted from Gateway API Inference Extension
 3 |  * Original source: https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/internal/runnable/leader_election.go
 4 |  * Licensed under the Apache License, Version 2.0
 5 |  */
 6 | 
 7 | package runnable
 8 | 
 9 | import (
10 | 	"sigs.k8s.io/controller-runtime/pkg/manager"
11 | )
12 | 
13 | // LeaderElection enables or disabled leader election for the provided manager.Runnable.
14 | func LeaderElection(runnable manager.Runnable, needsLeaderElection bool) manager.Runnable {
15 | 	return &leaderElection{
16 | 		Runnable:            runnable,
17 | 		needsLeaderElection: needsLeaderElection,
18 | 	}
19 | }
20 | 
21 | // RequireLeaderElection enables leader election for the provided manager.Runnable.
22 | func RequireLeaderElection(runnable manager.Runnable) manager.Runnable {
23 | 	return LeaderElection(runnable, true)
24 | }
25 | 
26 | // NoLeaderElection disabled leader election for the provided manager.Runnable.
27 | func NoLeaderElection(runnable manager.Runnable) manager.Runnable {
28 | 	return LeaderElection(runnable, false)
29 | }
30 | 
31 | // leaderElection is a wrapped manager.Runnable with configuration for enabling
32 | // or disabling leader election.
33 | type leaderElection struct {
34 | 	manager.Runnable
35 | 	needsLeaderElection bool
36 | }
37 | 
38 | // NeedLeaderElection indicates whether or not leader election is enabled.
39 | func (r *leaderElection) NeedLeaderElection() bool {
40 | 	return r.needsLeaderElection
41 | }
42 | 


--------------------------------------------------------------------------------
/internal/controller/tls/tls.go:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * This file is adapted from Gateway API Inference Extension
 3 |  * Original source: https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/internal/tls/tls.go
 4 |  * Licensed under the Apache License, Version 2.0
 5 |  */
 6 | 
 7 | // Package tls includes tooling for handling TLS certificates for controllers.
 8 | package tls
 9 | 
10 | import (
11 | 	"crypto/rand"
12 | 	"crypto/rsa"
13 | 	"crypto/tls"
14 | 	"crypto/x509"
15 | 	"crypto/x509/pkix"
16 | 	"encoding/pem"
17 | 	"fmt"
18 | 	"math/big"
19 | 	"time"
20 | )
21 | 
22 | // CreateSelfSignedTLSCertificate generates a self-signed certificate.
23 | func CreateSelfSignedTLSCertificate() (tls.Certificate, error) {
24 | 	serialNumberLimit := new(big.Int).Lsh(big.NewInt(1), 128)
25 | 	serialNumber, err := rand.Int(rand.Reader, serialNumberLimit)
26 | 	if err != nil {
27 | 		return tls.Certificate{}, fmt.Errorf("error creating serial number: %v", err)
28 | 	}
29 | 
30 | 	now := time.Now()
31 | 	notBefore := now.UTC()
32 | 	template := x509.Certificate{
33 | 		SerialNumber: serialNumber,
34 | 		Subject: pkix.Name{
35 | 			Organization: []string{"Inference Ext"},
36 | 		},
37 | 		NotBefore:             notBefore,
38 | 		NotAfter:              now.Add(time.Hour * 24 * 365 * 1).UTC(),
39 | 		KeyUsage:              x509.KeyUsageKeyEncipherment | x509.KeyUsageDigitalSignature,
40 | 		ExtKeyUsage:           []x509.ExtKeyUsage{x509.ExtKeyUsageServerAuth},
41 | 		BasicConstraintsValid: true,
42 | 	}
43 | 
44 | 	priv, err := rsa.GenerateKey(rand.Reader, 4096)
45 | 	if err != nil {
46 | 		return tls.Certificate{}, fmt.Errorf("error generating key: %v", err)
47 | 	}
48 | 
49 | 	derBytes, err := x509.CreateCertificate(rand.Reader, &template, &template, &priv.PublicKey, priv)
50 | 	if err != nil {
51 | 		return tls.Certificate{}, fmt.Errorf("error creating certificate: %v", err)
52 | 	}
53 | 
54 | 	certBytes := pem.EncodeToMemory(&pem.Block{Type: "CERTIFICATE", Bytes: derBytes})
55 | 
56 | 	privBytes, err := x509.MarshalPKCS8PrivateKey(priv)
57 | 	if err != nil {
58 | 		return tls.Certificate{}, fmt.Errorf("error marshalling private key: %v", err)
59 | 	}
60 | 	keyBytes := pem.EncodeToMemory(&pem.Block{Type: "PRIVATE KEY", Bytes: privBytes})
61 | 
62 | 	return tls.X509KeyPair(certBytes, keyBytes)
63 | }
64 | 


--------------------------------------------------------------------------------
/pkg/config/config.go:
--------------------------------------------------------------------------------
  1 | // Package config provides the configuration reading abilities
  2 | // Current version read configuration from environment variables
  3 | package config
  4 | 
  5 | import (
  6 | 	"math"
  7 | 
  8 | 	"github.com/go-logr/logr"
  9 | 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/env"
 10 | )
 11 | 
 12 | const (
 13 | 	// For every plugin named below, there are four environment variables. They are:
 14 | 	//  - "ENABLE_" + pluginName  Enables the named plugin for decode processing
 15 | 	//  - pluginName + "_WEIGHT"  The weight for a scorer in decode processing
 16 | 	//  - "PREFILL_ENABLE_" + pluginName  Enables the named plugin for prefill processing
 17 | 	//  - "PREFILL_" + pluginName + "_WEIGHT"  The weight for a scorer in prefill processing
 18 | 
 19 | 	// KVCacheScorerName name of the kv-cache scorer in configuration
 20 | 	KVCacheScorerName = "KVCACHE_AWARE_SCORER"
 21 | 	// LoadAwareScorerName name of the load aware scorer in configuration
 22 | 	LoadAwareScorerName = "LOAD_AWARE_SCORER"
 23 | 	// PrefixScorerName name of the prefix scorer in configuration
 24 | 	PrefixScorerName = "PREFIX_AWARE_SCORER"
 25 | 	// SessionAwareScorerName name of the session aware scorer in configuration
 26 | 	SessionAwareScorerName = "SESSION_AWARE_SCORER"
 27 | 
 28 | 	prefillPrefix = "PREFILL_"
 29 | 	enablePrefix  = "ENABLE_"
 30 | 	weightSuffix  = "_WEIGHT"
 31 | 
 32 | 	// Plugins from Upstream
 33 | 
 34 | 	// GIELeastKVCacheFilterName name of the GIE least kv-cache filter in configuration
 35 | 	GIELeastKVCacheFilterName = "GIE_LEAST_KVCACHE_FILTER"
 36 | 	// GIELeastQueueFilterName name of the GIE least queue filter in configuration
 37 | 	GIELeastQueueFilterName = "GIE_LEAST_QUEUE_FILTER"
 38 | 	// GIELoraAffinityFilterName name of the GIE LoRA affinity filter in configuration
 39 | 	GIELoraAffinityFilterName = "GIE_LORA_AFFINITY_FILTER"
 40 | 	// GIELowQueueFilterName name of the GIE low queue filter in configuration
 41 | 	GIELowQueueFilterName = "GIE_LOW_QUEUE_FILTER"
 42 | 	// GIESheddableCapacityFilterName name of the GIE sheddable capacity filter in configuration
 43 | 	GIESheddableCapacityFilterName = "GIE_SHEDDABLE_CAPACITY_FILTER"
 44 | 	// GIEKVCacheUtilizationScorerName name of the GIE kv-cache utilization scorer in configuration
 45 | 	GIEKVCacheUtilizationScorerName = "GIE_KVCACHE_UTILIZATION_SCORER"
 46 | 	// GIEQueueScorerName name of the GIE queue scorer in configuration
 47 | 	GIEQueueScorerName = "GIE_QUEUE_SCORER"
 48 | 	// GIEPrefixScorerName name of the GIE prefix plugin in configuration
 49 | 	GIEPrefixScorerName = "GIE_PREFIX_SCORER"
 50 | 
 51 | 	pdEnabledEnvKey             = "PD_ENABLED"
 52 | 	pdPromptLenThresholdEnvKey  = "PD_PROMPT_LEN_THRESHOLD"
 53 | 	pdPromptLenThresholdDefault = 100
 54 | 
 55 | 	prefixScorerBlockSizeEnvKey  = "PREFIX_SCORER_BLOCK_SIZE"
 56 | 	prefixScorerBlockSizeDefault = 256
 57 | )
 58 | 
 59 | // Config contains scheduler configuration, currently configuration is loaded from environment variables
 60 | type Config struct {
 61 | 	logger                  logr.Logger
 62 | 	DecodeSchedulerPlugins  map[string]int
 63 | 	PrefillSchedulerPlugins map[string]int
 64 | 
 65 | 	PDEnabled       bool
 66 | 	PDThreshold     int
 67 | 	PrefixBlockSize int
 68 | }
 69 | 
 70 | // NewConfig creates a new instance if Config
 71 | func NewConfig(logger logr.Logger) *Config {
 72 | 	return &Config{
 73 | 		logger:                  logger,
 74 | 		DecodeSchedulerPlugins:  map[string]int{},
 75 | 		PrefillSchedulerPlugins: map[string]int{},
 76 | 		PDEnabled:               false,
 77 | 		PDThreshold:             math.MaxInt,
 78 | 		PrefixBlockSize:         prefixScorerBlockSizeDefault,
 79 | 	}
 80 | }
 81 | 
 82 | // LoadConfig loads configuration from environment variables
 83 | func (c *Config) LoadConfig() {
 84 | 	c.loadPluginInfo(c.DecodeSchedulerPlugins, false,
 85 | 		KVCacheScorerName, LoadAwareScorerName, PrefixScorerName, SessionAwareScorerName,
 86 | 		GIELeastKVCacheFilterName, GIELeastQueueFilterName, GIELoraAffinityFilterName,
 87 | 		GIELowQueueFilterName, GIESheddableCapacityFilterName,
 88 | 		GIEKVCacheUtilizationScorerName, GIEQueueScorerName, GIEPrefixScorerName)
 89 | 
 90 | 	c.loadPluginInfo(c.PrefillSchedulerPlugins, true,
 91 | 		KVCacheScorerName, LoadAwareScorerName, PrefixScorerName, SessionAwareScorerName,
 92 | 		GIELeastKVCacheFilterName, GIELeastQueueFilterName, GIELoraAffinityFilterName,
 93 | 		GIELowQueueFilterName, GIESheddableCapacityFilterName,
 94 | 		GIEKVCacheUtilizationScorerName, GIEQueueScorerName, GIEPrefixScorerName)
 95 | 
 96 | 	c.PDEnabled = env.GetEnvString(pdEnabledEnvKey, "false", c.logger) == "true"
 97 | 	c.PDThreshold = env.GetEnvInt(pdPromptLenThresholdEnvKey, pdPromptLenThresholdDefault, c.logger)
 98 | 	c.PrefixBlockSize = env.GetEnvInt(prefixScorerBlockSizeEnvKey, prefixScorerBlockSizeDefault, c.logger)
 99 | }
100 | 
101 | func (c *Config) loadPluginInfo(plugins map[string]int, prefill bool, pluginNames ...string) {
102 | 	for _, pluginName := range pluginNames {
103 | 		var enablementKey string
104 | 		var weightKey string
105 | 		if prefill {
106 | 			enablementKey = prefillPrefix + enablePrefix + pluginName
107 | 			weightKey = prefillPrefix + pluginName + weightSuffix
108 | 		} else {
109 | 			enablementKey = enablePrefix + pluginName
110 | 			weightKey = pluginName + weightSuffix
111 | 		}
112 | 
113 | 		if env.GetEnvString(enablementKey, "false", c.logger) != "true" {
114 | 			c.logger.Info("Skipping plugin creation as it is not enabled", "name", pluginName)
115 | 		} else {
116 | 			weight := env.GetEnvInt(weightKey, 1, c.logger)
117 | 
118 | 			plugins[pluginName] = weight
119 | 			c.logger.Info("Initialized plugin", "plugin", pluginName, "weight", weight)
120 | 		}
121 | 	}
122 | }
123 | 


--------------------------------------------------------------------------------
/pkg/scheduling/dual/scheduler.go:
--------------------------------------------------------------------------------
 1 | // Package dual provides a sample Scheduler that internally uses
 2 | // a dual scheduler construct (primary and secondary).
 3 | package dual
 4 | 
 5 | import (
 6 | 	"context"
 7 | 	"fmt"
 8 | 	"math/rand"
 9 | 	"time"
10 | 
11 | 	"sigs.k8s.io/controller-runtime/pkg/log"
12 | 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore"
13 | 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics"
14 | 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/requestcontrol"
15 | 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling"
16 | 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/plugins/picker"
17 | 	giescorer "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/plugins/scorer"
18 | 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types"
19 | 	logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging"
20 | 
21 | 	"github.com/llm-d/llm-d-inference-scheduler/pkg/scheduling/plugins/filter"
22 | 	"github.com/llm-d/llm-d-inference-scheduler/pkg/scheduling/plugins/scorer"
23 | )
24 | 
25 | // Scheduler implements the dual scheduler concept, along with a threshold
26 | // determining when each is invoked.
27 | type Scheduler struct {
28 | 	threshold float32
29 | 	store     datastore.Datastore
30 | 	primary   requestcontrol.Scheduler
31 | 	secondary requestcontrol.Scheduler
32 | }
33 | 
34 | // NewScheduler create a new scheduler with the given datastore and threshold
35 | func NewScheduler(threshold float32, datastore datastore.Datastore) *Scheduler {
36 | 	scheduler := &Scheduler{
37 | 		threshold: threshold,
38 | 		store:     datastore,
39 | 	}
40 | 
41 | 	scheduler.primary = scheduling.NewSchedulerWithConfig(datastore, scheduling.NewSchedulerConfig().
42 | 		WithFilters(&filter.Passthrough{}).
43 | 		WithScorers(giescorer.NewWeightedScorer(&scorer.Passthrough{}, 10)).
44 | 		WithPicker(picker.NewMaxScorePicker()))
45 | 
46 | 	scheduler.secondary = scheduling.NewSchedulerWithConfig(datastore, scheduling.NewSchedulerConfig().
47 | 		WithFilters(&filter.Random{}).
48 | 		WithScorers(giescorer.NewWeightedScorer(&scorer.Random{}, 10)).
49 | 		WithPicker(picker.NewRandomPicker()))
50 | 
51 | 	return scheduler
52 | }
53 | 
54 | // Schedule selects a Pod for the given request and context
55 | func (s *Scheduler) Schedule(ctx context.Context, req *types.LLMRequest) (*types.Result, error) {
56 | 	logger := log.FromContext(ctx).WithName("PD-scheduler").WithValues("request", req)
57 | 	debugLog := logger.V(logutil.DEBUG)
58 | 
59 | 	scheduleStart := time.Now()
60 | 	defer func() {
61 | 		metrics.RecordSchedulerE2ELatency(time.Since(scheduleStart))
62 | 	}()
63 | 
64 | 	if rand.Float32() > s.threshold { // choose a primary only
65 | 		return s.primary.Schedule(ctx, req)
66 | 	}
67 | 
68 | 	primary, err := s.primary.Schedule(ctx, req)
69 | 	if err != nil {
70 | 		return nil, err
71 | 	}
72 | 	debugLog.Info(fmt.Sprintf("Primary scheduler selected %+v", primary))
73 | 
74 | 	// TODO: this is demo behavior we need to replace once we know what we want.
75 | 	if rand.Float32() < s.threshold { // choose a secondary as well
76 | 		secondary, err := s.secondary.Schedule(ctx, req)
77 | 		if err != nil {
78 | 			debugLog.Info(fmt.Sprintf("Secondary scheduler failed %+v, returning primary", err))
79 | 		}
80 | 		debugLog.Info(fmt.Sprintf("Secondary scheduler selected %+v", secondary))
81 | 		if rand.Float32() < s.threshold { // lucky again: return the secondary
82 | 			return secondary, nil
83 | 		}
84 | 	}
85 | 	return primary, nil
86 | }
87 | 


--------------------------------------------------------------------------------
/pkg/scheduling/pd/doc.go:
--------------------------------------------------------------------------------
1 | // Package pd implements disaggregated Prefill/Decode scheduling
2 | package pd
3 | 


--------------------------------------------------------------------------------
/pkg/scheduling/pd/scheduler.go:
--------------------------------------------------------------------------------
  1 | package pd
  2 | 
  3 | import (
  4 | 	"context"
  5 | 	"errors"
  6 | 	"fmt"
  7 | 	"time"
  8 | 
  9 | 	"sigs.k8s.io/controller-runtime/pkg/log"
 10 | 	"sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2"
 11 | 	backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics"
 12 | 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics"
 13 | 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/requestcontrol"
 14 | 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling"
 15 | 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/plugins"
 16 | 	giefilter "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/plugins/filter"
 17 | 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/plugins/multi/prefix"
 18 | 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/plugins/picker"
 19 | 	giescorer "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/plugins/scorer"
 20 | 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types"
 21 | 	envutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/env"
 22 | 	logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging"
 23 | 
 24 | 	"github.com/llm-d/llm-d-inference-scheduler/pkg/config"
 25 | 	"github.com/llm-d/llm-d-inference-scheduler/pkg/scheduling/plugins/filter"
 26 | 	"github.com/llm-d/llm-d-inference-scheduler/pkg/scheduling/plugins/scorer"
 27 | )
 28 | 
 29 | const (
 30 | 	// PrefillPodHeader is the HTTP header name used to indicate Prefill worker
 31 | 	PrefillPodHeader = "x-prefiller-url"
 32 | )
 33 | 
 34 | // Scheduler implements the disaggreagted P/D scheduling logic
 35 | type Scheduler struct {
 36 | 	threshold int
 37 | 	pdEnabled bool
 38 | 	store     Datastore
 39 | 	prefill   requestcontrol.Scheduler
 40 | 	decode    requestcontrol.Scheduler
 41 | 
 42 | 	// prefixScorer is a prefix scorer which will be used for decission if prefill step is required
 43 | 	// if pd is enabled, prefix scorers should be the same instance in all:
 44 | 	// prefill scheduler, decode scheduler and prefixScorer
 45 | 	prefixScorer *scorer.PrefixAwareScorer
 46 | }
 47 | 
 48 | var _ requestcontrol.Scheduler = &Scheduler{} // validate interface conformance
 49 | 
 50 | // Datastore portion used by scheduler
 51 | type Datastore interface {
 52 | 	// InferencePool operations
 53 | 	PoolGet() (*v1alpha2.InferencePool, error)
 54 | 	// PodMetrics operations
 55 | 	PodGetAll() []backendmetrics.PodMetrics
 56 | }
 57 | 
 58 | // NewScheduler returns a new disaggregated Prefill/Decode filter, using the
 59 | // provided configuration.
 60 | func NewScheduler(ctx context.Context, schedCfg *config.Config, ds Datastore) (*Scheduler, error) {
 61 | 	prefixConfig := scorer.DefaultPrefixStoreConfig()
 62 | 	prefixConfig.BlockSize = schedCfg.PrefixBlockSize
 63 | 
 64 | 	scheduler := &Scheduler{
 65 | 		threshold:    schedCfg.PDThreshold,
 66 | 		pdEnabled:    schedCfg.PDEnabled,
 67 | 		store:        ds,
 68 | 		prefixScorer: scorer.NewPrefixAwareScorer(ctx, prefixConfig),
 69 | 	}
 70 | 
 71 | 	scheduler.prefill = scheduling.NewSchedulerWithConfig(
 72 | 		ds,
 73 | 		scheduler.generateSchedulerConfig(ctx, schedCfg.PrefillSchedulerPlugins,
 74 | 			&filter.PrefillFilter{}),
 75 | 	)
 76 | 
 77 | 	scheduler.decode = scheduling.NewSchedulerWithConfig(
 78 | 		ds,
 79 | 		scheduler.generateSchedulerConfig(ctx, schedCfg.DecodeSchedulerPlugins,
 80 | 			&filter.DecodeFilter{}),
 81 | 	)
 82 | 
 83 | 	return scheduler, nil
 84 | }
 85 | 
 86 | // Schedule uses (up to) two internal schedulers to process requests.
 87 | // If the request prompt is short (as defined by the configured threshold)
 88 | // the scheduler use the default behavior ("Decode scheduler").
 89 | // If the request prompt is long enough to warrant disaggregated prefill-decode,
 90 | // both the Prefill and Decode schedulers are invoked. In the case of the
 91 | // Prefill scheduler, the selected Pod's URL is saved in a header
 92 | // and communicated back to the inference gateway.
 93 | func (s *Scheduler) Schedule(ctx context.Context, req *types.LLMRequest) (*types.Result, error) {
 94 | 	logger := log.FromContext(ctx).WithName("PD").WithValues("request", req)
 95 | 	debugLog := logger.V(logutil.DEBUG)
 96 | 
 97 | 	scheduleStart := time.Now()
 98 | 	defer func() {
 99 | 		metrics.RecordSchedulerE2ELatency(time.Since(scheduleStart))
100 | 	}()
101 | 
102 | 	if !s.pdEnabled {
103 | 		debugLog.Info("Disagregated prefill/decode disabled - scheduling to decode worker only")
104 | 		return s.decode.Schedule(ctx, req)
105 | 	}
106 | 
107 | 	// find the best pod for decode
108 | 	// assumes that prefix scorer was activated
109 | 	decodeRes, err := s.decode.Schedule(ctx, req)
110 | 
111 | 	if decodeRes == nil || decodeRes.TargetPod == nil {
112 | 		logger.Info("No decode pod found, skipping scheduling")
113 | 		return nil, errors.New("no decode pod found")
114 | 	}
115 | 
116 | 	// if the request is short enough, use the default scheduler
117 | 	hitPercentage := s.prefixScorer.GetCachedPercentage(decodeRes.TargetPod.GetPod().NamespacedName.String(), req.Prompt)
118 | 	if (1.0-hitPercentage)*float64(len(req.Prompt)) < float64(s.threshold) {
119 | 		logger.Info("Non-cached suffix is smaller than threshold, using decode scheduler",
120 | 			"hitPercentage", hitPercentage)
121 | 		return decodeRes, err
122 | 	}
123 | 
124 | 	logger.Info("Non-cached suffix is larger than threshold, using PD scheduler",
125 | 		"hitPercentage", hitPercentage)
126 | 	prefillRes, prefillErr := s.prefill.Schedule(ctx, req)
127 | 
128 | 	if prefillErr == nil && prefillRes.TargetPod != nil { // record the prefill worker
129 | 		pool, err := s.store.PoolGet()
130 | 		if err != nil {
131 | 			debugLog.Error(err, "Get inference pool failed - scheduling to decode worker only")
132 | 			return s.decode.Schedule(ctx, req)
133 | 		}
134 | 
135 | 		// TODO: should the scheme be conifgurable (e.g., https://)?
136 | 		prefillURL := fmt.Sprintf("http://%s:%d", prefillRes.TargetPod.GetPod().Address, pool.Spec.TargetPortNumber)
137 | 		if req.Headers == nil { // TODO should always be populated?
138 | 			req.Headers = make(map[string]string)
139 | 		}
140 | 		req.Headers[PrefillPodHeader] = prefillURL
141 | 	}
142 | 
143 | 	debugLog.Info("Scheduling to separate Prefill and Decode workers")
144 | 
145 | 	return decodeRes, nil // decode pod
146 | }
147 | 
148 | // OnResponse normally processes all LLMResponses - forwards all responses to the decode scheduler
149 | func (s *Scheduler) OnResponse(ctx context.Context, resp *types.LLMResponse, targetPodName string) {
150 | 	// prefill scheduler will never get OnReponse, need to take care of plugin, issue #97
151 | 	s.decode.OnResponse(ctx, resp, targetPodName)
152 | }
153 | 
154 | func (s *Scheduler) pluginsFromConfig(ctx context.Context, pluginsConfig map[string]int) map[plugins.Plugin]int {
155 | 	logger := log.FromContext(ctx)
156 | 
157 | 	plugins := map[plugins.Plugin]int{}
158 | 	prefixWasAdded := false
159 | 
160 | 	for pluginName, pluginWeight := range pluginsConfig {
161 | 		switch pluginName {
162 | 		case config.KVCacheScorerName:
163 | 			scorer, err := scorer.NewKVCacheAwareScorer(ctx)
164 | 			if err == nil {
165 | 				plugins[scorer] = pluginWeight
166 | 			} else {
167 | 				logger.Error(err, "KVCache scorer creation failed")
168 | 			}
169 | 		case config.LoadAwareScorerName:
170 | 			plugins[scorer.NewLoadAwareScorer(ctx)] = pluginWeight
171 | 		case config.PrefixScorerName:
172 | 			// TODO - create config? based on what? - issue #55
173 | 			// use the same instance
174 | 			plugins[s.prefixScorer] = pluginWeight
175 | 			prefixWasAdded = true
176 | 		case config.SessionAwareScorerName:
177 | 			plugins[scorer.NewSessionAffinity()] = pluginWeight
178 | 
179 | 		// Plugins from upstream
180 | 
181 | 		case config.GIELeastKVCacheFilterName:
182 | 			plugins[giefilter.NewLeastKVCacheFilter()] = pluginWeight
183 | 		case config.GIELeastQueueFilterName:
184 | 			plugins[giefilter.NewLeastQueueFilter()] = pluginWeight
185 | 		case config.GIELoraAffinityFilterName:
186 | 			plugins[giefilter.NewLoraAffinityFilter()] = pluginWeight
187 | 		case config.GIELowQueueFilterName:
188 | 			plugins[giefilter.NewLowQueueFilter()] = pluginWeight
189 | 		case config.GIESheddableCapacityFilterName:
190 | 			plugins[giefilter.NewSheddableCapacityFilter()] = pluginWeight
191 | 		case config.GIEKVCacheUtilizationScorerName:
192 | 			plugins[&giescorer.KVCacheScorer{}] = pluginWeight
193 | 		case config.GIEPrefixScorerName:
194 | 			// For now use the default configuration
195 | 			prefixConfig := prefix.Config{
196 | 				HashBlockSize:          envutil.GetEnvInt("PREFIX_CACHE_HASH_BLOCK_SIZE", prefix.DefaultHashBlockSize, logger),
197 | 				MaxPrefixBlocksToMatch: envutil.GetEnvInt("PREFIX_CACHE_MAX_PREFIX_BLOCKS", prefix.DefaultMaxPrefixBlocks, logger),
198 | 				LRUIndexerCapacity:     envutil.GetEnvInt("PREFIX_CACHE_LRU_CAPACITY", prefix.DefaultLRUIndexerCapacity, logger),
199 | 			}
200 | 			plugins[prefix.New(prefixConfig)] = pluginWeight
201 | 		case config.GIEQueueScorerName:
202 | 			plugins[&giescorer.QueueScorer{}] = pluginWeight
203 | 		}
204 | 	}
205 | 
206 | 	// only in case pd is enabled and prefix scorer was not enabled for decode scheduler
207 | 	// add prefix scorer to list of all scorers to collect information used for decision if PD should be acrivated
208 | 	if s.pdEnabled && !prefixWasAdded {
209 | 		plugins[s.prefixScorer] = 0.0
210 | 	}
211 | 
212 | 	return plugins
213 | }
214 | 
215 | func (s *Scheduler) generateSchedulerConfig(ctx context.Context, pluginsConfig map[string]int, extraFilters ...plugins.Filter) *scheduling.SchedulerConfig {
216 | 	thePlugins := s.pluginsFromConfig(ctx, pluginsConfig)
217 | 	preSchedulePlugins := []plugins.PreSchedule{}
218 | 	filters := []plugins.Filter{}
219 | 	scorers := []*giescorer.WeightedScorer{}
220 | 	postSchedulePlugins := []plugins.PostSchedule{}
221 | 	postResponsePlugins := []plugins.PostResponse{}
222 | 
223 | 	filters = append(filters, extraFilters...)
224 | 
225 | 	for plugin, pluginWeight := range thePlugins {
226 | 		if preSchedule, ok := plugin.(plugins.PreSchedule); ok {
227 | 			preSchedulePlugins = append(preSchedulePlugins, preSchedule)
228 | 		}
229 | 		if filter, ok := plugin.(plugins.Filter); ok {
230 | 			filters = append(filters, filter)
231 | 		}
232 | 		if scorer, ok := plugin.(plugins.Scorer); ok {
233 | 			scorers = append(scorers, giescorer.NewWeightedScorer(scorer, pluginWeight))
234 | 		}
235 | 		if postSchedule, ok := plugin.(plugins.PostSchedule); ok {
236 | 			postSchedulePlugins = append(postSchedulePlugins, postSchedule)
237 | 		}
238 | 		if postResponse, ok := plugin.(plugins.PostResponse); ok {
239 | 			postResponsePlugins = append(postResponsePlugins, postResponse)
240 | 		}
241 | 	}
242 | 
243 | 	return scheduling.NewSchedulerConfig().
244 | 		WithPreSchedulePlugins(preSchedulePlugins...).
245 | 		WithFilters(filters...).
246 | 		WithScorers(scorers...).
247 | 		WithPicker(picker.NewMaxScorePicker()).
248 | 		WithPostSchedulePlugins(postSchedulePlugins...).
249 | 		WithPostResponsePlugins(postResponsePlugins...)
250 | }
251 | 


--------------------------------------------------------------------------------
/pkg/scheduling/pd/scheduler_test.go:
--------------------------------------------------------------------------------
  1 | package pd_test
  2 | 
  3 | import (
  4 | 	"context"
  5 | 	"testing"
  6 | 
  7 | 	"github.com/go-logr/logr/testr"
  8 | 
  9 | 	"github.com/google/go-cmp/cmp"
 10 | 	k8stypes "k8s.io/apimachinery/pkg/types"
 11 | 	"sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2"
 12 | 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend"
 13 | 	backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics" // Import config for thresholds
 14 | 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types"
 15 | 
 16 | 	"github.com/llm-d/llm-d-inference-scheduler/pkg/config"
 17 | 	"github.com/llm-d/llm-d-inference-scheduler/pkg/scheduling/pd"
 18 | 	"github.com/llm-d/llm-d-inference-scheduler/pkg/scheduling/plugins/filter"
 19 | 	"sigs.k8s.io/controller-runtime/pkg/log"
 20 | )
 21 | 
 22 | // Tests the default scheduler configuration and expected behavior.
 23 | func TestPDSchedule(t *testing.T) {
 24 | 	pod1 := &backendmetrics.FakePodMetrics{
 25 | 		Pod: &backend.Pod{
 26 | 			NamespacedName: k8stypes.NamespacedName{Name: "pod1"},
 27 | 			Address:        "1.2.3.4",
 28 | 			Labels:         map[string]string{filter.RoleLabel: filter.RolePrefill},
 29 | 		},
 30 | 		Metrics: &backendmetrics.MetricsState{},
 31 | 	}
 32 | 	pod2 := &backendmetrics.FakePodMetrics{
 33 | 		Pod: &backend.Pod{
 34 | 			NamespacedName: k8stypes.NamespacedName{Name: "pod2"},
 35 | 			Address:        "5.6.7.8",
 36 | 			Labels:         map[string]string{filter.RoleLabel: filter.RoleDecode},
 37 | 		},
 38 | 		Metrics: &backendmetrics.MetricsState{},
 39 | 	}
 40 | 	wantPod2 := &types.PodMetrics{
 41 | 		Pod: &backend.Pod{
 42 | 			NamespacedName: k8stypes.NamespacedName{Name: "pod2"},
 43 | 			Address:        "5.6.7.8",
 44 | 			Labels:         map[string]string{filter.RoleLabel: filter.RoleDecode},
 45 | 		},
 46 | 		MetricsState: &backendmetrics.MetricsState{
 47 | 			ActiveModels:  map[string]int{},
 48 | 			WaitingModels: map[string]int{},
 49 | 		},
 50 | 	}
 51 | 
 52 | 	tests := []struct {
 53 | 		name            string
 54 | 		req             *types.LLMRequest
 55 | 		input           []*backendmetrics.FakePodMetrics
 56 | 		wantRes         *types.Result
 57 | 		wantHeaders     map[string]string
 58 | 		unwantedHeaders []string
 59 | 		err             bool
 60 | 	}{
 61 | 		{
 62 | 			name: "no pods in datastore",
 63 | 			req: &types.LLMRequest{
 64 | 				TargetModel: "any-model",
 65 | 				Critical:    true,
 66 | 				Prompt:      "12345678901",
 67 | 			},
 68 | 			input: []*backendmetrics.FakePodMetrics{},
 69 | 			err:   true,
 70 | 		},
 71 | 		{
 72 | 			name: "one decode pod, long prompt",
 73 | 			req: &types.LLMRequest{
 74 | 				TargetModel: "critical",
 75 | 				Critical:    true,
 76 | 				Prompt:      "12345678901",
 77 | 			},
 78 | 			// pod2 will be picked because it is the only pod with Decode role
 79 | 			input: []*backendmetrics.FakePodMetrics{pod2},
 80 | 			wantRes: &types.Result{
 81 | 				TargetPod: &types.ScoredPod{
 82 | 					Pod: wantPod2,
 83 | 				},
 84 | 			},
 85 | 			unwantedHeaders: []string{"x-prefiller-url"},
 86 | 		},
 87 | 		{
 88 | 			name: "one prefill pod, long prompt",
 89 | 			req: &types.LLMRequest{
 90 | 				TargetModel: "critical",
 91 | 				Critical:    true,
 92 | 				Prompt:      "12345678901",
 93 | 			},
 94 | 			// no Decode pod
 95 | 			input: []*backendmetrics.FakePodMetrics{pod1},
 96 | 			err:   true,
 97 | 		},
 98 | 		{
 99 | 			name: "1P1D",
100 | 			req: &types.LLMRequest{
101 | 				TargetModel: "critical",
102 | 				Critical:    true,
103 | 				Prompt:      "12345678901",
104 | 			},
105 | 			// pod2 will be picked because it is the decode pod, pod1 IP will be in the header
106 | 			input: []*backendmetrics.FakePodMetrics{pod1, pod2},
107 | 			wantRes: &types.Result{
108 | 				TargetPod: &types.ScoredPod{
109 | 					Pod:   wantPod2,
110 | 					Score: 0.0,
111 | 				},
112 | 			},
113 | 			wantHeaders: map[string]string{"x-prefiller-url": "http://1.2.3.4:80"},
114 | 		},
115 | 		{
116 | 			name: "1P1Dshort",
117 | 			req: &types.LLMRequest{
118 | 				TargetModel: "critical",
119 | 				Critical:    true,
120 | 				Prompt:      "123",
121 | 			},
122 | 			// pod2 will be picked because it is the decode pod, pod1 IP should no be in the header,
123 | 			// because the prompt is too short
124 | 			input: []*backendmetrics.FakePodMetrics{pod1, pod2},
125 | 			wantRes: &types.Result{
126 | 				TargetPod: &types.ScoredPod{
127 | 					Pod:   wantPod2,
128 | 					Score: 0.0,
129 | 				},
130 | 			},
131 | 			unwantedHeaders: []string{"x-prefiller-url"},
132 | 		},
133 | 	}
134 | 
135 | 	ctx := context.Background()
136 | 	logger := testr.New(t)
137 | 	ctx = log.IntoContext(ctx, logger)
138 | 
139 | 	schedCfg := config.NewConfig(logger)
140 | 	schedCfg.PDEnabled = true
141 | 	schedCfg.PDThreshold = 5
142 | 
143 | 	for _, test := range tests {
144 | 		t.Run(test.name, func(t *testing.T) {
145 | 			scheduler, _ := pd.NewScheduler(ctx, schedCfg, &fakeDataStore{pods: test.input})
146 | 			got, err := scheduler.Schedule(ctx, test.req)
147 | 
148 | 			if test.err != (err != nil) {
149 | 				t.Errorf("Unexpected error, got %v, want %v", err, test.err)
150 | 			}
151 | 
152 | 			if diff := cmp.Diff(test.wantRes, got); diff != "" {
153 | 				t.Errorf("Unexpected output (-want +got): %v", diff)
154 | 			}
155 | 
156 | 			for header, value := range test.wantHeaders {
157 | 				gotValue, ok := test.req.Headers[header]
158 | 				if !ok {
159 | 					t.Errorf("Missing header: %s", header)
160 | 				} else if gotValue != value {
161 | 					t.Errorf("Wrong header value for %s: want %s got %s)", header, value, gotValue)
162 | 				}
163 | 			}
164 | 
165 | 			for _, header := range test.unwantedHeaders {
166 | 				if _, exists := test.req.Headers[header]; exists {
167 | 					t.Errorf("Unwanted header %s exists", header)
168 | 				}
169 | 			}
170 | 		})
171 | 	}
172 | }
173 | 
174 | // TODO: this is probably better in upstream (e.g., epp/scheduling or epp/scheduling/plugins)
175 | // currently duplicated from pkg/scheduling/plugins/
176 | type fakeDataStore struct {
177 | 	pods []*backendmetrics.FakePodMetrics
178 | }
179 | 
180 | // PodGetAll returns all pods in the store
181 | func (fds *fakeDataStore) PodGetAll() []backendmetrics.PodMetrics {
182 | 	pm := make([]backendmetrics.PodMetrics, 0, len(fds.pods))
183 | 	for _, pod := range fds.pods {
184 | 		pm = append(pm, pod)
185 | 	}
186 | 	return pm
187 | }
188 | 
189 | func (fds *fakeDataStore) PoolGet() (*v1alpha2.InferencePool, error) {
190 | 	return &v1alpha2.InferencePool{
191 | 		Spec: v1alpha2.InferencePoolSpec{
192 | 			TargetPortNumber: 80,
193 | 		},
194 | 	}, nil
195 | }
196 | 


--------------------------------------------------------------------------------
/pkg/scheduling/plugins/filter/by_labels.go:
--------------------------------------------------------------------------------
 1 | package filter
 2 | 
 3 | import (
 4 | 	"errors"
 5 | 
 6 | 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 7 | 	"k8s.io/apimachinery/pkg/labels"
 8 | 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/plugins"
 9 | 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types"
10 | )
11 | 
12 | // ByLabels filters out pods that do not match its label selector criteria
13 | type ByLabels struct {
14 | 	name     string
15 | 	selector labels.Selector
16 | }
17 | 
18 | var _ plugins.Filter = &ByLabels{} // validate interface conformance
19 | 
20 | // NewByLabel returns a new filter instance, configured with the provided
21 | // name and label selector.
22 | func NewByLabel(name string, selector *metav1.LabelSelector) (plugins.Filter, error) {
23 | 	if name == "" {
24 | 		return nil, errors.New("ByLabels: missing filter name")
25 | 	}
26 | 	labelSelector, err := metav1.LabelSelectorAsSelector(selector)
27 | 	if err != nil {
28 | 		return nil, err
29 | 	}
30 | 
31 | 	return &ByLabels{
32 | 		name:     name,
33 | 		selector: labelSelector,
34 | 	}, nil
35 | }
36 | 
37 | // Name returns the name of the filter
38 | func (blf *ByLabels) Name() string {
39 | 	return blf.name
40 | }
41 | 
42 | // Filter filters out all pods that do not satisfy the label selector
43 | func (blf *ByLabels) Filter(_ *types.SchedulingContext, pods []types.Pod) []types.Pod {
44 | 	filtered := []types.Pod{}
45 | 
46 | 	for _, pod := range pods {
47 | 		labels := labels.Set(pod.GetPod().Labels)
48 | 		if blf.selector.Matches(labels) {
49 | 			filtered = append(filtered, pod)
50 | 		}
51 | 	}
52 | 	return filtered
53 | }
54 | 


--------------------------------------------------------------------------------
/pkg/scheduling/plugins/filter/passthrough.go:
--------------------------------------------------------------------------------
 1 | // Package filter provides filter plugins for the epp.
 2 | package filter
 3 | 
 4 | import (
 5 | 	"fmt"
 6 | 
 7 | 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/plugins"
 8 | 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types"
 9 | 	logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging"
10 | )
11 | 
12 | // Passthrough filter type
13 | type Passthrough struct{}
14 | 
15 | var _ plugins.Filter = &Passthrough{}
16 | 
17 | // Name returns the filter name
18 | func (p *Passthrough) Name() string {
19 | 	return "passthrough-filter"
20 | }
21 | 
22 | // Filter defines the filtering function. In this case it is a passthrough
23 | func (p *Passthrough) Filter(ctx *types.SchedulingContext, pods []types.Pod) []types.Pod {
24 | 	ctx.Logger.V(logutil.DEBUG).Info(fmt.Sprintf("Passthrough filter called with %d candidates: %+v",
25 | 		len(pods), pods))
26 | 
27 | 	return pods
28 | }
29 | 


--------------------------------------------------------------------------------
/pkg/scheduling/plugins/filter/pd_role_filter.go:
--------------------------------------------------------------------------------
 1 | package filter
 2 | 
 3 | import (
 4 | 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/plugins"
 5 | 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types"
 6 | )
 7 | 
 8 | const (
 9 | 	// RoleLabel name
10 | 	RoleLabel = "llm-d.ai/role"
11 | 	// RolePrefill set for designated prefill workers
12 | 	RolePrefill = "prefill"
13 | 	// RoleDecode set for designated decode workers
14 | 	RoleDecode = "decode"
15 | 	// RoleBoth set for workers that can act as both prefill and decode
16 | 	RoleBoth = "both"
17 | )
18 | 
19 | // PrefillFilter - filters out pods that are not marked with role Prefill
20 | type PrefillFilter struct{}
21 | 
22 | var _ plugins.Filter = &PrefillFilter{} // validate interface conformance
23 | 
24 | // Name returns the name of the filter
25 | func (pf *PrefillFilter) Name() string {
26 | 	return "prefill-filter"
27 | }
28 | 
29 | // Filter filters out all pods that are not marked as "prefill"
30 | func (pf *PrefillFilter) Filter(_ *types.SchedulingContext, pods []types.Pod) []types.Pod {
31 | 	filteredPods := []types.Pod{}
32 | 
33 | 	for _, pod := range pods {
34 | 		role := pod.GetPod().Labels[RoleLabel]
35 | 		if role == RolePrefill { // TODO: doesn't RoleBoth also imply Prefill?
36 | 			filteredPods = append(filteredPods, pod)
37 | 		}
38 | 	}
39 | 	return filteredPods
40 | }
41 | 
42 | // DecodeFilter - filters out pods that are not marked with role Decode or Both
43 | type DecodeFilter struct{}
44 | 
45 | var _ plugins.Filter = &DecodeFilter{} // validate interface conformance
46 | 
47 | // Name returns the name of the filter
48 | func (df *DecodeFilter) Name() string {
49 | 	return "decode-filter"
50 | }
51 | 
52 | // Filter removes all pods that are not marked as "decode" or "both"
53 | func (df *DecodeFilter) Filter(_ *types.SchedulingContext, pods []types.Pod) []types.Pod {
54 | 	filteredPods := []types.Pod{}
55 | 
56 | 	for _, pod := range pods {
57 | 		role, defined := pod.GetPod().Labels[RoleLabel]
58 | 		if !defined || role == RoleDecode || role == RoleBoth {
59 | 			filteredPods = append(filteredPods, pod)
60 | 		}
61 | 	}
62 | 	return filteredPods
63 | }
64 | 


--------------------------------------------------------------------------------
/pkg/scheduling/plugins/filter/random.go:
--------------------------------------------------------------------------------
 1 | // Package filter provides filter plugins for the epp.
 2 | package filter
 3 | 
 4 | import (
 5 | 	"fmt"
 6 | 	"math/rand/v2"
 7 | 
 8 | 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/plugins"
 9 | 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types"
10 | 	logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging"
11 | )
12 | 
13 | // Random drop filter type
14 | type Random struct {
15 | 	probability float64
16 | }
17 | 
18 | var _ plugins.Filter = &Random{}
19 | 
20 | // Name returns the filter name
21 | func (r *Random) Name() string {
22 | 	return "random-drop-filter"
23 | }
24 | 
25 | // Filter defines the filtering function. In this case it is a passthrough
26 | func (r *Random) Filter(ctx *types.SchedulingContext, pods []types.Pod) []types.Pod {
27 | 	ctx.Logger.V(logutil.DEBUG).Info(fmt.Sprintf("Random filter called with %d candidates: %+v",
28 | 		len(pods), pods))
29 | 	filtered := []types.Pod{}
30 | 
31 | 	for _, p := range pods {
32 | 		if rand.Float64() >= r.probability {
33 | 			filtered = append(filtered, p)
34 | 		} else {
35 | 			ctx.Logger.V(logutil.DEBUG).Info(fmt.Sprintf("%v dropped", p))
36 | 		}
37 | 	}
38 | 
39 | 	return filtered
40 | }
41 | 


--------------------------------------------------------------------------------
/pkg/scheduling/plugins/scorer/doc.go:
--------------------------------------------------------------------------------
1 | // Package scorer provides scorer plugins for the scheduler.
2 | package scorer
3 | 


--------------------------------------------------------------------------------
/pkg/scheduling/plugins/scorer/kvcache-aware.go:
--------------------------------------------------------------------------------
 1 | package scorer
 2 | 
 3 | import (
 4 | 	"context"
 5 | 	"fmt"
 6 | 	"os"
 7 | 
 8 | 	kvcache "github.com/llm-d/llm-d-kv-cache-manager/pkg/kv-cache"
 9 | 
10 | 	"sigs.k8s.io/controller-runtime/pkg/log"
11 | 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/plugins"
12 | 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types"
13 | 
14 | 	logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging"
15 | )
16 | 
17 | const (
18 | 	kvCacheAwareScorerName = "kvcache-aware-scorer"
19 | 
20 | 	kvCacheRedisEnvVar     = "KVCACHE_INDEXER_REDIS_ADDR"
21 | 	huggingFaceTokenEnvVar = "HF_TOKEN"
22 | )
23 | 
24 | // KVCacheAwareScorer uses the KVCacheIndexer to score pods based on KVCache
25 | // awareness.
26 | type KVCacheAwareScorer struct {
27 | 	kvCacheIndexer *kvcache.Indexer
28 | }
29 | 
30 | // NewKVCacheAwareScorer creates a new KVCacheAwareScorer instance.
31 | // It initializes the KVCacheIndexer from environment variables.
32 | //
33 | // If the environment variables are not set, or if the indexer
34 | // fails to initialize, an error is returned.
35 | func NewKVCacheAwareScorer(ctx context.Context) (plugins.Scorer, error) {
36 | 	config := kvcache.NewDefaultConfig()
37 | 
38 | 	redisAddr := os.Getenv(kvCacheRedisEnvVar)
39 | 	if redisAddr != "" {
40 | 		config.KVBlockIndexerConfig.RedisAddr = redisAddr
41 | 	} else {
42 | 		return nil, fmt.Errorf("environment variable %s is not set", kvCacheRedisEnvVar)
43 | 	}
44 | 
45 | 	hfToken := os.Getenv(huggingFaceTokenEnvVar)
46 | 	if hfToken != "" {
47 | 		config.TokenizersPoolConfig.HuggingFaceToken = hfToken
48 | 	} else {
49 | 		return nil, fmt.Errorf("environment variable %s is not set", huggingFaceTokenEnvVar)
50 | 	}
51 | 
52 | 	kvCacheIndexer, err := kvcache.NewKVCacheIndexer(config)
53 | 	if err != nil {
54 | 		return nil, fmt.Errorf("failed to create KVCacheIndexer: %w", err)
55 | 	}
56 | 
57 | 	go kvCacheIndexer.Run(ctx)
58 | 
59 | 	return &KVCacheAwareScorer{
60 | 		kvCacheIndexer: kvCacheIndexer,
61 | 	}, nil
62 | }
63 | 
64 | // Name returns the name of the scorer.
65 | func (s *KVCacheAwareScorer) Name() string {
66 | 	return kvCacheAwareScorerName
67 | }
68 | 
69 | // Score scores the provided pod based on the KVCache index state.
70 | // The returned scores are normalized to a range of 0-1.
71 | func (s *KVCacheAwareScorer) Score(ctx *types.SchedulingContext, pods []types.Pod) map[types.Pod]float64 {
72 | 	loggerDebug := log.FromContext(ctx).WithName(kvCacheAwareScorerName).V(logutil.DEBUG)
73 | 	if ctx.Req == nil {
74 | 		loggerDebug.Info("Request is nil, skipping scoring")
75 | 		return nil
76 | 	}
77 | 
78 | 	scores, err := s.kvCacheIndexer.GetPodScores(ctx.Context, ctx.Req.Prompt, ctx.Req.TargetModel, nil)
79 | 	if err != nil {
80 | 		loggerDebug.Error(err, "Failed to get pod scores")
81 | 		return nil
82 | 	}
83 | 	loggerDebug.Info("Got pod scores", "scores", scores)
84 | 
85 | 	podToKey := func(pod types.Pod) (string, bool) {
86 | 		metricsPod := pod.GetPod()
87 | 		if metricsPod == nil {
88 | 			return "", false
89 | 		}
90 | 
91 | 		return metricsPod.Address, true
92 | 	}
93 | 
94 | 	return indexedScoresToNormalizedScoredPods(pods, podToKey, scores)
95 | }
96 | 


--------------------------------------------------------------------------------
/pkg/scheduling/plugins/scorer/load_aware_scorer.go:
--------------------------------------------------------------------------------
 1 | package scorer
 2 | 
 3 | import (
 4 | 	"context"
 5 | 
 6 | 	"sigs.k8s.io/controller-runtime/pkg/log"
 7 | 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/plugins"
 8 | 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types"
 9 | 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/env"
10 | )
11 | 
12 | const (
13 | 	queueThresholdEnvName = "LOAD_AWARE_SCORER_QUEUE_THRESHOLD"
14 | 	queueThresholdDefault = 128
15 | )
16 | 
17 | // LoadAwareScorer scorer that is based on load
18 | type LoadAwareScorer struct {
19 | 	queueThreshold float64
20 | }
21 | 
22 | var _ plugins.Scorer = &LoadAwareScorer{} // validate interface conformance
23 | 
24 | // NewLoadAwareScorer creates a new load based scorer
25 | func NewLoadAwareScorer(ctx context.Context) plugins.Scorer {
26 | 	return &LoadAwareScorer{
27 | 		queueThreshold: float64(env.GetEnvInt(queueThresholdEnvName, queueThresholdDefault, log.FromContext(ctx))),
28 | 	}
29 | }
30 | 
31 | // Name returns the scorer's name
32 | func (s *LoadAwareScorer) Name() string {
33 | 	return "load-aware-scorer"
34 | }
35 | 
36 | // Score scores the given pod in range of 0-1
37 | // Currently metrics contains number of requests waiting in the queue, there is no information about number of requests
38 | // that can be processed in the given pod immediately.
39 | // Pod with empty waiting requests queue is scored with 0.5
40 | // Pod with requests in the queue will get score between 0.5 and 0.
41 | // Score 0 will get pod with number of requests in the queue equal to the threshold used in load-based filter (QueueingThresholdLoRA)
42 | // In future pods with additional capacity will get score higher than 0.5
43 | func (s *LoadAwareScorer) Score(_ *types.SchedulingContext, pods []types.Pod) map[types.Pod]float64 {
44 | 	scoredPods := make(map[types.Pod]float64)
45 | 
46 | 	for _, pod := range pods {
47 | 		waitingRequests := float64(pod.GetMetrics().WaitingQueueSize)
48 | 
49 | 		if waitingRequests == 0 {
50 | 			scoredPods[pod] = 0.5
51 | 		} else {
52 | 			if waitingRequests > s.queueThreshold {
53 | 				waitingRequests = s.queueThreshold
54 | 			}
55 | 			scoredPods[pod] = 0.5 * (1.0 - (waitingRequests / s.queueThreshold))
56 | 		}
57 | 	}
58 | 	return scoredPods
59 | }
60 | 


--------------------------------------------------------------------------------
/pkg/scheduling/plugins/scorer/load_aware_scorer_test.go:
--------------------------------------------------------------------------------
  1 | package scorer_test
  2 | 
  3 | import (
  4 | 	"context"
  5 | 	"testing"
  6 | 
  7 | 	"github.com/google/go-cmp/cmp"
  8 | 	k8stypes "k8s.io/apimachinery/pkg/types"
  9 | 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend"
 10 | 	backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics" // Import config for thresholds
 11 | 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling"
 12 | 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/plugins"
 13 | 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/plugins/picker"
 14 | 	giescorer "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/plugins/scorer"
 15 | 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types"
 16 | 
 17 | 	"github.com/llm-d/llm-d-inference-scheduler/pkg/scheduling/plugins/scorer"
 18 | )
 19 | 
 20 | func TestLoadBasedScorer(t *testing.T) {
 21 | 	ctx := context.Background()
 22 | 	tests := []struct {
 23 | 		name    string
 24 | 		scorer  plugins.Scorer
 25 | 		req     *types.LLMRequest
 26 | 		input   []*backendmetrics.FakePodMetrics
 27 | 		wantRes *types.Result
 28 | 		err     bool
 29 | 	}{
 30 | 		{
 31 | 			name:   "load based scorer",
 32 | 			scorer: scorer.NewLoadAwareScorer(ctx),
 33 | 			req: &types.LLMRequest{
 34 | 				TargetModel: "critical",
 35 | 				Critical:    true,
 36 | 			},
 37 | 			// pod2 will be picked because it has the shortest queue
 38 | 			input: []*backendmetrics.FakePodMetrics{
 39 | 				{
 40 | 					Pod: &backend.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod1"}},
 41 | 					Metrics: &backendmetrics.MetricsState{
 42 | 						WaitingQueueSize:    2,
 43 | 						KVCacheUsagePercent: 0.2,
 44 | 						MaxActiveModels:     2,
 45 | 						ActiveModels: map[string]int{
 46 | 							"foo": 1,
 47 | 							"bar": 1,
 48 | 						},
 49 | 					},
 50 | 				},
 51 | 				{
 52 | 					Pod: &backend.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod2"}},
 53 | 					Metrics: &backendmetrics.MetricsState{
 54 | 						WaitingQueueSize:    0,
 55 | 						KVCacheUsagePercent: 0.2,
 56 | 						MaxActiveModels:     2,
 57 | 						ActiveModels: map[string]int{
 58 | 							"foo": 1,
 59 | 							"bar": 1,
 60 | 						},
 61 | 					},
 62 | 				},
 63 | 				{
 64 | 					Pod: &backend.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod3"}},
 65 | 					Metrics: &backendmetrics.MetricsState{
 66 | 						WaitingQueueSize:    5,
 67 | 						KVCacheUsagePercent: 0.2,
 68 | 						MaxActiveModels:     2,
 69 | 						ActiveModels: map[string]int{
 70 | 							"foo": 1,
 71 | 							"bar": 1,
 72 | 						},
 73 | 					},
 74 | 				},
 75 | 			},
 76 | 			wantRes: &types.Result{
 77 | 				TargetPod: &types.ScoredPod{
 78 | 					Pod: &types.PodMetrics{
 79 | 						Pod: &backend.Pod{
 80 | 							NamespacedName: k8stypes.NamespacedName{Name: "pod2"},
 81 | 							Labels:         map[string]string{},
 82 | 						},
 83 | 						MetricsState: &backendmetrics.MetricsState{
 84 | 							WaitingQueueSize:    0,
 85 | 							KVCacheUsagePercent: 0.2,
 86 | 							MaxActiveModels:     2,
 87 | 							ActiveModels: map[string]int{
 88 | 								"foo": 1,
 89 | 								"bar": 1,
 90 | 							},
 91 | 							WaitingModels: map[string]int{},
 92 | 						},
 93 | 					},
 94 | 					Score: 0.5,
 95 | 				},
 96 | 			},
 97 | 		},
 98 | 	}
 99 | 
100 | 	for _, test := range tests {
101 | 		t.Run(test.name, func(t *testing.T) {
102 | 			datastore := &fakeDataStore{pods: test.input}
103 | 
104 | 			scheduler := scheduling.NewSchedulerWithConfig(datastore, scheduling.NewSchedulerConfig().
105 | 				WithScorers(giescorer.NewWeightedScorer(test.scorer, 1)).
106 | 				WithPicker(picker.NewMaxScorePicker()))
107 | 
108 | 			got, err := scheduler.Schedule(context.Background(), test.req)
109 | 			if test.err != (err != nil) {
110 | 				t.Errorf("Unexpected error, got %v, want %v", err, test.err)
111 | 			}
112 | 
113 | 			opt := cmp.AllowUnexported(types.PodMetrics{})
114 | 			if diff := cmp.Diff(test.wantRes, got, opt); diff != "" {
115 | 				t.Errorf("Unexpected output (-want +got): %v", diff)
116 | 			}
117 | 		})
118 | 	}
119 | }
120 | 
121 | // TODO: this is probably better in upstream (e.g., epp/scheduling or epp/scheduling/plugins)
122 | type fakeDataStore struct {
123 | 	pods []*backendmetrics.FakePodMetrics
124 | }
125 | 
126 | // PodGetAll returns all pods in the store
127 | func (fds *fakeDataStore) PodGetAll() []backendmetrics.PodMetrics {
128 | 	pm := make([]backendmetrics.PodMetrics, 0, len(fds.pods))
129 | 	for _, pod := range fds.pods {
130 | 		pm = append(pm, pod)
131 | 	}
132 | 	return pm
133 | }
134 | 


--------------------------------------------------------------------------------
/pkg/scheduling/plugins/scorer/passthrough.go:
--------------------------------------------------------------------------------
 1 | // Package scorer provides scorer plugins for the scheduler.
 2 | package scorer
 3 | 
 4 | import (
 5 | 	"fmt"
 6 | 
 7 | 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/plugins"
 8 | 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types"
 9 | 	logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging"
10 | )
11 | 
12 | // Passthrough is an example scorer which processes the pods, but does not
13 | // give them any score.
14 | type Passthrough struct{}
15 | 
16 | var _ plugins.Scorer = &Passthrough{}
17 | 
18 | // Name provides the textual identifier for this scorer.
19 | func (p *Passthrough) Name() string {
20 | 	return "passthrough-scorer"
21 | }
22 | 
23 | // Score accepts a list of []types.Pod and processes them for scoring.
24 | func (p *Passthrough) Score(ctx *types.SchedulingContext, pods []types.Pod) map[types.Pod]float64 {
25 | 	ctx.Logger.V(logutil.DEBUG).Info(fmt.Sprintf("Scoring pods passthrough was initialized %d candidates: %+v", len(pods), pods))
26 | 
27 | 	scoredPods := make(map[types.Pod]float64, len(pods))
28 | 	for _, pod := range pods {
29 | 		scoredPods[pod] = 0.0
30 | 	}
31 | 
32 | 	return scoredPods
33 | }
34 | 


--------------------------------------------------------------------------------
/pkg/scheduling/plugins/scorer/prefix_aware.go:
--------------------------------------------------------------------------------
  1 | package scorer
  2 | 
  3 | import (
  4 | 	"context"
  5 | 	"sync"
  6 | 	"time"
  7 | 
  8 | 	"sigs.k8s.io/controller-runtime/pkg/log"
  9 | 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/plugins"
 10 | 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types"
 11 | 	logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging"
 12 | )
 13 | 
 14 | const (
 15 | 	prefixAwareScorerName              = "prefix-aware-scorer"
 16 | 	prefixAwareKeepAliveTime           = 60 * time.Minute // How long should an idle session be kept alive
 17 | 	prefixAwareKeepAliveCheckFrequency = 15 * time.Minute // How often to check for overly idle sessions
 18 | )
 19 | 
 20 | type promptHits struct {
 21 | 	lastUpdate time.Time
 22 | 	// hits map from string to int
 23 | 	hits sync.Map
 24 | }
 25 | 
 26 | // PrefixAwareScorer is a routing scorer that scores pods based on the longest prefix match
 27 | // between the request's prompt and stored prefixes. The score is normalized between 0 and 1,
 28 | // where 1 represents the longest matching prefix.
 29 | type PrefixAwareScorer struct {
 30 | 	prefixStore *PrefixStore
 31 | 
 32 | 	// podToPromptHits map from podID(string) to promptHits
 33 | 	podToPromptHits sync.Map
 34 | }
 35 | 
 36 | var _ plugins.Scorer = &PrefixAwareScorer{} // validate interface conformance
 37 | 
 38 | // NewPrefixAwareScorer creates a new PrefixAwareScorer with the given
 39 | // PrefixStoreConfig. If the config is nil, default is used.
 40 | func NewPrefixAwareScorer(ctx context.Context, config *PrefixStoreConfig) *PrefixAwareScorer {
 41 | 	if config == nil {
 42 | 		config = DefaultPrefixStoreConfig()
 43 | 	}
 44 | 
 45 | 	scorer := &PrefixAwareScorer{
 46 | 		prefixStore:     NewPrefixStore(config),
 47 | 		podToPromptHits: sync.Map{},
 48 | 	}
 49 | 
 50 | 	go scorer.cleanup(ctx, prefixAwareKeepAliveCheckFrequency, prefixAwareKeepAliveTime)
 51 | 
 52 | 	return scorer
 53 | }
 54 | 
 55 | // Name returns the scorer's name
 56 | func (s *PrefixAwareScorer) Name() string {
 57 | 	return "prefix-aware-scorer"
 58 | }
 59 | 
 60 | // Score scores the target pods based on the longest prefix match.
 61 | func (s *PrefixAwareScorer) Score(ctx *types.SchedulingContext, pods []types.Pod) map[types.Pod]float64 {
 62 | 	loggerDebug := log.FromContext(ctx).WithName(prefixAwareScorerName).V(logutil.DEBUG)
 63 | 	if ctx.Req == nil {
 64 | 		loggerDebug.Info("Request is nil, skipping scoring")
 65 | 		return nil
 66 | 	}
 67 | 
 68 | 	scores := s.prefixStore.FindMatchingPods(ctx.Req.Prompt, ctx.Req.TargetModel)
 69 | 	loggerDebug.Info("Got pod scores", "scores", scores)
 70 | 
 71 | 	if len(scores) == 0 {
 72 | 		loggerDebug.Info("No scores found for pods")
 73 | 		return nil
 74 | 	}
 75 | 
 76 | 	for pod, score := range scores {
 77 | 		if pod == "" {
 78 | 			continue
 79 | 		}
 80 | 
 81 | 		rawPromptHitsInfo, _ := s.podToPromptHits.LoadOrStore(pod, &promptHits{lastUpdate: time.Now()})
 82 | 		if promptHitsInfo, ok := rawPromptHitsInfo.(*promptHits); ok {
 83 | 			promptHitsInfo.lastUpdate = time.Now()
 84 | 			promptHitsInfo.hits.Store(ctx.Req.Prompt, score)
 85 | 		}
 86 | 	}
 87 | 
 88 | 	podToKey := func(pod types.Pod) (string, bool) {
 89 | 		if pod.GetPod() == nil {
 90 | 			return "", false
 91 | 		}
 92 | 
 93 | 		return pod.GetPod().NamespacedName.String(), true
 94 | 	}
 95 | 
 96 | 	return indexedScoresToNormalizedScoredPods(pods, podToKey, scores)
 97 | }
 98 | 
 99 | // PostSchedule implements the PostSchedulePlugin interface.
100 | // It adds the prefix to the PrefixStore for the given pod.
101 | // TODO: switch to PostResponse.
102 | func (s *PrefixAwareScorer) PostSchedule(ctx *types.SchedulingContext, res *types.Result) {
103 | 	pod := res.TargetPod
104 | 
105 | 	debugLogger := log.FromContext(ctx).WithName(prefixAwareScorerName)
106 | 	debugLogger.Info("PostResponse called", "req", ctx.Req, "pod", pod)
107 | 
108 | 	if ctx.Req == nil {
109 | 		debugLogger.Info("Request is nil, skipping PostResponse")
110 | 		return
111 | 	}
112 | 
113 | 	if pod.GetPod() == nil {
114 | 		debugLogger.Info("Pod is nil, skipping PostResponse", "req", ctx.Req, "pod", pod)
115 | 		return
116 | 	}
117 | 
118 | 	if err := s.prefixStore.AddEntry(ctx.Req.TargetModel, ctx.Req.Prompt, &pod.GetPod().NamespacedName); err != nil {
119 | 		debugLogger.Error(err, "Failed to add entry to prefix store", "req", ctx.Req, "pod", pod)
120 | 		return
121 | 	}
122 | }
123 | 
124 | // GetPrefixStore returns the scorer's PrefixStore.
125 | func (s *PrefixAwareScorer) GetPrefixStore() *PrefixStore {
126 | 	return s.prefixStore
127 | }
128 | 
129 | // GetCachedPercentage returns the percentage of the prompt that is cached for the given pod.
130 | func (s *PrefixAwareScorer) GetCachedPercentage(pod, prompt string) float64 {
131 | 	rawHitsForPod, ok := s.podToPromptHits.Load(pod)
132 | 	if !ok {
133 | 		return 0.0
134 | 	}
135 | 
136 | 	hitsForPod, ok := rawHitsForPod.(*promptHits)
137 | 	if !ok {
138 | 		return 0.0
139 | 	}
140 | 
141 | 	rawVal, ok := hitsForPod.hits.Load(prompt)
142 | 	if !ok {
143 | 		return 0.0
144 | 	}
145 | 
146 | 	intVal, _ := rawVal.(int)
147 | 	return float64(intVal*s.prefixStore.blockSize) / float64(len(prompt))
148 | }
149 | 
150 | // cleanup Cleans up hits map
151 | func (s *PrefixAwareScorer) cleanup(ctx context.Context, keepAliveCheckFrequency time.Duration, keepAliveDuration time.Duration) {
152 | 	logger := log.FromContext(ctx)
153 | 
154 | 	logger.Info("Prefix aware scorer cleanup started")
155 | 	ticker := time.NewTicker(keepAliveCheckFrequency)
156 | 	defer ticker.Stop()
157 | 
158 | 	for {
159 | 		select {
160 | 		case <-ctx.Done():
161 | 			logger.Info("Prefix aware scorer cleanup stopped:")
162 | 			return
163 | 		case now := <-ticker.C:
164 | 			logger.Info("Prefix aware scorer cleanup")
165 | 			s.podToPromptHits.Range(
166 | 				func(podID any, rawPromptHit any) bool {
167 | 					if promptHitInfo, ok := rawPromptHit.(*promptHits); ok {
168 | 						if now.Sub(promptHitInfo.lastUpdate) > keepAliveDuration {
169 | 							// info is stale, remove it
170 | 							s.podToPromptHits.Delete(podID)
171 | 						}
172 | 					} else {
173 | 						// Value is not of the correct type, remove it
174 | 						s.podToPromptHits.Delete(podID)
175 | 					}
176 | 					return true
177 | 				})
178 | 		}
179 | 	}
180 | }
181 | 


--------------------------------------------------------------------------------
/pkg/scheduling/plugins/scorer/prefix_aware_test.go:
--------------------------------------------------------------------------------
  1 | package scorer_test
  2 | 
  3 | import (
  4 | 	"context"
  5 | 	"math/rand"
  6 | 	"strconv"
  7 | 	"testing"
  8 | 	"time"
  9 | 
 10 | 	"github.com/go-logr/logr"
 11 | 	k8stypes "k8s.io/apimachinery/pkg/types"
 12 | 	"sigs.k8s.io/controller-runtime/pkg/log"
 13 | 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend"
 14 | 	backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics"
 15 | 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types"
 16 | 
 17 | 	"github.com/llm-d/llm-d-inference-scheduler/pkg/scheduling/plugins/scorer"
 18 | )
 19 | 
 20 | func TestPrefixAwareScorer(t *testing.T) {
 21 | 	// Create test pods
 22 | 	pod1 := &types.PodMetrics{
 23 | 		Pod: &backend.Pod{
 24 | 			NamespacedName: k8stypes.NamespacedName{
 25 | 				Name:      "pod1",
 26 | 				Namespace: "default",
 27 | 			},
 28 | 		},
 29 | 		MetricsState: &backendmetrics.MetricsState{},
 30 | 	}
 31 | 	pod2 := &types.PodMetrics{
 32 | 		Pod: &backend.Pod{
 33 | 			NamespacedName: k8stypes.NamespacedName{
 34 | 				Name:      "pod2",
 35 | 				Namespace: "default",
 36 | 			},
 37 | 		},
 38 | 		MetricsState: &backendmetrics.MetricsState{},
 39 | 	}
 40 | 
 41 | 	tests := []struct {
 42 | 		name           string
 43 | 		weight         float64
 44 | 		prompt         string
 45 | 		modelName      string
 46 | 		prefixToAdd    string
 47 | 		podToAdd       k8stypes.NamespacedName
 48 | 		prefixModel    string // Model name to use when adding the prefix
 49 | 		expectedScores map[types.Pod]float64
 50 | 	}{
 51 | 		{
 52 | 			name:           "no prompt",
 53 | 			weight:         1.0,
 54 | 			prompt:         "",
 55 | 			modelName:      "model1",
 56 | 			prefixToAdd:    "hello",
 57 | 			podToAdd:       pod1.NamespacedName,
 58 | 			prefixModel:    "model1",
 59 | 			expectedScores: map[types.Pod]float64{}, // No prompt means zero scores
 60 | 		},
 61 | 		{
 62 | 			name:        "exact prefix match",
 63 | 			weight:      1.0,
 64 | 			prompt:      "hello world",
 65 | 			modelName:   "model1",
 66 | 			prefixToAdd: "hello",
 67 | 			podToAdd:    pod1.NamespacedName,
 68 | 			prefixModel: "model1",
 69 | 			expectedScores: map[types.Pod]float64{
 70 | 				pod1: 1.0,
 71 | 				pod2: 0.0,
 72 | 			}, // pod1 matches, pod2 doesn't
 73 | 		},
 74 | 		{
 75 | 			name:           "no prefix match",
 76 | 			weight:         1.0,
 77 | 			prompt:         "goodbye",
 78 | 			modelName:      "model1",
 79 | 			prefixToAdd:    "hello",
 80 | 			podToAdd:       pod1.NamespacedName,
 81 | 			prefixModel:    "model1",
 82 | 			expectedScores: map[types.Pod]float64{}, // No matching prefix
 83 | 		},
 84 | 		{
 85 | 			name:           "different model name",
 86 | 			weight:         1.0,
 87 | 			prompt:         "hello world",
 88 | 			modelName:      "model2", // Try to find with model2
 89 | 			prefixToAdd:    "hello",
 90 | 			podToAdd:       pod1.NamespacedName,
 91 | 			prefixModel:    "model1",                // But prefix was added with model1
 92 | 			expectedScores: map[types.Pod]float64{}, // Model name mismatch should result in no match
 93 | 		},
 94 | 		{
 95 | 			name:        "custom weight",
 96 | 			weight:      0.5,
 97 | 			prompt:      "hello world",
 98 | 			modelName:   "model1",
 99 | 			prefixToAdd: "hello",
100 | 			podToAdd:    pod1.NamespacedName,
101 | 			prefixModel: "model1",
102 | 			expectedScores: map[types.Pod]float64{
103 | 				pod1: 1.0, // Pod1 matches with weight
104 | 				pod2: 0.0, // Pod2 doesn't match
105 | 			}, // Weight affects score
106 | 		},
107 | 	}
108 | 
109 | 	ctx := context.TODO()
110 | 	_ = log.IntoContext(ctx, logr.New(log.NullLogSink{}))
111 | 
112 | 	for _, tt := range tests {
113 | 		t.Run(tt.name, func(t *testing.T) {
114 | 			// Reset prefix store for each test
115 | 			config := scorer.DefaultPrefixStoreConfig()
116 | 			config.BlockSize = 5 // set small chunking for testing
117 | 
118 | 			s := scorer.NewPrefixAwareScorer(ctx, config)
119 | 
120 | 			// Add prefix if specified
121 | 			if tt.prefixToAdd != "" {
122 | 				err := s.GetPrefixStore().AddEntry(tt.prefixModel,
123 | 					tt.prefixToAdd, &tt.podToAdd)
124 | 				if err != nil {
125 | 					t.Fatalf("Failed to add prefix: %v", err)
126 | 				}
127 | 			}
128 | 
129 | 			// Create test context
130 | 			sCtx := types.NewSchedulingContext(ctx, &types.LLMRequest{
131 | 				Prompt:      tt.prompt,
132 | 				TargetModel: tt.modelName,
133 | 			}, nil, []types.Pod{})
134 | 
135 | 			// Score pods
136 | 			pods := []types.Pod{pod1, pod2}
137 | 			scores := s.Score(sCtx, pods)
138 | 
139 | 			for p, score := range scores {
140 | 				if score != tt.expectedScores[p] {
141 | 					t.Errorf("Pod %v: expected score %v, got %v", p, tt.expectedScores[p], score)
142 | 				}
143 | 			}
144 | 		})
145 | 	}
146 | }
147 | 
148 | func TestPrefixAwareScorerProfiling(t *testing.T) {
149 | 	const testName = "profiling_test"
150 | 	const modelName = "test1" // store contains single cache for this model
151 | 	const nPodsTotal = 200
152 | 	const nPodsInStore = 100 // number of chunks stored for pod is proportional to the pod number
153 | 
154 | 	ctx := context.Background()
155 | 	logger := log.FromContext(ctx)
156 | 	ctx = log.IntoContext(ctx, logger)
157 | 
158 | 	name2Pod := createPods(nPodsTotal)
159 | 	config := scorer.DefaultPrefixStoreConfig()
160 | 	text := generateNonRepeatingText(config.BlockSize * nPodsInStore)
161 | 	t.Run(testName, func(t *testing.T) {
162 | 		start := time.Now() // record start time
163 | 		config := scorer.DefaultPrefixStoreConfig()
164 | 		s := scorer.NewPrefixAwareScorer(ctx, config)
165 | 		for i := range nPodsInStore {
166 | 			prompt := text[0 : (i+1)*config.BlockSize-1]
167 | 			err := s.GetPrefixStore().AddEntry(modelName, prompt, &name2Pod["pod"+strconv.Itoa(i)].NamespacedName)
168 | 			if err != nil {
169 | 				t.Errorf("Failed to add entry to prefix store: %v", err)
170 | 			}
171 | 		}
172 | 		sCtx := types.NewSchedulingContext(ctx, &types.LLMRequest{
173 | 			Prompt:      text,
174 | 			TargetModel: modelName,
175 | 		}, nil, []types.Pod{})
176 | 
177 | 		// Score pods
178 | 		pods := make([]types.Pod, 0, len(name2Pod))
179 | 		for _, v := range name2Pod {
180 | 			pods = append(pods, v)
181 | 		}
182 | 
183 | 		scores := s.Score(sCtx, pods)
184 | 
185 | 		highestScore := scores[name2Pod["pod"+strconv.Itoa(nPodsInStore-1)]]
186 | 		if highestScore < 0.99 {
187 | 			t.Error("Failed to calculate scores")
188 | 		}
189 | 
190 | 		// use 'elapsed' time when built-in profiler is not suitable because of short time periods
191 | 		elapsed := time.Since(start) // calculate duration
192 | 		t.Log("Time spent in microsec: " + strconv.FormatInt(elapsed.Microseconds(), 10))
193 | 	})
194 | 
195 | }
196 | 
197 | func createPods(nPods int) map[string]*types.PodMetrics {
198 | 	res := map[string]*types.PodMetrics{}
199 | 	for i := range nPods {
200 | 		pShortName := "pod" + strconv.Itoa(i)
201 | 		pod := &types.PodMetrics{
202 | 			Pod: &backend.Pod{
203 | 				NamespacedName: k8stypes.NamespacedName{
204 | 					Name:      pShortName,
205 | 					Namespace: "default",
206 | 				},
207 | 			},
208 | 			MetricsState: &backendmetrics.MetricsState{},
209 | 		}
210 | 		res[pShortName] = pod
211 | 	}
212 | 	return res
213 | }
214 | 
215 | func generateNonRepeatingText(length int) string {
216 | 	r := rand.New(rand.NewSource(time.Now().UnixNano()))
217 | 	chars := []rune("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 .,!?;:-_[]{}()<>|@#$%^&*+=")
218 | 
219 | 	result := make([]rune, length)
220 | 	for i := range result {
221 | 		result[i] = chars[r.Intn(len(chars))]
222 | 	}
223 | 	return string(result)
224 | }
225 | 


--------------------------------------------------------------------------------
/pkg/scheduling/plugins/scorer/prefix_store.go:
--------------------------------------------------------------------------------
  1 | package scorer
  2 | 
  3 | import (
  4 | 	"encoding/binary"
  5 | 	"fmt"
  6 | 	"sync"
  7 | 	"time"
  8 | 
  9 | 	"k8s.io/apimachinery/pkg/types"
 10 | 
 11 | 	"github.com/cespare/xxhash/v2"
 12 | 	lru "github.com/hashicorp/golang-lru/v2"
 13 | )
 14 | 
 15 | const (
 16 | 	// defaultMaxCacheSize sets the maximum number of blocks the LRU cache can store.
 17 | 	defaultMaxCacheSize = 500000
 18 | 	// defaultBlockSize defines how many runes each block contains in the prefix cache.
 19 | 	defaultBlockSize = 256
 20 | 	// defaultMaxBlockCacheSize sets the maximum number of pods a block can store.
 21 | 	defaultMaxBlockCacheSize = 100
 22 | )
 23 | 
 24 | // PrefixStoreConfig contains initialization configuration for PrefixStore.
 25 | type PrefixStoreConfig struct {
 26 | 	// CacheSize sets the maximum number of blocks the LRU cache can store.
 27 | 	CacheSize int
 28 | 	// BlockSize defines how many runes each block contains in the prefix cache.
 29 | 	BlockSize int
 30 | 	// BlockCacheSize sets the maximum number of pods a block can store.
 31 | 	BlockCacheSize int
 32 | }
 33 | 
 34 | // DefaultPrefixStoreConfig returns an PrefixStoreConfig instance with default
 35 | // configuration.
 36 | func DefaultPrefixStoreConfig() *PrefixStoreConfig {
 37 | 	return &PrefixStoreConfig{
 38 | 		CacheSize:      defaultMaxCacheSize,
 39 | 		BlockSize:      defaultBlockSize,
 40 | 		BlockCacheSize: defaultMaxBlockCacheSize,
 41 | 	}
 42 | }
 43 | 
 44 | // block holds the tokens contained in the block.
 45 | type block struct {
 46 | 	Pods *lru.Cache[types.NamespacedName, time.Time] //TODO: implement Pod eviction based on staleness
 47 | }
 48 | 
 49 | // PrefixStore is an in-memory prefix-to-block cache with xxhash keys and LRU
 50 | // eviction.
 51 | type PrefixStore struct {
 52 | 	sync.RWMutex
 53 | 
 54 | 	cacheSize      int
 55 | 	blockSize      int
 56 | 	blockCacheSize int
 57 | 
 58 | 	store map[string]*lru.Cache[uint64, *block]
 59 | }
 60 | 
 61 | // NewPrefixStore initializes the PrefixStore with LRU cache.
 62 | // If the configuration is nil, default is used.
 63 | func NewPrefixStore(config *PrefixStoreConfig) *PrefixStore {
 64 | 	if config == nil {
 65 | 		config = DefaultPrefixStoreConfig()
 66 | 	}
 67 | 
 68 | 	return &PrefixStore{
 69 | 		cacheSize:      config.CacheSize,
 70 | 		blockSize:      config.BlockSize,
 71 | 		blockCacheSize: config.BlockCacheSize,
 72 | 		store:          make(map[string]*lru.Cache[uint64, *block]),
 73 | 	}
 74 | }
 75 | 
 76 | // AddEntry adds a new entry to the prefix store.
 77 | func (s *PrefixStore) AddEntry(modelName string, prompt string, pod *types.NamespacedName) error {
 78 | 	if prompt == "" || pod == nil || len(prompt) < s.blockSize /* skip if prompt is too short */ {
 79 | 		return nil
 80 | 	}
 81 | 
 82 | 	s.Lock()
 83 | 	// Get or create the LRU cache for the model
 84 | 	cache, ok := s.store[modelName]
 85 | 	if !ok {
 86 | 		var err error
 87 | 		cache, err = lru.New[uint64, *block](s.cacheSize)
 88 | 		if err != nil {
 89 | 			return fmt.Errorf("failed to create LRU cache for model %s: %w", modelName, err)
 90 | 		}
 91 | 
 92 | 		s.store[modelName] = cache
 93 | 	}
 94 | 	s.Unlock()
 95 | 
 96 | 	promptBytes := []byte(prompt)
 97 | 	previousHash := uint64(0)
 98 | 	digest := xxhash.New()
 99 | 
100 | 	// Chunk the text into blocks and populate the cache
101 | 	for start := 0; start < len(promptBytes); start += s.blockSize {
102 | 		end := start + s.blockSize
103 | 		if end > len(promptBytes) {
104 | 			break // skip partial blocks
105 | 		}
106 | 
107 | 		// Compute the hash for the current block
108 | 		digest.Reset()
109 | 		if err := binary.Write(digest, binary.LittleEndian, previousHash); err != nil {
110 | 			return fmt.Errorf("failed to write previous hash: %w", err)
111 | 		}
112 | 		if _, err := digest.Write(promptBytes[start:end]); err != nil {
113 | 			return fmt.Errorf("failed to write prompt bytes: %w", err)
114 | 		}
115 | 
116 | 		blockHash := digest.Sum64()
117 | 		previousHash = blockHash
118 | 
119 | 		b, ok := cache.Get(blockHash)
120 | 		if !ok {
121 | 			pods, err := lru.New[types.NamespacedName, time.Time](s.blockCacheSize)
122 | 			if err != nil {
123 | 				return fmt.Errorf("failed to create LRU cache for block: %w", err)
124 | 			}
125 | 
126 | 			b = &block{Pods: pods}
127 | 			cache.Add(blockHash, b)
128 | 		}
129 | 
130 | 		b.Pods.Add(*pod, time.Now()) // thread-safe
131 | 	}
132 | 
133 | 	return nil
134 | }
135 | 
136 | // FindMatchingPods finds all pods that match the given prompt and model name.
137 | // It returns a map of pods and the number of blocks they match.
138 | func (s *PrefixStore) FindMatchingPods(prompt, modelName string) map[string]int {
139 | 	if prompt == "" || modelName == "" || len(prompt) < s.blockSize /* skip if prompt is too short */ {
140 | 		return nil
141 | 	}
142 | 
143 | 	s.RLock()
144 | 	cache, ok := s.store[modelName] // cache is thread-safe
145 | 	s.RUnlock()
146 | 
147 | 	if !ok {
148 | 		return nil
149 | 	}
150 | 
151 | 	promptBytes := []byte(prompt)
152 | 	previousHash := uint64(0)
153 | 	digest := xxhash.New()
154 | 
155 | 	matchedPods := make(map[string]int)
156 | 	for start := 0; start < len(promptBytes); start += s.blockSize {
157 | 		end := start + s.blockSize
158 | 		if end > len(promptBytes) {
159 | 			break // skip partial blocks
160 | 		}
161 | 
162 | 		digest.Reset()
163 | 		if err := binary.Write(digest, binary.LittleEndian, previousHash); err != nil {
164 | 			break
165 | 		}
166 | 		if _, err := digest.Write(promptBytes[start:end]); err != nil {
167 | 			break
168 | 		}
169 | 
170 | 		blockHash := digest.Sum64()
171 | 		previousHash = blockHash
172 | 
173 | 		b, ok := cache.Get(blockHash)
174 | 		if !ok {
175 | 			break // match consecutive blocks
176 | 		}
177 | 
178 | 		for _, pod := range b.Pods.Keys() {
179 | 			matchedPods[pod.String()]++
180 | 		}
181 | 	}
182 | 
183 | 	return matchedPods
184 | }
185 | 


--------------------------------------------------------------------------------
/pkg/scheduling/plugins/scorer/prefix_store_test.go:
--------------------------------------------------------------------------------
 1 | package scorer_test
 2 | 
 3 | import (
 4 | 	"context"
 5 | 	"testing"
 6 | 
 7 | 	"github.com/go-logr/logr"
 8 | 	k8stypes "k8s.io/apimachinery/pkg/types"
 9 | 	"sigs.k8s.io/controller-runtime/pkg/log"
10 | 
11 | 	"github.com/llm-d/llm-d-inference-scheduler/pkg/scheduling/plugins/scorer"
12 | )
13 | 
14 | // TestBasicPrefixOperations tests the basic functionality of adding and finding prefixes
15 | func TestBasicPrefixOperations(t *testing.T) {
16 | 	ctx := context.TODO()
17 | 	_ = log.IntoContext(ctx, logr.New(log.NullLogSink{}))
18 | 
19 | 	config := scorer.DefaultPrefixStoreConfig()
20 | 	config.BlockSize = 5 // set small chunking for testing
21 | 	store := scorer.NewPrefixStore(config)
22 | 
23 | 	podName := k8stypes.NamespacedName{
24 | 		Name:      "pod1",
25 | 		Namespace: "default",
26 | 	}
27 | 
28 | 	// Test adding a prefix
29 | 	err := store.AddEntry("model1", "hello", &podName)
30 | 	if err != nil {
31 | 		t.Errorf("Failed to add prefix: %v", err)
32 | 	}
33 | 
34 | 	// Test finding the exact prefix
35 | 	scores := store.FindMatchingPods("hello", "model1")
36 | 	if _, ok := scores[podName.String()]; !ok {
37 | 		t.Errorf("Expected pod %v, scores %v", podName, scores)
38 | 	}
39 | 
40 | 	// Test finding with a longer prefix
41 | 	scores = store.FindMatchingPods("hello world", "model1")
42 | 	if _, ok := scores[podName.String()]; !ok {
43 | 		t.Errorf("Expected pod %v, scores %v", podName, scores)
44 | 	}
45 | }
46 | 


--------------------------------------------------------------------------------
/pkg/scheduling/plugins/scorer/random.go:
--------------------------------------------------------------------------------
 1 | // Package scorer provides scorer plugins for the scheduler.
 2 | package scorer
 3 | 
 4 | import (
 5 | 	"fmt"
 6 | 	"math/rand"
 7 | 
 8 | 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/plugins"
 9 | 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types"
10 | 	logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging"
11 | )
12 | 
13 | // Random is an example scorer which processes the pods, giving each a random score.
14 | type Random struct{}
15 | 
16 | var _ plugins.Scorer = &Random{}
17 | 
18 | // Name provides the textual identifier for this scorer.
19 | func (r *Random) Name() string {
20 | 	return "random-scorer"
21 | }
22 | 
23 | // Score accepts a list of []types.Pod and processes them for scoring.
24 | func (r *Random) Score(ctx *types.SchedulingContext, pods []types.Pod) map[types.Pod]float64 {
25 | 	ctx.Logger.V(logutil.DEBUG).Info(fmt.Sprintf("Scoring pods randomly called with %d candidates: %+v",
26 | 		len(pods), pods))
27 | 
28 | 	scores := make(map[types.Pod]float64, len(pods))
29 | 	for _, pod := range pods {
30 | 		scores[pod] = rand.Float64()
31 | 	}
32 | 
33 | 	return scores
34 | }
35 | 


--------------------------------------------------------------------------------
/pkg/scheduling/plugins/scorer/session_affinity.go:
--------------------------------------------------------------------------------
 1 | package scorer
 2 | 
 3 | import (
 4 | 	"encoding/base64"
 5 | 	"time"
 6 | 
 7 | 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/plugins"
 8 | 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types"
 9 | 	logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging"
10 | )
11 | 
12 | const (
13 | 	sessionKeepAliveTime           = 60 * time.Minute  // How long should an idle session be kept alive
14 | 	sessionKeepAliveCheckFrequency = 15 * time.Minute  // How often to check for overly idle sessions
15 | 	sessionTokenHeader             = "x-session-token" // name of the session header in request
16 | )
17 | 
18 | // SessionAffinity is a routing scorer that routes subsequent
19 | // requests in a session to the same pod as the first request in the
20 | // session was sent to, by giving that pod the specified weight and assigning
21 | // zero score to the rest of the targets
22 | type SessionAffinity struct {
23 | }
24 | 
25 | var _ plugins.Scorer = &SessionAffinity{}       // validate interface conformance
26 | var _ plugins.PostResponse = &SessionAffinity{} // validate interface conformance
27 | 
28 | // NewSessionAffinity returns a scorer
29 | func NewSessionAffinity() *SessionAffinity {
30 | 	return &SessionAffinity{}
31 | }
32 | 
33 | // Name returns the scorer's name
34 | func (s *SessionAffinity) Name() string {
35 | 	return "session-affinity-scorer"
36 | }
37 | 
38 | // Score assign a high score to the pod used in previous requests and zero to others
39 | func (s *SessionAffinity) Score(ctx *types.SchedulingContext, pods []types.Pod) map[types.Pod]float64 {
40 | 	scoredPods := make(map[types.Pod]float64)
41 | 	sessionToken := ctx.Req.Headers[sessionTokenHeader]
42 | 	podName := ""
43 | 
44 | 	if sessionToken != "" {
45 | 		decodedBytes, err := base64.StdEncoding.DecodeString(sessionToken)
46 | 		if err != nil {
47 | 			ctx.Logger.Error(err, "Error decoding session header")
48 | 		} else {
49 | 			podName = string(decodedBytes)
50 | 		}
51 | 	}
52 | 	for _, pod := range pods {
53 | 		scoredPods[pod] = 0.0 // initial value
54 | 		if pod.GetPod().NamespacedName.String() == podName {
55 | 			scoredPods[pod] = 1.0
56 | 		}
57 | 	}
58 | 
59 | 	return scoredPods
60 | }
61 | 
62 | // PostResponse sets the session header on the response sent to the client
63 | // TODO: this should be using a cookie and ensure not overriding any other
64 | // cookie values if present.
65 | // Tracked in https://github.com/llm-d/llm-d-inference-scheduler/issues/28
66 | func (s *SessionAffinity) PostResponse(ctx *types.SchedulingContext, pod types.Pod) {
67 | 	if ctx.Resp == nil || pod == nil || pod.GetPod() == nil {
68 | 		reqID := "undefined"
69 | 		if ctx.Resp != nil {
70 | 			reqID = ctx.Resp.RequestId
71 | 		}
72 | 		ctx.Logger.V(logutil.DEBUG).Info("Session affinity scorer - skip post response because one of ctx.Resp, pod, pod.GetPod is nil", "req id", reqID)
73 | 		return
74 | 	}
75 | 
76 | 	if ctx.Resp.Headers == nil { // TODO should always be populated?
77 | 		ctx.Resp.Headers = make(map[string]string)
78 | 	}
79 | 
80 | 	ctx.Resp.Headers[sessionTokenHeader] = base64.StdEncoding.EncodeToString([]byte(pod.GetPod().NamespacedName.String()))
81 | }
82 | 


--------------------------------------------------------------------------------
/pkg/scheduling/plugins/scorer/utils.go:
--------------------------------------------------------------------------------
 1 | package scorer
 2 | 
 3 | import "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types"
 4 | 
 5 | // podToKey is a function type that converts a Pod to a string key.
 6 | // It returns the key and a boolean indicating success.
 7 | type podToKeyFunc func(pod types.Pod) (string, bool)
 8 | 
 9 | // indexedScoresToNormalizedScoredPods converts a map of pod scores to a map of
10 | // normalized scores. The function takes a list of pods, a function to convert
11 | // a pod to a key, and a map of scores indexed by those keys. It returns a map
12 | // of pods to their normalized scores.
13 | func indexedScoresToNormalizedScoredPods(pods []types.Pod, podToKey podToKeyFunc,
14 | 	scores map[string]int) map[types.Pod]float64 {
15 | 	scoredPods := make(map[types.Pod]float64)
16 | 	minScore, maxScore := getMinMax(scores)
17 | 
18 | 	for _, pod := range pods {
19 | 		key, ok := podToKey(pod)
20 | 		if !ok {
21 | 			continue
22 | 		}
23 | 
24 | 		if score, ok := scores[key]; ok {
25 | 			if minScore == maxScore {
26 | 				scoredPods[pod] = 1.0
27 | 				continue
28 | 			}
29 | 
30 | 			scoredPods[pod] = float64(score-minScore) / float64(maxScore-minScore)
31 | 		} else {
32 | 			scoredPods[pod] = 0.0
33 | 		}
34 | 	}
35 | 
36 | 	return scoredPods
37 | }
38 | 
39 | func getMinMax(scores map[string]int) (int, int) {
40 | 	minScore := int(^uint(0) >> 1) // max int
41 | 	maxScore := -1
42 | 
43 | 	for _, score := range scores {
44 | 		if score < minScore {
45 | 			minScore = score
46 | 		}
47 | 		if score > maxScore {
48 | 			maxScore = score
49 | 		}
50 | 	}
51 | 
52 | 	return minScore, maxScore
53 | }
54 | 


--------------------------------------------------------------------------------
/scripts/istio/generate-cp.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # python3 & pip install ruamel.yaml
 4 | # istioctl https://gcsweb.istio.io/gcs/istio-build/dev/1.26-alpha.9befed2f1439d883120f8de70fd70d84ca0ebc3d  alpha pre release
 5 | 
 6 | GATEWAY_NAMESPACE=llm-d-istio-system
 7 | 
 8 | CRD_DIR=deploy/components/crds-istio/
 9 | CP_DIR=deploy/components/istio-control-plane/
10 | ISTIO_CP="$(dirname "$0")/istio-cp.yaml"
11 | 
12 | istioctl manifest generate --dry-run --set values.global.istioNamespace=$GATEWAY_NAMESPACE -f $ISTIO_CP | scripts/istio/manifest-splitter.py -o $CP_DIR
13 | mv $CP_DIR/crds.yaml $CRD_DIR/istio.yaml
14 | 


--------------------------------------------------------------------------------
/scripts/istio/istio-cp.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: install.istio.io/v1alpha1
 2 | kind: IstioOperator
 3 | spec:
 4 |   profile: default
 5 |   revision: llm-d-gateway
 6 |   components:
 7 |     pilot:
 8 |       k8s:
 9 |         resources:
10 |           requests:
11 |             memory: 1024Mi
12 |     ingressGateways:
13 |       - name: istio-ingressgateway
14 |         enabled: false
15 | 


--------------------------------------------------------------------------------
/scripts/istio/manifest-splitter.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | import sys
  4 | import os
  5 | import argparse # Added for command-line arguments
  6 | from collections import defaultdict
  7 | from ruamel.yaml import YAML
  8 | from ruamel.yaml.constructor import DuplicateKeyError
  9 | from ruamel.yaml.error import YAMLError as RuamelYAMLError
 10 | 
 11 | 
 12 | # Define the mapping from Kubernetes Kind to output filename
 13 | # This can be customized as needed.
 14 | KIND_TO_FILENAME_MAP = {
 15 |     "ConfigMap": "configmaps.yaml",
 16 |     "Deployment": "deployments.yaml",
 17 |     "HorizontalPodAutoscaler": "hpa.yaml",
 18 |     "Namespace": "namespaces.yaml",
 19 |     "ServiceAccount": "service-accounts.yaml",
 20 |     "Service": "services.yaml",
 21 |     "Telemetry": "telemetry.yaml", # Istio specific
 22 |     # RBAC Components
 23 |     "Role": "rbac.yaml",
 24 |     "ClusterRole": "rbac.yaml",
 25 |     "RoleBinding": "rbac.yaml",
 26 |     "ClusterRoleBinding": "rbac.yaml",
 27 |     # Webhook Configurations
 28 |     "MutatingWebhookConfiguration": "webhooks.yaml",
 29 |     "ValidatingWebhookConfiguration": "webhooks.yaml",
 30 |     # Istio "Policy-like" CRDs and Networking
 31 |     "AuthorizationPolicy": "policies.yaml",
 32 |     "PeerAuthentication": "policies.yaml",
 33 |     "RequestAuthentication": "policies.yaml",
 34 |     "Sidecar": "policies.yaml",
 35 |     "EnvoyFilter": "policies.yaml",
 36 |     "WasmPlugin": "policies.yaml",
 37 |     "Gateway": "policies.yaml", # Istio Gateway
 38 |     "VirtualService": "policies.yaml",
 39 |     "DestinationRule": "policies.yaml",
 40 |     "ServiceEntry": "policies.yaml",
 41 |     "WorkloadEntry": "policies.yaml",
 42 |     "WorkloadGroup": "policies.yaml",
 43 |     "PodDisruptionBudget": "policies.yaml",
 44 |     "Telemetry": "telemetry.yaml",
 45 |     "IstioOperator": "istiooperators.yaml", # Often part of istioctl output
 46 |     "CustomResourceDefinition": "crds.yaml", # For CRDs themselves
 47 |     # Add more kinds as needed
 48 | }
 49 | 
 50 | # Files requested by the user (for kustomization.yaml)
 51 | REQUESTED_FILES_FOR_KUSTOMIZATION = [
 52 |     "configmaps.yaml",
 53 |     "deployments.yaml",
 54 |     "hpa.yaml",
 55 |     "namespaces.yaml",
 56 |     "policies.yaml",
 57 |     "rbac.yaml",
 58 |     "service-accounts.yaml",
 59 |     "services.yaml",
 60 |     "telemetry.yaml",
 61 |     "webhooks.yaml",
 62 |     # Potentially useful additions if they appear
 63 |     #"crds.yaml",
 64 |     #"istiooperators.yaml",
 65 |     "others.yaml" # Catch-all for unmapped kinds
 66 | ]
 67 | 
 68 | def main():
 69 |     parser = argparse.ArgumentParser(
 70 |         description="Split Istio manifests from stdin into categorized files in an output directory."
 71 |     )
 72 |     parser.add_argument(
 73 |         "-o", "--output-dir",
 74 |         default="istio_manifests_output", # Default output directory name
 75 |         help="The directory where YAML files will be saved (default: istio_manifests_output)"
 76 |     )
 77 |     args = parser.parse_args()
 78 |     output_dir = args.output_dir
 79 | 
 80 |     # Create the output directory if it doesn't exist
 81 |     try:
 82 |         os.makedirs(output_dir, exist_ok=True)
 83 |         print(f"Output directory: {os.path.abspath(output_dir)}")
 84 |     except OSError as e:
 85 |         print(f"Error: Could not create output directory '{output_dir}': {e}", file=sys.stderr)
 86 |         sys.exit(1)
 87 | 
 88 |     # Initialize ruamel.yaml instance for round-trip preservation
 89 |     yaml = YAML()
 90 |     yaml.preserve_quotes = True
 91 |     # yaml.indent(mapping=2, sequence=4, offset=2) # Optional: to enforce specific indent
 92 | 
 93 |     # Dictionary to store YAML content for each file
 94 |     output_files_content = defaultdict(list)
 95 |     # Set to keep track of which files actually get content (just filenames)
 96 |     created_filenames = set()
 97 | 
 98 |     try:
 99 |         yaml_documents = list(yaml.load_all(sys.stdin))
100 |     except RuamelYAMLError as e:
101 |         print(f"Error parsing YAML input: {e}", file=sys.stderr)
102 |         if hasattr(e, 'problem_mark') and e.problem_mark:
103 |             print(f"Error found near line {e.problem_mark.line + 1}, column {e.problem_mark.column + 1}", file=sys.stderr)
104 |         sys.exit(1)
105 |     except Exception as e:
106 |         print(f"An unexpected error occurred while reading/parsing stdin: {e}", file=sys.stderr)
107 |         sys.exit(1)
108 | 
109 |     if not yaml_documents:
110 |         print("No YAML input received from stdin.", file=sys.stderr)
111 |         return
112 | 
113 |     for doc in yaml_documents:
114 |         if doc is None:
115 |             continue
116 |         kind = doc.get("kind")
117 |         filename = KIND_TO_FILENAME_MAP.get(kind, "others.yaml") # Just the filename, not path
118 |         output_files_content[filename].append(doc)
119 |         created_filenames.add(filename)
120 | 
121 |     # Write the collected YAML documents to their respective files in the output directory
122 |     for filename, docs in output_files_content.items():
123 |         if not docs:
124 |             continue
125 |         
126 |         output_filepath = os.path.join(output_dir, filename)
127 |         try:
128 |             with open(output_filepath, "w") as f:
129 |                 yaml.dump_all(docs, f)
130 |             print(f"Written {len(docs)} resource(s) to {output_filepath}")
131 |         except IOError as e:
132 |             print(f"Error writing to file {output_filepath}: {e}", file=sys.stderr)
133 |         except RuamelYAMLError as e:
134 |             print(f"Error serializing YAML for {output_filepath}: {e}", file=sys.stderr)
135 |         except Exception as e:
136 |             print(f"An unexpected error occurred while writing {output_filepath}: {e}", file=sys.stderr)
137 | 
138 |     # Generate kustomization.yaml in the output directory
139 |     kustomization_yaml = YAML()
140 |     kustomization_yaml.indent(mapping=2, sequence=2, offset=0) # Common kustomize style
141 | 
142 |     kustomization_content = {"apiVersion": "kustomize.config.k8s.io/v1beta1", "kind": "Kustomization"}
143 |     
144 |     # Resources in kustomization.yaml are relative to kustomization.yaml itself
145 |     kustomization_resources = sorted([
146 |         fname for fname in created_filenames 
147 |         if fname != "kustomization.yaml" and fname in REQUESTED_FILES_FOR_KUSTOMIZATION
148 |     ])
149 | 
150 |     if not kustomization_resources:
151 |         print(f"No resources found to include in kustomization.yaml within {output_dir}.", file=sys.stderr)
152 |     else:
153 |         kustomization_content["resources"] = kustomization_resources
154 |         kustomization_filepath = os.path.join(output_dir, "kustomization.yaml")
155 |         try:
156 |             with open(kustomization_filepath, "w") as f:
157 |                 kustomization_yaml.dump(kustomization_content, f)
158 |             print(f"Written {kustomization_filepath}")
159 |         except IOError as e:
160 |             print(f"Error writing to {kustomization_filepath}: {e}", file=sys.stderr)
161 |         except RuamelYAMLError as e:
162 |             print(f"Error serializing YAML for {kustomization_filepath}: {e}", file=sys.stderr)
163 |         except Exception as e:
164 |             print(f"An unexpected error occurred while writing {kustomization_filepath}: {e}", file=sys.stderr)
165 | 
166 | if __name__ == "__main__":
167 |     main()


--------------------------------------------------------------------------------
/scripts/kind-dev-env.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # This shell script deploys a kind cluster with an Istio-based Gateway API
  4 | # implementation fully configured. It deploys the vllm simulator, which it
  5 | # exposes with a Gateway -> HTTPRoute -> InferencePool. The Gateway is
  6 | # configured with the a filter for the ext_proc endpoint picker.
  7 | 
  8 | set -eo pipefail
  9 | 
 10 | # ------------------------------------------------------------------------------
 11 | # Variables
 12 | # ------------------------------------------------------------------------------
 13 | 
 14 | SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 15 | 
 16 | # Set a default CLUSTER_NAME if not provided
 17 | : "${CLUSTER_NAME:=llm-d-inference-scheduler-dev}"
 18 | 
 19 | # Set the host port to map to the Gateway's inbound port (30080)
 20 | : "${GATEWAY_HOST_PORT:=30080}"
 21 | 
 22 | # Set the default IMAGE_REGISTRY if not provided
 23 | : "${IMAGE_REGISTRY:=ghcr.io/llm-d}"
 24 | 
 25 | # Set a default VLLM_SIMULATOR_IMAGE if not provided
 26 | : "${VLLM_SIMULATOR_IMAGE:=llm-d-inference-sim}"
 27 | 
 28 | # Set a default VLLM_SIMULATOR_TAG if not provided
 29 | export VLLM_SIMULATOR_TAG="${VLLM_SIMULATOR_TAG:-dev}"
 30 | 
 31 | # Set a default EPP_IMAGE if not provided
 32 | : "${EPP_IMAGE:=llm-d-inference-scheduler}"
 33 | 
 34 | # Set a default EPP_TAG if not provided
 35 | export EPP_TAG="${EPP_TAG:-dev}"
 36 | 
 37 | # Set the default routing side car image tag
 38 | export ROUTING_SIDECAR_TAG="${ROUTING_SIDECAR_TAG:-0.0.6}"
 39 | 
 40 | # Set the inference pool name for the deployment
 41 | export POOL_NAME="${POOL_NAME:-vllm-llama3-8b-instruct}"
 42 | 
 43 | # Set the model name to deploy
 44 | export MODEL_NAME="${MODEL_NAME:-meta-llama/Llama-3.1-8B-Instruct}"
 45 | 
 46 | # vLLM replica count (without PD)
 47 | export VLLM_REPLICA_COUNT="${VLLM_REPLICA_COUNT:-1}"
 48 | 
 49 | # By default we are not setting up for PD
 50 | export PD_ENABLED="\"${PD_ENABLED:-false}\""
 51 | 
 52 | # By default the PD threshhold is ten tokens
 53 | export PD_PROMPT_LEN_THRESHOLD="\"${PD_PROMPT_LEN_THRESHOLD:-10}\""
 54 | 
 55 | # Replica counts for P and D
 56 | export VLLM_REPLICA_COUNT_P="${VLLM_REPLICA_COUNT_P:-1}"
 57 | export VLLM_REPLICA_COUNT_D="${VLLM_REPLICA_COUNT_D:-2}"
 58 | 
 59 | # ------------------------------------------------------------------------------
 60 | # Setup & Requirement Checks
 61 | # ------------------------------------------------------------------------------
 62 | 
 63 | # Check for a supported container runtime if an explicit one was not set
 64 | if [ -z "${CONTAINER_RUNTIME}" ]; then
 65 |   if command -v docker &> /dev/null; then
 66 |     CONTAINER_RUNTIME="docker"
 67 |   elif command -v podman &> /dev/null; then
 68 |     CONTAINER_RUNTIME="podman"
 69 |   else
 70 |     echo "Neither docker nor podman could be found in PATH" >&2
 71 |     exit 1
 72 |   fi
 73 | fi
 74 | 
 75 | set -u
 76 | 
 77 | # Check for required programs
 78 | for cmd in kind kubectl kustomize ${CONTAINER_RUNTIME}; do
 79 |     if ! command -v "$cmd" &> /dev/null; then
 80 |         echo "Error: $cmd is not installed or not in the PATH."
 81 |         exit 1
 82 |     fi
 83 | done
 84 | 
 85 | # ------------------------------------------------------------------------------
 86 | # Cluster Deployment
 87 | # ------------------------------------------------------------------------------
 88 | 
 89 | # Check if the cluster already exists
 90 | if kind get clusters 2>/dev/null | grep -q "^${CLUSTER_NAME}$"; then
 91 |     echo "Cluster '${CLUSTER_NAME}' already exists, re-using"
 92 | else
 93 |     kind create cluster --name "${CLUSTER_NAME}" --config - << EOF
 94 | kind: Cluster
 95 | apiVersion: kind.x-k8s.io/v1alpha4
 96 | nodes:
 97 | - role: control-plane
 98 |   extraPortMappings:
 99 |   - containerPort: 30080
100 |     hostPort: ${GATEWAY_HOST_PORT}
101 |     protocol: TCP
102 | EOF
103 | fi
104 | 
105 | # Set the kubectl context to the kind cluster
106 | KUBE_CONTEXT="kind-${CLUSTER_NAME}"
107 | kubectl config set-context ${KUBE_CONTEXT} --namespace=default
108 | 
109 | set -x
110 | 
111 | # Hotfix for https://github.com/kubernetes-sigs/kind/issues/3880
112 | CONTAINER_NAME="${CLUSTER_NAME}-control-plane"
113 | ${CONTAINER_RUNTIME} exec -it ${CONTAINER_NAME} /bin/bash -c "sysctl net.ipv4.conf.all.arp_ignore=0"
114 | 
115 | # Wait for all pods to be ready
116 | kubectl --context ${KUBE_CONTEXT} -n kube-system wait --for=condition=Ready --all pods --timeout=300s
117 | 
118 | echo "Waiting for local-path-storage pods to be created..."
119 | until kubectl --context ${KUBE_CONTEXT} -n local-path-storage get pods -o name | grep -q pod/; do
120 |   sleep 2
121 | done
122 | kubectl --context ${KUBE_CONTEXT} -n local-path-storage wait --for=condition=Ready --all pods --timeout=300s
123 | 
124 | # ------------------------------------------------------------------------------
125 | # Load Container Images
126 | # ------------------------------------------------------------------------------
127 | 
128 | # Load the vllm simulator image into the cluster
129 | if [ "${CONTAINER_RUNTIME}" == "podman" ]; then
130 | 	podman save ${IMAGE_REGISTRY}/${VLLM_SIMULATOR_IMAGE}:${VLLM_SIMULATOR_TAG} -o /dev/stdout | kind --name ${CLUSTER_NAME} load image-archive /dev/stdin
131 | else
132 | 	kind --name ${CLUSTER_NAME} load docker-image ${IMAGE_REGISTRY}/${VLLM_SIMULATOR_IMAGE}:${VLLM_SIMULATOR_TAG}
133 | fi
134 | 
135 | # Load the ext_proc endpoint-picker image into the cluster
136 | if [ "${CONTAINER_RUNTIME}" == "podman" ]; then
137 | 	podman save ${IMAGE_REGISTRY}/${EPP_IMAGE}:${EPP_TAG} -o /dev/stdout | kind --name ${CLUSTER_NAME} load image-archive /dev/stdin
138 | else
139 | 	kind --name ${CLUSTER_NAME} load docker-image ${IMAGE_REGISTRY}/${EPP_IMAGE}:${EPP_TAG}
140 | fi
141 | # ------------------------------------------------------------------------------
142 | # CRD Deployment (Gateway API + GIE)
143 | # ------------------------------------------------------------------------------
144 | 
145 | kustomize build deploy/components/crds-gateway-api |
146 | 	kubectl --context ${KUBE_CONTEXT} apply --server-side --force-conflicts -f -
147 | 
148 | kustomize build deploy/components/crds-gie |
149 | 	kubectl --context ${KUBE_CONTEXT} apply --server-side --force-conflicts -f -
150 | 
151 | kustomize build --enable-helm deploy/components/crds-istio |
152 | 	kubectl --context ${KUBE_CONTEXT} apply --server-side --force-conflicts -f -
153 | 
154 | # ------------------------------------------------------------------------------
155 | # Development Environment
156 | # ------------------------------------------------------------------------------
157 | 
158 | # Deploy the environment to the "default" namespace
159 | if [ "${PD_ENABLED}" != "\"true\"" ]; then
160 |   KUSTOMIZE_DIR="deploy/environments/dev/kind-istio"
161 | else
162 |   KUSTOMIZE_DIR="deploy/environments/dev/kind-istio-pd"
163 | fi
164 | kustomize build --enable-helm  ${KUSTOMIZE_DIR} \
165 | 	| envsubst \${POOL_NAME} | envsubst \${EPP_TAG} | envsubst \${VLLM_SIMULATOR_TAG} \
166 |     | envsubst \${PD_ENABLED} | envsubst \${PD_PROMPT_LEN_THRESHOLD} \
167 |     | envsubst \${ROUTING_SIDECAR_TAG} | envsubst \${VLLM_REPLICA_COUNT} \
168 |     | envsubst \${VLLM_REPLICA_COUNT_P} | envsubst \${VLLM_REPLICA_COUNT_D} \
169 |     | kubectl --context ${KUBE_CONTEXT} apply -f -
170 | 
171 | # ------------------------------------------------------------------------------
172 | # Check & Verify
173 | # ------------------------------------------------------------------------------
174 | 
175 | # Wait for all control-plane deployments to be ready
176 | kubectl --context ${KUBE_CONTEXT} -n llm-d-istio-system wait --for=condition=available --timeout=300s deployment --all
177 | 
178 | # Wait for all deployments to be ready
179 | kubectl --context ${KUBE_CONTEXT} -n default wait --for=condition=available --timeout=300s deployment --all
180 | 
181 | # Wait for the gateway to be ready
182 | kubectl --context ${KUBE_CONTEXT} wait gateway/inference-gateway --for=condition=Programmed --timeout=300s
183 | 
184 | cat <<EOF
185 | -----------------------------------------
186 | Deployment completed!
187 | 
188 | * Kind Cluster Name: ${CLUSTER_NAME}
189 | * Kubectl Context: ${KUBE_CONTEXT}
190 | 
191 | Status:
192 | 
193 | * The vllm simulator is running and exposed via InferencePool
194 | * The Gateway is exposing the InferencePool via HTTPRoute
195 | * The Endpoint Picker is loaded into the Gateway via ext_proc
196 | 
197 | You can watch the Endpoint Picker logs with:
198 | 
199 |   $ kubectl --context ${KUBE_CONTEXT} logs -f deployments/endpoint-picker
200 | 
201 | With that running in the background, you can make requests:
202 | 
203 |   $ curl -s -w '\n' http://localhost:${GATEWAY_HOST_PORT}/v1/completions -H 'Content-Type: application/json' -d '{"model":"food-review","prompt":"hi","max_tokens":10,"temperature":0}' | jq
204 | 
205 | See DEVELOPMENT.md for additional access methods if the above fails.
206 | 
207 | -----------------------------------------
208 | EOF
209 | 


--------------------------------------------------------------------------------
/test/integration/epp_test.go:
--------------------------------------------------------------------------------
 1 | //go:build integration_tests
 2 | // +build integration_tests
 3 | 
 4 | package integration_test
 5 | 
 6 | import (
 7 | 	"bytes"
 8 | 	"encoding/json"
 9 | 	"fmt"
10 | 	"io"
11 | 	"net/http"
12 | 	"testing"
13 | 
14 | 	"github.com/stretchr/testify/assert"
15 | 	"github.com/stretchr/testify/require"
16 | )
17 | 
18 | type inferenceResponse struct {
19 | 	Choices []inferenceChoice `json:"choices"`
20 | 	Model   string            `json:"model"`
21 | }
22 | 
23 | type inferenceChoice struct {
24 | 	Text string `json:"text"`
25 | }
26 | 
27 | func TestEndpointPickerBasics(t *testing.T) {
28 | 	inferenceURL := fmt.Sprintf("%s/v1/completions", gatewayURL)
29 | 	jsonData := []byte(`{"model":"food-review","prompt":"hi","max_tokens":10,"temperature":0}`)
30 | 
31 | 	t.Logf("sending POST to %s: %s", inferenceURL, jsonData)
32 | 	resp, err := http.Post(inferenceURL, "application/json", bytes.NewBuffer(jsonData))
33 | 	require.NoError(t, err)
34 | 	defer resp.Body.Close()
35 | 
36 | 	t.Logf("checking HTTP response from %s", inferenceURL)
37 | 	require.Equal(t, http.StatusOK, resp.StatusCode)
38 | 	respBytes, err := io.ReadAll(resp.Body)
39 | 	require.NoError(t, err)
40 | 
41 | 	t.Log("checking HTTP response headers to verify endpoint-picker was called")
42 | 	assert.Equal(t, "true", resp.Header.Get("x-went-into-resp-headers"))
43 | 	assert.NotNil(t, resp.Header.Get("x-envoy-upstream-service-time"))
44 | 
45 | 	t.Logf("checking HTTP response body: %s", respBytes)
46 | 	infResp := inferenceResponse{}
47 | 	require.NoError(t, json.Unmarshal(respBytes, &infResp))
48 | 	assert.Equal(t, "food-review", infResp.Model)
49 | 	require.True(t, len(infResp.Choices) > 0)
50 | 	assert.NotEmpty(t, infResp.Choices[0].Text)
51 | }
52 | 


--------------------------------------------------------------------------------
/test/integration/suite_test.go:
--------------------------------------------------------------------------------
 1 | //go:build integration_tests
 2 | // +build integration_tests
 3 | 
 4 | package integration_test
 5 | 
 6 | import (
 7 | 	"fmt"
 8 | 	"net/http"
 9 | 	"os"
10 | 	"testing"
11 | 
12 | 	"k8s.io/client-go/kubernetes"
13 | 	"k8s.io/client-go/kubernetes/scheme"
14 | 	"k8s.io/client-go/tools/clientcmd"
15 | 	gwinfv1alpha2 "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2"
16 | 	gwapiv1 "sigs.k8s.io/gateway-api/apis/v1"
17 | )
18 | 
19 | const (
20 | 	gatewayURL = "http://localhost:30080" // TODO: make configurable
21 | )
22 | 
23 | var (
24 | 	kube *kubernetes.Clientset
25 | )
26 | 
27 | func TestMain(m *testing.M) {
28 | 	if err := initializeKubernetesClient(); err != nil {
29 | 		fmt.Fprintf(os.Stderr, "failed to initialize kubernetes client: %v\n", err)
30 | 		os.Exit(1)
31 | 	}
32 | 
33 | 	if err := initializeGateway(); err != nil {
34 | 		fmt.Fprintf(os.Stderr, "failed to initialize gateway: %v\n", err)
35 | 		os.Exit(1)
36 | 	}
37 | 
38 | 	code := m.Run()
39 | 	os.Exit(code)
40 | }
41 | 
42 | func initializeKubernetesClient() error {
43 | 	kubeConfigPath := os.Getenv("KUBECONFIG")
44 | 	if kubeConfigPath == "" {
45 | 		return fmt.Errorf("no KUBECONFIG set")
46 | 	}
47 | 
48 | 	kubeConfig, err := clientcmd.BuildConfigFromFlags("", kubeConfigPath)
49 | 	if err != nil {
50 | 		return err
51 | 	}
52 | 
53 | 	if err := gwapiv1.Install(scheme.Scheme); err != nil {
54 | 		return err
55 | 	}
56 | 
57 | 	if err := gwinfv1alpha2.Install(scheme.Scheme); err != nil {
58 | 		return err
59 | 	}
60 | 
61 | 	kube, err = kubernetes.NewForConfig(kubeConfig)
62 | 	if err != nil {
63 | 		return err
64 | 	}
65 | 
66 | 	_, err = kube.ServerVersion()
67 | 	if err != nil {
68 | 		return fmt.Errorf("request to kubernetes api failed: %w", err)
69 | 	}
70 | 
71 | 	return nil
72 | }
73 | 
74 | func initializeGateway() (err error) {
75 | 	resp, err := http.Get(gatewayURL)
76 | 	if err != nil {
77 | 		return err
78 | 	}
79 | 
80 | 	if resp.StatusCode != http.StatusNotFound {
81 | 		return fmt.Errorf("expected gateway to return 404, found: %s", resp.Status)
82 | 	}
83 | 
84 | 	serverHeader := resp.Header.Get("Server")
85 | 	if serverHeader == "" {
86 | 		return fmt.Errorf(`expected gateway to return "istio-envoy" server header, found no value`)
87 | 	}
88 | 	if serverHeader != "istio-envoy" {
89 | 		return fmt.Errorf(`expected gateway to return "istio-envoy" server header, found: %s`, serverHeader)
90 | 	}
91 | 
92 | 	return nil
93 | }
94 | 


--------------------------------------------------------------------------------