├── .github
    ├── ISSUE_TEMPLATE
    │   ├── bug_report.md
    │   └── feature_request.md
    ├── PULL_REQUEST_TEMPLATE
    │   └── pull_request_template.md
    ├── actions
    │   ├── docker-build-and-push
    │   │   └── action.yml
    │   ├── markdown-link-checker
    │   │   └── action.yaml
    │   ├── push-image
    │   │   └── action.yml
    │   └── trivy-scan
    │   │   └── action.yml
    └── workflows
    │   ├── ci-pr-checks.yaml
    │   ├── ci-release.yaml
    │   └── old
    │       └── pipeline-run.orig
├── .gitignore
├── .golangci.yml
├── .version.json
├── Dockerfile
├── LICENSE
├── Makefile
├── PROJECT
├── README.md
├── api
    └── v1alpha1
    │   ├── groupversion_info.go
    │   ├── modelservice_types.go
    │   └── zz_generated.deepcopy.go
├── cmd
    ├── generate.go
    ├── generate_test.go
    ├── root.go
    ├── run.go
    └── suite_test.go
├── config
    ├── crd
    │   ├── bases
    │   │   └── llm-d.ai_modelservices.yaml
    │   ├── kustomization.yaml
    │   └── kustomizeconfig.yaml
    ├── default
    │   ├── cert_metrics_manager_patch.yaml
    │   ├── kustomization.yaml
    │   ├── manager_metrics_patch.yaml
    │   └── metrics_service.yaml
    ├── dev
    │   ├── kustomization.yaml
    │   └── manager_patch.yaml
    ├── eppandinference
    │   ├── inferencepool-e2e.yaml
    │   └── kustomization.yaml
    ├── externalcrds
    │   ├── bases
    │   │   └── inferencecrds.yaml
    │   └── kustomization.yaml
    ├── manager
    │   ├── kustomization.yaml
    │   └── manager.yaml
    ├── network-policy
    │   ├── allow-metrics-traffic.yaml
    │   └── kustomization.yaml
    ├── prometheus
    │   ├── kustomization.yaml
    │   ├── monitor.yaml
    │   └── monitor_tls_patch.yaml
    ├── rbac
    │   ├── epp_role_binding.yaml
    │   ├── kustomization.yaml
    │   ├── leader_election_role.yaml
    │   ├── leader_election_role_binding.yaml
    │   ├── metrics_auth_role.yaml
    │   ├── metrics_auth_role_binding.yaml
    │   ├── metrics_reader_role.yaml
    │   ├── modelservice_admin_role.yaml
    │   ├── modelservice_editor_role.yaml
    │   ├── modelservice_viewer_role.yaml
    │   ├── role.yaml
    │   ├── role_binding.yaml
    │   └── service_account.yaml
    ├── samples
    │   ├── kustomization.yaml
    │   └── vllmd_v1alpha1_modelservice.yaml
    └── summitdemo
    │   ├── kustomization.yaml
    │   └── manager_patch.yaml
├── deploy
    ├── common
    │   ├── patch-service.yaml
    │   ├── patch-statefulset.yaml
    │   ├── service.yaml
    │   └── statefulset.yaml
    ├── kustomization.yaml
    ├── openshift
    │   ├── patch-route.yaml
    │   └── route.yaml
    └── rbac
    │   ├── exec-rbac-role.yaml
    │   ├── exec-rbac-rolebinding.yaml
    │   ├── patch-rbac-role.yaml
    │   └── patch-rbac-rolebinding.yaml
├── docs
    ├── api_reference
    │   ├── config.yaml
    │   ├── out.asciidoc
    │   └── out.html
    ├── apireference.md
    ├── developer.md
    ├── install.md
    ├── userguide.md
    └── userguide
    │   ├── core-concepts.md
    │   ├── model-artifacts.md
    │   └── model-name.md
├── go.mod
├── go.sum
├── hack
    └── boilerplate.go.txt
├── hooks
    └── pre-commit
├── internal
    └── controller
    │   ├── accelerator_types.go
    │   ├── accelerator_types_test.go
    │   ├── child_resources.go
    │   ├── child_resources_test.go
    │   ├── constants.go
    │   ├── merge_transformers.go
    │   ├── merge_transformers_test.go
    │   ├── modelservice_controller.go
    │   ├── modelservice_controller_test.go
    │   ├── suite_test.go
    │   ├── template.go
    │   ├── template_test.go
    │   ├── utils.go
    │   └── utils_test.go
├── main.go
├── model-service-arch.excalidraw
├── model-service-arch.png
├── perf
    └── create_modelservice.sh
├── samples
    ├── README.md
    ├── baseconfigs
    │   ├── simple-baseconfig.yaml
    │   ├── universal-baseconfig-pvc.yaml
    │   └── universal-baseconfig.yaml
    ├── msvcs
    │   ├── facebook-nixl.yaml
    │   ├── granite3.2.yaml
    │   ├── llama4.yaml
    │   └── xpyd.yaml
    └── test
    │   ├── README.md
    │   ├── baseconfig.yaml
    │   ├── msvc-hf.yaml
    │   └── msvc.yaml
└── test
    ├── e2e
        ├── e2e_suite_test.go
        └── e2e_test.go
    ├── inferenceCRDs
        ├── httproute.yaml
        ├── inferencemodel.yaml
        └── inferencepool.yaml
    ├── modelservices
        ├── baseResources.yaml
        └── ms1.yaml
    └── utils
        └── utils.go


/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve
 4 | title: ''
 5 | labels: kind/bug
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 | 
13 | **To Reproduce**
14 | Steps to reproduce the behavior
15 | 
16 | **Expected behavior**
17 | A clear and concise description of what you expected to happen.
18 | 
19 | **Screenshots**
20 | If applicable, add screenshots to help explain your problem.
21 | 
22 | **Additional context**
23 | Add any other context about the problem here.
24 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Suggest an idea for this project
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I have the following use-case [...]. For this use-case, I would like `modelservice` to [...]
12 | 
13 | **Describe the solution approach you'd like**
14 | A clear and concise description of what you want to happen.
15 | 
16 | **Describe alternatives you've considered**
17 | A clear and concise description of any alternative solutions or features you've considered.
18 | 
19 | **Additional context**
20 | Add any other context or screenshots about the feature request here.
21 | 


--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE/pull_request_template.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Pull request
 3 | about: Create a pull request
 4 | title: ''
 5 | labels: kind/bug
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | # Pull Request Template
11 | 
12 | ## Description
13 | 
14 | Please include a summary of the change and which issue is fixed. Please also include relevant motivation and context. List any dependencies that are required for this change.
15 | 
16 | Fixes # (issue)
17 | 
18 | ## Type of change
19 | 
20 | Please delete options that are not relevant.
21 | 
22 | - [ ] Bug fix (non-breaking change which fixes an issue)
23 | - [ ] New feature (non-breaking change which adds functionality)
24 | - [ ] Breaking change (fix or feature that would cause existing functionality to not work as expected)
25 | - [ ] This change requires a documentation update
26 | 
27 | ## How Has This Been Tested?
28 | 
29 | Please describe the tests that you ran to verify your changes. Provide instructions so we can reproduce. Please also list any relevant details for your test configuration
30 | 
31 | - [ ] Test A
32 | - [ ] Test B
33 | 
34 | **Test Configuration**:
35 | * OS (if applicable)
36 | * Kubernetes version (if applicable)
37 | 
38 | ## Checklist:
39 | 
40 | - [ ] My code follows the style guidelines of this project
41 | - [ ] I have performed a self-review of my own code
42 | - [ ] I have commented my code, particularly in hard-to-understand areas
43 | - [ ] I have made corresponding changes to the documentation
44 | - [ ] My changes generate no new warnings
45 | - [ ] I have added tests that prove my fix is effective or that my feature works
46 | - [ ] New and existing unit tests pass locally with my changes
47 | - [ ] Any dependent changes have been merged and published in downstream modules
48 | - [ ] I have checked my code and corrected any misspellings
49 | 


--------------------------------------------------------------------------------
/.github/actions/docker-build-and-push/action.yml:
--------------------------------------------------------------------------------
 1 | name: Docker Build - ghcr
 2 | description: Build image using buildx
 3 | inputs:
 4 |   image-name:
 5 |     required: true
 6 |     description: Image name
 7 |   tag:
 8 |     required: true
 9 |     description: Image tag
10 |   github-token:
11 |     required: true
12 |     description: GitHub token for login
13 |   registry:
14 |     required: true
15 |     description: Container registry (e.g., ghcr.io/llm-d)
16 | runs:
17 |   using: "composite"
18 |   steps:
19 |     - name: Set up Docker Buildx
20 |       uses: docker/setup-buildx-action@v3
21 | 
22 |     - name: Login to GitHub Container Registry
23 |       run: echo "${{ inputs.github-token }}" | docker login ghcr.io -u ${{ github.actor }} --password-stdin
24 |       shell: bash
25 | 
26 |     - name: Print image info
27 |       run: |
28 |         echo "Image name: ${{ inputs.image-name }}"
29 |         echo "Tag: ${{ inputs.tag }}"
30 |         echo "Registry: ${{ inputs.registry }}"
31 |       shell: bash
32 |       
33 |     - name: Build image
34 |       run: |
35 |         docker buildx build \
36 |           --platform linux/amd64 \
37 |           -t ${{ inputs.registry }}/${{ inputs.image-name }}:${{ inputs.tag }} \
38 |           --push .
39 |       shell: bash
40 | 


--------------------------------------------------------------------------------
/.github/actions/markdown-link-checker/action.yaml:
--------------------------------------------------------------------------------
 1 | name: Markdown Link Checker
 2 | description: Checks all Markdown files for broken links
 3 | inputs:
 4 |   github-token:
 5 |     description: GitHub token (not used, but kept for interface compatibility)
 6 |     required: false
 7 |   args:
 8 |     description: Arguments to pass to markdown-link-check
 9 |     required: false
10 |     default: "--quiet --retry"
11 | 
12 | runs:
13 |   using: "composite"
14 |   steps:
15 |     - name: Install markdown-link-check
16 |       shell: bash
17 |       run: npm install -g markdown-link-check
18 | 
19 |     - name: Run link check on all Markdown files
20 |       shell: bash
21 |       run: |
22 |         set -euo pipefail
23 |         echo "🔍 Scanning all Markdown files for broken links..."
24 |         failed=0
25 |         total_dead_links=0
26 | 
27 |         while IFS= read -r -d '' file; do
28 |           echo "------------------------------------------------------------"
29 |           echo "📄 Checking: $file"
30 |           output=$(markdown-link-check ${{ inputs.args }} "$file" 2>&1)
31 |           echo "$output"
32 | 
33 |           if echo "$output" | grep -q '✖'; then
34 |             num_file_dead_links=$(echo "$output" | grep '✖' | wc -l)
35 |             echo "❌ $num_file_dead_links broken links in $file"
36 |             total_dead_links=$((total_dead_links + num_file_dead_links))
37 |             failed=1
38 |           else
39 |             echo "✅ No broken links in $file"
40 |           fi
41 |         done < <(find . -type f -name "*.md" -print0)
42 | 
43 |         echo "------------------------------------------------------------"
44 |         if [ "$failed" -ne 0 ]; then
45 |           echo "❌ Total broken links found: $total_dead_links"
46 |           exit 1
47 |         else
48 |           echo "✅ All Markdown files passed link checks."
49 |         fi
50 | 


--------------------------------------------------------------------------------
/.github/actions/push-image/action.yml:
--------------------------------------------------------------------------------
 1 | name: Push Docker Image
 2 | description: Push built image to container registry
 3 | inputs:
 4 |   image-name:
 5 |     required: true
 6 |   tag:
 7 |     required: true
 8 |   registry:
 9 |     required: true
10 | runs:
11 |   using: "composite"
12 |   steps:
13 |     - name: Push image
14 |       run: |
15 |         docker push ${{ inputs.registry }}/${{ inputs.image-name }}:${{ inputs.tag }}
16 |       shell: bash
17 | 


--------------------------------------------------------------------------------
/.github/actions/trivy-scan/action.yml:
--------------------------------------------------------------------------------
 1 | name: Trivy Scan
 2 | description: Scan container image with Trivy
 3 | inputs:
 4 |   image:
 5 |     required: true
 6 | runs:
 7 |   using: "composite"
 8 |   steps:
 9 |     - name: Install Trivy
10 |       run: |
11 |         wget https://github.com/aquasecurity/trivy/releases/download/v0.44.1/trivy_0.44.1_Linux-64bit.deb
12 |         sudo dpkg -i trivy_0.44.1_Linux-64bit.deb
13 |       shell: bash
14 | 
15 | 
16 |     - name: Scan image
17 |       run: |
18 |         trivy image --severity HIGH,CRITICAL --no-progress ${{ inputs.image }}
19 |       shell: bash
20 | 


--------------------------------------------------------------------------------
/.github/workflows/ci-pr-checks.yaml:
--------------------------------------------------------------------------------
 1 | name: CI - PR Checks
 2 | 
 3 | on:
 4 |   pull_request:
 5 |     branches:
 6 |       - main
 7 | 
 8 | jobs:
 9 |   lint-and-test:
10 |     runs-on: ubuntu-latest
11 |     steps:
12 |       - name: Checkout source
13 |         uses: actions/checkout@v4
14 |         
15 |       - name: Sanity check repo contents
16 |         run: ls -la
17 |     
18 |       - name: Set up go with cache
19 |         uses: actions/setup-go@v5
20 |         with:
21 |           go-version: '1.24.0'
22 |           cache-dependency-path: ./go.sum
23 | 
24 |       - name: Run markdown link checker
25 |         uses: ./.github/actions/markdown-link-checker
26 |         with:
27 |           github-token: ${{ secrets.GITHUB_TOKEN }}
28 |           args: "--quiet --retry"    
29 | 
30 |       - name: Run lint checks
31 |         uses: golangci/golangci-lint-action@v8
32 |         with:
33 |           version: 'v2.1.6'
34 |           args: "--config=./.golangci.yml"
35 | 
36 |       - name: Run go test
37 |         shell: bash 
38 |         run: |
39 |           make test
40 | 


--------------------------------------------------------------------------------
/.github/workflows/ci-release.yaml:
--------------------------------------------------------------------------------
 1 | name: CI - Release - Docker Container Image
 2 | 
 3 | on:
 4 |   push:
 5 |     tags:
 6 |       - 'v*'  # Runs when a tag like v0.1.0 is pushed
 7 |   release:
 8 |     types: [published]  # Also runs when a GitHub release is published
 9 | 
10 | jobs:
11 |   docker-build-and-push:
12 |     runs-on: ubuntu-latest
13 |     steps:
14 |       - name: Checkout source
15 |         uses: actions/checkout@v4
16 | 
17 |       - name: Set project name from repository
18 |         id: version
19 |         run: |
20 |           repo="${GITHUB_REPOSITORY##*/}"
21 |           echo "project_name=$repo" >> "$GITHUB_OUTPUT"
22 | 
23 |       - name: Print project name
24 |         run: echo "Project is ${{ steps.version.outputs.project_name }}"
25 | 
26 |       - name: Determine tag name
27 |         id: tag
28 |         run: |
29 |           if [[ "${GITHUB_EVENT_NAME}" == "release" ]]; then
30 |             echo "tag=${GITHUB_REF##refs/tags/}" >> "$GITHUB_OUTPUT"
31 |           elif [[ "${GITHUB_REF}" == refs/tags/* ]]; then
32 |             echo "tag=${GITHUB_REF##refs/tags/}" >> "$GITHUB_OUTPUT"
33 |           else
34 |             echo "tag=latest" >> "$GITHUB_OUTPUT"
35 |           fi
36 |         shell: bash
37 | 
38 |       - name: Build and push image
39 |         uses: ./.github/actions/docker-build-and-push
40 |         with:
41 |           tag: ${{ steps.tag.outputs.tag }}
42 |           image-name: ${{ steps.version.outputs.project_name }}
43 |           registry: ghcr.io/llm-d
44 |           github-token: ${{ secrets.GHCR_TOKEN }}
45 | 
46 |       - name: Run Trivy scan
47 |         uses: ./.github/actions/trivy-scan
48 |         with:
49 |           image: ghcr.io/llm-d/${{ steps.version.outputs.project_name }}:${{ steps.tag.outputs.tag }}
50 | 


--------------------------------------------------------------------------------
/.github/workflows/old/pipeline-run.orig:
--------------------------------------------------------------------------------
 1 | name: CI PipelineRun
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - dev
 7 |       - main
 8 |   pull_request:
 9 | 
10 | jobs:
11 |   pipeline:
12 |     runs-on: ubuntu-latest
13 |     steps:
14 |       - name: Checkout source
15 |         uses: actions/checkout@v4
16 | 
17 |       - name: Sanity check repo contents
18 |         run: ls -la
19 | 
20 |       - name: Run lint checks
21 |         uses: ./.github/actions/lint
22 | 
23 |       - name: Build container image
24 |         uses: ./.github/actions/docker-build-and-push
25 |         with:
26 |           image-name: my-app
27 |           tag: ${{ github.sha }}
28 |           github-token: ${{ secrets.GHCR_TOKEN }}
29 |         
30 |       - name: Run Trivy scan
31 |         uses: ./.github/actions/trivy-scan
32 |         with:
33 |           image: ghcr.io/llm-d/my-app:${{ github.sha }}
34 | 
35 |     #   - name: Push image
36 |     #     if: github.ref == 'refs/heads/main'
37 |     #     uses: ./.github/actions/push-image
38 |     #     with:
39 |     #       image-name: my-app
40 |     #       tag: ${{ github.sha }}
41 |     #       registry: ghcr.io/llm-d
42 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Binaries for programs and plugins
 2 | *.exe
 3 | *.exe~
 4 | *.dll
 5 | *.so
 6 | *.dylib
 7 | bin/*
 8 | Dockerfile.cross
 9 | **/.DS_Store
10 | 
11 | # Test binary, built with `go test -c`
12 | *.test
13 | 
14 | # Output of the go coverage tool, specifically when used with LiteIDE
15 | *.out
16 | 
17 | # Go workspace file
18 | go.work
19 | 
20 | # Kubernetes Generated files - skip generated files, except for vendored files
21 | !vendor/**/zz_generated.*
22 | 
23 | # editor and IDE paraphernalia
24 | .idea
25 | .vscode
26 | *.swp
27 | *.swo
28 | *~
29 | 
30 | # temp files
31 | eppandinference.yaml
32 | externalcrds.yaml
33 | 
34 | 
35 | 


--------------------------------------------------------------------------------
/.golangci.yml:
--------------------------------------------------------------------------------
 1 | # Refer to golangci-lint's example config file for more options and information:
 2 | # https://github.com/golangci/golangci-lint/blob/master/.golangci.reference.yml
 3 | version: "2"
 4 | 
 5 | run:
 6 |   timeout: 5m
 7 |   modules-download-mode: readonly
 8 | 
 9 | linters:
10 |   enable:
11 |     - errcheck
12 |     - govet
13 |     - staticcheck
14 | 
15 | issues:
16 |   max-issues-per-linter: 0
17 |   max-same-issues: 0


--------------------------------------------------------------------------------
/.version.json:
--------------------------------------------------------------------------------
1 | {
2 |   "dev-version": "0.0.12",
3 |   "dev-registry": "ghcr.io/llm-d/llm-d-model-service-dev",
4 |   "prod-version": "0.0.11",
5 |   "prod-registry": "ghcr.io/llm-d/llm-d-model-service"
6 | }
7 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Build the manager binary
 2 | FROM quay.io/projectquay/golang:1.24 AS builder
 3 | ARG TARGETOS
 4 | ARG TARGETARCH
 5 | 
 6 | WORKDIR /workspace
 7 | # Copy the Go Modules manifests
 8 | COPY go.mod go.mod
 9 | COPY go.sum go.sum
10 | # cache deps before building and copying source so that we don't need to re-download as much
11 | # and so that source changes don't invalidate our downloaded layer
12 | RUN go mod download
13 | 
14 | # Copy the go source
15 | COPY main.go main.go
16 | COPY api/ api/
17 | COPY internal/ internal/
18 | COPY cmd/ cmd/
19 | 
20 | # Build
21 | # the GOARCH has not a default value to allow the binary be built according to the host where the command
22 | # was called. For example, if we call make docker-build in a local env which has the Apple Silicon M1 SO
23 | # the docker BUILDPLATFORM arg will be linux/arm64 when for Apple x86 it will be linux/amd64. Therefore,
24 | # by leaving it empty we can ensure that the container and binary shipped on it will have the same platform.
25 | RUN CGO_ENABLED=0 GOOS=${TARGETOS:-linux} GOARCH=${TARGETARCH} go build -a -o bin/manager main.go
26 | 
27 | # Use distroless as minimal base image to package the manager binary
28 | # Refer to https://github.com/GoogleContainerTools/distroless for more details
29 | FROM registry.access.redhat.com/ubi9/ubi:latest
30 | WORKDIR /
31 | COPY --from=builder /workspace/bin/manager /manager
32 | USER 65532:65532
33 | 
34 | CMD ["sleep", "infinity"]
35 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "{}"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright 2022 The Kubernetes Authors
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.


--------------------------------------------------------------------------------
/PROJECT:
--------------------------------------------------------------------------------
 1 | # Code generated by tool. DO NOT EDIT.
 2 | # This file is used to track the info used to scaffold your project
 3 | # and allow the plugins properly work.
 4 | # More info: https://book.kubebuilder.io/reference/project-config.html
 5 | domain: llm-d.ai
 6 | layout:
 7 | - go.kubebuilder.io/v4
 8 | projectName: modelservice
 9 | repo: github.com/llm-d/llm-d-model-service
10 | resources:
11 | - api:
12 |     crdVersion: v1
13 |     namespaced: true
14 |   controller: true
15 |   domain: llm-d.ai
16 |   group: llmd
17 |   kind: ModelService
18 |   path: github.com/llm-d/llm-d-model-service/api/v1alpha1
19 |   version: v1alpha1
20 | version: "3"
21 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # ModelService
 2 | 
 3 | > *ModelService* declaratively provisions and maintains the Kubernetes resources needed to serve a base model for inference.
 4 | 
 5 | A *ModelService* custom resource encapsulates the desired state of workloads and routing associated with a single base model. It automates the management of Kubernetes resources, including:
 6 | 
 7 | * Prefill and decode deployments
 8 | * Inference pool and model defined by [Gateway API Inference Extension](https://gateway-api-inference-extension.sigs.k8s.io)
 9 | * [Endpoint picker (EPP) deployment and service](https://gateway-api-inference-extension.sigs.k8s.io/?h=endpoint#endpoint-selection-extension)
10 | * Relevant RBAC permissions
11 | 
12 | A *ModelService* may optionally reference a **BaseConfig** — a Kubernetes ConfigMap that defines reusable, platform-managed presets for shared behavior across multiple base models.
13 | 
14 | Typically, platform operators define a small set of *BaseConfig* presets, and base model owners reference them in their respective *ModelService* resources.
15 | 
16 | The *ModelService* controller reconciles the cluster state to align with the configuration declared in the *ModelService* custom resource. This custom resource is the source of truth for resources it owns.
17 | 
18 | > ⚠️ Important: Do not manually modify resources owned by a *ModelService*. If your use case is not yet supported, please file an issue in the *ModelService* repository.
19 | 
20 | ## Features
21 | 
22 | ✅ Supports disaggregated prefill and decode workloads
23 | 
24 | 🌐 Integrates with Gateway API Inference Extension for request routing
25 | 
26 | 📈 Enables auto-scaling via HPA or custom controllers
27 | 
28 | 🔧 Allows independent scaling and node affinity for prefill and decode deployments
29 | 
30 | 📦 Supports model loading from:
31 | 
32 |   * HuggingFace (public or private)
33 |   * Kubernetes PVCs
34 |   * OCI images
35 | 
36 | 🧩 Supports value templating in both *BaseConfig* and *ModelService* resources
37 | 
38 | ## How It Works
39 | 
40 | When a *ModelService* resource is reconciled:
41 | 
42 | 1. **Templating**: template variables in *BaseConfig* and *ModelService* are interpolated based on the *ModelService* spec.
43 | 
44 | 2. **Merging**: a semantic merge overlays *ModelService* values on top of the selected *BaseConfig*.
45 | 
46 | 3. **Orchestration**: the controller creates or updates the following resources:
47 | 
48 |   * Inference workloads (prefill and decode deployments)
49 |   * Routing resources (e.g., EPP deployment)
50 |   * RBAC permissions
51 | 
52 | The result is a fully managed inference stack for the base model.
53 | 
54 | ![model-service-arch](model-service-arch.png)
55 | 
56 | ## Best Practices
57 | 
58 | * Use *BaseConfig* to capture platform-level defaults and shared configurations across multiple base models.
59 | * Use *ModelService* to define behavior specific to a given base model, and override *BaseConfig* values only when necessary.
60 | * Platform teams should install *Baseconfig* presets using the `llm-d` deployer.
61 | * Base model owners should prefer using these presets to streamline onboarding of base models, rather than creating their own *BaseConfigs*.
62 | 
63 | ## Docs
64 | 
65 | ### [Install](docs/install.md)
66 | 
67 | ### [Samples](./samples/README.md)
68 | 
69 | ### [User Guide](docs/userguide.md)
70 | 
71 | ### [API Reference](docs/apireference.md)
72 | 
73 | ### [Developer](docs/developer.md)
74 | 
75 | ## Roadmap
76 | 
77 | `Modelservice` roadmap features in no specific order.
78 | 
79 | 1. Multiple base models: Create HTTPRoute and related routing configuration
80 | 
81 | 2. LoRA adapters: Create LoRA controller that integrates with `ModelService`
82 | 
83 | 3. Routing weights: Allow a logical model to expose multiple model versions via routing weights
84 | 
85 | 4. In-cluster model caching: download model artifacts once into cluster and reuse
86 | 
87 | 5. Node-level model caching: pre-load model artifacts onto nodes for fast model loading
88 | 
89 | 6. BaseConfig CRD: migrate from the use of configmaps to CRD for `baseconfig` resources
90 | 
91 | 7. Prometheus metrics exporter: Emit controller metrics
92 | 
93 | 8. Enable multi-node inferencing: for instance, using LWS integration
94 | 


--------------------------------------------------------------------------------
/api/v1alpha1/groupversion_info.go:
--------------------------------------------------------------------------------
 1 | // Package v1alpha1 contains API Schema definitions for the llmd v1alpha1 API group.
 2 | // +kubebuilder:object:generate=true
 3 | // +groupName=llm-d.ai
 4 | package v1alpha1
 5 | 
 6 | import (
 7 | 	"k8s.io/apimachinery/pkg/runtime/schema"
 8 | 	"sigs.k8s.io/controller-runtime/pkg/scheme"
 9 | )
10 | 
11 | var (
12 | 	// GroupVersion is group version used to register these objects.
13 | 	GroupVersion = schema.GroupVersion{Group: "llm-d.ai", Version: "v1alpha1"}
14 | 
15 | 	// SchemeBuilder is used to add go types to the GroupVersionKind scheme.
16 | 	SchemeBuilder = &scheme.Builder{GroupVersion: GroupVersion}
17 | 
18 | 	// AddToScheme adds the types in this group-version to the given scheme.
19 | 	AddToScheme = SchemeBuilder.AddToScheme
20 | )
21 | 


--------------------------------------------------------------------------------
/cmd/generate.go:
--------------------------------------------------------------------------------
  1 | package cmd
  2 | 
  3 | import (
  4 | 	"context"
  5 | 	"fmt"
  6 | 	"os"
  7 | 
  8 | 	"github.com/go-logr/logr"
  9 | 	"github.com/spf13/cobra"
 10 | 	zaplog "go.uber.org/zap"
 11 | 	"go.uber.org/zap/zapcore"
 12 | 	corev1 "k8s.io/api/core/v1"
 13 | 	"k8s.io/client-go/kubernetes/scheme"
 14 | 	"sigs.k8s.io/controller-runtime/pkg/log"
 15 | 	"sigs.k8s.io/controller-runtime/pkg/log/zap"
 16 | 	"sigs.k8s.io/yaml"
 17 | 
 18 | 	msv1alpha1 "github.com/llm-d/llm-d-model-service/api/v1alpha1"
 19 | 	"github.com/llm-d/llm-d-model-service/internal/controller"
 20 | 	giev1alpha2 "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2"
 21 | 	gatewayv1 "sigs.k8s.io/gateway-api/apis/v1"
 22 | )
 23 | 
 24 | func readModelService(ctx context.Context, filename string, logger logr.Logger) (*msv1alpha1.ModelService, error) {
 25 | 	var modelService msv1alpha1.ModelService
 26 | 	data, err := os.ReadFile(filename)
 27 | 	if err != nil {
 28 | 		logger.Error(err, "unable to read ModelService from "+filename)
 29 | 		return nil, err
 30 | 	}
 31 | 
 32 | 	err = yaml.Unmarshal(data, &modelService)
 33 | 	if err != nil {
 34 | 		logger.Error(err, "unable to unmarshal data")
 35 | 		return nil, err
 36 | 	}
 37 | 
 38 | 	// interpolate MSVC
 39 | 	return controller.InterpolateModelService(ctx, &modelService)
 40 | }
 41 | 
 42 | func getBaseChildResources(filename string, msvc *msv1alpha1.ModelService, logger logr.Logger) (*controller.BaseConfig, error) {
 43 | 	var baseChildResourcesConfigMap *corev1.ConfigMap
 44 | 	var baseChildResources *controller.BaseConfig
 45 | 
 46 | 	if filename != "" {
 47 | 		data, err := os.ReadFile(filename)
 48 | 		if err != nil {
 49 | 			if os.IsNotExist(err) {
 50 | 				logger.Error(err, "unable to read base child resources from "+filename)
 51 | 				return nil, err
 52 | 			}
 53 | 			data = []byte{}
 54 | 		}
 55 | 
 56 | 		err = yaml.Unmarshal(data, &baseChildResourcesConfigMap)
 57 | 		if err != nil {
 58 | 			logger.Error(err, "unable to unmarshal base child resources")
 59 | 			return nil, err
 60 | 		}
 61 | 	} else {
 62 | 		baseChildResourcesConfigMap = &corev1.ConfigMap{}
 63 | 	}
 64 | 
 65 | 	interpolated, err := controller.InterpolateBaseConfigMap(context.TODO(), baseChildResourcesConfigMap, msvc)
 66 | 	if err != nil {
 67 | 		logger.Error(err, "cannot interpolate base configmap")
 68 | 		return nil, err
 69 | 	}
 70 | 
 71 | 	baseChildResources, err = controller.BaseConfigFromCM(interpolated)
 72 | 	if err != nil {
 73 | 		logger.Error(err, "unable to create base child resources from config map")
 74 | 		return nil, err
 75 | 	}
 76 | 
 77 | 	return baseChildResources, nil
 78 | }
 79 | 
 80 | func generateManifests(ctx context.Context, manifestFile string, configFile string) (*string, error) {
 81 | 	logger := log.FromContext(ctx)
 82 | 
 83 | 	// get msvc from file and interpolate it
 84 | 	msvc, err := readModelService(ctx, manifestFile, logger)
 85 | 	if err != nil {
 86 | 		logger.Error(err, "unable to read ModelService", "location", manifestFile)
 87 | 		return nil, err
 88 | 	}
 89 | 	logger.V(1).Info("generateManifest", "modelService", msvc)
 90 | 
 91 | 	// get base child resources from file
 92 | 	config, err := getBaseChildResources(configFile, msvc, logger)
 93 | 	if err != nil {
 94 | 		logger.Error(err, "unable to read basic configuration", "location", configFile)
 95 | 		return nil, err
 96 | 	}
 97 | 	logger.V(1).Info("generateManifest", "baseResources", config)
 98 | 
 99 | 	// create scheme
100 | 	err = msv1alpha1.AddToScheme(scheme.Scheme)
101 | 	if err != nil {
102 | 		logger.Info("unable to add model service to scheme")
103 | 		return nil, err
104 | 	}
105 | 	err = gatewayv1.Install(scheme.Scheme)
106 | 	if err != nil {
107 | 		logger.Info("unable to add gateway api extension to scheme")
108 | 		return nil, err
109 | 	}
110 | 	err = giev1alpha2.Install(scheme.Scheme)
111 | 	if err != nil {
112 | 		logger.Info("unable to add gateway api extension to scheme")
113 | 		return nil, err
114 | 	}
115 | 
116 | 	// update child resources
117 | 	cR := config.MergeChildResources(ctx, msvc, scheme.Scheme, &rbacOptions)
118 | 	logger.V(1).Info("generateManifest", "baseResources", cR)
119 | 
120 | 	yamlStr := ""
121 | 	yamlBytes, err := yaml.Marshal(&cR)
122 | 	if err != nil {
123 | 		logger.Error(err, "unable to marshal object to YAML")
124 | 		return nil, err
125 | 	}
126 | 
127 | 	yamlStr = string(yamlBytes)
128 | 	return &yamlStr, nil
129 | }
130 | 
131 | var modelServiceManifest string
132 | var baseConfigurationManifest string
133 | 
134 | var generateCmd = &cobra.Command{
135 | 	Use:   "generate",
136 | 	Short: "Generate manifest",
137 | 	Long:  `Generate manifest for objects created by ModelService controller`,
138 | 	RunE: func(cmd *cobra.Command, args []string) error {
139 | 		ctx := context.Background()
140 | 		var opts = zap.Options{
141 | 			Development: false,
142 | 			TimeEncoder: zapcore.RFC3339NanoTimeEncoder,
143 | 			ZapOpts:     []zaplog.Option{zaplog.AddCaller()},
144 | 			Level:       parseZapLogLevel(logLevel),
145 | 		}
146 | 		logger := zap.New(zap.UseFlagOptions(&opts))
147 | 		log.SetLogger(logger)
148 | 		log.IntoContext(ctx, logger)
149 | 
150 | 		result, err := generateManifests(ctx, modelServiceManifest, baseConfigurationManifest)
151 | 		if err != nil {
152 | 			return err
153 | 		}
154 | 
155 | 		fmt.Println(*result)
156 | 		return nil
157 | 	},
158 | }
159 | 
160 | func init() {
161 | 	generateCmd.Flags().StringVarP(&modelServiceManifest, "modelservice", "m", "", "File containing the ModelService definition.")
162 | 	_ = generateCmd.MarkFlagRequired("modelservice")
163 | 	generateCmd.Flags().StringVarP(&baseConfigurationManifest, "baseconfig", "b", "", "File containing the base platform configuration.")
164 | 	rootCmd.AddCommand(generateCmd)
165 | }
166 | 


--------------------------------------------------------------------------------
/cmd/generate_test.go:
--------------------------------------------------------------------------------
 1 | package cmd
 2 | 
 3 | import (
 4 | 	"context"
 5 | 	"path/filepath"
 6 | 
 7 | 	. "github.com/onsi/ginkgo/v2"
 8 | 	. "github.com/onsi/gomega"
 9 | )
10 | 
11 | var _ = Describe("generate command", func() {
12 | 
13 | 	var ctx context.Context
14 | 
15 | 	BeforeEach(func() {
16 | 		ctx = context.Background()
17 | 	})
18 | 
19 | 	Context("simulate call", func() {
20 | 		modelServiceYaml := filepath.Join("..", "samples", "test", "msvc.yaml")
21 | 		baseConfigYaml := filepath.Join("..", "samples", "test", "baseconfig.yaml")
22 | 		rootCmd.SetArgs([]string{
23 | 			"--epp-cluster-role=dummy",
24 | 			"generate",
25 | 			"-m", modelServiceYaml,
26 | 			"-b", baseConfigYaml,
27 | 		})
28 | 		err := rootCmd.Execute()
29 | 		Expect(err).ToNot(HaveOccurred())
30 | 	})
31 | 
32 | 	Context("call with valid inputs", func() {
33 | 		modelServiceYaml := filepath.Join("..", "samples", "test", "msvc.yaml")
34 | 		baseConfigYaml := filepath.Join("..", "samples", "test", "baseconfig.yaml")
35 | 		It("should generate manifests", func() {
36 | 			msvc, err := generateManifests(ctx, modelServiceYaml, baseConfigYaml)
37 | 			Expect(err).To(BeNil())
38 | 			Expect(msvc).ToNot(BeNil())
39 | 		})
40 | 	})
41 | 
42 | 	Context("call with invalid modelService filename", func() {
43 | 		modelServiceYaml := filepath.Join(".", "invalid")
44 | 		baseConfigYaml := filepath.Join("..", "samples", "test", "baseconfig.yaml")
45 | 		It("should report an error", func() {
46 | 			_, err := generateManifests(ctx, modelServiceYaml, baseConfigYaml)
47 | 			Expect(err).ToNot(BeNil())
48 | 		})
49 | 	})
50 | 
51 | 	Context("call with invalid modelService content", func() {
52 | 		modelServiceYaml := filepath.Join("..", "test", "modelservices", "invalidyaml.yaml")
53 | 		baseConfigYaml := filepath.Join("..", "samples", "test", "baseconfig.yaml")
54 | 		It("should report an error", func() {
55 | 			_, err := generateManifests(ctx, modelServiceYaml, baseConfigYaml)
56 | 			Expect(err).ToNot(BeNil())
57 | 		})
58 | 	})
59 | 
60 | 	Context("call with empty baseConfiguration filename", func() {
61 | 		modelServiceYaml := filepath.Join("..", "samples", "test", "msvc.yaml")
62 | 		baseConfigYaml := ""
63 | 		It("should generate manifests", func() {
64 | 			msvc, err := generateManifests(ctx, modelServiceYaml, baseConfigYaml)
65 | 			Expect(err).To(BeNil())
66 | 			Expect(msvc).ToNot(BeNil())
67 | 		})
68 | 	})
69 | 
70 | 	Context("call with invalid baseConfiguration filename", func() {
71 | 		modelServiceYaml := filepath.Join("..", "samples", "test", "msvc.yaml")
72 | 		baseConfigYaml := filepath.Join(".", "invalid")
73 | 		It("should report an error", func() {
74 | 			_, err := generateManifests(ctx, modelServiceYaml, baseConfigYaml)
75 | 			Expect(err).ToNot(BeNil())
76 | 		})
77 | 	})
78 | 
79 | 	Context("call with invalid baseConfiguration content", func() {
80 | 		modelServiceYaml := filepath.Join("..", "samples", "test", "msvc.yaml")
81 | 		baseConfigYaml := filepath.Join("..", "test", "modelservices", "invalidyaml.yaml")
82 | 		It("should report an error", func() {
83 | 			_, err := generateManifests(ctx, modelServiceYaml, baseConfigYaml)
84 | 			Expect(err).ToNot(BeNil())
85 | 		})
86 | 	})
87 | })
88 | 


--------------------------------------------------------------------------------
/cmd/root.go:
--------------------------------------------------------------------------------
 1 | package cmd
 2 | 
 3 | import (
 4 | 	"os"
 5 | 
 6 | 	"github.com/llm-d/llm-d-model-service/internal/controller"
 7 | 	"github.com/spf13/cobra"
 8 | )
 9 | 
10 | // rbac options
11 | var rbacOptions controller.RBACOptions
12 | 
13 | // rootCmd represents the base command when called without any subcommands
14 | var rootCmd = &cobra.Command{
15 | 	Use:   "manager",
16 | 	Short: "ModelService controller CLI",
17 | 	Long:  `ModelService controller CLI`,
18 | }
19 | 
20 | // MyExecute adds all child commands to the root command and sets flags appropriately.
21 | // This is called by main.main(). It only needs to happen once to the rootCmd.
22 | func Execute() {
23 | 	if err := rootCmd.Execute(); err != nil {
24 | 		os.Exit(1)
25 | 	}
26 | }
27 | 
28 | func init() {
29 | 	// secrets & cluster roles
30 | 	rootCmd.PersistentFlags().StringVar(&rbacOptions.EPPClusterRole, "epp-cluster-role", "", "Name of the epp cluster role")
31 | 	_ = rootCmd.MarkFlagRequired("epp-cluster-role")
32 | 	rootCmd.PersistentFlags().StringSliceVar(&rbacOptions.EPPPullSecrets, "epp-pull-secrets", []string{}, "List of pull secrets for configuring the epp deployment")
33 | 	rootCmd.PersistentFlags().StringSliceVar(&rbacOptions.PDPullSecrets, "pd-pull-secrets", []string{}, "List of pull secrets for configuring the prefill and decode deployments")
34 | }
35 | 


--------------------------------------------------------------------------------
/cmd/run.go:
--------------------------------------------------------------------------------
  1 | package cmd
  2 | 
  3 | import (
  4 | 	"crypto/tls"
  5 | 	"fmt"
  6 | 	"os"
  7 | 	"path/filepath"
  8 | 
  9 | 	// Import all Kubernetes client auth plugins (e.g. Azure, GCP, OIDC, etc.)
 10 | 	// to ensure that exec-entrypoint and run can make use of them.
 11 | 
 12 | 	_ "k8s.io/client-go/plugin/pkg/client/auth"
 13 | 
 14 | 	"github.com/spf13/cobra"
 15 | 	"k8s.io/apimachinery/pkg/runtime"
 16 | 	utilruntime "k8s.io/apimachinery/pkg/util/runtime"
 17 | 	clientgoscheme "k8s.io/client-go/kubernetes/scheme"
 18 | 	ctrl "sigs.k8s.io/controller-runtime"
 19 | 	"sigs.k8s.io/controller-runtime/pkg/certwatcher"
 20 | 	"sigs.k8s.io/controller-runtime/pkg/healthz"
 21 | 
 22 | 	zaplog "go.uber.org/zap"
 23 | 	"sigs.k8s.io/controller-runtime/pkg/metrics/filters"
 24 | 	metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server"
 25 | 	"sigs.k8s.io/controller-runtime/pkg/webhook"
 26 | 
 27 | 	msv1alpha1 "github.com/llm-d/llm-d-model-service/api/v1alpha1"
 28 | 	"github.com/llm-d/llm-d-model-service/internal/controller"
 29 | 	"go.uber.org/zap/zapcore"
 30 | 	"sigs.k8s.io/controller-runtime/pkg/log/zap"
 31 | 	giev1alpha2 "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2"
 32 | 	gatewayv1 "sigs.k8s.io/gateway-api/apis/v1"
 33 | 	// +kubebuilder:scaffold:imports
 34 | )
 35 | 
 36 | var (
 37 | 	setupLog = ctrl.Log.WithName("setup")
 38 | )
 39 | 
 40 | var metricsAddr string
 41 | var metricsCertPath, metricsCertName, metricsCertKey string
 42 | var webhookCertPath, webhookCertName, webhookCertKey string
 43 | var defaultsYAMLPath string
 44 | var enableLeaderElection bool
 45 | var probeAddr string
 46 | var secureMetrics bool
 47 | var enableHTTP2 bool
 48 | var tlsOpts []func(*tls.Config)
 49 | 
 50 | // Flags for zap logger
 51 | var logLevel string
 52 | 
 53 | func init() {
 54 | 	// logger
 55 | 	runCmd.PersistentFlags().StringVarP(&logLevel, "log-level", "l", "info", "Set the logging level (debug, info, warn, error, dpanic, panic, fatal)")
 56 | 
 57 | 	// added by kubebuilder
 58 | 	runCmd.Flags().BoolP("toggle", "t", false, "Help message for toggle")
 59 | 	runCmd.Flags().StringVar(&metricsAddr, "metrics-bind-address", "0", "The address the metrics endpoint binds to. "+
 60 | 		"Use :8443 for HTTPS or :8080 for HTTP, or leave as 0 to disable the metrics service.")
 61 | 	runCmd.Flags().StringVar(&probeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.")
 62 | 	runCmd.Flags().BoolVar(&enableLeaderElection, "leader-elect", false,
 63 | 		"Enable leader election for controller manager. "+
 64 | 			"Enabling this will ensure there is only one active controller manager.")
 65 | 	runCmd.Flags().BoolVar(&secureMetrics, "metrics-secure", true,
 66 | 		"If set, the metrics endpoint is served securely via HTTPS. Use --metrics-secure=false to use HTTP instead.")
 67 | 	runCmd.Flags().StringVar(&webhookCertPath, "webhook-cert-path", "", "The directory that contains the webhook certificate.")
 68 | 	runCmd.Flags().StringVar(&webhookCertName, "webhook-cert-name", "tls.crt", "The name of the webhook certificate file.")
 69 | 	runCmd.Flags().StringVar(&webhookCertKey, "webhook-cert-key", "tls.key", "The name of the webhook key file.")
 70 | 	runCmd.Flags().StringVar(&metricsCertPath, "metrics-cert-path", "",
 71 | 		"The directory that contains the metrics server certificate.")
 72 | 	runCmd.Flags().StringVar(&metricsCertName, "metrics-cert-name", "tls.crt", "The name of the metrics server certificate file.")
 73 | 	runCmd.Flags().StringVar(&metricsCertKey, "metrics-cert-key", "tls.key", "The name of the metrics server key file.")
 74 | 	runCmd.Flags().BoolVar(&enableHTTP2, "enable-http2", false,
 75 | 		"If set, HTTP/2 will be enabled for the metrics and webhook servers")
 76 | 	runCmd.Flags().StringVar(&defaultsYAMLPath, "defaults-yaml-path", "", "The YAML file containing the controller defaults.")
 77 | 
 78 | 	rootCmd.AddCommand(runCmd)
 79 | }
 80 | 
 81 | // nolint:gocyclo
 82 | func runController() {
 83 | 	scheme := runtime.NewScheme()
 84 | 	utilruntime.Must(clientgoscheme.AddToScheme(scheme))
 85 | 
 86 | 	utilruntime.Must(msv1alpha1.AddToScheme(scheme))
 87 | 	utilruntime.Must(gatewayv1.Install(scheme))
 88 | 	utilruntime.Must(giev1alpha2.Install(scheme))
 89 | 	var opts = zap.Options{
 90 | 		Development: false,
 91 | 		TimeEncoder: zapcore.RFC3339NanoTimeEncoder,
 92 | 		ZapOpts:     []zaplog.Option{zaplog.AddCaller()},
 93 | 		Level:       parseZapLogLevel(logLevel),
 94 | 	}
 95 | 	// +kubebuilder:scaffold:scheme
 96 | 	ctrl.SetLogger(zap.New(zap.UseFlagOptions(&opts)))
 97 | 
 98 | 	// if the enable-http2 flag is false (the default), http/2 should be disabled
 99 | 	// due to its vulnerabilities. More specifically, disabling http/2 will
100 | 	// prevent from being vulnerable to the HTTP/2 Stream Cancellation and
101 | 	// Rapid Reset CVEs. For more information see:
102 | 	// - https://github.com/advisories/GHSA-qppj-fm5r-hxr3
103 | 	// - https://github.com/advisories/GHSA-4374-p667-p6c8
104 | 	disableHTTP2 := func(c *tls.Config) {
105 | 		setupLog.Info("disabling http/2")
106 | 		c.NextProtos = []string{"http/1.1"}
107 | 	}
108 | 
109 | 	if !enableHTTP2 {
110 | 		tlsOpts = append(tlsOpts, disableHTTP2)
111 | 	}
112 | 
113 | 	// Create watchers for metrics and webhooks certificates
114 | 	var metricsCertWatcher, webhookCertWatcher *certwatcher.CertWatcher
115 | 
116 | 	// Initial webhook TLS options
117 | 	webhookTLSOpts := tlsOpts
118 | 
119 | 	if len(webhookCertPath) > 0 {
120 | 		setupLog.Info("Initializing webhook certificate watcher using provided certificates",
121 | 			"webhook-cert-path", webhookCertPath, "webhook-cert-name", webhookCertName, "webhook-cert-key", webhookCertKey)
122 | 
123 | 		var err error
124 | 		webhookCertWatcher, err = certwatcher.New(
125 | 			filepath.Join(webhookCertPath, webhookCertName),
126 | 			filepath.Join(webhookCertPath, webhookCertKey),
127 | 		)
128 | 		if err != nil {
129 | 			setupLog.Error(err, "Failed to initialize webhook certificate watcher")
130 | 			os.Exit(1)
131 | 		}
132 | 
133 | 		webhookTLSOpts = append(webhookTLSOpts, func(config *tls.Config) {
134 | 			config.GetCertificate = webhookCertWatcher.GetCertificate
135 | 		})
136 | 	}
137 | 
138 | 	webhookServer := webhook.NewServer(webhook.Options{
139 | 		TLSOpts: webhookTLSOpts,
140 | 	})
141 | 
142 | 	// Metrics endpoint is enabled in 'config/default/kustomization.yaml'. The Metrics options configure the server.
143 | 	// More info:
144 | 	// - https://pkg.go.dev/sigs.k8s.io/controller-runtime@v0.20.4/pkg/metrics/server
145 | 	// - https://book.kubebuilder.io/reference/metrics.html
146 | 	metricsServerOptions := metricsserver.Options{
147 | 		BindAddress:   metricsAddr,
148 | 		SecureServing: secureMetrics,
149 | 		TLSOpts:       tlsOpts,
150 | 	}
151 | 
152 | 	if secureMetrics {
153 | 		// FilterProvider is used to protect the metrics endpoint with authn/authz.
154 | 		// These configurations ensure that only authorized users and service accounts
155 | 		// can access the metrics endpoint. The RBAC are configured in 'config/rbac/kustomization.yaml'. More info:
156 | 		// https://pkg.go.dev/sigs.k8s.io/controller-runtime@v0.20.4/pkg/metrics/filters#WithAuthenticationAndAuthorization
157 | 		metricsServerOptions.FilterProvider = filters.WithAuthenticationAndAuthorization
158 | 	}
159 | 
160 | 	// If the certificate is not specified, controller-runtime will automatically
161 | 	// generate self-signed certificates for the metrics server. While convenient for development and testing,
162 | 	// this setup is not recommended for production.
163 | 	//
164 | 	// TODO(user): If you enable certManager, uncomment the following lines:
165 | 	// - [METRICS-WITH-CERTS] at config/default/kustomization.yaml to generate and use certificates
166 | 	// managed by cert-manager for the metrics server.
167 | 	// - [PROMETHEUS-WITH-CERTS] at config/prometheus/kustomization.yaml for TLS certification.
168 | 	if len(metricsCertPath) > 0 {
169 | 		setupLog.Info("Initializing metrics certificate watcher using provided certificates",
170 | 			"metrics-cert-path", metricsCertPath, "metrics-cert-name", metricsCertName, "metrics-cert-key", metricsCertKey)
171 | 
172 | 		var err error
173 | 		metricsCertWatcher, err = certwatcher.New(
174 | 			filepath.Join(metricsCertPath, metricsCertName),
175 | 			filepath.Join(metricsCertPath, metricsCertKey),
176 | 		)
177 | 		if err != nil {
178 | 			setupLog.Error(err, "to initialize metrics certificate watcher", "error", err)
179 | 			os.Exit(1)
180 | 		}
181 | 
182 | 		metricsServerOptions.TLSOpts = append(metricsServerOptions.TLSOpts, func(config *tls.Config) {
183 | 			config.GetCertificate = metricsCertWatcher.GetCertificate
184 | 		})
185 | 	}
186 | 
187 | 	mgr, err := ctrl.NewManager(ctrl.GetConfigOrDie(), ctrl.Options{
188 | 		Scheme:                 scheme,
189 | 		Metrics:                metricsServerOptions,
190 | 		WebhookServer:          webhookServer,
191 | 		HealthProbeBindAddress: probeAddr,
192 | 		LeaderElection:         enableLeaderElection,
193 | 		LeaderElectionID:       "f01a4b9d.llm-d.ai",
194 | 		// LeaderElectionReleaseOnCancel defines if the leader should step down voluntarily
195 | 		// when the Manager ends. This requires the binary to immediately end when the
196 | 		// Manager is stopped, otherwise, this setting is unsafe. Setting this significantly
197 | 		// speeds up voluntary leader transitions as the new leader don't have to wait
198 | 		// LeaseDuration time first.
199 | 		//
200 | 		// In the default scaffold provided, the program ends immediately after
201 | 		// the manager stops, so would be fine to enable this option. However,
202 | 		// if you are doing or is intended to do any operation such as perform cleanups
203 | 		// after the manager stops then its usage might be unsafe.
204 | 		// LeaderElectionReleaseOnCancel: true,
205 | 	})
206 | 	if err != nil {
207 | 		setupLog.Error(err, "unable to start manager")
208 | 		os.Exit(1)
209 | 	}
210 | 
211 | 	// Step 1: Read in the modelServiceDefaults struct
212 | 	// Pass that into Reconciler below
213 | 
214 | 	if err = (&controller.ModelServiceReconciler{
215 | 		Client:      mgr.GetClient(),
216 | 		Scheme:      mgr.GetScheme(),
217 | 		RBACOptions: rbacOptions,
218 | 		// Defaults: &modelServiceDefaults // from above
219 | 	}).SetupWithManager(mgr); err != nil {
220 | 		setupLog.Error(err, "unable to create controller", "controller", "ModelService")
221 | 		os.Exit(1)
222 | 	}
223 | 	// +kubebuilder:scaffold:builder
224 | 
225 | 	if metricsCertWatcher != nil {
226 | 		setupLog.Info("Adding metrics certificate watcher to manager")
227 | 		if err := mgr.Add(metricsCertWatcher); err != nil {
228 | 			setupLog.Error(err, "unable to add metrics certificate watcher to manager")
229 | 			os.Exit(1)
230 | 		}
231 | 	}
232 | 
233 | 	if webhookCertWatcher != nil {
234 | 		setupLog.Info("Adding webhook certificate watcher to manager")
235 | 		if err := mgr.Add(webhookCertWatcher); err != nil {
236 | 			setupLog.Error(err, "unable to add webhook certificate watcher to manager")
237 | 			os.Exit(1)
238 | 		}
239 | 	}
240 | 
241 | 	if err := mgr.AddHealthzCheck("healthz", healthz.Ping); err != nil {
242 | 		setupLog.Error(err, "unable to set up health check")
243 | 		os.Exit(1)
244 | 	}
245 | 	if err := mgr.AddReadyzCheck("readyz", healthz.Ping); err != nil {
246 | 		setupLog.Error(err, "unable to set up ready check")
247 | 		os.Exit(1)
248 | 	}
249 | 
250 | 	setupLog.Info("starting manager")
251 | 	if err := mgr.Start(ctrl.SetupSignalHandler()); err != nil {
252 | 		setupLog.Error(err, "problem running manager")
253 | 		os.Exit(1)
254 | 	}
255 | }
256 | 
257 | // runCmd represents the base command when called without any subcommands
258 | var runCmd = &cobra.Command{
259 | 	Use:   "run",
260 | 	Short: "Run the ModelService controller",
261 | 	Long:  `Run the ModelService controller`,
262 | 	PersistentPreRunE: func(cmd *cobra.Command, args []string) error {
263 | 		if len(rbacOptions.EPPClusterRole) < 1 {
264 | 			err := fmt.Errorf("valid EPP cluster role is required")
265 | 			return err
266 | 		}
267 | 		return nil
268 | 	},
269 | 	Run: func(cmd *cobra.Command, args []string) {
270 | 		runController()
271 | 	},
272 | }
273 | 
274 | func parseZapLogLevel(levelStr string) zapcore.Level {
275 | 	switch levelStr {
276 | 	case "debug":
277 | 		return zapcore.DebugLevel
278 | 	case "info":
279 | 		return zapcore.InfoLevel
280 | 	case "warn":
281 | 		return zapcore.WarnLevel
282 | 	case "error":
283 | 		return zapcore.ErrorLevel
284 | 	case "dpanic":
285 | 		return zapcore.DPanicLevel
286 | 	case "panic":
287 | 		return zapcore.PanicLevel
288 | 	case "fatal":
289 | 		return zapcore.FatalLevel
290 | 	default:
291 | 		return zapcore.InfoLevel
292 | 	}
293 | }
294 | 


--------------------------------------------------------------------------------
/cmd/suite_test.go:
--------------------------------------------------------------------------------
 1 | package cmd
 2 | 
 3 | import (
 4 | 	"testing"
 5 | 
 6 | 	. "github.com/onsi/ginkgo/v2"
 7 | 	. "github.com/onsi/gomega"
 8 | 
 9 | 	logf "sigs.k8s.io/controller-runtime/pkg/log"
10 | 	"sigs.k8s.io/controller-runtime/pkg/log/zap"
11 | )
12 | 
13 | // These tests use Ginkgo (BDD-style Go testing framework). Refer to
14 | // http://onsi.github.io/ginkgo/ to learn more about Ginkgo.
15 | 
16 | func TestControllers(t *testing.T) {
17 | 	RegisterFailHandler(Fail)
18 | 
19 | 	RunSpecs(t, "Controller Suite")
20 | }
21 | 
22 | var _ = BeforeSuite(func() {
23 | 	logf.SetLogger(zap.New(zap.WriteTo(GinkgoWriter), zap.UseDevMode(true)))
24 | })
25 | 


--------------------------------------------------------------------------------
/config/crd/kustomization.yaml:
--------------------------------------------------------------------------------
 1 | # This kustomization.yaml is not intended to be run by itself,
 2 | # since it depends on service name and namespace that are out of this kustomize package.
 3 | # It should be run by config/default
 4 | resources:
 5 | - bases/llm-d.ai_modelservices.yaml
 6 | # +kubebuilder:scaffold:crdkustomizeresource
 7 | 
 8 | patches: []
 9 | # [WEBHOOK] To enable webhook, uncomment all the sections with [WEBHOOK] prefix.
10 | # patches here are for enabling the conversion webhook for each CRD
11 | # +kubebuilder:scaffold:crdkustomizewebhookpatch
12 | 
13 | # [WEBHOOK] To enable webhook, uncomment the following section
14 | # the following config is for teaching kustomize how to do kustomization for CRDs.
15 | #configurations:
16 | #- kustomizeconfig.yaml
17 | 


--------------------------------------------------------------------------------
/config/crd/kustomizeconfig.yaml:
--------------------------------------------------------------------------------
 1 | # This file is for teaching kustomize how to substitute name and namespace reference in CRD
 2 | nameReference:
 3 | - kind: Service
 4 |   version: v1
 5 |   fieldSpecs:
 6 |   - kind: CustomResourceDefinition
 7 |     version: v1
 8 |     group: apiextensions.k8s.io
 9 |     path: spec/conversion/webhook/clientConfig/service/name
10 | 
11 | namespace:
12 | - kind: CustomResourceDefinition
13 |   version: v1
14 |   group: apiextensions.k8s.io
15 |   path: spec/conversion/webhook/clientConfig/service/namespace
16 |   create: false
17 | 
18 | varReference:
19 | - path: metadata/annotations
20 | 


--------------------------------------------------------------------------------
/config/default/cert_metrics_manager_patch.yaml:
--------------------------------------------------------------------------------
 1 | # This patch adds the args, volumes, and ports to allow the manager to use the metrics-server certs.
 2 | 
 3 | # Add the volumeMount for the metrics-server certs
 4 | - op: add
 5 |   path: /spec/template/spec/containers/0/volumeMounts/-
 6 |   value:
 7 |     mountPath: /tmp/k8s-metrics-server/metrics-certs
 8 |     name: metrics-certs
 9 |     readOnly: true
10 | 
11 | # Add the --metrics-cert-path argument for the metrics server
12 | - op: add
13 |   path: /spec/template/spec/containers/0/args/-
14 |   value: --metrics-cert-path=/tmp/k8s-metrics-server/metrics-certs
15 | 
16 | # Add the metrics-server certs volume configuration
17 | - op: add
18 |   path: /spec/template/spec/volumes/-
19 |   value:
20 |     name: metrics-certs
21 |     secret:
22 |       secretName: metrics-server-cert
23 |       optional: false
24 |       items:
25 |         - key: ca.crt
26 |           path: ca.crt
27 |         - key: tls.crt
28 |           path: tls.crt
29 |         - key: tls.key
30 |           path: tls.key
31 | 


--------------------------------------------------------------------------------
/config/default/kustomization.yaml:
--------------------------------------------------------------------------------
  1 | # Adds namespace to all resources.
  2 | namespace: modelservice-system
  3 | 
  4 | # Value of this field is prepended to the
  5 | # names of all resources, e.g. a deployment named
  6 | # "wordpress" becomes "alices-wordpress".
  7 | # Note that it should also match with the prefix (text before '-') of the namespace
  8 | # field above.
  9 | namePrefix: modelservice-
 10 | 
 11 | # Labels to add to all resources and selectors.
 12 | #labels:
 13 | #- includeSelectors: true
 14 | #  pairs:
 15 | #    someName: someValue
 16 | 
 17 | resources:
 18 | # - ../crd
 19 | - ../rbac
 20 | - ../manager
 21 | # [WEBHOOK] To enable webhook, uncomment all the sections with [WEBHOOK] prefix including the one in
 22 | # crd/kustomization.yaml
 23 | #- ../webhook
 24 | # [CERTMANAGER] To enable cert-manager, uncomment all sections with 'CERTMANAGER'. 'WEBHOOK' components are required.
 25 | #- ../certmanager
 26 | # [PROMETHEUS] To enable prometheus monitor, uncomment all sections with 'PROMETHEUS'.
 27 | #- ../prometheus
 28 | # [METRICS] Expose the controller manager metrics service.
 29 | - metrics_service.yaml
 30 | # [NETWORK POLICY] Protect the /metrics endpoint and Webhook Server with NetworkPolicy.
 31 | # Only Pod(s) running a namespace labeled with 'metrics: enabled' will be able to gather the metrics.
 32 | # Only CR(s) which requires webhooks and are applied on namespaces labeled with 'webhooks: enabled' will
 33 | # be able to communicate with the Webhook Server.
 34 | #- ../network-policy
 35 | 
 36 | # Uncomment the patches line if you enable Metrics
 37 | patches:
 38 | # [METRICS] The following patch will enable the metrics endpoint using HTTPS and the port :8443.
 39 | # More info: https://book.kubebuilder.io/reference/metrics
 40 | - path: manager_metrics_patch.yaml
 41 |   target:
 42 |     kind: Deployment
 43 | 
 44 | # Uncomment the patches line if you enable Metrics and CertManager
 45 | # [METRICS-WITH-CERTS] To enable metrics protected with certManager, uncomment the following line.
 46 | # This patch will protect the metrics with certManager self-signed certs.
 47 | #- path: cert_metrics_manager_patch.yaml
 48 | #  target:
 49 | #    kind: Deployment
 50 | 
 51 | # [WEBHOOK] To enable webhook, uncomment all the sections with [WEBHOOK] prefix including the one in
 52 | # crd/kustomization.yaml
 53 | #- path: manager_webhook_patch.yaml
 54 | #  target:
 55 | #    kind: Deployment
 56 | 
 57 | # [CERTMANAGER] To enable cert-manager, uncomment all sections with 'CERTMANAGER' prefix.
 58 | # Uncomment the following replacements to add the cert-manager CA injection annotations
 59 | #replacements:
 60 | # - source: # Uncomment the following block to enable certificates for metrics
 61 | #     kind: Service
 62 | #     version: v1
 63 | #     name: controller-manager-metrics-service
 64 | #     fieldPath: metadata.name
 65 | #   targets:
 66 | #     - select:
 67 | #         kind: Certificate
 68 | #         group: cert-manager.io
 69 | #         version: v1
 70 | #         name: metrics-certs
 71 | #       fieldPaths:
 72 | #         - spec.dnsNames.0
 73 | #         - spec.dnsNames.1
 74 | #       options:
 75 | #         delimiter: '.'
 76 | #         index: 0
 77 | #         create: true
 78 | #     - select: # Uncomment the following to set the Service name for TLS config in Prometheus ServiceMonitor
 79 | #         kind: ServiceMonitor
 80 | #         group: monitoring.coreos.com
 81 | #         version: v1
 82 | #         name: controller-manager-metrics-monitor
 83 | #       fieldPaths:
 84 | #         - spec.endpoints.0.tlsConfig.serverName
 85 | #       options:
 86 | #         delimiter: '.'
 87 | #         index: 0
 88 | #         create: true
 89 | #
 90 | # - source:
 91 | #     kind: Service
 92 | #     version: v1
 93 | #     name: controller-manager-metrics-service
 94 | #     fieldPath: metadata.namespace
 95 | #   targets:
 96 | #     - select:
 97 | #         kind: Certificate
 98 | #         group: cert-manager.io
 99 | #         version: v1
100 | #         name: metrics-certs
101 | #       fieldPaths:
102 | #         - spec.dnsNames.0
103 | #         - spec.dnsNames.1
104 | #       options:
105 | #         delimiter: '.'
106 | #         index: 1
107 | #         create: true
108 | #     - select: # Uncomment the following to set the Service namespace for TLS in Prometheus ServiceMonitor
109 | #         kind: ServiceMonitor
110 | #         group: monitoring.coreos.com
111 | #         version: v1
112 | #         name: controller-manager-metrics-monitor
113 | #       fieldPaths:
114 | #         - spec.endpoints.0.tlsConfig.serverName
115 | #       options:
116 | #         delimiter: '.'
117 | #         index: 1
118 | #         create: true
119 | #
120 | # - source: # Uncomment the following block if you have any webhook
121 | #     kind: Service
122 | #     version: v1
123 | #     name: webhook-service
124 | #     fieldPath: .metadata.name # Name of the service
125 | #   targets:
126 | #     - select:
127 | #         kind: Certificate
128 | #         group: cert-manager.io
129 | #         version: v1
130 | #         name: serving-cert
131 | #       fieldPaths:
132 | #         - .spec.dnsNames.0
133 | #         - .spec.dnsNames.1
134 | #       options:
135 | #         delimiter: '.'
136 | #         index: 0
137 | #         create: true
138 | # - source:
139 | #     kind: Service
140 | #     version: v1
141 | #     name: webhook-service
142 | #     fieldPath: .metadata.namespace # Namespace of the service
143 | #   targets:
144 | #     - select:
145 | #         kind: Certificate
146 | #         group: cert-manager.io
147 | #         version: v1
148 | #         name: serving-cert
149 | #       fieldPaths:
150 | #         - .spec.dnsNames.0
151 | #         - .spec.dnsNames.1
152 | #       options:
153 | #         delimiter: '.'
154 | #         index: 1
155 | #         create: true
156 | #
157 | # - source: # Uncomment the following block if you have a ValidatingWebhook (--programmatic-validation)
158 | #     kind: Certificate
159 | #     group: cert-manager.io
160 | #     version: v1
161 | #     name: serving-cert # This name should match the one in certificate.yaml
162 | #     fieldPath: .metadata.namespace # Namespace of the certificate CR
163 | #   targets:
164 | #     - select:
165 | #         kind: ValidatingWebhookConfiguration
166 | #       fieldPaths:
167 | #         - .metadata.annotations.[cert-manager.io/inject-ca-from]
168 | #       options:
169 | #         delimiter: '/'
170 | #         index: 0
171 | #         create: true
172 | # - source:
173 | #     kind: Certificate
174 | #     group: cert-manager.io
175 | #     version: v1
176 | #     name: serving-cert
177 | #     fieldPath: .metadata.name
178 | #   targets:
179 | #     - select:
180 | #         kind: ValidatingWebhookConfiguration
181 | #       fieldPaths:
182 | #         - .metadata.annotations.[cert-manager.io/inject-ca-from]
183 | #       options:
184 | #         delimiter: '/'
185 | #         index: 1
186 | #         create: true
187 | #
188 | # - source: # Uncomment the following block if you have a DefaultingWebhook (--defaulting )
189 | #     kind: Certificate
190 | #     group: cert-manager.io
191 | #     version: v1
192 | #     name: serving-cert
193 | #     fieldPath: .metadata.namespace # Namespace of the certificate CR
194 | #   targets:
195 | #     - select:
196 | #         kind: MutatingWebhookConfiguration
197 | #       fieldPaths:
198 | #         - .metadata.annotations.[cert-manager.io/inject-ca-from]
199 | #       options:
200 | #         delimiter: '/'
201 | #         index: 0
202 | #         create: true
203 | # - source:
204 | #     kind: Certificate
205 | #     group: cert-manager.io
206 | #     version: v1
207 | #     name: serving-cert
208 | #     fieldPath: .metadata.name
209 | #   targets:
210 | #     - select:
211 | #         kind: MutatingWebhookConfiguration
212 | #       fieldPaths:
213 | #         - .metadata.annotations.[cert-manager.io/inject-ca-from]
214 | #       options:
215 | #         delimiter: '/'
216 | #         index: 1
217 | #         create: true
218 | #
219 | # - source: # Uncomment the following block if you have a ConversionWebhook (--conversion)
220 | #     kind: Certificate
221 | #     group: cert-manager.io
222 | #     version: v1
223 | #     name: serving-cert
224 | #     fieldPath: .metadata.namespace # Namespace of the certificate CR
225 | #   targets: # Do not remove or uncomment the following scaffold marker; required to generate code for target CRD.
226 | # +kubebuilder:scaffold:crdkustomizecainjectionns
227 | # - source:
228 | #     kind: Certificate
229 | #     group: cert-manager.io
230 | #     version: v1
231 | #     name: serving-cert
232 | #     fieldPath: .metadata.name
233 | #   targets: # Do not remove or uncomment the following scaffold marker; required to generate code for target CRD.
234 | # +kubebuilder:scaffold:crdkustomizecainjectionname
235 | 


--------------------------------------------------------------------------------
/config/default/manager_metrics_patch.yaml:
--------------------------------------------------------------------------------
1 | # This patch adds the args to allow exposing the metrics endpoint using HTTPS
2 | - op: add
3 |   path: /spec/template/spec/containers/0/args/0
4 |   value: --metrics-bind-address=:8443
5 | 


--------------------------------------------------------------------------------
/config/default/metrics_service.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Service
 3 | metadata:
 4 |   labels:
 5 |     control-plane: controller-manager
 6 |     app.kubernetes.io/name: modelservice
 7 |     app.kubernetes.io/managed-by: kustomize
 8 |   name: controller-manager-metrics-service
 9 |   namespace: system
10 | spec:
11 |   ports:
12 |   - name: https
13 |     port: 8443
14 |     protocol: TCP
15 |     targetPort: 8443
16 |   selector:
17 |     control-plane: controller-manager
18 |     app.kubernetes.io/name: modelservice
19 | 


--------------------------------------------------------------------------------
/config/dev/kustomization.yaml:
--------------------------------------------------------------------------------
 1 | resources:
 2 | - ../default
 3 | 
 4 | patches:
 5 | # [MANAGER] The following patch provides access to image
 6 | - target:
 7 |     kind: Deployment
 8 |   path: manager_patch.yaml
 9 | 
10 | 


--------------------------------------------------------------------------------
/config/dev/manager_patch.yaml:
--------------------------------------------------------------------------------
 1 | 
 2 | # Make image pull policy Always -- dev images require this
 3 | - op: add
 4 |   path: /spec/template/spec/containers/0/imagePullPolicy
 5 |   value: Always
 6 | 
 7 | # Add image pull secret
 8 | - op: add
 9 |   path: /spec/template/spec/imagePullSecrets
10 |   value: 
11 |   - name: $IMAGE_PULL_SECRET
12 | 
13 | # to configure pull secrets for pd and epp pods, uncomment and set environemnt variables
14 | - op: add
15 |   path: /spec/template/spec/containers/0/args/-
16 |   value: --pd-pull-secrets=$PD_PULL_SECRETS
17 | - op: add
18 |   path: /spec/template/spec/containers/0/args/-
19 |   value: --epp-pull-secrets=$EPP_PULL_SECRETS
20 | 
21 | # Make image pull policy configurable -- e.g., Always, Never, IfNotPresent
22 | - op: add
23 |   path: /spec/template/spec/containers/0/imagePullPolicy
24 |   value: ${IMAGE_PULL_POLICY}
25 | 


--------------------------------------------------------------------------------
/config/eppandinference/inferencepool-e2e.yaml:
--------------------------------------------------------------------------------
  1 | apiVersion: v1
  2 | kind: ServiceAccount
  3 | metadata:
  4 |   labels:
  5 |     app.kubernetes.io/name: modelservice
  6 |     app.kubernetes.io/managed-by: kustomize
  7 |   name: inferenceandepp
  8 | ---
  9 | # The deployer has to create and deploy this 
 10 | # The ModelService operator is not creating this for the summit-demo
 11 | 
 12 | apiVersion: inference.networking.x-k8s.io/v1alpha2
 13 | kind: InferencePool
 14 | metadata:
 15 |   name: vllm-llama-4-scout-17b-16e-instruct
 16 | spec:
 17 |   # Need to match operatordefaults.yaml/vllmdProxyContainer.ports and look for the "proxy-port" name AND "port" in vllmdProxyContainer.args.args 
 18 |   targetPortNumber: 8000
 19 |   selector:
 20 |     app: vllm-llama-4-scout-17b-16e-instruct
 21 |   extensionRef:
 22 |     name: vllm-llama-4-scout-17b-16e-instruct-epp
 23 |     # model-related artifacts will be in the same namespace
 24 |     # namespace: $E2E_NS
 25 | ---
 26 | apiVersion: v1
 27 | kind: Service
 28 | metadata:
 29 |   name: vllm-llama-4-scout-17b-16e-instruct-epp
 30 |   # model-related artifacts will be in the same namespace
 31 |   # namespace: $E2E_NS
 32 | spec:
 33 |   selector:
 34 |     app: vllm-llama-4-scout-17b-16e-instruct-epp
 35 |   ports:
 36 |     - protocol: TCP
 37 |       port: 9002
 38 |       targetPort: 9002
 39 |       appProtocol: http2
 40 |   type: ClusterIP
 41 | ---
 42 | apiVersion: apps/v1
 43 | kind: Deployment
 44 | metadata:
 45 |   name: vllm-llama-4-scout-17b-16e-instruct-epp
 46 |   # model-related artifacts will be in the same namespace
 47 |   # namespace: $E2E_NS
 48 |   labels:
 49 |     app: vllm-llama-4-scout-17b-16e-instruct-epp
 50 | spec:
 51 |   replicas: 1
 52 |   selector:
 53 |     matchLabels:
 54 |       app: vllm-llama-4-scout-17b-16e-instruct-epp
 55 |   template:
 56 |     metadata:
 57 |       labels:
 58 |         app: vllm-llama-4-scout-17b-16e-instruct-epp
 59 |     spec:
 60 |       # Conservatively, this timeout should mirror the longest grace period of the pods within the pool
 61 |       terminationGracePeriodSeconds: 130
 62 |       serviceAccountName: inferenceandepp     
 63 |       containers:
 64 |       - name: epp
 65 |         image: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/epp:main
 66 |         imagePullPolicy: Always
 67 |         args:
 68 |         - -poolName
 69 |         - "vllm-llama-4-scout-17b-16e-instruct"
 70 |         # model-related artifacts will be in the same namespace
 71 |         # - -poolNamespace
 72 |         # - "$E2E_NS"
 73 |         - -v
 74 |         - "4"
 75 |         - --zap-encoder
 76 |         - "json"
 77 |         - -grpcPort
 78 |         - "9002"
 79 |         - -grpcHealthPort
 80 |         - "9003"
 81 |         env:
 82 |         - name: USE_STREAMING
 83 |           value: "true"
 84 |         ports:
 85 |         - containerPort: 9002
 86 |         - containerPort: 9003
 87 |         - name: metrics
 88 |           containerPort: 9090
 89 |         livenessProbe:
 90 |           grpc:
 91 |             port: 9003
 92 |             service: inference-extension
 93 |           initialDelaySeconds: 5
 94 |           periodSeconds: 10
 95 |         readinessProbe:
 96 |           grpc:
 97 |             port: 9003
 98 |             service: inference-extension
 99 |           initialDelaySeconds: 5
100 |           periodSeconds: 10
101 | ---
102 | kind: ClusterRole
103 | apiVersion: rbac.authorization.k8s.io/v1
104 | metadata:
105 |   name: pod-read
106 | rules:
107 | - apiGroups: ["inference.networking.x-k8s.io"]
108 |   resources: ["inferencemodels"]
109 |   verbs: ["get", "watch", "list"]
110 | - apiGroups: [""]
111 |   resources: ["pods"]
112 |   verbs: ["get", "watch", "list"]
113 | - apiGroups: ["inference.networking.x-k8s.io"]
114 |   resources: ["inferencepools"]
115 |   verbs: ["get", "watch", "list"]
116 | - apiGroups: ["discovery.k8s.io"]
117 |   resources: ["endpointslices"]
118 |   verbs: ["get", "watch", "list"]
119 | - apiGroups:
120 |   - authentication.k8s.io
121 |   resources:
122 |   - tokenreviews
123 |   verbs:
124 |   - create
125 | - apiGroups:
126 |   - authorization.k8s.io
127 |   resources:
128 |   - subjectaccessreviews
129 |   verbs:
130 |   - create
131 | --- 
132 | kind: ClusterRoleBinding
133 | apiVersion: rbac.authorization.k8s.io/v1
134 | metadata:
135 |   name: pod-read-binding
136 | subjects:
137 | - kind: ServiceAccount
138 |   name: inferenceandepp
139 |   # model-related artifacts will be in the same namespace
140 |   # namespace: $E2E_NS
141 | roleRef:
142 |   apiGroup: rbac.authorization.k8s.io
143 |   kind: ClusterRole
144 |   name: pod-read


--------------------------------------------------------------------------------
/config/eppandinference/kustomization.yaml:
--------------------------------------------------------------------------------
1 | # Adds namespace to all resources.
2 | namespace: e2e-solution
3 | 
4 | resources:
5 | - inferencepool-e2e.yaml


--------------------------------------------------------------------------------
/config/externalcrds/kustomization.yaml:
--------------------------------------------------------------------------------
1 | resources:
2 | - bases/inferencecrds.yaml


--------------------------------------------------------------------------------
/config/manager/kustomization.yaml:
--------------------------------------------------------------------------------
1 | resources:
2 | - manager.yaml
3 | apiVersion: kustomize.config.k8s.io/v1beta1
4 | kind: Kustomization
5 | images:
6 | - name: controller
7 |   newName: ghcr.io/llm-d/llm-d-model-service
8 |   newTag: 0.0.1
9 | 


--------------------------------------------------------------------------------
/config/manager/manager.yaml:
--------------------------------------------------------------------------------
  1 | # apiVersion: v1
  2 | # kind: Namespace
  3 | # metadata:
  4 | #   labels:
  5 | #     control-plane: controller-manager
  6 | #     app.kubernetes.io/name: modelservice
  7 | #     app.kubernetes.io/managed-by: kustomize
  8 | #   name: system
  9 | # ---
 10 | apiVersion: apps/v1
 11 | kind: Deployment
 12 | metadata:
 13 |   name: controller-manager
 14 |   namespace: system
 15 |   labels:
 16 |     control-plane: controller-manager
 17 |     app.kubernetes.io/name: modelservice
 18 |     app.kubernetes.io/managed-by: kustomize
 19 | spec:
 20 |   selector:
 21 |     matchLabels:
 22 |       control-plane: controller-manager
 23 |       app.kubernetes.io/name: modelservice
 24 |   replicas: 1
 25 |   template:
 26 |     metadata:
 27 |       annotations:
 28 |         kubectl.kubernetes.io/default-container: manager
 29 |       labels:
 30 |         control-plane: controller-manager
 31 |         app.kubernetes.io/name: modelservice
 32 |     spec:
 33 |       # TODO(user): Uncomment the following code to configure the nodeAffinity expression
 34 |       # according to the platforms which are supported by your solution.
 35 |       # It is considered best practice to support multiple architectures. You can
 36 |       # build your manager image using the makefile target docker-buildx.
 37 |       # affinity:
 38 |       #   nodeAffinity:
 39 |       #     requiredDuringSchedulingIgnoredDuringExecution:
 40 |       #       nodeSelectorTerms:
 41 |       #         - matchExpressions:
 42 |       #           - key: kubernetes.io/arch
 43 |       #             operator: In
 44 |       #             values:
 45 |       #               - amd64
 46 |       #               - arm64
 47 |       #               - ppc64le
 48 |       #               - s390x
 49 |       #           - key: kubernetes.io/os
 50 |       #             operator: In
 51 |       #             values:
 52 |       #               - linux
 53 |       securityContext:
 54 |         # Projects are configured by default to adhere to the "restricted" Pod Security Standards.
 55 |         # This ensures that deployments meet the highest security requirements for Kubernetes.
 56 |         # For more details, see: https://kubernetes.io/docs/concepts/security/pod-security-standards/#restricted
 57 |         runAsNonRoot: true
 58 |         seccompProfile:
 59 |           type: RuntimeDefault
 60 |       containers:
 61 |       - command:
 62 |         - /manager
 63 |         - run
 64 |         args:
 65 |           - --leader-elect=false
 66 |           - --health-probe-bind-address=:8081
 67 |           - --log-level=$LOG_LEVEL
 68 |           - --epp-cluster-role=$EPP_CLUSTER_ROLE
 69 |         image: controller:latest
 70 |         imagePullPolicy: Always
 71 |         name: manager
 72 |         ports: []
 73 |         securityContext:
 74 |           allowPrivilegeEscalation: false
 75 |           capabilities:
 76 |             drop:
 77 |             - "ALL"
 78 |         livenessProbe:
 79 |           httpGet:
 80 |             path: /healthz
 81 |             port: 8081
 82 |           initialDelaySeconds: 15
 83 |           periodSeconds: 20
 84 |         readinessProbe:
 85 |           httpGet:
 86 |             path: /readyz
 87 |             port: 8081
 88 |           initialDelaySeconds: 5
 89 |           periodSeconds: 10
 90 |         # TODO(user): Configure the resources accordingly based on the project requirements.
 91 |         # More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/
 92 |         resources:
 93 |           limits:
 94 |             cpu: 500m
 95 |             memory: 512Mi
 96 |           requests:
 97 |             cpu: 10m
 98 |             memory: 128Mi
 99 |         volumeMounts: []
100 |       volumes: []
101 |       serviceAccountName: controller-manager
102 |       terminationGracePeriodSeconds: 10
103 | 


--------------------------------------------------------------------------------
/config/network-policy/allow-metrics-traffic.yaml:
--------------------------------------------------------------------------------
 1 | # This NetworkPolicy allows ingress traffic
 2 | # with Pods running on namespaces labeled with 'metrics: enabled'. Only Pods on those
 3 | # namespaces are able to gather data from the metrics endpoint.
 4 | apiVersion: networking.k8s.io/v1
 5 | kind: NetworkPolicy
 6 | metadata:
 7 |   labels:
 8 |     app.kubernetes.io/name: modelservice
 9 |     app.kubernetes.io/managed-by: kustomize
10 |   name: allow-metrics-traffic
11 |   namespace: system
12 | spec:
13 |   podSelector:
14 |     matchLabels:
15 |       control-plane: controller-manager
16 |       app.kubernetes.io/name: modelservice
17 |   policyTypes:
18 |     - Ingress
19 |   ingress:
20 |     # This allows ingress traffic from any namespace with the label metrics: enabled
21 |     - from:
22 |       - namespaceSelector:
23 |           matchLabels:
24 |             metrics: enabled  # Only from namespaces with this label
25 |       ports:
26 |         - port: 8443
27 |           protocol: TCP
28 | 


--------------------------------------------------------------------------------
/config/network-policy/kustomization.yaml:
--------------------------------------------------------------------------------
1 | resources:
2 | - allow-metrics-traffic.yaml
3 | 


--------------------------------------------------------------------------------
/config/prometheus/kustomization.yaml:
--------------------------------------------------------------------------------
 1 | resources:
 2 | - monitor.yaml
 3 | 
 4 | # [PROMETHEUS-WITH-CERTS] The following patch configures the ServiceMonitor in ../prometheus
 5 | # to securely reference certificates created and managed by cert-manager.
 6 | # Additionally, ensure that you uncomment the [METRICS WITH CERTMANAGER] patch under config/default/kustomization.yaml
 7 | # to mount the "metrics-server-cert" secret in the Manager Deployment.
 8 | #patches:
 9 | #  - path: monitor_tls_patch.yaml
10 | #    target:
11 | #      kind: ServiceMonitor
12 | 


--------------------------------------------------------------------------------
/config/prometheus/monitor.yaml:
--------------------------------------------------------------------------------
 1 | # Prometheus Monitor Service (Metrics)
 2 | apiVersion: monitoring.coreos.com/v1
 3 | kind: ServiceMonitor
 4 | metadata:
 5 |   labels:
 6 |     control-plane: controller-manager
 7 |     app.kubernetes.io/name: modelservice
 8 |     app.kubernetes.io/managed-by: kustomize
 9 |   name: controller-manager-metrics-monitor
10 |   namespace: system
11 | spec:
12 |   endpoints:
13 |     - path: /metrics
14 |       port: https # Ensure this is the name of the port that exposes HTTPS metrics
15 |       scheme: https
16 |       bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
17 |       tlsConfig:
18 |         # TODO(user): The option insecureSkipVerify: true is not recommended for production since it disables
19 |         # certificate verification, exposing the system to potential man-in-the-middle attacks.
20 |         # For production environments, it is recommended to use cert-manager for automatic TLS certificate management.
21 |         # To apply this configuration, enable cert-manager and use the patch located at config/prometheus/servicemonitor_tls_patch.yaml,
22 |         # which securely references the certificate from the 'metrics-server-cert' secret.
23 |         insecureSkipVerify: true
24 |   selector:
25 |     matchLabels:
26 |       control-plane: controller-manager
27 |       app.kubernetes.io/name: modelservice
28 | 


--------------------------------------------------------------------------------
/config/prometheus/monitor_tls_patch.yaml:
--------------------------------------------------------------------------------
 1 | # Patch for Prometheus ServiceMonitor to enable secure TLS configuration
 2 | # using certificates managed by cert-manager
 3 | - op: replace
 4 |   path: /spec/endpoints/0/tlsConfig
 5 |   value:
 6 |     # SERVICE_NAME and SERVICE_NAMESPACE will be substituted by kustomize
 7 |     serverName: SERVICE_NAME.SERVICE_NAMESPACE.svc
 8 |     insecureSkipVerify: false
 9 |     ca:
10 |       secret:
11 |         name: metrics-server-cert
12 |         key: ca.crt
13 |     cert:
14 |       secret:
15 |         name: metrics-server-cert
16 |         key: tls.crt
17 |     keySecret:
18 |       name: metrics-server-cert
19 |       key: tls.key
20 | 


--------------------------------------------------------------------------------
/config/rbac/epp_role_binding.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: rbac.authorization.k8s.io/v1
 2 | kind: ClusterRoleBinding
 3 | metadata:
 4 |   labels:
 5 |     app.kubernetes.io/name: modelservice
 6 |     app.kubernetes.io/managed-by: kustomize
 7 |   name: manager-epp-rolebinding
 8 | roleRef:
 9 |   apiGroup: rbac.authorization.k8s.io
10 |   kind: ClusterRole
11 |   name: $EPP_CLUSTER_ROLE
12 | subjects:
13 | - kind: ServiceAccount
14 |   name: controller-manager
15 |   namespace: system
16 | 


--------------------------------------------------------------------------------
/config/rbac/kustomization.yaml:
--------------------------------------------------------------------------------
 1 | resources:
 2 | # All RBAC will be applied under this service account in
 3 | # the deployment namespace. You may comment out this resource
 4 | # if your manager will use a service account that exists at
 5 | # runtime. Be sure to update RoleBinding and ClusterRoleBinding
 6 | # subjects if changing service account names.
 7 | - service_account.yaml
 8 | - role.yaml
 9 | - role_binding.yaml
10 | - epp_role_binding.yaml
11 | - leader_election_role.yaml
12 | - leader_election_role_binding.yaml
13 | # The following RBAC configurations are used to protect
14 | # the metrics endpoint with authn/authz. These configurations
15 | # ensure that only authorized users and service accounts
16 | # can access the metrics endpoint. Comment the following
17 | # permissions if you want to disable this protection.
18 | # More info: https://book.kubebuilder.io/reference/metrics.html
19 | - metrics_auth_role.yaml
20 | - metrics_auth_role_binding.yaml
21 | - metrics_reader_role.yaml
22 | # For each CRD, "Admin", "Editor" and "Viewer" roles are scaffolded by
23 | # default, aiding admins in cluster management. Those roles are
24 | # not used by the {{ .ProjectName }} itself. You can comment the following lines
25 | # if you do not want those helpers be installed with your Project.
26 | - modelservice_admin_role.yaml
27 | - modelservice_editor_role.yaml
28 | - modelservice_viewer_role.yaml
29 | 
30 | 


--------------------------------------------------------------------------------
/config/rbac/leader_election_role.yaml:
--------------------------------------------------------------------------------
 1 | # permissions to do leader election.
 2 | apiVersion: rbac.authorization.k8s.io/v1
 3 | kind: Role
 4 | metadata:
 5 |   labels:
 6 |     app.kubernetes.io/name: modelservice
 7 |     app.kubernetes.io/managed-by: kustomize
 8 |   name: leader-election-role
 9 | rules:
10 | - apiGroups:
11 |   - ""
12 |   resources:
13 |   - configmaps
14 |   verbs:
15 |   - get
16 |   - list
17 |   - watch
18 |   - create
19 |   - update
20 |   - patch
21 |   - delete
22 | - apiGroups:
23 |   - coordination.k8s.io
24 |   resources:
25 |   - leases
26 |   verbs:
27 |   - get
28 |   - list
29 |   - watch
30 |   - create
31 |   - update
32 |   - patch
33 |   - delete
34 | - apiGroups:
35 |   - ""
36 |   resources:
37 |   - events
38 |   verbs:
39 |   - create
40 |   - patch
41 | 


--------------------------------------------------------------------------------
/config/rbac/leader_election_role_binding.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: rbac.authorization.k8s.io/v1
 2 | kind: RoleBinding
 3 | metadata:
 4 |   labels:
 5 |     app.kubernetes.io/name: modelservice
 6 |     app.kubernetes.io/managed-by: kustomize
 7 |   name: leader-election-rolebinding
 8 | roleRef:
 9 |   apiGroup: rbac.authorization.k8s.io
10 |   kind: Role
11 |   name: leader-election-role
12 | subjects:
13 | - kind: ServiceAccount
14 |   name: controller-manager
15 |   namespace: system
16 | 


--------------------------------------------------------------------------------
/config/rbac/metrics_auth_role.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: rbac.authorization.k8s.io/v1
 2 | kind: ClusterRole
 3 | metadata:
 4 |   name: metrics-auth-role
 5 | rules:
 6 | - apiGroups:
 7 |   - authentication.k8s.io
 8 |   resources:
 9 |   - tokenreviews
10 |   verbs:
11 |   - create
12 | - apiGroups:
13 |   - authorization.k8s.io
14 |   resources:
15 |   - subjectaccessreviews
16 |   verbs:
17 |   - create
18 | 


--------------------------------------------------------------------------------
/config/rbac/metrics_auth_role_binding.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: rbac.authorization.k8s.io/v1
 2 | kind: ClusterRoleBinding
 3 | metadata:
 4 |   name: metrics-auth-rolebinding
 5 | roleRef:
 6 |   apiGroup: rbac.authorization.k8s.io
 7 |   kind: ClusterRole
 8 |   name: metrics-auth-role
 9 | subjects:
10 | - kind: ServiceAccount
11 |   name: controller-manager
12 |   namespace: system
13 | 


--------------------------------------------------------------------------------
/config/rbac/metrics_reader_role.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: rbac.authorization.k8s.io/v1
 2 | kind: ClusterRole
 3 | metadata:
 4 |   name: metrics-reader
 5 | rules:
 6 | - nonResourceURLs:
 7 |   - "/metrics"
 8 |   verbs:
 9 |   - get
10 | 


--------------------------------------------------------------------------------
/config/rbac/modelservice_admin_role.yaml:
--------------------------------------------------------------------------------
 1 | # This rule is not used by the project modelservice itself.
 2 | # It is provided to allow the cluster admin to help manage permissions for users.
 3 | #
 4 | # Grants full permissions ('*') over llm-d.ai.
 5 | # This role is intended for users authorized to modify roles and bindings within the cluster,
 6 | # enabling them to delegate specific permissions to other users or groups as needed.
 7 | 
 8 | apiVersion: rbac.authorization.k8s.io/v1
 9 | kind: ClusterRole
10 | metadata:
11 |   labels:
12 |     app.kubernetes.io/name: modelservice
13 |     app.kubernetes.io/managed-by: kustomize
14 |   name: modelservice-admin-role
15 | rules:
16 | - apiGroups:
17 |   - llm-d.ai
18 |   resources:
19 |   - modelservices
20 |   verbs:
21 |   - '*'
22 | - apiGroups:
23 |   - llm-d.ai
24 |   resources:
25 |   - modelservices/status
26 |   verbs:
27 |   - get
28 | 


--------------------------------------------------------------------------------
/config/rbac/modelservice_editor_role.yaml:
--------------------------------------------------------------------------------
 1 | # This rule is not used by the project modelservice itself.
 2 | # It is provided to allow the cluster admin to help manage permissions for users.
 3 | #
 4 | # Grants permissions to create, update, and delete resources within the llm-d.ai.
 5 | # This role is intended for users who need to manage these resources
 6 | # but should not control RBAC or manage permissions for others.
 7 | 
 8 | apiVersion: rbac.authorization.k8s.io/v1
 9 | kind: ClusterRole
10 | metadata:
11 |   labels:
12 |     app.kubernetes.io/name: modelservice
13 |     app.kubernetes.io/managed-by: kustomize
14 |   name: modelservice-editor-role
15 | rules:
16 | - apiGroups:
17 |   - llm-d.ai
18 |   resources:
19 |   - modelservices
20 |   verbs:
21 |   - create
22 |   - delete
23 |   - get
24 |   - list
25 |   - patch
26 |   - update
27 |   - watch
28 | - apiGroups:
29 |   - llm-d.ai
30 |   resources:
31 |   - modelservices/status
32 |   verbs:
33 |   - get
34 | 


--------------------------------------------------------------------------------
/config/rbac/modelservice_viewer_role.yaml:
--------------------------------------------------------------------------------
 1 | # This rule is not used by the project modelservice itself.
 2 | # It is provided to allow the cluster admin to help manage permissions for users.
 3 | #
 4 | # Grants read-only access to llm-d.ai resources.
 5 | # This role is intended for users who need visibility into these resources
 6 | # without permissions to modify them. It is ideal for monitoring purposes and limited-access viewing.
 7 | 
 8 | apiVersion: rbac.authorization.k8s.io/v1
 9 | kind: ClusterRole
10 | metadata:
11 |   labels:
12 |     app.kubernetes.io/name: modelservice
13 |     app.kubernetes.io/managed-by: kustomize
14 |   name: modelservice-viewer-role
15 | rules:
16 | - apiGroups:
17 |   - llm-d.ai
18 |   resources:
19 |   - modelservices
20 |   verbs:
21 |   - get
22 |   - list
23 |   - watch
24 | - apiGroups:
25 |   - llm-d.ai
26 |   resources:
27 |   - modelservices/status
28 |   verbs:
29 |   - get
30 | 


--------------------------------------------------------------------------------
/config/rbac/role.yaml:
--------------------------------------------------------------------------------
  1 | ---
  2 | apiVersion: rbac.authorization.k8s.io/v1
  3 | kind: ClusterRole
  4 | metadata:
  5 |   name: manager-role
  6 | rules:
  7 | - apiGroups:
  8 |   - ""
  9 |   resources:
 10 |   - configmaps
 11 |   - serviceaccounts
 12 |   verbs:
 13 |   - create
 14 |   - delete
 15 |   - get
 16 |   - list
 17 |   - patch
 18 |   - update
 19 |   - watch
 20 | - apiGroups:
 21 |   - ""
 22 |   resources:
 23 |   - services
 24 |   verbs:
 25 |   - create
 26 |   - delete
 27 |   - list
 28 |   - patch
 29 |   - update
 30 |   - watch
 31 | - apiGroups:
 32 |   - apps
 33 |   resources:
 34 |   - deployments
 35 |   verbs:
 36 |   - create
 37 |   - delete
 38 |   - get
 39 |   - list
 40 |   - patch
 41 |   - update
 42 |   - watch
 43 | - apiGroups:
 44 |   - apps
 45 |   resources:
 46 |   - deployments/scale
 47 |   verbs:
 48 |   - patch
 49 |   - update
 50 | - apiGroups:
 51 |   - gateway.networking.k8s.io
 52 |   resources:
 53 |   - httproutes
 54 |   verbs:
 55 |   - create
 56 |   - delete
 57 |   - get
 58 |   - list
 59 |   - patch
 60 |   - update
 61 |   - watch
 62 | - apiGroups:
 63 |   - inference.networking.x-k8s.io
 64 |   resources:
 65 |   - inferencemodels
 66 |   - inferencepools
 67 |   verbs:
 68 |   - create
 69 |   - delete
 70 |   - get
 71 |   - list
 72 |   - patch
 73 |   - update
 74 |   - watch
 75 | - apiGroups:
 76 |   - llm-d.ai
 77 |   resources:
 78 |   - modelservices
 79 |   verbs:
 80 |   - create
 81 |   - delete
 82 |   - get
 83 |   - list
 84 |   - patch
 85 |   - update
 86 |   - watch
 87 | - apiGroups:
 88 |   - llm-d.ai
 89 |   resources:
 90 |   - modelservices/finalizers
 91 |   verbs:
 92 |   - update
 93 | - apiGroups:
 94 |   - llm-d.ai
 95 |   resources:
 96 |   - modelservices/status
 97 |   verbs:
 98 |   - get
 99 |   - patch
100 |   - update
101 | - apiGroups:
102 |   - rbac.authorization.k8s.io
103 |   resources:
104 |   - rolebindings
105 |   verbs:
106 |   - create
107 |   - delete
108 |   - get
109 |   - list
110 |   - patch
111 |   - update
112 |   - watch
113 | 


--------------------------------------------------------------------------------
/config/rbac/role_binding.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: rbac.authorization.k8s.io/v1
 2 | kind: ClusterRoleBinding
 3 | metadata:
 4 |   labels:
 5 |     app.kubernetes.io/name: modelservice
 6 |     app.kubernetes.io/managed-by: kustomize
 7 |   name: manager-rolebinding
 8 | roleRef:
 9 |   apiGroup: rbac.authorization.k8s.io
10 |   kind: ClusterRole
11 |   name: manager-role
12 | subjects:
13 | - kind: ServiceAccount
14 |   name: controller-manager
15 |   namespace: system
16 | 


--------------------------------------------------------------------------------
/config/rbac/service_account.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: ServiceAccount
3 | metadata:
4 |   labels:
5 |     app.kubernetes.io/name: modelservice
6 |     app.kubernetes.io/managed-by: kustomize
7 |   name: controller-manager
8 |   namespace: system
9 | 


--------------------------------------------------------------------------------
/config/samples/kustomization.yaml:
--------------------------------------------------------------------------------
1 | ## Append samples of your project ##
2 | resources:
3 | - llmd_v1alpha1_modelservice.yaml
4 | # +kubebuilder:scaffold:manifestskustomizesamples
5 | 


--------------------------------------------------------------------------------
/config/samples/vllmd_v1alpha1_modelservice.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: llm-d.ai/v1alpha1
 2 | kind: ModelService
 3 | metadata:
 4 |   labels:
 5 |     app.kubernetes.io/name: modelservice
 6 |     app.kubernetes.io/managed-by: kustomize
 7 |   name: modelservice-sample
 8 | spec:
 9 |   # TODO(user): Add fields here
10 | 


--------------------------------------------------------------------------------
/config/summitdemo/kustomization.yaml:
--------------------------------------------------------------------------------
 1 | # Adds namespace to all resources.
 2 | namespace: e2e-solution
 3 | 
 4 | resources:
 5 | - ../default
 6 | 
 7 | patches:
 8 | - target:
 9 |     kind: Deployment
10 |   path: manager_patch.yaml
11 | 


--------------------------------------------------------------------------------
/config/summitdemo/manager_patch.yaml:
--------------------------------------------------------------------------------
 1 | # Make image pull policy always if using dev image
 2 | - op: add
 3 |   path: /spec/template/spec/containers/0/imagePullPolicy
 4 |   value: Always
 5 | 
 6 | # Add image pull secret
 7 | - op: add
 8 |   path: /spec/template/spec/imagePullSecrets
 9 |   value:
10 |   - name: $IMAGE_PULL_SECRET
11 | 
12 | # to configure pull secrets for pd and epp pods, uncomment
13 | - op: add
14 |   path: /spec/template/spec/containers/0/args/-
15 |   value: --pd-pull-secrets=$PD_PULL_SECRETS
16 | - op: add
17 |   path: /spec/template/spec/containers/0/args/-
18 |   value: --epp-pull-secrets=$EPP_PULL_SECRETS
19 | 


--------------------------------------------------------------------------------
/deploy/common/patch-service.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Service
 3 | metadata:
 4 |   name: service
 5 | spec:
 6 |   selector:
 7 |     app: ${PROJECT_NAME}-service
 8 |   ports:
 9 |     - protocol: TCP
10 |       port: 8080
11 |       targetPort: 8080
12 |   type: ClusterIP
13 |  


--------------------------------------------------------------------------------
/deploy/common/patch-statefulset.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: apps/v1
 2 | kind: StatefulSet
 3 | metadata:
 4 |   name: 0
 5 | spec:
 6 |   serviceName: ${PROJECT_NAME}-service
 7 |   replicas: 1
 8 |   selector:
 9 |     matchLabels:
10 |       app: ${PROJECT_NAME}-statefulset
11 |   template:
12 |     metadata:
13 |       labels:
14 |         app: ${PROJECT_NAME}-statefulset
15 |     spec:
16 |       serviceAccountName: operator-controller-manager
17 |       containers:
18 |         - name: cmd
19 |           image: ${IMAGE_TAG_BASE}:${VERSION}
20 |           imagePullPolicy: Always
21 | 


--------------------------------------------------------------------------------
/deploy/common/service.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Service
 3 | metadata:
 4 |   name: service
 5 | spec:
 6 |   selector:
 7 |     app: placeholder
 8 |   ports:
 9 |     - protocol: TCP
10 |       port: 8080
11 |       targetPort: 8080
12 |   type: ClusterIP
13 | 


--------------------------------------------------------------------------------
/deploy/common/statefulset.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: apps/v1
 2 | kind: StatefulSet
 3 | metadata:
 4 |   name: "0"
 5 | spec:
 6 |   serviceName: placeholder
 7 |   replicas: 1
 8 |   selector:
 9 |     matchLabels:
10 |       app: placeholder
11 |   template:
12 |     metadata:
13 |       labels:
14 |         app: placeholder
15 |     spec:
16 |       serviceAccountName: operator-controller-manager
17 |       containers:
18 |         - name: cmd
19 |           image: ghcr.io/vllm-d/placeholder:placeholder
20 |           imagePullPolicy: Always
21 | 


--------------------------------------------------------------------------------
/deploy/kustomization.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: kustomize.config.k8s.io/v1beta1
 2 | kind: Kustomization
 3 | 
 4 | # Set the namespace for all resources using a placeholder.
 5 | namespace: ${NAMESPACE}
 6 | 
 7 | # Use a prefix for all object names. You can substitute the PROJECT_NAME variable.
 8 | namePrefix: ${PROJECT_NAME}-
 9 | 
10 | # List all the resources (manifests) you want to deploy.
11 | resources:
12 | - common/statefulset.yaml
13 | - common/service.yaml
14 | - openshift/route.yaml
15 | - rbac/exec-rbac-role.yaml
16 | - rbac/exec-rbac-rolebinding.yaml
17 | 
18 | # Generate the ConfigMap with a variable name.
19 | configMapGenerator:
20 | - name: config
21 |   options:
22 |     disableNameSuffixHash: true
23 | 
24 | # Include patches to update the Service, StatefulSet, Route, and RBAC resources.
25 | 
26 | # Define the image to be updated.
27 | # images:
28 | # - name: ghcr.io/vllm-d/placeholder
29 | #   newName: ghcr.io/vllm-d/${IMAGE_TAG_BASE}
30 | #   newTag: ${VERSION}
31 | patches:
32 | - path: common/patch-service.yaml
33 | - path: common/patch-statefulset.yaml
34 | - path: openshift/patch-route.yaml
35 | - path: rbac/patch-rbac-role.yaml
36 | - path: rbac/patch-rbac-rolebinding.yaml
37 | 


--------------------------------------------------------------------------------
/deploy/openshift/patch-route.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: route.openshift.io/v1
2 | kind: Route
3 | metadata:
4 |   name: route
5 | spec:
6 |   to:
7 |     name: "${PROJECT_NAME}-service"
8 | 


--------------------------------------------------------------------------------
/deploy/openshift/route.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: route.openshift.io/v1
 2 | kind: Route
 3 | metadata:
 4 |   name: route
 5 | spec:
 6 |   to:
 7 |     kind: Service
 8 |     name: placeholder
 9 |   port:
10 |     targetPort: 8080
11 |   tls:
12 |     termination: edge
13 | 


--------------------------------------------------------------------------------
/deploy/rbac/exec-rbac-role.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: rbac.authorization.k8s.io/v1
 2 | kind: Role
 3 | metadata:
 4 |   name: exec-role
 5 | rules:
 6 |   - apiGroups: [""]
 7 |     resources: ["pods/exec"]
 8 |     resourceNames: ["placeholder-0-0"]
 9 |     verbs: ["create"]
10 | 


--------------------------------------------------------------------------------
/deploy/rbac/exec-rbac-rolebinding.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: rbac.authorization.k8s.io/v1
 2 | kind: RoleBinding
 3 | metadata:
 4 |   name: exec-rolebinding
 5 | subjects:
 6 |   - kind: Group
 7 |     name: system:authenticated
 8 |     apiGroup: rbac.authorization.k8s.io
 9 | roleRef:
10 |   kind: Role
11 |   name: exec-role
12 |   apiGroup: rbac.authorization.k8s.io
13 | 
14 | 


--------------------------------------------------------------------------------
/deploy/rbac/patch-rbac-role.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: rbac.authorization.k8s.io/v1
 2 | kind: Role
 3 | metadata:
 4 |   name: exec-role
 5 | rules:
 6 |   - apiGroups: [""]
 7 |     resources: ["pods/exec"]
 8 |     resourceNames:
 9 |       - "${PROJECT_NAME}-0-0"
10 |     verbs: ["create"]
11 | 


--------------------------------------------------------------------------------
/deploy/rbac/patch-rbac-rolebinding.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: rbac.authorization.k8s.io/v1
 2 | kind: RoleBinding
 3 | metadata:
 4 |   name: exec-rolebinding
 5 | subjects:
 6 |   - kind: Group
 7 |     name: system:authenticated
 8 |     apiGroup: rbac.authorization.k8s.io
 9 | roleRef:
10 |   kind: Role
11 |   name: ${PROJECT_NAME}-exec-role
12 |   apiGroup: rbac.authorization.k8s.io
13 | 


--------------------------------------------------------------------------------
/docs/api_reference/config.yaml:
--------------------------------------------------------------------------------
1 | title: "ModelService API Reference"
2 | version: "1.0"
3 | sources:
4 |   - https://github.com/llm-d/llm-d-model-service.git


--------------------------------------------------------------------------------
/docs/apireference.md:
--------------------------------------------------------------------------------
 1 | # API Reference
 2 | 
 3 | Refer to the [`api_reference`](./api_reference/out.html) folder.
 4 | 
 5 | # Command to build API Reference
 6 | 
 7 | - Generate asciidoc using [crd-ref-docs](https://github.com/elastic/crd-ref-docs)
 8 | 
 9 | ```sh
10 | crd-ref-docs --source-path=./api/v1alpha1  --config=./docs/api_reference/config.yaml --output-path=./docs/api_reference
11 | ```
12 | 
13 | - Convert asciidoc to HTML by installing [asciidoctor](https://asciidoctor.org/)
14 | 
15 | ```sh
16 | asciidoctor ./docs/api_reference/out.asciidoc
17 | ```


--------------------------------------------------------------------------------
/docs/developer.md:
--------------------------------------------------------------------------------
  1 | # Developer Docs
  2 | 
  3 | Clone the [ModelService GitHub repository](https://github.com/llm-d/llm-d-model-service) (or a fork of it) to take advantage of the `make` commands described below.  All commands are from the project root directory.
  4 | 
  5 | Execution of the ModelService controller requires access to a cluster.
  6 | A local cluster, such as a `kind` cluster, suffices for basic execution and development testing.
  7 | However, testing end-to-end with a large language model may not be possible if the cluster does not have sufficient resources or if the [inference gateway](https://gateway-api.sigs.k8s.io/) and [inference gateway extension](https://github.com/llm-d/gateway-api-inference-extension) are not fully configured.
  8 | 
  9 | If a cluster is not available, you can do a dry-run to identify the Kubernetes resources that will be created for a given `ModelService CR`. See [ModelService Dry Run](#modelservice-dry-run) below.
 10 | 
 11 | ## Prerequisites
 12 | 
 13 | ### Install Kubernetes Gateway API CRDs
 14 | 
 15 | ```
 16 | kubectl apply -f https://github.com/kubernetes-sigs/gateway-api/releases/download/v1.3.0/standard-install.yaml
 17 | ```
 18 | 
 19 | ### Install Kubernetes Gateway API Inference Extension CRDs
 20 | 
 21 | ```shell
 22 | VERSION=v0.3.0
 23 | kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/releases/download/$VERSION/manifests.yaml
 24 | ```
 25 | 
 26 | ### Define Cluster Role for Endpoint Picker (EPP)
 27 | 
 28 | For the endpoint picker used in the [samples](https://github.com/llm-d/llm-d-model-service/tree/dev/samples), the `pod-read` cluster role defined [here](https://github.com/llm-d/gateway-api-inference-extension/blob/dev/config/manifests/inferencepool-resources.yaml#L84-L112) works.
 29 | 
 30 | ### Install ModelService CRDs
 31 | 
 32 | ```shell
 33 | make install
 34 | ```
 35 | 
 36 | If successful, you should see something like:
 37 | 
 38 | ```shell
 39 | % kubectl get crd | grep modelservice
 40 | modelservices.llm-d.ai                                            2025-05-08T13:37:32Z
 41 | ```
 42 | 
 43 | ## Local Execution
 44 | 
 45 | You can run the ModelService controller locally operating against the cluster defined by your current Kubernetes configuration.
 46 | 
 47 | ```shell
 48 | make run EPP_CLUSTERROLE=pod-read
 49 | ```
 50 | 
 51 | You can now create `ModelService` objects. See [samples](https://github.com/llm-d/llm-d-model-service/tree/dev/samples) for details.
 52 | 
 53 | To avoid long image and model downloads, you can create dummy model services such as those in[ `samples/test`](https://github.com/llm-d/llm-d-model-service/tree/dev/samples/test).
 54 | 
 55 | ## Running in a Cluster
 56 | 
 57 | Deploy the controller to the cluster:
 58 | 
 59 | 1. Create the target namespace `modelservice-system`
 60 | 
 61 |     By default, the ModelService controller is deployed to the `modelservice-system` namespace. To change the target namespace, create a kustomize overlay (see [`config/dev`](https://github.com/llm-d/llm-d-model-service/tree/dev/config/dev)).
 62 | 
 63 | 2. Deploy the controller:
 64 | 
 65 |     ```shell
 66 |     make dev-deploy EPP_CLUSTERROLE=pod-read
 67 |     ```
 68 | 
 69 |     You should see a `modelservice-controller-manager` pod start in the `modelservice-system` namespace.
 70 | 
 71 |     If an image pull secret is required, you can specify it with the environment variable `IMAGE_PULL_SECRET`.
 72 | 
 73 | You can now create `ModelService` objects. See [samples](https://github.com/llm-d/llm-d-model-service/tree/dev/samples) for details.
 74 | 
 75 | ## Uninstall
 76 | 
 77 | The controller and `ModelService` CRDs can be removed:
 78 | 
 79 | ```shell
 80 | make uninstall && make undeploy 
 81 | ```
 82 | 
 83 | Supporting resources like the endpoint picker cluster role, the inference gateway, and the Kubernetes Gateway APi Inference Extension CRDs can also be uninstalled.
 84 | 
 85 | ## ModelService Dry-Run
 86 | View the components that ModelService will create given a `ModelService` CR and a base config `ConfigMap`. This command does not require cluster access.
 87 | 
 88 | In the `llm-d-model-service`project root directory:
 89 | 
 90 | ```shell
 91 | go run main.go generate \
 92 | --epp-cluster-role=<name-of-endpoint-picker-cluster-role> \
 93 | --modelservice <path-to-msvc-cr> \
 94 | --baseconfig <path-to-baseconfig>
 95 | ```
 96 | 
 97 | Note that because no cluster access is required, it is not necessary to create an endpoint picker cluster role resource.
 98 | 
 99 | For example:
100 | 
101 | ```shell
102 | go run main.go generate \
103 | --epp-cluster-role=pod-read \
104 | --modelservice samples/msvcs/granite3.2.yaml \
105 | --baseconfig samples/baseconfigs/simple-baseconfig.yaml
106 | ```
107 | 
108 | will output the YAML manifest for the resources that ModelService will create in the cluster. Some fields that require cluster access to define, will not be included, such as `metadata.namespace`.
109 | 
110 | This feature purely for development purposes, and is intended to provide a quick way of debugging without a cluster. 


--------------------------------------------------------------------------------
/docs/install.md:
--------------------------------------------------------------------------------
1 | # Installation
2 | 
3 | `Modelservice` is one of the components installed during [llm-d installation](https://github.com/llm-d/llm-d-deployer/blob/main/quickstart/README.md).
4 | 
5 | For local development and testing, refer to [developer docs](developer.md).


--------------------------------------------------------------------------------
/docs/userguide.md:
--------------------------------------------------------------------------------
 1 | # User Guide
 2 | 
 3 | This guide presents the core concepts and configuration patterns for serving base models using the `ModelService` Custom Resource Definition (CRD). It is intended for both platform operators and model owners.
 4 | 
 5 | ## [Core Concepts](userguide/core-concepts.md)
 6 | 
 7 | Understand how `ModelService` fits into the Kubernetes ecosystem, what resources it manages, and how its declarative workflow simplifies inference infrastructure.
 8 | 
 9 | ---
10 | 
11 | ## Topics
12 | 
13 | 1. **[Model Name](userguide/model-name.md)**
14 |    How inference clients refer to your model using OpenAI-compatible APIs.
15 | 
16 | 2. **[Model Artifacts](userguide/model-artifacts.md)**
17 |    Load models from Hugging Face, PVCs, or OCI images and mount them into serving containers.
18 | 
19 | <!-- 3. **[Templating Reference](userguide/templating-reference.md)** -->
20 | 3. **Templating Reference**
21 |    Use Go templates in `ModelService` and `BaseConfig` to dynamically generate configurations for child resources.
22 | 
23 | <!-- 4. **[Decouple Scaling](userguide/decouple-scaling.md)** -->
24 | 4. **Decouple Scaling**
25 |    Let HPA or custom controllers manage replica counts for prefill and decode deployments.
26 | 
27 | <!-- 5. **[Accelerator Types](userguide/accelerator-types.md)** -->
28 | 5. **Accelerator Types**
29 |    Target specific GPU types using node labels to ensure models run on the right hardware.
30 | 
31 | <!-- 6. **[Semantic Merge](userguide/semantic-merge.md)** -->
32 | 6. **Semantic Merge**
33 |    Learn how values in `ModelService` override or augment those defined in `BaseConfig`.
34 | 
35 | <!-- 7. **[Child Resources](userguide/resources-owned.md)** -->
36 | 7. **Child Resources**
37 |    Explore all Kubernetes resources owned and managed by a `ModelService`.
38 | 
39 | ---
40 | 
41 | For more details, see:
42 | 
43 | 📄 [Install Guide](install.md) — how to install the ModelService controller
44 | 
45 | 📘 [API Reference](apireference.md) — full CRD schema and field definitions
46 | 


--------------------------------------------------------------------------------
/docs/userguide/core-concepts.md:
--------------------------------------------------------------------------------
 1 | # Core Concepts
 2 | 
 3 | The ModelService custom resource provides a unified declarative API for serving a base model in Kubernetes. It supports inference workloads with prefill/decode disaggregation, reusable configuration presets, and seamless integration with the Gateway API Inference Extension (GIE).
 4 | 
 5 | When a ModelService resource is reconciled, it creates and maintains the following resources.
 6 | 
 7 | ## Workload resources
 8 | 
 9 | * Prefill deployment and service
10 | * Decode deployment and service
11 | * Configmaps for prefill/decode
12 | 
13 | ## Routing
14 | 
15 | * HTTPRoute
16 | * Inference pool
17 | * Inference model
18 | * Endpoint picker (EPP) deployment and service 
19 | * Configmaps for EPP
20 | 
21 | ## Access control
22 | 
23 | * Service account for prefill/decode and EPP
24 | * RoleBinding for EPP
25 | 
26 | These resources are optional and fully configurable. Their creation, omission, and configuration is controlled through BaseConfig and ModelService specifications. When the resources are created, the parent ModelService that triggered their creation is set as their owner; this facilitates correctness of the reconciliation logic, garbage collection and status tracking.
27 | 
28 | The following sample illustrates the core concepts in the ModelService spec. Further details are covered under individual topics below.
29 | 
30 | ```yaml
31 | apiVersion: llm-d.ai/v1alpha1
32 | kind: ModelService
33 | metadata:
34 |   name: facebook-opt-125m
35 |   # `ModelService` is a namespace scoped resource
36 |   namespace: my-ns
37 | spec:
38 |   # `baseConfigMapRef.name` is the name of the Kubernetes configmap that provides default configurations for the resources spawned by this `ModelService`. 
39 |   
40 |   # configuration derived from this `ModelService` will be semantically merged with the contents of the referenced `BaseConfig` to produce the final resource configuration. This allows model owners to override platform defaults only when necessary.
41 | 
42 |   # the contents of this `BaseConfig` configmap can be templated
43 |   baseConfigMapRef:
44 |     name: generic-base-config
45 | 
46 |   # `routing.modelName` is name of the model used by OpenAI compatible inference clients in their queries.
47 |   routing:
48 |     modelName: facebook/opt-125m
49 | 
50 |   # `modelArtifacts.uri` describes the source of the model. In this example, it is sourced from Hugging Face (as indicated by the hf:// prefix in the URI), the owner of the Hugging Face repo is `facebook`, and the model ID within is `opt-125m`.
51 |   modelArtifacts:
52 |     # if `uri` is prefixed with hf://, it will create an emptyDir volume in prefill/decode pods, that can be mounted by the model serving container in the pod.
53 |     uri: hf://facebook/opt-125m
54 | 
55 |   # `prefill` and `decode` sections enable disaggregated prefill architecture for model serving; these sections are optional; include both to sections to enable diaggregation; omit prefill to disable disaggregation.
56 |   decode:
57 |     # number of decode pods
58 |     replicas: 1
59 |     # a list of containers
60 |     containers:
61 |     - name: vllm
62 |       # Templated arguments.
63 |       # .HFModelName expands to "facebook/opt-125m"
64 |       # For all variables, see the templating reference.
65 |       args:
66 |       # hint: add quote while using templating to avoid subtle yaml parsing issues
67 |       - "{{ .HFModelName }}"
68 |       # if `mountModelVolume` is set to true, the volume meant for model storage will be mounted by this container; in this example, this will be an emptyDir volume into which this container will download the model from Hugging Face.
69 |       mountModelVolume: true
70 | ```
71 | 
72 | This minimal example demonstrates inference serving using a Hugging Face model. For more on routing, model sources, templating, merging, and advanced features, refer to the respective [topics](../userguide.md#topics).
73 | 
74 | 


--------------------------------------------------------------------------------
/docs/userguide/model-artifacts.md:
--------------------------------------------------------------------------------
  1 | # Model Artifacts
  2 | 
  3 | The `modelArtifacts` section under the `spec` of a `ModelService` defines how model files, such as weights and metadata configurations, are retrieved and loaded into inference backends like vLLM. This abstraction simplifies the process by allowing users to specify the model source without needing to configure low-level details like environment variables, volumes, or volume mounts.
  4 | 
  5 | ## Purpose
  6 | 
  7 | Without `ModelService`, users must manually configure vLLM arguments, environment variables, and pod/container specifications. This requires a deep understanding of both vLLM and the composition of model artifacts. The `ModelService` controller automates these configurations, enabling users to focus solely on specifying the model source.
  8 | 
  9 | ## Model Artifact Sources and Behaviors
 10 | 
 11 | The `modelArtifacts.uri` field determines the source of the model artifacts. Each supported prefix results in specific behaviors in the prefill and decode deployments. The following sources are supported:
 12 | 
 13 | ### 1. Downloading a Model Directly from Hugging Face
 14 | 
 15 | If the `uri` begins with the `hf://` prefix, the model is downloaded directly from Hugging Face into an `emptyDir` volume.
 16 | 
 17 | #### URI Format
 18 | 
 19 | The repo and model ID must match exactly to the IDs found on the Hugging Face model registry, as required by vLLM.
 20 | 
 21 | `hf://<repo-id>/<model-id>`  
 22 | 
 23 | Example: `hf://facebook/opt-125m`
 24 | 
 25 | #### Additional Fields
 26 | 
 27 | - **`authSecretName`**: Specifies the Kubernetes Secret containing the `HF_TOKEN` for gated models.
 28 | - **`size`**: Defines the size of the `emptyDir` volume.
 29 | 
 30 | #### Behavior
 31 | 
 32 | - An `emptyDir` volume named `model-storage` is created.
 33 | - Containers with `mountModelVolume: true` will have a `volumeMount` at `/model-cache`.
 34 | - The `HF_HOME` environment variable is set to `/model-cache`.
 35 | - If `authSecretName` is provided, the `HF_TOKEN` environment variable is created.
 36 | 
 37 | #### Example Deployment Snippet
 38 | 
 39 | ```yaml
 40 | volumes:
 41 |   - name: model-storage
 42 |     emptyDir: {}
 43 | containers:
 44 |   - name: vllm
 45 |     env:
 46 |       - name: HF_HOME
 47 |         value: /model-cache
 48 |       - name: HF_TOKEN
 49 |         valueFrom:
 50 |           secretKeyRef:
 51 |             name: hf-secret
 52 |             key: HF_TOKEN
 53 |     volumeMounts:
 54 |       - mountPath: /model-cache
 55 |         name: model-storage
 56 | ```
 57 | 
 58 | #### Template variables 
 59 | 
 60 | Various template variables are exposed as a result of using the `"hf://"` prefix, namely
 61 | 
 62 | - `{{ .HFModelName }}`: this is `<repo-id>/<model-id>` in the URI, which might be useful for vLLM arguments. Note that this is different from `{{ .ModelName }}`, which is the `spec.routing.modelName`, used for client requests 
 63 | - `{{ .MountedModelPath }}`: this is equal to `/model-cache`
 64 | 
 65 | ### 2. Loading a model directly from a PVC
 66 | 
 67 | Downloading large models from Hugging Face can take a significant amount of time. If a PVC containing the model files is already pre-populated, then mounting this path and supplying that to vLLM can drastically shorten the engine's warm up time. 
 68 | 
 69 | #### URI format 
 70 | 
 71 | `"pvc://<pvc-name>/<path/to/model>"`
 72 | 
 73 | Example: `"pvc://granite-pvc/path/to/granite"`
 74 | 
 75 | #### Behavior 
 76 | 
 77 | - A read-only PVC volume with the name `model-storage` is created for the deployment 
 78 | - A read-only `volumeMount` with the `mountPath: model-cache` is created for each container where `mountModelVolume: true`
 79 | 
 80 | 
 81 | #### Example Deployment Snippet
 82 | 
 83 | ```yaml
 84 | volumes:
 85 |   - name: model-storage
 86 |     persistentVolumeClaim:
 87 |       claimName: granite-pvc
 88 |       readOnly: true
 89 | containers:
 90 |   - name: vllm
 91 |     volumeMounts:
 92 |       - mountPath: /model-cache
 93 |         name: model-storage
 94 | ```
 95 | 
 96 | #### Template variables
 97 | 
 98 | Various template variable are exposed as a result of using the `"pvc://"` prefix, with `.MountedModelPath` being particularly useful if vLLM arguments require it.
 99 | 
100 | - `{{ .MountedModelPath }}`: this is equal to `/model-cache/<path/to/model>` where `</path/to/model>` comes from the URI. In the above example, `{{ .MountedModelPath }}` interpolates to `/model-cache/path/to/granite`
101 | 
102 | ### 3. Loading the model from an image volume
103 | 
104 | NotImplemented.


--------------------------------------------------------------------------------
/docs/userguide/model-name.md:
--------------------------------------------------------------------------------
 1 | # Model Name
 2 | 
 3 | The `modelName` field under the `routing` section of a `ModelService` specifies how clients refer to a model during inference. This name is used by OpenAI-compatible APIs and must be **globally unique** across all `ModelService` resources in the cluster, unless they target different gateways.
 4 | 
 5 | ## Purpose
 6 | 
 7 | This field acts as the public-facing identifier for the model. When an inference client sends a request, it includes this name in the "model" field of the API request body.
 8 | 
 9 | ### Client request
10 | 
11 | ```json
12 | {
13 |   "model": "facebook/opt-125m",
14 |   "prompt": "What is the capital of France?"
15 | }
16 | ```
17 | 
18 | ### ModelService configuration
19 | 
20 | ```yaml
21 | spec:
22 |   routing:
23 |     modelName: facebook/opt-125m
24 | ```
25 | 
26 | The gateway ensures that each model name maps to one and only one live base model across the cluster.
27 | 
28 | ## Conflict Resolution
29 | 
30 | If multiple `ModelService` resources attempt to register the same `modelName`, the controller may arbitrarily select one owner, after attempting to apply the following rules.
31 | 
32 | * The oldest resource (based on creation timestamp) is retained as the valid owner.
33 | 
34 | * The newer conflicting resource will:
35 | 
36 |   * Have its inference model marked as not ready.
37 | 
38 |   * Emit an appropriate status error indicating the conflict.
39 | 


--------------------------------------------------------------------------------
/go.mod:
--------------------------------------------------------------------------------
  1 | module github.com/llm-d/llm-d-model-service
  2 | 
  3 | go 1.24.0
  4 | 
  5 | toolchain go1.24.2
  6 | 
  7 | godebug default=go1.23
  8 | 
  9 | require (
 10 | 	github.com/onsi/ginkgo/v2 v2.23.3
 11 | 	github.com/onsi/gomega v1.37.0
 12 | 	k8s.io/api v0.33.0
 13 | 	k8s.io/apimachinery v0.33.0
 14 | 	k8s.io/client-go v0.33.0
 15 | 	sigs.k8s.io/controller-runtime v0.20.4
 16 | )
 17 | 
 18 | require (
 19 | 	dario.cat/mergo v1.0.1
 20 | 	github.com/Masterminds/sprig/v3 v3.3.0
 21 | 	github.com/stretchr/testify v1.10.0
 22 | 	sigs.k8s.io/gateway-api v1.3.0
 23 | 	sigs.k8s.io/yaml v1.4.0
 24 | )
 25 | 
 26 | require (
 27 | 	github.com/Masterminds/goutils v1.1.1 // indirect
 28 | 	github.com/Masterminds/semver/v3 v3.3.0 // indirect
 29 | 	github.com/huandu/xstrings v1.5.0 // indirect
 30 | 	github.com/mitchellh/copystructure v1.2.0 // indirect
 31 | 	github.com/mitchellh/reflectwalk v1.0.2 // indirect
 32 | 	github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
 33 | 	github.com/shopspring/decimal v1.4.0 // indirect
 34 | 	github.com/spf13/cast v1.7.0 // indirect
 35 | 	golang.org/x/crypto v0.38.0 // indirect
 36 | 	sigs.k8s.io/randfill v1.0.0 // indirect
 37 | )
 38 | 
 39 | require (
 40 | 	cel.dev/expr v0.19.1 // indirect
 41 | 	github.com/antlr4-go/antlr/v4 v4.13.0 // indirect
 42 | 	github.com/beorn7/perks v1.0.1 // indirect
 43 | 	github.com/blang/semver/v4 v4.0.0 // indirect
 44 | 	github.com/cenkalti/backoff/v4 v4.3.0 // indirect
 45 | 	github.com/cespare/xxhash/v2 v2.3.0 // indirect
 46 | 	github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect
 47 | 	github.com/emicklei/go-restful/v3 v3.12.0 // indirect
 48 | 	github.com/evanphx/json-patch/v5 v5.9.11 // indirect
 49 | 	github.com/felixge/httpsnoop v1.0.4 // indirect
 50 | 	github.com/fsnotify/fsnotify v1.7.0 // indirect
 51 | 	github.com/fxamacker/cbor/v2 v2.8.0 // indirect
 52 | 	github.com/go-logr/logr v1.4.2
 53 | 	github.com/go-logr/stdr v1.2.2 // indirect
 54 | 	github.com/go-logr/zapr v1.3.0 // indirect
 55 | 	github.com/go-openapi/jsonpointer v0.21.0 // indirect
 56 | 	github.com/go-openapi/jsonreference v0.21.0 // indirect
 57 | 	github.com/go-openapi/swag v0.23.0 // indirect
 58 | 	github.com/go-task/slim-sprig/v3 v3.0.0 // indirect
 59 | 	github.com/gogo/protobuf v1.3.2 // indirect
 60 | 	github.com/google/btree v1.1.3 // indirect
 61 | 	github.com/google/cel-go v0.23.2 // indirect
 62 | 	github.com/google/gnostic-models v0.6.9 // indirect
 63 | 	github.com/google/go-cmp v0.7.0 // indirect
 64 | 	github.com/google/pprof v0.0.0-20241210010833-40e02aabc2ad // indirect
 65 | 	github.com/google/uuid v1.6.0 // indirect
 66 | 	github.com/grpc-ecosystem/grpc-gateway/v2 v2.24.0 // indirect
 67 | 	github.com/inconshreveable/mousetrap v1.1.0 // indirect
 68 | 	github.com/josharian/intern v1.0.0 // indirect
 69 | 	github.com/json-iterator/go v1.1.12 // indirect
 70 | 	github.com/mailru/easyjson v0.7.7 // indirect
 71 | 	github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
 72 | 	github.com/modern-go/reflect2 v1.0.2 // indirect
 73 | 	github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
 74 | 	github.com/pkg/errors v0.9.1 // indirect
 75 | 	github.com/prometheus/client_golang v1.22.0 // indirect
 76 | 	github.com/prometheus/client_model v0.6.1 // indirect
 77 | 	github.com/prometheus/common v0.63.0 // indirect
 78 | 	github.com/prometheus/procfs v0.15.1 // indirect
 79 | 	github.com/spf13/cobra v1.9.1
 80 | 	github.com/spf13/pflag v1.0.6 // indirect
 81 | 	github.com/stoewer/go-strcase v1.3.0 // indirect
 82 | 	github.com/x448/float16 v0.8.4 // indirect
 83 | 	go.opentelemetry.io/auto/sdk v1.1.0 // indirect
 84 | 	go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.58.0 // indirect
 85 | 	go.opentelemetry.io/otel v1.34.0 // indirect
 86 | 	go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.33.0 // indirect
 87 | 	go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.33.0 // indirect
 88 | 	go.opentelemetry.io/otel/metric v1.34.0 // indirect
 89 | 	go.opentelemetry.io/otel/sdk v1.34.0 // indirect
 90 | 	go.opentelemetry.io/otel/trace v1.34.0 // indirect
 91 | 	go.opentelemetry.io/proto/otlp v1.4.0 // indirect
 92 | 	go.uber.org/multierr v1.11.0 // indirect
 93 | 	go.uber.org/zap v1.27.0
 94 | 	golang.org/x/exp v0.0.0-20250408133849-7e4ce0ab07d0 // indirect
 95 | 	golang.org/x/net v0.40.0 // indirect
 96 | 	golang.org/x/oauth2 v0.27.0 // indirect
 97 | 	golang.org/x/sync v0.14.0 // indirect
 98 | 	golang.org/x/sys v0.33.0 // indirect
 99 | 	golang.org/x/term v0.32.0 // indirect
100 | 	golang.org/x/text v0.25.0 // indirect
101 | 	golang.org/x/time v0.9.0 // indirect
102 | 	golang.org/x/tools v0.33.0 // indirect
103 | 	gomodules.xyz/jsonpatch/v2 v2.4.0 // indirect
104 | 	google.golang.org/genproto/googleapis/api v0.0.0-20250106144421-5f5ef82da422 // indirect
105 | 	google.golang.org/genproto/googleapis/rpc v0.0.0-20250115164207-1a7da9e5054f // indirect
106 | 	google.golang.org/grpc v1.71.1 // indirect
107 | 	google.golang.org/protobuf v1.36.6 // indirect
108 | 	gopkg.in/evanphx/json-patch.v4 v4.12.0 // indirect
109 | 	gopkg.in/inf.v0 v0.9.1 // indirect
110 | 	gopkg.in/yaml.v3 v3.0.1 // indirect
111 | 	k8s.io/apiextensions-apiserver v0.33.0 // indirect
112 | 	k8s.io/apiserver v0.33.0 // indirect
113 | 	k8s.io/component-base v0.33.0 // indirect
114 | 	k8s.io/klog/v2 v2.130.1 // indirect
115 | 	k8s.io/kube-openapi v0.0.0-20250318190949-c8a335a9a2ff // indirect
116 | 	k8s.io/utils v0.0.0-20250502105355-0f33e8f1c979
117 | 	sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.31.2 // indirect
118 | 	sigs.k8s.io/gateway-api-inference-extension v0.3.0
119 | 	sigs.k8s.io/json v0.0.0-20241014173422-cfa47c3a1cc8 // indirect
120 | 	sigs.k8s.io/structured-merge-diff/v4 v4.7.0 // indirect
121 | )
122 | 


--------------------------------------------------------------------------------
/hack/boilerplate.go.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/llm-d/llm-d-model-service/55eb18c4f08116ed8c9211f643b356ec5e47b4b7/hack/boilerplate.go.txt


--------------------------------------------------------------------------------
/hooks/pre-commit:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -e
 3 | 
 4 | # echo "▶️  Running lint…"
 5 | # make lint
 6 | 
 7 | # echo "▶️  Running tests…"
 8 | # make test
 9 | 
10 | echo "✔️  All checks passed!"
11 | 


--------------------------------------------------------------------------------
/internal/controller/accelerator_types.go:
--------------------------------------------------------------------------------
 1 | package controller
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 
 6 | 	"github.com/llm-d/llm-d-model-service/api/v1alpha1"
 7 | 	corev1 "k8s.io/api/core/v1"
 8 | )
 9 | 
10 | // ToNodeAffinity generates a NodeAffinity rule that requires nodes to match
11 | // the specified accelerator label key and one of the allowed values.
12 | //
13 | // Returns an error if LabelKey is empty or LabelValues is empty.
14 | func AcceleratorTypesToNodeAffinity(a *v1alpha1.AcceleratorTypes) (*corev1.NodeAffinity, error) {
15 | 	if a == nil {
16 | 		return nil, nil
17 | 	}
18 | 	if a.LabelKey == "" {
19 | 		return nil, fmt.Errorf("LabelKey must not be empty")
20 | 	}
21 | 	if len(a.LabelValues) == 0 {
22 | 		return nil, fmt.Errorf("LabelValues must contain at least one value")
23 | 	}
24 | 
25 | 	// Construct the node affinity rule
26 | 	nodeAffinity := &corev1.NodeAffinity{
27 | 		RequiredDuringSchedulingIgnoredDuringExecution: &corev1.NodeSelector{
28 | 			NodeSelectorTerms: []corev1.NodeSelectorTerm{
29 | 				{
30 | 					MatchExpressions: []corev1.NodeSelectorRequirement{
31 | 						{
32 | 							Key:      a.LabelKey,
33 | 							Operator: corev1.NodeSelectorOpIn,
34 | 							Values:   a.LabelValues,
35 | 						},
36 | 					},
37 | 				},
38 | 			},
39 | 		},
40 | 	}
41 | 
42 | 	return nodeAffinity, nil
43 | }
44 | 


--------------------------------------------------------------------------------
/internal/controller/accelerator_types_test.go:
--------------------------------------------------------------------------------
 1 | package controller
 2 | 
 3 | import (
 4 | 	"testing"
 5 | 
 6 | 	"github.com/llm-d/llm-d-model-service/api/v1alpha1"
 7 | 	"github.com/stretchr/testify/assert"
 8 | 
 9 | 	corev1 "k8s.io/api/core/v1"
10 | )
11 | 
12 | func TestToNodeAffinity(t *testing.T) {
13 | 	tests := []struct {
14 | 		name        string
15 | 		accelerator v1alpha1.AcceleratorTypes
16 | 		expectError bool
17 | 	}{
18 | 		// valid label key and values
19 | 		{
20 | 			name: "valid accelerator",
21 | 			accelerator: v1alpha1.AcceleratorTypes{
22 | 				LabelKey:    "nvidia.com/gpu.product",
23 | 				LabelValues: []string{"A100", "H100"},
24 | 			},
25 | 			expectError: false,
26 | 		},
27 | 		// missing LabelKey
28 | 		{
29 | 			name: "missing label key",
30 | 			accelerator: v1alpha1.AcceleratorTypes{
31 | 				LabelKey:    "",
32 | 				LabelValues: []string{"A100"},
33 | 			},
34 | 			expectError: true,
35 | 		},
36 | 		// empty LabelValues slice
37 | 		{
38 | 			name: "empty label values",
39 | 			accelerator: v1alpha1.AcceleratorTypes{
40 | 				LabelKey:    "nvidia.com/gpu.product",
41 | 				LabelValues: []string{},
42 | 			},
43 | 			expectError: true,
44 | 		},
45 | 	}
46 | 
47 | 	for _, tt := range tests {
48 | 		t.Run(tt.name, func(t *testing.T) {
49 | 			nodeAffinity, err := AcceleratorTypesToNodeAffinity(&tt.accelerator)
50 | 
51 | 			if tt.expectError {
52 | 				assert.Error(t, err, "expected error but got none")
53 | 				assert.Nil(t, nodeAffinity)
54 | 			} else {
55 | 				assert.NoError(t, err)
56 | 				assert.NotNil(t, nodeAffinity)
57 | 				assert.NotNil(t, nodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution)
58 | 
59 | 				// Validate NodeSelectorTerm with correct MatchExpression
60 | 				terms := nodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution.NodeSelectorTerms
61 | 				assert.Len(t, terms, 1)
62 | 				assert.Len(t, terms[0].MatchExpressions, 1)
63 | 
64 | 				expr := terms[0].MatchExpressions[0]
65 | 				assert.Equal(t, corev1.NodeSelectorOpIn, expr.Operator)
66 | 				assert.Equal(t, tt.accelerator.LabelKey, expr.Key)
67 | 				assert.ElementsMatch(t, tt.accelerator.LabelValues, expr.Values)
68 | 			}
69 | 		})
70 | 	}
71 | }
72 | 


--------------------------------------------------------------------------------
/internal/controller/child_resources_test.go:
--------------------------------------------------------------------------------
  1 | package controller
  2 | 
  3 | import (
  4 | 	"context"
  5 | 	"fmt"
  6 | 
  7 | 	"k8s.io/apimachinery/pkg/api/errors"
  8 | 
  9 | 	msv1alpha1 "github.com/llm-d/llm-d-model-service/api/v1alpha1"
 10 | 	. "github.com/onsi/ginkgo/v2"
 11 | 	. "github.com/onsi/gomega"
 12 | 	appsv1 "k8s.io/api/apps/v1"
 13 | 	corev1 "k8s.io/api/core/v1"
 14 | 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 15 | 	"sigs.k8s.io/yaml"
 16 | )
 17 | 
 18 | // tests to check if base config reading works ok
 19 | var _ = Describe("BaseConfig reader", func() {
 20 | 	var (
 21 | 		ctx        context.Context
 22 | 		reconciler *ModelServiceReconciler
 23 | 		msvc       *msv1alpha1.ModelService
 24 | 		cm         *corev1.ConfigMap
 25 | 		replicas   = int32(1)
 26 | 	)
 27 | 
 28 | 	BeforeEach(func() {
 29 | 		ctx = context.Background()
 30 | 
 31 | 		// Create test deployment YAML
 32 | 		deployment := appsv1.Deployment{
 33 | 			Spec: appsv1.DeploymentSpec{
 34 | 				Replicas: &replicas,
 35 | 			},
 36 | 		}
 37 | 		deployYaml, err := yaml.Marshal(deployment)
 38 | 		Expect(err).To(BeNil())
 39 | 
 40 | 		// Create ConfigMap with a deployment inside
 41 | 		cm = &corev1.ConfigMap{
 42 | 			ObjectMeta: metav1.ObjectMeta{
 43 | 				Name:      "test-base-config",
 44 | 				Namespace: "default",
 45 | 			},
 46 | 			Data: map[string]string{
 47 | 				"eppDeployment": string(deployYaml),
 48 | 			},
 49 | 		}
 50 | 
 51 | 		// Create ModelService referencing the ConfigMap
 52 | 		msvc = &msv1alpha1.ModelService{
 53 | 			ObjectMeta: metav1.ObjectMeta{
 54 | 				Name:      "test-modelservice",
 55 | 				Namespace: "default",
 56 | 			},
 57 | 			Spec: msv1alpha1.ModelServiceSpec{
 58 | 				BaseConfigMapRef: &corev1.ObjectReference{
 59 | 					Name: "test-base-config",
 60 | 				},
 61 | 				ModelArtifacts: msv1alpha1.ModelArtifacts{
 62 | 					URI: "hf://facebook/opt-125m",
 63 | 				},
 64 | 			},
 65 | 		}
 66 | 
 67 | 		By("Creating the base config cm")
 68 | 		Expect(k8sClient.Create(ctx, cm)).To(Succeed())
 69 | 
 70 | 		By("Creating the msvc")
 71 | 		Expect(k8sClient.Create(ctx, msvc)).To(Succeed())
 72 | 
 73 | 		reconciler = &ModelServiceReconciler{
 74 | 			Client: k8sClient,
 75 | 			Scheme: k8sClient.Scheme(),
 76 | 		}
 77 | 	})
 78 | 
 79 | 	It("should correctly deserialize the eppDeployment from ConfigMap", func() {
 80 | 		bc, err := reconciler.getChildResourcesFromConfigMap(ctx, msvc)
 81 | 		Expect(err).To(BeNil())
 82 | 		Expect(bc).ToNot(BeNil())
 83 | 		Expect(bc.EPPDeployment).ToNot(BeNil())
 84 | 		Expect(bc.EPPDeployment.Spec.Replicas).ToNot(BeNil())
 85 | 		Expect(*bc.EPPDeployment.Spec.Replicas).To(Equal(int32(1)))
 86 | 	})
 87 | 
 88 | 	It("should continue to correctly deserialize the eppDeployment from ConfigMap with pvc prefix", func() {
 89 | 		msvc.Spec.ModelArtifacts.URI = "pvc://my-pvc/path/to/opt-125m"
 90 | 		bc, err := reconciler.getChildResourcesFromConfigMap(ctx, msvc)
 91 | 		Expect(err).To(BeNil())
 92 | 		Expect(bc).ToNot(BeNil())
 93 | 		Expect(bc.EPPDeployment).ToNot(BeNil())
 94 | 		Expect(bc.EPPDeployment.Spec.Replicas).ToNot(BeNil())
 95 | 		Expect(*bc.EPPDeployment.Spec.Replicas).To(Equal(int32(1)))
 96 | 	})
 97 | 
 98 | 	It("should return nil if configmap ref is missing", func() {
 99 | 		msvc.Spec.BaseConfigMapRef = nil
100 | 		bc, err := reconciler.getChildResourcesFromConfigMap(ctx, msvc)
101 | 		Expect(err).To(BeNil())
102 | 		Expect(bc.PrefillDeployment).To(BeNil())
103 | 		Expect(bc.DecodeDeployment).To(BeNil())
104 | 		Expect(bc.PrefillService).To(BeNil())
105 | 		Expect(bc.DecodeService).To(BeNil())
106 | 		Expect(bc.InferencePool).To(BeNil())
107 | 		Expect(bc.InferenceModel).To(BeNil())
108 | 		Expect(bc.EPPDeployment).To(BeNil())
109 | 		Expect(bc.EPPService).To(BeNil())
110 | 	})
111 | 
112 | 	It("should error if the ConfigMap is missing", func() {
113 | 		msvc.Spec.BaseConfigMapRef.Name = "doesnotexist"
114 | 		bc, err := reconciler.getChildResourcesFromConfigMap(ctx, msvc)
115 | 		Expect(err).To(HaveOccurred())
116 | 		Expect(bc).To(BeNil())
117 | 	})
118 | 
119 | 	AfterEach(func() {
120 | 		// Clean up resources after each test
121 | 		err := k8sClient.Delete(ctx, msvc)
122 | 		if err != nil && !errors.IsNotFound(err) {
123 | 			Fail(fmt.Sprintf("Failed to delete ModelService: %v", err))
124 | 		}
125 | 
126 | 		err = k8sClient.Delete(ctx, cm)
127 | 		if err != nil && !errors.IsNotFound(err) {
128 | 			Fail(fmt.Sprintf("Failed to delete ConfigMap: %v", err))
129 | 		}
130 | 	})
131 | })
132 | 
133 | // tests to check if templating works ok
134 | var _ = Describe("BaseConfig reader", func() {
135 | 	var (
136 | 		ctx        context.Context
137 | 		reconciler *ModelServiceReconciler
138 | 		msvc       *msv1alpha1.ModelService
139 | 		cm         *corev1.ConfigMap
140 | 	)
141 | 
142 | 	BeforeEach(func() {
143 | 		ctx = context.Background()
144 | 
145 | 		// Be careful that there are no TAB characters in the string
146 | 		deployYamlStr := `metadata:
147 |   name: mvsc-prefill
148 | spec:
149 |   template:
150 |     spec:
151 |       containers:
152 |       - name: vllm
153 |         command:
154 |         - vllm
155 |         - serve
156 |         args:
157 |         - '{{ .HFModelName }}'
158 |         ports:
159 |         - containerPort: {{ "portName" | getPort }}
160 | `
161 | 
162 | 		// Create ConfigMap with a deployment inside
163 | 		cm = &corev1.ConfigMap{
164 | 			ObjectMeta: metav1.ObjectMeta{
165 | 				Name:      "test-base-config",
166 | 				Namespace: "default",
167 | 			},
168 | 			Data: map[string]string{
169 | 				"prefillDeployment": string(deployYamlStr),
170 | 			},
171 | 		}
172 | 
173 | 		// Create ModelService referencing the ConfigMap
174 | 		msvc = &msv1alpha1.ModelService{
175 | 			ObjectMeta: metav1.ObjectMeta{
176 | 				Name:      "test-modelservice",
177 | 				Namespace: "default",
178 | 			},
179 | 			Spec: msv1alpha1.ModelServiceSpec{
180 | 				BaseConfigMapRef: &corev1.ObjectReference{
181 | 					Name: "test-base-config",
182 | 				},
183 | 				ModelArtifacts: msv1alpha1.ModelArtifacts{
184 | 					URI: "hf://facebook/opt-125m",
185 | 				},
186 | 				Routing: msv1alpha1.Routing{
187 | 					Ports: []msv1alpha1.Port{
188 | 						{
189 | 							Name: "portName",
190 | 							Port: 9999,
191 | 						},
192 | 					},
193 | 				},
194 | 			},
195 | 		}
196 | 
197 | 		By("Creating the base config cm")
198 | 		Expect(k8sClient.Create(ctx, cm)).To(Succeed())
199 | 
200 | 		By("Creating the msvc")
201 | 		Expect(k8sClient.Create(ctx, msvc)).To(Succeed())
202 | 
203 | 		reconciler = &ModelServiceReconciler{
204 | 			Client: k8sClient,
205 | 			Scheme: k8sClient.Scheme(),
206 | 		}
207 | 	})
208 | 
209 | 	It("should correctly interpolate container args", func() {
210 | 		bc, err := reconciler.getChildResourcesFromConfigMap(ctx, msvc)
211 | 		Expect(err).To(BeNil())
212 | 		Expect(bc).ToNot(BeNil())
213 | 		Expect(bc.PrefillDeployment).ToNot(BeNil())
214 | 		Expect(bc.PrefillDeployment.Spec.Template.Spec.Containers).ToNot(BeNil())
215 | 		c := bc.PrefillDeployment.Spec.Template.Spec.Containers[0]
216 | 		Expect(c.Args).ToNot(BeNil())
217 | 		Expect(c.Args[0]).To(Equal("facebook/opt-125m"))
218 | 	})
219 | 
220 | 	It("should correctly interpolate containerPort", func() {
221 | 		bc, err := reconciler.getChildResourcesFromConfigMap(ctx, msvc)
222 | 		Expect(err).To(BeNil())
223 | 		Expect(bc).ToNot(BeNil())
224 | 		Expect(bc.PrefillDeployment).ToNot(BeNil())
225 | 		Expect(bc.PrefillDeployment.Spec.Template.Spec.Containers).ToNot(BeNil())
226 | 		c := bc.PrefillDeployment.Spec.Template.Spec.Containers[0]
227 | 		Expect(c.Ports).ToNot(BeEmpty())
228 | 		Expect(c.Ports[0].ContainerPort).To(Equal(int32(9999)))
229 | 	})
230 | 
231 | 	AfterEach(func() {
232 | 		// Clean up resources after each test
233 | 		err := k8sClient.Delete(ctx, msvc)
234 | 		if err != nil && !errors.IsNotFound(err) {
235 | 			Fail(fmt.Sprintf("Failed to delete ModelService: %v", err))
236 | 		}
237 | 
238 | 		err = k8sClient.Delete(ctx, cm)
239 | 		if err != nil && !errors.IsNotFound(err) {
240 | 			Fail(fmt.Sprintf("Failed to delete ConfigMap: %v", err))
241 | 		}
242 | 	})
243 | })
244 | 


--------------------------------------------------------------------------------
/internal/controller/constants.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | Constants for utils
 3 | */
 4 | 
 5 | package controller
 6 | 
 7 | const modelStorageVolumeName = "model-storage"
 8 | const modelStorageRoot = "/model-cache"
 9 | const pathSep = "/"
10 | const DECODE_ROLE = "decode"
11 | const PREFILL_ROLE = "prefill"
12 | const MODEL_ARTIFACT_URI_PVC = "pvc"
13 | const MODEL_ARTIFACT_URI_HF = "hf"
14 | const MODEL_ARTIFACT_URI_OCI = "oci"
15 | const MODEL_ARTIFACT_URI_PVC_PREFIX = MODEL_ARTIFACT_URI_PVC + "://"
16 | const MODEL_ARTIFACT_URI_HF_PREFIX = MODEL_ARTIFACT_URI_HF + "://"
17 | const MODEL_ARTIFACT_URI_OCI_PREFIX = MODEL_ARTIFACT_URI_OCI + "://"
18 | const ENV_HF_HOME = "HF_HOME"
19 | const ENV_HF_TOKEN = "HF_TOKEN"
20 | 
21 | type URIType string
22 | 
23 | const (
24 | 	PVC        URIType = "pvc"
25 | 	HF         URIType = "hf"
26 | 	OCI        URIType = "oci"
27 | 	UnknownURI URIType = "unknown"
28 | )
29 | 


--------------------------------------------------------------------------------
/internal/controller/merge_transformers.go:
--------------------------------------------------------------------------------
  1 | package controller
  2 | 
  3 | import (
  4 | 	"reflect"
  5 | 	"slices"
  6 | 
  7 | 	"dario.cat/mergo"
  8 | 	corev1 "k8s.io/api/core/v1"
  9 | 	gatewayv1 "sigs.k8s.io/gateway-api/apis/v1"
 10 | )
 11 | 
 12 | // convertToGenericSlice returns a slice where each item in the slice
 13 | // is converted to a T object from each item in reflect.Value
 14 | func convertToGenericSlice[T any](val reflect.Value) []T {
 15 | 	if val.Kind() == reflect.Ptr {
 16 | 		val = val.Elem()
 17 | 	}
 18 | 	if val.Kind() != reflect.Slice {
 19 | 		return nil
 20 | 	}
 21 | 
 22 | 	var result []T
 23 | 	for i := 0; i < val.Len(); i++ {
 24 | 		item := val.Index(i).Interface()
 25 | 		tItem, ok := item.(T)
 26 | 		if !ok {
 27 | 			return nil
 28 | 		}
 29 | 		result = append(result, tItem)
 30 | 	}
 31 | 	return result
 32 | }
 33 | 
 34 | // mergeKeyValue returns the value given the name of the field in that struct
 35 | // for example,
 36 | // myEnvVar := corev1.EnvVar{"Name": "env-var"}
 37 | // mergeKeyValue(myEnvVar, "Name") returns "env-var"
 38 | func mergeKeyValue[T any](obj T, fieldName string) string {
 39 | 	return reflect.ValueOf(obj).FieldByName(fieldName).String()
 40 | }
 41 | 
 42 | // genericSliceTransformer merges two slices of the same type T
 43 | // mergeFunc is the function that contains logic for merging two T objects
 44 | // mergeKey is the name of the field in T, so that if dst.MergeKey == src.MergeKey,
 45 | // the mergeFunc is called on those two objects. Otherwise, the src is appended
 46 | // for now, only string fields are supported for mergeKey
 47 | // (since we cannot guarantee equality for generic reflect.Value)
 48 | // mergeFunc takes in
 49 | // - dst (pointer): so that in-place merge can take happen
 50 | // - src: the src object to merge into dst
 51 | func genericSliceTransformer[T any](
 52 | 	typ reflect.Type,
 53 | 	mergeFunc func(dst *T, src T) error,
 54 | 	mergeKey string) func(dst, src reflect.Value) error {
 55 | 
 56 | 	if typ == reflect.TypeOf([]T{}) {
 57 | 		return func(dst, src reflect.Value) error {
 58 | 
 59 | 			// Reject transforming anything other than slices
 60 | 			if dst.Kind() != reflect.Slice || src.Kind() != reflect.Slice {
 61 | 				return nil
 62 | 			}
 63 | 
 64 | 			srcSlice := convertToGenericSlice[T](src)
 65 | 			dstSlice := convertToGenericSlice[T](dst)
 66 | 
 67 | 			// keep track of the common mergeKeys among src and dst
 68 | 			srcMergeKeyMap := map[string]T{}
 69 | 			commonMergeKeys := []string{} // TODO: maybe mergeKey can be another generic type?
 70 | 
 71 | 			for _, srcObj := range srcSlice {
 72 | 				mergeKeyValue := mergeKeyValue(srcObj, mergeKey)
 73 | 				srcMergeKeyMap[mergeKeyValue] = srcObj
 74 | 			}
 75 | 
 76 | 			for _, dstObj := range dstSlice {
 77 | 				mergeKeyValue := mergeKeyValue(dstObj, mergeKey)
 78 | 				if _, found := srcMergeKeyMap[mergeKeyValue]; found {
 79 | 					commonMergeKeys = append(commonMergeKeys, mergeKeyValue)
 80 | 				}
 81 | 			}
 82 | 
 83 | 			// now loop over dstSlice and see if there is a srcObj with same mergeKey value in src
 84 | 			for i, dstObj := range dstSlice {
 85 | 
 86 | 				dstMergeKeyValue := mergeKeyValue(dstObj, mergeKey)
 87 | 
 88 | 				// Found a matching srcObj with same mergeKey value
 89 | 				if srcObj, found := srcMergeKeyMap[dstMergeKeyValue]; found {
 90 | 
 91 | 					// Calls mergeFunc on the logic that merges two T structs in the slice
 92 | 					err := mergeFunc(&dstObj, srcObj)
 93 | 
 94 | 					if err != nil {
 95 | 						return err
 96 | 					}
 97 | 
 98 | 					// Update dstObj in dstSlice if merge was successful
 99 | 					dstSlice[i] = dstObj
100 | 				}
101 | 			}
102 | 
103 | 			// Construct the mergedSlice combining both src and dst
104 | 			mergedSlice := []T{}
105 | 
106 | 			// mergedSlice contains everything already present in dst to begin with,
107 | 			// with the common T objects already merged from src
108 | 			mergedSlice = append(mergedSlice, dstSlice...)
109 | 
110 | 			// append other src objects that weren't merged and skip the ones that are common
111 | 			for _, srcObj := range srcSlice {
112 | 				mergeKeyValue := mergeKeyValue(srcObj, mergeKey)
113 | 				if !slices.Contains(commonMergeKeys, mergeKeyValue) {
114 | 					mergedSlice = append(mergedSlice, srcObj)
115 | 				}
116 | 			}
117 | 
118 | 			// Now rewrite dst with mergedSlice
119 | 			dst.Set(reflect.ValueOf(mergedSlice))
120 | 			return nil
121 | 		}
122 | 	}
123 | 	return nil
124 | }
125 | 
126 | // envVarSliceTransformer: transformer for merging two EnvVars
127 | type envVarSliceTransformer struct{}
128 | 
129 | // Transformer for []corev1.Env
130 | func (e envVarSliceTransformer) Transformer(typ reflect.Type) func(dst, src reflect.Value) error {
131 | 
132 | 	// mergeKey for merging two EnvVars is the Name of the EnvVar
133 | 	mergeKey := "Name"
134 | 	mergeFunc := func(dst *corev1.EnvVar, src corev1.EnvVar) error {
135 | 		return mergo.Merge(dst, src, mergo.WithOverride)
136 | 	}
137 | 
138 | 	return genericSliceTransformer(typ, mergeFunc, mergeKey)
139 | }
140 | 
141 | // stringSlicePrependTransformer: transformer for merging two string slices
142 | type stringSlicePrependTransformer struct{}
143 | 
144 | // Transformer for []string, such as Container.Args so that src args get prepended, not appended
145 | func (stringSlicePrependTransformer) Transformer(t reflect.Type) func(dst, src reflect.Value) error {
146 | 	if t.Kind() == reflect.Slice && t.Elem().Kind() == reflect.String {
147 | 		return func(dst, src reflect.Value) error {
148 | 			// Ensure dst is settable
149 | 			if !dst.CanSet() {
150 | 				return nil
151 | 			}
152 | 			if src.IsNil() || src.Len() == 0 {
153 | 				return nil
154 | 			}
155 | 
156 | 			// Combine: src first, then dst
157 | 			merged := reflect.AppendSlice(src, dst)
158 | 			dst.Set(merged)
159 | 			return nil
160 | 		}
161 | 	}
162 | 	return nil
163 | }
164 | 
165 | // compositeTransformer is a list of transformers to apply in a single mergo.Merge call
166 | type compositeTransformer struct {
167 | 	transformers []mergo.Transformers
168 | }
169 | 
170 | // Transformer takes in a list of Transformers and applies them one by one
171 | func (ct compositeTransformer) Transformer(t reflect.Type) func(dst, src reflect.Value) error {
172 | 	for _, tr := range ct.transformers {
173 | 		if fn := tr.Transformer(t); fn != nil {
174 | 			return fn
175 | 		}
176 | 	}
177 | 	return nil
178 | }
179 | 
180 | // containerSliceTransformer: transformer for merging two Containers
181 | type containerSliceTransformer struct{}
182 | 
183 | // Transformer merges two []corev1.Container based on their Name,
184 | // and applies transformers for each Container.Spec fields
185 | func (c containerSliceTransformer) Transformer(typ reflect.Type) func(dst, src reflect.Value) error {
186 | 
187 | 	// mergeKey for merging two Containers is the Name of the Container
188 | 	mergeKey := "Name"
189 | 
190 | 	// dstContainer (comes from baseconfig)
191 | 	// srcContainer (comes from msvc and controller logic)
192 | 	mergeFunc := func(dstContainer *corev1.Container, srcContainer corev1.Container) error {
193 | 
194 | 		// Command should be completely overriden, not appended
195 | 		if len(srcContainer.Command) > 0 {
196 | 			dstContainer.Command = []string{}
197 | 		}
198 | 
199 | 		err := mergo.Merge(dstContainer,
200 | 			srcContainer,
201 | 			mergo.WithAppendSlice,
202 | 			mergo.WithOverride,
203 | 			mergo.WithTransformers(compositeTransformer{
204 | 				transformers: []mergo.Transformers{
205 | 					envVarSliceTransformer{},
206 | 					stringSlicePrependTransformer{},
207 | 				},
208 | 			}),
209 | 		)
210 | 
211 | 		if err != nil {
212 | 			return err
213 | 		}
214 | 
215 | 		return nil
216 | 	}
217 | 
218 | 	return genericSliceTransformer(typ, mergeFunc, mergeKey)
219 | }
220 | 
221 | // parentRefSliceTransformer: transformer for merging two ParentReference objects
222 | type parentRefSliceTransformer struct{}
223 | 
224 | // Transformer merges two []gatewayv1.ParentReference based on their Name,
225 | // and applies transformers for each Container.Spec fields
226 | func (c parentRefSliceTransformer) Transformer(typ reflect.Type) func(dst, src reflect.Value) error {
227 | 
228 | 	// mergeKey for merging two ParentReference is the Name of the ParentReference
229 | 	mergeKey := "Name"
230 | 
231 | 	// dstParentReference (comes from baseconfig)
232 | 	// srcParentReference (comes from msvc and controller logic)
233 | 	mergeFunc := func(dstParentReference *gatewayv1.ParentReference, srcParentReference gatewayv1.ParentReference) error {
234 | 
235 | 		err := mergo.Merge(dstParentReference,
236 | 			srcParentReference,
237 | 			mergo.WithAppendSlice,
238 | 			mergo.WithOverride)
239 | 
240 | 		if err != nil {
241 | 			return err
242 | 		}
243 | 
244 | 		return nil
245 | 	}
246 | 
247 | 	return genericSliceTransformer(typ, mergeFunc, mergeKey)
248 | }
249 | 
250 | // backendRefTransformer: transformer for merging two BackendRef objects
251 | type backendRefTransformer struct{}
252 | 
253 | // Transformer merges two []gatewayv1.BackendRef based on their Name,
254 | // and applies transformers for each Container.Spec fields
255 | func (c backendRefTransformer) Transformer(typ reflect.Type) func(dst, src reflect.Value) error {
256 | 
257 | 	// mergeKey for merging two BackendRef is the Name of the BackendRef
258 | 	mergeKey := "Name"
259 | 
260 | 	// dstBackendRef (comes from baseconfig)
261 | 	// srcBackendRef (comes from msvc and controller logic)
262 | 	mergeFunc := func(dstBackendRef *gatewayv1.BackendRef, srcBackendRef gatewayv1.BackendRef) error {
263 | 
264 | 		err := mergo.Merge(dstBackendRef,
265 | 			srcBackendRef,
266 | 			mergo.WithAppendSlice,
267 | 			mergo.WithOverride)
268 | 
269 | 		if err != nil {
270 | 			return err
271 | 		}
272 | 
273 | 		return nil
274 | 	}
275 | 
276 | 	return genericSliceTransformer(typ, mergeFunc, mergeKey)
277 | }
278 | 
279 | // MergeContainerSlices merges src slice into dest in place
280 | func MergeContainerSlices(dest, src []corev1.Container) ([]corev1.Container, error) {
281 | 	err := mergo.Merge(&dest, src, mergo.WithTransformers(containerSliceTransformer{}))
282 | 
283 | 	if err != nil {
284 | 		return []corev1.Container{}, err
285 | 	}
286 | 
287 | 	return dest, err
288 | }
289 | 
290 | // MergeGatewayRefSlices merges src slice containing gatewayv1.ParentRefs into dest in place
291 | func MergeGatewayRefSlices(dest, src []gatewayv1.ParentReference) ([]gatewayv1.ParentReference, error) {
292 | 	err := mergo.Merge(&dest, src, mergo.WithTransformers(parentRefSliceTransformer{}))
293 | 
294 | 	if err != nil {
295 | 		return []gatewayv1.ParentReference{}, err
296 | 	}
297 | 
298 | 	return dest, err
299 | }
300 | 
301 | // MergeBackendRefSlices merges src slice containing gatewayv1.ParentRefs into dest in place
302 | func MergeBackendRefSlices(dest, src []gatewayv1.BackendRef) ([]gatewayv1.BackendRef, error) {
303 | 	err := mergo.Merge(&dest, src, mergo.WithTransformers(backendRefTransformer{}))
304 | 
305 | 	if err != nil {
306 | 		return []gatewayv1.BackendRef{}, err
307 | 	}
308 | 
309 | 	return dest, err
310 | }
311 | 


--------------------------------------------------------------------------------
/internal/controller/suite_test.go:
--------------------------------------------------------------------------------
  1 | package controller
  2 | 
  3 | import (
  4 | 	"context"
  5 | 	"os"
  6 | 	"path/filepath"
  7 | 	"testing"
  8 | 
  9 | 	. "github.com/onsi/ginkgo/v2"
 10 | 	. "github.com/onsi/gomega"
 11 | 
 12 | 	"k8s.io/client-go/kubernetes/scheme"
 13 | 	"k8s.io/client-go/rest"
 14 | 	"sigs.k8s.io/controller-runtime/pkg/client"
 15 | 	"sigs.k8s.io/controller-runtime/pkg/envtest"
 16 | 	logf "sigs.k8s.io/controller-runtime/pkg/log"
 17 | 	"sigs.k8s.io/controller-runtime/pkg/log/zap"
 18 | 
 19 | 	msv1alpha1 "github.com/llm-d/llm-d-model-service/api/v1alpha1"
 20 | 	appsv1 "k8s.io/api/apps/v1"
 21 | 	corev1 "k8s.io/api/core/v1"
 22 | 	giev1alpha2 "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2"
 23 | 	gatewayv1 "sigs.k8s.io/gateway-api/apis/v1"
 24 | 	// +kubebuilder:scaffold:imports
 25 | )
 26 | 
 27 | // These tests use Ginkgo (BDD-style Go testing framework). Refer to
 28 | // http://onsi.github.io/ginkgo/ to learn more about Ginkgo.
 29 | 
 30 | var (
 31 | 	ctx       context.Context
 32 | 	cancel    context.CancelFunc
 33 | 	testEnv   *envtest.Environment
 34 | 	cfg       *rest.Config
 35 | 	k8sClient client.Client
 36 | )
 37 | 
 38 | func TestControllers(t *testing.T) {
 39 | 	RegisterFailHandler(Fail)
 40 | 
 41 | 	RunSpecs(t, "Controller Suite")
 42 | }
 43 | 
 44 | var _ = BeforeSuite(func() {
 45 | 	logf.SetLogger(zap.New(zap.WriteTo(GinkgoWriter), zap.UseDevMode(true)))
 46 | 
 47 | 	ctx, cancel = context.WithCancel(context.TODO())
 48 | 
 49 | 	var err error
 50 | 	err = msv1alpha1.AddToScheme(scheme.Scheme)
 51 | 	Expect(err).NotTo(HaveOccurred())
 52 | 	err = gatewayv1.Install(scheme.Scheme)
 53 | 	Expect(err).NotTo(HaveOccurred())
 54 | 	err = giev1alpha2.Install(scheme.Scheme)
 55 | 	Expect(err).NotTo(HaveOccurred())
 56 | 	err = corev1.AddToScheme(scheme.Scheme)
 57 | 	Expect(err).NotTo(HaveOccurred())
 58 | 	err = appsv1.AddToScheme(scheme.Scheme)
 59 | 	Expect(err).NotTo(HaveOccurred())
 60 | 
 61 | 	// +kubebuilder:scaffold:scheme
 62 | 
 63 | 	By("bootstrapping test environment")
 64 | 	testEnv = &envtest.Environment{
 65 | 		// TODO: This should be made robust,
 66 | 		// if someone runs tests from a subfolder, these may not run
 67 | 		CRDDirectoryPaths:     []string{filepath.Join("..", "..", "config", "crd", "bases"), filepath.Join("..", "..", "test", "inferenceCRDs")},
 68 | 		ErrorIfCRDPathMissing: true,
 69 | 	}
 70 | 
 71 | 	// Retrieve the first found binary directory to allow running tests from IDEs
 72 | 	if getFirstFoundEnvTestBinaryDir() != "" {
 73 | 		testEnv.BinaryAssetsDirectory = getFirstFoundEnvTestBinaryDir()
 74 | 	}
 75 | 
 76 | 	// cfg is defined in this file globally.
 77 | 	cfg, err = testEnv.Start()
 78 | 	Expect(err).NotTo(HaveOccurred())
 79 | 	Expect(cfg).NotTo(BeNil())
 80 | 
 81 | 	k8sClient, err = client.New(cfg, client.Options{Scheme: scheme.Scheme})
 82 | 	Expect(err).NotTo(HaveOccurred())
 83 | 	Expect(k8sClient).NotTo(BeNil())
 84 | })
 85 | 
 86 | var _ = AfterSuite(func() {
 87 | 	By("tearing down the test environment")
 88 | 	cancel()
 89 | 	err := testEnv.Stop()
 90 | 	Expect(err).NotTo(HaveOccurred())
 91 | })
 92 | 
 93 | // getFirstFoundEnvTestBinaryDir locates the first binary in the specified path.
 94 | // ENVTEST-based tests depend on specific binaries, usually located in paths set by
 95 | // controller-runtime. When running tests directly (e.g., via an IDE) without using
 96 | // Makefile targets, the 'BinaryAssetsDirectory' must be explicitly configured.
 97 | //
 98 | // This function streamlines the process by finding the required binaries, similar to
 99 | // setting the 'KUBEBUILDER_ASSETS' environment variable. To ensure the binaries are
100 | // properly set up, run 'make setup-envtest' beforehand.
101 | func getFirstFoundEnvTestBinaryDir() string {
102 | 	basePath := filepath.Join("..", "..", "bin", "k8s")
103 | 	entries, err := os.ReadDir(basePath)
104 | 	if err != nil {
105 | 		logf.Log.Error(err, "Failed to read directory", "path", basePath)
106 | 		return ""
107 | 	}
108 | 	for _, entry := range entries {
109 | 		if entry.IsDir() {
110 | 			return filepath.Join(basePath, entry.Name())
111 | 		}
112 | 	}
113 | 	return ""
114 | }
115 | 


--------------------------------------------------------------------------------
/internal/controller/template.go:
--------------------------------------------------------------------------------
 1 | package controller
 2 | 
 3 | import (
 4 | 	"bytes"
 5 | 	"fmt"
 6 | 	"text/template"
 7 | 
 8 | 	sprig "github.com/Masterminds/sprig/v3"
 9 | )
10 | 
11 | // registerSprigFunctions to get a new template with sprig functions support
12 | func registerSprigFunctions(tmplStr string, functions *TemplateFuncs) (*template.Template, error) {
13 | 	// Create a new template and register Sprig functions
14 | 	tmpl, err := template.New("template").
15 | 		Funcs(sprig.TxtFuncMap()).
16 | 		Funcs(functions.funcMap).
17 | 		Parse(tmplStr)
18 | 	if err != nil {
19 | 		return nil, fmt.Errorf("error parsing template: %w", err)
20 | 	}
21 | 	return tmpl, err
22 | }
23 | 
24 | // renderTemplate using template vars
25 | func renderTemplate(tmplStr string, vars *TemplateVars, functions *TemplateFuncs) (string, error) {
26 | 	tmpl, err := registerSprigFunctions(tmplStr, functions)
27 | 	if err != nil {
28 | 		return "", err
29 | 	}
30 | 
31 | 	// Execute the template with the provided struct
32 | 	var buf bytes.Buffer
33 | 	if err := tmpl.Execute(&buf, vars); err != nil {
34 | 		return "", fmt.Errorf("error executing template: %w", err)
35 | 	}
36 | 
37 | 	return buf.String(), nil
38 | }
39 | 


--------------------------------------------------------------------------------
/internal/controller/template_test.go:
--------------------------------------------------------------------------------
  1 | package controller
  2 | 
  3 | import (
  4 | 	"context"
  5 | 	"fmt"
  6 | 	"testing"
  7 | 
  8 | 	msv1alpha1 "github.com/llm-d/llm-d-model-service/api/v1alpha1"
  9 | 	"github.com/stretchr/testify/assert"
 10 | 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 11 | )
 12 | 
 13 | const msvcName = "msvc-test"
 14 | const msvcNamespace = "default"
 15 | const modelName = "modelName"
 16 | const sanitizedModelName = "modelname"
 17 | const pvcName = "pvc-name"
 18 | const modelPath = "path/to/" + modelName
 19 | const mountedModelPathInVolume = modelStorageRoot + pathSep + modelPath
 20 | const pvcURI = "pvc://" + pvcName + "/" + modelPath
 21 | const hfModelName = pvcName + "/" + modelName
 22 | const hfURI = "hf://" + hfModelName
 23 | const authSecretName = "hf-secret"
 24 | 
 25 | var authSecretNameCopy = authSecretName
 26 | var authSecretNamePtr = &authSecretNameCopy // ugly workaround ModelArtifacts.AuthSecretName is *strings
 27 | 
 28 | // returns a minimal valid msvc
 29 | func minimalMSVC() *msv1alpha1.ModelService {
 30 | 	return &msv1alpha1.ModelService{
 31 | 		ObjectMeta: metav1.ObjectMeta{
 32 | 			Name:      msvcName,
 33 | 			Namespace: msvcNamespace,
 34 | 		},
 35 | 		Spec: msv1alpha1.ModelServiceSpec{
 36 | 			Routing: msv1alpha1.Routing{
 37 | 				ModelName: modelName,
 38 | 			},
 39 | 			ModelArtifacts: msv1alpha1.ModelArtifacts{
 40 | 				URI:            pvcURI,
 41 | 				AuthSecretName: authSecretNamePtr,
 42 | 			},
 43 | 		},
 44 | 	}
 45 | }
 46 | 
 47 | // createMSVCWithPDSpec creates a minimal msvc with the appropriate decode
 48 | func createMSVCWithDecode(decodeSpec *msv1alpha1.PDSpec) *msv1alpha1.ModelService {
 49 | 
 50 | 	minimalMSVC := minimalMSVC()
 51 | 	minimalMSVC.Spec.Decode = decodeSpec
 52 | 	return minimalMSVC
 53 | }
 54 | 
 55 | func TestTemplateVars(t *testing.T) {
 56 | 	// Test that each template var can be interpolated in MSVC
 57 | 
 58 | 	tests := map[string]struct {
 59 | 		expectedValue string
 60 | 		uri           string
 61 | 	}{
 62 | 		"ModelServiceName": {
 63 | 			expectedValue: msvcName,
 64 | 		},
 65 | 		"ModelServiceNamespace": {
 66 | 			expectedValue: msvcNamespace,
 67 | 		},
 68 | 		"ModelName": {
 69 | 			expectedValue: modelName,
 70 | 		},
 71 | 		"HFModelName": {
 72 | 			expectedValue: hfModelName,
 73 | 			uri:           hfURI,
 74 | 		},
 75 | 		"SanitizedModelName": {
 76 | 			expectedValue: sanitizedModelName,
 77 | 		},
 78 | 		"ModelPath": {
 79 | 			expectedValue: modelPath,
 80 | 		},
 81 | 		"MountedModelPath": {
 82 | 			expectedValue: mountedModelPathInVolume,
 83 | 		},
 84 | 		"AuthSecretName": {
 85 | 			expectedValue: authSecretName,
 86 | 		},
 87 | 		"EPPServiceName": {
 88 | 			expectedValue: msvcName + "-epp-service",
 89 | 		},
 90 | 		"EPPDeploymentName": {
 91 | 			expectedValue: msvcName + "-epp",
 92 | 		},
 93 | 		"PrefillDeploymentName": {
 94 | 			expectedValue: msvcName + "-prefill",
 95 | 		},
 96 | 		"DecodeDeploymentName": {
 97 | 			expectedValue: msvcName + "-decode",
 98 | 		},
 99 | 		"PrefillServiceName": {
100 | 			expectedValue: msvcName + "-service-prefill",
101 | 		},
102 | 		"DecodeServiceName": {
103 | 			expectedValue: msvcName + "-service-decode",
104 | 		},
105 | 		"InferencePoolName": {
106 | 			expectedValue: msvcName + "-inference-pool",
107 | 		},
108 | 		"InferenceModelName": {
109 | 			expectedValue: msvcName,
110 | 		},
111 | 	}
112 | 
113 | 	for templateVar, testCase := range tests {
114 | 		ctx := context.Background()
115 | 
116 | 		minimalMSVC := createMSVCWithDecode(&msv1alpha1.PDSpec{
117 | 			ModelServicePodSpec: msv1alpha1.ModelServicePodSpec{
118 | 				Containers: []msv1alpha1.ContainerSpec{
119 | 					{
120 | 						Args: []string{
121 | 							// This becomes, for example, {{ .ModelService }}
122 | 							fmt.Sprintf("{{ .%s }}", templateVar),
123 | 						},
124 | 					},
125 | 				},
126 | 			},
127 | 		})
128 | 
129 | 		if testCase.uri != "" {
130 | 			minimalMSVC.Spec.ModelArtifacts.URI = testCase.uri
131 | 		}
132 | 
133 | 		interpolatedMSVC, err := InterpolateModelService(ctx, minimalMSVC)
134 | 		assert.NoError(t, err, "got error but expected none")
135 | 
136 | 		// Assert that the template var is interpolated and the expected values match
137 | 		// Check that Args[0] matches
138 | 		actualValue := interpolatedMSVC.Spec.Decode.Containers[0].Args[0]
139 | 		assert.Equal(t, testCase.expectedValue, actualValue, fmt.Sprintf("%s should be interpolated", templateVar))
140 | 	}
141 | }
142 | 
143 | func TestMSVCInterpolation(t *testing.T) {
144 | 
145 | 	tests := []struct {
146 | 		name         string
147 | 		originalMSVC *msv1alpha1.ModelService
148 | 		expectedMSVC *msv1alpha1.ModelService
149 | 		expectError  bool
150 | 	}{
151 | 		{
152 | 			name:         "no interpolation required should pass",
153 | 			originalMSVC: minimalMSVC(),
154 | 			expectedMSVC: minimalMSVC(),
155 | 			expectError:  false,
156 | 		},
157 | 		{
158 | 			name: "one interpolation required in args should pass",
159 | 			originalMSVC: createMSVCWithDecode(&msv1alpha1.PDSpec{
160 | 				ModelServicePodSpec: msv1alpha1.ModelServicePodSpec{
161 | 					Containers: []msv1alpha1.ContainerSpec{
162 | 						{
163 | 							Args: []string{
164 | 								"{{ .ModelPath }}",
165 | 							},
166 | 						},
167 | 					},
168 | 				},
169 | 			}),
170 | 			expectedMSVC: createMSVCWithDecode(&msv1alpha1.PDSpec{
171 | 				ModelServicePodSpec: msv1alpha1.ModelServicePodSpec{
172 | 					Containers: []msv1alpha1.ContainerSpec{
173 | 						{
174 | 							Args: []string{
175 | 								modelPath,
176 | 							},
177 | 						},
178 | 					},
179 | 				},
180 | 			}),
181 | 			expectError: false,
182 | 		},
183 | 		{
184 | 			name: "1+ interpolation required in args should pass",
185 | 			originalMSVC: createMSVCWithDecode(&msv1alpha1.PDSpec{
186 | 				ModelServicePodSpec: msv1alpha1.ModelServicePodSpec{
187 | 					Containers: []msv1alpha1.ContainerSpec{
188 | 						{
189 | 							Args: []string{
190 | 								"{{ .ModelPath }}",
191 | 								"--arg2",
192 | 								"{{ .DecodeDeploymentName }}",
193 | 							},
194 | 						},
195 | 					},
196 | 				},
197 | 			}),
198 | 			expectedMSVC: createMSVCWithDecode(&msv1alpha1.PDSpec{
199 | 				ModelServicePodSpec: msv1alpha1.ModelServicePodSpec{
200 | 					Containers: []msv1alpha1.ContainerSpec{
201 | 						{
202 | 							Args: []string{
203 | 								modelPath,
204 | 								"--arg2",
205 | 								msvcName + "-decode",
206 | 							},
207 | 						},
208 | 					},
209 | 				},
210 | 			}),
211 | 			expectError: false,
212 | 		},
213 | 	}
214 | 
215 | 	for _, tt := range tests {
216 | 		t.Run(tt.name, func(t *testing.T) {
217 | 			ctx := context.Background()
218 | 			interpolatedMSVC, err := InterpolateModelService(ctx, tt.originalMSVC)
219 | 
220 | 			if tt.expectError {
221 | 				assert.Error(t, err, "expected error but got none")
222 | 			} else {
223 | 				assert.NoError(t, err)
224 | 
225 | 				// Assert that expected args matches interpolated args
226 | 				if tt.expectedMSVC.Spec.Decode != nil && interpolatedMSVC.Spec.Decode != nil {
227 | 					expectedContainers := tt.expectedMSVC.Spec.Decode.Containers
228 | 					interpolatedContainers := interpolatedMSVC.Spec.Decode.Containers
229 | 
230 | 					assert.Equal(t, len(expectedContainers), len(interpolatedContainers), "container lengths don't match")
231 | 
232 | 					for i := range len(expectedContainers) {
233 | 						expectedContainer := expectedContainers[i]
234 | 						interpolatedContainer := interpolatedContainers[i]
235 | 
236 | 						// assert args match
237 | 						assertEqualSlices(t, expectedContainer.Args, interpolatedContainer.Args)
238 | 					}
239 | 				} else if tt.expectedMSVC.Spec.Decode == nil && interpolatedMSVC.Spec.Decode == nil {
240 | 					// both decode specs are nil, pass
241 | 				} else {
242 | 					assert.Fail(t, fmt.Sprintf("decode specs don't match\ngot: %v\nwant:%v", interpolatedMSVC.Spec.Decode, tt.expectedMSVC.Spec.Decode))
243 | 				}
244 | 
245 | 			}
246 | 		})
247 | 	}
248 | }
249 | 


--------------------------------------------------------------------------------
/internal/controller/utils_test.go:
--------------------------------------------------------------------------------
  1 | package controller
  2 | 
  3 | import (
  4 | 	"context"
  5 | 	"fmt"
  6 | 	"strings"
  7 | 
  8 | 	. "github.com/onsi/ginkgo/v2"
  9 | 	. "github.com/onsi/gomega"
 10 | 	"k8s.io/apimachinery/pkg/api/resource"
 11 | 
 12 | 	msv1alpha1 "github.com/llm-d/llm-d-model-service/api/v1alpha1"
 13 | )
 14 | 
 15 | const PVC_NAME = "my-pvc"
 16 | const MODEL_PATH = "path/to/model"
 17 | const HF_REPO_ID = "ibm-granite"
 18 | const HF_MODEL_ID = "granite-3.3-2b-instruct"
 19 | 
 20 | var _ = Describe("Model Artifacts", func() {
 21 | 	Context("Given a model artifact with an invalid URI prefix", func() {
 22 | 		modelArtifact := msv1alpha1.ModelArtifacts{
 23 | 			URI: fmt.Sprintf("nothing://%s/%s", PVC_NAME, MODEL_PATH),
 24 | 		}
 25 | 
 26 | 		It("should parse correctly", func() {
 27 | 			By("checking type of uri")
 28 | 			Expect(isPVCURI(modelArtifact.URI)).To(BeFalse())
 29 | 			Expect(isHFURI(modelArtifact.URI)).To(BeFalse())
 30 | 
 31 | 			By("Parsing PVC uri should fail")
 32 | 			_, err := parsePVCURI(&modelArtifact)
 33 | 			Expect(err).NotTo(BeNil())
 34 | 
 35 | 			By("Parsing HF uri should fail")
 36 | 			_, _, err = parseHFURI(&modelArtifact)
 37 | 			Expect(err).NotTo(BeNil())
 38 | 		})
 39 | 	})
 40 | 
 41 | 	Context("Given an URI string", func() {
 42 | 		tests := map[string]struct {
 43 | 			expectedURIType        URIType
 44 | 			expectedModelMountPath string
 45 | 		}{
 46 | 			"pvc://pvc-name/path/to/model": {
 47 | 				expectedURIType:        PVC,
 48 | 				expectedModelMountPath: modelStorageRoot + pathSep + "path/to/model",
 49 | 			},
 50 | 			"oci://repo-with-tag::path/to/model": {
 51 | 				expectedURIType:        OCI,
 52 | 				expectedModelMountPath: "", // TODO
 53 | 			},
 54 | 			"hf://repo-id/model-id": {
 55 | 				expectedURIType:        HF,
 56 | 				expectedModelMountPath: modelStorageRoot,
 57 | 			},
 58 | 			"pvc://pvc-name": {
 59 | 				expectedURIType:        PVC,
 60 | 				expectedModelMountPath: "",
 61 | 			},
 62 | 			"oci://": {
 63 | 				expectedURIType:        OCI,
 64 | 				expectedModelMountPath: "", // TODO
 65 | 			},
 66 | 			"hf://wrong": {
 67 | 				expectedURIType:        HF,
 68 | 				expectedModelMountPath: modelStorageRoot,
 69 | 			},
 70 | 			"random://": {
 71 | 				expectedURIType:        UnknownURI,
 72 | 				expectedModelMountPath: "",
 73 | 			},
 74 | 			"": {
 75 | 				expectedURIType:        UnknownURI,
 76 | 				expectedModelMountPath: "",
 77 | 			},
 78 | 			"PVC://": {
 79 | 				expectedURIType:        UnknownURI,
 80 | 				expectedModelMountPath: "",
 81 | 			},
 82 | 			"HF://": {
 83 | 				expectedURIType:        UnknownURI,
 84 | 				expectedModelMountPath: "",
 85 | 			},
 86 | 			"OCI://": {
 87 | 				expectedURIType:        UnknownURI,
 88 | 				expectedModelMountPath: "",
 89 | 			},
 90 | 		}
 91 | 
 92 | 		It("should determine the type of the URI correctly", func() {
 93 | 			for uri, answer := range tests {
 94 | 				expectedURIType := answer.expectedURIType
 95 | 				actualURIType := UriType(uri)
 96 | 				Expect(actualURIType).To(Equal(expectedURIType))
 97 | 			}
 98 | 		})
 99 | 
100 | 		It("should compute the mounted model path correctly", func() {
101 | 			for uri, answer := range tests {
102 | 				expectedModelMountPath := answer.expectedModelMountPath
103 | 
104 | 				actualModelMountPath, err := mountedModelPath(&msv1alpha1.ModelService{
105 | 					Spec: msv1alpha1.ModelServiceSpec{
106 | 						ModelArtifacts: msv1alpha1.ModelArtifacts{
107 | 							URI: uri,
108 | 						},
109 | 					},
110 | 				})
111 | 
112 | 				// Expect error if uri type is unknown
113 | 				if answer.expectedURIType == UnknownURI {
114 | 					Expect(err).To(HaveOccurred())
115 | 				} else {
116 | 					Expect(err).ToNot(HaveOccurred())
117 | 					Expect(actualModelMountPath).To(Equal(expectedModelMountPath))
118 | 				}
119 | 
120 | 			}
121 | 		})
122 | 	})
123 | 
124 | 	Context("Given a model artifact with a valid PVC URI", func() {
125 | 		ctx := context.Background()
126 | 		modelArtifact := msv1alpha1.ModelArtifacts{
127 | 			URI: fmt.Sprintf("pvc://%s/%s", PVC_NAME, MODEL_PATH),
128 | 		}
129 | 
130 | 		modelService := msv1alpha1.ModelService{
131 | 			Spec: msv1alpha1.ModelServiceSpec{
132 | 				ModelArtifacts: modelArtifact,
133 | 			},
134 | 		}
135 | 
136 | 		It("should parse correctly", func() {
137 | 			By("checking type of uri")
138 | 			Expect(isPVCURI(modelArtifact.URI)).To(BeTrue())
139 | 			Expect(isHFURI(modelArtifact.URI)).To(BeFalse())
140 | 			Expect(isOCIURI(modelArtifact.URI)).To(BeFalse())
141 | 
142 | 			By("Parsing uri parts should be successful")
143 | 			parts, err := parsePVCURI(&modelArtifact)
144 | 			Expect(err).To(BeNil())
145 | 			Expect(len(parts) > 1).To(BeTrue())
146 | 			Expect(parts[0]).To(Equal(PVC_NAME))
147 | 			Expect(strings.Join(parts[1:], "/")).To(Equal(MODEL_PATH))
148 | 		})
149 | 		It("should produce a valid volumeMounts list", func() {
150 | 			volumeMounts := getVolumeMountsForContainer(ctx, &modelService)
151 | 			Expect(len(volumeMounts)).To(Equal(1))
152 | 			firstVolumeMount := volumeMounts[0]
153 | 
154 | 			Expect(firstVolumeMount.Name).To(Equal(modelStorageVolumeName))
155 | 			Expect(firstVolumeMount.MountPath).To(Equal(modelStorageRoot))
156 | 			Expect(firstVolumeMount.ReadOnly).To(BeTrue())
157 | 		})
158 | 		It("should produce a valid volumes list", func() {
159 | 			volumes := getVolumeForPDDeployment(ctx, &modelService)
160 | 			Expect(len(volumes)).To(Equal(1))
161 | 			firstVolume := volumes[0]
162 | 			Expect(firstVolume.Name).To(Equal(modelStorageVolumeName))
163 | 			Expect(firstVolume.PersistentVolumeClaim.ClaimName).To(Equal(PVC_NAME))
164 | 			Expect(firstVolume.PersistentVolumeClaim.ReadOnly).To(BeTrue())
165 | 		})
166 | 
167 | 		It("should produce a valid env list", func() {
168 | 			envs := getEnvsForContainer(ctx, &modelService)
169 | 			Expect(len(envs)).To(Equal(0))
170 | 		})
171 | 	})
172 | 
173 | 	Context("Given a model artifact with a valid HF URI", func() {
174 | 
175 | 		ctx := context.Background()
176 | 		authSecretName := "auth-secret-key"
177 | 		sizeLimit := "5Gi"
178 | 		sizeLimitQuan := resource.MustParse(sizeLimit)
179 | 
180 | 		modelArtifact := msv1alpha1.ModelArtifacts{
181 | 			URI:            fmt.Sprintf("hf://%s/%s", HF_REPO_ID, HF_MODEL_ID),
182 | 			AuthSecretName: &authSecretName,
183 | 			Size:           &sizeLimitQuan,
184 | 		}
185 | 
186 | 		modelService := msv1alpha1.ModelService{
187 | 			Spec: msv1alpha1.ModelServiceSpec{
188 | 				ModelArtifacts: modelArtifact,
189 | 			},
190 | 		}
191 | 
192 | 		It("should parse correctly", func() {
193 | 			By("checking type of uri")
194 | 			Expect(isPVCURI(modelArtifact.URI)).To(BeFalse())
195 | 			Expect(isHFURI(modelArtifact.URI)).To(BeTrue())
196 | 			Expect(isOCIURI(modelArtifact.URI)).To(BeFalse())
197 | 
198 | 			By("Parsing uri parts should be successful")
199 | 			repo, model, err := parseHFURI(&modelArtifact)
200 | 			Expect(err).To(BeNil())
201 | 			Expect(repo).To(Equal(HF_REPO_ID))
202 | 			Expect(model).To(Equal(HF_MODEL_ID))
203 | 		})
204 | 
205 | 		It("should produce a valid volumeMounts list", func() {
206 | 			volumeMounts := getVolumeMountsForContainer(ctx, &modelService)
207 | 			Expect(len(volumeMounts)).To(Equal(1))
208 | 			firstVolumeMount := volumeMounts[0]
209 | 
210 | 			Expect(firstVolumeMount.Name).To(Equal(modelStorageVolumeName))
211 | 			Expect(firstVolumeMount.MountPath).To(Equal(modelStorageRoot))
212 | 			Expect(firstVolumeMount.ReadOnly).To(BeFalse())
213 | 		})
214 | 
215 | 		It("should produce a valid volumes list", func() {
216 | 			volumes := getVolumeForPDDeployment(ctx, &modelService)
217 | 			Expect(len(volumes)).To(Equal(1))
218 | 			firstVolume := volumes[0]
219 | 			Expect(firstVolume.Name).To(Equal(modelStorageVolumeName))
220 | 			Expect(firstVolume.EmptyDir.SizeLimit.String()).To(Equal(sizeLimit))
221 | 		})
222 | 
223 | 		It("should produce a valid env list", func() {
224 | 			envs := getEnvsForContainer(ctx, &modelService)
225 | 			Expect(len(envs)).To(Equal(2))
226 | 			hfTokenEnvVar := envs[0]
227 | 
228 | 			Expect(hfTokenEnvVar.Name).To(Equal(ENV_HF_TOKEN))
229 | 			Expect(hfTokenEnvVar.ValueFrom.SecretKeyRef.Name).To(Equal(authSecretName))
230 | 			Expect(hfTokenEnvVar.ValueFrom.SecretKeyRef.Key).To(Equal(ENV_HF_TOKEN))
231 | 
232 | 			hfHomeEnvVar := envs[1]
233 | 			Expect(hfHomeEnvVar.Name).To(Equal(ENV_HF_HOME))
234 | 			Expect(hfHomeEnvVar.Value).To(Equal(modelStorageRoot))
235 | 		})
236 | 	})
237 | })
238 | 


--------------------------------------------------------------------------------
/main.go:
--------------------------------------------------------------------------------
1 | package main
2 | 
3 | import "github.com/llm-d/llm-d-model-service/cmd"
4 | 
5 | func main() {
6 | 	cmd.Execute()
7 | }
8 | 


--------------------------------------------------------------------------------
/model-service-arch.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/llm-d/llm-d-model-service/55eb18c4f08116ed8c9211f643b356ec5e47b4b7/model-service-arch.png


--------------------------------------------------------------------------------
/perf/create_modelservice.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Base name for the ModelService
 4 | BASE_NAME="perf-facebook-opt-125m-nixl"
 5 | 
 6 | echo "Please make sure universal base config is applied to the cluster"
 7 | 
 8 | # Loop to create and apply 100 instances
 9 | for i in $(seq 1 1000); do
10 |   NAME="${BASE_NAME}-${i}"
11 |   cat <<EOF | kubectl apply -f -
12 | apiVersion: llm-d.ai/v1alpha1
13 | kind: ModelService
14 | metadata:
15 |   name: ${NAME}
16 | spec:
17 |   decoupleScaling: false
18 | 
19 |   baseConfigMapRef:
20 |     name: universal-base-config
21 | 
22 |   routing: 
23 |     modelName: facebook/opt-125m
24 |     ports:
25 |     - name: app_port
26 |       port: 8000
27 |     - name: internal_port
28 |       port: 8200
29 | 
30 |   modelArtifacts:
31 |     uri: hf://facebook/opt-125m
32 | 
33 |   decode:
34 |     replicas: 1
35 |     acceleratorTypes:
36 |       labelKey: nvidia.com/gpu.product
37 |       labelValues:
38 |         - NVIDIA-A100-SXM4-80GB
39 |     containers:
40 |     - name: "vllm"
41 |       args:
42 |         - "{{ .HFModelName }}"
43 | 
44 |   prefill:
45 |     replicas: 1
46 |     acceleratorTypes:
47 |       labelKey: nvidia.com/gpu.product
48 |       labelValues:
49 |         - NVIDIA-A100-SXM4-80GB
50 |     containers:
51 |     - name: "vllm"
52 |       args:
53 |         - "{{ .HFModelName }}"
54 | EOF
55 | done
56 | 
57 | echo "Done submitting"
58 | 
59 | echo ""
60 | 
61 | echo "use kubectl delete msvc --all to delete all created resources"
62 | 
63 | 


--------------------------------------------------------------------------------
/samples/README.md:
--------------------------------------------------------------------------------
  1 | # Sample ModelService CRs and BaseConfigs
  2 | 
  3 | This folder contains example baseconfigs (ConfigMap) and ModelService CRs for various scenarios, including downloading models from Hugging Face or loading from a PVC. In particular, we provide a "universal" baseconfig that can be used for those scenarios. We will also show how to apply to an OpenShift cluster and the results you can expect from applying the ModelService CR and its referenced baseconfig. 
  4 | 
  5 | 👉 [baseconfigs](./baseconfigs/)
  6 | 
  7 | 👉 [msvcs](./msvcs/)
  8 | 
  9 | 👉 [test](./test/) (for local development usage only)
 10 | 
 11 | ## Prerequisite
 12 | Before you get started, ensure that you have the following install and running.
 13 | 
 14 | - Access to an OpenShift cluster 
 15 | - ModelService controller running with required RBACs and image pull secrets for P/D and EPP deployments
 16 | - External CRDs install on cluster for routing
 17 | - Other components such as [Inference Gateway Extension](https://github.com/kubernetes-sigs/gateway-api-inference-extension)
 18 | 
 19 | If you need more guidance, refer to the [developer docs](../docs/developer.md) on how to properly install the ModelService controller and other components on your system.
 20 | 
 21 | ## Scenarios 
 22 | *Note: we have set `acceleratorTypes` to A100-80GB, please reset as per resurces available in your cluster*
 23 | ### Scenario 1: serving a base model on vLLM on one pod
 24 | A simple use case is online serving a model on vLLM using one deployment. We will serve [`ibm-granite/granite-3.3-2b-base`](https://huggingface.co/ibm-granite/granite-3.3-2b-base) which can be downloaded from Hugging Face without the need for a token.
 25 | 
 26 | - [msvcs/granite3.2.yaml](./msvcs/granite3.2.yaml)
 27 | - [baseconfigs/simple-baseconfig.yaml](./baseconfigs/simple-baseconfig.yaml)
 28 | 
 29 | The `simple-baseconfig` contains just one section for `decodeDeployment`, which is a deployment template that spins up one vLLM container. It also specifies a volume and volume mount for downloading the model. The `decodeService` section is optional.
 30 | 
 31 | *Note: the term `decodeDeployment` might be misleading. There is no P/D disagreegation in this example. Using `prefillDeployment` will achieve the same result. We just need a deployment template for serving the model.*
 32 | 
 33 | Applying the baseconfig and CR to an OpenShift cluster, you should expect a deployment and service getting created. 
 34 | 
 35 | ```
 36 | kubectl apply -f samples/baseconfigs/simple-baseconfig.yaml
 37 | kubectl apply -f samples/msvcs/granite3.2.yaml
 38 | ```
 39 | 
 40 | You may port-forward the pod or service at port 8000 (because those are the ports for the vLLM container and decode service specified in the baseconfig) and query the vLLM container. The following command port-forwards the service.
 41 | 
 42 | ```
 43 | kubectl port-forward svc/granite-base-model-service-decode 8000:8000
 44 | curl  http://localhost:8000/v1/completions \
 45 |     -H "Content-Type: application/json" \
 46 |     -d '{
 47 |     "model": "ibm-granite/granite-3.3-2b-base",
 48 |     "prompt": "New York is"
 49 | }'
 50 | ```
 51 | 
 52 | ### Scenario 2: serving a model with routing and P/D disaggregation support
 53 | The platform owner may create another baseconfig used to serve models with routing enabled, useful for P/D disaggregation. We will continue to use a model that can be downloaded from Hugging Face: `facebook/opt-125m`.
 54 | 
 55 | - [msvcs/facebook-nixl.yaml](./msvcs/facebook-nixl.yaml)
 56 | - [baseconfigs/simple-baseconfig.yaml](./baseconfigs/universal-baseconfig.yaml)
 57 | 
 58 | Applying the baseconfig and CR to an OpenShift cluster, you should expect a deployment and service getting created. 
 59 | 
 60 | ```
 61 | kubectl apply -f samples/baseconfigs/universal-baseconfig.yaml
 62 | kubectl apply -f samples/msvcs/facebook-nixl.yaml
 63 | ```
 64 | 
 65 | You should expect to see the following resources created for this scenario:
 66 | 
 67 | - Model components:
 68 |   - A decode deployment
 69 |   - A prefill deployment
 70 | - Routing components:
 71 |   - An InferencePool
 72 |   - An InferenceModel
 73 |   - An EPP deployment 
 74 | - Networking components 
 75 |   - A service for decode deployment
 76 |   - A service for prefill deployment
 77 |   - A service for EPP deployment
 78 | - RBAC components 
 79 |   - A service account for P/D deployments (for custom image pulls)
 80 |   - A service account for EPP deployment 
 81 |   - A rolebinding for EPP deployment 
 82 | 
 83 | You may port-forward the services or pods and query vLLM directly. Optionally, if inference-gateway is installed in the cluster, use that which will route to the EPP. 
 84 | 
 85 | ```
 86 | kubectl port-forward svc/inference-gateway 8000:<inference-gateway-port>
 87 | curl  http://localhost:8000/v1/completions \
 88 |     -H "Content-Type: application/json" \
 89 |     -d '{
 90 |     "model": "facebook/opt-125m",
 91 |     "prompt": "Author-contribution statements and acknowledgements in research papers should state clearly and specifically whether, and to what extent, the authors used AI technologies such as ChatGPT in the preparation of their manuscript and analysis. They should also indicate which LLMs were used. This will alert editors and reviewers to scrutinize manuscripts more carefully for potential biases, inaccuracies and improper source crediting. Likewise, scientific journals should be transparent about their use of LLMs, for example when selecting submitted manuscripts. Mention the large language model based product mentioned in the paragraph above:"
 92 | }'
 93 | ```
 94 | 
 95 | ### Scenario 3: serving a model with xPyD disaggregation
 96 | Previously, we have looked at MSVCs which have just one replica for decode and prefill workloads. ModelService can help you achieve xPyD disaggregation, and all that is required is using different `replica` in the prefill and decode specs. 
 97 | 
 98 | Note that in this scenario, we are using the same baseconfig used in the last scenario, because there is really no difference in terms of the base configuration between the two other than model-specific behaviors such as replica count and model name.
 99 | 
100 | - [msvcs/xpyd.yaml](./msvcs/xpyd.yaml)
101 | - [baseconfigs/universal-baseconfig.yaml](./baseconfigs/universal-baseconfig.yaml)
102 | 
103 | ```
104 | kubectl apply -f samples/msvcs/xpyd.yaml
105 | ```
106 | 
107 | and you should see the corresponding number of pods spin up for each deployment.
108 | 
109 | ### Scenario 4: loading a large model from a PVC 
110 | Downloading a model from Hugging Face takes a long time for large models like [`meta-llama/Llama-4-Scout-17B-16E`](https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E), and one way to circumvent the long container creation time is to download a model to a PVC ahead of time and mount the PVC in the vLLM container. We have provided a baseconfig with the volume mounts configured, and all that is needed in the ModelService CR is to specify the path to which the model can be found.
111 | 
112 | - [msvcs/llama4.yaml](./msvcs/llama4.yaml)
113 | - [baseconfigs/universal-baseconfig-pvc.yaml](./baseconfigs/universal-baseconfig-pvc.yaml)
114 | 
115 | ```
116 | kubectl apply -f samples/baseconfigs/universal-baseconfig-pvc.yaml
117 | kubectl apply -f samples/msvcs/llama4.yaml
118 | ```
119 | 
120 | This should drastically shorten the wait time for pod creation. 


--------------------------------------------------------------------------------
/samples/baseconfigs/simple-baseconfig.yaml:
--------------------------------------------------------------------------------
 1 | # A simple baseconfig to serve a model downloaded from Hugging Face without a token on a pod
 2 | # Make sure that the model can fit on the sizeLimit specified
 3 | #
 4 | # Requirements:
 5 | # Any consuming ModelService should define ports labeled:
 6 | #    - app_port - the external port number for the prefill and decode pods
 7 | 
 8 | apiVersion: v1
 9 | kind: ConfigMap
10 | metadata:
11 |   name: simple-base-config
12 | immutable: true
13 | data:
14 |   decodeDeployment: |
15 |     apiVersion: apps/v1
16 |     kind: Deployment
17 |     spec:
18 |       template:
19 |         spec:
20 |           containers:
21 |             - name: vllm
22 |               image: vllm/vllm-openai:v0.8.5
23 |               command:
24 |                 - vllm
25 |                 - serve
26 |               securityContext:
27 |                 allowPrivilegeEscalation: false
28 |               args:
29 |                 - "--port"
30 |                 - "{{ "app_port" | getPort }"
31 |               env:
32 |                 - name: CUDA_VISIBLE_DEVICES
33 |                   value: "0"
34 |                 - name: UCX_TLS
35 |                   value: "cuda_ipc,cuda_copy,tcp"
36 |                 - name: HF_HUB_CACHE
37 |                   value: /cache
38 |               volumeMounts:
39 |                 - name: model-cache
40 |                   mountPath: /cache
41 |               resources:
42 |                 limits:
43 |                   nvidia.com/gpu: 1
44 |                 requests:
45 |                   cpu: "16"
46 |                   memory: 16Gi
47 |                   nvidia.com/gpu: 1
48 |           volumes:
49 |             - name: model-cache
50 |               emptyDir:
51 |                 sizeLimit: 5Gi
52 | 
53 |   # A service for the deployment is optional
54 |   decodeService: |
55 |     apiVersion: v1
56 |     kind: Service
57 |     spec:
58 |       clusterIP: None
59 |       ports:
60 |       - name: vllm
61 |         port: {{ "app_port" | getPort }
62 |         protocol: TCP
63 |   


--------------------------------------------------------------------------------
/samples/baseconfigs/universal-baseconfig-pvc.yaml:
--------------------------------------------------------------------------------
  1 | # A universal baseconfig for models stored on PVCs
  2 | #
  3 | # Requirements:
  4 | # Any consuming ModelService should define ports labeled:
  5 | #    - app_port - the external port number for the prefill and decode pods
  6 | #    - internal_port - the port number used by the sidecar to communicate with a vllm container
  7 | apiVersion: v1
  8 | kind: ConfigMap
  9 | metadata:
 10 |   name: universal-base-config-pvc
 11 | immutable: true
 12 | data:
 13 |   decodeDeployment: |
 14 |     apiVersion: apps/v1
 15 |     kind: Deployment
 16 |     spec:
 17 |       template:
 18 |         spec:
 19 |           initContainers:
 20 |             - name: routing-proxy
 21 |               image: ghcr.io/llm-d/llm-d-routing-sidecar:0.0.6
 22 |               securityContext:
 23 |                 allowPrivilegeEscalation: false
 24 |                 runAsNonRoot: true
 25 |               args:
 26 |                 # Note: this port has to match the prefill port
 27 |                 - "--port={{ "app_port" | getPort }}"
 28 |                 - "--vllm-port={{ "internal_port" | getPort }"
 29 |                 - "--connector=nixl"
 30 |               ports:
 31 |                 - containerPort: {{ "app_port" | getPort }}
 32 |                   protocol: TCP
 33 |               restartPolicy: Always
 34 |           containers:
 35 |             - name: vllm
 36 |               image: ghcr.io/llm-d/llm-d:0.0.8
 37 |               command:
 38 |                 - vllm
 39 |                 - serve
 40 |               securityContext:
 41 |                 allowPrivilegeEscalation: false
 42 |               args:
 43 |                 - "--port"
 44 |                 - "{{ "internal_port" | getPort }"
 45 |                 - "--enforce-eager"
 46 |                 - "--kv-transfer-config"
 47 |                 - '{"kv_connector":"NixlConnector","kv_role":"kv_both"}'
 48 |               env:
 49 |                 - name: CUDA_VISIBLE_DEVICES
 50 |                   value: "0"
 51 |                 - name: UCX_TLS
 52 |                   value: "cuda_ipc,cuda_copy,tcp"
 53 |                 - name: NIXL_ROLE
 54 |                   value: RECVER
 55 |                 - name: HF_HUB_CACHE
 56 |                   value: /vllm-workspace/models
 57 |               ports:
 58 |                 - containerPort: 55555
 59 |                   protocol: TCP
 60 |               volumeMounts:
 61 |                 - name: model-cache
 62 |                   mountPath: /vllm-workspace/models
 63 |               resources:
 64 |                 limits:
 65 |                   nvidia.com/gpu: 1
 66 |                 requests:
 67 |                   cpu: "16"
 68 |                   memory: 16Gi
 69 |                   nvidia.com/gpu: 1
 70 |           volumes:
 71 |             # MSVC controller will add model-storage using the PVC name in volumes
 72 |             # The mount path is /cache
 73 |             - name: model-cache
 74 |               emptyDir:
 75 |                 sizeLimit: 20Gi
 76 |            
 77 | 
 78 |   prefillDeployment: |
 79 |     apiVersion: apps/v1
 80 |     kind: Deployment
 81 |     spec:
 82 |       template:
 83 |         spec:
 84 |           containers:
 85 |             - name: vllm
 86 |               image: ghcr.io/llm-d/llm-d:0.0.8
 87 |               command:
 88 |                 - vllm
 89 |                 - serve
 90 |               securityContext:
 91 |                 allowPrivilegeEscalation: false
 92 |               args:
 93 |                 # Note: this port has to match the proxy --port arg
 94 |                 - "--port"
 95 |                 - "{{ "app_port" | getPort }}"
 96 |                 - "--enforce-eager"
 97 |                 - "--kv-transfer-config"
 98 |                 - '{"kv_connector":"NixlConnector","kv_role":"kv_both"}'
 99 |               env:
100 |                 - name: CUDA_VISIBLE_DEVICES
101 |                   value: "0"
102 |                 - name: UCX_TLS
103 |                   value: "cuda_ipc,cuda_copy,tcp"
104 |                 - name: VLLM_NIXL_SIDE_CHANNEL_PORT
105 |                   value: "5557"
106 |                 - name: VLLM_NIXL_SIDE_CHANNEL_HOST
107 |                   valueFrom:
108 |                     fieldRef:
109 |                       fieldPath: status.podIP
110 |                 - name: VLLM_LOGGING_LEVEL
111 |                   value: DEBUG
112 |                 - name: HF_HUB_CACHE
113 |                   value: /vllm-workspace/models
114 |               ports:
115 |                 - containerPort: {{ "app_port" | getPort }}
116 |                   protocol: TCP
117 |                 - containerPort: 5557
118 |                   protocol: TCP
119 |               volumeMounts:
120 |                 - name: model-cache
121 |                   mountPath: /vllm-workspace/models
122 |               resources:
123 |                 limits:
124 |                   nvidia.com/gpu: 1
125 |                 requests:
126 |                   cpu: "16"
127 |                   memory: 16Gi
128 |                   nvidia.com/gpu: 1
129 |           volumes:
130 |             # MSVC controller will add model-storage using the PVC name in volumes
131 |             # The mount path is /cache
132 |             - name: model-cache
133 |               emptyDir:
134 |                 sizeLimit: 20Gi
135 |   
136 |   eppService: |
137 |     apiVersion: v1
138 |     kind: Service
139 |     spec:
140 |       ports:
141 |         - port: 9002    # Needs to match the port of the eppDeployment
142 |           protocol: TCP
143 |       type: NodePort
144 |   
145 |   eppDeployment: |
146 |     apiVersion: apps/v1
147 |     kind: Deployment
148 |     spec:
149 |       template:
150 |         spec:
151 |           containers:
152 |             - name: "epp"
153 |               args:
154 |                 - -poolName
155 |                 - {{ .InferencePoolName }}
156 |                 - -poolNamespace
157 |                 - {{ .ModelServiceNamespace }}
158 |                 - -v
159 |                 - "4"
160 |                 - --zap-encoder
161 |                 - json
162 |                 - -grpcPort
163 |                 - "9002"
164 |                 - -grpcHealthPort
165 |                 - "9003"
166 |               env:
167 |                 - name: PD_ENABLED
168 |                   value: "true"
169 |                 - name: PD_PROMPT_LEN_THRESHOLD
170 |                   value: "10"
171 |               image: ghcr.io/llm-d/llm-d-inference-scheduler:0.0.3
172 |               imagePullPolicy: Always
173 |               livenessProbe:
174 |                 failureThreshold: 3
175 |                 grpc:
176 |                   port: 9003
177 |                   service: {{ .EPPServiceName }}
178 |                 initialDelaySeconds: 5
179 |                 periodSeconds: 10
180 |                 successThreshold: 1
181 |                 timeoutSeconds: 1
182 |               ports:
183 |                 - containerPort: 9002
184 |                   protocol: TCP
185 |                 - containerPort: 9003
186 |                   protocol: TCP
187 |                 - containerPort: 9090
188 |                   name: metrics
189 |                   protocol: TCP
190 |               readinessProbe:
191 |                 failureThreshold: 3
192 |                 grpc:
193 |                   port: 9003
194 |                   service: {{ .EPPServiceName }}
195 |                 initialDelaySeconds: 5
196 |                 periodSeconds: 10
197 |                 successThreshold: 1
198 |                 timeoutSeconds: 1
199 |   
200 |   inferencePool: |
201 |     apiVersion: inference.networking.x-k8s.io/v1alpha2
202 |     kind: InferencePool
203 |     spec:
204 |       targetPortNumber: {{ "app_port" | getPort }}
205 |   
206 |   inferenceModel: |
207 |     apiVersion: inference.networking.x-k8s.io/v1alpha2
208 |     kind: InferenceModel
209 | 


--------------------------------------------------------------------------------
/samples/baseconfigs/universal-baseconfig.yaml:
--------------------------------------------------------------------------------
  1 | # Based on: https://github.com/llm-d/llm-d-routing-sidecar/tree/dev/test/config/nixl
  2 | #
  3 | # Requirements:
  4 | # Any consuming ModelService should define ports labeled:
  5 | #    - app_port - the external port number for the prefill and decode pods
  6 | #    - internal_port - the port number used by the sidecar to communicate with a vllm container
  7 | apiVersion: v1
  8 | kind: ConfigMap
  9 | metadata:
 10 |   name: universal-base-config
 11 | immutable: true
 12 | data:
 13 |   decodeDeployment: |
 14 |     apiVersion: apps/v1
 15 |     kind: Deployment
 16 |     spec:
 17 |       template:
 18 |         spec:
 19 |           initContainers:
 20 |             - name: routing-proxy
 21 |               image: ghcr.io/llm-d/llm-d-routing-sidecar:0.0.6
 22 |               securityContext:
 23 |                 allowPrivilegeEscalation: false
 24 |                 runAsNonRoot: true
 25 |               args:
 26 |                 # Note: this port has to match the prefill port
 27 |                 - "--port={{ "app_port" | getPort }}"
 28 |                 - "--vllm-port={{ "internal_port" | getPort }}"
 29 |                 - "--connector=nixl"
 30 |               ports:
 31 |                 - containerPort: {{ "app_port" | getPort }}
 32 |                   protocol: TCP
 33 |               restartPolicy: Always
 34 |           containers:
 35 |             - name: vllm
 36 |               image: grcr.io/llm-d/llm-d:0.0.8
 37 |               command:
 38 |                 - vllm
 39 |                 - serve
 40 |               securityContext:
 41 |                 allowPrivilegeEscalation: false
 42 |               args:
 43 |                 - "--port"
 44 |                 - "{{ "internal_port" | getPort }}"
 45 |                 - "--enforce-eager"
 46 |                 - "--kv-transfer-config"
 47 |                 - '{"kv_connector":"NixlConnector","kv_role":"kv_both"}'
 48 |               env:
 49 |                 - name: CUDA_VISIBLE_DEVICES
 50 |                   value: "0"
 51 |                 - name: UCX_TLS
 52 |                   value: "cuda_ipc,cuda_copy,tcp"
 53 |                 - name: NIXL_ROLE
 54 |                   value: RECVER
 55 |                 - name: HF_HUB_CACHE
 56 |                   value: /vllm-workspace/models
 57 |               ports:
 58 |                 - containerPort: 55555
 59 |                   protocol: TCP
 60 |               volumeMounts:
 61 |                 - name: model-cache
 62 |                   mountPath: /vllm-workspace/models
 63 |               resources:
 64 |                 limits:
 65 |                   nvidia.com/gpu: 1
 66 |                 requests:
 67 |                   cpu: "16"
 68 |                   memory: 16Gi
 69 |                   nvidia.com/gpu: 1
 70 |           volumes:
 71 |             - name: model-cache
 72 |               emptyDir:
 73 |                 sizeLimit: 20Gi
 74 | 
 75 |   prefillDeployment: |
 76 |     apiVersion: apps/v1
 77 |     kind: Deployment
 78 |     spec:
 79 |       template:
 80 |         spec:
 81 |           containers:
 82 |             - name: vllm
 83 |               image: ghcr.io/llm-d/llm-d:0.0.8
 84 |               command:
 85 |                 - vllm
 86 |                 - serve
 87 |               securityContext:
 88 |                 allowPrivilegeEscalation: false
 89 |               args:
 90 |                 # Note: this port has to match the proxy --port arg
 91 |                 - "--port"
 92 |                 - "{{ "app_port" | getPort }}"
 93 |                 - "--enforce-eager"
 94 |                 - "--kv-transfer-config"
 95 |                 - '{"kv_connector":"NixlConnector","kv_role":"kv_both"}'
 96 |               env:
 97 |                 - name: CUDA_VISIBLE_DEVICES
 98 |                   value: "0"
 99 |                 - name: UCX_TLS
100 |                   value: "cuda_ipc,cuda_copy,tcp"
101 |                 - name: VLLM_NIXL_SIDE_CHANNEL_PORT
102 |                   value: "5557"
103 |                 - name: VLLM_NIXL_SIDE_CHANNEL_HOST
104 |                   valueFrom:
105 |                     fieldRef:
106 |                       fieldPath: status.podIP
107 |                 - name: VLLM_LOGGING_LEVEL
108 |                   value: DEBUG
109 |                 - name: HF_HUB_CACHE
110 |                   value: /vllm-workspace/models
111 |               ports:
112 |                 - containerPort: {{ "app_port" | getPort }}
113 |                   protocol: TCP
114 |                 - containerPort: 5557
115 |                   protocol: TCP
116 |               volumeMounts:
117 |                 - name: model-cache
118 |                   mountPath: /vllm-workspace/models
119 |               resources:
120 |                 limits:
121 |                   nvidia.com/gpu: 1
122 |                 requests:
123 |                   cpu: "16"
124 |                   memory: 16Gi
125 |                   nvidia.com/gpu: 1
126 |           volumes:
127 |             - name: model-cache
128 |               emptyDir:
129 |                 sizeLimit: 20Gi
130 |   
131 |   eppService: |
132 |     apiVersion: v1
133 |     kind: Service
134 |     spec:
135 |       ports:
136 |         - port: 9002    # Needs to match the port of the eppDeployment
137 |           protocol: TCP
138 |       type: NodePort
139 |   
140 |   eppDeployment: |
141 |     apiVersion: apps/v1
142 |     kind: Deployment
143 |     spec:
144 |       template:
145 |         spec:
146 |           containers:
147 |             - name: "epp"
148 |               args:
149 |                 - -poolName
150 |                 - {{ .InferencePoolName }}
151 |                 - -poolNamespace
152 |                 - {{ .ModelServiceNamespace }}
153 |                 - -v
154 |                 - "4"
155 |                 - --zap-encoder
156 |                 - json
157 |                 - -grpcPort
158 |                 - "9002"
159 |                 - -grpcHealthPort
160 |                 - "9003"
161 |               env:
162 |                 - name: PD_ENABLED
163 |                   value: "true"
164 |                 - name: PD_PROMPT_LEN_THRESHOLD
165 |                   value: "10"
166 |               image: ghcr.io/llm-d/llm-d-inference-scheduler:0.0.3
167 |               imagePullPolicy: Always
168 |               livenessProbe:
169 |                 failureThreshold: 3
170 |                 grpc:
171 |                   port: 9003
172 |                   service: {{ .EPPServiceName }}
173 |                 initialDelaySeconds: 5
174 |                 periodSeconds: 10
175 |                 successThreshold: 1
176 |                 timeoutSeconds: 1
177 |               ports:
178 |                 - containerPort: 9002
179 |                   protocol: TCP
180 |                 - containerPort: 9003
181 |                   protocol: TCP
182 |                 - containerPort: 9090
183 |                   name: metrics
184 |                   protocol: TCP
185 |               readinessProbe:
186 |                 failureThreshold: 3
187 |                 grpc:
188 |                   port: 9003
189 |                   service: {{ .EPPServiceName }}
190 |                 initialDelaySeconds: 5
191 |                 periodSeconds: 10
192 |                 successThreshold: 1
193 |                 timeoutSeconds: 1
194 |   
195 |   inferencePool: |
196 |     apiVersion: inference.networking.x-k8s.io/v1alpha2
197 |     kind: InferencePool
198 |     spec:
199 |       targetPortNumber: {{ "app_port" | getPort }}
200 |   
201 |   inferenceModel: |
202 |     apiVersion: inference.networking.x-k8s.io/v1alpha2
203 |     kind: InferenceModel
204 | 


--------------------------------------------------------------------------------
/samples/msvcs/facebook-nixl.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: llm-d.ai/v1alpha1
 2 | kind: ModelService
 3 | metadata:
 4 |   name: facebook-opt-125m-nixl
 5 | spec:
 6 |   decoupleScaling: false
 7 | 
 8 |   baseConfigMapRef:
 9 |     name: universal-base-config
10 | 
11 |   routing: 
12 |     # This is the model name for the OpenAI request
13 |     modelName: facebook/opt-125m
14 |     ports:
15 |     - name: app_port
16 |       port: 8000
17 |     - name: internal_port
18 |       port: 8200
19 | 
20 |   modelArtifacts:
21 |     # When specfying the URI with `hf` prefix, the <repo-id>/<model-id> string
22 |     # is extracted and exposed as a template variable that can be used as {{ .HFModelName }}
23 |     uri: hf://facebook/opt-125m
24 | 
25 |   # describe decode pods
26 |   decode:
27 |     replicas: 1
28 |     acceleratorTypes:
29 |       labelKey: nvidia.com/gpu.product
30 |       labelValues:
31 |         - NVIDIA-A100-SXM4-80GB
32 |     containers:
33 |     - name: "vllm"
34 |       # The baseconfig image includes LMCache and multiconnector support
35 |       args:
36 |         - "{{ .HFModelName }}"
37 |   
38 |   # describe the prefill pods 
39 |   prefill:
40 |     replicas: 1
41 |     acceleratorTypes:
42 |       labelKey: nvidia.com/gpu.product
43 |       labelValues:
44 |         - NVIDIA-A100-SXM4-80GB
45 |     containers:
46 |       - name: "vllm"
47 |         args:
48 |           - "{{ .HFModelName }}"
49 |       
50 | 


--------------------------------------------------------------------------------
/samples/msvcs/granite3.2.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: llm-d.ai/v1alpha1
 2 | kind: ModelService
 3 | metadata:
 4 |   name: granite-base-model
 5 | spec:
 6 |   decoupleScaling: false
 7 | 
 8 |   baseConfigMapRef:
 9 |     name: simple-base-config
10 | 
11 |   routing: 
12 |     modelName: ibm-granite/granite-3.3-2b-base
13 |     ports:
14 |     - name: app_port
15 |       port: 8000
16 | 
17 |   modelArtifacts:
18 |     uri: hf://ibm-granite/granite-3.3-2b-base
19 | 
20 |   # describe decode pods
21 |   decode:
22 |     replicas: 1
23 |     # acceleratorTypes:
24 |     #   labelKey: nvidia.com/gpu.product
25 |     #   labelValues:
26 |     #     - NVIDIA-A100-SXM4-80GB
27 |     containers:
28 |     - name: "vllm"
29 |       args:
30 |         - "{{ .HFModelName }}"
31 |   
32 | 


--------------------------------------------------------------------------------
/samples/msvcs/llama4.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: llm-d.ai/v1alpha1
 2 | kind: ModelService
 3 | metadata:
 4 |   name: meta-llama-4-scout-17b-16e
 5 | spec:
 6 |   decoupleScaling: false
 7 | 
 8 |   baseConfigMapRef:
 9 |     name: universal-base-config-pvc
10 | 
11 |   routing: 
12 |     modelName: meta-llama/Llama-4-Scout-17B-16E
13 |     ports:
14 |     - name: app_port
15 |       port: 8000
16 |     - name: internal_port
17 |       port: 8200
18 | 
19 |   modelArtifacts:
20 |     # When specfying the URI with `pvc` prefix, the string after the pvc name (llama-pvc)
21 |     # is extracted and exposed as a template variable that can be used as {{ .ModelPath }}
22 |     uri: pvc://llama-pvc/path/to/llama4
23 | 
24 |   # describe decode pods
25 |   decode:
26 |     replicas: 1
27 |     parallelism:  
28 |       tensor: 8
29 |     containers:
30 |     - name: "vllm"
31 |       args:
32 |       # Comes from model-storage volume, which is a PVC created by MSVC controller
33 |       # The mountPath is /cache
34 |       # {{ .ModelPath }} == /path/to/llama4
35 |       - '/cache/{{ .ModelPath }}'
36 |       
37 |       # Other args come from https://blog.vllm.ai/2025/04/05/llama4.html
38 |       # This is for reference only
39 |       # Modify the args as you wish
40 |       - "--tensor-parallel-size"
41 |       - "8"
42 |       - "--max-model-len" 
43 |       - "1000000"
44 |       - "--override-generation-config='{\"attn_temperature_tuning\": true}'"
45 |       
46 |     acceleratorTypes:
47 |       labelKey: nvidia.com/gpu.product
48 |       labelValues:
49 |         # According to the blog, Scout requires H100s
50 |         - NVIDIA-H100
51 |   
52 |   # describe the prefill pods 
53 |   prefill:
54 |     replicas: 1
55 |     parallelism:  
56 |       tensor: 8
57 |     containers:
58 |     - name: "vllm"
59 |       args:
60 |       - '/cache/{{ .ModelPath }}'
61 |       - "--tensor-parallel-size"
62 |       - "8"
63 |       - "--max-model-len" 
64 |       - "1000000"
65 |       - "--override-generation-config='{\"attn_temperature_tuning\": true}'"
66 |       
67 |     acceleratorTypes:
68 |       labelKey: nvidia.com/gpu.product
69 |       labelValues:
70 |         - NVIDIA-H100
71 | 


--------------------------------------------------------------------------------
/samples/msvcs/xpyd.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: llm-d.ai/v1alpha1
 2 | kind: ModelService
 3 | metadata:
 4 |   name: facebook-opt-125m-xpyd
 5 | spec:
 6 |   decoupleScaling: false
 7 | 
 8 |   baseConfigMapRef:
 9 |     name: generic-base-config
10 | 
11 |   routing: 
12 |     modelName: facebook/opt-125m
13 |     ports:
14 |     - name: app_port
15 |       port: 8000
16 |     - name: internal_port
17 |       port: 8200
18 | 
19 |   modelArtifacts:
20 |     uri: pvc://facebook-pvc/path/to/opt-125m
21 | 
22 |   # describe decode pods
23 |   decode:
24 |     # Note a different replica count from spec.prefill.replicas
25 |     replicas: 2
26 |     containers:
27 |     - name: "vllm"
28 |       args:
29 |       # Comes from baseconfig's volume mounts path
30 |       - '/stored/models/{{ .ModelPath }}'
31 |     acceleratorTypes:
32 |       labelKey: nvidia.com/gpu.product
33 |       labelValues:
34 |         - NVIDIA-A100-SXM4-80GB
35 |   
36 |   # describe the prefill pods 
37 |   prefill:
38 |     replicas: 1
39 |     containers:
40 |     - name: "vllm"
41 |       args:
42 |       # Comes from baseconfig's volume mounts path
43 |       - '/stored/models/{{ .ModelPath }}'
44 |     acceleratorTypes:
45 |       labelKey: nvidia.com/gpu.product
46 |       labelValues:
47 |         - NVIDIA-A100-SXM4-80GB
48 | 


--------------------------------------------------------------------------------
/samples/test/README.md:
--------------------------------------------------------------------------------
1 | The files here are used for local development only.


--------------------------------------------------------------------------------
/samples/test/baseconfig.yaml:
--------------------------------------------------------------------------------
  1 | apiVersion: v1
  2 | kind: ConfigMap
  3 | metadata:
  4 |   name: basic-basic-conf
  5 | data:
  6 |   decodeDeployment: |
  7 |     spec:
  8 |       replicas: 2
  9 |       template:
 10 |         spec:
 11 |           containers:
 12 |           - name: llm
 13 |             command:
 14 |             - sleep
 15 | 
 16 |   # Note that this label is preserved and our labels are added
 17 |   decodeService: |
 18 |     spec:
 19 |       selector:
 20 |         app.kubernetes.io/name: decodeServiceLabelInBaseConfig
 21 |       ports:
 22 |         - protocol: TCP
 23 |           port: {{ "inport" | getPort }}
 24 |           targetPort: {{ "outport" | getPort }}
 25 |   
 26 |   # This service should not be created bc prefill doesn't exist in basemsvc.yaml
 27 |   prefillService: |
 28 |     spec:
 29 |       selector:
 30 |         app.kubernetes.io/name: prefillServiceLabelInBaseConfig
 31 |       ports:
 32 |         - protocol: TCP
 33 |           port: {{ "inport" | getPort }}
 34 |           targetPort: {{ "outport" | getPort }}
 35 | 
 36 |   inferenceModel: |
 37 |     spec:
 38 |       criticality: Standard    
 39 |   
 40 |   inferencePool: |
 41 |     spec:
 42 |       targetPortNumber: {{ "outport" | getPort }}
 43 |   eppDeployment: |
 44 |     apiVersion: apps/v1
 45 |     kind: Deployment
 46 |     metadata:
 47 |       name: epp
 48 |       namespace: default
 49 |     spec:
 50 |       replicas: 1
 51 |       template:
 52 |         spec:
 53 |           # Conservatively, this timeout should mirror the longest grace period of the pods within the pool
 54 |           terminationGracePeriodSeconds: 130
 55 |           containers:
 56 |           - name: epp
 57 |             image: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/epp:main
 58 |             imagePullPolicy: Always
 59 |             args:
 60 |             - -poolName
 61 |             - my-pool-name
 62 |             - -poolNamespace
 63 |             - my-pool-namespace
 64 |             - -v
 65 |             - "4"
 66 |             - --zap-encoder
 67 |             - "json"
 68 |             - -grpcPort
 69 |             - "9002"
 70 |             - -grpcHealthPort
 71 |             - "9003"
 72 |             env:
 73 |             - name: USE_STREAMING
 74 |               value: "true"
 75 |             ports:
 76 |             - containerPort: 9002
 77 |             - containerPort: 9003
 78 |             - name: metrics
 79 |               containerPort: 9090
 80 |             livenessProbe:
 81 |               grpc:
 82 |                 port: 9003
 83 |                 service: inference-extension
 84 |               initialDelaySeconds: 5
 85 |               periodSeconds: 10
 86 |             readinessProbe:
 87 |               grpc:
 88 |                 port: 9003
 89 |                 service: inference-extension
 90 |               initialDelaySeconds: 5
 91 |               periodSeconds: 10 
 92 |   eppService: |
 93 |     apiVersion: v1
 94 |     kind: Service
 95 |     metadata:
 96 |       name: llm-llama3-8b-instruct-epp
 97 |       namespace: default
 98 |     spec:
 99 |       selector:
100 |         app: llm-llama3-8b-instruct-epp
101 |       ports:
102 |       - protocol: TCP
103 |         port: 9002
104 |         targetPort: 9002
105 |         appProtocol: http2
106 |     type: ClusterIP   
107 |   
108 |   httpRoute: |
109 |     apiVersion: gateway.networking.k8s.io/v1
110 |     kind: HTTPRoute
111 |     spec:
112 |       parentRefs:
113 |       - name: inference-gateway-name
114 |         port: 12345
115 |       rules:
116 |       - matches:
117 |         - path:
118 |             type: PathPrefix
119 |             value: /
120 |       - backendRefs:
121 |         - group: inference.networking.x-k8s.io
122 |           kind: InferencePool
123 |           name: {{ .InferencePoolName }}
124 |           port: {{ "outport" | getPort }}
125 | 


--------------------------------------------------------------------------------
/samples/test/msvc-hf.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: llm-d.ai/v1alpha1
 2 | kind: ModelService
 3 | metadata:
 4 |   name: busybox
 5 | spec:
 6 |   decoupleScaling: false
 7 | 
 8 |   baseConfigMapRef:
 9 |     name: basic-basic-conf
10 | 
11 |   routing: 
12 |     modelName: ibm-granite/granite-3.3-2b-instruct
13 |     ports:
14 |     - name: inport
15 |       port: 80
16 |     - name: outport
17 |       port: 9376
18 | 
19 |   modelArtifacts:
20 |     uri: hf://ibm-granite/granite-3.3-2b-instruct
21 |     authSecretName: hf-secret
22 |     size: 5Gi
23 | 
24 |   # describe decode pods
25 |   decode:
26 |     replicas: 1
27 |     containers:
28 |     - name: "sidecar"
29 |       image: "nginx"
30 |     - name: "llm"
31 |       image: busybox
32 |       args:
33 |       - "{{ .HFModelName }}"
34 |       - "{{ .MountedModelPath }}"
35 |       mountModelVolume: true 


--------------------------------------------------------------------------------
/samples/test/msvc.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: llm-d.ai/v1alpha1
 2 | kind: ModelService
 3 | metadata:
 4 |   name: busybox
 5 | spec:
 6 |   decoupleScaling: false
 7 | 
 8 |   baseConfigMapRef:
 9 |     name: basic-basic-conf
10 | 
11 |   routing: 
12 |     modelName: llama-2075
13 |     ports:
14 |     - name: inport
15 |       port: 80
16 |     - name: outport
17 |       port: 9376
18 |     gatewayRefs:
19 |     - name: inference-gateway-name
20 |       port: 1112
21 | 
22 |   modelArtifacts:
23 |     uri: pvc://llama-of-the-future/path/to/llama-2075
24 | 
25 |   # describe decode pods
26 |   decode:
27 |     replicas: 1
28 |     initContainers:
29 |     - name: "proxy"
30 |       image: "busybox"
31 |       args: 
32 |       - "{{ .ModelPath }}"
33 |       mountModelVolume: true 
34 |       
35 |     containers:
36 |     - name: "llm"
37 |       image: busybox
38 |       args:
39 |       - "{{ .ModelName }}"
40 | 
41 |   endpointPicker:
42 |     containers:
43 |     - name: "epp"
44 |       env:
45 |       - name: HF_TOKEN
46 |         value: hello
47 |       - name: USE_STREAMING
48 |         value: "false"


--------------------------------------------------------------------------------
/test/e2e/e2e_suite_test.go:
--------------------------------------------------------------------------------
  1 | package e2e
  2 | 
  3 | import (
  4 | 	"context"
  5 | 	"fmt"
  6 | 	"os"
  7 | 	"os/exec"
  8 | 	"path/filepath"
  9 | 	"testing"
 10 | 
 11 | 	. "github.com/onsi/ginkgo/v2"
 12 | 	. "github.com/onsi/gomega"
 13 | 	"k8s.io/apimachinery/pkg/runtime"
 14 | 	"k8s.io/client-go/rest"
 15 | 	"k8s.io/client-go/tools/clientcmd"
 16 | 	"sigs.k8s.io/controller-runtime/pkg/client"
 17 | 
 18 | 	msv1alpha1 "github.com/llm-d/llm-d-model-service/api/v1alpha1"
 19 | 	"github.com/llm-d/llm-d-model-service/test/utils"
 20 | 	clientgoscheme "k8s.io/client-go/kubernetes/scheme"
 21 | 	giev1alpha2 "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2"
 22 | )
 23 | 
 24 | var (
 25 | 	// Optional Environment Variables:
 26 | 	// - CERT_MANAGER_INSTALL_SKIP=true: Skips CertManager installation during test setup.
 27 | 	// These variables are useful if CertManager is already installed, avoiding
 28 | 	// re-installation and conflicts.
 29 | 	skipCertManagerInstall = os.Getenv("CERT_MANAGER_INSTALL_SKIP") == "true"
 30 | 	// isCertManagerAlreadyInstalled will be set true when CertManager CRDs be found on the cluster
 31 | 	isCertManagerAlreadyInstalled = false
 32 | 
 33 | 	// projectImage is the name of the image which will be build and loaded
 34 | 	// with the code source changes to be tested.
 35 | 	projectImage = "llm-d.ai/modelservice:v0.0.1"
 36 | 	imageArchive = "/tmp/llm-d.ai-modelservice-v0.0.1.tar"
 37 | 	testCluster  = "kind-modelservice-test"
 38 | 	kindImage    = "kindest/node:v1.32.0@sha256:c48c62eac5da28cdadcf560d1d8616cfa6783b58f0d94cf63ad1bf49600cb027"
 39 | )
 40 | 
 41 | // TestE2E runs the end-to-end (e2e) test suite for the project. These tests execute in an isolated,
 42 | // temporary environment to validate project changes with the purposed to be used in CI jobs.
 43 | // The default setup requires Kind, builds/loads the Manager Docker image locally, and installs
 44 | // CertManager.
 45 | func TestE2E(t *testing.T) {
 46 | 	RegisterFailHandler(Fail)
 47 | 	_, _ = fmt.Fprintf(GinkgoWriter, "Starting modelservice integration test suite\n")
 48 | 	RunSpecs(t, "e2e suite")
 49 | }
 50 | 
 51 | var (
 52 | 	k8sClient client.Client
 53 | 	cfg       *rest.Config
 54 | 	ctx       = context.TODO()
 55 | )
 56 | 
 57 | var _ = BeforeSuite(func() {
 58 | 	By("deleting kind cluster if it exists")
 59 | 	cmd := exec.Command("kind", "delete", "cluster", "--name", testCluster)
 60 | 	_, _ = utils.Run(cmd)
 61 | 	// ignore problems
 62 | 
 63 | 	By("creating Kind cluster")
 64 | 	cmd = exec.Command("kind", "create", "cluster", "--image", kindImage, "--name", testCluster)
 65 | 	_, err := utils.Run(cmd)
 66 | 	ExpectWithOffset(1, err).NotTo(
 67 | 		HaveOccurred(),
 68 | 		fmt.Sprintf("Failed to create Kind cluster %s", testCluster),
 69 | 	)
 70 | 
 71 | 	var kubeconfig string
 72 | 	if os.Getenv("KUBECONFIG") != "" {
 73 | 		kubeconfig = os.Getenv("KUBECONFIG")
 74 | 	} else {
 75 | 		homeDir, _ := os.UserHomeDir()
 76 | 		kubeconfig = filepath.Join(homeDir, ".kube", "config")
 77 | 	}
 78 | 	cfg, err = clientcmd.BuildConfigFromFlags("", kubeconfig)
 79 | 	Expect(err).ToNot(HaveOccurred(), "Failed to build kubeconfig")
 80 | 	var scheme = runtime.NewScheme()
 81 | 
 82 | 	err = clientgoscheme.AddToScheme(scheme)
 83 | 	Expect(err).NotTo(HaveOccurred())
 84 | 
 85 | 	Expect(msv1alpha1.AddToScheme(scheme)).To(Succeed())
 86 | 	Expect(giev1alpha2.Install(scheme)).To(Succeed())
 87 | 	k8sClient, err = client.New(cfg, client.Options{Scheme: scheme})
 88 | 	Expect(err).ToNot(HaveOccurred(), "Failed to create k8s client")
 89 | 
 90 | 	By("building the manager(Operator) image")
 91 | 	cmd = exec.Command("make", "docker-build", fmt.Sprintf("IMG=%s", projectImage))
 92 | 	_, err = utils.Run(cmd)
 93 | 	ExpectWithOffset(1, err).NotTo(HaveOccurred(), "Failed to build the manager(Operator) image")
 94 | 
 95 | 	By("archiving the image")
 96 | 	cmd = exec.Command("make", "archive-image", fmt.Sprintf("IMG=%s", projectImage), fmt.Sprintf("IMG_ARCHIVE=%s", imageArchive))
 97 | 	_, err = utils.Run(cmd)
 98 | 	ExpectWithOffset(1, err).NotTo(HaveOccurred(), "Failed to archive the manager(operator) image")
 99 | 
100 | 	// TODO(user): If you want to change the e2e test vendor from Kind, ensure the image is
101 | 	// built and available before running the tests. Also, remove the following block.
102 | 	By("loading the manager(Operator) image on Kind")
103 | 	err = utils.LoadImageToKindClusterWithName(imageArchive, testCluster)
104 | 	ExpectWithOffset(1, err).NotTo(HaveOccurred(), "Failed to load the manager(Operator) image into Kind")
105 | 
106 | 	// The tests-e2e are intended to run on a temporary cluster that is created and destroyed for testing.
107 | 	// To prevent errors when tests run in environments with CertManager already installed,
108 | 	// we check for its presence before execution.
109 | 	// Setup CertManager before the suite if not skipped and if not already installed
110 | 	if !skipCertManagerInstall {
111 | 		By("checking if cert manager is installed already")
112 | 		isCertManagerAlreadyInstalled = utils.IsCertManagerCRDsInstalled()
113 | 		if !isCertManagerAlreadyInstalled {
114 | 			_, _ = fmt.Fprintf(GinkgoWriter, "Installing CertManager...\n")
115 | 			Expect(utils.InstallCertManager()).To(Succeed(), "Failed to install CertManager")
116 | 		} else {
117 | 			_, _ = fmt.Fprintf(GinkgoWriter, "WARNING: CertManager is already installed. Skipping installation...\n")
118 | 		}
119 | 	}
120 | })
121 | 
122 | var _ = AfterSuite(func() {
123 | 	// Teardown CertManager after the suite if not skipped and if it was not already installed
124 | 	if !skipCertManagerInstall && !isCertManagerAlreadyInstalled {
125 | 		_, _ = fmt.Fprintf(GinkgoWriter, "Uninstalling CertManager...\n")
126 | 		utils.UninstallCertManager()
127 | 	}
128 | 
129 | 	// delete test cluster
130 | 	cmd := exec.Command("kind", "delete", "cluster", "--name", testCluster)
131 | 	_, _ = utils.Run(cmd)
132 | 
133 | })
134 | 


--------------------------------------------------------------------------------
/test/modelservices/baseResources.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: ConfigMap
3 | metadata:
4 |   name: br1.yaml
5 | data:
6 | 
7 | 


--------------------------------------------------------------------------------
/test/modelservices/ms1.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: llm-d.ai/v1alpha1
 2 | kind: ModelService
 3 | metadata:
 4 |   name: test-modelservice
 5 |   namespace: test-namespace
 6 | spec:
 7 |   routing:
 8 |     modelName: repo/model
 9 |   modelArtifacts:
10 |     uri: "pvc://pvc-name/path/to/model"
11 |   decoupleScaling: false
12 |   decode:
13 |     containers:
14 |     - name: llm-proxy
15 |       image: "ghcr.io/llm-d/llm-d-routingsidecar-dev:0.0.5"
16 |       imagePullPolicy: "Always"
17 |     - name: llm-container
18 |       image: "ghcr.io/llm-d/llm-d-dev:0.0.2"
19 | 
20 | 


--------------------------------------------------------------------------------
/test/utils/utils.go:
--------------------------------------------------------------------------------
  1 | package utils
  2 | 
  3 | import (
  4 | 	"bufio"
  5 | 	"bytes"
  6 | 	"fmt"
  7 | 	"os"
  8 | 	"os/exec"
  9 | 	"strings"
 10 | 
 11 | 	. "github.com/onsi/ginkgo/v2" //nolint:staticcheck,golint,revive
 12 | )
 13 | 
 14 | const (
 15 | 	prometheusOperatorVersion = "v0.77.1"
 16 | 	prometheusOperatorURL     = "https://github.com/prometheus-operator/prometheus-operator/" +
 17 | 		"releases/download/%s/bundle.yaml"
 18 | 
 19 | 	certmanagerVersion = "v1.16.3"
 20 | 	certmanagerURLTmpl = "https://github.com/cert-manager/cert-manager/releases/download/%s/cert-manager.yaml"
 21 | )
 22 | 
 23 | func warnError(err error) {
 24 | 	_, _ = fmt.Fprintf(GinkgoWriter, "warning: %v\n", err)
 25 | }
 26 | 
 27 | // Run executes the provided command within this context
 28 | func Run(cmd *exec.Cmd) (string, error) {
 29 | 	dir, _ := GetProjectDir()
 30 | 	cmd.Dir = dir
 31 | 
 32 | 	if err := os.Chdir(cmd.Dir); err != nil {
 33 | 		_, _ = fmt.Fprintf(GinkgoWriter, "chdir dir: %s\n", err)
 34 | 	}
 35 | 
 36 | 	cmd.Env = append(os.Environ(), "GO111MODULE=on")
 37 | 	command := strings.Join(cmd.Args, " ")
 38 | 	_, _ = fmt.Fprintf(GinkgoWriter, "running: %s\n", command)
 39 | 	output, err := cmd.CombinedOutput()
 40 | 	if err != nil {
 41 | 		return string(output), fmt.Errorf("%s failed with error: (%v) %s", command, err, string(output))
 42 | 	}
 43 | 
 44 | 	return string(output), nil
 45 | }
 46 | 
 47 | // InstallPrometheusOperator installs the prometheus Operator to be used to export the enabled metrics.
 48 | func InstallPrometheusOperator() error {
 49 | 	url := fmt.Sprintf(prometheusOperatorURL, prometheusOperatorVersion)
 50 | 	cmd := exec.Command("kubectl", "create", "-f", url)
 51 | 	_, err := Run(cmd)
 52 | 	return err
 53 | }
 54 | 
 55 | // UninstallPrometheusOperator uninstalls the prometheus
 56 | func UninstallPrometheusOperator() {
 57 | 	url := fmt.Sprintf(prometheusOperatorURL, prometheusOperatorVersion)
 58 | 	cmd := exec.Command("kubectl", "delete", "-f", url)
 59 | 	if _, err := Run(cmd); err != nil {
 60 | 		warnError(err)
 61 | 	}
 62 | }
 63 | 
 64 | // IsPrometheusCRDsInstalled checks if any Prometheus CRDs are installed
 65 | // by verifying the existence of key CRDs related to Prometheus.
 66 | func IsPrometheusCRDsInstalled() bool {
 67 | 	// List of common Prometheus CRDs
 68 | 	prometheusCRDs := []string{
 69 | 		"prometheuses.monitoring.coreos.com",
 70 | 		"prometheusrules.monitoring.coreos.com",
 71 | 		"prometheusagents.monitoring.coreos.com",
 72 | 	}
 73 | 
 74 | 	cmd := exec.Command("kubectl", "get", "crds", "-o", "custom-columns=NAME:.metadata.name")
 75 | 	output, err := Run(cmd)
 76 | 	if err != nil {
 77 | 		return false
 78 | 	}
 79 | 	crdList := GetNonEmptyLines(output)
 80 | 	for _, crd := range prometheusCRDs {
 81 | 		for _, line := range crdList {
 82 | 			if strings.Contains(line, crd) {
 83 | 				return true
 84 | 			}
 85 | 		}
 86 | 	}
 87 | 
 88 | 	return false
 89 | }
 90 | 
 91 | // UninstallCertManager uninstalls the cert manager
 92 | func UninstallCertManager() {
 93 | 	url := fmt.Sprintf(certmanagerURLTmpl, certmanagerVersion)
 94 | 	cmd := exec.Command("kubectl", "delete", "-f", url)
 95 | 	if _, err := Run(cmd); err != nil {
 96 | 		warnError(err)
 97 | 	}
 98 | }
 99 | 
100 | // InstallCertManager installs the cert manager bundle.
101 | func InstallCertManager() error {
102 | 	url := fmt.Sprintf(certmanagerURLTmpl, certmanagerVersion)
103 | 	cmd := exec.Command("kubectl", "apply", "-f", url)
104 | 	if _, err := Run(cmd); err != nil {
105 | 		return err
106 | 	}
107 | 	// Wait for cert-manager-webhook to be ready, which can take time if cert-manager
108 | 	// was re-installed after uninstalling on a cluster.
109 | 	cmd = exec.Command("kubectl", "wait", "deployment.apps/cert-manager-webhook",
110 | 		"--for", "condition=Available",
111 | 		"--namespace", "cert-manager",
112 | 		"--timeout", "5m",
113 | 	)
114 | 
115 | 	_, err := Run(cmd)
116 | 	return err
117 | }
118 | 
119 | // IsCertManagerCRDsInstalled checks if any Cert Manager CRDs are installed
120 | // by verifying the existence of key CRDs related to Cert Manager.
121 | func IsCertManagerCRDsInstalled() bool {
122 | 	// List of common Cert Manager CRDs
123 | 	certManagerCRDs := []string{
124 | 		"certificates.cert-manager.io",
125 | 		"issuers.cert-manager.io",
126 | 		"clusterissuers.cert-manager.io",
127 | 		"certificaterequests.cert-manager.io",
128 | 		"orders.acme.cert-manager.io",
129 | 		"challenges.acme.cert-manager.io",
130 | 	}
131 | 
132 | 	// Execute the kubectl command to get all CRDs
133 | 	cmd := exec.Command("kubectl", "get", "crds")
134 | 	output, err := Run(cmd)
135 | 	if err != nil {
136 | 		return false
137 | 	}
138 | 
139 | 	// Check if any of the Cert Manager CRDs are present
140 | 	crdList := GetNonEmptyLines(output)
141 | 	for _, crd := range certManagerCRDs {
142 | 		for _, line := range crdList {
143 | 			if strings.Contains(line, crd) {
144 | 				return true
145 | 			}
146 | 		}
147 | 	}
148 | 
149 | 	return false
150 | }
151 | 
152 | // LoadImageToKindClusterWithName loads a local docker image to the kind cluster
153 | func LoadImageToKindClusterWithName(imageName string, cluster string) error {
154 | 	if v, ok := os.LookupEnv("KIND_CLUSTER"); ok {
155 | 		cluster = v
156 | 	}
157 | 	kindOptions := []string{"load", "image-archive", imageName, "--name", cluster}
158 | 	cmd := exec.Command("kind", kindOptions...)
159 | 	_, err := Run(cmd)
160 | 	return err
161 | }
162 | 
163 | // GetNonEmptyLines converts given command output string into individual objects
164 | // according to line breakers, and ignores the empty elements in it.
165 | func GetNonEmptyLines(output string) []string {
166 | 	var res []string
167 | 	elements := strings.Split(output, "\n")
168 | 	for _, element := range elements {
169 | 		if element != "" {
170 | 			res = append(res, element)
171 | 		}
172 | 	}
173 | 
174 | 	return res
175 | }
176 | 
177 | // GetProjectDir will return the directory where the project is
178 | func GetProjectDir() (string, error) {
179 | 	wd, err := os.Getwd()
180 | 	if err != nil {
181 | 		return wd, err
182 | 	}
183 | 	wd = strings.ReplaceAll(wd, "/test/e2e", "")
184 | 	return wd, nil
185 | }
186 | 
187 | // UncommentCode searches for target in the file and remove the comment prefix
188 | // of the target content. The target content may span multiple lines.
189 | func UncommentCode(filename, target, prefix string) error {
190 | 	// false positive
191 | 	// nolint:gosec
192 | 	content, err := os.ReadFile(filename)
193 | 	if err != nil {
194 | 		return err
195 | 	}
196 | 	strContent := string(content)
197 | 
198 | 	idx := strings.Index(strContent, target)
199 | 	if idx < 0 {
200 | 		return fmt.Errorf("unable to find the code %s to be uncomment", target)
201 | 	}
202 | 
203 | 	out := new(bytes.Buffer)
204 | 	_, err = out.Write(content[:idx])
205 | 	if err != nil {
206 | 		return err
207 | 	}
208 | 
209 | 	scanner := bufio.NewScanner(bytes.NewBufferString(target))
210 | 	if !scanner.Scan() {
211 | 		return nil
212 | 	}
213 | 	for {
214 | 		_, err := out.WriteString(strings.TrimPrefix(scanner.Text(), prefix))
215 | 		if err != nil {
216 | 			return err
217 | 		}
218 | 		// Avoid writing a newline in case the previous line was the last in target.
219 | 		if !scanner.Scan() {
220 | 			break
221 | 		}
222 | 		if _, err := out.WriteString("\n"); err != nil {
223 | 			return err
224 | 		}
225 | 	}
226 | 
227 | 	_, err = out.Write(content[idx+len(target):])
228 | 	if err != nil {
229 | 		return err
230 | 	}
231 | 	// false positive
232 | 	// nolint:gosec
233 | 	return os.WriteFile(filename, out.Bytes(), 0644)
234 | }
235 | 


--------------------------------------------------------------------------------