├── .github ├── ISSUE_TEMPLATE │ ├── bug_report.md │ └── feature_request.md ├── PULL_REQUEST_TEMPLATE │ └── pull_request_template.md ├── actions │ ├── docker-build-and-push │ │ └── action.yml │ ├── markdown-link-checker │ │ └── action.yaml │ ├── push-image │ │ └── action.yml │ └── trivy-scan │ │ └── action.yml └── workflows │ ├── ci-pr-checks.yaml │ ├── ci-release.yaml │ └── old │ └── pipeline-run.orig ├── .gitignore ├── .golangci.yml ├── .version.json ├── Dockerfile ├── LICENSE ├── Makefile ├── PROJECT ├── README.md ├── api └── v1alpha1 │ ├── groupversion_info.go │ ├── modelservice_types.go │ └── zz_generated.deepcopy.go ├── cmd ├── generate.go ├── generate_test.go ├── root.go ├── run.go └── suite_test.go ├── config ├── crd │ ├── bases │ │ └── llm-d.ai_modelservices.yaml │ ├── kustomization.yaml │ └── kustomizeconfig.yaml ├── default │ ├── cert_metrics_manager_patch.yaml │ ├── kustomization.yaml │ ├── manager_metrics_patch.yaml │ └── metrics_service.yaml ├── dev │ ├── kustomization.yaml │ └── manager_patch.yaml ├── eppandinference │ ├── inferencepool-e2e.yaml │ └── kustomization.yaml ├── externalcrds │ ├── bases │ │ └── inferencecrds.yaml │ └── kustomization.yaml ├── manager │ ├── kustomization.yaml │ └── manager.yaml ├── network-policy │ ├── allow-metrics-traffic.yaml │ └── kustomization.yaml ├── prometheus │ ├── kustomization.yaml │ ├── monitor.yaml │ └── monitor_tls_patch.yaml ├── rbac │ ├── epp_role_binding.yaml │ ├── kustomization.yaml │ ├── leader_election_role.yaml │ ├── leader_election_role_binding.yaml │ ├── metrics_auth_role.yaml │ ├── metrics_auth_role_binding.yaml │ ├── metrics_reader_role.yaml │ ├── modelservice_admin_role.yaml │ ├── modelservice_editor_role.yaml │ ├── modelservice_viewer_role.yaml │ ├── role.yaml │ ├── role_binding.yaml │ └── service_account.yaml ├── samples │ ├── kustomization.yaml │ └── vllmd_v1alpha1_modelservice.yaml └── summitdemo │ ├── kustomization.yaml │ └── manager_patch.yaml ├── deploy ├── common │ ├── patch-service.yaml │ ├── patch-statefulset.yaml │ ├── service.yaml │ └── statefulset.yaml ├── kustomization.yaml ├── openshift │ ├── patch-route.yaml │ └── route.yaml └── rbac │ ├── exec-rbac-role.yaml │ ├── exec-rbac-rolebinding.yaml │ ├── patch-rbac-role.yaml │ └── patch-rbac-rolebinding.yaml ├── docs ├── api_reference │ ├── config.yaml │ ├── out.asciidoc │ └── out.html ├── apireference.md ├── developer.md ├── install.md ├── userguide.md └── userguide │ ├── core-concepts.md │ ├── model-artifacts.md │ └── model-name.md ├── go.mod ├── go.sum ├── hack └── boilerplate.go.txt ├── hooks └── pre-commit ├── internal └── controller │ ├── accelerator_types.go │ ├── accelerator_types_test.go │ ├── child_resources.go │ ├── child_resources_test.go │ ├── constants.go │ ├── merge_transformers.go │ ├── merge_transformers_test.go │ ├── modelservice_controller.go │ ├── modelservice_controller_test.go │ ├── suite_test.go │ ├── template.go │ ├── template_test.go │ ├── utils.go │ └── utils_test.go ├── main.go ├── model-service-arch.excalidraw ├── model-service-arch.png ├── perf └── create_modelservice.sh ├── samples ├── README.md ├── baseconfigs │ ├── simple-baseconfig.yaml │ ├── universal-baseconfig-pvc.yaml │ └── universal-baseconfig.yaml ├── msvcs │ ├── facebook-nixl.yaml │ ├── granite3.2.yaml │ ├── llama4.yaml │ └── xpyd.yaml └── test │ ├── README.md │ ├── baseconfig.yaml │ ├── msvc-hf.yaml │ └── msvc.yaml └── test ├── e2e ├── e2e_suite_test.go └── e2e_test.go ├── inferenceCRDs ├── httproute.yaml ├── inferencemodel.yaml └── inferencepool.yaml ├── modelservices ├── baseResources.yaml └── ms1.yaml └── utils └── utils.go /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: kind/bug 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce** 14 | Steps to reproduce the behavior 15 | 16 | **Expected behavior** 17 | A clear and concise description of what you expected to happen. 18 | 19 | **Screenshots** 20 | If applicable, add screenshots to help explain your problem. 21 | 22 | **Additional context** 23 | Add any other context about the problem here. 24 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I have the following use-case [...]. For this use-case, I would like `modelservice` to [...] 12 | 13 | **Describe the solution approach you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Additional context** 20 | Add any other context or screenshots about the feature request here. 21 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE/pull_request_template.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Pull request 3 | about: Create a pull request 4 | title: '' 5 | labels: kind/bug 6 | assignees: '' 7 | 8 | --- 9 | 10 | # Pull Request Template 11 | 12 | ## Description 13 | 14 | Please include a summary of the change and which issue is fixed. Please also include relevant motivation and context. List any dependencies that are required for this change. 15 | 16 | Fixes # (issue) 17 | 18 | ## Type of change 19 | 20 | Please delete options that are not relevant. 21 | 22 | - [ ] Bug fix (non-breaking change which fixes an issue) 23 | - [ ] New feature (non-breaking change which adds functionality) 24 | - [ ] Breaking change (fix or feature that would cause existing functionality to not work as expected) 25 | - [ ] This change requires a documentation update 26 | 27 | ## How Has This Been Tested? 28 | 29 | Please describe the tests that you ran to verify your changes. Provide instructions so we can reproduce. Please also list any relevant details for your test configuration 30 | 31 | - [ ] Test A 32 | - [ ] Test B 33 | 34 | **Test Configuration**: 35 | * OS (if applicable) 36 | * Kubernetes version (if applicable) 37 | 38 | ## Checklist: 39 | 40 | - [ ] My code follows the style guidelines of this project 41 | - [ ] I have performed a self-review of my own code 42 | - [ ] I have commented my code, particularly in hard-to-understand areas 43 | - [ ] I have made corresponding changes to the documentation 44 | - [ ] My changes generate no new warnings 45 | - [ ] I have added tests that prove my fix is effective or that my feature works 46 | - [ ] New and existing unit tests pass locally with my changes 47 | - [ ] Any dependent changes have been merged and published in downstream modules 48 | - [ ] I have checked my code and corrected any misspellings 49 | -------------------------------------------------------------------------------- /.github/actions/docker-build-and-push/action.yml: -------------------------------------------------------------------------------- 1 | name: Docker Build - ghcr 2 | description: Build image using buildx 3 | inputs: 4 | image-name: 5 | required: true 6 | description: Image name 7 | tag: 8 | required: true 9 | description: Image tag 10 | github-token: 11 | required: true 12 | description: GitHub token for login 13 | registry: 14 | required: true 15 | description: Container registry (e.g., ghcr.io/llm-d) 16 | runs: 17 | using: "composite" 18 | steps: 19 | - name: Set up Docker Buildx 20 | uses: docker/setup-buildx-action@v3 21 | 22 | - name: Login to GitHub Container Registry 23 | run: echo "${{ inputs.github-token }}" | docker login ghcr.io -u ${{ github.actor }} --password-stdin 24 | shell: bash 25 | 26 | - name: Print image info 27 | run: | 28 | echo "Image name: ${{ inputs.image-name }}" 29 | echo "Tag: ${{ inputs.tag }}" 30 | echo "Registry: ${{ inputs.registry }}" 31 | shell: bash 32 | 33 | - name: Build image 34 | run: | 35 | docker buildx build \ 36 | --platform linux/amd64 \ 37 | -t ${{ inputs.registry }}/${{ inputs.image-name }}:${{ inputs.tag }} \ 38 | --push . 39 | shell: bash 40 | -------------------------------------------------------------------------------- /.github/actions/markdown-link-checker/action.yaml: -------------------------------------------------------------------------------- 1 | name: Markdown Link Checker 2 | description: Checks all Markdown files for broken links 3 | inputs: 4 | github-token: 5 | description: GitHub token (not used, but kept for interface compatibility) 6 | required: false 7 | args: 8 | description: Arguments to pass to markdown-link-check 9 | required: false 10 | default: "--quiet --retry" 11 | 12 | runs: 13 | using: "composite" 14 | steps: 15 | - name: Install markdown-link-check 16 | shell: bash 17 | run: npm install -g markdown-link-check 18 | 19 | - name: Run link check on all Markdown files 20 | shell: bash 21 | run: | 22 | set -euo pipefail 23 | echo "🔍 Scanning all Markdown files for broken links..." 24 | failed=0 25 | total_dead_links=0 26 | 27 | while IFS= read -r -d '' file; do 28 | echo "------------------------------------------------------------" 29 | echo "📄 Checking: $file" 30 | output=$(markdown-link-check ${{ inputs.args }} "$file" 2>&1) 31 | echo "$output" 32 | 33 | if echo "$output" | grep -q '✖'; then 34 | num_file_dead_links=$(echo "$output" | grep '✖' | wc -l) 35 | echo "❌ $num_file_dead_links broken links in $file" 36 | total_dead_links=$((total_dead_links + num_file_dead_links)) 37 | failed=1 38 | else 39 | echo "✅ No broken links in $file" 40 | fi 41 | done < <(find . -type f -name "*.md" -print0) 42 | 43 | echo "------------------------------------------------------------" 44 | if [ "$failed" -ne 0 ]; then 45 | echo "❌ Total broken links found: $total_dead_links" 46 | exit 1 47 | else 48 | echo "✅ All Markdown files passed link checks." 49 | fi 50 | -------------------------------------------------------------------------------- /.github/actions/push-image/action.yml: -------------------------------------------------------------------------------- 1 | name: Push Docker Image 2 | description: Push built image to container registry 3 | inputs: 4 | image-name: 5 | required: true 6 | tag: 7 | required: true 8 | registry: 9 | required: true 10 | runs: 11 | using: "composite" 12 | steps: 13 | - name: Push image 14 | run: | 15 | docker push ${{ inputs.registry }}/${{ inputs.image-name }}:${{ inputs.tag }} 16 | shell: bash 17 | -------------------------------------------------------------------------------- /.github/actions/trivy-scan/action.yml: -------------------------------------------------------------------------------- 1 | name: Trivy Scan 2 | description: Scan container image with Trivy 3 | inputs: 4 | image: 5 | required: true 6 | runs: 7 | using: "composite" 8 | steps: 9 | - name: Install Trivy 10 | run: | 11 | wget https://github.com/aquasecurity/trivy/releases/download/v0.44.1/trivy_0.44.1_Linux-64bit.deb 12 | sudo dpkg -i trivy_0.44.1_Linux-64bit.deb 13 | shell: bash 14 | 15 | 16 | - name: Scan image 17 | run: | 18 | trivy image --severity HIGH,CRITICAL --no-progress ${{ inputs.image }} 19 | shell: bash 20 | -------------------------------------------------------------------------------- /.github/workflows/ci-pr-checks.yaml: -------------------------------------------------------------------------------- 1 | name: CI - PR Checks 2 | 3 | on: 4 | pull_request: 5 | branches: 6 | - main 7 | 8 | jobs: 9 | lint-and-test: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - name: Checkout source 13 | uses: actions/checkout@v4 14 | 15 | - name: Sanity check repo contents 16 | run: ls -la 17 | 18 | - name: Set up go with cache 19 | uses: actions/setup-go@v5 20 | with: 21 | go-version: '1.24.0' 22 | cache-dependency-path: ./go.sum 23 | 24 | - name: Run markdown link checker 25 | uses: ./.github/actions/markdown-link-checker 26 | with: 27 | github-token: ${{ secrets.GITHUB_TOKEN }} 28 | args: "--quiet --retry" 29 | 30 | - name: Run lint checks 31 | uses: golangci/golangci-lint-action@v8 32 | with: 33 | version: 'v2.1.6' 34 | args: "--config=./.golangci.yml" 35 | 36 | - name: Run go test 37 | shell: bash 38 | run: | 39 | make test 40 | -------------------------------------------------------------------------------- /.github/workflows/ci-release.yaml: -------------------------------------------------------------------------------- 1 | name: CI - Release - Docker Container Image 2 | 3 | on: 4 | push: 5 | tags: 6 | - 'v*' # Runs when a tag like v0.1.0 is pushed 7 | release: 8 | types: [published] # Also runs when a GitHub release is published 9 | 10 | jobs: 11 | docker-build-and-push: 12 | runs-on: ubuntu-latest 13 | steps: 14 | - name: Checkout source 15 | uses: actions/checkout@v4 16 | 17 | - name: Set project name from repository 18 | id: version 19 | run: | 20 | repo="${GITHUB_REPOSITORY##*/}" 21 | echo "project_name=$repo" >> "$GITHUB_OUTPUT" 22 | 23 | - name: Print project name 24 | run: echo "Project is ${{ steps.version.outputs.project_name }}" 25 | 26 | - name: Determine tag name 27 | id: tag 28 | run: | 29 | if [[ "${GITHUB_EVENT_NAME}" == "release" ]]; then 30 | echo "tag=${GITHUB_REF##refs/tags/}" >> "$GITHUB_OUTPUT" 31 | elif [[ "${GITHUB_REF}" == refs/tags/* ]]; then 32 | echo "tag=${GITHUB_REF##refs/tags/}" >> "$GITHUB_OUTPUT" 33 | else 34 | echo "tag=latest" >> "$GITHUB_OUTPUT" 35 | fi 36 | shell: bash 37 | 38 | - name: Build and push image 39 | uses: ./.github/actions/docker-build-and-push 40 | with: 41 | tag: ${{ steps.tag.outputs.tag }} 42 | image-name: ${{ steps.version.outputs.project_name }} 43 | registry: ghcr.io/llm-d 44 | github-token: ${{ secrets.GHCR_TOKEN }} 45 | 46 | - name: Run Trivy scan 47 | uses: ./.github/actions/trivy-scan 48 | with: 49 | image: ghcr.io/llm-d/${{ steps.version.outputs.project_name }}:${{ steps.tag.outputs.tag }} 50 | -------------------------------------------------------------------------------- /.github/workflows/old/pipeline-run.orig: -------------------------------------------------------------------------------- 1 | name: CI PipelineRun 2 | 3 | on: 4 | push: 5 | branches: 6 | - dev 7 | - main 8 | pull_request: 9 | 10 | jobs: 11 | pipeline: 12 | runs-on: ubuntu-latest 13 | steps: 14 | - name: Checkout source 15 | uses: actions/checkout@v4 16 | 17 | - name: Sanity check repo contents 18 | run: ls -la 19 | 20 | - name: Run lint checks 21 | uses: ./.github/actions/lint 22 | 23 | - name: Build container image 24 | uses: ./.github/actions/docker-build-and-push 25 | with: 26 | image-name: my-app 27 | tag: ${{ github.sha }} 28 | github-token: ${{ secrets.GHCR_TOKEN }} 29 | 30 | - name: Run Trivy scan 31 | uses: ./.github/actions/trivy-scan 32 | with: 33 | image: ghcr.io/llm-d/my-app:${{ github.sha }} 34 | 35 | # - name: Push image 36 | # if: github.ref == 'refs/heads/main' 37 | # uses: ./.github/actions/push-image 38 | # with: 39 | # image-name: my-app 40 | # tag: ${{ github.sha }} 41 | # registry: ghcr.io/llm-d 42 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Binaries for programs and plugins 2 | *.exe 3 | *.exe~ 4 | *.dll 5 | *.so 6 | *.dylib 7 | bin/* 8 | Dockerfile.cross 9 | **/.DS_Store 10 | 11 | # Test binary, built with `go test -c` 12 | *.test 13 | 14 | # Output of the go coverage tool, specifically when used with LiteIDE 15 | *.out 16 | 17 | # Go workspace file 18 | go.work 19 | 20 | # Kubernetes Generated files - skip generated files, except for vendored files 21 | !vendor/**/zz_generated.* 22 | 23 | # editor and IDE paraphernalia 24 | .idea 25 | .vscode 26 | *.swp 27 | *.swo 28 | *~ 29 | 30 | # temp files 31 | eppandinference.yaml 32 | externalcrds.yaml 33 | 34 | 35 | -------------------------------------------------------------------------------- /.golangci.yml: -------------------------------------------------------------------------------- 1 | # Refer to golangci-lint's example config file for more options and information: 2 | # https://github.com/golangci/golangci-lint/blob/master/.golangci.reference.yml 3 | version: "2" 4 | 5 | run: 6 | timeout: 5m 7 | modules-download-mode: readonly 8 | 9 | linters: 10 | enable: 11 | - errcheck 12 | - govet 13 | - staticcheck 14 | 15 | issues: 16 | max-issues-per-linter: 0 17 | max-same-issues: 0 -------------------------------------------------------------------------------- /.version.json: -------------------------------------------------------------------------------- 1 | { 2 | "dev-version": "0.0.12", 3 | "dev-registry": "ghcr.io/llm-d/llm-d-model-service-dev", 4 | "prod-version": "0.0.11", 5 | "prod-registry": "ghcr.io/llm-d/llm-d-model-service" 6 | } 7 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # Build the manager binary 2 | FROM quay.io/projectquay/golang:1.24 AS builder 3 | ARG TARGETOS 4 | ARG TARGETARCH 5 | 6 | WORKDIR /workspace 7 | # Copy the Go Modules manifests 8 | COPY go.mod go.mod 9 | COPY go.sum go.sum 10 | # cache deps before building and copying source so that we don't need to re-download as much 11 | # and so that source changes don't invalidate our downloaded layer 12 | RUN go mod download 13 | 14 | # Copy the go source 15 | COPY main.go main.go 16 | COPY api/ api/ 17 | COPY internal/ internal/ 18 | COPY cmd/ cmd/ 19 | 20 | # Build 21 | # the GOARCH has not a default value to allow the binary be built according to the host where the command 22 | # was called. For example, if we call make docker-build in a local env which has the Apple Silicon M1 SO 23 | # the docker BUILDPLATFORM arg will be linux/arm64 when for Apple x86 it will be linux/amd64. Therefore, 24 | # by leaving it empty we can ensure that the container and binary shipped on it will have the same platform. 25 | RUN CGO_ENABLED=0 GOOS=${TARGETOS:-linux} GOARCH=${TARGETARCH} go build -a -o bin/manager main.go 26 | 27 | # Use distroless as minimal base image to package the manager binary 28 | # Refer to https://github.com/GoogleContainerTools/distroless for more details 29 | FROM registry.access.redhat.com/ubi9/ubi:latest 30 | WORKDIR / 31 | COPY --from=builder /workspace/bin/manager /manager 32 | USER 65532:65532 33 | 34 | CMD ["sleep", "infinity"] 35 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright 2022 The Kubernetes Authors 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. -------------------------------------------------------------------------------- /PROJECT: -------------------------------------------------------------------------------- 1 | # Code generated by tool. DO NOT EDIT. 2 | # This file is used to track the info used to scaffold your project 3 | # and allow the plugins properly work. 4 | # More info: https://book.kubebuilder.io/reference/project-config.html 5 | domain: llm-d.ai 6 | layout: 7 | - go.kubebuilder.io/v4 8 | projectName: modelservice 9 | repo: github.com/llm-d/llm-d-model-service 10 | resources: 11 | - api: 12 | crdVersion: v1 13 | namespaced: true 14 | controller: true 15 | domain: llm-d.ai 16 | group: llmd 17 | kind: ModelService 18 | path: github.com/llm-d/llm-d-model-service/api/v1alpha1 19 | version: v1alpha1 20 | version: "3" 21 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ModelService 2 | 3 | > *ModelService* declaratively provisions and maintains the Kubernetes resources needed to serve a base model for inference. 4 | 5 | A *ModelService* custom resource encapsulates the desired state of workloads and routing associated with a single base model. It automates the management of Kubernetes resources, including: 6 | 7 | * Prefill and decode deployments 8 | * Inference pool and model defined by [Gateway API Inference Extension](https://gateway-api-inference-extension.sigs.k8s.io) 9 | * [Endpoint picker (EPP) deployment and service](https://gateway-api-inference-extension.sigs.k8s.io/?h=endpoint#endpoint-selection-extension) 10 | * Relevant RBAC permissions 11 | 12 | A *ModelService* may optionally reference a **BaseConfig** — a Kubernetes ConfigMap that defines reusable, platform-managed presets for shared behavior across multiple base models. 13 | 14 | Typically, platform operators define a small set of *BaseConfig* presets, and base model owners reference them in their respective *ModelService* resources. 15 | 16 | The *ModelService* controller reconciles the cluster state to align with the configuration declared in the *ModelService* custom resource. This custom resource is the source of truth for resources it owns. 17 | 18 | > ⚠️ Important: Do not manually modify resources owned by a *ModelService*. If your use case is not yet supported, please file an issue in the *ModelService* repository. 19 | 20 | ## Features 21 | 22 | ✅ Supports disaggregated prefill and decode workloads 23 | 24 | 🌐 Integrates with Gateway API Inference Extension for request routing 25 | 26 | 📈 Enables auto-scaling via HPA or custom controllers 27 | 28 | 🔧 Allows independent scaling and node affinity for prefill and decode deployments 29 | 30 | 📦 Supports model loading from: 31 | 32 | * HuggingFace (public or private) 33 | * Kubernetes PVCs 34 | * OCI images 35 | 36 | 🧩 Supports value templating in both *BaseConfig* and *ModelService* resources 37 | 38 | ## How It Works 39 | 40 | When a *ModelService* resource is reconciled: 41 | 42 | 1. **Templating**: template variables in *BaseConfig* and *ModelService* are interpolated based on the *ModelService* spec. 43 | 44 | 2. **Merging**: a semantic merge overlays *ModelService* values on top of the selected *BaseConfig*. 45 | 46 | 3. **Orchestration**: the controller creates or updates the following resources: 47 | 48 | * Inference workloads (prefill and decode deployments) 49 | * Routing resources (e.g., EPP deployment) 50 | * RBAC permissions 51 | 52 | The result is a fully managed inference stack for the base model. 53 | 54 | ![model-service-arch](model-service-arch.png) 55 | 56 | ## Best Practices 57 | 58 | * Use *BaseConfig* to capture platform-level defaults and shared configurations across multiple base models. 59 | * Use *ModelService* to define behavior specific to a given base model, and override *BaseConfig* values only when necessary. 60 | * Platform teams should install *Baseconfig* presets using the `llm-d` deployer. 61 | * Base model owners should prefer using these presets to streamline onboarding of base models, rather than creating their own *BaseConfigs*. 62 | 63 | ## Docs 64 | 65 | ### [Install](docs/install.md) 66 | 67 | ### [Samples](./samples/README.md) 68 | 69 | ### [User Guide](docs/userguide.md) 70 | 71 | ### [API Reference](docs/apireference.md) 72 | 73 | ### [Developer](docs/developer.md) 74 | 75 | ## Roadmap 76 | 77 | `Modelservice` roadmap features in no specific order. 78 | 79 | 1. Multiple base models: Create HTTPRoute and related routing configuration 80 | 81 | 2. LoRA adapters: Create LoRA controller that integrates with `ModelService` 82 | 83 | 3. Routing weights: Allow a logical model to expose multiple model versions via routing weights 84 | 85 | 4. In-cluster model caching: download model artifacts once into cluster and reuse 86 | 87 | 5. Node-level model caching: pre-load model artifacts onto nodes for fast model loading 88 | 89 | 6. BaseConfig CRD: migrate from the use of configmaps to CRD for `baseconfig` resources 90 | 91 | 7. Prometheus metrics exporter: Emit controller metrics 92 | 93 | 8. Enable multi-node inferencing: for instance, using LWS integration 94 | -------------------------------------------------------------------------------- /api/v1alpha1/groupversion_info.go: -------------------------------------------------------------------------------- 1 | // Package v1alpha1 contains API Schema definitions for the llmd v1alpha1 API group. 2 | // +kubebuilder:object:generate=true 3 | // +groupName=llm-d.ai 4 | package v1alpha1 5 | 6 | import ( 7 | "k8s.io/apimachinery/pkg/runtime/schema" 8 | "sigs.k8s.io/controller-runtime/pkg/scheme" 9 | ) 10 | 11 | var ( 12 | // GroupVersion is group version used to register these objects. 13 | GroupVersion = schema.GroupVersion{Group: "llm-d.ai", Version: "v1alpha1"} 14 | 15 | // SchemeBuilder is used to add go types to the GroupVersionKind scheme. 16 | SchemeBuilder = &scheme.Builder{GroupVersion: GroupVersion} 17 | 18 | // AddToScheme adds the types in this group-version to the given scheme. 19 | AddToScheme = SchemeBuilder.AddToScheme 20 | ) 21 | -------------------------------------------------------------------------------- /cmd/generate.go: -------------------------------------------------------------------------------- 1 | package cmd 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | "os" 7 | 8 | "github.com/go-logr/logr" 9 | "github.com/spf13/cobra" 10 | zaplog "go.uber.org/zap" 11 | "go.uber.org/zap/zapcore" 12 | corev1 "k8s.io/api/core/v1" 13 | "k8s.io/client-go/kubernetes/scheme" 14 | "sigs.k8s.io/controller-runtime/pkg/log" 15 | "sigs.k8s.io/controller-runtime/pkg/log/zap" 16 | "sigs.k8s.io/yaml" 17 | 18 | msv1alpha1 "github.com/llm-d/llm-d-model-service/api/v1alpha1" 19 | "github.com/llm-d/llm-d-model-service/internal/controller" 20 | giev1alpha2 "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" 21 | gatewayv1 "sigs.k8s.io/gateway-api/apis/v1" 22 | ) 23 | 24 | func readModelService(ctx context.Context, filename string, logger logr.Logger) (*msv1alpha1.ModelService, error) { 25 | var modelService msv1alpha1.ModelService 26 | data, err := os.ReadFile(filename) 27 | if err != nil { 28 | logger.Error(err, "unable to read ModelService from "+filename) 29 | return nil, err 30 | } 31 | 32 | err = yaml.Unmarshal(data, &modelService) 33 | if err != nil { 34 | logger.Error(err, "unable to unmarshal data") 35 | return nil, err 36 | } 37 | 38 | // interpolate MSVC 39 | return controller.InterpolateModelService(ctx, &modelService) 40 | } 41 | 42 | func getBaseChildResources(filename string, msvc *msv1alpha1.ModelService, logger logr.Logger) (*controller.BaseConfig, error) { 43 | var baseChildResourcesConfigMap *corev1.ConfigMap 44 | var baseChildResources *controller.BaseConfig 45 | 46 | if filename != "" { 47 | data, err := os.ReadFile(filename) 48 | if err != nil { 49 | if os.IsNotExist(err) { 50 | logger.Error(err, "unable to read base child resources from "+filename) 51 | return nil, err 52 | } 53 | data = []byte{} 54 | } 55 | 56 | err = yaml.Unmarshal(data, &baseChildResourcesConfigMap) 57 | if err != nil { 58 | logger.Error(err, "unable to unmarshal base child resources") 59 | return nil, err 60 | } 61 | } else { 62 | baseChildResourcesConfigMap = &corev1.ConfigMap{} 63 | } 64 | 65 | interpolated, err := controller.InterpolateBaseConfigMap(context.TODO(), baseChildResourcesConfigMap, msvc) 66 | if err != nil { 67 | logger.Error(err, "cannot interpolate base configmap") 68 | return nil, err 69 | } 70 | 71 | baseChildResources, err = controller.BaseConfigFromCM(interpolated) 72 | if err != nil { 73 | logger.Error(err, "unable to create base child resources from config map") 74 | return nil, err 75 | } 76 | 77 | return baseChildResources, nil 78 | } 79 | 80 | func generateManifests(ctx context.Context, manifestFile string, configFile string) (*string, error) { 81 | logger := log.FromContext(ctx) 82 | 83 | // get msvc from file and interpolate it 84 | msvc, err := readModelService(ctx, manifestFile, logger) 85 | if err != nil { 86 | logger.Error(err, "unable to read ModelService", "location", manifestFile) 87 | return nil, err 88 | } 89 | logger.V(1).Info("generateManifest", "modelService", msvc) 90 | 91 | // get base child resources from file 92 | config, err := getBaseChildResources(configFile, msvc, logger) 93 | if err != nil { 94 | logger.Error(err, "unable to read basic configuration", "location", configFile) 95 | return nil, err 96 | } 97 | logger.V(1).Info("generateManifest", "baseResources", config) 98 | 99 | // create scheme 100 | err = msv1alpha1.AddToScheme(scheme.Scheme) 101 | if err != nil { 102 | logger.Info("unable to add model service to scheme") 103 | return nil, err 104 | } 105 | err = gatewayv1.Install(scheme.Scheme) 106 | if err != nil { 107 | logger.Info("unable to add gateway api extension to scheme") 108 | return nil, err 109 | } 110 | err = giev1alpha2.Install(scheme.Scheme) 111 | if err != nil { 112 | logger.Info("unable to add gateway api extension to scheme") 113 | return nil, err 114 | } 115 | 116 | // update child resources 117 | cR := config.MergeChildResources(ctx, msvc, scheme.Scheme, &rbacOptions) 118 | logger.V(1).Info("generateManifest", "baseResources", cR) 119 | 120 | yamlStr := "" 121 | yamlBytes, err := yaml.Marshal(&cR) 122 | if err != nil { 123 | logger.Error(err, "unable to marshal object to YAML") 124 | return nil, err 125 | } 126 | 127 | yamlStr = string(yamlBytes) 128 | return &yamlStr, nil 129 | } 130 | 131 | var modelServiceManifest string 132 | var baseConfigurationManifest string 133 | 134 | var generateCmd = &cobra.Command{ 135 | Use: "generate", 136 | Short: "Generate manifest", 137 | Long: `Generate manifest for objects created by ModelService controller`, 138 | RunE: func(cmd *cobra.Command, args []string) error { 139 | ctx := context.Background() 140 | var opts = zap.Options{ 141 | Development: false, 142 | TimeEncoder: zapcore.RFC3339NanoTimeEncoder, 143 | ZapOpts: []zaplog.Option{zaplog.AddCaller()}, 144 | Level: parseZapLogLevel(logLevel), 145 | } 146 | logger := zap.New(zap.UseFlagOptions(&opts)) 147 | log.SetLogger(logger) 148 | log.IntoContext(ctx, logger) 149 | 150 | result, err := generateManifests(ctx, modelServiceManifest, baseConfigurationManifest) 151 | if err != nil { 152 | return err 153 | } 154 | 155 | fmt.Println(*result) 156 | return nil 157 | }, 158 | } 159 | 160 | func init() { 161 | generateCmd.Flags().StringVarP(&modelServiceManifest, "modelservice", "m", "", "File containing the ModelService definition.") 162 | _ = generateCmd.MarkFlagRequired("modelservice") 163 | generateCmd.Flags().StringVarP(&baseConfigurationManifest, "baseconfig", "b", "", "File containing the base platform configuration.") 164 | rootCmd.AddCommand(generateCmd) 165 | } 166 | -------------------------------------------------------------------------------- /cmd/generate_test.go: -------------------------------------------------------------------------------- 1 | package cmd 2 | 3 | import ( 4 | "context" 5 | "path/filepath" 6 | 7 | . "github.com/onsi/ginkgo/v2" 8 | . "github.com/onsi/gomega" 9 | ) 10 | 11 | var _ = Describe("generate command", func() { 12 | 13 | var ctx context.Context 14 | 15 | BeforeEach(func() { 16 | ctx = context.Background() 17 | }) 18 | 19 | Context("simulate call", func() { 20 | modelServiceYaml := filepath.Join("..", "samples", "test", "msvc.yaml") 21 | baseConfigYaml := filepath.Join("..", "samples", "test", "baseconfig.yaml") 22 | rootCmd.SetArgs([]string{ 23 | "--epp-cluster-role=dummy", 24 | "generate", 25 | "-m", modelServiceYaml, 26 | "-b", baseConfigYaml, 27 | }) 28 | err := rootCmd.Execute() 29 | Expect(err).ToNot(HaveOccurred()) 30 | }) 31 | 32 | Context("call with valid inputs", func() { 33 | modelServiceYaml := filepath.Join("..", "samples", "test", "msvc.yaml") 34 | baseConfigYaml := filepath.Join("..", "samples", "test", "baseconfig.yaml") 35 | It("should generate manifests", func() { 36 | msvc, err := generateManifests(ctx, modelServiceYaml, baseConfigYaml) 37 | Expect(err).To(BeNil()) 38 | Expect(msvc).ToNot(BeNil()) 39 | }) 40 | }) 41 | 42 | Context("call with invalid modelService filename", func() { 43 | modelServiceYaml := filepath.Join(".", "invalid") 44 | baseConfigYaml := filepath.Join("..", "samples", "test", "baseconfig.yaml") 45 | It("should report an error", func() { 46 | _, err := generateManifests(ctx, modelServiceYaml, baseConfigYaml) 47 | Expect(err).ToNot(BeNil()) 48 | }) 49 | }) 50 | 51 | Context("call with invalid modelService content", func() { 52 | modelServiceYaml := filepath.Join("..", "test", "modelservices", "invalidyaml.yaml") 53 | baseConfigYaml := filepath.Join("..", "samples", "test", "baseconfig.yaml") 54 | It("should report an error", func() { 55 | _, err := generateManifests(ctx, modelServiceYaml, baseConfigYaml) 56 | Expect(err).ToNot(BeNil()) 57 | }) 58 | }) 59 | 60 | Context("call with empty baseConfiguration filename", func() { 61 | modelServiceYaml := filepath.Join("..", "samples", "test", "msvc.yaml") 62 | baseConfigYaml := "" 63 | It("should generate manifests", func() { 64 | msvc, err := generateManifests(ctx, modelServiceYaml, baseConfigYaml) 65 | Expect(err).To(BeNil()) 66 | Expect(msvc).ToNot(BeNil()) 67 | }) 68 | }) 69 | 70 | Context("call with invalid baseConfiguration filename", func() { 71 | modelServiceYaml := filepath.Join("..", "samples", "test", "msvc.yaml") 72 | baseConfigYaml := filepath.Join(".", "invalid") 73 | It("should report an error", func() { 74 | _, err := generateManifests(ctx, modelServiceYaml, baseConfigYaml) 75 | Expect(err).ToNot(BeNil()) 76 | }) 77 | }) 78 | 79 | Context("call with invalid baseConfiguration content", func() { 80 | modelServiceYaml := filepath.Join("..", "samples", "test", "msvc.yaml") 81 | baseConfigYaml := filepath.Join("..", "test", "modelservices", "invalidyaml.yaml") 82 | It("should report an error", func() { 83 | _, err := generateManifests(ctx, modelServiceYaml, baseConfigYaml) 84 | Expect(err).ToNot(BeNil()) 85 | }) 86 | }) 87 | }) 88 | -------------------------------------------------------------------------------- /cmd/root.go: -------------------------------------------------------------------------------- 1 | package cmd 2 | 3 | import ( 4 | "os" 5 | 6 | "github.com/llm-d/llm-d-model-service/internal/controller" 7 | "github.com/spf13/cobra" 8 | ) 9 | 10 | // rbac options 11 | var rbacOptions controller.RBACOptions 12 | 13 | // rootCmd represents the base command when called without any subcommands 14 | var rootCmd = &cobra.Command{ 15 | Use: "manager", 16 | Short: "ModelService controller CLI", 17 | Long: `ModelService controller CLI`, 18 | } 19 | 20 | // MyExecute adds all child commands to the root command and sets flags appropriately. 21 | // This is called by main.main(). It only needs to happen once to the rootCmd. 22 | func Execute() { 23 | if err := rootCmd.Execute(); err != nil { 24 | os.Exit(1) 25 | } 26 | } 27 | 28 | func init() { 29 | // secrets & cluster roles 30 | rootCmd.PersistentFlags().StringVar(&rbacOptions.EPPClusterRole, "epp-cluster-role", "", "Name of the epp cluster role") 31 | _ = rootCmd.MarkFlagRequired("epp-cluster-role") 32 | rootCmd.PersistentFlags().StringSliceVar(&rbacOptions.EPPPullSecrets, "epp-pull-secrets", []string{}, "List of pull secrets for configuring the epp deployment") 33 | rootCmd.PersistentFlags().StringSliceVar(&rbacOptions.PDPullSecrets, "pd-pull-secrets", []string{}, "List of pull secrets for configuring the prefill and decode deployments") 34 | } 35 | -------------------------------------------------------------------------------- /cmd/run.go: -------------------------------------------------------------------------------- 1 | package cmd 2 | 3 | import ( 4 | "crypto/tls" 5 | "fmt" 6 | "os" 7 | "path/filepath" 8 | 9 | // Import all Kubernetes client auth plugins (e.g. Azure, GCP, OIDC, etc.) 10 | // to ensure that exec-entrypoint and run can make use of them. 11 | 12 | _ "k8s.io/client-go/plugin/pkg/client/auth" 13 | 14 | "github.com/spf13/cobra" 15 | "k8s.io/apimachinery/pkg/runtime" 16 | utilruntime "k8s.io/apimachinery/pkg/util/runtime" 17 | clientgoscheme "k8s.io/client-go/kubernetes/scheme" 18 | ctrl "sigs.k8s.io/controller-runtime" 19 | "sigs.k8s.io/controller-runtime/pkg/certwatcher" 20 | "sigs.k8s.io/controller-runtime/pkg/healthz" 21 | 22 | zaplog "go.uber.org/zap" 23 | "sigs.k8s.io/controller-runtime/pkg/metrics/filters" 24 | metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server" 25 | "sigs.k8s.io/controller-runtime/pkg/webhook" 26 | 27 | msv1alpha1 "github.com/llm-d/llm-d-model-service/api/v1alpha1" 28 | "github.com/llm-d/llm-d-model-service/internal/controller" 29 | "go.uber.org/zap/zapcore" 30 | "sigs.k8s.io/controller-runtime/pkg/log/zap" 31 | giev1alpha2 "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" 32 | gatewayv1 "sigs.k8s.io/gateway-api/apis/v1" 33 | // +kubebuilder:scaffold:imports 34 | ) 35 | 36 | var ( 37 | setupLog = ctrl.Log.WithName("setup") 38 | ) 39 | 40 | var metricsAddr string 41 | var metricsCertPath, metricsCertName, metricsCertKey string 42 | var webhookCertPath, webhookCertName, webhookCertKey string 43 | var defaultsYAMLPath string 44 | var enableLeaderElection bool 45 | var probeAddr string 46 | var secureMetrics bool 47 | var enableHTTP2 bool 48 | var tlsOpts []func(*tls.Config) 49 | 50 | // Flags for zap logger 51 | var logLevel string 52 | 53 | func init() { 54 | // logger 55 | runCmd.PersistentFlags().StringVarP(&logLevel, "log-level", "l", "info", "Set the logging level (debug, info, warn, error, dpanic, panic, fatal)") 56 | 57 | // added by kubebuilder 58 | runCmd.Flags().BoolP("toggle", "t", false, "Help message for toggle") 59 | runCmd.Flags().StringVar(&metricsAddr, "metrics-bind-address", "0", "The address the metrics endpoint binds to. "+ 60 | "Use :8443 for HTTPS or :8080 for HTTP, or leave as 0 to disable the metrics service.") 61 | runCmd.Flags().StringVar(&probeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.") 62 | runCmd.Flags().BoolVar(&enableLeaderElection, "leader-elect", false, 63 | "Enable leader election for controller manager. "+ 64 | "Enabling this will ensure there is only one active controller manager.") 65 | runCmd.Flags().BoolVar(&secureMetrics, "metrics-secure", true, 66 | "If set, the metrics endpoint is served securely via HTTPS. Use --metrics-secure=false to use HTTP instead.") 67 | runCmd.Flags().StringVar(&webhookCertPath, "webhook-cert-path", "", "The directory that contains the webhook certificate.") 68 | runCmd.Flags().StringVar(&webhookCertName, "webhook-cert-name", "tls.crt", "The name of the webhook certificate file.") 69 | runCmd.Flags().StringVar(&webhookCertKey, "webhook-cert-key", "tls.key", "The name of the webhook key file.") 70 | runCmd.Flags().StringVar(&metricsCertPath, "metrics-cert-path", "", 71 | "The directory that contains the metrics server certificate.") 72 | runCmd.Flags().StringVar(&metricsCertName, "metrics-cert-name", "tls.crt", "The name of the metrics server certificate file.") 73 | runCmd.Flags().StringVar(&metricsCertKey, "metrics-cert-key", "tls.key", "The name of the metrics server key file.") 74 | runCmd.Flags().BoolVar(&enableHTTP2, "enable-http2", false, 75 | "If set, HTTP/2 will be enabled for the metrics and webhook servers") 76 | runCmd.Flags().StringVar(&defaultsYAMLPath, "defaults-yaml-path", "", "The YAML file containing the controller defaults.") 77 | 78 | rootCmd.AddCommand(runCmd) 79 | } 80 | 81 | // nolint:gocyclo 82 | func runController() { 83 | scheme := runtime.NewScheme() 84 | utilruntime.Must(clientgoscheme.AddToScheme(scheme)) 85 | 86 | utilruntime.Must(msv1alpha1.AddToScheme(scheme)) 87 | utilruntime.Must(gatewayv1.Install(scheme)) 88 | utilruntime.Must(giev1alpha2.Install(scheme)) 89 | var opts = zap.Options{ 90 | Development: false, 91 | TimeEncoder: zapcore.RFC3339NanoTimeEncoder, 92 | ZapOpts: []zaplog.Option{zaplog.AddCaller()}, 93 | Level: parseZapLogLevel(logLevel), 94 | } 95 | // +kubebuilder:scaffold:scheme 96 | ctrl.SetLogger(zap.New(zap.UseFlagOptions(&opts))) 97 | 98 | // if the enable-http2 flag is false (the default), http/2 should be disabled 99 | // due to its vulnerabilities. More specifically, disabling http/2 will 100 | // prevent from being vulnerable to the HTTP/2 Stream Cancellation and 101 | // Rapid Reset CVEs. For more information see: 102 | // - https://github.com/advisories/GHSA-qppj-fm5r-hxr3 103 | // - https://github.com/advisories/GHSA-4374-p667-p6c8 104 | disableHTTP2 := func(c *tls.Config) { 105 | setupLog.Info("disabling http/2") 106 | c.NextProtos = []string{"http/1.1"} 107 | } 108 | 109 | if !enableHTTP2 { 110 | tlsOpts = append(tlsOpts, disableHTTP2) 111 | } 112 | 113 | // Create watchers for metrics and webhooks certificates 114 | var metricsCertWatcher, webhookCertWatcher *certwatcher.CertWatcher 115 | 116 | // Initial webhook TLS options 117 | webhookTLSOpts := tlsOpts 118 | 119 | if len(webhookCertPath) > 0 { 120 | setupLog.Info("Initializing webhook certificate watcher using provided certificates", 121 | "webhook-cert-path", webhookCertPath, "webhook-cert-name", webhookCertName, "webhook-cert-key", webhookCertKey) 122 | 123 | var err error 124 | webhookCertWatcher, err = certwatcher.New( 125 | filepath.Join(webhookCertPath, webhookCertName), 126 | filepath.Join(webhookCertPath, webhookCertKey), 127 | ) 128 | if err != nil { 129 | setupLog.Error(err, "Failed to initialize webhook certificate watcher") 130 | os.Exit(1) 131 | } 132 | 133 | webhookTLSOpts = append(webhookTLSOpts, func(config *tls.Config) { 134 | config.GetCertificate = webhookCertWatcher.GetCertificate 135 | }) 136 | } 137 | 138 | webhookServer := webhook.NewServer(webhook.Options{ 139 | TLSOpts: webhookTLSOpts, 140 | }) 141 | 142 | // Metrics endpoint is enabled in 'config/default/kustomization.yaml'. The Metrics options configure the server. 143 | // More info: 144 | // - https://pkg.go.dev/sigs.k8s.io/controller-runtime@v0.20.4/pkg/metrics/server 145 | // - https://book.kubebuilder.io/reference/metrics.html 146 | metricsServerOptions := metricsserver.Options{ 147 | BindAddress: metricsAddr, 148 | SecureServing: secureMetrics, 149 | TLSOpts: tlsOpts, 150 | } 151 | 152 | if secureMetrics { 153 | // FilterProvider is used to protect the metrics endpoint with authn/authz. 154 | // These configurations ensure that only authorized users and service accounts 155 | // can access the metrics endpoint. The RBAC are configured in 'config/rbac/kustomization.yaml'. More info: 156 | // https://pkg.go.dev/sigs.k8s.io/controller-runtime@v0.20.4/pkg/metrics/filters#WithAuthenticationAndAuthorization 157 | metricsServerOptions.FilterProvider = filters.WithAuthenticationAndAuthorization 158 | } 159 | 160 | // If the certificate is not specified, controller-runtime will automatically 161 | // generate self-signed certificates for the metrics server. While convenient for development and testing, 162 | // this setup is not recommended for production. 163 | // 164 | // TODO(user): If you enable certManager, uncomment the following lines: 165 | // - [METRICS-WITH-CERTS] at config/default/kustomization.yaml to generate and use certificates 166 | // managed by cert-manager for the metrics server. 167 | // - [PROMETHEUS-WITH-CERTS] at config/prometheus/kustomization.yaml for TLS certification. 168 | if len(metricsCertPath) > 0 { 169 | setupLog.Info("Initializing metrics certificate watcher using provided certificates", 170 | "metrics-cert-path", metricsCertPath, "metrics-cert-name", metricsCertName, "metrics-cert-key", metricsCertKey) 171 | 172 | var err error 173 | metricsCertWatcher, err = certwatcher.New( 174 | filepath.Join(metricsCertPath, metricsCertName), 175 | filepath.Join(metricsCertPath, metricsCertKey), 176 | ) 177 | if err != nil { 178 | setupLog.Error(err, "to initialize metrics certificate watcher", "error", err) 179 | os.Exit(1) 180 | } 181 | 182 | metricsServerOptions.TLSOpts = append(metricsServerOptions.TLSOpts, func(config *tls.Config) { 183 | config.GetCertificate = metricsCertWatcher.GetCertificate 184 | }) 185 | } 186 | 187 | mgr, err := ctrl.NewManager(ctrl.GetConfigOrDie(), ctrl.Options{ 188 | Scheme: scheme, 189 | Metrics: metricsServerOptions, 190 | WebhookServer: webhookServer, 191 | HealthProbeBindAddress: probeAddr, 192 | LeaderElection: enableLeaderElection, 193 | LeaderElectionID: "f01a4b9d.llm-d.ai", 194 | // LeaderElectionReleaseOnCancel defines if the leader should step down voluntarily 195 | // when the Manager ends. This requires the binary to immediately end when the 196 | // Manager is stopped, otherwise, this setting is unsafe. Setting this significantly 197 | // speeds up voluntary leader transitions as the new leader don't have to wait 198 | // LeaseDuration time first. 199 | // 200 | // In the default scaffold provided, the program ends immediately after 201 | // the manager stops, so would be fine to enable this option. However, 202 | // if you are doing or is intended to do any operation such as perform cleanups 203 | // after the manager stops then its usage might be unsafe. 204 | // LeaderElectionReleaseOnCancel: true, 205 | }) 206 | if err != nil { 207 | setupLog.Error(err, "unable to start manager") 208 | os.Exit(1) 209 | } 210 | 211 | // Step 1: Read in the modelServiceDefaults struct 212 | // Pass that into Reconciler below 213 | 214 | if err = (&controller.ModelServiceReconciler{ 215 | Client: mgr.GetClient(), 216 | Scheme: mgr.GetScheme(), 217 | RBACOptions: rbacOptions, 218 | // Defaults: &modelServiceDefaults // from above 219 | }).SetupWithManager(mgr); err != nil { 220 | setupLog.Error(err, "unable to create controller", "controller", "ModelService") 221 | os.Exit(1) 222 | } 223 | // +kubebuilder:scaffold:builder 224 | 225 | if metricsCertWatcher != nil { 226 | setupLog.Info("Adding metrics certificate watcher to manager") 227 | if err := mgr.Add(metricsCertWatcher); err != nil { 228 | setupLog.Error(err, "unable to add metrics certificate watcher to manager") 229 | os.Exit(1) 230 | } 231 | } 232 | 233 | if webhookCertWatcher != nil { 234 | setupLog.Info("Adding webhook certificate watcher to manager") 235 | if err := mgr.Add(webhookCertWatcher); err != nil { 236 | setupLog.Error(err, "unable to add webhook certificate watcher to manager") 237 | os.Exit(1) 238 | } 239 | } 240 | 241 | if err := mgr.AddHealthzCheck("healthz", healthz.Ping); err != nil { 242 | setupLog.Error(err, "unable to set up health check") 243 | os.Exit(1) 244 | } 245 | if err := mgr.AddReadyzCheck("readyz", healthz.Ping); err != nil { 246 | setupLog.Error(err, "unable to set up ready check") 247 | os.Exit(1) 248 | } 249 | 250 | setupLog.Info("starting manager") 251 | if err := mgr.Start(ctrl.SetupSignalHandler()); err != nil { 252 | setupLog.Error(err, "problem running manager") 253 | os.Exit(1) 254 | } 255 | } 256 | 257 | // runCmd represents the base command when called without any subcommands 258 | var runCmd = &cobra.Command{ 259 | Use: "run", 260 | Short: "Run the ModelService controller", 261 | Long: `Run the ModelService controller`, 262 | PersistentPreRunE: func(cmd *cobra.Command, args []string) error { 263 | if len(rbacOptions.EPPClusterRole) < 1 { 264 | err := fmt.Errorf("valid EPP cluster role is required") 265 | return err 266 | } 267 | return nil 268 | }, 269 | Run: func(cmd *cobra.Command, args []string) { 270 | runController() 271 | }, 272 | } 273 | 274 | func parseZapLogLevel(levelStr string) zapcore.Level { 275 | switch levelStr { 276 | case "debug": 277 | return zapcore.DebugLevel 278 | case "info": 279 | return zapcore.InfoLevel 280 | case "warn": 281 | return zapcore.WarnLevel 282 | case "error": 283 | return zapcore.ErrorLevel 284 | case "dpanic": 285 | return zapcore.DPanicLevel 286 | case "panic": 287 | return zapcore.PanicLevel 288 | case "fatal": 289 | return zapcore.FatalLevel 290 | default: 291 | return zapcore.InfoLevel 292 | } 293 | } 294 | -------------------------------------------------------------------------------- /cmd/suite_test.go: -------------------------------------------------------------------------------- 1 | package cmd 2 | 3 | import ( 4 | "testing" 5 | 6 | . "github.com/onsi/ginkgo/v2" 7 | . "github.com/onsi/gomega" 8 | 9 | logf "sigs.k8s.io/controller-runtime/pkg/log" 10 | "sigs.k8s.io/controller-runtime/pkg/log/zap" 11 | ) 12 | 13 | // These tests use Ginkgo (BDD-style Go testing framework). Refer to 14 | // http://onsi.github.io/ginkgo/ to learn more about Ginkgo. 15 | 16 | func TestControllers(t *testing.T) { 17 | RegisterFailHandler(Fail) 18 | 19 | RunSpecs(t, "Controller Suite") 20 | } 21 | 22 | var _ = BeforeSuite(func() { 23 | logf.SetLogger(zap.New(zap.WriteTo(GinkgoWriter), zap.UseDevMode(true))) 24 | }) 25 | -------------------------------------------------------------------------------- /config/crd/kustomization.yaml: -------------------------------------------------------------------------------- 1 | # This kustomization.yaml is not intended to be run by itself, 2 | # since it depends on service name and namespace that are out of this kustomize package. 3 | # It should be run by config/default 4 | resources: 5 | - bases/llm-d.ai_modelservices.yaml 6 | # +kubebuilder:scaffold:crdkustomizeresource 7 | 8 | patches: [] 9 | # [WEBHOOK] To enable webhook, uncomment all the sections with [WEBHOOK] prefix. 10 | # patches here are for enabling the conversion webhook for each CRD 11 | # +kubebuilder:scaffold:crdkustomizewebhookpatch 12 | 13 | # [WEBHOOK] To enable webhook, uncomment the following section 14 | # the following config is for teaching kustomize how to do kustomization for CRDs. 15 | #configurations: 16 | #- kustomizeconfig.yaml 17 | -------------------------------------------------------------------------------- /config/crd/kustomizeconfig.yaml: -------------------------------------------------------------------------------- 1 | # This file is for teaching kustomize how to substitute name and namespace reference in CRD 2 | nameReference: 3 | - kind: Service 4 | version: v1 5 | fieldSpecs: 6 | - kind: CustomResourceDefinition 7 | version: v1 8 | group: apiextensions.k8s.io 9 | path: spec/conversion/webhook/clientConfig/service/name 10 | 11 | namespace: 12 | - kind: CustomResourceDefinition 13 | version: v1 14 | group: apiextensions.k8s.io 15 | path: spec/conversion/webhook/clientConfig/service/namespace 16 | create: false 17 | 18 | varReference: 19 | - path: metadata/annotations 20 | -------------------------------------------------------------------------------- /config/default/cert_metrics_manager_patch.yaml: -------------------------------------------------------------------------------- 1 | # This patch adds the args, volumes, and ports to allow the manager to use the metrics-server certs. 2 | 3 | # Add the volumeMount for the metrics-server certs 4 | - op: add 5 | path: /spec/template/spec/containers/0/volumeMounts/- 6 | value: 7 | mountPath: /tmp/k8s-metrics-server/metrics-certs 8 | name: metrics-certs 9 | readOnly: true 10 | 11 | # Add the --metrics-cert-path argument for the metrics server 12 | - op: add 13 | path: /spec/template/spec/containers/0/args/- 14 | value: --metrics-cert-path=/tmp/k8s-metrics-server/metrics-certs 15 | 16 | # Add the metrics-server certs volume configuration 17 | - op: add 18 | path: /spec/template/spec/volumes/- 19 | value: 20 | name: metrics-certs 21 | secret: 22 | secretName: metrics-server-cert 23 | optional: false 24 | items: 25 | - key: ca.crt 26 | path: ca.crt 27 | - key: tls.crt 28 | path: tls.crt 29 | - key: tls.key 30 | path: tls.key 31 | -------------------------------------------------------------------------------- /config/default/kustomization.yaml: -------------------------------------------------------------------------------- 1 | # Adds namespace to all resources. 2 | namespace: modelservice-system 3 | 4 | # Value of this field is prepended to the 5 | # names of all resources, e.g. a deployment named 6 | # "wordpress" becomes "alices-wordpress". 7 | # Note that it should also match with the prefix (text before '-') of the namespace 8 | # field above. 9 | namePrefix: modelservice- 10 | 11 | # Labels to add to all resources and selectors. 12 | #labels: 13 | #- includeSelectors: true 14 | # pairs: 15 | # someName: someValue 16 | 17 | resources: 18 | # - ../crd 19 | - ../rbac 20 | - ../manager 21 | # [WEBHOOK] To enable webhook, uncomment all the sections with [WEBHOOK] prefix including the one in 22 | # crd/kustomization.yaml 23 | #- ../webhook 24 | # [CERTMANAGER] To enable cert-manager, uncomment all sections with 'CERTMANAGER'. 'WEBHOOK' components are required. 25 | #- ../certmanager 26 | # [PROMETHEUS] To enable prometheus monitor, uncomment all sections with 'PROMETHEUS'. 27 | #- ../prometheus 28 | # [METRICS] Expose the controller manager metrics service. 29 | - metrics_service.yaml 30 | # [NETWORK POLICY] Protect the /metrics endpoint and Webhook Server with NetworkPolicy. 31 | # Only Pod(s) running a namespace labeled with 'metrics: enabled' will be able to gather the metrics. 32 | # Only CR(s) which requires webhooks and are applied on namespaces labeled with 'webhooks: enabled' will 33 | # be able to communicate with the Webhook Server. 34 | #- ../network-policy 35 | 36 | # Uncomment the patches line if you enable Metrics 37 | patches: 38 | # [METRICS] The following patch will enable the metrics endpoint using HTTPS and the port :8443. 39 | # More info: https://book.kubebuilder.io/reference/metrics 40 | - path: manager_metrics_patch.yaml 41 | target: 42 | kind: Deployment 43 | 44 | # Uncomment the patches line if you enable Metrics and CertManager 45 | # [METRICS-WITH-CERTS] To enable metrics protected with certManager, uncomment the following line. 46 | # This patch will protect the metrics with certManager self-signed certs. 47 | #- path: cert_metrics_manager_patch.yaml 48 | # target: 49 | # kind: Deployment 50 | 51 | # [WEBHOOK] To enable webhook, uncomment all the sections with [WEBHOOK] prefix including the one in 52 | # crd/kustomization.yaml 53 | #- path: manager_webhook_patch.yaml 54 | # target: 55 | # kind: Deployment 56 | 57 | # [CERTMANAGER] To enable cert-manager, uncomment all sections with 'CERTMANAGER' prefix. 58 | # Uncomment the following replacements to add the cert-manager CA injection annotations 59 | #replacements: 60 | # - source: # Uncomment the following block to enable certificates for metrics 61 | # kind: Service 62 | # version: v1 63 | # name: controller-manager-metrics-service 64 | # fieldPath: metadata.name 65 | # targets: 66 | # - select: 67 | # kind: Certificate 68 | # group: cert-manager.io 69 | # version: v1 70 | # name: metrics-certs 71 | # fieldPaths: 72 | # - spec.dnsNames.0 73 | # - spec.dnsNames.1 74 | # options: 75 | # delimiter: '.' 76 | # index: 0 77 | # create: true 78 | # - select: # Uncomment the following to set the Service name for TLS config in Prometheus ServiceMonitor 79 | # kind: ServiceMonitor 80 | # group: monitoring.coreos.com 81 | # version: v1 82 | # name: controller-manager-metrics-monitor 83 | # fieldPaths: 84 | # - spec.endpoints.0.tlsConfig.serverName 85 | # options: 86 | # delimiter: '.' 87 | # index: 0 88 | # create: true 89 | # 90 | # - source: 91 | # kind: Service 92 | # version: v1 93 | # name: controller-manager-metrics-service 94 | # fieldPath: metadata.namespace 95 | # targets: 96 | # - select: 97 | # kind: Certificate 98 | # group: cert-manager.io 99 | # version: v1 100 | # name: metrics-certs 101 | # fieldPaths: 102 | # - spec.dnsNames.0 103 | # - spec.dnsNames.1 104 | # options: 105 | # delimiter: '.' 106 | # index: 1 107 | # create: true 108 | # - select: # Uncomment the following to set the Service namespace for TLS in Prometheus ServiceMonitor 109 | # kind: ServiceMonitor 110 | # group: monitoring.coreos.com 111 | # version: v1 112 | # name: controller-manager-metrics-monitor 113 | # fieldPaths: 114 | # - spec.endpoints.0.tlsConfig.serverName 115 | # options: 116 | # delimiter: '.' 117 | # index: 1 118 | # create: true 119 | # 120 | # - source: # Uncomment the following block if you have any webhook 121 | # kind: Service 122 | # version: v1 123 | # name: webhook-service 124 | # fieldPath: .metadata.name # Name of the service 125 | # targets: 126 | # - select: 127 | # kind: Certificate 128 | # group: cert-manager.io 129 | # version: v1 130 | # name: serving-cert 131 | # fieldPaths: 132 | # - .spec.dnsNames.0 133 | # - .spec.dnsNames.1 134 | # options: 135 | # delimiter: '.' 136 | # index: 0 137 | # create: true 138 | # - source: 139 | # kind: Service 140 | # version: v1 141 | # name: webhook-service 142 | # fieldPath: .metadata.namespace # Namespace of the service 143 | # targets: 144 | # - select: 145 | # kind: Certificate 146 | # group: cert-manager.io 147 | # version: v1 148 | # name: serving-cert 149 | # fieldPaths: 150 | # - .spec.dnsNames.0 151 | # - .spec.dnsNames.1 152 | # options: 153 | # delimiter: '.' 154 | # index: 1 155 | # create: true 156 | # 157 | # - source: # Uncomment the following block if you have a ValidatingWebhook (--programmatic-validation) 158 | # kind: Certificate 159 | # group: cert-manager.io 160 | # version: v1 161 | # name: serving-cert # This name should match the one in certificate.yaml 162 | # fieldPath: .metadata.namespace # Namespace of the certificate CR 163 | # targets: 164 | # - select: 165 | # kind: ValidatingWebhookConfiguration 166 | # fieldPaths: 167 | # - .metadata.annotations.[cert-manager.io/inject-ca-from] 168 | # options: 169 | # delimiter: '/' 170 | # index: 0 171 | # create: true 172 | # - source: 173 | # kind: Certificate 174 | # group: cert-manager.io 175 | # version: v1 176 | # name: serving-cert 177 | # fieldPath: .metadata.name 178 | # targets: 179 | # - select: 180 | # kind: ValidatingWebhookConfiguration 181 | # fieldPaths: 182 | # - .metadata.annotations.[cert-manager.io/inject-ca-from] 183 | # options: 184 | # delimiter: '/' 185 | # index: 1 186 | # create: true 187 | # 188 | # - source: # Uncomment the following block if you have a DefaultingWebhook (--defaulting ) 189 | # kind: Certificate 190 | # group: cert-manager.io 191 | # version: v1 192 | # name: serving-cert 193 | # fieldPath: .metadata.namespace # Namespace of the certificate CR 194 | # targets: 195 | # - select: 196 | # kind: MutatingWebhookConfiguration 197 | # fieldPaths: 198 | # - .metadata.annotations.[cert-manager.io/inject-ca-from] 199 | # options: 200 | # delimiter: '/' 201 | # index: 0 202 | # create: true 203 | # - source: 204 | # kind: Certificate 205 | # group: cert-manager.io 206 | # version: v1 207 | # name: serving-cert 208 | # fieldPath: .metadata.name 209 | # targets: 210 | # - select: 211 | # kind: MutatingWebhookConfiguration 212 | # fieldPaths: 213 | # - .metadata.annotations.[cert-manager.io/inject-ca-from] 214 | # options: 215 | # delimiter: '/' 216 | # index: 1 217 | # create: true 218 | # 219 | # - source: # Uncomment the following block if you have a ConversionWebhook (--conversion) 220 | # kind: Certificate 221 | # group: cert-manager.io 222 | # version: v1 223 | # name: serving-cert 224 | # fieldPath: .metadata.namespace # Namespace of the certificate CR 225 | # targets: # Do not remove or uncomment the following scaffold marker; required to generate code for target CRD. 226 | # +kubebuilder:scaffold:crdkustomizecainjectionns 227 | # - source: 228 | # kind: Certificate 229 | # group: cert-manager.io 230 | # version: v1 231 | # name: serving-cert 232 | # fieldPath: .metadata.name 233 | # targets: # Do not remove or uncomment the following scaffold marker; required to generate code for target CRD. 234 | # +kubebuilder:scaffold:crdkustomizecainjectionname 235 | -------------------------------------------------------------------------------- /config/default/manager_metrics_patch.yaml: -------------------------------------------------------------------------------- 1 | # This patch adds the args to allow exposing the metrics endpoint using HTTPS 2 | - op: add 3 | path: /spec/template/spec/containers/0/args/0 4 | value: --metrics-bind-address=:8443 5 | -------------------------------------------------------------------------------- /config/default/metrics_service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | labels: 5 | control-plane: controller-manager 6 | app.kubernetes.io/name: modelservice 7 | app.kubernetes.io/managed-by: kustomize 8 | name: controller-manager-metrics-service 9 | namespace: system 10 | spec: 11 | ports: 12 | - name: https 13 | port: 8443 14 | protocol: TCP 15 | targetPort: 8443 16 | selector: 17 | control-plane: controller-manager 18 | app.kubernetes.io/name: modelservice 19 | -------------------------------------------------------------------------------- /config/dev/kustomization.yaml: -------------------------------------------------------------------------------- 1 | resources: 2 | - ../default 3 | 4 | patches: 5 | # [MANAGER] The following patch provides access to image 6 | - target: 7 | kind: Deployment 8 | path: manager_patch.yaml 9 | 10 | -------------------------------------------------------------------------------- /config/dev/manager_patch.yaml: -------------------------------------------------------------------------------- 1 | 2 | # Make image pull policy Always -- dev images require this 3 | - op: add 4 | path: /spec/template/spec/containers/0/imagePullPolicy 5 | value: Always 6 | 7 | # Add image pull secret 8 | - op: add 9 | path: /spec/template/spec/imagePullSecrets 10 | value: 11 | - name: $IMAGE_PULL_SECRET 12 | 13 | # to configure pull secrets for pd and epp pods, uncomment and set environemnt variables 14 | - op: add 15 | path: /spec/template/spec/containers/0/args/- 16 | value: --pd-pull-secrets=$PD_PULL_SECRETS 17 | - op: add 18 | path: /spec/template/spec/containers/0/args/- 19 | value: --epp-pull-secrets=$EPP_PULL_SECRETS 20 | 21 | # Make image pull policy configurable -- e.g., Always, Never, IfNotPresent 22 | - op: add 23 | path: /spec/template/spec/containers/0/imagePullPolicy 24 | value: ${IMAGE_PULL_POLICY} 25 | -------------------------------------------------------------------------------- /config/eppandinference/inferencepool-e2e.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ServiceAccount 3 | metadata: 4 | labels: 5 | app.kubernetes.io/name: modelservice 6 | app.kubernetes.io/managed-by: kustomize 7 | name: inferenceandepp 8 | --- 9 | # The deployer has to create and deploy this 10 | # The ModelService operator is not creating this for the summit-demo 11 | 12 | apiVersion: inference.networking.x-k8s.io/v1alpha2 13 | kind: InferencePool 14 | metadata: 15 | name: vllm-llama-4-scout-17b-16e-instruct 16 | spec: 17 | # Need to match operatordefaults.yaml/vllmdProxyContainer.ports and look for the "proxy-port" name AND "port" in vllmdProxyContainer.args.args 18 | targetPortNumber: 8000 19 | selector: 20 | app: vllm-llama-4-scout-17b-16e-instruct 21 | extensionRef: 22 | name: vllm-llama-4-scout-17b-16e-instruct-epp 23 | # model-related artifacts will be in the same namespace 24 | # namespace: $E2E_NS 25 | --- 26 | apiVersion: v1 27 | kind: Service 28 | metadata: 29 | name: vllm-llama-4-scout-17b-16e-instruct-epp 30 | # model-related artifacts will be in the same namespace 31 | # namespace: $E2E_NS 32 | spec: 33 | selector: 34 | app: vllm-llama-4-scout-17b-16e-instruct-epp 35 | ports: 36 | - protocol: TCP 37 | port: 9002 38 | targetPort: 9002 39 | appProtocol: http2 40 | type: ClusterIP 41 | --- 42 | apiVersion: apps/v1 43 | kind: Deployment 44 | metadata: 45 | name: vllm-llama-4-scout-17b-16e-instruct-epp 46 | # model-related artifacts will be in the same namespace 47 | # namespace: $E2E_NS 48 | labels: 49 | app: vllm-llama-4-scout-17b-16e-instruct-epp 50 | spec: 51 | replicas: 1 52 | selector: 53 | matchLabels: 54 | app: vllm-llama-4-scout-17b-16e-instruct-epp 55 | template: 56 | metadata: 57 | labels: 58 | app: vllm-llama-4-scout-17b-16e-instruct-epp 59 | spec: 60 | # Conservatively, this timeout should mirror the longest grace period of the pods within the pool 61 | terminationGracePeriodSeconds: 130 62 | serviceAccountName: inferenceandepp 63 | containers: 64 | - name: epp 65 | image: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/epp:main 66 | imagePullPolicy: Always 67 | args: 68 | - -poolName 69 | - "vllm-llama-4-scout-17b-16e-instruct" 70 | # model-related artifacts will be in the same namespace 71 | # - -poolNamespace 72 | # - "$E2E_NS" 73 | - -v 74 | - "4" 75 | - --zap-encoder 76 | - "json" 77 | - -grpcPort 78 | - "9002" 79 | - -grpcHealthPort 80 | - "9003" 81 | env: 82 | - name: USE_STREAMING 83 | value: "true" 84 | ports: 85 | - containerPort: 9002 86 | - containerPort: 9003 87 | - name: metrics 88 | containerPort: 9090 89 | livenessProbe: 90 | grpc: 91 | port: 9003 92 | service: inference-extension 93 | initialDelaySeconds: 5 94 | periodSeconds: 10 95 | readinessProbe: 96 | grpc: 97 | port: 9003 98 | service: inference-extension 99 | initialDelaySeconds: 5 100 | periodSeconds: 10 101 | --- 102 | kind: ClusterRole 103 | apiVersion: rbac.authorization.k8s.io/v1 104 | metadata: 105 | name: pod-read 106 | rules: 107 | - apiGroups: ["inference.networking.x-k8s.io"] 108 | resources: ["inferencemodels"] 109 | verbs: ["get", "watch", "list"] 110 | - apiGroups: [""] 111 | resources: ["pods"] 112 | verbs: ["get", "watch", "list"] 113 | - apiGroups: ["inference.networking.x-k8s.io"] 114 | resources: ["inferencepools"] 115 | verbs: ["get", "watch", "list"] 116 | - apiGroups: ["discovery.k8s.io"] 117 | resources: ["endpointslices"] 118 | verbs: ["get", "watch", "list"] 119 | - apiGroups: 120 | - authentication.k8s.io 121 | resources: 122 | - tokenreviews 123 | verbs: 124 | - create 125 | - apiGroups: 126 | - authorization.k8s.io 127 | resources: 128 | - subjectaccessreviews 129 | verbs: 130 | - create 131 | --- 132 | kind: ClusterRoleBinding 133 | apiVersion: rbac.authorization.k8s.io/v1 134 | metadata: 135 | name: pod-read-binding 136 | subjects: 137 | - kind: ServiceAccount 138 | name: inferenceandepp 139 | # model-related artifacts will be in the same namespace 140 | # namespace: $E2E_NS 141 | roleRef: 142 | apiGroup: rbac.authorization.k8s.io 143 | kind: ClusterRole 144 | name: pod-read -------------------------------------------------------------------------------- /config/eppandinference/kustomization.yaml: -------------------------------------------------------------------------------- 1 | # Adds namespace to all resources. 2 | namespace: e2e-solution 3 | 4 | resources: 5 | - inferencepool-e2e.yaml -------------------------------------------------------------------------------- /config/externalcrds/kustomization.yaml: -------------------------------------------------------------------------------- 1 | resources: 2 | - bases/inferencecrds.yaml -------------------------------------------------------------------------------- /config/manager/kustomization.yaml: -------------------------------------------------------------------------------- 1 | resources: 2 | - manager.yaml 3 | apiVersion: kustomize.config.k8s.io/v1beta1 4 | kind: Kustomization 5 | images: 6 | - name: controller 7 | newName: ghcr.io/llm-d/llm-d-model-service 8 | newTag: 0.0.1 9 | -------------------------------------------------------------------------------- /config/manager/manager.yaml: -------------------------------------------------------------------------------- 1 | # apiVersion: v1 2 | # kind: Namespace 3 | # metadata: 4 | # labels: 5 | # control-plane: controller-manager 6 | # app.kubernetes.io/name: modelservice 7 | # app.kubernetes.io/managed-by: kustomize 8 | # name: system 9 | # --- 10 | apiVersion: apps/v1 11 | kind: Deployment 12 | metadata: 13 | name: controller-manager 14 | namespace: system 15 | labels: 16 | control-plane: controller-manager 17 | app.kubernetes.io/name: modelservice 18 | app.kubernetes.io/managed-by: kustomize 19 | spec: 20 | selector: 21 | matchLabels: 22 | control-plane: controller-manager 23 | app.kubernetes.io/name: modelservice 24 | replicas: 1 25 | template: 26 | metadata: 27 | annotations: 28 | kubectl.kubernetes.io/default-container: manager 29 | labels: 30 | control-plane: controller-manager 31 | app.kubernetes.io/name: modelservice 32 | spec: 33 | # TODO(user): Uncomment the following code to configure the nodeAffinity expression 34 | # according to the platforms which are supported by your solution. 35 | # It is considered best practice to support multiple architectures. You can 36 | # build your manager image using the makefile target docker-buildx. 37 | # affinity: 38 | # nodeAffinity: 39 | # requiredDuringSchedulingIgnoredDuringExecution: 40 | # nodeSelectorTerms: 41 | # - matchExpressions: 42 | # - key: kubernetes.io/arch 43 | # operator: In 44 | # values: 45 | # - amd64 46 | # - arm64 47 | # - ppc64le 48 | # - s390x 49 | # - key: kubernetes.io/os 50 | # operator: In 51 | # values: 52 | # - linux 53 | securityContext: 54 | # Projects are configured by default to adhere to the "restricted" Pod Security Standards. 55 | # This ensures that deployments meet the highest security requirements for Kubernetes. 56 | # For more details, see: https://kubernetes.io/docs/concepts/security/pod-security-standards/#restricted 57 | runAsNonRoot: true 58 | seccompProfile: 59 | type: RuntimeDefault 60 | containers: 61 | - command: 62 | - /manager 63 | - run 64 | args: 65 | - --leader-elect=false 66 | - --health-probe-bind-address=:8081 67 | - --log-level=$LOG_LEVEL 68 | - --epp-cluster-role=$EPP_CLUSTER_ROLE 69 | image: controller:latest 70 | imagePullPolicy: Always 71 | name: manager 72 | ports: [] 73 | securityContext: 74 | allowPrivilegeEscalation: false 75 | capabilities: 76 | drop: 77 | - "ALL" 78 | livenessProbe: 79 | httpGet: 80 | path: /healthz 81 | port: 8081 82 | initialDelaySeconds: 15 83 | periodSeconds: 20 84 | readinessProbe: 85 | httpGet: 86 | path: /readyz 87 | port: 8081 88 | initialDelaySeconds: 5 89 | periodSeconds: 10 90 | # TODO(user): Configure the resources accordingly based on the project requirements. 91 | # More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ 92 | resources: 93 | limits: 94 | cpu: 500m 95 | memory: 512Mi 96 | requests: 97 | cpu: 10m 98 | memory: 128Mi 99 | volumeMounts: [] 100 | volumes: [] 101 | serviceAccountName: controller-manager 102 | terminationGracePeriodSeconds: 10 103 | -------------------------------------------------------------------------------- /config/network-policy/allow-metrics-traffic.yaml: -------------------------------------------------------------------------------- 1 | # This NetworkPolicy allows ingress traffic 2 | # with Pods running on namespaces labeled with 'metrics: enabled'. Only Pods on those 3 | # namespaces are able to gather data from the metrics endpoint. 4 | apiVersion: networking.k8s.io/v1 5 | kind: NetworkPolicy 6 | metadata: 7 | labels: 8 | app.kubernetes.io/name: modelservice 9 | app.kubernetes.io/managed-by: kustomize 10 | name: allow-metrics-traffic 11 | namespace: system 12 | spec: 13 | podSelector: 14 | matchLabels: 15 | control-plane: controller-manager 16 | app.kubernetes.io/name: modelservice 17 | policyTypes: 18 | - Ingress 19 | ingress: 20 | # This allows ingress traffic from any namespace with the label metrics: enabled 21 | - from: 22 | - namespaceSelector: 23 | matchLabels: 24 | metrics: enabled # Only from namespaces with this label 25 | ports: 26 | - port: 8443 27 | protocol: TCP 28 | -------------------------------------------------------------------------------- /config/network-policy/kustomization.yaml: -------------------------------------------------------------------------------- 1 | resources: 2 | - allow-metrics-traffic.yaml 3 | -------------------------------------------------------------------------------- /config/prometheus/kustomization.yaml: -------------------------------------------------------------------------------- 1 | resources: 2 | - monitor.yaml 3 | 4 | # [PROMETHEUS-WITH-CERTS] The following patch configures the ServiceMonitor in ../prometheus 5 | # to securely reference certificates created and managed by cert-manager. 6 | # Additionally, ensure that you uncomment the [METRICS WITH CERTMANAGER] patch under config/default/kustomization.yaml 7 | # to mount the "metrics-server-cert" secret in the Manager Deployment. 8 | #patches: 9 | # - path: monitor_tls_patch.yaml 10 | # target: 11 | # kind: ServiceMonitor 12 | -------------------------------------------------------------------------------- /config/prometheus/monitor.yaml: -------------------------------------------------------------------------------- 1 | # Prometheus Monitor Service (Metrics) 2 | apiVersion: monitoring.coreos.com/v1 3 | kind: ServiceMonitor 4 | metadata: 5 | labels: 6 | control-plane: controller-manager 7 | app.kubernetes.io/name: modelservice 8 | app.kubernetes.io/managed-by: kustomize 9 | name: controller-manager-metrics-monitor 10 | namespace: system 11 | spec: 12 | endpoints: 13 | - path: /metrics 14 | port: https # Ensure this is the name of the port that exposes HTTPS metrics 15 | scheme: https 16 | bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token 17 | tlsConfig: 18 | # TODO(user): The option insecureSkipVerify: true is not recommended for production since it disables 19 | # certificate verification, exposing the system to potential man-in-the-middle attacks. 20 | # For production environments, it is recommended to use cert-manager for automatic TLS certificate management. 21 | # To apply this configuration, enable cert-manager and use the patch located at config/prometheus/servicemonitor_tls_patch.yaml, 22 | # which securely references the certificate from the 'metrics-server-cert' secret. 23 | insecureSkipVerify: true 24 | selector: 25 | matchLabels: 26 | control-plane: controller-manager 27 | app.kubernetes.io/name: modelservice 28 | -------------------------------------------------------------------------------- /config/prometheus/monitor_tls_patch.yaml: -------------------------------------------------------------------------------- 1 | # Patch for Prometheus ServiceMonitor to enable secure TLS configuration 2 | # using certificates managed by cert-manager 3 | - op: replace 4 | path: /spec/endpoints/0/tlsConfig 5 | value: 6 | # SERVICE_NAME and SERVICE_NAMESPACE will be substituted by kustomize 7 | serverName: SERVICE_NAME.SERVICE_NAMESPACE.svc 8 | insecureSkipVerify: false 9 | ca: 10 | secret: 11 | name: metrics-server-cert 12 | key: ca.crt 13 | cert: 14 | secret: 15 | name: metrics-server-cert 16 | key: tls.crt 17 | keySecret: 18 | name: metrics-server-cert 19 | key: tls.key 20 | -------------------------------------------------------------------------------- /config/rbac/epp_role_binding.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: ClusterRoleBinding 3 | metadata: 4 | labels: 5 | app.kubernetes.io/name: modelservice 6 | app.kubernetes.io/managed-by: kustomize 7 | name: manager-epp-rolebinding 8 | roleRef: 9 | apiGroup: rbac.authorization.k8s.io 10 | kind: ClusterRole 11 | name: $EPP_CLUSTER_ROLE 12 | subjects: 13 | - kind: ServiceAccount 14 | name: controller-manager 15 | namespace: system 16 | -------------------------------------------------------------------------------- /config/rbac/kustomization.yaml: -------------------------------------------------------------------------------- 1 | resources: 2 | # All RBAC will be applied under this service account in 3 | # the deployment namespace. You may comment out this resource 4 | # if your manager will use a service account that exists at 5 | # runtime. Be sure to update RoleBinding and ClusterRoleBinding 6 | # subjects if changing service account names. 7 | - service_account.yaml 8 | - role.yaml 9 | - role_binding.yaml 10 | - epp_role_binding.yaml 11 | - leader_election_role.yaml 12 | - leader_election_role_binding.yaml 13 | # The following RBAC configurations are used to protect 14 | # the metrics endpoint with authn/authz. These configurations 15 | # ensure that only authorized users and service accounts 16 | # can access the metrics endpoint. Comment the following 17 | # permissions if you want to disable this protection. 18 | # More info: https://book.kubebuilder.io/reference/metrics.html 19 | - metrics_auth_role.yaml 20 | - metrics_auth_role_binding.yaml 21 | - metrics_reader_role.yaml 22 | # For each CRD, "Admin", "Editor" and "Viewer" roles are scaffolded by 23 | # default, aiding admins in cluster management. Those roles are 24 | # not used by the {{ .ProjectName }} itself. You can comment the following lines 25 | # if you do not want those helpers be installed with your Project. 26 | - modelservice_admin_role.yaml 27 | - modelservice_editor_role.yaml 28 | - modelservice_viewer_role.yaml 29 | 30 | -------------------------------------------------------------------------------- /config/rbac/leader_election_role.yaml: -------------------------------------------------------------------------------- 1 | # permissions to do leader election. 2 | apiVersion: rbac.authorization.k8s.io/v1 3 | kind: Role 4 | metadata: 5 | labels: 6 | app.kubernetes.io/name: modelservice 7 | app.kubernetes.io/managed-by: kustomize 8 | name: leader-election-role 9 | rules: 10 | - apiGroups: 11 | - "" 12 | resources: 13 | - configmaps 14 | verbs: 15 | - get 16 | - list 17 | - watch 18 | - create 19 | - update 20 | - patch 21 | - delete 22 | - apiGroups: 23 | - coordination.k8s.io 24 | resources: 25 | - leases 26 | verbs: 27 | - get 28 | - list 29 | - watch 30 | - create 31 | - update 32 | - patch 33 | - delete 34 | - apiGroups: 35 | - "" 36 | resources: 37 | - events 38 | verbs: 39 | - create 40 | - patch 41 | -------------------------------------------------------------------------------- /config/rbac/leader_election_role_binding.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: RoleBinding 3 | metadata: 4 | labels: 5 | app.kubernetes.io/name: modelservice 6 | app.kubernetes.io/managed-by: kustomize 7 | name: leader-election-rolebinding 8 | roleRef: 9 | apiGroup: rbac.authorization.k8s.io 10 | kind: Role 11 | name: leader-election-role 12 | subjects: 13 | - kind: ServiceAccount 14 | name: controller-manager 15 | namespace: system 16 | -------------------------------------------------------------------------------- /config/rbac/metrics_auth_role.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: ClusterRole 3 | metadata: 4 | name: metrics-auth-role 5 | rules: 6 | - apiGroups: 7 | - authentication.k8s.io 8 | resources: 9 | - tokenreviews 10 | verbs: 11 | - create 12 | - apiGroups: 13 | - authorization.k8s.io 14 | resources: 15 | - subjectaccessreviews 16 | verbs: 17 | - create 18 | -------------------------------------------------------------------------------- /config/rbac/metrics_auth_role_binding.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: ClusterRoleBinding 3 | metadata: 4 | name: metrics-auth-rolebinding 5 | roleRef: 6 | apiGroup: rbac.authorization.k8s.io 7 | kind: ClusterRole 8 | name: metrics-auth-role 9 | subjects: 10 | - kind: ServiceAccount 11 | name: controller-manager 12 | namespace: system 13 | -------------------------------------------------------------------------------- /config/rbac/metrics_reader_role.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: ClusterRole 3 | metadata: 4 | name: metrics-reader 5 | rules: 6 | - nonResourceURLs: 7 | - "/metrics" 8 | verbs: 9 | - get 10 | -------------------------------------------------------------------------------- /config/rbac/modelservice_admin_role.yaml: -------------------------------------------------------------------------------- 1 | # This rule is not used by the project modelservice itself. 2 | # It is provided to allow the cluster admin to help manage permissions for users. 3 | # 4 | # Grants full permissions ('*') over llm-d.ai. 5 | # This role is intended for users authorized to modify roles and bindings within the cluster, 6 | # enabling them to delegate specific permissions to other users or groups as needed. 7 | 8 | apiVersion: rbac.authorization.k8s.io/v1 9 | kind: ClusterRole 10 | metadata: 11 | labels: 12 | app.kubernetes.io/name: modelservice 13 | app.kubernetes.io/managed-by: kustomize 14 | name: modelservice-admin-role 15 | rules: 16 | - apiGroups: 17 | - llm-d.ai 18 | resources: 19 | - modelservices 20 | verbs: 21 | - '*' 22 | - apiGroups: 23 | - llm-d.ai 24 | resources: 25 | - modelservices/status 26 | verbs: 27 | - get 28 | -------------------------------------------------------------------------------- /config/rbac/modelservice_editor_role.yaml: -------------------------------------------------------------------------------- 1 | # This rule is not used by the project modelservice itself. 2 | # It is provided to allow the cluster admin to help manage permissions for users. 3 | # 4 | # Grants permissions to create, update, and delete resources within the llm-d.ai. 5 | # This role is intended for users who need to manage these resources 6 | # but should not control RBAC or manage permissions for others. 7 | 8 | apiVersion: rbac.authorization.k8s.io/v1 9 | kind: ClusterRole 10 | metadata: 11 | labels: 12 | app.kubernetes.io/name: modelservice 13 | app.kubernetes.io/managed-by: kustomize 14 | name: modelservice-editor-role 15 | rules: 16 | - apiGroups: 17 | - llm-d.ai 18 | resources: 19 | - modelservices 20 | verbs: 21 | - create 22 | - delete 23 | - get 24 | - list 25 | - patch 26 | - update 27 | - watch 28 | - apiGroups: 29 | - llm-d.ai 30 | resources: 31 | - modelservices/status 32 | verbs: 33 | - get 34 | -------------------------------------------------------------------------------- /config/rbac/modelservice_viewer_role.yaml: -------------------------------------------------------------------------------- 1 | # This rule is not used by the project modelservice itself. 2 | # It is provided to allow the cluster admin to help manage permissions for users. 3 | # 4 | # Grants read-only access to llm-d.ai resources. 5 | # This role is intended for users who need visibility into these resources 6 | # without permissions to modify them. It is ideal for monitoring purposes and limited-access viewing. 7 | 8 | apiVersion: rbac.authorization.k8s.io/v1 9 | kind: ClusterRole 10 | metadata: 11 | labels: 12 | app.kubernetes.io/name: modelservice 13 | app.kubernetes.io/managed-by: kustomize 14 | name: modelservice-viewer-role 15 | rules: 16 | - apiGroups: 17 | - llm-d.ai 18 | resources: 19 | - modelservices 20 | verbs: 21 | - get 22 | - list 23 | - watch 24 | - apiGroups: 25 | - llm-d.ai 26 | resources: 27 | - modelservices/status 28 | verbs: 29 | - get 30 | -------------------------------------------------------------------------------- /config/rbac/role.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: rbac.authorization.k8s.io/v1 3 | kind: ClusterRole 4 | metadata: 5 | name: manager-role 6 | rules: 7 | - apiGroups: 8 | - "" 9 | resources: 10 | - configmaps 11 | - serviceaccounts 12 | verbs: 13 | - create 14 | - delete 15 | - get 16 | - list 17 | - patch 18 | - update 19 | - watch 20 | - apiGroups: 21 | - "" 22 | resources: 23 | - services 24 | verbs: 25 | - create 26 | - delete 27 | - list 28 | - patch 29 | - update 30 | - watch 31 | - apiGroups: 32 | - apps 33 | resources: 34 | - deployments 35 | verbs: 36 | - create 37 | - delete 38 | - get 39 | - list 40 | - patch 41 | - update 42 | - watch 43 | - apiGroups: 44 | - apps 45 | resources: 46 | - deployments/scale 47 | verbs: 48 | - patch 49 | - update 50 | - apiGroups: 51 | - gateway.networking.k8s.io 52 | resources: 53 | - httproutes 54 | verbs: 55 | - create 56 | - delete 57 | - get 58 | - list 59 | - patch 60 | - update 61 | - watch 62 | - apiGroups: 63 | - inference.networking.x-k8s.io 64 | resources: 65 | - inferencemodels 66 | - inferencepools 67 | verbs: 68 | - create 69 | - delete 70 | - get 71 | - list 72 | - patch 73 | - update 74 | - watch 75 | - apiGroups: 76 | - llm-d.ai 77 | resources: 78 | - modelservices 79 | verbs: 80 | - create 81 | - delete 82 | - get 83 | - list 84 | - patch 85 | - update 86 | - watch 87 | - apiGroups: 88 | - llm-d.ai 89 | resources: 90 | - modelservices/finalizers 91 | verbs: 92 | - update 93 | - apiGroups: 94 | - llm-d.ai 95 | resources: 96 | - modelservices/status 97 | verbs: 98 | - get 99 | - patch 100 | - update 101 | - apiGroups: 102 | - rbac.authorization.k8s.io 103 | resources: 104 | - rolebindings 105 | verbs: 106 | - create 107 | - delete 108 | - get 109 | - list 110 | - patch 111 | - update 112 | - watch 113 | -------------------------------------------------------------------------------- /config/rbac/role_binding.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: ClusterRoleBinding 3 | metadata: 4 | labels: 5 | app.kubernetes.io/name: modelservice 6 | app.kubernetes.io/managed-by: kustomize 7 | name: manager-rolebinding 8 | roleRef: 9 | apiGroup: rbac.authorization.k8s.io 10 | kind: ClusterRole 11 | name: manager-role 12 | subjects: 13 | - kind: ServiceAccount 14 | name: controller-manager 15 | namespace: system 16 | -------------------------------------------------------------------------------- /config/rbac/service_account.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ServiceAccount 3 | metadata: 4 | labels: 5 | app.kubernetes.io/name: modelservice 6 | app.kubernetes.io/managed-by: kustomize 7 | name: controller-manager 8 | namespace: system 9 | -------------------------------------------------------------------------------- /config/samples/kustomization.yaml: -------------------------------------------------------------------------------- 1 | ## Append samples of your project ## 2 | resources: 3 | - llmd_v1alpha1_modelservice.yaml 4 | # +kubebuilder:scaffold:manifestskustomizesamples 5 | -------------------------------------------------------------------------------- /config/samples/vllmd_v1alpha1_modelservice.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: llm-d.ai/v1alpha1 2 | kind: ModelService 3 | metadata: 4 | labels: 5 | app.kubernetes.io/name: modelservice 6 | app.kubernetes.io/managed-by: kustomize 7 | name: modelservice-sample 8 | spec: 9 | # TODO(user): Add fields here 10 | -------------------------------------------------------------------------------- /config/summitdemo/kustomization.yaml: -------------------------------------------------------------------------------- 1 | # Adds namespace to all resources. 2 | namespace: e2e-solution 3 | 4 | resources: 5 | - ../default 6 | 7 | patches: 8 | - target: 9 | kind: Deployment 10 | path: manager_patch.yaml 11 | -------------------------------------------------------------------------------- /config/summitdemo/manager_patch.yaml: -------------------------------------------------------------------------------- 1 | # Make image pull policy always if using dev image 2 | - op: add 3 | path: /spec/template/spec/containers/0/imagePullPolicy 4 | value: Always 5 | 6 | # Add image pull secret 7 | - op: add 8 | path: /spec/template/spec/imagePullSecrets 9 | value: 10 | - name: $IMAGE_PULL_SECRET 11 | 12 | # to configure pull secrets for pd and epp pods, uncomment 13 | - op: add 14 | path: /spec/template/spec/containers/0/args/- 15 | value: --pd-pull-secrets=$PD_PULL_SECRETS 16 | - op: add 17 | path: /spec/template/spec/containers/0/args/- 18 | value: --epp-pull-secrets=$EPP_PULL_SECRETS 19 | -------------------------------------------------------------------------------- /deploy/common/patch-service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | name: service 5 | spec: 6 | selector: 7 | app: ${PROJECT_NAME}-service 8 | ports: 9 | - protocol: TCP 10 | port: 8080 11 | targetPort: 8080 12 | type: ClusterIP 13 | -------------------------------------------------------------------------------- /deploy/common/patch-statefulset.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: StatefulSet 3 | metadata: 4 | name: 0 5 | spec: 6 | serviceName: ${PROJECT_NAME}-service 7 | replicas: 1 8 | selector: 9 | matchLabels: 10 | app: ${PROJECT_NAME}-statefulset 11 | template: 12 | metadata: 13 | labels: 14 | app: ${PROJECT_NAME}-statefulset 15 | spec: 16 | serviceAccountName: operator-controller-manager 17 | containers: 18 | - name: cmd 19 | image: ${IMAGE_TAG_BASE}:${VERSION} 20 | imagePullPolicy: Always 21 | -------------------------------------------------------------------------------- /deploy/common/service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | name: service 5 | spec: 6 | selector: 7 | app: placeholder 8 | ports: 9 | - protocol: TCP 10 | port: 8080 11 | targetPort: 8080 12 | type: ClusterIP 13 | -------------------------------------------------------------------------------- /deploy/common/statefulset.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: StatefulSet 3 | metadata: 4 | name: "0" 5 | spec: 6 | serviceName: placeholder 7 | replicas: 1 8 | selector: 9 | matchLabels: 10 | app: placeholder 11 | template: 12 | metadata: 13 | labels: 14 | app: placeholder 15 | spec: 16 | serviceAccountName: operator-controller-manager 17 | containers: 18 | - name: cmd 19 | image: ghcr.io/vllm-d/placeholder:placeholder 20 | imagePullPolicy: Always 21 | -------------------------------------------------------------------------------- /deploy/kustomization.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: kustomize.config.k8s.io/v1beta1 2 | kind: Kustomization 3 | 4 | # Set the namespace for all resources using a placeholder. 5 | namespace: ${NAMESPACE} 6 | 7 | # Use a prefix for all object names. You can substitute the PROJECT_NAME variable. 8 | namePrefix: ${PROJECT_NAME}- 9 | 10 | # List all the resources (manifests) you want to deploy. 11 | resources: 12 | - common/statefulset.yaml 13 | - common/service.yaml 14 | - openshift/route.yaml 15 | - rbac/exec-rbac-role.yaml 16 | - rbac/exec-rbac-rolebinding.yaml 17 | 18 | # Generate the ConfigMap with a variable name. 19 | configMapGenerator: 20 | - name: config 21 | options: 22 | disableNameSuffixHash: true 23 | 24 | # Include patches to update the Service, StatefulSet, Route, and RBAC resources. 25 | 26 | # Define the image to be updated. 27 | # images: 28 | # - name: ghcr.io/vllm-d/placeholder 29 | # newName: ghcr.io/vllm-d/${IMAGE_TAG_BASE} 30 | # newTag: ${VERSION} 31 | patches: 32 | - path: common/patch-service.yaml 33 | - path: common/patch-statefulset.yaml 34 | - path: openshift/patch-route.yaml 35 | - path: rbac/patch-rbac-role.yaml 36 | - path: rbac/patch-rbac-rolebinding.yaml 37 | -------------------------------------------------------------------------------- /deploy/openshift/patch-route.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: route.openshift.io/v1 2 | kind: Route 3 | metadata: 4 | name: route 5 | spec: 6 | to: 7 | name: "${PROJECT_NAME}-service" 8 | -------------------------------------------------------------------------------- /deploy/openshift/route.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: route.openshift.io/v1 2 | kind: Route 3 | metadata: 4 | name: route 5 | spec: 6 | to: 7 | kind: Service 8 | name: placeholder 9 | port: 10 | targetPort: 8080 11 | tls: 12 | termination: edge 13 | -------------------------------------------------------------------------------- /deploy/rbac/exec-rbac-role.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: Role 3 | metadata: 4 | name: exec-role 5 | rules: 6 | - apiGroups: [""] 7 | resources: ["pods/exec"] 8 | resourceNames: ["placeholder-0-0"] 9 | verbs: ["create"] 10 | -------------------------------------------------------------------------------- /deploy/rbac/exec-rbac-rolebinding.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: RoleBinding 3 | metadata: 4 | name: exec-rolebinding 5 | subjects: 6 | - kind: Group 7 | name: system:authenticated 8 | apiGroup: rbac.authorization.k8s.io 9 | roleRef: 10 | kind: Role 11 | name: exec-role 12 | apiGroup: rbac.authorization.k8s.io 13 | 14 | -------------------------------------------------------------------------------- /deploy/rbac/patch-rbac-role.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: Role 3 | metadata: 4 | name: exec-role 5 | rules: 6 | - apiGroups: [""] 7 | resources: ["pods/exec"] 8 | resourceNames: 9 | - "${PROJECT_NAME}-0-0" 10 | verbs: ["create"] 11 | -------------------------------------------------------------------------------- /deploy/rbac/patch-rbac-rolebinding.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: RoleBinding 3 | metadata: 4 | name: exec-rolebinding 5 | subjects: 6 | - kind: Group 7 | name: system:authenticated 8 | apiGroup: rbac.authorization.k8s.io 9 | roleRef: 10 | kind: Role 11 | name: ${PROJECT_NAME}-exec-role 12 | apiGroup: rbac.authorization.k8s.io 13 | -------------------------------------------------------------------------------- /docs/api_reference/config.yaml: -------------------------------------------------------------------------------- 1 | title: "ModelService API Reference" 2 | version: "1.0" 3 | sources: 4 | - https://github.com/llm-d/llm-d-model-service.git -------------------------------------------------------------------------------- /docs/apireference.md: -------------------------------------------------------------------------------- 1 | # API Reference 2 | 3 | Refer to the [`api_reference`](./api_reference/out.html) folder. 4 | 5 | # Command to build API Reference 6 | 7 | - Generate asciidoc using [crd-ref-docs](https://github.com/elastic/crd-ref-docs) 8 | 9 | ```sh 10 | crd-ref-docs --source-path=./api/v1alpha1 --config=./docs/api_reference/config.yaml --output-path=./docs/api_reference 11 | ``` 12 | 13 | - Convert asciidoc to HTML by installing [asciidoctor](https://asciidoctor.org/) 14 | 15 | ```sh 16 | asciidoctor ./docs/api_reference/out.asciidoc 17 | ``` -------------------------------------------------------------------------------- /docs/developer.md: -------------------------------------------------------------------------------- 1 | # Developer Docs 2 | 3 | Clone the [ModelService GitHub repository](https://github.com/llm-d/llm-d-model-service) (or a fork of it) to take advantage of the `make` commands described below. All commands are from the project root directory. 4 | 5 | Execution of the ModelService controller requires access to a cluster. 6 | A local cluster, such as a `kind` cluster, suffices for basic execution and development testing. 7 | However, testing end-to-end with a large language model may not be possible if the cluster does not have sufficient resources or if the [inference gateway](https://gateway-api.sigs.k8s.io/) and [inference gateway extension](https://github.com/llm-d/gateway-api-inference-extension) are not fully configured. 8 | 9 | If a cluster is not available, you can do a dry-run to identify the Kubernetes resources that will be created for a given `ModelService CR`. See [ModelService Dry Run](#modelservice-dry-run) below. 10 | 11 | ## Prerequisites 12 | 13 | ### Install Kubernetes Gateway API CRDs 14 | 15 | ``` 16 | kubectl apply -f https://github.com/kubernetes-sigs/gateway-api/releases/download/v1.3.0/standard-install.yaml 17 | ``` 18 | 19 | ### Install Kubernetes Gateway API Inference Extension CRDs 20 | 21 | ```shell 22 | VERSION=v0.3.0 23 | kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/releases/download/$VERSION/manifests.yaml 24 | ``` 25 | 26 | ### Define Cluster Role for Endpoint Picker (EPP) 27 | 28 | For the endpoint picker used in the [samples](https://github.com/llm-d/llm-d-model-service/tree/dev/samples), the `pod-read` cluster role defined [here](https://github.com/llm-d/gateway-api-inference-extension/blob/dev/config/manifests/inferencepool-resources.yaml#L84-L112) works. 29 | 30 | ### Install ModelService CRDs 31 | 32 | ```shell 33 | make install 34 | ``` 35 | 36 | If successful, you should see something like: 37 | 38 | ```shell 39 | % kubectl get crd | grep modelservice 40 | modelservices.llm-d.ai 2025-05-08T13:37:32Z 41 | ``` 42 | 43 | ## Local Execution 44 | 45 | You can run the ModelService controller locally operating against the cluster defined by your current Kubernetes configuration. 46 | 47 | ```shell 48 | make run EPP_CLUSTERROLE=pod-read 49 | ``` 50 | 51 | You can now create `ModelService` objects. See [samples](https://github.com/llm-d/llm-d-model-service/tree/dev/samples) for details. 52 | 53 | To avoid long image and model downloads, you can create dummy model services such as those in[ `samples/test`](https://github.com/llm-d/llm-d-model-service/tree/dev/samples/test). 54 | 55 | ## Running in a Cluster 56 | 57 | Deploy the controller to the cluster: 58 | 59 | 1. Create the target namespace `modelservice-system` 60 | 61 | By default, the ModelService controller is deployed to the `modelservice-system` namespace. To change the target namespace, create a kustomize overlay (see [`config/dev`](https://github.com/llm-d/llm-d-model-service/tree/dev/config/dev)). 62 | 63 | 2. Deploy the controller: 64 | 65 | ```shell 66 | make dev-deploy EPP_CLUSTERROLE=pod-read 67 | ``` 68 | 69 | You should see a `modelservice-controller-manager` pod start in the `modelservice-system` namespace. 70 | 71 | If an image pull secret is required, you can specify it with the environment variable `IMAGE_PULL_SECRET`. 72 | 73 | You can now create `ModelService` objects. See [samples](https://github.com/llm-d/llm-d-model-service/tree/dev/samples) for details. 74 | 75 | ## Uninstall 76 | 77 | The controller and `ModelService` CRDs can be removed: 78 | 79 | ```shell 80 | make uninstall && make undeploy 81 | ``` 82 | 83 | Supporting resources like the endpoint picker cluster role, the inference gateway, and the Kubernetes Gateway APi Inference Extension CRDs can also be uninstalled. 84 | 85 | ## ModelService Dry-Run 86 | View the components that ModelService will create given a `ModelService` CR and a base config `ConfigMap`. This command does not require cluster access. 87 | 88 | In the `llm-d-model-service`project root directory: 89 | 90 | ```shell 91 | go run main.go generate \ 92 | --epp-cluster-role= \ 93 | --modelservice \ 94 | --baseconfig 95 | ``` 96 | 97 | Note that because no cluster access is required, it is not necessary to create an endpoint picker cluster role resource. 98 | 99 | For example: 100 | 101 | ```shell 102 | go run main.go generate \ 103 | --epp-cluster-role=pod-read \ 104 | --modelservice samples/msvcs/granite3.2.yaml \ 105 | --baseconfig samples/baseconfigs/simple-baseconfig.yaml 106 | ``` 107 | 108 | will output the YAML manifest for the resources that ModelService will create in the cluster. Some fields that require cluster access to define, will not be included, such as `metadata.namespace`. 109 | 110 | This feature purely for development purposes, and is intended to provide a quick way of debugging without a cluster. -------------------------------------------------------------------------------- /docs/install.md: -------------------------------------------------------------------------------- 1 | # Installation 2 | 3 | `Modelservice` is one of the components installed during [llm-d installation](https://github.com/llm-d/llm-d-deployer/blob/main/quickstart/README.md). 4 | 5 | For local development and testing, refer to [developer docs](developer.md). -------------------------------------------------------------------------------- /docs/userguide.md: -------------------------------------------------------------------------------- 1 | # User Guide 2 | 3 | This guide presents the core concepts and configuration patterns for serving base models using the `ModelService` Custom Resource Definition (CRD). It is intended for both platform operators and model owners. 4 | 5 | ## [Core Concepts](userguide/core-concepts.md) 6 | 7 | Understand how `ModelService` fits into the Kubernetes ecosystem, what resources it manages, and how its declarative workflow simplifies inference infrastructure. 8 | 9 | --- 10 | 11 | ## Topics 12 | 13 | 1. **[Model Name](userguide/model-name.md)** 14 | How inference clients refer to your model using OpenAI-compatible APIs. 15 | 16 | 2. **[Model Artifacts](userguide/model-artifacts.md)** 17 | Load models from Hugging Face, PVCs, or OCI images and mount them into serving containers. 18 | 19 | 20 | 3. **Templating Reference** 21 | Use Go templates in `ModelService` and `BaseConfig` to dynamically generate configurations for child resources. 22 | 23 | 24 | 4. **Decouple Scaling** 25 | Let HPA or custom controllers manage replica counts for prefill and decode deployments. 26 | 27 | 28 | 5. **Accelerator Types** 29 | Target specific GPU types using node labels to ensure models run on the right hardware. 30 | 31 | 32 | 6. **Semantic Merge** 33 | Learn how values in `ModelService` override or augment those defined in `BaseConfig`. 34 | 35 | 36 | 7. **Child Resources** 37 | Explore all Kubernetes resources owned and managed by a `ModelService`. 38 | 39 | --- 40 | 41 | For more details, see: 42 | 43 | 📄 [Install Guide](install.md) — how to install the ModelService controller 44 | 45 | 📘 [API Reference](apireference.md) — full CRD schema and field definitions 46 | -------------------------------------------------------------------------------- /docs/userguide/core-concepts.md: -------------------------------------------------------------------------------- 1 | # Core Concepts 2 | 3 | The ModelService custom resource provides a unified declarative API for serving a base model in Kubernetes. It supports inference workloads with prefill/decode disaggregation, reusable configuration presets, and seamless integration with the Gateway API Inference Extension (GIE). 4 | 5 | When a ModelService resource is reconciled, it creates and maintains the following resources. 6 | 7 | ## Workload resources 8 | 9 | * Prefill deployment and service 10 | * Decode deployment and service 11 | * Configmaps for prefill/decode 12 | 13 | ## Routing 14 | 15 | * HTTPRoute 16 | * Inference pool 17 | * Inference model 18 | * Endpoint picker (EPP) deployment and service 19 | * Configmaps for EPP 20 | 21 | ## Access control 22 | 23 | * Service account for prefill/decode and EPP 24 | * RoleBinding for EPP 25 | 26 | These resources are optional and fully configurable. Their creation, omission, and configuration is controlled through BaseConfig and ModelService specifications. When the resources are created, the parent ModelService that triggered their creation is set as their owner; this facilitates correctness of the reconciliation logic, garbage collection and status tracking. 27 | 28 | The following sample illustrates the core concepts in the ModelService spec. Further details are covered under individual topics below. 29 | 30 | ```yaml 31 | apiVersion: llm-d.ai/v1alpha1 32 | kind: ModelService 33 | metadata: 34 | name: facebook-opt-125m 35 | # `ModelService` is a namespace scoped resource 36 | namespace: my-ns 37 | spec: 38 | # `baseConfigMapRef.name` is the name of the Kubernetes configmap that provides default configurations for the resources spawned by this `ModelService`. 39 | 40 | # configuration derived from this `ModelService` will be semantically merged with the contents of the referenced `BaseConfig` to produce the final resource configuration. This allows model owners to override platform defaults only when necessary. 41 | 42 | # the contents of this `BaseConfig` configmap can be templated 43 | baseConfigMapRef: 44 | name: generic-base-config 45 | 46 | # `routing.modelName` is name of the model used by OpenAI compatible inference clients in their queries. 47 | routing: 48 | modelName: facebook/opt-125m 49 | 50 | # `modelArtifacts.uri` describes the source of the model. In this example, it is sourced from Hugging Face (as indicated by the hf:// prefix in the URI), the owner of the Hugging Face repo is `facebook`, and the model ID within is `opt-125m`. 51 | modelArtifacts: 52 | # if `uri` is prefixed with hf://, it will create an emptyDir volume in prefill/decode pods, that can be mounted by the model serving container in the pod. 53 | uri: hf://facebook/opt-125m 54 | 55 | # `prefill` and `decode` sections enable disaggregated prefill architecture for model serving; these sections are optional; include both to sections to enable diaggregation; omit prefill to disable disaggregation. 56 | decode: 57 | # number of decode pods 58 | replicas: 1 59 | # a list of containers 60 | containers: 61 | - name: vllm 62 | # Templated arguments. 63 | # .HFModelName expands to "facebook/opt-125m" 64 | # For all variables, see the templating reference. 65 | args: 66 | # hint: add quote while using templating to avoid subtle yaml parsing issues 67 | - "{{ .HFModelName }}" 68 | # if `mountModelVolume` is set to true, the volume meant for model storage will be mounted by this container; in this example, this will be an emptyDir volume into which this container will download the model from Hugging Face. 69 | mountModelVolume: true 70 | ``` 71 | 72 | This minimal example demonstrates inference serving using a Hugging Face model. For more on routing, model sources, templating, merging, and advanced features, refer to the respective [topics](../userguide.md#topics). 73 | 74 | -------------------------------------------------------------------------------- /docs/userguide/model-artifacts.md: -------------------------------------------------------------------------------- 1 | # Model Artifacts 2 | 3 | The `modelArtifacts` section under the `spec` of a `ModelService` defines how model files, such as weights and metadata configurations, are retrieved and loaded into inference backends like vLLM. This abstraction simplifies the process by allowing users to specify the model source without needing to configure low-level details like environment variables, volumes, or volume mounts. 4 | 5 | ## Purpose 6 | 7 | Without `ModelService`, users must manually configure vLLM arguments, environment variables, and pod/container specifications. This requires a deep understanding of both vLLM and the composition of model artifacts. The `ModelService` controller automates these configurations, enabling users to focus solely on specifying the model source. 8 | 9 | ## Model Artifact Sources and Behaviors 10 | 11 | The `modelArtifacts.uri` field determines the source of the model artifacts. Each supported prefix results in specific behaviors in the prefill and decode deployments. The following sources are supported: 12 | 13 | ### 1. Downloading a Model Directly from Hugging Face 14 | 15 | If the `uri` begins with the `hf://` prefix, the model is downloaded directly from Hugging Face into an `emptyDir` volume. 16 | 17 | #### URI Format 18 | 19 | The repo and model ID must match exactly to the IDs found on the Hugging Face model registry, as required by vLLM. 20 | 21 | `hf:///` 22 | 23 | Example: `hf://facebook/opt-125m` 24 | 25 | #### Additional Fields 26 | 27 | - **`authSecretName`**: Specifies the Kubernetes Secret containing the `HF_TOKEN` for gated models. 28 | - **`size`**: Defines the size of the `emptyDir` volume. 29 | 30 | #### Behavior 31 | 32 | - An `emptyDir` volume named `model-storage` is created. 33 | - Containers with `mountModelVolume: true` will have a `volumeMount` at `/model-cache`. 34 | - The `HF_HOME` environment variable is set to `/model-cache`. 35 | - If `authSecretName` is provided, the `HF_TOKEN` environment variable is created. 36 | 37 | #### Example Deployment Snippet 38 | 39 | ```yaml 40 | volumes: 41 | - name: model-storage 42 | emptyDir: {} 43 | containers: 44 | - name: vllm 45 | env: 46 | - name: HF_HOME 47 | value: /model-cache 48 | - name: HF_TOKEN 49 | valueFrom: 50 | secretKeyRef: 51 | name: hf-secret 52 | key: HF_TOKEN 53 | volumeMounts: 54 | - mountPath: /model-cache 55 | name: model-storage 56 | ``` 57 | 58 | #### Template variables 59 | 60 | Various template variables are exposed as a result of using the `"hf://"` prefix, namely 61 | 62 | - `{{ .HFModelName }}`: this is `/` in the URI, which might be useful for vLLM arguments. Note that this is different from `{{ .ModelName }}`, which is the `spec.routing.modelName`, used for client requests 63 | - `{{ .MountedModelPath }}`: this is equal to `/model-cache` 64 | 65 | ### 2. Loading a model directly from a PVC 66 | 67 | Downloading large models from Hugging Face can take a significant amount of time. If a PVC containing the model files is already pre-populated, then mounting this path and supplying that to vLLM can drastically shorten the engine's warm up time. 68 | 69 | #### URI format 70 | 71 | `"pvc:///"` 72 | 73 | Example: `"pvc://granite-pvc/path/to/granite"` 74 | 75 | #### Behavior 76 | 77 | - A read-only PVC volume with the name `model-storage` is created for the deployment 78 | - A read-only `volumeMount` with the `mountPath: model-cache` is created for each container where `mountModelVolume: true` 79 | 80 | 81 | #### Example Deployment Snippet 82 | 83 | ```yaml 84 | volumes: 85 | - name: model-storage 86 | persistentVolumeClaim: 87 | claimName: granite-pvc 88 | readOnly: true 89 | containers: 90 | - name: vllm 91 | volumeMounts: 92 | - mountPath: /model-cache 93 | name: model-storage 94 | ``` 95 | 96 | #### Template variables 97 | 98 | Various template variable are exposed as a result of using the `"pvc://"` prefix, with `.MountedModelPath` being particularly useful if vLLM arguments require it. 99 | 100 | - `{{ .MountedModelPath }}`: this is equal to `/model-cache/` where `` comes from the URI. In the above example, `{{ .MountedModelPath }}` interpolates to `/model-cache/path/to/granite` 101 | 102 | ### 3. Loading the model from an image volume 103 | 104 | NotImplemented. -------------------------------------------------------------------------------- /docs/userguide/model-name.md: -------------------------------------------------------------------------------- 1 | # Model Name 2 | 3 | The `modelName` field under the `routing` section of a `ModelService` specifies how clients refer to a model during inference. This name is used by OpenAI-compatible APIs and must be **globally unique** across all `ModelService` resources in the cluster, unless they target different gateways. 4 | 5 | ## Purpose 6 | 7 | This field acts as the public-facing identifier for the model. When an inference client sends a request, it includes this name in the "model" field of the API request body. 8 | 9 | ### Client request 10 | 11 | ```json 12 | { 13 | "model": "facebook/opt-125m", 14 | "prompt": "What is the capital of France?" 15 | } 16 | ``` 17 | 18 | ### ModelService configuration 19 | 20 | ```yaml 21 | spec: 22 | routing: 23 | modelName: facebook/opt-125m 24 | ``` 25 | 26 | The gateway ensures that each model name maps to one and only one live base model across the cluster. 27 | 28 | ## Conflict Resolution 29 | 30 | If multiple `ModelService` resources attempt to register the same `modelName`, the controller may arbitrarily select one owner, after attempting to apply the following rules. 31 | 32 | * The oldest resource (based on creation timestamp) is retained as the valid owner. 33 | 34 | * The newer conflicting resource will: 35 | 36 | * Have its inference model marked as not ready. 37 | 38 | * Emit an appropriate status error indicating the conflict. 39 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/llm-d/llm-d-model-service 2 | 3 | go 1.24.0 4 | 5 | toolchain go1.24.2 6 | 7 | godebug default=go1.23 8 | 9 | require ( 10 | github.com/onsi/ginkgo/v2 v2.23.3 11 | github.com/onsi/gomega v1.37.0 12 | k8s.io/api v0.33.0 13 | k8s.io/apimachinery v0.33.0 14 | k8s.io/client-go v0.33.0 15 | sigs.k8s.io/controller-runtime v0.20.4 16 | ) 17 | 18 | require ( 19 | dario.cat/mergo v1.0.1 20 | github.com/Masterminds/sprig/v3 v3.3.0 21 | github.com/stretchr/testify v1.10.0 22 | sigs.k8s.io/gateway-api v1.3.0 23 | sigs.k8s.io/yaml v1.4.0 24 | ) 25 | 26 | require ( 27 | github.com/Masterminds/goutils v1.1.1 // indirect 28 | github.com/Masterminds/semver/v3 v3.3.0 // indirect 29 | github.com/huandu/xstrings v1.5.0 // indirect 30 | github.com/mitchellh/copystructure v1.2.0 // indirect 31 | github.com/mitchellh/reflectwalk v1.0.2 // indirect 32 | github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect 33 | github.com/shopspring/decimal v1.4.0 // indirect 34 | github.com/spf13/cast v1.7.0 // indirect 35 | golang.org/x/crypto v0.38.0 // indirect 36 | sigs.k8s.io/randfill v1.0.0 // indirect 37 | ) 38 | 39 | require ( 40 | cel.dev/expr v0.19.1 // indirect 41 | github.com/antlr4-go/antlr/v4 v4.13.0 // indirect 42 | github.com/beorn7/perks v1.0.1 // indirect 43 | github.com/blang/semver/v4 v4.0.0 // indirect 44 | github.com/cenkalti/backoff/v4 v4.3.0 // indirect 45 | github.com/cespare/xxhash/v2 v2.3.0 // indirect 46 | github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect 47 | github.com/emicklei/go-restful/v3 v3.12.0 // indirect 48 | github.com/evanphx/json-patch/v5 v5.9.11 // indirect 49 | github.com/felixge/httpsnoop v1.0.4 // indirect 50 | github.com/fsnotify/fsnotify v1.7.0 // indirect 51 | github.com/fxamacker/cbor/v2 v2.8.0 // indirect 52 | github.com/go-logr/logr v1.4.2 53 | github.com/go-logr/stdr v1.2.2 // indirect 54 | github.com/go-logr/zapr v1.3.0 // indirect 55 | github.com/go-openapi/jsonpointer v0.21.0 // indirect 56 | github.com/go-openapi/jsonreference v0.21.0 // indirect 57 | github.com/go-openapi/swag v0.23.0 // indirect 58 | github.com/go-task/slim-sprig/v3 v3.0.0 // indirect 59 | github.com/gogo/protobuf v1.3.2 // indirect 60 | github.com/google/btree v1.1.3 // indirect 61 | github.com/google/cel-go v0.23.2 // indirect 62 | github.com/google/gnostic-models v0.6.9 // indirect 63 | github.com/google/go-cmp v0.7.0 // indirect 64 | github.com/google/pprof v0.0.0-20241210010833-40e02aabc2ad // indirect 65 | github.com/google/uuid v1.6.0 // indirect 66 | github.com/grpc-ecosystem/grpc-gateway/v2 v2.24.0 // indirect 67 | github.com/inconshreveable/mousetrap v1.1.0 // indirect 68 | github.com/josharian/intern v1.0.0 // indirect 69 | github.com/json-iterator/go v1.1.12 // indirect 70 | github.com/mailru/easyjson v0.7.7 // indirect 71 | github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect 72 | github.com/modern-go/reflect2 v1.0.2 // indirect 73 | github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect 74 | github.com/pkg/errors v0.9.1 // indirect 75 | github.com/prometheus/client_golang v1.22.0 // indirect 76 | github.com/prometheus/client_model v0.6.1 // indirect 77 | github.com/prometheus/common v0.63.0 // indirect 78 | github.com/prometheus/procfs v0.15.1 // indirect 79 | github.com/spf13/cobra v1.9.1 80 | github.com/spf13/pflag v1.0.6 // indirect 81 | github.com/stoewer/go-strcase v1.3.0 // indirect 82 | github.com/x448/float16 v0.8.4 // indirect 83 | go.opentelemetry.io/auto/sdk v1.1.0 // indirect 84 | go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.58.0 // indirect 85 | go.opentelemetry.io/otel v1.34.0 // indirect 86 | go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.33.0 // indirect 87 | go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.33.0 // indirect 88 | go.opentelemetry.io/otel/metric v1.34.0 // indirect 89 | go.opentelemetry.io/otel/sdk v1.34.0 // indirect 90 | go.opentelemetry.io/otel/trace v1.34.0 // indirect 91 | go.opentelemetry.io/proto/otlp v1.4.0 // indirect 92 | go.uber.org/multierr v1.11.0 // indirect 93 | go.uber.org/zap v1.27.0 94 | golang.org/x/exp v0.0.0-20250408133849-7e4ce0ab07d0 // indirect 95 | golang.org/x/net v0.40.0 // indirect 96 | golang.org/x/oauth2 v0.27.0 // indirect 97 | golang.org/x/sync v0.14.0 // indirect 98 | golang.org/x/sys v0.33.0 // indirect 99 | golang.org/x/term v0.32.0 // indirect 100 | golang.org/x/text v0.25.0 // indirect 101 | golang.org/x/time v0.9.0 // indirect 102 | golang.org/x/tools v0.33.0 // indirect 103 | gomodules.xyz/jsonpatch/v2 v2.4.0 // indirect 104 | google.golang.org/genproto/googleapis/api v0.0.0-20250106144421-5f5ef82da422 // indirect 105 | google.golang.org/genproto/googleapis/rpc v0.0.0-20250115164207-1a7da9e5054f // indirect 106 | google.golang.org/grpc v1.71.1 // indirect 107 | google.golang.org/protobuf v1.36.6 // indirect 108 | gopkg.in/evanphx/json-patch.v4 v4.12.0 // indirect 109 | gopkg.in/inf.v0 v0.9.1 // indirect 110 | gopkg.in/yaml.v3 v3.0.1 // indirect 111 | k8s.io/apiextensions-apiserver v0.33.0 // indirect 112 | k8s.io/apiserver v0.33.0 // indirect 113 | k8s.io/component-base v0.33.0 // indirect 114 | k8s.io/klog/v2 v2.130.1 // indirect 115 | k8s.io/kube-openapi v0.0.0-20250318190949-c8a335a9a2ff // indirect 116 | k8s.io/utils v0.0.0-20250502105355-0f33e8f1c979 117 | sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.31.2 // indirect 118 | sigs.k8s.io/gateway-api-inference-extension v0.3.0 119 | sigs.k8s.io/json v0.0.0-20241014173422-cfa47c3a1cc8 // indirect 120 | sigs.k8s.io/structured-merge-diff/v4 v4.7.0 // indirect 121 | ) 122 | -------------------------------------------------------------------------------- /hack/boilerplate.go.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/llm-d/llm-d-model-service/55eb18c4f08116ed8c9211f643b356ec5e47b4b7/hack/boilerplate.go.txt -------------------------------------------------------------------------------- /hooks/pre-commit: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -e 3 | 4 | # echo "▶️ Running lint…" 5 | # make lint 6 | 7 | # echo "▶️ Running tests…" 8 | # make test 9 | 10 | echo "✔️ All checks passed!" 11 | -------------------------------------------------------------------------------- /internal/controller/accelerator_types.go: -------------------------------------------------------------------------------- 1 | package controller 2 | 3 | import ( 4 | "fmt" 5 | 6 | "github.com/llm-d/llm-d-model-service/api/v1alpha1" 7 | corev1 "k8s.io/api/core/v1" 8 | ) 9 | 10 | // ToNodeAffinity generates a NodeAffinity rule that requires nodes to match 11 | // the specified accelerator label key and one of the allowed values. 12 | // 13 | // Returns an error if LabelKey is empty or LabelValues is empty. 14 | func AcceleratorTypesToNodeAffinity(a *v1alpha1.AcceleratorTypes) (*corev1.NodeAffinity, error) { 15 | if a == nil { 16 | return nil, nil 17 | } 18 | if a.LabelKey == "" { 19 | return nil, fmt.Errorf("LabelKey must not be empty") 20 | } 21 | if len(a.LabelValues) == 0 { 22 | return nil, fmt.Errorf("LabelValues must contain at least one value") 23 | } 24 | 25 | // Construct the node affinity rule 26 | nodeAffinity := &corev1.NodeAffinity{ 27 | RequiredDuringSchedulingIgnoredDuringExecution: &corev1.NodeSelector{ 28 | NodeSelectorTerms: []corev1.NodeSelectorTerm{ 29 | { 30 | MatchExpressions: []corev1.NodeSelectorRequirement{ 31 | { 32 | Key: a.LabelKey, 33 | Operator: corev1.NodeSelectorOpIn, 34 | Values: a.LabelValues, 35 | }, 36 | }, 37 | }, 38 | }, 39 | }, 40 | } 41 | 42 | return nodeAffinity, nil 43 | } 44 | -------------------------------------------------------------------------------- /internal/controller/accelerator_types_test.go: -------------------------------------------------------------------------------- 1 | package controller 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/llm-d/llm-d-model-service/api/v1alpha1" 7 | "github.com/stretchr/testify/assert" 8 | 9 | corev1 "k8s.io/api/core/v1" 10 | ) 11 | 12 | func TestToNodeAffinity(t *testing.T) { 13 | tests := []struct { 14 | name string 15 | accelerator v1alpha1.AcceleratorTypes 16 | expectError bool 17 | }{ 18 | // valid label key and values 19 | { 20 | name: "valid accelerator", 21 | accelerator: v1alpha1.AcceleratorTypes{ 22 | LabelKey: "nvidia.com/gpu.product", 23 | LabelValues: []string{"A100", "H100"}, 24 | }, 25 | expectError: false, 26 | }, 27 | // missing LabelKey 28 | { 29 | name: "missing label key", 30 | accelerator: v1alpha1.AcceleratorTypes{ 31 | LabelKey: "", 32 | LabelValues: []string{"A100"}, 33 | }, 34 | expectError: true, 35 | }, 36 | // empty LabelValues slice 37 | { 38 | name: "empty label values", 39 | accelerator: v1alpha1.AcceleratorTypes{ 40 | LabelKey: "nvidia.com/gpu.product", 41 | LabelValues: []string{}, 42 | }, 43 | expectError: true, 44 | }, 45 | } 46 | 47 | for _, tt := range tests { 48 | t.Run(tt.name, func(t *testing.T) { 49 | nodeAffinity, err := AcceleratorTypesToNodeAffinity(&tt.accelerator) 50 | 51 | if tt.expectError { 52 | assert.Error(t, err, "expected error but got none") 53 | assert.Nil(t, nodeAffinity) 54 | } else { 55 | assert.NoError(t, err) 56 | assert.NotNil(t, nodeAffinity) 57 | assert.NotNil(t, nodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution) 58 | 59 | // Validate NodeSelectorTerm with correct MatchExpression 60 | terms := nodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution.NodeSelectorTerms 61 | assert.Len(t, terms, 1) 62 | assert.Len(t, terms[0].MatchExpressions, 1) 63 | 64 | expr := terms[0].MatchExpressions[0] 65 | assert.Equal(t, corev1.NodeSelectorOpIn, expr.Operator) 66 | assert.Equal(t, tt.accelerator.LabelKey, expr.Key) 67 | assert.ElementsMatch(t, tt.accelerator.LabelValues, expr.Values) 68 | } 69 | }) 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /internal/controller/child_resources_test.go: -------------------------------------------------------------------------------- 1 | package controller 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | 7 | "k8s.io/apimachinery/pkg/api/errors" 8 | 9 | msv1alpha1 "github.com/llm-d/llm-d-model-service/api/v1alpha1" 10 | . "github.com/onsi/ginkgo/v2" 11 | . "github.com/onsi/gomega" 12 | appsv1 "k8s.io/api/apps/v1" 13 | corev1 "k8s.io/api/core/v1" 14 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 15 | "sigs.k8s.io/yaml" 16 | ) 17 | 18 | // tests to check if base config reading works ok 19 | var _ = Describe("BaseConfig reader", func() { 20 | var ( 21 | ctx context.Context 22 | reconciler *ModelServiceReconciler 23 | msvc *msv1alpha1.ModelService 24 | cm *corev1.ConfigMap 25 | replicas = int32(1) 26 | ) 27 | 28 | BeforeEach(func() { 29 | ctx = context.Background() 30 | 31 | // Create test deployment YAML 32 | deployment := appsv1.Deployment{ 33 | Spec: appsv1.DeploymentSpec{ 34 | Replicas: &replicas, 35 | }, 36 | } 37 | deployYaml, err := yaml.Marshal(deployment) 38 | Expect(err).To(BeNil()) 39 | 40 | // Create ConfigMap with a deployment inside 41 | cm = &corev1.ConfigMap{ 42 | ObjectMeta: metav1.ObjectMeta{ 43 | Name: "test-base-config", 44 | Namespace: "default", 45 | }, 46 | Data: map[string]string{ 47 | "eppDeployment": string(deployYaml), 48 | }, 49 | } 50 | 51 | // Create ModelService referencing the ConfigMap 52 | msvc = &msv1alpha1.ModelService{ 53 | ObjectMeta: metav1.ObjectMeta{ 54 | Name: "test-modelservice", 55 | Namespace: "default", 56 | }, 57 | Spec: msv1alpha1.ModelServiceSpec{ 58 | BaseConfigMapRef: &corev1.ObjectReference{ 59 | Name: "test-base-config", 60 | }, 61 | ModelArtifacts: msv1alpha1.ModelArtifacts{ 62 | URI: "hf://facebook/opt-125m", 63 | }, 64 | }, 65 | } 66 | 67 | By("Creating the base config cm") 68 | Expect(k8sClient.Create(ctx, cm)).To(Succeed()) 69 | 70 | By("Creating the msvc") 71 | Expect(k8sClient.Create(ctx, msvc)).To(Succeed()) 72 | 73 | reconciler = &ModelServiceReconciler{ 74 | Client: k8sClient, 75 | Scheme: k8sClient.Scheme(), 76 | } 77 | }) 78 | 79 | It("should correctly deserialize the eppDeployment from ConfigMap", func() { 80 | bc, err := reconciler.getChildResourcesFromConfigMap(ctx, msvc) 81 | Expect(err).To(BeNil()) 82 | Expect(bc).ToNot(BeNil()) 83 | Expect(bc.EPPDeployment).ToNot(BeNil()) 84 | Expect(bc.EPPDeployment.Spec.Replicas).ToNot(BeNil()) 85 | Expect(*bc.EPPDeployment.Spec.Replicas).To(Equal(int32(1))) 86 | }) 87 | 88 | It("should continue to correctly deserialize the eppDeployment from ConfigMap with pvc prefix", func() { 89 | msvc.Spec.ModelArtifacts.URI = "pvc://my-pvc/path/to/opt-125m" 90 | bc, err := reconciler.getChildResourcesFromConfigMap(ctx, msvc) 91 | Expect(err).To(BeNil()) 92 | Expect(bc).ToNot(BeNil()) 93 | Expect(bc.EPPDeployment).ToNot(BeNil()) 94 | Expect(bc.EPPDeployment.Spec.Replicas).ToNot(BeNil()) 95 | Expect(*bc.EPPDeployment.Spec.Replicas).To(Equal(int32(1))) 96 | }) 97 | 98 | It("should return nil if configmap ref is missing", func() { 99 | msvc.Spec.BaseConfigMapRef = nil 100 | bc, err := reconciler.getChildResourcesFromConfigMap(ctx, msvc) 101 | Expect(err).To(BeNil()) 102 | Expect(bc.PrefillDeployment).To(BeNil()) 103 | Expect(bc.DecodeDeployment).To(BeNil()) 104 | Expect(bc.PrefillService).To(BeNil()) 105 | Expect(bc.DecodeService).To(BeNil()) 106 | Expect(bc.InferencePool).To(BeNil()) 107 | Expect(bc.InferenceModel).To(BeNil()) 108 | Expect(bc.EPPDeployment).To(BeNil()) 109 | Expect(bc.EPPService).To(BeNil()) 110 | }) 111 | 112 | It("should error if the ConfigMap is missing", func() { 113 | msvc.Spec.BaseConfigMapRef.Name = "doesnotexist" 114 | bc, err := reconciler.getChildResourcesFromConfigMap(ctx, msvc) 115 | Expect(err).To(HaveOccurred()) 116 | Expect(bc).To(BeNil()) 117 | }) 118 | 119 | AfterEach(func() { 120 | // Clean up resources after each test 121 | err := k8sClient.Delete(ctx, msvc) 122 | if err != nil && !errors.IsNotFound(err) { 123 | Fail(fmt.Sprintf("Failed to delete ModelService: %v", err)) 124 | } 125 | 126 | err = k8sClient.Delete(ctx, cm) 127 | if err != nil && !errors.IsNotFound(err) { 128 | Fail(fmt.Sprintf("Failed to delete ConfigMap: %v", err)) 129 | } 130 | }) 131 | }) 132 | 133 | // tests to check if templating works ok 134 | var _ = Describe("BaseConfig reader", func() { 135 | var ( 136 | ctx context.Context 137 | reconciler *ModelServiceReconciler 138 | msvc *msv1alpha1.ModelService 139 | cm *corev1.ConfigMap 140 | ) 141 | 142 | BeforeEach(func() { 143 | ctx = context.Background() 144 | 145 | // Be careful that there are no TAB characters in the string 146 | deployYamlStr := `metadata: 147 | name: mvsc-prefill 148 | spec: 149 | template: 150 | spec: 151 | containers: 152 | - name: vllm 153 | command: 154 | - vllm 155 | - serve 156 | args: 157 | - '{{ .HFModelName }}' 158 | ports: 159 | - containerPort: {{ "portName" | getPort }} 160 | ` 161 | 162 | // Create ConfigMap with a deployment inside 163 | cm = &corev1.ConfigMap{ 164 | ObjectMeta: metav1.ObjectMeta{ 165 | Name: "test-base-config", 166 | Namespace: "default", 167 | }, 168 | Data: map[string]string{ 169 | "prefillDeployment": string(deployYamlStr), 170 | }, 171 | } 172 | 173 | // Create ModelService referencing the ConfigMap 174 | msvc = &msv1alpha1.ModelService{ 175 | ObjectMeta: metav1.ObjectMeta{ 176 | Name: "test-modelservice", 177 | Namespace: "default", 178 | }, 179 | Spec: msv1alpha1.ModelServiceSpec{ 180 | BaseConfigMapRef: &corev1.ObjectReference{ 181 | Name: "test-base-config", 182 | }, 183 | ModelArtifacts: msv1alpha1.ModelArtifacts{ 184 | URI: "hf://facebook/opt-125m", 185 | }, 186 | Routing: msv1alpha1.Routing{ 187 | Ports: []msv1alpha1.Port{ 188 | { 189 | Name: "portName", 190 | Port: 9999, 191 | }, 192 | }, 193 | }, 194 | }, 195 | } 196 | 197 | By("Creating the base config cm") 198 | Expect(k8sClient.Create(ctx, cm)).To(Succeed()) 199 | 200 | By("Creating the msvc") 201 | Expect(k8sClient.Create(ctx, msvc)).To(Succeed()) 202 | 203 | reconciler = &ModelServiceReconciler{ 204 | Client: k8sClient, 205 | Scheme: k8sClient.Scheme(), 206 | } 207 | }) 208 | 209 | It("should correctly interpolate container args", func() { 210 | bc, err := reconciler.getChildResourcesFromConfigMap(ctx, msvc) 211 | Expect(err).To(BeNil()) 212 | Expect(bc).ToNot(BeNil()) 213 | Expect(bc.PrefillDeployment).ToNot(BeNil()) 214 | Expect(bc.PrefillDeployment.Spec.Template.Spec.Containers).ToNot(BeNil()) 215 | c := bc.PrefillDeployment.Spec.Template.Spec.Containers[0] 216 | Expect(c.Args).ToNot(BeNil()) 217 | Expect(c.Args[0]).To(Equal("facebook/opt-125m")) 218 | }) 219 | 220 | It("should correctly interpolate containerPort", func() { 221 | bc, err := reconciler.getChildResourcesFromConfigMap(ctx, msvc) 222 | Expect(err).To(BeNil()) 223 | Expect(bc).ToNot(BeNil()) 224 | Expect(bc.PrefillDeployment).ToNot(BeNil()) 225 | Expect(bc.PrefillDeployment.Spec.Template.Spec.Containers).ToNot(BeNil()) 226 | c := bc.PrefillDeployment.Spec.Template.Spec.Containers[0] 227 | Expect(c.Ports).ToNot(BeEmpty()) 228 | Expect(c.Ports[0].ContainerPort).To(Equal(int32(9999))) 229 | }) 230 | 231 | AfterEach(func() { 232 | // Clean up resources after each test 233 | err := k8sClient.Delete(ctx, msvc) 234 | if err != nil && !errors.IsNotFound(err) { 235 | Fail(fmt.Sprintf("Failed to delete ModelService: %v", err)) 236 | } 237 | 238 | err = k8sClient.Delete(ctx, cm) 239 | if err != nil && !errors.IsNotFound(err) { 240 | Fail(fmt.Sprintf("Failed to delete ConfigMap: %v", err)) 241 | } 242 | }) 243 | }) 244 | -------------------------------------------------------------------------------- /internal/controller/constants.go: -------------------------------------------------------------------------------- 1 | /* 2 | Constants for utils 3 | */ 4 | 5 | package controller 6 | 7 | const modelStorageVolumeName = "model-storage" 8 | const modelStorageRoot = "/model-cache" 9 | const pathSep = "/" 10 | const DECODE_ROLE = "decode" 11 | const PREFILL_ROLE = "prefill" 12 | const MODEL_ARTIFACT_URI_PVC = "pvc" 13 | const MODEL_ARTIFACT_URI_HF = "hf" 14 | const MODEL_ARTIFACT_URI_OCI = "oci" 15 | const MODEL_ARTIFACT_URI_PVC_PREFIX = MODEL_ARTIFACT_URI_PVC + "://" 16 | const MODEL_ARTIFACT_URI_HF_PREFIX = MODEL_ARTIFACT_URI_HF + "://" 17 | const MODEL_ARTIFACT_URI_OCI_PREFIX = MODEL_ARTIFACT_URI_OCI + "://" 18 | const ENV_HF_HOME = "HF_HOME" 19 | const ENV_HF_TOKEN = "HF_TOKEN" 20 | 21 | type URIType string 22 | 23 | const ( 24 | PVC URIType = "pvc" 25 | HF URIType = "hf" 26 | OCI URIType = "oci" 27 | UnknownURI URIType = "unknown" 28 | ) 29 | -------------------------------------------------------------------------------- /internal/controller/merge_transformers.go: -------------------------------------------------------------------------------- 1 | package controller 2 | 3 | import ( 4 | "reflect" 5 | "slices" 6 | 7 | "dario.cat/mergo" 8 | corev1 "k8s.io/api/core/v1" 9 | gatewayv1 "sigs.k8s.io/gateway-api/apis/v1" 10 | ) 11 | 12 | // convertToGenericSlice returns a slice where each item in the slice 13 | // is converted to a T object from each item in reflect.Value 14 | func convertToGenericSlice[T any](val reflect.Value) []T { 15 | if val.Kind() == reflect.Ptr { 16 | val = val.Elem() 17 | } 18 | if val.Kind() != reflect.Slice { 19 | return nil 20 | } 21 | 22 | var result []T 23 | for i := 0; i < val.Len(); i++ { 24 | item := val.Index(i).Interface() 25 | tItem, ok := item.(T) 26 | if !ok { 27 | return nil 28 | } 29 | result = append(result, tItem) 30 | } 31 | return result 32 | } 33 | 34 | // mergeKeyValue returns the value given the name of the field in that struct 35 | // for example, 36 | // myEnvVar := corev1.EnvVar{"Name": "env-var"} 37 | // mergeKeyValue(myEnvVar, "Name") returns "env-var" 38 | func mergeKeyValue[T any](obj T, fieldName string) string { 39 | return reflect.ValueOf(obj).FieldByName(fieldName).String() 40 | } 41 | 42 | // genericSliceTransformer merges two slices of the same type T 43 | // mergeFunc is the function that contains logic for merging two T objects 44 | // mergeKey is the name of the field in T, so that if dst.MergeKey == src.MergeKey, 45 | // the mergeFunc is called on those two objects. Otherwise, the src is appended 46 | // for now, only string fields are supported for mergeKey 47 | // (since we cannot guarantee equality for generic reflect.Value) 48 | // mergeFunc takes in 49 | // - dst (pointer): so that in-place merge can take happen 50 | // - src: the src object to merge into dst 51 | func genericSliceTransformer[T any]( 52 | typ reflect.Type, 53 | mergeFunc func(dst *T, src T) error, 54 | mergeKey string) func(dst, src reflect.Value) error { 55 | 56 | if typ == reflect.TypeOf([]T{}) { 57 | return func(dst, src reflect.Value) error { 58 | 59 | // Reject transforming anything other than slices 60 | if dst.Kind() != reflect.Slice || src.Kind() != reflect.Slice { 61 | return nil 62 | } 63 | 64 | srcSlice := convertToGenericSlice[T](src) 65 | dstSlice := convertToGenericSlice[T](dst) 66 | 67 | // keep track of the common mergeKeys among src and dst 68 | srcMergeKeyMap := map[string]T{} 69 | commonMergeKeys := []string{} // TODO: maybe mergeKey can be another generic type? 70 | 71 | for _, srcObj := range srcSlice { 72 | mergeKeyValue := mergeKeyValue(srcObj, mergeKey) 73 | srcMergeKeyMap[mergeKeyValue] = srcObj 74 | } 75 | 76 | for _, dstObj := range dstSlice { 77 | mergeKeyValue := mergeKeyValue(dstObj, mergeKey) 78 | if _, found := srcMergeKeyMap[mergeKeyValue]; found { 79 | commonMergeKeys = append(commonMergeKeys, mergeKeyValue) 80 | } 81 | } 82 | 83 | // now loop over dstSlice and see if there is a srcObj with same mergeKey value in src 84 | for i, dstObj := range dstSlice { 85 | 86 | dstMergeKeyValue := mergeKeyValue(dstObj, mergeKey) 87 | 88 | // Found a matching srcObj with same mergeKey value 89 | if srcObj, found := srcMergeKeyMap[dstMergeKeyValue]; found { 90 | 91 | // Calls mergeFunc on the logic that merges two T structs in the slice 92 | err := mergeFunc(&dstObj, srcObj) 93 | 94 | if err != nil { 95 | return err 96 | } 97 | 98 | // Update dstObj in dstSlice if merge was successful 99 | dstSlice[i] = dstObj 100 | } 101 | } 102 | 103 | // Construct the mergedSlice combining both src and dst 104 | mergedSlice := []T{} 105 | 106 | // mergedSlice contains everything already present in dst to begin with, 107 | // with the common T objects already merged from src 108 | mergedSlice = append(mergedSlice, dstSlice...) 109 | 110 | // append other src objects that weren't merged and skip the ones that are common 111 | for _, srcObj := range srcSlice { 112 | mergeKeyValue := mergeKeyValue(srcObj, mergeKey) 113 | if !slices.Contains(commonMergeKeys, mergeKeyValue) { 114 | mergedSlice = append(mergedSlice, srcObj) 115 | } 116 | } 117 | 118 | // Now rewrite dst with mergedSlice 119 | dst.Set(reflect.ValueOf(mergedSlice)) 120 | return nil 121 | } 122 | } 123 | return nil 124 | } 125 | 126 | // envVarSliceTransformer: transformer for merging two EnvVars 127 | type envVarSliceTransformer struct{} 128 | 129 | // Transformer for []corev1.Env 130 | func (e envVarSliceTransformer) Transformer(typ reflect.Type) func(dst, src reflect.Value) error { 131 | 132 | // mergeKey for merging two EnvVars is the Name of the EnvVar 133 | mergeKey := "Name" 134 | mergeFunc := func(dst *corev1.EnvVar, src corev1.EnvVar) error { 135 | return mergo.Merge(dst, src, mergo.WithOverride) 136 | } 137 | 138 | return genericSliceTransformer(typ, mergeFunc, mergeKey) 139 | } 140 | 141 | // stringSlicePrependTransformer: transformer for merging two string slices 142 | type stringSlicePrependTransformer struct{} 143 | 144 | // Transformer for []string, such as Container.Args so that src args get prepended, not appended 145 | func (stringSlicePrependTransformer) Transformer(t reflect.Type) func(dst, src reflect.Value) error { 146 | if t.Kind() == reflect.Slice && t.Elem().Kind() == reflect.String { 147 | return func(dst, src reflect.Value) error { 148 | // Ensure dst is settable 149 | if !dst.CanSet() { 150 | return nil 151 | } 152 | if src.IsNil() || src.Len() == 0 { 153 | return nil 154 | } 155 | 156 | // Combine: src first, then dst 157 | merged := reflect.AppendSlice(src, dst) 158 | dst.Set(merged) 159 | return nil 160 | } 161 | } 162 | return nil 163 | } 164 | 165 | // compositeTransformer is a list of transformers to apply in a single mergo.Merge call 166 | type compositeTransformer struct { 167 | transformers []mergo.Transformers 168 | } 169 | 170 | // Transformer takes in a list of Transformers and applies them one by one 171 | func (ct compositeTransformer) Transformer(t reflect.Type) func(dst, src reflect.Value) error { 172 | for _, tr := range ct.transformers { 173 | if fn := tr.Transformer(t); fn != nil { 174 | return fn 175 | } 176 | } 177 | return nil 178 | } 179 | 180 | // containerSliceTransformer: transformer for merging two Containers 181 | type containerSliceTransformer struct{} 182 | 183 | // Transformer merges two []corev1.Container based on their Name, 184 | // and applies transformers for each Container.Spec fields 185 | func (c containerSliceTransformer) Transformer(typ reflect.Type) func(dst, src reflect.Value) error { 186 | 187 | // mergeKey for merging two Containers is the Name of the Container 188 | mergeKey := "Name" 189 | 190 | // dstContainer (comes from baseconfig) 191 | // srcContainer (comes from msvc and controller logic) 192 | mergeFunc := func(dstContainer *corev1.Container, srcContainer corev1.Container) error { 193 | 194 | // Command should be completely overriden, not appended 195 | if len(srcContainer.Command) > 0 { 196 | dstContainer.Command = []string{} 197 | } 198 | 199 | err := mergo.Merge(dstContainer, 200 | srcContainer, 201 | mergo.WithAppendSlice, 202 | mergo.WithOverride, 203 | mergo.WithTransformers(compositeTransformer{ 204 | transformers: []mergo.Transformers{ 205 | envVarSliceTransformer{}, 206 | stringSlicePrependTransformer{}, 207 | }, 208 | }), 209 | ) 210 | 211 | if err != nil { 212 | return err 213 | } 214 | 215 | return nil 216 | } 217 | 218 | return genericSliceTransformer(typ, mergeFunc, mergeKey) 219 | } 220 | 221 | // parentRefSliceTransformer: transformer for merging two ParentReference objects 222 | type parentRefSliceTransformer struct{} 223 | 224 | // Transformer merges two []gatewayv1.ParentReference based on their Name, 225 | // and applies transformers for each Container.Spec fields 226 | func (c parentRefSliceTransformer) Transformer(typ reflect.Type) func(dst, src reflect.Value) error { 227 | 228 | // mergeKey for merging two ParentReference is the Name of the ParentReference 229 | mergeKey := "Name" 230 | 231 | // dstParentReference (comes from baseconfig) 232 | // srcParentReference (comes from msvc and controller logic) 233 | mergeFunc := func(dstParentReference *gatewayv1.ParentReference, srcParentReference gatewayv1.ParentReference) error { 234 | 235 | err := mergo.Merge(dstParentReference, 236 | srcParentReference, 237 | mergo.WithAppendSlice, 238 | mergo.WithOverride) 239 | 240 | if err != nil { 241 | return err 242 | } 243 | 244 | return nil 245 | } 246 | 247 | return genericSliceTransformer(typ, mergeFunc, mergeKey) 248 | } 249 | 250 | // backendRefTransformer: transformer for merging two BackendRef objects 251 | type backendRefTransformer struct{} 252 | 253 | // Transformer merges two []gatewayv1.BackendRef based on their Name, 254 | // and applies transformers for each Container.Spec fields 255 | func (c backendRefTransformer) Transformer(typ reflect.Type) func(dst, src reflect.Value) error { 256 | 257 | // mergeKey for merging two BackendRef is the Name of the BackendRef 258 | mergeKey := "Name" 259 | 260 | // dstBackendRef (comes from baseconfig) 261 | // srcBackendRef (comes from msvc and controller logic) 262 | mergeFunc := func(dstBackendRef *gatewayv1.BackendRef, srcBackendRef gatewayv1.BackendRef) error { 263 | 264 | err := mergo.Merge(dstBackendRef, 265 | srcBackendRef, 266 | mergo.WithAppendSlice, 267 | mergo.WithOverride) 268 | 269 | if err != nil { 270 | return err 271 | } 272 | 273 | return nil 274 | } 275 | 276 | return genericSliceTransformer(typ, mergeFunc, mergeKey) 277 | } 278 | 279 | // MergeContainerSlices merges src slice into dest in place 280 | func MergeContainerSlices(dest, src []corev1.Container) ([]corev1.Container, error) { 281 | err := mergo.Merge(&dest, src, mergo.WithTransformers(containerSliceTransformer{})) 282 | 283 | if err != nil { 284 | return []corev1.Container{}, err 285 | } 286 | 287 | return dest, err 288 | } 289 | 290 | // MergeGatewayRefSlices merges src slice containing gatewayv1.ParentRefs into dest in place 291 | func MergeGatewayRefSlices(dest, src []gatewayv1.ParentReference) ([]gatewayv1.ParentReference, error) { 292 | err := mergo.Merge(&dest, src, mergo.WithTransformers(parentRefSliceTransformer{})) 293 | 294 | if err != nil { 295 | return []gatewayv1.ParentReference{}, err 296 | } 297 | 298 | return dest, err 299 | } 300 | 301 | // MergeBackendRefSlices merges src slice containing gatewayv1.ParentRefs into dest in place 302 | func MergeBackendRefSlices(dest, src []gatewayv1.BackendRef) ([]gatewayv1.BackendRef, error) { 303 | err := mergo.Merge(&dest, src, mergo.WithTransformers(backendRefTransformer{})) 304 | 305 | if err != nil { 306 | return []gatewayv1.BackendRef{}, err 307 | } 308 | 309 | return dest, err 310 | } 311 | -------------------------------------------------------------------------------- /internal/controller/suite_test.go: -------------------------------------------------------------------------------- 1 | package controller 2 | 3 | import ( 4 | "context" 5 | "os" 6 | "path/filepath" 7 | "testing" 8 | 9 | . "github.com/onsi/ginkgo/v2" 10 | . "github.com/onsi/gomega" 11 | 12 | "k8s.io/client-go/kubernetes/scheme" 13 | "k8s.io/client-go/rest" 14 | "sigs.k8s.io/controller-runtime/pkg/client" 15 | "sigs.k8s.io/controller-runtime/pkg/envtest" 16 | logf "sigs.k8s.io/controller-runtime/pkg/log" 17 | "sigs.k8s.io/controller-runtime/pkg/log/zap" 18 | 19 | msv1alpha1 "github.com/llm-d/llm-d-model-service/api/v1alpha1" 20 | appsv1 "k8s.io/api/apps/v1" 21 | corev1 "k8s.io/api/core/v1" 22 | giev1alpha2 "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" 23 | gatewayv1 "sigs.k8s.io/gateway-api/apis/v1" 24 | // +kubebuilder:scaffold:imports 25 | ) 26 | 27 | // These tests use Ginkgo (BDD-style Go testing framework). Refer to 28 | // http://onsi.github.io/ginkgo/ to learn more about Ginkgo. 29 | 30 | var ( 31 | ctx context.Context 32 | cancel context.CancelFunc 33 | testEnv *envtest.Environment 34 | cfg *rest.Config 35 | k8sClient client.Client 36 | ) 37 | 38 | func TestControllers(t *testing.T) { 39 | RegisterFailHandler(Fail) 40 | 41 | RunSpecs(t, "Controller Suite") 42 | } 43 | 44 | var _ = BeforeSuite(func() { 45 | logf.SetLogger(zap.New(zap.WriteTo(GinkgoWriter), zap.UseDevMode(true))) 46 | 47 | ctx, cancel = context.WithCancel(context.TODO()) 48 | 49 | var err error 50 | err = msv1alpha1.AddToScheme(scheme.Scheme) 51 | Expect(err).NotTo(HaveOccurred()) 52 | err = gatewayv1.Install(scheme.Scheme) 53 | Expect(err).NotTo(HaveOccurred()) 54 | err = giev1alpha2.Install(scheme.Scheme) 55 | Expect(err).NotTo(HaveOccurred()) 56 | err = corev1.AddToScheme(scheme.Scheme) 57 | Expect(err).NotTo(HaveOccurred()) 58 | err = appsv1.AddToScheme(scheme.Scheme) 59 | Expect(err).NotTo(HaveOccurred()) 60 | 61 | // +kubebuilder:scaffold:scheme 62 | 63 | By("bootstrapping test environment") 64 | testEnv = &envtest.Environment{ 65 | // TODO: This should be made robust, 66 | // if someone runs tests from a subfolder, these may not run 67 | CRDDirectoryPaths: []string{filepath.Join("..", "..", "config", "crd", "bases"), filepath.Join("..", "..", "test", "inferenceCRDs")}, 68 | ErrorIfCRDPathMissing: true, 69 | } 70 | 71 | // Retrieve the first found binary directory to allow running tests from IDEs 72 | if getFirstFoundEnvTestBinaryDir() != "" { 73 | testEnv.BinaryAssetsDirectory = getFirstFoundEnvTestBinaryDir() 74 | } 75 | 76 | // cfg is defined in this file globally. 77 | cfg, err = testEnv.Start() 78 | Expect(err).NotTo(HaveOccurred()) 79 | Expect(cfg).NotTo(BeNil()) 80 | 81 | k8sClient, err = client.New(cfg, client.Options{Scheme: scheme.Scheme}) 82 | Expect(err).NotTo(HaveOccurred()) 83 | Expect(k8sClient).NotTo(BeNil()) 84 | }) 85 | 86 | var _ = AfterSuite(func() { 87 | By("tearing down the test environment") 88 | cancel() 89 | err := testEnv.Stop() 90 | Expect(err).NotTo(HaveOccurred()) 91 | }) 92 | 93 | // getFirstFoundEnvTestBinaryDir locates the first binary in the specified path. 94 | // ENVTEST-based tests depend on specific binaries, usually located in paths set by 95 | // controller-runtime. When running tests directly (e.g., via an IDE) without using 96 | // Makefile targets, the 'BinaryAssetsDirectory' must be explicitly configured. 97 | // 98 | // This function streamlines the process by finding the required binaries, similar to 99 | // setting the 'KUBEBUILDER_ASSETS' environment variable. To ensure the binaries are 100 | // properly set up, run 'make setup-envtest' beforehand. 101 | func getFirstFoundEnvTestBinaryDir() string { 102 | basePath := filepath.Join("..", "..", "bin", "k8s") 103 | entries, err := os.ReadDir(basePath) 104 | if err != nil { 105 | logf.Log.Error(err, "Failed to read directory", "path", basePath) 106 | return "" 107 | } 108 | for _, entry := range entries { 109 | if entry.IsDir() { 110 | return filepath.Join(basePath, entry.Name()) 111 | } 112 | } 113 | return "" 114 | } 115 | -------------------------------------------------------------------------------- /internal/controller/template.go: -------------------------------------------------------------------------------- 1 | package controller 2 | 3 | import ( 4 | "bytes" 5 | "fmt" 6 | "text/template" 7 | 8 | sprig "github.com/Masterminds/sprig/v3" 9 | ) 10 | 11 | // registerSprigFunctions to get a new template with sprig functions support 12 | func registerSprigFunctions(tmplStr string, functions *TemplateFuncs) (*template.Template, error) { 13 | // Create a new template and register Sprig functions 14 | tmpl, err := template.New("template"). 15 | Funcs(sprig.TxtFuncMap()). 16 | Funcs(functions.funcMap). 17 | Parse(tmplStr) 18 | if err != nil { 19 | return nil, fmt.Errorf("error parsing template: %w", err) 20 | } 21 | return tmpl, err 22 | } 23 | 24 | // renderTemplate using template vars 25 | func renderTemplate(tmplStr string, vars *TemplateVars, functions *TemplateFuncs) (string, error) { 26 | tmpl, err := registerSprigFunctions(tmplStr, functions) 27 | if err != nil { 28 | return "", err 29 | } 30 | 31 | // Execute the template with the provided struct 32 | var buf bytes.Buffer 33 | if err := tmpl.Execute(&buf, vars); err != nil { 34 | return "", fmt.Errorf("error executing template: %w", err) 35 | } 36 | 37 | return buf.String(), nil 38 | } 39 | -------------------------------------------------------------------------------- /internal/controller/template_test.go: -------------------------------------------------------------------------------- 1 | package controller 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | "testing" 7 | 8 | msv1alpha1 "github.com/llm-d/llm-d-model-service/api/v1alpha1" 9 | "github.com/stretchr/testify/assert" 10 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 11 | ) 12 | 13 | const msvcName = "msvc-test" 14 | const msvcNamespace = "default" 15 | const modelName = "modelName" 16 | const sanitizedModelName = "modelname" 17 | const pvcName = "pvc-name" 18 | const modelPath = "path/to/" + modelName 19 | const mountedModelPathInVolume = modelStorageRoot + pathSep + modelPath 20 | const pvcURI = "pvc://" + pvcName + "/" + modelPath 21 | const hfModelName = pvcName + "/" + modelName 22 | const hfURI = "hf://" + hfModelName 23 | const authSecretName = "hf-secret" 24 | 25 | var authSecretNameCopy = authSecretName 26 | var authSecretNamePtr = &authSecretNameCopy // ugly workaround ModelArtifacts.AuthSecretName is *strings 27 | 28 | // returns a minimal valid msvc 29 | func minimalMSVC() *msv1alpha1.ModelService { 30 | return &msv1alpha1.ModelService{ 31 | ObjectMeta: metav1.ObjectMeta{ 32 | Name: msvcName, 33 | Namespace: msvcNamespace, 34 | }, 35 | Spec: msv1alpha1.ModelServiceSpec{ 36 | Routing: msv1alpha1.Routing{ 37 | ModelName: modelName, 38 | }, 39 | ModelArtifacts: msv1alpha1.ModelArtifacts{ 40 | URI: pvcURI, 41 | AuthSecretName: authSecretNamePtr, 42 | }, 43 | }, 44 | } 45 | } 46 | 47 | // createMSVCWithPDSpec creates a minimal msvc with the appropriate decode 48 | func createMSVCWithDecode(decodeSpec *msv1alpha1.PDSpec) *msv1alpha1.ModelService { 49 | 50 | minimalMSVC := minimalMSVC() 51 | minimalMSVC.Spec.Decode = decodeSpec 52 | return minimalMSVC 53 | } 54 | 55 | func TestTemplateVars(t *testing.T) { 56 | // Test that each template var can be interpolated in MSVC 57 | 58 | tests := map[string]struct { 59 | expectedValue string 60 | uri string 61 | }{ 62 | "ModelServiceName": { 63 | expectedValue: msvcName, 64 | }, 65 | "ModelServiceNamespace": { 66 | expectedValue: msvcNamespace, 67 | }, 68 | "ModelName": { 69 | expectedValue: modelName, 70 | }, 71 | "HFModelName": { 72 | expectedValue: hfModelName, 73 | uri: hfURI, 74 | }, 75 | "SanitizedModelName": { 76 | expectedValue: sanitizedModelName, 77 | }, 78 | "ModelPath": { 79 | expectedValue: modelPath, 80 | }, 81 | "MountedModelPath": { 82 | expectedValue: mountedModelPathInVolume, 83 | }, 84 | "AuthSecretName": { 85 | expectedValue: authSecretName, 86 | }, 87 | "EPPServiceName": { 88 | expectedValue: msvcName + "-epp-service", 89 | }, 90 | "EPPDeploymentName": { 91 | expectedValue: msvcName + "-epp", 92 | }, 93 | "PrefillDeploymentName": { 94 | expectedValue: msvcName + "-prefill", 95 | }, 96 | "DecodeDeploymentName": { 97 | expectedValue: msvcName + "-decode", 98 | }, 99 | "PrefillServiceName": { 100 | expectedValue: msvcName + "-service-prefill", 101 | }, 102 | "DecodeServiceName": { 103 | expectedValue: msvcName + "-service-decode", 104 | }, 105 | "InferencePoolName": { 106 | expectedValue: msvcName + "-inference-pool", 107 | }, 108 | "InferenceModelName": { 109 | expectedValue: msvcName, 110 | }, 111 | } 112 | 113 | for templateVar, testCase := range tests { 114 | ctx := context.Background() 115 | 116 | minimalMSVC := createMSVCWithDecode(&msv1alpha1.PDSpec{ 117 | ModelServicePodSpec: msv1alpha1.ModelServicePodSpec{ 118 | Containers: []msv1alpha1.ContainerSpec{ 119 | { 120 | Args: []string{ 121 | // This becomes, for example, {{ .ModelService }} 122 | fmt.Sprintf("{{ .%s }}", templateVar), 123 | }, 124 | }, 125 | }, 126 | }, 127 | }) 128 | 129 | if testCase.uri != "" { 130 | minimalMSVC.Spec.ModelArtifacts.URI = testCase.uri 131 | } 132 | 133 | interpolatedMSVC, err := InterpolateModelService(ctx, minimalMSVC) 134 | assert.NoError(t, err, "got error but expected none") 135 | 136 | // Assert that the template var is interpolated and the expected values match 137 | // Check that Args[0] matches 138 | actualValue := interpolatedMSVC.Spec.Decode.Containers[0].Args[0] 139 | assert.Equal(t, testCase.expectedValue, actualValue, fmt.Sprintf("%s should be interpolated", templateVar)) 140 | } 141 | } 142 | 143 | func TestMSVCInterpolation(t *testing.T) { 144 | 145 | tests := []struct { 146 | name string 147 | originalMSVC *msv1alpha1.ModelService 148 | expectedMSVC *msv1alpha1.ModelService 149 | expectError bool 150 | }{ 151 | { 152 | name: "no interpolation required should pass", 153 | originalMSVC: minimalMSVC(), 154 | expectedMSVC: minimalMSVC(), 155 | expectError: false, 156 | }, 157 | { 158 | name: "one interpolation required in args should pass", 159 | originalMSVC: createMSVCWithDecode(&msv1alpha1.PDSpec{ 160 | ModelServicePodSpec: msv1alpha1.ModelServicePodSpec{ 161 | Containers: []msv1alpha1.ContainerSpec{ 162 | { 163 | Args: []string{ 164 | "{{ .ModelPath }}", 165 | }, 166 | }, 167 | }, 168 | }, 169 | }), 170 | expectedMSVC: createMSVCWithDecode(&msv1alpha1.PDSpec{ 171 | ModelServicePodSpec: msv1alpha1.ModelServicePodSpec{ 172 | Containers: []msv1alpha1.ContainerSpec{ 173 | { 174 | Args: []string{ 175 | modelPath, 176 | }, 177 | }, 178 | }, 179 | }, 180 | }), 181 | expectError: false, 182 | }, 183 | { 184 | name: "1+ interpolation required in args should pass", 185 | originalMSVC: createMSVCWithDecode(&msv1alpha1.PDSpec{ 186 | ModelServicePodSpec: msv1alpha1.ModelServicePodSpec{ 187 | Containers: []msv1alpha1.ContainerSpec{ 188 | { 189 | Args: []string{ 190 | "{{ .ModelPath }}", 191 | "--arg2", 192 | "{{ .DecodeDeploymentName }}", 193 | }, 194 | }, 195 | }, 196 | }, 197 | }), 198 | expectedMSVC: createMSVCWithDecode(&msv1alpha1.PDSpec{ 199 | ModelServicePodSpec: msv1alpha1.ModelServicePodSpec{ 200 | Containers: []msv1alpha1.ContainerSpec{ 201 | { 202 | Args: []string{ 203 | modelPath, 204 | "--arg2", 205 | msvcName + "-decode", 206 | }, 207 | }, 208 | }, 209 | }, 210 | }), 211 | expectError: false, 212 | }, 213 | } 214 | 215 | for _, tt := range tests { 216 | t.Run(tt.name, func(t *testing.T) { 217 | ctx := context.Background() 218 | interpolatedMSVC, err := InterpolateModelService(ctx, tt.originalMSVC) 219 | 220 | if tt.expectError { 221 | assert.Error(t, err, "expected error but got none") 222 | } else { 223 | assert.NoError(t, err) 224 | 225 | // Assert that expected args matches interpolated args 226 | if tt.expectedMSVC.Spec.Decode != nil && interpolatedMSVC.Spec.Decode != nil { 227 | expectedContainers := tt.expectedMSVC.Spec.Decode.Containers 228 | interpolatedContainers := interpolatedMSVC.Spec.Decode.Containers 229 | 230 | assert.Equal(t, len(expectedContainers), len(interpolatedContainers), "container lengths don't match") 231 | 232 | for i := range len(expectedContainers) { 233 | expectedContainer := expectedContainers[i] 234 | interpolatedContainer := interpolatedContainers[i] 235 | 236 | // assert args match 237 | assertEqualSlices(t, expectedContainer.Args, interpolatedContainer.Args) 238 | } 239 | } else if tt.expectedMSVC.Spec.Decode == nil && interpolatedMSVC.Spec.Decode == nil { 240 | // both decode specs are nil, pass 241 | } else { 242 | assert.Fail(t, fmt.Sprintf("decode specs don't match\ngot: %v\nwant:%v", interpolatedMSVC.Spec.Decode, tt.expectedMSVC.Spec.Decode)) 243 | } 244 | 245 | } 246 | }) 247 | } 248 | } 249 | -------------------------------------------------------------------------------- /internal/controller/utils_test.go: -------------------------------------------------------------------------------- 1 | package controller 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | "strings" 7 | 8 | . "github.com/onsi/ginkgo/v2" 9 | . "github.com/onsi/gomega" 10 | "k8s.io/apimachinery/pkg/api/resource" 11 | 12 | msv1alpha1 "github.com/llm-d/llm-d-model-service/api/v1alpha1" 13 | ) 14 | 15 | const PVC_NAME = "my-pvc" 16 | const MODEL_PATH = "path/to/model" 17 | const HF_REPO_ID = "ibm-granite" 18 | const HF_MODEL_ID = "granite-3.3-2b-instruct" 19 | 20 | var _ = Describe("Model Artifacts", func() { 21 | Context("Given a model artifact with an invalid URI prefix", func() { 22 | modelArtifact := msv1alpha1.ModelArtifacts{ 23 | URI: fmt.Sprintf("nothing://%s/%s", PVC_NAME, MODEL_PATH), 24 | } 25 | 26 | It("should parse correctly", func() { 27 | By("checking type of uri") 28 | Expect(isPVCURI(modelArtifact.URI)).To(BeFalse()) 29 | Expect(isHFURI(modelArtifact.URI)).To(BeFalse()) 30 | 31 | By("Parsing PVC uri should fail") 32 | _, err := parsePVCURI(&modelArtifact) 33 | Expect(err).NotTo(BeNil()) 34 | 35 | By("Parsing HF uri should fail") 36 | _, _, err = parseHFURI(&modelArtifact) 37 | Expect(err).NotTo(BeNil()) 38 | }) 39 | }) 40 | 41 | Context("Given an URI string", func() { 42 | tests := map[string]struct { 43 | expectedURIType URIType 44 | expectedModelMountPath string 45 | }{ 46 | "pvc://pvc-name/path/to/model": { 47 | expectedURIType: PVC, 48 | expectedModelMountPath: modelStorageRoot + pathSep + "path/to/model", 49 | }, 50 | "oci://repo-with-tag::path/to/model": { 51 | expectedURIType: OCI, 52 | expectedModelMountPath: "", // TODO 53 | }, 54 | "hf://repo-id/model-id": { 55 | expectedURIType: HF, 56 | expectedModelMountPath: modelStorageRoot, 57 | }, 58 | "pvc://pvc-name": { 59 | expectedURIType: PVC, 60 | expectedModelMountPath: "", 61 | }, 62 | "oci://": { 63 | expectedURIType: OCI, 64 | expectedModelMountPath: "", // TODO 65 | }, 66 | "hf://wrong": { 67 | expectedURIType: HF, 68 | expectedModelMountPath: modelStorageRoot, 69 | }, 70 | "random://": { 71 | expectedURIType: UnknownURI, 72 | expectedModelMountPath: "", 73 | }, 74 | "": { 75 | expectedURIType: UnknownURI, 76 | expectedModelMountPath: "", 77 | }, 78 | "PVC://": { 79 | expectedURIType: UnknownURI, 80 | expectedModelMountPath: "", 81 | }, 82 | "HF://": { 83 | expectedURIType: UnknownURI, 84 | expectedModelMountPath: "", 85 | }, 86 | "OCI://": { 87 | expectedURIType: UnknownURI, 88 | expectedModelMountPath: "", 89 | }, 90 | } 91 | 92 | It("should determine the type of the URI correctly", func() { 93 | for uri, answer := range tests { 94 | expectedURIType := answer.expectedURIType 95 | actualURIType := UriType(uri) 96 | Expect(actualURIType).To(Equal(expectedURIType)) 97 | } 98 | }) 99 | 100 | It("should compute the mounted model path correctly", func() { 101 | for uri, answer := range tests { 102 | expectedModelMountPath := answer.expectedModelMountPath 103 | 104 | actualModelMountPath, err := mountedModelPath(&msv1alpha1.ModelService{ 105 | Spec: msv1alpha1.ModelServiceSpec{ 106 | ModelArtifacts: msv1alpha1.ModelArtifacts{ 107 | URI: uri, 108 | }, 109 | }, 110 | }) 111 | 112 | // Expect error if uri type is unknown 113 | if answer.expectedURIType == UnknownURI { 114 | Expect(err).To(HaveOccurred()) 115 | } else { 116 | Expect(err).ToNot(HaveOccurred()) 117 | Expect(actualModelMountPath).To(Equal(expectedModelMountPath)) 118 | } 119 | 120 | } 121 | }) 122 | }) 123 | 124 | Context("Given a model artifact with a valid PVC URI", func() { 125 | ctx := context.Background() 126 | modelArtifact := msv1alpha1.ModelArtifacts{ 127 | URI: fmt.Sprintf("pvc://%s/%s", PVC_NAME, MODEL_PATH), 128 | } 129 | 130 | modelService := msv1alpha1.ModelService{ 131 | Spec: msv1alpha1.ModelServiceSpec{ 132 | ModelArtifacts: modelArtifact, 133 | }, 134 | } 135 | 136 | It("should parse correctly", func() { 137 | By("checking type of uri") 138 | Expect(isPVCURI(modelArtifact.URI)).To(BeTrue()) 139 | Expect(isHFURI(modelArtifact.URI)).To(BeFalse()) 140 | Expect(isOCIURI(modelArtifact.URI)).To(BeFalse()) 141 | 142 | By("Parsing uri parts should be successful") 143 | parts, err := parsePVCURI(&modelArtifact) 144 | Expect(err).To(BeNil()) 145 | Expect(len(parts) > 1).To(BeTrue()) 146 | Expect(parts[0]).To(Equal(PVC_NAME)) 147 | Expect(strings.Join(parts[1:], "/")).To(Equal(MODEL_PATH)) 148 | }) 149 | It("should produce a valid volumeMounts list", func() { 150 | volumeMounts := getVolumeMountsForContainer(ctx, &modelService) 151 | Expect(len(volumeMounts)).To(Equal(1)) 152 | firstVolumeMount := volumeMounts[0] 153 | 154 | Expect(firstVolumeMount.Name).To(Equal(modelStorageVolumeName)) 155 | Expect(firstVolumeMount.MountPath).To(Equal(modelStorageRoot)) 156 | Expect(firstVolumeMount.ReadOnly).To(BeTrue()) 157 | }) 158 | It("should produce a valid volumes list", func() { 159 | volumes := getVolumeForPDDeployment(ctx, &modelService) 160 | Expect(len(volumes)).To(Equal(1)) 161 | firstVolume := volumes[0] 162 | Expect(firstVolume.Name).To(Equal(modelStorageVolumeName)) 163 | Expect(firstVolume.PersistentVolumeClaim.ClaimName).To(Equal(PVC_NAME)) 164 | Expect(firstVolume.PersistentVolumeClaim.ReadOnly).To(BeTrue()) 165 | }) 166 | 167 | It("should produce a valid env list", func() { 168 | envs := getEnvsForContainer(ctx, &modelService) 169 | Expect(len(envs)).To(Equal(0)) 170 | }) 171 | }) 172 | 173 | Context("Given a model artifact with a valid HF URI", func() { 174 | 175 | ctx := context.Background() 176 | authSecretName := "auth-secret-key" 177 | sizeLimit := "5Gi" 178 | sizeLimitQuan := resource.MustParse(sizeLimit) 179 | 180 | modelArtifact := msv1alpha1.ModelArtifacts{ 181 | URI: fmt.Sprintf("hf://%s/%s", HF_REPO_ID, HF_MODEL_ID), 182 | AuthSecretName: &authSecretName, 183 | Size: &sizeLimitQuan, 184 | } 185 | 186 | modelService := msv1alpha1.ModelService{ 187 | Spec: msv1alpha1.ModelServiceSpec{ 188 | ModelArtifacts: modelArtifact, 189 | }, 190 | } 191 | 192 | It("should parse correctly", func() { 193 | By("checking type of uri") 194 | Expect(isPVCURI(modelArtifact.URI)).To(BeFalse()) 195 | Expect(isHFURI(modelArtifact.URI)).To(BeTrue()) 196 | Expect(isOCIURI(modelArtifact.URI)).To(BeFalse()) 197 | 198 | By("Parsing uri parts should be successful") 199 | repo, model, err := parseHFURI(&modelArtifact) 200 | Expect(err).To(BeNil()) 201 | Expect(repo).To(Equal(HF_REPO_ID)) 202 | Expect(model).To(Equal(HF_MODEL_ID)) 203 | }) 204 | 205 | It("should produce a valid volumeMounts list", func() { 206 | volumeMounts := getVolumeMountsForContainer(ctx, &modelService) 207 | Expect(len(volumeMounts)).To(Equal(1)) 208 | firstVolumeMount := volumeMounts[0] 209 | 210 | Expect(firstVolumeMount.Name).To(Equal(modelStorageVolumeName)) 211 | Expect(firstVolumeMount.MountPath).To(Equal(modelStorageRoot)) 212 | Expect(firstVolumeMount.ReadOnly).To(BeFalse()) 213 | }) 214 | 215 | It("should produce a valid volumes list", func() { 216 | volumes := getVolumeForPDDeployment(ctx, &modelService) 217 | Expect(len(volumes)).To(Equal(1)) 218 | firstVolume := volumes[0] 219 | Expect(firstVolume.Name).To(Equal(modelStorageVolumeName)) 220 | Expect(firstVolume.EmptyDir.SizeLimit.String()).To(Equal(sizeLimit)) 221 | }) 222 | 223 | It("should produce a valid env list", func() { 224 | envs := getEnvsForContainer(ctx, &modelService) 225 | Expect(len(envs)).To(Equal(2)) 226 | hfTokenEnvVar := envs[0] 227 | 228 | Expect(hfTokenEnvVar.Name).To(Equal(ENV_HF_TOKEN)) 229 | Expect(hfTokenEnvVar.ValueFrom.SecretKeyRef.Name).To(Equal(authSecretName)) 230 | Expect(hfTokenEnvVar.ValueFrom.SecretKeyRef.Key).To(Equal(ENV_HF_TOKEN)) 231 | 232 | hfHomeEnvVar := envs[1] 233 | Expect(hfHomeEnvVar.Name).To(Equal(ENV_HF_HOME)) 234 | Expect(hfHomeEnvVar.Value).To(Equal(modelStorageRoot)) 235 | }) 236 | }) 237 | }) 238 | -------------------------------------------------------------------------------- /main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import "github.com/llm-d/llm-d-model-service/cmd" 4 | 5 | func main() { 6 | cmd.Execute() 7 | } 8 | -------------------------------------------------------------------------------- /model-service-arch.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/llm-d/llm-d-model-service/55eb18c4f08116ed8c9211f643b356ec5e47b4b7/model-service-arch.png -------------------------------------------------------------------------------- /perf/create_modelservice.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Base name for the ModelService 4 | BASE_NAME="perf-facebook-opt-125m-nixl" 5 | 6 | echo "Please make sure universal base config is applied to the cluster" 7 | 8 | # Loop to create and apply 100 instances 9 | for i in $(seq 1 1000); do 10 | NAME="${BASE_NAME}-${i}" 11 | cat < 87 | curl http://localhost:8000/v1/completions \ 88 | -H "Content-Type: application/json" \ 89 | -d '{ 90 | "model": "facebook/opt-125m", 91 | "prompt": "Author-contribution statements and acknowledgements in research papers should state clearly and specifically whether, and to what extent, the authors used AI technologies such as ChatGPT in the preparation of their manuscript and analysis. They should also indicate which LLMs were used. This will alert editors and reviewers to scrutinize manuscripts more carefully for potential biases, inaccuracies and improper source crediting. Likewise, scientific journals should be transparent about their use of LLMs, for example when selecting submitted manuscripts. Mention the large language model based product mentioned in the paragraph above:" 92 | }' 93 | ``` 94 | 95 | ### Scenario 3: serving a model with xPyD disaggregation 96 | Previously, we have looked at MSVCs which have just one replica for decode and prefill workloads. ModelService can help you achieve xPyD disaggregation, and all that is required is using different `replica` in the prefill and decode specs. 97 | 98 | Note that in this scenario, we are using the same baseconfig used in the last scenario, because there is really no difference in terms of the base configuration between the two other than model-specific behaviors such as replica count and model name. 99 | 100 | - [msvcs/xpyd.yaml](./msvcs/xpyd.yaml) 101 | - [baseconfigs/universal-baseconfig.yaml](./baseconfigs/universal-baseconfig.yaml) 102 | 103 | ``` 104 | kubectl apply -f samples/msvcs/xpyd.yaml 105 | ``` 106 | 107 | and you should see the corresponding number of pods spin up for each deployment. 108 | 109 | ### Scenario 4: loading a large model from a PVC 110 | Downloading a model from Hugging Face takes a long time for large models like [`meta-llama/Llama-4-Scout-17B-16E`](https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E), and one way to circumvent the long container creation time is to download a model to a PVC ahead of time and mount the PVC in the vLLM container. We have provided a baseconfig with the volume mounts configured, and all that is needed in the ModelService CR is to specify the path to which the model can be found. 111 | 112 | - [msvcs/llama4.yaml](./msvcs/llama4.yaml) 113 | - [baseconfigs/universal-baseconfig-pvc.yaml](./baseconfigs/universal-baseconfig-pvc.yaml) 114 | 115 | ``` 116 | kubectl apply -f samples/baseconfigs/universal-baseconfig-pvc.yaml 117 | kubectl apply -f samples/msvcs/llama4.yaml 118 | ``` 119 | 120 | This should drastically shorten the wait time for pod creation. -------------------------------------------------------------------------------- /samples/baseconfigs/simple-baseconfig.yaml: -------------------------------------------------------------------------------- 1 | # A simple baseconfig to serve a model downloaded from Hugging Face without a token on a pod 2 | # Make sure that the model can fit on the sizeLimit specified 3 | # 4 | # Requirements: 5 | # Any consuming ModelService should define ports labeled: 6 | # - app_port - the external port number for the prefill and decode pods 7 | 8 | apiVersion: v1 9 | kind: ConfigMap 10 | metadata: 11 | name: simple-base-config 12 | immutable: true 13 | data: 14 | decodeDeployment: | 15 | apiVersion: apps/v1 16 | kind: Deployment 17 | spec: 18 | template: 19 | spec: 20 | containers: 21 | - name: vllm 22 | image: vllm/vllm-openai:v0.8.5 23 | command: 24 | - vllm 25 | - serve 26 | securityContext: 27 | allowPrivilegeEscalation: false 28 | args: 29 | - "--port" 30 | - "{{ "app_port" | getPort }" 31 | env: 32 | - name: CUDA_VISIBLE_DEVICES 33 | value: "0" 34 | - name: UCX_TLS 35 | value: "cuda_ipc,cuda_copy,tcp" 36 | - name: HF_HUB_CACHE 37 | value: /cache 38 | volumeMounts: 39 | - name: model-cache 40 | mountPath: /cache 41 | resources: 42 | limits: 43 | nvidia.com/gpu: 1 44 | requests: 45 | cpu: "16" 46 | memory: 16Gi 47 | nvidia.com/gpu: 1 48 | volumes: 49 | - name: model-cache 50 | emptyDir: 51 | sizeLimit: 5Gi 52 | 53 | # A service for the deployment is optional 54 | decodeService: | 55 | apiVersion: v1 56 | kind: Service 57 | spec: 58 | clusterIP: None 59 | ports: 60 | - name: vllm 61 | port: {{ "app_port" | getPort } 62 | protocol: TCP 63 | -------------------------------------------------------------------------------- /samples/baseconfigs/universal-baseconfig-pvc.yaml: -------------------------------------------------------------------------------- 1 | # A universal baseconfig for models stored on PVCs 2 | # 3 | # Requirements: 4 | # Any consuming ModelService should define ports labeled: 5 | # - app_port - the external port number for the prefill and decode pods 6 | # - internal_port - the port number used by the sidecar to communicate with a vllm container 7 | apiVersion: v1 8 | kind: ConfigMap 9 | metadata: 10 | name: universal-base-config-pvc 11 | immutable: true 12 | data: 13 | decodeDeployment: | 14 | apiVersion: apps/v1 15 | kind: Deployment 16 | spec: 17 | template: 18 | spec: 19 | initContainers: 20 | - name: routing-proxy 21 | image: ghcr.io/llm-d/llm-d-routing-sidecar:0.0.6 22 | securityContext: 23 | allowPrivilegeEscalation: false 24 | runAsNonRoot: true 25 | args: 26 | # Note: this port has to match the prefill port 27 | - "--port={{ "app_port" | getPort }}" 28 | - "--vllm-port={{ "internal_port" | getPort }" 29 | - "--connector=nixl" 30 | ports: 31 | - containerPort: {{ "app_port" | getPort }} 32 | protocol: TCP 33 | restartPolicy: Always 34 | containers: 35 | - name: vllm 36 | image: ghcr.io/llm-d/llm-d:0.0.8 37 | command: 38 | - vllm 39 | - serve 40 | securityContext: 41 | allowPrivilegeEscalation: false 42 | args: 43 | - "--port" 44 | - "{{ "internal_port" | getPort }" 45 | - "--enforce-eager" 46 | - "--kv-transfer-config" 47 | - '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' 48 | env: 49 | - name: CUDA_VISIBLE_DEVICES 50 | value: "0" 51 | - name: UCX_TLS 52 | value: "cuda_ipc,cuda_copy,tcp" 53 | - name: NIXL_ROLE 54 | value: RECVER 55 | - name: HF_HUB_CACHE 56 | value: /vllm-workspace/models 57 | ports: 58 | - containerPort: 55555 59 | protocol: TCP 60 | volumeMounts: 61 | - name: model-cache 62 | mountPath: /vllm-workspace/models 63 | resources: 64 | limits: 65 | nvidia.com/gpu: 1 66 | requests: 67 | cpu: "16" 68 | memory: 16Gi 69 | nvidia.com/gpu: 1 70 | volumes: 71 | # MSVC controller will add model-storage using the PVC name in volumes 72 | # The mount path is /cache 73 | - name: model-cache 74 | emptyDir: 75 | sizeLimit: 20Gi 76 | 77 | 78 | prefillDeployment: | 79 | apiVersion: apps/v1 80 | kind: Deployment 81 | spec: 82 | template: 83 | spec: 84 | containers: 85 | - name: vllm 86 | image: ghcr.io/llm-d/llm-d:0.0.8 87 | command: 88 | - vllm 89 | - serve 90 | securityContext: 91 | allowPrivilegeEscalation: false 92 | args: 93 | # Note: this port has to match the proxy --port arg 94 | - "--port" 95 | - "{{ "app_port" | getPort }}" 96 | - "--enforce-eager" 97 | - "--kv-transfer-config" 98 | - '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' 99 | env: 100 | - name: CUDA_VISIBLE_DEVICES 101 | value: "0" 102 | - name: UCX_TLS 103 | value: "cuda_ipc,cuda_copy,tcp" 104 | - name: VLLM_NIXL_SIDE_CHANNEL_PORT 105 | value: "5557" 106 | - name: VLLM_NIXL_SIDE_CHANNEL_HOST 107 | valueFrom: 108 | fieldRef: 109 | fieldPath: status.podIP 110 | - name: VLLM_LOGGING_LEVEL 111 | value: DEBUG 112 | - name: HF_HUB_CACHE 113 | value: /vllm-workspace/models 114 | ports: 115 | - containerPort: {{ "app_port" | getPort }} 116 | protocol: TCP 117 | - containerPort: 5557 118 | protocol: TCP 119 | volumeMounts: 120 | - name: model-cache 121 | mountPath: /vllm-workspace/models 122 | resources: 123 | limits: 124 | nvidia.com/gpu: 1 125 | requests: 126 | cpu: "16" 127 | memory: 16Gi 128 | nvidia.com/gpu: 1 129 | volumes: 130 | # MSVC controller will add model-storage using the PVC name in volumes 131 | # The mount path is /cache 132 | - name: model-cache 133 | emptyDir: 134 | sizeLimit: 20Gi 135 | 136 | eppService: | 137 | apiVersion: v1 138 | kind: Service 139 | spec: 140 | ports: 141 | - port: 9002 # Needs to match the port of the eppDeployment 142 | protocol: TCP 143 | type: NodePort 144 | 145 | eppDeployment: | 146 | apiVersion: apps/v1 147 | kind: Deployment 148 | spec: 149 | template: 150 | spec: 151 | containers: 152 | - name: "epp" 153 | args: 154 | - -poolName 155 | - {{ .InferencePoolName }} 156 | - -poolNamespace 157 | - {{ .ModelServiceNamespace }} 158 | - -v 159 | - "4" 160 | - --zap-encoder 161 | - json 162 | - -grpcPort 163 | - "9002" 164 | - -grpcHealthPort 165 | - "9003" 166 | env: 167 | - name: PD_ENABLED 168 | value: "true" 169 | - name: PD_PROMPT_LEN_THRESHOLD 170 | value: "10" 171 | image: ghcr.io/llm-d/llm-d-inference-scheduler:0.0.3 172 | imagePullPolicy: Always 173 | livenessProbe: 174 | failureThreshold: 3 175 | grpc: 176 | port: 9003 177 | service: {{ .EPPServiceName }} 178 | initialDelaySeconds: 5 179 | periodSeconds: 10 180 | successThreshold: 1 181 | timeoutSeconds: 1 182 | ports: 183 | - containerPort: 9002 184 | protocol: TCP 185 | - containerPort: 9003 186 | protocol: TCP 187 | - containerPort: 9090 188 | name: metrics 189 | protocol: TCP 190 | readinessProbe: 191 | failureThreshold: 3 192 | grpc: 193 | port: 9003 194 | service: {{ .EPPServiceName }} 195 | initialDelaySeconds: 5 196 | periodSeconds: 10 197 | successThreshold: 1 198 | timeoutSeconds: 1 199 | 200 | inferencePool: | 201 | apiVersion: inference.networking.x-k8s.io/v1alpha2 202 | kind: InferencePool 203 | spec: 204 | targetPortNumber: {{ "app_port" | getPort }} 205 | 206 | inferenceModel: | 207 | apiVersion: inference.networking.x-k8s.io/v1alpha2 208 | kind: InferenceModel 209 | -------------------------------------------------------------------------------- /samples/baseconfigs/universal-baseconfig.yaml: -------------------------------------------------------------------------------- 1 | # Based on: https://github.com/llm-d/llm-d-routing-sidecar/tree/dev/test/config/nixl 2 | # 3 | # Requirements: 4 | # Any consuming ModelService should define ports labeled: 5 | # - app_port - the external port number for the prefill and decode pods 6 | # - internal_port - the port number used by the sidecar to communicate with a vllm container 7 | apiVersion: v1 8 | kind: ConfigMap 9 | metadata: 10 | name: universal-base-config 11 | immutable: true 12 | data: 13 | decodeDeployment: | 14 | apiVersion: apps/v1 15 | kind: Deployment 16 | spec: 17 | template: 18 | spec: 19 | initContainers: 20 | - name: routing-proxy 21 | image: ghcr.io/llm-d/llm-d-routing-sidecar:0.0.6 22 | securityContext: 23 | allowPrivilegeEscalation: false 24 | runAsNonRoot: true 25 | args: 26 | # Note: this port has to match the prefill port 27 | - "--port={{ "app_port" | getPort }}" 28 | - "--vllm-port={{ "internal_port" | getPort }}" 29 | - "--connector=nixl" 30 | ports: 31 | - containerPort: {{ "app_port" | getPort }} 32 | protocol: TCP 33 | restartPolicy: Always 34 | containers: 35 | - name: vllm 36 | image: grcr.io/llm-d/llm-d:0.0.8 37 | command: 38 | - vllm 39 | - serve 40 | securityContext: 41 | allowPrivilegeEscalation: false 42 | args: 43 | - "--port" 44 | - "{{ "internal_port" | getPort }}" 45 | - "--enforce-eager" 46 | - "--kv-transfer-config" 47 | - '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' 48 | env: 49 | - name: CUDA_VISIBLE_DEVICES 50 | value: "0" 51 | - name: UCX_TLS 52 | value: "cuda_ipc,cuda_copy,tcp" 53 | - name: NIXL_ROLE 54 | value: RECVER 55 | - name: HF_HUB_CACHE 56 | value: /vllm-workspace/models 57 | ports: 58 | - containerPort: 55555 59 | protocol: TCP 60 | volumeMounts: 61 | - name: model-cache 62 | mountPath: /vllm-workspace/models 63 | resources: 64 | limits: 65 | nvidia.com/gpu: 1 66 | requests: 67 | cpu: "16" 68 | memory: 16Gi 69 | nvidia.com/gpu: 1 70 | volumes: 71 | - name: model-cache 72 | emptyDir: 73 | sizeLimit: 20Gi 74 | 75 | prefillDeployment: | 76 | apiVersion: apps/v1 77 | kind: Deployment 78 | spec: 79 | template: 80 | spec: 81 | containers: 82 | - name: vllm 83 | image: ghcr.io/llm-d/llm-d:0.0.8 84 | command: 85 | - vllm 86 | - serve 87 | securityContext: 88 | allowPrivilegeEscalation: false 89 | args: 90 | # Note: this port has to match the proxy --port arg 91 | - "--port" 92 | - "{{ "app_port" | getPort }}" 93 | - "--enforce-eager" 94 | - "--kv-transfer-config" 95 | - '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' 96 | env: 97 | - name: CUDA_VISIBLE_DEVICES 98 | value: "0" 99 | - name: UCX_TLS 100 | value: "cuda_ipc,cuda_copy,tcp" 101 | - name: VLLM_NIXL_SIDE_CHANNEL_PORT 102 | value: "5557" 103 | - name: VLLM_NIXL_SIDE_CHANNEL_HOST 104 | valueFrom: 105 | fieldRef: 106 | fieldPath: status.podIP 107 | - name: VLLM_LOGGING_LEVEL 108 | value: DEBUG 109 | - name: HF_HUB_CACHE 110 | value: /vllm-workspace/models 111 | ports: 112 | - containerPort: {{ "app_port" | getPort }} 113 | protocol: TCP 114 | - containerPort: 5557 115 | protocol: TCP 116 | volumeMounts: 117 | - name: model-cache 118 | mountPath: /vllm-workspace/models 119 | resources: 120 | limits: 121 | nvidia.com/gpu: 1 122 | requests: 123 | cpu: "16" 124 | memory: 16Gi 125 | nvidia.com/gpu: 1 126 | volumes: 127 | - name: model-cache 128 | emptyDir: 129 | sizeLimit: 20Gi 130 | 131 | eppService: | 132 | apiVersion: v1 133 | kind: Service 134 | spec: 135 | ports: 136 | - port: 9002 # Needs to match the port of the eppDeployment 137 | protocol: TCP 138 | type: NodePort 139 | 140 | eppDeployment: | 141 | apiVersion: apps/v1 142 | kind: Deployment 143 | spec: 144 | template: 145 | spec: 146 | containers: 147 | - name: "epp" 148 | args: 149 | - -poolName 150 | - {{ .InferencePoolName }} 151 | - -poolNamespace 152 | - {{ .ModelServiceNamespace }} 153 | - -v 154 | - "4" 155 | - --zap-encoder 156 | - json 157 | - -grpcPort 158 | - "9002" 159 | - -grpcHealthPort 160 | - "9003" 161 | env: 162 | - name: PD_ENABLED 163 | value: "true" 164 | - name: PD_PROMPT_LEN_THRESHOLD 165 | value: "10" 166 | image: ghcr.io/llm-d/llm-d-inference-scheduler:0.0.3 167 | imagePullPolicy: Always 168 | livenessProbe: 169 | failureThreshold: 3 170 | grpc: 171 | port: 9003 172 | service: {{ .EPPServiceName }} 173 | initialDelaySeconds: 5 174 | periodSeconds: 10 175 | successThreshold: 1 176 | timeoutSeconds: 1 177 | ports: 178 | - containerPort: 9002 179 | protocol: TCP 180 | - containerPort: 9003 181 | protocol: TCP 182 | - containerPort: 9090 183 | name: metrics 184 | protocol: TCP 185 | readinessProbe: 186 | failureThreshold: 3 187 | grpc: 188 | port: 9003 189 | service: {{ .EPPServiceName }} 190 | initialDelaySeconds: 5 191 | periodSeconds: 10 192 | successThreshold: 1 193 | timeoutSeconds: 1 194 | 195 | inferencePool: | 196 | apiVersion: inference.networking.x-k8s.io/v1alpha2 197 | kind: InferencePool 198 | spec: 199 | targetPortNumber: {{ "app_port" | getPort }} 200 | 201 | inferenceModel: | 202 | apiVersion: inference.networking.x-k8s.io/v1alpha2 203 | kind: InferenceModel 204 | -------------------------------------------------------------------------------- /samples/msvcs/facebook-nixl.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: llm-d.ai/v1alpha1 2 | kind: ModelService 3 | metadata: 4 | name: facebook-opt-125m-nixl 5 | spec: 6 | decoupleScaling: false 7 | 8 | baseConfigMapRef: 9 | name: universal-base-config 10 | 11 | routing: 12 | # This is the model name for the OpenAI request 13 | modelName: facebook/opt-125m 14 | ports: 15 | - name: app_port 16 | port: 8000 17 | - name: internal_port 18 | port: 8200 19 | 20 | modelArtifacts: 21 | # When specfying the URI with `hf` prefix, the / string 22 | # is extracted and exposed as a template variable that can be used as {{ .HFModelName }} 23 | uri: hf://facebook/opt-125m 24 | 25 | # describe decode pods 26 | decode: 27 | replicas: 1 28 | acceleratorTypes: 29 | labelKey: nvidia.com/gpu.product 30 | labelValues: 31 | - NVIDIA-A100-SXM4-80GB 32 | containers: 33 | - name: "vllm" 34 | # The baseconfig image includes LMCache and multiconnector support 35 | args: 36 | - "{{ .HFModelName }}" 37 | 38 | # describe the prefill pods 39 | prefill: 40 | replicas: 1 41 | acceleratorTypes: 42 | labelKey: nvidia.com/gpu.product 43 | labelValues: 44 | - NVIDIA-A100-SXM4-80GB 45 | containers: 46 | - name: "vllm" 47 | args: 48 | - "{{ .HFModelName }}" 49 | 50 | -------------------------------------------------------------------------------- /samples/msvcs/granite3.2.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: llm-d.ai/v1alpha1 2 | kind: ModelService 3 | metadata: 4 | name: granite-base-model 5 | spec: 6 | decoupleScaling: false 7 | 8 | baseConfigMapRef: 9 | name: simple-base-config 10 | 11 | routing: 12 | modelName: ibm-granite/granite-3.3-2b-base 13 | ports: 14 | - name: app_port 15 | port: 8000 16 | 17 | modelArtifacts: 18 | uri: hf://ibm-granite/granite-3.3-2b-base 19 | 20 | # describe decode pods 21 | decode: 22 | replicas: 1 23 | # acceleratorTypes: 24 | # labelKey: nvidia.com/gpu.product 25 | # labelValues: 26 | # - NVIDIA-A100-SXM4-80GB 27 | containers: 28 | - name: "vllm" 29 | args: 30 | - "{{ .HFModelName }}" 31 | 32 | -------------------------------------------------------------------------------- /samples/msvcs/llama4.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: llm-d.ai/v1alpha1 2 | kind: ModelService 3 | metadata: 4 | name: meta-llama-4-scout-17b-16e 5 | spec: 6 | decoupleScaling: false 7 | 8 | baseConfigMapRef: 9 | name: universal-base-config-pvc 10 | 11 | routing: 12 | modelName: meta-llama/Llama-4-Scout-17B-16E 13 | ports: 14 | - name: app_port 15 | port: 8000 16 | - name: internal_port 17 | port: 8200 18 | 19 | modelArtifacts: 20 | # When specfying the URI with `pvc` prefix, the string after the pvc name (llama-pvc) 21 | # is extracted and exposed as a template variable that can be used as {{ .ModelPath }} 22 | uri: pvc://llama-pvc/path/to/llama4 23 | 24 | # describe decode pods 25 | decode: 26 | replicas: 1 27 | parallelism: 28 | tensor: 8 29 | containers: 30 | - name: "vllm" 31 | args: 32 | # Comes from model-storage volume, which is a PVC created by MSVC controller 33 | # The mountPath is /cache 34 | # {{ .ModelPath }} == /path/to/llama4 35 | - '/cache/{{ .ModelPath }}' 36 | 37 | # Other args come from https://blog.vllm.ai/2025/04/05/llama4.html 38 | # This is for reference only 39 | # Modify the args as you wish 40 | - "--tensor-parallel-size" 41 | - "8" 42 | - "--max-model-len" 43 | - "1000000" 44 | - "--override-generation-config='{\"attn_temperature_tuning\": true}'" 45 | 46 | acceleratorTypes: 47 | labelKey: nvidia.com/gpu.product 48 | labelValues: 49 | # According to the blog, Scout requires H100s 50 | - NVIDIA-H100 51 | 52 | # describe the prefill pods 53 | prefill: 54 | replicas: 1 55 | parallelism: 56 | tensor: 8 57 | containers: 58 | - name: "vllm" 59 | args: 60 | - '/cache/{{ .ModelPath }}' 61 | - "--tensor-parallel-size" 62 | - "8" 63 | - "--max-model-len" 64 | - "1000000" 65 | - "--override-generation-config='{\"attn_temperature_tuning\": true}'" 66 | 67 | acceleratorTypes: 68 | labelKey: nvidia.com/gpu.product 69 | labelValues: 70 | - NVIDIA-H100 71 | -------------------------------------------------------------------------------- /samples/msvcs/xpyd.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: llm-d.ai/v1alpha1 2 | kind: ModelService 3 | metadata: 4 | name: facebook-opt-125m-xpyd 5 | spec: 6 | decoupleScaling: false 7 | 8 | baseConfigMapRef: 9 | name: generic-base-config 10 | 11 | routing: 12 | modelName: facebook/opt-125m 13 | ports: 14 | - name: app_port 15 | port: 8000 16 | - name: internal_port 17 | port: 8200 18 | 19 | modelArtifacts: 20 | uri: pvc://facebook-pvc/path/to/opt-125m 21 | 22 | # describe decode pods 23 | decode: 24 | # Note a different replica count from spec.prefill.replicas 25 | replicas: 2 26 | containers: 27 | - name: "vllm" 28 | args: 29 | # Comes from baseconfig's volume mounts path 30 | - '/stored/models/{{ .ModelPath }}' 31 | acceleratorTypes: 32 | labelKey: nvidia.com/gpu.product 33 | labelValues: 34 | - NVIDIA-A100-SXM4-80GB 35 | 36 | # describe the prefill pods 37 | prefill: 38 | replicas: 1 39 | containers: 40 | - name: "vllm" 41 | args: 42 | # Comes from baseconfig's volume mounts path 43 | - '/stored/models/{{ .ModelPath }}' 44 | acceleratorTypes: 45 | labelKey: nvidia.com/gpu.product 46 | labelValues: 47 | - NVIDIA-A100-SXM4-80GB 48 | -------------------------------------------------------------------------------- /samples/test/README.md: -------------------------------------------------------------------------------- 1 | The files here are used for local development only. -------------------------------------------------------------------------------- /samples/test/baseconfig.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ConfigMap 3 | metadata: 4 | name: basic-basic-conf 5 | data: 6 | decodeDeployment: | 7 | spec: 8 | replicas: 2 9 | template: 10 | spec: 11 | containers: 12 | - name: llm 13 | command: 14 | - sleep 15 | 16 | # Note that this label is preserved and our labels are added 17 | decodeService: | 18 | spec: 19 | selector: 20 | app.kubernetes.io/name: decodeServiceLabelInBaseConfig 21 | ports: 22 | - protocol: TCP 23 | port: {{ "inport" | getPort }} 24 | targetPort: {{ "outport" | getPort }} 25 | 26 | # This service should not be created bc prefill doesn't exist in basemsvc.yaml 27 | prefillService: | 28 | spec: 29 | selector: 30 | app.kubernetes.io/name: prefillServiceLabelInBaseConfig 31 | ports: 32 | - protocol: TCP 33 | port: {{ "inport" | getPort }} 34 | targetPort: {{ "outport" | getPort }} 35 | 36 | inferenceModel: | 37 | spec: 38 | criticality: Standard 39 | 40 | inferencePool: | 41 | spec: 42 | targetPortNumber: {{ "outport" | getPort }} 43 | eppDeployment: | 44 | apiVersion: apps/v1 45 | kind: Deployment 46 | metadata: 47 | name: epp 48 | namespace: default 49 | spec: 50 | replicas: 1 51 | template: 52 | spec: 53 | # Conservatively, this timeout should mirror the longest grace period of the pods within the pool 54 | terminationGracePeriodSeconds: 130 55 | containers: 56 | - name: epp 57 | image: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/epp:main 58 | imagePullPolicy: Always 59 | args: 60 | - -poolName 61 | - my-pool-name 62 | - -poolNamespace 63 | - my-pool-namespace 64 | - -v 65 | - "4" 66 | - --zap-encoder 67 | - "json" 68 | - -grpcPort 69 | - "9002" 70 | - -grpcHealthPort 71 | - "9003" 72 | env: 73 | - name: USE_STREAMING 74 | value: "true" 75 | ports: 76 | - containerPort: 9002 77 | - containerPort: 9003 78 | - name: metrics 79 | containerPort: 9090 80 | livenessProbe: 81 | grpc: 82 | port: 9003 83 | service: inference-extension 84 | initialDelaySeconds: 5 85 | periodSeconds: 10 86 | readinessProbe: 87 | grpc: 88 | port: 9003 89 | service: inference-extension 90 | initialDelaySeconds: 5 91 | periodSeconds: 10 92 | eppService: | 93 | apiVersion: v1 94 | kind: Service 95 | metadata: 96 | name: llm-llama3-8b-instruct-epp 97 | namespace: default 98 | spec: 99 | selector: 100 | app: llm-llama3-8b-instruct-epp 101 | ports: 102 | - protocol: TCP 103 | port: 9002 104 | targetPort: 9002 105 | appProtocol: http2 106 | type: ClusterIP 107 | 108 | httpRoute: | 109 | apiVersion: gateway.networking.k8s.io/v1 110 | kind: HTTPRoute 111 | spec: 112 | parentRefs: 113 | - name: inference-gateway-name 114 | port: 12345 115 | rules: 116 | - matches: 117 | - path: 118 | type: PathPrefix 119 | value: / 120 | - backendRefs: 121 | - group: inference.networking.x-k8s.io 122 | kind: InferencePool 123 | name: {{ .InferencePoolName }} 124 | port: {{ "outport" | getPort }} 125 | -------------------------------------------------------------------------------- /samples/test/msvc-hf.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: llm-d.ai/v1alpha1 2 | kind: ModelService 3 | metadata: 4 | name: busybox 5 | spec: 6 | decoupleScaling: false 7 | 8 | baseConfigMapRef: 9 | name: basic-basic-conf 10 | 11 | routing: 12 | modelName: ibm-granite/granite-3.3-2b-instruct 13 | ports: 14 | - name: inport 15 | port: 80 16 | - name: outport 17 | port: 9376 18 | 19 | modelArtifacts: 20 | uri: hf://ibm-granite/granite-3.3-2b-instruct 21 | authSecretName: hf-secret 22 | size: 5Gi 23 | 24 | # describe decode pods 25 | decode: 26 | replicas: 1 27 | containers: 28 | - name: "sidecar" 29 | image: "nginx" 30 | - name: "llm" 31 | image: busybox 32 | args: 33 | - "{{ .HFModelName }}" 34 | - "{{ .MountedModelPath }}" 35 | mountModelVolume: true -------------------------------------------------------------------------------- /samples/test/msvc.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: llm-d.ai/v1alpha1 2 | kind: ModelService 3 | metadata: 4 | name: busybox 5 | spec: 6 | decoupleScaling: false 7 | 8 | baseConfigMapRef: 9 | name: basic-basic-conf 10 | 11 | routing: 12 | modelName: llama-2075 13 | ports: 14 | - name: inport 15 | port: 80 16 | - name: outport 17 | port: 9376 18 | gatewayRefs: 19 | - name: inference-gateway-name 20 | port: 1112 21 | 22 | modelArtifacts: 23 | uri: pvc://llama-of-the-future/path/to/llama-2075 24 | 25 | # describe decode pods 26 | decode: 27 | replicas: 1 28 | initContainers: 29 | - name: "proxy" 30 | image: "busybox" 31 | args: 32 | - "{{ .ModelPath }}" 33 | mountModelVolume: true 34 | 35 | containers: 36 | - name: "llm" 37 | image: busybox 38 | args: 39 | - "{{ .ModelName }}" 40 | 41 | endpointPicker: 42 | containers: 43 | - name: "epp" 44 | env: 45 | - name: HF_TOKEN 46 | value: hello 47 | - name: USE_STREAMING 48 | value: "false" -------------------------------------------------------------------------------- /test/e2e/e2e_suite_test.go: -------------------------------------------------------------------------------- 1 | package e2e 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | "os" 7 | "os/exec" 8 | "path/filepath" 9 | "testing" 10 | 11 | . "github.com/onsi/ginkgo/v2" 12 | . "github.com/onsi/gomega" 13 | "k8s.io/apimachinery/pkg/runtime" 14 | "k8s.io/client-go/rest" 15 | "k8s.io/client-go/tools/clientcmd" 16 | "sigs.k8s.io/controller-runtime/pkg/client" 17 | 18 | msv1alpha1 "github.com/llm-d/llm-d-model-service/api/v1alpha1" 19 | "github.com/llm-d/llm-d-model-service/test/utils" 20 | clientgoscheme "k8s.io/client-go/kubernetes/scheme" 21 | giev1alpha2 "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" 22 | ) 23 | 24 | var ( 25 | // Optional Environment Variables: 26 | // - CERT_MANAGER_INSTALL_SKIP=true: Skips CertManager installation during test setup. 27 | // These variables are useful if CertManager is already installed, avoiding 28 | // re-installation and conflicts. 29 | skipCertManagerInstall = os.Getenv("CERT_MANAGER_INSTALL_SKIP") == "true" 30 | // isCertManagerAlreadyInstalled will be set true when CertManager CRDs be found on the cluster 31 | isCertManagerAlreadyInstalled = false 32 | 33 | // projectImage is the name of the image which will be build and loaded 34 | // with the code source changes to be tested. 35 | projectImage = "llm-d.ai/modelservice:v0.0.1" 36 | imageArchive = "/tmp/llm-d.ai-modelservice-v0.0.1.tar" 37 | testCluster = "kind-modelservice-test" 38 | kindImage = "kindest/node:v1.32.0@sha256:c48c62eac5da28cdadcf560d1d8616cfa6783b58f0d94cf63ad1bf49600cb027" 39 | ) 40 | 41 | // TestE2E runs the end-to-end (e2e) test suite for the project. These tests execute in an isolated, 42 | // temporary environment to validate project changes with the purposed to be used in CI jobs. 43 | // The default setup requires Kind, builds/loads the Manager Docker image locally, and installs 44 | // CertManager. 45 | func TestE2E(t *testing.T) { 46 | RegisterFailHandler(Fail) 47 | _, _ = fmt.Fprintf(GinkgoWriter, "Starting modelservice integration test suite\n") 48 | RunSpecs(t, "e2e suite") 49 | } 50 | 51 | var ( 52 | k8sClient client.Client 53 | cfg *rest.Config 54 | ctx = context.TODO() 55 | ) 56 | 57 | var _ = BeforeSuite(func() { 58 | By("deleting kind cluster if it exists") 59 | cmd := exec.Command("kind", "delete", "cluster", "--name", testCluster) 60 | _, _ = utils.Run(cmd) 61 | // ignore problems 62 | 63 | By("creating Kind cluster") 64 | cmd = exec.Command("kind", "create", "cluster", "--image", kindImage, "--name", testCluster) 65 | _, err := utils.Run(cmd) 66 | ExpectWithOffset(1, err).NotTo( 67 | HaveOccurred(), 68 | fmt.Sprintf("Failed to create Kind cluster %s", testCluster), 69 | ) 70 | 71 | var kubeconfig string 72 | if os.Getenv("KUBECONFIG") != "" { 73 | kubeconfig = os.Getenv("KUBECONFIG") 74 | } else { 75 | homeDir, _ := os.UserHomeDir() 76 | kubeconfig = filepath.Join(homeDir, ".kube", "config") 77 | } 78 | cfg, err = clientcmd.BuildConfigFromFlags("", kubeconfig) 79 | Expect(err).ToNot(HaveOccurred(), "Failed to build kubeconfig") 80 | var scheme = runtime.NewScheme() 81 | 82 | err = clientgoscheme.AddToScheme(scheme) 83 | Expect(err).NotTo(HaveOccurred()) 84 | 85 | Expect(msv1alpha1.AddToScheme(scheme)).To(Succeed()) 86 | Expect(giev1alpha2.Install(scheme)).To(Succeed()) 87 | k8sClient, err = client.New(cfg, client.Options{Scheme: scheme}) 88 | Expect(err).ToNot(HaveOccurred(), "Failed to create k8s client") 89 | 90 | By("building the manager(Operator) image") 91 | cmd = exec.Command("make", "docker-build", fmt.Sprintf("IMG=%s", projectImage)) 92 | _, err = utils.Run(cmd) 93 | ExpectWithOffset(1, err).NotTo(HaveOccurred(), "Failed to build the manager(Operator) image") 94 | 95 | By("archiving the image") 96 | cmd = exec.Command("make", "archive-image", fmt.Sprintf("IMG=%s", projectImage), fmt.Sprintf("IMG_ARCHIVE=%s", imageArchive)) 97 | _, err = utils.Run(cmd) 98 | ExpectWithOffset(1, err).NotTo(HaveOccurred(), "Failed to archive the manager(operator) image") 99 | 100 | // TODO(user): If you want to change the e2e test vendor from Kind, ensure the image is 101 | // built and available before running the tests. Also, remove the following block. 102 | By("loading the manager(Operator) image on Kind") 103 | err = utils.LoadImageToKindClusterWithName(imageArchive, testCluster) 104 | ExpectWithOffset(1, err).NotTo(HaveOccurred(), "Failed to load the manager(Operator) image into Kind") 105 | 106 | // The tests-e2e are intended to run on a temporary cluster that is created and destroyed for testing. 107 | // To prevent errors when tests run in environments with CertManager already installed, 108 | // we check for its presence before execution. 109 | // Setup CertManager before the suite if not skipped and if not already installed 110 | if !skipCertManagerInstall { 111 | By("checking if cert manager is installed already") 112 | isCertManagerAlreadyInstalled = utils.IsCertManagerCRDsInstalled() 113 | if !isCertManagerAlreadyInstalled { 114 | _, _ = fmt.Fprintf(GinkgoWriter, "Installing CertManager...\n") 115 | Expect(utils.InstallCertManager()).To(Succeed(), "Failed to install CertManager") 116 | } else { 117 | _, _ = fmt.Fprintf(GinkgoWriter, "WARNING: CertManager is already installed. Skipping installation...\n") 118 | } 119 | } 120 | }) 121 | 122 | var _ = AfterSuite(func() { 123 | // Teardown CertManager after the suite if not skipped and if it was not already installed 124 | if !skipCertManagerInstall && !isCertManagerAlreadyInstalled { 125 | _, _ = fmt.Fprintf(GinkgoWriter, "Uninstalling CertManager...\n") 126 | utils.UninstallCertManager() 127 | } 128 | 129 | // delete test cluster 130 | cmd := exec.Command("kind", "delete", "cluster", "--name", testCluster) 131 | _, _ = utils.Run(cmd) 132 | 133 | }) 134 | -------------------------------------------------------------------------------- /test/modelservices/baseResources.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ConfigMap 3 | metadata: 4 | name: br1.yaml 5 | data: 6 | 7 | -------------------------------------------------------------------------------- /test/modelservices/ms1.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: llm-d.ai/v1alpha1 2 | kind: ModelService 3 | metadata: 4 | name: test-modelservice 5 | namespace: test-namespace 6 | spec: 7 | routing: 8 | modelName: repo/model 9 | modelArtifacts: 10 | uri: "pvc://pvc-name/path/to/model" 11 | decoupleScaling: false 12 | decode: 13 | containers: 14 | - name: llm-proxy 15 | image: "ghcr.io/llm-d/llm-d-routingsidecar-dev:0.0.5" 16 | imagePullPolicy: "Always" 17 | - name: llm-container 18 | image: "ghcr.io/llm-d/llm-d-dev:0.0.2" 19 | 20 | -------------------------------------------------------------------------------- /test/utils/utils.go: -------------------------------------------------------------------------------- 1 | package utils 2 | 3 | import ( 4 | "bufio" 5 | "bytes" 6 | "fmt" 7 | "os" 8 | "os/exec" 9 | "strings" 10 | 11 | . "github.com/onsi/ginkgo/v2" //nolint:staticcheck,golint,revive 12 | ) 13 | 14 | const ( 15 | prometheusOperatorVersion = "v0.77.1" 16 | prometheusOperatorURL = "https://github.com/prometheus-operator/prometheus-operator/" + 17 | "releases/download/%s/bundle.yaml" 18 | 19 | certmanagerVersion = "v1.16.3" 20 | certmanagerURLTmpl = "https://github.com/cert-manager/cert-manager/releases/download/%s/cert-manager.yaml" 21 | ) 22 | 23 | func warnError(err error) { 24 | _, _ = fmt.Fprintf(GinkgoWriter, "warning: %v\n", err) 25 | } 26 | 27 | // Run executes the provided command within this context 28 | func Run(cmd *exec.Cmd) (string, error) { 29 | dir, _ := GetProjectDir() 30 | cmd.Dir = dir 31 | 32 | if err := os.Chdir(cmd.Dir); err != nil { 33 | _, _ = fmt.Fprintf(GinkgoWriter, "chdir dir: %s\n", err) 34 | } 35 | 36 | cmd.Env = append(os.Environ(), "GO111MODULE=on") 37 | command := strings.Join(cmd.Args, " ") 38 | _, _ = fmt.Fprintf(GinkgoWriter, "running: %s\n", command) 39 | output, err := cmd.CombinedOutput() 40 | if err != nil { 41 | return string(output), fmt.Errorf("%s failed with error: (%v) %s", command, err, string(output)) 42 | } 43 | 44 | return string(output), nil 45 | } 46 | 47 | // InstallPrometheusOperator installs the prometheus Operator to be used to export the enabled metrics. 48 | func InstallPrometheusOperator() error { 49 | url := fmt.Sprintf(prometheusOperatorURL, prometheusOperatorVersion) 50 | cmd := exec.Command("kubectl", "create", "-f", url) 51 | _, err := Run(cmd) 52 | return err 53 | } 54 | 55 | // UninstallPrometheusOperator uninstalls the prometheus 56 | func UninstallPrometheusOperator() { 57 | url := fmt.Sprintf(prometheusOperatorURL, prometheusOperatorVersion) 58 | cmd := exec.Command("kubectl", "delete", "-f", url) 59 | if _, err := Run(cmd); err != nil { 60 | warnError(err) 61 | } 62 | } 63 | 64 | // IsPrometheusCRDsInstalled checks if any Prometheus CRDs are installed 65 | // by verifying the existence of key CRDs related to Prometheus. 66 | func IsPrometheusCRDsInstalled() bool { 67 | // List of common Prometheus CRDs 68 | prometheusCRDs := []string{ 69 | "prometheuses.monitoring.coreos.com", 70 | "prometheusrules.monitoring.coreos.com", 71 | "prometheusagents.monitoring.coreos.com", 72 | } 73 | 74 | cmd := exec.Command("kubectl", "get", "crds", "-o", "custom-columns=NAME:.metadata.name") 75 | output, err := Run(cmd) 76 | if err != nil { 77 | return false 78 | } 79 | crdList := GetNonEmptyLines(output) 80 | for _, crd := range prometheusCRDs { 81 | for _, line := range crdList { 82 | if strings.Contains(line, crd) { 83 | return true 84 | } 85 | } 86 | } 87 | 88 | return false 89 | } 90 | 91 | // UninstallCertManager uninstalls the cert manager 92 | func UninstallCertManager() { 93 | url := fmt.Sprintf(certmanagerURLTmpl, certmanagerVersion) 94 | cmd := exec.Command("kubectl", "delete", "-f", url) 95 | if _, err := Run(cmd); err != nil { 96 | warnError(err) 97 | } 98 | } 99 | 100 | // InstallCertManager installs the cert manager bundle. 101 | func InstallCertManager() error { 102 | url := fmt.Sprintf(certmanagerURLTmpl, certmanagerVersion) 103 | cmd := exec.Command("kubectl", "apply", "-f", url) 104 | if _, err := Run(cmd); err != nil { 105 | return err 106 | } 107 | // Wait for cert-manager-webhook to be ready, which can take time if cert-manager 108 | // was re-installed after uninstalling on a cluster. 109 | cmd = exec.Command("kubectl", "wait", "deployment.apps/cert-manager-webhook", 110 | "--for", "condition=Available", 111 | "--namespace", "cert-manager", 112 | "--timeout", "5m", 113 | ) 114 | 115 | _, err := Run(cmd) 116 | return err 117 | } 118 | 119 | // IsCertManagerCRDsInstalled checks if any Cert Manager CRDs are installed 120 | // by verifying the existence of key CRDs related to Cert Manager. 121 | func IsCertManagerCRDsInstalled() bool { 122 | // List of common Cert Manager CRDs 123 | certManagerCRDs := []string{ 124 | "certificates.cert-manager.io", 125 | "issuers.cert-manager.io", 126 | "clusterissuers.cert-manager.io", 127 | "certificaterequests.cert-manager.io", 128 | "orders.acme.cert-manager.io", 129 | "challenges.acme.cert-manager.io", 130 | } 131 | 132 | // Execute the kubectl command to get all CRDs 133 | cmd := exec.Command("kubectl", "get", "crds") 134 | output, err := Run(cmd) 135 | if err != nil { 136 | return false 137 | } 138 | 139 | // Check if any of the Cert Manager CRDs are present 140 | crdList := GetNonEmptyLines(output) 141 | for _, crd := range certManagerCRDs { 142 | for _, line := range crdList { 143 | if strings.Contains(line, crd) { 144 | return true 145 | } 146 | } 147 | } 148 | 149 | return false 150 | } 151 | 152 | // LoadImageToKindClusterWithName loads a local docker image to the kind cluster 153 | func LoadImageToKindClusterWithName(imageName string, cluster string) error { 154 | if v, ok := os.LookupEnv("KIND_CLUSTER"); ok { 155 | cluster = v 156 | } 157 | kindOptions := []string{"load", "image-archive", imageName, "--name", cluster} 158 | cmd := exec.Command("kind", kindOptions...) 159 | _, err := Run(cmd) 160 | return err 161 | } 162 | 163 | // GetNonEmptyLines converts given command output string into individual objects 164 | // according to line breakers, and ignores the empty elements in it. 165 | func GetNonEmptyLines(output string) []string { 166 | var res []string 167 | elements := strings.Split(output, "\n") 168 | for _, element := range elements { 169 | if element != "" { 170 | res = append(res, element) 171 | } 172 | } 173 | 174 | return res 175 | } 176 | 177 | // GetProjectDir will return the directory where the project is 178 | func GetProjectDir() (string, error) { 179 | wd, err := os.Getwd() 180 | if err != nil { 181 | return wd, err 182 | } 183 | wd = strings.ReplaceAll(wd, "/test/e2e", "") 184 | return wd, nil 185 | } 186 | 187 | // UncommentCode searches for target in the file and remove the comment prefix 188 | // of the target content. The target content may span multiple lines. 189 | func UncommentCode(filename, target, prefix string) error { 190 | // false positive 191 | // nolint:gosec 192 | content, err := os.ReadFile(filename) 193 | if err != nil { 194 | return err 195 | } 196 | strContent := string(content) 197 | 198 | idx := strings.Index(strContent, target) 199 | if idx < 0 { 200 | return fmt.Errorf("unable to find the code %s to be uncomment", target) 201 | } 202 | 203 | out := new(bytes.Buffer) 204 | _, err = out.Write(content[:idx]) 205 | if err != nil { 206 | return err 207 | } 208 | 209 | scanner := bufio.NewScanner(bytes.NewBufferString(target)) 210 | if !scanner.Scan() { 211 | return nil 212 | } 213 | for { 214 | _, err := out.WriteString(strings.TrimPrefix(scanner.Text(), prefix)) 215 | if err != nil { 216 | return err 217 | } 218 | // Avoid writing a newline in case the previous line was the last in target. 219 | if !scanner.Scan() { 220 | break 221 | } 222 | if _, err := out.WriteString("\n"); err != nil { 223 | return err 224 | } 225 | } 226 | 227 | _, err = out.Write(content[idx+len(target):]) 228 | if err != nil { 229 | return err 230 | } 231 | // false positive 232 | // nolint:gosec 233 | return os.WriteFile(filename, out.Bytes(), 0644) 234 | } 235 | --------------------------------------------------------------------------------