├── tools
    ├── benchmark
    │   ├── requirements.txt
    │   ├── README.md
    │   └── download-benchmark-results.bash
    ├── dynamic-lora-sidecar
    │   ├── .gitignore
    │   ├── screenshots
    │   │   ├── vllm-logs.png
    │   │   └── lora-syncer-logs.png
    │   ├── requirements.txt
    │   ├── Dockerfile
    │   └── Makefile
    ├── dashboards
    │   ├── inference_gateway_dashboard_1.png
    │   ├── inference_gateway_dashboard_2.png
    │   ├── inference_gateway_dashboard_3.png
    │   └── README.md
    ├── simulations
    │   └── llm_ig_simulation
    │   │   └── src
    │   │       ├── __init__.py
    │   │       └── constants.py
    └── tools.go
├── .github
    ├── ISSUE_TEMPLATE
    │   ├── config.yml
    │   ├── blank_issue.md
    │   ├── feature_request.md
    │   └── bug_request.md
    ├── workflows
    │   ├── non-main-gatekeeper.yml
    │   └── kal.yml
    └── dependabot.yml
├── config
    ├── charts
    │   ├── body-based-routing
    │   │   ├── templates
    │   │   │   ├── NOTES.txt
    │   │   │   ├── bbr.yaml
    │   │   │   ├── gke.yaml
    │   │   │   └── istio.yaml
    │   │   ├── Chart.yaml
    │   │   ├── values.yaml
    │   │   └── .helmignore
    │   └── inferencepool
    │   │   ├── templates
    │   │       ├── NOTES.txt
    │   │       ├── _validations.tpl
    │   │       ├── epp-sa-token-secret.yaml
    │   │       ├── epp-service.yaml
    │   │       ├── istio.yaml
    │   │       ├── _helpers.tpl
    │   │       ├── leader-election-rbac.yaml
    │   │       ├── epp-config.yaml
    │   │       └── epp-servicemonitor.yaml
    │   │   ├── Chart.yaml
    │   │   └── .helmignore
    ├── observability
    │   └── prometheus
    │   │   ├── rbac.yaml
    │   │   └── values.yaml
    ├── manifests
    │   ├── gateway
    │   │   ├── istio
    │   │   │   ├── gateway.yaml
    │   │   │   └── httproute.yaml
    │   │   ├── kgateway
    │   │   │   ├── gateway.yaml
    │   │   │   └── httproute.yaml
    │   │   ├── nginxgatewayfabric
    │   │   │   ├── gateway.yaml
    │   │   │   └── httproute.yaml
    │   │   ├── gke
    │   │   │   ├── gateway.yaml
    │   │   │   └── httproute.yaml
    │   │   └── envoyaigateway
    │   │   │   ├── gateway.yaml
    │   │   │   └── httproute.yaml
    │   ├── benchmark
    │   │   └── model-server-service.yaml
    │   ├── inferenceobjective.yaml
    │   ├── vllm
    │   │   └── sim-deployment.yaml
    │   └── bbr-example
    │   │   └── httproute_bbr.yaml
    └── crd
    │   ├── kustomizeconfig.yaml
    │   └── kustomization.yaml
├── pkg
    ├── epp
    │   ├── scheduling
    │   │   ├── framework
    │   │   │   ├── plugins
    │   │   │   │   ├── test
    │   │   │   │   │   ├── README.md
    │   │   │   │   │   └── consts.go
    │   │   │   │   ├── multi
    │   │   │   │   │   └── prefix
    │   │   │   │   │   │   └── OWNERS
    │   │   │   │   ├── README.md
    │   │   │   │   └── picker
    │   │   │   │   │   └── common.go
    │   │   │   └── weighted_scorer.go
    │   │   └── scheduler_config.go
    │   ├── metrics
    │   │   └── testdata
    │   │   │   ├── prefix_indexer_size_metric
    │   │   │   ├── queue_avg_size_metrics
    │   │   │   ├── kv_cache_avg_metrics
    │   │   │   ├── running_requests_metrics
    │   │   │   ├── request_total_metric
    │   │   │   ├── request_error_total_metric
    │   │   │   ├── prefix_indexer_hit_ratio_metric
    │   │   │   ├── scheduler_e2e_duration_seconds_metric
    │   │   │   └── prefix_indexer_hit_bytes_metric
    │   ├── util
    │   │   ├── logging
    │   │   │   ├── logging_const.go
    │   │   │   ├── fatal.go
    │   │   │   └── logger.go
    │   │   ├── request
    │   │   │   ├── sheddable.go
    │   │   │   ├── metadata.go
    │   │   │   └── headers.go
    │   │   ├── metrics
    │   │   │   └── metrics.go
    │   │   ├── pod
    │   │   │   └── pod.go
    │   │   └── error
    │   │   │   └── error.go
    │   ├── backend
    │   │   ├── pod.go
    │   │   └── metrics
    │   │   │   └── metrics_state.go
    │   ├── datalayer
    │   │   ├── metrics
    │   │   │   └── types.go
    │   │   └── mocks
    │   │   │   └── ticker.go
    │   ├── config
    │   │   └── config.go
    │   ├── plugins
    │   │   ├── shared_state.go
    │   │   ├── typedname.go
    │   │   ├── registry.go
    │   │   └── plugins.go
    │   ├── saturationdetector
    │   │   └── config.go
    │   ├── flowcontrol
    │   │   ├── contracts
    │   │   │   └── doc.go
    │   │   ├── framework
    │   │   │   └── doc.go
    │   │   └── registry
    │   │   │   └── connection.go
    │   ├── requestcontrol
    │   │   ├── types.go
    │   │   └── plugin_executor.go
    │   └── server
    │   │   └── runserver_test.go
    ├── README.md
    ├── bbr
    │   └── README.md
    └── common
    │   └── kubemeta.go
├── sidecars
    └── latencypredictorasync
    │   └── OWNERS
├── site-src
    ├── images
    │   ├── ga-stage.png
    │   ├── alpha-stage.png
    │   ├── favicon-64.png
    │   ├── request-flow.png
    │   ├── migration-stage.png
    │   ├── resource-model.png
    │   ├── running-example.png
    │   ├── logo
    │   │   ├── logo-text-xl-dark.png
    │   │   └── logo-text-large-horizontal-white.png
    │   ├── edit-environment-variables.png
    │   ├── inferencepool-vs-service.png
    │   └── modify-run-configuration.png
    ├── .mkdocs-exclude
    ├── performance
    │   └── benchmark
    │   │   └── example-bar-chart.png
    ├── _includes
    │   ├── infobj.md
    │   ├── intro.md
    │   ├── model-server-sim.md
    │   ├── test.md
    │   ├── model-server-gpu.md
    │   ├── prereqs.md
    │   ├── bbr.md
    │   ├── model-server.md
    │   ├── model-server-cpu.md
    │   ├── epp.md
    │   └── epp-latest.md
    ├── enhancements
    │   └── overview.md
    ├── concepts
    │   ├── roles-and-personas.md
    │   ├── conformance.md
    │   └── priority-and-capacity.md
    ├── api-types
    │   ├── inferenceobjective.md
    │   └── inferencepoolimport.md
    ├── stylesheets
    │   └── extra.css
    ├── guides
    │   └── epp-configuration
    │   │   └── flags.md
    └── contributing
    │   └── devguide.md
├── .dockerignore
├── test
    └── testdata
    │   ├── model-secret.yaml
    │   ├── client.yaml
    │   ├── configloader_1_test.yaml
    │   ├── metrics-rbac.yaml
    │   └── inferencepool-with-model-hermetic.yaml
├── code-of-conduct.md
├── latencypredictor
    ├── requirements.txt
    ├── Dockerfile-training
    ├── Dockerfile-prediction
    └── Dockerfile-test
├── benchmarking
    └── inference-perf
    │   ├── Chart.yaml
    │   ├── templates
    │       ├── configmap.yaml
    │       └── secret.yaml
    │   └── .helmignore
├── docs
    └── proposals
    │   ├── 0845-scheduler-architecture-proposal
    │       ├── images
    │       │   └── scheduler_cycle.png
    │       └── examples
    │       │   └── example.yaml
    │   └── README.md
├── .custom-gcl.yml
├── OWNERS
├── netlify.toml
├── OWNERS_ALIASES
├── RELEASE.md
├── crd-ref-docs.yaml
├── SECURITY_CONTACTS
├── api
    ├── doc.go
    └── v1
    │   └── doc.go
├── apix
    ├── doc.go
    ├── v1alpha1
    │   ├── doc.go
    │   └── shared_types.go
    ├── config
    │   └── v1alpha1
    │   │   └── doc.go
    └── v1alpha2
    │   └── doc.go
├── hack
    ├── boilerplate
    │   ├── boilerplate.go.txt
    │   ├── boilerplate.generatego.txt
    │   ├── boilerplate.py.txt
    │   └── boilerplate.sh.txt
    ├── mkdocs
    │   └── image
    │   │   ├── requirements.txt
    │   │   ├── entrypoint.sh
    │   │   └── Dockerfile
    ├── referencer.go
    ├── verify-boilerplate.sh
    └── update-codegen.sh
├── .gitignore
├── conformance
    ├── reports
    │   ├── v1.0.2
    │   │   └── gateway
    │   │   │   ├── nginx-nginx-gateway-fabric
    │   │   │       ├── inference-v2.2.0-report.yaml
    │   │   │       └── README.md
    │   │   │   └── kgateway
    │   │   │       └── inference-v2.1.0-report.yaml
    │   ├── v0.5.1
    │   │   └── gateway
    │   │   │   ├── agentgateway
    │   │   │       ├── inference-v0.7.2-report.yaml
    │   │   │       └── README.md
    │   │   │   ├── kgateway
    │   │   │       └── inference-v2.0.4-report.yaml
    │   │   │   ├── envoy-ai-gateway
    │   │   │       └── aigw-latest-report.yaml
    │   │   │   ├── kubvernor
    │   │   │       ├── kubvernor-inference-conformance-output-0.1.1.yaml
    │   │   │       └── README.md
    │   │   │   └── ack-gateway
    │   │   │       └── v1.4.0-apsara.3-gateway-report.yaml
    │   ├── v0.5.0
    │   │   └── gateway
    │   │   │   ├── gke-gateway
    │   │   │       └── standard-v1.32.4-rxlb-gateway-report.yaml
    │   │   │   └── istio
    │   │   │       ├── 1.28-alpha.32ca03082f566513ad9b860f31e7745b0f68dc91-default-gateway-report.yaml
    │   │   │       └── README.md
    │   └── v0.4.0
    │   │   └── gateway
    │   │       └── istio
    │   │           ├── 1.28-alpha.32ca03082f566513ad9b860f31e7745b0f68dc91-default-gateway-report.yaml
    │   │           └── README.md
    ├── tests
    │   ├── httproute_invalid_inferencepool_ref.yaml
    │   ├── gateway_following_epp_routing.yaml
    │   ├── gateway_following_epp_routing_dp.yaml
    │   ├── epp_unavailable_fail_open.yaml
    │   ├── inferencepool_accepted.yaml
    │   ├── inferencepool_multiple_rules_different_pools.yaml
    │   ├── gateway_weighted_two_pools.yaml
    │   ├── inferencepool_invalid_epp_service.yaml
    │   ├── httproute_multiple_gateways_different_pools.yaml
    │   ├── main.go
    │   └── inferencepool_resolvedrefs_condition.yaml
    ├── embed.go
    ├── conformance_test.go
    └── utils
    │   └── assertions.go
├── bbr.Dockerfile
├── client-go
    ├── clientset
    │   └── versioned
    │   │   ├── fake
    │   │       └── doc.go
    │   │   ├── typed
    │   │       ├── api
    │   │       │   └── v1
    │   │       │   │   ├── doc.go
    │   │       │   │   ├── fake
    │   │       │   │       ├── doc.go
    │   │       │   │       └── fake_api_client.go
    │   │       │   │   └── generated_expansion.go
    │   │       └── apix
    │   │       │   ├── v1alpha1
    │   │       │       ├── fake
    │   │       │       │   ├── doc.go
    │   │       │       │   └── fake_apix_client.go
    │   │       │       ├── doc.go
    │   │       │       └── generated_expansion.go
    │   │       │   └── v1alpha2
    │   │       │       ├── fake
    │   │       │           ├── doc.go
    │   │       │           └── fake_apix_client.go
    │   │       │       ├── doc.go
    │   │       │       └── generated_expansion.go
    │   │   └── scheme
    │   │       └── doc.go
    ├── listers
    │   ├── api
    │   │   └── v1
    │   │   │   └── expansion_generated.go
    │   └── apix
    │   │   └── v1alpha1
    │   │       └── expansion_generated.go
    ├── applyconfiguration
    │   ├── apix
    │   │   └── v1alpha2
    │   │   │   └── match.go
    │   ├── api
    │   │   └── v1
    │   │   │   └── port.go
    │   └── internal
    │   │   └── internal.go
    └── informers
    │   └── externalversions
    │       ├── internalinterfaces
    │           └── factory_interfaces.go
    │       └── api
    │           ├── interface.go
    │           └── v1
    │               └── interface.go
├── SECURITY.md
├── cmd
    ├── bbr
    │   └── main.go
    └── epp
    │   └── main.go
├── Dockerfile
├── .golangci.yml
├── PROJECT
├── internal
    └── runnable
    │   └── leader_election.go
└── version
    └── version.go


/tools/benchmark/requirements.txt:
--------------------------------------------------------------------------------
1 | pandas
2 | numpy
3 | matplotlib


--------------------------------------------------------------------------------
/tools/dynamic-lora-sidecar/.gitignore:
--------------------------------------------------------------------------------
1 | sidecar/__pycache__/
2 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/config.yml:
--------------------------------------------------------------------------------
1 | blank_issues_enabled: false
2 | 


--------------------------------------------------------------------------------
/config/charts/body-based-routing/templates/NOTES.txt:
--------------------------------------------------------------------------------
1 | Body-based routing extension deployed.
2 | 


--------------------------------------------------------------------------------
/config/charts/inferencepool/templates/NOTES.txt:
--------------------------------------------------------------------------------
1 | InferencePool {{ .Release.Name }} deployed.
2 | 


--------------------------------------------------------------------------------
/pkg/epp/scheduling/framework/plugins/test/README.md:
--------------------------------------------------------------------------------
1 | This package contains plugins implementation for test purpose only.
2 | 


--------------------------------------------------------------------------------
/sidecars/latencypredictorasync/OWNERS:
--------------------------------------------------------------------------------
1 | # See the OWNERS docs at https://go.k8s.io/owners
2 | 
3 | approvers:
4 | - kaushikmitr
5 | 


--------------------------------------------------------------------------------
/site-src/images/ga-stage.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bexxmodd/gateway-api-inference-extension/main/site-src/images/ga-stage.png


--------------------------------------------------------------------------------
/site-src/images/alpha-stage.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bexxmodd/gateway-api-inference-extension/main/site-src/images/alpha-stage.png


--------------------------------------------------------------------------------
/site-src/images/favicon-64.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bexxmodd/gateway-api-inference-extension/main/site-src/images/favicon-64.png


--------------------------------------------------------------------------------
/pkg/README.md:
--------------------------------------------------------------------------------
1 | ## Quickstart
2 | 
3 | Please refer to our Getting started guide here: https://gateway-api-inference-extension.sigs.k8s.io/guides/


--------------------------------------------------------------------------------
/site-src/.mkdocs-exclude:
--------------------------------------------------------------------------------
1 | .mkdocs-exclude
2 | .nojekyll
3 | .placeholder
4 | search/search_index.json
5 | sitemap.xml.gz
6 | sitemap.xml
7 | 


--------------------------------------------------------------------------------
/site-src/images/request-flow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bexxmodd/gateway-api-inference-extension/main/site-src/images/request-flow.png


--------------------------------------------------------------------------------
/.dockerignore:
--------------------------------------------------------------------------------
1 | # More info: https://docs.docker.com/engine/reference/builder/#dockerignore-file
2 | # Ignore build and test binaries.
3 | bin/
4 | 


--------------------------------------------------------------------------------
/pkg/epp/scheduling/framework/plugins/multi/prefix/OWNERS:
--------------------------------------------------------------------------------
1 | # See the OWNERS docs at https://go.k8s.io/owners
2 | 
3 | approvers:
4 | - liu-cong
5 | 


--------------------------------------------------------------------------------
/site-src/images/migration-stage.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bexxmodd/gateway-api-inference-extension/main/site-src/images/migration-stage.png


--------------------------------------------------------------------------------
/site-src/images/resource-model.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bexxmodd/gateway-api-inference-extension/main/site-src/images/resource-model.png


--------------------------------------------------------------------------------
/site-src/images/running-example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bexxmodd/gateway-api-inference-extension/main/site-src/images/running-example.png


--------------------------------------------------------------------------------
/site-src/images/logo/logo-text-xl-dark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bexxmodd/gateway-api-inference-extension/main/site-src/images/logo/logo-text-xl-dark.png


--------------------------------------------------------------------------------
/site-src/images/edit-environment-variables.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bexxmodd/gateway-api-inference-extension/main/site-src/images/edit-environment-variables.png


--------------------------------------------------------------------------------
/site-src/images/inferencepool-vs-service.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bexxmodd/gateway-api-inference-extension/main/site-src/images/inferencepool-vs-service.png


--------------------------------------------------------------------------------
/site-src/images/modify-run-configuration.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bexxmodd/gateway-api-inference-extension/main/site-src/images/modify-run-configuration.png


--------------------------------------------------------------------------------
/test/testdata/model-secret.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: Secret
3 | metadata:
4 |   name: hf-token
5 |   labels:
6 |     app: vllm
7 | stringData:
8 |   token: $HF_TOKEN
9 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/blank_issue.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Blank Issue
3 | about: Create a new issue from scratch
4 | title: ''
5 | labels: needs-triage
6 | assignees: ''
7 | 
8 | ---


--------------------------------------------------------------------------------
/code-of-conduct.md:
--------------------------------------------------------------------------------
1 | # Kubernetes Community Code of Conduct
2 | 
3 | Please refer to our [Kubernetes Community Code of Conduct](https://git.k8s.io/community/code-of-conduct.md)
4 | 


--------------------------------------------------------------------------------
/config/observability/prometheus/rbac.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: ServiceAccount
3 | metadata:
4 |   name: inference-gateway-sa-metrics-reader
5 |   namespace: monitoring
6 | 


--------------------------------------------------------------------------------
/tools/dashboards/inference_gateway_dashboard_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bexxmodd/gateway-api-inference-extension/main/tools/dashboards/inference_gateway_dashboard_1.png


--------------------------------------------------------------------------------
/tools/dashboards/inference_gateway_dashboard_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bexxmodd/gateway-api-inference-extension/main/tools/dashboards/inference_gateway_dashboard_2.png


--------------------------------------------------------------------------------
/tools/dashboards/inference_gateway_dashboard_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bexxmodd/gateway-api-inference-extension/main/tools/dashboards/inference_gateway_dashboard_3.png


--------------------------------------------------------------------------------
/site-src/performance/benchmark/example-bar-chart.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bexxmodd/gateway-api-inference-extension/main/site-src/performance/benchmark/example-bar-chart.png


--------------------------------------------------------------------------------
/tools/dynamic-lora-sidecar/screenshots/vllm-logs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bexxmodd/gateway-api-inference-extension/main/tools/dynamic-lora-sidecar/screenshots/vllm-logs.png


--------------------------------------------------------------------------------
/site-src/images/logo/logo-text-large-horizontal-white.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bexxmodd/gateway-api-inference-extension/main/site-src/images/logo/logo-text-large-horizontal-white.png


--------------------------------------------------------------------------------
/tools/benchmark/README.md:
--------------------------------------------------------------------------------
1 | This folder contains resources to run performance benchmarks. Pls follow the benchmark guide here https://gateway-api-inference-extension.sigs.k8s.io/performance/benchmark.


--------------------------------------------------------------------------------
/tools/dynamic-lora-sidecar/requirements.txt:
--------------------------------------------------------------------------------
1 | aiohttp==3.12.12
2 | jsonschema==4.24.0
3 | prometheus_client==0.22.1
4 | PyYAML==6.0.2
5 | requests==2.32.4
6 | watchfiles==1.0.5
7 | watchdog==6.0.0
8 | 


--------------------------------------------------------------------------------
/tools/dynamic-lora-sidecar/screenshots/lora-syncer-logs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bexxmodd/gateway-api-inference-extension/main/tools/dynamic-lora-sidecar/screenshots/lora-syncer-logs.png


--------------------------------------------------------------------------------
/latencypredictor/requirements.txt:
--------------------------------------------------------------------------------
 1 | fastapi
 2 | uvicorn[standard]
 3 | scikit-learn
 4 | numpy
 5 | pandas
 6 | joblib
 7 | river
 8 | pydantic
 9 | requests
10 | xgboost
11 | aiohttp
12 | lightgbm
13 | 


--------------------------------------------------------------------------------
/benchmarking/inference-perf/Chart.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v2
2 | name: inference-perf
3 | description: A Helm chart for running inference-perf benchmarking tool
4 | type: application
5 | version: 0.2.0
6 | appVersion: "0.2.0"
7 | 


--------------------------------------------------------------------------------
/config/charts/inferencepool/Chart.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v2
 2 | name: inferencepool
 3 | description: A Helm chart for InferencePool
 4 | 
 5 | type: application
 6 | 
 7 | version: 0.0.0
 8 | 
 9 | appVersion: "0.0.0"
10 | 


--------------------------------------------------------------------------------
/docs/proposals/0845-scheduler-architecture-proposal/images/scheduler_cycle.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bexxmodd/gateway-api-inference-extension/main/docs/proposals/0845-scheduler-architecture-proposal/images/scheduler_cycle.png


--------------------------------------------------------------------------------
/.custom-gcl.yml:
--------------------------------------------------------------------------------
1 | version: v2.3.1
2 | name: golangci-kube-api-linter
3 | destination: ./bin
4 | plugins:
5 | - module: 'sigs.k8s.io/kube-api-linter'
6 |   version: 'v0.0.0-20250808120943-48643eb2563d' # Pin to a commit while there's no tag
7 | 


--------------------------------------------------------------------------------
/OWNERS:
--------------------------------------------------------------------------------
1 | # See the OWNERS docs at https://go.k8s.io/owners
2 | 
3 | approvers:
4 | - gateway-api-inference-extension-maintainers
5 | 
6 | reviewers:
7 | - gateway-api-inference-extension-reviewers
8 | - gateway-api-inference-extension-maintainers
9 | 


--------------------------------------------------------------------------------
/config/charts/body-based-routing/Chart.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v2
 2 | name: body-based-routing
 3 | description: A Helm chart for the body-based routing extension
 4 | 
 5 | type: application
 6 | 
 7 | version: 0.1.0
 8 | 
 9 | appVersion: "0.2.0"
10 | 


--------------------------------------------------------------------------------
/pkg/epp/metrics/testdata/prefix_indexer_size_metric:
--------------------------------------------------------------------------------
1 | # HELP inference_extension_prefix_indexer_size [ALPHA] Size of the prefix indexer.
2 | # TYPE inference_extension_prefix_indexer_size gauge
3 | inference_extension_prefix_indexer_size{} 4096
4 | 


--------------------------------------------------------------------------------
/netlify.toml:
--------------------------------------------------------------------------------
1 | # netlify configuration
2 | [build]
3 | publish = "site"
4 | command = "make build-docs-netlify"
5 | # available here https://github.com/netlify/build-image/blob/focal/included_software.md#languages
6 | environment = { PYTHON_VERSION = "3.8" }


--------------------------------------------------------------------------------
/pkg/epp/metrics/testdata/queue_avg_size_metrics:
--------------------------------------------------------------------------------
1 | # HELP inference_pool_average_queue_size [ALPHA] The average number of requests pending in the model server queue.
2 | # TYPE inference_pool_average_queue_size gauge
3 | inference_pool_average_queue_size{name="p1"} 0.4
4 | 


--------------------------------------------------------------------------------
/config/manifests/gateway/istio/gateway.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: gateway.networking.k8s.io/v1
 2 | kind: Gateway
 3 | metadata:
 4 |   name: inference-gateway
 5 | spec:
 6 |   gatewayClassName: istio
 7 |   listeners:
 8 |   - name: http
 9 |     port: 80
10 |     protocol: HTTP
11 | 


--------------------------------------------------------------------------------
/config/manifests/gateway/kgateway/gateway.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: gateway.networking.k8s.io/v1
 2 | kind: Gateway
 3 | metadata:
 4 |   name: inference-gateway
 5 | spec:
 6 |   gatewayClassName: agentgateway
 7 |   listeners:
 8 |   - name: http
 9 |     port: 80
10 |     protocol: HTTP
11 | 


--------------------------------------------------------------------------------
/config/manifests/gateway/nginxgatewayfabric/gateway.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: gateway.networking.k8s.io/v1
 2 | kind: Gateway
 3 | metadata:
 4 |   name: inference-gateway
 5 | spec:
 6 |   gatewayClassName: nginx
 7 |   listeners:
 8 |   - name: http
 9 |     port: 80
10 |     protocol: HTTP
11 | 


--------------------------------------------------------------------------------
/config/manifests/benchmark/model-server-service.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Service
 3 | metadata:
 4 |   name: my-pool-service
 5 | spec:
 6 |   ports:
 7 |   - port: 8081
 8 |     protocol: TCP
 9 |     targetPort: 8000
10 |   selector:
11 |     app: my-pool
12 |   type: LoadBalancer
13 | 


--------------------------------------------------------------------------------
/config/manifests/gateway/gke/gateway.yaml:
--------------------------------------------------------------------------------
 1 | kind: Gateway
 2 | apiVersion: gateway.networking.k8s.io/v1
 3 | metadata:
 4 |  name: inference-gateway
 5 | spec:
 6 |  gatewayClassName: gke-l7-regional-external-managed
 7 |  listeners:
 8 |  - name: http
 9 |    port: 80
10 |    protocol: HTTP
11 | 


--------------------------------------------------------------------------------
/pkg/epp/metrics/testdata/kv_cache_avg_metrics:
--------------------------------------------------------------------------------
1 | # HELP inference_pool_average_kv_cache_utilization [ALPHA] The average kv cache utilization for an inference server pool.
2 | # TYPE inference_pool_average_kv_cache_utilization gauge
3 | inference_pool_average_kv_cache_utilization{name="p1"} 0.3
4 | 


--------------------------------------------------------------------------------
/pkg/epp/metrics/testdata/running_requests_metrics:
--------------------------------------------------------------------------------
1 | # HELP inference_objective_running_requests [ALPHA] Inference objective number of running requests in each model.
2 | # TYPE inference_objective_running_requests gauge
3 | inference_objective_running_requests{model_name="m1"} 1
4 | inference_objective_running_requests{model_name="m2"} 1
5 | 


--------------------------------------------------------------------------------
/site-src/_includes/infobj.md:
--------------------------------------------------------------------------------
1 | ??? example "Experimental"
2 | 
3 |     This project is still in an alpha state and breaking changes may occur in the future.
4 | 
5 | This quickstart guide is intended for engineers familiar with k8s and model servers (vLLM in this instance). The goal of this guide is to get an Inference Gateway up and running!
6 | 


--------------------------------------------------------------------------------
/site-src/_includes/intro.md:
--------------------------------------------------------------------------------
1 | ??? example "Experimental"
2 | 
3 |     This project is still in an alpha state and breaking changes may occur in the future.
4 | 
5 | This quickstart guide is intended for engineers familiar with k8s and model servers (vLLM in this instance). The goal of this guide is to get an Inference Gateway up and running!
6 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Suggest an idea for this project
 4 | title: ''
 5 | labels: needs-triage
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | <!-- Please only use this template for submitting enhancement requests -->
11 | 
12 | **What would you like to be added**:
13 | 
14 | **Why is this needed**:
15 | 


--------------------------------------------------------------------------------
/benchmarking/inference-perf/templates/configmap.yaml:
--------------------------------------------------------------------------------
 1 | # inference-perf/templates/configmap.yaml
 2 | apiVersion: v1
 3 | kind: ConfigMap
 4 | metadata:
 5 |   name: {{ include "inference-perf.fullname" . }}-config
 6 |   labels:
 7 |     {{- include "inference-perf.labels" . | nindent 4 }}
 8 | data:
 9 |   config.yml: |
10 |     {{- toYaml .Values.config | nindent 4 }}


--------------------------------------------------------------------------------
/OWNERS_ALIASES:
--------------------------------------------------------------------------------
 1 | # See the OWNERS docs: https://git.k8s.io/community/contributors/guide/owners.md
 2 | # This file should be kept in sync with k/org.
 3 | 
 4 | aliases:
 5 |   gateway-api-inference-extension-maintainers:
 6 |   - ahg-g
 7 |   - danehans
 8 |   - nirrozenbaum
 9 |   - kfswain
10 | 
11 |   gateway-api-inference-extension-reviewers:
12 |   - elevran
13 |   - liu-cong
14 |   - robscott
15 | 


--------------------------------------------------------------------------------
/config/charts/inferencepool/templates/_validations.tpl:
--------------------------------------------------------------------------------
1 | {{/*
2 | common validations
3 | */}}
4 | {{- define "gateway-api-inference-extension.validations.inferencepool.common" -}}
5 | {{- if or (empty $.Values.inferencePool.modelServers) (not $.Values.inferencePool.modelServers.matchLabels) }}
6 | {{- fail ".Values.inferencePool.modelServers.matchLabels is required" }}
7 | {{- end }}
8 | {{- end -}}
9 | 


--------------------------------------------------------------------------------
/test/testdata/client.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   annotations:
 5 |   labels:
 6 |     app: curl
 7 |   name: curl
 8 | spec:
 9 |   containers:
10 |   - command:
11 |     - tail
12 |     - -f
13 |     - /dev/null
14 |     image: curlimages/curl:7.83.1
15 |     imagePullPolicy: IfNotPresent
16 |     name: curl
17 |   restartPolicy: Never
18 |   schedulerName: default-scheduler
19 | 


--------------------------------------------------------------------------------
/site-src/_includes/model-server-sim.md:
--------------------------------------------------------------------------------
1 | === "vLLM Simulator Model Server"
2 | 
3 |     This option uses the [vLLM simulator](https://github.com/llm-d/llm-d-inference-sim/tree/main) to simulate a backend model server.
4 |     This setup uses the least amount of compute resources, does not require GPU's, and is ideal for test/dev environments.
5 | 
6 |     To deploy the vLLM simulator, run the following command.
7 | 


--------------------------------------------------------------------------------
/config/charts/body-based-routing/values.yaml:
--------------------------------------------------------------------------------
 1 | bbr:
 2 |   name: body-based-router
 3 |   replicas: 1
 4 |   image:
 5 |     name: bbr
 6 |     hub: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension
 7 |     tag: main
 8 |     pullPolicy: Always
 9 |   port: 9004
10 |   healthCheckPort: 9005
11 | 
12 | provider:
13 |   name: none
14 | 
15 | inferenceGateway:
16 |   name: inference-gateway
17 | 


--------------------------------------------------------------------------------
/benchmarking/inference-perf/templates/secret.yaml:
--------------------------------------------------------------------------------
 1 | # inference-perf/templates/secret.yaml
 2 | {{- if .Values.hfToken }}
 3 | apiVersion: v1
 4 | kind: Secret
 5 | metadata:
 6 |   name: {{ include "inference-perf.hfSecret" . }}
 7 |   labels:
 8 |     {{- include "inference-perf.labels" . | nindent 4 }}
 9 | type: Opaque
10 | stringData:
11 |   {{ include "inference-perf.hfKey" . }}: {{ .Values.hfToken | quote }}
12 | {{- end }}
13 | 


--------------------------------------------------------------------------------
/docs/proposals/README.md:
--------------------------------------------------------------------------------
1 | # Proposals Best Practices
2 | 
3 | 
4 | ## Naming
5 | The directory of the proposal should lead with a 4-digit PR number (will move to 5,6,... should our PR count get that high), followed by kebab-cased title. The PR number is not known until the PR is cut, so development can use a placeholder, ex. XXXX-my-proposal. PR number is used b/c it is unique & chronological, allowing the default ordering of proposals to follow the timeline of development.


--------------------------------------------------------------------------------
/pkg/epp/metrics/testdata/request_total_metric:
--------------------------------------------------------------------------------
1 | # HELP inference_objective_request_total [ALPHA] Counter of inference objective requests broken out for each model and target model.
2 | # TYPE inference_objective_request_total counter
3 | inference_objective_request_total{model_name="m10", target_model_name="t10"} 2
4 | inference_objective_request_total{model_name="m10", target_model_name="t11"} 1
5 | inference_objective_request_total{model_name="m20", target_model_name="t20"} 1
6 | 


--------------------------------------------------------------------------------
/site-src/_includes/test.md:
--------------------------------------------------------------------------------
 1 | ### Try it out
 2 | 
 3 |    Wait until the gateway is ready.
 4 | 
 5 |    ```bash
 6 |    IP=$(kubectl get gateway/inference-gateway -o jsonpath='{.status.addresses[0].value}')
 7 |    PORT=80
 8 | 
 9 |    curl -i ${IP}:${PORT}/v1/completions -H 'Content-Type: application/json' -d '{
10 |    "model": "food-review-1",
11 |    "prompt": "Write as if you were a critic: San Francisco",
12 |    "max_tokens": 100,
13 |    "temperature": 0
14 |    }'
15 |    ```
16 | 


--------------------------------------------------------------------------------
/config/manifests/gateway/nginxgatewayfabric/httproute.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: gateway.networking.k8s.io/v1
 2 | kind: HTTPRoute
 3 | metadata:
 4 |   name: llm-route
 5 |   namespace: default
 6 | spec:
 7 |   parentRefs:
 8 |   - name: inference-gateway
 9 |   rules:
10 |   - matches:
11 |     - path:
12 |         type: PathPrefix
13 |         value: /
14 |     backendRefs:
15 |     - group: inference.networking.k8s.io
16 |       kind: InferencePool
17 |       name: vllm-llama3-8b-instruct
18 | 
19 | 


--------------------------------------------------------------------------------
/benchmarking/inference-perf/.helmignore:
--------------------------------------------------------------------------------
 1 | # Patterns to ignore when building packages.
 2 | # This supports shell glob matching, relative path matching, and
 3 | # negation (prefixed with !). Only one pattern per line.
 4 | .DS_Store
 5 | # Common VCS dirs
 6 | .git/
 7 | .gitignore
 8 | .bzr/
 9 | .bzrignore
10 | .hg/
11 | .hgignore
12 | .svn/
13 | # Common backup files
14 | *.swp
15 | *.bak
16 | *.tmp
17 | *.orig
18 | *~
19 | # Various IDEs
20 | .project
21 | .idea/
22 | *.tmproj
23 | .vscode/
24 | 


--------------------------------------------------------------------------------
/config/charts/inferencepool/.helmignore:
--------------------------------------------------------------------------------
 1 | # Patterns to ignore when building packages.
 2 | # This supports shell glob matching, relative path matching, and
 3 | # negation (prefixed with !). Only one pattern per line.
 4 | .DS_Store
 5 | # Common VCS dirs
 6 | .git/
 7 | .gitignore
 8 | .bzr/
 9 | .bzrignore
10 | .hg/
11 | .hgignore
12 | .svn/
13 | # Common backup files
14 | *.swp
15 | *.bak
16 | *.tmp
17 | *.orig
18 | *~
19 | # Various IDEs
20 | .project
21 | .idea/
22 | *.tmproj
23 | .vscode/
24 | 


--------------------------------------------------------------------------------
/config/manifests/gateway/envoyaigateway/gateway.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: gateway.networking.k8s.io/v1
 2 | kind: GatewayClass
 3 | metadata:
 4 |   name: envoy-ai-gateway
 5 | spec:
 6 |   controllerName: gateway.envoyproxy.io/gatewayclass-controller
 7 | ---
 8 | apiVersion: gateway.networking.k8s.io/v1
 9 | kind: Gateway
10 | metadata:
11 |   name: inference-gateway
12 | spec:
13 |   gatewayClassName: envoy-ai-gateway
14 |   listeners:
15 |     - name: http
16 |       protocol: HTTP
17 |       port: 80
18 | 


--------------------------------------------------------------------------------
/config/charts/body-based-routing/.helmignore:
--------------------------------------------------------------------------------
 1 | # Patterns to ignore when building packages.
 2 | # This supports shell glob matching, relative path matching, and
 3 | # negation (prefixed with !). Only one pattern per line.
 4 | .DS_Store
 5 | # Common VCS dirs
 6 | .git/
 7 | .gitignore
 8 | .bzr/
 9 | .bzrignore
10 | .hg/
11 | .hgignore
12 | .svn/
13 | # Common backup files
14 | *.swp
15 | *.bak
16 | *.tmp
17 | *.orig
18 | *~
19 | # Various IDEs
20 | .project
21 | .idea/
22 | *.tmproj
23 | .vscode/
24 | 


--------------------------------------------------------------------------------
/config/manifests/gateway/gke/httproute.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: gateway.networking.k8s.io/v1
 2 | kind: HTTPRoute
 3 | metadata:
 4 |   name: llm-route
 5 | spec:
 6 |   parentRefs:
 7 |   - group: gateway.networking.k8s.io
 8 |     kind: Gateway
 9 |     name: inference-gateway
10 |   rules:
11 |   - backendRefs:
12 |     - group: inference.networking.k8s.io
13 |       kind: InferencePool
14 |       name: vllm-llama3-8b-instruct
15 |     matches:
16 |     - path:
17 |         type: PathPrefix
18 |         value: /
19 | 


--------------------------------------------------------------------------------
/site-src/_includes/model-server-gpu.md:
--------------------------------------------------------------------------------
1 | === "GPU-Based Model Server"
2 | 
3 |     For this setup, you will need 3 GPUs to run the sample model server. Adjust the number of replicas as needed.
4 |     Create a Hugging Face secret to download the model [meta-llama/Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct).
5 |     Ensure that the token grants access to this model.
6 | 
7 |     Deploy a sample vLLM deployment with the proper protocol to work with the LLM Instance Gateway.
8 | 


--------------------------------------------------------------------------------
/site-src/enhancements/overview.md:
--------------------------------------------------------------------------------
1 | # Inference Gateway Proposal process
2 | 
3 | Our current proposal process is intentionally light-weight. If you have a proposal you are interested in sharing, please follow these steps:
4 | 
5 | 1. Cut an issue or bring a topic to the weekly meeting!
6 | 2. Assuming positive signal, or if more context is needed please add a proposal, following the style and naming conventions shown here: https://github.com/kubernetes-sigs/gateway-api-inference-extension/tree/main/docs/proposals


--------------------------------------------------------------------------------
/config/manifests/gateway/istio/httproute.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: gateway.networking.k8s.io/v1
 2 | kind: HTTPRoute
 3 | metadata:
 4 |   name: llm-route
 5 | spec:
 6 |   parentRefs:
 7 |   - group: gateway.networking.k8s.io
 8 |     kind: Gateway
 9 |     name: inference-gateway
10 |   rules:
11 |   - backendRefs:
12 |     - group: inference.networking.k8s.io
13 |       kind: InferencePool
14 |       name: vllm-llama3-8b-instruct
15 |     matches:
16 |     - path:
17 |         type: PathPrefix
18 |         value: /
19 |     timeouts:
20 |       request: 300s
21 | 


--------------------------------------------------------------------------------
/config/manifests/gateway/kgateway/httproute.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: gateway.networking.k8s.io/v1
 2 | kind: HTTPRoute
 3 | metadata:
 4 |   name: llm-route
 5 | spec:
 6 |   parentRefs:
 7 |   - group: gateway.networking.k8s.io
 8 |     kind: Gateway
 9 |     name: inference-gateway
10 |   rules:
11 |   - backendRefs:
12 |     - group: inference.networking.k8s.io
13 |       kind: InferencePool
14 |       name: vllm-llama3-8b-instruct
15 |     matches:
16 |     - path:
17 |         type: PathPrefix
18 |         value: /
19 |     timeouts:
20 |       request: 300s
21 | 


--------------------------------------------------------------------------------
/config/manifests/gateway/envoyaigateway/httproute.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: gateway.networking.k8s.io/v1
 2 | kind: HTTPRoute
 3 | metadata:
 4 |   name: llm-route
 5 | spec:
 6 |   parentRefs:
 7 |     - group: gateway.networking.k8s.io
 8 |       kind: Gateway
 9 |       name: inference-gateway
10 |   rules:
11 |     - backendRefs:
12 |         - group: inference.networking.k8s.io
13 |           kind: InferencePool
14 |           name: vllm-llama3-8b-instruct
15 |       matches:
16 |         - path:
17 |             type: PathPrefix
18 |             value: /
19 |       timeouts:
20 |         request: 300s
21 | 


--------------------------------------------------------------------------------
/.github/workflows/non-main-gatekeeper.yml:
--------------------------------------------------------------------------------
 1 | name: Label non-main PRs
 2 | 
 3 | on:
 4 |   pull_request:
 5 |     types: [opened, edited, synchronize, reopened]
 6 | 
 7 | jobs:
 8 |   add-label:
 9 |     runs-on: ubuntu-latest
10 |     steps:
11 |       - name: Add labels when base branch is not main
12 |         if: github.event.pull_request.base.ref != 'main'
13 |         uses: actions-ecosystem/action-add-labels@v1
14 |         with:
15 |           github_token: ${{ secrets.GITHUB_TOKEN }}
16 |           labels: |
17 |             do-not-merge/hold
18 |             do-not-merge/cherry-pick-not-approved
19 | 


--------------------------------------------------------------------------------
/pkg/epp/metrics/testdata/request_error_total_metric:
--------------------------------------------------------------------------------
1 | # HELP inference_objective_request_error_total [ALPHA] Counter of inference objective requests errors broken out for each model and target model.
2 | # TYPE inference_objective_request_error_total counter
3 | inference_objective_request_error_total{error_code="Internal", model_name="m10",target_model_name="t10"} 2
4 | inference_objective_request_error_total{error_code="ModelServerError", model_name="m10",target_model_name="t11"} 1
5 | inference_objective_request_error_total{error_code="InferencePoolResourceExhausted", model_name="m20",target_model_name="t20"} 1
6 | 


--------------------------------------------------------------------------------
/pkg/epp/scheduling/framework/plugins/README.md:
--------------------------------------------------------------------------------
 1 | # Scheduling Plugins
 2 | 
 3 | This package contains the scheduling plugin implementations.
 4 | 
 5 | Plugins are organized by the following rule. Follow this rule when adding a new
 6 | plugin.
 7 | 
 8 | ```
 9 | plugins/
10 | |__ filter/(Plugins that implement the Filter interface only.)
11 | |__ scorer/ (Plugins that implement the Scorer interface only.)
12 | |__ picker/(Plugins that implement the Picker interface only.)
13 | |__ multi/ (Plugins that implement multiple plugin interfaces.)
14 | |____prefix/ (Prefix cache aware scheduling plugin.)
15 | ```
16 | 


--------------------------------------------------------------------------------
/RELEASE.md:
--------------------------------------------------------------------------------
 1 | # Release Process
 2 | 
 3 | The Kubernetes Template Project is released on an as-needed basis. The process is as follows:
 4 | 
 5 | 1. Update `version/version.go` with the new semver tag
 6 | 1. An issue is proposing a new release with a changelog since the last release
 7 | 1. All [OWNERS](OWNERS) must LGTM this release
 8 | 1. An OWNER runs `git tag -s $VERSION` and inserts the changelog and pushes the tag with `git push $VERSION`
 9 | 1. The release issue is closed
10 | 1. An announcement email is sent to `dev@kubernetes.io` with the subject `[ANNOUNCE] kubernetes-template-project $VERSION is released`


--------------------------------------------------------------------------------
/crd-ref-docs.yaml:
--------------------------------------------------------------------------------
 1 | # This file contains configuration for our reference docs generation. For more
 2 | # information about the possible configuration, refer to
 3 | # https://github.com/elastic/crd-ref-docs.
 4 | 
 5 | processor:
 6 |   ignoreTypes:
 7 |     - "(InferencePool|InferenceObjective|InferencePoolImport)List$"
 8 |   # RE2 regular expressions describing type fields that should be excluded from the generated documentation.
 9 |   ignoreFields:
10 |     - "TypeMeta$"
11 | 
12 | render:
13 |   # Version of Kubernetes to use when generating links to Kubernetes API documentation.
14 |   kubernetesVersion: 1.31
15 | 


--------------------------------------------------------------------------------
/tools/dynamic-lora-sidecar/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.10-slim-buster AS test
 2 | 
 3 | WORKDIR /dynamic-lora-reconciler-test
 4 | COPY requirements.txt .
 5 | COPY sidecar/* ./ 
 6 | RUN pip install -r requirements.txt
 7 | RUN python -m unittest discover || exit 1  
 8 | 
 9 | FROM python:3.10-slim-buster
10 | 
11 | WORKDIR /dynamic-lora-reconciler
12 | 
13 | RUN python3 -m venv /opt/venv
14 | 
15 | ENV PATH="/opt/venv/bin:$PATH"
16 | 
17 | RUN pip install --upgrade pip
18 | COPY requirements.txt .
19 | RUN pip install --no-cache-dir -r requirements.txt
20 | 
21 | COPY sidecar/* ./
22 | 
23 | CMD ["python", "sidecar.py"]


--------------------------------------------------------------------------------
/config/crd/kustomizeconfig.yaml:
--------------------------------------------------------------------------------
 1 | # This file is for teaching kustomize how to substitute name and namespace reference in CRD
 2 | nameReference:
 3 | - kind: Service
 4 |   version: v1
 5 |   fieldSpecs:
 6 |   - kind: CustomResourceDefinition
 7 |     version: v1
 8 |     group: apiextensions.k8s.io
 9 |     path: spec/conversion/webhook/clientConfig/service/name
10 | 
11 | namespace:
12 | - kind: CustomResourceDefinition
13 |   version: v1
14 |   group: apiextensions.k8s.io
15 |   path: spec/conversion/webhook/clientConfig/service/namespace
16 |   create: false
17 | 
18 | varReference:
19 | - path: metadata/annotations
20 | 


--------------------------------------------------------------------------------
/site-src/_includes/prereqs.md:
--------------------------------------------------------------------------------
 1 | A cluster with:
 2 | 
 3 | - Support for one of the three most recent Kubernetes minor [releases](https://kubernetes.io/releases/).
 4 | - Support for services of type `LoadBalancer`. For kind clusters, follow [this guide](https://kind.sigs.k8s.io/docs/user/loadbalancer)
 5 |   to get services of type LoadBalancer working.
 6 | - Support for [sidecar containers](https://kubernetes.io/docs/concepts/workloads/pods/sidecar-containers/) (enabled by default since Kubernetes v1.29)
 7 |   to run the model server deployment.
 8 | 
 9 | Tooling:
10 | 
11 | - [Helm](https://helm.sh/docs/intro/install/) installed.
12 | 


--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | updates:
 3 |   # Maintain dependencies for go
 4 |   - package-ecosystem: "gomod"
 5 |     directory: "/"
 6 |     schedule:
 7 |       interval: "weekly"
 8 |     labels:
 9 |       - "area/dependency"
10 |       - "ok-to-test"
11 |       - "release-note-none"
12 |     groups:
13 |       kubernetes:
14 |         patterns:
15 |           - "k8s.io/*"
16 |     ignore:
17 |       # Ignore major and minor versions for dependencies updates
18 |       # Allow patches and security updates.
19 |       - dependency-name: k8s.io/*
20 |         update-types: ["version-update:semver-major", "version-update:semver-minor"]
21 | 


--------------------------------------------------------------------------------
/test/testdata/configloader_1_test.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: inference.networking.x-k8s.io/v1alpha1
 2 | kind: EndpointPickerConfig
 3 | plugins:
 4 | - name: test1
 5 |   type: test-one
 6 |   parameters:
 7 |     threshold: 10
 8 | - name: profileHandler
 9 |   type: test-profile-handler
10 | - type: test-two
11 |   parameters:
12 |     blockSize: 32
13 | - name: testPicker
14 |   type: test-picker
15 | schedulingProfiles:
16 | - name: default
17 |   plugins:
18 |   - pluginRef: test1
19 |   - pluginRef: test-two
20 |     weight: 50
21 |   - pluginRef: testPicker
22 | featureGates:
23 | - dataLayer
24 | saturationDetector:
25 |   metricsStalenessThreshold: 150ms
26 | 


--------------------------------------------------------------------------------
/SECURITY_CONTACTS:
--------------------------------------------------------------------------------
 1 | # Defined below are the security contacts for this repo.
 2 | #
 3 | # They are the contact point for the Security Response Committee to reach out
 4 | # to for triaging and handling of incoming issues.
 5 | #
 6 | # The below names agree to abide by the
 7 | # [Embargo Policy](https://git.k8s.io/security/private-distributors-list.md#embargo-policy)
 8 | # and will be removed and replaced if they violate that agreement.
 9 | #
10 | # DO NOT REPORT SECURITY VULNERABILITIES DIRECTLY TO THESE NAMES, FOLLOW THE
11 | # INSTRUCTIONS AT https://kubernetes.io/security/
12 | 
13 | ArangoGutierrez
14 | Jeffwan
15 | SergeyKanzhelev
16 | terrytangyuan
17 | 


--------------------------------------------------------------------------------
/pkg/bbr/README.md:
--------------------------------------------------------------------------------
 1 | # Body-Based Routing
 2 | This package provides an extension that can be deployed to write the `model`
 3 | HTTP body parameter as a header (X-Gateway-Model-Name) so as to enable routing capabilities on the
 4 | model name.
 5 | 
 6 | As per OpenAI spec, it is standard for the model name to be included in the
 7 | body of the HTTP request. However, most implementations do not support routing
 8 | based on the request body. This extension helps bridge that gap for clients.
 9 | This extension works by parsing the request body. If it finds a `model` parameter in the
10 | request body, it will copy the value of that parameter into a request header.
11 | 


--------------------------------------------------------------------------------
/api/doc.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright 2024 The Kubernetes Authors.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 | 
17 | package api
18 | 


--------------------------------------------------------------------------------
/apix/doc.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright 2024 The Kubernetes Authors.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 | 
17 | package apix
18 | 


--------------------------------------------------------------------------------
/hack/boilerplate/boilerplate.go.txt:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright YEAR The Kubernetes Authors.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 | 


--------------------------------------------------------------------------------
/hack/boilerplate/boilerplate.generatego.txt:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright The Kubernetes Authors.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 | 


--------------------------------------------------------------------------------
/hack/boilerplate/boilerplate.py.txt:
--------------------------------------------------------------------------------
 1 | # Copyright YEAR The Kubernetes Authors.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/hack/boilerplate/boilerplate.sh.txt:
--------------------------------------------------------------------------------
 1 | # Copyright YEAR The Kubernetes Authors.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/config/charts/inferencepool/templates/epp-sa-token-secret.yaml:
--------------------------------------------------------------------------------
 1 | {{- if and .Values.inferenceExtension.monitoring.prometheus.enabled .Values.inferenceExtension.monitoring.prometheus.auth.enabled (ne (lower .Values.provider.name) "gke") }}
 2 | apiVersion: v1
 3 | kind: Secret
 4 | metadata:
 5 |   name: {{ .Values.inferenceExtension.monitoring.prometheus.auth.secretName }}
 6 |   namespace: {{ .Release.Namespace }}
 7 |   labels:
 8 |     {{- include "gateway-api-inference-extension.labels" . | nindent 4 }}
 9 |   annotations:
10 |     kubernetes.io/service-account.name: {{ include "gateway-api-inference-extension.name" . }}
11 | type: kubernetes.io/service-account-token
12 | {{- end }}


--------------------------------------------------------------------------------
/tools/simulations/llm_ig_simulation/src/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 The Kubernetes Authors.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Binaries for programs and plugins
 2 | *.exe
 3 | *.exe~
 4 | *.dll
 5 | *.so
 6 | *.dylib
 7 | bin/*
 8 | Dockerfile.cross
 9 | artifacts
10 | latencypredictor/__pycache__
11 | 
12 | # Test binary, built with `go test -c`
13 | *.test
14 | 
15 | # Output of the go coverage tool, specifically when used with LiteIDE
16 | *.out
17 | 
18 | # Go workspace file
19 | go.work
20 | go.work.sum
21 | 
22 | # Kubernetes Generated files - skip generated files, except for vendored files
23 | !vendor/**/zz_generated.*
24 | 
25 | # editor and IDE paraphernalia
26 | .idea
27 | .vscode
28 | *.swp
29 | *.swo
30 | *~
31 | 
32 | # generated docs
33 | site
34 | 
35 | # MacOS generated files
36 | **/.DS_Store
37 | 


--------------------------------------------------------------------------------
/conformance/reports/v1.0.2/gateway/nginx-nginx-gateway-fabric/inference-v2.2.0-report.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: gateway.networking.k8s.io/v1
 2 | date: "2025-10-28T14:18:58Z"
 3 | gatewayAPIChannel: experimental
 4 | gatewayAPIVersion: v1.3.0
 5 | implementation:
 6 |   contact:
 7 |   - https://github.com/nginx/nginx-gateway-fabric/discussions/new/choose
 8 |   organization: nginx
 9 |   project: nginx-gateway-fabric
10 |   url: https://github.com/nginx/nginx-gateway-fabric
11 |   version: 2.2.0
12 | kind: ConformanceReport
13 | mode: default
14 | profiles:
15 | - core:
16 |     result: success
17 |     statistics:
18 |       Failed: 0
19 |       Passed: 9
20 |       Skipped: 0
21 |   name: Gateway
22 |   summary: Core tests succeeded.


--------------------------------------------------------------------------------
/conformance/tests/httproute_invalid_inferencepool_ref.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: gateway.networking.k8s.io/v1
 2 | kind: HTTPRoute
 3 | metadata:
 4 |   name: httproute-to-non-existent-pool
 5 |   namespace: inference-conformance-app-backend
 6 | spec:
 7 |   parentRefs:
 8 |   - group: gateway.networking.k8s.io
 9 |     kind: Gateway
10 |     name: conformance-primary
11 |     namespace: inference-conformance-infra
12 |     sectionName: http
13 |   rules:
14 |   - backendRefs:
15 |     - group: inference.networking.k8s.io
16 |       kind: InferencePool
17 |       name: non-existent-inference-pool # Intentionally Non-Existing
18 |     matches:
19 |     - path:
20 |         type: PathPrefix
21 |         value: /test-non-existent-pool
22 | 


--------------------------------------------------------------------------------
/hack/mkdocs/image/requirements.txt:
--------------------------------------------------------------------------------
 1 | # required for mkdocs-core
 2 | jinja2~=3.0
 3 | # mkdocs 2.4.1 requires Markdown < 3.4.0
 4 | # https://github.com/kubernetes-sigs/gateway-api/pull/1671#issuecomment-1400586465
 5 | markdown~=3.7
 6 | mkdocs~=1.6
 7 | mkdocs-material-extensions~=1.3
 8 | pygments~=2.16
 9 | pymdown-extensions~=10.2
10 | 
11 | # Requirements for plugins
12 | babel~=2.10
13 | colorama~=0.4
14 | paginate~=0.5
15 | regex>=2022.4
16 | requests~=2.26
17 | 
18 | # mkdocs + mkdocs plugins
19 | mkdocs==1.6.1
20 | mkdocs-awesome-pages-plugin==2.9.3
21 | mkdocs-macros-plugin==1.2.0
22 | mkdocs-material==9.5.36
23 | mkdocs-material-extensions==1.3.1
24 | mkdocs-redirects==1.2.1
25 | mkdocs-mermaid2-plugin==1.1.1
26 | 


--------------------------------------------------------------------------------
/conformance/tests/gateway_following_epp_routing.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: gateway.networking.k8s.io/v1
 2 | kind: HTTPRoute
 3 | metadata:
 4 |   name: httproute-for-primary-gw
 5 |   namespace: inference-conformance-app-backend
 6 | spec:
 7 |   parentRefs:
 8 |   - group: gateway.networking.k8s.io
 9 |     kind: Gateway
10 |     name: conformance-primary
11 |     namespace: inference-conformance-infra
12 |     sectionName: http
13 |   hostnames:
14 |   - "primary.example.com"
15 |   rules:
16 |   - backendRefs:
17 |     - group: inference.networking.k8s.io
18 |       kind: InferencePool
19 |       name: primary-inference-pool
20 |     matches:
21 |     - path:
22 |         type: PathPrefix
23 |         value: /primary-gateway-test
24 | 


--------------------------------------------------------------------------------
/conformance/tests/gateway_following_epp_routing_dp.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: gateway.networking.k8s.io/v1
 2 | kind: HTTPRoute
 3 | metadata:
 4 |   name: httproute-for-primary-gw-dp
 5 |   namespace: inference-conformance-app-backend
 6 | spec:
 7 |   parentRefs:
 8 |   - group: gateway.networking.k8s.io
 9 |     kind: Gateway
10 |     name: conformance-primary
11 |     namespace: inference-conformance-infra
12 |     sectionName: http
13 |   hostnames:
14 |   - "primary.example.com"
15 |   rules:
16 |   - backendRefs:
17 |     - group: inference.networking.k8s.io
18 |       kind: InferencePool
19 |       name: dp-inference-pool
20 |     matches:
21 |     - path:
22 |         type: PathPrefix
23 |         value: /primary-gateway-dp-test
24 | 


--------------------------------------------------------------------------------
/conformance/reports/v0.5.1/gateway/agentgateway/inference-v0.7.2-report.yaml:
--------------------------------------------------------------------------------
 1 | GatewayAPIInferenceExtensionVersion: v0.5.1
 2 | apiVersion: gateway.networking.k8s.io/v1
 3 | date: "2025-08-06T17:50:20-07:00"
 4 | gatewayAPIChannel: experimental
 5 | gatewayAPIVersion: v1.3.0
 6 | implementation:
 7 |   contact:
 8 |   - github.com/agentgateway/agentgateway/issues/new/choose
 9 |   organization: agentgateway
10 |   project: agentgateway
11 |   url: http://agentgateway.dev/
12 |   version: v0.7.2
13 | kind: ConformanceReport
14 | mode: default
15 | profiles:
16 | - core:
17 |     result: success
18 |     statistics:
19 |       Failed: 0
20 |       Passed: 9
21 |       Skipped: 0
22 |   name: Gateway
23 |   summary: Core tests succeeded.
24 | 


--------------------------------------------------------------------------------
/conformance/reports/v0.5.1/gateway/kgateway/inference-v2.0.4-report.yaml:
--------------------------------------------------------------------------------
 1 | GatewayAPIInferenceExtensionVersion: v0.5.1
 2 | apiVersion: gateway.networking.k8s.io/v1
 3 | date: "2025-08-06T15:05:42-07:00"
 4 | gatewayAPIChannel: experimental
 5 | gatewayAPIVersion: v1.3.0
 6 | implementation:
 7 |   contact:
 8 |   - github.com/kgateway-dev/kgateway/issues/new/choose
 9 |   organization: kgateway-dev
10 |   project: kgateway
11 |   url: github.com/kgateway-dev/kgateway
12 |   version: v2.0.4
13 | kind: ConformanceReport
14 | mode: default
15 | profiles:
16 | - core:
17 |     result: success
18 |     statistics:
19 |       Failed: 0
20 |       Passed: 9
21 |       Skipped: 0
22 |   name: Gateway
23 |   summary: Core tests succeeded.
24 | 


--------------------------------------------------------------------------------
/conformance/reports/v1.0.2/gateway/kgateway/inference-v2.1.0-report.yaml:
--------------------------------------------------------------------------------
 1 | GatewayAPIInferenceExtensionVersion: v1.0.2
 2 | apiVersion: gateway.networking.k8s.io/v1
 3 | date: "2025-10-27T13:11:40-07:00"
 4 | gatewayAPIChannel: experimental
 5 | gatewayAPIVersion: v1.4.0
 6 | implementation:
 7 |   contact:
 8 |   - github.com/kgateway-dev/kgateway/issues/new/choose
 9 |   organization: kgateway-dev
10 |   project: kgateway
11 |   url: github.com/kgateway-dev/kgateway
12 |   version: v2.1.1
13 | kind: ConformanceReport
14 | mode: default
15 | profiles:
16 | - core:
17 |     result: success
18 |     statistics:
19 |       Failed: 0
20 |       Passed: 9
21 |       Skipped: 0
22 |   name: Gateway
23 |   summary: Core tests succeeded.
24 | 


--------------------------------------------------------------------------------
/conformance/reports/v0.5.1/gateway/envoy-ai-gateway/aigw-latest-report.yaml:
--------------------------------------------------------------------------------
 1 | GatewayAPIInferenceExtensionVersion: v0.5.1
 2 | apiVersion: gateway.networking.k8s.io/v1
 3 | date: "2025-08-15T14:10:31-09:00"
 4 | gatewayAPIChannel: experimental
 5 | gatewayAPIVersion: v1.3.0
 6 | implementation:
 7 |   contact:
 8 |   - github.com/envoyproxy/ai-gateway/issues/new/choose
 9 |   organization: envoyproxy
10 |   project: envoy-ai-gateway
11 |   url: github.com/envoyproxy/ai-gateway
12 |   version: latest
13 | kind: ConformanceReport
14 | mode: default
15 | profiles:
16 | - core:
17 |     result: success
18 |     statistics:
19 |       Failed: 0
20 |       Passed: 9
21 |       Skipped: 0
22 |   name: Gateway
23 |   summary: Core tests succeeded.
24 | 


--------------------------------------------------------------------------------
/.github/workflows/kal.yml:
--------------------------------------------------------------------------------
 1 | name: PR golangci-lint
 2 | 
 3 | on:
 4 |   pull_request:
 5 |     types: [opened, edited, synchronize, reopened]
 6 | 
 7 | # Remove all permissions from GITHUB_TOKEN except metadata.
 8 | permissions: {}
 9 | 
10 | jobs:
11 |   golangci:
12 |     name: kube-api-lint
13 |     runs-on: ubuntu-latest
14 |     strategy:
15 |       fail-fast: false
16 |     steps:
17 |       - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # tag=v4.2.2
18 |         name: Checkout code
19 |         with:
20 |           persist-credentials: false
21 |       - name: Set up Go
22 |         uses: actions/setup-go@d35c59abb061a4a6fb18e82ac0862c26744d6ab5 # tag=v5.5.0
23 |       - name: Run API Linter
24 |         run: make api-lint


--------------------------------------------------------------------------------
/conformance/reports/v0.5.0/gateway/gke-gateway/standard-v1.32.4-rxlb-gateway-report.yaml:
--------------------------------------------------------------------------------
 1 | GatewayAPIInferenceExtensionVersion: v0.5.0
 2 | apiVersion: gateway.networking.k8s.io/v1
 3 | date: "2025-07-21T22:52:10Z"
 4 | gatewayAPIChannel: standard
 5 | gatewayAPIVersion: v1.2.1
 6 | implementation:
 7 |   contact:
 8 |   - gke-gateway-dev@google.com
 9 |   organization: GKE
10 |   project: gke-gateway
11 |   url: https://cloud.google.com/kubernetes-engine/docs/concepts/gateway-api
12 |   version: 1.32.4-gke.1415000
13 | kind: ConformanceReport
14 | mode: default
15 | profiles:
16 | - core:
17 |     result: success
18 |     statistics:
19 |       Failed: 0
20 |       Passed: 9
21 |       Skipped: 0
22 |   name: Gateway
23 |   summary: Core tests succeeded.
24 | 


--------------------------------------------------------------------------------
/conformance/reports/v0.5.1/gateway/agentgateway/README.md:
--------------------------------------------------------------------------------
 1 | # Agentgateway (with kgateway)
 2 | 
 3 | ## Table of Contents
 4 | 
 5 | | Extension Version Tested | Profile Tested | Implementation Version | Mode    | Report                                                                     |
 6 | |--------------------------|----------------|------------------------|---------|----------------------------------------------------------------------------|
 7 | | v0.5.1                   | Gateway        | v0.7.2                 | default | [v0.7.2 report](./inference-v0.7.2-report.yaml)   |
 8 | 
 9 | ## Reproduce
10 | 
11 | From the [kgateway repository](https://github.com/kgateway-dev/kgateway/): `CONFORMANCE_GATEWAY_CLASS=agentgateway make gie-conformance`.
12 | 


--------------------------------------------------------------------------------
/conformance/reports/v0.5.1/gateway/kubvernor/kubvernor-inference-conformance-output-0.1.1.yaml:
--------------------------------------------------------------------------------
 1 | GatewayAPIInferenceExtensionVersion: v0.5.1
 2 | apiVersion: gateway.networking.k8s.io/v1
 3 | date: "2025-08-25T15:42:29+01:00"
 4 | gatewayAPIChannel: standard
 5 | gatewayAPIVersion: v1.2.1
 6 | implementation:
 7 |   contact:
 8 |     - https://github.com/kubvernor/kubvernor
 9 |   organization: kubvernor
10 |   project: kubvernor
11 |   url: https://github.com/kubvernor/kubvernor
12 |   version: 0.1.1
13 | kind: ConformanceReport
14 | mode: default
15 | profiles:
16 |   - core:
17 |       result: success
18 |       statistics:
19 |         Failed: 0
20 |         Passed: 9
21 |         Skipped: 0
22 |     name: Gateway
23 |     summary: Core tests succeeded.
24 | 


--------------------------------------------------------------------------------
/conformance/reports/v0.4.0/gateway/istio/1.28-alpha.32ca03082f566513ad9b860f31e7745b0f68dc91-default-gateway-report.yaml:
--------------------------------------------------------------------------------
 1 | GatewayAPIInferenceExtensionVersion: v0.4.0
 2 | apiVersion: gateway.networking.k8s.io/v1
 3 | date: "2025-07-23T14:20:45+02:00"
 4 | gatewayAPIChannel: standard
 5 | gatewayAPIVersion: v1.3.0
 6 | implementation:
 7 |   contact:
 8 |   - '@istio/maintainers'
 9 |   organization: istio
10 |   project: istio
11 |   url: https://istio.io
12 |   version: 1.28-alpha.32ca03082f566513ad9b860f31e7745b0f68dc91
13 | kind: ConformanceReport
14 | mode: default
15 | profiles:
16 | - core:
17 |     result: success
18 |     statistics:
19 |       Failed: 0
20 |       Passed: 9
21 |       Skipped: 0
22 |   name: Gateway
23 |   summary: Core tests succeeded.
24 | 


--------------------------------------------------------------------------------
/conformance/reports/v0.5.0/gateway/istio/1.28-alpha.32ca03082f566513ad9b860f31e7745b0f68dc91-default-gateway-report.yaml:
--------------------------------------------------------------------------------
 1 | GatewayAPIInferenceExtensionVersion: v0.5.0
 2 | apiVersion: gateway.networking.k8s.io/v1
 3 | date: "2025-07-23T14:31:41+02:00"
 4 | gatewayAPIChannel: standard
 5 | gatewayAPIVersion: v1.3.0
 6 | implementation:
 7 |   contact:
 8 |   - '@istio/maintainers'
 9 |   organization: istio
10 |   project: istio
11 |   url: https://istio.io
12 |   version: 1.28-alpha.32ca03082f566513ad9b860f31e7745b0f68dc91
13 | kind: ConformanceReport
14 | mode: default
15 | profiles:
16 | - core:
17 |     result: success
18 |     statistics:
19 |       Failed: 0
20 |       Passed: 9
21 |       Skipped: 0
22 |   name: Gateway
23 |   summary: Core tests succeeded.
24 | 


--------------------------------------------------------------------------------
/pkg/epp/util/logging/logging_const.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright 2025 The Kubernetes Authors.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 | 
17 | package logging
18 | 
19 | const (
20 | 	DEFAULT = 2
21 | 	VERBOSE = 3
22 | 	DEBUG   = 4
23 | 	TRACE   = 5
24 | )
25 | 


--------------------------------------------------------------------------------
/bbr.Dockerfile:
--------------------------------------------------------------------------------
 1 | # Dockerfile has specific requirement to put this ARG at the beginning:
 2 | # https://docs.docker.com/engine/reference/builder/#understand-how-arg-and-from-interact
 3 | ARG BUILDER_IMAGE=golang:1.24
 4 | ARG BASE_IMAGE=gcr.io/distroless/static:nonroot
 5 | 
 6 | ## Multistage build
 7 | FROM ${BUILDER_IMAGE} AS builder
 8 | ENV CGO_ENABLED=0
 9 | ENV GOOS=linux
10 | ENV GOARCH=amd64
11 | 
12 | # Dependencies
13 | WORKDIR /src
14 | COPY go.mod go.sum ./
15 | RUN go mod download
16 | 
17 | # Sources
18 | COPY cmd/bbr ./cmd
19 | COPY pkg ./pkg
20 | COPY internal ./internal
21 | COPY api ./api
22 | WORKDIR /src/cmd
23 | RUN go build -o /bbr
24 | 
25 | ## Multistage deploy
26 | FROM ${BASE_IMAGE}
27 | 
28 | WORKDIR /
29 | COPY --from=builder /bbr /bbr
30 | 
31 | ENTRYPOINT ["/bbr"]
32 | 


--------------------------------------------------------------------------------
/conformance/tests/epp_unavailable_fail_open.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: gateway.networking.k8s.io/v1
 2 | kind: HTTPRoute
 3 | metadata:
 4 |   name: httproute-for-failopen-pool-gw
 5 |   namespace: inference-conformance-app-backend
 6 | spec:
 7 |   parentRefs:
 8 |   - group: gateway.networking.k8s.io
 9 |     kind: Gateway
10 |     name: conformance-secondary
11 |     namespace: inference-conformance-infra
12 |     sectionName: http
13 |   hostnames:
14 |   - "secondary.example.com"
15 |   rules:
16 |   - backendRefs:
17 |     - group: inference.networking.k8s.io
18 |       kind: InferencePool
19 |       name: secondary-inference-pool # Use secondary-inferencePool because it has failureMode set to failOpen
20 |     matches:
21 |     - path:
22 |         type: PathPrefix
23 |         value: /failopen-pool-test
24 | 


--------------------------------------------------------------------------------
/site-src/_includes/bbr.md:
--------------------------------------------------------------------------------
1 | ### Deploy the Body Based Router Extension (Optional)
2 | 
3 | This guide has shown how to get started with serving a single base model type per L7 URL path. If after this exercise, you wish to continue on to exercise model-aware routing such that more than 1 base model is served at the same L7 url path, that requires use of the (optional) Body Based Routing (BBR) extension which is described in a separate section of the documentation, namely the [`Serving Multiple GenAI Models`](serve-multiple-genai-models.md) section. If you wish to exercise that function, then retain the setup you have deployed so far from this guide and move on to the additional steps described in [that guide](serve-multiple-genai-models.md) or else move on to the following section to cleanup your setup. 
4 | 


--------------------------------------------------------------------------------
/conformance/reports/v0.5.1/gateway/kubvernor/README.md:
--------------------------------------------------------------------------------
 1 | # Kubvernor Rust Gateway
 2 | 
 3 | ## Table of Contents
 4 | 
 5 | | Extension Version Tested | Profile Tested | Implementation Version | Mode    | Report                                                                |
 6 | |--------------------------|----------------|------------------------|---------|-----------------------------------------------------------------------|
 7 | | v0.5.1                   | Gateway        | [0.1.1](https://github.com/kubvernor/kubvernor/releases/tag/0.1.1)                  | default | [Conformance report](./kubvernor-inference-conformance-output-0.1.1.yaml) |
 8 | 
 9 | ## Reproduce
10 | 
11 | To reproduce Kubvernor conformance report follow [README](https://github.com/kubvernor/kubvernor/blob/0.1.1/README.md)
12 | 
13 | 


--------------------------------------------------------------------------------
/config/manifests/inferenceobjective.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: inference.networking.x-k8s.io/v1alpha2
 2 | kind: InferenceObjective
 3 | metadata:
 4 |   name: food-review
 5 | spec:
 6 |   priority: 1
 7 |   poolRef:
 8 |     group: inference.networking.k8s.io
 9 |     name: vllm-llama3-8b-instruct
10 | ---
11 | apiVersion: inference.networking.x-k8s.io/v1alpha2
12 | kind: InferenceObjective
13 | metadata:
14 |   name: base-model
15 | spec:
16 |   priority: 2
17 |   poolRef:
18 |     group: inference.networking.k8s.io
19 |     name: vllm-llama3-8b-instruct
20 | ---
21 | apiVersion: inference.networking.x-k8s.io/v1alpha2
22 | kind: InferenceObjective
23 | metadata:
24 |   name: base-model-cpu
25 | spec:
26 |   priority: 2
27 |   poolRef:
28 |     group: inference.networking.k8s.io
29 |     name: vllm-llama3-8b-instruct
30 | 


--------------------------------------------------------------------------------
/client-go/clientset/versioned/fake/doc.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright The Kubernetes Authors.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 | 
17 | // Code generated by client-gen. DO NOT EDIT.
18 | 
19 | // This package has the automatically generated fake clientset.
20 | package fake
21 | 


--------------------------------------------------------------------------------
/pkg/epp/backend/pod.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright 2025 The Kubernetes Authors.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 | 
17 | package backend
18 | 
19 | import (
20 | 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datalayer"
21 | )
22 | 
23 | type Pod = datalayer.PodInfo
24 | 


--------------------------------------------------------------------------------
/client-go/clientset/versioned/typed/api/v1/doc.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright The Kubernetes Authors.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 | 
17 | // Code generated by client-gen. DO NOT EDIT.
18 | 
19 | // This package has the automatically generated typed clients.
20 | package v1
21 | 


--------------------------------------------------------------------------------
/client-go/clientset/versioned/typed/api/v1/fake/doc.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright The Kubernetes Authors.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 | 
17 | // Code generated by client-gen. DO NOT EDIT.
18 | 
19 | // Package fake has the automatically generated clients.
20 | package fake
21 | 


--------------------------------------------------------------------------------
/client-go/clientset/versioned/typed/api/v1/generated_expansion.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright The Kubernetes Authors.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 | 
17 | // Code generated by client-gen. DO NOT EDIT.
18 | 
19 | package v1
20 | 
21 | type InferencePoolExpansion interface{}
22 | 


--------------------------------------------------------------------------------
/hack/referencer.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright 2024 The Kubernetes Authors.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 | 
17 | package internal
18 | 
19 | import (
20 | 	// Keep a reference to the code generators so they are not removed by `go mod tidy`
21 | 	_ "k8s.io/code-generator"
22 | )
23 | 


--------------------------------------------------------------------------------
/client-go/clientset/versioned/typed/apix/v1alpha1/fake/doc.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright The Kubernetes Authors.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 | 
17 | // Code generated by client-gen. DO NOT EDIT.
18 | 
19 | // Package fake has the automatically generated clients.
20 | package fake
21 | 


--------------------------------------------------------------------------------
/client-go/clientset/versioned/typed/apix/v1alpha2/fake/doc.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright The Kubernetes Authors.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 | 
17 | // Code generated by client-gen. DO NOT EDIT.
18 | 
19 | // Package fake has the automatically generated clients.
20 | package fake
21 | 


--------------------------------------------------------------------------------
/conformance/tests/inferencepool_accepted.yaml:
--------------------------------------------------------------------------------
 1 | # --- HTTPRoute Definition ---
 2 | apiVersion: gateway.networking.k8s.io/v1
 3 | kind: HTTPRoute
 4 | metadata:
 5 |   name: httproute-for-inferencepool-accepted
 6 |   namespace: inference-conformance-app-backend
 7 | spec:
 8 |   parentRefs:
 9 |   - group: gateway.networking.k8s.io
10 |     kind: Gateway
11 |     name: conformance-primary
12 |     namespace: inference-conformance-infra
13 |     sectionName: http
14 |   rules:
15 |   - backendRefs:
16 |     - group: inference.networking.k8s.io
17 |       kind: InferencePool
18 |       name: primary-inference-pool
19 |       # namespace: inference-conformance-app-backend - is omitted since it is in the same namespace as HTTPRoute
20 |     matches:
21 |     - path:
22 |         type: PathPrefix
23 |         value: /accepted-pool-test
24 | 


--------------------------------------------------------------------------------
/client-go/clientset/versioned/scheme/doc.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright The Kubernetes Authors.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 | 
17 | // Code generated by client-gen. DO NOT EDIT.
18 | 
19 | // This package contains the scheme of the automatically generated clientset.
20 | package scheme
21 | 


--------------------------------------------------------------------------------
/client-go/clientset/versioned/typed/apix/v1alpha1/doc.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright The Kubernetes Authors.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 | 
17 | // Code generated by client-gen. DO NOT EDIT.
18 | 
19 | // This package has the automatically generated typed clients.
20 | package v1alpha1
21 | 


--------------------------------------------------------------------------------
/client-go/clientset/versioned/typed/apix/v1alpha2/doc.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright The Kubernetes Authors.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 | 
17 | // Code generated by client-gen. DO NOT EDIT.
18 | 
19 | // This package has the automatically generated typed clients.
20 | package v1alpha2
21 | 


--------------------------------------------------------------------------------
/client-go/clientset/versioned/typed/apix/v1alpha1/generated_expansion.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright The Kubernetes Authors.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 | 
17 | // Code generated by client-gen. DO NOT EDIT.
18 | 
19 | package v1alpha1
20 | 
21 | type InferencePoolImportExpansion interface{}
22 | 


--------------------------------------------------------------------------------
/config/charts/inferencepool/templates/epp-service.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Service
 3 | metadata:
 4 |   name: {{ include "gateway-api-inference-extension.name" . }}
 5 |   namespace: {{ .Release.Namespace }}
 6 |   labels:
 7 |     {{- include "gateway-api-inference-extension.labels" . | nindent 4 }}
 8 | spec:
 9 |   selector:
10 |     {{- include "gateway-api-inference-extension.selectorLabels" . | nindent 4 }}
11 |   ports:
12 |     - name: grpc-ext-proc
13 |       protocol: TCP
14 |       port: {{ .Values.inferenceExtension.extProcPort | default 9002 }}
15 |     - name: http-metrics
16 |       protocol: TCP
17 |       port: {{ .Values.inferenceExtension.metricsPort | default 9090 }}
18 |     {{- with .Values.inferenceExtension.extraServicePorts }}
19 |     {{- toYaml . | nindent 4 }}
20 |     {{- end }}
21 |   type: ClusterIP
22 | 


--------------------------------------------------------------------------------
/conformance/reports/v0.5.1/gateway/ack-gateway/v1.4.0-apsara.3-gateway-report.yaml:
--------------------------------------------------------------------------------
 1 | GatewayAPIInferenceExtensionVersion: v0.5.1
 2 | apiVersion: gateway.networking.k8s.io/v1
 3 | date: "2025-08-18T18:15:11+08:00"
 4 | gatewayAPIChannel: experimental
 5 | gatewayAPIVersion: v1.3.0
 6 | implementation:
 7 |   contact:
 8 |   - https://smartservice.console.aliyun.com/service/create-ticket
 9 |   organization: AlibabaCloud
10 |   project: ack-gateway-with-inference-extension
11 |   url: https://www.alibabacloud.com/help/en/cs/user-guide/gateway-with-inference-extension-overview
12 |   version: v1.4.0-apsara.3
13 | kind: ConformanceReport
14 | mode: default
15 | profiles:
16 | - core:
17 |     result: success
18 |     statistics:
19 |       Failed: 0
20 |       Passed: 9
21 |       Skipped: 0
22 |   name: Gateway
23 |   summary: Core tests succeeded.


--------------------------------------------------------------------------------
/pkg/epp/util/request/sheddable.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright 2025 The Kubernetes Authors.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 | 
17 | package request
18 | 
19 | // IsSheddable determines if a request is considered sheddable based on its priority.
20 | func IsSheddable(priority int) bool {
21 | 	return priority < 0
22 | }
23 | 


--------------------------------------------------------------------------------
/latencypredictor/Dockerfile-training:
--------------------------------------------------------------------------------
 1 | # Use an official Python runtime as a parent image
 2 | FROM python:3.11-slim
 3 | 
 4 | # Set the working directory in the container
 5 | WORKDIR /app
 6 | 
 7 | # Copy the requirements file and install dependencies
 8 | # (It's good practice to manage dependencies in a requirements.txt file)
 9 | 
10 | 
11 | RUN apt-get update && apt-get install -y \
12 |     libgomp1 \
13 |     && rm -rf /var/lib/apt/lists/*
14 | 
15 |     
16 | COPY requirements.txt .
17 | RUN pip install --no-cache-dir -r requirements.txt
18 | 
19 | # Copy the rest of the application code
20 | COPY . .
21 | 
22 | # Expose the port the app runs on
23 | EXPOSE 8000
24 | 
25 | # Command to run the application using uvicorn
26 | # We use 0.0.0.0 to bind to all network interfaces inside the container
27 | CMD ["uvicorn", "training_server:app", "--host", "0.0.0.0", "--port", "8000"]


--------------------------------------------------------------------------------
/latencypredictor/Dockerfile-prediction:
--------------------------------------------------------------------------------
 1 | # Use an official Python runtime as a parent image
 2 | FROM python:3.11-slim
 3 | 
 4 | # Set the working directory in the container
 5 | WORKDIR /app
 6 | 
 7 | # Copy the requirements file and install dependencies
 8 | # (It's good practice to manage dependencies in a requirements.txt file)
 9 | 
10 | 
11 | RUN apt-get update && apt-get install -y \
12 |     libgomp1 \
13 |     && rm -rf /var/lib/apt/lists/*
14 |     
15 | COPY requirements.txt .
16 | RUN pip install --no-cache-dir -r requirements.txt
17 | 
18 | # Copy the rest of the application code
19 | COPY . .
20 | 
21 | # Expose the port the app runs on
22 | EXPOSE 8001
23 | 
24 | # Command to run the application using uvicorn
25 | # We use 0.0.0.0 to bind to all network interfaces inside the container
26 | CMD ["uvicorn", "prediction_server:app", "--host", "0.0.0.0", "--port", "8001"]
27 | 


--------------------------------------------------------------------------------
/apix/v1alpha1/doc.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright 2025 The Kubernetes Authors.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 | 
17 | // Package v1alpha1 contains API Schema definitions for the
18 | // inference.networking.x-k8s.io API group.
19 | //
20 | // +kubebuilder:object:generate=true
21 | // +groupName=inference.networking.x-k8s.io
22 | package v1alpha1
23 | 


--------------------------------------------------------------------------------
/apix/config/v1alpha1/doc.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright 2025 The Kubernetes Authors.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 | 
17 | // Package v1alpha1 contains API Schema definitions for the
18 | // inference.networking.x-k8s.io API group.
19 | //
20 | // +kubebuilder:object:generate=true
21 | // +groupName=inference.networking.x-k8s.io
22 | package v1alpha1
23 | 


--------------------------------------------------------------------------------
/hack/mkdocs/image/entrypoint.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | # Copyright 2019 The Kubernetes Authors.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | 
17 | set -o errexit
18 | set -o pipefail
19 | 
20 | CMD=$1
21 | 
22 | if [ "$CMD" == "build" ];
23 | then
24 |   mkdocs build
25 |   exit 0;
26 | fi
27 | 
28 | mkdocs serve --dev-addr=0.0.0.0:3000 --livereload


--------------------------------------------------------------------------------
/conformance/tests/inferencepool_multiple_rules_different_pools.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | apiVersion: gateway.networking.k8s.io/v1
 3 | kind: HTTPRoute
 4 | metadata:
 5 |   name: httproute-multiple-rules-different-pools
 6 |   namespace: inference-conformance-app-backend
 7 | spec:
 8 |   parentRefs:
 9 |     - name: conformance-primary
10 |       namespace: inference-conformance-infra
11 |   rules:
12 |     - matches:
13 |         - path:
14 |             type: PathPrefix
15 |             value: /primary
16 |       backendRefs:
17 |         - name: primary-inference-pool
18 |           kind: InferencePool
19 |           group: inference.networking.k8s.io
20 |     - matches:
21 |         - path:
22 |             type: PathPrefix
23 |             value: /secondary
24 |       backendRefs:
25 |         - name: secondary-inference-pool
26 |           kind: InferencePool
27 |           group: inference.networking.k8s.io
28 | 


--------------------------------------------------------------------------------
/site-src/concepts/roles-and-personas.md:
--------------------------------------------------------------------------------
 1 | # Roles and Personas
 2 | 
 3 | Before diving into the details of the API, descriptions of the personas these APIs were designed for will help convey the thought process of the API design.
 4 | 
 5 | ## Inference Platform Admin
 6 | 
 7 | The Inference Platform Admin creates and manages the infrastructure necessary to run LLM workloads, including handling Ops for:
 8 | 
 9 |   - Hardware
10 |   - Model Server
11 |   - Base Model
12 |   - Resource Allocation for Workloads
13 |   - Gateway configuration
14 |   - etc
15 | 
16 | ## Inference Workload Owner
17 | 
18 | An Inference Workload Owner persona owns and manages one or many Generative AI Workloads (LLM focused *currently*). This includes:
19 | 
20 | - Defining priority
21 | - Managing fine-tunes
22 |   - LoRA Adapters
23 |   - System Prompts
24 |   - Prompt Cache
25 |   - etc.
26 | - Managing rollout of adapters
27 | 


--------------------------------------------------------------------------------
/hack/mkdocs/image/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 The Kubernetes Authors.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | FROM python:3.13-alpine
16 | 
17 | COPY requirements.txt /requirements.txt
18 | RUN pip install -r /requirements.txt
19 | 
20 | WORKDIR /docs
21 | 
22 | EXPOSE 3000
23 | 
24 | COPY entrypoint.sh /
25 | 
26 | ENTRYPOINT ["/entrypoint.sh"]


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug Report
 3 | about: Report a bug you encountered
 4 | title: ''
 5 | labels: kind/bug, needs-triage
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | <!-- Please use this template while reporting a bug and provide as much info as possible. Not doing so may result in your bug not being addressed in a timely manner. Thanks!
11 | 
12 | If the matter is security related, please disclose it privately via https://kubernetes.io/security/
13 | -->
14 | 
15 | **What happened**:
16 | 
17 | **What you expected to happen**:
18 | 
19 | **How to reproduce it (as minimally and precisely as possible)**:
20 | 
21 | **Anything else we need to know?**:
22 | 
23 | **Environment**:
24 | - Kubernetes version (use `kubectl version`):
25 | - Inference extension version (use `git describe --tags --dirty --always`):
26 | - Cloud provider or hardware configuration:
27 | - Install tools:
28 | - Others:
29 | 


--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
 1 | # Security Policy
 2 | 
 3 | ## Security Announcements
 4 | 
 5 | Join the [kubernetes-security-announce] group for security and vulnerability announcements.
 6 | 
 7 | ## Reporting a Vulnerability
 8 | 
 9 | Instructions for reporting a vulnerability can be found on the
10 | [Kubernetes Security and Disclosure Information] page.
11 | 
12 | ## Supported Versions
13 | 
14 | Information about supported Kubernetes versions can be found on the
15 | [Kubernetes version and version skew support policy] page on the Kubernetes website.
16 | 
17 | [kubernetes-security-announce]: https://groups.google.com/forum/#!forum/kubernetes-security-announce
18 | [Kubernetes version and version skew support policy]: https://kubernetes.io/docs/setup/release/version-skew-policy/#supported-versions
19 | [Kubernetes Security and Disclosure Information]: https://kubernetes.io/docs/reference/issues-security/security/#report-a-vulnerability
20 | 


--------------------------------------------------------------------------------
/api/v1/doc.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright 2025 The Kubernetes Authors.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 | 
17 | // Package v1 contains API Schema definitions for the
18 | // inference.networking.k8s.io API group.
19 | //
20 | // +k8s:openapi-gen=true
21 | // +kubebuilder:object:generate=true
22 | // +groupName=inference.networking.k8s.io
23 | // +groupGoName=Inference
24 | package v1
25 | 


--------------------------------------------------------------------------------
/conformance/embed.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright 2025 The Kubernetes Authors.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 | 
17 | package conformance
18 | 
19 | import "embed"
20 | 
21 | // Manifests embeds the contents of the conformance/resources directory making
22 | // the YAML files within them available to the test suite at runtime.
23 | //
24 | //go:embed resources tests/*
25 | var Manifests embed.FS
26 | 


--------------------------------------------------------------------------------
/client-go/clientset/versioned/typed/apix/v1alpha2/generated_expansion.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright The Kubernetes Authors.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 | 
17 | // Code generated by client-gen. DO NOT EDIT.
18 | 
19 | package v1alpha2
20 | 
21 | type InferenceModelRewriteExpansion interface{}
22 | 
23 | type InferenceObjectiveExpansion interface{}
24 | 
25 | type InferencePoolExpansion interface{}
26 | 


--------------------------------------------------------------------------------
/pkg/epp/datalayer/metrics/types.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright 2025 The Kubernetes Authors.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 | 
17 | package metrics
18 | 
19 | import (
20 | 	"reflect"
21 | 
22 | 	dto "github.com/prometheus/client_model/go"
23 | )
24 | 
25 | type PrometheusMetricMap = map[string]*dto.MetricFamily
26 | 
27 | var (
28 | 	PrometheusMetricType = reflect.TypeOf(PrometheusMetricMap{})
29 | )
30 | 


--------------------------------------------------------------------------------
/apix/v1alpha2/doc.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright 2025 The Kubernetes Authors.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 | 
17 | // Package v1alpha2 contains API Schema definitions for the
18 | // inference.networking.x-k8s.io API group.
19 | //
20 | // +k8s:openapi-gen=true
21 | // +kubebuilder:object:generate=true
22 | // +groupName=inference.networking.x-k8s.io
23 | // +groupGoName=XInference
24 | package v1alpha2
25 | 


--------------------------------------------------------------------------------
/config/observability/prometheus/values.yaml:
--------------------------------------------------------------------------------
 1 | serviceAccounts:
 2 |   server:
 3 |     create: false
 4 |     name: inference-gateway-sa-metrics-reader
 5 | 
 6 | extraScrapeConfigs: |
 7 |   - job_name: 'inference-extension-epp'
 8 |     authorization:
 9 |         credentials_file: /var/run/secrets/kubernetes.io/serviceaccount/token
10 |     scrape_interval: 5s   
11 |     kubernetes_sd_configs:
12 |       - role: endpoints
13 |     relabel_configs:
14 |       - source_labels: [__meta_kubernetes_service_name]
15 |         action: keep
16 |         regex: .*-epp$
17 |       - source_labels: [__meta_kubernetes_pod_container_port_number]
18 |         action: keep
19 |         regex: "9090"
20 |   - job_name: vllm
21 |     scrape_interval: 5s   
22 |     kubernetes_sd_configs:
23 |       - role: pod
24 |     relabel_configs:
25 |       - source_labels: [__meta_kubernetes_pod_label_app]
26 |         action: keep
27 |         regex: vllm-llama3-8b-instruct
28 | 


--------------------------------------------------------------------------------
/cmd/bbr/main.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright 2025 The Kubernetes Authors.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 | 
17 | package main
18 | 
19 | import (
20 | 	"os"
21 | 
22 | 	ctrl "sigs.k8s.io/controller-runtime"
23 | 	"sigs.k8s.io/gateway-api-inference-extension/cmd/bbr/runner"
24 | )
25 | 
26 | func main() {
27 | 	if err := runner.NewRunner().Run(ctrl.SetupSignalHandler()); err != nil {
28 | 		os.Exit(1)
29 | 	}
30 | 
31 | }
32 | 


--------------------------------------------------------------------------------
/docs/proposals/0845-scheduler-architecture-proposal/examples/example.yaml:
--------------------------------------------------------------------------------
 1 | #names are egregiously long, but attempting to descibe custom logic within a name
 2 | profileSelection: disagg-token-length
 3 | schedulingResult: log-shadowbox-label-pd-result 
 4 | profiles:
 5 |   prefill:
 6 |     preschedule:
 7 |       - decode-prefix-cache-check
 8 |     filter:
 9 |       - is-prefill
10 |       - has-required-accelerator
11 |     score:
12 |       - prefix-cache: 3
13 |       - latency-scorer: 2
14 |     selection:
15 |       - best-score
16 |     postschedule:
17 |       - log-full-scores
18 |   decode:
19 |     filter:
20 |       - is-decode
21 |     score:
22 |       - prefix-cache: 3
23 |       - kv-cache-util: 5
24 |     selection:
25 |       - random-top-3
26 |   shadowbox-decode:
27 |     filter:
28 |       - is-decode
29 |       - is-tpu
30 |     score:
31 |       - prefix-cache-v2: 4
32 |       - kv-cache-util: 1
33 |     selection:
34 |       - random-top-3
35 | 


--------------------------------------------------------------------------------
/latencypredictor/Dockerfile-test:
--------------------------------------------------------------------------------
 1 | # Dockerfile-test
 2 | FROM python:3.9-slim
 3 | 
 4 | # Install system dependencies
 5 | RUN apt-get update && apt-get install -y \
 6 |     curl \
 7 |     wget \
 8 |     jq \
 9 |     && rm -rf /var/lib/apt/lists/*
10 | 
11 | # Set working directory
12 | WORKDIR /app
13 | 
14 | # Copy requirements and install Python dependencies
15 | COPY requirements.txt .
16 | RUN pip install --no-cache-dir -r requirements.txt
17 | 
18 | # Install additional testing dependencies
19 | RUN pip install --no-cache-dir \
20 |     pytest \
21 |     pytest-asyncio \
22 |     requests \
23 |     httpx \
24 |     aiohttp
25 | 
26 | # Copy test files
27 | COPY test_dual_server_client.py .
28 | 
29 | 
30 | # Create test results directory
31 | RUN mkdir -p /test-results
32 | 
33 | # Set environment variables
34 | ENV PYTHONPATH=/app
35 | ENV PYTHONUNBUFFERED=1
36 | 
37 | # Default command runs the specific test
38 | CMD ["pytest", "-v", "-s", "test_dual_server_client.py"]


--------------------------------------------------------------------------------
/tools/tools.go:
--------------------------------------------------------------------------------
 1 | //go:build tools
 2 | // +build tools
 3 | 
 4 | /*
 5 | Copyright 2025 The Kubernetes Authors.
 6 | 
 7 | Licensed under the Apache License, Version 2.0 (the "License");
 8 | you may not use this file except in compliance with the License.
 9 | You may obtain a copy of the License at
10 | 
11 |     http://www.apache.org/licenses/LICENSE-2.0
12 | 
13 | Unless required by applicable law or agreed to in writing, software
14 | distributed under the License is distributed on an "AS IS" BASIS,
15 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | See the License for the specific language governing permissions and
17 | limitations under the License.
18 | */
19 | 
20 | // This package contains import references to packages required only for the
21 | // build process.
22 | // https://github.com/golang/go/wiki/Modules#how-can-i-track-tool-dependencies-for-a-module
23 | package tools
24 | 
25 | import (
26 | 	_ "github.com/elastic/crd-ref-docs"
27 | )
28 | 


--------------------------------------------------------------------------------
/conformance/tests/gateway_weighted_two_pools.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: gateway.networking.k8s.io/v1
 2 | kind: HTTPRoute
 3 | metadata:
 4 |   name: httproute-weighted-two-pools
 5 |   namespace: inference-conformance-app-backend
 6 | spec:
 7 |   parentRefs:
 8 |   - group: gateway.networking.k8s.io
 9 |     kind: Gateway
10 |     name: conformance-primary
11 |     namespace: inference-conformance-infra
12 |     sectionName: http
13 |   hostnames:
14 |   - "primary.example.com"
15 |   rules:
16 |   - matches:
17 |     - path:
18 |         type: PathPrefix
19 |         value: /weighted-two-pools-test
20 |     backendRefs:
21 |     # 70% of traffic goes to the primary pool
22 |     - group: inference.networking.k8s.io
23 |       kind: InferencePool
24 |       name: primary-inference-pool
25 |       weight: 70
26 |     # 30% of traffic goes to the secondary pool
27 |     - group: inference.networking.k8s.io
28 |       kind: InferencePool
29 |       name: secondary-inference-pool
30 |       weight: 30
31 | 


--------------------------------------------------------------------------------
/apix/v1alpha1/shared_types.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright 2025 The Kubernetes Authors.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 | 
17 | package v1alpha1
18 | 
19 | // ExportAnnotationKey is the annotation key used to export an InferencePool.
20 | var ExportAnnotationKey = "inference.networking.x-k8s.io/export"
21 | 
22 | // ExportAnnotationVal is the annotation value used to export an InferencePool
23 | // to all clusters.
24 | var ExportAnnotationVal = "ClusterSet"
25 | 


--------------------------------------------------------------------------------
/config/charts/inferencepool/templates/istio.yaml:
--------------------------------------------------------------------------------
 1 | {{- if eq .Values.provider.name "istio" }}
 2 | {{- /* Prefer .Values.provider.istio, fallback to legacy .Values.istio, then {} */ -}}
 3 | {{- $provIstio := (index .Values "provider" "istio") -}}
 4 | {{- $legacyIstio := .Values.istio -}}
 5 | {{- $istio := coalesce $provIstio $legacyIstio (dict) -}}
 6 | {{- $dr := (index $istio "destinationRule") | default (dict) -}}
 7 | 
 8 | apiVersion: networking.istio.io/v1beta1
 9 | kind: DestinationRule
10 | metadata:
11 |   name: {{ include "gateway-api-inference-extension.name" . }}
12 | spec:
13 |   host: {{ (index $dr "host") | default (printf "%s.%s.svc.cluster.local" (include "gateway-api-inference-extension.name" .) .Release.Namespace) }}
14 |   trafficPolicy:
15 |     tls:
16 |       mode: SIMPLE
17 |       insecureSkipVerify: true
18 |     {{- with (index (index $dr "trafficPolicy") "connectionPool") }}
19 |     connectionPool:
20 |       {{- toYaml . | nindent 6 }}
21 |     {{- end }}
22 | {{- end }}
23 | 


--------------------------------------------------------------------------------
/conformance/tests/inferencepool_invalid_epp_service.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: inference.networking.k8s.io/v1
 2 | kind: InferencePool
 3 | metadata:
 4 |   name: pool-with-invalid-epp
 5 |   namespace: inference-conformance-app-backend
 6 | spec:
 7 |   selector:
 8 |     matchLabels:
 9 |       app: primary-inference-model-server
10 |   targetPorts:
11 |   - number: 3000
12 |   endpointPickerRef:
13 |     name: non-existent-epp-svc
14 |     kind: Service
15 |     port:
16 |       number: 9002
17 | ---
18 | apiVersion: gateway.networking.k8s.io/v1
19 | kind: HTTPRoute
20 | metadata:
21 |   name: httproute-for-invalid-epp-pool
22 |   namespace: inference-conformance-app-backend
23 | spec:
24 |   parentRefs:
25 |   - name: conformance-primary
26 |     namespace: inference-conformance-infra
27 |   rules:
28 |   - backendRefs:
29 |     - name: pool-with-invalid-epp
30 |       kind: InferencePool
31 |       group: inference.networking.k8s.io
32 |     matches:
33 |     - path:
34 |         type: PathPrefix
35 |         value: /invalid-epp-test
36 | 


--------------------------------------------------------------------------------
/site-src/api-types/inferenceobjective.md:
--------------------------------------------------------------------------------
 1 | # Inference Objective
 2 | 
 3 | ??? example "Alpha since v1.0.0"
 4 | 
 5 |     The `InferenceObjective` resource is alpha and may have breaking changes in
 6 |     future releases of the API.
 7 | 
 8 | ## Background
 9 | 
10 | The **InferenceObjective** API defines a set of serving objectives of the specific request it is associated with. This CRD currently houses only `Priority` but will be expanded to include fields such as SLO attainment.
11 | 
12 | ## Usage
13 | 
14 | To associate a request to the InferencePool with a specific InferenceObjective, the system uses a specific header: `x-gateway-inference-objective` with the value of the header set to the InferenceObjective metadata name. So the calling client must set the header key/value on the request to associate the selected InferenceObjective. If no InferenceObjective is selected, default values are used.  
15 | 
16 | ## Spec
17 | 
18 | The full spec of the InferenceObjective is defined [here](/reference/x-v1a2-spec/#inferenceobjective).
19 | 


--------------------------------------------------------------------------------
/site-src/stylesheets/extra.css:
--------------------------------------------------------------------------------
 1 | /* Hide title in favor of logo */
 2 | .md-header__topic {
 3 |     display: none;
 4 | }
 5 | 
 6 | /* Use Kubernetes color as primary */
 7 | :root {
 8 |     --md-primary-fg-color: #326ce5;
 9 | }
10 | 
11 | /* Increase size of logo */
12 | .md-header__button.md-logo img, .md-header__button.md-logo svg {
13 |     height: 1.8rem;
14 | }
15 | 
16 | /* Always show tabs, even on smaller screens */
17 | @media screen and (max-width: 76.234375em) {
18 |     .md-header__button.md-logo {
19 |         display: block;
20 |     }
21 |     .md-tabs {
22 |         display: block;
23 |     }
24 | }
25 | 
26 | /* Rounded search box + results */
27 | .md-search__form {
28 |     border-radius: .5rem;
29 | }
30 | 
31 | [data-md-toggle=search]:checked~.md-header .md-search__form {
32 |     border-radius: .5rem .5rem 0 0;
33 | }
34 | [dir=ltr] .md-search__output {
35 |     border-radius: 0 0 .5rem .5rem;
36 | }
37 | 
38 | /* Center images  */
39 | img.center {
40 |     display: block;
41 |     margin: 20px auto;
42 | }
43 | 


--------------------------------------------------------------------------------
/pkg/epp/util/metrics/metrics.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright 2025 The Kubernetes Authors.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 | 
17 | package metrics
18 | 
19 | import (
20 | 	"fmt"
21 | 
22 | 	compbasemetrics "k8s.io/component-base/metrics"
23 | )
24 | 
25 | // HelpMsgWithStability is a helper function to create a help message with stability level.
26 | func HelpMsgWithStability(msg string, stability compbasemetrics.StabilityLevel) string {
27 | 	return fmt.Sprintf("[%v] %v", stability, msg)
28 | }
29 | 


--------------------------------------------------------------------------------
/conformance/conformance_test.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright 2025 The Kubernetes Authors.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 | 
17 | package conformance
18 | 
19 | import (
20 | 	"testing"
21 | )
22 | 
23 | // TestConformance is the top-level function that runs the conformance tests.
24 | // It calls the RunConformance function which sets up the suite and executes
25 | // the registered tests.
26 | func TestConformance(t *testing.T) {
27 | 	// RunConformance is defined in conformance.go
28 | 	RunConformance(t)
29 | }
30 | 


--------------------------------------------------------------------------------
/pkg/epp/util/logging/fatal.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright 2025 The Kubernetes Authors.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 | 
17 | package logging
18 | 
19 | import (
20 | 	"os"
21 | 
22 | 	"github.com/go-logr/logr"
23 | )
24 | 
25 | // Fatal calls logger.Error followed by os.Exit(1).
26 | //
27 | // This is a utility function and should not be used in production code!
28 | func Fatal(logger logr.Logger, err error, msg string, keysAndValues ...any) {
29 | 	logger.Error(err, msg, keysAndValues...)
30 | 	os.Exit(1)
31 | }
32 | 


--------------------------------------------------------------------------------
/test/testdata/metrics-rbac.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: rbac.authorization.k8s.io/v1
 2 | kind: ClusterRole
 3 | metadata:
 4 |   name: inference-gateway-metrics-reader
 5 | rules:
 6 | - nonResourceURLs:
 7 |   - /metrics
 8 |   verbs:
 9 |   - get
10 | ---
11 | apiVersion: v1
12 | kind: ServiceAccount
13 | metadata:
14 |   name: inference-gateway-sa-metrics-reader
15 |   namespace: $E2E_NS
16 | ---
17 | apiVersion: rbac.authorization.k8s.io/v1
18 | kind: ClusterRoleBinding
19 | metadata:
20 |   name: inference-gateway-sa-metrics-reader-role-binding
21 | subjects:
22 | - kind: ServiceAccount
23 |   name: inference-gateway-sa-metrics-reader
24 |   namespace: $E2E_NS
25 | roleRef:
26 |   kind: ClusterRole
27 |   name: inference-gateway-metrics-reader
28 |   apiGroup: rbac.authorization.k8s.io
29 | ---
30 | apiVersion: v1
31 | kind: Secret
32 | metadata:
33 |   name: inference-gateway-sa-metrics-reader-secret
34 |   namespace: $E2E_NS
35 |   annotations:
36 |     kubernetes.io/service-account.name: inference-gateway-sa-metrics-reader
37 | type: kubernetes.io/service-account-token


--------------------------------------------------------------------------------
/tools/dynamic-lora-sidecar/Makefile:
--------------------------------------------------------------------------------
 1 | # Makefile for dynamic-lora-sidecar
 2 | 
 3 | PYTHON_VERSION := 3.10
 4 | VENV_DIR := venv
 5 | PYTHON := $(VENV_DIR)/bin/python
 6 | PIP := $(VENV_DIR)/bin/pip
 7 | 
 8 | .PHONY: help venv install test clean
 9 | 
10 | help: ## Show available targets
11 | 	@echo "Available targets:"
12 | 	@echo "  venv     - Create virtual environment"
13 | 	@echo "  install  - Install dependencies"
14 | 	@echo "  test     - Run unit tests"
15 | 	@echo "  clean    - Clean up virtual environment"
16 | 
17 | venv: $(VENV_DIR)/bin/activate ## Create virtual environment
18 | 
19 | $(VENV_DIR)/bin/activate:
20 | 	python$(PYTHON_VERSION) -m venv $(VENV_DIR)
21 | 
22 | install: venv ## Install dependencies
23 | 	$(PIP) install --upgrade pip
24 | 	$(PIP) install -r requirements.txt
25 | 
26 | test: install ## Run unit tests
27 | 	$(PYTHON) -m unittest discover -v -s sidecar
28 | 
29 | clean: ## Clean up virtual environment
30 | 	rm -rf $(VENV_DIR)
31 | 	rm -rf .pytest_cache
32 | 	find . -name "*.pyc" -delete
33 | 	find . -name "__pycache__" -type d -exec rm -rf {} +
34 | 


--------------------------------------------------------------------------------
/client-go/listers/api/v1/expansion_generated.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright The Kubernetes Authors.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 | 
17 | // Code generated by lister-gen. DO NOT EDIT.
18 | 
19 | package v1
20 | 
21 | // InferencePoolListerExpansion allows custom methods to be added to
22 | // InferencePoolLister.
23 | type InferencePoolListerExpansion interface{}
24 | 
25 | // InferencePoolNamespaceListerExpansion allows custom methods to be added to
26 | // InferencePoolNamespaceLister.
27 | type InferencePoolNamespaceListerExpansion interface{}
28 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Dockerfile has specific requirement to put this ARG at the beginning:
 2 | # https://docs.docker.com/engine/reference/builder/#understand-how-arg-and-from-interact
 3 | ARG BUILDER_IMAGE=golang:1.24
 4 | ARG BASE_IMAGE=gcr.io/distroless/static:nonroot
 5 | 
 6 | ## Multistage build
 7 | FROM ${BUILDER_IMAGE} AS builder
 8 | ENV CGO_ENABLED=0
 9 | ENV GOOS=linux
10 | ENV GOARCH=amd64
11 | ARG COMMIT_SHA=unknown
12 | ARG BUILD_REF
13 | 
14 | # Dependencies
15 | WORKDIR /src
16 | COPY go.mod go.sum ./
17 | RUN go mod download
18 | 
19 | # Sources
20 | COPY cmd/epp ./cmd/epp
21 | COPY pkg/common ./pkg/common
22 | COPY pkg/epp ./pkg/epp
23 | COPY internal ./internal
24 | COPY apix ./apix
25 | COPY api ./api
26 | COPY version ./version
27 | WORKDIR /src/cmd/epp
28 | RUN go build -ldflags="-X sigs.k8s.io/gateway-api-inference-extension/version.CommitSHA=${COMMIT_SHA} -X sigs.k8s.io/gateway-api-inference-extension/version.BuildRef=${BUILD_REF}" -o /epp
29 | 
30 | ## Multistage deploy
31 | FROM ${BASE_IMAGE}
32 | 
33 | WORKDIR /
34 | COPY --from=builder /epp /epp
35 | 
36 | ENTRYPOINT ["/epp"]
37 | 


--------------------------------------------------------------------------------
/pkg/epp/backend/metrics/metrics_state.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright 2025 The Kubernetes Authors.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 | 
17 | package metrics
18 | 
19 | import (
20 | 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datalayer"
21 | )
22 | 
23 | // NewMetricsState initializes a new MetricsState and returns its pointer.
24 | func NewMetricsState() *MetricsState {
25 | 	return datalayer.NewMetrics()
26 | }
27 | 
28 | // MetricsState holds the latest state of the metrics that were scraped from a pod.
29 | type MetricsState = datalayer.Metrics
30 | 


--------------------------------------------------------------------------------
/pkg/epp/util/pod/pod.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright 2025 The Kubernetes Authors.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 | 
17 | package pod
18 | 
19 | import (
20 | 	corev1 "k8s.io/api/core/v1"
21 | )
22 | 
23 | func IsPodReady(pod *corev1.Pod) bool {
24 | 	if !pod.DeletionTimestamp.IsZero() {
25 | 		return false
26 | 	}
27 | 	for _, condition := range pod.Status.Conditions {
28 | 		if condition.Type == corev1.PodReady {
29 | 			if condition.Status == corev1.ConditionTrue {
30 | 				return true
31 | 			}
32 | 			break
33 | 		}
34 | 	}
35 | 	return false
36 | }
37 | 


--------------------------------------------------------------------------------
/.golangci.yml:
--------------------------------------------------------------------------------
 1 | version: "2"
 2 | run:
 3 |   allow-parallel-runners: true
 4 | linters:
 5 |   default: none
 6 |   enable:
 7 |     - copyloopvar
 8 |     - dupword
 9 |     - durationcheck
10 |     - errcheck
11 |     - fatcontext
12 |     - ginkgolinter
13 |     - goconst
14 |     - gocritic
15 |     - govet
16 |     - ineffassign
17 |     - loggercheck
18 |     - makezero
19 |     - misspell
20 |     - nakedret
21 |     - perfsprint
22 |     - prealloc
23 |     - revive
24 |     - staticcheck
25 |     - unconvert
26 |     - unparam
27 |     - unused
28 |   settings:
29 |     revive:
30 |       rules:
31 |         - name: comment-spacings
32 |   exclusions:
33 |     generated: lax
34 |     presets:
35 |       - comments
36 |       - common-false-positives
37 |       - legacy
38 |       - std-error-handling
39 |     paths:
40 |       - bin
41 |       - third_party$
42 |       - builtin$
43 |       - examples$
44 | formatters:
45 |   enable:
46 |     - gofmt
47 |     - goimports
48 |   exclusions:
49 |     generated: lax
50 |     paths:
51 |       - bin
52 |       - third_party$
53 |       - builtin$
54 |       - examples$
55 | 


--------------------------------------------------------------------------------
/pkg/epp/config/config.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright 2025 The Kubernetes Authors.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 | 
17 | package config
18 | 
19 | import (
20 | 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/saturationdetector"
21 | 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling"
22 | )
23 | 
24 | // Config is the configuration loaded from the text based configuration
25 | type Config struct {
26 | 	SchedulerConfig          *scheduling.SchedulerConfig
27 | 	SaturationDetectorConfig *saturationdetector.Config
28 | }
29 | 
30 | type FeatureConfig map[string]bool
31 | 


--------------------------------------------------------------------------------
/PROJECT:
--------------------------------------------------------------------------------
 1 | # Code generated by tool. DO NOT EDIT.
 2 | # This file is used to track the info used to scaffold your project
 3 | # and allow the plugins properly work.
 4 | # More info: https://book.kubebuilder.io/reference/project-config.html
 5 | domain: x-k8s.io
 6 | layout:
 7 | - go.kubebuilder.io/v4
 8 | projectName: gateway-api-inference-extension
 9 | repo: sigs.k8s.io/gateway-api-inference-extension
10 | resources:
11 | - api:
12 |     crdVersion: v1
13 |     namespaced: true
14 |   domain: x-k8s.io
15 |   group: inference
16 |   kind: InferencePool
17 |   path: sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1
18 |   version: v1alpha1
19 | - api:
20 |     crdVersion: v1
21 |     namespaced: true
22 |   domain: x-k8s.io
23 |   group: inference
24 |   kind: InferenceObjective
25 |   path: sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1
26 |   version: v1alpha1
27 | - api:
28 |     crdVersion: v1
29 |     namespaced: true
30 |   domain: x-k8s.io
31 |   group: inference
32 |   kind: EndpointPickerConfig
33 |   path: sigs.k8s.io/gateway-api-inference-extension/api/config/v1alpha1
34 |   version: v1alpha1
35 | version: "3"
36 | 


--------------------------------------------------------------------------------
/client-go/listers/apix/v1alpha1/expansion_generated.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright The Kubernetes Authors.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 | 
17 | // Code generated by lister-gen. DO NOT EDIT.
18 | 
19 | package v1alpha1
20 | 
21 | // InferencePoolImportListerExpansion allows custom methods to be added to
22 | // InferencePoolImportLister.
23 | type InferencePoolImportListerExpansion interface{}
24 | 
25 | // InferencePoolImportNamespaceListerExpansion allows custom methods to be added to
26 | // InferencePoolImportNamespaceLister.
27 | type InferencePoolImportNamespaceListerExpansion interface{}
28 | 


--------------------------------------------------------------------------------
/cmd/epp/main.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright 2025 The Kubernetes Authors.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 | 
17 | package main
18 | 
19 | import (
20 | 	"os"
21 | 
22 | 	ctrl "sigs.k8s.io/controller-runtime"
23 | 
24 | 	"sigs.k8s.io/gateway-api-inference-extension/cmd/epp/runner"
25 | )
26 | 
27 | func main() {
28 | 	// For adding out-of-tree plugins to the plugins registry, use the following:
29 | 	// plugins.Register(my-out-of-tree-plugin-name, my-out-of-tree-plugin-factory-function)
30 | 
31 | 	if err := runner.NewRunner().Run(ctrl.SetupSignalHandler()); err != nil {
32 | 		os.Exit(1)
33 | 	}
34 | }
35 | 


--------------------------------------------------------------------------------
/pkg/epp/plugins/shared_state.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright 2025 The Kubernetes Authors.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 | 
17 | package plugins
18 | 
19 | import (
20 | 	"errors"
21 | )
22 | 
23 | var (
24 | 	// ErrNotFound is the not found error message.
25 | 	ErrNotFound = errors.New("not found")
26 | )
27 | 
28 | // StateKey is the type of keys stored in PluginState.
29 | type StateKey string
30 | 
31 | // StateData is a generic type for arbitrary data stored in PluginState.
32 | type StateData interface {
33 | 	// Clone is an interface to make a copy of StateData.
34 | 	Clone() StateData
35 | }
36 | 


--------------------------------------------------------------------------------
/pkg/epp/plugins/typedname.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright 2025 The Kubernetes Authors.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 | 
17 | package plugins
18 | 
19 | const (
20 | 	separator = "/"
21 | )
22 | 
23 | // TypedName is a utility struct providing a type and a name to plugins.
24 | type TypedName struct {
25 | 	// Type returns the type of a plugin.
26 | 	Type string
27 | 	// Name returns the name of a plugin instance.
28 | 	Name string
29 | }
30 | 
31 | // String returns the type and name rendered as "<name>/<type>".
32 | func (tn TypedName) String() string {
33 | 	return tn.Name + separator + tn.Type
34 | }
35 | 


--------------------------------------------------------------------------------
/config/charts/inferencepool/templates/_helpers.tpl:
--------------------------------------------------------------------------------
 1 | {{/*
 2 | Common labels
 3 | */}}
 4 | {{- define "gateway-api-inference-extension.labels" -}}
 5 | app.kubernetes.io/name: {{ include "gateway-api-inference-extension.name" . }}
 6 | {{- if .Chart.AppVersion }}
 7 | app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
 8 | {{- end }}
 9 | {{- end }}
10 | 
11 | {{/*
12 | Inference extension name
13 | */}}
14 | {{- define "gateway-api-inference-extension.name" -}}
15 | {{- $base := .Release.Name | default "default-pool" | lower | trim | trunc 40 -}}
16 | {{ $base }}-epp
17 | {{- end -}}
18 | 
19 | {{/*
20 | Cluster RBAC unique name
21 | */}}
22 | {{- define "gateway-api-inference-extension.cluster-rbac-name" -}}
23 | {{- $base := .Release.Name | default "default-pool" | lower | trim | trunc 40 }}
24 | {{- $ns := .Release.Namespace | default "default" | lower | trim | trunc 40 }}
25 | {{- printf "%s-%s-epp" $base $ns | quote | trunc 84 }}
26 | {{- end -}}
27 | 
28 | {{/*
29 | Selector labels
30 | */}}
31 | {{- define "gateway-api-inference-extension.selectorLabels" -}}
32 | inferencepool: {{ include "gateway-api-inference-extension.name" . }}
33 | {{- end -}}
34 | 


--------------------------------------------------------------------------------
/pkg/epp/util/request/metadata.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright 2025 The Kubernetes Authors.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 | 
17 | package request
18 | 
19 | import (
20 | 	extProcPb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3"
21 | )
22 | 
23 | func ExtractMetadataValues(req *extProcPb.ProcessingRequest) map[string]any {
24 | 	metadata := make(map[string]any)
25 | 	if req != nil && req.MetadataContext != nil && req.MetadataContext.FilterMetadata != nil {
26 | 		for key, val := range req.MetadataContext.FilterMetadata {
27 | 			metadata[key] = val.AsMap()
28 | 		}
29 | 	}
30 | 	return metadata
31 | }
32 | 


--------------------------------------------------------------------------------
/internal/runnable/leader_election.go:
--------------------------------------------------------------------------------
 1 | package runnable
 2 | 
 3 | import "sigs.k8s.io/controller-runtime/pkg/manager"
 4 | 
 5 | type leaderElection struct {
 6 | 	manager.Runnable
 7 | 	needsLeaderElection bool
 8 | }
 9 | 
10 | // LeaderElection wraps the given runnable to implement manager.LeaderElectionRunnable.
11 | func LeaderElection(runnable manager.Runnable, needsLeaderElection bool) manager.Runnable {
12 | 	return &leaderElection{
13 | 		Runnable:            runnable,
14 | 		needsLeaderElection: needsLeaderElection,
15 | 	}
16 | }
17 | 
18 | // RequireLeaderElection wraps the given runnable, marking it as requiring leader election.
19 | func RequireLeaderElection(runnable manager.Runnable) manager.Runnable {
20 | 	return LeaderElection(runnable, true)
21 | }
22 | 
23 | // RequireLeaderElection wraps the given runnable, marking it as not requiring leader election.
24 | func NoLeaderElection(runnable manager.Runnable) manager.Runnable {
25 | 	return LeaderElection(runnable, false)
26 | }
27 | 
28 | // NeedLeaderElection implements manager.NeedLeaderElection interface.
29 | func (r *leaderElection) NeedLeaderElection() bool {
30 | 	return r.needsLeaderElection
31 | }
32 | 


--------------------------------------------------------------------------------
/config/charts/inferencepool/templates/leader-election-rbac.yaml:
--------------------------------------------------------------------------------
 1 | {{- if gt (.Values.inferenceExtension.replicas | int) 1 }}
 2 | ---
 3 | kind: Role
 4 | apiVersion: rbac.authorization.k8s.io/v1
 5 | metadata:
 6 |   name: {{ include "gateway-api-inference-extension.name" . }}-leader-election
 7 |   namespace: {{ .Release.Namespace }}
 8 |   labels:
 9 |     {{- include "gateway-api-inference-extension.labels" . | nindent 4 }}
10 | rules:
11 | - apiGroups: [ "coordination.k8s.io" ]
12 |   resources: [ "leases" ]
13 |   verbs: [ "get", "list", "watch", "create", "update", "patch", "delete" ]
14 | - apiGroups: [ "" ]
15 |   resources: [ "events" ]
16 |   verbs: [ "create", "patch" ]
17 | ---
18 | kind: RoleBinding
19 | apiVersion: rbac.authorization.k8s.io/v1
20 | metadata:
21 |   name: {{ include "gateway-api-inference-extension.name" . }}-leader-election-binding
22 |   namespace: {{ .Release.Namespace }}
23 | subjects:
24 | - kind: ServiceAccount
25 |   name: {{ include "gateway-api-inference-extension.name" . }}
26 | roleRef:
27 |   apiGroup: rbac.authorization.k8s.io
28 |   kind: Role
29 |   name: {{ include "gateway-api-inference-extension.name" . }}-leader-election
30 | {{- end }}
31 | 


--------------------------------------------------------------------------------
/pkg/epp/metrics/testdata/prefix_indexer_hit_ratio_metric:
--------------------------------------------------------------------------------
 1 | # HELP inference_extension_prefix_indexer_hit_ratio [ALPHA] Ratio of prefix length matched to total prefix length in the cache lookup.
 2 | # TYPE inference_extension_prefix_indexer_hit_ratio histogram
 3 | inference_extension_prefix_indexer_hit_ratio_bucket{le="0"} 2
 4 | inference_extension_prefix_indexer_hit_ratio_bucket{le="0.1"} 2
 5 | inference_extension_prefix_indexer_hit_ratio_bucket{le="0.2"} 2
 6 | inference_extension_prefix_indexer_hit_ratio_bucket{le="0.3"} 2
 7 | inference_extension_prefix_indexer_hit_ratio_bucket{le="0.4"} 2
 8 | inference_extension_prefix_indexer_hit_ratio_bucket{le="0.5"} 4
 9 | inference_extension_prefix_indexer_hit_ratio_bucket{le="0.6"} 4
10 | inference_extension_prefix_indexer_hit_ratio_bucket{le="0.7"} 5
11 | inference_extension_prefix_indexer_hit_ratio_bucket{le="0.8"} 5
12 | inference_extension_prefix_indexer_hit_ratio_bucket{le="0.9"} 5
13 | inference_extension_prefix_indexer_hit_ratio_bucket{le="1"} 6
14 | inference_extension_prefix_indexer_hit_ratio_bucket{le="+Inf"} 6
15 | inference_extension_prefix_indexer_hit_ratio_sum 2.7
16 | inference_extension_prefix_indexer_hit_ratio_count 6
17 | 


--------------------------------------------------------------------------------
/hack/verify-boilerplate.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright 2025 The Kubernetes Authors.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | 
17 | set -o errexit
18 | set -o nounset
19 | set -o pipefail
20 | 
21 | SCRIPT_ROOT=$(dirname "${BASH_SOURCE}")/..
22 | 
23 | boilerDir="${SCRIPT_ROOT}/hack/boilerplate"
24 | boiler="${boilerDir}/boilerplate.py"
25 | 
26 | files_need_boilerplate=($(${boiler} "$@"))
27 | 
28 | # Run boilerplate check
29 | if [[ ${#files_need_boilerplate[@]} -gt 0 ]]; then
30 |   for file in "${files_need_boilerplate[@]}"; do
31 |     echo "Boilerplate header is wrong for: ${file}"
32 |   done
33 | 
34 |   exit 1
35 | fi
36 | 


--------------------------------------------------------------------------------
/site-src/concepts/conformance.md:
--------------------------------------------------------------------------------
 1 | # Conformance
 2 | 
 3 | Similar to Gateway API, this project will rely on conformance tests to ensure
 4 | compatibility across implementations. This will be focused on three different
 5 | layers:
 6 | 
 7 | ## 1. Gateway API Implementations
 8 | 
 9 | Conformance tests will verify that:
10 | 
11 | * InferencePool is supported as a backend type
12 | * Implementations forward requests to the configured extension for an
13 |   InferencePool following the specification defined by this project
14 | * Implementations honor the routing guidance provided by the extension
15 | * Implementations behave appropriately when an extension is either not present
16 |   or fails to respond
17 | 
18 | ## 2. Inference Routing Extensions
19 | 
20 | Conformance tests will verify that:
21 | 
22 | * Extensions accept requests that match the protocol specified by this project
23 | * Extensions respond with routing guidance that matches the protocol specified
24 |   by this project
25 | 
26 | ## 3. Model Server Frameworks
27 | 
28 | Conformance tests will verify that:
29 | 
30 | * Frameworks serve the expected set of metrics using a format and path specified
31 |   by this project
32 | 


--------------------------------------------------------------------------------
/config/manifests/vllm/sim-deployment.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: apps/v1
 2 | kind: Deployment
 3 | metadata:
 4 |   name: vllm-llama3-8b-instruct
 5 | spec:
 6 |   replicas: 3
 7 |   selector:
 8 |     matchLabels:
 9 |       app: vllm-llama3-8b-instruct
10 |   template:
11 |     metadata:
12 |       labels:
13 |         app: vllm-llama3-8b-instruct
14 |     spec:
15 |       containers:
16 |       - name: vllm-sim
17 |         image: ghcr.io/llm-d/llm-d-inference-sim:v0.5.0
18 |         imagePullPolicy: Always
19 |         args:
20 |         - --model
21 |         - meta-llama/Llama-3.1-8B-Instruct
22 |         - --port
23 |         - "8000"
24 |         - --max-loras
25 |         - "2"
26 |         - --lora-modules
27 |         - '{"name": "food-review-1"}'
28 |         env:
29 |         - name: POD_NAME
30 |           valueFrom:
31 |             fieldRef:
32 |               fieldPath: metadata.name
33 |         - name: NAMESPACE
34 |           valueFrom:
35 |             fieldRef:
36 |               fieldPath: metadata.namespace
37 |         ports:
38 |         - containerPort: 8000
39 |           name: http
40 |           protocol: TCP
41 |         resources:
42 |           requests:
43 |             cpu: 10m
44 | 


--------------------------------------------------------------------------------
/pkg/epp/datalayer/mocks/ticker.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright 2025 The Kubernetes Authors.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 | 
17 | package mocks
18 | 
19 | import (
20 | 	"time"
21 | )
22 | 
23 | // -- Ticker is a mock time source --
24 | type Ticker struct {
25 | 	ch chan time.Time
26 | }
27 | 
28 | func NewTicker() *Ticker {
29 | 	return &Ticker{
30 | 		ch: make(chan time.Time, 10),
31 | 	}
32 | }
33 | 
34 | func (t *Ticker) Channel() <-chan time.Time {
35 | 	return t.ch
36 | }
37 | 
38 | func (t *Ticker) Tick() {
39 | 	select {
40 | 	case t.ch <- time.Now():
41 | 	default: // if buffer is full, or channel closed
42 | 	}
43 | }
44 | 
45 | func (t *Ticker) Stop() {}
46 | 


--------------------------------------------------------------------------------
/pkg/epp/metrics/testdata/scheduler_e2e_duration_seconds_metric:
--------------------------------------------------------------------------------
 1 | # HELP inference_extension_scheduler_e2e_duration_seconds [ALPHA] End-to-end scheduling latency distribution in seconds.
 2 | # TYPE inference_extension_scheduler_e2e_duration_seconds histogram
 3 | inference_extension_scheduler_e2e_duration_seconds_bucket{le="0.0001"} 0
 4 | inference_extension_scheduler_e2e_duration_seconds_bucket{le="0.0002"} 1
 5 | inference_extension_scheduler_e2e_duration_seconds_bucket{le="0.0005"} 1
 6 | inference_extension_scheduler_e2e_duration_seconds_bucket{le="0.001"} 2
 7 | inference_extension_scheduler_e2e_duration_seconds_bucket{le="0.002"} 3
 8 | inference_extension_scheduler_e2e_duration_seconds_bucket{le="0.005"} 4
 9 | inference_extension_scheduler_e2e_duration_seconds_bucket{le="0.01"} 5
10 | inference_extension_scheduler_e2e_duration_seconds_bucket{le="0.02"} 6
11 | inference_extension_scheduler_e2e_duration_seconds_bucket{le="0.05"} 7
12 | inference_extension_scheduler_e2e_duration_seconds_bucket{le="0.1"} 8
13 | inference_extension_scheduler_e2e_duration_seconds_bucket{le="+Inf"} 9
14 | inference_extension_scheduler_e2e_duration_seconds_sum{} 0.2835
15 | inference_extension_scheduler_e2e_duration_seconds_count{} 9
16 | 


--------------------------------------------------------------------------------
/conformance/utils/assertions.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright 2025 The Kubernetes Authors.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 | 
17 | // Package assertions contains custom assertion helper functions used within
18 | // the Gateway API Inference Extension conformance test suite.
19 | package assertions
20 | 
21 | // TODO: Implement custom assertion functions specific to Inference Extension testing.
22 | // Examples might include:
23 | // - Asserting specific fields or structures within an inference API response body.
24 | // - Asserting specific metrics reported by mock model servers or EPPs.
25 | // - Asserting specific conditions or status fields unique to InferencePool or InferenceObjective.
26 | 


--------------------------------------------------------------------------------
/pkg/epp/scheduling/framework/weighted_scorer.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright 2025 The Kubernetes Authors.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 | 
17 | package framework
18 | 
19 | // NewWeightedScorer initializes a new WeightedScorer and returns its pointer.
20 | func NewWeightedScorer(scorer Scorer, weight int) *WeightedScorer {
21 | 	return &WeightedScorer{
22 | 		Scorer: scorer,
23 | 		weight: weight,
24 | 	}
25 | }
26 | 
27 | // WeightedScorer is a struct that encapsulates a scorer with its weight.
28 | type WeightedScorer struct {
29 | 	Scorer
30 | 	weight int
31 | }
32 | 
33 | // Weight returns the weight of the scorer.
34 | func (s *WeightedScorer) Weight() int {
35 | 	return s.weight
36 | }
37 | 


--------------------------------------------------------------------------------
/config/charts/inferencepool/templates/epp-config.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: ConfigMap
 3 | metadata:
 4 |   name: {{ include "gateway-api-inference-extension.name" . }}
 5 |   namespace: {{ .Release.Namespace }}
 6 | data:
 7 |   default-plugins.yaml: |
 8 |     apiVersion: inference.networking.x-k8s.io/v1alpha1
 9 |     kind: EndpointPickerConfig
10 |     plugins:
11 |     - type: queue-scorer
12 |     - type: kv-cache-utilization-scorer
13 |     - type: prefix-cache-scorer
14 |     schedulingProfiles:
15 |     - name: default
16 |       plugins:
17 |       - pluginRef: queue-scorer
18 |         weight: 2
19 |       - pluginRef: kv-cache-utilization-scorer
20 |         weight: 2
21 |       - pluginRef: prefix-cache-scorer
22 |         weight: 3
23 |   {{- if (hasKey .Values.inferenceExtension "pluginsCustomConfig") }}
24 |   {{- .Values.inferenceExtension.pluginsCustomConfig | toYaml | nindent 2 }}
25 |   {{- end }}
26 |   
27 | ---
28 | {{- if .Values.inferenceExtension.sidecar.enabled }}
29 | apiVersion: v1
30 | kind: ConfigMap
31 | metadata:
32 |   name: {{ .Values.inferenceExtension.sidecar.configMap.name }}
33 |   namespace: {{ .Release.Namespace }}
34 | data:
35 |   {{- .Values.inferenceExtension.sidecar.configMap.data | toYaml | nindent 2 }}
36 | {{- end }}
37 | 


--------------------------------------------------------------------------------
/config/charts/body-based-routing/templates/bbr.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: apps/v1
 2 | kind: Deployment
 3 | metadata:
 4 |   name: {{ .Values.bbr.name }}
 5 |   namespace: {{ .Release.Namespace }}
 6 | spec:
 7 |   replicas: {{ .Values.bbr.replicas | default 1 }}
 8 |   selector:
 9 |     matchLabels:
10 |       app: {{ .Values.bbr.name }}
11 |   template:
12 |     metadata:
13 |       labels:
14 |         app: {{ .Values.bbr.name }}
15 |     spec:
16 |       containers:
17 |       - name: bbr
18 |         image: {{ .Values.bbr.image.hub }}/{{ .Values.bbr.image.name }}:{{ .Values.bbr.image.tag }}
19 |         imagePullPolicy: {{ .Values.bbr.image.pullPolicy | default "Always" }}
20 |         args:
21 |         - "--streaming"
22 |         - "--v"
23 |         - "3"
24 |         ports:
25 |         - containerPort: {{ .Values.bbr.port }}
26 |         # health check
27 |         - containerPort: {{ .Values.bbr.healthCheckPort }}
28 | ---
29 | apiVersion: v1
30 | kind: Service
31 | metadata:
32 |   name: {{ .Values.bbr.name }}
33 |   namespace: {{ .Release.Namespace }}
34 | spec:
35 |   selector:
36 |     app: {{ .Values.bbr.name }}
37 |   ports:
38 |   - protocol: TCP
39 |     port: {{ .Values.bbr.port }}
40 |     targetPort: {{ .Values.bbr.port }}
41 |     appProtocol: HTTP2
42 |   type: ClusterIP
43 | 


--------------------------------------------------------------------------------
/conformance/reports/v0.4.0/gateway/istio/README.md:
--------------------------------------------------------------------------------
 1 | # istio (gateway Profile Conformance) - v0.4.0
 2 | 
 3 | ## Test Results
 4 | 
 5 | This directory contains conformance test results for Gateway API Inference Extension v0.4.0 testing against istio implementations using the gateway profile.
 6 | 
 7 | | Extension Version Tested | Profile Tested | Implementation Version | Mode    | Report | Status |
 8 | |--------------------------|----------------|------------------------|---------|--------|--------|
 9 | | v1.3.0 | Gateway | 1.28-alpha.32ca03082f566513ad9b860f31e7745b0f68dc91 | default | [./1.28-alpha.32ca03082f566513ad9b860f31e7745b0f68dc91-default-gateway-report.yaml](././1.28-alpha.32ca03082f566513ad9b860f31e7745b0f68dc91-default-gateway-report.yaml) | PASS |
10 | ## Running the Tests
11 | 
12 | For instructions on how to reproduce these test results and run the conformance tests yourself, see the [istio Conformance Testing README](../../../../scripts/istio/README.md).
13 | 
14 | ## About This Version
15 | 
16 | - **Extension Version**: v0.4.0
17 | - **Profile**: gateway
18 | - **Implementation**: istio
19 | - **Test Mode**: Default
20 | 
21 | For detailed information about conformance testing, report generation, and requirements, see the [main conformance README](../../../../../README.md).
22 | 


--------------------------------------------------------------------------------
/conformance/reports/v0.5.0/gateway/istio/README.md:
--------------------------------------------------------------------------------
 1 | # istio (gateway Profile Conformance) - v0.5.0
 2 | 
 3 | ## Test Results
 4 | 
 5 | This directory contains conformance test results for Gateway API Inference Extension v0.5.0 testing against istio implementations using the gateway profile.
 6 | 
 7 | | Extension Version Tested | Profile Tested | Implementation Version | Mode    | Report | Status |
 8 | |--------------------------|----------------|------------------------|---------|--------|--------|
 9 | | v1.3.0 | Gateway | 1.28-alpha.32ca03082f566513ad9b860f31e7745b0f68dc91 | default | [./1.28-alpha.32ca03082f566513ad9b860f31e7745b0f68dc91-default-gateway-report.yaml](././1.28-alpha.32ca03082f566513ad9b860f31e7745b0f68dc91-default-gateway-report.yaml) | PASS |
10 | ## Running the Tests
11 | 
12 | For instructions on how to reproduce these test results and run the conformance tests yourself, see the [istio Conformance Testing README](../../../../scripts/istio/README.md).
13 | 
14 | ## About This Version
15 | 
16 | - **Extension Version**: v0.5.0
17 | - **Profile**: gateway
18 | - **Implementation**: istio
19 | - **Test Mode**: Default
20 | 
21 | For detailed information about conformance testing, report generation, and requirements, see the [main conformance README](../../../../../README.md).
22 | 


--------------------------------------------------------------------------------
/version/version.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright 2025 The Kubernetes Authors.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 | 
17 | package version
18 | 
19 | var (
20 | 	// The git hash of the latest commit in the build.
21 | 	CommitSHA string
22 | 
23 | 	// The build ref from the _PULL_BASE_REF from cloud build trigger.
24 | 	BuildRef string
25 | )
26 | 
27 | const (
28 | 	// BundleVersionAnnotation is the annotation key used in the Gateway API inference extension CRDs to specify
29 | 	// the installed Gateway API inference extension version.
30 | 	BundleVersionAnnotation = "inference.networking.k8s.io/bundle-version"
31 | 
32 | 	// BundleVersion is the value used for labeling the version of the gateway-api-inference-extension.
33 | 	BundleVersion = "main-dev"
34 | )
35 | 


--------------------------------------------------------------------------------
/config/charts/inferencepool/templates/epp-servicemonitor.yaml:
--------------------------------------------------------------------------------
 1 | {{- if and .Values.inferenceExtension.monitoring.prometheus.enabled (ne (lower .Values.provider.name) "gke") }}
 2 | apiVersion: monitoring.coreos.com/v1
 3 | kind: ServiceMonitor
 4 | metadata:
 5 |   name: {{ include "gateway-api-inference-extension.name" . }}-monitor
 6 |   namespace: {{ .Release.Namespace }}
 7 |   labels:
 8 |     {{- include "gateway-api-inference-extension.labels" . | nindent 4 }}
 9 |     {{- with .Values.inferenceExtension.monitoring.prometheus.extraLabels }}
10 |     {{- toYaml . | nindent 4 }}
11 |     {{- end }}
12 | spec:
13 |   endpoints:
14 |   - interval: {{ .Values.inferenceExtension.monitoring.interval }}
15 |     port: "http-metrics"
16 |     path: "/metrics"
17 |     {{- if .Values.inferenceExtension.monitoring.prometheus.auth.enabled }}
18 |     authorization:
19 |       credentials:
20 |         key: token
21 |         name: {{ .Values.inferenceExtension.monitoring.prometheus.auth.secretName }}
22 |     {{- end }}
23 |   jobLabel: {{ include "gateway-api-inference-extension.name" . }}
24 |   namespaceSelector:
25 |     matchNames:
26 |     - {{ .Release.Namespace }}
27 |   selector:
28 |     matchLabels:
29 |       {{- include "gateway-api-inference-extension.labels" . | nindent 6 }}
30 | {{- end }}
31 | 


--------------------------------------------------------------------------------
/conformance/tests/httproute_multiple_gateways_different_pools.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | apiVersion: gateway.networking.k8s.io/v1
 3 | kind: HTTPRoute
 4 | metadata:
 5 |   name: route-for-primary-gateway
 6 |   namespace: inference-conformance-app-backend
 7 | spec:
 8 |   parentRefs:
 9 |   - kind: Gateway
10 |     name: conformance-primary
11 |     namespace: inference-conformance-infra
12 |   hostnames:
13 |   - "primary.example.com"
14 |   rules:
15 |   - backendRefs:
16 |     - group: inference.networking.k8s.io
17 |       kind: InferencePool
18 |       name: primary-inference-pool
19 |     matches:
20 |     - path:
21 |         type: PathPrefix
22 |         value: /test-primary-gateway
23 | ---
24 | apiVersion: gateway.networking.k8s.io/v1
25 | kind: HTTPRoute
26 | metadata:
27 |   name: route-for-secondary-gateway
28 |   namespace: inference-conformance-app-backend
29 | spec:
30 |   parentRefs:
31 |   - kind: Gateway
32 |     name: conformance-secondary
33 |     namespace: inference-conformance-infra
34 |   hostnames:
35 |   - "secondary.example.com"
36 |   rules:
37 |   - backendRefs:
38 |     - group: inference.networking.k8s.io
39 |       kind: InferencePool
40 |       name: secondary-inference-pool
41 |     matches:
42 |     - path:
43 |         type: PathPrefix
44 |         value: /test-secondary-gateway
45 | 


--------------------------------------------------------------------------------
/pkg/epp/plugins/registry.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright 2025 The Kubernetes Authors.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 | 
17 | package plugins
18 | 
19 | import (
20 | 	"encoding/json"
21 | )
22 | 
23 | // Factory is the definition of the factory functions that are used to instantiate plugins
24 | // specified in a configuration.
25 | type FactoryFunc func(name string, parameters json.RawMessage, handle Handle) (Plugin, error)
26 | 
27 | // Register is a static function that can be called to register plugin factory functions.
28 | func Register(pluginType string, factory FactoryFunc) {
29 | 	Registry[pluginType] = factory
30 | }
31 | 
32 | // Registry is a mapping from plugin name to Factory function
33 | var Registry map[string]FactoryFunc = map[string]FactoryFunc{}
34 | 


--------------------------------------------------------------------------------
/config/crd/kustomization.yaml:
--------------------------------------------------------------------------------
 1 | # This kustomization.yaml is not intended to be run by itself,
 2 | # since it depends on service name and namespace that are out of this kustomize package.
 3 | # It should be run by config/default
 4 | resources:
 5 |   - bases/inference.networking.x-k8s.io_inferencepools.yaml
 6 |   - bases/inference.networking.x-k8s.io_inferenceobjectives.yaml
 7 |   - bases/inference.networking.x-k8s.io_inferencepoolimports.yaml
 8 |   - bases/inference.networking.k8s.io_inferencepools.yaml
 9 | # +kubebuilder:scaffold:crdkustomizeresource
10 | 
11 | patches:
12 | # [WEBHOOK] To enable webhook, uncomment all the sections with [WEBHOOK] prefix.
13 | # patches here are for enabling the conversion webhook for each CRD
14 | # +kubebuilder:scaffold:crdkustomizewebhookpatch
15 | 
16 | # [CERTMANAGER] To enable cert-manager, uncomment all the sections with [CERTMANAGER] prefix.
17 | # patches here are for enabling the CA injection for each CRD
18 | #- path: patches/cainjection_in_inferencepools.yaml
19 | #- path: patches/cainjection_in_inferenceobjectives.yaml
20 | # +kubebuilder:scaffold:crdkustomizecainjectionpatch
21 | 
22 | # [WEBHOOK] To enable webhook, uncomment the following section
23 | # the following config is for teaching kustomize how to do kustomization for CRDs.
24 | 
25 | #configurations:
26 | #- kustomizeconfig.yaml
27 | 


--------------------------------------------------------------------------------
/pkg/epp/saturationdetector/config.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright 2025 The Kubernetes Authors.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 | 	http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 | package saturationdetector
17 | 
18 | import (
19 | 	"time"
20 | )
21 | 
22 | // Default configuration values
23 | const (
24 | 	// DefaultQueueDepthThreshold is the default backend waiting queue size threshold.
25 | 	DefaultQueueDepthThreshold = 5
26 | 	// DefaultKVCacheUtilThreshold is the default KV cache utilization (0.0 to 1.0) threshold.
27 | 	DefaultKVCacheUtilThreshold = 0.8
28 | 	// DefaultMetricsStalenessThreshold defines how old metrics can be before they
29 | 	// are considered stale.
30 | 	// Given the pod metrics refresh interval is 50ms, a threshold slightly above
31 | 	// that should be fine.
32 | 	DefaultMetricsStalenessThreshold = 200 * time.Millisecond
33 | )
34 | 


--------------------------------------------------------------------------------
/config/manifests/bbr-example/httproute_bbr.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | apiVersion: gateway.networking.k8s.io/v1
 3 | kind: HTTPRoute
 4 | metadata:
 5 |   name: llm-llama-route
 6 | spec:
 7 |   parentRefs:
 8 |   - group: gateway.networking.k8s.io
 9 |     kind: Gateway
10 |     name: inference-gateway
11 |   rules:
12 |   - backendRefs:
13 |     - group: inference.networking.k8s.io
14 |       kind: InferencePool
15 |       name: vllm-llama3-8b-instruct
16 |     matches:
17 |     - path:
18 |         type: PathPrefix
19 |         value: /
20 |       headers:
21 |         - type: Exact
22 |           name: X-Gateway-Model-Name
23 |           value: 'meta-llama/Llama-3.1-8B-Instruct'
24 |     timeouts:
25 |       request: 300s
26 | ---
27 | apiVersion: gateway.networking.k8s.io/v1
28 | kind: HTTPRoute
29 | metadata:
30 |   name: llm-phi4-route
31 | spec:
32 |   parentRefs:
33 |   - group: gateway.networking.k8s.io
34 |     kind: Gateway
35 |     name: inference-gateway
36 |   rules:
37 |   - backendRefs:
38 |     - group: inference.networking.k8s.io
39 |       kind: InferencePool
40 |       name: vllm-phi4-mini-instruct
41 |     matches:
42 |     - path:
43 |         type: PathPrefix
44 |         value: /
45 |       headers:
46 |         - type: Exact
47 |           name: X-Gateway-Model-Name
48 |           value: 'microsoft/Phi-4-mini-instruct'
49 |     timeouts:
50 |       request: 300s
51 | ---
52 | 


--------------------------------------------------------------------------------
/pkg/epp/flowcontrol/contracts/doc.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright 2025 The Kubernetes Authors.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 | 
17 | // Package contracts defines the boundaries and service interfaces for the Flow Control system.
18 | //
19 | // Adhering to a "Ports and Adapters" (Hexagonal) architectural style, these interfaces decouple the core
20 | // `controller.FlowController` engine from its dependencies. They establish the required behaviors and system invariants
21 | // that concrete implementations must uphold.
22 | //
23 | // The primary contracts are:
24 | //
25 | //   - `FlowRegistry`: The interface for the stateful control plane that manages the lifecycle of flows, queues, and
26 | //     policies.
27 | //
28 | //   - `SaturationDetector`: The interface for a component that provides real-time load signals.
29 | package contracts
30 | 


--------------------------------------------------------------------------------
/pkg/epp/metrics/testdata/prefix_indexer_hit_bytes_metric:
--------------------------------------------------------------------------------
 1 | # HELP inference_extension_prefix_indexer_hit_bytes [ALPHA] Length of the prefix match in number of bytes in the cache lookup.
 2 | # TYPE inference_extension_prefix_indexer_hit_bytes histogram
 3 | inference_extension_prefix_indexer_hit_bytes_bucket{le="0"} 2
 4 | inference_extension_prefix_indexer_hit_bytes_bucket{le="16"} 5
 5 | inference_extension_prefix_indexer_hit_bytes_bucket{le="32"} 5
 6 | inference_extension_prefix_indexer_hit_bytes_bucket{le="64"} 6
 7 | inference_extension_prefix_indexer_hit_bytes_bucket{le="128"} 6
 8 | inference_extension_prefix_indexer_hit_bytes_bucket{le="256"} 6
 9 | inference_extension_prefix_indexer_hit_bytes_bucket{le="512"} 6
10 | inference_extension_prefix_indexer_hit_bytes_bucket{le="1024"} 6
11 | inference_extension_prefix_indexer_hit_bytes_bucket{le="2048"} 6
12 | inference_extension_prefix_indexer_hit_bytes_bucket{le="4096"} 6
13 | inference_extension_prefix_indexer_hit_bytes_bucket{le="8192"} 6
14 | inference_extension_prefix_indexer_hit_bytes_bucket{le="16384"} 6
15 | inference_extension_prefix_indexer_hit_bytes_bucket{le="32768"} 6
16 | inference_extension_prefix_indexer_hit_bytes_bucket{le="65536"} 6
17 | inference_extension_prefix_indexer_hit_bytes_bucket{le="+Inf"} 6
18 | inference_extension_prefix_indexer_hit_bytes_sum 86
19 | inference_extension_prefix_indexer_hit_bytes_count 6
20 | 


--------------------------------------------------------------------------------
/pkg/epp/requestcontrol/types.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright 2025 The Kubernetes Authors.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 | 
17 | package requestcontrol
18 | 
19 | // Response contains information from the response received to be passed to the Response requestcontrol plugins
20 | type Response struct {
21 | 	// RequestId is the Envoy generated Id for the request being processed
22 | 	RequestId string
23 | 	// Headers is a map of the response headers. Nil during body processing
24 | 	Headers map[string]string
25 | 	// Body Is the body of the response or nil during header processing
26 | 	Body string
27 | 	// IsStreaming indicates whether or not the response is being streamed by the model
28 | 	IsStreaming bool
29 | 	// EndOfStream when true indicates that this invocation contains the last chunk of the response
30 | 	EndOfStream bool
31 | }
32 | 


--------------------------------------------------------------------------------
/pkg/epp/server/runserver_test.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright 2025 The Kubernetes Authors.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 | 
17 | package server_test
18 | 
19 | import (
20 | 	"testing"
21 | 
22 | 	"sigs.k8s.io/controller-runtime/pkg/manager"
23 | 
24 | 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/server"
25 | 	logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging"
26 | )
27 | 
28 | func TestRunnable(t *testing.T) {
29 | 	// Make sure AsRunnable() does not use leader election.
30 | 	runner := server.NewDefaultExtProcServerRunner().AsRunnable(logutil.NewTestLogger())
31 | 	r, ok := runner.(manager.LeaderElectionRunnable)
32 | 	if !ok {
33 | 		t.Fatal("runner is not LeaderElectionRunnable")
34 | 	}
35 | 	if r.NeedLeaderElection() {
36 | 		t.Error("runner returned NeedLeaderElection = true, expected false")
37 | 	}
38 | }
39 | 


--------------------------------------------------------------------------------
/pkg/epp/util/logging/logger.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright 2025 The Kubernetes Authors.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 | 
17 | package logging
18 | 
19 | import (
20 | 	"context"
21 | 
22 | 	"github.com/go-logr/logr"
23 | 	uberzap "go.uber.org/zap"
24 | 	"sigs.k8s.io/controller-runtime/pkg/log"
25 | 	"sigs.k8s.io/controller-runtime/pkg/log/zap"
26 | )
27 | 
28 | // NewTestLogger creates a new Zap logger using the dev mode.
29 | func NewTestLogger() logr.Logger {
30 | 	return zap.New(zap.UseDevMode(true), zap.RawZapOpts(uberzap.AddCaller()))
31 | }
32 | 
33 | // NewTestLoggerIntoContext creates a new Zap logger using the dev mode and inserts it into the given context.
34 | func NewTestLoggerIntoContext(ctx context.Context) context.Context {
35 | 	return log.IntoContext(ctx, zap.New(zap.UseDevMode(true), zap.RawZapOpts(uberzap.AddCaller())))
36 | }
37 | 


--------------------------------------------------------------------------------
/pkg/epp/util/request/headers.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright 2025 The Kubernetes Authors.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 | 
17 | package request
18 | 
19 | import (
20 | 	"strings"
21 | 
22 | 	extProcPb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3"
23 | )
24 | 
25 | const (
26 | 	RequestIdHeaderKey = "x-request-id"
27 | )
28 | 
29 | func ExtractHeaderValue(req *extProcPb.ProcessingRequest_RequestHeaders, headerKey string) string {
30 | 	// header key should be case insensitive
31 | 	headerKeyInLower := strings.ToLower(headerKey)
32 | 	if req != nil && req.RequestHeaders != nil && req.RequestHeaders.Headers != nil {
33 | 		for _, headerKv := range req.RequestHeaders.Headers.Headers {
34 | 			if strings.ToLower(headerKv.Key) == headerKeyInLower {
35 | 				return string(headerKv.RawValue)
36 | 			}
37 | 		}
38 | 	}
39 | 	return ""
40 | }
41 | 


--------------------------------------------------------------------------------
/tools/dashboards/README.md:
--------------------------------------------------------------------------------
 1 | # Documentation
 2 | 
 3 | This documentation provides instructions for setting up grafana dashboards to see metrics emitted from the inference extension and model servers.
 4 | 
 5 | ## Requirements
 6 | 
 7 | Please follow [metrics](https://gateway-api-inference-extension.sigs.k8s.io/guides/metrics-and-observability/) page to configure the proxy to enable all metrics.
 8 | 
 9 | ## Load Inference Extension dashboard into Grafana
10 | 
11 | Please follow [grafana instructions](https://grafana.com/docs/grafana/latest/dashboards/build-dashboards/import-dashboards/) to load the dashboard json.
12 | 
13 | ## Configure Google Managed Prometheus as source for metrics
14 | 
15 | If you run the inference gateway with [Google Managed Prometheus](https://cloud.google.com/stackdriver/docs/managed-prometheus), please follow the [instructions](https://cloud.google.com/stackdriver/docs/managed-prometheus/query) to configure Google Managed Prometheus as data source for the grafana dashboard.
16 | 
17 | ## Troubleshooting
18 | 
19 | ### No data in graph
20 | 
21 | Please configure the `scrape_interval` of your prometheus configuration to lower than `15s`, `rate` function returns empty string if data falls too apart. See https://www.robustperception.io/what-range-should-i-use-with-rate/ for more details.
22 | 
23 | Example:
24 | 
25 | ```
26 |     global:
27 |       scrape_interval: 5s
28 | ```
29 | 


--------------------------------------------------------------------------------
/config/charts/body-based-routing/templates/gke.yaml:
--------------------------------------------------------------------------------
 1 | {{- if eq .Values.provider.name "gke" }}
 2 | ---
 3 | kind: GCPRoutingExtension
 4 | apiVersion: networking.gke.io/v1
 5 | metadata:
 6 |   name: {{ .Values.bbr.name }}
 7 |   namespace: {{ .Release.Namespace }}
 8 | spec:
 9 |   targetRefs:
10 |   - group: "gateway.networking.k8s.io"
11 |     kind: Gateway
12 |     name: {{ .Values.inferenceGateway.name }}
13 |   extensionChains:
14 |   - name: chain1
15 |     extensions:
16 |     - name: ext1
17 |       authority: "myext.com"
18 |       timeout: 1s
19 |       supportedEvents:
20 |       - RequestHeaders
21 |       - RequestBody
22 |       - RequestTrailers
23 |       requestBodySendMode: "FullDuplexStreamed"
24 |       backendRef:
25 |         group: ""
26 |         kind: Service
27 |         name: {{ .Values.bbr.name }}
28 |         port: {{ .Values.bbr.port }}
29 | ---
30 | apiVersion: networking.gke.io/v1
31 | kind: HealthCheckPolicy
32 | metadata:
33 |   name: bbr-healthcheck
34 |   namespace: {{ .Release.Namespace }}
35 | spec:
36 |   default:
37 |     logConfig:
38 |       enabled: true
39 |     config:
40 |       type: "GRPC"
41 |       grpcHealthCheck:
42 |         portSpecification: "USE_FIXED_PORT"
43 |         port: {{ .Values.bbr.healthCheckPort }}
44 |   targetRef:
45 |     group: ""
46 |     kind: Service
47 |     name: {{ .Values.bbr.name }}
48 |     namespace: {{ .Release.Namespace }}
49 | {{- end }}
50 | 


--------------------------------------------------------------------------------
/client-go/clientset/versioned/typed/api/v1/fake/fake_api_client.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright The Kubernetes Authors.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 | 
17 | // Code generated by client-gen. DO NOT EDIT.
18 | 
19 | package fake
20 | 
21 | import (
22 | 	rest "k8s.io/client-go/rest"
23 | 	testing "k8s.io/client-go/testing"
24 | 	v1 "sigs.k8s.io/gateway-api-inference-extension/client-go/clientset/versioned/typed/api/v1"
25 | )
26 | 
27 | type FakeInferenceV1 struct {
28 | 	*testing.Fake
29 | }
30 | 
31 | func (c *FakeInferenceV1) InferencePools(namespace string) v1.InferencePoolInterface {
32 | 	return newFakeInferencePools(c, namespace)
33 | }
34 | 
35 | // RESTClient returns a RESTClient that is used to communicate
36 | // with API server by this client implementation.
37 | func (c *FakeInferenceV1) RESTClient() rest.Interface {
38 | 	var ret *rest.RESTClient
39 | 	return ret
40 | }
41 | 


--------------------------------------------------------------------------------
/site-src/_includes/model-server.md:
--------------------------------------------------------------------------------
 1 |    Three options are supported for running the model server:
 2 | 
 3 |    1. GPU-based model server.
 4 |       Requirements: a Hugging Face access token that grants access to the model [meta-llama/Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct).
 5 | 
 6 |    1. CPU-based model server (not using GPUs).
 7 |       The sample uses the model [Qwen/Qwen2.5-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct).
 8 | 
 9 |    1. [vLLM Simulator](https://github.com/llm-d/llm-d-inference-sim/tree/main) model server (not using GPUs).
10 |       The sample is configured to simulate the [meta-llama/Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct) model.
11 | 
12 |    Choose one of these options and follow the steps below. Please do not deploy more than one, as the deployments have the same name and will override each other.
13 | 
14 | === "GPU-Based Model Server"
15 | 
16 |     For this setup, you will need 3 GPUs to run the sample model server. Adjust the number of replicas in `./config/manifests/vllm/gpu-deployment.yaml` as needed.
17 |     Create a Hugging Face secret to download the model [meta-llama/Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct). Ensure that the token grants access to this model.
18 | 
19 |     Deploy a sample vLLM deployment with the proper protocol to work with the LLM Instance Gateway.
20 | 


--------------------------------------------------------------------------------
/site-src/guides/epp-configuration/flags.md:
--------------------------------------------------------------------------------
 1 | # EPP Configuration Flags
 2 | 
 3 | This page documents selected configuration flags for the Endpoint Picker (EPP) binary. Most flags are self-explanatory via their `--help` descriptions; only flags with nuanced or non-obvious behavior are detailed here.
 4 | 
 5 | ## --pool-namespace
 6 | 
 7 | **Description:**
 8 | Specifies the namespace of the InferencePool this Endpoint Picker is associated with.
 9 | 
10 | **Resolution order:**
11 | 1. If `--pool-namespace` is set to a non-empty value, its value is used.
12 | 2. If the flag is not set (i.e., left empty), the `NAMESPACE` environment variable is checked. If set, its value is used.
13 | 3. If neither is set, the namespace defaults to `default`.
14 | 
15 | This allows the EPP to automatically use the namespace it is running in (when the `NAMESPACE` env var is set via Kubernetes Downward API), without requiring explicit configuration. If you want to force the use of the default namespace, explicitly set `--pool-namespace=default`. If you want to use the environment variable or fallback, leave the flag unset or set it to an empty string.
16 | 
17 | **Example manifest snippet to set the env var from pod metadata:**
18 | 
19 | ```yaml
20 | env:
21 |   - name: NAMESPACE
22 |     valueFrom:
23 |       fieldRef:
24 |         fieldPath: metadata.namespace
25 | ```
26 | 
27 | ---
28 | 
29 | For a full list of flags, run:
30 | 
31 | ```
32 | EPP_BINARY --help
33 | ```
34 | 


--------------------------------------------------------------------------------
/site-src/_includes/model-server-cpu.md:
--------------------------------------------------------------------------------
 1 | === "CPU-Based Model Server"
 2 | 
 3 |     ???+ warning
 4 | 
 5 |         CPU deployment can be unreliable i.e. the pods may crash/restart because of resource contraints.
 6 | 
 7 |     This setup is using the formal `vllm-cpu` image, which according to the documentation can run vLLM on x86 CPU platform.
 8 |     For this setup, we use approximately 9.5GB of memory and 12 CPUs for each replica.
 9 | 
10 |     While it is possible to deploy the model server with less resources, this is not recommended. For example, in our tests, loading the model using 8GB of memory and 1 CPU was possible but took almost 3.5 minutes and inference requests took unreasonable time. In general, there is a tradeoff between the memory and CPU we allocate to our pods and the performance. The more memory and CPU we allocate the better performance we can get.
11 | 
12 |     After running multiple configurations of these values we decided in this sample to use 9.5GB of memory and 12 CPUs for each replica, which gives reasonable response times. You can increase those numbers and potentially may even get better response times. For modifying the allocated resources, adjust the numbers in [cpu-deployment.yaml](https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/vllm/cpu-deployment.yaml) as needed.
13 | 
14 |     Deploy a sample vLLM deployment with the proper protocol to work with the LLM Instance Gateway.
15 | 


--------------------------------------------------------------------------------
/site-src/api-types/inferencepoolimport.md:
--------------------------------------------------------------------------------
 1 | # Inference Pool Import
 2 | 
 3 | ??? example "Alpha since v1.1.0"
 4 | 
 5 |     The `InferencePoolImport` resource is alpha and may have breaking changes in
 6 |     future releases of the API.
 7 | 
 8 | ## Background
 9 | 
10 | The **InferencePoolImport** API is a cluster-local, controller-managed resource that represents an imported InferencePool.
11 | It primarily communicates a relationship between an exported InferencePool and the exporting cluster name. It is not
12 | user-authored; status carries the effective import. Inference Platform Owners can reference the InferencePoolImport,
13 | even if the local cluster does not have an InferencePool. In the context of Gateway API, it means that an HTTPRoute can
14 | be configured to reference an InferencePoolImport to route matching requests to endpoints of backing InferencePools.
15 | 
16 | Key ideas:
17 | 
18 | - Map an exported InferencePool to exporting controller and cluster.
19 | - Name/namespace sameness with the exported InferencePool (avoids extra indirection).
20 | - Conditions: Surface a controller-level status condition to indicate whether the InferencePoolImport is ready for use.
21 | - Conditions: Surface parent-level status conditions to indicate whether the InferencePoolImport is referenced by a parent,
22 |   e.g. Gateway.
23 | 
24 | ## Spec
25 | 
26 | The full spec of the InferencePoolImport is defined [here](/reference/x-v1a1-spec/#inferencepoolimport).
27 | 


--------------------------------------------------------------------------------
/client-go/clientset/versioned/typed/apix/v1alpha1/fake/fake_apix_client.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright The Kubernetes Authors.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 | 
17 | // Code generated by client-gen. DO NOT EDIT.
18 | 
19 | package fake
20 | 
21 | import (
22 | 	rest "k8s.io/client-go/rest"
23 | 	testing "k8s.io/client-go/testing"
24 | 	v1alpha1 "sigs.k8s.io/gateway-api-inference-extension/client-go/clientset/versioned/typed/apix/v1alpha1"
25 | )
26 | 
27 | type FakeInferenceV1alpha1 struct {
28 | 	*testing.Fake
29 | }
30 | 
31 | func (c *FakeInferenceV1alpha1) InferencePoolImports(namespace string) v1alpha1.InferencePoolImportInterface {
32 | 	return newFakeInferencePoolImports(c, namespace)
33 | }
34 | 
35 | // RESTClient returns a RESTClient that is used to communicate
36 | // with API server by this client implementation.
37 | func (c *FakeInferenceV1alpha1) RESTClient() rest.Interface {
38 | 	var ret *rest.RESTClient
39 | 	return ret
40 | }
41 | 


--------------------------------------------------------------------------------
/tools/simulations/llm_ig_simulation/src/constants.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 The Kubernetes Authors.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | PREFILL_LATENCY_CONST_2 = 0
16 | PREFILL_LATENCY_CONST_1 = 0.00006769375513
17 | PREFILL_LATENCY_CONST_0 = 0.01969
18 | PREFILL_LATENCY_CONST_MIN = 0.04
19 | 
20 | DECODE_LATENCY_CONST_BATCH = 0.0001026494433
21 | DECODE_LATENCY_CONST_1 = 0.0000005353485087
22 | DECODE_LATENCY_CONST_0 = 0.014
23 | TOKENIZE_LATENCY_CONST = 0
24 | 
25 | MAX_NUM_BATCH_TOKENS = 512 # in prefill
26 | 
27 | TOTAL_NUM_GPU_BLOCKS = 2810
28 | NUMBER_OF_TOKENS_PER_BLOCK = 16
29 | MAX_NUM_TOKENS_ALLOWED = TOTAL_NUM_GPU_BLOCKS * NUMBER_OF_TOKENS_PER_BLOCK - MAX_NUM_BATCH_TOKENS # in kv cache
30 | MAX_GPU_MEMORY_PERC_BEFORE_RECOMPUTE = 0.9
31 | MAX_GPU_MEMORY_PERC_BEFORE_RECOMPUTE_NON_CRITICAL = 0.8
32 | MAX_NUM_SEQ = 256
33 | 
34 | # size of each lora in units of KV Cache
35 | LORA_DICT = {"tweet": 1600, "sql": 1600, "dummy-1": 0, "dummy-2": 0}
36 | 


--------------------------------------------------------------------------------
/conformance/reports/v1.0.2/gateway/nginx-nginx-gateway-fabric/README.md:
--------------------------------------------------------------------------------
 1 | # Nginx NGINX Gateway Fabric
 2 | 
 3 | ## Table of Contents
 4 | 
 5 | | Extension Version Tested | Profile Tested | Implementation Version | Mode    | Report                                                                     |
 6 | |--------------------------|----------------|------------------------|---------|----------------------------------------------------------------------------|
 7 | | v1.0.2                   | Gateway        | v2.2.0                 | default | [v2.2.0 report](./inference-v2.2.0-report.yaml)    
 8 | 
 9 | ## Reproduce
10 | 
11 | To reproduce results, clone the NGF repository:
12 | 
13 | ```shell
14 | git clone https://github.com/nginx/nginx-gateway-fabric.git && cd nginx-gateway-fabric/tests
15 | ```
16 | 
17 | Follow the steps in the [NGINX Gateway Fabric Testing](https://github.com/nginx/nginx-gateway-fabric/blob/main/tests/README.md#conformance-testing) document to run the conformance tests. If you are running tests on the `edge` version, then you don't need to build any images. Otherwise, you'll need to check out the specific release tag that you want to test, and then build and load the images onto your cluster, per the steps in the README.
18 | 
19 | Note: Enable this flag to install all CRDs and required resources:
20 | 
21 | ```shell
22 | export ENABLE_INFERENCE_EXTENSION=true
23 | ```
24 | 
25 | After running, see the conformance report:
26 | 
27 | ```shell
28 | cat conformance-profile-inference.yaml
29 | ```
30 | 


--------------------------------------------------------------------------------
/pkg/epp/scheduling/framework/plugins/picker/common.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright 2025 The Kubernetes Authors.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 | 
17 | package picker
18 | 
19 | import (
20 | 	"math/rand/v2"
21 | 	"time"
22 | 
23 | 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types"
24 | )
25 | 
26 | const (
27 | 	DefaultMaxNumOfEndpoints = 1 // common default to all pickers
28 | )
29 | 
30 | // pickerParameters defines the common parameters for all pickers
31 | type pickerParameters struct {
32 | 	MaxNumOfEndpoints int `json:"maxNumOfEndpoints"`
33 | }
34 | 
35 | func shuffleScoredPods(scoredPods []*types.ScoredPod) {
36 | 	// Rand package is not safe for concurrent use, so we create a new instance.
37 | 	// Source: https://pkg.go.dev/math/rand/v2#pkg-overview
38 | 	randomGenerator := rand.New(rand.NewPCG(uint64(time.Now().UnixNano()), 0))
39 | 
40 | 	// Shuffle in-place
41 | 	randomGenerator.Shuffle(len(scoredPods), func(i, j int) {
42 | 		scoredPods[i], scoredPods[j] = scoredPods[j], scoredPods[i]
43 | 	})
44 | }
45 | 


--------------------------------------------------------------------------------
/conformance/tests/main.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright 2025 The Kubernetes Authors.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 | 
17 | // Package tests is the root package for all Gateway API Inference Extension
18 | // conformance test implementations.
19 | package tests
20 | 
21 | import (
22 | 	// Importing the suite package to access the ConformanceTest struct definition.
23 | 	// For initial version directly importing from the core gateway-api repo.
24 | 	// This may be adjusted in the future if we have need to create a copy of
25 | 	// the suite utilities.
26 | 	"sigs.k8s.io/gateway-api/conformance/utils/suite"
27 | 	// Do NOT add blank imports for specific test packages here.
28 | 	// They should be added to the main conformance package instead
29 | 	// to avoid import cycles.
30 | )
31 | 
32 | // ConformanceTests holds all the conformance tests definitions for the
33 | // Gateway API Inference Extension suite. Tests are registered from other packages
34 | // using init() functions like the one in the basic package.
35 | var ConformanceTests []suite.ConformanceTest
36 | 


--------------------------------------------------------------------------------
/site-src/contributing/devguide.md:
--------------------------------------------------------------------------------
 1 | # Developer Guide
 2 | 
 3 | ## Integration Tests Debug Guide
 4 | This document provides detailed instructions on how to run and debug integration tests locally in debug mode.
 5 | 
 6 | ### Prerequisites
 7 | 
 8 | #### 1. Install Required Tools
 9 | Ensure the envtest tool is installed:
10 | ```bash
11 | $ make envtest
12 | ```
13 | 
14 | #### 2. Verify Kubernetes Test Environment
15 | Run the following command to set up and verify the test environment:
16 | ```bash
17 | $ ./bin/setup-envtest use 1.31.0 --bin-dir ./bin -p path
18 | bin/k8s/1.31.0-darwin-arm64
19 | ```
20 | ### Run test in shell
21 | ```shell
22 | export KUBEBUILDER_ASSETS=<project-root-path>/bin/k8s/1.31.0-<platform-architecture> 
23 | go test sigs.k8s.io/gateway-api-inference-extension/test/integration/epp -run <test-function-name>
24 | ```
25 | 
26 | ### Configure and Run in GoLand
27 | 
28 | #### 1. Create Test Configuration
29 | Select the test case you want to debug:
30 | ![](../images/modify-run-configuration.png)
31 | 
32 | #### 2. Configure Environment Variables
33 | Set environment variables in the Run/Debug Configuration:
34 | 
35 | ![](../images/edit-environment-variables.png)
36 | 
37 | **Required environment variable:**
38 | 
39 | - **Name:** `KUBEBUILDER_ASSETS`
40 | - **Value:** `<project-root-path>/bin/k8s/1.31.0-<platform-architecture>`
41 | 
42 | **Example path:**
43 | ```
44 | /go/src/kubernetes.io/gateway-api-inference-extension/bin/k8s/1.31.0-darwin-arm64
45 | ```
46 | 
47 | #### 3. Set Breakpoints and Run
48 | 
49 | Example Output:
50 | 
51 | ![](../images/running-example.png)
52 | 


--------------------------------------------------------------------------------
/pkg/epp/scheduling/scheduler_config.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright 2025 The Kubernetes Authors.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 | 
17 | package scheduling
18 | 
19 | import (
20 | 	"fmt"
21 | 
22 | 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework"
23 | )
24 | 
25 | // NewSchedulerConfig creates a new SchedulerConfig object and returns its pointer.
26 | func NewSchedulerConfig(profileHandler framework.ProfileHandler, profiles map[string]*framework.SchedulerProfile) *SchedulerConfig {
27 | 	return &SchedulerConfig{
28 | 		profileHandler: profileHandler,
29 | 		profiles:       profiles,
30 | 	}
31 | }
32 | 
33 | // SchedulerConfig provides a configuration for the scheduler which influence routing decisions.
34 | type SchedulerConfig struct {
35 | 	profileHandler framework.ProfileHandler
36 | 	profiles       map[string]*framework.SchedulerProfile
37 | }
38 | 
39 | func (c *SchedulerConfig) String() string {
40 | 	return fmt.Sprintf(
41 | 		"{ProfileHandler: %s, Profiles: %v}",
42 | 		c.profileHandler.TypedName(),
43 | 		c.profiles,
44 | 	)
45 | }
46 | 


--------------------------------------------------------------------------------
/test/testdata/inferencepool-with-model-hermetic.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: inference.networking.k8s.io/v1
 2 | kind: InferencePool
 3 | metadata:
 4 |   name: vllm-llama3-8b-instruct-pool
 5 |   namespace: default
 6 | spec:
 7 |   targetPorts:
 8 |     - number: 8000
 9 |   selector:
10 |     matchLabels:
11 |       app: vllm-llama3-8b-instruct-pool
12 |   endpointPickerRef:
13 |     name: epp
14 |     kind: Service
15 |     port:
16 |       number: 9002
17 | ---
18 | apiVersion: inference.networking.x-k8s.io/v1alpha2
19 | kind: InferenceObjective
20 | metadata:
21 |   name: sql-lora
22 |   namespace: default
23 | spec:
24 |   priority: 2
25 |   poolRef:
26 |     name: vllm-llama3-8b-instruct-pool
27 |   targetModels:
28 |   - name: sql-lora-1fdg2
29 |     weight: 100
30 | ---
31 | apiVersion: inference.networking.x-k8s.io/v1alpha2
32 | kind: InferenceObjective
33 | metadata:
34 |   name: sql-lora-sheddable
35 |   namespace: default
36 | spec:
37 |   poolRef:
38 |     name: vllm-llama3-8b-instruct-pool
39 |   targetModels:
40 |   - name: sql-lora-1fdg3
41 |     weight: 100
42 | ---
43 | apiVersion: inference.networking.x-k8s.io/v1alpha2
44 | kind: InferenceObjective
45 | metadata:
46 |   name: my-model
47 |   namespace: default
48 | spec:
49 |   priority: 2
50 |   poolRef:
51 |     name: vllm-llama3-8b-instruct-pool
52 |   targetModels:
53 |   - name: my-model-12345
54 |     weight: 100    
55 | ---
56 | apiVersion: inference.networking.x-k8s.io/v1alpha2
57 | kind: InferenceObjective
58 | metadata:
59 |   name: direct-model
60 |   namespace: default
61 | spec:
62 |   priority: 2
63 |   poolRef:
64 |     name: vllm-llama3-8b-instruct-pool
65 | 


--------------------------------------------------------------------------------
/hack/update-codegen.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # Copyright 2025 The Kubernetes Authors.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | 
17 | set -o errexit
18 | set -o nounset
19 | set -o pipefail
20 | 
21 | echo "Generating CRDs"
22 | go run ./pkg/generator
23 | 
24 | SCRIPT_ROOT=$(dirname "${BASH_SOURCE[0]}")/..
25 | echo "$SCRIPT_ROOT script"
26 | CODEGEN_PKG=${2:-bin}
27 | echo $CODEGEN_PKG
28 | source "${CODEGEN_PKG}/kube_codegen.sh"
29 | THIS_PKG="sigs.k8s.io/gateway-api-inference-extension"
30 | 
31 | 
32 | kube::codegen::gen_helpers \
33 |     --boilerplate "${SCRIPT_ROOT}/hack/boilerplate/boilerplate.generatego.txt" \
34 |     "${SCRIPT_ROOT}"
35 | 
36 | kube::codegen::gen_register \
37 |     --boilerplate "${SCRIPT_ROOT}/hack/boilerplate/boilerplate.generatego.txt" \
38 |     "${SCRIPT_ROOT}"
39 | 
40 | kube::codegen::gen_client \
41 | --with-watch \
42 | --with-applyconfig \
43 | --output-dir "${SCRIPT_ROOT}/client-go" \
44 | --output-pkg "${THIS_PKG}/client-go" \
45 | --boilerplate "${SCRIPT_ROOT}/hack/boilerplate/boilerplate.generatego.txt" \
46 | "${SCRIPT_ROOT}"
47 | 


--------------------------------------------------------------------------------
/pkg/epp/util/error/error.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright 2025 The Kubernetes Authors.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 | 
17 | package error
18 | 
19 | import (
20 | 	"fmt"
21 | )
22 | 
23 | // Error is an error struct for errors returned by the epp server.
24 | type Error struct {
25 | 	Code string
26 | 	Msg  string
27 | }
28 | 
29 | const (
30 | 	Unknown                        = "Unknown"
31 | 	BadRequest                     = "BadRequest"
32 | 	Internal                       = "Internal"
33 | 	ServiceUnavailable             = "ServiceUnavailable"
34 | 	ModelServerError               = "ModelServerError"
35 | 	BadConfiguration               = "BadConfiguration"
36 | 	InferencePoolResourceExhausted = "InferencePoolResourceExhausted"
37 | )
38 | 
39 | // Error returns a string version of the error.
40 | func (e Error) Error() string {
41 | 	return fmt.Sprintf("inference gateway: %s - %s", e.Code, e.Msg)
42 | }
43 | 
44 | // CanonicalCode returns the error's ErrorCode.
45 | func CanonicalCode(err error) string {
46 | 	e, ok := err.(Error)
47 | 	if ok {
48 | 		return e.Code
49 | 	}
50 | 	return Unknown
51 | }
52 | 


--------------------------------------------------------------------------------
/client-go/applyconfiguration/apix/v1alpha2/match.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright The Kubernetes Authors.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 | 
17 | // Code generated by applyconfiguration-gen. DO NOT EDIT.
18 | 
19 | package v1alpha2
20 | 
21 | // MatchApplyConfiguration represents a declarative configuration of the Match type for use
22 | // with apply.
23 | type MatchApplyConfiguration struct {
24 | 	Model *ModelMatchApplyConfiguration `json:"model,omitempty"`
25 | }
26 | 
27 | // MatchApplyConfiguration constructs a declarative configuration of the Match type for use with
28 | // apply.
29 | func Match() *MatchApplyConfiguration {
30 | 	return &MatchApplyConfiguration{}
31 | }
32 | 
33 | // WithModel sets the Model field in the declarative configuration to the given value
34 | // and returns the receiver, so that objects can be built by chaining "With" function invocations.
35 | // If called multiple times, the Model field is set to the value of the last call.
36 | func (b *MatchApplyConfiguration) WithModel(value *ModelMatchApplyConfiguration) *MatchApplyConfiguration {
37 | 	b.Model = value
38 | 	return b
39 | }
40 | 


--------------------------------------------------------------------------------
/pkg/epp/plugins/plugins.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright 2025 The Kubernetes Authors.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 | 
17 | package plugins
18 | 
19 | // Plugin defines the interface for a plugin.
20 | // This interface should be embedded in all plugins across the code.
21 | type Plugin interface {
22 | 	// TypedName returns the type and name tuple of this plugin instance.
23 | 	TypedName() TypedName
24 | }
25 | 
26 | // ConsumerPlugin defines the interface for a consumer.
27 | type ConsumerPlugin interface {
28 | 	Plugin
29 | 	// Consumes returns data consumed by the plugin.
30 | 	// This is a map from data key (string) produced to
31 | 	// the data type of the key (represented as data with default value casted as any field).
32 | 	Consumes() map[string]any
33 | }
34 | 
35 | // ProducerPlugin defines the interface for a producer.
36 | type ProducerPlugin interface {
37 | 	Plugin
38 | 	// Produces returns data produced by the producer.
39 | 	// This is a map from data key (string) produced to
40 | 	// the data type of the key (represented as data with default value casted as any field).
41 | 	Produces() map[string]any
42 | }
43 | 


--------------------------------------------------------------------------------
/pkg/epp/flowcontrol/framework/doc.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright 2025 The Kubernetes Authors.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 | 
17 | // Package framework defines the core plugin interfaces for extending the `controller.FlowController`.
18 | //
19 | // It establishes the contracts that custom logic, such as queueing disciplines and dispatching policies, must adhere
20 | // to. By building on these interfaces, the Flow Control system can be extended and customized without modifying the
21 | // core controller logic.
22 | //
23 | // The primary contracts are:
24 | //   - `SafeQueue`: An interface for concurrent-safe queue implementations.
25 | //   - `IntraFlowDispatchPolicy`: An interface for policies that decide which item to select from within a single flow's
26 | //     queue.
27 | //   - `ItemComparator`: An interface vended by policies to make their internal item-ordering logic explicit and
28 | //     available to other components.
29 | //
30 | // These components are linked by `QueueCapability`, which allows policies to declare their queue requirements (e.g.,
31 | // FIFO or priority-based ordering).
32 | package framework
33 | 


--------------------------------------------------------------------------------
/site-src/_includes/epp.md:
--------------------------------------------------------------------------------
 1 | === "GKE"
 2 | 
 3 |       ```bash
 4 |       export GATEWAY_PROVIDER=gke
 5 |       helm install vllm-llama3-8b-instruct \
 6 |       --set inferencePool.modelServers.matchLabels.app=vllm-llama3-8b-instruct \
 7 |       --set provider.name=$GATEWAY_PROVIDER \
 8 |       --version $IGW_CHART_VERSION \
 9 |       oci://registry.k8s.io/gateway-api-inference-extension/charts/inferencepool
10 |       ```
11 | 
12 | === "Istio"
13 | 
14 |       ```bash
15 |       export GATEWAY_PROVIDER=istio
16 |       helm install vllm-llama3-8b-instruct \
17 |       --set inferencePool.modelServers.matchLabels.app=vllm-llama3-8b-instruct \
18 |       --set provider.name=$GATEWAY_PROVIDER \
19 |       --version $IGW_CHART_VERSION \
20 |       oci://registry.k8s.io/gateway-api-inference-extension/charts/inferencepool
21 |       ```
22 | 
23 | === "Kgateway"
24 | 
25 |       ```bash
26 |       export GATEWAY_PROVIDER=none
27 |       helm install vllm-llama3-8b-instruct \
28 |       --set inferencePool.modelServers.matchLabels.app=vllm-llama3-8b-instruct \
29 |       --set provider.name=$GATEWAY_PROVIDER \
30 |       --version $IGW_CHART_VERSION \
31 |       oci://registry.k8s.io/gateway-api-inference-extension/charts/inferencepool
32 |       ```
33 | 
34 | === "NGINX Gateway Fabric"
35 | 
36 |       ```bash
37 |       export GATEWAY_PROVIDER=none
38 |       helm install vllm-llama3-8b-instruct \
39 |       --set inferencePool.modelServers.matchLabels.app=vllm-llama3-8b-instruct \
40 |       --set provider.name=$GATEWAY_PROVIDER \
41 |       --version $IGW_CHART_VERSION \
42 |       oci://registry.k8s.io/gateway-api-inference-extension/charts/inferencepool
43 |       ```
44 | 


--------------------------------------------------------------------------------
/client-go/informers/externalversions/internalinterfaces/factory_interfaces.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright The Kubernetes Authors.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 | 
17 | // Code generated by informer-gen. DO NOT EDIT.
18 | 
19 | package internalinterfaces
20 | 
21 | import (
22 | 	time "time"
23 | 
24 | 	v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
25 | 	runtime "k8s.io/apimachinery/pkg/runtime"
26 | 	cache "k8s.io/client-go/tools/cache"
27 | 	versioned "sigs.k8s.io/gateway-api-inference-extension/client-go/clientset/versioned"
28 | )
29 | 
30 | // NewInformerFunc takes versioned.Interface and time.Duration to return a SharedIndexInformer.
31 | type NewInformerFunc func(versioned.Interface, time.Duration) cache.SharedIndexInformer
32 | 
33 | // SharedInformerFactory a small interface to allow for adding an informer without an import cycle
34 | type SharedInformerFactory interface {
35 | 	Start(stopCh <-chan struct{})
36 | 	InformerFor(obj runtime.Object, newFunc NewInformerFunc) cache.SharedIndexInformer
37 | }
38 | 
39 | // TweakListOptionsFunc is a function that transforms a v1.ListOptions.
40 | type TweakListOptionsFunc func(*v1.ListOptions)
41 | 


--------------------------------------------------------------------------------
/client-go/applyconfiguration/api/v1/port.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright The Kubernetes Authors.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 | 
17 | // Code generated by applyconfiguration-gen. DO NOT EDIT.
18 | 
19 | package v1
20 | 
21 | import (
22 | 	apiv1 "sigs.k8s.io/gateway-api-inference-extension/api/v1"
23 | )
24 | 
25 | // PortApplyConfiguration represents a declarative configuration of the Port type for use
26 | // with apply.
27 | type PortApplyConfiguration struct {
28 | 	Number *apiv1.PortNumber `json:"number,omitempty"`
29 | }
30 | 
31 | // PortApplyConfiguration constructs a declarative configuration of the Port type for use with
32 | // apply.
33 | func Port() *PortApplyConfiguration {
34 | 	return &PortApplyConfiguration{}
35 | }
36 | 
37 | // WithNumber sets the Number field in the declarative configuration to the given value
38 | // and returns the receiver, so that objects can be built by chaining "With" function invocations.
39 | // If called multiple times, the Number field is set to the value of the last call.
40 | func (b *PortApplyConfiguration) WithNumber(value apiv1.PortNumber) *PortApplyConfiguration {
41 | 	b.Number = &value
42 | 	return b
43 | }
44 | 


--------------------------------------------------------------------------------
/pkg/epp/scheduling/framework/plugins/test/consts.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright 2025 The Kubernetes Authors.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 | 
17 | package test
18 | 
19 | const (
20 | 	// HeaderTestEppEndPointSelectionKey is the request header used in tests to control
21 | 	// Endpoint Picker (EPP) behavior deterministically.
22 | 	//
23 | 	// The header value is a comma-separated list of endpoint identifiers. Each entry
24 | 	// may be in one of the following formats:
25 | 	//
26 | 	//   - "IP"          — selects all pods whose IP address matches the given value.
27 | 	//   - "IP:port"     — selects only pods whose IP and port both match exactly.
28 | 	//                     Ports correspond to data-parallel ranks or specific targetPorts.
29 | 	//
30 | 	// IPv6 addresses are supported, with or without brackets (e.g. "fd00::1" or "[fd00::1]:3002").
31 | 	// The returned order matches the order of endpoints specified in the header, and duplicates
32 | 	// are ignored.
33 | 	//
34 | 	// Examples:
35 | 	//   "test-epp-endpoint-selection": "10.0.0.7,10.0.0.8:3002"
36 | 	//   "test-epp-endpoint-selection": "[fd00::1]:3000,fd00::2"
37 | 	HeaderTestEppEndPointSelectionKey = "test-epp-endpoint-selection"
38 | )
39 | 


--------------------------------------------------------------------------------
/config/charts/body-based-routing/templates/istio.yaml:
--------------------------------------------------------------------------------
 1 | {{- if eq .Values.provider.name "istio" }}
 2 | ---
 3 | apiVersion: networking.istio.io/v1alpha3
 4 | kind: EnvoyFilter
 5 | metadata:
 6 |   name: {{ .Values.bbr.name }}
 7 |   namespace: {{ .Release.Namespace }}
 8 | spec:
 9 |   configPatches:
10 |   - applyTo: HTTP_FILTER
11 |     match:
12 |       # context omitted so that this applies to both sidecars and gateways
13 |       listener:
14 |         filterChain:
15 |           filter:
16 |             name: "envoy.filters.network.http_connection_manager"
17 |     patch:
18 |       operation: INSERT_FIRST
19 |       value:
20 |         name: envoy.filters.http.ext_proc
21 |         typed_config:
22 |           "@type": type.googleapis.com/envoy.extensions.filters.http.ext_proc.v3.ExternalProcessor
23 |           failure_mode_allow: false
24 |           allow_mode_override: true
25 |           processing_mode:
26 |             request_header_mode: "SEND"
27 |             response_header_mode: "SKIP"
28 |             request_body_mode: "FULL_DUPLEX_STREAMED"
29 |             response_body_mode: "NONE"
30 |             request_trailer_mode: "SEND"
31 |             response_trailer_mode: "SKIP"
32 |           grpc_service:
33 |             envoy_grpc:
34 |               cluster_name: outbound|{{ .Values.bbr.port }}||{{ .Values.bbr.name }}.{{ .Release.Namespace }}.svc.cluster.local
35 | ---
36 | apiVersion: networking.istio.io/v1
37 | kind: DestinationRule
38 | metadata:
39 |   name: {{ .Values.bbr.name }}
40 |   namespace: {{ .Release.Namespace }}
41 | spec:
42 |   host: {{ .Values.bbr.name }}.{{ .Release.Namespace }}.svc.cluster.local
43 |   trafficPolicy:
44 |       tls:
45 |         mode: SIMPLE
46 |         insecureSkipVerify: true
47 | {{- end }}
48 | 


--------------------------------------------------------------------------------
/pkg/epp/flowcontrol/registry/connection.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright 2025 The Kubernetes Authors.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 | 
17 | package registry
18 | 
19 | import (
20 | 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/flowcontrol/contracts"
21 | 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/flowcontrol/types"
22 | )
23 | 
24 | // connection is the concrete, un-exported implementation of the `contracts.ActiveFlowConnection` interface.
25 | // It is a temporary handle created for the duration of a single `WithConnection` call.
26 | type connection struct {
27 | 	registry *FlowRegistry
28 | 	key      types.FlowKey
29 | }
30 | 
31 | var _ contracts.ActiveFlowConnection = &connection{}
32 | 
33 | // Shards returns a stable snapshot of accessors for all internal state shards.
34 | func (c *connection) ActiveShards() []contracts.RegistryShard {
35 | 	c.registry.mu.RLock()
36 | 	defer c.registry.mu.RUnlock()
37 | 
38 | 	// Return a copy to ensure the caller cannot modify the registry's internal slice.
39 | 	shardsCopy := make([]contracts.RegistryShard, len(c.registry.activeShards))
40 | 	for i, s := range c.registry.activeShards {
41 | 		shardsCopy[i] = s
42 | 	}
43 | 	return shardsCopy
44 | }
45 | 


--------------------------------------------------------------------------------
/conformance/tests/inferencepool_resolvedrefs_condition.yaml:
--------------------------------------------------------------------------------
 1 | # conformance/tests/basic/inferencepool_resolvedrefs_condition.yaml
 2 | 
 3 | # This manifest defines the initial resources for the
 4 | # inferencepool_resolvedrefs_condition.go conformance test.
 5 | 
 6 | # --- HTTPRoute for Primary Gateway (conformance-primary) ---
 7 | apiVersion: gateway.networking.k8s.io/v1
 8 | kind: HTTPRoute
 9 | metadata:
10 |   name: httproute-for-primary-gw
11 |   namespace: inference-conformance-app-backend
12 | spec:
13 |   parentRefs:
14 |   - group: gateway.networking.k8s.io
15 |     kind: Gateway
16 |     name: conformance-primary
17 |     namespace: inference-conformance-infra
18 |     sectionName: http
19 |   hostnames:
20 |   - "primary.example.com"
21 |   rules:
22 |   - backendRefs:
23 |     - group: inference.networking.k8s.io
24 |       kind: InferencePool
25 |       name: primary-inference-pool
26 |     matches:
27 |     - path:
28 |         type: PathPrefix
29 |         value: /primary-gateway-test
30 | ---
31 | # --- HTTPRoute for Secondary Gateway (conformance-secondary) ---
32 | apiVersion: gateway.networking.k8s.io/v1
33 | kind: HTTPRoute
34 | metadata:
35 |   name: httproute-for-secondary-gw
36 |   namespace: inference-conformance-app-backend
37 | spec:
38 |   parentRefs:
39 |   - group: gateway.networking.k8s.io
40 |     kind: Gateway
41 |     name: conformance-secondary
42 |     namespace: inference-conformance-infra
43 |     sectionName: http
44 |   hostnames:
45 |   - "secondary.example.com"
46 |   rules:
47 |   - backendRefs:
48 |     - group: inference.networking.k8s.io
49 |       kind: InferencePool
50 |       name: primary-inference-pool
51 |     matches:
52 |     - path:
53 |         type: PathPrefix
54 |         value: /secondary-gateway-test
55 | 


--------------------------------------------------------------------------------
/tools/benchmark/download-benchmark-results.bash:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Downloads the benchmark result files from the benchmark tool pod.
 4 | download_benchmark_results() {
 5 |   until echo $(kubectl logs deployment/benchmark-tool -n ${namespace}) | grep -q -m 1 "LPG_FINISHED"; do sleep 30 ; done;
 6 |       benchmark_pod=$(kubectl get pods -l app=benchmark-tool -n ${namespace} -o jsonpath="{.items[0].metadata.name}")
 7 |       echo "Downloading JSON results from pod ${benchmark_pod}"
 8 |       kubectl exec ${benchmark_pod} -n ${namespace} -- rm -f ShareGPT_V3_unfiltered_cleaned_split.json
 9 |       for f in $(kubectl exec ${benchmark_pod} -n ${namespace} -- /bin/sh -c ls -l | grep json); do
10 |         echo "Downloading json file ${f}"
11 |         kubectl cp -n ${namespace} ${benchmark_pod}:$f ${benchmark_output_dir}/results/json/$f; 
12 |       done
13 | }
14 | 
15 | # Env vars to be passed when calling this script.
16 | # The id of the benchmark. This is needed to identify what the benchmark is for.
17 | # It decides the filepath to save the results, which later is used by the jupyter notebook to assign
18 | # the benchmark_id as data labels for plotting. 
19 | benchmark_id=${benchmark_id:-"inference-extension"}
20 | # run_id can be used to group different runs of the same benchmarks for comparison.
21 | run_id=${run_id:-"default-run"}
22 | namespace=${namespace:-"default"}
23 | output_dir=${output_dir:-'output'}
24 | 
25 | SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
26 | benchmark_output_dir=${SCRIPT_DIR}/${output_dir}/${run_id}/${benchmark_id}
27 | 
28 | echo "Saving benchmark results to ${benchmark_output_dir}/results/json/"
29 | download_benchmark_results
30 | kubectl delete -f ${SCRIPT_DIR}/../../config/manifests/benchmark/benchmark.yaml


--------------------------------------------------------------------------------
/pkg/epp/requestcontrol/plugin_executor.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright 2025 The Kubernetes Authors.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 | 
17 | package requestcontrol
18 | 
19 | import (
20 | 	"context"
21 | 	"errors"
22 | 	"time"
23 | 
24 | 	schedulingtypes "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types"
25 | )
26 | 
27 | // prepareDataPluginsWithTimeout executes the PrepareRequestData plugins with retries and timeout.
28 | func prepareDataPluginsWithTimeout(timeout time.Duration, plugins []PrepareDataPlugin,
29 | 	ctx context.Context, request *schedulingtypes.LLMRequest, pods []schedulingtypes.Pod) error {
30 | 	errCh := make(chan error, 1)
31 | 	// Execute plugins sequentially in a separate goroutine
32 | 	go func() {
33 | 		for _, plugin := range plugins {
34 | 			err := plugin.PrepareRequestData(ctx, request, pods)
35 | 			if err != nil {
36 | 				errCh <- errors.New("prepare data plugin " + plugin.TypedName().String() + " failed: " + err.Error())
37 | 				return
38 | 			}
39 | 		}
40 | 		errCh <- nil
41 | 	}()
42 | 
43 | 	select {
44 | 	case <-ctx.Done():
45 | 		return ctx.Err()
46 | 	case err := <-errCh:
47 | 		return err
48 | 	case <-time.After(timeout):
49 | 		return errors.New("prepare data plugin timed out")
50 | 	}
51 | }
52 | 


--------------------------------------------------------------------------------
/site-src/_includes/epp-latest.md:
--------------------------------------------------------------------------------
 1 | === "GKE"
 2 | 
 3 |       ```bash
 4 |       export GATEWAY_PROVIDER=gke
 5 |       helm install vllm-llama3-8b-instruct \
 6 |       --set inferencePool.modelServers.matchLabels.app=vllm-llama3-8b-instruct \
 7 |       --set provider.name=$GATEWAY_PROVIDER \
 8 |       --version $IGW_CHART_VERSION \
 9 |       oci://us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/charts/inferencepool
10 |       ```
11 | 
12 | === "Istio"
13 | 
14 |       ```bash
15 |       export GATEWAY_PROVIDER=istio
16 |       helm install vllm-llama3-8b-instruct \
17 |       --set inferencePool.modelServers.matchLabels.app=vllm-llama3-8b-instruct \
18 |       --set provider.name=$GATEWAY_PROVIDER \
19 |       --version $IGW_CHART_VERSION \
20 |       oci://us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/charts/inferencepool
21 |       ```
22 | 
23 | === "Kgateway"
24 | 
25 |       ```bash
26 |       export GATEWAY_PROVIDER=none
27 |       helm install vllm-llama3-8b-instruct \
28 |       --set inferencePool.modelServers.matchLabels.app=vllm-llama3-8b-instruct \
29 |       --set provider.name=$GATEWAY_PROVIDER \
30 |       --version $IGW_CHART_VERSION \
31 |       oci://us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/charts/inferencepool
32 |       ```
33 | 
34 | === "NGINX Gateway Fabric"
35 | 
36 |       ```bash
37 |       export GATEWAY_PROVIDER=none
38 |       helm install vllm-llama3-8b-instruct \
39 |       --set inferencePool.modelServers.matchLabels.app=vllm-llama3-8b-instruct \
40 |       --set provider.name=$GATEWAY_PROVIDER \
41 |       --version $IGW_CHART_VERSION \
42 |       oci://us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/charts/inferencepool
43 |       ```


--------------------------------------------------------------------------------
/pkg/common/kubemeta.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright 2025 The Kubernetes Authors.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 | 
17 | // Package common defines structs for referring to fully qualified k8s resources.
18 | package common
19 | 
20 | import (
21 | 	"fmt"
22 | 	"strings"
23 | 
24 | 	"k8s.io/apimachinery/pkg/runtime/schema"
25 | 	"k8s.io/apimachinery/pkg/types"
26 | )
27 | 
28 | // GKNN represents a fully qualified k8s resource.
29 | type GKNN struct {
30 | 	types.NamespacedName
31 | 	schema.GroupKind
32 | }
33 | 
34 | // String implements Stringer.
35 | func (g *GKNN) String() string {
36 | 	return fmt.Sprintf("%s %s", g.GroupKind.String(), g.NamespacedName.String())
37 | }
38 | 
39 | // Compare returns the comparison of a and b where less than, equal, and greater than return -1, 0,
40 | // and 1 respectively.
41 | func Compare(a, b GKNN) int {
42 | 	if v := strings.Compare(a.Group, b.Group); v != 0 {
43 | 		return v
44 | 	}
45 | 	if v := strings.Compare(a.Kind, b.Kind); v != 0 {
46 | 		return v
47 | 	}
48 | 	if v := strings.Compare(a.Namespace, b.Namespace); v != 0 {
49 | 		return v
50 | 	}
51 | 	return strings.Compare(a.Name, b.Name)
52 | }
53 | 
54 | // Less returns true if a is less than b.
55 | func Less(a, b GKNN) bool {
56 | 	return Compare(a, b) < 0
57 | }
58 | 


--------------------------------------------------------------------------------
/site-src/concepts/priority-and-capacity.md:
--------------------------------------------------------------------------------
 1 | # Priority and Capacity
 2 | 
 3 | The InferenceObjective creates the definition of `Priority` which describes how requests interact with each other, this naturally interacts with total pool capacity, and properly understanding and configuring these behaviors is important in allowing a pool to handle requests of different priority.
 4 | 
 5 | ## Priority (in flow control)
 6 | 
 7 | It should be noted that priority is currently only used in [Capacity](#capacity), and that the description below is how Priority will be consumed in the `Flow Control` model.
 8 | 
 9 | Priority is a simple stack rank; the higher the number, the higher the priority. Should no priority for a request be specified, the default value is zero. Requests of higher priority are _always_ selected first when requests are queued. Requests of equal priority currently operate on a FCFS basis. 
10 | 
11 | ## Capacity
12 | 
13 | The current capacity model uses configurable [thresholds](https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/35b14a10a9830d1a9e3850913539066ebc8fb317/pkg/epp/saturationdetector/saturationdetector.go#L49) to determine if the entire pool is saturated. The calculation is to simply iterate through each endpoint in the pool, and if all are above all thresholds, the pool is considered `saturated`. In the event of saturation, all requests with a negative priority will be rejected, and other requests will be scheduled and queued on the model servers. 
14 | 
15 | ## Future work
16 | 
17 | The Flow Control system is nearing completion and will add more nuance to the Priority and Capacity model: proper priority enforcement, more articulate capacity tracking, queuing at the Inference Gateway level, etc. This documentation will be updated when the Flow Control has finished implementation.


--------------------------------------------------------------------------------
/client-go/applyconfiguration/internal/internal.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright The Kubernetes Authors.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 | 
17 | // Code generated by applyconfiguration-gen. DO NOT EDIT.
18 | 
19 | package internal
20 | 
21 | import (
22 | 	fmt "fmt"
23 | 	sync "sync"
24 | 
25 | 	typed "sigs.k8s.io/structured-merge-diff/v6/typed"
26 | )
27 | 
28 | func Parser() *typed.Parser {
29 | 	parserOnce.Do(func() {
30 | 		var err error
31 | 		parser, err = typed.NewParser(schemaYAML)
32 | 		if err != nil {
33 | 			panic(fmt.Sprintf("Failed to parse schema: %v", err))
34 | 		}
35 | 	})
36 | 	return parser
37 | }
38 | 
39 | var parserOnce sync.Once
40 | var parser *typed.Parser
41 | var schemaYAML = typed.YAMLObject(`types:
42 | - name: __untyped_atomic_
43 |   scalar: untyped
44 |   list:
45 |     elementType:
46 |       namedType: __untyped_atomic_
47 |     elementRelationship: atomic
48 |   map:
49 |     elementType:
50 |       namedType: __untyped_atomic_
51 |     elementRelationship: atomic
52 | - name: __untyped_deduced_
53 |   scalar: untyped
54 |   list:
55 |     elementType:
56 |       namedType: __untyped_atomic_
57 |     elementRelationship: atomic
58 |   map:
59 |     elementType:
60 |       namedType: __untyped_deduced_
61 |     elementRelationship: separable
62 | `)
63 | 


--------------------------------------------------------------------------------
/client-go/informers/externalversions/api/interface.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright The Kubernetes Authors.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 | 
17 | // Code generated by informer-gen. DO NOT EDIT.
18 | 
19 | package api
20 | 
21 | import (
22 | 	v1 "sigs.k8s.io/gateway-api-inference-extension/client-go/informers/externalversions/api/v1"
23 | 	internalinterfaces "sigs.k8s.io/gateway-api-inference-extension/client-go/informers/externalversions/internalinterfaces"
24 | )
25 | 
26 | // Interface provides access to each of this group's versions.
27 | type Interface interface {
28 | 	// V1 provides access to shared informers for resources in V1.
29 | 	V1() v1.Interface
30 | }
31 | 
32 | type group struct {
33 | 	factory          internalinterfaces.SharedInformerFactory
34 | 	namespace        string
35 | 	tweakListOptions internalinterfaces.TweakListOptionsFunc
36 | }
37 | 
38 | // New returns a new Interface.
39 | func New(f internalinterfaces.SharedInformerFactory, namespace string, tweakListOptions internalinterfaces.TweakListOptionsFunc) Interface {
40 | 	return &group{factory: f, namespace: namespace, tweakListOptions: tweakListOptions}
41 | }
42 | 
43 | // V1 returns a new v1.Interface.
44 | func (g *group) V1() v1.Interface {
45 | 	return v1.New(g.factory, g.namespace, g.tweakListOptions)
46 | }
47 | 


--------------------------------------------------------------------------------
/client-go/clientset/versioned/typed/apix/v1alpha2/fake/fake_apix_client.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright The Kubernetes Authors.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 | 
17 | // Code generated by client-gen. DO NOT EDIT.
18 | 
19 | package fake
20 | 
21 | import (
22 | 	rest "k8s.io/client-go/rest"
23 | 	testing "k8s.io/client-go/testing"
24 | 	v1alpha2 "sigs.k8s.io/gateway-api-inference-extension/client-go/clientset/versioned/typed/apix/v1alpha2"
25 | )
26 | 
27 | type FakeXInferenceV1alpha2 struct {
28 | 	*testing.Fake
29 | }
30 | 
31 | func (c *FakeXInferenceV1alpha2) InferenceModelRewrites(namespace string) v1alpha2.InferenceModelRewriteInterface {
32 | 	return newFakeInferenceModelRewrites(c, namespace)
33 | }
34 | 
35 | func (c *FakeXInferenceV1alpha2) InferenceObjectives(namespace string) v1alpha2.InferenceObjectiveInterface {
36 | 	return newFakeInferenceObjectives(c, namespace)
37 | }
38 | 
39 | func (c *FakeXInferenceV1alpha2) InferencePools(namespace string) v1alpha2.InferencePoolInterface {
40 | 	return newFakeInferencePools(c, namespace)
41 | }
42 | 
43 | // RESTClient returns a RESTClient that is used to communicate
44 | // with API server by this client implementation.
45 | func (c *FakeXInferenceV1alpha2) RESTClient() rest.Interface {
46 | 	var ret *rest.RESTClient
47 | 	return ret
48 | }
49 | 


--------------------------------------------------------------------------------
/client-go/informers/externalversions/api/v1/interface.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright The Kubernetes Authors.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 | 
17 | // Code generated by informer-gen. DO NOT EDIT.
18 | 
19 | package v1
20 | 
21 | import (
22 | 	internalinterfaces "sigs.k8s.io/gateway-api-inference-extension/client-go/informers/externalversions/internalinterfaces"
23 | )
24 | 
25 | // Interface provides access to all the informers in this group version.
26 | type Interface interface {
27 | 	// InferencePools returns a InferencePoolInformer.
28 | 	InferencePools() InferencePoolInformer
29 | }
30 | 
31 | type version struct {
32 | 	factory          internalinterfaces.SharedInformerFactory
33 | 	namespace        string
34 | 	tweakListOptions internalinterfaces.TweakListOptionsFunc
35 | }
36 | 
37 | // New returns a new Interface.
38 | func New(f internalinterfaces.SharedInformerFactory, namespace string, tweakListOptions internalinterfaces.TweakListOptionsFunc) Interface {
39 | 	return &version{factory: f, namespace: namespace, tweakListOptions: tweakListOptions}
40 | }
41 | 
42 | // InferencePools returns a InferencePoolInformer.
43 | func (v *version) InferencePools() InferencePoolInformer {
44 | 	return &inferencePoolInformer{factory: v.factory, namespace: v.namespace, tweakListOptions: v.tweakListOptions}
45 | }
46 | 


--------------------------------------------------------------------------------