├── tools ├── benchmark │ ├── requirements.txt │ ├── README.md │ └── download-benchmark-results.bash ├── dynamic-lora-sidecar │ ├── .gitignore │ ├── screenshots │ │ ├── vllm-logs.png │ │ └── lora-syncer-logs.png │ ├── requirements.txt │ ├── Dockerfile │ └── Makefile ├── dashboards │ ├── inference_gateway_dashboard_1.png │ ├── inference_gateway_dashboard_2.png │ ├── inference_gateway_dashboard_3.png │ └── README.md ├── simulations │ └── llm_ig_simulation │ │ └── src │ │ ├── __init__.py │ │ └── constants.py └── tools.go ├── .github ├── ISSUE_TEMPLATE │ ├── config.yml │ ├── blank_issue.md │ ├── feature_request.md │ └── bug_request.md ├── workflows │ ├── non-main-gatekeeper.yml │ └── kal.yml └── dependabot.yml ├── config ├── charts │ ├── body-based-routing │ │ ├── templates │ │ │ ├── NOTES.txt │ │ │ ├── bbr.yaml │ │ │ ├── gke.yaml │ │ │ └── istio.yaml │ │ ├── Chart.yaml │ │ ├── values.yaml │ │ └── .helmignore │ └── inferencepool │ │ ├── templates │ │ ├── NOTES.txt │ │ ├── _validations.tpl │ │ ├── epp-sa-token-secret.yaml │ │ ├── epp-service.yaml │ │ ├── istio.yaml │ │ ├── _helpers.tpl │ │ ├── leader-election-rbac.yaml │ │ ├── epp-config.yaml │ │ └── epp-servicemonitor.yaml │ │ ├── Chart.yaml │ │ └── .helmignore ├── observability │ └── prometheus │ │ ├── rbac.yaml │ │ └── values.yaml ├── manifests │ ├── gateway │ │ ├── istio │ │ │ ├── gateway.yaml │ │ │ └── httproute.yaml │ │ ├── kgateway │ │ │ ├── gateway.yaml │ │ │ └── httproute.yaml │ │ ├── nginxgatewayfabric │ │ │ ├── gateway.yaml │ │ │ └── httproute.yaml │ │ ├── gke │ │ │ ├── gateway.yaml │ │ │ └── httproute.yaml │ │ └── envoyaigateway │ │ │ ├── gateway.yaml │ │ │ └── httproute.yaml │ ├── benchmark │ │ └── model-server-service.yaml │ ├── inferenceobjective.yaml │ ├── vllm │ │ └── sim-deployment.yaml │ └── bbr-example │ │ └── httproute_bbr.yaml └── crd │ ├── kustomizeconfig.yaml │ └── kustomization.yaml ├── pkg ├── epp │ ├── scheduling │ │ ├── framework │ │ │ ├── plugins │ │ │ │ ├── test │ │ │ │ │ ├── README.md │ │ │ │ │ └── consts.go │ │ │ │ ├── multi │ │ │ │ │ └── prefix │ │ │ │ │ │ └── OWNERS │ │ │ │ ├── README.md │ │ │ │ └── picker │ │ │ │ │ └── common.go │ │ │ └── weighted_scorer.go │ │ └── scheduler_config.go │ ├── metrics │ │ └── testdata │ │ │ ├── prefix_indexer_size_metric │ │ │ ├── queue_avg_size_metrics │ │ │ ├── kv_cache_avg_metrics │ │ │ ├── running_requests_metrics │ │ │ ├── request_total_metric │ │ │ ├── request_error_total_metric │ │ │ ├── prefix_indexer_hit_ratio_metric │ │ │ ├── scheduler_e2e_duration_seconds_metric │ │ │ └── prefix_indexer_hit_bytes_metric │ ├── util │ │ ├── logging │ │ │ ├── logging_const.go │ │ │ ├── fatal.go │ │ │ └── logger.go │ │ ├── request │ │ │ ├── sheddable.go │ │ │ ├── metadata.go │ │ │ └── headers.go │ │ ├── metrics │ │ │ └── metrics.go │ │ ├── pod │ │ │ └── pod.go │ │ └── error │ │ │ └── error.go │ ├── backend │ │ ├── pod.go │ │ └── metrics │ │ │ └── metrics_state.go │ ├── datalayer │ │ ├── metrics │ │ │ └── types.go │ │ └── mocks │ │ │ └── ticker.go │ ├── config │ │ └── config.go │ ├── plugins │ │ ├── shared_state.go │ │ ├── typedname.go │ │ ├── registry.go │ │ └── plugins.go │ ├── saturationdetector │ │ └── config.go │ ├── flowcontrol │ │ ├── contracts │ │ │ └── doc.go │ │ ├── framework │ │ │ └── doc.go │ │ └── registry │ │ │ └── connection.go │ ├── requestcontrol │ │ ├── types.go │ │ └── plugin_executor.go │ └── server │ │ └── runserver_test.go ├── README.md ├── bbr │ └── README.md └── common │ └── kubemeta.go ├── sidecars └── latencypredictorasync │ └── OWNERS ├── site-src ├── images │ ├── ga-stage.png │ ├── alpha-stage.png │ ├── favicon-64.png │ ├── request-flow.png │ ├── migration-stage.png │ ├── resource-model.png │ ├── running-example.png │ ├── logo │ │ ├── logo-text-xl-dark.png │ │ └── logo-text-large-horizontal-white.png │ ├── edit-environment-variables.png │ ├── inferencepool-vs-service.png │ └── modify-run-configuration.png ├── .mkdocs-exclude ├── performance │ └── benchmark │ │ └── example-bar-chart.png ├── _includes │ ├── infobj.md │ ├── intro.md │ ├── model-server-sim.md │ ├── test.md │ ├── model-server-gpu.md │ ├── prereqs.md │ ├── bbr.md │ ├── model-server.md │ ├── model-server-cpu.md │ ├── epp.md │ └── epp-latest.md ├── enhancements │ └── overview.md ├── concepts │ ├── roles-and-personas.md │ ├── conformance.md │ └── priority-and-capacity.md ├── api-types │ ├── inferenceobjective.md │ └── inferencepoolimport.md ├── stylesheets │ └── extra.css ├── guides │ └── epp-configuration │ │ └── flags.md └── contributing │ └── devguide.md ├── .dockerignore ├── test └── testdata │ ├── model-secret.yaml │ ├── client.yaml │ ├── configloader_1_test.yaml │ ├── metrics-rbac.yaml │ └── inferencepool-with-model-hermetic.yaml ├── code-of-conduct.md ├── latencypredictor ├── requirements.txt ├── Dockerfile-training ├── Dockerfile-prediction └── Dockerfile-test ├── benchmarking └── inference-perf │ ├── Chart.yaml │ ├── templates │ ├── configmap.yaml │ └── secret.yaml │ └── .helmignore ├── docs └── proposals │ ├── 0845-scheduler-architecture-proposal │ ├── images │ │ └── scheduler_cycle.png │ └── examples │ │ └── example.yaml │ └── README.md ├── .custom-gcl.yml ├── OWNERS ├── netlify.toml ├── OWNERS_ALIASES ├── RELEASE.md ├── crd-ref-docs.yaml ├── SECURITY_CONTACTS ├── api ├── doc.go └── v1 │ └── doc.go ├── apix ├── doc.go ├── v1alpha1 │ ├── doc.go │ └── shared_types.go ├── config │ └── v1alpha1 │ │ └── doc.go └── v1alpha2 │ └── doc.go ├── hack ├── boilerplate │ ├── boilerplate.go.txt │ ├── boilerplate.generatego.txt │ ├── boilerplate.py.txt │ └── boilerplate.sh.txt ├── mkdocs │ └── image │ │ ├── requirements.txt │ │ ├── entrypoint.sh │ │ └── Dockerfile ├── referencer.go ├── verify-boilerplate.sh └── update-codegen.sh ├── .gitignore ├── conformance ├── reports │ ├── v1.0.2 │ │ └── gateway │ │ │ ├── nginx-nginx-gateway-fabric │ │ │ ├── inference-v2.2.0-report.yaml │ │ │ └── README.md │ │ │ └── kgateway │ │ │ └── inference-v2.1.0-report.yaml │ ├── v0.5.1 │ │ └── gateway │ │ │ ├── agentgateway │ │ │ ├── inference-v0.7.2-report.yaml │ │ │ └── README.md │ │ │ ├── kgateway │ │ │ └── inference-v2.0.4-report.yaml │ │ │ ├── envoy-ai-gateway │ │ │ └── aigw-latest-report.yaml │ │ │ ├── kubvernor │ │ │ ├── kubvernor-inference-conformance-output-0.1.1.yaml │ │ │ └── README.md │ │ │ └── ack-gateway │ │ │ └── v1.4.0-apsara.3-gateway-report.yaml │ ├── v0.5.0 │ │ └── gateway │ │ │ ├── gke-gateway │ │ │ └── standard-v1.32.4-rxlb-gateway-report.yaml │ │ │ └── istio │ │ │ ├── 1.28-alpha.32ca03082f566513ad9b860f31e7745b0f68dc91-default-gateway-report.yaml │ │ │ └── README.md │ └── v0.4.0 │ │ └── gateway │ │ └── istio │ │ ├── 1.28-alpha.32ca03082f566513ad9b860f31e7745b0f68dc91-default-gateway-report.yaml │ │ └── README.md ├── tests │ ├── httproute_invalid_inferencepool_ref.yaml │ ├── gateway_following_epp_routing.yaml │ ├── gateway_following_epp_routing_dp.yaml │ ├── epp_unavailable_fail_open.yaml │ ├── inferencepool_accepted.yaml │ ├── inferencepool_multiple_rules_different_pools.yaml │ ├── gateway_weighted_two_pools.yaml │ ├── inferencepool_invalid_epp_service.yaml │ ├── httproute_multiple_gateways_different_pools.yaml │ ├── main.go │ └── inferencepool_resolvedrefs_condition.yaml ├── embed.go ├── conformance_test.go └── utils │ └── assertions.go ├── bbr.Dockerfile ├── client-go ├── clientset │ └── versioned │ │ ├── fake │ │ └── doc.go │ │ ├── typed │ │ ├── api │ │ │ └── v1 │ │ │ │ ├── doc.go │ │ │ │ ├── fake │ │ │ │ ├── doc.go │ │ │ │ └── fake_api_client.go │ │ │ │ └── generated_expansion.go │ │ └── apix │ │ │ ├── v1alpha1 │ │ │ ├── fake │ │ │ │ ├── doc.go │ │ │ │ └── fake_apix_client.go │ │ │ ├── doc.go │ │ │ └── generated_expansion.go │ │ │ └── v1alpha2 │ │ │ ├── fake │ │ │ ├── doc.go │ │ │ └── fake_apix_client.go │ │ │ ├── doc.go │ │ │ └── generated_expansion.go │ │ └── scheme │ │ └── doc.go ├── listers │ ├── api │ │ └── v1 │ │ │ └── expansion_generated.go │ └── apix │ │ └── v1alpha1 │ │ └── expansion_generated.go ├── applyconfiguration │ ├── apix │ │ └── v1alpha2 │ │ │ └── match.go │ ├── api │ │ └── v1 │ │ │ └── port.go │ └── internal │ │ └── internal.go └── informers │ └── externalversions │ ├── internalinterfaces │ └── factory_interfaces.go │ └── api │ ├── interface.go │ └── v1 │ └── interface.go ├── SECURITY.md ├── cmd ├── bbr │ └── main.go └── epp │ └── main.go ├── Dockerfile ├── .golangci.yml ├── PROJECT ├── internal └── runnable │ └── leader_election.go └── version └── version.go /tools/benchmark/requirements.txt: -------------------------------------------------------------------------------- 1 | pandas 2 | numpy 3 | matplotlib -------------------------------------------------------------------------------- /tools/dynamic-lora-sidecar/.gitignore: -------------------------------------------------------------------------------- 1 | sidecar/__pycache__/ 2 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/config.yml: -------------------------------------------------------------------------------- 1 | blank_issues_enabled: false 2 | -------------------------------------------------------------------------------- /config/charts/body-based-routing/templates/NOTES.txt: -------------------------------------------------------------------------------- 1 | Body-based routing extension deployed. 2 | -------------------------------------------------------------------------------- /config/charts/inferencepool/templates/NOTES.txt: -------------------------------------------------------------------------------- 1 | InferencePool {{ .Release.Name }} deployed. 2 | -------------------------------------------------------------------------------- /pkg/epp/scheduling/framework/plugins/test/README.md: -------------------------------------------------------------------------------- 1 | This package contains plugins implementation for test purpose only. 2 | -------------------------------------------------------------------------------- /sidecars/latencypredictorasync/OWNERS: -------------------------------------------------------------------------------- 1 | # See the OWNERS docs at https://go.k8s.io/owners 2 | 3 | approvers: 4 | - kaushikmitr 5 | -------------------------------------------------------------------------------- /site-src/images/ga-stage.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bexxmodd/gateway-api-inference-extension/main/site-src/images/ga-stage.png -------------------------------------------------------------------------------- /site-src/images/alpha-stage.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bexxmodd/gateway-api-inference-extension/main/site-src/images/alpha-stage.png -------------------------------------------------------------------------------- /site-src/images/favicon-64.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bexxmodd/gateway-api-inference-extension/main/site-src/images/favicon-64.png -------------------------------------------------------------------------------- /pkg/README.md: -------------------------------------------------------------------------------- 1 | ## Quickstart 2 | 3 | Please refer to our Getting started guide here: https://gateway-api-inference-extension.sigs.k8s.io/guides/ -------------------------------------------------------------------------------- /site-src/.mkdocs-exclude: -------------------------------------------------------------------------------- 1 | .mkdocs-exclude 2 | .nojekyll 3 | .placeholder 4 | search/search_index.json 5 | sitemap.xml.gz 6 | sitemap.xml 7 | -------------------------------------------------------------------------------- /site-src/images/request-flow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bexxmodd/gateway-api-inference-extension/main/site-src/images/request-flow.png -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | # More info: https://docs.docker.com/engine/reference/builder/#dockerignore-file 2 | # Ignore build and test binaries. 3 | bin/ 4 | -------------------------------------------------------------------------------- /pkg/epp/scheduling/framework/plugins/multi/prefix/OWNERS: -------------------------------------------------------------------------------- 1 | # See the OWNERS docs at https://go.k8s.io/owners 2 | 3 | approvers: 4 | - liu-cong 5 | -------------------------------------------------------------------------------- /site-src/images/migration-stage.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bexxmodd/gateway-api-inference-extension/main/site-src/images/migration-stage.png -------------------------------------------------------------------------------- /site-src/images/resource-model.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bexxmodd/gateway-api-inference-extension/main/site-src/images/resource-model.png -------------------------------------------------------------------------------- /site-src/images/running-example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bexxmodd/gateway-api-inference-extension/main/site-src/images/running-example.png -------------------------------------------------------------------------------- /site-src/images/logo/logo-text-xl-dark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bexxmodd/gateway-api-inference-extension/main/site-src/images/logo/logo-text-xl-dark.png -------------------------------------------------------------------------------- /site-src/images/edit-environment-variables.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bexxmodd/gateway-api-inference-extension/main/site-src/images/edit-environment-variables.png -------------------------------------------------------------------------------- /site-src/images/inferencepool-vs-service.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bexxmodd/gateway-api-inference-extension/main/site-src/images/inferencepool-vs-service.png -------------------------------------------------------------------------------- /site-src/images/modify-run-configuration.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bexxmodd/gateway-api-inference-extension/main/site-src/images/modify-run-configuration.png -------------------------------------------------------------------------------- /test/testdata/model-secret.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Secret 3 | metadata: 4 | name: hf-token 5 | labels: 6 | app: vllm 7 | stringData: 8 | token: $HF_TOKEN 9 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/blank_issue.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Blank Issue 3 | about: Create a new issue from scratch 4 | title: '' 5 | labels: needs-triage 6 | assignees: '' 7 | 8 | --- -------------------------------------------------------------------------------- /code-of-conduct.md: -------------------------------------------------------------------------------- 1 | # Kubernetes Community Code of Conduct 2 | 3 | Please refer to our [Kubernetes Community Code of Conduct](https://git.k8s.io/community/code-of-conduct.md) 4 | -------------------------------------------------------------------------------- /config/observability/prometheus/rbac.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ServiceAccount 3 | metadata: 4 | name: inference-gateway-sa-metrics-reader 5 | namespace: monitoring 6 | -------------------------------------------------------------------------------- /tools/dashboards/inference_gateway_dashboard_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bexxmodd/gateway-api-inference-extension/main/tools/dashboards/inference_gateway_dashboard_1.png -------------------------------------------------------------------------------- /tools/dashboards/inference_gateway_dashboard_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bexxmodd/gateway-api-inference-extension/main/tools/dashboards/inference_gateway_dashboard_2.png -------------------------------------------------------------------------------- /tools/dashboards/inference_gateway_dashboard_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bexxmodd/gateway-api-inference-extension/main/tools/dashboards/inference_gateway_dashboard_3.png -------------------------------------------------------------------------------- /site-src/performance/benchmark/example-bar-chart.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bexxmodd/gateway-api-inference-extension/main/site-src/performance/benchmark/example-bar-chart.png -------------------------------------------------------------------------------- /tools/dynamic-lora-sidecar/screenshots/vllm-logs.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bexxmodd/gateway-api-inference-extension/main/tools/dynamic-lora-sidecar/screenshots/vllm-logs.png -------------------------------------------------------------------------------- /site-src/images/logo/logo-text-large-horizontal-white.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bexxmodd/gateway-api-inference-extension/main/site-src/images/logo/logo-text-large-horizontal-white.png -------------------------------------------------------------------------------- /tools/benchmark/README.md: -------------------------------------------------------------------------------- 1 | This folder contains resources to run performance benchmarks. Pls follow the benchmark guide here https://gateway-api-inference-extension.sigs.k8s.io/performance/benchmark. -------------------------------------------------------------------------------- /tools/dynamic-lora-sidecar/requirements.txt: -------------------------------------------------------------------------------- 1 | aiohttp==3.12.12 2 | jsonschema==4.24.0 3 | prometheus_client==0.22.1 4 | PyYAML==6.0.2 5 | requests==2.32.4 6 | watchfiles==1.0.5 7 | watchdog==6.0.0 8 | -------------------------------------------------------------------------------- /tools/dynamic-lora-sidecar/screenshots/lora-syncer-logs.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bexxmodd/gateway-api-inference-extension/main/tools/dynamic-lora-sidecar/screenshots/lora-syncer-logs.png -------------------------------------------------------------------------------- /latencypredictor/requirements.txt: -------------------------------------------------------------------------------- 1 | fastapi 2 | uvicorn[standard] 3 | scikit-learn 4 | numpy 5 | pandas 6 | joblib 7 | river 8 | pydantic 9 | requests 10 | xgboost 11 | aiohttp 12 | lightgbm 13 | -------------------------------------------------------------------------------- /benchmarking/inference-perf/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v2 2 | name: inference-perf 3 | description: A Helm chart for running inference-perf benchmarking tool 4 | type: application 5 | version: 0.2.0 6 | appVersion: "0.2.0" 7 | -------------------------------------------------------------------------------- /config/charts/inferencepool/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v2 2 | name: inferencepool 3 | description: A Helm chart for InferencePool 4 | 5 | type: application 6 | 7 | version: 0.0.0 8 | 9 | appVersion: "0.0.0" 10 | -------------------------------------------------------------------------------- /docs/proposals/0845-scheduler-architecture-proposal/images/scheduler_cycle.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bexxmodd/gateway-api-inference-extension/main/docs/proposals/0845-scheduler-architecture-proposal/images/scheduler_cycle.png -------------------------------------------------------------------------------- /.custom-gcl.yml: -------------------------------------------------------------------------------- 1 | version: v2.3.1 2 | name: golangci-kube-api-linter 3 | destination: ./bin 4 | plugins: 5 | - module: 'sigs.k8s.io/kube-api-linter' 6 | version: 'v0.0.0-20250808120943-48643eb2563d' # Pin to a commit while there's no tag 7 | -------------------------------------------------------------------------------- /OWNERS: -------------------------------------------------------------------------------- 1 | # See the OWNERS docs at https://go.k8s.io/owners 2 | 3 | approvers: 4 | - gateway-api-inference-extension-maintainers 5 | 6 | reviewers: 7 | - gateway-api-inference-extension-reviewers 8 | - gateway-api-inference-extension-maintainers 9 | -------------------------------------------------------------------------------- /config/charts/body-based-routing/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v2 2 | name: body-based-routing 3 | description: A Helm chart for the body-based routing extension 4 | 5 | type: application 6 | 7 | version: 0.1.0 8 | 9 | appVersion: "0.2.0" 10 | -------------------------------------------------------------------------------- /pkg/epp/metrics/testdata/prefix_indexer_size_metric: -------------------------------------------------------------------------------- 1 | # HELP inference_extension_prefix_indexer_size [ALPHA] Size of the prefix indexer. 2 | # TYPE inference_extension_prefix_indexer_size gauge 3 | inference_extension_prefix_indexer_size{} 4096 4 | -------------------------------------------------------------------------------- /netlify.toml: -------------------------------------------------------------------------------- 1 | # netlify configuration 2 | [build] 3 | publish = "site" 4 | command = "make build-docs-netlify" 5 | # available here https://github.com/netlify/build-image/blob/focal/included_software.md#languages 6 | environment = { PYTHON_VERSION = "3.8" } -------------------------------------------------------------------------------- /pkg/epp/metrics/testdata/queue_avg_size_metrics: -------------------------------------------------------------------------------- 1 | # HELP inference_pool_average_queue_size [ALPHA] The average number of requests pending in the model server queue. 2 | # TYPE inference_pool_average_queue_size gauge 3 | inference_pool_average_queue_size{name="p1"} 0.4 4 | -------------------------------------------------------------------------------- /config/manifests/gateway/istio/gateway.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: gateway.networking.k8s.io/v1 2 | kind: Gateway 3 | metadata: 4 | name: inference-gateway 5 | spec: 6 | gatewayClassName: istio 7 | listeners: 8 | - name: http 9 | port: 80 10 | protocol: HTTP 11 | -------------------------------------------------------------------------------- /config/manifests/gateway/kgateway/gateway.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: gateway.networking.k8s.io/v1 2 | kind: Gateway 3 | metadata: 4 | name: inference-gateway 5 | spec: 6 | gatewayClassName: agentgateway 7 | listeners: 8 | - name: http 9 | port: 80 10 | protocol: HTTP 11 | -------------------------------------------------------------------------------- /config/manifests/gateway/nginxgatewayfabric/gateway.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: gateway.networking.k8s.io/v1 2 | kind: Gateway 3 | metadata: 4 | name: inference-gateway 5 | spec: 6 | gatewayClassName: nginx 7 | listeners: 8 | - name: http 9 | port: 80 10 | protocol: HTTP 11 | -------------------------------------------------------------------------------- /config/manifests/benchmark/model-server-service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | name: my-pool-service 5 | spec: 6 | ports: 7 | - port: 8081 8 | protocol: TCP 9 | targetPort: 8000 10 | selector: 11 | app: my-pool 12 | type: LoadBalancer 13 | -------------------------------------------------------------------------------- /config/manifests/gateway/gke/gateway.yaml: -------------------------------------------------------------------------------- 1 | kind: Gateway 2 | apiVersion: gateway.networking.k8s.io/v1 3 | metadata: 4 | name: inference-gateway 5 | spec: 6 | gatewayClassName: gke-l7-regional-external-managed 7 | listeners: 8 | - name: http 9 | port: 80 10 | protocol: HTTP 11 | -------------------------------------------------------------------------------- /pkg/epp/metrics/testdata/kv_cache_avg_metrics: -------------------------------------------------------------------------------- 1 | # HELP inference_pool_average_kv_cache_utilization [ALPHA] The average kv cache utilization for an inference server pool. 2 | # TYPE inference_pool_average_kv_cache_utilization gauge 3 | inference_pool_average_kv_cache_utilization{name="p1"} 0.3 4 | -------------------------------------------------------------------------------- /pkg/epp/metrics/testdata/running_requests_metrics: -------------------------------------------------------------------------------- 1 | # HELP inference_objective_running_requests [ALPHA] Inference objective number of running requests in each model. 2 | # TYPE inference_objective_running_requests gauge 3 | inference_objective_running_requests{model_name="m1"} 1 4 | inference_objective_running_requests{model_name="m2"} 1 5 | -------------------------------------------------------------------------------- /site-src/_includes/infobj.md: -------------------------------------------------------------------------------- 1 | ??? example "Experimental" 2 | 3 | This project is still in an alpha state and breaking changes may occur in the future. 4 | 5 | This quickstart guide is intended for engineers familiar with k8s and model servers (vLLM in this instance). The goal of this guide is to get an Inference Gateway up and running! 6 | -------------------------------------------------------------------------------- /site-src/_includes/intro.md: -------------------------------------------------------------------------------- 1 | ??? example "Experimental" 2 | 3 | This project is still in an alpha state and breaking changes may occur in the future. 4 | 5 | This quickstart guide is intended for engineers familiar with k8s and model servers (vLLM in this instance). The goal of this guide is to get an Inference Gateway up and running! 6 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: needs-triage 6 | assignees: '' 7 | 8 | --- 9 | 10 | 11 | 12 | **What would you like to be added**: 13 | 14 | **Why is this needed**: 15 | -------------------------------------------------------------------------------- /benchmarking/inference-perf/templates/configmap.yaml: -------------------------------------------------------------------------------- 1 | # inference-perf/templates/configmap.yaml 2 | apiVersion: v1 3 | kind: ConfigMap 4 | metadata: 5 | name: {{ include "inference-perf.fullname" . }}-config 6 | labels: 7 | {{- include "inference-perf.labels" . | nindent 4 }} 8 | data: 9 | config.yml: | 10 | {{- toYaml .Values.config | nindent 4 }} -------------------------------------------------------------------------------- /OWNERS_ALIASES: -------------------------------------------------------------------------------- 1 | # See the OWNERS docs: https://git.k8s.io/community/contributors/guide/owners.md 2 | # This file should be kept in sync with k/org. 3 | 4 | aliases: 5 | gateway-api-inference-extension-maintainers: 6 | - ahg-g 7 | - danehans 8 | - nirrozenbaum 9 | - kfswain 10 | 11 | gateway-api-inference-extension-reviewers: 12 | - elevran 13 | - liu-cong 14 | - robscott 15 | -------------------------------------------------------------------------------- /config/charts/inferencepool/templates/_validations.tpl: -------------------------------------------------------------------------------- 1 | {{/* 2 | common validations 3 | */}} 4 | {{- define "gateway-api-inference-extension.validations.inferencepool.common" -}} 5 | {{- if or (empty $.Values.inferencePool.modelServers) (not $.Values.inferencePool.modelServers.matchLabels) }} 6 | {{- fail ".Values.inferencePool.modelServers.matchLabels is required" }} 7 | {{- end }} 8 | {{- end -}} 9 | -------------------------------------------------------------------------------- /test/testdata/client.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | annotations: 5 | labels: 6 | app: curl 7 | name: curl 8 | spec: 9 | containers: 10 | - command: 11 | - tail 12 | - -f 13 | - /dev/null 14 | image: curlimages/curl:7.83.1 15 | imagePullPolicy: IfNotPresent 16 | name: curl 17 | restartPolicy: Never 18 | schedulerName: default-scheduler 19 | -------------------------------------------------------------------------------- /site-src/_includes/model-server-sim.md: -------------------------------------------------------------------------------- 1 | === "vLLM Simulator Model Server" 2 | 3 | This option uses the [vLLM simulator](https://github.com/llm-d/llm-d-inference-sim/tree/main) to simulate a backend model server. 4 | This setup uses the least amount of compute resources, does not require GPU's, and is ideal for test/dev environments. 5 | 6 | To deploy the vLLM simulator, run the following command. 7 | -------------------------------------------------------------------------------- /config/charts/body-based-routing/values.yaml: -------------------------------------------------------------------------------- 1 | bbr: 2 | name: body-based-router 3 | replicas: 1 4 | image: 5 | name: bbr 6 | hub: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension 7 | tag: main 8 | pullPolicy: Always 9 | port: 9004 10 | healthCheckPort: 9005 11 | 12 | provider: 13 | name: none 14 | 15 | inferenceGateway: 16 | name: inference-gateway 17 | -------------------------------------------------------------------------------- /benchmarking/inference-perf/templates/secret.yaml: -------------------------------------------------------------------------------- 1 | # inference-perf/templates/secret.yaml 2 | {{- if .Values.hfToken }} 3 | apiVersion: v1 4 | kind: Secret 5 | metadata: 6 | name: {{ include "inference-perf.hfSecret" . }} 7 | labels: 8 | {{- include "inference-perf.labels" . | nindent 4 }} 9 | type: Opaque 10 | stringData: 11 | {{ include "inference-perf.hfKey" . }}: {{ .Values.hfToken | quote }} 12 | {{- end }} 13 | -------------------------------------------------------------------------------- /docs/proposals/README.md: -------------------------------------------------------------------------------- 1 | # Proposals Best Practices 2 | 3 | 4 | ## Naming 5 | The directory of the proposal should lead with a 4-digit PR number (will move to 5,6,... should our PR count get that high), followed by kebab-cased title. The PR number is not known until the PR is cut, so development can use a placeholder, ex. XXXX-my-proposal. PR number is used b/c it is unique & chronological, allowing the default ordering of proposals to follow the timeline of development. -------------------------------------------------------------------------------- /pkg/epp/metrics/testdata/request_total_metric: -------------------------------------------------------------------------------- 1 | # HELP inference_objective_request_total [ALPHA] Counter of inference objective requests broken out for each model and target model. 2 | # TYPE inference_objective_request_total counter 3 | inference_objective_request_total{model_name="m10", target_model_name="t10"} 2 4 | inference_objective_request_total{model_name="m10", target_model_name="t11"} 1 5 | inference_objective_request_total{model_name="m20", target_model_name="t20"} 1 6 | -------------------------------------------------------------------------------- /site-src/_includes/test.md: -------------------------------------------------------------------------------- 1 | ### Try it out 2 | 3 | Wait until the gateway is ready. 4 | 5 | ```bash 6 | IP=$(kubectl get gateway/inference-gateway -o jsonpath='{.status.addresses[0].value}') 7 | PORT=80 8 | 9 | curl -i ${IP}:${PORT}/v1/completions -H 'Content-Type: application/json' -d '{ 10 | "model": "food-review-1", 11 | "prompt": "Write as if you were a critic: San Francisco", 12 | "max_tokens": 100, 13 | "temperature": 0 14 | }' 15 | ``` 16 | -------------------------------------------------------------------------------- /config/manifests/gateway/nginxgatewayfabric/httproute.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: gateway.networking.k8s.io/v1 2 | kind: HTTPRoute 3 | metadata: 4 | name: llm-route 5 | namespace: default 6 | spec: 7 | parentRefs: 8 | - name: inference-gateway 9 | rules: 10 | - matches: 11 | - path: 12 | type: PathPrefix 13 | value: / 14 | backendRefs: 15 | - group: inference.networking.k8s.io 16 | kind: InferencePool 17 | name: vllm-llama3-8b-instruct 18 | 19 | -------------------------------------------------------------------------------- /benchmarking/inference-perf/.helmignore: -------------------------------------------------------------------------------- 1 | # Patterns to ignore when building packages. 2 | # This supports shell glob matching, relative path matching, and 3 | # negation (prefixed with !). Only one pattern per line. 4 | .DS_Store 5 | # Common VCS dirs 6 | .git/ 7 | .gitignore 8 | .bzr/ 9 | .bzrignore 10 | .hg/ 11 | .hgignore 12 | .svn/ 13 | # Common backup files 14 | *.swp 15 | *.bak 16 | *.tmp 17 | *.orig 18 | *~ 19 | # Various IDEs 20 | .project 21 | .idea/ 22 | *.tmproj 23 | .vscode/ 24 | -------------------------------------------------------------------------------- /config/charts/inferencepool/.helmignore: -------------------------------------------------------------------------------- 1 | # Patterns to ignore when building packages. 2 | # This supports shell glob matching, relative path matching, and 3 | # negation (prefixed with !). Only one pattern per line. 4 | .DS_Store 5 | # Common VCS dirs 6 | .git/ 7 | .gitignore 8 | .bzr/ 9 | .bzrignore 10 | .hg/ 11 | .hgignore 12 | .svn/ 13 | # Common backup files 14 | *.swp 15 | *.bak 16 | *.tmp 17 | *.orig 18 | *~ 19 | # Various IDEs 20 | .project 21 | .idea/ 22 | *.tmproj 23 | .vscode/ 24 | -------------------------------------------------------------------------------- /config/manifests/gateway/envoyaigateway/gateway.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: gateway.networking.k8s.io/v1 2 | kind: GatewayClass 3 | metadata: 4 | name: envoy-ai-gateway 5 | spec: 6 | controllerName: gateway.envoyproxy.io/gatewayclass-controller 7 | --- 8 | apiVersion: gateway.networking.k8s.io/v1 9 | kind: Gateway 10 | metadata: 11 | name: inference-gateway 12 | spec: 13 | gatewayClassName: envoy-ai-gateway 14 | listeners: 15 | - name: http 16 | protocol: HTTP 17 | port: 80 18 | -------------------------------------------------------------------------------- /config/charts/body-based-routing/.helmignore: -------------------------------------------------------------------------------- 1 | # Patterns to ignore when building packages. 2 | # This supports shell glob matching, relative path matching, and 3 | # negation (prefixed with !). Only one pattern per line. 4 | .DS_Store 5 | # Common VCS dirs 6 | .git/ 7 | .gitignore 8 | .bzr/ 9 | .bzrignore 10 | .hg/ 11 | .hgignore 12 | .svn/ 13 | # Common backup files 14 | *.swp 15 | *.bak 16 | *.tmp 17 | *.orig 18 | *~ 19 | # Various IDEs 20 | .project 21 | .idea/ 22 | *.tmproj 23 | .vscode/ 24 | -------------------------------------------------------------------------------- /config/manifests/gateway/gke/httproute.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: gateway.networking.k8s.io/v1 2 | kind: HTTPRoute 3 | metadata: 4 | name: llm-route 5 | spec: 6 | parentRefs: 7 | - group: gateway.networking.k8s.io 8 | kind: Gateway 9 | name: inference-gateway 10 | rules: 11 | - backendRefs: 12 | - group: inference.networking.k8s.io 13 | kind: InferencePool 14 | name: vllm-llama3-8b-instruct 15 | matches: 16 | - path: 17 | type: PathPrefix 18 | value: / 19 | -------------------------------------------------------------------------------- /site-src/_includes/model-server-gpu.md: -------------------------------------------------------------------------------- 1 | === "GPU-Based Model Server" 2 | 3 | For this setup, you will need 3 GPUs to run the sample model server. Adjust the number of replicas as needed. 4 | Create a Hugging Face secret to download the model [meta-llama/Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct). 5 | Ensure that the token grants access to this model. 6 | 7 | Deploy a sample vLLM deployment with the proper protocol to work with the LLM Instance Gateway. 8 | -------------------------------------------------------------------------------- /site-src/enhancements/overview.md: -------------------------------------------------------------------------------- 1 | # Inference Gateway Proposal process 2 | 3 | Our current proposal process is intentionally light-weight. If you have a proposal you are interested in sharing, please follow these steps: 4 | 5 | 1. Cut an issue or bring a topic to the weekly meeting! 6 | 2. Assuming positive signal, or if more context is needed please add a proposal, following the style and naming conventions shown here: https://github.com/kubernetes-sigs/gateway-api-inference-extension/tree/main/docs/proposals -------------------------------------------------------------------------------- /config/manifests/gateway/istio/httproute.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: gateway.networking.k8s.io/v1 2 | kind: HTTPRoute 3 | metadata: 4 | name: llm-route 5 | spec: 6 | parentRefs: 7 | - group: gateway.networking.k8s.io 8 | kind: Gateway 9 | name: inference-gateway 10 | rules: 11 | - backendRefs: 12 | - group: inference.networking.k8s.io 13 | kind: InferencePool 14 | name: vllm-llama3-8b-instruct 15 | matches: 16 | - path: 17 | type: PathPrefix 18 | value: / 19 | timeouts: 20 | request: 300s 21 | -------------------------------------------------------------------------------- /config/manifests/gateway/kgateway/httproute.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: gateway.networking.k8s.io/v1 2 | kind: HTTPRoute 3 | metadata: 4 | name: llm-route 5 | spec: 6 | parentRefs: 7 | - group: gateway.networking.k8s.io 8 | kind: Gateway 9 | name: inference-gateway 10 | rules: 11 | - backendRefs: 12 | - group: inference.networking.k8s.io 13 | kind: InferencePool 14 | name: vllm-llama3-8b-instruct 15 | matches: 16 | - path: 17 | type: PathPrefix 18 | value: / 19 | timeouts: 20 | request: 300s 21 | -------------------------------------------------------------------------------- /config/manifests/gateway/envoyaigateway/httproute.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: gateway.networking.k8s.io/v1 2 | kind: HTTPRoute 3 | metadata: 4 | name: llm-route 5 | spec: 6 | parentRefs: 7 | - group: gateway.networking.k8s.io 8 | kind: Gateway 9 | name: inference-gateway 10 | rules: 11 | - backendRefs: 12 | - group: inference.networking.k8s.io 13 | kind: InferencePool 14 | name: vllm-llama3-8b-instruct 15 | matches: 16 | - path: 17 | type: PathPrefix 18 | value: / 19 | timeouts: 20 | request: 300s 21 | -------------------------------------------------------------------------------- /.github/workflows/non-main-gatekeeper.yml: -------------------------------------------------------------------------------- 1 | name: Label non-main PRs 2 | 3 | on: 4 | pull_request: 5 | types: [opened, edited, synchronize, reopened] 6 | 7 | jobs: 8 | add-label: 9 | runs-on: ubuntu-latest 10 | steps: 11 | - name: Add labels when base branch is not main 12 | if: github.event.pull_request.base.ref != 'main' 13 | uses: actions-ecosystem/action-add-labels@v1 14 | with: 15 | github_token: ${{ secrets.GITHUB_TOKEN }} 16 | labels: | 17 | do-not-merge/hold 18 | do-not-merge/cherry-pick-not-approved 19 | -------------------------------------------------------------------------------- /pkg/epp/metrics/testdata/request_error_total_metric: -------------------------------------------------------------------------------- 1 | # HELP inference_objective_request_error_total [ALPHA] Counter of inference objective requests errors broken out for each model and target model. 2 | # TYPE inference_objective_request_error_total counter 3 | inference_objective_request_error_total{error_code="Internal", model_name="m10",target_model_name="t10"} 2 4 | inference_objective_request_error_total{error_code="ModelServerError", model_name="m10",target_model_name="t11"} 1 5 | inference_objective_request_error_total{error_code="InferencePoolResourceExhausted", model_name="m20",target_model_name="t20"} 1 6 | -------------------------------------------------------------------------------- /pkg/epp/scheduling/framework/plugins/README.md: -------------------------------------------------------------------------------- 1 | # Scheduling Plugins 2 | 3 | This package contains the scheduling plugin implementations. 4 | 5 | Plugins are organized by the following rule. Follow this rule when adding a new 6 | plugin. 7 | 8 | ``` 9 | plugins/ 10 | |__ filter/(Plugins that implement the Filter interface only.) 11 | |__ scorer/ (Plugins that implement the Scorer interface only.) 12 | |__ picker/(Plugins that implement the Picker interface only.) 13 | |__ multi/ (Plugins that implement multiple plugin interfaces.) 14 | |____prefix/ (Prefix cache aware scheduling plugin.) 15 | ``` 16 | -------------------------------------------------------------------------------- /RELEASE.md: -------------------------------------------------------------------------------- 1 | # Release Process 2 | 3 | The Kubernetes Template Project is released on an as-needed basis. The process is as follows: 4 | 5 | 1. Update `version/version.go` with the new semver tag 6 | 1. An issue is proposing a new release with a changelog since the last release 7 | 1. All [OWNERS](OWNERS) must LGTM this release 8 | 1. An OWNER runs `git tag -s $VERSION` and inserts the changelog and pushes the tag with `git push $VERSION` 9 | 1. The release issue is closed 10 | 1. An announcement email is sent to `dev@kubernetes.io` with the subject `[ANNOUNCE] kubernetes-template-project $VERSION is released` -------------------------------------------------------------------------------- /crd-ref-docs.yaml: -------------------------------------------------------------------------------- 1 | # This file contains configuration for our reference docs generation. For more 2 | # information about the possible configuration, refer to 3 | # https://github.com/elastic/crd-ref-docs. 4 | 5 | processor: 6 | ignoreTypes: 7 | - "(InferencePool|InferenceObjective|InferencePoolImport)List$" 8 | # RE2 regular expressions describing type fields that should be excluded from the generated documentation. 9 | ignoreFields: 10 | - "TypeMeta$" 11 | 12 | render: 13 | # Version of Kubernetes to use when generating links to Kubernetes API documentation. 14 | kubernetesVersion: 1.31 15 | -------------------------------------------------------------------------------- /tools/dynamic-lora-sidecar/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.10-slim-buster AS test 2 | 3 | WORKDIR /dynamic-lora-reconciler-test 4 | COPY requirements.txt . 5 | COPY sidecar/* ./ 6 | RUN pip install -r requirements.txt 7 | RUN python -m unittest discover || exit 1 8 | 9 | FROM python:3.10-slim-buster 10 | 11 | WORKDIR /dynamic-lora-reconciler 12 | 13 | RUN python3 -m venv /opt/venv 14 | 15 | ENV PATH="/opt/venv/bin:$PATH" 16 | 17 | RUN pip install --upgrade pip 18 | COPY requirements.txt . 19 | RUN pip install --no-cache-dir -r requirements.txt 20 | 21 | COPY sidecar/* ./ 22 | 23 | CMD ["python", "sidecar.py"] -------------------------------------------------------------------------------- /config/crd/kustomizeconfig.yaml: -------------------------------------------------------------------------------- 1 | # This file is for teaching kustomize how to substitute name and namespace reference in CRD 2 | nameReference: 3 | - kind: Service 4 | version: v1 5 | fieldSpecs: 6 | - kind: CustomResourceDefinition 7 | version: v1 8 | group: apiextensions.k8s.io 9 | path: spec/conversion/webhook/clientConfig/service/name 10 | 11 | namespace: 12 | - kind: CustomResourceDefinition 13 | version: v1 14 | group: apiextensions.k8s.io 15 | path: spec/conversion/webhook/clientConfig/service/namespace 16 | create: false 17 | 18 | varReference: 19 | - path: metadata/annotations 20 | -------------------------------------------------------------------------------- /site-src/_includes/prereqs.md: -------------------------------------------------------------------------------- 1 | A cluster with: 2 | 3 | - Support for one of the three most recent Kubernetes minor [releases](https://kubernetes.io/releases/). 4 | - Support for services of type `LoadBalancer`. For kind clusters, follow [this guide](https://kind.sigs.k8s.io/docs/user/loadbalancer) 5 | to get services of type LoadBalancer working. 6 | - Support for [sidecar containers](https://kubernetes.io/docs/concepts/workloads/pods/sidecar-containers/) (enabled by default since Kubernetes v1.29) 7 | to run the model server deployment. 8 | 9 | Tooling: 10 | 11 | - [Helm](https://helm.sh/docs/intro/install/) installed. 12 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | # Maintain dependencies for go 4 | - package-ecosystem: "gomod" 5 | directory: "/" 6 | schedule: 7 | interval: "weekly" 8 | labels: 9 | - "area/dependency" 10 | - "ok-to-test" 11 | - "release-note-none" 12 | groups: 13 | kubernetes: 14 | patterns: 15 | - "k8s.io/*" 16 | ignore: 17 | # Ignore major and minor versions for dependencies updates 18 | # Allow patches and security updates. 19 | - dependency-name: k8s.io/* 20 | update-types: ["version-update:semver-major", "version-update:semver-minor"] 21 | -------------------------------------------------------------------------------- /test/testdata/configloader_1_test.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: inference.networking.x-k8s.io/v1alpha1 2 | kind: EndpointPickerConfig 3 | plugins: 4 | - name: test1 5 | type: test-one 6 | parameters: 7 | threshold: 10 8 | - name: profileHandler 9 | type: test-profile-handler 10 | - type: test-two 11 | parameters: 12 | blockSize: 32 13 | - name: testPicker 14 | type: test-picker 15 | schedulingProfiles: 16 | - name: default 17 | plugins: 18 | - pluginRef: test1 19 | - pluginRef: test-two 20 | weight: 50 21 | - pluginRef: testPicker 22 | featureGates: 23 | - dataLayer 24 | saturationDetector: 25 | metricsStalenessThreshold: 150ms 26 | -------------------------------------------------------------------------------- /SECURITY_CONTACTS: -------------------------------------------------------------------------------- 1 | # Defined below are the security contacts for this repo. 2 | # 3 | # They are the contact point for the Security Response Committee to reach out 4 | # to for triaging and handling of incoming issues. 5 | # 6 | # The below names agree to abide by the 7 | # [Embargo Policy](https://git.k8s.io/security/private-distributors-list.md#embargo-policy) 8 | # and will be removed and replaced if they violate that agreement. 9 | # 10 | # DO NOT REPORT SECURITY VULNERABILITIES DIRECTLY TO THESE NAMES, FOLLOW THE 11 | # INSTRUCTIONS AT https://kubernetes.io/security/ 12 | 13 | ArangoGutierrez 14 | Jeffwan 15 | SergeyKanzhelev 16 | terrytangyuan 17 | -------------------------------------------------------------------------------- /pkg/bbr/README.md: -------------------------------------------------------------------------------- 1 | # Body-Based Routing 2 | This package provides an extension that can be deployed to write the `model` 3 | HTTP body parameter as a header (X-Gateway-Model-Name) so as to enable routing capabilities on the 4 | model name. 5 | 6 | As per OpenAI spec, it is standard for the model name to be included in the 7 | body of the HTTP request. However, most implementations do not support routing 8 | based on the request body. This extension helps bridge that gap for clients. 9 | This extension works by parsing the request body. If it finds a `model` parameter in the 10 | request body, it will copy the value of that parameter into a request header. 11 | -------------------------------------------------------------------------------- /api/doc.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2024 The Kubernetes Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package api 18 | -------------------------------------------------------------------------------- /apix/doc.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2024 The Kubernetes Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package apix 18 | -------------------------------------------------------------------------------- /hack/boilerplate/boilerplate.go.txt: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright YEAR The Kubernetes Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | -------------------------------------------------------------------------------- /hack/boilerplate/boilerplate.generatego.txt: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright The Kubernetes Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | -------------------------------------------------------------------------------- /hack/boilerplate/boilerplate.py.txt: -------------------------------------------------------------------------------- 1 | # Copyright YEAR The Kubernetes Authors. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /hack/boilerplate/boilerplate.sh.txt: -------------------------------------------------------------------------------- 1 | # Copyright YEAR The Kubernetes Authors. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /config/charts/inferencepool/templates/epp-sa-token-secret.yaml: -------------------------------------------------------------------------------- 1 | {{- if and .Values.inferenceExtension.monitoring.prometheus.enabled .Values.inferenceExtension.monitoring.prometheus.auth.enabled (ne (lower .Values.provider.name) "gke") }} 2 | apiVersion: v1 3 | kind: Secret 4 | metadata: 5 | name: {{ .Values.inferenceExtension.monitoring.prometheus.auth.secretName }} 6 | namespace: {{ .Release.Namespace }} 7 | labels: 8 | {{- include "gateway-api-inference-extension.labels" . | nindent 4 }} 9 | annotations: 10 | kubernetes.io/service-account.name: {{ include "gateway-api-inference-extension.name" . }} 11 | type: kubernetes.io/service-account-token 12 | {{- end }} -------------------------------------------------------------------------------- /tools/simulations/llm_ig_simulation/src/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 The Kubernetes Authors. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Binaries for programs and plugins 2 | *.exe 3 | *.exe~ 4 | *.dll 5 | *.so 6 | *.dylib 7 | bin/* 8 | Dockerfile.cross 9 | artifacts 10 | latencypredictor/__pycache__ 11 | 12 | # Test binary, built with `go test -c` 13 | *.test 14 | 15 | # Output of the go coverage tool, specifically when used with LiteIDE 16 | *.out 17 | 18 | # Go workspace file 19 | go.work 20 | go.work.sum 21 | 22 | # Kubernetes Generated files - skip generated files, except for vendored files 23 | !vendor/**/zz_generated.* 24 | 25 | # editor and IDE paraphernalia 26 | .idea 27 | .vscode 28 | *.swp 29 | *.swo 30 | *~ 31 | 32 | # generated docs 33 | site 34 | 35 | # MacOS generated files 36 | **/.DS_Store 37 | -------------------------------------------------------------------------------- /conformance/reports/v1.0.2/gateway/nginx-nginx-gateway-fabric/inference-v2.2.0-report.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: gateway.networking.k8s.io/v1 2 | date: "2025-10-28T14:18:58Z" 3 | gatewayAPIChannel: experimental 4 | gatewayAPIVersion: v1.3.0 5 | implementation: 6 | contact: 7 | - https://github.com/nginx/nginx-gateway-fabric/discussions/new/choose 8 | organization: nginx 9 | project: nginx-gateway-fabric 10 | url: https://github.com/nginx/nginx-gateway-fabric 11 | version: 2.2.0 12 | kind: ConformanceReport 13 | mode: default 14 | profiles: 15 | - core: 16 | result: success 17 | statistics: 18 | Failed: 0 19 | Passed: 9 20 | Skipped: 0 21 | name: Gateway 22 | summary: Core tests succeeded. -------------------------------------------------------------------------------- /conformance/tests/httproute_invalid_inferencepool_ref.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: gateway.networking.k8s.io/v1 2 | kind: HTTPRoute 3 | metadata: 4 | name: httproute-to-non-existent-pool 5 | namespace: inference-conformance-app-backend 6 | spec: 7 | parentRefs: 8 | - group: gateway.networking.k8s.io 9 | kind: Gateway 10 | name: conformance-primary 11 | namespace: inference-conformance-infra 12 | sectionName: http 13 | rules: 14 | - backendRefs: 15 | - group: inference.networking.k8s.io 16 | kind: InferencePool 17 | name: non-existent-inference-pool # Intentionally Non-Existing 18 | matches: 19 | - path: 20 | type: PathPrefix 21 | value: /test-non-existent-pool 22 | -------------------------------------------------------------------------------- /hack/mkdocs/image/requirements.txt: -------------------------------------------------------------------------------- 1 | # required for mkdocs-core 2 | jinja2~=3.0 3 | # mkdocs 2.4.1 requires Markdown < 3.4.0 4 | # https://github.com/kubernetes-sigs/gateway-api/pull/1671#issuecomment-1400586465 5 | markdown~=3.7 6 | mkdocs~=1.6 7 | mkdocs-material-extensions~=1.3 8 | pygments~=2.16 9 | pymdown-extensions~=10.2 10 | 11 | # Requirements for plugins 12 | babel~=2.10 13 | colorama~=0.4 14 | paginate~=0.5 15 | regex>=2022.4 16 | requests~=2.26 17 | 18 | # mkdocs + mkdocs plugins 19 | mkdocs==1.6.1 20 | mkdocs-awesome-pages-plugin==2.9.3 21 | mkdocs-macros-plugin==1.2.0 22 | mkdocs-material==9.5.36 23 | mkdocs-material-extensions==1.3.1 24 | mkdocs-redirects==1.2.1 25 | mkdocs-mermaid2-plugin==1.1.1 26 | -------------------------------------------------------------------------------- /conformance/tests/gateway_following_epp_routing.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: gateway.networking.k8s.io/v1 2 | kind: HTTPRoute 3 | metadata: 4 | name: httproute-for-primary-gw 5 | namespace: inference-conformance-app-backend 6 | spec: 7 | parentRefs: 8 | - group: gateway.networking.k8s.io 9 | kind: Gateway 10 | name: conformance-primary 11 | namespace: inference-conformance-infra 12 | sectionName: http 13 | hostnames: 14 | - "primary.example.com" 15 | rules: 16 | - backendRefs: 17 | - group: inference.networking.k8s.io 18 | kind: InferencePool 19 | name: primary-inference-pool 20 | matches: 21 | - path: 22 | type: PathPrefix 23 | value: /primary-gateway-test 24 | -------------------------------------------------------------------------------- /conformance/tests/gateway_following_epp_routing_dp.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: gateway.networking.k8s.io/v1 2 | kind: HTTPRoute 3 | metadata: 4 | name: httproute-for-primary-gw-dp 5 | namespace: inference-conformance-app-backend 6 | spec: 7 | parentRefs: 8 | - group: gateway.networking.k8s.io 9 | kind: Gateway 10 | name: conformance-primary 11 | namespace: inference-conformance-infra 12 | sectionName: http 13 | hostnames: 14 | - "primary.example.com" 15 | rules: 16 | - backendRefs: 17 | - group: inference.networking.k8s.io 18 | kind: InferencePool 19 | name: dp-inference-pool 20 | matches: 21 | - path: 22 | type: PathPrefix 23 | value: /primary-gateway-dp-test 24 | -------------------------------------------------------------------------------- /conformance/reports/v0.5.1/gateway/agentgateway/inference-v0.7.2-report.yaml: -------------------------------------------------------------------------------- 1 | GatewayAPIInferenceExtensionVersion: v0.5.1 2 | apiVersion: gateway.networking.k8s.io/v1 3 | date: "2025-08-06T17:50:20-07:00" 4 | gatewayAPIChannel: experimental 5 | gatewayAPIVersion: v1.3.0 6 | implementation: 7 | contact: 8 | - github.com/agentgateway/agentgateway/issues/new/choose 9 | organization: agentgateway 10 | project: agentgateway 11 | url: http://agentgateway.dev/ 12 | version: v0.7.2 13 | kind: ConformanceReport 14 | mode: default 15 | profiles: 16 | - core: 17 | result: success 18 | statistics: 19 | Failed: 0 20 | Passed: 9 21 | Skipped: 0 22 | name: Gateway 23 | summary: Core tests succeeded. 24 | -------------------------------------------------------------------------------- /conformance/reports/v0.5.1/gateway/kgateway/inference-v2.0.4-report.yaml: -------------------------------------------------------------------------------- 1 | GatewayAPIInferenceExtensionVersion: v0.5.1 2 | apiVersion: gateway.networking.k8s.io/v1 3 | date: "2025-08-06T15:05:42-07:00" 4 | gatewayAPIChannel: experimental 5 | gatewayAPIVersion: v1.3.0 6 | implementation: 7 | contact: 8 | - github.com/kgateway-dev/kgateway/issues/new/choose 9 | organization: kgateway-dev 10 | project: kgateway 11 | url: github.com/kgateway-dev/kgateway 12 | version: v2.0.4 13 | kind: ConformanceReport 14 | mode: default 15 | profiles: 16 | - core: 17 | result: success 18 | statistics: 19 | Failed: 0 20 | Passed: 9 21 | Skipped: 0 22 | name: Gateway 23 | summary: Core tests succeeded. 24 | -------------------------------------------------------------------------------- /conformance/reports/v1.0.2/gateway/kgateway/inference-v2.1.0-report.yaml: -------------------------------------------------------------------------------- 1 | GatewayAPIInferenceExtensionVersion: v1.0.2 2 | apiVersion: gateway.networking.k8s.io/v1 3 | date: "2025-10-27T13:11:40-07:00" 4 | gatewayAPIChannel: experimental 5 | gatewayAPIVersion: v1.4.0 6 | implementation: 7 | contact: 8 | - github.com/kgateway-dev/kgateway/issues/new/choose 9 | organization: kgateway-dev 10 | project: kgateway 11 | url: github.com/kgateway-dev/kgateway 12 | version: v2.1.1 13 | kind: ConformanceReport 14 | mode: default 15 | profiles: 16 | - core: 17 | result: success 18 | statistics: 19 | Failed: 0 20 | Passed: 9 21 | Skipped: 0 22 | name: Gateway 23 | summary: Core tests succeeded. 24 | -------------------------------------------------------------------------------- /conformance/reports/v0.5.1/gateway/envoy-ai-gateway/aigw-latest-report.yaml: -------------------------------------------------------------------------------- 1 | GatewayAPIInferenceExtensionVersion: v0.5.1 2 | apiVersion: gateway.networking.k8s.io/v1 3 | date: "2025-08-15T14:10:31-09:00" 4 | gatewayAPIChannel: experimental 5 | gatewayAPIVersion: v1.3.0 6 | implementation: 7 | contact: 8 | - github.com/envoyproxy/ai-gateway/issues/new/choose 9 | organization: envoyproxy 10 | project: envoy-ai-gateway 11 | url: github.com/envoyproxy/ai-gateway 12 | version: latest 13 | kind: ConformanceReport 14 | mode: default 15 | profiles: 16 | - core: 17 | result: success 18 | statistics: 19 | Failed: 0 20 | Passed: 9 21 | Skipped: 0 22 | name: Gateway 23 | summary: Core tests succeeded. 24 | -------------------------------------------------------------------------------- /.github/workflows/kal.yml: -------------------------------------------------------------------------------- 1 | name: PR golangci-lint 2 | 3 | on: 4 | pull_request: 5 | types: [opened, edited, synchronize, reopened] 6 | 7 | # Remove all permissions from GITHUB_TOKEN except metadata. 8 | permissions: {} 9 | 10 | jobs: 11 | golangci: 12 | name: kube-api-lint 13 | runs-on: ubuntu-latest 14 | strategy: 15 | fail-fast: false 16 | steps: 17 | - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # tag=v4.2.2 18 | name: Checkout code 19 | with: 20 | persist-credentials: false 21 | - name: Set up Go 22 | uses: actions/setup-go@d35c59abb061a4a6fb18e82ac0862c26744d6ab5 # tag=v5.5.0 23 | - name: Run API Linter 24 | run: make api-lint -------------------------------------------------------------------------------- /conformance/reports/v0.5.0/gateway/gke-gateway/standard-v1.32.4-rxlb-gateway-report.yaml: -------------------------------------------------------------------------------- 1 | GatewayAPIInferenceExtensionVersion: v0.5.0 2 | apiVersion: gateway.networking.k8s.io/v1 3 | date: "2025-07-21T22:52:10Z" 4 | gatewayAPIChannel: standard 5 | gatewayAPIVersion: v1.2.1 6 | implementation: 7 | contact: 8 | - gke-gateway-dev@google.com 9 | organization: GKE 10 | project: gke-gateway 11 | url: https://cloud.google.com/kubernetes-engine/docs/concepts/gateway-api 12 | version: 1.32.4-gke.1415000 13 | kind: ConformanceReport 14 | mode: default 15 | profiles: 16 | - core: 17 | result: success 18 | statistics: 19 | Failed: 0 20 | Passed: 9 21 | Skipped: 0 22 | name: Gateway 23 | summary: Core tests succeeded. 24 | -------------------------------------------------------------------------------- /conformance/reports/v0.5.1/gateway/agentgateway/README.md: -------------------------------------------------------------------------------- 1 | # Agentgateway (with kgateway) 2 | 3 | ## Table of Contents 4 | 5 | | Extension Version Tested | Profile Tested | Implementation Version | Mode | Report | 6 | |--------------------------|----------------|------------------------|---------|----------------------------------------------------------------------------| 7 | | v0.5.1 | Gateway | v0.7.2 | default | [v0.7.2 report](./inference-v0.7.2-report.yaml) | 8 | 9 | ## Reproduce 10 | 11 | From the [kgateway repository](https://github.com/kgateway-dev/kgateway/): `CONFORMANCE_GATEWAY_CLASS=agentgateway make gie-conformance`. 12 | -------------------------------------------------------------------------------- /conformance/reports/v0.5.1/gateway/kubvernor/kubvernor-inference-conformance-output-0.1.1.yaml: -------------------------------------------------------------------------------- 1 | GatewayAPIInferenceExtensionVersion: v0.5.1 2 | apiVersion: gateway.networking.k8s.io/v1 3 | date: "2025-08-25T15:42:29+01:00" 4 | gatewayAPIChannel: standard 5 | gatewayAPIVersion: v1.2.1 6 | implementation: 7 | contact: 8 | - https://github.com/kubvernor/kubvernor 9 | organization: kubvernor 10 | project: kubvernor 11 | url: https://github.com/kubvernor/kubvernor 12 | version: 0.1.1 13 | kind: ConformanceReport 14 | mode: default 15 | profiles: 16 | - core: 17 | result: success 18 | statistics: 19 | Failed: 0 20 | Passed: 9 21 | Skipped: 0 22 | name: Gateway 23 | summary: Core tests succeeded. 24 | -------------------------------------------------------------------------------- /conformance/reports/v0.4.0/gateway/istio/1.28-alpha.32ca03082f566513ad9b860f31e7745b0f68dc91-default-gateway-report.yaml: -------------------------------------------------------------------------------- 1 | GatewayAPIInferenceExtensionVersion: v0.4.0 2 | apiVersion: gateway.networking.k8s.io/v1 3 | date: "2025-07-23T14:20:45+02:00" 4 | gatewayAPIChannel: standard 5 | gatewayAPIVersion: v1.3.0 6 | implementation: 7 | contact: 8 | - '@istio/maintainers' 9 | organization: istio 10 | project: istio 11 | url: https://istio.io 12 | version: 1.28-alpha.32ca03082f566513ad9b860f31e7745b0f68dc91 13 | kind: ConformanceReport 14 | mode: default 15 | profiles: 16 | - core: 17 | result: success 18 | statistics: 19 | Failed: 0 20 | Passed: 9 21 | Skipped: 0 22 | name: Gateway 23 | summary: Core tests succeeded. 24 | -------------------------------------------------------------------------------- /conformance/reports/v0.5.0/gateway/istio/1.28-alpha.32ca03082f566513ad9b860f31e7745b0f68dc91-default-gateway-report.yaml: -------------------------------------------------------------------------------- 1 | GatewayAPIInferenceExtensionVersion: v0.5.0 2 | apiVersion: gateway.networking.k8s.io/v1 3 | date: "2025-07-23T14:31:41+02:00" 4 | gatewayAPIChannel: standard 5 | gatewayAPIVersion: v1.3.0 6 | implementation: 7 | contact: 8 | - '@istio/maintainers' 9 | organization: istio 10 | project: istio 11 | url: https://istio.io 12 | version: 1.28-alpha.32ca03082f566513ad9b860f31e7745b0f68dc91 13 | kind: ConformanceReport 14 | mode: default 15 | profiles: 16 | - core: 17 | result: success 18 | statistics: 19 | Failed: 0 20 | Passed: 9 21 | Skipped: 0 22 | name: Gateway 23 | summary: Core tests succeeded. 24 | -------------------------------------------------------------------------------- /pkg/epp/util/logging/logging_const.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2025 The Kubernetes Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package logging 18 | 19 | const ( 20 | DEFAULT = 2 21 | VERBOSE = 3 22 | DEBUG = 4 23 | TRACE = 5 24 | ) 25 | -------------------------------------------------------------------------------- /bbr.Dockerfile: -------------------------------------------------------------------------------- 1 | # Dockerfile has specific requirement to put this ARG at the beginning: 2 | # https://docs.docker.com/engine/reference/builder/#understand-how-arg-and-from-interact 3 | ARG BUILDER_IMAGE=golang:1.24 4 | ARG BASE_IMAGE=gcr.io/distroless/static:nonroot 5 | 6 | ## Multistage build 7 | FROM ${BUILDER_IMAGE} AS builder 8 | ENV CGO_ENABLED=0 9 | ENV GOOS=linux 10 | ENV GOARCH=amd64 11 | 12 | # Dependencies 13 | WORKDIR /src 14 | COPY go.mod go.sum ./ 15 | RUN go mod download 16 | 17 | # Sources 18 | COPY cmd/bbr ./cmd 19 | COPY pkg ./pkg 20 | COPY internal ./internal 21 | COPY api ./api 22 | WORKDIR /src/cmd 23 | RUN go build -o /bbr 24 | 25 | ## Multistage deploy 26 | FROM ${BASE_IMAGE} 27 | 28 | WORKDIR / 29 | COPY --from=builder /bbr /bbr 30 | 31 | ENTRYPOINT ["/bbr"] 32 | -------------------------------------------------------------------------------- /conformance/tests/epp_unavailable_fail_open.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: gateway.networking.k8s.io/v1 2 | kind: HTTPRoute 3 | metadata: 4 | name: httproute-for-failopen-pool-gw 5 | namespace: inference-conformance-app-backend 6 | spec: 7 | parentRefs: 8 | - group: gateway.networking.k8s.io 9 | kind: Gateway 10 | name: conformance-secondary 11 | namespace: inference-conformance-infra 12 | sectionName: http 13 | hostnames: 14 | - "secondary.example.com" 15 | rules: 16 | - backendRefs: 17 | - group: inference.networking.k8s.io 18 | kind: InferencePool 19 | name: secondary-inference-pool # Use secondary-inferencePool because it has failureMode set to failOpen 20 | matches: 21 | - path: 22 | type: PathPrefix 23 | value: /failopen-pool-test 24 | -------------------------------------------------------------------------------- /site-src/_includes/bbr.md: -------------------------------------------------------------------------------- 1 | ### Deploy the Body Based Router Extension (Optional) 2 | 3 | This guide has shown how to get started with serving a single base model type per L7 URL path. If after this exercise, you wish to continue on to exercise model-aware routing such that more than 1 base model is served at the same L7 url path, that requires use of the (optional) Body Based Routing (BBR) extension which is described in a separate section of the documentation, namely the [`Serving Multiple GenAI Models`](serve-multiple-genai-models.md) section. If you wish to exercise that function, then retain the setup you have deployed so far from this guide and move on to the additional steps described in [that guide](serve-multiple-genai-models.md) or else move on to the following section to cleanup your setup. 4 | -------------------------------------------------------------------------------- /conformance/reports/v0.5.1/gateway/kubvernor/README.md: -------------------------------------------------------------------------------- 1 | # Kubvernor Rust Gateway 2 | 3 | ## Table of Contents 4 | 5 | | Extension Version Tested | Profile Tested | Implementation Version | Mode | Report | 6 | |--------------------------|----------------|------------------------|---------|-----------------------------------------------------------------------| 7 | | v0.5.1 | Gateway | [0.1.1](https://github.com/kubvernor/kubvernor/releases/tag/0.1.1) | default | [Conformance report](./kubvernor-inference-conformance-output-0.1.1.yaml) | 8 | 9 | ## Reproduce 10 | 11 | To reproduce Kubvernor conformance report follow [README](https://github.com/kubvernor/kubvernor/blob/0.1.1/README.md) 12 | 13 | -------------------------------------------------------------------------------- /config/manifests/inferenceobjective.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: inference.networking.x-k8s.io/v1alpha2 2 | kind: InferenceObjective 3 | metadata: 4 | name: food-review 5 | spec: 6 | priority: 1 7 | poolRef: 8 | group: inference.networking.k8s.io 9 | name: vllm-llama3-8b-instruct 10 | --- 11 | apiVersion: inference.networking.x-k8s.io/v1alpha2 12 | kind: InferenceObjective 13 | metadata: 14 | name: base-model 15 | spec: 16 | priority: 2 17 | poolRef: 18 | group: inference.networking.k8s.io 19 | name: vllm-llama3-8b-instruct 20 | --- 21 | apiVersion: inference.networking.x-k8s.io/v1alpha2 22 | kind: InferenceObjective 23 | metadata: 24 | name: base-model-cpu 25 | spec: 26 | priority: 2 27 | poolRef: 28 | group: inference.networking.k8s.io 29 | name: vllm-llama3-8b-instruct 30 | -------------------------------------------------------------------------------- /client-go/clientset/versioned/fake/doc.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright The Kubernetes Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | // Code generated by client-gen. DO NOT EDIT. 18 | 19 | // This package has the automatically generated fake clientset. 20 | package fake 21 | -------------------------------------------------------------------------------- /pkg/epp/backend/pod.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2025 The Kubernetes Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package backend 18 | 19 | import ( 20 | "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datalayer" 21 | ) 22 | 23 | type Pod = datalayer.PodInfo 24 | -------------------------------------------------------------------------------- /client-go/clientset/versioned/typed/api/v1/doc.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright The Kubernetes Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | // Code generated by client-gen. DO NOT EDIT. 18 | 19 | // This package has the automatically generated typed clients. 20 | package v1 21 | -------------------------------------------------------------------------------- /client-go/clientset/versioned/typed/api/v1/fake/doc.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright The Kubernetes Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | // Code generated by client-gen. DO NOT EDIT. 18 | 19 | // Package fake has the automatically generated clients. 20 | package fake 21 | -------------------------------------------------------------------------------- /client-go/clientset/versioned/typed/api/v1/generated_expansion.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright The Kubernetes Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | // Code generated by client-gen. DO NOT EDIT. 18 | 19 | package v1 20 | 21 | type InferencePoolExpansion interface{} 22 | -------------------------------------------------------------------------------- /hack/referencer.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2024 The Kubernetes Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package internal 18 | 19 | import ( 20 | // Keep a reference to the code generators so they are not removed by `go mod tidy` 21 | _ "k8s.io/code-generator" 22 | ) 23 | -------------------------------------------------------------------------------- /client-go/clientset/versioned/typed/apix/v1alpha1/fake/doc.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright The Kubernetes Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | // Code generated by client-gen. DO NOT EDIT. 18 | 19 | // Package fake has the automatically generated clients. 20 | package fake 21 | -------------------------------------------------------------------------------- /client-go/clientset/versioned/typed/apix/v1alpha2/fake/doc.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright The Kubernetes Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | // Code generated by client-gen. DO NOT EDIT. 18 | 19 | // Package fake has the automatically generated clients. 20 | package fake 21 | -------------------------------------------------------------------------------- /conformance/tests/inferencepool_accepted.yaml: -------------------------------------------------------------------------------- 1 | # --- HTTPRoute Definition --- 2 | apiVersion: gateway.networking.k8s.io/v1 3 | kind: HTTPRoute 4 | metadata: 5 | name: httproute-for-inferencepool-accepted 6 | namespace: inference-conformance-app-backend 7 | spec: 8 | parentRefs: 9 | - group: gateway.networking.k8s.io 10 | kind: Gateway 11 | name: conformance-primary 12 | namespace: inference-conformance-infra 13 | sectionName: http 14 | rules: 15 | - backendRefs: 16 | - group: inference.networking.k8s.io 17 | kind: InferencePool 18 | name: primary-inference-pool 19 | # namespace: inference-conformance-app-backend - is omitted since it is in the same namespace as HTTPRoute 20 | matches: 21 | - path: 22 | type: PathPrefix 23 | value: /accepted-pool-test 24 | -------------------------------------------------------------------------------- /client-go/clientset/versioned/scheme/doc.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright The Kubernetes Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | // Code generated by client-gen. DO NOT EDIT. 18 | 19 | // This package contains the scheme of the automatically generated clientset. 20 | package scheme 21 | -------------------------------------------------------------------------------- /client-go/clientset/versioned/typed/apix/v1alpha1/doc.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright The Kubernetes Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | // Code generated by client-gen. DO NOT EDIT. 18 | 19 | // This package has the automatically generated typed clients. 20 | package v1alpha1 21 | -------------------------------------------------------------------------------- /client-go/clientset/versioned/typed/apix/v1alpha2/doc.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright The Kubernetes Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | // Code generated by client-gen. DO NOT EDIT. 18 | 19 | // This package has the automatically generated typed clients. 20 | package v1alpha2 21 | -------------------------------------------------------------------------------- /client-go/clientset/versioned/typed/apix/v1alpha1/generated_expansion.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright The Kubernetes Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | // Code generated by client-gen. DO NOT EDIT. 18 | 19 | package v1alpha1 20 | 21 | type InferencePoolImportExpansion interface{} 22 | -------------------------------------------------------------------------------- /config/charts/inferencepool/templates/epp-service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | name: {{ include "gateway-api-inference-extension.name" . }} 5 | namespace: {{ .Release.Namespace }} 6 | labels: 7 | {{- include "gateway-api-inference-extension.labels" . | nindent 4 }} 8 | spec: 9 | selector: 10 | {{- include "gateway-api-inference-extension.selectorLabels" . | nindent 4 }} 11 | ports: 12 | - name: grpc-ext-proc 13 | protocol: TCP 14 | port: {{ .Values.inferenceExtension.extProcPort | default 9002 }} 15 | - name: http-metrics 16 | protocol: TCP 17 | port: {{ .Values.inferenceExtension.metricsPort | default 9090 }} 18 | {{- with .Values.inferenceExtension.extraServicePorts }} 19 | {{- toYaml . | nindent 4 }} 20 | {{- end }} 21 | type: ClusterIP 22 | -------------------------------------------------------------------------------- /conformance/reports/v0.5.1/gateway/ack-gateway/v1.4.0-apsara.3-gateway-report.yaml: -------------------------------------------------------------------------------- 1 | GatewayAPIInferenceExtensionVersion: v0.5.1 2 | apiVersion: gateway.networking.k8s.io/v1 3 | date: "2025-08-18T18:15:11+08:00" 4 | gatewayAPIChannel: experimental 5 | gatewayAPIVersion: v1.3.0 6 | implementation: 7 | contact: 8 | - https://smartservice.console.aliyun.com/service/create-ticket 9 | organization: AlibabaCloud 10 | project: ack-gateway-with-inference-extension 11 | url: https://www.alibabacloud.com/help/en/cs/user-guide/gateway-with-inference-extension-overview 12 | version: v1.4.0-apsara.3 13 | kind: ConformanceReport 14 | mode: default 15 | profiles: 16 | - core: 17 | result: success 18 | statistics: 19 | Failed: 0 20 | Passed: 9 21 | Skipped: 0 22 | name: Gateway 23 | summary: Core tests succeeded. -------------------------------------------------------------------------------- /pkg/epp/util/request/sheddable.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2025 The Kubernetes Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package request 18 | 19 | // IsSheddable determines if a request is considered sheddable based on its priority. 20 | func IsSheddable(priority int) bool { 21 | return priority < 0 22 | } 23 | -------------------------------------------------------------------------------- /latencypredictor/Dockerfile-training: -------------------------------------------------------------------------------- 1 | # Use an official Python runtime as a parent image 2 | FROM python:3.11-slim 3 | 4 | # Set the working directory in the container 5 | WORKDIR /app 6 | 7 | # Copy the requirements file and install dependencies 8 | # (It's good practice to manage dependencies in a requirements.txt file) 9 | 10 | 11 | RUN apt-get update && apt-get install -y \ 12 | libgomp1 \ 13 | && rm -rf /var/lib/apt/lists/* 14 | 15 | 16 | COPY requirements.txt . 17 | RUN pip install --no-cache-dir -r requirements.txt 18 | 19 | # Copy the rest of the application code 20 | COPY . . 21 | 22 | # Expose the port the app runs on 23 | EXPOSE 8000 24 | 25 | # Command to run the application using uvicorn 26 | # We use 0.0.0.0 to bind to all network interfaces inside the container 27 | CMD ["uvicorn", "training_server:app", "--host", "0.0.0.0", "--port", "8000"] -------------------------------------------------------------------------------- /latencypredictor/Dockerfile-prediction: -------------------------------------------------------------------------------- 1 | # Use an official Python runtime as a parent image 2 | FROM python:3.11-slim 3 | 4 | # Set the working directory in the container 5 | WORKDIR /app 6 | 7 | # Copy the requirements file and install dependencies 8 | # (It's good practice to manage dependencies in a requirements.txt file) 9 | 10 | 11 | RUN apt-get update && apt-get install -y \ 12 | libgomp1 \ 13 | && rm -rf /var/lib/apt/lists/* 14 | 15 | COPY requirements.txt . 16 | RUN pip install --no-cache-dir -r requirements.txt 17 | 18 | # Copy the rest of the application code 19 | COPY . . 20 | 21 | # Expose the port the app runs on 22 | EXPOSE 8001 23 | 24 | # Command to run the application using uvicorn 25 | # We use 0.0.0.0 to bind to all network interfaces inside the container 26 | CMD ["uvicorn", "prediction_server:app", "--host", "0.0.0.0", "--port", "8001"] 27 | -------------------------------------------------------------------------------- /apix/v1alpha1/doc.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2025 The Kubernetes Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | // Package v1alpha1 contains API Schema definitions for the 18 | // inference.networking.x-k8s.io API group. 19 | // 20 | // +kubebuilder:object:generate=true 21 | // +groupName=inference.networking.x-k8s.io 22 | package v1alpha1 23 | -------------------------------------------------------------------------------- /apix/config/v1alpha1/doc.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2025 The Kubernetes Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | // Package v1alpha1 contains API Schema definitions for the 18 | // inference.networking.x-k8s.io API group. 19 | // 20 | // +kubebuilder:object:generate=true 21 | // +groupName=inference.networking.x-k8s.io 22 | package v1alpha1 23 | -------------------------------------------------------------------------------- /hack/mkdocs/image/entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # Copyright 2019 The Kubernetes Authors. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | set -o errexit 18 | set -o pipefail 19 | 20 | CMD=$1 21 | 22 | if [ "$CMD" == "build" ]; 23 | then 24 | mkdocs build 25 | exit 0; 26 | fi 27 | 28 | mkdocs serve --dev-addr=0.0.0.0:3000 --livereload -------------------------------------------------------------------------------- /conformance/tests/inferencepool_multiple_rules_different_pools.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: gateway.networking.k8s.io/v1 3 | kind: HTTPRoute 4 | metadata: 5 | name: httproute-multiple-rules-different-pools 6 | namespace: inference-conformance-app-backend 7 | spec: 8 | parentRefs: 9 | - name: conformance-primary 10 | namespace: inference-conformance-infra 11 | rules: 12 | - matches: 13 | - path: 14 | type: PathPrefix 15 | value: /primary 16 | backendRefs: 17 | - name: primary-inference-pool 18 | kind: InferencePool 19 | group: inference.networking.k8s.io 20 | - matches: 21 | - path: 22 | type: PathPrefix 23 | value: /secondary 24 | backendRefs: 25 | - name: secondary-inference-pool 26 | kind: InferencePool 27 | group: inference.networking.k8s.io 28 | -------------------------------------------------------------------------------- /site-src/concepts/roles-and-personas.md: -------------------------------------------------------------------------------- 1 | # Roles and Personas 2 | 3 | Before diving into the details of the API, descriptions of the personas these APIs were designed for will help convey the thought process of the API design. 4 | 5 | ## Inference Platform Admin 6 | 7 | The Inference Platform Admin creates and manages the infrastructure necessary to run LLM workloads, including handling Ops for: 8 | 9 | - Hardware 10 | - Model Server 11 | - Base Model 12 | - Resource Allocation for Workloads 13 | - Gateway configuration 14 | - etc 15 | 16 | ## Inference Workload Owner 17 | 18 | An Inference Workload Owner persona owns and manages one or many Generative AI Workloads (LLM focused *currently*). This includes: 19 | 20 | - Defining priority 21 | - Managing fine-tunes 22 | - LoRA Adapters 23 | - System Prompts 24 | - Prompt Cache 25 | - etc. 26 | - Managing rollout of adapters 27 | -------------------------------------------------------------------------------- /hack/mkdocs/image/Dockerfile: -------------------------------------------------------------------------------- 1 | # Copyright 2019 The Kubernetes Authors. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | FROM python:3.13-alpine 16 | 17 | COPY requirements.txt /requirements.txt 18 | RUN pip install -r /requirements.txt 19 | 20 | WORKDIR /docs 21 | 22 | EXPOSE 3000 23 | 24 | COPY entrypoint.sh / 25 | 26 | ENTRYPOINT ["/entrypoint.sh"] -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug Report 3 | about: Report a bug you encountered 4 | title: '' 5 | labels: kind/bug, needs-triage 6 | assignees: '' 7 | 8 | --- 9 | 10 | 14 | 15 | **What happened**: 16 | 17 | **What you expected to happen**: 18 | 19 | **How to reproduce it (as minimally and precisely as possible)**: 20 | 21 | **Anything else we need to know?**: 22 | 23 | **Environment**: 24 | - Kubernetes version (use `kubectl version`): 25 | - Inference extension version (use `git describe --tags --dirty --always`): 26 | - Cloud provider or hardware configuration: 27 | - Install tools: 28 | - Others: 29 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | # Security Policy 2 | 3 | ## Security Announcements 4 | 5 | Join the [kubernetes-security-announce] group for security and vulnerability announcements. 6 | 7 | ## Reporting a Vulnerability 8 | 9 | Instructions for reporting a vulnerability can be found on the 10 | [Kubernetes Security and Disclosure Information] page. 11 | 12 | ## Supported Versions 13 | 14 | Information about supported Kubernetes versions can be found on the 15 | [Kubernetes version and version skew support policy] page on the Kubernetes website. 16 | 17 | [kubernetes-security-announce]: https://groups.google.com/forum/#!forum/kubernetes-security-announce 18 | [Kubernetes version and version skew support policy]: https://kubernetes.io/docs/setup/release/version-skew-policy/#supported-versions 19 | [Kubernetes Security and Disclosure Information]: https://kubernetes.io/docs/reference/issues-security/security/#report-a-vulnerability 20 | -------------------------------------------------------------------------------- /api/v1/doc.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2025 The Kubernetes Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | // Package v1 contains API Schema definitions for the 18 | // inference.networking.k8s.io API group. 19 | // 20 | // +k8s:openapi-gen=true 21 | // +kubebuilder:object:generate=true 22 | // +groupName=inference.networking.k8s.io 23 | // +groupGoName=Inference 24 | package v1 25 | -------------------------------------------------------------------------------- /conformance/embed.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2025 The Kubernetes Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package conformance 18 | 19 | import "embed" 20 | 21 | // Manifests embeds the contents of the conformance/resources directory making 22 | // the YAML files within them available to the test suite at runtime. 23 | // 24 | //go:embed resources tests/* 25 | var Manifests embed.FS 26 | -------------------------------------------------------------------------------- /client-go/clientset/versioned/typed/apix/v1alpha2/generated_expansion.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright The Kubernetes Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | // Code generated by client-gen. DO NOT EDIT. 18 | 19 | package v1alpha2 20 | 21 | type InferenceModelRewriteExpansion interface{} 22 | 23 | type InferenceObjectiveExpansion interface{} 24 | 25 | type InferencePoolExpansion interface{} 26 | -------------------------------------------------------------------------------- /pkg/epp/datalayer/metrics/types.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2025 The Kubernetes Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package metrics 18 | 19 | import ( 20 | "reflect" 21 | 22 | dto "github.com/prometheus/client_model/go" 23 | ) 24 | 25 | type PrometheusMetricMap = map[string]*dto.MetricFamily 26 | 27 | var ( 28 | PrometheusMetricType = reflect.TypeOf(PrometheusMetricMap{}) 29 | ) 30 | -------------------------------------------------------------------------------- /apix/v1alpha2/doc.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2025 The Kubernetes Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | // Package v1alpha2 contains API Schema definitions for the 18 | // inference.networking.x-k8s.io API group. 19 | // 20 | // +k8s:openapi-gen=true 21 | // +kubebuilder:object:generate=true 22 | // +groupName=inference.networking.x-k8s.io 23 | // +groupGoName=XInference 24 | package v1alpha2 25 | -------------------------------------------------------------------------------- /config/observability/prometheus/values.yaml: -------------------------------------------------------------------------------- 1 | serviceAccounts: 2 | server: 3 | create: false 4 | name: inference-gateway-sa-metrics-reader 5 | 6 | extraScrapeConfigs: | 7 | - job_name: 'inference-extension-epp' 8 | authorization: 9 | credentials_file: /var/run/secrets/kubernetes.io/serviceaccount/token 10 | scrape_interval: 5s 11 | kubernetes_sd_configs: 12 | - role: endpoints 13 | relabel_configs: 14 | - source_labels: [__meta_kubernetes_service_name] 15 | action: keep 16 | regex: .*-epp$ 17 | - source_labels: [__meta_kubernetes_pod_container_port_number] 18 | action: keep 19 | regex: "9090" 20 | - job_name: vllm 21 | scrape_interval: 5s 22 | kubernetes_sd_configs: 23 | - role: pod 24 | relabel_configs: 25 | - source_labels: [__meta_kubernetes_pod_label_app] 26 | action: keep 27 | regex: vllm-llama3-8b-instruct 28 | -------------------------------------------------------------------------------- /cmd/bbr/main.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2025 The Kubernetes Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package main 18 | 19 | import ( 20 | "os" 21 | 22 | ctrl "sigs.k8s.io/controller-runtime" 23 | "sigs.k8s.io/gateway-api-inference-extension/cmd/bbr/runner" 24 | ) 25 | 26 | func main() { 27 | if err := runner.NewRunner().Run(ctrl.SetupSignalHandler()); err != nil { 28 | os.Exit(1) 29 | } 30 | 31 | } 32 | -------------------------------------------------------------------------------- /docs/proposals/0845-scheduler-architecture-proposal/examples/example.yaml: -------------------------------------------------------------------------------- 1 | #names are egregiously long, but attempting to descibe custom logic within a name 2 | profileSelection: disagg-token-length 3 | schedulingResult: log-shadowbox-label-pd-result 4 | profiles: 5 | prefill: 6 | preschedule: 7 | - decode-prefix-cache-check 8 | filter: 9 | - is-prefill 10 | - has-required-accelerator 11 | score: 12 | - prefix-cache: 3 13 | - latency-scorer: 2 14 | selection: 15 | - best-score 16 | postschedule: 17 | - log-full-scores 18 | decode: 19 | filter: 20 | - is-decode 21 | score: 22 | - prefix-cache: 3 23 | - kv-cache-util: 5 24 | selection: 25 | - random-top-3 26 | shadowbox-decode: 27 | filter: 28 | - is-decode 29 | - is-tpu 30 | score: 31 | - prefix-cache-v2: 4 32 | - kv-cache-util: 1 33 | selection: 34 | - random-top-3 35 | -------------------------------------------------------------------------------- /latencypredictor/Dockerfile-test: -------------------------------------------------------------------------------- 1 | # Dockerfile-test 2 | FROM python:3.9-slim 3 | 4 | # Install system dependencies 5 | RUN apt-get update && apt-get install -y \ 6 | curl \ 7 | wget \ 8 | jq \ 9 | && rm -rf /var/lib/apt/lists/* 10 | 11 | # Set working directory 12 | WORKDIR /app 13 | 14 | # Copy requirements and install Python dependencies 15 | COPY requirements.txt . 16 | RUN pip install --no-cache-dir -r requirements.txt 17 | 18 | # Install additional testing dependencies 19 | RUN pip install --no-cache-dir \ 20 | pytest \ 21 | pytest-asyncio \ 22 | requests \ 23 | httpx \ 24 | aiohttp 25 | 26 | # Copy test files 27 | COPY test_dual_server_client.py . 28 | 29 | 30 | # Create test results directory 31 | RUN mkdir -p /test-results 32 | 33 | # Set environment variables 34 | ENV PYTHONPATH=/app 35 | ENV PYTHONUNBUFFERED=1 36 | 37 | # Default command runs the specific test 38 | CMD ["pytest", "-v", "-s", "test_dual_server_client.py"] -------------------------------------------------------------------------------- /tools/tools.go: -------------------------------------------------------------------------------- 1 | //go:build tools 2 | // +build tools 3 | 4 | /* 5 | Copyright 2025 The Kubernetes Authors. 6 | 7 | Licensed under the Apache License, Version 2.0 (the "License"); 8 | you may not use this file except in compliance with the License. 9 | You may obtain a copy of the License at 10 | 11 | http://www.apache.org/licenses/LICENSE-2.0 12 | 13 | Unless required by applicable law or agreed to in writing, software 14 | distributed under the License is distributed on an "AS IS" BASIS, 15 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | See the License for the specific language governing permissions and 17 | limitations under the License. 18 | */ 19 | 20 | // This package contains import references to packages required only for the 21 | // build process. 22 | // https://github.com/golang/go/wiki/Modules#how-can-i-track-tool-dependencies-for-a-module 23 | package tools 24 | 25 | import ( 26 | _ "github.com/elastic/crd-ref-docs" 27 | ) 28 | -------------------------------------------------------------------------------- /conformance/tests/gateway_weighted_two_pools.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: gateway.networking.k8s.io/v1 2 | kind: HTTPRoute 3 | metadata: 4 | name: httproute-weighted-two-pools 5 | namespace: inference-conformance-app-backend 6 | spec: 7 | parentRefs: 8 | - group: gateway.networking.k8s.io 9 | kind: Gateway 10 | name: conformance-primary 11 | namespace: inference-conformance-infra 12 | sectionName: http 13 | hostnames: 14 | - "primary.example.com" 15 | rules: 16 | - matches: 17 | - path: 18 | type: PathPrefix 19 | value: /weighted-two-pools-test 20 | backendRefs: 21 | # 70% of traffic goes to the primary pool 22 | - group: inference.networking.k8s.io 23 | kind: InferencePool 24 | name: primary-inference-pool 25 | weight: 70 26 | # 30% of traffic goes to the secondary pool 27 | - group: inference.networking.k8s.io 28 | kind: InferencePool 29 | name: secondary-inference-pool 30 | weight: 30 31 | -------------------------------------------------------------------------------- /apix/v1alpha1/shared_types.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2025 The Kubernetes Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package v1alpha1 18 | 19 | // ExportAnnotationKey is the annotation key used to export an InferencePool. 20 | var ExportAnnotationKey = "inference.networking.x-k8s.io/export" 21 | 22 | // ExportAnnotationVal is the annotation value used to export an InferencePool 23 | // to all clusters. 24 | var ExportAnnotationVal = "ClusterSet" 25 | -------------------------------------------------------------------------------- /config/charts/inferencepool/templates/istio.yaml: -------------------------------------------------------------------------------- 1 | {{- if eq .Values.provider.name "istio" }} 2 | {{- /* Prefer .Values.provider.istio, fallback to legacy .Values.istio, then {} */ -}} 3 | {{- $provIstio := (index .Values "provider" "istio") -}} 4 | {{- $legacyIstio := .Values.istio -}} 5 | {{- $istio := coalesce $provIstio $legacyIstio (dict) -}} 6 | {{- $dr := (index $istio "destinationRule") | default (dict) -}} 7 | 8 | apiVersion: networking.istio.io/v1beta1 9 | kind: DestinationRule 10 | metadata: 11 | name: {{ include "gateway-api-inference-extension.name" . }} 12 | spec: 13 | host: {{ (index $dr "host") | default (printf "%s.%s.svc.cluster.local" (include "gateway-api-inference-extension.name" .) .Release.Namespace) }} 14 | trafficPolicy: 15 | tls: 16 | mode: SIMPLE 17 | insecureSkipVerify: true 18 | {{- with (index (index $dr "trafficPolicy") "connectionPool") }} 19 | connectionPool: 20 | {{- toYaml . | nindent 6 }} 21 | {{- end }} 22 | {{- end }} 23 | -------------------------------------------------------------------------------- /conformance/tests/inferencepool_invalid_epp_service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: inference.networking.k8s.io/v1 2 | kind: InferencePool 3 | metadata: 4 | name: pool-with-invalid-epp 5 | namespace: inference-conformance-app-backend 6 | spec: 7 | selector: 8 | matchLabels: 9 | app: primary-inference-model-server 10 | targetPorts: 11 | - number: 3000 12 | endpointPickerRef: 13 | name: non-existent-epp-svc 14 | kind: Service 15 | port: 16 | number: 9002 17 | --- 18 | apiVersion: gateway.networking.k8s.io/v1 19 | kind: HTTPRoute 20 | metadata: 21 | name: httproute-for-invalid-epp-pool 22 | namespace: inference-conformance-app-backend 23 | spec: 24 | parentRefs: 25 | - name: conformance-primary 26 | namespace: inference-conformance-infra 27 | rules: 28 | - backendRefs: 29 | - name: pool-with-invalid-epp 30 | kind: InferencePool 31 | group: inference.networking.k8s.io 32 | matches: 33 | - path: 34 | type: PathPrefix 35 | value: /invalid-epp-test 36 | -------------------------------------------------------------------------------- /site-src/api-types/inferenceobjective.md: -------------------------------------------------------------------------------- 1 | # Inference Objective 2 | 3 | ??? example "Alpha since v1.0.0" 4 | 5 | The `InferenceObjective` resource is alpha and may have breaking changes in 6 | future releases of the API. 7 | 8 | ## Background 9 | 10 | The **InferenceObjective** API defines a set of serving objectives of the specific request it is associated with. This CRD currently houses only `Priority` but will be expanded to include fields such as SLO attainment. 11 | 12 | ## Usage 13 | 14 | To associate a request to the InferencePool with a specific InferenceObjective, the system uses a specific header: `x-gateway-inference-objective` with the value of the header set to the InferenceObjective metadata name. So the calling client must set the header key/value on the request to associate the selected InferenceObjective. If no InferenceObjective is selected, default values are used. 15 | 16 | ## Spec 17 | 18 | The full spec of the InferenceObjective is defined [here](/reference/x-v1a2-spec/#inferenceobjective). 19 | -------------------------------------------------------------------------------- /site-src/stylesheets/extra.css: -------------------------------------------------------------------------------- 1 | /* Hide title in favor of logo */ 2 | .md-header__topic { 3 | display: none; 4 | } 5 | 6 | /* Use Kubernetes color as primary */ 7 | :root { 8 | --md-primary-fg-color: #326ce5; 9 | } 10 | 11 | /* Increase size of logo */ 12 | .md-header__button.md-logo img, .md-header__button.md-logo svg { 13 | height: 1.8rem; 14 | } 15 | 16 | /* Always show tabs, even on smaller screens */ 17 | @media screen and (max-width: 76.234375em) { 18 | .md-header__button.md-logo { 19 | display: block; 20 | } 21 | .md-tabs { 22 | display: block; 23 | } 24 | } 25 | 26 | /* Rounded search box + results */ 27 | .md-search__form { 28 | border-radius: .5rem; 29 | } 30 | 31 | [data-md-toggle=search]:checked~.md-header .md-search__form { 32 | border-radius: .5rem .5rem 0 0; 33 | } 34 | [dir=ltr] .md-search__output { 35 | border-radius: 0 0 .5rem .5rem; 36 | } 37 | 38 | /* Center images */ 39 | img.center { 40 | display: block; 41 | margin: 20px auto; 42 | } 43 | -------------------------------------------------------------------------------- /pkg/epp/util/metrics/metrics.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2025 The Kubernetes Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package metrics 18 | 19 | import ( 20 | "fmt" 21 | 22 | compbasemetrics "k8s.io/component-base/metrics" 23 | ) 24 | 25 | // HelpMsgWithStability is a helper function to create a help message with stability level. 26 | func HelpMsgWithStability(msg string, stability compbasemetrics.StabilityLevel) string { 27 | return fmt.Sprintf("[%v] %v", stability, msg) 28 | } 29 | -------------------------------------------------------------------------------- /conformance/conformance_test.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2025 The Kubernetes Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package conformance 18 | 19 | import ( 20 | "testing" 21 | ) 22 | 23 | // TestConformance is the top-level function that runs the conformance tests. 24 | // It calls the RunConformance function which sets up the suite and executes 25 | // the registered tests. 26 | func TestConformance(t *testing.T) { 27 | // RunConformance is defined in conformance.go 28 | RunConformance(t) 29 | } 30 | -------------------------------------------------------------------------------- /pkg/epp/util/logging/fatal.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2025 The Kubernetes Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package logging 18 | 19 | import ( 20 | "os" 21 | 22 | "github.com/go-logr/logr" 23 | ) 24 | 25 | // Fatal calls logger.Error followed by os.Exit(1). 26 | // 27 | // This is a utility function and should not be used in production code! 28 | func Fatal(logger logr.Logger, err error, msg string, keysAndValues ...any) { 29 | logger.Error(err, msg, keysAndValues...) 30 | os.Exit(1) 31 | } 32 | -------------------------------------------------------------------------------- /test/testdata/metrics-rbac.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: ClusterRole 3 | metadata: 4 | name: inference-gateway-metrics-reader 5 | rules: 6 | - nonResourceURLs: 7 | - /metrics 8 | verbs: 9 | - get 10 | --- 11 | apiVersion: v1 12 | kind: ServiceAccount 13 | metadata: 14 | name: inference-gateway-sa-metrics-reader 15 | namespace: $E2E_NS 16 | --- 17 | apiVersion: rbac.authorization.k8s.io/v1 18 | kind: ClusterRoleBinding 19 | metadata: 20 | name: inference-gateway-sa-metrics-reader-role-binding 21 | subjects: 22 | - kind: ServiceAccount 23 | name: inference-gateway-sa-metrics-reader 24 | namespace: $E2E_NS 25 | roleRef: 26 | kind: ClusterRole 27 | name: inference-gateway-metrics-reader 28 | apiGroup: rbac.authorization.k8s.io 29 | --- 30 | apiVersion: v1 31 | kind: Secret 32 | metadata: 33 | name: inference-gateway-sa-metrics-reader-secret 34 | namespace: $E2E_NS 35 | annotations: 36 | kubernetes.io/service-account.name: inference-gateway-sa-metrics-reader 37 | type: kubernetes.io/service-account-token -------------------------------------------------------------------------------- /tools/dynamic-lora-sidecar/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for dynamic-lora-sidecar 2 | 3 | PYTHON_VERSION := 3.10 4 | VENV_DIR := venv 5 | PYTHON := $(VENV_DIR)/bin/python 6 | PIP := $(VENV_DIR)/bin/pip 7 | 8 | .PHONY: help venv install test clean 9 | 10 | help: ## Show available targets 11 | @echo "Available targets:" 12 | @echo " venv - Create virtual environment" 13 | @echo " install - Install dependencies" 14 | @echo " test - Run unit tests" 15 | @echo " clean - Clean up virtual environment" 16 | 17 | venv: $(VENV_DIR)/bin/activate ## Create virtual environment 18 | 19 | $(VENV_DIR)/bin/activate: 20 | python$(PYTHON_VERSION) -m venv $(VENV_DIR) 21 | 22 | install: venv ## Install dependencies 23 | $(PIP) install --upgrade pip 24 | $(PIP) install -r requirements.txt 25 | 26 | test: install ## Run unit tests 27 | $(PYTHON) -m unittest discover -v -s sidecar 28 | 29 | clean: ## Clean up virtual environment 30 | rm -rf $(VENV_DIR) 31 | rm -rf .pytest_cache 32 | find . -name "*.pyc" -delete 33 | find . -name "__pycache__" -type d -exec rm -rf {} + 34 | -------------------------------------------------------------------------------- /client-go/listers/api/v1/expansion_generated.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright The Kubernetes Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | // Code generated by lister-gen. DO NOT EDIT. 18 | 19 | package v1 20 | 21 | // InferencePoolListerExpansion allows custom methods to be added to 22 | // InferencePoolLister. 23 | type InferencePoolListerExpansion interface{} 24 | 25 | // InferencePoolNamespaceListerExpansion allows custom methods to be added to 26 | // InferencePoolNamespaceLister. 27 | type InferencePoolNamespaceListerExpansion interface{} 28 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # Dockerfile has specific requirement to put this ARG at the beginning: 2 | # https://docs.docker.com/engine/reference/builder/#understand-how-arg-and-from-interact 3 | ARG BUILDER_IMAGE=golang:1.24 4 | ARG BASE_IMAGE=gcr.io/distroless/static:nonroot 5 | 6 | ## Multistage build 7 | FROM ${BUILDER_IMAGE} AS builder 8 | ENV CGO_ENABLED=0 9 | ENV GOOS=linux 10 | ENV GOARCH=amd64 11 | ARG COMMIT_SHA=unknown 12 | ARG BUILD_REF 13 | 14 | # Dependencies 15 | WORKDIR /src 16 | COPY go.mod go.sum ./ 17 | RUN go mod download 18 | 19 | # Sources 20 | COPY cmd/epp ./cmd/epp 21 | COPY pkg/common ./pkg/common 22 | COPY pkg/epp ./pkg/epp 23 | COPY internal ./internal 24 | COPY apix ./apix 25 | COPY api ./api 26 | COPY version ./version 27 | WORKDIR /src/cmd/epp 28 | RUN go build -ldflags="-X sigs.k8s.io/gateway-api-inference-extension/version.CommitSHA=${COMMIT_SHA} -X sigs.k8s.io/gateway-api-inference-extension/version.BuildRef=${BUILD_REF}" -o /epp 29 | 30 | ## Multistage deploy 31 | FROM ${BASE_IMAGE} 32 | 33 | WORKDIR / 34 | COPY --from=builder /epp /epp 35 | 36 | ENTRYPOINT ["/epp"] 37 | -------------------------------------------------------------------------------- /pkg/epp/backend/metrics/metrics_state.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2025 The Kubernetes Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package metrics 18 | 19 | import ( 20 | "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datalayer" 21 | ) 22 | 23 | // NewMetricsState initializes a new MetricsState and returns its pointer. 24 | func NewMetricsState() *MetricsState { 25 | return datalayer.NewMetrics() 26 | } 27 | 28 | // MetricsState holds the latest state of the metrics that were scraped from a pod. 29 | type MetricsState = datalayer.Metrics 30 | -------------------------------------------------------------------------------- /pkg/epp/util/pod/pod.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2025 The Kubernetes Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package pod 18 | 19 | import ( 20 | corev1 "k8s.io/api/core/v1" 21 | ) 22 | 23 | func IsPodReady(pod *corev1.Pod) bool { 24 | if !pod.DeletionTimestamp.IsZero() { 25 | return false 26 | } 27 | for _, condition := range pod.Status.Conditions { 28 | if condition.Type == corev1.PodReady { 29 | if condition.Status == corev1.ConditionTrue { 30 | return true 31 | } 32 | break 33 | } 34 | } 35 | return false 36 | } 37 | -------------------------------------------------------------------------------- /.golangci.yml: -------------------------------------------------------------------------------- 1 | version: "2" 2 | run: 3 | allow-parallel-runners: true 4 | linters: 5 | default: none 6 | enable: 7 | - copyloopvar 8 | - dupword 9 | - durationcheck 10 | - errcheck 11 | - fatcontext 12 | - ginkgolinter 13 | - goconst 14 | - gocritic 15 | - govet 16 | - ineffassign 17 | - loggercheck 18 | - makezero 19 | - misspell 20 | - nakedret 21 | - perfsprint 22 | - prealloc 23 | - revive 24 | - staticcheck 25 | - unconvert 26 | - unparam 27 | - unused 28 | settings: 29 | revive: 30 | rules: 31 | - name: comment-spacings 32 | exclusions: 33 | generated: lax 34 | presets: 35 | - comments 36 | - common-false-positives 37 | - legacy 38 | - std-error-handling 39 | paths: 40 | - bin 41 | - third_party$ 42 | - builtin$ 43 | - examples$ 44 | formatters: 45 | enable: 46 | - gofmt 47 | - goimports 48 | exclusions: 49 | generated: lax 50 | paths: 51 | - bin 52 | - third_party$ 53 | - builtin$ 54 | - examples$ 55 | -------------------------------------------------------------------------------- /pkg/epp/config/config.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2025 The Kubernetes Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package config 18 | 19 | import ( 20 | "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/saturationdetector" 21 | "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling" 22 | ) 23 | 24 | // Config is the configuration loaded from the text based configuration 25 | type Config struct { 26 | SchedulerConfig *scheduling.SchedulerConfig 27 | SaturationDetectorConfig *saturationdetector.Config 28 | } 29 | 30 | type FeatureConfig map[string]bool 31 | -------------------------------------------------------------------------------- /PROJECT: -------------------------------------------------------------------------------- 1 | # Code generated by tool. DO NOT EDIT. 2 | # This file is used to track the info used to scaffold your project 3 | # and allow the plugins properly work. 4 | # More info: https://book.kubebuilder.io/reference/project-config.html 5 | domain: x-k8s.io 6 | layout: 7 | - go.kubebuilder.io/v4 8 | projectName: gateway-api-inference-extension 9 | repo: sigs.k8s.io/gateway-api-inference-extension 10 | resources: 11 | - api: 12 | crdVersion: v1 13 | namespaced: true 14 | domain: x-k8s.io 15 | group: inference 16 | kind: InferencePool 17 | path: sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1 18 | version: v1alpha1 19 | - api: 20 | crdVersion: v1 21 | namespaced: true 22 | domain: x-k8s.io 23 | group: inference 24 | kind: InferenceObjective 25 | path: sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1 26 | version: v1alpha1 27 | - api: 28 | crdVersion: v1 29 | namespaced: true 30 | domain: x-k8s.io 31 | group: inference 32 | kind: EndpointPickerConfig 33 | path: sigs.k8s.io/gateway-api-inference-extension/api/config/v1alpha1 34 | version: v1alpha1 35 | version: "3" 36 | -------------------------------------------------------------------------------- /client-go/listers/apix/v1alpha1/expansion_generated.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright The Kubernetes Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | // Code generated by lister-gen. DO NOT EDIT. 18 | 19 | package v1alpha1 20 | 21 | // InferencePoolImportListerExpansion allows custom methods to be added to 22 | // InferencePoolImportLister. 23 | type InferencePoolImportListerExpansion interface{} 24 | 25 | // InferencePoolImportNamespaceListerExpansion allows custom methods to be added to 26 | // InferencePoolImportNamespaceLister. 27 | type InferencePoolImportNamespaceListerExpansion interface{} 28 | -------------------------------------------------------------------------------- /cmd/epp/main.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2025 The Kubernetes Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package main 18 | 19 | import ( 20 | "os" 21 | 22 | ctrl "sigs.k8s.io/controller-runtime" 23 | 24 | "sigs.k8s.io/gateway-api-inference-extension/cmd/epp/runner" 25 | ) 26 | 27 | func main() { 28 | // For adding out-of-tree plugins to the plugins registry, use the following: 29 | // plugins.Register(my-out-of-tree-plugin-name, my-out-of-tree-plugin-factory-function) 30 | 31 | if err := runner.NewRunner().Run(ctrl.SetupSignalHandler()); err != nil { 32 | os.Exit(1) 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /pkg/epp/plugins/shared_state.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2025 The Kubernetes Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package plugins 18 | 19 | import ( 20 | "errors" 21 | ) 22 | 23 | var ( 24 | // ErrNotFound is the not found error message. 25 | ErrNotFound = errors.New("not found") 26 | ) 27 | 28 | // StateKey is the type of keys stored in PluginState. 29 | type StateKey string 30 | 31 | // StateData is a generic type for arbitrary data stored in PluginState. 32 | type StateData interface { 33 | // Clone is an interface to make a copy of StateData. 34 | Clone() StateData 35 | } 36 | -------------------------------------------------------------------------------- /pkg/epp/plugins/typedname.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2025 The Kubernetes Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package plugins 18 | 19 | const ( 20 | separator = "/" 21 | ) 22 | 23 | // TypedName is a utility struct providing a type and a name to plugins. 24 | type TypedName struct { 25 | // Type returns the type of a plugin. 26 | Type string 27 | // Name returns the name of a plugin instance. 28 | Name string 29 | } 30 | 31 | // String returns the type and name rendered as "/". 32 | func (tn TypedName) String() string { 33 | return tn.Name + separator + tn.Type 34 | } 35 | -------------------------------------------------------------------------------- /config/charts/inferencepool/templates/_helpers.tpl: -------------------------------------------------------------------------------- 1 | {{/* 2 | Common labels 3 | */}} 4 | {{- define "gateway-api-inference-extension.labels" -}} 5 | app.kubernetes.io/name: {{ include "gateway-api-inference-extension.name" . }} 6 | {{- if .Chart.AppVersion }} 7 | app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} 8 | {{- end }} 9 | {{- end }} 10 | 11 | {{/* 12 | Inference extension name 13 | */}} 14 | {{- define "gateway-api-inference-extension.name" -}} 15 | {{- $base := .Release.Name | default "default-pool" | lower | trim | trunc 40 -}} 16 | {{ $base }}-epp 17 | {{- end -}} 18 | 19 | {{/* 20 | Cluster RBAC unique name 21 | */}} 22 | {{- define "gateway-api-inference-extension.cluster-rbac-name" -}} 23 | {{- $base := .Release.Name | default "default-pool" | lower | trim | trunc 40 }} 24 | {{- $ns := .Release.Namespace | default "default" | lower | trim | trunc 40 }} 25 | {{- printf "%s-%s-epp" $base $ns | quote | trunc 84 }} 26 | {{- end -}} 27 | 28 | {{/* 29 | Selector labels 30 | */}} 31 | {{- define "gateway-api-inference-extension.selectorLabels" -}} 32 | inferencepool: {{ include "gateway-api-inference-extension.name" . }} 33 | {{- end -}} 34 | -------------------------------------------------------------------------------- /pkg/epp/util/request/metadata.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2025 The Kubernetes Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package request 18 | 19 | import ( 20 | extProcPb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3" 21 | ) 22 | 23 | func ExtractMetadataValues(req *extProcPb.ProcessingRequest) map[string]any { 24 | metadata := make(map[string]any) 25 | if req != nil && req.MetadataContext != nil && req.MetadataContext.FilterMetadata != nil { 26 | for key, val := range req.MetadataContext.FilterMetadata { 27 | metadata[key] = val.AsMap() 28 | } 29 | } 30 | return metadata 31 | } 32 | -------------------------------------------------------------------------------- /internal/runnable/leader_election.go: -------------------------------------------------------------------------------- 1 | package runnable 2 | 3 | import "sigs.k8s.io/controller-runtime/pkg/manager" 4 | 5 | type leaderElection struct { 6 | manager.Runnable 7 | needsLeaderElection bool 8 | } 9 | 10 | // LeaderElection wraps the given runnable to implement manager.LeaderElectionRunnable. 11 | func LeaderElection(runnable manager.Runnable, needsLeaderElection bool) manager.Runnable { 12 | return &leaderElection{ 13 | Runnable: runnable, 14 | needsLeaderElection: needsLeaderElection, 15 | } 16 | } 17 | 18 | // RequireLeaderElection wraps the given runnable, marking it as requiring leader election. 19 | func RequireLeaderElection(runnable manager.Runnable) manager.Runnable { 20 | return LeaderElection(runnable, true) 21 | } 22 | 23 | // RequireLeaderElection wraps the given runnable, marking it as not requiring leader election. 24 | func NoLeaderElection(runnable manager.Runnable) manager.Runnable { 25 | return LeaderElection(runnable, false) 26 | } 27 | 28 | // NeedLeaderElection implements manager.NeedLeaderElection interface. 29 | func (r *leaderElection) NeedLeaderElection() bool { 30 | return r.needsLeaderElection 31 | } 32 | -------------------------------------------------------------------------------- /config/charts/inferencepool/templates/leader-election-rbac.yaml: -------------------------------------------------------------------------------- 1 | {{- if gt (.Values.inferenceExtension.replicas | int) 1 }} 2 | --- 3 | kind: Role 4 | apiVersion: rbac.authorization.k8s.io/v1 5 | metadata: 6 | name: {{ include "gateway-api-inference-extension.name" . }}-leader-election 7 | namespace: {{ .Release.Namespace }} 8 | labels: 9 | {{- include "gateway-api-inference-extension.labels" . | nindent 4 }} 10 | rules: 11 | - apiGroups: [ "coordination.k8s.io" ] 12 | resources: [ "leases" ] 13 | verbs: [ "get", "list", "watch", "create", "update", "patch", "delete" ] 14 | - apiGroups: [ "" ] 15 | resources: [ "events" ] 16 | verbs: [ "create", "patch" ] 17 | --- 18 | kind: RoleBinding 19 | apiVersion: rbac.authorization.k8s.io/v1 20 | metadata: 21 | name: {{ include "gateway-api-inference-extension.name" . }}-leader-election-binding 22 | namespace: {{ .Release.Namespace }} 23 | subjects: 24 | - kind: ServiceAccount 25 | name: {{ include "gateway-api-inference-extension.name" . }} 26 | roleRef: 27 | apiGroup: rbac.authorization.k8s.io 28 | kind: Role 29 | name: {{ include "gateway-api-inference-extension.name" . }}-leader-election 30 | {{- end }} 31 | -------------------------------------------------------------------------------- /pkg/epp/metrics/testdata/prefix_indexer_hit_ratio_metric: -------------------------------------------------------------------------------- 1 | # HELP inference_extension_prefix_indexer_hit_ratio [ALPHA] Ratio of prefix length matched to total prefix length in the cache lookup. 2 | # TYPE inference_extension_prefix_indexer_hit_ratio histogram 3 | inference_extension_prefix_indexer_hit_ratio_bucket{le="0"} 2 4 | inference_extension_prefix_indexer_hit_ratio_bucket{le="0.1"} 2 5 | inference_extension_prefix_indexer_hit_ratio_bucket{le="0.2"} 2 6 | inference_extension_prefix_indexer_hit_ratio_bucket{le="0.3"} 2 7 | inference_extension_prefix_indexer_hit_ratio_bucket{le="0.4"} 2 8 | inference_extension_prefix_indexer_hit_ratio_bucket{le="0.5"} 4 9 | inference_extension_prefix_indexer_hit_ratio_bucket{le="0.6"} 4 10 | inference_extension_prefix_indexer_hit_ratio_bucket{le="0.7"} 5 11 | inference_extension_prefix_indexer_hit_ratio_bucket{le="0.8"} 5 12 | inference_extension_prefix_indexer_hit_ratio_bucket{le="0.9"} 5 13 | inference_extension_prefix_indexer_hit_ratio_bucket{le="1"} 6 14 | inference_extension_prefix_indexer_hit_ratio_bucket{le="+Inf"} 6 15 | inference_extension_prefix_indexer_hit_ratio_sum 2.7 16 | inference_extension_prefix_indexer_hit_ratio_count 6 17 | -------------------------------------------------------------------------------- /hack/verify-boilerplate.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2025 The Kubernetes Authors. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | set -o errexit 18 | set -o nounset 19 | set -o pipefail 20 | 21 | SCRIPT_ROOT=$(dirname "${BASH_SOURCE}")/.. 22 | 23 | boilerDir="${SCRIPT_ROOT}/hack/boilerplate" 24 | boiler="${boilerDir}/boilerplate.py" 25 | 26 | files_need_boilerplate=($(${boiler} "$@")) 27 | 28 | # Run boilerplate check 29 | if [[ ${#files_need_boilerplate[@]} -gt 0 ]]; then 30 | for file in "${files_need_boilerplate[@]}"; do 31 | echo "Boilerplate header is wrong for: ${file}" 32 | done 33 | 34 | exit 1 35 | fi 36 | -------------------------------------------------------------------------------- /site-src/concepts/conformance.md: -------------------------------------------------------------------------------- 1 | # Conformance 2 | 3 | Similar to Gateway API, this project will rely on conformance tests to ensure 4 | compatibility across implementations. This will be focused on three different 5 | layers: 6 | 7 | ## 1. Gateway API Implementations 8 | 9 | Conformance tests will verify that: 10 | 11 | * InferencePool is supported as a backend type 12 | * Implementations forward requests to the configured extension for an 13 | InferencePool following the specification defined by this project 14 | * Implementations honor the routing guidance provided by the extension 15 | * Implementations behave appropriately when an extension is either not present 16 | or fails to respond 17 | 18 | ## 2. Inference Routing Extensions 19 | 20 | Conformance tests will verify that: 21 | 22 | * Extensions accept requests that match the protocol specified by this project 23 | * Extensions respond with routing guidance that matches the protocol specified 24 | by this project 25 | 26 | ## 3. Model Server Frameworks 27 | 28 | Conformance tests will verify that: 29 | 30 | * Frameworks serve the expected set of metrics using a format and path specified 31 | by this project 32 | -------------------------------------------------------------------------------- /config/manifests/vllm/sim-deployment.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: vllm-llama3-8b-instruct 5 | spec: 6 | replicas: 3 7 | selector: 8 | matchLabels: 9 | app: vllm-llama3-8b-instruct 10 | template: 11 | metadata: 12 | labels: 13 | app: vllm-llama3-8b-instruct 14 | spec: 15 | containers: 16 | - name: vllm-sim 17 | image: ghcr.io/llm-d/llm-d-inference-sim:v0.5.0 18 | imagePullPolicy: Always 19 | args: 20 | - --model 21 | - meta-llama/Llama-3.1-8B-Instruct 22 | - --port 23 | - "8000" 24 | - --max-loras 25 | - "2" 26 | - --lora-modules 27 | - '{"name": "food-review-1"}' 28 | env: 29 | - name: POD_NAME 30 | valueFrom: 31 | fieldRef: 32 | fieldPath: metadata.name 33 | - name: NAMESPACE 34 | valueFrom: 35 | fieldRef: 36 | fieldPath: metadata.namespace 37 | ports: 38 | - containerPort: 8000 39 | name: http 40 | protocol: TCP 41 | resources: 42 | requests: 43 | cpu: 10m 44 | -------------------------------------------------------------------------------- /pkg/epp/datalayer/mocks/ticker.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2025 The Kubernetes Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package mocks 18 | 19 | import ( 20 | "time" 21 | ) 22 | 23 | // -- Ticker is a mock time source -- 24 | type Ticker struct { 25 | ch chan time.Time 26 | } 27 | 28 | func NewTicker() *Ticker { 29 | return &Ticker{ 30 | ch: make(chan time.Time, 10), 31 | } 32 | } 33 | 34 | func (t *Ticker) Channel() <-chan time.Time { 35 | return t.ch 36 | } 37 | 38 | func (t *Ticker) Tick() { 39 | select { 40 | case t.ch <- time.Now(): 41 | default: // if buffer is full, or channel closed 42 | } 43 | } 44 | 45 | func (t *Ticker) Stop() {} 46 | -------------------------------------------------------------------------------- /pkg/epp/metrics/testdata/scheduler_e2e_duration_seconds_metric: -------------------------------------------------------------------------------- 1 | # HELP inference_extension_scheduler_e2e_duration_seconds [ALPHA] End-to-end scheduling latency distribution in seconds. 2 | # TYPE inference_extension_scheduler_e2e_duration_seconds histogram 3 | inference_extension_scheduler_e2e_duration_seconds_bucket{le="0.0001"} 0 4 | inference_extension_scheduler_e2e_duration_seconds_bucket{le="0.0002"} 1 5 | inference_extension_scheduler_e2e_duration_seconds_bucket{le="0.0005"} 1 6 | inference_extension_scheduler_e2e_duration_seconds_bucket{le="0.001"} 2 7 | inference_extension_scheduler_e2e_duration_seconds_bucket{le="0.002"} 3 8 | inference_extension_scheduler_e2e_duration_seconds_bucket{le="0.005"} 4 9 | inference_extension_scheduler_e2e_duration_seconds_bucket{le="0.01"} 5 10 | inference_extension_scheduler_e2e_duration_seconds_bucket{le="0.02"} 6 11 | inference_extension_scheduler_e2e_duration_seconds_bucket{le="0.05"} 7 12 | inference_extension_scheduler_e2e_duration_seconds_bucket{le="0.1"} 8 13 | inference_extension_scheduler_e2e_duration_seconds_bucket{le="+Inf"} 9 14 | inference_extension_scheduler_e2e_duration_seconds_sum{} 0.2835 15 | inference_extension_scheduler_e2e_duration_seconds_count{} 9 16 | -------------------------------------------------------------------------------- /conformance/utils/assertions.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2025 The Kubernetes Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | // Package assertions contains custom assertion helper functions used within 18 | // the Gateway API Inference Extension conformance test suite. 19 | package assertions 20 | 21 | // TODO: Implement custom assertion functions specific to Inference Extension testing. 22 | // Examples might include: 23 | // - Asserting specific fields or structures within an inference API response body. 24 | // - Asserting specific metrics reported by mock model servers or EPPs. 25 | // - Asserting specific conditions or status fields unique to InferencePool or InferenceObjective. 26 | -------------------------------------------------------------------------------- /pkg/epp/scheduling/framework/weighted_scorer.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2025 The Kubernetes Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package framework 18 | 19 | // NewWeightedScorer initializes a new WeightedScorer and returns its pointer. 20 | func NewWeightedScorer(scorer Scorer, weight int) *WeightedScorer { 21 | return &WeightedScorer{ 22 | Scorer: scorer, 23 | weight: weight, 24 | } 25 | } 26 | 27 | // WeightedScorer is a struct that encapsulates a scorer with its weight. 28 | type WeightedScorer struct { 29 | Scorer 30 | weight int 31 | } 32 | 33 | // Weight returns the weight of the scorer. 34 | func (s *WeightedScorer) Weight() int { 35 | return s.weight 36 | } 37 | -------------------------------------------------------------------------------- /config/charts/inferencepool/templates/epp-config.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ConfigMap 3 | metadata: 4 | name: {{ include "gateway-api-inference-extension.name" . }} 5 | namespace: {{ .Release.Namespace }} 6 | data: 7 | default-plugins.yaml: | 8 | apiVersion: inference.networking.x-k8s.io/v1alpha1 9 | kind: EndpointPickerConfig 10 | plugins: 11 | - type: queue-scorer 12 | - type: kv-cache-utilization-scorer 13 | - type: prefix-cache-scorer 14 | schedulingProfiles: 15 | - name: default 16 | plugins: 17 | - pluginRef: queue-scorer 18 | weight: 2 19 | - pluginRef: kv-cache-utilization-scorer 20 | weight: 2 21 | - pluginRef: prefix-cache-scorer 22 | weight: 3 23 | {{- if (hasKey .Values.inferenceExtension "pluginsCustomConfig") }} 24 | {{- .Values.inferenceExtension.pluginsCustomConfig | toYaml | nindent 2 }} 25 | {{- end }} 26 | 27 | --- 28 | {{- if .Values.inferenceExtension.sidecar.enabled }} 29 | apiVersion: v1 30 | kind: ConfigMap 31 | metadata: 32 | name: {{ .Values.inferenceExtension.sidecar.configMap.name }} 33 | namespace: {{ .Release.Namespace }} 34 | data: 35 | {{- .Values.inferenceExtension.sidecar.configMap.data | toYaml | nindent 2 }} 36 | {{- end }} 37 | -------------------------------------------------------------------------------- /config/charts/body-based-routing/templates/bbr.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: {{ .Values.bbr.name }} 5 | namespace: {{ .Release.Namespace }} 6 | spec: 7 | replicas: {{ .Values.bbr.replicas | default 1 }} 8 | selector: 9 | matchLabels: 10 | app: {{ .Values.bbr.name }} 11 | template: 12 | metadata: 13 | labels: 14 | app: {{ .Values.bbr.name }} 15 | spec: 16 | containers: 17 | - name: bbr 18 | image: {{ .Values.bbr.image.hub }}/{{ .Values.bbr.image.name }}:{{ .Values.bbr.image.tag }} 19 | imagePullPolicy: {{ .Values.bbr.image.pullPolicy | default "Always" }} 20 | args: 21 | - "--streaming" 22 | - "--v" 23 | - "3" 24 | ports: 25 | - containerPort: {{ .Values.bbr.port }} 26 | # health check 27 | - containerPort: {{ .Values.bbr.healthCheckPort }} 28 | --- 29 | apiVersion: v1 30 | kind: Service 31 | metadata: 32 | name: {{ .Values.bbr.name }} 33 | namespace: {{ .Release.Namespace }} 34 | spec: 35 | selector: 36 | app: {{ .Values.bbr.name }} 37 | ports: 38 | - protocol: TCP 39 | port: {{ .Values.bbr.port }} 40 | targetPort: {{ .Values.bbr.port }} 41 | appProtocol: HTTP2 42 | type: ClusterIP 43 | -------------------------------------------------------------------------------- /conformance/reports/v0.4.0/gateway/istio/README.md: -------------------------------------------------------------------------------- 1 | # istio (gateway Profile Conformance) - v0.4.0 2 | 3 | ## Test Results 4 | 5 | This directory contains conformance test results for Gateway API Inference Extension v0.4.0 testing against istio implementations using the gateway profile. 6 | 7 | | Extension Version Tested | Profile Tested | Implementation Version | Mode | Report | Status | 8 | |--------------------------|----------------|------------------------|---------|--------|--------| 9 | | v1.3.0 | Gateway | 1.28-alpha.32ca03082f566513ad9b860f31e7745b0f68dc91 | default | [./1.28-alpha.32ca03082f566513ad9b860f31e7745b0f68dc91-default-gateway-report.yaml](././1.28-alpha.32ca03082f566513ad9b860f31e7745b0f68dc91-default-gateway-report.yaml) | PASS | 10 | ## Running the Tests 11 | 12 | For instructions on how to reproduce these test results and run the conformance tests yourself, see the [istio Conformance Testing README](../../../../scripts/istio/README.md). 13 | 14 | ## About This Version 15 | 16 | - **Extension Version**: v0.4.0 17 | - **Profile**: gateway 18 | - **Implementation**: istio 19 | - **Test Mode**: Default 20 | 21 | For detailed information about conformance testing, report generation, and requirements, see the [main conformance README](../../../../../README.md). 22 | -------------------------------------------------------------------------------- /conformance/reports/v0.5.0/gateway/istio/README.md: -------------------------------------------------------------------------------- 1 | # istio (gateway Profile Conformance) - v0.5.0 2 | 3 | ## Test Results 4 | 5 | This directory contains conformance test results for Gateway API Inference Extension v0.5.0 testing against istio implementations using the gateway profile. 6 | 7 | | Extension Version Tested | Profile Tested | Implementation Version | Mode | Report | Status | 8 | |--------------------------|----------------|------------------------|---------|--------|--------| 9 | | v1.3.0 | Gateway | 1.28-alpha.32ca03082f566513ad9b860f31e7745b0f68dc91 | default | [./1.28-alpha.32ca03082f566513ad9b860f31e7745b0f68dc91-default-gateway-report.yaml](././1.28-alpha.32ca03082f566513ad9b860f31e7745b0f68dc91-default-gateway-report.yaml) | PASS | 10 | ## Running the Tests 11 | 12 | For instructions on how to reproduce these test results and run the conformance tests yourself, see the [istio Conformance Testing README](../../../../scripts/istio/README.md). 13 | 14 | ## About This Version 15 | 16 | - **Extension Version**: v0.5.0 17 | - **Profile**: gateway 18 | - **Implementation**: istio 19 | - **Test Mode**: Default 20 | 21 | For detailed information about conformance testing, report generation, and requirements, see the [main conformance README](../../../../../README.md). 22 | -------------------------------------------------------------------------------- /version/version.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2025 The Kubernetes Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package version 18 | 19 | var ( 20 | // The git hash of the latest commit in the build. 21 | CommitSHA string 22 | 23 | // The build ref from the _PULL_BASE_REF from cloud build trigger. 24 | BuildRef string 25 | ) 26 | 27 | const ( 28 | // BundleVersionAnnotation is the annotation key used in the Gateway API inference extension CRDs to specify 29 | // the installed Gateway API inference extension version. 30 | BundleVersionAnnotation = "inference.networking.k8s.io/bundle-version" 31 | 32 | // BundleVersion is the value used for labeling the version of the gateway-api-inference-extension. 33 | BundleVersion = "main-dev" 34 | ) 35 | -------------------------------------------------------------------------------- /config/charts/inferencepool/templates/epp-servicemonitor.yaml: -------------------------------------------------------------------------------- 1 | {{- if and .Values.inferenceExtension.monitoring.prometheus.enabled (ne (lower .Values.provider.name) "gke") }} 2 | apiVersion: monitoring.coreos.com/v1 3 | kind: ServiceMonitor 4 | metadata: 5 | name: {{ include "gateway-api-inference-extension.name" . }}-monitor 6 | namespace: {{ .Release.Namespace }} 7 | labels: 8 | {{- include "gateway-api-inference-extension.labels" . | nindent 4 }} 9 | {{- with .Values.inferenceExtension.monitoring.prometheus.extraLabels }} 10 | {{- toYaml . | nindent 4 }} 11 | {{- end }} 12 | spec: 13 | endpoints: 14 | - interval: {{ .Values.inferenceExtension.monitoring.interval }} 15 | port: "http-metrics" 16 | path: "/metrics" 17 | {{- if .Values.inferenceExtension.monitoring.prometheus.auth.enabled }} 18 | authorization: 19 | credentials: 20 | key: token 21 | name: {{ .Values.inferenceExtension.monitoring.prometheus.auth.secretName }} 22 | {{- end }} 23 | jobLabel: {{ include "gateway-api-inference-extension.name" . }} 24 | namespaceSelector: 25 | matchNames: 26 | - {{ .Release.Namespace }} 27 | selector: 28 | matchLabels: 29 | {{- include "gateway-api-inference-extension.labels" . | nindent 6 }} 30 | {{- end }} 31 | -------------------------------------------------------------------------------- /conformance/tests/httproute_multiple_gateways_different_pools.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: gateway.networking.k8s.io/v1 3 | kind: HTTPRoute 4 | metadata: 5 | name: route-for-primary-gateway 6 | namespace: inference-conformance-app-backend 7 | spec: 8 | parentRefs: 9 | - kind: Gateway 10 | name: conformance-primary 11 | namespace: inference-conformance-infra 12 | hostnames: 13 | - "primary.example.com" 14 | rules: 15 | - backendRefs: 16 | - group: inference.networking.k8s.io 17 | kind: InferencePool 18 | name: primary-inference-pool 19 | matches: 20 | - path: 21 | type: PathPrefix 22 | value: /test-primary-gateway 23 | --- 24 | apiVersion: gateway.networking.k8s.io/v1 25 | kind: HTTPRoute 26 | metadata: 27 | name: route-for-secondary-gateway 28 | namespace: inference-conformance-app-backend 29 | spec: 30 | parentRefs: 31 | - kind: Gateway 32 | name: conformance-secondary 33 | namespace: inference-conformance-infra 34 | hostnames: 35 | - "secondary.example.com" 36 | rules: 37 | - backendRefs: 38 | - group: inference.networking.k8s.io 39 | kind: InferencePool 40 | name: secondary-inference-pool 41 | matches: 42 | - path: 43 | type: PathPrefix 44 | value: /test-secondary-gateway 45 | -------------------------------------------------------------------------------- /pkg/epp/plugins/registry.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2025 The Kubernetes Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package plugins 18 | 19 | import ( 20 | "encoding/json" 21 | ) 22 | 23 | // Factory is the definition of the factory functions that are used to instantiate plugins 24 | // specified in a configuration. 25 | type FactoryFunc func(name string, parameters json.RawMessage, handle Handle) (Plugin, error) 26 | 27 | // Register is a static function that can be called to register plugin factory functions. 28 | func Register(pluginType string, factory FactoryFunc) { 29 | Registry[pluginType] = factory 30 | } 31 | 32 | // Registry is a mapping from plugin name to Factory function 33 | var Registry map[string]FactoryFunc = map[string]FactoryFunc{} 34 | -------------------------------------------------------------------------------- /config/crd/kustomization.yaml: -------------------------------------------------------------------------------- 1 | # This kustomization.yaml is not intended to be run by itself, 2 | # since it depends on service name and namespace that are out of this kustomize package. 3 | # It should be run by config/default 4 | resources: 5 | - bases/inference.networking.x-k8s.io_inferencepools.yaml 6 | - bases/inference.networking.x-k8s.io_inferenceobjectives.yaml 7 | - bases/inference.networking.x-k8s.io_inferencepoolimports.yaml 8 | - bases/inference.networking.k8s.io_inferencepools.yaml 9 | # +kubebuilder:scaffold:crdkustomizeresource 10 | 11 | patches: 12 | # [WEBHOOK] To enable webhook, uncomment all the sections with [WEBHOOK] prefix. 13 | # patches here are for enabling the conversion webhook for each CRD 14 | # +kubebuilder:scaffold:crdkustomizewebhookpatch 15 | 16 | # [CERTMANAGER] To enable cert-manager, uncomment all the sections with [CERTMANAGER] prefix. 17 | # patches here are for enabling the CA injection for each CRD 18 | #- path: patches/cainjection_in_inferencepools.yaml 19 | #- path: patches/cainjection_in_inferenceobjectives.yaml 20 | # +kubebuilder:scaffold:crdkustomizecainjectionpatch 21 | 22 | # [WEBHOOK] To enable webhook, uncomment the following section 23 | # the following config is for teaching kustomize how to do kustomization for CRDs. 24 | 25 | #configurations: 26 | #- kustomizeconfig.yaml 27 | -------------------------------------------------------------------------------- /pkg/epp/saturationdetector/config.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2025 The Kubernetes Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | package saturationdetector 17 | 18 | import ( 19 | "time" 20 | ) 21 | 22 | // Default configuration values 23 | const ( 24 | // DefaultQueueDepthThreshold is the default backend waiting queue size threshold. 25 | DefaultQueueDepthThreshold = 5 26 | // DefaultKVCacheUtilThreshold is the default KV cache utilization (0.0 to 1.0) threshold. 27 | DefaultKVCacheUtilThreshold = 0.8 28 | // DefaultMetricsStalenessThreshold defines how old metrics can be before they 29 | // are considered stale. 30 | // Given the pod metrics refresh interval is 50ms, a threshold slightly above 31 | // that should be fine. 32 | DefaultMetricsStalenessThreshold = 200 * time.Millisecond 33 | ) 34 | -------------------------------------------------------------------------------- /config/manifests/bbr-example/httproute_bbr.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: gateway.networking.k8s.io/v1 3 | kind: HTTPRoute 4 | metadata: 5 | name: llm-llama-route 6 | spec: 7 | parentRefs: 8 | - group: gateway.networking.k8s.io 9 | kind: Gateway 10 | name: inference-gateway 11 | rules: 12 | - backendRefs: 13 | - group: inference.networking.k8s.io 14 | kind: InferencePool 15 | name: vllm-llama3-8b-instruct 16 | matches: 17 | - path: 18 | type: PathPrefix 19 | value: / 20 | headers: 21 | - type: Exact 22 | name: X-Gateway-Model-Name 23 | value: 'meta-llama/Llama-3.1-8B-Instruct' 24 | timeouts: 25 | request: 300s 26 | --- 27 | apiVersion: gateway.networking.k8s.io/v1 28 | kind: HTTPRoute 29 | metadata: 30 | name: llm-phi4-route 31 | spec: 32 | parentRefs: 33 | - group: gateway.networking.k8s.io 34 | kind: Gateway 35 | name: inference-gateway 36 | rules: 37 | - backendRefs: 38 | - group: inference.networking.k8s.io 39 | kind: InferencePool 40 | name: vllm-phi4-mini-instruct 41 | matches: 42 | - path: 43 | type: PathPrefix 44 | value: / 45 | headers: 46 | - type: Exact 47 | name: X-Gateway-Model-Name 48 | value: 'microsoft/Phi-4-mini-instruct' 49 | timeouts: 50 | request: 300s 51 | --- 52 | -------------------------------------------------------------------------------- /pkg/epp/flowcontrol/contracts/doc.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2025 The Kubernetes Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | // Package contracts defines the boundaries and service interfaces for the Flow Control system. 18 | // 19 | // Adhering to a "Ports and Adapters" (Hexagonal) architectural style, these interfaces decouple the core 20 | // `controller.FlowController` engine from its dependencies. They establish the required behaviors and system invariants 21 | // that concrete implementations must uphold. 22 | // 23 | // The primary contracts are: 24 | // 25 | // - `FlowRegistry`: The interface for the stateful control plane that manages the lifecycle of flows, queues, and 26 | // policies. 27 | // 28 | // - `SaturationDetector`: The interface for a component that provides real-time load signals. 29 | package contracts 30 | -------------------------------------------------------------------------------- /pkg/epp/metrics/testdata/prefix_indexer_hit_bytes_metric: -------------------------------------------------------------------------------- 1 | # HELP inference_extension_prefix_indexer_hit_bytes [ALPHA] Length of the prefix match in number of bytes in the cache lookup. 2 | # TYPE inference_extension_prefix_indexer_hit_bytes histogram 3 | inference_extension_prefix_indexer_hit_bytes_bucket{le="0"} 2 4 | inference_extension_prefix_indexer_hit_bytes_bucket{le="16"} 5 5 | inference_extension_prefix_indexer_hit_bytes_bucket{le="32"} 5 6 | inference_extension_prefix_indexer_hit_bytes_bucket{le="64"} 6 7 | inference_extension_prefix_indexer_hit_bytes_bucket{le="128"} 6 8 | inference_extension_prefix_indexer_hit_bytes_bucket{le="256"} 6 9 | inference_extension_prefix_indexer_hit_bytes_bucket{le="512"} 6 10 | inference_extension_prefix_indexer_hit_bytes_bucket{le="1024"} 6 11 | inference_extension_prefix_indexer_hit_bytes_bucket{le="2048"} 6 12 | inference_extension_prefix_indexer_hit_bytes_bucket{le="4096"} 6 13 | inference_extension_prefix_indexer_hit_bytes_bucket{le="8192"} 6 14 | inference_extension_prefix_indexer_hit_bytes_bucket{le="16384"} 6 15 | inference_extension_prefix_indexer_hit_bytes_bucket{le="32768"} 6 16 | inference_extension_prefix_indexer_hit_bytes_bucket{le="65536"} 6 17 | inference_extension_prefix_indexer_hit_bytes_bucket{le="+Inf"} 6 18 | inference_extension_prefix_indexer_hit_bytes_sum 86 19 | inference_extension_prefix_indexer_hit_bytes_count 6 20 | -------------------------------------------------------------------------------- /pkg/epp/requestcontrol/types.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2025 The Kubernetes Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package requestcontrol 18 | 19 | // Response contains information from the response received to be passed to the Response requestcontrol plugins 20 | type Response struct { 21 | // RequestId is the Envoy generated Id for the request being processed 22 | RequestId string 23 | // Headers is a map of the response headers. Nil during body processing 24 | Headers map[string]string 25 | // Body Is the body of the response or nil during header processing 26 | Body string 27 | // IsStreaming indicates whether or not the response is being streamed by the model 28 | IsStreaming bool 29 | // EndOfStream when true indicates that this invocation contains the last chunk of the response 30 | EndOfStream bool 31 | } 32 | -------------------------------------------------------------------------------- /pkg/epp/server/runserver_test.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2025 The Kubernetes Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package server_test 18 | 19 | import ( 20 | "testing" 21 | 22 | "sigs.k8s.io/controller-runtime/pkg/manager" 23 | 24 | "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/server" 25 | logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" 26 | ) 27 | 28 | func TestRunnable(t *testing.T) { 29 | // Make sure AsRunnable() does not use leader election. 30 | runner := server.NewDefaultExtProcServerRunner().AsRunnable(logutil.NewTestLogger()) 31 | r, ok := runner.(manager.LeaderElectionRunnable) 32 | if !ok { 33 | t.Fatal("runner is not LeaderElectionRunnable") 34 | } 35 | if r.NeedLeaderElection() { 36 | t.Error("runner returned NeedLeaderElection = true, expected false") 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /pkg/epp/util/logging/logger.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2025 The Kubernetes Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package logging 18 | 19 | import ( 20 | "context" 21 | 22 | "github.com/go-logr/logr" 23 | uberzap "go.uber.org/zap" 24 | "sigs.k8s.io/controller-runtime/pkg/log" 25 | "sigs.k8s.io/controller-runtime/pkg/log/zap" 26 | ) 27 | 28 | // NewTestLogger creates a new Zap logger using the dev mode. 29 | func NewTestLogger() logr.Logger { 30 | return zap.New(zap.UseDevMode(true), zap.RawZapOpts(uberzap.AddCaller())) 31 | } 32 | 33 | // NewTestLoggerIntoContext creates a new Zap logger using the dev mode and inserts it into the given context. 34 | func NewTestLoggerIntoContext(ctx context.Context) context.Context { 35 | return log.IntoContext(ctx, zap.New(zap.UseDevMode(true), zap.RawZapOpts(uberzap.AddCaller()))) 36 | } 37 | -------------------------------------------------------------------------------- /pkg/epp/util/request/headers.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2025 The Kubernetes Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package request 18 | 19 | import ( 20 | "strings" 21 | 22 | extProcPb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3" 23 | ) 24 | 25 | const ( 26 | RequestIdHeaderKey = "x-request-id" 27 | ) 28 | 29 | func ExtractHeaderValue(req *extProcPb.ProcessingRequest_RequestHeaders, headerKey string) string { 30 | // header key should be case insensitive 31 | headerKeyInLower := strings.ToLower(headerKey) 32 | if req != nil && req.RequestHeaders != nil && req.RequestHeaders.Headers != nil { 33 | for _, headerKv := range req.RequestHeaders.Headers.Headers { 34 | if strings.ToLower(headerKv.Key) == headerKeyInLower { 35 | return string(headerKv.RawValue) 36 | } 37 | } 38 | } 39 | return "" 40 | } 41 | -------------------------------------------------------------------------------- /tools/dashboards/README.md: -------------------------------------------------------------------------------- 1 | # Documentation 2 | 3 | This documentation provides instructions for setting up grafana dashboards to see metrics emitted from the inference extension and model servers. 4 | 5 | ## Requirements 6 | 7 | Please follow [metrics](https://gateway-api-inference-extension.sigs.k8s.io/guides/metrics-and-observability/) page to configure the proxy to enable all metrics. 8 | 9 | ## Load Inference Extension dashboard into Grafana 10 | 11 | Please follow [grafana instructions](https://grafana.com/docs/grafana/latest/dashboards/build-dashboards/import-dashboards/) to load the dashboard json. 12 | 13 | ## Configure Google Managed Prometheus as source for metrics 14 | 15 | If you run the inference gateway with [Google Managed Prometheus](https://cloud.google.com/stackdriver/docs/managed-prometheus), please follow the [instructions](https://cloud.google.com/stackdriver/docs/managed-prometheus/query) to configure Google Managed Prometheus as data source for the grafana dashboard. 16 | 17 | ## Troubleshooting 18 | 19 | ### No data in graph 20 | 21 | Please configure the `scrape_interval` of your prometheus configuration to lower than `15s`, `rate` function returns empty string if data falls too apart. See https://www.robustperception.io/what-range-should-i-use-with-rate/ for more details. 22 | 23 | Example: 24 | 25 | ``` 26 | global: 27 | scrape_interval: 5s 28 | ``` 29 | -------------------------------------------------------------------------------- /config/charts/body-based-routing/templates/gke.yaml: -------------------------------------------------------------------------------- 1 | {{- if eq .Values.provider.name "gke" }} 2 | --- 3 | kind: GCPRoutingExtension 4 | apiVersion: networking.gke.io/v1 5 | metadata: 6 | name: {{ .Values.bbr.name }} 7 | namespace: {{ .Release.Namespace }} 8 | spec: 9 | targetRefs: 10 | - group: "gateway.networking.k8s.io" 11 | kind: Gateway 12 | name: {{ .Values.inferenceGateway.name }} 13 | extensionChains: 14 | - name: chain1 15 | extensions: 16 | - name: ext1 17 | authority: "myext.com" 18 | timeout: 1s 19 | supportedEvents: 20 | - RequestHeaders 21 | - RequestBody 22 | - RequestTrailers 23 | requestBodySendMode: "FullDuplexStreamed" 24 | backendRef: 25 | group: "" 26 | kind: Service 27 | name: {{ .Values.bbr.name }} 28 | port: {{ .Values.bbr.port }} 29 | --- 30 | apiVersion: networking.gke.io/v1 31 | kind: HealthCheckPolicy 32 | metadata: 33 | name: bbr-healthcheck 34 | namespace: {{ .Release.Namespace }} 35 | spec: 36 | default: 37 | logConfig: 38 | enabled: true 39 | config: 40 | type: "GRPC" 41 | grpcHealthCheck: 42 | portSpecification: "USE_FIXED_PORT" 43 | port: {{ .Values.bbr.healthCheckPort }} 44 | targetRef: 45 | group: "" 46 | kind: Service 47 | name: {{ .Values.bbr.name }} 48 | namespace: {{ .Release.Namespace }} 49 | {{- end }} 50 | -------------------------------------------------------------------------------- /client-go/clientset/versioned/typed/api/v1/fake/fake_api_client.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright The Kubernetes Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | // Code generated by client-gen. DO NOT EDIT. 18 | 19 | package fake 20 | 21 | import ( 22 | rest "k8s.io/client-go/rest" 23 | testing "k8s.io/client-go/testing" 24 | v1 "sigs.k8s.io/gateway-api-inference-extension/client-go/clientset/versioned/typed/api/v1" 25 | ) 26 | 27 | type FakeInferenceV1 struct { 28 | *testing.Fake 29 | } 30 | 31 | func (c *FakeInferenceV1) InferencePools(namespace string) v1.InferencePoolInterface { 32 | return newFakeInferencePools(c, namespace) 33 | } 34 | 35 | // RESTClient returns a RESTClient that is used to communicate 36 | // with API server by this client implementation. 37 | func (c *FakeInferenceV1) RESTClient() rest.Interface { 38 | var ret *rest.RESTClient 39 | return ret 40 | } 41 | -------------------------------------------------------------------------------- /site-src/_includes/model-server.md: -------------------------------------------------------------------------------- 1 | Three options are supported for running the model server: 2 | 3 | 1. GPU-based model server. 4 | Requirements: a Hugging Face access token that grants access to the model [meta-llama/Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct). 5 | 6 | 1. CPU-based model server (not using GPUs). 7 | The sample uses the model [Qwen/Qwen2.5-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct). 8 | 9 | 1. [vLLM Simulator](https://github.com/llm-d/llm-d-inference-sim/tree/main) model server (not using GPUs). 10 | The sample is configured to simulate the [meta-llama/Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct) model. 11 | 12 | Choose one of these options and follow the steps below. Please do not deploy more than one, as the deployments have the same name and will override each other. 13 | 14 | === "GPU-Based Model Server" 15 | 16 | For this setup, you will need 3 GPUs to run the sample model server. Adjust the number of replicas in `./config/manifests/vllm/gpu-deployment.yaml` as needed. 17 | Create a Hugging Face secret to download the model [meta-llama/Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct). Ensure that the token grants access to this model. 18 | 19 | Deploy a sample vLLM deployment with the proper protocol to work with the LLM Instance Gateway. 20 | -------------------------------------------------------------------------------- /site-src/guides/epp-configuration/flags.md: -------------------------------------------------------------------------------- 1 | # EPP Configuration Flags 2 | 3 | This page documents selected configuration flags for the Endpoint Picker (EPP) binary. Most flags are self-explanatory via their `--help` descriptions; only flags with nuanced or non-obvious behavior are detailed here. 4 | 5 | ## --pool-namespace 6 | 7 | **Description:** 8 | Specifies the namespace of the InferencePool this Endpoint Picker is associated with. 9 | 10 | **Resolution order:** 11 | 1. If `--pool-namespace` is set to a non-empty value, its value is used. 12 | 2. If the flag is not set (i.e., left empty), the `NAMESPACE` environment variable is checked. If set, its value is used. 13 | 3. If neither is set, the namespace defaults to `default`. 14 | 15 | This allows the EPP to automatically use the namespace it is running in (when the `NAMESPACE` env var is set via Kubernetes Downward API), without requiring explicit configuration. If you want to force the use of the default namespace, explicitly set `--pool-namespace=default`. If you want to use the environment variable or fallback, leave the flag unset or set it to an empty string. 16 | 17 | **Example manifest snippet to set the env var from pod metadata:** 18 | 19 | ```yaml 20 | env: 21 | - name: NAMESPACE 22 | valueFrom: 23 | fieldRef: 24 | fieldPath: metadata.namespace 25 | ``` 26 | 27 | --- 28 | 29 | For a full list of flags, run: 30 | 31 | ``` 32 | EPP_BINARY --help 33 | ``` 34 | -------------------------------------------------------------------------------- /site-src/_includes/model-server-cpu.md: -------------------------------------------------------------------------------- 1 | === "CPU-Based Model Server" 2 | 3 | ???+ warning 4 | 5 | CPU deployment can be unreliable i.e. the pods may crash/restart because of resource contraints. 6 | 7 | This setup is using the formal `vllm-cpu` image, which according to the documentation can run vLLM on x86 CPU platform. 8 | For this setup, we use approximately 9.5GB of memory and 12 CPUs for each replica. 9 | 10 | While it is possible to deploy the model server with less resources, this is not recommended. For example, in our tests, loading the model using 8GB of memory and 1 CPU was possible but took almost 3.5 minutes and inference requests took unreasonable time. In general, there is a tradeoff between the memory and CPU we allocate to our pods and the performance. The more memory and CPU we allocate the better performance we can get. 11 | 12 | After running multiple configurations of these values we decided in this sample to use 9.5GB of memory and 12 CPUs for each replica, which gives reasonable response times. You can increase those numbers and potentially may even get better response times. For modifying the allocated resources, adjust the numbers in [cpu-deployment.yaml](https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/vllm/cpu-deployment.yaml) as needed. 13 | 14 | Deploy a sample vLLM deployment with the proper protocol to work with the LLM Instance Gateway. 15 | -------------------------------------------------------------------------------- /site-src/api-types/inferencepoolimport.md: -------------------------------------------------------------------------------- 1 | # Inference Pool Import 2 | 3 | ??? example "Alpha since v1.1.0" 4 | 5 | The `InferencePoolImport` resource is alpha and may have breaking changes in 6 | future releases of the API. 7 | 8 | ## Background 9 | 10 | The **InferencePoolImport** API is a cluster-local, controller-managed resource that represents an imported InferencePool. 11 | It primarily communicates a relationship between an exported InferencePool and the exporting cluster name. It is not 12 | user-authored; status carries the effective import. Inference Platform Owners can reference the InferencePoolImport, 13 | even if the local cluster does not have an InferencePool. In the context of Gateway API, it means that an HTTPRoute can 14 | be configured to reference an InferencePoolImport to route matching requests to endpoints of backing InferencePools. 15 | 16 | Key ideas: 17 | 18 | - Map an exported InferencePool to exporting controller and cluster. 19 | - Name/namespace sameness with the exported InferencePool (avoids extra indirection). 20 | - Conditions: Surface a controller-level status condition to indicate whether the InferencePoolImport is ready for use. 21 | - Conditions: Surface parent-level status conditions to indicate whether the InferencePoolImport is referenced by a parent, 22 | e.g. Gateway. 23 | 24 | ## Spec 25 | 26 | The full spec of the InferencePoolImport is defined [here](/reference/x-v1a1-spec/#inferencepoolimport). 27 | -------------------------------------------------------------------------------- /client-go/clientset/versioned/typed/apix/v1alpha1/fake/fake_apix_client.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright The Kubernetes Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | // Code generated by client-gen. DO NOT EDIT. 18 | 19 | package fake 20 | 21 | import ( 22 | rest "k8s.io/client-go/rest" 23 | testing "k8s.io/client-go/testing" 24 | v1alpha1 "sigs.k8s.io/gateway-api-inference-extension/client-go/clientset/versioned/typed/apix/v1alpha1" 25 | ) 26 | 27 | type FakeInferenceV1alpha1 struct { 28 | *testing.Fake 29 | } 30 | 31 | func (c *FakeInferenceV1alpha1) InferencePoolImports(namespace string) v1alpha1.InferencePoolImportInterface { 32 | return newFakeInferencePoolImports(c, namespace) 33 | } 34 | 35 | // RESTClient returns a RESTClient that is used to communicate 36 | // with API server by this client implementation. 37 | func (c *FakeInferenceV1alpha1) RESTClient() rest.Interface { 38 | var ret *rest.RESTClient 39 | return ret 40 | } 41 | -------------------------------------------------------------------------------- /tools/simulations/llm_ig_simulation/src/constants.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 The Kubernetes Authors. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | PREFILL_LATENCY_CONST_2 = 0 16 | PREFILL_LATENCY_CONST_1 = 0.00006769375513 17 | PREFILL_LATENCY_CONST_0 = 0.01969 18 | PREFILL_LATENCY_CONST_MIN = 0.04 19 | 20 | DECODE_LATENCY_CONST_BATCH = 0.0001026494433 21 | DECODE_LATENCY_CONST_1 = 0.0000005353485087 22 | DECODE_LATENCY_CONST_0 = 0.014 23 | TOKENIZE_LATENCY_CONST = 0 24 | 25 | MAX_NUM_BATCH_TOKENS = 512 # in prefill 26 | 27 | TOTAL_NUM_GPU_BLOCKS = 2810 28 | NUMBER_OF_TOKENS_PER_BLOCK = 16 29 | MAX_NUM_TOKENS_ALLOWED = TOTAL_NUM_GPU_BLOCKS * NUMBER_OF_TOKENS_PER_BLOCK - MAX_NUM_BATCH_TOKENS # in kv cache 30 | MAX_GPU_MEMORY_PERC_BEFORE_RECOMPUTE = 0.9 31 | MAX_GPU_MEMORY_PERC_BEFORE_RECOMPUTE_NON_CRITICAL = 0.8 32 | MAX_NUM_SEQ = 256 33 | 34 | # size of each lora in units of KV Cache 35 | LORA_DICT = {"tweet": 1600, "sql": 1600, "dummy-1": 0, "dummy-2": 0} 36 | -------------------------------------------------------------------------------- /conformance/reports/v1.0.2/gateway/nginx-nginx-gateway-fabric/README.md: -------------------------------------------------------------------------------- 1 | # Nginx NGINX Gateway Fabric 2 | 3 | ## Table of Contents 4 | 5 | | Extension Version Tested | Profile Tested | Implementation Version | Mode | Report | 6 | |--------------------------|----------------|------------------------|---------|----------------------------------------------------------------------------| 7 | | v1.0.2 | Gateway | v2.2.0 | default | [v2.2.0 report](./inference-v2.2.0-report.yaml) 8 | 9 | ## Reproduce 10 | 11 | To reproduce results, clone the NGF repository: 12 | 13 | ```shell 14 | git clone https://github.com/nginx/nginx-gateway-fabric.git && cd nginx-gateway-fabric/tests 15 | ``` 16 | 17 | Follow the steps in the [NGINX Gateway Fabric Testing](https://github.com/nginx/nginx-gateway-fabric/blob/main/tests/README.md#conformance-testing) document to run the conformance tests. If you are running tests on the `edge` version, then you don't need to build any images. Otherwise, you'll need to check out the specific release tag that you want to test, and then build and load the images onto your cluster, per the steps in the README. 18 | 19 | Note: Enable this flag to install all CRDs and required resources: 20 | 21 | ```shell 22 | export ENABLE_INFERENCE_EXTENSION=true 23 | ``` 24 | 25 | After running, see the conformance report: 26 | 27 | ```shell 28 | cat conformance-profile-inference.yaml 29 | ``` 30 | -------------------------------------------------------------------------------- /pkg/epp/scheduling/framework/plugins/picker/common.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2025 The Kubernetes Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package picker 18 | 19 | import ( 20 | "math/rand/v2" 21 | "time" 22 | 23 | "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types" 24 | ) 25 | 26 | const ( 27 | DefaultMaxNumOfEndpoints = 1 // common default to all pickers 28 | ) 29 | 30 | // pickerParameters defines the common parameters for all pickers 31 | type pickerParameters struct { 32 | MaxNumOfEndpoints int `json:"maxNumOfEndpoints"` 33 | } 34 | 35 | func shuffleScoredPods(scoredPods []*types.ScoredPod) { 36 | // Rand package is not safe for concurrent use, so we create a new instance. 37 | // Source: https://pkg.go.dev/math/rand/v2#pkg-overview 38 | randomGenerator := rand.New(rand.NewPCG(uint64(time.Now().UnixNano()), 0)) 39 | 40 | // Shuffle in-place 41 | randomGenerator.Shuffle(len(scoredPods), func(i, j int) { 42 | scoredPods[i], scoredPods[j] = scoredPods[j], scoredPods[i] 43 | }) 44 | } 45 | -------------------------------------------------------------------------------- /conformance/tests/main.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2025 The Kubernetes Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | // Package tests is the root package for all Gateway API Inference Extension 18 | // conformance test implementations. 19 | package tests 20 | 21 | import ( 22 | // Importing the suite package to access the ConformanceTest struct definition. 23 | // For initial version directly importing from the core gateway-api repo. 24 | // This may be adjusted in the future if we have need to create a copy of 25 | // the suite utilities. 26 | "sigs.k8s.io/gateway-api/conformance/utils/suite" 27 | // Do NOT add blank imports for specific test packages here. 28 | // They should be added to the main conformance package instead 29 | // to avoid import cycles. 30 | ) 31 | 32 | // ConformanceTests holds all the conformance tests definitions for the 33 | // Gateway API Inference Extension suite. Tests are registered from other packages 34 | // using init() functions like the one in the basic package. 35 | var ConformanceTests []suite.ConformanceTest 36 | -------------------------------------------------------------------------------- /site-src/contributing/devguide.md: -------------------------------------------------------------------------------- 1 | # Developer Guide 2 | 3 | ## Integration Tests Debug Guide 4 | This document provides detailed instructions on how to run and debug integration tests locally in debug mode. 5 | 6 | ### Prerequisites 7 | 8 | #### 1. Install Required Tools 9 | Ensure the envtest tool is installed: 10 | ```bash 11 | $ make envtest 12 | ``` 13 | 14 | #### 2. Verify Kubernetes Test Environment 15 | Run the following command to set up and verify the test environment: 16 | ```bash 17 | $ ./bin/setup-envtest use 1.31.0 --bin-dir ./bin -p path 18 | bin/k8s/1.31.0-darwin-arm64 19 | ``` 20 | ### Run test in shell 21 | ```shell 22 | export KUBEBUILDER_ASSETS=/bin/k8s/1.31.0- 23 | go test sigs.k8s.io/gateway-api-inference-extension/test/integration/epp -run 24 | ``` 25 | 26 | ### Configure and Run in GoLand 27 | 28 | #### 1. Create Test Configuration 29 | Select the test case you want to debug: 30 | ![](../images/modify-run-configuration.png) 31 | 32 | #### 2. Configure Environment Variables 33 | Set environment variables in the Run/Debug Configuration: 34 | 35 | ![](../images/edit-environment-variables.png) 36 | 37 | **Required environment variable:** 38 | 39 | - **Name:** `KUBEBUILDER_ASSETS` 40 | - **Value:** `/bin/k8s/1.31.0-` 41 | 42 | **Example path:** 43 | ``` 44 | /go/src/kubernetes.io/gateway-api-inference-extension/bin/k8s/1.31.0-darwin-arm64 45 | ``` 46 | 47 | #### 3. Set Breakpoints and Run 48 | 49 | Example Output: 50 | 51 | ![](../images/running-example.png) 52 | -------------------------------------------------------------------------------- /pkg/epp/scheduling/scheduler_config.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2025 The Kubernetes Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package scheduling 18 | 19 | import ( 20 | "fmt" 21 | 22 | "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework" 23 | ) 24 | 25 | // NewSchedulerConfig creates a new SchedulerConfig object and returns its pointer. 26 | func NewSchedulerConfig(profileHandler framework.ProfileHandler, profiles map[string]*framework.SchedulerProfile) *SchedulerConfig { 27 | return &SchedulerConfig{ 28 | profileHandler: profileHandler, 29 | profiles: profiles, 30 | } 31 | } 32 | 33 | // SchedulerConfig provides a configuration for the scheduler which influence routing decisions. 34 | type SchedulerConfig struct { 35 | profileHandler framework.ProfileHandler 36 | profiles map[string]*framework.SchedulerProfile 37 | } 38 | 39 | func (c *SchedulerConfig) String() string { 40 | return fmt.Sprintf( 41 | "{ProfileHandler: %s, Profiles: %v}", 42 | c.profileHandler.TypedName(), 43 | c.profiles, 44 | ) 45 | } 46 | -------------------------------------------------------------------------------- /test/testdata/inferencepool-with-model-hermetic.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: inference.networking.k8s.io/v1 2 | kind: InferencePool 3 | metadata: 4 | name: vllm-llama3-8b-instruct-pool 5 | namespace: default 6 | spec: 7 | targetPorts: 8 | - number: 8000 9 | selector: 10 | matchLabels: 11 | app: vllm-llama3-8b-instruct-pool 12 | endpointPickerRef: 13 | name: epp 14 | kind: Service 15 | port: 16 | number: 9002 17 | --- 18 | apiVersion: inference.networking.x-k8s.io/v1alpha2 19 | kind: InferenceObjective 20 | metadata: 21 | name: sql-lora 22 | namespace: default 23 | spec: 24 | priority: 2 25 | poolRef: 26 | name: vllm-llama3-8b-instruct-pool 27 | targetModels: 28 | - name: sql-lora-1fdg2 29 | weight: 100 30 | --- 31 | apiVersion: inference.networking.x-k8s.io/v1alpha2 32 | kind: InferenceObjective 33 | metadata: 34 | name: sql-lora-sheddable 35 | namespace: default 36 | spec: 37 | poolRef: 38 | name: vllm-llama3-8b-instruct-pool 39 | targetModels: 40 | - name: sql-lora-1fdg3 41 | weight: 100 42 | --- 43 | apiVersion: inference.networking.x-k8s.io/v1alpha2 44 | kind: InferenceObjective 45 | metadata: 46 | name: my-model 47 | namespace: default 48 | spec: 49 | priority: 2 50 | poolRef: 51 | name: vllm-llama3-8b-instruct-pool 52 | targetModels: 53 | - name: my-model-12345 54 | weight: 100 55 | --- 56 | apiVersion: inference.networking.x-k8s.io/v1alpha2 57 | kind: InferenceObjective 58 | metadata: 59 | name: direct-model 60 | namespace: default 61 | spec: 62 | priority: 2 63 | poolRef: 64 | name: vllm-llama3-8b-instruct-pool 65 | -------------------------------------------------------------------------------- /hack/update-codegen.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Copyright 2025 The Kubernetes Authors. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | set -o errexit 18 | set -o nounset 19 | set -o pipefail 20 | 21 | echo "Generating CRDs" 22 | go run ./pkg/generator 23 | 24 | SCRIPT_ROOT=$(dirname "${BASH_SOURCE[0]}")/.. 25 | echo "$SCRIPT_ROOT script" 26 | CODEGEN_PKG=${2:-bin} 27 | echo $CODEGEN_PKG 28 | source "${CODEGEN_PKG}/kube_codegen.sh" 29 | THIS_PKG="sigs.k8s.io/gateway-api-inference-extension" 30 | 31 | 32 | kube::codegen::gen_helpers \ 33 | --boilerplate "${SCRIPT_ROOT}/hack/boilerplate/boilerplate.generatego.txt" \ 34 | "${SCRIPT_ROOT}" 35 | 36 | kube::codegen::gen_register \ 37 | --boilerplate "${SCRIPT_ROOT}/hack/boilerplate/boilerplate.generatego.txt" \ 38 | "${SCRIPT_ROOT}" 39 | 40 | kube::codegen::gen_client \ 41 | --with-watch \ 42 | --with-applyconfig \ 43 | --output-dir "${SCRIPT_ROOT}/client-go" \ 44 | --output-pkg "${THIS_PKG}/client-go" \ 45 | --boilerplate "${SCRIPT_ROOT}/hack/boilerplate/boilerplate.generatego.txt" \ 46 | "${SCRIPT_ROOT}" 47 | -------------------------------------------------------------------------------- /pkg/epp/util/error/error.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2025 The Kubernetes Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package error 18 | 19 | import ( 20 | "fmt" 21 | ) 22 | 23 | // Error is an error struct for errors returned by the epp server. 24 | type Error struct { 25 | Code string 26 | Msg string 27 | } 28 | 29 | const ( 30 | Unknown = "Unknown" 31 | BadRequest = "BadRequest" 32 | Internal = "Internal" 33 | ServiceUnavailable = "ServiceUnavailable" 34 | ModelServerError = "ModelServerError" 35 | BadConfiguration = "BadConfiguration" 36 | InferencePoolResourceExhausted = "InferencePoolResourceExhausted" 37 | ) 38 | 39 | // Error returns a string version of the error. 40 | func (e Error) Error() string { 41 | return fmt.Sprintf("inference gateway: %s - %s", e.Code, e.Msg) 42 | } 43 | 44 | // CanonicalCode returns the error's ErrorCode. 45 | func CanonicalCode(err error) string { 46 | e, ok := err.(Error) 47 | if ok { 48 | return e.Code 49 | } 50 | return Unknown 51 | } 52 | -------------------------------------------------------------------------------- /client-go/applyconfiguration/apix/v1alpha2/match.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright The Kubernetes Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | // Code generated by applyconfiguration-gen. DO NOT EDIT. 18 | 19 | package v1alpha2 20 | 21 | // MatchApplyConfiguration represents a declarative configuration of the Match type for use 22 | // with apply. 23 | type MatchApplyConfiguration struct { 24 | Model *ModelMatchApplyConfiguration `json:"model,omitempty"` 25 | } 26 | 27 | // MatchApplyConfiguration constructs a declarative configuration of the Match type for use with 28 | // apply. 29 | func Match() *MatchApplyConfiguration { 30 | return &MatchApplyConfiguration{} 31 | } 32 | 33 | // WithModel sets the Model field in the declarative configuration to the given value 34 | // and returns the receiver, so that objects can be built by chaining "With" function invocations. 35 | // If called multiple times, the Model field is set to the value of the last call. 36 | func (b *MatchApplyConfiguration) WithModel(value *ModelMatchApplyConfiguration) *MatchApplyConfiguration { 37 | b.Model = value 38 | return b 39 | } 40 | -------------------------------------------------------------------------------- /pkg/epp/plugins/plugins.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2025 The Kubernetes Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package plugins 18 | 19 | // Plugin defines the interface for a plugin. 20 | // This interface should be embedded in all plugins across the code. 21 | type Plugin interface { 22 | // TypedName returns the type and name tuple of this plugin instance. 23 | TypedName() TypedName 24 | } 25 | 26 | // ConsumerPlugin defines the interface for a consumer. 27 | type ConsumerPlugin interface { 28 | Plugin 29 | // Consumes returns data consumed by the plugin. 30 | // This is a map from data key (string) produced to 31 | // the data type of the key (represented as data with default value casted as any field). 32 | Consumes() map[string]any 33 | } 34 | 35 | // ProducerPlugin defines the interface for a producer. 36 | type ProducerPlugin interface { 37 | Plugin 38 | // Produces returns data produced by the producer. 39 | // This is a map from data key (string) produced to 40 | // the data type of the key (represented as data with default value casted as any field). 41 | Produces() map[string]any 42 | } 43 | -------------------------------------------------------------------------------- /pkg/epp/flowcontrol/framework/doc.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2025 The Kubernetes Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | // Package framework defines the core plugin interfaces for extending the `controller.FlowController`. 18 | // 19 | // It establishes the contracts that custom logic, such as queueing disciplines and dispatching policies, must adhere 20 | // to. By building on these interfaces, the Flow Control system can be extended and customized without modifying the 21 | // core controller logic. 22 | // 23 | // The primary contracts are: 24 | // - `SafeQueue`: An interface for concurrent-safe queue implementations. 25 | // - `IntraFlowDispatchPolicy`: An interface for policies that decide which item to select from within a single flow's 26 | // queue. 27 | // - `ItemComparator`: An interface vended by policies to make their internal item-ordering logic explicit and 28 | // available to other components. 29 | // 30 | // These components are linked by `QueueCapability`, which allows policies to declare their queue requirements (e.g., 31 | // FIFO or priority-based ordering). 32 | package framework 33 | -------------------------------------------------------------------------------- /site-src/_includes/epp.md: -------------------------------------------------------------------------------- 1 | === "GKE" 2 | 3 | ```bash 4 | export GATEWAY_PROVIDER=gke 5 | helm install vllm-llama3-8b-instruct \ 6 | --set inferencePool.modelServers.matchLabels.app=vllm-llama3-8b-instruct \ 7 | --set provider.name=$GATEWAY_PROVIDER \ 8 | --version $IGW_CHART_VERSION \ 9 | oci://registry.k8s.io/gateway-api-inference-extension/charts/inferencepool 10 | ``` 11 | 12 | === "Istio" 13 | 14 | ```bash 15 | export GATEWAY_PROVIDER=istio 16 | helm install vllm-llama3-8b-instruct \ 17 | --set inferencePool.modelServers.matchLabels.app=vllm-llama3-8b-instruct \ 18 | --set provider.name=$GATEWAY_PROVIDER \ 19 | --version $IGW_CHART_VERSION \ 20 | oci://registry.k8s.io/gateway-api-inference-extension/charts/inferencepool 21 | ``` 22 | 23 | === "Kgateway" 24 | 25 | ```bash 26 | export GATEWAY_PROVIDER=none 27 | helm install vllm-llama3-8b-instruct \ 28 | --set inferencePool.modelServers.matchLabels.app=vllm-llama3-8b-instruct \ 29 | --set provider.name=$GATEWAY_PROVIDER \ 30 | --version $IGW_CHART_VERSION \ 31 | oci://registry.k8s.io/gateway-api-inference-extension/charts/inferencepool 32 | ``` 33 | 34 | === "NGINX Gateway Fabric" 35 | 36 | ```bash 37 | export GATEWAY_PROVIDER=none 38 | helm install vllm-llama3-8b-instruct \ 39 | --set inferencePool.modelServers.matchLabels.app=vllm-llama3-8b-instruct \ 40 | --set provider.name=$GATEWAY_PROVIDER \ 41 | --version $IGW_CHART_VERSION \ 42 | oci://registry.k8s.io/gateway-api-inference-extension/charts/inferencepool 43 | ``` 44 | -------------------------------------------------------------------------------- /client-go/informers/externalversions/internalinterfaces/factory_interfaces.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright The Kubernetes Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | // Code generated by informer-gen. DO NOT EDIT. 18 | 19 | package internalinterfaces 20 | 21 | import ( 22 | time "time" 23 | 24 | v1 "k8s.io/apimachinery/pkg/apis/meta/v1" 25 | runtime "k8s.io/apimachinery/pkg/runtime" 26 | cache "k8s.io/client-go/tools/cache" 27 | versioned "sigs.k8s.io/gateway-api-inference-extension/client-go/clientset/versioned" 28 | ) 29 | 30 | // NewInformerFunc takes versioned.Interface and time.Duration to return a SharedIndexInformer. 31 | type NewInformerFunc func(versioned.Interface, time.Duration) cache.SharedIndexInformer 32 | 33 | // SharedInformerFactory a small interface to allow for adding an informer without an import cycle 34 | type SharedInformerFactory interface { 35 | Start(stopCh <-chan struct{}) 36 | InformerFor(obj runtime.Object, newFunc NewInformerFunc) cache.SharedIndexInformer 37 | } 38 | 39 | // TweakListOptionsFunc is a function that transforms a v1.ListOptions. 40 | type TweakListOptionsFunc func(*v1.ListOptions) 41 | -------------------------------------------------------------------------------- /client-go/applyconfiguration/api/v1/port.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright The Kubernetes Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | // Code generated by applyconfiguration-gen. DO NOT EDIT. 18 | 19 | package v1 20 | 21 | import ( 22 | apiv1 "sigs.k8s.io/gateway-api-inference-extension/api/v1" 23 | ) 24 | 25 | // PortApplyConfiguration represents a declarative configuration of the Port type for use 26 | // with apply. 27 | type PortApplyConfiguration struct { 28 | Number *apiv1.PortNumber `json:"number,omitempty"` 29 | } 30 | 31 | // PortApplyConfiguration constructs a declarative configuration of the Port type for use with 32 | // apply. 33 | func Port() *PortApplyConfiguration { 34 | return &PortApplyConfiguration{} 35 | } 36 | 37 | // WithNumber sets the Number field in the declarative configuration to the given value 38 | // and returns the receiver, so that objects can be built by chaining "With" function invocations. 39 | // If called multiple times, the Number field is set to the value of the last call. 40 | func (b *PortApplyConfiguration) WithNumber(value apiv1.PortNumber) *PortApplyConfiguration { 41 | b.Number = &value 42 | return b 43 | } 44 | -------------------------------------------------------------------------------- /pkg/epp/scheduling/framework/plugins/test/consts.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2025 The Kubernetes Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package test 18 | 19 | const ( 20 | // HeaderTestEppEndPointSelectionKey is the request header used in tests to control 21 | // Endpoint Picker (EPP) behavior deterministically. 22 | // 23 | // The header value is a comma-separated list of endpoint identifiers. Each entry 24 | // may be in one of the following formats: 25 | // 26 | // - "IP" — selects all pods whose IP address matches the given value. 27 | // - "IP:port" — selects only pods whose IP and port both match exactly. 28 | // Ports correspond to data-parallel ranks or specific targetPorts. 29 | // 30 | // IPv6 addresses are supported, with or without brackets (e.g. "fd00::1" or "[fd00::1]:3002"). 31 | // The returned order matches the order of endpoints specified in the header, and duplicates 32 | // are ignored. 33 | // 34 | // Examples: 35 | // "test-epp-endpoint-selection": "10.0.0.7,10.0.0.8:3002" 36 | // "test-epp-endpoint-selection": "[fd00::1]:3000,fd00::2" 37 | HeaderTestEppEndPointSelectionKey = "test-epp-endpoint-selection" 38 | ) 39 | -------------------------------------------------------------------------------- /config/charts/body-based-routing/templates/istio.yaml: -------------------------------------------------------------------------------- 1 | {{- if eq .Values.provider.name "istio" }} 2 | --- 3 | apiVersion: networking.istio.io/v1alpha3 4 | kind: EnvoyFilter 5 | metadata: 6 | name: {{ .Values.bbr.name }} 7 | namespace: {{ .Release.Namespace }} 8 | spec: 9 | configPatches: 10 | - applyTo: HTTP_FILTER 11 | match: 12 | # context omitted so that this applies to both sidecars and gateways 13 | listener: 14 | filterChain: 15 | filter: 16 | name: "envoy.filters.network.http_connection_manager" 17 | patch: 18 | operation: INSERT_FIRST 19 | value: 20 | name: envoy.filters.http.ext_proc 21 | typed_config: 22 | "@type": type.googleapis.com/envoy.extensions.filters.http.ext_proc.v3.ExternalProcessor 23 | failure_mode_allow: false 24 | allow_mode_override: true 25 | processing_mode: 26 | request_header_mode: "SEND" 27 | response_header_mode: "SKIP" 28 | request_body_mode: "FULL_DUPLEX_STREAMED" 29 | response_body_mode: "NONE" 30 | request_trailer_mode: "SEND" 31 | response_trailer_mode: "SKIP" 32 | grpc_service: 33 | envoy_grpc: 34 | cluster_name: outbound|{{ .Values.bbr.port }}||{{ .Values.bbr.name }}.{{ .Release.Namespace }}.svc.cluster.local 35 | --- 36 | apiVersion: networking.istio.io/v1 37 | kind: DestinationRule 38 | metadata: 39 | name: {{ .Values.bbr.name }} 40 | namespace: {{ .Release.Namespace }} 41 | spec: 42 | host: {{ .Values.bbr.name }}.{{ .Release.Namespace }}.svc.cluster.local 43 | trafficPolicy: 44 | tls: 45 | mode: SIMPLE 46 | insecureSkipVerify: true 47 | {{- end }} 48 | -------------------------------------------------------------------------------- /pkg/epp/flowcontrol/registry/connection.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2025 The Kubernetes Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package registry 18 | 19 | import ( 20 | "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/flowcontrol/contracts" 21 | "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/flowcontrol/types" 22 | ) 23 | 24 | // connection is the concrete, un-exported implementation of the `contracts.ActiveFlowConnection` interface. 25 | // It is a temporary handle created for the duration of a single `WithConnection` call. 26 | type connection struct { 27 | registry *FlowRegistry 28 | key types.FlowKey 29 | } 30 | 31 | var _ contracts.ActiveFlowConnection = &connection{} 32 | 33 | // Shards returns a stable snapshot of accessors for all internal state shards. 34 | func (c *connection) ActiveShards() []contracts.RegistryShard { 35 | c.registry.mu.RLock() 36 | defer c.registry.mu.RUnlock() 37 | 38 | // Return a copy to ensure the caller cannot modify the registry's internal slice. 39 | shardsCopy := make([]contracts.RegistryShard, len(c.registry.activeShards)) 40 | for i, s := range c.registry.activeShards { 41 | shardsCopy[i] = s 42 | } 43 | return shardsCopy 44 | } 45 | -------------------------------------------------------------------------------- /conformance/tests/inferencepool_resolvedrefs_condition.yaml: -------------------------------------------------------------------------------- 1 | # conformance/tests/basic/inferencepool_resolvedrefs_condition.yaml 2 | 3 | # This manifest defines the initial resources for the 4 | # inferencepool_resolvedrefs_condition.go conformance test. 5 | 6 | # --- HTTPRoute for Primary Gateway (conformance-primary) --- 7 | apiVersion: gateway.networking.k8s.io/v1 8 | kind: HTTPRoute 9 | metadata: 10 | name: httproute-for-primary-gw 11 | namespace: inference-conformance-app-backend 12 | spec: 13 | parentRefs: 14 | - group: gateway.networking.k8s.io 15 | kind: Gateway 16 | name: conformance-primary 17 | namespace: inference-conformance-infra 18 | sectionName: http 19 | hostnames: 20 | - "primary.example.com" 21 | rules: 22 | - backendRefs: 23 | - group: inference.networking.k8s.io 24 | kind: InferencePool 25 | name: primary-inference-pool 26 | matches: 27 | - path: 28 | type: PathPrefix 29 | value: /primary-gateway-test 30 | --- 31 | # --- HTTPRoute for Secondary Gateway (conformance-secondary) --- 32 | apiVersion: gateway.networking.k8s.io/v1 33 | kind: HTTPRoute 34 | metadata: 35 | name: httproute-for-secondary-gw 36 | namespace: inference-conformance-app-backend 37 | spec: 38 | parentRefs: 39 | - group: gateway.networking.k8s.io 40 | kind: Gateway 41 | name: conformance-secondary 42 | namespace: inference-conformance-infra 43 | sectionName: http 44 | hostnames: 45 | - "secondary.example.com" 46 | rules: 47 | - backendRefs: 48 | - group: inference.networking.k8s.io 49 | kind: InferencePool 50 | name: primary-inference-pool 51 | matches: 52 | - path: 53 | type: PathPrefix 54 | value: /secondary-gateway-test 55 | -------------------------------------------------------------------------------- /tools/benchmark/download-benchmark-results.bash: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Downloads the benchmark result files from the benchmark tool pod. 4 | download_benchmark_results() { 5 | until echo $(kubectl logs deployment/benchmark-tool -n ${namespace}) | grep -q -m 1 "LPG_FINISHED"; do sleep 30 ; done; 6 | benchmark_pod=$(kubectl get pods -l app=benchmark-tool -n ${namespace} -o jsonpath="{.items[0].metadata.name}") 7 | echo "Downloading JSON results from pod ${benchmark_pod}" 8 | kubectl exec ${benchmark_pod} -n ${namespace} -- rm -f ShareGPT_V3_unfiltered_cleaned_split.json 9 | for f in $(kubectl exec ${benchmark_pod} -n ${namespace} -- /bin/sh -c ls -l | grep json); do 10 | echo "Downloading json file ${f}" 11 | kubectl cp -n ${namespace} ${benchmark_pod}:$f ${benchmark_output_dir}/results/json/$f; 12 | done 13 | } 14 | 15 | # Env vars to be passed when calling this script. 16 | # The id of the benchmark. This is needed to identify what the benchmark is for. 17 | # It decides the filepath to save the results, which later is used by the jupyter notebook to assign 18 | # the benchmark_id as data labels for plotting. 19 | benchmark_id=${benchmark_id:-"inference-extension"} 20 | # run_id can be used to group different runs of the same benchmarks for comparison. 21 | run_id=${run_id:-"default-run"} 22 | namespace=${namespace:-"default"} 23 | output_dir=${output_dir:-'output'} 24 | 25 | SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" 26 | benchmark_output_dir=${SCRIPT_DIR}/${output_dir}/${run_id}/${benchmark_id} 27 | 28 | echo "Saving benchmark results to ${benchmark_output_dir}/results/json/" 29 | download_benchmark_results 30 | kubectl delete -f ${SCRIPT_DIR}/../../config/manifests/benchmark/benchmark.yaml -------------------------------------------------------------------------------- /pkg/epp/requestcontrol/plugin_executor.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2025 The Kubernetes Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package requestcontrol 18 | 19 | import ( 20 | "context" 21 | "errors" 22 | "time" 23 | 24 | schedulingtypes "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types" 25 | ) 26 | 27 | // prepareDataPluginsWithTimeout executes the PrepareRequestData plugins with retries and timeout. 28 | func prepareDataPluginsWithTimeout(timeout time.Duration, plugins []PrepareDataPlugin, 29 | ctx context.Context, request *schedulingtypes.LLMRequest, pods []schedulingtypes.Pod) error { 30 | errCh := make(chan error, 1) 31 | // Execute plugins sequentially in a separate goroutine 32 | go func() { 33 | for _, plugin := range plugins { 34 | err := plugin.PrepareRequestData(ctx, request, pods) 35 | if err != nil { 36 | errCh <- errors.New("prepare data plugin " + plugin.TypedName().String() + " failed: " + err.Error()) 37 | return 38 | } 39 | } 40 | errCh <- nil 41 | }() 42 | 43 | select { 44 | case <-ctx.Done(): 45 | return ctx.Err() 46 | case err := <-errCh: 47 | return err 48 | case <-time.After(timeout): 49 | return errors.New("prepare data plugin timed out") 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /site-src/_includes/epp-latest.md: -------------------------------------------------------------------------------- 1 | === "GKE" 2 | 3 | ```bash 4 | export GATEWAY_PROVIDER=gke 5 | helm install vllm-llama3-8b-instruct \ 6 | --set inferencePool.modelServers.matchLabels.app=vllm-llama3-8b-instruct \ 7 | --set provider.name=$GATEWAY_PROVIDER \ 8 | --version $IGW_CHART_VERSION \ 9 | oci://us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/charts/inferencepool 10 | ``` 11 | 12 | === "Istio" 13 | 14 | ```bash 15 | export GATEWAY_PROVIDER=istio 16 | helm install vllm-llama3-8b-instruct \ 17 | --set inferencePool.modelServers.matchLabels.app=vllm-llama3-8b-instruct \ 18 | --set provider.name=$GATEWAY_PROVIDER \ 19 | --version $IGW_CHART_VERSION \ 20 | oci://us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/charts/inferencepool 21 | ``` 22 | 23 | === "Kgateway" 24 | 25 | ```bash 26 | export GATEWAY_PROVIDER=none 27 | helm install vllm-llama3-8b-instruct \ 28 | --set inferencePool.modelServers.matchLabels.app=vllm-llama3-8b-instruct \ 29 | --set provider.name=$GATEWAY_PROVIDER \ 30 | --version $IGW_CHART_VERSION \ 31 | oci://us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/charts/inferencepool 32 | ``` 33 | 34 | === "NGINX Gateway Fabric" 35 | 36 | ```bash 37 | export GATEWAY_PROVIDER=none 38 | helm install vllm-llama3-8b-instruct \ 39 | --set inferencePool.modelServers.matchLabels.app=vllm-llama3-8b-instruct \ 40 | --set provider.name=$GATEWAY_PROVIDER \ 41 | --version $IGW_CHART_VERSION \ 42 | oci://us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/charts/inferencepool 43 | ``` -------------------------------------------------------------------------------- /pkg/common/kubemeta.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2025 The Kubernetes Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | // Package common defines structs for referring to fully qualified k8s resources. 18 | package common 19 | 20 | import ( 21 | "fmt" 22 | "strings" 23 | 24 | "k8s.io/apimachinery/pkg/runtime/schema" 25 | "k8s.io/apimachinery/pkg/types" 26 | ) 27 | 28 | // GKNN represents a fully qualified k8s resource. 29 | type GKNN struct { 30 | types.NamespacedName 31 | schema.GroupKind 32 | } 33 | 34 | // String implements Stringer. 35 | func (g *GKNN) String() string { 36 | return fmt.Sprintf("%s %s", g.GroupKind.String(), g.NamespacedName.String()) 37 | } 38 | 39 | // Compare returns the comparison of a and b where less than, equal, and greater than return -1, 0, 40 | // and 1 respectively. 41 | func Compare(a, b GKNN) int { 42 | if v := strings.Compare(a.Group, b.Group); v != 0 { 43 | return v 44 | } 45 | if v := strings.Compare(a.Kind, b.Kind); v != 0 { 46 | return v 47 | } 48 | if v := strings.Compare(a.Namespace, b.Namespace); v != 0 { 49 | return v 50 | } 51 | return strings.Compare(a.Name, b.Name) 52 | } 53 | 54 | // Less returns true if a is less than b. 55 | func Less(a, b GKNN) bool { 56 | return Compare(a, b) < 0 57 | } 58 | -------------------------------------------------------------------------------- /site-src/concepts/priority-and-capacity.md: -------------------------------------------------------------------------------- 1 | # Priority and Capacity 2 | 3 | The InferenceObjective creates the definition of `Priority` which describes how requests interact with each other, this naturally interacts with total pool capacity, and properly understanding and configuring these behaviors is important in allowing a pool to handle requests of different priority. 4 | 5 | ## Priority (in flow control) 6 | 7 | It should be noted that priority is currently only used in [Capacity](#capacity), and that the description below is how Priority will be consumed in the `Flow Control` model. 8 | 9 | Priority is a simple stack rank; the higher the number, the higher the priority. Should no priority for a request be specified, the default value is zero. Requests of higher priority are _always_ selected first when requests are queued. Requests of equal priority currently operate on a FCFS basis. 10 | 11 | ## Capacity 12 | 13 | The current capacity model uses configurable [thresholds](https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/35b14a10a9830d1a9e3850913539066ebc8fb317/pkg/epp/saturationdetector/saturationdetector.go#L49) to determine if the entire pool is saturated. The calculation is to simply iterate through each endpoint in the pool, and if all are above all thresholds, the pool is considered `saturated`. In the event of saturation, all requests with a negative priority will be rejected, and other requests will be scheduled and queued on the model servers. 14 | 15 | ## Future work 16 | 17 | The Flow Control system is nearing completion and will add more nuance to the Priority and Capacity model: proper priority enforcement, more articulate capacity tracking, queuing at the Inference Gateway level, etc. This documentation will be updated when the Flow Control has finished implementation. -------------------------------------------------------------------------------- /client-go/applyconfiguration/internal/internal.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright The Kubernetes Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | // Code generated by applyconfiguration-gen. DO NOT EDIT. 18 | 19 | package internal 20 | 21 | import ( 22 | fmt "fmt" 23 | sync "sync" 24 | 25 | typed "sigs.k8s.io/structured-merge-diff/v6/typed" 26 | ) 27 | 28 | func Parser() *typed.Parser { 29 | parserOnce.Do(func() { 30 | var err error 31 | parser, err = typed.NewParser(schemaYAML) 32 | if err != nil { 33 | panic(fmt.Sprintf("Failed to parse schema: %v", err)) 34 | } 35 | }) 36 | return parser 37 | } 38 | 39 | var parserOnce sync.Once 40 | var parser *typed.Parser 41 | var schemaYAML = typed.YAMLObject(`types: 42 | - name: __untyped_atomic_ 43 | scalar: untyped 44 | list: 45 | elementType: 46 | namedType: __untyped_atomic_ 47 | elementRelationship: atomic 48 | map: 49 | elementType: 50 | namedType: __untyped_atomic_ 51 | elementRelationship: atomic 52 | - name: __untyped_deduced_ 53 | scalar: untyped 54 | list: 55 | elementType: 56 | namedType: __untyped_atomic_ 57 | elementRelationship: atomic 58 | map: 59 | elementType: 60 | namedType: __untyped_deduced_ 61 | elementRelationship: separable 62 | `) 63 | -------------------------------------------------------------------------------- /client-go/informers/externalversions/api/interface.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright The Kubernetes Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | // Code generated by informer-gen. DO NOT EDIT. 18 | 19 | package api 20 | 21 | import ( 22 | v1 "sigs.k8s.io/gateway-api-inference-extension/client-go/informers/externalversions/api/v1" 23 | internalinterfaces "sigs.k8s.io/gateway-api-inference-extension/client-go/informers/externalversions/internalinterfaces" 24 | ) 25 | 26 | // Interface provides access to each of this group's versions. 27 | type Interface interface { 28 | // V1 provides access to shared informers for resources in V1. 29 | V1() v1.Interface 30 | } 31 | 32 | type group struct { 33 | factory internalinterfaces.SharedInformerFactory 34 | namespace string 35 | tweakListOptions internalinterfaces.TweakListOptionsFunc 36 | } 37 | 38 | // New returns a new Interface. 39 | func New(f internalinterfaces.SharedInformerFactory, namespace string, tweakListOptions internalinterfaces.TweakListOptionsFunc) Interface { 40 | return &group{factory: f, namespace: namespace, tweakListOptions: tweakListOptions} 41 | } 42 | 43 | // V1 returns a new v1.Interface. 44 | func (g *group) V1() v1.Interface { 45 | return v1.New(g.factory, g.namespace, g.tweakListOptions) 46 | } 47 | -------------------------------------------------------------------------------- /client-go/clientset/versioned/typed/apix/v1alpha2/fake/fake_apix_client.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright The Kubernetes Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | // Code generated by client-gen. DO NOT EDIT. 18 | 19 | package fake 20 | 21 | import ( 22 | rest "k8s.io/client-go/rest" 23 | testing "k8s.io/client-go/testing" 24 | v1alpha2 "sigs.k8s.io/gateway-api-inference-extension/client-go/clientset/versioned/typed/apix/v1alpha2" 25 | ) 26 | 27 | type FakeXInferenceV1alpha2 struct { 28 | *testing.Fake 29 | } 30 | 31 | func (c *FakeXInferenceV1alpha2) InferenceModelRewrites(namespace string) v1alpha2.InferenceModelRewriteInterface { 32 | return newFakeInferenceModelRewrites(c, namespace) 33 | } 34 | 35 | func (c *FakeXInferenceV1alpha2) InferenceObjectives(namespace string) v1alpha2.InferenceObjectiveInterface { 36 | return newFakeInferenceObjectives(c, namespace) 37 | } 38 | 39 | func (c *FakeXInferenceV1alpha2) InferencePools(namespace string) v1alpha2.InferencePoolInterface { 40 | return newFakeInferencePools(c, namespace) 41 | } 42 | 43 | // RESTClient returns a RESTClient that is used to communicate 44 | // with API server by this client implementation. 45 | func (c *FakeXInferenceV1alpha2) RESTClient() rest.Interface { 46 | var ret *rest.RESTClient 47 | return ret 48 | } 49 | -------------------------------------------------------------------------------- /client-go/informers/externalversions/api/v1/interface.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright The Kubernetes Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | // Code generated by informer-gen. DO NOT EDIT. 18 | 19 | package v1 20 | 21 | import ( 22 | internalinterfaces "sigs.k8s.io/gateway-api-inference-extension/client-go/informers/externalversions/internalinterfaces" 23 | ) 24 | 25 | // Interface provides access to all the informers in this group version. 26 | type Interface interface { 27 | // InferencePools returns a InferencePoolInformer. 28 | InferencePools() InferencePoolInformer 29 | } 30 | 31 | type version struct { 32 | factory internalinterfaces.SharedInformerFactory 33 | namespace string 34 | tweakListOptions internalinterfaces.TweakListOptionsFunc 35 | } 36 | 37 | // New returns a new Interface. 38 | func New(f internalinterfaces.SharedInformerFactory, namespace string, tweakListOptions internalinterfaces.TweakListOptionsFunc) Interface { 39 | return &version{factory: f, namespace: namespace, tweakListOptions: tweakListOptions} 40 | } 41 | 42 | // InferencePools returns a InferencePoolInformer. 43 | func (v *version) InferencePools() InferencePoolInformer { 44 | return &inferencePoolInformer{factory: v.factory, namespace: v.namespace, tweakListOptions: v.tweakListOptions} 45 | } 46 | --------------------------------------------------------------------------------