├── .github └── workflows │ ├── build_push.yml │ ├── build_push_commit.yml │ ├── build_push_criu.yml │ ├── build_push_freezer.yml │ ├── ci.yml │ └── release.yml ├── .gitignore ├── LICENSE.md ├── Makefile ├── README.md ├── activator ├── activator.go ├── activator_test.go ├── bpf.go ├── bpf_bpfeb.go ├── bpf_bpfeb.o ├── bpf_bpfel.go ├── bpf_bpfel.o └── redirector.c ├── api ├── node │ └── v1 │ │ ├── meta.go │ │ ├── node.pb.go │ │ ├── node.proto │ │ └── node_ttrpc.pb.go ├── runtime │ ├── generate.go │ └── v1 │ │ ├── meta.go │ │ ├── register.go │ │ ├── types.go │ │ └── zz_generated.deepcopy.go └── shim │ └── v1 │ ├── shim.pb.go │ ├── shim.proto │ ├── shim_ttrpc.pb.go │ └── util.go ├── cmd ├── freezer │ ├── Dockerfile │ └── main.go ├── installer │ ├── Dockerfile │ ├── main.go │ └── main_test.go ├── manager │ ├── Dockerfile │ └── main.go └── shim │ └── main.go ├── config ├── base │ ├── kustomization.yaml │ ├── namespace.yaml │ ├── node-daemonset.yaml │ └── rbac.yaml ├── crds │ ├── kustomization.yaml │ └── runtime.zeropod.ctrox.dev_migrations.yaml ├── examples │ ├── live-migration.yaml │ ├── migration.yaml │ ├── nginx.yaml │ ├── pod.yaml │ ├── redmine.yaml │ ├── wildfly.yaml │ └── wordpress.yaml ├── gke │ ├── daemonset.yaml │ └── kustomization.yaml ├── in-place-scaling │ └── kustomization.yaml ├── k3s │ ├── k3s.yaml │ └── kustomization.yaml ├── kind │ └── kustomization.yaml ├── migration-manager │ ├── kustomization.yaml │ └── rbac.yaml ├── pod-updater │ ├── kustomization.yaml │ └── rbac.yaml ├── production │ └── kustomization.yaml ├── rke2 │ ├── kustomization.yaml │ └── rke2.yaml ├── status-labels │ └── kustomization.yaml └── uninstall │ └── kustomization.yaml ├── criu ├── Dockerfile ├── always-lazy.patch ├── rpc-ps-address.patch └── unix_sock.patch ├── e2e ├── Dockerfile ├── bench_test.go ├── e2e_test.go ├── kind.yaml ├── main_test.go ├── migration_test.go ├── portforward_test.go └── setup_test.go ├── go.mod ├── go.sum ├── hack └── boilerplate.go.txt ├── manager ├── metrics_collector.go ├── node │ ├── cert.go │ ├── criu_check.go │ ├── exec_logger.go │ ├── page_server_proxy.go │ ├── page_server_proxy_test.go │ ├── service.go │ └── service_test.go ├── pod_controller.go ├── pod_controller_test.go ├── pod_labeller.go ├── pod_labeller_test.go ├── pod_scaler.go ├── pod_scaler_test.go ├── redirector_attacher.go ├── status.go └── status_test.go ├── shim ├── checkpoint.go ├── config.go ├── config_test.go ├── container.go ├── evac.go ├── io │ ├── container_io.go │ ├── cri.go │ ├── doc.go │ ├── helpers.go │ ├── helpers_unix.go │ ├── logger.go │ └── logger_test.go ├── log.go ├── metrics.go ├── port.go ├── port_test.go ├── restore.go ├── task │ ├── plugin │ │ └── plugin_linux.go │ ├── register.go │ ├── service.go │ ├── service_zeropod.go │ └── shim.go └── util.go └── socket ├── Dockerfile ├── bpf_bpfeb.go ├── bpf_bpfeb.o ├── bpf_bpfel.go ├── bpf_bpfel.o ├── ebpf.go ├── host_resolver.go ├── kprobe.c ├── noop.go ├── tracker.go ├── tracker_test.go └── vmlinux.h.gz /.github/workflows/build_push.yml: -------------------------------------------------------------------------------- 1 | name: Build and Publish manager/installer 2 | 3 | on: 4 | push: 5 | tags: 6 | - "*" 7 | 8 | env: 9 | REGISTRY: ghcr.io 10 | INSTALLER_IMAGE_NAME: ${{ github.repository }}-installer 11 | MANAGER_IMAGE_NAME: ${{ github.repository }}-manager 12 | CRIU_IMAGE_NAME: ${{ github.repository }}-criu 13 | CRIU_VERSION: v4.1 14 | 15 | jobs: 16 | push_to_registry: 17 | permissions: write-all 18 | name: Push Docker images 19 | runs-on: ubuntu-latest 20 | steps: 21 | - name: Checkout repository 22 | uses: actions/checkout@v4 23 | 24 | - name: Log in to the Container registry 25 | uses: docker/login-action@65b78e6e13532edd9afa3aa52ac7964289d1a9c1 26 | with: 27 | registry: ${{ env.REGISTRY }} 28 | username: ${{ github.actor }} 29 | password: ${{ secrets.GITHUB_TOKEN }} 30 | 31 | - name: Set up Docker Buildx 32 | uses: docker/setup-buildx-action@v2 33 | 34 | - name: Extract metadata for installer 35 | id: meta_installer 36 | uses: docker/metadata-action@9ec57ed1fcdbf14dcef7dfbe97b2010124a938b7 37 | with: 38 | images: ${{ env.REGISTRY }}/${{ env.INSTALLER_IMAGE_NAME }} 39 | 40 | - name: Extract metadata for manager 41 | id: meta_manager 42 | uses: docker/metadata-action@9ec57ed1fcdbf14dcef7dfbe97b2010124a938b7 43 | with: 44 | images: ${{ env.REGISTRY }}/${{ env.MANAGER_IMAGE_NAME }} 45 | 46 | - name: Build and push installer 47 | uses: docker/build-push-action@v4 48 | with: 49 | context: . 50 | file: ./cmd/installer/Dockerfile 51 | platforms: linux/amd64,linux/arm64 52 | push: true 53 | tags: ${{ steps.meta_installer.outputs.tags }} 54 | labels: ${{ steps.meta_installer.outputs.labels }} 55 | 56 | - name: Build and push manager 57 | uses: docker/build-push-action@v4 58 | with: 59 | context: . 60 | file: ./cmd/manager/Dockerfile 61 | platforms: linux/amd64,linux/arm64 62 | push: true 63 | tags: ${{ steps.meta_manager.outputs.tags }} 64 | labels: ${{ steps.meta_manager.outputs.labels }} 65 | build-args: | 66 | CRIU_IMAGE_NAME=${{ env.REGISTRY }}/${{ env.CRIU_IMAGE_NAME }} 67 | CRIU_VERSION=${{ env.CRIU_VERSION }} 68 | -------------------------------------------------------------------------------- /.github/workflows/build_push_commit.yml: -------------------------------------------------------------------------------- 1 | name: Build and Publish manager/installer 2 | 3 | on: 4 | push: 5 | branches: 6 | - "*" 7 | 8 | env: 9 | REGISTRY: ghcr.io 10 | INSTALLER_IMAGE_NAME: ${{ github.repository }}-installer 11 | MANAGER_IMAGE_NAME: ${{ github.repository }}-manager 12 | CRIU_IMAGE_NAME: ${{ github.repository }}-criu 13 | CRIU_VERSION: v4.1 14 | 15 | jobs: 16 | push_to_registry: 17 | permissions: write-all 18 | name: Push Docker images 19 | runs-on: ubuntu-latest 20 | steps: 21 | - name: Checkout repository 22 | uses: actions/checkout@v4 23 | 24 | - name: Log in to the Container registry 25 | uses: docker/login-action@65b78e6e13532edd9afa3aa52ac7964289d1a9c1 26 | with: 27 | registry: ${{ env.REGISTRY }} 28 | username: ${{ github.actor }} 29 | password: ${{ secrets.GITHUB_TOKEN }} 30 | 31 | - name: Set up Docker Buildx 32 | uses: docker/setup-buildx-action@v2 33 | 34 | - name: Extract metadata for installer 35 | id: meta_installer 36 | uses: docker/metadata-action@9ec57ed1fcdbf14dcef7dfbe97b2010124a938b7 37 | with: 38 | tags: | 39 | type=sha,prefix=dev- 40 | images: ${{ env.REGISTRY }}/${{ env.INSTALLER_IMAGE_NAME }} 41 | 42 | - name: Extract metadata for manager 43 | id: meta_manager 44 | uses: docker/metadata-action@9ec57ed1fcdbf14dcef7dfbe97b2010124a938b7 45 | with: 46 | tags: | 47 | type=sha,prefix=dev- 48 | images: ${{ env.REGISTRY }}/${{ env.MANAGER_IMAGE_NAME }} 49 | 50 | - name: Build and push installer 51 | uses: docker/build-push-action@v4 52 | with: 53 | context: . 54 | file: ./cmd/installer/Dockerfile 55 | platforms: linux/amd64,linux/arm64 56 | push: true 57 | tags: ${{ steps.meta_installer.outputs.tags }} 58 | labels: ${{ steps.meta_installer.outputs.labels }} 59 | 60 | - name: Build and push manager 61 | uses: docker/build-push-action@v4 62 | with: 63 | context: . 64 | file: ./cmd/manager/Dockerfile 65 | platforms: linux/amd64,linux/arm64 66 | push: true 67 | tags: ${{ steps.meta_manager.outputs.tags }} 68 | labels: ${{ steps.meta_manager.outputs.labels }} 69 | build-args: | 70 | CRIU_IMAGE_NAME=${{ env.REGISTRY }}/${{ env.CRIU_IMAGE_NAME }} 71 | CRIU_VERSION=${{ env.CRIU_VERSION }} 72 | -------------------------------------------------------------------------------- /.github/workflows/build_push_criu.yml: -------------------------------------------------------------------------------- 1 | name: Build and Publish CRIU image 2 | 3 | on: 4 | push: 5 | paths: 6 | - criu/** 7 | 8 | env: 9 | REGISTRY: ghcr.io 10 | CRIU_IMAGE_NAME: ${{ github.repository }}-criu 11 | CRIU_VERSION: v4.1 12 | 13 | jobs: 14 | push_to_registry: 15 | permissions: write-all 16 | name: Push Docker images 17 | runs-on: ubuntu-latest 18 | steps: 19 | - name: Checkout repository 20 | uses: actions/checkout@v4 21 | 22 | - name: Log in to the Container registry 23 | uses: docker/login-action@65b78e6e13532edd9afa3aa52ac7964289d1a9c1 24 | with: 25 | registry: ${{ env.REGISTRY }} 26 | username: ${{ github.actor }} 27 | password: ${{ secrets.GITHUB_TOKEN }} 28 | 29 | - name: Set up Docker Buildx 30 | uses: docker/setup-buildx-action@v2 31 | 32 | - name: Extract metadata for criu 33 | id: meta_criu 34 | uses: docker/metadata-action@9ec57ed1fcdbf14dcef7dfbe97b2010124a938b7 35 | with: 36 | images: ${{ env.REGISTRY }}/${{ env.CRIU_IMAGE_NAME }} 37 | 38 | - name: Build and push criu 39 | uses: docker/build-push-action@v4 40 | with: 41 | context: . 42 | file: ./criu/Dockerfile 43 | platforms: linux/amd64,linux/arm64 44 | push: true 45 | tags: ${{ env.REGISTRY }}/${{ env.CRIU_IMAGE_NAME }}:${{env.CRIU_VERSION}} 46 | labels: ${{ steps.meta_criu.outputs.labels }} 47 | build-args: CRIU_VERSION=${{ env.CRIU_VERSION }} 48 | -------------------------------------------------------------------------------- /.github/workflows/build_push_freezer.yml: -------------------------------------------------------------------------------- 1 | name: Build and Publish freezer 2 | 3 | on: 4 | push: 5 | paths: 6 | - cmd/freezer/** 7 | 8 | env: 9 | REGISTRY: ghcr.io 10 | FREEZER_IMAGE_NAME: ${{ github.repository }}-freezer 11 | 12 | jobs: 13 | push_to_registry: 14 | permissions: write-all 15 | name: Push Docker images 16 | runs-on: ubuntu-latest 17 | steps: 18 | - name: Checkout repository 19 | uses: actions/checkout@v4 20 | 21 | - name: Log in to the Container registry 22 | uses: docker/login-action@65b78e6e13532edd9afa3aa52ac7964289d1a9c1 23 | with: 24 | registry: ${{ env.REGISTRY }} 25 | username: ${{ github.actor }} 26 | password: ${{ secrets.GITHUB_TOKEN }} 27 | 28 | - name: Set up Docker Buildx 29 | uses: docker/setup-buildx-action@v2 30 | 31 | - name: Extract metadata for freezer 32 | id: meta_freezer 33 | uses: docker/metadata-action@9ec57ed1fcdbf14dcef7dfbe97b2010124a938b7 34 | with: 35 | images: ${{ env.REGISTRY }}/${{ env.FREEZER_IMAGE_NAME }} 36 | 37 | - name: Build and push freezer 38 | uses: docker/build-push-action@v4 39 | with: 40 | context: . 41 | file: ./cmd/freezer/Dockerfile 42 | platforms: linux/amd64,linux/arm64 43 | push: true 44 | tags: ${{ env.REGISTRY }}/${{ env.FREEZER_IMAGE_NAME }}:latest 45 | labels: ${{ steps.meta_freezer.outputs.labels }} 46 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: ci 2 | 3 | on: [push] 4 | 5 | jobs: 6 | staticcheck: 7 | runs-on: ubuntu-latest 8 | steps: 9 | - uses: actions/checkout@v4 10 | 11 | - name: Set up Go 12 | uses: actions/setup-go@v4 13 | with: 14 | go-version: "1.23" 15 | 16 | - uses: dominikh/staticcheck-action@v1.3.1 17 | with: 18 | install-go: false 19 | version: "2024.1" 20 | 21 | test: 22 | runs-on: ubuntu-latest 23 | steps: 24 | - uses: actions/checkout@v4 25 | 26 | - name: Set up Go 27 | uses: actions/setup-go@v4 28 | with: 29 | go-version: "1.23" 30 | 31 | - name: test 32 | run: sudo --preserve-env make test 33 | 34 | build: 35 | runs-on: ubuntu-24.04 36 | steps: 37 | - uses: actions/checkout@v4 38 | 39 | - name: Set up Docker Buildx 40 | uses: docker/setup-buildx-action@v2 41 | with: 42 | install: true 43 | 44 | - name: Set up Go 45 | uses: actions/setup-go@v4 46 | with: 47 | go-version: "1.23" 48 | 49 | - name: Install protoc-gen-go 50 | run: | 51 | go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.36.3 52 | go install github.com/containerd/ttrpc/cmd/protoc-gen-go-ttrpc@v1.2.4 53 | 54 | - uses: awalsh128/cache-apt-pkgs-action@v1 55 | with: 56 | packages: protobuf-compiler libprotobuf-dev 57 | version: 1.0 58 | 59 | - name: build ebpf image 60 | run: make build-ebpf 61 | 62 | - name: generate ttrpc and ebpf 63 | run: make generate 64 | 65 | - name: check for diff 66 | run: git diff --exit-code 67 | 68 | e2e: 69 | runs-on: ubuntu-latest 70 | steps: 71 | - uses: actions/checkout@v4 72 | 73 | - name: Set up Docker Buildx 74 | uses: docker/setup-buildx-action@v2 75 | with: 76 | install: true 77 | 78 | - name: Set up Go 79 | uses: actions/setup-go@v4 80 | with: 81 | go-version: "1.23" 82 | 83 | - name: e2e 84 | run: make test-e2e 85 | -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | name: Prepare Release 2 | 3 | on: 4 | push: 5 | branches: 6 | - "release/v[012].[0-9]+.[0-9]+" 7 | 8 | env: 9 | REGISTRY: ghcr.io 10 | INSTALLER_IMAGE_NAME: ${{ github.repository }}-installer 11 | MANAGER_IMAGE_NAME: ${{ github.repository }}-manager 12 | 13 | jobs: 14 | update-manifests: 15 | name: Update Manifests 16 | runs-on: ubuntu-latest 17 | steps: 18 | - name: Checkout repository 19 | uses: actions/checkout@v4 20 | 21 | - name: Split branch name 22 | env: 23 | BRANCH: ${{ github.ref_name }} 24 | id: split 25 | run: echo "::set-output name=fragment::${BRANCH##*/}" 26 | 27 | - name: Update Deployment Image Tags 28 | working-directory: "config/production" 29 | run: | 30 | kustomize edit set image installer=${{ env.REGISTRY }}/${{ env.INSTALLER_IMAGE_NAME }}:${{ steps.split.outputs.fragment }} 31 | kustomize edit set image manager=${{ env.REGISTRY }}/${{ env.MANAGER_IMAGE_NAME }}:${{ steps.split.outputs.fragment }} 32 | 33 | - name: Create Pull Request 34 | uses: peter-evans/create-pull-request@v6 35 | with: 36 | commit-message: "chore: update image versions" 37 | title: "Update image versions" 38 | base: main 39 | branch: ${{ steps.split.outputs.fragment }} 40 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | containerd-shim-zeropod-v2 2 | *.log 3 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | REGISTRY := ghcr.io 2 | NAMESPACE := ctrox 3 | TAG := dev 4 | INSTALLER_IMAGE := $(REGISTRY)/$(NAMESPACE)/zeropod-installer:$(TAG) 5 | MANAGER_IMAGE := $(REGISTRY)/$(NAMESPACE)/zeropod-manager:$(TAG) 6 | TEST_IMAGE := $(REGISTRY)/$(NAMESPACE)/zeropod-test:$(TAG) 7 | # includes fix for https://github.com/checkpoint-restore/criu/issues/2532 8 | CRIU_VERSION := v4.1 9 | CRIU_IMAGE := $(REGISTRY)/$(NAMESPACE)/zeropod-criu:$(CRIU_VERSION) 10 | DOCKER_SOCK := /var/run/docker.sock 11 | EBPF_IMAGE := $(REGISTRY)/$(NAMESPACE)/zeropod-ebpf:$(TAG) 12 | # versioning 13 | PKG=github.com/ctrox/zeropod 14 | CONTAINERD_PKG=github.com/containerd/containerd 15 | VERSION ?= $(shell git describe --match 'v[0-9]*' --dirty='.m' --always --tags) 16 | REVISION=$(shell git rev-parse HEAD)$(shell if ! git diff --no-ext-diff --quiet --exit-code; then echo .m; fi) 17 | LDFLAGS=-s -w 18 | SHIM_LDFLAGS=-X $(CONTAINERD_PKG)/version.Version=$(VERSION) -X $(CONTAINERD_PKG)/version.Revision=$(REVISION) -X $(CONTAINERD_PKG)/version.Package=$(PKG) $(LDFLAGS) 19 | GOARCH ?= $(shell go env GOARCH) 20 | 21 | # build-kind can be used for fast local development. It just builds and 22 | # switches out the shim binary. Running pods have to be recreated to make use 23 | # of the new shim. 24 | build-kind: build 25 | docker cp containerd-shim-zeropod-v2 kind-worker:/opt/zeropod/bin/ 26 | docker cp containerd-shim-zeropod-v2 kind-worker2:/opt/zeropod/bin/ 27 | 28 | install-kind: build-installer build-manager 29 | kind load docker-image $(INSTALLER_IMAGE) 30 | kind load docker-image $(MANAGER_IMAGE) 31 | kubectl apply --context kind-kind -k config/kind 32 | 33 | install-manager: build-manager 34 | kind load docker-image $(MANAGER_IMAGE) 35 | kubectl --context kind-kind -n zeropod-system delete pods -l app.kubernetes.io/name=zeropod-node 36 | 37 | build: 38 | CGO_ENABLED=0 GOARCH=$(GOARCH) GOOS=linux go build -ldflags '${SHIM_LDFLAGS}' -o containerd-shim-zeropod-v2 cmd/shim/main.go 39 | 40 | logs: 41 | docker exec kind-worker journalctl -fu containerd & docker exec kind-worker2 journalctl -fu containerd 42 | 43 | build-criu: 44 | docker buildx build --push --platform linux/arm64,linux/amd64 --build-arg CRIU_VERSION=$(CRIU_VERSION) -t $(CRIU_IMAGE) -f criu/Dockerfile . 45 | 46 | build-installer: 47 | docker build --load -t $(INSTALLER_IMAGE) -f cmd/installer/Dockerfile . 48 | 49 | build-manager: 50 | docker build --build-arg CRIU_VERSION=$(CRIU_VERSION) --load -t $(MANAGER_IMAGE) -f cmd/manager/Dockerfile . 51 | 52 | build-test: 53 | docker build --load -t $(TEST_IMAGE) -f e2e/Dockerfile . 54 | 55 | build-ebpf: 56 | docker build --load -t $(EBPF_IMAGE) -f socket/Dockerfile . 57 | 58 | push-dev: build-installer build-manager 59 | docker push $(INSTALLER_IMAGE) 60 | docker push $(MANAGER_IMAGE) 61 | 62 | test-e2e: 63 | go test -v ./e2e/ 64 | 65 | bench: 66 | go test -bench=. -benchtime=10x -v -run=Bench ./e2e/ 67 | 68 | test: 69 | go test -v -short ./... 70 | 71 | # docker-e2e runs the e2e test in a docker container. However, as running the 72 | # e2e test requires a docker socket (for kind), this mounts the docker socket 73 | # of the host into the container. For now this is the only way to run the e2e 74 | # tests on Mac OS with apple silicon as the shim requires GOOS=linux. 75 | docker-test-e2e: build-test 76 | docker run --rm --privileged --network=host --rm -v $(DOCKER_SOCK):$(DOCKER_SOCK) -v $(PWD):/app $(TEST_IMAGE) make test-e2e 77 | 78 | docker-bench: build-test 79 | docker run --rm --privileged --network=host --rm -v $(DOCKER_SOCK):$(DOCKER_SOCK) -v $(PWD):/app $(TEST_IMAGE) make bench 80 | 81 | # has to have SYS_ADMIN because the test tries to set netns and mount bpffs 82 | # we use --pid=host to make the ebpf tracker work without a pid resolver 83 | docker-test: 84 | docker run --rm --cap-add=SYS_ADMIN --cap-add=NET_ADMIN --pid=host -v $(PWD):/app $(TEST_IMAGE) make test 85 | 86 | CLANG ?= clang 87 | CFLAGS := -O2 -g -Wall -Werror 88 | 89 | # $BPF_CLANG is used in go:generate invocations. 90 | # generate runs go generate in a docker container which has all the required 91 | # dependencies installed. 92 | generate: export BPF_CLANG := $(CLANG) 93 | generate: export BPF_CFLAGS := $(CFLAGS) 94 | generate: ttrpc 95 | go generate ./api/... 96 | docker run --rm -v $(PWD):/app:Z --user $(shell id -u):$(shell id -g) --env=BPF_CLANG="$(CLANG)" --env=BPF_CFLAGS="$(CFLAGS)" $(EBPF_IMAGE) 97 | 98 | ttrpc: 99 | go mod download 100 | cd api/shim/v1; protoc --go_out=. --go_opt=paths=source_relative \ 101 | --ttrpc_out=. --plugin=protoc-gen-ttrpc=`which protoc-gen-go-ttrpc` \ 102 | --ttrpc_opt=paths=source_relative *.proto -I. \ 103 | -I $(shell go env GOMODCACHE)/github.com/prometheus/client_model@v0.6.1 104 | cd api/node/v1; protoc --go_out=. --go_opt=paths=source_relative \ 105 | --ttrpc_out=. --plugin=protoc-gen-ttrpc=`which protoc-gen-go-ttrpc` \ 106 | --ttrpc_opt=paths=source_relative *.proto -I. 107 | 108 | # to improve reproducibility of the bpf builds, we dump the vmlinux.h and 109 | # store it compressed in git instead of dumping it during the build. 110 | update-vmlinux: 111 | docker run --rm -v $(PWD):/app:Z --entrypoint /bin/sh --user $(shell id -u):$(shell id -g) $(EBPF_IMAGE) \ 112 | -c "bpftool btf dump file /sys/kernel/btf/vmlinux format c" | gzip > socket/vmlinux.h.gz 113 | -------------------------------------------------------------------------------- /activator/activator_test.go: -------------------------------------------------------------------------------- 1 | package activator 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | "io" 7 | "log/slog" 8 | "net" 9 | "net/http" 10 | "net/http/httptest" 11 | "os" 12 | "sync" 13 | "testing" 14 | "time" 15 | 16 | "github.com/containernetworking/plugins/pkg/ns" 17 | "github.com/stretchr/testify/assert" 18 | "github.com/stretchr/testify/require" 19 | ) 20 | 21 | func TestActivator(t *testing.T) { 22 | require.NoError(t, MountBPFFS(BPFFSPath)) 23 | 24 | nn, err := ns.GetCurrentNS() 25 | require.NoError(t, err) 26 | 27 | ctx, cancel := context.WithCancel(context.Background()) 28 | 29 | port, err := freePort() 30 | require.NoError(t, err) 31 | 32 | s, err := NewServer(ctx, nn) 33 | require.NoError(t, err) 34 | 35 | bpf, err := InitBPF(os.Getpid(), slog.Default()) 36 | require.NoError(t, err) 37 | require.NoError(t, bpf.AttachRedirector("lo")) 38 | 39 | response := "ok" 40 | ts := httptest.NewUnstartedServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { 41 | fmt.Fprint(w, response) 42 | })) 43 | 44 | once := sync.Once{} 45 | err = s.Start(ctx, []uint16{uint16(port)}, func() error { 46 | once.Do(func() { 47 | // simulate a delay until our server is started 48 | time.Sleep(time.Millisecond * 200) 49 | l, err := net.Listen("tcp4", fmt.Sprintf(":%d", port)) 50 | require.NoError(t, err) 51 | 52 | if err := s.DisableRedirects(); err != nil { 53 | t.Errorf("could not disable redirects: %s", err) 54 | } 55 | 56 | // replace listener of server 57 | ts.Listener.Close() 58 | ts.Listener = l 59 | ts.Start() 60 | t.Logf("listening on :%d", port) 61 | 62 | t.Cleanup(func() { 63 | ts.Close() 64 | }) 65 | }) 66 | return nil 67 | }) 68 | require.NoError(t, err) 69 | t.Cleanup(func() { 70 | s.Stop(ctx) 71 | cancel() 72 | }) 73 | 74 | c := &http.Client{Timeout: time.Second} 75 | 76 | parallelReqs := 10 77 | wg := sync.WaitGroup{} 78 | for _, port := range []int{port} { 79 | port := port 80 | for i := 0; i < parallelReqs; i++ { 81 | wg.Add(1) 82 | go func() { 83 | defer wg.Done() 84 | resp, err := c.Get(fmt.Sprintf("http://localhost:%d", port)) 85 | require.NoError(t, err) 86 | b, err := io.ReadAll(resp.Body) 87 | require.NoError(t, err) 88 | 89 | assert.Equal(t, http.StatusOK, resp.StatusCode) 90 | assert.Equal(t, response, string(b)) 91 | t.Log(string(b)) 92 | }() 93 | } 94 | } 95 | wg.Wait() 96 | } 97 | -------------------------------------------------------------------------------- /activator/bpf.go: -------------------------------------------------------------------------------- 1 | package activator 2 | 3 | import ( 4 | "fmt" 5 | "log/slog" 6 | "net" 7 | "os" 8 | "os/exec" 9 | "path/filepath" 10 | "strconv" 11 | 12 | "github.com/cilium/ebpf" 13 | "github.com/cilium/ebpf/rlimit" 14 | "github.com/vishvananda/netlink" 15 | "golang.org/x/sys/unix" 16 | ) 17 | 18 | // $BPF_CLANG and $BPF_CFLAGS are set by the Makefile. 19 | //go:generate go run github.com/cilium/ebpf/cmd/bpf2go -cc $BPF_CLANG -cflags $BPF_CFLAGS bpf redirector.c -- -I/headers 20 | 21 | const BPFFSPath = "/sys/fs/bpf" 22 | 23 | type BPF struct { 24 | pid int 25 | objs *bpfObjects 26 | qdiscs []*netlink.GenericQdisc 27 | filters []*netlink.BpfFilter 28 | log *slog.Logger 29 | } 30 | 31 | func InitBPF(pid int, log *slog.Logger) (*BPF, error) { 32 | // Allow the current process to lock memory for eBPF resources. 33 | if err := rlimit.RemoveMemlock(); err != nil { 34 | return nil, err 35 | } 36 | 37 | // as a single shim process can host multiple containers, we store the map 38 | // in a directory per shim process. 39 | path := PinPath(pid) 40 | if err := os.MkdirAll(path, os.ModePerm); err != nil { 41 | return nil, fmt.Errorf("failed to create bpf fs subpath: %w", err) 42 | } 43 | 44 | objs := bpfObjects{} 45 | if err := loadBpfObjects(&objs, &ebpf.CollectionOptions{ 46 | Maps: ebpf.MapOptions{ 47 | PinPath: path, 48 | }, 49 | }); err != nil { 50 | return nil, fmt.Errorf("loading objects: %w", err) 51 | } 52 | 53 | return &BPF{pid: pid, log: log, objs: &objs}, nil 54 | } 55 | 56 | func (bpf *BPF) Cleanup() error { 57 | if err := bpf.objs.Close(); err != nil { 58 | return fmt.Errorf("unable to close bpf objects: %w", err) 59 | } 60 | 61 | for _, qdisc := range bpf.qdiscs { 62 | if err := netlink.QdiscDel(qdisc); !os.IsNotExist(err) { 63 | return fmt.Errorf("unable to delete qdisc: %w", err) 64 | } 65 | } 66 | for _, filter := range bpf.filters { 67 | if err := netlink.FilterDel(filter); !os.IsNotExist(err) { 68 | return fmt.Errorf("unable to delete filter: %w", err) 69 | } 70 | } 71 | 72 | bpf.log.Info("deleting", "path", PinPath(bpf.pid)) 73 | return os.RemoveAll(PinPath(bpf.pid)) 74 | } 75 | 76 | func (bpf *BPF) AttachRedirector(ifaces ...string) error { 77 | for _, iface := range ifaces { 78 | devID, err := net.InterfaceByName(iface) 79 | if err != nil { 80 | return fmt.Errorf("could not get interface ID: %w", err) 81 | } 82 | 83 | qdisc := &netlink.GenericQdisc{ 84 | QdiscAttrs: netlink.QdiscAttrs{ 85 | LinkIndex: devID.Index, 86 | Handle: netlink.MakeHandle(0xffff, 0), 87 | Parent: netlink.HANDLE_CLSACT, 88 | }, 89 | QdiscType: "clsact", 90 | } 91 | 92 | if err := netlink.QdiscReplace(qdisc); err != nil { 93 | return fmt.Errorf("could not replace qdisc: %w", err) 94 | } 95 | bpf.qdiscs = append(bpf.qdiscs, qdisc) 96 | 97 | ingress := netlink.BpfFilter{ 98 | FilterAttrs: netlink.FilterAttrs{ 99 | LinkIndex: devID.Index, 100 | Parent: netlink.HANDLE_MIN_INGRESS, 101 | Handle: 1, 102 | Protocol: unix.ETH_P_ALL, 103 | }, 104 | Fd: bpf.objs.TcRedirectIngress.FD(), 105 | Name: bpf.objs.TcRedirectIngress.String(), 106 | DirectAction: true, 107 | } 108 | egress := ingress 109 | egress.Parent = netlink.HANDLE_MIN_EGRESS 110 | egress.Fd = bpf.objs.TcRedirectEgress.FD() 111 | egress.Name = bpf.objs.TcRedirectEgress.String() 112 | 113 | if err := netlink.FilterReplace(&ingress); err != nil { 114 | return fmt.Errorf("failed to replace tc filter: %w", err) 115 | } 116 | bpf.filters = append(bpf.filters, &ingress) 117 | 118 | if err := netlink.FilterReplace(&egress); err != nil { 119 | return fmt.Errorf("failed to replace tc filter: %w", err) 120 | } 121 | bpf.filters = append(bpf.filters, &egress) 122 | } 123 | 124 | return nil 125 | } 126 | 127 | func PinPath(pid int) string { 128 | return filepath.Join(MapsPath(), strconv.Itoa(pid)) 129 | } 130 | 131 | func MapsPath() string { 132 | return filepath.Join(BPFFSPath, "zeropod_maps") 133 | } 134 | 135 | // MountBPFFS executes a mount -t bpf on the supplied path 136 | func MountBPFFS(path string) error { 137 | return mount("bpf", "bpf", path) 138 | } 139 | 140 | // MountBPFFS mounts the kernel debugfs 141 | func MountDebugFS() error { 142 | return mount("debugfs", "debugfs", "/sys/kernel/debug") 143 | } 144 | 145 | func mount(name, typ, path string) error { 146 | const alreadyMountedCode = 32 147 | out, err := exec.Command("mount", "-t", typ, name, path).CombinedOutput() 148 | if err != nil { 149 | if exitError, ok := err.(*exec.ExitError); ok { 150 | if exitError.ExitCode() == alreadyMountedCode { 151 | return nil 152 | } 153 | } 154 | return fmt.Errorf("unable to mount BPF fs: %s: %s", err, out) 155 | } 156 | 157 | return nil 158 | } 159 | -------------------------------------------------------------------------------- /activator/bpf_bpfeb.go: -------------------------------------------------------------------------------- 1 | // Code generated by bpf2go; DO NOT EDIT. 2 | //go:build mips || mips64 || ppc64 || s390x 3 | 4 | package activator 5 | 6 | import ( 7 | "bytes" 8 | _ "embed" 9 | "fmt" 10 | "io" 11 | 12 | "github.com/cilium/ebpf" 13 | ) 14 | 15 | // loadBpf returns the embedded CollectionSpec for bpf. 16 | func loadBpf() (*ebpf.CollectionSpec, error) { 17 | reader := bytes.NewReader(_BpfBytes) 18 | spec, err := ebpf.LoadCollectionSpecFromReader(reader) 19 | if err != nil { 20 | return nil, fmt.Errorf("can't load bpf: %w", err) 21 | } 22 | 23 | return spec, err 24 | } 25 | 26 | // loadBpfObjects loads bpf and converts it into a struct. 27 | // 28 | // The following types are suitable as obj argument: 29 | // 30 | // *bpfObjects 31 | // *bpfPrograms 32 | // *bpfMaps 33 | // 34 | // See ebpf.CollectionSpec.LoadAndAssign documentation for details. 35 | func loadBpfObjects(obj interface{}, opts *ebpf.CollectionOptions) error { 36 | spec, err := loadBpf() 37 | if err != nil { 38 | return err 39 | } 40 | 41 | return spec.LoadAndAssign(obj, opts) 42 | } 43 | 44 | // bpfSpecs contains maps and programs before they are loaded into the kernel. 45 | // 46 | // It can be passed ebpf.CollectionSpec.Assign. 47 | type bpfSpecs struct { 48 | bpfProgramSpecs 49 | bpfMapSpecs 50 | bpfVariableSpecs 51 | } 52 | 53 | // bpfProgramSpecs contains programs before they are loaded into the kernel. 54 | // 55 | // It can be passed ebpf.CollectionSpec.Assign. 56 | type bpfProgramSpecs struct { 57 | TcRedirectEgress *ebpf.ProgramSpec `ebpf:"tc_redirect_egress"` 58 | TcRedirectIngress *ebpf.ProgramSpec `ebpf:"tc_redirect_ingress"` 59 | } 60 | 61 | // bpfMapSpecs contains maps before they are loaded into the kernel. 62 | // 63 | // It can be passed ebpf.CollectionSpec.Assign. 64 | type bpfMapSpecs struct { 65 | ActiveConnections *ebpf.MapSpec `ebpf:"active_connections"` 66 | DisableRedirect *ebpf.MapSpec `ebpf:"disable_redirect"` 67 | EgressRedirects *ebpf.MapSpec `ebpf:"egress_redirects"` 68 | IngressRedirects *ebpf.MapSpec `ebpf:"ingress_redirects"` 69 | } 70 | 71 | // bpfVariableSpecs contains global variables before they are loaded into the kernel. 72 | // 73 | // It can be passed ebpf.CollectionSpec.Assign. 74 | type bpfVariableSpecs struct { 75 | } 76 | 77 | // bpfObjects contains all objects after they have been loaded into the kernel. 78 | // 79 | // It can be passed to loadBpfObjects or ebpf.CollectionSpec.LoadAndAssign. 80 | type bpfObjects struct { 81 | bpfPrograms 82 | bpfMaps 83 | bpfVariables 84 | } 85 | 86 | func (o *bpfObjects) Close() error { 87 | return _BpfClose( 88 | &o.bpfPrograms, 89 | &o.bpfMaps, 90 | ) 91 | } 92 | 93 | // bpfMaps contains all maps after they have been loaded into the kernel. 94 | // 95 | // It can be passed to loadBpfObjects or ebpf.CollectionSpec.LoadAndAssign. 96 | type bpfMaps struct { 97 | ActiveConnections *ebpf.Map `ebpf:"active_connections"` 98 | DisableRedirect *ebpf.Map `ebpf:"disable_redirect"` 99 | EgressRedirects *ebpf.Map `ebpf:"egress_redirects"` 100 | IngressRedirects *ebpf.Map `ebpf:"ingress_redirects"` 101 | } 102 | 103 | func (m *bpfMaps) Close() error { 104 | return _BpfClose( 105 | m.ActiveConnections, 106 | m.DisableRedirect, 107 | m.EgressRedirects, 108 | m.IngressRedirects, 109 | ) 110 | } 111 | 112 | // bpfVariables contains all global variables after they have been loaded into the kernel. 113 | // 114 | // It can be passed to loadBpfObjects or ebpf.CollectionSpec.LoadAndAssign. 115 | type bpfVariables struct { 116 | } 117 | 118 | // bpfPrograms contains all programs after they have been loaded into the kernel. 119 | // 120 | // It can be passed to loadBpfObjects or ebpf.CollectionSpec.LoadAndAssign. 121 | type bpfPrograms struct { 122 | TcRedirectEgress *ebpf.Program `ebpf:"tc_redirect_egress"` 123 | TcRedirectIngress *ebpf.Program `ebpf:"tc_redirect_ingress"` 124 | } 125 | 126 | func (p *bpfPrograms) Close() error { 127 | return _BpfClose( 128 | p.TcRedirectEgress, 129 | p.TcRedirectIngress, 130 | ) 131 | } 132 | 133 | func _BpfClose(closers ...io.Closer) error { 134 | for _, closer := range closers { 135 | if err := closer.Close(); err != nil { 136 | return err 137 | } 138 | } 139 | return nil 140 | } 141 | 142 | // Do not access this directly. 143 | // 144 | //go:embed bpf_bpfeb.o 145 | var _BpfBytes []byte 146 | -------------------------------------------------------------------------------- /activator/bpf_bpfeb.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ctrox/zeropod/74906d25199d6c90954298fb57abc2693a64afc9/activator/bpf_bpfeb.o -------------------------------------------------------------------------------- /activator/bpf_bpfel.go: -------------------------------------------------------------------------------- 1 | // Code generated by bpf2go; DO NOT EDIT. 2 | //go:build 386 || amd64 || arm || arm64 || loong64 || mips64le || mipsle || ppc64le || riscv64 3 | 4 | package activator 5 | 6 | import ( 7 | "bytes" 8 | _ "embed" 9 | "fmt" 10 | "io" 11 | 12 | "github.com/cilium/ebpf" 13 | ) 14 | 15 | // loadBpf returns the embedded CollectionSpec for bpf. 16 | func loadBpf() (*ebpf.CollectionSpec, error) { 17 | reader := bytes.NewReader(_BpfBytes) 18 | spec, err := ebpf.LoadCollectionSpecFromReader(reader) 19 | if err != nil { 20 | return nil, fmt.Errorf("can't load bpf: %w", err) 21 | } 22 | 23 | return spec, err 24 | } 25 | 26 | // loadBpfObjects loads bpf and converts it into a struct. 27 | // 28 | // The following types are suitable as obj argument: 29 | // 30 | // *bpfObjects 31 | // *bpfPrograms 32 | // *bpfMaps 33 | // 34 | // See ebpf.CollectionSpec.LoadAndAssign documentation for details. 35 | func loadBpfObjects(obj interface{}, opts *ebpf.CollectionOptions) error { 36 | spec, err := loadBpf() 37 | if err != nil { 38 | return err 39 | } 40 | 41 | return spec.LoadAndAssign(obj, opts) 42 | } 43 | 44 | // bpfSpecs contains maps and programs before they are loaded into the kernel. 45 | // 46 | // It can be passed ebpf.CollectionSpec.Assign. 47 | type bpfSpecs struct { 48 | bpfProgramSpecs 49 | bpfMapSpecs 50 | bpfVariableSpecs 51 | } 52 | 53 | // bpfProgramSpecs contains programs before they are loaded into the kernel. 54 | // 55 | // It can be passed ebpf.CollectionSpec.Assign. 56 | type bpfProgramSpecs struct { 57 | TcRedirectEgress *ebpf.ProgramSpec `ebpf:"tc_redirect_egress"` 58 | TcRedirectIngress *ebpf.ProgramSpec `ebpf:"tc_redirect_ingress"` 59 | } 60 | 61 | // bpfMapSpecs contains maps before they are loaded into the kernel. 62 | // 63 | // It can be passed ebpf.CollectionSpec.Assign. 64 | type bpfMapSpecs struct { 65 | ActiveConnections *ebpf.MapSpec `ebpf:"active_connections"` 66 | DisableRedirect *ebpf.MapSpec `ebpf:"disable_redirect"` 67 | EgressRedirects *ebpf.MapSpec `ebpf:"egress_redirects"` 68 | IngressRedirects *ebpf.MapSpec `ebpf:"ingress_redirects"` 69 | } 70 | 71 | // bpfVariableSpecs contains global variables before they are loaded into the kernel. 72 | // 73 | // It can be passed ebpf.CollectionSpec.Assign. 74 | type bpfVariableSpecs struct { 75 | } 76 | 77 | // bpfObjects contains all objects after they have been loaded into the kernel. 78 | // 79 | // It can be passed to loadBpfObjects or ebpf.CollectionSpec.LoadAndAssign. 80 | type bpfObjects struct { 81 | bpfPrograms 82 | bpfMaps 83 | bpfVariables 84 | } 85 | 86 | func (o *bpfObjects) Close() error { 87 | return _BpfClose( 88 | &o.bpfPrograms, 89 | &o.bpfMaps, 90 | ) 91 | } 92 | 93 | // bpfMaps contains all maps after they have been loaded into the kernel. 94 | // 95 | // It can be passed to loadBpfObjects or ebpf.CollectionSpec.LoadAndAssign. 96 | type bpfMaps struct { 97 | ActiveConnections *ebpf.Map `ebpf:"active_connections"` 98 | DisableRedirect *ebpf.Map `ebpf:"disable_redirect"` 99 | EgressRedirects *ebpf.Map `ebpf:"egress_redirects"` 100 | IngressRedirects *ebpf.Map `ebpf:"ingress_redirects"` 101 | } 102 | 103 | func (m *bpfMaps) Close() error { 104 | return _BpfClose( 105 | m.ActiveConnections, 106 | m.DisableRedirect, 107 | m.EgressRedirects, 108 | m.IngressRedirects, 109 | ) 110 | } 111 | 112 | // bpfVariables contains all global variables after they have been loaded into the kernel. 113 | // 114 | // It can be passed to loadBpfObjects or ebpf.CollectionSpec.LoadAndAssign. 115 | type bpfVariables struct { 116 | } 117 | 118 | // bpfPrograms contains all programs after they have been loaded into the kernel. 119 | // 120 | // It can be passed to loadBpfObjects or ebpf.CollectionSpec.LoadAndAssign. 121 | type bpfPrograms struct { 122 | TcRedirectEgress *ebpf.Program `ebpf:"tc_redirect_egress"` 123 | TcRedirectIngress *ebpf.Program `ebpf:"tc_redirect_ingress"` 124 | } 125 | 126 | func (p *bpfPrograms) Close() error { 127 | return _BpfClose( 128 | p.TcRedirectEgress, 129 | p.TcRedirectIngress, 130 | ) 131 | } 132 | 133 | func _BpfClose(closers ...io.Closer) error { 134 | for _, closer := range closers { 135 | if err := closer.Close(); err != nil { 136 | return err 137 | } 138 | } 139 | return nil 140 | } 141 | 142 | // Do not access this directly. 143 | // 144 | //go:embed bpf_bpfel.o 145 | var _BpfBytes []byte 146 | -------------------------------------------------------------------------------- /activator/bpf_bpfel.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ctrox/zeropod/74906d25199d6c90954298fb57abc2693a64afc9/activator/bpf_bpfel.o -------------------------------------------------------------------------------- /activator/redirector.c: -------------------------------------------------------------------------------- 1 | //go:build ignore 2 | 3 | #include "vmlinux.h" 4 | #include "bpf_helpers.h" 5 | #include "bpf_endian.h" 6 | 7 | char __license[] SEC("license") = "Dual MIT/GPL"; 8 | 9 | #define TC_ACT_OK 0 10 | 11 | struct { 12 | __uint(type, BPF_MAP_TYPE_LRU_HASH); 13 | __uint(max_entries, 128); 14 | __type(key, __be16); // sport 15 | __type(value, __be16); // dport 16 | __uint(pinning, LIBBPF_PIN_BY_NAME); 17 | } ingress_redirects SEC(".maps"); 18 | 19 | struct { 20 | __uint(type, BPF_MAP_TYPE_LRU_HASH); 21 | __uint(max_entries, 128); 22 | __type(key, __be16); // sport 23 | __type(value, __be16); // dport 24 | __uint(pinning, LIBBPF_PIN_BY_NAME); 25 | } egress_redirects SEC(".maps"); 26 | 27 | struct { 28 | __uint(type, BPF_MAP_TYPE_LRU_HASH); 29 | __uint(max_entries, 512); 30 | __type(key, __be16); // proxy port 31 | __type(value, u8); // unused 32 | __uint(pinning, LIBBPF_PIN_BY_NAME); 33 | } disable_redirect SEC(".maps"); 34 | 35 | struct { 36 | __uint(type, BPF_MAP_TYPE_LRU_HASH); 37 | __uint(max_entries, 512); // TBD but should probably be enough 38 | __type(key, __be16); // remote_port 39 | __type(value, u8); // unused 40 | __uint(pinning, LIBBPF_PIN_BY_NAME); 41 | } active_connections SEC(".maps"); 42 | 43 | static __always_inline int disabled(__be16 sport_h, __be16 dport_h) { 44 | void *disable_redirect_map = &disable_redirect; 45 | 46 | void *disabled_s = bpf_map_lookup_elem(disable_redirect_map, &sport_h); 47 | 48 | if (disabled_s) { 49 | return 1; 50 | } 51 | 52 | void *disabled_d = bpf_map_lookup_elem(disable_redirect_map, &dport_h); 53 | 54 | if (disabled_d) { 55 | return 1; 56 | } 57 | 58 | return 0; 59 | }; 60 | 61 | static __always_inline int ingress_redirect(struct tcphdr *tcp) { 62 | __be16 sport_h = bpf_ntohs(tcp->source); 63 | __be16 dport_h = bpf_ntohs(tcp->dest); 64 | 65 | void *active_connections_map = &active_connections; 66 | 67 | void *redirect_map = &ingress_redirects; 68 | __be16 *new_dest = bpf_map_lookup_elem(redirect_map, &dport_h); 69 | 70 | if (new_dest) { 71 | // check ports which should not be redirected 72 | if (disabled(sport_h, dport_h)) { 73 | // if we can find an acive connection on the source port, we need 74 | // to redirect regardless until the connection is closed. 75 | void *conn_sport = bpf_map_lookup_elem(active_connections_map, &sport_h); 76 | if (!conn_sport) { 77 | // bpf_printk("ingress: sport %d or dport %d is disabled for redirecting", sport_h, dport_h); 78 | return TC_ACT_OK; 79 | } 80 | // bpf_printk("ingress: port %d found in active connections, redirecting", sport_h); 81 | } 82 | // bpf_printk("ingress: changing destination port from %d to %d for packet from %d", dport_h, *new_dest, sport_h); 83 | tcp->dest = bpf_htons(*new_dest); 84 | } 85 | 86 | return TC_ACT_OK; 87 | } 88 | 89 | static __always_inline int egress_redirect(struct tcphdr *tcp) { 90 | __be16 sport_h = bpf_ntohs(tcp->source); 91 | // __be16 dport_h = bpf_ntohs(tcp->dest); 92 | 93 | void *redirect_map = &egress_redirects; 94 | __be16 *new_source = bpf_map_lookup_elem(redirect_map, &sport_h); 95 | 96 | if (new_source) { 97 | // bpf_printk("egress: changing source port from %d to %d for packet to %d", sport_h, *new_source, dport_h); 98 | tcp->source = bpf_htons(*new_source); 99 | } 100 | 101 | return TC_ACT_OK; 102 | } 103 | 104 | static __always_inline int parse_and_redirect(struct __sk_buff *ctx, bool ingress) { 105 | void *data = (void *)(long)ctx->data; 106 | void *data_end = (void *)(long)ctx->data_end; 107 | struct ethhdr *eth = data; 108 | 109 | if ((void*)eth + sizeof(*eth) <= data_end) { 110 | struct iphdr *ip = data + sizeof(*eth); 111 | 112 | if ((void*)ip + sizeof(*ip) <= data_end) { 113 | if (ip->protocol == IPPROTO_TCP) { 114 | struct tcphdr *tcp = (void*)ip + sizeof(*ip); 115 | if ((void*)tcp + sizeof(*tcp) <= data_end) { 116 | if (ingress) { 117 | return ingress_redirect(tcp); 118 | } 119 | 120 | return egress_redirect(tcp); 121 | } 122 | } 123 | } 124 | } 125 | 126 | return 0; 127 | } 128 | 129 | 130 | SEC("tc") 131 | int tc_redirect_ingress(struct __sk_buff *skb) { 132 | return parse_and_redirect(skb, true); 133 | } 134 | 135 | SEC("tc") 136 | int tc_redirect_egress(struct __sk_buff *skb) { 137 | return parse_and_redirect(skb, false); 138 | } 139 | -------------------------------------------------------------------------------- /api/node/v1/meta.go: -------------------------------------------------------------------------------- 1 | package v1 2 | 3 | import "path/filepath" 4 | 5 | const ( 6 | runPath = "/run/zeropod/" 7 | varPath = "/var/lib/zeropod/" 8 | SocketPath = runPath + "node.sock" 9 | imagesPath = varPath + "i/" 10 | SnapshotSuffix = "snapshot" 11 | WorkDirSuffix = "work" 12 | MigrateAnnotationKey = "zeropod.ctrox.dev/migrate" 13 | LiveMigrateAnnotationKey = "zeropod.ctrox.dev/live-migrate" 14 | NodeNameEnvKey = "NODE_NAME" 15 | PodIPEnvKey = "POD_IP" 16 | preDumpDirName = "pre-dump" 17 | ) 18 | 19 | func ImagePath(id string) string { 20 | return filepath.Join(imagesPath, id) 21 | } 22 | 23 | func WorkDirPath(id string) string { 24 | return filepath.Join(ImagePath(id), WorkDirSuffix) 25 | } 26 | 27 | func SnapshotPath(id string) string { 28 | return filepath.Join(ImagePath(id), SnapshotSuffix) 29 | } 30 | 31 | func LazyPagesSocket(id string) string { 32 | return filepath.Join(runPath, id+".sock") 33 | } 34 | 35 | func PreDumpDir(id string) string { 36 | return filepath.Join(SnapshotPath(id), preDumpDirName) 37 | } 38 | 39 | func RelativePreDumpDir() string { 40 | return filepath.Join("..", SnapshotSuffix, preDumpDirName) 41 | } 42 | -------------------------------------------------------------------------------- /api/node/v1/node.proto: -------------------------------------------------------------------------------- 1 | syntax = "proto3"; 2 | 3 | package zeropod.node.v1; 4 | option go_package = "github.com/ctrox/zeropod/api/node/v1/;v1"; 5 | 6 | import "google/protobuf/empty.proto"; 7 | import "google/protobuf/timestamp.proto"; 8 | 9 | service Node { 10 | rpc Evac(EvacRequest) returns (EvacResponse); 11 | rpc PrepareEvac(EvacRequest) returns (EvacResponse); 12 | rpc Restore(RestoreRequest) returns (RestoreResponse); 13 | rpc FinishRestore(RestoreRequest) returns (RestoreResponse); 14 | rpc NewCriuLazyPages(CriuLazyPagesRequest) returns (google.protobuf.Empty); 15 | rpc PullImage(PullImageRequest) returns (stream Image); 16 | } 17 | 18 | message EvacRequest { 19 | PodInfo pod_info = 1; 20 | MigrationInfo migration_info = 2; 21 | } 22 | 23 | message EvacResponse { 24 | google.protobuf.Empty empty = 1; 25 | } 26 | 27 | message RestoreRequest { 28 | PodInfo pod_info = 1; 29 | MigrationInfo migration_info = 2; 30 | } 31 | 32 | message RestoreResponse { 33 | MigrationInfo migration_info = 1; 34 | } 35 | 36 | message PodInfo { 37 | string name = 1; 38 | string namespace = 2; 39 | string container_name = 3; 40 | repeated int32 ports = 4; 41 | } 42 | 43 | message MigrationInfo { 44 | string image_id = 1; 45 | string bundle_dir = 2; 46 | bool live_migration = 3; 47 | google.protobuf.Timestamp paused_at = 4; 48 | google.protobuf.Timestamp restore_start = 5; 49 | google.protobuf.Timestamp restore_end = 6; 50 | repeated int32 ports = 7; 51 | } 52 | 53 | message Image { 54 | bytes imageData = 1; 55 | } 56 | 57 | message CriuLazyPagesRequest { 58 | string checkpoint_path = 1; 59 | string address = 2; 60 | int32 port = 3; 61 | bool tls = 4; 62 | } 63 | 64 | message PullImageRequest { 65 | string image_id = 1; 66 | } 67 | -------------------------------------------------------------------------------- /api/node/v1/node_ttrpc.pb.go: -------------------------------------------------------------------------------- 1 | // Code generated by protoc-gen-go-ttrpc. DO NOT EDIT. 2 | // source: node.proto 3 | package v1 4 | 5 | import ( 6 | context "context" 7 | ttrpc "github.com/containerd/ttrpc" 8 | emptypb "google.golang.org/protobuf/types/known/emptypb" 9 | ) 10 | 11 | type NodeService interface { 12 | Evac(context.Context, *EvacRequest) (*EvacResponse, error) 13 | PrepareEvac(context.Context, *EvacRequest) (*EvacResponse, error) 14 | Restore(context.Context, *RestoreRequest) (*RestoreResponse, error) 15 | FinishRestore(context.Context, *RestoreRequest) (*RestoreResponse, error) 16 | NewCriuLazyPages(context.Context, *CriuLazyPagesRequest) (*emptypb.Empty, error) 17 | PullImage(context.Context, *PullImageRequest, Node_PullImageServer) error 18 | } 19 | 20 | type Node_PullImageServer interface { 21 | Send(*Image) error 22 | ttrpc.StreamServer 23 | } 24 | 25 | type nodePullImageServer struct { 26 | ttrpc.StreamServer 27 | } 28 | 29 | func (x *nodePullImageServer) Send(m *Image) error { 30 | return x.StreamServer.SendMsg(m) 31 | } 32 | 33 | func RegisterNodeService(srv *ttrpc.Server, svc NodeService) { 34 | srv.RegisterService("zeropod.node.v1.Node", &ttrpc.ServiceDesc{ 35 | Methods: map[string]ttrpc.Method{ 36 | "Evac": func(ctx context.Context, unmarshal func(interface{}) error) (interface{}, error) { 37 | var req EvacRequest 38 | if err := unmarshal(&req); err != nil { 39 | return nil, err 40 | } 41 | return svc.Evac(ctx, &req) 42 | }, 43 | "PrepareEvac": func(ctx context.Context, unmarshal func(interface{}) error) (interface{}, error) { 44 | var req EvacRequest 45 | if err := unmarshal(&req); err != nil { 46 | return nil, err 47 | } 48 | return svc.PrepareEvac(ctx, &req) 49 | }, 50 | "Restore": func(ctx context.Context, unmarshal func(interface{}) error) (interface{}, error) { 51 | var req RestoreRequest 52 | if err := unmarshal(&req); err != nil { 53 | return nil, err 54 | } 55 | return svc.Restore(ctx, &req) 56 | }, 57 | "FinishRestore": func(ctx context.Context, unmarshal func(interface{}) error) (interface{}, error) { 58 | var req RestoreRequest 59 | if err := unmarshal(&req); err != nil { 60 | return nil, err 61 | } 62 | return svc.FinishRestore(ctx, &req) 63 | }, 64 | "NewCriuLazyPages": func(ctx context.Context, unmarshal func(interface{}) error) (interface{}, error) { 65 | var req CriuLazyPagesRequest 66 | if err := unmarshal(&req); err != nil { 67 | return nil, err 68 | } 69 | return svc.NewCriuLazyPages(ctx, &req) 70 | }, 71 | }, 72 | Streams: map[string]ttrpc.Stream{ 73 | "PullImage": { 74 | Handler: func(ctx context.Context, stream ttrpc.StreamServer) (interface{}, error) { 75 | m := new(PullImageRequest) 76 | if err := stream.RecvMsg(m); err != nil { 77 | return nil, err 78 | } 79 | return nil, svc.PullImage(ctx, m, &nodePullImageServer{stream}) 80 | }, 81 | StreamingClient: false, 82 | StreamingServer: true, 83 | }, 84 | }, 85 | }) 86 | } 87 | 88 | type NodeClient interface { 89 | Evac(context.Context, *EvacRequest) (*EvacResponse, error) 90 | PrepareEvac(context.Context, *EvacRequest) (*EvacResponse, error) 91 | Restore(context.Context, *RestoreRequest) (*RestoreResponse, error) 92 | FinishRestore(context.Context, *RestoreRequest) (*RestoreResponse, error) 93 | NewCriuLazyPages(context.Context, *CriuLazyPagesRequest) (*emptypb.Empty, error) 94 | PullImage(context.Context, *PullImageRequest) (Node_PullImageClient, error) 95 | } 96 | 97 | type nodeClient struct { 98 | client *ttrpc.Client 99 | } 100 | 101 | func NewNodeClient(client *ttrpc.Client) NodeClient { 102 | return &nodeClient{ 103 | client: client, 104 | } 105 | } 106 | 107 | func (c *nodeClient) Evac(ctx context.Context, req *EvacRequest) (*EvacResponse, error) { 108 | var resp EvacResponse 109 | if err := c.client.Call(ctx, "zeropod.node.v1.Node", "Evac", req, &resp); err != nil { 110 | return nil, err 111 | } 112 | return &resp, nil 113 | } 114 | 115 | func (c *nodeClient) PrepareEvac(ctx context.Context, req *EvacRequest) (*EvacResponse, error) { 116 | var resp EvacResponse 117 | if err := c.client.Call(ctx, "zeropod.node.v1.Node", "PrepareEvac", req, &resp); err != nil { 118 | return nil, err 119 | } 120 | return &resp, nil 121 | } 122 | 123 | func (c *nodeClient) Restore(ctx context.Context, req *RestoreRequest) (*RestoreResponse, error) { 124 | var resp RestoreResponse 125 | if err := c.client.Call(ctx, "zeropod.node.v1.Node", "Restore", req, &resp); err != nil { 126 | return nil, err 127 | } 128 | return &resp, nil 129 | } 130 | 131 | func (c *nodeClient) FinishRestore(ctx context.Context, req *RestoreRequest) (*RestoreResponse, error) { 132 | var resp RestoreResponse 133 | if err := c.client.Call(ctx, "zeropod.node.v1.Node", "FinishRestore", req, &resp); err != nil { 134 | return nil, err 135 | } 136 | return &resp, nil 137 | } 138 | 139 | func (c *nodeClient) NewCriuLazyPages(ctx context.Context, req *CriuLazyPagesRequest) (*emptypb.Empty, error) { 140 | var resp emptypb.Empty 141 | if err := c.client.Call(ctx, "zeropod.node.v1.Node", "NewCriuLazyPages", req, &resp); err != nil { 142 | return nil, err 143 | } 144 | return &resp, nil 145 | } 146 | 147 | func (c *nodeClient) PullImage(ctx context.Context, req *PullImageRequest) (Node_PullImageClient, error) { 148 | stream, err := c.client.NewStream(ctx, &ttrpc.StreamDesc{ 149 | StreamingClient: false, 150 | StreamingServer: true, 151 | }, "zeropod.node.v1.Node", "PullImage", req) 152 | if err != nil { 153 | return nil, err 154 | } 155 | x := &nodePullImageClient{stream} 156 | return x, nil 157 | } 158 | 159 | type Node_PullImageClient interface { 160 | Recv() (*Image, error) 161 | ttrpc.ClientStream 162 | } 163 | 164 | type nodePullImageClient struct { 165 | ttrpc.ClientStream 166 | } 167 | 168 | func (x *nodePullImageClient) Recv() (*Image, error) { 169 | m := new(Image) 170 | if err := x.ClientStream.RecvMsg(m); err != nil { 171 | return nil, err 172 | } 173 | return m, nil 174 | } 175 | -------------------------------------------------------------------------------- /api/runtime/generate.go: -------------------------------------------------------------------------------- 1 | //go:build generate 2 | // +build generate 3 | 4 | // Remove existing CRDs 5 | //go:generate rm -rf ../package/crds 6 | 7 | // Generate deepcopy methodsets and CRD manifests 8 | //go:generate go run -tags generate sigs.k8s.io/controller-tools/cmd/controller-gen object:headerFile="../../hack/boilerplate.go.txt" paths=./... crd:crdVersions=v1 output:artifacts:config=../../config/crds 9 | 10 | // Package runtime contains API types for all runtime related resources. 11 | package runtime 12 | 13 | import ( 14 | _ "sigs.k8s.io/controller-tools/cmd/controller-gen" //nolint:typecheck 15 | ) 16 | -------------------------------------------------------------------------------- /api/runtime/v1/meta.go: -------------------------------------------------------------------------------- 1 | package v1 2 | 3 | const ( 4 | RuntimeClassName = "zeropod" 5 | ) 6 | -------------------------------------------------------------------------------- /api/runtime/v1/register.go: -------------------------------------------------------------------------------- 1 | // Package v1 contains API Schema definitions for the node v1 API group. 2 | // +groupName=runtime.zeropod.ctrox.dev 3 | // +versionName=v1 4 | package v1 5 | 6 | import ( 7 | reflect "reflect" 8 | 9 | "k8s.io/apimachinery/pkg/runtime/schema" 10 | "sigs.k8s.io/controller-runtime/pkg/scheme" 11 | ) 12 | 13 | const ( 14 | Group = "runtime.zeropod.ctrox.dev" 15 | Version = "v1" 16 | ) 17 | 18 | var ( 19 | // GroupVersion is group version used to register these objects. 20 | GroupVersion = schema.GroupVersion{Group: Group, Version: Version} 21 | 22 | // SchemeBuilder is used to add go types to the GroupVersionKind scheme. 23 | SchemeBuilder = &scheme.Builder{GroupVersion: GroupVersion} 24 | 25 | // AddToScheme adds the types in this group-version to the given scheme. 26 | AddToScheme = SchemeBuilder.AddToScheme 27 | ) 28 | 29 | var ( 30 | MigrationKind = reflect.TypeOf(Migration{}).Name() 31 | MigrationGroupKind = schema.GroupKind{Group: Group, Kind: MigrationKind}.String() 32 | MigrationKindAPIVersion = MigrationKind + "." + GroupVersion.String() 33 | MigrationGroupVersionKind = GroupVersion.WithKind(MigrationKind) 34 | ) 35 | 36 | func init() { 37 | SchemeBuilder.Register(&Migration{}, &MigrationList{}) 38 | } 39 | -------------------------------------------------------------------------------- /api/runtime/v1/types.go: -------------------------------------------------------------------------------- 1 | package v1 2 | 3 | import ( 4 | "fmt" 5 | 6 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 7 | ) 8 | 9 | type MigrationServer struct { 10 | Host string `json:"host"` 11 | Port int `json:"port"` 12 | } 13 | 14 | func (ms MigrationServer) Address() string { 15 | return fmt.Sprintf("%s:%d", ms.Host, ms.Port) 16 | } 17 | 18 | // +kubebuilder:object:generate:=true 19 | type MigrationSpec struct { 20 | // LiveMigration indicates if this migration is done live (lazy) or not. If 21 | // set, the source node will setup a page server to serve memory pages 22 | // during live migration. If false, the image copy will include all memory 23 | // pages, which might result in a slower migration. 24 | // +optional 25 | LiveMigration bool `json:"liveMigration"` 26 | // SourceNode of the pod to be migrated 27 | SourceNode string `json:"sourceNode"` 28 | // TargetNode of the pod to be migrated 29 | // +optional 30 | TargetNode string `json:"targetNode,omitempty"` 31 | // SourcePod of the migration 32 | // +optional 33 | SourcePod string `json:"sourcePod,omitempty"` 34 | // TargetPod of the migration 35 | // +optional 36 | TargetPod string `json:"targetPod,omitempty"` 37 | // PodTemplateHash of the source pod. This is used to find a suitable target 38 | // pod. 39 | PodTemplateHash string `json:"podTemplateHash"` 40 | // Containers to be migrated 41 | // +listType:=map 42 | // +listMapKey:=name 43 | Containers []MigrationContainer `json:"containers"` 44 | } 45 | 46 | // +kubebuilder:object:generate:=true 47 | type MigrationContainer struct { 48 | Name string `json:"name"` 49 | ID string `json:"id"` 50 | // ImageServer to pull the CRIU checkpoint image from. 51 | // +optional 52 | ImageServer *MigrationServer `json:"imageServer,omitempty"` 53 | // PageServer to pull the memory pages from during lazy migration. 54 | // +optional 55 | PageServer *MigrationServer `json:"pageServer,omitempty"` 56 | 57 | Ports []int32 `json:"ports,omitempty"` 58 | } 59 | 60 | // +kubebuilder:object:generate:=true 61 | type MigrationStatus struct { 62 | // Containers indicates the status of the individual container migrations. 63 | // +listType:=map 64 | // +listMapKey:=name 65 | Containers []MigrationContainerStatus `json:"containers"` 66 | } 67 | 68 | // +kubebuilder:object:generate:=true 69 | type MigrationContainerStatus struct { 70 | Name string `json:"name"` 71 | Condition MigrationCondition `json:"condition"` 72 | PausedAt metav1.MicroTime `json:"pausedAt,omitempty"` 73 | RestoredAt metav1.MicroTime `json:"restoredAt,omitempty"` 74 | MigrationDuration metav1.Duration `json:"migrationDuration,omitempty"` 75 | } 76 | 77 | type MigrationPhase string 78 | 79 | const ( 80 | MigrationPhasePending MigrationPhase = "Pending" 81 | MigrationPhaseRunning MigrationPhase = "Running" 82 | MigrationPhaseCompleted MigrationPhase = "Completed" 83 | MigrationPhaseFailed MigrationPhase = "Failed" 84 | MigrationPhaseUnclaimed MigrationPhase = "Unclaimed" 85 | ) 86 | 87 | type MigrationCondition struct { 88 | Phase MigrationPhase `json:"phase,omitempty"` 89 | } 90 | 91 | // +kubebuilder:object:root=true 92 | // +kubebuilder:storageversion 93 | // +kubebuilder:subresource:status 94 | // +kubebuilder:printcolumn:name="Phase",type="string",JSONPath=".status.containers[*].condition.phase" 95 | // +kubebuilder:printcolumn:name="Live",type="boolean",JSONPath=".spec.liveMigration" 96 | // +kubebuilder:printcolumn:name="Duration",type="string",JSONPath=".status.containers[*].migrationDuration" 97 | // +kubebuilder:printcolumn:name="Age",type="date",JSONPath=".metadata.creationTimestamp" 98 | // +kubebuilder:resource:scope=Namespaced 99 | // Migration tracks container live migrations done by zeropod. 100 | type Migration struct { 101 | metav1.TypeMeta `json:",inline"` 102 | metav1.ObjectMeta `json:"metadata,omitempty"` 103 | 104 | Spec MigrationSpec `json:"spec,omitempty"` 105 | Status MigrationStatus `json:"status,omitempty"` 106 | } 107 | 108 | // +kubebuilder:object:root=true 109 | 110 | // MigrationList contains a list of Migration. 111 | type MigrationList struct { 112 | metav1.TypeMeta `json:",inline"` 113 | metav1.ListMeta `json:"metadata,omitempty"` 114 | Items []Migration `json:"items"` 115 | } 116 | -------------------------------------------------------------------------------- /api/runtime/v1/zz_generated.deepcopy.go: -------------------------------------------------------------------------------- 1 | //go:build !ignore_autogenerated 2 | 3 | /* 4 | Copyright 2025 The zeropod authors. 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | */ 18 | 19 | // Code generated by controller-gen. DO NOT EDIT. 20 | 21 | package v1 22 | 23 | import ( 24 | runtime "k8s.io/apimachinery/pkg/runtime" 25 | ) 26 | 27 | // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. 28 | func (in *Migration) DeepCopyInto(out *Migration) { 29 | *out = *in 30 | out.TypeMeta = in.TypeMeta 31 | in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) 32 | in.Spec.DeepCopyInto(&out.Spec) 33 | in.Status.DeepCopyInto(&out.Status) 34 | } 35 | 36 | // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new Migration. 37 | func (in *Migration) DeepCopy() *Migration { 38 | if in == nil { 39 | return nil 40 | } 41 | out := new(Migration) 42 | in.DeepCopyInto(out) 43 | return out 44 | } 45 | 46 | // DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. 47 | func (in *Migration) DeepCopyObject() runtime.Object { 48 | if c := in.DeepCopy(); c != nil { 49 | return c 50 | } 51 | return nil 52 | } 53 | 54 | // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. 55 | func (in *MigrationContainer) DeepCopyInto(out *MigrationContainer) { 56 | *out = *in 57 | if in.ImageServer != nil { 58 | in, out := &in.ImageServer, &out.ImageServer 59 | *out = new(MigrationServer) 60 | **out = **in 61 | } 62 | if in.PageServer != nil { 63 | in, out := &in.PageServer, &out.PageServer 64 | *out = new(MigrationServer) 65 | **out = **in 66 | } 67 | if in.Ports != nil { 68 | in, out := &in.Ports, &out.Ports 69 | *out = make([]int32, len(*in)) 70 | copy(*out, *in) 71 | } 72 | } 73 | 74 | // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MigrationContainer. 75 | func (in *MigrationContainer) DeepCopy() *MigrationContainer { 76 | if in == nil { 77 | return nil 78 | } 79 | out := new(MigrationContainer) 80 | in.DeepCopyInto(out) 81 | return out 82 | } 83 | 84 | // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. 85 | func (in *MigrationContainerStatus) DeepCopyInto(out *MigrationContainerStatus) { 86 | *out = *in 87 | out.Condition = in.Condition 88 | in.PausedAt.DeepCopyInto(&out.PausedAt) 89 | in.RestoredAt.DeepCopyInto(&out.RestoredAt) 90 | out.MigrationDuration = in.MigrationDuration 91 | } 92 | 93 | // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MigrationContainerStatus. 94 | func (in *MigrationContainerStatus) DeepCopy() *MigrationContainerStatus { 95 | if in == nil { 96 | return nil 97 | } 98 | out := new(MigrationContainerStatus) 99 | in.DeepCopyInto(out) 100 | return out 101 | } 102 | 103 | // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. 104 | func (in *MigrationList) DeepCopyInto(out *MigrationList) { 105 | *out = *in 106 | out.TypeMeta = in.TypeMeta 107 | in.ListMeta.DeepCopyInto(&out.ListMeta) 108 | if in.Items != nil { 109 | in, out := &in.Items, &out.Items 110 | *out = make([]Migration, len(*in)) 111 | for i := range *in { 112 | (*in)[i].DeepCopyInto(&(*out)[i]) 113 | } 114 | } 115 | } 116 | 117 | // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MigrationList. 118 | func (in *MigrationList) DeepCopy() *MigrationList { 119 | if in == nil { 120 | return nil 121 | } 122 | out := new(MigrationList) 123 | in.DeepCopyInto(out) 124 | return out 125 | } 126 | 127 | // DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. 128 | func (in *MigrationList) DeepCopyObject() runtime.Object { 129 | if c := in.DeepCopy(); c != nil { 130 | return c 131 | } 132 | return nil 133 | } 134 | 135 | // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. 136 | func (in *MigrationSpec) DeepCopyInto(out *MigrationSpec) { 137 | *out = *in 138 | if in.Containers != nil { 139 | in, out := &in.Containers, &out.Containers 140 | *out = make([]MigrationContainer, len(*in)) 141 | for i := range *in { 142 | (*in)[i].DeepCopyInto(&(*out)[i]) 143 | } 144 | } 145 | } 146 | 147 | // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MigrationSpec. 148 | func (in *MigrationSpec) DeepCopy() *MigrationSpec { 149 | if in == nil { 150 | return nil 151 | } 152 | out := new(MigrationSpec) 153 | in.DeepCopyInto(out) 154 | return out 155 | } 156 | 157 | // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. 158 | func (in *MigrationStatus) DeepCopyInto(out *MigrationStatus) { 159 | *out = *in 160 | if in.Containers != nil { 161 | in, out := &in.Containers, &out.Containers 162 | *out = make([]MigrationContainerStatus, len(*in)) 163 | for i := range *in { 164 | (*in)[i].DeepCopyInto(&(*out)[i]) 165 | } 166 | } 167 | } 168 | 169 | // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MigrationStatus. 170 | func (in *MigrationStatus) DeepCopy() *MigrationStatus { 171 | if in == nil { 172 | return nil 173 | } 174 | out := new(MigrationStatus) 175 | in.DeepCopyInto(out) 176 | return out 177 | } 178 | -------------------------------------------------------------------------------- /api/shim/v1/shim.proto: -------------------------------------------------------------------------------- 1 | syntax = "proto3"; 2 | 3 | package zeropod.shim.v1; 4 | option go_package = "github.com/ctrox/zeropod/api/shim/v1/;v1"; 5 | 6 | import "google/protobuf/empty.proto"; 7 | import "google/protobuf/timestamp.proto"; 8 | import "google/protobuf/duration.proto"; 9 | 10 | service Shim { 11 | rpc Metrics(MetricsRequest) returns (MetricsResponse); 12 | rpc GetStatus(ContainerRequest) returns (ContainerStatus); 13 | rpc SubscribeStatus(SubscribeStatusRequest) returns (stream ContainerStatus); 14 | } 15 | 16 | message MetricsRequest { 17 | google.protobuf.Empty empty = 1; 18 | } 19 | 20 | message SubscribeStatusRequest { 21 | google.protobuf.Empty empty = 1; 22 | } 23 | 24 | message MetricsResponse { 25 | repeated ContainerMetrics metrics = 1; 26 | } 27 | 28 | message ContainerRequest { 29 | string id = 1; 30 | } 31 | 32 | enum ContainerPhase { 33 | SCALED_DOWN = 0; 34 | RUNNING = 1; 35 | STOPPING = 2; 36 | } 37 | 38 | message ContainerStatus { 39 | string id = 1; 40 | string name = 2; 41 | string pod_name = 3; 42 | string pod_namespace = 4; 43 | ContainerPhase phase = 5; 44 | } 45 | 46 | message ContainerMetrics { 47 | string name = 1; 48 | string pod_name = 2; 49 | string pod_namespace = 3; 50 | google.protobuf.Timestamp lastCheckpoint = 4; 51 | google.protobuf.Timestamp lastRestore = 5; 52 | google.protobuf.Duration lastCheckpointDuration = 6; 53 | google.protobuf.Duration lastRestoreDuration = 7; 54 | bool running = 8; 55 | } 56 | -------------------------------------------------------------------------------- /api/shim/v1/shim_ttrpc.pb.go: -------------------------------------------------------------------------------- 1 | // Code generated by protoc-gen-go-ttrpc. DO NOT EDIT. 2 | // source: shim.proto 3 | package v1 4 | 5 | import ( 6 | context "context" 7 | ttrpc "github.com/containerd/ttrpc" 8 | ) 9 | 10 | type ShimService interface { 11 | Metrics(context.Context, *MetricsRequest) (*MetricsResponse, error) 12 | GetStatus(context.Context, *ContainerRequest) (*ContainerStatus, error) 13 | SubscribeStatus(context.Context, *SubscribeStatusRequest, Shim_SubscribeStatusServer) error 14 | } 15 | 16 | type Shim_SubscribeStatusServer interface { 17 | Send(*ContainerStatus) error 18 | ttrpc.StreamServer 19 | } 20 | 21 | type shimSubscribeStatusServer struct { 22 | ttrpc.StreamServer 23 | } 24 | 25 | func (x *shimSubscribeStatusServer) Send(m *ContainerStatus) error { 26 | return x.StreamServer.SendMsg(m) 27 | } 28 | 29 | func RegisterShimService(srv *ttrpc.Server, svc ShimService) { 30 | srv.RegisterService("zeropod.shim.v1.Shim", &ttrpc.ServiceDesc{ 31 | Methods: map[string]ttrpc.Method{ 32 | "Metrics": func(ctx context.Context, unmarshal func(interface{}) error) (interface{}, error) { 33 | var req MetricsRequest 34 | if err := unmarshal(&req); err != nil { 35 | return nil, err 36 | } 37 | return svc.Metrics(ctx, &req) 38 | }, 39 | "GetStatus": func(ctx context.Context, unmarshal func(interface{}) error) (interface{}, error) { 40 | var req ContainerRequest 41 | if err := unmarshal(&req); err != nil { 42 | return nil, err 43 | } 44 | return svc.GetStatus(ctx, &req) 45 | }, 46 | }, 47 | Streams: map[string]ttrpc.Stream{ 48 | "SubscribeStatus": { 49 | Handler: func(ctx context.Context, stream ttrpc.StreamServer) (interface{}, error) { 50 | m := new(SubscribeStatusRequest) 51 | if err := stream.RecvMsg(m); err != nil { 52 | return nil, err 53 | } 54 | return nil, svc.SubscribeStatus(ctx, m, &shimSubscribeStatusServer{stream}) 55 | }, 56 | StreamingClient: false, 57 | StreamingServer: true, 58 | }, 59 | }, 60 | }) 61 | } 62 | 63 | type ShimClient interface { 64 | Metrics(context.Context, *MetricsRequest) (*MetricsResponse, error) 65 | GetStatus(context.Context, *ContainerRequest) (*ContainerStatus, error) 66 | SubscribeStatus(context.Context, *SubscribeStatusRequest) (Shim_SubscribeStatusClient, error) 67 | } 68 | 69 | type shimClient struct { 70 | client *ttrpc.Client 71 | } 72 | 73 | func NewShimClient(client *ttrpc.Client) ShimClient { 74 | return &shimClient{ 75 | client: client, 76 | } 77 | } 78 | 79 | func (c *shimClient) Metrics(ctx context.Context, req *MetricsRequest) (*MetricsResponse, error) { 80 | var resp MetricsResponse 81 | if err := c.client.Call(ctx, "zeropod.shim.v1.Shim", "Metrics", req, &resp); err != nil { 82 | return nil, err 83 | } 84 | return &resp, nil 85 | } 86 | 87 | func (c *shimClient) GetStatus(ctx context.Context, req *ContainerRequest) (*ContainerStatus, error) { 88 | var resp ContainerStatus 89 | if err := c.client.Call(ctx, "zeropod.shim.v1.Shim", "GetStatus", req, &resp); err != nil { 90 | return nil, err 91 | } 92 | return &resp, nil 93 | } 94 | 95 | func (c *shimClient) SubscribeStatus(ctx context.Context, req *SubscribeStatusRequest) (Shim_SubscribeStatusClient, error) { 96 | stream, err := c.client.NewStream(ctx, &ttrpc.StreamDesc{ 97 | StreamingClient: false, 98 | StreamingServer: true, 99 | }, "zeropod.shim.v1.Shim", "SubscribeStatus", req) 100 | if err != nil { 101 | return nil, err 102 | } 103 | x := &shimSubscribeStatusClient{stream} 104 | return x, nil 105 | } 106 | 107 | type Shim_SubscribeStatusClient interface { 108 | Recv() (*ContainerStatus, error) 109 | ttrpc.ClientStream 110 | } 111 | 112 | type shimSubscribeStatusClient struct { 113 | ttrpc.ClientStream 114 | } 115 | 116 | func (x *shimSubscribeStatusClient) Recv() (*ContainerStatus, error) { 117 | m := new(ContainerStatus) 118 | if err := x.ClientStream.RecvMsg(m); err != nil { 119 | return nil, err 120 | } 121 | return m, nil 122 | } 123 | -------------------------------------------------------------------------------- /api/shim/v1/util.go: -------------------------------------------------------------------------------- 1 | package v1 2 | 3 | import ( 4 | "path/filepath" 5 | 6 | "github.com/containerd/containerd/v2/pkg/atomicfile" 7 | ) 8 | 9 | // WriteAddress writes a address file atomically 10 | func WriteAddress(path, address string) error { 11 | path, err := filepath.Abs(path) 12 | if err != nil { 13 | return err 14 | } 15 | f, err := atomicfile.New(path, 0o644) 16 | if err != nil { 17 | return err 18 | } 19 | _, err = f.Write([]byte(address)) 20 | if err != nil { 21 | f.Cancel() 22 | return err 23 | } 24 | return f.Close() 25 | } 26 | -------------------------------------------------------------------------------- /cmd/freezer/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM --platform=$BUILDPLATFORM golang:1.24 AS builder 2 | 3 | WORKDIR /workspace 4 | COPY go.mod go.mod 5 | COPY go.sum go.sum 6 | RUN go mod download 7 | 8 | COPY cmd/freezer cmd/freezer 9 | 10 | RUN CGO_ENABLED=0 GOOS=linux GOARCH=$TARGETARCH GO111MODULE=on go build -ldflags "-s -w" -a -o freezer cmd/freezer/main.go 11 | 12 | FROM gcr.io/distroless/static-debian12 13 | COPY --from=builder /workspace/freezer / 14 | ENTRYPOINT ["/freezer"] 15 | -------------------------------------------------------------------------------- /cmd/freezer/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "encoding/json" 5 | "flag" 6 | "fmt" 7 | "io" 8 | "log/slog" 9 | "math/rand/v2" 10 | "net/http" 11 | "time" 12 | ) 13 | 14 | type Freeze struct { 15 | LastObservation time.Time `json:"lastObservation"` 16 | LastFreezeDuration time.Duration `json:"lastFreezeDuration"` 17 | Data string `json:"data"` 18 | } 19 | 20 | var ballast []byte 21 | 22 | // Freezer is helps with e2e testing migrations. It allocates the specified 23 | // amount of memory and constantly stores a timestamp in memory. If the last 24 | // timestamp is older than 50 Milliseconds it will detect that as a "freeze". It 25 | // also exposes a simple HTTP API to get the last freeze and duration and an 26 | // endpoint to set some string data, used to store aribtrary state. 27 | func main() { 28 | mem := flag.Int("memory", 0, "memory to allocate in MiB") 29 | flag.Parse() 30 | 31 | allocateMemory(*mem) 32 | 33 | f := Freeze{ 34 | LastObservation: time.Now(), 35 | } 36 | 37 | http.HandleFunc("/get", func(w http.ResponseWriter, r *http.Request) { 38 | b, err := json.Marshal(&f) 39 | if err != nil { 40 | http.Error(w, err.Error(), http.StatusInternalServerError) 41 | } 42 | slog.Info("get called", "data", b) 43 | fmt.Fprintf(w, "%s", b) 44 | }) 45 | 46 | http.HandleFunc("/set", func(w http.ResponseWriter, r *http.Request) { 47 | b, err := io.ReadAll(r.Body) 48 | if err != nil { 49 | http.Error(w, err.Error(), http.StatusInternalServerError) 50 | slog.Error("reading req body", "error", err) 51 | return 52 | } 53 | fr := Freeze{} 54 | if err := json.Unmarshal(b, &fr); err != nil { 55 | http.Error(w, err.Error(), http.StatusInternalServerError) 56 | slog.Error("unmarshal req body", "error", err) 57 | return 58 | } 59 | f.Data = fr.Data 60 | slog.Info("set called", "data", b) 61 | 62 | freezeJSON, err := json.Marshal(&f) 63 | if err != nil { 64 | http.Error(w, err.Error(), http.StatusInternalServerError) 65 | slog.Error("marshal req body", "error", err) 66 | return 67 | } 68 | fmt.Fprintf(w, "%s", freezeJSON) 69 | }) 70 | 71 | go http.ListenAndServe(":8080", nil) 72 | for { 73 | since := time.Since(f.LastObservation) 74 | if since > time.Millisecond*50 { 75 | f.LastFreezeDuration = since 76 | slog.Info("observed a freeze", "duration", since.String()) 77 | } 78 | f.LastObservation = time.Now() 79 | time.Sleep(time.Millisecond) 80 | } 81 | } 82 | 83 | func allocateMemory(mem int) { 84 | slog.Info("allocating memory", "bytes", mem<<20) 85 | // we don't really care about the randomness too much, we want something 86 | // quick that isn't so easily compressable. 87 | r := rand.New(rand.NewPCG(rand.Uint64(), rand.Uint64())) 88 | ballast = make([]byte, mem<<20) 89 | for i := range len(ballast) { 90 | ballast[i] = byte(r.UintN(255)) 91 | } 92 | slog.Info("done allocating", "bytes", mem<<20) 93 | } 94 | -------------------------------------------------------------------------------- /cmd/installer/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM --platform=$BUILDPLATFORM golang:1.24 AS builder 2 | 3 | WORKDIR /workspace 4 | COPY go.mod go.mod 5 | COPY go.sum go.sum 6 | RUN go mod download 7 | RUN apt-get update && apt-get install -y make 8 | 9 | COPY . . 10 | 11 | ARG TARGETARCH 12 | RUN CGO_ENABLED=0 GOOS=linux GOARCH=$TARGETARCH GO111MODULE=on go build -ldflags "-s -w" -a -o zeropod-installer cmd/installer/main.go 13 | RUN GOARCH=$TARGETARCH make build 14 | 15 | FROM gcr.io/distroless/static-debian12 16 | WORKDIR /build 17 | COPY --from=builder /workspace/zeropod-installer . 18 | COPY --from=builder /workspace/containerd-shim-zeropod-v2 . 19 | 20 | ENTRYPOINT ["/build/zeropod-installer"] 21 | -------------------------------------------------------------------------------- /cmd/manager/Dockerfile: -------------------------------------------------------------------------------- 1 | ARG CRIU_IMAGE_NAME=ghcr.io/ctrox/zeropod-criu 2 | ARG CRIU_VERSION=v4.0 3 | 4 | FROM --platform=$BUILDPLATFORM golang:1.24 AS builder 5 | 6 | WORKDIR /workspace 7 | COPY go.mod go.mod 8 | COPY go.sum go.sum 9 | RUN go mod download 10 | 11 | COPY . . 12 | 13 | ARG TARGETARCH 14 | RUN CGO_ENABLED=0 GOOS=linux GOARCH=$TARGETARCH GO111MODULE=on go build -ldflags "-s -w" -a -o zeropod-manager cmd/manager/main.go 15 | 16 | FROM ${CRIU_IMAGE_NAME}:${CRIU_VERSION} AS criu 17 | # the manager runs criu lazy-pages, which needs libc so we can't use the static 18 | # image here 19 | FROM gcr.io/distroless/base-debian12 20 | COPY --from=builder /workspace/zeropod-manager / 21 | COPY --from=criu /bin /bin 22 | COPY --from=criu /lib /lib 23 | CMD ["/zeropod-manager"] 24 | -------------------------------------------------------------------------------- /cmd/manager/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "context" 5 | "errors" 6 | "flag" 7 | "fmt" 8 | "log/slog" 9 | "net/http" 10 | "os" 11 | "os/signal" 12 | "syscall" 13 | 14 | v1 "github.com/ctrox/zeropod/api/runtime/v1" 15 | "github.com/ctrox/zeropod/manager" 16 | "github.com/ctrox/zeropod/manager/node" 17 | "github.com/ctrox/zeropod/socket" 18 | "github.com/prometheus/client_golang/prometheus" 19 | "github.com/prometheus/client_golang/prometheus/promhttp" 20 | corev1 "k8s.io/api/core/v1" 21 | "k8s.io/apimachinery/pkg/runtime" 22 | "sigs.k8s.io/controller-runtime/pkg/client/config" 23 | ctrlmanager "sigs.k8s.io/controller-runtime/pkg/manager" 24 | "sigs.k8s.io/controller-runtime/pkg/metrics/server" 25 | ) 26 | 27 | var ( 28 | metricsAddr = flag.String("metrics-addr", ":8080", "address of the metrics server") 29 | nodeServerAddr = flag.String("node-server-addr", ":8090", "address of the node server") 30 | debug = flag.Bool("debug", false, "enable debug logs") 31 | inPlaceScaling = flag.Bool("in-place-scaling", false, 32 | "enable in-place resource scaling, requires InPlacePodVerticalScaling feature flag") 33 | statusLabels = flag.Bool("status-labels", false, "update pod labels to reflect container status") 34 | ) 35 | 36 | func main() { 37 | flag.Parse() 38 | 39 | opts := &slog.HandlerOptions{Level: slog.LevelInfo} 40 | if *debug { 41 | opts.Level = slog.LevelDebug 42 | } 43 | log := slog.New(slog.NewJSONHandler(os.Stdout, opts)) 44 | log.Info("starting manager", "metrics-addr", *metricsAddr) 45 | 46 | ctx, stop := signal.NotifyContext(context.Background(), os.Interrupt, syscall.SIGTERM) 47 | defer stop() 48 | 49 | if err := manager.AttachRedirectors(ctx, log); err != nil { 50 | log.Error("attaching redirectors", "err", err) 51 | os.Exit(1) 52 | } 53 | 54 | cleanSocketTracker, err := socket.LoadEBPFTracker() 55 | if err != nil { 56 | log.Error("loading socket tracker", "err", err) 57 | os.Exit(1) 58 | } 59 | 60 | mgr, err := newControllerManager() 61 | if err != nil { 62 | log.Error("creating controller manager", "err", err) 63 | os.Exit(1) 64 | } 65 | 66 | podHandlers := []manager.PodHandler{} 67 | if *statusLabels { 68 | podHandlers = append(podHandlers, manager.NewPodLabeller(log)) 69 | } 70 | if *inPlaceScaling { 71 | podHandlers = append(podHandlers, manager.NewPodScaler(log)) 72 | } 73 | 74 | col := manager.NewCollector() 75 | sc := manager.SubscriberConfig{Log: log, Kube: mgr.GetClient(), Collector: col} 76 | if err := manager.StartSubscribers(ctx, sc, podHandlers...); err != nil { 77 | log.Error("starting subscribers", "err", err) 78 | os.Exit(1) 79 | } 80 | 81 | registry := prometheus.NewRegistry() 82 | if err := registry.Register(col); err != nil { 83 | slog.Error("registering metrics", "err", err) 84 | os.Exit(1) 85 | } 86 | 87 | mux := http.NewServeMux() 88 | mux.Handle("/metrics", promhttp.HandlerFor( 89 | registry, 90 | promhttp.HandlerOpts{ 91 | EnableOpenMetrics: true, 92 | })) 93 | server := &http.Server{Addr: *metricsAddr, Handler: mux} 94 | 95 | go func() { 96 | if err := server.ListenAndServe(); err != nil { 97 | if !errors.Is(err, http.ErrServerClosed) { 98 | log.Error("serving metrics", "err", err) 99 | os.Exit(1) 100 | } 101 | } 102 | }() 103 | 104 | nodeServer, err := node.NewServer(*nodeServerAddr, mgr.GetClient(), log) 105 | if err != nil { 106 | log.Error("creating node server", "err", err) 107 | os.Exit(1) 108 | } 109 | go nodeServer.Start(ctx) 110 | 111 | if err := manager.NewPodController(ctx, mgr, log); err != nil { 112 | log.Error("running pod controller", "error", err) 113 | } 114 | 115 | go func() { 116 | if err := mgr.Start(ctx); err != nil { 117 | log.Error("starting controller manager", "error", err) 118 | os.Exit(1) 119 | } 120 | }() 121 | 122 | <-ctx.Done() 123 | log.Info("stopping manager") 124 | cleanSocketTracker() 125 | if err := server.Shutdown(ctx); err != nil { 126 | log.Error("shutting down server", "err", err) 127 | } 128 | } 129 | 130 | func newControllerManager() (ctrlmanager.Manager, error) { 131 | cfg, err := config.GetConfig() 132 | if err != nil { 133 | return nil, fmt.Errorf("getting client config: %w", err) 134 | } 135 | scheme := runtime.NewScheme() 136 | if err := corev1.AddToScheme(scheme); err != nil { 137 | return nil, err 138 | } 139 | if err := v1.AddToScheme(scheme); err != nil { 140 | return nil, err 141 | } 142 | mgr, err := ctrlmanager.New(cfg, ctrlmanager.Options{ 143 | Scheme: scheme, Metrics: server.Options{BindAddress: "0"}, 144 | }) 145 | if err != nil { 146 | return nil, err 147 | } 148 | return mgr, nil 149 | } 150 | -------------------------------------------------------------------------------- /cmd/shim/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "context" 5 | "io" 6 | "path/filepath" 7 | 8 | "github.com/containerd/containerd/api/types" 9 | "github.com/containerd/containerd/v2/cmd/containerd-shim-runc-v2/manager" 10 | "github.com/containerd/containerd/v2/pkg/shim" 11 | shimv1 "github.com/ctrox/zeropod/api/shim/v1" 12 | zshim "github.com/ctrox/zeropod/shim" 13 | _ "github.com/ctrox/zeropod/shim/task/plugin" 14 | ) 15 | 16 | // compatManager is a wrapper around [shim.Manager] that allows us to control 17 | // the task API version. This makes it possible to use the containerd v2 shim 18 | // with containerd 1.7. 19 | type compatManager struct { 20 | mgr shim.Manager 21 | } 22 | 23 | func (cm compatManager) Name() string { 24 | return cm.mgr.Name() 25 | } 26 | 27 | func (cm compatManager) Start(ctx context.Context, id string, opts shim.StartOpts) (shim.BootstrapParams, error) { 28 | params, err := cm.mgr.Start(ctx, id, opts) 29 | if err != nil { 30 | return params, err 31 | } 32 | // TODO: would be nice to detect the containerd version and set 3 for 2.0+. 33 | // So far it looks like this is not possible. Since containerd v2 works with 34 | // task v2 right now, this is not a big issue. 35 | params.Version = 2 36 | path, err := filepath.Abs("address") 37 | if err != nil { 38 | return params, err 39 | } 40 | if err := shimv1.WriteAddress(path, params.Address); err != nil { 41 | return params, err 42 | } 43 | return params, err 44 | } 45 | 46 | func (cm compatManager) Stop(ctx context.Context, id string) (shim.StopStatus, error) { 47 | return cm.mgr.Stop(ctx, id) 48 | } 49 | 50 | func (cm compatManager) Info(ctx context.Context, optionsR io.Reader) (*types.RuntimeInfo, error) { 51 | info, err := cm.mgr.Info(ctx, optionsR) 52 | return info, err 53 | } 54 | 55 | func newCompatManager() shim.Manager { 56 | return &compatManager{mgr: manager.NewShimManager(zshim.RuntimeName)} 57 | } 58 | 59 | func main() { 60 | shim.Run(context.Background(), newCompatManager()) 61 | } 62 | -------------------------------------------------------------------------------- /config/base/kustomization.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: kustomize.config.k8s.io/v1beta1 2 | kind: Kustomization 3 | namespace: zeropod-system 4 | resources: 5 | - namespace.yaml 6 | - node-daemonset.yaml 7 | - rbac.yaml 8 | components: 9 | - ../crds 10 | -------------------------------------------------------------------------------- /config/base/namespace.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Namespace 3 | metadata: 4 | name: ns 5 | -------------------------------------------------------------------------------- /config/base/node-daemonset.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: DaemonSet 3 | metadata: 4 | name: zeropod-node 5 | namespace: zeropod-system 6 | labels: 7 | app.kubernetes.io/name: zeropod-node 8 | spec: 9 | selector: 10 | matchLabels: 11 | app.kubernetes.io/name: zeropod-node 12 | template: 13 | metadata: 14 | labels: 15 | app.kubernetes.io/name: zeropod-node 16 | spec: 17 | serviceAccountName: zeropod-node 18 | nodeSelector: 19 | zeropod.ctrox.dev/node: "true" 20 | initContainers: 21 | - name: installer 22 | image: installer 23 | args: [] 24 | imagePullPolicy: IfNotPresent 25 | volumeMounts: 26 | - mountPath: /host 27 | name: hostroot 28 | - mountPath: /etc/containerd 29 | name: containerd-etc 30 | - mountPath: /run/containerd 31 | name: containerd-run 32 | - mountPath: /opt/zeropod 33 | name: zeropod-opt 34 | - mountPath: /run/systemd 35 | name: systemd-run 36 | - mountPath: /etc/criu 37 | name: criu-etc 38 | - mountPath: /tls 39 | name: tls 40 | - name: prepare-bpf-fs 41 | args: 42 | - mount | grep "/sys/fs/bpf type bpf" || mount -t bpf bpf /sys/fs/bpf 43 | command: 44 | - /bin/sh 45 | - -c 46 | - -- 47 | image: alpine:3.19.1 48 | imagePullPolicy: IfNotPresent 49 | securityContext: 50 | privileged: true 51 | volumeMounts: 52 | - mountPath: /sys/fs/bpf 53 | mountPropagation: Bidirectional 54 | name: bpf 55 | containers: 56 | - name: manager 57 | image: manager 58 | imagePullPolicy: IfNotPresent 59 | command: ["/zeropod-manager"] 60 | args: 61 | - -metrics-addr=:8080 62 | ports: 63 | - name: metrics 64 | containerPort: 8080 65 | - name: node-server 66 | containerPort: 8090 67 | env: 68 | - name: POD_IP 69 | valueFrom: 70 | fieldRef: 71 | fieldPath: status.podIP 72 | - name: NODE_NAME 73 | valueFrom: 74 | fieldRef: 75 | fieldPath: spec.nodeName 76 | volumeMounts: 77 | - mountPath: /run/zeropod 78 | name: zeropod-run 79 | - mountPath: /var/lib/zeropod 80 | name: zeropod-var 81 | - mountPath: /hostproc 82 | name: hostproc 83 | - mountPath: /sys/fs/bpf 84 | name: bpf 85 | - mountPath: /tls 86 | name: tls 87 | securityContext: 88 | appArmorProfile: 89 | type: Unconfined 90 | capabilities: 91 | add: 92 | # for net nsenter 93 | - "SYS_PTRACE" 94 | - "SYS_ADMIN" 95 | # for attaching qdiscs/filters 96 | - "NET_ADMIN" 97 | # for setting memlock rlimit 98 | - SYS_RESOURCE 99 | tolerations: 100 | - operator: Exists 101 | volumes: 102 | - name: hostroot 103 | hostPath: 104 | path: / 105 | - name: containerd-etc 106 | hostPath: 107 | path: /etc/containerd 108 | - name: containerd-run 109 | hostPath: 110 | path: /run/containerd 111 | - name: zeropod-var 112 | hostPath: 113 | path: /var/lib/zeropod 114 | - name: zeropod-opt 115 | hostPath: 116 | path: /opt/zeropod 117 | - name: zeropod-run 118 | hostPath: 119 | path: /run/zeropod 120 | - name: systemd-run 121 | hostPath: 122 | path: /run/systemd 123 | - name: criu-etc 124 | hostPath: 125 | path: /etc/criu 126 | - hostPath: 127 | path: /proc 128 | type: Directory 129 | name: hostproc 130 | - hostPath: 131 | path: /sys/fs/bpf 132 | type: Directory 133 | name: bpf 134 | - name: tls 135 | emptyDir: {} 136 | -------------------------------------------------------------------------------- /config/base/rbac.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ServiceAccount 3 | metadata: 4 | name: zeropod-node 5 | namespace: zeropod-system 6 | --- 7 | # the installer needs to be able to install the zeropod runtimeclass 8 | apiVersion: rbac.authorization.k8s.io/v1 9 | kind: ClusterRole 10 | metadata: 11 | name: zeropod:runtimeclass-installer 12 | rules: 13 | - apiGroups: 14 | - node.k8s.io 15 | resources: 16 | - runtimeclasses 17 | verbs: 18 | - create 19 | - delete 20 | - update 21 | --- 22 | apiVersion: rbac.authorization.k8s.io/v1 23 | kind: ClusterRoleBinding 24 | metadata: 25 | name: zeropod:runtimeclass-installer 26 | roleRef: 27 | apiGroup: rbac.authorization.k8s.io 28 | kind: ClusterRole 29 | name: zeropod:runtimeclass-installer 30 | subjects: 31 | - kind: ServiceAccount 32 | name: zeropod-node 33 | namespace: zeropod-system 34 | --- 35 | apiVersion: rbac.authorization.k8s.io/v1 36 | kind: Role 37 | metadata: 38 | name: zeropod:secret-creator 39 | rules: 40 | - apiGroups: 41 | - "" 42 | resources: 43 | - secrets 44 | verbs: 45 | - get 46 | - create 47 | --- 48 | apiVersion: rbac.authorization.k8s.io/v1 49 | kind: RoleBinding 50 | metadata: 51 | name: zeropod:secret-creator 52 | roleRef: 53 | apiGroup: rbac.authorization.k8s.io 54 | kind: Role 55 | name: zeropod:secret-creator 56 | subjects: 57 | - kind: ServiceAccount 58 | name: zeropod-node 59 | -------------------------------------------------------------------------------- /config/crds/kustomization.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: kustomize.config.k8s.io/v1alpha1 2 | kind: Component 3 | resources: 4 | - runtime.zeropod.ctrox.dev_migrations.yaml 5 | -------------------------------------------------------------------------------- /config/examples/live-migration.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: freezer 5 | spec: 6 | replicas: 1 7 | selector: 8 | matchLabels: 9 | app: freezer 10 | template: 11 | metadata: 12 | labels: 13 | app: freezer 14 | annotations: 15 | zeropod.ctrox.dev/scaledown-duration: 1h 16 | zeropod.ctrox.dev/live-migrate: "freezer" 17 | spec: 18 | runtimeClassName: zeropod 19 | containers: 20 | - image: ghcr.io/ctrox/zeropod-freezer:latest 21 | name: freezer 22 | args: ["-memory", "128"] 23 | imagePullPolicy: IfNotPresent 24 | ports: 25 | - containerPort: 8080 26 | resources: 27 | requests: 28 | cpu: 100m 29 | memory: 128Mi 30 | -------------------------------------------------------------------------------- /config/examples/migration.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: freezer 5 | spec: 6 | replicas: 1 7 | selector: 8 | matchLabels: 9 | app: freezer 10 | template: 11 | metadata: 12 | labels: 13 | app: freezer 14 | annotations: 15 | zeropod.ctrox.dev/scaledown-duration: 10s 16 | zeropod.ctrox.dev/migrate: "freezer" 17 | spec: 18 | runtimeClassName: zeropod 19 | containers: 20 | - image: ghcr.io/ctrox/zeropod-freezer:latest 21 | name: freezer 22 | args: ["-memory", "128"] 23 | imagePullPolicy: IfNotPresent 24 | ports: 25 | - containerPort: 8080 26 | resources: 27 | requests: 28 | cpu: 100m 29 | memory: 128Mi 30 | -------------------------------------------------------------------------------- /config/examples/nginx.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: nginx 5 | spec: 6 | replicas: 1 7 | selector: 8 | matchLabels: 9 | app: nginx 10 | template: 11 | metadata: 12 | labels: 13 | app: nginx 14 | annotations: 15 | io.containerd.runc.v2.group: "zeropod" 16 | zeropod.ctrox.dev/scaledown-duration: 10s 17 | spec: 18 | runtimeClassName: zeropod 19 | containers: 20 | - image: nginx 21 | name: nginx 22 | ports: 23 | - containerPort: 80 24 | resources: 25 | requests: 26 | cpu: 100m 27 | memory: 128Mi 28 | -------------------------------------------------------------------------------- /config/examples/pod.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: http-echo 5 | labels: 6 | app: echo 7 | annotations: 8 | zeropod.ctrox.dev/ports-map: "container1=8080;container2=8082" 9 | zeropod.ctrox.dev/container-names: "container1" 10 | zeropod.ctrox.dev/scaledown-duration: 0s 11 | spec: 12 | runtimeClassName: zeropod 13 | containers: 14 | - image: registry.k8s.io/e2e-test-images/agnhost:2.39 15 | name: container1 16 | args: 17 | - netexec 18 | - --http-port=8080 19 | - --udp-port=-1 20 | ports: 21 | - containerPort: 8080 22 | startupProbe: 23 | httpGet: 24 | port: 8080 25 | path: / 26 | periodSeconds: 1 27 | - image: ealen/echo-server 28 | name: container2 29 | args: 30 | - --port=8081 31 | ports: 32 | - containerPort: 8081 33 | -------------------------------------------------------------------------------- /config/examples/redmine.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: redmine 5 | spec: 6 | selector: 7 | matchLabels: 8 | app: redmine 9 | template: 10 | metadata: 11 | labels: 12 | app: redmine 13 | annotations: 14 | zeropod.ctrox.dev/ports-map: "redmine=3000" 15 | zeropod.ctrox.dev/container-names: "redmine" 16 | zeropod.ctrox.dev/scaledown-duration: 0m 17 | spec: 18 | runtimeClassName: zeropod 19 | containers: 20 | - image: redmine 21 | name: redmine 22 | ports: 23 | - containerPort: 3000 24 | env: 25 | - name: REDMINE_DB_MYSQL 26 | value: mysql 27 | - name: REDMINE_DB_USER 28 | value: root 29 | - name: REDMINE_DB_PASSWORD 30 | value: password 31 | - name: REDMINE_SECRET_KEY_BASE 32 | value: supersecretkey 33 | --- 34 | apiVersion: v1 35 | kind: Service 36 | metadata: 37 | name: redmine 38 | labels: 39 | app: redmine 40 | spec: 41 | ports: 42 | - port: 80 43 | name: http 44 | targetPort: 3000 45 | type: LoadBalancer 46 | selector: 47 | app: redmine 48 | -------------------------------------------------------------------------------- /config/examples/wildfly.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: wildfly-pod 5 | annotations: 6 | zeropod.ctrox.dev/ports-map: "wildfly-pod=8080" 7 | zeropod.ctrox.dev/container-names: wildfly-pod 8 | zeropod.ctrox.dev/scaledown-duration: 5m 9 | spec: 10 | runtimeClassName: zeropod 11 | containers: 12 | - image: jboss/wildfly 13 | name: wildfly-pod 14 | ports: 15 | - containerPort: 8080 16 | -------------------------------------------------------------------------------- /config/examples/wordpress.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: php 5 | spec: 6 | selector: 7 | matchLabels: 8 | app: php 9 | template: 10 | metadata: 11 | labels: 12 | app: php 13 | annotations: 14 | zeropod.ctrox.dev/ports-map: "wordpress=80" 15 | zeropod.ctrox.dev/container-names: wordpress 16 | zeropod.ctrox.dev/scaledown-duration: 0m 17 | spec: 18 | runtimeClassName: zeropod 19 | containers: 20 | - image: wordpress 21 | name: wordpress 22 | ports: 23 | - containerPort: 80 24 | env: 25 | - name: WORDPRESS_DB_HOST 26 | value: mysql 27 | - name: WORDPRESS_DB_USER 28 | value: root 29 | - name: WORDPRESS_DB_PASSWORD 30 | value: password 31 | - name: WORDPRESS_DB_NAME 32 | value: wordpress 33 | --- 34 | apiVersion: v1 35 | kind: Service 36 | metadata: 37 | name: php 38 | labels: 39 | app: php 40 | spec: 41 | ports: 42 | - port: 8080 43 | name: http 44 | targetPort: 80 45 | type: LoadBalancer 46 | selector: 47 | app: php 48 | --- 49 | apiVersion: v1 50 | kind: Service 51 | metadata: 52 | name: mysql 53 | labels: 54 | app: mysql 55 | spec: 56 | ports: 57 | - port: 3306 58 | name: mysql 59 | clusterIP: None 60 | selector: 61 | app: mysql 62 | --- 63 | apiVersion: apps/v1 64 | kind: StatefulSet 65 | metadata: 66 | name: mysql 67 | spec: 68 | selector: 69 | matchLabels: 70 | app: mysql 71 | serviceName: "mysql" 72 | replicas: 1 73 | template: 74 | metadata: 75 | labels: 76 | app: mysql 77 | spec: 78 | containers: 79 | - image: mysql 80 | name: mysql 81 | ports: 82 | - containerPort: 3306 83 | env: 84 | - name: MYSQL_ROOT_PASSWORD 85 | value: password 86 | volumeMounts: 87 | - name: data 88 | mountPath: /var/lib/mysql 89 | volumeClaimTemplates: 90 | - metadata: 91 | name: data 92 | spec: 93 | accessModes: ["ReadWriteOnce"] 94 | resources: 95 | requests: 96 | storage: 5Gi 97 | -------------------------------------------------------------------------------- /config/gke/daemonset.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: DaemonSet 3 | metadata: 4 | name: zeropod-node 5 | namespace: zeropod-system 6 | spec: 7 | template: 8 | spec: 9 | volumes: 10 | - name: zeropod-opt 11 | hostPath: 12 | path: /var/lib/toolbox/zeropod 13 | -------------------------------------------------------------------------------- /config/gke/kustomization.yaml: -------------------------------------------------------------------------------- 1 | resources: 2 | - ../production 3 | patches: 4 | - path: daemonset.yaml 5 | - patch: |- 6 | - op: add 7 | path: /spec/template/spec/initContainers/0/args/- 8 | value: -host-opt-path=/var/lib/toolbox/zeropod 9 | target: 10 | kind: DaemonSet 11 | -------------------------------------------------------------------------------- /config/in-place-scaling/kustomization.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: kustomize.config.k8s.io/v1alpha1 2 | kind: Component 3 | patches: 4 | - patch: |- 5 | - op: add 6 | path: /spec/template/spec/containers/0/args/- 7 | value: -in-place-scaling=true 8 | target: 9 | kind: DaemonSet 10 | -------------------------------------------------------------------------------- /config/k3s/k3s.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: DaemonSet 3 | metadata: 4 | name: zeropod-node 5 | namespace: zeropod-system 6 | spec: 7 | template: 8 | spec: 9 | volumes: 10 | - name: containerd-etc 11 | hostPath: 12 | path: /var/lib/rancher/k3s/agent/etc/containerd/ 13 | - name: containerd-run 14 | hostPath: 15 | path: /run/k3s/containerd/ 16 | - name: zeropod-opt 17 | hostPath: 18 | path: /var/lib/rancher/k3s/agent/containerd 19 | -------------------------------------------------------------------------------- /config/k3s/kustomization.yaml: -------------------------------------------------------------------------------- 1 | resources: 2 | - ../production 3 | patches: 4 | - path: k3s.yaml 5 | - patch: |- 6 | - op: add 7 | path: /spec/template/spec/initContainers/0/args/- 8 | value: -runtime=k3s 9 | target: 10 | kind: DaemonSet 11 | -------------------------------------------------------------------------------- /config/kind/kustomization.yaml: -------------------------------------------------------------------------------- 1 | resources: 2 | - ../base 3 | components: 4 | - ../in-place-scaling 5 | - ../pod-updater 6 | - ../status-labels 7 | - ../migration-manager 8 | images: 9 | - name: manager 10 | newName: ghcr.io/ctrox/zeropod-manager 11 | newTag: dev 12 | - name: installer 13 | newName: ghcr.io/ctrox/zeropod-installer 14 | newTag: dev 15 | patches: 16 | - patch: |- 17 | - op: add 18 | path: /spec/template/spec/containers/0/args/- 19 | value: -debug=true 20 | target: 21 | kind: DaemonSet 22 | -------------------------------------------------------------------------------- /config/migration-manager/kustomization.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: kustomize.config.k8s.io/v1alpha1 2 | kind: Component 3 | resources: 4 | - rbac.yaml 5 | -------------------------------------------------------------------------------- /config/migration-manager/rbac.yaml: -------------------------------------------------------------------------------- 1 | # the manager needs to CRUD migrations 2 | apiVersion: rbac.authorization.k8s.io/v1 3 | kind: ClusterRole 4 | metadata: 5 | name: zeropod:migration-manager 6 | rules: 7 | - apiGroups: 8 | - runtime.zeropod.ctrox.dev 9 | resources: 10 | - migrations 11 | - migrations/status 12 | verbs: 13 | - get 14 | - list 15 | - watch 16 | - create 17 | - update 18 | - delete 19 | - apiGroups: 20 | - "" 21 | resources: 22 | - pods 23 | verbs: 24 | - get 25 | - list 26 | - watch 27 | --- 28 | apiVersion: rbac.authorization.k8s.io/v1 29 | kind: ClusterRoleBinding 30 | metadata: 31 | name: zeropod:migration-manager 32 | roleRef: 33 | apiGroup: rbac.authorization.k8s.io 34 | kind: ClusterRole 35 | name: zeropod:migration-manager 36 | subjects: 37 | - kind: ServiceAccount 38 | name: zeropod-node 39 | namespace: zeropod-system 40 | -------------------------------------------------------------------------------- /config/pod-updater/kustomization.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: kustomize.config.k8s.io/v1alpha1 2 | kind: Component 3 | resources: 4 | - rbac.yaml 5 | -------------------------------------------------------------------------------- /config/pod-updater/rbac.yaml: -------------------------------------------------------------------------------- 1 | # the manager needs to get/update pods for dynamic resource requests 2 | apiVersion: rbac.authorization.k8s.io/v1 3 | kind: ClusterRole 4 | metadata: 5 | name: zeropod:pod-updater 6 | rules: 7 | - apiGroups: 8 | - "" 9 | resources: 10 | - pods 11 | - pods/resize 12 | verbs: 13 | - get 14 | - update 15 | --- 16 | apiVersion: rbac.authorization.k8s.io/v1 17 | kind: ClusterRoleBinding 18 | metadata: 19 | name: zeropod:pod-updater 20 | roleRef: 21 | apiGroup: rbac.authorization.k8s.io 22 | kind: ClusterRole 23 | name: zeropod:pod-updater 24 | subjects: 25 | - kind: ServiceAccount 26 | name: zeropod-node 27 | namespace: zeropod-system 28 | -------------------------------------------------------------------------------- /config/production/kustomization.yaml: -------------------------------------------------------------------------------- 1 | resources: 2 | - ../base 3 | # pod-updater is required if status-labels or in-place-scaling is enabled 4 | components: 5 | - ../pod-updater 6 | - ../status-labels 7 | - ../migration-manager 8 | # uncommment to enable in-place-scaling 9 | # - ../in-place-scaling 10 | images: 11 | - name: installer 12 | newName: ghcr.io/ctrox/zeropod-installer 13 | newTag: v0.6.3 14 | - name: manager 15 | newName: ghcr.io/ctrox/zeropod-manager 16 | newTag: v0.6.3 17 | patches: 18 | - patch: |- 19 | - op: add 20 | path: /spec/template/spec/initContainers/0/args/- 21 | value: -criu-image=ghcr.io/ctrox/zeropod-criu:v4.1 22 | target: 23 | kind: DaemonSet 24 | apiVersion: kustomize.config.k8s.io/v1beta1 25 | kind: Kustomization 26 | -------------------------------------------------------------------------------- /config/rke2/kustomization.yaml: -------------------------------------------------------------------------------- 1 | resources: 2 | - ../production 3 | patches: 4 | - path: rke2.yaml 5 | - patch: |- 6 | - op: add 7 | path: /spec/template/spec/initContainers/0/args/- 8 | value: -runtime=rke2 9 | target: 10 | kind: DaemonSet 11 | -------------------------------------------------------------------------------- /config/rke2/rke2.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: DaemonSet 3 | metadata: 4 | name: zeropod-node 5 | namespace: zeropod-system 6 | spec: 7 | template: 8 | spec: 9 | volumes: 10 | - name: containerd-etc 11 | hostPath: 12 | path: /var/lib/rancher/rke2/agent/etc/containerd/ 13 | - name: containerd-run 14 | hostPath: 15 | path: /run/k3s/containerd/ 16 | - name: zeropod-opt 17 | hostPath: 18 | path: /var/lib/rancher/rke2/agent/containerd 19 | -------------------------------------------------------------------------------- /config/status-labels/kustomization.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: kustomize.config.k8s.io/v1alpha1 2 | kind: Component 3 | patches: 4 | - patch: |- 5 | - op: add 6 | path: /spec/template/spec/containers/0/args/- 7 | value: -status-labels=true 8 | target: 9 | kind: DaemonSet 10 | -------------------------------------------------------------------------------- /config/uninstall/kustomization.yaml: -------------------------------------------------------------------------------- 1 | resources: 2 | - ../production 3 | patches: 4 | - patch: |- 5 | - op: add 6 | path: /spec/template/spec/initContainers/0/args/- 7 | value: -uninstall 8 | - op: replace 9 | path: /spec/template/spec/initContainers/0/volumeMounts/2 10 | value: 11 | mountPath: /opt 12 | name: opt 13 | - op: replace 14 | path: /spec/template/spec/volumes/2 15 | value: 16 | hostPath: 17 | path: /opt 18 | name: opt 19 | target: 20 | kind: DaemonSet 21 | -------------------------------------------------------------------------------- /criu/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM debian:12 as build 2 | RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ 3 | git build-essential libprotobuf-dev libprotobuf-c-dev \ 4 | protobuf-c-compiler protobuf-compiler python3-protobuf \ 5 | libcap-dev libnl-3-dev libnet-dev pkg-config curl ca-certificates \ 6 | gnutls-dev uuid-dev 7 | 8 | WORKDIR /workspace 9 | 10 | RUN git clone https://github.com/checkpoint-restore/criu.git 11 | WORKDIR /workspace/criu 12 | ADD criu/*.patch . 13 | ARG CRIU_VERSION=v4.1 14 | RUN git checkout $CRIU_VERSION 15 | RUN git apply *.patch 16 | RUN make -j $(nproc) 17 | 18 | # copy all libraries that criu needs except libc as replacing libc is not that 19 | # straight-forward. 20 | RUN mkdir criu-libs/ && \ 21 | for l in $(ldd criu/criu | awk '{ print $3 }'); do cp $l criu-libs/; done && \ 22 | rm criu-libs/libc.so* 23 | 24 | # we just add libs and binaries from scratch so this image can be used with 25 | # containerd's client.Install. 26 | FROM scratch AS export-stage 27 | COPY --from=build /workspace/criu/criu/criu /bin/ 28 | COPY --from=build /workspace/criu/criu-libs /lib/ 29 | -------------------------------------------------------------------------------- /criu/always-lazy.patch: -------------------------------------------------------------------------------- 1 | diff --git a/criu/cr-service.c b/criu/cr-service.c 2 | index 61a04c5ff..c0cce482e 100644 3 | --- a/criu/cr-service.c 4 | +++ b/criu/cr-service.c 5 | @@ -285,6 +285,12 @@ int exec_rpc_query_external_files(char *name, int sk) 6 | 7 | static char images_dir[PATH_MAX]; 8 | 9 | +bool file_exists(char *filename) 10 | +{ 11 | + struct stat buffer; 12 | + return (stat (filename, &buffer) == 0); 13 | +} 14 | + 15 | static int setup_opts_from_req(int sk, CriuOpts *req) 16 | { 17 | struct ucred ids; 18 | @@ -293,6 +299,7 @@ static int setup_opts_from_req(int sk, CriuOpts *req) 19 | char images_dir_path[PATH_MAX]; 20 | char work_dir_path[PATH_MAX]; 21 | char status_fd[PATH_MAX]; 22 | + char lazy_pages_socket_path[PATH_MAX]; 23 | bool output_changed_by_rpc_conf = false; 24 | bool work_changed_by_rpc_conf = false; 25 | bool imgs_changed_by_rpc_conf = false; 26 | @@ -554,6 +561,14 @@ static int setup_opts_from_req(int sk, CriuOpts *req) 27 | opts.lazy_pages = req->lazy_pages; 28 | } 29 | 30 | + strcpy(lazy_pages_socket_path, images_dir); 31 | + strcat(lazy_pages_socket_path, "/lazy-pages.socket"); 32 | + 33 | + if (file_exists(lazy_pages_socket_path)) { 34 | + // always enable lazy-pages if the socket exists 35 | + opts.lazy_pages = true; 36 | + } 37 | + 38 | if (req->has_pre_dump_mode) { 39 | switch (req->pre_dump_mode) { 40 | case CRIU_PRE_DUMP_MODE__SPLICE: 41 | -------------------------------------------------------------------------------- /criu/rpc-ps-address.patch: -------------------------------------------------------------------------------- 1 | diff --git a/criu/cr-service.c b/criu/cr-service.c 2 | index deb16e553..f86f0e334 100644 3 | --- a/criu/cr-service.c 4 | +++ b/criu/cr-service.c 5 | @@ -598,6 +598,7 @@ static int setup_opts_from_req(int sk, CriuOpts *req) 6 | 7 | if (req->ps) { 8 | opts.port = (short)req->ps->port; 9 | + opts.addr = req->ps->address; 10 | 11 | if (!opts.lazy_pages) { 12 | opts.use_page_server = true; 13 | -------------------------------------------------------------------------------- /criu/unix_sock.patch: -------------------------------------------------------------------------------- 1 | diff --git a/criu/util.c b/criu/util.c 2 | index d2bc9a865..38c1a42e3 100644 3 | --- a/criu/util.c 4 | +++ b/criu/util.c 5 | @@ -12,6 +12,7 @@ 6 | #include 7 | #include 8 | #include 9 | +#include 10 | #include 11 | #include 12 | #include 13 | @@ -1193,10 +1194,9 @@ static int get_sockaddr_in(struct sockaddr_storage *addr, char *host, unsigned s 14 | } else if (inet_pton(AF_INET6, host, &((struct sockaddr_in6 *)addr)->sin6_addr)) { 15 | addr->ss_family = AF_INET6; 16 | } else { 17 | - pr_err("Invalid server address \"%s\". " 18 | - "The address must be in IPv4 or IPv6 format.\n", 19 | - host); 20 | - return -1; 21 | + struct sockaddr_un *sun = (struct sockaddr_un *)addr; 22 | + sun->sun_family = AF_UNIX; 23 | + strcpy(sun->sun_path, host); 24 | } 25 | 26 | if (addr->ss_family == AF_INET6) { 27 | @@ -1219,9 +1219,13 @@ int setup_tcp_server(char *type, char *addr, unsigned short *port) 28 | return -1; 29 | } 30 | 31 | - pr_info("Starting %s server on port %u\n", type, *port); 32 | + if (saddr.ss_family == AF_UNIX) { 33 | + pr_info("Starting %s server on socket %s\n", type, addr); 34 | + } else { 35 | + pr_info("Starting %s server on port %u\n", type, *port); 36 | + } 37 | 38 | - sk = socket(saddr.ss_family, SOCK_STREAM, IPPROTO_TCP); 39 | + sk = socket(saddr.ss_family, SOCK_STREAM, IPPROTO_IP); 40 | 41 | if (sk < 0) { 42 | pr_perror("Can't init %s server", type); 43 | @@ -1233,6 +1237,10 @@ int setup_tcp_server(char *type, char *addr, unsigned short *port) 44 | goto out; 45 | } 46 | 47 | + if (saddr.ss_family == AF_UNIX) { 48 | + slen = sizeof(struct sockaddr_un); 49 | + } 50 | + 51 | if (bind(sk, (struct sockaddr *)&saddr, slen)) { 52 | pr_perror("Can't bind %s server", type); 53 | goto out; 54 | @@ -1244,7 +1252,7 @@ int setup_tcp_server(char *type, char *addr, unsigned short *port) 55 | } 56 | 57 | /* Get socket port in case of autobind */ 58 | - if ((*port) == 0) { 59 | + if (saddr.ss_family != AF_UNIX && (*port) == 0) { 60 | if (getsockname(sk, (struct sockaddr *)&saddr, &slen)) { 61 | pr_perror("Can't get %s server name", type); 62 | goto out; 63 | @@ -1306,8 +1314,8 @@ int run_tcp_server(bool daemon_mode, int *ask, int cfd, int sk) 64 | ret = getnameinfo((struct sockaddr *)&caddr, clen, address, sizeof(address), port, sizeof(port), 65 | NI_NUMERICHOST | NI_NUMERICSERV); 66 | if (ret) { 67 | - pr_err("Failed converting address: %s\n", gai_strerror(ret)); 68 | - goto err; 69 | + pr_err("Failed converting address: %d\n", ret); 70 | + // goto err; 71 | } 72 | pr_info("Accepted connection from %s:%s\n", address, port); 73 | close(sk); 74 | -------------------------------------------------------------------------------- /e2e/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM docker:24.0-cli 2 | 3 | RUN apk add --update go make iptables 4 | WORKDIR /app 5 | ADD go.* /app 6 | RUN go mod download 7 | -------------------------------------------------------------------------------- /e2e/bench_test.go: -------------------------------------------------------------------------------- 1 | package e2e 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | "net/http" 7 | "runtime" 8 | "testing" 9 | "time" 10 | 11 | "github.com/stretchr/testify/assert" 12 | ) 13 | 14 | func BenchmarkRestore(b *testing.B) { 15 | e2e := setup(b) 16 | client := e2e.client 17 | port := e2e.port 18 | ctx := context.Background() 19 | 20 | c := &http.Client{ 21 | Timeout: time.Second * 10, 22 | Transport: &http.Transport{ 23 | // disable keepalive as we want the container to checkpoint as soon as possible. 24 | DisableKeepAlives: true, 25 | }, 26 | } 27 | 28 | benches := map[string]struct { 29 | preDump bool 30 | waitDuration time.Duration 31 | scaleDownAfter time.Duration 32 | }{ 33 | "without pre-dump": { 34 | preDump: false, 35 | scaleDownAfter: 0, 36 | waitDuration: time.Millisecond * 800, 37 | }, 38 | "with pre-dump": { 39 | preDump: true, 40 | scaleDownAfter: 0, 41 | waitDuration: time.Millisecond * 800, 42 | }, 43 | "one second scaledown duration": { 44 | preDump: false, 45 | scaleDownAfter: time.Second, 46 | waitDuration: 0, 47 | }, 48 | } 49 | 50 | for name, bc := range benches { 51 | bc := bc 52 | b.Run(name, func(b *testing.B) { 53 | if bc.preDump && runtime.GOARCH == "arm64" { 54 | b.Skip("skipping pre-dump test as it's not supported on arm64") 55 | } 56 | 57 | // bench does an initial run with 1 iteration which we don't want since 58 | // the setup takes a long time. 59 | if b.N == 1 { 60 | b.ResetTimer() 61 | return 62 | } 63 | 64 | cleanupPod := createPodAndWait(b, ctx, client, testPod(preDump(bc.preDump), scaleDownAfter(bc.scaleDownAfter))) 65 | cleanupService := createServiceAndWait(b, ctx, client, testService(defaultTargetPort), 1) 66 | b.ResetTimer() 67 | 68 | defer func() { 69 | b.StopTimer() 70 | time.Sleep(bc.waitDuration) 71 | cleanupPod() 72 | cleanupService() 73 | b.StartTimer() 74 | }() 75 | 76 | for i := 0; i < b.N; i++ { 77 | b.StopTimer() 78 | // we add a sleep in between requests so we can be sure the 79 | // container has checkpointed completely when we hit it again. 80 | // TODO: once we are able to tell if the container has 81 | // checkpointed from the outside, we should just wait for that 82 | // instead of the static sleep. 83 | time.Sleep(bc.waitDuration) 84 | b.StartTimer() 85 | 86 | before := time.Now() 87 | resp, err := c.Get(fmt.Sprintf("http://localhost:%d", port)) 88 | if !assert.NoError(b, err) { 89 | b.Log("error", err) 90 | time.Sleep(time.Hour) 91 | continue 92 | } 93 | b.Logf("get request took %s", time.Since(before)) 94 | assert.Equal(b, resp.StatusCode, http.StatusOK) 95 | } 96 | }) 97 | } 98 | } 99 | -------------------------------------------------------------------------------- /e2e/kind.yaml: -------------------------------------------------------------------------------- 1 | kind: Cluster 2 | apiVersion: kind.x-k8s.io/v1alpha4 3 | featureGates: 4 | InPlacePodVerticalScaling: true 5 | UserNamespacesSupport: true 6 | InPlacePodVerticalScalingAllocatedStatus: true 7 | nodes: 8 | - role: control-plane 9 | image: kindest/node:v1.32.2@sha256:f226345927d7e348497136874b6d207e0b32cc52154ad8323129352923a3142f 10 | - role: worker 11 | image: kindest/node:v1.32.2@sha256:f226345927d7e348497136874b6d207e0b32cc52154ad8323129352923a3142f 12 | extraMounts: 13 | - hostPath: /proc 14 | containerPath: /host/proc 15 | labels: 16 | zeropod.ctrox.dev/node: "true" 17 | - role: worker 18 | image: kindest/node:v1.32.2@sha256:f226345927d7e348497136874b6d207e0b32cc52154ad8323129352923a3142f 19 | extraMounts: 20 | - hostPath: /proc 21 | containerPath: /host/proc 22 | labels: 23 | zeropod.ctrox.dev/node: "true" 24 | -------------------------------------------------------------------------------- /e2e/main_test.go: -------------------------------------------------------------------------------- 1 | package e2e 2 | 3 | import ( 4 | "fmt" 5 | "os" 6 | "testing" 7 | ) 8 | 9 | func TestMain(m *testing.M) { 10 | code := m.Run() 11 | if e2e != nil { 12 | if err := e2e.cleanup(); err != nil { 13 | fmt.Printf("error during test cleanup: %s", err) 14 | os.Exit(1) 15 | } 16 | } 17 | os.Exit(code) 18 | } 19 | -------------------------------------------------------------------------------- /hack/boilerplate.go.txt: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2025 The zeropod authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | -------------------------------------------------------------------------------- /manager/metrics_collector.go: -------------------------------------------------------------------------------- 1 | package manager 2 | 3 | import ( 4 | "context" 5 | "log/slog" 6 | "net" 7 | "os" 8 | "path/filepath" 9 | 10 | "github.com/containerd/ttrpc" 11 | v1 "github.com/ctrox/zeropod/api/shim/v1" 12 | "github.com/ctrox/zeropod/shim/task" 13 | "github.com/prometheus/client_golang/prometheus" 14 | ) 15 | 16 | const ( 17 | labelContainerName = "container" 18 | LabelPodName = "pod" 19 | LabelPodNamespace = "namespace" 20 | 21 | MetricsNamespace = "zeropod" 22 | MetricCheckpointDuration = "checkpoint_duration_seconds" 23 | MetricRestoreDuration = "restore_duration_seconds" 24 | MetricLastCheckpointTime = "last_checkpoint_time" 25 | MetricLastRestoreTime = "last_restore_time" 26 | MetricRunning = "running" 27 | ) 28 | 29 | var ( 30 | crBuckets = []float64{ 31 | 0.02, 0.03, 0.04, 0.05, 0.075, 32 | 0.1, 0.12, 0.14, 0.16, 0.18, 33 | 0.2, 0.3, 0.4, 0.5, 1, 34 | } 35 | commonLabels = []string{labelContainerName, LabelPodName, LabelPodNamespace} 36 | ) 37 | 38 | type Collector struct { 39 | checkpointDuration *prometheus.HistogramVec 40 | restoreDuration *prometheus.HistogramVec 41 | lastCheckpointTime *prometheus.GaugeVec 42 | lastRestoreTime *prometheus.GaugeVec 43 | running *prometheus.GaugeVec 44 | } 45 | 46 | func NewCollector() *Collector { 47 | return &Collector{ 48 | checkpointDuration: prometheus.NewHistogramVec(prometheus.HistogramOpts{ 49 | Namespace: MetricsNamespace, 50 | Name: MetricCheckpointDuration, 51 | Help: "The duration of the last checkpoint in seconds.", 52 | Buckets: crBuckets, 53 | }, commonLabels), 54 | 55 | restoreDuration: prometheus.NewHistogramVec(prometheus.HistogramOpts{ 56 | Namespace: MetricsNamespace, 57 | Name: MetricRestoreDuration, 58 | Help: "The duration of the last restore in seconds.", 59 | Buckets: crBuckets, 60 | }, commonLabels), 61 | 62 | lastCheckpointTime: prometheus.NewGaugeVec(prometheus.GaugeOpts{ 63 | Namespace: MetricsNamespace, 64 | Name: MetricLastCheckpointTime, 65 | Help: "A unix timestamp in nanoseconds of the last checkpoint.", 66 | }, commonLabels), 67 | 68 | lastRestoreTime: prometheus.NewGaugeVec(prometheus.GaugeOpts{ 69 | Namespace: MetricsNamespace, 70 | Name: MetricLastRestoreTime, 71 | Help: "A unix timestamp in nanoseconds of the last restore.", 72 | }, commonLabels), 73 | 74 | running: prometheus.NewGaugeVec(prometheus.GaugeOpts{ 75 | Namespace: MetricsNamespace, 76 | Name: MetricRunning, 77 | Help: "Reports if the process is currently running or checkpointed.", 78 | }, commonLabels), 79 | } 80 | } 81 | 82 | func (c *Collector) Collect(ch chan<- prometheus.Metric) { 83 | socks, err := os.ReadDir(task.ShimSocketPath) 84 | if err != nil { 85 | slog.Error("error listing file in shim socket path", "path", task.ShimSocketPath, "err", err) 86 | return 87 | } 88 | 89 | for _, sock := range socks { 90 | sockName := filepath.Join(task.ShimSocketPath, sock.Name()) 91 | slog.Debug("getting metrics", "name", sockName) 92 | 93 | shimMetrics, err := collectMetricsOverTTRPC(context.Background(), sockName) 94 | if err != nil { 95 | slog.Error("getting metrics", "err", err) 96 | // we still want to read the rest of the sockets 97 | continue 98 | } 99 | 100 | for _, metrics := range shimMetrics { 101 | l := labels(metrics) 102 | r := 0 103 | if metrics.Running { 104 | r = 1 105 | } 106 | // TODO: handle stale metrics 107 | c.running.With(l).Set(float64(r)) 108 | if metrics.LastCheckpointDuration != nil { 109 | c.checkpointDuration.With(l).Observe(metrics.LastCheckpointDuration.AsDuration().Seconds()) 110 | } 111 | if metrics.LastRestoreDuration != nil { 112 | c.restoreDuration.With(l).Observe(metrics.LastRestoreDuration.AsDuration().Seconds()) 113 | } 114 | if metrics.LastCheckpoint != nil { 115 | c.lastCheckpointTime.With(l).Set(float64(metrics.LastCheckpoint.AsTime().UnixNano())) 116 | } 117 | if metrics.LastRestore != nil { 118 | c.lastRestoreTime.With(l).Set(float64(metrics.LastRestore.AsTime().UnixNano())) 119 | } 120 | } 121 | } 122 | c.running.Collect(ch) 123 | c.checkpointDuration.Collect(ch) 124 | c.restoreDuration.Collect(ch) 125 | c.lastCheckpointTime.Collect(ch) 126 | c.lastRestoreTime.Collect(ch) 127 | } 128 | 129 | func (c *Collector) Describe(ch chan<- *prometheus.Desc) { 130 | } 131 | 132 | func collectMetricsOverTTRPC(ctx context.Context, sock string) ([]*v1.ContainerMetrics, error) { 133 | conn, err := net.Dial("unix", sock) 134 | if err != nil { 135 | return nil, err 136 | } 137 | 138 | resp, err := v1.NewShimClient(ttrpc.NewClient(conn)).Metrics(ctx, &v1.MetricsRequest{}) 139 | if err != nil { 140 | return nil, err 141 | } 142 | 143 | return resp.Metrics, nil 144 | } 145 | 146 | func labels(metrics *v1.ContainerMetrics) map[string]string { 147 | return map[string]string{ 148 | labelContainerName: metrics.Name, 149 | LabelPodName: metrics.PodName, 150 | LabelPodNamespace: metrics.PodNamespace, 151 | } 152 | } 153 | 154 | func (c *Collector) deleteMetrics(status *v1.ContainerStatus) { 155 | l := labels(&v1.ContainerMetrics{ 156 | Name: status.Name, 157 | PodName: status.PodName, 158 | PodNamespace: status.PodNamespace, 159 | }) 160 | c.running.Delete(l) 161 | c.checkpointDuration.Delete(l) 162 | c.restoreDuration.Delete(l) 163 | c.lastCheckpointTime.Delete(l) 164 | c.lastRestoreTime.Delete(l) 165 | } 166 | -------------------------------------------------------------------------------- /manager/node/cert.go: -------------------------------------------------------------------------------- 1 | package node 2 | 3 | import ( 4 | "crypto/ed25519" 5 | "crypto/rand" 6 | "crypto/tls" 7 | "crypto/x509" 8 | "crypto/x509/pkix" 9 | "fmt" 10 | "math/big" 11 | "net" 12 | "time" 13 | ) 14 | 15 | func GenCert(caCert *x509.Certificate, caKey ed25519.PrivateKey, ipAddresses ...net.IP) (tls.Certificate, error) { 16 | serialNumberLimit := new(big.Int).Lsh(big.NewInt(1), 128) 17 | serialNumber, err := rand.Int(rand.Reader, serialNumberLimit) 18 | if err != nil { 19 | return tls.Certificate{}, fmt.Errorf("generating serial number: %w", err) 20 | } 21 | 22 | template := x509.Certificate{ 23 | SerialNumber: serialNumber, 24 | Subject: pkix.Name{ 25 | Organization: []string{"ctrox.dev"}, 26 | Country: []string{"CH"}, 27 | }, 28 | NotBefore: time.Now(), 29 | NotAfter: time.Now().Add(time.Hour * 87600), 30 | KeyUsage: x509.KeyUsageKeyEncipherment | x509.KeyUsageDigitalSignature, 31 | ExtKeyUsage: []x509.ExtKeyUsage{ 32 | x509.ExtKeyUsageClientAuth, 33 | x509.ExtKeyUsageServerAuth, 34 | }, 35 | } 36 | if len(ipAddresses) > 0 { 37 | template.IPAddresses = ipAddresses 38 | } 39 | 40 | pub, priv, err := ed25519.GenerateKey(rand.Reader) 41 | if err != nil { 42 | return tls.Certificate{}, fmt.Errorf("generating key: %w", err) 43 | } 44 | 45 | if caCert == nil { 46 | template.IsCA = true 47 | template.KeyUsage |= x509.KeyUsageCertSign 48 | template.BasicConstraintsValid = true 49 | caCert = &template 50 | } 51 | if caKey == nil { 52 | caKey = priv 53 | } 54 | 55 | derBytes, err := x509.CreateCertificate(rand.Reader, &template, caCert, pub, caKey) 56 | if err != nil { 57 | return tls.Certificate{}, fmt.Errorf("creating certificate: %w", err) 58 | } 59 | 60 | return tls.Certificate{Certificate: [][]byte{derBytes}, PrivateKey: priv}, nil 61 | } 62 | 63 | func initTLS(host string) (*tls.Config, error) { 64 | caCert, err := tls.LoadX509KeyPair(caCertFile, caKeyFile) 65 | if err != nil { 66 | return nil, err 67 | } 68 | ca, err := x509.ParseCertificate(caCert.Certificate[0]) 69 | if err != nil { 70 | return nil, err 71 | } 72 | 73 | cert, err := GenCert(ca, caCert.PrivateKey.(ed25519.PrivateKey), net.ParseIP(host)) 74 | if err != nil { 75 | return nil, err 76 | } 77 | 78 | caCertPool := x509.NewCertPool() 79 | caCertPool.AddCert(ca) 80 | return serverTLSConfig(caCertPool, cert), nil 81 | } 82 | 83 | func serverTLSConfig(ca *x509.CertPool, cert tls.Certificate) *tls.Config { 84 | return &tls.Config{ 85 | ClientAuth: tls.RequireAndVerifyClientCert, 86 | ClientCAs: ca, 87 | Certificates: []tls.Certificate{cert}, 88 | RootCAs: ca, 89 | MinVersion: tls.VersionTLS13, 90 | } 91 | } 92 | -------------------------------------------------------------------------------- /manager/node/criu_check.go: -------------------------------------------------------------------------------- 1 | package node 2 | 3 | import ( 4 | "fmt" 5 | 6 | "github.com/checkpoint-restore/go-criu/v7" 7 | "github.com/checkpoint-restore/go-criu/v7/rpc" 8 | "k8s.io/utils/ptr" 9 | ) 10 | 11 | func checkLazyPages() error { 12 | c := criu.MakeCriu() 13 | feat, err := c.FeatureCheck(&rpc.CriuFeatures{LazyPages: ptr.To(true)}) 14 | if err != nil { 15 | return fmt.Errorf("lazy pages feature check failed with: %w", err) 16 | } 17 | if feat.LazyPages == nil || !*feat.LazyPages { 18 | return fmt.Errorf("lazy pages feature check failed") 19 | } 20 | return nil 21 | } 22 | -------------------------------------------------------------------------------- /manager/node/exec_logger.go: -------------------------------------------------------------------------------- 1 | package node 2 | 3 | import ( 4 | "bufio" 5 | "bytes" 6 | "log/slog" 7 | ) 8 | 9 | type execLogger struct { 10 | logger *slog.Logger 11 | level slog.Level 12 | } 13 | 14 | func newExecLogger(name string, l *slog.Logger, level slog.Level) *execLogger { 15 | return &execLogger{ 16 | logger: l.With("exec", name), 17 | level: level, 18 | } 19 | } 20 | 21 | func (el execLogger) Write(p []byte) (n int, err error) { 22 | scanner := bufio.NewScanner(bytes.NewReader(p)) 23 | for scanner.Scan() { 24 | el.logger.Debug(scanner.Text()) 25 | } 26 | return len(p), nil 27 | } 28 | -------------------------------------------------------------------------------- /manager/node/page_server_proxy.go: -------------------------------------------------------------------------------- 1 | package node 2 | 3 | import ( 4 | "context" 5 | "crypto/tls" 6 | "fmt" 7 | "io" 8 | "log/slog" 9 | "net" 10 | "sync" 11 | ) 12 | 13 | type pageServerProxy struct { 14 | tlsListen, tlsBackend *tls.Config 15 | log *slog.Logger 16 | port int 17 | backendAddr string 18 | listenAddr string 19 | done chan struct{} 20 | err error 21 | listener net.Listener 22 | } 23 | 24 | // newPageServerProxy returns a TCP proxy for use with a criu page server 25 | // listening on a local unix socket. As the page server is one-shot, the proxy 26 | // will automatically stop after the client has disconnected. Depending on the 27 | // passed in tlsListen/tlsBackend it will respectively listen for TLS or connect 28 | // to a TLS backend. A nil config means it will listen on plain TCP and connect 29 | // to a unix socket. 30 | func newPageServerProxy(addr, backendAddr string, tlsListen, tlsBackend *tls.Config, log *slog.Logger) *pageServerProxy { 31 | psp := &pageServerProxy{ 32 | log: log.WithGroup("page-server-proxy"), 33 | tlsListen: tlsListen, 34 | tlsBackend: tlsBackend, 35 | listenAddr: addr, 36 | backendAddr: backendAddr, 37 | done: make(chan struct{}), 38 | } 39 | return psp 40 | } 41 | 42 | func (p *pageServerProxy) listen(network, laddr string) (net.Listener, error) { 43 | var listener net.Listener 44 | var err error 45 | if p.tlsListen != nil { 46 | p.log.Info("listening tls", "addr", p.listenAddr) 47 | listener, err = tls.Listen(network, laddr, p.tlsListen) 48 | } else { 49 | p.log.Info("listening tcp", "addr", p.listenAddr) 50 | listener, err = net.Listen(network, laddr) 51 | } 52 | if err != nil { 53 | return nil, err 54 | } 55 | 56 | addr, ok := listener.Addr().(*net.TCPAddr) 57 | if !ok { 58 | return nil, fmt.Errorf("addr is not a net.TCPAddr: %T", listener.Addr()) 59 | } 60 | p.port = addr.Port 61 | 62 | return listener, nil 63 | } 64 | 65 | func (p *pageServerProxy) Start(ctx context.Context) error { 66 | listener, err := p.listen("tcp", p.listenAddr) 67 | if err != nil { 68 | return err 69 | } 70 | p.listener = listener 71 | 72 | go p.accept(ctx) 73 | go func() { 74 | <-ctx.Done() 75 | p.listener.Close() 76 | }() 77 | return nil 78 | } 79 | 80 | // Port returns the port the proxy is listening on. Only set after Start() has 81 | // returned. 82 | func (p *pageServerProxy) Port() int { 83 | return p.port 84 | } 85 | 86 | func (p *pageServerProxy) accept(ctx context.Context) { 87 | defer func() { p.done <- struct{}{} }() 88 | // as the page server is one-shot we only need to accept exactly once 89 | conn, err := p.listener.Accept() 90 | if err != nil { 91 | if ctx.Err() != nil { 92 | p.err = ctx.Err() 93 | return 94 | } 95 | p.err = err 96 | return 97 | } 98 | 99 | if err := p.HandleConn(ctx, conn); err != nil { 100 | p.log.Info("handling request", "error", err) 101 | p.err = err 102 | } 103 | } 104 | 105 | func (p *pageServerProxy) HandleConn(ctx context.Context, src net.Conn) error { 106 | var target net.Conn 107 | var err error 108 | if p.tlsBackend != nil { 109 | target, err = tls.Dial("tcp", p.backendAddr, p.tlsBackend) 110 | if err != nil { 111 | return fmt.Errorf("dialing target: %w", err) 112 | } 113 | } else { 114 | target, err = net.Dial("unix", p.backendAddr) 115 | if err != nil { 116 | return fmt.Errorf("dialing target: %w", err) 117 | } 118 | } 119 | p.log.Info("handling page server proxy connection", "remote_addr", src.RemoteAddr()) 120 | conn, ok := src.(*tls.Conn) 121 | if ok { 122 | if err := conn.HandshakeContext(ctx); err != nil { 123 | return fmt.Errorf("error during handshake: %w", err) 124 | } 125 | p.log.Info("handshake complete", "tls_version", tls.VersionName(conn.ConnectionState().Version)) 126 | } 127 | 128 | if err := proxy(ctx, src, target); err != nil { 129 | return fmt.Errorf("proxy error: %w", err) 130 | } 131 | 132 | return nil 133 | } 134 | 135 | // Wait waits until the server is done handling a connection or the context is 136 | // cancelled. 137 | func (p *pageServerProxy) Wait() error { 138 | <-p.done 139 | p.log.Info("page server done") 140 | return p.err 141 | } 142 | 143 | // proxy just proxies between conn1 and conn2. If the ctx is cancelled, both 144 | // sides of the connections are closed. 145 | func proxy(ctx context.Context, conn1, conn2 net.Conn) error { 146 | defer conn1.Close() 147 | defer conn2.Close() 148 | 149 | done := make(chan struct{}, 1) 150 | errs := make(chan error, 1) 151 | go func() { 152 | copy(errs, conn1, conn2) 153 | done <- struct{}{} 154 | }() 155 | select { 156 | case err := <-errs: 157 | return err 158 | case <-done: 159 | return nil 160 | case <-ctx.Done(): 161 | return nil 162 | } 163 | } 164 | 165 | func copy(errs chan error, conn1, conn2 net.Conn) { 166 | var wg sync.WaitGroup 167 | wg.Add(2) 168 | 169 | go func() { 170 | defer wg.Done() 171 | if _, err := io.Copy(conn1, conn2); err != nil { 172 | errs <- err 173 | } 174 | closeWrite(conn1) 175 | }() 176 | go func() { 177 | defer wg.Done() 178 | if _, err := io.Copy(conn2, conn1); err != nil { 179 | errs <- err 180 | } 181 | closeWrite(conn2) 182 | }() 183 | 184 | wg.Wait() 185 | } 186 | 187 | func closeWrite(conn net.Conn) { 188 | switch c := conn.(type) { 189 | case *tls.Conn: 190 | _ = c.CloseWrite() 191 | case *net.TCPConn: 192 | _ = c.CloseWrite() 193 | case *net.UnixConn: 194 | _ = c.CloseWrite() 195 | default: 196 | _ = c.Close() 197 | } 198 | } 199 | -------------------------------------------------------------------------------- /manager/node/page_server_proxy_test.go: -------------------------------------------------------------------------------- 1 | package node 2 | 3 | import ( 4 | "bytes" 5 | "context" 6 | "crypto/ed25519" 7 | "crypto/tls" 8 | "crypto/x509" 9 | "fmt" 10 | "io" 11 | "log/slog" 12 | "net" 13 | "path/filepath" 14 | "testing" 15 | "time" 16 | ) 17 | 18 | func TestPageServerProxy(t *testing.T) { 19 | socket := filepath.Join(t.TempDir(), "psp.sock") 20 | 21 | l, err := net.Listen("unix", socket) 22 | if err != nil { 23 | t.Fatal(err) 24 | } 25 | 26 | caCertPool := x509.NewCertPool() 27 | ca, serverCert, clientCert := prepareTLS(t) 28 | caCertPool.AddCert(ca) 29 | tlsConfig := serverTLSConfig(caCertPool, serverCert) 30 | psp := newPageServerProxy("localhost:0", socket, tlsConfig, nil, slog.Default()) 31 | 32 | ctx, cancel := context.WithTimeout(context.Background(), time.Second*5) 33 | defer cancel() 34 | if err := psp.Start(ctx); err != nil { 35 | t.Fatal(err) 36 | } 37 | t.Logf("started server on port %d", psp.Port()) 38 | 39 | dial := tls.Dialer{ 40 | Config: &tls.Config{ 41 | Certificates: []tls.Certificate{clientCert}, 42 | RootCAs: caCertPool, 43 | }, 44 | } 45 | 46 | conn, err := dial.DialContext(ctx, "tcp", fmt.Sprintf("127.0.0.1:%d", psp.Port())) 47 | if err != nil { 48 | t.Fatal(err) 49 | } 50 | 51 | sockConn, err := l.Accept() 52 | if err != nil { 53 | t.Fatal(err) 54 | } 55 | go io.Copy(sockConn, sockConn) 56 | 57 | testData := []byte("fooo") 58 | conn.Write(testData) 59 | buf := make([]byte, len(testData)) 60 | n, err := conn.Read(buf) 61 | if err != nil { 62 | t.Fatal("error reading from proxy connection") 63 | } 64 | t.Logf("read %d bytes", n) 65 | 66 | if !bytes.Equal(buf, testData) { 67 | t.Fatalf("read data %q does not equal test data %q", buf, testData) 68 | } 69 | 70 | if err := conn.Close(); err != nil { 71 | t.Fatal(err) 72 | } 73 | if err := sockConn.Close(); err != nil { 74 | t.Fatal(err) 75 | } 76 | 77 | if err := psp.Wait(); err != nil { 78 | t.Fatal(err) 79 | } 80 | } 81 | 82 | func prepareTLS(t *testing.T) (*x509.Certificate, tls.Certificate, tls.Certificate) { 83 | caCert, err := GenCert(nil, nil) 84 | if err != nil { 85 | t.Fatal(err) 86 | } 87 | 88 | ca, err := x509.ParseCertificate(caCert.Certificate[0]) 89 | if err != nil { 90 | t.Fatal(err) 91 | } 92 | 93 | serverCert, err := GenCert(ca, caCert.PrivateKey.(ed25519.PrivateKey), net.ParseIP("127.0.0.1")) 94 | if err != nil { 95 | t.Fatal(err) 96 | } 97 | 98 | clientCert, err := GenCert(ca, caCert.PrivateKey.(ed25519.PrivateKey), net.ParseIP("127.0.0.2")) 99 | if err != nil { 100 | t.Fatal(err) 101 | } 102 | 103 | return ca, serverCert, clientCert 104 | } 105 | -------------------------------------------------------------------------------- /manager/node/service_test.go: -------------------------------------------------------------------------------- 1 | package node 2 | 3 | import ( 4 | "testing" 5 | 6 | v1 "github.com/ctrox/zeropod/api/runtime/v1" 7 | "github.com/stretchr/testify/assert" 8 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 9 | ) 10 | 11 | func TestSetContainerStatus(t *testing.T) { 12 | migration := &v1.Migration{ 13 | ObjectMeta: metav1.ObjectMeta{ 14 | Name: "foo", 15 | }, 16 | Status: v1.MigrationStatus{ 17 | Containers: []v1.MigrationContainerStatus{ 18 | { 19 | Name: "container1", 20 | PausedAt: metav1.NowMicro(), 21 | }, 22 | }, 23 | }, 24 | } 25 | 26 | setOrUpdateContainerStatus(migration, "container1", func(cms *v1.MigrationContainerStatus) { 27 | cms.Condition.Phase = v1.MigrationPhaseCompleted 28 | cms.RestoredAt = metav1.NowMicro() 29 | }) 30 | assert.Equal(t, v1.MigrationPhaseCompleted, migration.Status.Containers[0].Condition.Phase) 31 | assert.NotEmpty(t, migration.Status.Containers[0].PausedAt) 32 | assert.NotEmpty(t, migration.Status.Containers[0].RestoredAt) 33 | 34 | setOrUpdateContainerStatus(migration, "container2", func(cms *v1.MigrationContainerStatus) { 35 | cms.Condition.Phase = v1.MigrationPhaseFailed 36 | cms.RestoredAt = metav1.NowMicro() 37 | }) 38 | assert.Len(t, migration.Status.Containers, 2) 39 | assert.Equal(t, v1.MigrationPhaseFailed, migration.Status.Containers[1].Condition.Phase) 40 | assert.NotEmpty(t, migration.Status.Containers[1].RestoredAt) 41 | } 42 | -------------------------------------------------------------------------------- /manager/pod_controller.go: -------------------------------------------------------------------------------- 1 | package manager 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | "log/slog" 7 | "net/url" 8 | "os" 9 | "path" 10 | 11 | nodev1 "github.com/ctrox/zeropod/api/node/v1" 12 | v1 "github.com/ctrox/zeropod/api/runtime/v1" 13 | shimv1 "github.com/ctrox/zeropod/api/shim/v1" 14 | "github.com/go-logr/logr" 15 | appsv1 "k8s.io/api/apps/v1" 16 | corev1 "k8s.io/api/core/v1" 17 | "k8s.io/apimachinery/pkg/api/errors" 18 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 19 | ctrl "sigs.k8s.io/controller-runtime" 20 | "sigs.k8s.io/controller-runtime/pkg/client" 21 | "sigs.k8s.io/controller-runtime/pkg/controller" 22 | "sigs.k8s.io/controller-runtime/pkg/handler" 23 | "sigs.k8s.io/controller-runtime/pkg/manager" 24 | "sigs.k8s.io/controller-runtime/pkg/reconcile" 25 | "sigs.k8s.io/controller-runtime/pkg/source" 26 | ) 27 | 28 | func NewPodController(ctx context.Context, mgr manager.Manager, log *slog.Logger) error { 29 | ctrl.SetLogger(logr.FromSlogHandler(log.Handler())) 30 | 31 | pr, err := newPodReconciler(mgr.GetClient(), log) 32 | if err != nil { 33 | return err 34 | } 35 | c, err := controller.New("pod-controller", mgr, controller.Options{ 36 | Reconciler: pr, 37 | }) 38 | if err != nil { 39 | return err 40 | } 41 | return c.Watch(source.Kind( 42 | mgr.GetCache(), &corev1.Pod{}, &handler.TypedEnqueueRequestForObject[*corev1.Pod]{}, 43 | )) 44 | } 45 | 46 | type podReconciler struct { 47 | kube client.Client 48 | log *slog.Logger 49 | nodeName string 50 | } 51 | 52 | func newPodReconciler(kube client.Client, log *slog.Logger) (*podReconciler, error) { 53 | nodeName, ok := os.LookupEnv(nodev1.NodeNameEnvKey) 54 | if !ok { 55 | return nil, fmt.Errorf("could not find node name, env %s is not set", nodev1.NodeNameEnvKey) 56 | } 57 | return &podReconciler{ 58 | log: log, 59 | kube: kube, 60 | nodeName: nodeName, 61 | }, nil 62 | } 63 | 64 | func (r *podReconciler) Reconcile(ctx context.Context, request reconcile.Request) (reconcile.Result, error) { 65 | log := r.log.With("req", request) 66 | 67 | if err := r.kube.Get(ctx, request.NamespacedName, &v1.Migration{}); err == nil { 68 | // migration already exists, there's nothing for us to do 69 | return reconcile.Result{}, nil 70 | } 71 | 72 | pod := &corev1.Pod{} 73 | if err := r.kube.Get(ctx, request.NamespacedName, pod); err != nil { 74 | if errors.IsNotFound(err) { 75 | // pod is already gone, it's too late to migrate it 76 | return reconcile.Result{}, nil 77 | } 78 | return reconcile.Result{}, err 79 | } 80 | 81 | if !r.isMigratable(pod) { 82 | return reconcile.Result{}, nil 83 | } 84 | 85 | migration, err := newMigration(pod) 86 | if err != nil { 87 | return reconcile.Result{}, fmt.Errorf("initializing migration object: %w", err) 88 | } 89 | if err := r.kube.Create(ctx, migration); err != nil { 90 | if !errors.IsAlreadyExists(err) { 91 | return reconcile.Result{}, fmt.Errorf("creating migration object: %w", err) 92 | } 93 | } 94 | log.Info("created migration for pod", "pod_name", pod.Name, "pod_namespace", pod.Namespace) 95 | 96 | for _, container := range migration.Spec.Containers { 97 | migration.Status.Containers = append(migration.Status.Containers, v1.MigrationContainerStatus{ 98 | Name: container.Name, 99 | Condition: v1.MigrationCondition{ 100 | Phase: v1.MigrationPhasePending, 101 | }, 102 | }) 103 | } 104 | if err := r.kube.Status().Update(ctx, migration); err != nil { 105 | // updating the status to pending is just cosmetic, we don't want to 106 | // slow down the process by retrying here. 107 | log.Warn("setting migration status failed, ignoring") 108 | return reconcile.Result{}, nil 109 | } 110 | 111 | return reconcile.Result{}, nil 112 | } 113 | 114 | func (r podReconciler) isMigratable(pod *corev1.Pod) bool { 115 | if pod.Spec.RuntimeClassName != nil && *pod.Spec.RuntimeClassName != v1.RuntimeClassName { 116 | return false 117 | } 118 | 119 | if pod.Spec.NodeName != r.nodeName { 120 | return false 121 | } 122 | 123 | if pod.DeletionTimestamp == nil { 124 | return false 125 | } 126 | 127 | if !anyMigrationEnabled(pod) { 128 | return false 129 | } 130 | 131 | if !hasScaledDownContainer(pod) && !liveMigrationEnabled(pod) { 132 | r.log.Info("skipping pod with no scaled down containers and live migration disabled", 133 | "pod_name", pod.Name, "pod_namespace", pod.Namespace) 134 | return false 135 | } 136 | 137 | return true 138 | } 139 | 140 | func newMigration(pod *corev1.Pod) (*v1.Migration, error) { 141 | containers := []v1.MigrationContainer{} 142 | for _, container := range pod.Status.ContainerStatuses { 143 | u, err := url.Parse(container.ContainerID) 144 | if err != nil { 145 | return nil, fmt.Errorf("unable to parse container ID %s", container.ContainerID) 146 | } 147 | containers = append(containers, v1.MigrationContainer{ 148 | Name: container.Name, 149 | ID: u.Host, 150 | }) 151 | } 152 | 153 | return &v1.Migration{ 154 | ObjectMeta: metav1.ObjectMeta{ 155 | Name: pod.Name, 156 | Namespace: pod.Namespace, 157 | }, 158 | Spec: v1.MigrationSpec{ 159 | SourcePod: pod.Name, 160 | SourceNode: pod.Spec.NodeName, 161 | PodTemplateHash: pod.Labels[appsv1.DefaultDeploymentUniqueLabelKey], 162 | Containers: containers, 163 | }, 164 | }, nil 165 | } 166 | 167 | func hasScaledDownContainer(pod *corev1.Pod) bool { 168 | for _, container := range pod.Spec.Containers { 169 | if k, ok := pod.Labels[path.Join(StatusLabelKeyPrefix, container.Name)]; ok { 170 | if k == shimv1.ContainerPhase_SCALED_DOWN.String() { 171 | return true 172 | } 173 | } 174 | } 175 | return false 176 | } 177 | 178 | func anyMigrationEnabled(pod *corev1.Pod) bool { 179 | _, migrate := pod.Annotations[nodev1.MigrateAnnotationKey] 180 | _, liveMigrate := pod.Annotations[nodev1.LiveMigrateAnnotationKey] 181 | return migrate || liveMigrate 182 | } 183 | 184 | func liveMigrationEnabled(pod *corev1.Pod) bool { 185 | _, ok := pod.Annotations[nodev1.LiveMigrateAnnotationKey] 186 | return ok 187 | } 188 | -------------------------------------------------------------------------------- /manager/pod_controller_test.go: -------------------------------------------------------------------------------- 1 | package manager 2 | 3 | import ( 4 | "context" 5 | "log/slog" 6 | "path" 7 | "testing" 8 | 9 | nodev1 "github.com/ctrox/zeropod/api/node/v1" 10 | v1 "github.com/ctrox/zeropod/api/runtime/v1" 11 | shimv1 "github.com/ctrox/zeropod/api/shim/v1" 12 | "github.com/stretchr/testify/assert" 13 | "github.com/stretchr/testify/require" 14 | corev1 "k8s.io/api/core/v1" 15 | "k8s.io/apimachinery/pkg/api/errors" 16 | "k8s.io/apimachinery/pkg/runtime" 17 | "k8s.io/utils/ptr" 18 | "sigs.k8s.io/controller-runtime/pkg/client" 19 | "sigs.k8s.io/controller-runtime/pkg/client/fake" 20 | "sigs.k8s.io/controller-runtime/pkg/reconcile" 21 | ) 22 | 23 | func TestPodReconciler(t *testing.T) { 24 | slog.SetLogLoggerLevel(slog.LevelDebug) 25 | scheme := runtime.NewScheme() 26 | require.NoError(t, corev1.AddToScheme(scheme)) 27 | require.NoError(t, v1.AddToScheme(scheme)) 28 | kube := fake.NewClientBuilder().WithScheme(scheme).WithStatusSubresource(&v1.Migration{}).Build() 29 | ctx := context.Background() 30 | 31 | for name, tc := range map[string]struct { 32 | pod *corev1.Pod 33 | containerID string 34 | expectedContainerID string 35 | deletePod bool 36 | nodeName string 37 | runtimeClassName string 38 | expectedMigration bool 39 | expectedRequeue bool 40 | }{ 41 | "pod that should be migrated": { 42 | pod: newMigratePod("node1", v1.RuntimeClassName, shimv1.ContainerPhase_SCALED_DOWN), 43 | containerID: "containerd://imageid", 44 | expectedContainerID: "imageid", 45 | deletePod: true, 46 | nodeName: "node1", 47 | expectedMigration: true, 48 | }, 49 | "pod without live migration and running": { 50 | pod: newMigratePod("node1", v1.RuntimeClassName, shimv1.ContainerPhase_RUNNING), 51 | containerID: "containerd://imageid", 52 | expectedContainerID: "imageid", 53 | deletePod: true, 54 | nodeName: "node1", 55 | expectedMigration: false, 56 | }, 57 | "pod with live migration and running": { 58 | pod: enableLiveMigration(newMigratePod("node1", v1.RuntimeClassName, shimv1.ContainerPhase_RUNNING)), 59 | containerID: "containerd://imageid", 60 | expectedContainerID: "imageid", 61 | deletePod: true, 62 | nodeName: "node1", 63 | expectedMigration: true, 64 | }, 65 | "pod on wrong node": { 66 | pod: newMigratePod("node2", v1.RuntimeClassName, shimv1.ContainerPhase_SCALED_DOWN), 67 | deletePod: true, 68 | nodeName: "node1", 69 | expectedMigration: false, 70 | }, 71 | "not a zeropod": { 72 | pod: newMigratePod("node1", "", shimv1.ContainerPhase_SCALED_DOWN), 73 | deletePod: true, 74 | nodeName: "node1", 75 | expectedMigration: false, 76 | }, 77 | "not deleted": { 78 | pod: newMigratePod("node1", "", shimv1.ContainerPhase_SCALED_DOWN), 79 | deletePod: false, 80 | nodeName: "node1", 81 | expectedMigration: false, 82 | }, 83 | } { 84 | t.Run(name, func(t *testing.T) { 85 | t.Setenv(nodev1.NodeNameEnvKey, tc.nodeName) 86 | r, err := newPodReconciler(kube, slog.Default()) 87 | require.NoError(t, err) 88 | 89 | if tc.containerID != "" { 90 | tc.pod.Status.ContainerStatuses[0].ContainerID = tc.containerID 91 | } 92 | require.NoError(t, kube.Create(ctx, tc.pod)) 93 | if tc.deletePod { 94 | // set a finalizer to simulate deletion 95 | tc.pod.SetFinalizers([]string{"foo"}) 96 | require.NoError(t, kube.Update(ctx, tc.pod)) 97 | require.NoError(t, kube.Delete(ctx, tc.pod)) 98 | } 99 | res, err := r.Reconcile(ctx, reconcile.Request{ 100 | NamespacedName: client.ObjectKeyFromObject(tc.pod), 101 | }) 102 | assert.NoError(t, err) 103 | assert.Equal(t, tc.expectedRequeue, res.Requeue) 104 | 105 | if tc.expectedMigration { 106 | migration := &v1.Migration{} 107 | assert.NoError(t, kube.Get(ctx, client.ObjectKeyFromObject(tc.pod), migration)) 108 | require.NotEmpty(t, migration.Spec.Containers) 109 | require.NotEmpty(t, migration.Status.Containers) 110 | assert.Equal(t, v1.MigrationPhasePending, migration.Status.Containers[0].Condition.Phase) 111 | assert.Equal(t, tc.expectedContainerID, migration.Spec.Containers[0].ID) 112 | } else { 113 | assert.True(t, errors.IsNotFound(kube.Get(ctx, client.ObjectKeyFromObject(tc.pod), &v1.Migration{}))) 114 | } 115 | }) 116 | } 117 | } 118 | 119 | func newMigratePod(nodeName, runtimeClassName string, phase shimv1.ContainerPhase) *corev1.Pod { 120 | pod := newPod(corev1.ResourceList{}) 121 | pod.Name = "" 122 | pod.GenerateName = "controller-test-" 123 | pod.Spec.NodeName = nodeName 124 | pod.Spec.RuntimeClassName = ptr.To(runtimeClassName) 125 | containerName := pod.Spec.Containers[0].Name 126 | pod.SetAnnotations(map[string]string{ 127 | nodev1.MigrateAnnotationKey: pod.Spec.Containers[0].Name, 128 | }) 129 | pod.SetLabels(map[string]string{ 130 | path.Join(StatusLabelKeyPrefix, containerName): phase.String(), 131 | }) 132 | pod.Status.ContainerStatuses = []corev1.ContainerStatus{{ 133 | Name: containerName, 134 | }} 135 | return pod 136 | } 137 | 138 | func enableLiveMigration(pod *corev1.Pod) *corev1.Pod { 139 | pod.Annotations[nodev1.LiveMigrateAnnotationKey] = pod.Spec.Containers[0].Name 140 | return pod 141 | } 142 | -------------------------------------------------------------------------------- /manager/pod_labeller.go: -------------------------------------------------------------------------------- 1 | package manager 2 | 3 | import ( 4 | "context" 5 | "log/slog" 6 | "path" 7 | 8 | v1 "github.com/ctrox/zeropod/api/shim/v1" 9 | corev1 "k8s.io/api/core/v1" 10 | ) 11 | 12 | const ( 13 | StatusLabelKeyPrefix = "status.zeropod.ctrox.dev" 14 | ) 15 | 16 | type PodLabeller struct { 17 | log *slog.Logger 18 | } 19 | 20 | func NewPodLabeller(log *slog.Logger) *PodLabeller { 21 | log = log.With("component", "podlabeller") 22 | log.Info("init") 23 | return &PodLabeller{log: log} 24 | } 25 | 26 | func (pl *PodLabeller) Handle(ctx context.Context, status *v1.ContainerStatus, pod *corev1.Pod) error { 27 | clog := pl.log.With("container", status.Name, "pod", status.PodName, 28 | "namespace", status.PodNamespace, "phase", status.Phase) 29 | clog.Info("status event") 30 | 31 | pl.setLabel(pod, status) 32 | return nil 33 | } 34 | 35 | func (pu *PodLabeller) setLabel(pod *corev1.Pod, status *v1.ContainerStatus) { 36 | if pod.Labels == nil { 37 | pod.Labels = map[string]string{} 38 | } 39 | pod.Labels[path.Join(StatusLabelKeyPrefix, status.Name)] = status.Phase.String() 40 | } 41 | -------------------------------------------------------------------------------- /manager/pod_labeller_test.go: -------------------------------------------------------------------------------- 1 | package manager 2 | 3 | import ( 4 | "context" 5 | "log/slog" 6 | "testing" 7 | 8 | v1 "github.com/ctrox/zeropod/api/shim/v1" 9 | "github.com/stretchr/testify/assert" 10 | corev1 "k8s.io/api/core/v1" 11 | "k8s.io/apimachinery/pkg/runtime" 12 | ) 13 | 14 | func TestPodLabeller(t *testing.T) { 15 | slog.SetLogLoggerLevel(slog.LevelDebug) 16 | 17 | scheme := runtime.NewScheme() 18 | if err := corev1.AddToScheme(scheme); err != nil { 19 | t.Fatal(err) 20 | } 21 | cases := map[string]struct { 22 | statusEventPhase v1.ContainerPhase 23 | beforeEvent map[string]string 24 | expected map[string]string 25 | }{ 26 | "no labels set": { 27 | statusEventPhase: v1.ContainerPhase_RUNNING, 28 | beforeEvent: nil, 29 | expected: map[string]string{ 30 | "status.zeropod.ctrox.dev/first-container": v1.ContainerPhase_RUNNING.String(), 31 | }, 32 | }, 33 | "existing labels are kept": { 34 | statusEventPhase: v1.ContainerPhase_RUNNING, 35 | beforeEvent: map[string]string{"existing": "label"}, 36 | expected: map[string]string{ 37 | "existing": "label", 38 | "status.zeropod.ctrox.dev/first-container": v1.ContainerPhase_RUNNING.String(), 39 | }, 40 | }, 41 | "status label is updated": { 42 | statusEventPhase: v1.ContainerPhase_SCALED_DOWN, 43 | beforeEvent: map[string]string{ 44 | "status.zeropod.ctrox.dev/first-container": v1.ContainerPhase_RUNNING.String(), 45 | }, 46 | expected: map[string]string{ 47 | "status.zeropod.ctrox.dev/first-container": v1.ContainerPhase_SCALED_DOWN.String(), 48 | }, 49 | }, 50 | } 51 | 52 | for name, tc := range cases { 53 | tc := tc 54 | t.Run(name, func(t *testing.T) { 55 | pod := newPod(nil) 56 | pod.SetLabels(tc.beforeEvent) 57 | 58 | if err := NewPodLabeller(slog.Default()).Handle( 59 | context.Background(), 60 | &v1.ContainerStatus{ 61 | Name: pod.Spec.Containers[0].Name, 62 | PodName: pod.Name, 63 | PodNamespace: pod.Namespace, 64 | Phase: tc.statusEventPhase, 65 | }, 66 | pod, 67 | ); err != nil { 68 | t.Fatal(err) 69 | } 70 | 71 | assert.Equal(t, pod.GetLabels(), tc.expected) 72 | }) 73 | } 74 | } 75 | -------------------------------------------------------------------------------- /manager/pod_scaler.go: -------------------------------------------------------------------------------- 1 | package manager 2 | 3 | import ( 4 | "context" 5 | "encoding/json" 6 | "fmt" 7 | "log/slog" 8 | 9 | v1 "github.com/ctrox/zeropod/api/shim/v1" 10 | corev1 "k8s.io/api/core/v1" 11 | "k8s.io/apimachinery/pkg/api/resource" 12 | ) 13 | 14 | const ( 15 | CPUAnnotationKey = "zeropod.ctrox.dev/cpu-requests" 16 | MemoryAnnotationKey = "zeropod.ctrox.dev/memory-requests" 17 | ) 18 | 19 | var ( 20 | ScaledDownCPU = resource.MustParse("1m") 21 | ScaledDownMemory = resource.MustParse("1Ki") 22 | ) 23 | 24 | type containerResource map[string]resource.Quantity 25 | 26 | type PodScaler struct { 27 | log *slog.Logger 28 | } 29 | 30 | func NewPodScaler(log *slog.Logger) *PodScaler { 31 | log = log.With("component", "podscaler") 32 | log.Info("init") 33 | return &PodScaler{log: log} 34 | } 35 | 36 | func (ps *PodScaler) Handle(ctx context.Context, status *v1.ContainerStatus, pod *corev1.Pod) error { 37 | clog := ps.log.With("container", status.Name, "pod", status.PodName, 38 | "namespace", status.PodNamespace, "phase", status.Phase) 39 | clog.Info("status event") 40 | 41 | if err := ps.setAnnotations(pod); err != nil { 42 | return err 43 | } 44 | 45 | for i, container := range pod.Spec.Containers { 46 | if container.Name != status.Name { 47 | continue 48 | } 49 | 50 | _, hasCPU := container.Resources.Requests[corev1.ResourceCPU] 51 | _, hasMemory := container.Resources.Requests[corev1.ResourceMemory] 52 | if !hasCPU || !hasMemory { 53 | clog.Debug("ignoring container without resources") 54 | continue 55 | } 56 | 57 | initial, err := ps.initialRequests(container, pod.Annotations) 58 | if err != nil { 59 | return fmt.Errorf("getting initial requests from pod failed: %w", err) 60 | } 61 | 62 | current := container.Resources.Requests 63 | if ps.isUpToDate(initial, current, status) { 64 | clog.Debug("container is up to date", "initial", printResources(initial)) 65 | continue 66 | } 67 | 68 | new := ps.newRequests(initial, current.DeepCopy(), status) 69 | pod.Spec.Containers[i].Resources.Requests = new 70 | clog.Debug("container needs to be updated", "current", printResources(current), "new", printResources(new)) 71 | } 72 | 73 | return nil 74 | } 75 | 76 | func (ps *PodScaler) isUpToDate(initial, current corev1.ResourceList, status *v1.ContainerStatus) bool { 77 | switch status.Phase { 78 | case v1.ContainerPhase_SCALED_DOWN: 79 | return current[corev1.ResourceCPU] == ScaledDownCPU && 80 | current[corev1.ResourceMemory] == ScaledDownMemory 81 | case v1.ContainerPhase_RUNNING: 82 | return current[corev1.ResourceCPU] == initial[corev1.ResourceCPU] && 83 | current[corev1.ResourceMemory] == initial[corev1.ResourceMemory] 84 | default: 85 | return true 86 | } 87 | } 88 | 89 | func (ps *PodScaler) newRequests(initial, current corev1.ResourceList, status *v1.ContainerStatus) corev1.ResourceList { 90 | switch status.Phase { 91 | case v1.ContainerPhase_SCALED_DOWN: 92 | current[corev1.ResourceCPU] = ScaledDownCPU 93 | current[corev1.ResourceMemory] = ScaledDownMemory 94 | return current 95 | case v1.ContainerPhase_RUNNING: 96 | return initial 97 | default: 98 | return current 99 | } 100 | } 101 | 102 | func (ps *PodScaler) initialRequests(container corev1.Container, podAnnotations map[string]string) (corev1.ResourceList, error) { 103 | initial := container.DeepCopy().Resources.Requests 104 | containerCPUs := containerResource{} 105 | if cpuReq, ok := podAnnotations[CPUAnnotationKey]; ok { 106 | if err := json.Unmarshal([]byte(cpuReq), &containerCPUs); err != nil { 107 | return nil, err 108 | } 109 | } 110 | 111 | containerMemory := containerResource{} 112 | if memortReq, ok := podAnnotations[MemoryAnnotationKey]; ok { 113 | if err := json.Unmarshal([]byte(memortReq), &containerMemory); err != nil { 114 | return nil, err 115 | } 116 | } 117 | 118 | if cpu, ok := containerCPUs[container.Name]; ok { 119 | initial[corev1.ResourceCPU] = cpu 120 | } 121 | 122 | if memory, ok := containerMemory[container.Name]; ok { 123 | initial[corev1.ResourceMemory] = memory 124 | } 125 | 126 | return initial, nil 127 | } 128 | 129 | func (ps *PodScaler) setAnnotations(pod *corev1.Pod) error { 130 | containerCPUs := containerResource{} 131 | containerMemory := containerResource{} 132 | for _, container := range pod.Spec.Containers { 133 | containerCPUs[container.Name] = container.Resources.Requests[corev1.ResourceCPU] 134 | containerMemory[container.Name] = container.Resources.Requests[corev1.ResourceMemory] 135 | } 136 | 137 | if pod.Annotations == nil { 138 | pod.Annotations = map[string]string{} 139 | } 140 | 141 | if _, ok := pod.Annotations[CPUAnnotationKey]; !ok { 142 | val, err := json.Marshal(containerCPUs) 143 | if err != nil { 144 | return err 145 | } 146 | pod.Annotations[CPUAnnotationKey] = string(val) 147 | } 148 | 149 | if _, ok := pod.Annotations[MemoryAnnotationKey]; !ok { 150 | val, err := json.Marshal(containerMemory) 151 | if err != nil { 152 | return err 153 | } 154 | pod.Annotations[MemoryAnnotationKey] = string(val) 155 | } 156 | 157 | return nil 158 | } 159 | 160 | func printResources(res corev1.ResourceList) string { 161 | cpu := res[corev1.ResourceCPU] 162 | memory := res[corev1.ResourceMemory] 163 | return fmt.Sprintf("cpu: %s, memory: %s", cpu.String(), memory.String()) 164 | } 165 | -------------------------------------------------------------------------------- /manager/pod_scaler_test.go: -------------------------------------------------------------------------------- 1 | package manager 2 | 3 | import ( 4 | "context" 5 | "log/slog" 6 | "testing" 7 | 8 | v1 "github.com/ctrox/zeropod/api/shim/v1" 9 | "github.com/stretchr/testify/assert" 10 | corev1 "k8s.io/api/core/v1" 11 | "k8s.io/apimachinery/pkg/api/resource" 12 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 13 | "k8s.io/apimachinery/pkg/runtime" 14 | ) 15 | 16 | func TestHandlePod(t *testing.T) { 17 | slog.SetLogLoggerLevel(slog.LevelDebug) 18 | 19 | scheme := runtime.NewScheme() 20 | if err := corev1.AddToScheme(scheme); err != nil { 21 | t.Fatal(err) 22 | } 23 | runningCPU, runningMemory := resource.MustParse("100m"), resource.MustParse("100Mi") 24 | 25 | cases := map[string]struct { 26 | statusEventPhase v1.ContainerPhase 27 | beforeEvent corev1.ResourceList 28 | expected corev1.ResourceList 29 | }{ 30 | "running pod is not updated": { 31 | statusEventPhase: v1.ContainerPhase_RUNNING, 32 | beforeEvent: corev1.ResourceList{ 33 | corev1.ResourceCPU: runningCPU, 34 | corev1.ResourceMemory: runningMemory, 35 | }, 36 | expected: corev1.ResourceList{ 37 | corev1.ResourceCPU: runningCPU, 38 | corev1.ResourceMemory: runningMemory, 39 | }, 40 | }, 41 | "running is updated when scaling down": { 42 | statusEventPhase: v1.ContainerPhase_SCALED_DOWN, 43 | beforeEvent: corev1.ResourceList{ 44 | corev1.ResourceCPU: runningCPU, 45 | corev1.ResourceMemory: runningMemory, 46 | }, 47 | expected: corev1.ResourceList{ 48 | corev1.ResourceCPU: ScaledDownCPU, 49 | corev1.ResourceMemory: ScaledDownMemory, 50 | }, 51 | }, 52 | "scaled down pod is not updated": { 53 | statusEventPhase: v1.ContainerPhase_SCALED_DOWN, 54 | beforeEvent: corev1.ResourceList{ 55 | corev1.ResourceCPU: ScaledDownCPU, 56 | corev1.ResourceMemory: ScaledDownMemory, 57 | }, 58 | expected: corev1.ResourceList{ 59 | corev1.ResourceCPU: ScaledDownCPU, 60 | corev1.ResourceMemory: ScaledDownMemory, 61 | }, 62 | }, 63 | "scaled down pod requests are restored when starting": { 64 | statusEventPhase: v1.ContainerPhase_RUNNING, 65 | beforeEvent: corev1.ResourceList{ 66 | corev1.ResourceCPU: ScaledDownCPU, 67 | corev1.ResourceMemory: ScaledDownMemory, 68 | }, 69 | expected: corev1.ResourceList{ 70 | corev1.ResourceCPU: runningCPU, 71 | corev1.ResourceMemory: runningMemory, 72 | }, 73 | }, 74 | } 75 | 76 | for name, tc := range cases { 77 | tc := tc 78 | t.Run(name, func(t *testing.T) { 79 | ps := &PodScaler{log: slog.Default()} 80 | 81 | initialPod := newPod(corev1.ResourceList{corev1.ResourceCPU: runningCPU, corev1.ResourceMemory: runningMemory}) 82 | ps.setAnnotations(initialPod) 83 | pod := newPod(tc.beforeEvent) 84 | pod.SetAnnotations(initialPod.GetAnnotations()) 85 | 86 | if err := ps.Handle( 87 | context.Background(), 88 | &v1.ContainerStatus{ 89 | Name: pod.Spec.Containers[0].Name, 90 | PodName: pod.Name, 91 | PodNamespace: pod.Namespace, 92 | Phase: tc.statusEventPhase, 93 | }, 94 | pod, 95 | ); err != nil { 96 | t.Fatal(err) 97 | } 98 | 99 | assert.Equal(t, pod.Spec.Containers[0].Resources.Requests, tc.expected) 100 | }) 101 | } 102 | } 103 | 104 | func newPod(req corev1.ResourceList) *corev1.Pod { 105 | return &corev1.Pod{ 106 | ObjectMeta: metav1.ObjectMeta{ 107 | Name: "scaled-pod", 108 | Namespace: "default", 109 | }, 110 | Spec: corev1.PodSpec{ 111 | Containers: []corev1.Container{ 112 | { 113 | Name: "first-container", 114 | Resources: corev1.ResourceRequirements{ 115 | Requests: req, 116 | }, 117 | }, 118 | { 119 | Name: "second-container", 120 | Resources: corev1.ResourceRequirements{ 121 | Requests: req, 122 | }, 123 | }, 124 | }, 125 | }, 126 | } 127 | } 128 | -------------------------------------------------------------------------------- /manager/redirector_attacher.go: -------------------------------------------------------------------------------- 1 | package manager 2 | 3 | import ( 4 | "context" 5 | "errors" 6 | "fmt" 7 | "log/slog" 8 | "os" 9 | "path/filepath" 10 | "strconv" 11 | "sync" 12 | 13 | "github.com/containernetworking/plugins/pkg/ns" 14 | "github.com/ctrox/zeropod/activator" 15 | "github.com/ctrox/zeropod/socket" 16 | "github.com/fsnotify/fsnotify" 17 | ) 18 | 19 | type Redirector struct { 20 | sync.Mutex 21 | activators map[int]*activator.BPF 22 | log *slog.Logger 23 | } 24 | 25 | // AttachRedirectors scans the zeropod maps path in the bpf file system for 26 | // directories named after the pid of the sandbox container. It does an 27 | // initial iteration over all directories and then starts a goroutine which 28 | // watches for fsevents. When the associated netns of the sandbox container 29 | // can be found it attaches the redirector BPF programs to the network 30 | // interfaces of the sandbox. The directories are expected to be created by 31 | // the zeropod shim on startup. 32 | func AttachRedirectors(ctx context.Context, log *slog.Logger) error { 33 | r := &Redirector{ 34 | activators: make(map[int]*activator.BPF), 35 | log: log, 36 | } 37 | 38 | if _, err := os.Stat(activator.MapsPath()); os.IsNotExist(err) { 39 | r.log.Info("maps path not found, creating", "path", activator.MapsPath()) 40 | if err := os.Mkdir(activator.MapsPath(), os.ModePerm); err != nil { 41 | return err 42 | } 43 | } 44 | 45 | pids, err := r.getSandboxPids() 46 | if err != nil { 47 | return err 48 | } 49 | 50 | if len(pids) == 0 { 51 | r.log.Info("no sandbox pids found") 52 | } 53 | 54 | for _, pid := range pids { 55 | if err := statNetNS(pid); os.IsNotExist(err) { 56 | r.log.Info("net ns not found, removing leftover pid", "path", netNSPath(pid)) 57 | os.RemoveAll(activator.PinPath(pid)) 58 | continue 59 | } 60 | 61 | if err := r.attachRedirector(pid); err != nil { 62 | return err 63 | } 64 | } 65 | 66 | go r.watchForSandboxPids(ctx) 67 | 68 | return nil 69 | } 70 | 71 | func (r *Redirector) watchForSandboxPids(ctx context.Context) error { 72 | watcher, err := fsnotify.NewWatcher() 73 | if err != nil { 74 | return err 75 | } 76 | defer watcher.Close() 77 | 78 | if err := watcher.Add(activator.MapsPath()); err != nil { 79 | return err 80 | } 81 | 82 | for { 83 | select { 84 | // watch for events 85 | case event := <-watcher.Events: 86 | if filepath.Base(event.Name) == socket.TCPEventsMap { 87 | continue 88 | } 89 | 90 | pid, err := strconv.Atoi(filepath.Base(event.Name)) 91 | if err != nil { 92 | r.log.Warn("unable to parse pid from added name", "name", filepath.Base(event.Name)) 93 | continue 94 | } 95 | 96 | if err := statNetNS(pid); err != nil { 97 | r.log.Warn("ignoring pid as net ns was not found", "pid", pid) 98 | continue 99 | } 100 | 101 | switch event.Op { 102 | case fsnotify.Create: 103 | if err := r.attachRedirector(pid); err != nil { 104 | r.log.Error("unable to attach redirector", "pid", pid, "err", err) 105 | } 106 | case fsnotify.Remove: 107 | r.Lock() 108 | if act, ok := r.activators[pid]; ok { 109 | r.log.Info("cleaning up activator", "pid", pid) 110 | if err := act.Cleanup(); err != nil { 111 | r.log.Error("error cleaning up redirector", "err", err) 112 | } 113 | } 114 | r.Unlock() 115 | } 116 | case err := <-watcher.Errors: 117 | r.log.Error("watch error", "err", err) 118 | case <-ctx.Done(): 119 | return nil 120 | } 121 | } 122 | } 123 | 124 | func (r *Redirector) attachRedirector(pid int) error { 125 | bpf, err := activator.InitBPF(pid, r.log) 126 | if err != nil { 127 | return fmt.Errorf("unable to initialize BPF: %w", err) 128 | } 129 | r.Lock() 130 | r.activators[pid] = bpf 131 | r.Unlock() 132 | 133 | netNS, err := ns.GetNS(netNSPath(pid)) 134 | if err != nil { 135 | return err 136 | } 137 | 138 | if err := netNS.Do(func(nn ns.NetNS) error { 139 | // TODO: is this really always eth0? 140 | // as for loopback, this is required for port-forwarding to work 141 | ifaces := []string{"eth0", "lo"} 142 | r.log.Info("attaching redirector for sandbox", "pid", pid, "links", ifaces) 143 | return bpf.AttachRedirector(ifaces...) 144 | }); err != nil { 145 | return err 146 | } 147 | 148 | return nil 149 | } 150 | 151 | func statNetNS(pid int) error { 152 | _, err := os.Stat(netNSPath(pid)) 153 | return err 154 | } 155 | 156 | func netNSPath(pid int) string { 157 | return fmt.Sprintf("/hostproc/%d/ns/net", pid) 158 | } 159 | 160 | func (r *Redirector) getSandboxPids() ([]int, error) { 161 | f, err := os.Open(activator.MapsPath()) 162 | if err != nil { 163 | if errors.Is(err, os.ErrNotExist) { 164 | return nil, nil 165 | } 166 | return nil, err 167 | } 168 | 169 | dirs, err := f.Readdirnames(0) 170 | if err != nil { 171 | return nil, err 172 | } 173 | 174 | intPids := make([]int, 0, len(dirs)) 175 | for _, dir := range dirs { 176 | if dir == socket.TCPEventsMap { 177 | continue 178 | } 179 | 180 | intPid, err := strconv.Atoi(dir) 181 | if err != nil { 182 | r.log.Warn("unable to parse pid from dir name", "name", dir) 183 | continue 184 | } 185 | 186 | // before adding this pid, check if the corresponding network ns 187 | // actually exists. This is important when running in a kind environment 188 | // where the bpffs is shared between different "nodes". 189 | if err := statNetNS(intPid); err == nil { 190 | intPids = append(intPids, intPid) 191 | } 192 | } 193 | 194 | return intPids, nil 195 | } 196 | -------------------------------------------------------------------------------- /manager/status_test.go: -------------------------------------------------------------------------------- 1 | package manager 2 | 3 | import ( 4 | "context" 5 | "log/slog" 6 | "testing" 7 | 8 | v1 "github.com/ctrox/zeropod/api/shim/v1" 9 | "github.com/stretchr/testify/assert" 10 | corev1 "k8s.io/api/core/v1" 11 | "k8s.io/apimachinery/pkg/runtime" 12 | "k8s.io/apimachinery/pkg/types" 13 | "sigs.k8s.io/controller-runtime/pkg/client/fake" 14 | ) 15 | 16 | type annotationHandler struct { 17 | annotations map[string]string 18 | } 19 | 20 | func (fh *annotationHandler) Handle(ctx context.Context, status *v1.ContainerStatus, pod *corev1.Pod) error { 21 | pod.SetAnnotations(fh.annotations) 22 | return nil 23 | } 24 | 25 | func TestOnStatus(t *testing.T) { 26 | slog.SetLogLoggerLevel(slog.LevelDebug) 27 | 28 | scheme := runtime.NewScheme() 29 | if err := corev1.AddToScheme(scheme); err != nil { 30 | t.Fatal(err) 31 | } 32 | cases := map[string]struct { 33 | statusEventPhase v1.ContainerPhase 34 | beforeEvent map[string]string 35 | expected map[string]string 36 | podHandlers []PodHandler 37 | }{ 38 | "pod is updated when we have a pod handler": { 39 | statusEventPhase: v1.ContainerPhase_RUNNING, 40 | podHandlers: []PodHandler{&annotationHandler{annotations: map[string]string{"new": "annotation"}}}, 41 | beforeEvent: map[string]string{"some": "annotation"}, 42 | expected: map[string]string{"new": "annotation"}, 43 | }, 44 | } 45 | 46 | for name, tc := range cases { 47 | tc := tc 48 | t.Run(name, func(t *testing.T) { 49 | client := fake.NewClientBuilder().WithScheme(scheme).Build() 50 | pod := newPod(nil) 51 | pod.SetAnnotations(tc.beforeEvent) 52 | 53 | ctx := context.Background() 54 | if err := client.Create(ctx, pod); err != nil { 55 | t.Fatal(err) 56 | } 57 | 58 | sub := subscriber{ 59 | kube: client, 60 | log: slog.Default(), 61 | podHandlers: tc.podHandlers, 62 | } 63 | assert.NoError(t, sub.onStatus(ctx, &v1.ContainerStatus{ 64 | Name: pod.Spec.Containers[0].Name, 65 | PodName: pod.Name, 66 | PodNamespace: pod.Namespace, 67 | Phase: tc.statusEventPhase, 68 | })) 69 | 70 | if err := client.Get(ctx, types.NamespacedName{Name: pod.Name, Namespace: pod.Namespace}, pod); err != nil { 71 | t.Fatal(err) 72 | } 73 | 74 | assert.Equal(t, pod.GetAnnotations(), tc.expected) 75 | }) 76 | } 77 | } 78 | -------------------------------------------------------------------------------- /shim/checkpoint.go: -------------------------------------------------------------------------------- 1 | package shim 2 | 3 | import ( 4 | "context" 5 | "errors" 6 | "fmt" 7 | "os" 8 | "path" 9 | "time" 10 | 11 | "github.com/containerd/containerd/v2/cmd/containerd-shim-runc-v2/process" 12 | runcC "github.com/containerd/go-runc" 13 | "github.com/containerd/log" 14 | "github.com/ctrox/zeropod/activator" 15 | nodev1 "github.com/ctrox/zeropod/api/node/v1" 16 | "google.golang.org/protobuf/types/known/durationpb" 17 | ) 18 | 19 | const retryInterval = time.Second 20 | 21 | func (c *Container) scaleDown(ctx context.Context) error { 22 | if err := c.startActivator(ctx); err != nil { 23 | if errors.Is(err, errNoPortsDetected) { 24 | log.G(ctx).Infof("no ports detected, rescheduling scale down in %s", retryInterval) 25 | return c.scheduleScaleDownIn(retryInterval) 26 | } 27 | 28 | if errors.Is(err, activator.ErrMapNotFound) { 29 | log.G(ctx).Infof("activator is not ready, rescheduling scale down in %s", retryInterval) 30 | return c.scheduleScaleDownIn(retryInterval) 31 | } 32 | 33 | return err 34 | } 35 | 36 | if err := c.activator.Reset(); err != nil { 37 | return err 38 | } 39 | 40 | if err := c.tracker.RemovePid(uint32(c.process.Pid())); err != nil { 41 | // key could not exist, just log the error for now 42 | log.G(ctx).Errorf("unable to remove pid %d: %s", c.process.Pid(), err) 43 | } 44 | 45 | if c.ScaledDown() { 46 | return nil 47 | } 48 | 49 | if c.cfg.DisableCheckpointing { 50 | if err := c.kill(ctx); err != nil { 51 | return err 52 | } 53 | return nil 54 | } 55 | 56 | if err := c.checkpoint(ctx); err != nil { 57 | return err 58 | } 59 | 60 | return nil 61 | } 62 | 63 | func (c *Container) kill(ctx context.Context) error { 64 | c.checkpointRestore.Lock() 65 | defer c.checkpointRestore.Unlock() 66 | log.G(ctx).Infof("checkpointing is disabled, scaling down by killing") 67 | c.AddCheckpointedPID(c.Pid()) 68 | 69 | if err := c.process.Kill(ctx, 9, false); err != nil { 70 | return err 71 | } 72 | c.SetScaledDown(true) 73 | return nil 74 | } 75 | 76 | func (c *Container) checkpoint(ctx context.Context) error { 77 | c.checkpointRestore.Lock() 78 | defer c.checkpointRestore.Unlock() 79 | 80 | snapshotDir := nodev1.SnapshotPath(c.ID()) 81 | if err := os.RemoveAll(snapshotDir); err != nil { 82 | return fmt.Errorf("unable to prepare snapshot dir: %w", err) 83 | } 84 | 85 | workDir := nodev1.WorkDirPath(c.ID()) 86 | log.G(ctx).Infof("checkpointing process %d of container to %s", c.process.Pid(), snapshotDir) 87 | 88 | initProcess, ok := c.process.(*process.Init) 89 | if !ok { 90 | return fmt.Errorf("process is not of type %T, got %T", process.Init{}, c.process) 91 | } 92 | 93 | opts := &runcC.CheckpointOpts{ 94 | WorkDir: workDir, 95 | AllowOpenTCP: true, 96 | AllowExternalUnixSockets: true, 97 | AllowTerminal: false, 98 | FileLocks: true, 99 | EmptyNamespaces: []string{}, 100 | } 101 | 102 | if c.cfg.PreDump { 103 | // for the pre-dump we set the ImagePath to be a sub-path of our container image path 104 | opts.ImagePath = nodev1.PreDumpDir(c.ID()) 105 | 106 | beforePreDump := time.Now() 107 | if err := initProcess.Runtime().Checkpoint(ctx, c.ID(), opts, runcC.PreDump); err != nil { 108 | log.G(ctx).Errorf("error pre-dumping container: %s", err) 109 | b, err := os.ReadFile(path.Join(workDir, "dump.log")) 110 | if err != nil { 111 | log.G(ctx).Errorf("error reading dump.log: %s", err) 112 | } 113 | log.G(ctx).Errorf("dump.log: %s", b) 114 | return err 115 | } 116 | 117 | log.G(ctx).Infof("pre-dumping done in %s", time.Since(beforePreDump)) 118 | } 119 | 120 | if c.cfg.PreDump { 121 | // ParentPath is the relative path from the ImagePath to the pre-dump dir. 122 | opts.ParentPath = nodev1.RelativePreDumpDir() 123 | } 124 | 125 | c.AddCheckpointedPID(c.Pid()) 126 | // ImagePath is always the same, regardless of pre-dump 127 | opts.ImagePath = nodev1.SnapshotPath(c.ID()) 128 | 129 | beforeCheckpoint := time.Now() 130 | if err := initProcess.Runtime().Checkpoint(ctx, c.ID(), opts); err != nil { 131 | log.G(ctx).Errorf("error checkpointing container: %s", err) 132 | b, err := os.ReadFile(path.Join(workDir, "dump.log")) 133 | if err != nil { 134 | log.G(ctx).Errorf("error reading dump.log: %s", err) 135 | } 136 | log.G(ctx).Errorf("dump.log: %s", b) 137 | return err 138 | } 139 | 140 | c.SetScaledDown(true) 141 | c.metrics.LastCheckpointDuration = durationpb.New(time.Since(beforeCheckpoint)) 142 | log.G(ctx).Infof("checkpointing done in %s", time.Since(beforeCheckpoint)) 143 | 144 | return nil 145 | } 146 | -------------------------------------------------------------------------------- /shim/config.go: -------------------------------------------------------------------------------- 1 | package shim 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | "runtime" 7 | "slices" 8 | "strconv" 9 | "strings" 10 | "time" 11 | 12 | "github.com/containerd/containerd/v2/pkg/namespaces" 13 | "github.com/containerd/log" 14 | "github.com/opencontainers/runtime-spec/specs-go" 15 | ) 16 | 17 | const ( 18 | NodeLabel = "zeropod.ctrox.dev/node" 19 | PortsAnnotationKey = "zeropod.ctrox.dev/ports-map" 20 | ContainerNamesAnnotationKey = "zeropod.ctrox.dev/container-names" 21 | ScaleDownDurationAnnotationKey = "zeropod.ctrox.dev/scaledown-duration" 22 | DisableCheckpoiningAnnotationKey = "zeropod.ctrox.dev/disable-checkpointing" 23 | PreDumpAnnotationKey = "zeropod.ctrox.dev/pre-dump" 24 | MigrateAnnotationKey = "zeropod.ctrox.dev/migrate" 25 | LiveMigrateAnnotationKey = "zeropod.ctrox.dev/live-migrate" 26 | CRIContainerNameAnnotation = "io.kubernetes.cri.container-name" 27 | CRIContainerTypeAnnotation = "io.kubernetes.cri.container-type" 28 | CRIPodNameAnnotation = "io.kubernetes.cri.sandbox-name" 29 | CRIPodNamespaceAnnotation = "io.kubernetes.cri.sandbox-namespace" 30 | CRIPodUIDAnnotation = "io.kubernetes.cri.sandbox-uid" 31 | 32 | defaultScaleDownDuration = time.Minute 33 | containersDelim = "," 34 | portsDelim = containersDelim 35 | mappingDelim = ";" 36 | mapDelim = "=" 37 | defaultContainerdNS = "k8s.io" 38 | ) 39 | 40 | type Config struct { 41 | ZeropodContainerNames []string 42 | Ports []uint16 43 | ScaleDownDuration time.Duration 44 | DisableCheckpointing bool 45 | PreDump bool 46 | Migrate []string 47 | LiveMigrate string 48 | ContainerName string 49 | ContainerType string 50 | PodName string 51 | PodNamespace string 52 | PodUID string 53 | ContainerdNamespace string 54 | spec *specs.Spec 55 | } 56 | 57 | // NewConfig uses the annotations from the container spec to create a new 58 | // typed ZeropodConfig config. 59 | func NewConfig(ctx context.Context, spec *specs.Spec) (*Config, error) { 60 | containerName := spec.Annotations[CRIContainerNameAnnotation] 61 | containerType := spec.Annotations[CRIContainerTypeAnnotation] 62 | 63 | var err error 64 | var containerPorts []uint16 65 | portsMap := spec.Annotations[PortsAnnotationKey] 66 | if portsMap != "" { 67 | for _, mapping := range strings.Split(portsMap, mappingDelim) { 68 | namePorts := strings.Split(mapping, mapDelim) 69 | if len(namePorts) != 2 { 70 | return nil, fmt.Errorf("invalid port map, the format needs to be name=port") 71 | } 72 | 73 | name, ports := namePorts[0], namePorts[1] 74 | if name != containerName { 75 | continue 76 | } 77 | 78 | for _, port := range strings.Split(ports, portsDelim) { 79 | p, err := strconv.ParseUint(port, 10, 16) 80 | if err != nil { 81 | return nil, err 82 | } 83 | containerPorts = append(containerPorts, uint16(p)) 84 | } 85 | } 86 | } 87 | 88 | scaleDownDuration := spec.Annotations[ScaleDownDurationAnnotationKey] 89 | dur := defaultScaleDownDuration 90 | if scaleDownDuration != "" { 91 | dur, err = time.ParseDuration(scaleDownDuration) 92 | if err != nil { 93 | return nil, err 94 | } 95 | } 96 | 97 | disableCheckpointValue := spec.Annotations[DisableCheckpoiningAnnotationKey] 98 | disableCheckpointing := false 99 | if disableCheckpointValue != "" { 100 | disableCheckpointing, err = strconv.ParseBool(disableCheckpointValue) 101 | if err != nil { 102 | return nil, err 103 | } 104 | } 105 | 106 | preDump := false 107 | preDumpValue := spec.Annotations[PreDumpAnnotationKey] 108 | if preDumpValue != "" { 109 | preDump, err = strconv.ParseBool(preDumpValue) 110 | if err != nil { 111 | return nil, err 112 | } 113 | if preDump && runtime.GOARCH == "arm64" { 114 | // disable pre-dump on arm64 115 | // https://github.com/checkpoint-restore/criu/issues/1859 116 | log.G(ctx).Warnf("disabling pre-dump: it was requested but is not supported on %s", runtime.GOARCH) 117 | preDump = false 118 | } 119 | } 120 | 121 | containerNames := []string{} 122 | containerNamesValue := spec.Annotations[ContainerNamesAnnotationKey] 123 | if containerNamesValue != "" { 124 | containerNames = strings.Split(containerNamesValue, containersDelim) 125 | } 126 | 127 | migrate := []string{} 128 | migrateValue := spec.Annotations[MigrateAnnotationKey] 129 | if migrateValue != "" { 130 | migrate = strings.Split(migrateValue, containersDelim) 131 | } 132 | 133 | ns, ok := namespaces.Namespace(ctx) 134 | if !ok { 135 | ns = defaultContainerdNS 136 | } 137 | 138 | return &Config{ 139 | Ports: containerPorts, 140 | ScaleDownDuration: dur, 141 | DisableCheckpointing: disableCheckpointing, 142 | PreDump: preDump, 143 | Migrate: migrate, 144 | LiveMigrate: spec.Annotations[LiveMigrateAnnotationKey], 145 | ZeropodContainerNames: containerNames, 146 | ContainerName: containerName, 147 | ContainerType: containerType, 148 | PodName: spec.Annotations[CRIPodNameAnnotation], 149 | PodNamespace: spec.Annotations[CRIPodNamespaceAnnotation], 150 | PodUID: spec.Annotations[CRIPodUIDAnnotation], 151 | ContainerdNamespace: ns, 152 | spec: spec, 153 | }, nil 154 | } 155 | 156 | func (cfg Config) IsZeropodContainer() bool { 157 | for _, n := range cfg.ZeropodContainerNames { 158 | if n == cfg.ContainerName { 159 | return true 160 | } 161 | } 162 | 163 | // if there is none specified, every one of them is considered. 164 | return len(cfg.ZeropodContainerNames) == 0 165 | } 166 | 167 | func (cfg Config) migrationEnabled() bool { 168 | return slices.Contains(cfg.Migrate, cfg.ContainerName) 169 | } 170 | 171 | func (cfg Config) LiveMigrationEnabled() bool { 172 | return cfg.LiveMigrate == cfg.ContainerName 173 | } 174 | 175 | func (cfg Config) AnyMigrationEnabled() bool { 176 | return cfg.migrationEnabled() || cfg.LiveMigrationEnabled() 177 | } 178 | -------------------------------------------------------------------------------- /shim/config_test.go: -------------------------------------------------------------------------------- 1 | package shim 2 | 3 | import ( 4 | "context" 5 | "runtime" 6 | "testing" 7 | "time" 8 | 9 | "github.com/opencontainers/runtime-spec/specs-go" 10 | "github.com/stretchr/testify/assert" 11 | "github.com/stretchr/testify/require" 12 | ) 13 | 14 | func TestNewConfig(t *testing.T) { 15 | tests := map[string]struct { 16 | annotations map[string]string 17 | assertCfg func(t *testing.T, cfg *Config) 18 | }{ 19 | "ports": { 20 | annotations: map[string]string{ 21 | CRIContainerNameAnnotation: "container1", 22 | PortsAnnotationKey: "container0=1234;container1=80,81;container2=8080", 23 | }, 24 | assertCfg: func(t *testing.T, cfg *Config) { 25 | assert.Equal(t, []uint16{80, 81}, cfg.Ports) 26 | }, 27 | }, 28 | "container names": { 29 | annotations: map[string]string{ 30 | CRIContainerNameAnnotation: "container1", 31 | ContainerNamesAnnotationKey: "container0,container1,container2", 32 | }, 33 | assertCfg: func(t *testing.T, cfg *Config) { 34 | assert.Equal(t, "container1", cfg.ContainerName) 35 | assert.Equal(t, []string{"container0", "container1", "container2"}, 36 | cfg.ZeropodContainerNames) 37 | }, 38 | }, 39 | "scaledown duration": { 40 | annotations: map[string]string{ 41 | ScaleDownDurationAnnotationKey: "5m", 42 | }, 43 | assertCfg: func(t *testing.T, cfg *Config) { 44 | assert.Equal(t, time.Minute*5, cfg.ScaleDownDuration) 45 | }, 46 | }, 47 | "disable checkpointing": { 48 | annotations: map[string]string{ 49 | DisableCheckpoiningAnnotationKey: "true", 50 | }, 51 | assertCfg: func(t *testing.T, cfg *Config) { 52 | assert.True(t, cfg.DisableCheckpointing) 53 | }, 54 | }, 55 | "enable checkpointing": { 56 | annotations: map[string]string{ 57 | DisableCheckpoiningAnnotationKey: "false", 58 | }, 59 | assertCfg: func(t *testing.T, cfg *Config) { 60 | assert.False(t, cfg.DisableCheckpointing) 61 | }, 62 | }, 63 | "predump": { 64 | annotations: map[string]string{ 65 | PreDumpAnnotationKey: "true", 66 | }, 67 | assertCfg: func(t *testing.T, cfg *Config) { 68 | assert.True(t, cfg.PreDump) 69 | }, 70 | }, 71 | "disable predump": { 72 | annotations: map[string]string{ 73 | PreDumpAnnotationKey: "false", 74 | }, 75 | assertCfg: func(t *testing.T, cfg *Config) { 76 | assert.False(t, cfg.PreDump) 77 | }, 78 | }, 79 | } 80 | 81 | for name, tc := range tests { 82 | t.Run(name, func(t *testing.T) { 83 | if tc.annotations[PreDumpAnnotationKey] == "true" && runtime.GOARCH == "arm64" { 84 | t.Skip("skipping pre-dump test as it's not supported on arm64") 85 | } 86 | cfg, err := NewConfig(context.Background(), &specs.Spec{ 87 | Annotations: tc.annotations, 88 | }) 89 | require.NoError(t, err) 90 | tc.assertCfg(t, cfg) 91 | }) 92 | } 93 | } 94 | -------------------------------------------------------------------------------- /shim/io/container_io.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright The containerd Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package io 18 | 19 | import ( 20 | "errors" 21 | "fmt" 22 | "io" 23 | "strings" 24 | 25 | "github.com/containerd/log" 26 | 27 | "github.com/containerd/containerd/v2/pkg/cio" 28 | cioutil "github.com/containerd/containerd/v2/pkg/ioutil" 29 | ) 30 | 31 | // streamKey generates a key for the stream. 32 | func streamKey(id, name string, stream StreamType) string { 33 | return strings.Join([]string{id, name, string(stream)}, "-") 34 | } 35 | 36 | // ContainerIO holds the container io. 37 | type ContainerIO struct { 38 | id string 39 | 40 | fifos *cio.FIFOSet 41 | *stdioStream 42 | 43 | stdoutGroup *cioutil.WriterGroup 44 | stderrGroup *cioutil.WriterGroup 45 | 46 | closer *wgCloser 47 | } 48 | 49 | var _ cio.IO = &ContainerIO{} 50 | 51 | // ContainerIOOpts sets specific information to newly created ContainerIO. 52 | type ContainerIOOpts func(*ContainerIO) error 53 | 54 | // WithFIFOs specifies existing fifos for the container io. 55 | func WithFIFOs(fifos *cio.FIFOSet) ContainerIOOpts { 56 | return func(c *ContainerIO) error { 57 | c.fifos = fifos 58 | return nil 59 | } 60 | } 61 | 62 | // WithNewFIFOs creates new fifos for the container io. 63 | func WithNewFIFOs(root string, tty, stdin bool) ContainerIOOpts { 64 | return func(c *ContainerIO) error { 65 | fifos, err := newFifos(root, c.id, tty, stdin) 66 | if err != nil { 67 | return err 68 | } 69 | return WithFIFOs(fifos)(c) 70 | } 71 | } 72 | 73 | // WithStreams creates new streams for the container io. 74 | // The stream address is in format of `protocol://address?stream_id=xyz`. 75 | // It allocates ContainerID-stdin, ContainerID-stdout and ContainerID-stderr as streaming IDs. 76 | // For example, that advertiser address of shim is `ttrpc+unix:///run/demo.sock` and container ID is `app`. 77 | // There are three streams if stdin is enabled and TTY is disabled. 78 | // 79 | // - Stdin: ttrpc+unix:///run/demo.sock?stream_id=app-stdin 80 | // - Stdout: ttrpc+unix:///run/demo.sock?stream_id=app-stdout 81 | // - stderr: ttrpc+unix:///run/demo.sock?stream_id=app-stderr 82 | // 83 | // The streaming IDs will be used as unique key to establish stream tunnel. 84 | // And it should support reconnection with the same streaming ID if containerd restarts. 85 | func WithStreams(address string, tty, stdin bool) ContainerIOOpts { 86 | return func(c *ContainerIO) error { 87 | if address == "" { 88 | return fmt.Errorf("address can not be empty for io stream") 89 | } 90 | fifos, err := newStreams(address, c.id, tty, stdin) 91 | if err != nil { 92 | return err 93 | } 94 | return WithFIFOs(fifos)(c) 95 | } 96 | } 97 | 98 | // NewContainerIO creates container io. 99 | func NewContainerIO(id string, opts ...ContainerIOOpts) (_ *ContainerIO, err error) { 100 | c := &ContainerIO{ 101 | id: id, 102 | stdoutGroup: cioutil.NewWriterGroup(), 103 | stderrGroup: cioutil.NewWriterGroup(), 104 | } 105 | for _, opt := range opts { 106 | if err := opt(c); err != nil { 107 | return nil, err 108 | } 109 | } 110 | if c.fifos == nil { 111 | return nil, errors.New("fifos are not set") 112 | } 113 | // Create actual fifos. 114 | stdio, closer, err := newStdioStream(c.fifos) 115 | if err != nil { 116 | return nil, err 117 | } 118 | c.stdioStream = stdio 119 | c.closer = closer 120 | return c, nil 121 | } 122 | 123 | // Config returns io config. 124 | func (c *ContainerIO) Config() cio.Config { 125 | return c.fifos.Config 126 | } 127 | 128 | // Pipe creates container fifos and pipe container output 129 | // to output stream. 130 | func (c *ContainerIO) Pipe() { 131 | wg := c.closer.wg 132 | if c.stdout != nil { 133 | wg.Add(1) 134 | go func() { 135 | if _, err := io.Copy(c.stdoutGroup, c.stdout); err != nil { 136 | log.L.WithError(err).Errorf("Failed to pipe stdout of container %q", c.id) 137 | } 138 | c.stdout.Close() 139 | c.stdoutGroup.Close() 140 | wg.Done() 141 | log.L.Debugf("Finish piping stdout of container %q", c.id) 142 | }() 143 | } 144 | 145 | if !c.fifos.Terminal && c.stderr != nil { 146 | wg.Add(1) 147 | go func() { 148 | if _, err := io.Copy(c.stderrGroup, c.stderr); err != nil { 149 | log.L.WithError(err).Errorf("Failed to pipe stderr of container %q", c.id) 150 | } 151 | c.stderr.Close() 152 | c.stderrGroup.Close() 153 | wg.Done() 154 | log.L.Debugf("Finish piping stderr of container %q", c.id) 155 | }() 156 | } 157 | } 158 | 159 | // AddOutput adds new write closers to the container stream, and returns existing 160 | // write closers if there are any. 161 | func (c *ContainerIO) AddOutput(name string, stdout, stderr io.WriteCloser) (io.WriteCloser, io.WriteCloser) { 162 | var oldStdout, oldStderr io.WriteCloser 163 | if stdout != nil { 164 | key := streamKey(c.id, name, Stdout) 165 | oldStdout = c.stdoutGroup.Get(key) 166 | c.stdoutGroup.Add(key, stdout) 167 | } 168 | if stderr != nil { 169 | key := streamKey(c.id, name, Stderr) 170 | oldStderr = c.stderrGroup.Get(key) 171 | c.stderrGroup.Add(key, stderr) 172 | } 173 | return oldStdout, oldStderr 174 | } 175 | 176 | // Cancel cancels container io. 177 | func (c *ContainerIO) Cancel() { 178 | c.closer.Cancel() 179 | } 180 | 181 | // Wait waits container io to finish. 182 | func (c *ContainerIO) Wait() { 183 | c.closer.Wait() 184 | } 185 | 186 | // Close closes all FIFOs. 187 | func (c *ContainerIO) Close() error { 188 | c.closer.Close() 189 | if c.fifos != nil { 190 | return c.fifos.Close() 191 | } 192 | return nil 193 | } 194 | -------------------------------------------------------------------------------- /shim/io/cri.go: -------------------------------------------------------------------------------- 1 | package io 2 | 3 | // from https://github.com/kubernetes/cri-api/blob/87ee4e17aba6baf0bfd719ec0532af4000e83c5d/pkg/apis/runtime/v1/constants.go 4 | 5 | // LogTag is the tag of a log line in CRI container log. 6 | // Currently defined log tags: 7 | // * First tag: Partial/Full - P/F. 8 | // The field in the container log format can be extended to include multiple 9 | // tags by using a delimiter, but changes should be rare. If it becomes clear 10 | // that better extensibility is desired, a more extensible format (e.g., json) 11 | // should be adopted as a replacement and/or addition. 12 | type LogTag string 13 | 14 | const ( 15 | // LogTagPartial means the line is part of multiple lines. 16 | LogTagPartial LogTag = "P" 17 | // LogTagFull means the line is a single full line or the end of multiple lines. 18 | LogTagFull LogTag = "F" 19 | ) 20 | -------------------------------------------------------------------------------- /shim/io/doc.go: -------------------------------------------------------------------------------- 1 | // Package io has been copied from 2 | // https://github.com/containerd/containerd/tree/c07fb699a9a0c9792d21a759c87d0d60de11bfb9/internal/cri/io 3 | // with some small changes to avoid importing k8s.io/cri-api (except in tests) 4 | // and github.com/docker/go-metrics 5 | package io 6 | -------------------------------------------------------------------------------- /shim/io/helpers_unix.go: -------------------------------------------------------------------------------- 1 | //go:build !windows 2 | 3 | /* 4 | Copyright The containerd Authors. 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | */ 18 | 19 | package io 20 | 21 | import ( 22 | "context" 23 | "io" 24 | "os" 25 | 26 | "github.com/containerd/fifo" 27 | ) 28 | 29 | func openPipe(ctx context.Context, fn string, flag int, perm os.FileMode) (io.ReadWriteCloser, error) { 30 | return fifo.OpenFifo(ctx, fn, flag, perm) 31 | } 32 | -------------------------------------------------------------------------------- /shim/log.go: -------------------------------------------------------------------------------- 1 | package shim 2 | 3 | import ( 4 | "fmt" 5 | "os" 6 | "path/filepath" 7 | "sort" 8 | ) 9 | 10 | // getLogPath gets the log path of the container by searching for the last log 11 | // file in the CRI pod log path. 12 | // TODO: it would be nicer to get this path via annotations but it looks like 13 | // containerd only passes that to the sandbox container (pause). One possible 14 | // solution would be to implement log restoring in the sandbox container 15 | // instead of the zeropod. 16 | func getLogPath(cfg *Config) (string, error) { 17 | logDir := fmt.Sprintf("/var/log/pods/%s_%s_%s/%s", cfg.PodNamespace, cfg.PodName, cfg.PodUID, cfg.ContainerName) 18 | 19 | dir, err := os.Open(logDir) 20 | if err != nil { 21 | return "", err 22 | } 23 | defer dir.Close() 24 | 25 | names, err := dir.Readdirnames(0) 26 | if err != nil { 27 | return "", err 28 | } 29 | sort.Slice(names, func(i, j int) bool { 30 | return i < j 31 | }) 32 | 33 | return filepath.Join(logDir, names[len(names)-1]), nil 34 | } 35 | -------------------------------------------------------------------------------- /shim/metrics.go: -------------------------------------------------------------------------------- 1 | package shim 2 | 3 | import ( 4 | v1 "github.com/ctrox/zeropod/api/shim/v1" 5 | ) 6 | 7 | func newMetrics(cfg *Config, running bool) *v1.ContainerMetrics { 8 | return &v1.ContainerMetrics{ 9 | Name: cfg.ContainerName, 10 | PodName: cfg.PodName, 11 | PodNamespace: cfg.PodNamespace, 12 | Running: running, 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /shim/port.go: -------------------------------------------------------------------------------- 1 | package shim 2 | 3 | import ( 4 | "fmt" 5 | "os" 6 | "path/filepath" 7 | "strconv" 8 | "strings" 9 | 10 | "github.com/prometheus/procfs" 11 | ) 12 | 13 | const ( 14 | stateListen = 10 15 | procPath = "/proc" 16 | childrenFile = "children" 17 | taskDir = "task" 18 | ) 19 | 20 | // listeningPorts finds all ports of the pid that are in listen state of the 21 | // supplied process and all child processes. It finds both, ipv4 and ipv6 22 | // sockets. 23 | func listeningPortsDeep(pid int) ([]uint16, error) { 24 | children, err := findChildren(pid) 25 | if err != nil { 26 | return nil, fmt.Errorf("finding child pids: %w", err) 27 | } 28 | 29 | // use a map to eliminate duplicates 30 | portMap := map[uint16]struct{}{} 31 | for _, pid := range append([]int{pid}, children...) { 32 | p, err := listeningPorts(pid) 33 | if err != nil { 34 | return nil, err 35 | } 36 | for _, port := range p { 37 | portMap[port] = struct{}{} 38 | } 39 | } 40 | ports := []uint16{} 41 | for k := range portMap { 42 | ports = append(ports, k) 43 | } 44 | 45 | return ports, nil 46 | } 47 | 48 | // listeningPorts finds all ports of the pid that are in listen state of the 49 | // supplied process. It finds both, ipv4 and ipv6 sockets. 50 | func listeningPorts(pid int) ([]uint16, error) { 51 | fs, err := procfs.NewFS(filepath.Join(procPath, strconv.Itoa(pid))) 52 | if err != nil { 53 | return nil, err 54 | } 55 | 56 | tcp, err := fs.NetTCP() 57 | if err != nil { 58 | return nil, err 59 | } 60 | 61 | tcp6, err := fs.NetTCP6() 62 | if err != nil { 63 | return nil, err 64 | } 65 | 66 | inos, err := inodes(pid) 67 | if err != nil { 68 | return nil, err 69 | } 70 | 71 | // use a map to eliminate duplicates 72 | portMap := map[uint16]struct{}{} 73 | for _, line := range append(tcp, tcp6...) { 74 | if _, ok := inos[line.Inode]; ok && line.St == stateListen { 75 | portMap[uint16(line.LocalPort)] = struct{}{} 76 | } 77 | } 78 | 79 | ports := []uint16{} 80 | for k := range portMap { 81 | ports = append(ports, k) 82 | } 83 | 84 | return ports, err 85 | } 86 | 87 | func inodes(pid int) (map[uint64]struct{}, error) { 88 | fs, err := procfs.NewFS(procPath) 89 | if err != nil { 90 | return nil, err 91 | } 92 | 93 | proc, err := fs.Proc(pid) 94 | if err != nil { 95 | return nil, err 96 | } 97 | 98 | fdInfos, err := proc.FileDescriptorsInfo() 99 | if err != nil { 100 | return nil, err 101 | } 102 | 103 | inodes := map[uint64]struct{}{} 104 | for _, fdInfo := range fdInfos { 105 | inode, err := strconv.ParseUint(fdInfo.Ino, 10, 64) 106 | if err != nil { 107 | return nil, fmt.Errorf("unable to parse inode to uint: %w", err) 108 | } 109 | inodes[inode] = struct{}{} 110 | } 111 | 112 | return inodes, nil 113 | } 114 | 115 | // findChildren uses the procfs to find all child tasks of the supplied 116 | // process and returns their pids if any are found. 117 | func findChildren(pid int) ([]int, error) { 118 | taskPath := filepath.Join(procPath, strconv.Itoa(pid), taskDir) 119 | tasks, err := os.ReadDir(taskPath) 120 | if err != nil { 121 | return nil, fmt.Errorf("listing tasks dir: %w", err) 122 | } 123 | 124 | pids := []int{} 125 | for _, task := range tasks { 126 | f, err := os.ReadFile(filepath.Join(taskPath, task.Name(), childrenFile)) 127 | if err != nil { 128 | if os.IsNotExist(err) { 129 | continue 130 | } 131 | return nil, fmt.Errorf("reading children file: %w", err) 132 | } 133 | 134 | spl := strings.Split(string(f), " ") 135 | if len(spl) < 2 { 136 | continue 137 | } 138 | 139 | for _, strPID := range spl { 140 | pid, err := strconv.Atoi(strPID) 141 | if err != nil { 142 | continue 143 | } 144 | pids = append(pids, pid) 145 | } 146 | } 147 | 148 | return pids, nil 149 | } 150 | -------------------------------------------------------------------------------- /shim/port_test.go: -------------------------------------------------------------------------------- 1 | package shim 2 | 3 | import ( 4 | "net/http/httptest" 5 | "net/url" 6 | "os" 7 | "strconv" 8 | "testing" 9 | 10 | "github.com/stretchr/testify/assert" 11 | "github.com/stretchr/testify/require" 12 | ) 13 | 14 | func TestListeningPorts(t *testing.T) { 15 | ts := httptest.NewServer(nil) 16 | ts2 := httptest.NewServer(nil) 17 | ports, err := listeningPortsDeep(os.Getpid()) 18 | require.NoError(t, err) 19 | 20 | u, err := url.Parse(ts.URL) 21 | require.NoError(t, err) 22 | u2, err := url.Parse(ts2.URL) 23 | require.NoError(t, err) 24 | 25 | port, err := strconv.Atoi(u.Port()) 26 | require.NoError(t, err) 27 | port2, err := strconv.Atoi(u2.Port()) 28 | require.NoError(t, err) 29 | assert.Contains(t, ports, uint16(port)) 30 | assert.Contains(t, ports, uint16(port2)) 31 | } 32 | -------------------------------------------------------------------------------- /shim/task/plugin/plugin_linux.go: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright The containerd Authors. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package plugin 18 | 19 | import ( 20 | "github.com/containerd/containerd/v2/pkg/shim" 21 | "github.com/containerd/containerd/v2/pkg/shutdown" 22 | "github.com/containerd/containerd/v2/plugins" 23 | "github.com/containerd/plugin" 24 | "github.com/containerd/plugin/registry" 25 | "github.com/ctrox/zeropod/shim/task" 26 | ) 27 | 28 | func init() { 29 | registry.Register(&plugin.Registration{ 30 | Type: plugins.TTRPCPlugin, 31 | ID: "task", 32 | Requires: []plugin.Type{ 33 | plugins.EventPlugin, 34 | plugins.InternalPlugin, 35 | }, 36 | InitFn: func(ic *plugin.InitContext) (interface{}, error) { 37 | pp, err := ic.GetByID(plugins.EventPlugin, "publisher") 38 | if err != nil { 39 | return nil, err 40 | } 41 | ss, err := ic.GetByID(plugins.InternalPlugin, "shutdown") 42 | if err != nil { 43 | return nil, err 44 | } 45 | return task.NewZeropodService(ic.Context, pp.(shim.Publisher), ss.(shutdown.Service)) 46 | }, 47 | }) 48 | } 49 | -------------------------------------------------------------------------------- /shim/task/register.go: -------------------------------------------------------------------------------- 1 | package task 2 | 3 | import ( 4 | "context" 5 | 6 | taskAPI "github.com/containerd/containerd/api/runtime/task/v3" 7 | "github.com/containerd/ttrpc" 8 | ) 9 | 10 | const ( 11 | taskServiceV2 = "containerd.task.v2.Task" 12 | taskServiceV3 = "containerd.task.v3.Task" 13 | ) 14 | 15 | // registerTaskService registers a task service with the provided name. This is 16 | // a bit of a hack to register a v3 task service as a v2 service. Since the API 17 | // has not changed at all this works just fine. 18 | func registerTaskService(name string, srv *ttrpc.Server, svc taskAPI.TTRPCTaskService) { 19 | srv.RegisterService(name, &ttrpc.ServiceDesc{ 20 | Methods: map[string]ttrpc.Method{ 21 | "State": func(ctx context.Context, unmarshal func(interface{}) error) (interface{}, error) { 22 | var req taskAPI.StateRequest 23 | if err := unmarshal(&req); err != nil { 24 | return nil, err 25 | } 26 | return svc.State(ctx, &req) 27 | }, 28 | "Create": func(ctx context.Context, unmarshal func(interface{}) error) (interface{}, error) { 29 | var req taskAPI.CreateTaskRequest 30 | if err := unmarshal(&req); err != nil { 31 | return nil, err 32 | } 33 | return svc.Create(ctx, &req) 34 | }, 35 | "Start": func(ctx context.Context, unmarshal func(interface{}) error) (interface{}, error) { 36 | var req taskAPI.StartRequest 37 | if err := unmarshal(&req); err != nil { 38 | return nil, err 39 | } 40 | return svc.Start(ctx, &req) 41 | }, 42 | "Delete": func(ctx context.Context, unmarshal func(interface{}) error) (interface{}, error) { 43 | var req taskAPI.DeleteRequest 44 | if err := unmarshal(&req); err != nil { 45 | return nil, err 46 | } 47 | return svc.Delete(ctx, &req) 48 | }, 49 | "Pids": func(ctx context.Context, unmarshal func(interface{}) error) (interface{}, error) { 50 | var req taskAPI.PidsRequest 51 | if err := unmarshal(&req); err != nil { 52 | return nil, err 53 | } 54 | return svc.Pids(ctx, &req) 55 | }, 56 | "Pause": func(ctx context.Context, unmarshal func(interface{}) error) (interface{}, error) { 57 | var req taskAPI.PauseRequest 58 | if err := unmarshal(&req); err != nil { 59 | return nil, err 60 | } 61 | return svc.Pause(ctx, &req) 62 | }, 63 | "Resume": func(ctx context.Context, unmarshal func(interface{}) error) (interface{}, error) { 64 | var req taskAPI.ResumeRequest 65 | if err := unmarshal(&req); err != nil { 66 | return nil, err 67 | } 68 | return svc.Resume(ctx, &req) 69 | }, 70 | "Checkpoint": func(ctx context.Context, unmarshal func(interface{}) error) (interface{}, error) { 71 | var req taskAPI.CheckpointTaskRequest 72 | if err := unmarshal(&req); err != nil { 73 | return nil, err 74 | } 75 | return svc.Checkpoint(ctx, &req) 76 | }, 77 | "Kill": func(ctx context.Context, unmarshal func(interface{}) error) (interface{}, error) { 78 | var req taskAPI.KillRequest 79 | if err := unmarshal(&req); err != nil { 80 | return nil, err 81 | } 82 | return svc.Kill(ctx, &req) 83 | }, 84 | "Exec": func(ctx context.Context, unmarshal func(interface{}) error) (interface{}, error) { 85 | var req taskAPI.ExecProcessRequest 86 | if err := unmarshal(&req); err != nil { 87 | return nil, err 88 | } 89 | return svc.Exec(ctx, &req) 90 | }, 91 | "ResizePty": func(ctx context.Context, unmarshal func(interface{}) error) (interface{}, error) { 92 | var req taskAPI.ResizePtyRequest 93 | if err := unmarshal(&req); err != nil { 94 | return nil, err 95 | } 96 | return svc.ResizePty(ctx, &req) 97 | }, 98 | "CloseIO": func(ctx context.Context, unmarshal func(interface{}) error) (interface{}, error) { 99 | var req taskAPI.CloseIORequest 100 | if err := unmarshal(&req); err != nil { 101 | return nil, err 102 | } 103 | return svc.CloseIO(ctx, &req) 104 | }, 105 | "Update": func(ctx context.Context, unmarshal func(interface{}) error) (interface{}, error) { 106 | var req taskAPI.UpdateTaskRequest 107 | if err := unmarshal(&req); err != nil { 108 | return nil, err 109 | } 110 | return svc.Update(ctx, &req) 111 | }, 112 | "Wait": func(ctx context.Context, unmarshal func(interface{}) error) (interface{}, error) { 113 | var req taskAPI.WaitRequest 114 | if err := unmarshal(&req); err != nil { 115 | return nil, err 116 | } 117 | return svc.Wait(ctx, &req) 118 | }, 119 | "Stats": func(ctx context.Context, unmarshal func(interface{}) error) (interface{}, error) { 120 | var req taskAPI.StatsRequest 121 | if err := unmarshal(&req); err != nil { 122 | return nil, err 123 | } 124 | return svc.Stats(ctx, &req) 125 | }, 126 | "Connect": func(ctx context.Context, unmarshal func(interface{}) error) (interface{}, error) { 127 | var req taskAPI.ConnectRequest 128 | if err := unmarshal(&req); err != nil { 129 | return nil, err 130 | } 131 | return svc.Connect(ctx, &req) 132 | }, 133 | "Shutdown": func(ctx context.Context, unmarshal func(interface{}) error) (interface{}, error) { 134 | var req taskAPI.ShutdownRequest 135 | if err := unmarshal(&req); err != nil { 136 | return nil, err 137 | } 138 | return svc.Shutdown(ctx, &req) 139 | }, 140 | }, 141 | }) 142 | } 143 | -------------------------------------------------------------------------------- /shim/task/shim.go: -------------------------------------------------------------------------------- 1 | package task 2 | 3 | import ( 4 | "context" 5 | "encoding/json" 6 | "fmt" 7 | "os" 8 | "path" 9 | "path/filepath" 10 | 11 | "github.com/containerd/containerd/v2/pkg/shim" 12 | "github.com/containerd/log" 13 | "github.com/containerd/ttrpc" 14 | v1 "github.com/ctrox/zeropod/api/shim/v1" 15 | ) 16 | 17 | const ShimSocketPath = "/run/zeropod/s/" 18 | 19 | func shimSocketAddress(containerdSocket string) string { 20 | return fmt.Sprintf("unix://%s.sock", filepath.Join(ShimSocketPath, path.Base(containerdSocket))) 21 | } 22 | 23 | func shimID() (string, error) { 24 | address, err := shim.ReadAddress("address") 25 | if err == nil { 26 | return address, nil 27 | } 28 | 29 | path, err := filepath.Abs("bootstrap.json") 30 | if err != nil { 31 | return "", fmt.Errorf("reading bootstrap.json: %w", err) 32 | } 33 | data, err := os.ReadFile(path) 34 | if err != nil { 35 | return "", fmt.Errorf("reading bootstrap.json: %w", err) 36 | } 37 | var params shim.BootstrapParams 38 | if err := json.Unmarshal(data, ¶ms); err != nil { 39 | return "", fmt.Errorf("unmarshalling bootstrap.json: %w", err) 40 | } 41 | 42 | return filepath.Base(params.Address), nil 43 | } 44 | 45 | func startShimServer(ctx context.Context, id string, task *wrapper) { 46 | socket := shimSocketAddress(id) 47 | listener, err := shim.NewSocket(socket) 48 | if err != nil { 49 | if !shim.SocketEaddrinuse(err) { 50 | log.G(ctx).WithError(err).Error("listening to socket") 51 | return 52 | } 53 | 54 | if shim.CanConnect(socket) { 55 | log.G(ctx).Debug("shim socket already exists, skipping server start") 56 | return 57 | } 58 | 59 | if err := shim.RemoveSocket(socket); err != nil { 60 | log.G(ctx).WithError(err).Error("remove pre-existing socket") 61 | } 62 | 63 | listener, err = shim.NewSocket(socket) 64 | if err != nil { 65 | log.G(ctx).WithError(err).Error("failed to create shim listener") 66 | } 67 | } 68 | 69 | log.G(ctx).Infof("starting shim server at %s", socket) 70 | // write shim address to filesystem 71 | if err := v1.WriteAddress("shim_address", socket); err != nil { 72 | log.G(ctx).WithError(err).Errorf("failed to write shim address") 73 | return 74 | } 75 | 76 | s, err := ttrpc.NewServer() 77 | if err != nil { 78 | log.G(ctx).WithError(err).Errorf("failed to create ttrpc server") 79 | return 80 | } 81 | defer s.Close() 82 | 83 | v1.RegisterShimService(s, &shimService{task: task}) 84 | 85 | defer func() { 86 | s.Close() 87 | listener.Close() 88 | os.Remove(socket) 89 | }() 90 | go s.Serve(ctx, listener) 91 | 92 | <-ctx.Done() 93 | 94 | log.G(ctx).Info("stopping shim server") 95 | } 96 | 97 | // shimService is an extension to the shim task service to provide 98 | // zeropod-specific functions like metrics. 99 | type shimService struct { 100 | task *wrapper 101 | } 102 | 103 | // SubscribeStatus watches for shim events. 104 | func (s *shimService) SubscribeStatus(ctx context.Context, _ *v1.SubscribeStatusRequest, srv v1.Shim_SubscribeStatusServer) error { 105 | for { 106 | select { 107 | case msg := <-s.task.zeropodEvents: 108 | if err := srv.Send(msg); err != nil { 109 | log.G(ctx).Errorf("unable to send event message: %s", err) 110 | } 111 | case <-ctx.Done(): 112 | return nil 113 | } 114 | } 115 | } 116 | 117 | // GetStatus returns the status of a zeropod container. 118 | func (s *shimService) GetStatus(ctx context.Context, req *v1.ContainerRequest) (*v1.ContainerStatus, error) { 119 | container, ok := s.task.zeropodContainers[req.Id] 120 | if !ok { 121 | return nil, fmt.Errorf("could not find zeropod container with id: %s", req.Id) 122 | } 123 | 124 | return container.Status(), nil 125 | } 126 | 127 | // Metrics returns metrics of the zeropod shim instance. 128 | func (s *shimService) Metrics(context.Context, *v1.MetricsRequest) (*v1.MetricsResponse, error) { 129 | containerMetrics := []*v1.ContainerMetrics{} 130 | for _, container := range s.task.zeropodContainers { 131 | containerMetrics = append(containerMetrics, container.GetMetrics()) 132 | } 133 | return &v1.MetricsResponse{Metrics: containerMetrics}, nil 134 | } 135 | -------------------------------------------------------------------------------- /shim/util.go: -------------------------------------------------------------------------------- 1 | package shim 2 | 3 | import ( 4 | "encoding/json" 5 | "fmt" 6 | "os" 7 | "path/filepath" 8 | 9 | "github.com/opencontainers/runtime-spec/specs-go" 10 | ) 11 | 12 | const RuntimeName = "io.containerd.zeropod.v2" 13 | 14 | func GetSpec(bundlePath string) (*specs.Spec, error) { 15 | var bundleSpec specs.Spec 16 | bundleConfigContents, err := os.ReadFile(filepath.Join(bundlePath, "config.json")) 17 | if err != nil { 18 | return nil, fmt.Errorf("failed to read budle: %w", err) 19 | } 20 | 21 | if err := json.Unmarshal(bundleConfigContents, &bundleSpec); err != nil { 22 | return nil, err 23 | } 24 | 25 | return &bundleSpec, nil 26 | } 27 | 28 | // GetNetworkNS reads the bundle's OCI spec and returns the network NS path of 29 | // the container. 30 | func GetNetworkNS(spec *specs.Spec) (string, error) { 31 | for _, ns := range spec.Linux.Namespaces { 32 | if ns.Type == specs.NetworkNamespace { 33 | return ns.Path, nil 34 | } 35 | } 36 | 37 | return "", fmt.Errorf("could not find network namespace in container spec") 38 | } 39 | 40 | // GetPIDNS reads the bundle's OCI spec and returns the PID NS path of the 41 | // container. 42 | func GetPIDNS(spec *specs.Spec) (string, error) { 43 | for _, ns := range spec.Linux.Namespaces { 44 | if ns.Type == specs.PIDNamespace { 45 | return ns.Path, nil 46 | } 47 | } 48 | 49 | return "", fmt.Errorf("could not find pid namespace in container spec") 50 | } 51 | -------------------------------------------------------------------------------- /socket/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM golang:1.23 as gomod 2 | 3 | WORKDIR /app 4 | ADD go.* /app 5 | # for some reason, doing go mod download in the golang container results in a 6 | # way smaller image, so we'll do it here and do the rest in a separate stage. 7 | RUN go mod download 8 | 9 | # we use fedora since it has a recent version of bpftool 10 | FROM fedora:41 11 | RUN dnf install -y llvm clang bpftool libbpf-devel golang 12 | 13 | RUN mkdir /headers 14 | RUN cp /usr/include/bpf/bpf_* /headers 15 | COPY socket/vmlinux.h.gz /headers 16 | RUN gunzip /headers/vmlinux.h.gz 17 | 18 | COPY --from=gomod /go /tmp 19 | ENV GOCACHE=/tmp 20 | ENV GOMODCACHE=/tmp/pkg/mod 21 | 22 | RUN mkdir -m 777 /app 23 | WORKDIR /app 24 | 25 | ENTRYPOINT ["go", "generate", "./..."] 26 | -------------------------------------------------------------------------------- /socket/bpf_bpfeb.go: -------------------------------------------------------------------------------- 1 | // Code generated by bpf2go; DO NOT EDIT. 2 | //go:build mips || mips64 || ppc64 || s390x 3 | 4 | package socket 5 | 6 | import ( 7 | "bytes" 8 | _ "embed" 9 | "fmt" 10 | "io" 11 | 12 | "github.com/cilium/ebpf" 13 | ) 14 | 15 | // loadBpf returns the embedded CollectionSpec for bpf. 16 | func loadBpf() (*ebpf.CollectionSpec, error) { 17 | reader := bytes.NewReader(_BpfBytes) 18 | spec, err := ebpf.LoadCollectionSpecFromReader(reader) 19 | if err != nil { 20 | return nil, fmt.Errorf("can't load bpf: %w", err) 21 | } 22 | 23 | return spec, err 24 | } 25 | 26 | // loadBpfObjects loads bpf and converts it into a struct. 27 | // 28 | // The following types are suitable as obj argument: 29 | // 30 | // *bpfObjects 31 | // *bpfPrograms 32 | // *bpfMaps 33 | // 34 | // See ebpf.CollectionSpec.LoadAndAssign documentation for details. 35 | func loadBpfObjects(obj interface{}, opts *ebpf.CollectionOptions) error { 36 | spec, err := loadBpf() 37 | if err != nil { 38 | return err 39 | } 40 | 41 | return spec.LoadAndAssign(obj, opts) 42 | } 43 | 44 | // bpfSpecs contains maps and programs before they are loaded into the kernel. 45 | // 46 | // It can be passed ebpf.CollectionSpec.Assign. 47 | type bpfSpecs struct { 48 | bpfProgramSpecs 49 | bpfMapSpecs 50 | bpfVariableSpecs 51 | } 52 | 53 | // bpfProgramSpecs contains programs before they are loaded into the kernel. 54 | // 55 | // It can be passed ebpf.CollectionSpec.Assign. 56 | type bpfProgramSpecs struct { 57 | KretprobeInetCskAccept *ebpf.ProgramSpec `ebpf:"kretprobe__inet_csk_accept"` 58 | } 59 | 60 | // bpfMapSpecs contains maps before they are loaded into the kernel. 61 | // 62 | // It can be passed ebpf.CollectionSpec.Assign. 63 | type bpfMapSpecs struct { 64 | TcpEvents *ebpf.MapSpec `ebpf:"tcp_events"` 65 | } 66 | 67 | // bpfVariableSpecs contains global variables before they are loaded into the kernel. 68 | // 69 | // It can be passed ebpf.CollectionSpec.Assign. 70 | type bpfVariableSpecs struct { 71 | } 72 | 73 | // bpfObjects contains all objects after they have been loaded into the kernel. 74 | // 75 | // It can be passed to loadBpfObjects or ebpf.CollectionSpec.LoadAndAssign. 76 | type bpfObjects struct { 77 | bpfPrograms 78 | bpfMaps 79 | bpfVariables 80 | } 81 | 82 | func (o *bpfObjects) Close() error { 83 | return _BpfClose( 84 | &o.bpfPrograms, 85 | &o.bpfMaps, 86 | ) 87 | } 88 | 89 | // bpfMaps contains all maps after they have been loaded into the kernel. 90 | // 91 | // It can be passed to loadBpfObjects or ebpf.CollectionSpec.LoadAndAssign. 92 | type bpfMaps struct { 93 | TcpEvents *ebpf.Map `ebpf:"tcp_events"` 94 | } 95 | 96 | func (m *bpfMaps) Close() error { 97 | return _BpfClose( 98 | m.TcpEvents, 99 | ) 100 | } 101 | 102 | // bpfVariables contains all global variables after they have been loaded into the kernel. 103 | // 104 | // It can be passed to loadBpfObjects or ebpf.CollectionSpec.LoadAndAssign. 105 | type bpfVariables struct { 106 | } 107 | 108 | // bpfPrograms contains all programs after they have been loaded into the kernel. 109 | // 110 | // It can be passed to loadBpfObjects or ebpf.CollectionSpec.LoadAndAssign. 111 | type bpfPrograms struct { 112 | KretprobeInetCskAccept *ebpf.Program `ebpf:"kretprobe__inet_csk_accept"` 113 | } 114 | 115 | func (p *bpfPrograms) Close() error { 116 | return _BpfClose( 117 | p.KretprobeInetCskAccept, 118 | ) 119 | } 120 | 121 | func _BpfClose(closers ...io.Closer) error { 122 | for _, closer := range closers { 123 | if err := closer.Close(); err != nil { 124 | return err 125 | } 126 | } 127 | return nil 128 | } 129 | 130 | // Do not access this directly. 131 | // 132 | //go:embed bpf_bpfeb.o 133 | var _BpfBytes []byte 134 | -------------------------------------------------------------------------------- /socket/bpf_bpfeb.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ctrox/zeropod/74906d25199d6c90954298fb57abc2693a64afc9/socket/bpf_bpfeb.o -------------------------------------------------------------------------------- /socket/bpf_bpfel.go: -------------------------------------------------------------------------------- 1 | // Code generated by bpf2go; DO NOT EDIT. 2 | //go:build 386 || amd64 || arm || arm64 || loong64 || mips64le || mipsle || ppc64le || riscv64 3 | 4 | package socket 5 | 6 | import ( 7 | "bytes" 8 | _ "embed" 9 | "fmt" 10 | "io" 11 | 12 | "github.com/cilium/ebpf" 13 | ) 14 | 15 | // loadBpf returns the embedded CollectionSpec for bpf. 16 | func loadBpf() (*ebpf.CollectionSpec, error) { 17 | reader := bytes.NewReader(_BpfBytes) 18 | spec, err := ebpf.LoadCollectionSpecFromReader(reader) 19 | if err != nil { 20 | return nil, fmt.Errorf("can't load bpf: %w", err) 21 | } 22 | 23 | return spec, err 24 | } 25 | 26 | // loadBpfObjects loads bpf and converts it into a struct. 27 | // 28 | // The following types are suitable as obj argument: 29 | // 30 | // *bpfObjects 31 | // *bpfPrograms 32 | // *bpfMaps 33 | // 34 | // See ebpf.CollectionSpec.LoadAndAssign documentation for details. 35 | func loadBpfObjects(obj interface{}, opts *ebpf.CollectionOptions) error { 36 | spec, err := loadBpf() 37 | if err != nil { 38 | return err 39 | } 40 | 41 | return spec.LoadAndAssign(obj, opts) 42 | } 43 | 44 | // bpfSpecs contains maps and programs before they are loaded into the kernel. 45 | // 46 | // It can be passed ebpf.CollectionSpec.Assign. 47 | type bpfSpecs struct { 48 | bpfProgramSpecs 49 | bpfMapSpecs 50 | bpfVariableSpecs 51 | } 52 | 53 | // bpfProgramSpecs contains programs before they are loaded into the kernel. 54 | // 55 | // It can be passed ebpf.CollectionSpec.Assign. 56 | type bpfProgramSpecs struct { 57 | KretprobeInetCskAccept *ebpf.ProgramSpec `ebpf:"kretprobe__inet_csk_accept"` 58 | } 59 | 60 | // bpfMapSpecs contains maps before they are loaded into the kernel. 61 | // 62 | // It can be passed ebpf.CollectionSpec.Assign. 63 | type bpfMapSpecs struct { 64 | TcpEvents *ebpf.MapSpec `ebpf:"tcp_events"` 65 | } 66 | 67 | // bpfVariableSpecs contains global variables before they are loaded into the kernel. 68 | // 69 | // It can be passed ebpf.CollectionSpec.Assign. 70 | type bpfVariableSpecs struct { 71 | } 72 | 73 | // bpfObjects contains all objects after they have been loaded into the kernel. 74 | // 75 | // It can be passed to loadBpfObjects or ebpf.CollectionSpec.LoadAndAssign. 76 | type bpfObjects struct { 77 | bpfPrograms 78 | bpfMaps 79 | bpfVariables 80 | } 81 | 82 | func (o *bpfObjects) Close() error { 83 | return _BpfClose( 84 | &o.bpfPrograms, 85 | &o.bpfMaps, 86 | ) 87 | } 88 | 89 | // bpfMaps contains all maps after they have been loaded into the kernel. 90 | // 91 | // It can be passed to loadBpfObjects or ebpf.CollectionSpec.LoadAndAssign. 92 | type bpfMaps struct { 93 | TcpEvents *ebpf.Map `ebpf:"tcp_events"` 94 | } 95 | 96 | func (m *bpfMaps) Close() error { 97 | return _BpfClose( 98 | m.TcpEvents, 99 | ) 100 | } 101 | 102 | // bpfVariables contains all global variables after they have been loaded into the kernel. 103 | // 104 | // It can be passed to loadBpfObjects or ebpf.CollectionSpec.LoadAndAssign. 105 | type bpfVariables struct { 106 | } 107 | 108 | // bpfPrograms contains all programs after they have been loaded into the kernel. 109 | // 110 | // It can be passed to loadBpfObjects or ebpf.CollectionSpec.LoadAndAssign. 111 | type bpfPrograms struct { 112 | KretprobeInetCskAccept *ebpf.Program `ebpf:"kretprobe__inet_csk_accept"` 113 | } 114 | 115 | func (p *bpfPrograms) Close() error { 116 | return _BpfClose( 117 | p.KretprobeInetCskAccept, 118 | ) 119 | } 120 | 121 | func _BpfClose(closers ...io.Closer) error { 122 | for _, closer := range closers { 123 | if err := closer.Close(); err != nil { 124 | return err 125 | } 126 | } 127 | return nil 128 | } 129 | 130 | // Do not access this directly. 131 | // 132 | //go:embed bpf_bpfel.o 133 | var _BpfBytes []byte 134 | -------------------------------------------------------------------------------- /socket/bpf_bpfel.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ctrox/zeropod/74906d25199d6c90954298fb57abc2693a64afc9/socket/bpf_bpfel.o -------------------------------------------------------------------------------- /socket/ebpf.go: -------------------------------------------------------------------------------- 1 | package socket 2 | 3 | import ( 4 | "fmt" 5 | "os" 6 | "path/filepath" 7 | "time" 8 | 9 | "github.com/cilium/ebpf" 10 | "github.com/cilium/ebpf/link" 11 | "github.com/cilium/ebpf/rlimit" 12 | "github.com/ctrox/zeropod/activator" 13 | "golang.org/x/sys/unix" 14 | ) 15 | 16 | // $BPF_CLANG and $BPF_CFLAGS are set by the Makefile. 17 | //go:generate go run github.com/cilium/ebpf/cmd/bpf2go -cc $BPF_CLANG -cflags $BPF_CFLAGS bpf kprobe.c -- -I/headers 18 | 19 | const TCPEventsMap = "tcp_events" 20 | 21 | // LoadEBPFTracker loads the eBPF program and attaches the kretprobe to track 22 | // connections system-wide. 23 | func LoadEBPFTracker() (func() error, error) { 24 | // Allow the current process to lock memory for eBPF resources. 25 | if err := rlimit.RemoveMemlock(); err != nil { 26 | return nil, err 27 | } 28 | 29 | pinPath := activator.MapsPath() 30 | if err := os.MkdirAll(pinPath, os.ModePerm); err != nil { 31 | return nil, fmt.Errorf("failed to create bpf fs subpath: %w", err) 32 | } 33 | 34 | // Load pre-compiled programs and maps into the kernel. 35 | objs := bpfObjects{} 36 | if err := loadBpfObjects(&objs, &ebpf.CollectionOptions{ 37 | Maps: ebpf.MapOptions{ 38 | // Pin the map to the BPF filesystem and configure the 39 | // library to automatically re-write it in the BPF 40 | // program so it can be re-used if it already exists or 41 | // create it if not. 42 | PinPath: pinPath, 43 | }, 44 | }); err != nil { 45 | return nil, fmt.Errorf("loading objects: %w", err) 46 | } 47 | 48 | // in the past we used inet_sock_set_state here but we now use a 49 | // kretprobe with inet_csk_accept as inet_sock_set_state is not giving us 50 | // reliable PIDs. https://github.com/iovisor/bcc/issues/2304 51 | kp, err := link.Kretprobe("inet_csk_accept", objs.KretprobeInetCskAccept, &link.KprobeOptions{}) 52 | if err != nil { 53 | return nil, fmt.Errorf("linking kprobe: %w", err) 54 | } 55 | 56 | return func() error { 57 | if err := objs.Close(); err != nil { 58 | return err 59 | } 60 | return kp.Close() 61 | }, nil 62 | } 63 | 64 | // NewEBPFTracker returns a TCP connection tracker that will keep track of the 65 | // last TCP accept of specific processes. It writes the results to an ebpf map 66 | // keyed with the PID and the value contains the timestamp of the last 67 | // observed accept. 68 | func NewEBPFTracker() (Tracker, error) { 69 | var resolver PIDResolver 70 | resolver = noopResolver{} 71 | // if hostProcPath exists, we're probably running in a test container. We 72 | // will use the hostResolver instead of using the actual pids. 73 | if _, err := os.Stat(hostProcPath); err == nil { 74 | resolver = hostResolver{} 75 | } 76 | 77 | tcpEvents, err := ebpf.LoadPinnedMap(filepath.Join(activator.MapsPath(), TCPEventsMap), &ebpf.LoadPinOptions{}) 78 | 79 | return &EBPFTracker{ 80 | PIDResolver: resolver, 81 | tcpEvents: tcpEvents, 82 | }, err 83 | } 84 | 85 | // PIDResolver allows to customize how the PIDs of the connection tracker are 86 | // resolved. This can be useful if the shim is already running in a container 87 | // (e.g. when using Kind), so it can resolve the PID of the container to the 88 | // ones of the host that ebpf sees. 89 | type PIDResolver interface { 90 | Resolve(pid uint32) uint32 91 | } 92 | 93 | // noopResolver does not resolve anything and just returns the actual pid. 94 | type noopResolver struct{} 95 | 96 | func (p noopResolver) Resolve(pid uint32) uint32 { 97 | return pid 98 | } 99 | 100 | type NoActivityRecordedErr struct{} 101 | 102 | func (err NoActivityRecordedErr) Error() string { 103 | return "no activity recorded" 104 | } 105 | 106 | type EBPFTracker struct { 107 | PIDResolver 108 | tcpEvents *ebpf.Map 109 | } 110 | 111 | // TrackPid puts the pid into the TcpEvents map meaning tcp events of the 112 | // process belonging to that pid will be tracked. 113 | func (c *EBPFTracker) TrackPid(pid uint32) error { 114 | val := uint64(0) 115 | pid = c.PIDResolver.Resolve(pid) 116 | if err := c.tcpEvents.Put(&pid, &val); err != nil { 117 | return fmt.Errorf("unable to put pid %d into bpf map: %w", pid, err) 118 | } 119 | 120 | return nil 121 | } 122 | 123 | // RemovePid removes the pid from the TcpEvents map. 124 | func (c *EBPFTracker) RemovePid(pid uint32) error { 125 | pid = c.PIDResolver.Resolve(pid) 126 | return c.tcpEvents.Delete(&pid) 127 | } 128 | 129 | // LastActivity returns a time.Time of the last tcp activity recorded of the 130 | // process belonging to the pid (or a child-process of the pid). 131 | func (c *EBPFTracker) LastActivity(pid uint32) (time.Time, error) { 132 | var val uint64 133 | 134 | pid = c.PIDResolver.Resolve(pid) 135 | if err := c.tcpEvents.Lookup(&pid, &val); err != nil { 136 | return time.Time{}, fmt.Errorf("looking up %d: %w", pid, err) 137 | } 138 | 139 | if val == 0 { 140 | return time.Time{}, NoActivityRecordedErr{} 141 | } 142 | 143 | return convertBPFTime(val) 144 | } 145 | 146 | func (c *EBPFTracker) Close() error { 147 | return c.tcpEvents.Close() 148 | } 149 | 150 | // convertBPFTime takes the value of bpf_ktime_get_ns and converts it to a 151 | // time.Time. 152 | func convertBPFTime(t uint64) (time.Time, error) { 153 | b, err := getBootTimeNS() 154 | if err != nil { 155 | return time.Time{}, err 156 | } 157 | 158 | return time.Now().Add(-time.Duration(b - int64(t))), nil 159 | } 160 | 161 | // getKtimeNS returns the time elapsed since system boot, in nanoseconds. Does 162 | // not include time the system was suspended. Basically the equivalent of 163 | // bpf_ktime_get_ns. 164 | func getBootTimeNS() (int64, error) { 165 | var ts unix.Timespec 166 | err := unix.ClockGettime(unix.CLOCK_MONOTONIC, &ts) 167 | if err != nil { 168 | return 0, fmt.Errorf("could not get time: %s", err) 169 | } 170 | 171 | return unix.TimespecToNsec(ts), nil 172 | } 173 | -------------------------------------------------------------------------------- /socket/host_resolver.go: -------------------------------------------------------------------------------- 1 | package socket 2 | 3 | import ( 4 | "fmt" 5 | "os/exec" 6 | "strconv" 7 | "strings" 8 | ) 9 | 10 | const hostProcPath = "/host/proc/" 11 | 12 | // hostResolver uses the procfs of the host to resolve PIDs. With this the 13 | // connection tracker can work when running in a container. As the ebpf 14 | // program is not aware of the PID namespace that the processes running in, we 15 | // need to find the PIDs of the host processes from the ones in the container. 16 | type hostResolver struct{} 17 | 18 | func (h hostResolver) Resolve(pid uint32) uint32 { 19 | p, err := findHostPid(hostProcPath, pid) 20 | if err != nil { 21 | return pid 22 | } 23 | 24 | return p 25 | } 26 | 27 | // findHostPid greps through the procfs to find the host pid of the supplied 28 | // namespaced pid. It's very ugly but it works well enough for testing with 29 | // Kind. It would be better to use the procfs package here but NSpid is always 30 | // empty. 31 | func findHostPid(procPath string, nsPid uint32) (uint32, error) { 32 | out, err := exec.Command("bash", "-c", fmt.Sprintf(`grep -P 'NSpid:.*\t%d\t' -ril %s*/status | head -n 1`, nsPid, procPath)).Output() 33 | if err != nil { 34 | return 0, err 35 | } 36 | 37 | strPid := strings.TrimSuffix(strings.TrimPrefix(string(out), procPath), "/status\n") 38 | pid, err := strconv.ParseUint(strPid, 10, 32) 39 | return uint32(pid), err 40 | } 41 | -------------------------------------------------------------------------------- /socket/kprobe.c: -------------------------------------------------------------------------------- 1 | //go:build ignore 2 | 3 | #include "vmlinux.h" 4 | #include "bpf_helpers.h" 5 | 6 | char __license[] SEC("license") = "Dual MIT/GPL"; 7 | 8 | struct { 9 | __uint(type, BPF_MAP_TYPE_HASH); 10 | __uint(max_entries, 1024); // should be enough pids? 11 | __type(key, __u32); // pid 12 | __type(value, __u64); // ktime ns of the last tracked event 13 | __uint(pinning, LIBBPF_PIN_BY_NAME); 14 | } tcp_events SEC(".maps"); 15 | 16 | SEC("kretprobe/inet_csk_accept") 17 | int kretprobe__inet_csk_accept(struct pt_regs *ctx) 18 | { 19 | // TODO: we don't check if the protocol is actually TCP here as this seems quite messy: 20 | // https://github.com/iovisor/bcc/blob/71b5141659aaaf4a7c2172c73a802bd86a256ecd/tools/tcpaccept.py#L118 21 | // does this matter? Which other protocols make use of inet_csk_accept? 22 | 23 | struct task_struct* task = (struct task_struct*)bpf_get_current_task_btf(); 24 | // we use the tgid as our pid as it represents the pid from userspace 25 | __u32 pid = task->tgid; 26 | 27 | void *tcp_event = &tcp_events; 28 | void* found_pid = bpf_map_lookup_elem(tcp_event, &pid); 29 | 30 | if (!found_pid) { 31 | // try ppid, our process might have forks 32 | pid = task->real_parent->tgid; 33 | 34 | void* found_ppid = bpf_map_lookup_elem(tcp_event, &pid); 35 | if (!found_ppid) { 36 | return 0; 37 | } 38 | } 39 | 40 | __u64 time = bpf_ktime_get_ns(); 41 | 42 | // const char fmt_str[] = "%d: accept found on pid %d\n"; 43 | // bpf_trace_printk(fmt_str, sizeof(fmt_str), time, pid); 44 | 45 | return bpf_map_update_elem(tcp_event, &pid, &time, BPF_ANY); 46 | }; 47 | -------------------------------------------------------------------------------- /socket/noop.go: -------------------------------------------------------------------------------- 1 | package socket 2 | 3 | import "time" 4 | 5 | func NewNoopTracker(scaleDownDuration time.Duration) NoopTracker { 6 | return NoopTracker{ 7 | PIDResolver: noopResolver{}, 8 | scaleDownDuration: scaleDownDuration, 9 | } 10 | } 11 | 12 | type NoopTracker struct { 13 | PIDResolver 14 | scaleDownDuration time.Duration 15 | } 16 | 17 | func (n NoopTracker) TrackPid(pid uint32) error { 18 | return nil 19 | } 20 | 21 | func (n NoopTracker) RemovePid(pid uint32) error { 22 | return nil 23 | } 24 | 25 | func (n NoopTracker) LastActivity(pid uint32) (time.Time, error) { 26 | return time.Now().Add(-n.scaleDownDuration), nil 27 | } 28 | 29 | func (n NoopTracker) Close() error { 30 | return nil 31 | } 32 | -------------------------------------------------------------------------------- /socket/tracker.go: -------------------------------------------------------------------------------- 1 | package socket 2 | 3 | import "time" 4 | 5 | type Tracker interface { 6 | PIDResolver 7 | 8 | // TrackPid starts connection tracking of the specified process. 9 | TrackPid(pid uint32) error 10 | // TrackPid stops connection tracking of the specified process. 11 | RemovePid(pid uint32) error 12 | // LastActivity returns the time of the last TCP activity of the specified process. 13 | LastActivity(pid uint32) (time.Time, error) 14 | // Close the activity tracker. 15 | Close() error 16 | } 17 | -------------------------------------------------------------------------------- /socket/tracker_test.go: -------------------------------------------------------------------------------- 1 | package socket 2 | 3 | import ( 4 | "fmt" 5 | "net/http" 6 | "net/http/httptest" 7 | "os" 8 | "testing" 9 | "time" 10 | 11 | "github.com/ctrox/zeropod/activator" 12 | "github.com/stretchr/testify/require" 13 | ) 14 | 15 | // TestEBPFTracker tests the ebpf tcp tracker by getting our own pid, starting 16 | // an HTTP server and doing a request against it. This test requires elevated 17 | // privileges to run. 18 | func TestEBPFTracker(t *testing.T) { 19 | require.NoError(t, activator.MountDebugFS()) 20 | require.NoError(t, activator.MountBPFFS(activator.BPFFSPath)) 21 | 22 | clean, err := LoadEBPFTracker() 23 | require.NoError(t, err) 24 | defer func() { require.NoError(t, clean()) }() 25 | 26 | tracker, err := NewEBPFTracker() 27 | require.NoError(t, err) 28 | 29 | pid := uint32(os.Getpid()) 30 | require.NoError(t, tracker.TrackPid(pid)) 31 | 32 | ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { 33 | fmt.Fprintln(w, "ok") 34 | })) 35 | 36 | require.Eventually(t, func() bool { 37 | _, err = http.Get(ts.URL) 38 | return err == nil 39 | }, time.Millisecond*100, time.Millisecond, "waiting for http server to reply") 40 | 41 | require.Eventually(t, func() bool { 42 | activity, err := tracker.LastActivity(pid) 43 | if err != nil { 44 | return false 45 | } 46 | 47 | if time.Since(activity) > time.Millisecond*100 { 48 | t.Fatalf("last activity was %s ago, expected it to be within the last 100ms", time.Since(activity)) 49 | } 50 | 51 | return true 52 | }, time.Millisecond*100, time.Millisecond, "waiting for last tcp activity") 53 | } 54 | -------------------------------------------------------------------------------- /socket/vmlinux.h.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ctrox/zeropod/74906d25199d6c90954298fb57abc2693a64afc9/socket/vmlinux.h.gz --------------------------------------------------------------------------------