├── .github ├── ISSUE_TEMPLATE │ ├── bug_report.md │ └── feature_request.md └── workflows │ ├── pr.yaml │ ├── release.yaml │ └── subcharts.yaml ├── .gitignore ├── CODE_OF_CONDUCT.md ├── Dockerfile ├── LICENSE ├── Makefile ├── README.md ├── chart ├── Chart.lock ├── Chart.yaml ├── charts │ ├── grafana-6.48.0.tgz │ └── victoria-metrics-single-0.8.48.tgz ├── dashboard.json ├── templates │ ├── _helpers.tpl │ ├── daemonset.yaml │ ├── grafana │ │ └── dashboards.yaml │ └── rbac │ │ ├── psp.yaml │ │ ├── role.yaml │ │ ├── rolebinding.yaml │ │ └── serviceaccount.yaml └── values.yaml ├── cmd └── caretta │ └── caretta.go ├── go.mod ├── go.sum ├── images ├── caretta.gif ├── logo.svg └── screenshot.png ├── pkg ├── caretta │ ├── caretta.go │ ├── config.go │ ├── ebpf_map.go │ ├── links_tracer.go │ ├── links_tracer_test.go │ └── types.go ├── k8s │ ├── ipresolver.go │ └── ipresolver_test.go ├── metrics │ └── prometheus.go └── tracing │ ├── ebpf │ ├── arm_support.h │ ├── caretta.bpf.c │ ├── core_structures.h │ ├── ebpf_internal_types.h │ ├── ebpf_utils.h │ └── epbf_shared_types.h │ └── probes.go └── scripts └── build └── download_libbpf_headers.sh /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce** 14 | Steps to reproduce the behavior: 15 | 1. Go to '...' 16 | 2. Click on '....' 17 | 3. Scroll down to '....' 18 | 4. See error 19 | 20 | **Expected behavior** 21 | A clear and concise description of what you expected to happen. 22 | 23 | **Screenshots** 24 | If applicable, add screenshots to help explain your problem. 25 | 26 | **Environment (please complete the following information):** 27 | - OS: [e.g. iOS] 28 | - Browser [e.g. chrome, safari] 29 | - Kubernetes cluster information - distribution, version 30 | 31 | **Additional context** 32 | Add any other context about the problem here. 33 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Additional context** 20 | Add any other context or screenshots about the feature request here. 21 | -------------------------------------------------------------------------------- /.github/workflows/pr.yaml: -------------------------------------------------------------------------------- 1 | name: pr 2 | 3 | on: 4 | pull_request: 5 | 6 | jobs: 7 | build: 8 | runs-on: ubuntu-latest 9 | permissions: 10 | contents: write 11 | id-token: write 12 | steps: 13 | - 14 | name: Checkout 15 | uses: actions/checkout@v3 16 | - 17 | name: Set Up QEMU 18 | uses: docker/setup-qemu-action@v3 19 | with: 20 | platforms: arm64 21 | - 22 | name: Set up Docker Buildx 23 | uses: docker/setup-buildx-action@v3 24 | - 25 | name: Build Docker Image 26 | uses: docker/build-push-action@v3 27 | with: 28 | context: . 29 | push: false 30 | cache-from: type=gha 31 | cache-to: type=gha,mode=max 32 | tags: caretta 33 | platforms: linux/amd64,linux/arm64 34 | 35 | -------------------------------------------------------------------------------- /.github/workflows/release.yaml: -------------------------------------------------------------------------------- 1 | name: release 2 | 3 | on: 4 | push: 5 | tags: 6 | - 'v*.*.*' 7 | 8 | jobs: 9 | release: 10 | runs-on: ubuntu-latest 11 | permissions: 12 | contents: write 13 | id-token: write 14 | steps: 15 | - 16 | name: Checkout 17 | uses: actions/checkout@v3 18 | - 19 | name: Set Up QEMU 20 | uses: docker/setup-qemu-action@v3 21 | with: 22 | platforms: arm64 23 | - 24 | name: Set up Docker Buildx 25 | uses: docker/setup-buildx-action@v3 26 | - 27 | name: Login to Quay.io 28 | uses: docker/login-action@v2 29 | with: 30 | registry: quay.io 31 | username: ${{ secrets.QUAY_USERNAME }} 32 | password: ${{ secrets.QUAY_ROBOT_TOKEN }} 33 | - 34 | name: Build & Push Docker Image 35 | uses: docker/build-push-action@v3 36 | with: 37 | context: . 38 | push: true 39 | cache-from: type=gha 40 | cache-to: type=gha,mode=max 41 | tags: quay.io/groundcover/caretta:${{ github.ref_name }} 42 | platforms: linux/arm64,linux/amd64 43 | - 44 | name: Checkout Helm Repo 45 | uses: actions/checkout@v3 46 | with: 47 | path: helm-repo 48 | repository: groundcover-com/charts 49 | token: ${{ secrets.HELM_CHARTS_REPO_KEY }} 50 | - 51 | name: Publish Chart 52 | working-directory: helm-repo 53 | env: 54 | GITHUB_TAG: ${{ github.ref_name }} 55 | run: | 56 | version=${GITHUB_TAG#v} 57 | helm lint ../chart 58 | helm package --version ${version} --app-version ${GITHUB_TAG} ../chart 59 | helm repo index --url https://helm.groundcover.com . 60 | git config user.name "ci-groundcover" 61 | git config user.email "ci@groundcover.com" 62 | git add . 63 | git commit -m "Added caretta ${version} chart" 64 | git push 65 | -------------------------------------------------------------------------------- /.github/workflows/subcharts.yaml: -------------------------------------------------------------------------------- 1 | name: subcharts-images 2 | 3 | on: 4 | push: 5 | branches: 6 | - 'main' 7 | paths: 8 | - 'chart/charts/**' 9 | - '.github/workflows/subcharts.yaml' 10 | 11 | defaults: 12 | run: 13 | working-directory: chart/charts 14 | 15 | jobs: 16 | subchart-images: 17 | runs-on: ubuntu-latest 18 | permissions: 19 | contents: write 20 | id-token: write 21 | steps: 22 | - 23 | name: Checkout 24 | uses: actions/checkout@v3 25 | - 26 | name: Login to Quay.io 27 | uses: docker/login-action@v2 28 | with: 29 | registry: quay.io 30 | username: ${{ secrets.QUAY_USERNAME }} 31 | password: ${{ secrets.QUAY_ROBOT_TOKEN }} 32 | - 33 | name: Set up Docker Buildx 34 | uses: docker/setup-buildx-action@v2 35 | - 36 | name: Push Grafana Image 37 | run: | 38 | IMAGE_TAG=$(helm show chart grafana* | yq e '.appVersion' -) 39 | docker buildx imagetools create grafana/grafana:${IMAGE_TAG} --tag quay.io/groundcover/grafana:${IMAGE_TAG} 40 | - 41 | name: Push Victoria-Metrics Image 42 | run: | 43 | IMAGE_TAG=v$(helm show chart victoria-metrics* | yq e '.appVersion' -) 44 | docker buildx imagetools create victoriametrics/victoria-metrics:${IMAGE_TAG} --tag quay.io/groundcover/victoria-metrics:${IMAGE_TAG} 45 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # If you prefer the allow list template instead of the deny list, see community template: 2 | # https://github.com/github/gitignore/blob/main/community/Golang/Go.AllowList.gitignore 3 | # 4 | # Binaries for programs and plugins 5 | *.exe 6 | *.exe~ 7 | *.dll 8 | *.so 9 | *.dylib 10 | *.o 11 | 12 | # Test binary, built with `go test -c` 13 | *.test 14 | 15 | # Output of the go coverage tool, specifically when used with LiteIDE 16 | *.out 17 | 18 | # Dependency directories (remove the comment below to include it) 19 | # vendor/ 20 | 21 | # Go workspace file 22 | go.work 23 | 24 | 25 | # autogenerated by bpf2go 26 | *_bpfel_*.go 27 | 28 | # binary output 29 | bin/ 30 | vendor/ 31 | 32 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | We as members, contributors, and leaders pledge to make participation in our 6 | community a harassment-free experience for everyone, regardless of age, body 7 | size, visible or invisible disability, ethnicity, sex characteristics, gender 8 | identity and expression, level of experience, education, socio-economic status, 9 | nationality, personal appearance, race, religion, or sexual identity 10 | and orientation. 11 | 12 | We pledge to act and interact in ways that contribute to an open, welcoming, 13 | diverse, inclusive, and healthy community. 14 | 15 | ## Our Standards 16 | 17 | Examples of behavior that contributes to a positive environment for our 18 | community include: 19 | 20 | * Demonstrating empathy and kindness toward other people 21 | * Being respectful of differing opinions, viewpoints, and experiences 22 | * Giving and gracefully accepting constructive feedback 23 | * Accepting responsibility and apologizing to those affected by our mistakes, 24 | and learning from the experience 25 | * Focusing on what is best not just for us as individuals, but for the 26 | overall community 27 | 28 | Examples of unacceptable behavior include: 29 | 30 | * The use of sexualized language or imagery, and sexual attention or 31 | advances of any kind 32 | * Trolling, insulting or derogatory comments, and personal or political attacks 33 | * Public or private harassment 34 | * Publishing others' private information, such as a physical or email 35 | address, without their explicit permission 36 | * Other conduct which could reasonably be considered inappropriate in a 37 | professional setting 38 | 39 | ## Enforcement Responsibilities 40 | 41 | Community leaders are responsible for clarifying and enforcing our standards of 42 | acceptable behavior and will take appropriate and fair corrective action in 43 | response to any behavior that they deem inappropriate, threatening, offensive, 44 | or harmful. 45 | 46 | Community leaders have the right and responsibility to remove, edit, or reject 47 | comments, commits, code, wiki edits, issues, and other contributions that are 48 | not aligned to this Code of Conduct, and will communicate reasons for moderation 49 | decisions when appropriate. 50 | 51 | ## Scope 52 | 53 | This Code of Conduct applies within all community spaces, and also applies when 54 | an individual is officially representing the community in public spaces. 55 | Examples of representing our community include using an official e-mail address, 56 | posting via an official social media account, or acting as an appointed 57 | representative at an online or offline event. 58 | 59 | ## Enforcement 60 | 61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 62 | reported to the community leaders responsible for enforcement at 63 | info@groundcover.com. 64 | All complaints will be reviewed and investigated promptly and fairly. 65 | 66 | All community leaders are obligated to respect the privacy and security of the 67 | reporter of any incident. 68 | 69 | ## Enforcement Guidelines 70 | 71 | Community leaders will follow these Community Impact Guidelines in determining 72 | the consequences for any action they deem in violation of this Code of Conduct: 73 | 74 | ### 1. Correction 75 | 76 | **Community Impact**: Use of inappropriate language or other behavior deemed 77 | unprofessional or unwelcome in the community. 78 | 79 | **Consequence**: A private, written warning from community leaders, providing 80 | clarity around the nature of the violation and an explanation of why the 81 | behavior was inappropriate. A public apology may be requested. 82 | 83 | ### 2. Warning 84 | 85 | **Community Impact**: A violation through a single incident or series 86 | of actions. 87 | 88 | **Consequence**: A warning with consequences for continued behavior. No 89 | interaction with the people involved, including unsolicited interaction with 90 | those enforcing the Code of Conduct, for a specified period of time. This 91 | includes avoiding interactions in community spaces as well as external channels 92 | like social media. Violating these terms may lead to a temporary or 93 | permanent ban. 94 | 95 | ### 3. Temporary Ban 96 | 97 | **Community Impact**: A serious violation of community standards, including 98 | sustained inappropriate behavior. 99 | 100 | **Consequence**: A temporary ban from any sort of interaction or public 101 | communication with the community for a specified period of time. No public or 102 | private interaction with the people involved, including unsolicited interaction 103 | with those enforcing the Code of Conduct, is allowed during this period. 104 | Violating these terms may lead to a permanent ban. 105 | 106 | ### 4. Permanent Ban 107 | 108 | **Community Impact**: Demonstrating a pattern of violation of community 109 | standards, including sustained inappropriate behavior, harassment of an 110 | individual, or aggression toward or disparagement of classes of individuals. 111 | 112 | **Consequence**: A permanent ban from any sort of public interaction within 113 | the community. 114 | 115 | ## Attribution 116 | 117 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], 118 | version 2.0, available at 119 | https://www.contributor-covenant.org/version/2/0/code_of_conduct.html. 120 | 121 | Community Impact Guidelines were inspired by [Mozilla's code of conduct 122 | enforcement ladder](https://github.com/mozilla/diversity). 123 | 124 | [homepage]: https://www.contributor-covenant.org 125 | 126 | For answers to common questions about this code of conduct, see the FAQ at 127 | https://www.contributor-covenant.org/faq. Translations are available at 128 | https://www.contributor-covenant.org/translations. 129 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM quay.io/cilium/ebpf-builder:1648566014 AS builder 2 | ARG TARGETARCH 3 | ARG TARGETPLATFORM 4 | RUN echo "Building for $TARGETARCH" 5 | RUN echo "Building for $TARGETPLATFORM" 6 | WORKDIR /build 7 | COPY . /build/ 8 | RUN make build ARCH=$TARGETARCH 9 | 10 | FROM alpine:3.17 11 | 12 | WORKDIR /app 13 | COPY --from=builder build/bin/caretta ./ 14 | 15 | VOLUME /sys/kernel/debug 16 | 17 | CMD ./caretta -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | BIN_DIR:=bin 2 | BINARY_PATH:=${BIN_DIR}/caretta 3 | DOCKER_BIN:=docker 4 | BPF2GO_BINARY := ${BIN_DIR}/bpf2go 5 | BPF2GO_VERSION := 0.9.0 6 | REPODIR := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST)))) 7 | UIDGID := $(shell stat -c '%u:%g' ${REPODIR}) 8 | PROJECT_DIRNAME := $(shell basename ${REPODIR}) 9 | CILIUM_EBPF_DIRECTORY := /tmp/cilium-ebpf 10 | BUILD_SCRIPTS_DIRECTORY=scripts/build 11 | BPF_CLANG := clang-14 12 | INCLUDE_C_FLAGS := -I/tmp/caretta_extra/libbpf_headers -I/tmp/${PROJECT_DIRNAME}/ 13 | BPF_CFLAGS := -O2 -g -Wall -Werror -fdebug-prefix-map=/ebpf=. ${INCLUDE_C_FLAGS} 14 | IMAGE=quay.io/cilium/ebpf-builder 15 | VERSION=1648566014 16 | 17 | ARCH=amd64 # amd64 or arm64 18 | 19 | .PHONY: build 20 | build: ${BIN_DIR} pkg/tracing/bpf_bpfel_x86.go cmd/caretta/caretta.go 21 | GOOS=linux GOARCH=${TARGETARCH} CGO_ENABLED=0 go build -o ${BINARY_PATH} cmd/caretta/caretta.go 22 | 23 | ${BIN_DIR}: 24 | mkdir -p ${BIN_DIR} 25 | 26 | .PHONY: download_libbpf_headers 27 | download_libbpf_headers: 28 | ${REPODIR}/${BUILD_SCRIPTS_DIRECTORY}/download_libbpf_headers.sh 29 | 30 | .PHONY: generate_ebpf 31 | generate_ebpf: ${BPF2GO_BINARY}_${BPF2GO_VERSION} \ 32 | download_libbpf_headers 33 | go mod vendor 34 | (cd ${REPODIR}/pkg/tracing && \ 35 | GOPACKAGE=tracing ${REPODIR}/${BPF2GO_BINARY}_${BPF2GO_VERSION} \ 36 | -cc "${BPF_CLANG}" -cflags "${BPF_CFLAGS}" \ 37 | -target arm64,amd64 bpf \ 38 | ebpf/caretta.bpf.c --) 39 | 40 | ${BPF2GO_BINARY}_${BPF2GO_VERSION}: 41 | git clone -q --branch v${BPF2GO_VERSION} https://github.com/cilium/ebpf \ 42 | ${CILIUM_EBPF_DIRECTORY} 2>/dev/null 43 | cd ${CILIUM_EBPF_DIRECTORY} && \ 44 | go build -o ${REPODIR}/${BPF2GO_BINARY}_${BPF2GO_VERSION} ./cmd/bpf2go 45 | 46 | .PHONY: generate_ebpf_in_docker 47 | generate_ebpf_in_docker: ${BIN_DIR} 48 | ${DOCKER_BIN} run \ 49 | -v ${REPODIR}:/tmp/caretta \ 50 | -w /tmp/${PROJECT_DIRNAME} \ 51 | --env HOME="/tmp/" \ 52 | "${IMAGE}:${VERSION}" \ 53 | ${MAKE} generate_ebpf 54 | 55 | pkg/tracing/bpf_bpfel%.go: pkg/tracing/ebpf/caretta.bpf.c 56 | $(MAKE) generate_ebpf -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
2 |

Caretta

3 |

4 | caretta 5 |

6 | Caretta - Instant K8s service dependency map, right to your Grafana | Product Hunt 7 |

8 |

Instant K8s service dependency map, right to your Grafana.

9 |

made by groundcover

10 |

11 | 12 | [![slack](https://img.shields.io/badge/slack-groundcover-yellowgreen.svg?logo=slack)](http://www.groundcover.com/join-slack) 13 | [![License](https://img.shields.io/badge/License-Apache_2.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) 14 |
15 | 16 |

17 | 18 |

19 | caretta-screenshot 20 |

21 | 22 | ## What is Caretta? 23 | 24 | Caretta is a lightweight, standalone tool that instantly creates a visual network map of the services running in your cluster. 25 | 26 | Carreta leverages eBPF to efficiently map all service network interactions in a K8s cluster, and Grafana to query and visualize the collected data. 27 | 28 | Carreta is built to be efficient, with a minimal footprint on the system, and does not require any modifications of the cluster. 29 | 30 | Caretta demonstrates the power of using eBPF for observability solutions, which is our vision at
groundcover. If you're interested in understanding how Caretta is built, head over to our Caretta blog post! 31 | 32 | ## Installing Caretta :zap: 33 | As simple as installing a helm chart. It is recommended to install Caretta in a new, unique namespace. 34 | ```bash 35 | helm repo add groundcover https://helm.groundcover.com/ 36 | ``` 37 | ```bash 38 | helm repo update 39 | ``` 40 | ```bash 41 | helm install caretta --namespace caretta --create-namespace groundcover/caretta 42 | ``` 43 | 44 | ### Configuration 45 | You can configure Caretta using helm values. 46 | Useful values: 47 | * **tolerations** can be specified to make sure Caretta's eBPF-agent will run on all cluster in your nodes. *default value will tolerate common control-plane node annotations* 48 | * **victoria-metrics-single.server.persistentVolume.enabled** can be set to *true* if you wish to save Caretta's metrics to a persistent volume *default: false* 49 | * **pollIntervalSeconds** can be modified to specify the polling and publishing interval of new metrics from the kernel. *default: 5* 50 | * The built-in Victoria Metrics and Grafana instances can be disabled by changing the values **victoria-metrics-single.enabled** or **grafana.enabled** to false, accordingly. _default: true_ 51 | * Caretta resolves Kubernetes entities to their owners by default. For example, a pod 'pod1' and another pod 'pod2' both belonging to a deployment 'deployment1' will be resolved to 'deployment1'. This can be disabled by setting **traverseUpHierarchy** to false. _default: true_ 52 | 53 | 54 | Example yaml for overriding these values: 55 | ```yaml 56 | pollIntervalSeconds: 15 # set metrics polling interval 57 | traverseUpHierarchy: false # disable resolving kubernetes entities to their owners 58 | 59 | tolerations: # set any desired tolerations 60 | - key: node-role.kubernetes.io/control-plane 61 | operator: Exists 62 | effect: NoSchedule 63 | 64 | victoria-metrics-single: 65 | server: 66 | persistentVolume: 67 | enabled: true # set to true to use persistent volume 68 | ``` 69 | This can also be done using the --set flag on the `helm install` command. 70 | 71 | ### Uninstallation 72 | To uninstall, delete the helm release: 73 | ```bash 74 | helm delete caretta --namespace caretta 75 | ``` 76 | Note that if persistent storage was enabled in the installation, it may not be deleted automatically by this command. 77 | 78 | ## Requirements 79 | * Linux kernel version >= 4.16 80 | * CO-RE support. Supported linux distributions can be found here. Specifically, Docker for Mac uses a distribution which is not currently supported. 81 | 82 | 83 | 84 | ## Working with Caretta :turtle: 85 | Caretta's helm chart ships an instance of Grafana with a predefined dashboard using data published by Caretta. This dashboard contains some examples to demonstrate the usage of Caretta's metrics. 86 | 87 | ### Using the provided Grafana instance 88 | To access Grafana, port-forward port `3000` from the Grafana pod in Caretta's namespace. 89 | 90 | Using *kubectl*, it should look something like this: 91 | 92 | ```bash 93 | kubectl port-forward --namespace caretta 3000:3000 94 | ``` 95 | 96 | > **_NOTE:_** Anonymous mode is enabled, making the default dashboard accessible with no login needed. 97 | > To edit the default dashboard or create your own dashboard, use the default administrator's credentials user:`admin` ; password:`caretta`. 98 | 99 | ### Scraping Caretta's metrics 100 | 101 | Caretta uses [Victoria Metrics](https://victoriametrics.com/) to collect and publish its metrics, and the outcome can be consumed by **any Prometheus-compatible dashboard**. 102 | 103 | Caretta's main metric is `caretta_links_observed` (Gauge). It uses the following labels to represent a specific connection (network socket) going through the cluster: 104 | * `client_name` - either a name of a kubernetes entity, if resolved, an external domain, if resolved, or an IP address. 105 | * `client_namespace` - either the namespace of the kubernetes entity, or "node", or "external". 106 | * `client_kind` - either the kind of the kubernetes entity, or "node", or "external". 107 | * `server_name` - either a name of a kubernetes entity, if resolved, an external domain, if resolved, or an IP address. 108 | * `server_namespace` - either the namespace of the kubernetes entity, or "node", or "external". 109 | * `server_kind` - either the kind of the kubernetes entity, or "node", or "external". 110 | * `server_port` - the port used by the server. 111 | * `role` - either 1 (client) or 2 (server). 112 | 113 | Along those labels, Caretta uses other labels for Grafana's Node Graph panel. 114 | 115 | #### Example metric data 116 | This example shows a connection between a client named `checkoutservice`, controlled by a deployment, to a service named `productioncatalogservice` on port 3550, from the perspective of the client. Total bytes sent by the client in this connection is 2537 bytes. 117 | ```bash 118 | caretta_links_observed{client_id="1074587981",client_kind="Deployment",client_name="checkoutservice",client_namespace="demo-ng",link_id="198768460",role="1",server_id="1112713827",server_kind="Service",server_name="productcatalogservice",server_namespace="demo-ng",server_port="3550"} 2537 119 | ``` 120 | 121 | #### Example queries :star: 122 | ```bash 123 | increase ((sum (server_port) (caretta_links_observed{client_name="some-client", server_name="some-server}))[15m]) 124 | ``` 125 | will output the throughput observed between some-client and some-server in the last 15 minutes, aggregated by port. 126 | 127 | ```bash 128 | sum by (server_name) (rate(caretta_links_observed{client_name="some-client"})) 129 | ``` 130 | will output the rate of traffic from some-client to servers it communicates with, aggregated by the server's name. 131 | 132 | ```bash 133 | sort_desc(increase((sum by (client_name)(caretta_links_observed{server_namespace="external"}))[5m])) 134 | ``` 135 | will output communication to external servers by client's name, sorted descending. 136 | 137 | ## Need help:grey_question: 138 | Feel free to reach us on our slack channel, or create an issue in this repository. 139 | 140 | ## Contribution 141 | Feel free to add your contribution to the project. 142 | 143 | * Open an issue for missing features, or bugs 144 | * Create a pull request for adding code to the project 145 | -------------------------------------------------------------------------------- /chart/Chart.lock: -------------------------------------------------------------------------------- 1 | dependencies: 2 | - name: victoria-metrics-single 3 | repository: https://victoriametrics.github.io/helm-charts 4 | version: 0.8.48 5 | - name: grafana 6 | repository: https://grafana.github.io/helm-charts 7 | version: 6.48.0 8 | digest: sha256:eb7c3b54ae1fef78dae03136bdd7c0e34a3a08a34c147a227e824437a443bccb 9 | generated: "2022-12-26T10:15:04.518501964Z" 10 | -------------------------------------------------------------------------------- /chart/Chart.yaml: -------------------------------------------------------------------------------- 1 | version: 0.0.1 2 | apiVersion: v2 3 | appVersion: v0.0.1 4 | name: caretta 5 | description: A helm chart for Caretta service map. 6 | type: application 7 | dependencies: 8 | - name: victoria-metrics-single 9 | version: "0.8.48" 10 | repository: "https://victoriametrics.github.io/helm-charts" 11 | condition: victoria-metrics-single.enabled 12 | - name: grafana 13 | version: "6.48.0" 14 | repository: "https://grafana.github.io/helm-charts" 15 | condition: grafana.enabled -------------------------------------------------------------------------------- /chart/charts/grafana-6.48.0.tgz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/groundcover-com/caretta/280d1640ce0174b1dfdd7d05bdd104604aa04508/chart/charts/grafana-6.48.0.tgz -------------------------------------------------------------------------------- /chart/charts/victoria-metrics-single-0.8.48.tgz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/groundcover-com/caretta/280d1640ce0174b1dfdd7d05bdd104604aa04508/chart/charts/victoria-metrics-single-0.8.48.tgz -------------------------------------------------------------------------------- /chart/dashboard.json: -------------------------------------------------------------------------------- 1 | { 2 | "annotations": { 3 | "list": [ 4 | { 5 | "builtIn": 1, 6 | "datasource": { 7 | "type": "grafana", 8 | "uid": "-- Grafana --" 9 | }, 10 | "enable": true, 11 | "hide": true, 12 | "iconColor": "rgba(0, 211, 255, 1)", 13 | "name": "Annotations & Alerts", 14 | "target": { 15 | "limit": 100, 16 | "matchAny": false, 17 | "tags": [], 18 | "type": "dashboard" 19 | }, 20 | "type": "dashboard" 21 | } 22 | ] 23 | }, 24 | "editable": true, 25 | "fiscalYearStartMonth": 0, 26 | "graphTooltip": 0, 27 | "id": 15, 28 | "links": [], 29 | "liveNow": false, 30 | "panels": [ 31 | { 32 | "datasource": { 33 | "type": "prometheus", 34 | "uid": "${DS_PROMETHEUS}" 35 | }, 36 | "description": "", 37 | "gridPos": { 38 | "h": 24, 39 | "w": 17, 40 | "x": 0, 41 | "y": 0 42 | }, 43 | "id": 2, 44 | "interval": "15s", 45 | "options": { 46 | "nodes": { 47 | "arcs": [ 48 | { 49 | "color": "#5794F2", 50 | "field": "arc__color" 51 | } 52 | ] 53 | } 54 | }, 55 | "targets": [ 56 | { 57 | "datasource": { 58 | "type": "prometheus", 59 | "uid": "${DS_PROMETHEUS}" 60 | }, 61 | "editorMode": "code", 62 | "exemplar": false, 63 | "expr": "increase((sum by (id, title, subTitle, detail__kind, arc__color) (label_replace((label_replace(label_replace(label_replace(label_replace((caretta_links_observed{client_namespace=~\"$namespace\", client_kind=~\"$kind\", client_name=~\"$workload\", server_port=~\"$port\"} or caretta_links_observed{server_namespace=~\"$namespace\", server_kind=~\"$kind\", server_name=~\"$workload\", server_port=~\"$port\"}), \"detail__kind\", \"$1\", \"server_kind\", \"(.*)\"), \"subTitle\", \"$1\", \"server_namespace\", \"(.*)\"), \"title\", \"$1\", \"server_name\", \"(.*)\"), \"id\", \"$1\", \"server_id\", \"(.*)\") or label_replace(label_replace(label_replace(label_replace((caretta_links_observed{client_namespace=~\"$namespace\", client_kind=~\"$kind\", client_name=~\"$workload\", server_port=~\"$port\"} or caretta_links_observed{server_namespace=~\"$namespace\", server_kind=~\"$kind\", server_name=~\"$workload\", server_port=~\"$port\"}), \"detail__kind\", \"$1\", \"client_kind\", \"(.*)\"), \"subTitle\", \"$1\", \"client_namespace\", \"(.*)\"), \"title\", \"$1\", \"client_name\", \"(.*)\"), \"id\", \"$1\", \"client_id\", \"(.*)\") ), \"arc__color\", \"1\", \"link_id\", \"(.*)\")) )[$__range:$__interval]) > 0", 64 | "format": "table", 65 | "instant": true, 66 | "legendFormat": "__auto", 67 | "range": false, 68 | "refId": "nodes" 69 | }, 70 | { 71 | "datasource": { 72 | "type": "prometheus", 73 | "uid": "${DS_PROMETHEUS}" 74 | }, 75 | "editorMode": "code", 76 | "exemplar": false, 77 | "expr": "increase((sum by (id, source, target, mainStat) ((label_replace(label_replace(label_replace(label_replace((caretta_links_observed{client_namespace=~\"$namespace\", client_kind=~\"$kind\", client_name=~\"$workload\", server_port=~\"$port\"} or caretta_links_observed{server_namespace=~\"$namespace\", server_kind=~\"$kind\", server_name=~\"$workload\", server_port=~\"$port\"}), \"id\", \"$1\", \"link_id\", \"(.*)\"), \"source\", \"$1\", \"client_id\", \"(.*)\"), \"target\", \"$1\", \"server_id\", \"(.*)\"), \"mainStat\", \"$1\", \"server_port\", \"(.*)\"))) )[$__range:$__interval]) > 0", 78 | "format": "table", 79 | "hide": false, 80 | "instant": true, 81 | "legendFormat": "__auto", 82 | "range": false, 83 | "refId": "edges" 84 | } 85 | ], 86 | "title": "Service Map ☸️", 87 | "type": "nodeGraph" 88 | }, 89 | { 90 | "datasource": { 91 | "type": "prometheus", 92 | "uid": "${DS_PROMETHEUS}" 93 | }, 94 | "fieldConfig": { 95 | "defaults": { 96 | "color": { 97 | "fixedColor": "blue", 98 | "mode": "fixed" 99 | }, 100 | "custom": { 101 | "hideFrom": { 102 | "legend": false, 103 | "tooltip": false, 104 | "viz": false 105 | } 106 | }, 107 | "links": [], 108 | "mappings": [] 109 | }, 110 | "overrides": [] 111 | }, 112 | "gridPos": { 113 | "h": 7, 114 | "w": 4, 115 | "x": 17, 116 | "y": 0 117 | }, 118 | "id": 4, 119 | "options": { 120 | "displayLabels": [ 121 | "name" 122 | ], 123 | "legend": { 124 | "displayMode": "list", 125 | "placement": "right", 126 | "showLegend": false 127 | }, 128 | "pieType": "donut", 129 | "reduceOptions": { 130 | "calcs": [ 131 | "lastNotNull" 132 | ], 133 | "fields": "", 134 | "values": false 135 | }, 136 | "tooltip": { 137 | "mode": "single", 138 | "sort": "none" 139 | } 140 | }, 141 | "targets": [ 142 | { 143 | "datasource": { 144 | "type": "prometheus", 145 | "uid": "${DS_PROMETHEUS}" 146 | }, 147 | "editorMode": "code", 148 | "expr": "sum by (server_port) (increase((caretta_links_observed{client_namespace=~\"$namespace\", client_kind=~\"$kind\", client_name=~\"$workload\", server_port=~\"$port\"} or caretta_links_observed{server_namespace=~\"$namespace\", server_kind=~\"$kind\", server_name=~\"$workload\", server_port=~\"$port\"})[$__range:$__interval])) > 0", 149 | "legendFormat": "__auto", 150 | "range": true, 151 | "refId": "A" 152 | } 153 | ], 154 | "title": "Active Ports", 155 | "type": "piechart" 156 | }, 157 | { 158 | "datasource": { 159 | "type": "datasource", 160 | "uid": "grafana" 161 | }, 162 | "gridPos": { 163 | "h": 7, 164 | "w": 3, 165 | "x": 21, 166 | "y": 0 167 | }, 168 | "id": 10, 169 | "options": { 170 | "code": { 171 | "language": "plaintext", 172 | "showLineNumbers": false, 173 | "showMiniMap": false 174 | }, 175 | "content": "\n \n
\n\n

\n\n
\n", 176 | "mode": "markdown" 177 | }, 178 | "pluginVersion": "10.1.2", 179 | "type": "text" 180 | }, 181 | { 182 | "datasource": { 183 | "type": "prometheus", 184 | "uid": "${DS_PROMETHEUS}" 185 | }, 186 | "fieldConfig": { 187 | "defaults": { 188 | "color": { 189 | "fixedColor": "purple", 190 | "mode": "continuous-blues" 191 | }, 192 | "mappings": [], 193 | "thresholds": { 194 | "mode": "absolute", 195 | "steps": [ 196 | { 197 | "color": "green", 198 | "value": null 199 | }, 200 | { 201 | "color": "red", 202 | "value": 80 203 | } 204 | ] 205 | }, 206 | "unit": "Bps" 207 | }, 208 | "overrides": [] 209 | }, 210 | "gridPos": { 211 | "h": 8, 212 | "w": 7, 213 | "x": 17, 214 | "y": 7 215 | }, 216 | "id": 8, 217 | "options": { 218 | "displayMode": "gradient", 219 | "minVizHeight": 10, 220 | "minVizWidth": 0, 221 | "orientation": "horizontal", 222 | "reduceOptions": { 223 | "calcs": [ 224 | "lastNotNull" 225 | ], 226 | "fields": "", 227 | "values": false 228 | }, 229 | "showUnfilled": true, 230 | "valueMode": "color" 231 | }, 232 | "pluginVersion": "10.1.2", 233 | "targets": [ 234 | { 235 | "datasource": { 236 | "type": "prometheus", 237 | "uid": "${DS_PROMETHEUS}" 238 | }, 239 | "editorMode": "code", 240 | "exemplar": false, 241 | "expr": "topk(8, sum by (client_name) ((rate(caretta_links_observed{client_namespace=~\"$namespace\", client_kind=~\"$kind\", client_name=~\"$workload\", server_port=~\"$port\"}[$__range:$__interval]))))", 242 | "format": "time_series", 243 | "instant": true, 244 | "legendFormat": "__auto", 245 | "range": false, 246 | "refId": "A" 247 | } 248 | ], 249 | "title": "Top Throughput Workloads", 250 | "type": "bargauge" 251 | }, 252 | { 253 | "datasource": { 254 | "type": "prometheus", 255 | "uid": "${DS_PROMETHEUS}" 256 | }, 257 | "description": "", 258 | "fieldConfig": { 259 | "defaults": { 260 | "color": { 261 | "mode": "continuous-blues" 262 | }, 263 | "mappings": [], 264 | "thresholds": { 265 | "mode": "absolute", 266 | "steps": [ 267 | { 268 | "color": "green", 269 | "value": null 270 | }, 271 | { 272 | "color": "red", 273 | "value": 80 274 | } 275 | ] 276 | }, 277 | "unit": "Bps" 278 | }, 279 | "overrides": [] 280 | }, 281 | "gridPos": { 282 | "h": 9, 283 | "w": 7, 284 | "x": 17, 285 | "y": 15 286 | }, 287 | "id": 6, 288 | "options": { 289 | "colorMode": "background", 290 | "graphMode": "area", 291 | "justifyMode": "center", 292 | "orientation": "horizontal", 293 | "reduceOptions": { 294 | "calcs": [ 295 | "lastNotNull" 296 | ], 297 | "fields": "", 298 | "values": false 299 | }, 300 | "text": {}, 301 | "textMode": "auto" 302 | }, 303 | "pluginVersion": "10.1.2", 304 | "targets": [ 305 | { 306 | "datasource": { 307 | "type": "prometheus", 308 | "uid": "${DS_PROMETHEUS}" 309 | }, 310 | "editorMode": "code", 311 | "exemplar": false, 312 | "expr": "topk(7, sum by (client_name, server_name) ( rate( (caretta_links_observed{client_namespace=~\"$namespace\", client_kind=~\"$kind\", client_name=~\"$workload\", server_port=~\"$port\", client_kind!~\"(node|external)\",} or caretta_links_observed{server_namespace=~\"$namespace\", server_kind=~\"$kind\", server_name=~\"$workload\", server_port=~\"$port\", server_kind!~\"(node|external)\"})[$__range:$__interval]) ) )", 313 | "format": "time_series", 314 | "instant": true, 315 | "legendFormat": "{{client_name}} ⮂ {{server_name}}", 316 | "range": false, 317 | "refId": "A" 318 | } 319 | ], 320 | "title": "Top Throughput Connections", 321 | "type": "stat" 322 | } 323 | ], 324 | "refresh": "", 325 | "schemaVersion": 38, 326 | "style": "dark", 327 | "tags": [], 328 | "templating": { 329 | "list": [ 330 | { 331 | "current": { 332 | "selected": false, 333 | "text": "default", 334 | "value": "default" 335 | }, 336 | "hide": 0, 337 | "includeAll": false, 338 | "label": "datasource", 339 | "multi": false, 340 | "name": "DS_PROMETHEUS", 341 | "options": [], 342 | "query": "prometheus", 343 | "queryValue": "", 344 | "refresh": 1, 345 | "regex": "", 346 | "skipUrlSync": false, 347 | "type": "datasource" 348 | }, 349 | { 350 | "allValue": "(.*)", 351 | "current": { 352 | "selected": true, 353 | "text": [ 354 | "All" 355 | ], 356 | "value": [ 357 | "$__all" 358 | ] 359 | }, 360 | "datasource": { 361 | "type": "prometheus", 362 | "uid": "${DS_PROMETHEUS}" 363 | }, 364 | "definition": "query_result(caretta_links_observed)", 365 | "hide": 0, 366 | "includeAll": true, 367 | "multi": true, 368 | "name": "namespace", 369 | "options": [], 370 | "query": { 371 | "query": "query_result(caretta_links_observed)", 372 | "refId": "StandardVariableQuery" 373 | }, 374 | "refresh": 1, 375 | "regex": "/.*_namespace=\"([^\"]*).*/", 376 | "skipUrlSync": false, 377 | "sort": 1, 378 | "type": "query" 379 | }, 380 | { 381 | "allValue": "(.*)", 382 | "current": { 383 | "selected": true, 384 | "text": [ 385 | "All" 386 | ], 387 | "value": [ 388 | "$__all" 389 | ] 390 | }, 391 | "datasource": { 392 | "type": "prometheus", 393 | "uid": "${DS_PROMETHEUS}" 394 | }, 395 | "definition": "query_result(caretta_links_observed)", 396 | "hide": 0, 397 | "includeAll": true, 398 | "multi": true, 399 | "name": "kind", 400 | "options": [], 401 | "query": { 402 | "query": "query_result(caretta_links_observed)", 403 | "refId": "StandardVariableQuery" 404 | }, 405 | "refresh": 1, 406 | "regex": "/.*_kind=\"([^\"]*).*/", 407 | "skipUrlSync": false, 408 | "sort": 0, 409 | "type": "query" 410 | }, 411 | { 412 | "allValue": "(.*)", 413 | "current": { 414 | "selected": true, 415 | "text": [ 416 | "All" 417 | ], 418 | "value": [ 419 | "$__all" 420 | ] 421 | }, 422 | "datasource": { 423 | "type": "prometheus", 424 | "uid": "${DS_PROMETHEUS}" 425 | }, 426 | "definition": "query_result(caretta_links_observed)", 427 | "hide": 0, 428 | "includeAll": true, 429 | "label": "workload", 430 | "multi": true, 431 | "name": "workload", 432 | "options": [], 433 | "query": { 434 | "query": "query_result(caretta_links_observed)", 435 | "refId": "StandardVariableQuery" 436 | }, 437 | "refresh": 2, 438 | "regex": "/.*_name=\"([^\"]*).*/", 439 | "skipUrlSync": false, 440 | "sort": 1, 441 | "type": "query" 442 | }, 443 | { 444 | "allValue": "(.*)", 445 | "current": { 446 | "selected": true, 447 | "text": [ 448 | "All" 449 | ], 450 | "value": [ 451 | "$__all" 452 | ] 453 | }, 454 | "datasource": { 455 | "type": "prometheus", 456 | "uid": "${DS_PROMETHEUS}" 457 | }, 458 | "definition": "label_values(server_port)", 459 | "hide": 0, 460 | "includeAll": true, 461 | "label": "server port", 462 | "multi": true, 463 | "name": "port", 464 | "options": [], 465 | "query": { 466 | "query": "label_values(server_port)", 467 | "refId": "StandardVariableQuery" 468 | }, 469 | "refresh": 1, 470 | "regex": "", 471 | "skipUrlSync": false, 472 | "sort": 0, 473 | "type": "query" 474 | } 475 | ] 476 | }, 477 | "time": { 478 | "from": "now-5m", 479 | "to": "now" 480 | }, 481 | "timepicker": {}, 482 | "timezone": "", 483 | "title": "Caretta Dashboard", 484 | "uid": "k0Om62pVf", 485 | "version": 2, 486 | "weekStart": "" 487 | } -------------------------------------------------------------------------------- /chart/templates/_helpers.tpl: -------------------------------------------------------------------------------- 1 | {{/* 2 | Expand the name of the chart. 3 | */}} 4 | {{- define "caretta.name" -}} 5 | {{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} 6 | {{- end }} 7 | 8 | {{/* 9 | Create a default fully qualified app name. 10 | We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). 11 | If release name contains chart name it will be used as a full name. 12 | */}} 13 | {{- define "caretta.fullname" -}} 14 | {{- if .Values.fullnameOverride }} 15 | {{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} 16 | {{- else }} 17 | {{- $name := default .Chart.Name .Values.nameOverride }} 18 | {{- if contains $name .Release.Name }} 19 | {{- .Release.Name | trunc 63 | trimSuffix "-" }} 20 | {{- else }} 21 | {{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} 22 | {{- end }} 23 | {{- end }} 24 | {{- end }} 25 | 26 | {{/* 27 | Create chart name and version as used by the chart label. 28 | */}} 29 | {{- define "caretta.chart" -}} 30 | {{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} 31 | {{- end }} 32 | 33 | {{/* 34 | Common labels 35 | */}} 36 | {{- define "caretta.labels" -}} 37 | helm.sh/chart: {{ include "caretta.chart" . }} 38 | {{ include "caretta.selectorLabels" . }} 39 | {{- if .Chart.AppVersion }} 40 | app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} 41 | {{- end }} 42 | app.kubernetes.io/managed-by: {{ .Release.Service }} 43 | {{- end }} 44 | 45 | {{/* 46 | Selector labels 47 | */}} 48 | {{- define "caretta.selectorLabels" -}} 49 | app.kubernetes.io/name: {{ include "caretta.name" . }} 50 | app.kubernetes.io/instance: {{ .Release.Name }} 51 | {{- end }} 52 | 53 | {{/* 54 | Create the name of the service account to use 55 | */}} 56 | {{- define "caretta.serviceAccountName" -}} 57 | {{- if .Values.serviceAccount.create }} 58 | {{- default (include "caretta.fullname" .) .Values.serviceAccount.name }} 59 | {{- else }} 60 | {{- default "default" .Values.serviceAccount.name }} 61 | {{- end }} 62 | {{- end }} 63 | -------------------------------------------------------------------------------- /chart/templates/daemonset.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: DaemonSet 3 | metadata: 4 | name: {{ include "caretta.name" . }} 5 | labels: 6 | app: caretta 7 | {{- include "caretta.labels" . | nindent 4 }} 8 | spec: 9 | selector: 10 | matchLabels: 11 | app: caretta 12 | {{- include "caretta.selectorLabels" . | nindent 6 }} 13 | template: 14 | metadata: 15 | annotations: 16 | {{- with .Values.podAnnotations }} 17 | {{- toYaml . | nindent 8 }} 18 | {{- end }} 19 | labels: 20 | app: caretta 21 | {{- include "caretta.selectorLabels" . | nindent 8 }} 22 | spec: 23 | {{- with .Values.imagePullSecrets }} 24 | imagePullSecrets: 25 | {{- toYaml . | nindent 8 }} 26 | {{- end }} 27 | serviceAccountName: {{ include "caretta.name" . }} 28 | {{- if .Values.priorityClassName }} 29 | priorityClassName: {{ .Values.priorityClassName }} 30 | {{- end }} 31 | securityContext: 32 | {{- toYaml .Values.podSecurityContext | nindent 8 }} 33 | containers: 34 | - name: {{ .Chart.Name }} 35 | securityContext: 36 | {{- toYaml .Values.securityContext | nindent 12 }} 37 | image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}" 38 | imagePullPolicy: {{ .Values.image.pullPolicy }} 39 | volumeMounts: 40 | - mountPath: /proc 41 | name: proc 42 | - mountPath: /sys/kernel/debug 43 | name: debug 44 | ports: 45 | - name: prom-metrics 46 | containerPort: {{ .Values.prometheusPort }} 47 | protocol: TCP 48 | env: 49 | - name: "RESOLVE_DNS" 50 | value: "{{ .Values.enableDnsResolving }}" 51 | - name: "PROMETHEUS_PORT" 52 | value: "{{ .Values.prometheusPort }}" 53 | - name: "PROMETHEUS_ENDPOINT" 54 | value: "{{ .Values.prometheusEndpoint }}" 55 | - name: "POLL_INTERVAL" 56 | value: "{{ .Values.pollIntervalSeconds }}" 57 | - name: "TRAVERSE_UP_HIERARCHY" 58 | value: "{{ .Values.traverseUpHierarchy }}" 59 | resources: 60 | {{- toYaml .Values.resources | nindent 12 }} 61 | {{- with .Values.nodeSelector }} 62 | nodeSelector: 63 | {{- toYaml . | nindent 8 }} 64 | {{- end }} 65 | {{- with .Values.affinity }} 66 | affinity: 67 | {{- toYaml . | nindent 8 }} 68 | {{- end }} 69 | {{- with .Values.tolerations }} 70 | tolerations: 71 | {{- toYaml . | nindent 8 }} 72 | {{- end }} 73 | volumes: 74 | - name: proc 75 | hostPath: 76 | path: /proc 77 | - name: debug 78 | hostPath: 79 | path: /sys/kernel/debug -------------------------------------------------------------------------------- /chart/templates/grafana/dashboards.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ConfigMap 3 | metadata: 4 | name: caretta-grafana-dashboards 5 | namespace: {{ .Release.Namespace }} 6 | labels: 7 | {{- if $.Values.grafana.sidecar.dashboards.enabled }} 8 | {{ $.Values.grafana.sidecar.dashboards.label }}: {{ $.Values.grafana.sidecar.dashboards.labelValue | quote }} 9 | {{- end }} 10 | data: 11 | dashboard.json: |- 12 | {{ .Files.Get "dashboard.json" | indent 4}} -------------------------------------------------------------------------------- /chart/templates/rbac/psp.yaml: -------------------------------------------------------------------------------- 1 | {{- if and .Values.rbac.pspEnabled (.Capabilities.APIVersions.Has "policy/v1beta1") }} 2 | apiVersion: policy/v1beta1 3 | kind: PodSecurityPolicy 4 | metadata: 5 | annotations: 6 | seccomp.security.alpha.kubernetes.io/allowedProfileNames: '*' 7 | name: {{ template "caretta.fullname" . }} 8 | spec: 9 | allowPrivilegeEscalation: true 10 | allowedCapabilities: 11 | - '*' 12 | fsGroup: 13 | rule: RunAsAny 14 | hostIPC: true 15 | hostNetwork: false 16 | hostPID: true 17 | hostPorts: 18 | - max: 65535 19 | min: 0 20 | privileged: true 21 | runAsUser: 22 | rule: RunAsAny 23 | seLinux: 24 | rule: RunAsAny 25 | supplementalGroups: 26 | rule: RunAsAny 27 | volumes: 28 | - '*' 29 | {{ end -}} 30 | -------------------------------------------------------------------------------- /chart/templates/rbac/role.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: ClusterRole 3 | metadata: 4 | name: {{ include "caretta.fullname" . }} 5 | rules: 6 | {{- if and .Values.rbac.pspEnabled (.Capabilities.APIVersions.Has "policy/v1beta1") }} 7 | - apiGroups: 8 | - policy 9 | - extensions 10 | resourceNames: 11 | - {{ template "caretta.fullname" . }} 12 | resources: 13 | - podsecuritypolicies 14 | verbs: 15 | - use 16 | {{- end }} 17 | {{- if and .Values.rbac.sccEnabled (.Capabilities.APIVersions.Has "security.openshift.io/v1")}} 18 | - apiGroups: 19 | - security.openshift.io 20 | resources: 21 | - securitycontextconstraints 22 | verbs: 23 | - use 24 | resourceNames: 25 | - privileged 26 | {{- end }} 27 | - verbs: 28 | - get 29 | - list 30 | - watch 31 | apiGroups: 32 | - '' 33 | resources: 34 | - configmaps 35 | - endpoints 36 | - persistentvolumeclaims 37 | - persistentvolumeclaims/status 38 | - pods 39 | - replicationcontrollers 40 | - replicationcontrollers/scale 41 | - serviceaccounts 42 | - services 43 | - services/status 44 | - verbs: 45 | - get 46 | - list 47 | - watch 48 | apiGroups: 49 | - '' 50 | resources: 51 | - bindings 52 | - events 53 | - limitranges 54 | - namespaces/status 55 | - pods/log 56 | - pods/status 57 | - nodes 58 | - replicationcontrollers/status 59 | - resourcequotas 60 | - resourcequotas/status 61 | - verbs: 62 | - get 63 | - list 64 | - watch 65 | apiGroups: 66 | - '' 67 | resources: 68 | - namespaces 69 | - verbs: 70 | - get 71 | - list 72 | - watch 73 | apiGroups: 74 | - apps 75 | resources: 76 | - controllerrevisions 77 | - daemonsets 78 | - daemonsets/status 79 | - deployments 80 | - deployments/scale 81 | - deployments/status 82 | - replicasets 83 | - replicasets/scale 84 | - replicasets/status 85 | - statefulsets 86 | - statefulsets/scale 87 | - statefulsets/status 88 | - verbs: 89 | - get 90 | - list 91 | - watch 92 | apiGroups: 93 | - batch 94 | resources: 95 | - cronjobs 96 | - cronjobs/status 97 | - jobs 98 | - jobs/status 99 | - verbs: 100 | - get 101 | - list 102 | - watch 103 | apiGroups: 104 | - extensions 105 | resources: 106 | - daemonsets 107 | - daemonsets/status 108 | - deployments 109 | - deployments/scale 110 | - deployments/status 111 | - ingresses 112 | - ingresses/status 113 | - networkpolicies 114 | - replicasets 115 | - replicasets/scale 116 | - replicasets/status 117 | - replicationcontrollers/scale 118 | - verbs: 119 | - get 120 | - list 121 | - watch 122 | apiGroups: 123 | - policy 124 | resources: 125 | - poddisruptionbudgets 126 | - poddisruptionbudgets/status 127 | - verbs: 128 | - get 129 | - list 130 | - watch 131 | apiGroups: 132 | - networking.k8s.io 133 | resources: 134 | - ingresses 135 | - ingresses/status 136 | - networkpolicies 137 | - verbs: 138 | - get 139 | apiGroups: 140 | - discovery.k8s.io 141 | resources: 142 | - endpointslices 143 | - verbs: 144 | - list 145 | apiGroups: 146 | - discovery.k8s.io 147 | resources: 148 | - endpointslices 149 | - verbs: 150 | - watch 151 | apiGroups: 152 | - discovery.k8s.io 153 | resources: 154 | - endpointslices 155 | - verbs: 156 | - get 157 | - list 158 | - watch 159 | apiGroups: 160 | - metrics.k8s.io 161 | resources: 162 | - pods 163 | - nodes 164 | 165 | -------------------------------------------------------------------------------- /chart/templates/rbac/rolebinding.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: ClusterRoleBinding 3 | metadata: 4 | name: {{ include "caretta.fullname" . }} 5 | roleRef: 6 | apiGroup: rbac.authorization.k8s.io 7 | kind: ClusterRole 8 | name: {{ include "caretta.fullname" . }} 9 | subjects: 10 | - kind: ServiceAccount 11 | name: {{ include "caretta.name" . }} 12 | namespace: {{ .Release.Namespace }} -------------------------------------------------------------------------------- /chart/templates/rbac/serviceaccount.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ServiceAccount 3 | metadata: 4 | name: {{ include "caretta.name" . }} 5 | namespace: {{ .Release.Namespace }} 6 | -------------------------------------------------------------------------------- /chart/values.yaml: -------------------------------------------------------------------------------- 1 | enableDnsResolving: true 2 | prometheusPort: 7117 3 | prometheusEndpoint: "/metrics" 4 | pollIntervalSeconds: 5 5 | traverseUpHierarchy: true 6 | 7 | rbac: 8 | pspEnabled: true 9 | sccEnabled: true 10 | image: 11 | repository: quay.io/groundcover/caretta 12 | pullPolicy: Always 13 | tag: "" 14 | 15 | imagePullSecrets: [] 16 | nameOverride: "" 17 | fullnameOverride: "" 18 | 19 | resources: 20 | limits: 21 | cpu: 150m 22 | memory: 500Mi 23 | requests: 24 | cpu: 10m 25 | memory: 50Mi 26 | 27 | serviceAccount: 28 | # Specifies whether a service account should be created 29 | create: true 30 | # Annotations to add to the service account 31 | annotations: {} 32 | # The name of the service account to use. 33 | # If not set and create is true, a name is generated using the fullname template 34 | name: "" 35 | 36 | podAnnotations: {} 37 | 38 | podSecurityContext: {} 39 | # fsGroup: 2000 40 | 41 | securityContext: 42 | privileged: true 43 | readOnlyRootFilesystem: true 44 | # capabilities: 45 | # drop: 46 | # - ALL 47 | # readOnlyRootFilesystem: true 48 | # runAsNonRoot: true 49 | # runAsUser: 1000 50 | 51 | tolerations: 52 | - key: node-role.kubernetes.io/control-plane 53 | operator: Exists 54 | effect: NoSchedule 55 | - key: node-role.kubernetes.io/master 56 | operator: Exists 57 | effect: NoSchedule 58 | 59 | nodeSelector: {} 60 | affinity: {} 61 | priorityClassName: 62 | 63 | victoria-metrics-single: 64 | server: 65 | image: 66 | repository: quay.io/groundcover/victoria-metrics 67 | resources: 68 | limits: 69 | cpu: 300m 70 | memory: 350Mi 71 | requests: 72 | cpu: 5m 73 | memory: 50Mi 74 | fullnameOverride: caretta-vm 75 | persistentVolume: 76 | enabled: false 77 | size: 16Gi # change enabled to true if you pv is required 78 | 79 | scrape: 80 | enabled: true 81 | 82 | config: 83 | global: 84 | scrape_interval: 15s 85 | 86 | scrape_configs: 87 | - job_name: 'caretta' 88 | metrics_path: /metrics 89 | scrape_interval: 5s 90 | kubernetes_sd_configs: 91 | - role: pod 92 | namespaces: 93 | own_namespace: true 94 | relabel_configs: 95 | - source_labels: [__meta_kubernetes_pod_label_app] 96 | separator: ; 97 | regex: caretta 98 | replacement: $1 99 | action: keep 100 | - action: labelmap 101 | regex: __meta_kubernetes_pod_label_(.+) 102 | - source_labels: [__meta_kubernetes_pod_name] 103 | action: replace 104 | target_label: caretta_pod 105 | - source_labels: [__meta_kubernetes_pod_node_name] 106 | action: replace 107 | target_label: caretta_node 108 | 109 | grafana: 110 | image: 111 | repository: quay.io/groundcover/grafana 112 | resources: 113 | limits: 114 | memory: 300Mi 115 | cpu: 300m 116 | requests: 117 | memory: 50Mi 118 | cpu: 5m 119 | datasources: 120 | datasources.yaml: 121 | apiVersion: 1 122 | datasources: 123 | - name: Prometheus 124 | type: prometheus 125 | access: proxy 126 | url: "http://caretta-vm:8428" 127 | editable: "true" 128 | 129 | sidecar: 130 | dashboards: 131 | label: grafana_dashboard 132 | labelValue: "" 133 | 134 | dashboardProviders: 135 | dashboardproviders.yaml: 136 | apiVersion: 1 137 | providers: 138 | - name: 'default' 139 | orgId: 1 140 | folder: '' 141 | type: file 142 | disableDeletion: false 143 | editable: true 144 | options: 145 | path: /var/lib/grafana/dashboards 146 | foldersFromFilesStructure: true 147 | 148 | dashboardsConfigMaps: 149 | default: "caretta-grafana-dashboards" 150 | 151 | grafana.ini: 152 | auth.anonymous: 153 | enabled: true 154 | dashboards: 155 | default_home_dashboard_path: /var/lib/grafana/dashboards/default/dashboard.json 156 | adminUser: "admin" 157 | adminPassword: "caretta" 158 | -------------------------------------------------------------------------------- /cmd/caretta/caretta.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "log" 5 | _ "net/http/pprof" 6 | "os" 7 | "os/signal" 8 | "syscall" 9 | 10 | "github.com/groundcover-com/caretta/pkg/caretta" 11 | ) 12 | 13 | func main() { 14 | log.Print("Caretta starting...") 15 | caretta := caretta.NewCaretta() 16 | 17 | caretta.Start() 18 | 19 | osSignal := make(chan os.Signal, 1) 20 | signal.Notify(osSignal, syscall.SIGINT, syscall.SIGTERM) 21 | <-osSignal 22 | caretta.Stop() 23 | } 24 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/groundcover-com/caretta 2 | 3 | go 1.19 4 | 5 | require ( 6 | github.com/cilium/ebpf v0.10.0 7 | github.com/google/uuid v1.3.0 8 | github.com/hashicorp/golang-lru/v2 v2.0.1 9 | github.com/prometheus/client_golang v1.14.0 10 | github.com/stretchr/testify v1.8.1 11 | k8s.io/api v0.26.0 12 | k8s.io/apimachinery v0.26.0 13 | k8s.io/client-go v0.26.0 14 | ) 15 | 16 | require ( 17 | github.com/beorn7/perks v1.0.1 // indirect 18 | github.com/cespare/xxhash/v2 v2.1.2 // indirect 19 | github.com/davecgh/go-spew v1.1.1 // indirect 20 | github.com/emicklei/go-restful/v3 v3.9.0 // indirect 21 | github.com/evanphx/json-patch v4.12.0+incompatible // indirect 22 | github.com/go-logr/logr v1.2.3 // indirect 23 | github.com/go-openapi/jsonpointer v0.19.5 // indirect 24 | github.com/go-openapi/jsonreference v0.20.0 // indirect 25 | github.com/go-openapi/swag v0.19.14 // indirect 26 | github.com/gogo/protobuf v1.3.2 // indirect 27 | github.com/golang/protobuf v1.5.2 // indirect 28 | github.com/google/gnostic v0.5.7-v3refs // indirect 29 | github.com/google/go-cmp v0.5.9 // indirect 30 | github.com/google/gofuzz v1.1.0 // indirect 31 | github.com/josharian/intern v1.0.0 // indirect 32 | github.com/json-iterator/go v1.1.12 // indirect 33 | github.com/mailru/easyjson v0.7.6 // indirect 34 | github.com/matttproud/golang_protobuf_extensions v1.0.1 // indirect 35 | github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect 36 | github.com/modern-go/reflect2 v1.0.2 // indirect 37 | github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect 38 | github.com/pkg/errors v0.9.1 // indirect 39 | github.com/pmezard/go-difflib v1.0.0 // indirect 40 | github.com/prometheus/client_model v0.3.0 // indirect 41 | github.com/prometheus/common v0.37.0 // indirect 42 | github.com/prometheus/procfs v0.8.0 // indirect 43 | golang.org/x/net v0.3.1-0.20221206200815-1e63c2f08a10 // indirect 44 | golang.org/x/oauth2 v0.0.0-20220223155221-ee480838109b // indirect 45 | golang.org/x/sys v0.3.0 // indirect 46 | golang.org/x/term v0.3.0 // indirect 47 | golang.org/x/text v0.5.0 // indirect 48 | golang.org/x/time v0.0.0-20220210224613-90d013bbcef8 // indirect 49 | google.golang.org/appengine v1.6.7 // indirect 50 | google.golang.org/protobuf v1.28.1 // indirect 51 | gopkg.in/inf.v0 v0.9.1 // indirect 52 | gopkg.in/yaml.v2 v2.4.0 // indirect 53 | gopkg.in/yaml.v3 v3.0.1 // indirect 54 | k8s.io/klog/v2 v2.80.1 // indirect 55 | k8s.io/kube-openapi v0.0.0-20221012153701-172d655c2280 // indirect 56 | k8s.io/utils v0.0.0-20221107191617-1a15be271d1d // indirect 57 | sigs.k8s.io/json v0.0.0-20220713155537-f223a00ba0e2 // indirect 58 | sigs.k8s.io/structured-merge-diff/v4 v4.2.3 // indirect 59 | sigs.k8s.io/yaml v1.3.0 // indirect 60 | ) 61 | -------------------------------------------------------------------------------- /images/caretta.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/groundcover-com/caretta/280d1640ce0174b1dfdd7d05bdd104604aa04508/images/caretta.gif -------------------------------------------------------------------------------- /images/screenshot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/groundcover-com/caretta/280d1640ce0174b1dfdd7d05bdd104604aa04508/images/screenshot.png -------------------------------------------------------------------------------- /pkg/caretta/caretta.go: -------------------------------------------------------------------------------- 1 | package caretta 2 | 3 | import ( 4 | "context" 5 | "hash/fnv" 6 | "log" 7 | "net/http" 8 | "strconv" 9 | "time" 10 | 11 | caretta_k8s "github.com/groundcover-com/caretta/pkg/k8s" 12 | "github.com/groundcover-com/caretta/pkg/metrics" 13 | "github.com/prometheus/client_golang/prometheus" 14 | "github.com/prometheus/client_golang/prometheus/promauto" 15 | "k8s.io/client-go/kubernetes" 16 | "k8s.io/client-go/rest" 17 | ) 18 | 19 | var ( 20 | linksMetrics = promauto.NewGaugeVec(prometheus.GaugeOpts{ 21 | Name: "caretta_links_observed", 22 | Help: "total bytes_sent value of links observed by caretta since its launch", 23 | }, []string{ 24 | "link_id", "client_id", "client_name", "client_namespace", "client_kind", "client_owner", "server_id", "server_name", "server_namespace", "server_kind", "server_port", "role", 25 | }) 26 | tcpStateMetrics = promauto.NewGaugeVec(prometheus.GaugeOpts{ 27 | Name: "caretta_tcp_states", 28 | Help: "state of TCP connections observed by caretta since its launch", 29 | }, []string{ 30 | "link_id", "client_id", "client_name", "client_namespace", "client_kind", "client_owner", "server_id", "server_name", "server_namespace", "server_kind", "server_port", "role", 31 | }) 32 | ) 33 | 34 | type Caretta struct { 35 | stopSignal chan bool 36 | tracer LinksTracer 37 | metricsServer *http.Server 38 | config carettaConfig 39 | } 40 | 41 | func NewCaretta() *Caretta { 42 | return &Caretta{ 43 | stopSignal: make(chan bool, 1), 44 | config: readConfig(), 45 | } 46 | } 47 | 48 | func (caretta *Caretta) Start() { 49 | caretta.metricsServer = metrics.StartMetricsServer(caretta.config.prometheusEndpoint, caretta.config.prometheusPort) 50 | 51 | clientset, err := caretta.getClientSet() 52 | if err != nil { 53 | log.Fatalf("Error getting kubernetes clientset: %v", err) 54 | } 55 | resolver, err := caretta_k8s.NewK8sIPResolver(clientset, caretta.config.shouldResolveDns, caretta.config.traverseUpHierarchy) 56 | if err != nil { 57 | log.Fatalf("Error creating resolver: %v", err) 58 | } 59 | err = resolver.StartWatching() 60 | if err != nil { 61 | log.Fatalf("Error watching cluster's state: %v", err) 62 | } 63 | 64 | // wait for resolver to populate 65 | time.Sleep(10 * time.Second) 66 | 67 | caretta.tracer = NewTracer(resolver) 68 | err = caretta.tracer.Start() 69 | if err != nil { 70 | log.Fatalf("Couldn't load probes - %v", err) 71 | } 72 | 73 | pollingTicker := time.NewTicker(time.Duration(caretta.config.pollingIntervalSeconds) * time.Second) 74 | 75 | pastLinks := make(map[NetworkLink]uint64) 76 | 77 | go func() { 78 | for { 79 | select { 80 | case <-caretta.stopSignal: 81 | return 82 | case <-pollingTicker.C: 83 | var links map[NetworkLink]uint64 84 | var tcpConnections []TcpConnection 85 | 86 | if err != nil { 87 | log.Printf("Error updating snapshot of cluster state, skipping iteration") 88 | continue 89 | } 90 | 91 | pastLinks, links, tcpConnections = caretta.tracer.TracesPollingIteration(pastLinks) 92 | for link, throughput := range links { 93 | caretta.handleLink(&link, throughput) 94 | } 95 | 96 | for _, connection := range tcpConnections { 97 | caretta.handleTcpConnection(&connection) 98 | } 99 | } 100 | } 101 | }() 102 | } 103 | 104 | func (caretta *Caretta) Stop() { 105 | log.Print("Stopping Caretta...") 106 | caretta.stopSignal <- true 107 | err := caretta.tracer.Stop() 108 | if err != nil { 109 | log.Printf("Error unloading bpf objects: %v", err) 110 | } 111 | err = caretta.metricsServer.Shutdown(context.Background()) 112 | if err != nil { 113 | log.Printf("Error shutting Prometheus server down: %v", err) 114 | } 115 | 116 | } 117 | 118 | func (caretta *Caretta) handleLink(link *NetworkLink, throughput uint64) { 119 | linksMetrics.With(prometheus.Labels{ 120 | "link_id": strconv.Itoa(int(fnvHash(link.Client.Name+link.Client.Namespace+link.Server.Name+link.Server.Namespace) + link.Role)), 121 | "client_id": strconv.Itoa(int(fnvHash(link.Client.Name + link.Client.Namespace))), 122 | "client_name": link.Client.Name, 123 | "client_namespace": link.Client.Namespace, 124 | "client_kind": link.Client.Kind, 125 | "client_owner": link.Client.Owner, 126 | "server_id": strconv.Itoa(int(fnvHash(link.Server.Name + link.Server.Namespace))), 127 | "server_name": link.Server.Name, 128 | "server_namespace": link.Server.Namespace, 129 | "server_kind": link.Server.Kind, 130 | "server_port": strconv.Itoa(int(link.ServerPort)), 131 | "role": strconv.Itoa(int(link.Role)), 132 | }).Set(float64(throughput)) 133 | } 134 | 135 | func (caretta *Caretta) handleTcpConnection(connection *TcpConnection) { 136 | tcpStateMetrics.With(prometheus.Labels{ 137 | "link_id": strconv.Itoa(int(fnvHash(connection.Client.Name+connection.Client.Namespace+connection.Server.Name+connection.Server.Namespace) + connection.Role)), 138 | "client_id": strconv.Itoa(int(fnvHash(connection.Client.Name + connection.Client.Namespace))), 139 | "client_name": connection.Client.Name, 140 | "client_namespace": connection.Client.Namespace, 141 | "client_kind": connection.Client.Kind, 142 | "client_owner": connection.Client.Owner, 143 | "server_id": strconv.Itoa(int(fnvHash(connection.Server.Name + connection.Server.Namespace))), 144 | "server_name": connection.Server.Name, 145 | "server_namespace": connection.Server.Namespace, 146 | "server_kind": connection.Server.Kind, 147 | "server_port": strconv.Itoa(int(connection.ServerPort)), 148 | "role": strconv.Itoa(int(connection.Role)), 149 | }).Set(float64(connection.State)) 150 | } 151 | 152 | func (caretta *Caretta) getClientSet() (*kubernetes.Clientset, error) { 153 | config, err := rest.InClusterConfig() 154 | if err != nil { 155 | return nil, err 156 | } 157 | 158 | clientset, err := kubernetes.NewForConfig(config) 159 | if err != nil { 160 | return nil, err 161 | } 162 | return clientset, nil 163 | } 164 | 165 | // simple fnvHash function from string to uint32 166 | func fnvHash(s string) uint32 { 167 | h := fnv.New32a() 168 | h.Write([]byte(s)) 169 | return h.Sum32() 170 | } 171 | -------------------------------------------------------------------------------- /pkg/caretta/config.go: -------------------------------------------------------------------------------- 1 | package caretta 2 | 3 | import ( 4 | "fmt" 5 | "os" 6 | "strconv" 7 | ) 8 | 9 | const ( 10 | defaultPrometheusEndpoint = "/metrics" 11 | defaultPrometheusPort = ":7117" 12 | defaultPollingIntervalSeconds = 5 13 | defaultShouldResolveDns = false 14 | defaultTraverseUpHierarchy = true 15 | ) 16 | 17 | type carettaConfig struct { 18 | shouldResolveDns bool 19 | prometheusPort string 20 | prometheusEndpoint string 21 | pollingIntervalSeconds int 22 | traverseUpHierarchy bool 23 | } 24 | 25 | // environment variables based, encapsulated to enable future changes 26 | func readConfig() carettaConfig { 27 | port := defaultPrometheusPort 28 | if val := os.Getenv("PROMETHEUS_PORT"); val != "" { 29 | valInt, err := strconv.Atoi(val) 30 | if err == nil { 31 | port = fmt.Sprintf(":%d", valInt) 32 | } 33 | } 34 | 35 | endpoint := defaultPrometheusEndpoint 36 | if val := os.Getenv("PROMETHEUS_ENDPOINT"); val != "" { 37 | endpoint = val 38 | } 39 | 40 | interval := defaultPollingIntervalSeconds 41 | if val := os.Getenv("POLL_INTERVAL"); val != "" { 42 | valInt, err := strconv.Atoi(val) 43 | if err == nil { 44 | interval = valInt 45 | } 46 | } 47 | 48 | shouldResolveDns := defaultShouldResolveDns 49 | if val := os.Getenv("RESOLVE_DNS"); val != "" { 50 | valBool, err := strconv.ParseBool(val) 51 | if err == nil { 52 | shouldResolveDns = valBool 53 | } 54 | } 55 | 56 | traverseUpHierarchy := defaultTraverseUpHierarchy 57 | if val := os.Getenv("TRAVERSE_UP_HIERARCHY"); val != "" { 58 | valBool, err := strconv.ParseBool(val) 59 | if err == nil { 60 | traverseUpHierarchy = valBool 61 | } 62 | } 63 | 64 | return carettaConfig{ 65 | shouldResolveDns: shouldResolveDns, 66 | prometheusPort: port, 67 | prometheusEndpoint: endpoint, 68 | pollingIntervalSeconds: interval, 69 | traverseUpHierarchy: traverseUpHierarchy, 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /pkg/caretta/ebpf_map.go: -------------------------------------------------------------------------------- 1 | package caretta 2 | 3 | import "github.com/cilium/ebpf" 4 | 5 | type IEbpfMapIterator interface { 6 | Next(interface{}, interface{}) bool 7 | } 8 | 9 | type IEbpfMap interface { 10 | Lookup(interface{}, interface{}) error 11 | Iterate() IEbpfMapIterator 12 | Delete(interface{}) error 13 | } 14 | 15 | type EbpfMap struct { 16 | innerMap *ebpf.Map 17 | } 18 | 19 | type EbpfMapIterator struct { 20 | innerIterator *ebpf.MapIterator 21 | } 22 | 23 | func (m *EbpfMap) Lookup(key interface{}, val interface{}) error { 24 | return m.innerMap.Lookup(key, val) 25 | } 26 | 27 | func (m *EbpfMap) Iterate() IEbpfMapIterator { 28 | return &EbpfMapIterator{innerIterator: m.innerMap.Iterate()} 29 | } 30 | 31 | func (m *EbpfMap) Delete(key interface{}) error { 32 | return m.innerMap.Delete(key) 33 | } 34 | 35 | func (it *EbpfMapIterator) Next(key interface{}, val interface{}) bool { 36 | return it.innerIterator.Next(key, val) 37 | } 38 | -------------------------------------------------------------------------------- /pkg/caretta/links_tracer.go: -------------------------------------------------------------------------------- 1 | package caretta 2 | 3 | import ( 4 | "encoding/binary" 5 | "errors" 6 | "log" 7 | "net" 8 | 9 | "github.com/groundcover-com/caretta/pkg/k8s" 10 | "github.com/groundcover-com/caretta/pkg/tracing" 11 | 12 | "github.com/prometheus/client_golang/prometheus" 13 | "github.com/prometheus/client_golang/prometheus/promauto" 14 | ) 15 | 16 | var ( 17 | pollsMade = promauto.NewCounter(prometheus.CounterOpts{ 18 | Name: "caretta_polls_made", 19 | Help: "Counter of polls made by caretta", 20 | }) 21 | failedConnectionDeletion = promauto.NewCounter(prometheus.CounterOpts{ 22 | Name: "caretta_failed_deletions", 23 | Help: "Counter of failed deletion of closed connection from map", 24 | }) 25 | filteredLoopbackConnections = promauto.NewGauge(prometheus.GaugeOpts{ 26 | Name: "caretta_current_loopback_connections", 27 | Help: `Number of loopback connections observed in the last iteration`, 28 | }) 29 | mapSize = promauto.NewGauge(prometheus.GaugeOpts{ 30 | Name: "caretta_ebpf_connections_map_size", 31 | Help: "number of items in the connections map iterated from user space per iteration", 32 | }) 33 | mapDeletions = promauto.NewCounter(prometheus.CounterOpts{ 34 | Name: "caretta_connection_deletions", 35 | Help: "total number of deletions from the map done by the userspace", 36 | }) 37 | ) 38 | 39 | type IPResolver interface { 40 | ResolveIP(string) k8s.Workload 41 | StartWatching() error 42 | StopWatching() 43 | } 44 | 45 | type Probes interface { 46 | UnloadProbes() error 47 | } 48 | 49 | type LinksTracer struct { 50 | ebpfObjects Probes 51 | connections IEbpfMap 52 | resolver IPResolver 53 | } 54 | 55 | // initializes a LinksTracer object 56 | func NewTracer(resolver *k8s.K8sIPResolver) LinksTracer { 57 | tracer := LinksTracer{resolver: resolver} 58 | return tracer 59 | } 60 | 61 | func NewTracerWithObjs(resolver IPResolver, connections IEbpfMap, probes Probes) LinksTracer { 62 | return LinksTracer{ 63 | ebpfObjects: probes, 64 | connections: connections, 65 | resolver: resolver, 66 | } 67 | } 68 | 69 | func (tracer *LinksTracer) Start() error { 70 | objs, connMap, err := tracing.LoadProbes() 71 | if err != nil { 72 | return err 73 | } 74 | 75 | tracer.ebpfObjects = &objs 76 | tracer.connections = &EbpfMap{innerMap: connMap} 77 | return nil 78 | } 79 | 80 | func (tracer *LinksTracer) Stop() error { 81 | tracer.resolver.StopWatching() 82 | return tracer.ebpfObjects.UnloadProbes() 83 | } 84 | 85 | // a single polling from the eBPF maps 86 | // iterating the traces from the kernel-space, summing each network link 87 | func (tracer *LinksTracer) TracesPollingIteration(pastLinks map[NetworkLink]uint64) (map[NetworkLink]uint64, map[NetworkLink]uint64, []TcpConnection) { 88 | // outline of an iteration - 89 | // filter unwanted connections, sum all connections as links, add past links, and return the new map 90 | pollsMade.Inc() 91 | loopbackCounter := 0 92 | 93 | currentLinks := make(map[NetworkLink]uint64) 94 | currentTcpConnections := []TcpConnection{} 95 | var connectionsToDelete []ConnectionIdentifier 96 | 97 | var conn ConnectionIdentifier 98 | var throughput ConnectionThroughputStats 99 | 100 | entries := tracer.connections.Iterate() 101 | // iterate the map from the eBPF program 102 | itemsCounter := 0 103 | for entries.Next(&conn, &throughput) { 104 | itemsCounter += 1 105 | // filter unnecessary connection 106 | 107 | if throughput.IsActive == 0 { 108 | connectionsToDelete = append(connectionsToDelete, conn) 109 | } 110 | 111 | // skip loopback connections 112 | if conn.Tuple.SrcIp == conn.Tuple.DstIp && isAddressLoopback(conn.Tuple.DstIp) { 113 | loopbackCounter++ 114 | continue 115 | } 116 | 117 | // filter unroled connections (probably indicates a bug) 118 | link, err := tracer.reduceConnectionToLink(conn) 119 | if conn.Role == UnknownConnectionRole || err != nil { 120 | continue 121 | } 122 | 123 | tcpConn, err := tracer.reduceConnectionToTcp(conn, throughput) 124 | if err != nil { 125 | continue 126 | } 127 | 128 | currentLinks[link] += throughput.BytesSent 129 | currentTcpConnections = append(currentTcpConnections, tcpConn) 130 | } 131 | 132 | mapSize.Set(float64(itemsCounter)) 133 | filteredLoopbackConnections.Set(float64(loopbackCounter)) 134 | 135 | // add past links 136 | for pastLink, pastThroughput := range pastLinks { 137 | currentLinks[pastLink] += pastThroughput 138 | } 139 | 140 | // delete connections marked to delete 141 | for _, conn := range connectionsToDelete { 142 | tracer.deleteAndStoreConnection(&conn, pastLinks) 143 | } 144 | 145 | return pastLinks, currentLinks, currentTcpConnections 146 | 147 | } 148 | 149 | func (tracer *LinksTracer) deleteAndStoreConnection(conn *ConnectionIdentifier, pastLinks map[NetworkLink]uint64) { 150 | // newer kernels introduce batch map operation, but it might not be available so we delete item-by-item 151 | var throughput ConnectionThroughputStats 152 | err := tracer.connections.Lookup(conn, &throughput) 153 | if err != nil { 154 | log.Printf("Error retrieving connection to delete, skipping it: %v", err) 155 | failedConnectionDeletion.Inc() 156 | return 157 | } 158 | err = tracer.connections.Delete(conn) 159 | if err != nil { 160 | log.Printf("Error deleting connection from map: %v", err) 161 | failedConnectionDeletion.Inc() 162 | return 163 | } 164 | // if deletion is successful, add it to past links 165 | link, err := tracer.reduceConnectionToLink(*conn) 166 | if err != nil { 167 | log.Printf("Error reducing connection to link when deleting: %v", err) 168 | return 169 | } 170 | 171 | pastLinks[link] += throughput.BytesSent 172 | 173 | mapDeletions.Inc() 174 | } 175 | 176 | // reduce a specific connection to a general link 177 | func (tracer *LinksTracer) reduceConnectionToLink(connection ConnectionIdentifier) (NetworkLink, error) { 178 | var link NetworkLink 179 | link.Role = connection.Role 180 | 181 | srcWorkload := tracer.resolver.ResolveIP(IP(connection.Tuple.SrcIp).String()) 182 | dstWorkload := tracer.resolver.ResolveIP(IP(connection.Tuple.DstIp).String()) 183 | 184 | if connection.Role == ClientConnectionRole { 185 | // Src is Client, Dst is Server, Port is DstPort 186 | link.Client = srcWorkload 187 | link.Server = dstWorkload 188 | link.ServerPort = connection.Tuple.DstPort 189 | } else if connection.Role == ServerConnectionRole { 190 | // Dst is Client, Src is Server, Port is SrcPort 191 | link.Client = dstWorkload 192 | link.Server = srcWorkload 193 | link.ServerPort = connection.Tuple.SrcPort 194 | } else { 195 | return NetworkLink{}, errors.New("connection's role is unknown") 196 | } 197 | return link, nil 198 | } 199 | 200 | // reduce a specific connection to a general tcp connection 201 | func (tracer *LinksTracer) reduceConnectionToTcp(connection ConnectionIdentifier, throughput ConnectionThroughputStats) (TcpConnection, error) { 202 | var tcpConn TcpConnection 203 | tcpConn.Role = connection.Role 204 | 205 | srcWorkload := tracer.resolver.ResolveIP(IP(connection.Tuple.SrcIp).String()) 206 | dstWorkload := tracer.resolver.ResolveIP(IP(connection.Tuple.DstIp).String()) 207 | 208 | if connection.Role == ClientConnectionRole { 209 | // Src is Client, Dst is Server, Port is DstPort 210 | tcpConn.Client = srcWorkload 211 | tcpConn.Server = dstWorkload 212 | tcpConn.ServerPort = connection.Tuple.DstPort 213 | tcpConn.State = TcpConnectionOpenState 214 | } else if connection.Role == ServerConnectionRole { 215 | // Dst is Client, Src is Server, Port is SrcPort 216 | tcpConn.Client = dstWorkload 217 | tcpConn.Server = srcWorkload 218 | tcpConn.ServerPort = connection.Tuple.SrcPort 219 | tcpConn.State = TcpConnectionAcceptState 220 | } else { 221 | return TcpConnection{}, errors.New("connection's role is unknown") 222 | } 223 | 224 | if throughput.IsActive == 0 { 225 | tcpConn.State = TcpConnectionClosedState 226 | } 227 | 228 | return tcpConn, nil 229 | } 230 | 231 | func isAddressLoopback(ip uint32) bool { 232 | ipAddr := make(net.IP, 4) 233 | binary.LittleEndian.PutUint32(ipAddr, ip) 234 | return ipAddr.IsLoopback() 235 | } 236 | -------------------------------------------------------------------------------- /pkg/caretta/links_tracer_test.go: -------------------------------------------------------------------------------- 1 | package caretta_test 2 | 3 | import ( 4 | "errors" 5 | "fmt" 6 | "testing" 7 | 8 | "github.com/groundcover-com/caretta/pkg/caretta" 9 | 10 | "github.com/groundcover-com/caretta/pkg/k8s" 11 | "github.com/stretchr/testify/assert" 12 | ) 13 | 14 | // Defining a mock of a map. This is not a complete implementation of a map with iterator 15 | type MockConnectionsMapIterator struct { 16 | innerMap map[caretta.ConnectionIdentifier]caretta.ConnectionThroughputStats 17 | keys []caretta.ConnectionIdentifier 18 | count int 19 | } 20 | 21 | func (mi *MockConnectionsMapIterator) Next(conn interface{}, throughput interface{}) bool { 22 | assertedConn, ok := conn.(*caretta.ConnectionIdentifier) 23 | if !ok { 24 | return false 25 | } 26 | assertedThroughput, ok := throughput.(*caretta.ConnectionThroughputStats) 27 | if !ok { 28 | return false 29 | } 30 | for mi.count < len(mi.keys) { 31 | *assertedConn = mi.keys[mi.count] 32 | *assertedThroughput = mi.innerMap[*assertedConn] 33 | mi.count++ 34 | return true 35 | } 36 | 37 | return false 38 | } 39 | 40 | type MockConnectionsMap struct { 41 | innerMap map[caretta.ConnectionIdentifier]caretta.ConnectionThroughputStats 42 | } 43 | 44 | func NewMockConnectionsMap() *MockConnectionsMap { 45 | return &MockConnectionsMap{innerMap: make(map[caretta.ConnectionIdentifier]caretta.ConnectionThroughputStats)} 46 | } 47 | 48 | func (m *MockConnectionsMap) Lookup(conn interface{}, throughput interface{}) error { 49 | assertedConn, ok := conn.(*caretta.ConnectionIdentifier) 50 | if !ok { 51 | return errors.New("wrong type for Lookup") 52 | } 53 | assertedThroughput, ok := throughput.(*caretta.ConnectionThroughputStats) 54 | if !ok { 55 | return errors.New("wrong type for Lookup") 56 | } 57 | *assertedThroughput, ok = m.innerMap[*assertedConn] 58 | if !ok { 59 | return errors.New("Key not in map") 60 | } 61 | return nil 62 | } 63 | 64 | func (m *MockConnectionsMap) Iterate() caretta.IEbpfMapIterator { 65 | keys := make([]caretta.ConnectionIdentifier, 0, len(m.innerMap)) 66 | for ci := range m.innerMap { 67 | keys = append(keys, ci) 68 | } 69 | 70 | return &MockConnectionsMapIterator{innerMap: m.innerMap, keys: keys, count: 0} 71 | } 72 | 73 | func (m *MockConnectionsMap) Delete(key interface{}) error { 74 | assertedKey, ok := key.(*caretta.ConnectionIdentifier) 75 | if !ok { 76 | return errors.New("wrong type in delete") 77 | } 78 | delete(m.innerMap, *assertedKey) 79 | return nil 80 | } 81 | 82 | func (m *MockConnectionsMap) Update(key caretta.ConnectionIdentifier, value caretta.ConnectionThroughputStats) { 83 | m.innerMap[key] = value 84 | } 85 | 86 | type MockResolver struct{} 87 | 88 | func (resolver *MockResolver) ResolveIP(ip string) k8s.Workload { 89 | return k8s.Workload{ 90 | Name: ip, 91 | Namespace: "Namespace", 92 | Kind: "Kind", 93 | } 94 | } 95 | 96 | func (resolver *MockResolver) StartWatching() error { 97 | return nil 98 | } 99 | func (resolver *MockResolver) StopWatching() {} 100 | 101 | type testConnection struct { 102 | connId caretta.ConnectionIdentifier 103 | throughput caretta.ConnectionThroughputStats 104 | } 105 | 106 | type aggregationTest struct { 107 | description string 108 | connections []testConnection 109 | expectedLink caretta.NetworkLink 110 | expectedThroughput uint64 111 | } 112 | 113 | var clientTuple = caretta.ConnectionTuple{ 114 | SrcIp: 1, 115 | DstIp: 2, 116 | SrcPort: 55555, 117 | DstPort: 80, 118 | } 119 | var serverTuple = caretta.ConnectionTuple{ 120 | DstIp: 1, 121 | SrcIp: 2, 122 | DstPort: 55555, 123 | SrcPort: 80, 124 | } 125 | var activeThroughput = caretta.ConnectionThroughputStats{ 126 | BytesSent: 10, 127 | BytesReceived: 2, 128 | IsActive: 1, 129 | } 130 | var inactiveThroughput = caretta.ConnectionThroughputStats{ 131 | BytesSent: 10, 132 | BytesReceived: 2, 133 | IsActive: 0, 134 | } 135 | var clientLink = caretta.NetworkLink{ 136 | Client: k8s.Workload{ 137 | Name: caretta.IP(1).String(), 138 | Namespace: "Namespace", 139 | Kind: "Kind", 140 | }, 141 | Server: k8s.Workload{ 142 | Name: caretta.IP(2).String(), 143 | Namespace: "Namespace", 144 | Kind: "Kind", 145 | }, 146 | ServerPort: 80, 147 | Role: caretta.ClientConnectionRole, 148 | } 149 | var serverLink = caretta.NetworkLink{ 150 | Client: k8s.Workload{ 151 | Name: caretta.IP(1).String(), 152 | Namespace: "Namespace", 153 | Kind: "Kind", 154 | }, 155 | Server: k8s.Workload{ 156 | Name: caretta.IP(2).String(), 157 | Namespace: "Namespace", 158 | Kind: "Kind", 159 | }, 160 | ServerPort: 80, 161 | Role: caretta.ServerConnectionRole, 162 | } 163 | 164 | func TestAggregations(t *testing.T) { 165 | var aggregationTests = []aggregationTest{ 166 | { 167 | description: "single client connection create correct link", 168 | connections: []testConnection{ 169 | { 170 | connId: caretta.ConnectionIdentifier{ 171 | Id: 1, 172 | Pid: 1, 173 | Tuple: clientTuple, 174 | Role: caretta.ClientConnectionRole, 175 | }, 176 | throughput: activeThroughput, 177 | }, 178 | }, 179 | expectedLink: clientLink, 180 | expectedThroughput: activeThroughput.BytesSent, 181 | }, 182 | { 183 | description: "single server connection create correct link", 184 | connections: []testConnection{ 185 | { 186 | connId: caretta.ConnectionIdentifier{ 187 | Id: 1, 188 | Pid: 1, 189 | Tuple: serverTuple, 190 | Role: caretta.ServerConnectionRole, 191 | }, 192 | throughput: activeThroughput, 193 | }, 194 | }, 195 | expectedLink: serverLink, 196 | expectedThroughput: activeThroughput.BytesSent, 197 | }, 198 | { 199 | description: "2 client connections aggregate both to one", 200 | connections: []testConnection{ 201 | { 202 | connId: caretta.ConnectionIdentifier{ 203 | Id: 1, 204 | Pid: 1, 205 | Tuple: clientTuple, 206 | Role: caretta.ClientConnectionRole, 207 | }, 208 | throughput: activeThroughput, 209 | }, 210 | { 211 | connId: caretta.ConnectionIdentifier{ 212 | Id: 2, 213 | Pid: 1, 214 | Tuple: clientTuple, 215 | Role: caretta.ClientConnectionRole, 216 | }, 217 | throughput: activeThroughput, 218 | }, 219 | }, 220 | expectedLink: clientLink, 221 | expectedThroughput: 2 * activeThroughput.BytesSent, 222 | }, 223 | { 224 | description: "2 server connections aggregate both to one", 225 | connections: []testConnection{ 226 | { 227 | connId: caretta.ConnectionIdentifier{ 228 | Id: 1, 229 | Pid: 1, 230 | Tuple: serverTuple, 231 | Role: caretta.ServerConnectionRole, 232 | }, 233 | throughput: activeThroughput, 234 | }, 235 | { 236 | connId: caretta.ConnectionIdentifier{ 237 | Id: 2, 238 | Pid: 1, 239 | Tuple: serverTuple, 240 | Role: caretta.ServerConnectionRole, 241 | }, 242 | throughput: activeThroughput, 243 | }, 244 | }, 245 | expectedLink: serverLink, 246 | expectedThroughput: 2 * activeThroughput.BytesSent, 247 | }, 248 | { 249 | description: "3 active client connections, 2 inactive aggregate all to one", 250 | connections: []testConnection{ 251 | { 252 | connId: caretta.ConnectionIdentifier{ 253 | Id: 1, 254 | Pid: 1, 255 | Tuple: clientTuple, 256 | Role: caretta.ClientConnectionRole, 257 | }, 258 | throughput: activeThroughput, 259 | }, 260 | { 261 | connId: caretta.ConnectionIdentifier{ 262 | Id: 2, 263 | Pid: 1, 264 | Tuple: clientTuple, 265 | Role: caretta.ClientConnectionRole, 266 | }, 267 | throughput: activeThroughput, 268 | }, 269 | { 270 | connId: caretta.ConnectionIdentifier{ 271 | Id: 3, 272 | Pid: 1, 273 | Tuple: clientTuple, 274 | Role: caretta.ClientConnectionRole, 275 | }, 276 | throughput: activeThroughput, 277 | }, 278 | { 279 | connId: caretta.ConnectionIdentifier{ 280 | Id: 4, 281 | Pid: 1, 282 | Tuple: clientTuple, 283 | Role: caretta.ClientConnectionRole, 284 | }, 285 | throughput: inactiveThroughput, 286 | }, 287 | { 288 | connId: caretta.ConnectionIdentifier{ 289 | Id: 5, 290 | Pid: 1, 291 | Tuple: clientTuple, 292 | Role: caretta.ClientConnectionRole, 293 | }, 294 | throughput: inactiveThroughput, 295 | }, 296 | }, 297 | expectedLink: clientLink, 298 | expectedThroughput: 3*activeThroughput.BytesSent + 2*inactiveThroughput.BytesSent, 299 | }, 300 | { 301 | description: "3 active server connections, 2 inactive aggregate all to one", 302 | connections: []testConnection{ 303 | { 304 | connId: caretta.ConnectionIdentifier{ 305 | Id: 1, 306 | Pid: 1, 307 | Tuple: serverTuple, 308 | Role: caretta.ServerConnectionRole, 309 | }, 310 | throughput: activeThroughput, 311 | }, 312 | { 313 | connId: caretta.ConnectionIdentifier{ 314 | Id: 2, 315 | Pid: 1, 316 | Tuple: serverTuple, 317 | Role: caretta.ServerConnectionRole, 318 | }, 319 | throughput: activeThroughput, 320 | }, 321 | { 322 | connId: caretta.ConnectionIdentifier{ 323 | Id: 3, 324 | Pid: 1, 325 | Tuple: serverTuple, 326 | Role: caretta.ServerConnectionRole, 327 | }, 328 | throughput: activeThroughput, 329 | }, 330 | { 331 | connId: caretta.ConnectionIdentifier{ 332 | Id: 4, 333 | Pid: 1, 334 | Tuple: serverTuple, 335 | Role: caretta.ServerConnectionRole, 336 | }, 337 | throughput: inactiveThroughput, 338 | }, 339 | { 340 | connId: caretta.ConnectionIdentifier{ 341 | Id: 5, 342 | Pid: 1, 343 | Tuple: serverTuple, 344 | Role: caretta.ServerConnectionRole, 345 | }, 346 | throughput: inactiveThroughput, 347 | }, 348 | }, 349 | expectedLink: serverLink, 350 | expectedThroughput: 3*activeThroughput.BytesSent + 2*inactiveThroughput.BytesSent, 351 | }, 352 | } 353 | for _, test := range aggregationTests { 354 | t.Run(test.description, func(t *testing.T) { 355 | assert := assert.New(t) 356 | m := NewMockConnectionsMap() 357 | 358 | tracer := caretta.NewTracerWithObjs(&MockResolver{}, m, nil) 359 | pastLinks := make(map[caretta.NetworkLink]uint64) 360 | var currentLinks map[caretta.NetworkLink]uint64 361 | for _, connection := range test.connections { 362 | m.Update(connection.connId, connection.throughput) 363 | _, currentLinks, _ = tracer.TracesPollingIteration(pastLinks) 364 | } 365 | resultThroughput, ok := currentLinks[test.expectedLink] 366 | assert.True(ok, "expected link not in result map") 367 | assert.Equal(test.expectedThroughput, resultThroughput, "wrong throughput value") 368 | }) 369 | 370 | } 371 | } 372 | 373 | func TestDeletion_ActiveConnection_NotDeleted(t *testing.T) { 374 | assert := assert.New(t) 375 | 376 | // Arrange mock map, initial connection 377 | m := NewMockConnectionsMap() 378 | 379 | conn1 := caretta.ConnectionIdentifier{ 380 | Id: 1, 381 | Pid: 1, 382 | Tuple: serverTuple, 383 | Role: caretta.ServerConnectionRole, 384 | } 385 | throughput1 := activeThroughput 386 | 387 | tracer := caretta.NewTracerWithObjs(&MockResolver{}, m, nil) 388 | 389 | pastLinks := make(map[caretta.NetworkLink]uint64) 390 | 391 | // Act 392 | m.Update(conn1, throughput1) 393 | _, currentLinks, _ := tracer.TracesPollingIteration(pastLinks) 394 | 395 | // Assert 396 | resultThroughput, ok := currentLinks[serverLink] 397 | assert.True(ok, "link not in map, map is %v", currentLinks) 398 | assert.Equal(throughput1.BytesSent, resultThroughput) 399 | 400 | var testThroughput caretta.ConnectionThroughputStats 401 | 402 | err := m.Lookup(&conn1, &testThroughput) 403 | assert.NoError(err, "connection should stay on the map") 404 | } 405 | 406 | func TestDeletion_InactiveConnection_AddedToPastLinksAndRemovedFromMap(t *testing.T) { 407 | assert := assert.New(t) 408 | 409 | // Arrange mock map, initial connection 410 | m := NewMockConnectionsMap() 411 | 412 | conn1 := caretta.ConnectionIdentifier{ 413 | Id: 1, 414 | Pid: 1, 415 | Tuple: serverTuple, 416 | Role: caretta.ServerConnectionRole, 417 | } 418 | throughput1 := activeThroughput 419 | m.Update(conn1, throughput1) 420 | 421 | tracer := caretta.NewTracerWithObjs(&MockResolver{}, m, nil) 422 | 423 | pastLinks := make(map[caretta.NetworkLink]uint64) 424 | 425 | pastLinks, _, _ = tracer.TracesPollingIteration(pastLinks) 426 | 427 | // Act: update the throughput so the connection is inactive, and iterate 428 | throughput2 := inactiveThroughput 429 | m.Update(conn1, throughput2) 430 | pastLinks, currentLinks, _ := tracer.TracesPollingIteration(pastLinks) 431 | 432 | // Assert: check the past connection is both in past links and in current links 433 | resultThroughput, ok := currentLinks[serverLink] 434 | assert.True(ok, "link not in map, map is %v", currentLinks) 435 | assert.Equal(throughput1.BytesSent, resultThroughput) 436 | _, ok = pastLinks[serverLink] 437 | assert.True(ok, "inactive link not in past links: %v", pastLinks) 438 | 439 | var testThroughput caretta.ConnectionThroughputStats 440 | err := m.Lookup(&conn1, &testThroughput) 441 | assert.Error(err, fmt.Sprintf("inactive connection not deleted from connections map, val is %d", testThroughput.BytesSent)) 442 | } 443 | 444 | func TestDeletion_InactiveConnection_NewConnectionAfterDeletionUpdatesCorrectly(t *testing.T) { 445 | assert := assert.New(t) 446 | 447 | // Arrange mock map, initial connection, inactive connection 448 | m := NewMockConnectionsMap() 449 | 450 | conn1 := caretta.ConnectionIdentifier{ 451 | Id: 1, 452 | Pid: 1, 453 | Tuple: serverTuple, 454 | Role: caretta.ServerConnectionRole, 455 | } 456 | throughput1 := activeThroughput 457 | m.Update(conn1, throughput1) 458 | 459 | tracer := caretta.NewTracerWithObjs(&MockResolver{}, m, nil) 460 | 461 | pastLinks := make(map[caretta.NetworkLink]uint64) 462 | 463 | // update the throughput so the connection is inactive 464 | throughput2 := inactiveThroughput 465 | m.Update(conn1, throughput2) 466 | pastLinks, _, _ = tracer.TracesPollingIteration(pastLinks) 467 | 468 | // Act: new connection, same link 469 | throughput3 := activeThroughput 470 | m.Update(conn1, throughput3) 471 | _, currentLinks, _ := tracer.TracesPollingIteration(pastLinks) 472 | 473 | // Assert the new connection is aggregated correctly 474 | resultThroughput, ok := currentLinks[serverLink] 475 | assert.True(ok, "link not in map, map is %v", currentLinks) 476 | assert.Equal(throughput1.BytesSent+throughput3.BytesSent, resultThroughput) 477 | } 478 | 479 | func TestConnectionState_Open(t *testing.T) { 480 | assert := assert.New(t) 481 | 482 | // Arrange mock map, initial connection 483 | m := NewMockConnectionsMap() 484 | 485 | conn1 := caretta.ConnectionIdentifier{ 486 | Id: 1, 487 | Pid: 1, 488 | Tuple: serverTuple, 489 | Role: caretta.ClientConnectionRole, 490 | } 491 | throughput1 := activeThroughput 492 | 493 | tracer := caretta.NewTracerWithObjs(&MockResolver{}, m, nil) 494 | 495 | pastLinks := make(map[caretta.NetworkLink]uint64) 496 | 497 | // Act 498 | m.Update(conn1, throughput1) 499 | _, _, currentConnections := tracer.TracesPollingIteration(pastLinks) 500 | 501 | // Assert 502 | assert.Equal(1, len(currentConnections)) 503 | // Get the first element of the map 504 | for _, tcp := range currentConnections { 505 | assert.Equal(uint32(caretta.TcpConnectionOpenState), tcp.State) 506 | break 507 | } 508 | } 509 | 510 | func TestConnectionState_Close(t *testing.T) { 511 | assert := assert.New(t) 512 | 513 | // Arrange mock map, initial connection 514 | m := NewMockConnectionsMap() 515 | 516 | conn1 := caretta.ConnectionIdentifier{ 517 | Id: 1, 518 | Pid: 1, 519 | Tuple: serverTuple, 520 | Role: caretta.ServerConnectionRole, 521 | } 522 | throughput1 := inactiveThroughput 523 | 524 | tracer := caretta.NewTracerWithObjs(&MockResolver{}, m, nil) 525 | 526 | pastLinks := make(map[caretta.NetworkLink]uint64) 527 | 528 | // Act 529 | m.Update(conn1, throughput1) 530 | _, _, currentConnections := tracer.TracesPollingIteration(pastLinks) 531 | 532 | // Assert 533 | assert.Equal(1, len(currentConnections)) 534 | for _, tcp := range currentConnections { 535 | assert.Equal(uint32(caretta.TcpConnectionClosedState), tcp.State) 536 | break 537 | } 538 | } 539 | 540 | func TestConnectionState_Accept(t *testing.T) { 541 | assert := assert.New(t) 542 | 543 | // Arrange mock map, initial connection 544 | m := NewMockConnectionsMap() 545 | 546 | conn1 := caretta.ConnectionIdentifier{ 547 | Id: 1, 548 | Pid: 1, 549 | Tuple: serverTuple, 550 | Role: caretta.ServerConnectionRole, 551 | } 552 | throughput1 := activeThroughput 553 | 554 | tracer := caretta.NewTracerWithObjs(&MockResolver{}, m, nil) 555 | 556 | pastLinks := make(map[caretta.NetworkLink]uint64) 557 | 558 | // Act 559 | m.Update(conn1, throughput1) 560 | _, _, currentConnections := tracer.TracesPollingIteration(pastLinks) 561 | 562 | // Assert 563 | assert.Equal(1, len(currentConnections)) 564 | for _, tcp := range currentConnections { 565 | assert.Equal(uint32(caretta.TcpConnectionAcceptState), tcp.State) 566 | break 567 | } 568 | } 569 | 570 | func TestConnectionState_UnknownRole(t *testing.T) { 571 | assert := assert.New(t) 572 | 573 | // Arrange mock map, initial connection 574 | m := NewMockConnectionsMap() 575 | 576 | conn1 := caretta.ConnectionIdentifier{ 577 | Id: 1, 578 | Pid: 1, 579 | Tuple: serverTuple, 580 | Role: caretta.UnknownConnectionRole, 581 | } 582 | throughput1 := activeThroughput 583 | 584 | tracer := caretta.NewTracerWithObjs(&MockResolver{}, m, nil) 585 | 586 | pastLinks := make(map[caretta.NetworkLink]uint64) 587 | 588 | // Act 589 | m.Update(conn1, throughput1) 590 | _, _, currentConnections := tracer.TracesPollingIteration(pastLinks) 591 | 592 | // Assert 593 | assert.Equal(0, len(currentConnections)) 594 | } 595 | -------------------------------------------------------------------------------- /pkg/caretta/types.go: -------------------------------------------------------------------------------- 1 | package caretta 2 | 3 | import ( 4 | "encoding/binary" 5 | "net" 6 | 7 | caretta_k8s "github.com/groundcover-com/caretta/pkg/k8s" 8 | ) 9 | 10 | const ( 11 | UnknownConnectionRole = iota 12 | ClientConnectionRole = iota 13 | ServerConnectionRole = iota 14 | TcpConnectionOpenState = iota 15 | TcpConnectionAcceptState = iota 16 | TcpConnectionClosedState = iota 17 | ) 18 | 19 | type IP uint32 20 | 21 | func (ip IP) String() string { 22 | netIp := make(net.IP, 4) 23 | binary.LittleEndian.PutUint32(netIp, uint32(ip)) 24 | return netIp.String() 25 | } 26 | 27 | // "final" type of link, like an edge on the graph 28 | type NetworkLink struct { 29 | Client caretta_k8s.Workload 30 | Server caretta_k8s.Workload 31 | ServerPort uint16 32 | Role uint32 33 | } 34 | 35 | type TcpConnection struct { 36 | Client caretta_k8s.Workload 37 | Server caretta_k8s.Workload 38 | ServerPort uint16 39 | Role uint32 40 | State uint32 41 | } 42 | 43 | type ConnectionTuple struct { 44 | SrcIp uint32 45 | DstIp uint32 46 | SrcPort uint16 47 | DstPort uint16 48 | } 49 | 50 | type ConnectionIdentifier struct { 51 | Id uint32 52 | Pid uint32 53 | Tuple ConnectionTuple 54 | Role uint32 55 | } 56 | 57 | type ConnectionThroughputStats struct { 58 | BytesSent uint64 59 | BytesReceived uint64 60 | IsActive uint64 61 | } 62 | -------------------------------------------------------------------------------- /pkg/k8s/ipresolver.go: -------------------------------------------------------------------------------- 1 | package k8s 2 | 3 | import ( 4 | "context" 5 | "errors" 6 | "fmt" 7 | "log" 8 | "net" 9 | "sync" 10 | "time" 11 | 12 | "k8s.io/apimachinery/pkg/watch" 13 | "k8s.io/client-go/kubernetes" 14 | 15 | lrucache "github.com/hashicorp/golang-lru/v2" 16 | "github.com/prometheus/client_golang/prometheus" 17 | "github.com/prometheus/client_golang/prometheus/promauto" 18 | appsv1 "k8s.io/api/apps/v1" 19 | batchv1 "k8s.io/api/batch/v1" 20 | "k8s.io/api/batch/v1beta1" 21 | v1 "k8s.io/api/core/v1" 22 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 23 | ) 24 | 25 | const MAX_RESOLVED_DNS = 10000 // arbitrary limit 26 | var reregisterWatchSleepDuration = 1 * time.Second 27 | 28 | var ( 29 | watchEventsCounter = promauto.NewCounterVec(prometheus.CounterOpts{ 30 | Name: "caretta_watcher_events_count", 31 | }, []string{"object_type"}) 32 | watchResetsCounter = promauto.NewCounterVec(prometheus.CounterOpts{ 33 | Name: "caretta_watcher_resets_count", 34 | }, []string{"object_type"}) 35 | ) 36 | 37 | type clusterSnapshot struct { 38 | Pods sync.Map // map[types.UID]v1.Pod 39 | Nodes sync.Map // map[types.UID]v1.Node 40 | ReplicaSets sync.Map // map[types.UID]appsv1.ReplicaSet 41 | DaemonSets sync.Map // map[types.UID]appsv1.DaemonSet 42 | StatefulSets sync.Map // map[types.UID]appsv1.StatefulSet 43 | Jobs sync.Map // map[types.UID]batchv1.Job 44 | Services sync.Map // map[types.UID]v1.Service 45 | Deployments sync.Map // map[types.UID]appsv1.Deployment 46 | CronJobs sync.Map // map[types.UID]batchv1.CronJob or batchv1beta.CronJob 47 | PodDescriptors sync.Map // map[types.UID]Workload 48 | } 49 | 50 | type K8sIPResolver struct { 51 | clientset kubernetes.Interface 52 | snapshot clusterSnapshot 53 | ipsMap sync.Map 54 | stopSignal chan bool 55 | shouldResolveDns bool 56 | traverseUpHierarchy bool 57 | dnsResolvedIps *lrucache.Cache[string, string] 58 | } 59 | 60 | type Workload struct { 61 | Name string 62 | Namespace string 63 | Kind string 64 | Owner string 65 | } 66 | 67 | func NewK8sIPResolver(clientset kubernetes.Interface, resolveDns bool, traverseUpHierarchy bool) (*K8sIPResolver, error) { 68 | var dnsCache *lrucache.Cache[string, string] 69 | if resolveDns { 70 | var err error 71 | dnsCache, err = lrucache.New[string, string](MAX_RESOLVED_DNS) 72 | if err != nil { 73 | return nil, err 74 | } 75 | } else { 76 | dnsCache = nil 77 | } 78 | return &K8sIPResolver{ 79 | clientset: clientset, 80 | snapshot: clusterSnapshot{}, 81 | ipsMap: sync.Map{}, 82 | stopSignal: make(chan bool), 83 | shouldResolveDns: resolveDns, 84 | dnsResolvedIps: dnsCache, 85 | traverseUpHierarchy: traverseUpHierarchy, 86 | }, nil 87 | } 88 | 89 | // resolve the given IP from the resolver's cache 90 | // if not available, return the IP itself. 91 | func (resolver *K8sIPResolver) ResolveIP(ip string) Workload { 92 | if val, ok := resolver.ipsMap.Load(ip); ok { 93 | entry, ok := val.(Workload) 94 | if ok { 95 | return entry 96 | } 97 | log.Printf("type confusion in ipsMap") 98 | } 99 | host := ip 100 | 101 | if resolver.shouldResolveDns { 102 | val, ok := resolver.dnsResolvedIps.Get(ip) 103 | if ok { 104 | host = val 105 | } else { 106 | hosts, err := net.LookupAddr(ip) 107 | if err == nil && len(hosts) > 0 { 108 | host = hosts[0] 109 | } 110 | resolver.dnsResolvedIps.Add(ip, host) 111 | } 112 | } 113 | return Workload{ 114 | Name: host, 115 | Namespace: "external", 116 | Kind: "external", 117 | } 118 | } 119 | 120 | func (resolver *K8sIPResolver) StartWatching() error { 121 | // register watchers 122 | podsWatcher, err := resolver.clientset.CoreV1().Pods("").Watch(context.Background(), metav1.ListOptions{}) 123 | if err != nil { 124 | return fmt.Errorf("error watching pods changes - %v", err) 125 | } 126 | 127 | nodesWatcher, err := resolver.clientset.CoreV1().Nodes().Watch(context.Background(), metav1.ListOptions{}) 128 | if err != nil { 129 | return fmt.Errorf("error watching nodes changes - %v", err) 130 | } 131 | 132 | replicasetsWatcher, err := resolver.clientset.AppsV1().ReplicaSets("").Watch(context.Background(), metav1.ListOptions{}) 133 | if err != nil { 134 | return fmt.Errorf("error watching replicasets changes - %v", err) 135 | } 136 | 137 | daemonsetsWatcher, err := resolver.clientset.AppsV1().DaemonSets("").Watch(context.Background(), metav1.ListOptions{}) 138 | if err != nil { 139 | return fmt.Errorf("error watching daemonsets changes - %v", err) 140 | } 141 | 142 | statefulsetsWatcher, err := resolver.clientset.AppsV1().StatefulSets("").Watch(context.Background(), metav1.ListOptions{}) 143 | if err != nil { 144 | return fmt.Errorf("error watching statefulsets changes - %v", err) 145 | } 146 | 147 | jobsWatcher, err := resolver.clientset.BatchV1().Jobs("").Watch(context.Background(), metav1.ListOptions{}) 148 | if err != nil { 149 | return fmt.Errorf("error watching jobs changes - %v", err) 150 | } 151 | 152 | servicesWatcher, err := resolver.clientset.CoreV1().Services("").Watch(context.Background(), metav1.ListOptions{}) 153 | if err != nil { 154 | return fmt.Errorf("error watching services changes - %v", err) 155 | } 156 | 157 | deploymentsWatcher, err := resolver.clientset.AppsV1().Deployments("").Watch(context.Background(), metav1.ListOptions{}) 158 | if err != nil { 159 | return fmt.Errorf("error watching deployments changes - %v", err) 160 | } 161 | 162 | cronJobsWatcher, err := resolver.startCronjobWatcher() 163 | if err != nil { 164 | return fmt.Errorf("error watching cronjobs changes - %v", err) 165 | } 166 | 167 | // invoke a watching function 168 | go func() { 169 | for { 170 | select { 171 | case <-resolver.stopSignal: 172 | podsWatcher.Stop() 173 | nodesWatcher.Stop() 174 | replicasetsWatcher.Stop() 175 | daemonsetsWatcher.Stop() 176 | statefulsetsWatcher.Stop() 177 | jobsWatcher.Stop() 178 | servicesWatcher.Stop() 179 | deploymentsWatcher.Stop() 180 | cronJobsWatcher.Stop() 181 | return 182 | case podEvent, ok := <-podsWatcher.ResultChan(): 183 | { 184 | if !ok { 185 | watchResetsCounter.WithLabelValues("pod").Inc() 186 | podsWatcher, err = resolver.clientset.CoreV1().Pods("").Watch(context.Background(), metav1.ListOptions{}) 187 | if err != nil { 188 | time.Sleep(reregisterWatchSleepDuration) 189 | } 190 | continue 191 | } 192 | watchEventsCounter.WithLabelValues("pod").Inc() 193 | resolver.handlePodWatchEvent(&podEvent) 194 | } 195 | case nodeEvent, ok := <-nodesWatcher.ResultChan(): 196 | { 197 | if !ok { 198 | watchResetsCounter.WithLabelValues("node").Inc() 199 | nodesWatcher, err = resolver.clientset.CoreV1().Nodes().Watch(context.Background(), metav1.ListOptions{}) 200 | if err != nil { 201 | time.Sleep(reregisterWatchSleepDuration) 202 | } 203 | continue 204 | } 205 | watchEventsCounter.WithLabelValues("node").Inc() 206 | resolver.handleNodeWatchEvent(&nodeEvent) 207 | } 208 | case replicasetsEvent, ok := <-replicasetsWatcher.ResultChan(): 209 | { 210 | if !ok { 211 | watchResetsCounter.WithLabelValues("replicaset").Inc() 212 | replicasetsWatcher, err = resolver.clientset.AppsV1().ReplicaSets("").Watch(context.Background(), metav1.ListOptions{}) 213 | if err != nil { 214 | time.Sleep(reregisterWatchSleepDuration) 215 | } 216 | continue 217 | } 218 | watchEventsCounter.WithLabelValues("replicaset").Inc() 219 | resolver.handleReplicaSetWatchEvent(&replicasetsEvent) 220 | } 221 | case daemonsetsEvent, ok := <-daemonsetsWatcher.ResultChan(): 222 | { 223 | if !ok { 224 | watchResetsCounter.WithLabelValues("daemonset").Inc() 225 | daemonsetsWatcher, err = resolver.clientset.AppsV1().DaemonSets("").Watch(context.Background(), metav1.ListOptions{}) 226 | if err != nil { 227 | time.Sleep(reregisterWatchSleepDuration) 228 | } 229 | continue 230 | } 231 | watchEventsCounter.WithLabelValues("daemonset").Inc() 232 | resolver.handleDaemonSetWatchEvent(&daemonsetsEvent) 233 | } 234 | case statefulsetsEvent, ok := <-statefulsetsWatcher.ResultChan(): 235 | { 236 | if !ok { 237 | watchResetsCounter.WithLabelValues("statefulset").Inc() 238 | statefulsetsWatcher, err = resolver.clientset.AppsV1().StatefulSets("").Watch(context.Background(), metav1.ListOptions{}) 239 | if err != nil { 240 | time.Sleep(reregisterWatchSleepDuration) 241 | } 242 | continue 243 | } 244 | watchEventsCounter.WithLabelValues("statefulset").Inc() 245 | resolver.handleStatefulSetWatchEvent(&statefulsetsEvent) 246 | } 247 | case jobsEvent, ok := <-jobsWatcher.ResultChan(): 248 | { 249 | if !ok { 250 | watchResetsCounter.WithLabelValues("job").Inc() 251 | jobsWatcher, err = resolver.clientset.BatchV1().Jobs("").Watch(context.Background(), metav1.ListOptions{}) 252 | if err != nil { 253 | time.Sleep(reregisterWatchSleepDuration) 254 | } 255 | continue 256 | } 257 | watchEventsCounter.WithLabelValues("job").Inc() 258 | resolver.handleJobsWatchEvent(&jobsEvent) 259 | } 260 | case servicesEvent, ok := <-servicesWatcher.ResultChan(): 261 | { 262 | if !ok { 263 | watchResetsCounter.WithLabelValues("service").Inc() 264 | servicesWatcher, err = resolver.clientset.CoreV1().Services("").Watch(context.Background(), metav1.ListOptions{}) 265 | if err != nil { 266 | time.Sleep(reregisterWatchSleepDuration) 267 | } 268 | continue 269 | } 270 | watchEventsCounter.WithLabelValues("service").Inc() 271 | resolver.handleServicesWatchEvent(&servicesEvent) 272 | } 273 | case deploymentsEvent, ok := <-deploymentsWatcher.ResultChan(): 274 | { 275 | if !ok { 276 | watchResetsCounter.WithLabelValues("deployment").Inc() 277 | deploymentsWatcher, err = resolver.clientset.AppsV1().Deployments("").Watch(context.Background(), metav1.ListOptions{}) 278 | if err != nil { 279 | time.Sleep(reregisterWatchSleepDuration) 280 | } 281 | continue 282 | } 283 | watchEventsCounter.WithLabelValues("deployment").Inc() 284 | resolver.handleDeploymentsWatchEvent(&deploymentsEvent) 285 | } 286 | case cronjobsEvent, ok := <-cronJobsWatcher.ResultChan(): 287 | { 288 | if !ok { 289 | watchResetsCounter.WithLabelValues("cronjob").Inc() 290 | cronJobsWatcher, err = resolver.startCronjobWatcher() 291 | if err != nil { 292 | time.Sleep(reregisterWatchSleepDuration) 293 | } 294 | continue 295 | } 296 | watchEventsCounter.WithLabelValues("cronjob").Inc() 297 | resolver.handleCronJobsWatchEvent(&cronjobsEvent) 298 | } 299 | } 300 | } 301 | }() 302 | 303 | // get initial state 304 | err = resolver.getResolvedClusterSnapshot() 305 | if err != nil { 306 | resolver.StopWatching() 307 | return fmt.Errorf("error retrieving cluster's initial state: %v", err) 308 | } 309 | 310 | return nil 311 | } 312 | 313 | func (resolver *K8sIPResolver) startCronjobWatcher() (watch.Interface, error) { 314 | cronJobsWatcher, err := resolver.clientset.BatchV1().CronJobs("").Watch(context.Background(), metav1.ListOptions{}) 315 | if err != nil { 316 | return resolver.clientset.BatchV1beta1().CronJobs("").Watch(context.Background(), metav1.ListOptions{}) 317 | } 318 | 319 | return cronJobsWatcher, nil 320 | } 321 | 322 | func (resolver *K8sIPResolver) StopWatching() { 323 | resolver.stopSignal <- true 324 | } 325 | 326 | func (resolver *K8sIPResolver) handlePodWatchEvent(podEvent *watch.Event) { 327 | switch podEvent.Type { 328 | case watch.Added: 329 | pod, ok := podEvent.Object.(*v1.Pod) 330 | if !ok { 331 | return 332 | } 333 | resolver.snapshot.Pods.Store(pod.UID, *pod) 334 | entry := resolver.resolvePodDescriptor(pod) 335 | for _, podIp := range pod.Status.PodIPs { 336 | resolver.storeWorkloadsIP(podIp.IP, &entry) 337 | } 338 | case watch.Modified: 339 | pod, ok := podEvent.Object.(*v1.Pod) 340 | if !ok { 341 | return 342 | } 343 | resolver.snapshot.Pods.Store(pod.UID, *pod) 344 | entry := resolver.resolvePodDescriptor(pod) 345 | for _, podIp := range pod.Status.PodIPs { 346 | resolver.storeWorkloadsIP(podIp.IP, &entry) 347 | } 348 | case watch.Deleted: 349 | if val, ok := podEvent.Object.(*v1.Pod); ok { 350 | resolver.snapshot.Pods.Delete(val.UID) 351 | resolver.snapshot.PodDescriptors.Delete(val.UID) 352 | } 353 | } 354 | } 355 | 356 | func (resolver *K8sIPResolver) handleNodeWatchEvent(nodeEvent *watch.Event) { 357 | switch nodeEvent.Type { 358 | case watch.Added, watch.Modified: 359 | node, ok := nodeEvent.Object.(*v1.Node) 360 | if !ok { 361 | return 362 | } 363 | resolver.snapshot.Nodes.Store(node.UID, *node) 364 | for _, nodeAddress := range node.Status.Addresses { 365 | resolver.storeWorkloadsIP(nodeAddress.Address, &Workload{ 366 | Name: node.Name, 367 | Namespace: "node", 368 | Kind: "node", 369 | }) 370 | } 371 | case watch.Deleted: 372 | if val, ok := nodeEvent.Object.(*v1.Node); ok { 373 | resolver.snapshot.Nodes.Delete(val.UID) 374 | } 375 | } 376 | } 377 | 378 | func (resolver *K8sIPResolver) handleReplicaSetWatchEvent(replicasetsEvent *watch.Event) { 379 | switch replicasetsEvent.Type { 380 | case watch.Added: 381 | if val, ok := replicasetsEvent.Object.(*appsv1.ReplicaSet); ok { 382 | resolver.snapshot.ReplicaSets.Store(val.UID, *val) 383 | } 384 | case watch.Deleted: 385 | if val, ok := replicasetsEvent.Object.(*appsv1.ReplicaSet); ok { 386 | resolver.snapshot.ReplicaSets.Delete(val.UID) 387 | } 388 | } 389 | } 390 | 391 | func (resolver *K8sIPResolver) handleDaemonSetWatchEvent(daemonsetsEvent *watch.Event) { 392 | switch daemonsetsEvent.Type { 393 | case watch.Added: 394 | if val, ok := daemonsetsEvent.Object.(*appsv1.DaemonSet); ok { 395 | resolver.snapshot.DaemonSets.Store(val.UID, *val) 396 | } 397 | case watch.Deleted: 398 | if val, ok := daemonsetsEvent.Object.(*appsv1.DaemonSet); ok { 399 | resolver.snapshot.DaemonSets.Delete(val.UID) 400 | } 401 | } 402 | } 403 | 404 | func (resolver *K8sIPResolver) handleStatefulSetWatchEvent(statefulsetsEvent *watch.Event) { 405 | switch statefulsetsEvent.Type { 406 | case watch.Added: 407 | if val, ok := statefulsetsEvent.Object.(*appsv1.StatefulSet); ok { 408 | resolver.snapshot.StatefulSets.Store(val.UID, *val) 409 | } 410 | case watch.Deleted: 411 | if val, ok := statefulsetsEvent.Object.(*appsv1.StatefulSet); ok { 412 | resolver.snapshot.StatefulSets.Delete(val.UID) 413 | } 414 | } 415 | } 416 | 417 | func (resolver *K8sIPResolver) handleJobsWatchEvent(jobsEvent *watch.Event) { 418 | switch jobsEvent.Type { 419 | case watch.Added: 420 | if val, ok := jobsEvent.Object.(*batchv1.Job); ok { 421 | resolver.snapshot.Jobs.Store(val.UID, *val) 422 | } 423 | case watch.Deleted: 424 | if val, ok := jobsEvent.Object.(*batchv1.Job); ok { 425 | resolver.snapshot.Jobs.Delete(val.UID) 426 | } 427 | } 428 | } 429 | 430 | func (resolver *K8sIPResolver) handleServicesWatchEvent(servicesEvent *watch.Event) { 431 | switch servicesEvent.Type { 432 | case watch.Added, watch.Modified: 433 | service, ok := servicesEvent.Object.(*v1.Service) 434 | if !ok { 435 | return 436 | } 437 | resolver.snapshot.Services.Store(service.UID, *service) 438 | 439 | // services has (potentially multiple) ClusterIP 440 | workload := Workload{ 441 | Name: service.Name, 442 | Namespace: service.Namespace, 443 | Kind: "Service", 444 | } 445 | 446 | // TODO maybe try to match service to workload 447 | for _, clusterIp := range service.Spec.ClusterIPs { 448 | if clusterIp != "None" { 449 | _, ok := resolver.ipsMap.Load(clusterIp) 450 | if !ok { 451 | resolver.storeWorkloadsIP(clusterIp, &workload) 452 | } 453 | } 454 | } 455 | case watch.Deleted: 456 | if val, ok := servicesEvent.Object.(*v1.Service); ok { 457 | resolver.snapshot.Services.Delete(val.UID) 458 | } 459 | } 460 | } 461 | 462 | func (resolver *K8sIPResolver) handleDeploymentsWatchEvent(deploymentsEvent *watch.Event) { 463 | switch deploymentsEvent.Type { 464 | case watch.Added: 465 | if val, ok := deploymentsEvent.Object.(*appsv1.Deployment); ok { 466 | resolver.snapshot.Deployments.Store(val.UID, *val) 467 | } 468 | case watch.Deleted: 469 | if val, ok := deploymentsEvent.Object.(*appsv1.Deployment); ok { 470 | resolver.snapshot.Deployments.Delete(val.UID) 471 | } 472 | } 473 | } 474 | 475 | func (resolver *K8sIPResolver) handleCronJobsWatchEvent(cronjobsEvent *watch.Event) { 476 | switch cronjobsEvent.Type { 477 | case watch.Added: 478 | if val, ok := cronjobsEvent.Object.(*batchv1.CronJob); ok { 479 | resolver.snapshot.CronJobs.Store(val.UID, *val) 480 | } 481 | if val, ok := cronjobsEvent.Object.(*v1beta1.CronJob); ok { 482 | resolver.snapshot.CronJobs.Store(val.UID, *val) 483 | } 484 | 485 | case watch.Deleted: 486 | if val, ok := cronjobsEvent.Object.(*batchv1.CronJob); ok { 487 | resolver.snapshot.CronJobs.Delete(val.UID) 488 | } 489 | if val, ok := cronjobsEvent.Object.(*v1beta1.CronJob); ok { 490 | resolver.snapshot.CronJobs.Delete(val.UID) 491 | } 492 | } 493 | } 494 | 495 | func (resolver *K8sIPResolver) getResolvedClusterSnapshot() error { 496 | err := resolver.getFullClusterSnapshot() 497 | if err != nil { 498 | return err 499 | } 500 | resolver.updateIpMapping() 501 | return nil 502 | } 503 | 504 | // iterate the API for initial coverage of the cluster's state 505 | func (resolver *K8sIPResolver) getFullClusterSnapshot() error { 506 | pods, err := resolver.clientset.CoreV1().Pods("").List(context.Background(), metav1.ListOptions{}) 507 | if err != nil { 508 | return errors.New("error getting pods, aborting snapshot update") 509 | } 510 | for _, pod := range pods.Items { 511 | resolver.snapshot.Pods.Store(pod.UID, pod) 512 | } 513 | 514 | nodes, err := resolver.clientset.CoreV1().Nodes().List(context.Background(), metav1.ListOptions{}) 515 | if err != nil { 516 | return errors.New("error getting nodes, aborting snapshot update") 517 | } 518 | for _, node := range nodes.Items { 519 | resolver.snapshot.Nodes.Store(node.UID, node) 520 | } 521 | 522 | replicasets, err := resolver.clientset.AppsV1().ReplicaSets("").List(context.Background(), metav1.ListOptions{}) 523 | if err != nil { 524 | return errors.New("error getting replicasets, aborting snapshot update") 525 | } 526 | for _, rs := range replicasets.Items { 527 | resolver.snapshot.ReplicaSets.Store(rs.ObjectMeta.UID, rs) 528 | } 529 | 530 | daemonsets, err := resolver.clientset.AppsV1().DaemonSets("").List(context.Background(), metav1.ListOptions{}) 531 | if err != nil { 532 | return errors.New("error getting daemonsets, aborting snapshot update") 533 | } 534 | for _, ds := range daemonsets.Items { 535 | resolver.snapshot.DaemonSets.Store(ds.ObjectMeta.UID, ds) 536 | } 537 | 538 | statefulsets, err := resolver.clientset.AppsV1().StatefulSets("").List(context.Background(), metav1.ListOptions{}) 539 | if err != nil { 540 | return errors.New("error getting statefulsets, aborting snapshot update") 541 | } 542 | for _, ss := range statefulsets.Items { 543 | resolver.snapshot.StatefulSets.Store(ss.ObjectMeta.UID, ss) 544 | } 545 | 546 | jobs, err := resolver.clientset.BatchV1().Jobs("").List(context.Background(), metav1.ListOptions{}) 547 | if err != nil { 548 | return errors.New("error getting jobs, aborting snapshot update") 549 | } 550 | for _, job := range jobs.Items { 551 | resolver.snapshot.Jobs.Store(job.ObjectMeta.UID, job) 552 | } 553 | 554 | services, err := resolver.clientset.CoreV1().Services("").List(context.Background(), metav1.ListOptions{}) 555 | if err != nil { 556 | return errors.New("error getting services, aborting snapshot update") 557 | } 558 | for _, service := range services.Items { 559 | resolver.snapshot.Services.Store(service.UID, service) 560 | } 561 | 562 | deployments, err := resolver.clientset.AppsV1().Deployments("").List(context.Background(), metav1.ListOptions{}) 563 | if err != nil { 564 | return errors.New("error getting deployments, aborting snapshot update") 565 | } 566 | for _, deployment := range deployments.Items { 567 | resolver.snapshot.Deployments.Store(deployment.UID, deployment) 568 | } 569 | 570 | cronJobs, err := resolver.clientset.BatchV1().CronJobs("").List(context.Background(), metav1.ListOptions{}) 571 | if err != nil { 572 | cronJobs, err := resolver.clientset.BatchV1beta1().CronJobs("").List(context.Background(), metav1.ListOptions{}) 573 | if err != nil { 574 | return errors.New("error getting cronjobs, aborting snapshot update") 575 | } 576 | for _, cronJob := range cronJobs.Items { 577 | resolver.snapshot.CronJobs.Store(cronJob.UID, cronJob) 578 | } 579 | } 580 | for _, cronJob := range cronJobs.Items { 581 | resolver.snapshot.CronJobs.Store(cronJob.UID, cronJob) 582 | } 583 | 584 | return nil 585 | } 586 | 587 | // add mapping from ip to resolved host to an existing map, 588 | // based on the given cluster snapshot 589 | func (resolver *K8sIPResolver) updateIpMapping() { 590 | // because IP collisions may occur and lead to overwrites in the map, the order is important 591 | // we go from less "favorable" to more "favorable" - 592 | // services -> running pods -> nodes 593 | 594 | resolver.snapshot.Services.Range(func(key any, val any) bool { 595 | service, ok := val.(v1.Service) 596 | if !ok { 597 | log.Printf("Type confusion in services map") 598 | return true // continue 599 | } 600 | // services has (potentially multiple) ClusterIP 601 | workload := Workload{ 602 | Name: service.Name, 603 | Namespace: service.Namespace, 604 | Kind: "Service", 605 | } 606 | 607 | // TODO maybe try to match service to workload 608 | for _, clusterIp := range service.Spec.ClusterIPs { 609 | if clusterIp != "None" { 610 | resolver.storeWorkloadsIP(clusterIp, &workload) 611 | } 612 | } 613 | return true 614 | }) 615 | 616 | resolver.snapshot.Pods.Range(func(key, value any) bool { 617 | pod, ok := value.(v1.Pod) 618 | if !ok { 619 | log.Printf("Type confusion in pods map") 620 | return true // continue 621 | } 622 | entry := resolver.resolvePodDescriptor(&pod) 623 | for _, podIp := range pod.Status.PodIPs { 624 | // if ip is already in the map, override only if current pod is running 625 | resolver.storeWorkloadsIP(podIp.IP, &entry) 626 | } 627 | return true 628 | }) 629 | 630 | resolver.snapshot.Nodes.Range(func(key any, value any) bool { 631 | node, ok := value.(v1.Node) 632 | if !ok { 633 | log.Printf("Type confusion in nodes map") 634 | return true // continue 635 | } 636 | for _, nodeAddress := range node.Status.Addresses { 637 | workload := Workload{ 638 | Name: node.Name, 639 | Namespace: "node", 640 | Kind: "node", 641 | } 642 | resolver.storeWorkloadsIP(nodeAddress.Address, &workload) 643 | } 644 | return true 645 | }) 646 | } 647 | 648 | func (resolver *K8sIPResolver) storeWorkloadsIP(ip string, newWorkload *Workload) { 649 | // we want to override existing workload, unless the existing workload is a node and the new one isn't 650 | val, ok := resolver.ipsMap.Load(ip) 651 | if ok { 652 | existingWorkload, ok := val.(Workload) 653 | if ok { 654 | if existingWorkload.Kind == "node" && newWorkload.Kind != "node" { 655 | return 656 | } 657 | } 658 | } 659 | resolver.ipsMap.Store(ip, *newWorkload) 660 | } 661 | 662 | // an ugly function to go up one level in hierarchy. maybe there's a better way to do it 663 | // the snapshot is maintained to avoid using an API request for each resolving 664 | func (resolver *K8sIPResolver) getControllerOfOwner(originalOwner *metav1.OwnerReference) (*metav1.OwnerReference, error) { 665 | switch originalOwner.Kind { 666 | case "ReplicaSet": 667 | replicaSetVal, ok := resolver.snapshot.ReplicaSets.Load(originalOwner.UID) 668 | if !ok { 669 | return nil, errors.New("Missing replicaset for UID " + string(originalOwner.UID)) 670 | } 671 | replicaSet, ok := replicaSetVal.(appsv1.ReplicaSet) 672 | if !ok { 673 | return nil, errors.New("type confusion in replicasets map") 674 | } 675 | return metav1.GetControllerOf(&replicaSet), nil 676 | case "DaemonSet": 677 | daemonSetVal, ok := resolver.snapshot.DaemonSets.Load(originalOwner.UID) 678 | if !ok { 679 | return nil, errors.New("Missing daemonset for UID " + string(originalOwner.UID)) 680 | } 681 | daemonSet, ok := daemonSetVal.(appsv1.DaemonSet) 682 | if !ok { 683 | return nil, errors.New("type confusion in daemonsets map") 684 | } 685 | return metav1.GetControllerOf(&daemonSet), nil 686 | case "StatefulSet": 687 | statefulSetVal, ok := resolver.snapshot.StatefulSets.Load(originalOwner.UID) 688 | if !ok { 689 | return nil, errors.New("Missing statefulset for UID " + string(originalOwner.UID)) 690 | } 691 | statefulSet, ok := statefulSetVal.(appsv1.StatefulSet) 692 | if !ok { 693 | return nil, errors.New("type confusion in statefulsets map") 694 | } 695 | return metav1.GetControllerOf(&statefulSet), nil 696 | case "Job": 697 | jobVal, ok := resolver.snapshot.Jobs.Load(originalOwner.UID) 698 | if !ok { 699 | return nil, errors.New("Missing job for UID " + string(originalOwner.UID)) 700 | } 701 | job, ok := jobVal.(batchv1.Job) 702 | if !ok { 703 | return nil, errors.New("type confusion in jobs map") 704 | } 705 | return metav1.GetControllerOf(&job), nil 706 | case "Deployment": 707 | deploymentVal, ok := resolver.snapshot.Deployments.Load(originalOwner.UID) 708 | if !ok { 709 | return nil, errors.New("Missing deployment for UID " + string(originalOwner.UID)) 710 | } 711 | deployment, ok := deploymentVal.(appsv1.Deployment) 712 | if !ok { 713 | return nil, errors.New("type confusion in deployments map") 714 | } 715 | return metav1.GetControllerOf(&deployment), nil 716 | case "CronJob": 717 | cronJobVal, ok := resolver.snapshot.CronJobs.Load(originalOwner.UID) 718 | if !ok { 719 | return nil, errors.New("Missing cronjob for UID " + string(originalOwner.UID)) 720 | } 721 | cronJob, ok := cronJobVal.(batchv1.CronJob) 722 | if !ok { 723 | cronJob, ok := cronJobVal.(v1beta1.CronJob) 724 | if !ok { 725 | return nil, errors.New("type confusion in cronjobs map") 726 | } 727 | return metav1.GetControllerOf(&cronJob), nil 728 | } 729 | 730 | return metav1.GetControllerOf(&cronJob), nil 731 | } 732 | return nil, errors.New("Unsupported kind for lookup - " + originalOwner.Kind) 733 | } 734 | 735 | func (resolver *K8sIPResolver) resolvePodDescriptor(pod *v1.Pod) Workload { 736 | existing, ok := resolver.snapshot.PodDescriptors.Load(pod.UID) 737 | if ok { 738 | result, ok := existing.(Workload) 739 | if ok { 740 | return result 741 | } 742 | } 743 | var err error 744 | name := pod.Name 745 | namespace := pod.Namespace 746 | kind := "pod" 747 | result := Workload{ 748 | Name: name, 749 | Namespace: namespace, 750 | Kind: kind, 751 | } 752 | 753 | if resolver.traverseUpHierarchy { 754 | owner := metav1.GetControllerOf(pod) 755 | // climbing up the owners' hierarchy. if an error occurs, we take the data we got and save 756 | // the error to know we shouldn't save this resolution to the descriptors map and retry later. 757 | for owner != nil { 758 | name = owner.Name 759 | kind = owner.Kind 760 | owner, err = resolver.getControllerOfOwner(owner) 761 | if err != nil { 762 | log.Printf("Warning: couldn't retrieve owner of %v - %v. This might happen when starting up", name, err) 763 | } 764 | } 765 | 766 | result.Name = name 767 | result.Kind = kind 768 | } else { 769 | owner := metav1.GetControllerOf(pod) 770 | if owner != nil { 771 | result.Owner = owner.Name 772 | } 773 | } 774 | 775 | if err == nil { 776 | resolver.snapshot.PodDescriptors.Store(pod.UID, result) 777 | } 778 | return result 779 | } 780 | -------------------------------------------------------------------------------- /pkg/k8s/ipresolver_test.go: -------------------------------------------------------------------------------- 1 | package k8s_test 2 | 3 | import ( 4 | "log" 5 | "testing" 6 | "time" 7 | 8 | "github.com/groundcover-com/caretta/pkg/k8s" 9 | 10 | "github.com/google/uuid" 11 | "github.com/stretchr/testify/assert" 12 | appsv1 "k8s.io/api/apps/v1" 13 | batchv1 "k8s.io/api/batch/v1" 14 | v1 "k8s.io/api/core/v1" 15 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 16 | "k8s.io/apimachinery/pkg/runtime" 17 | "k8s.io/apimachinery/pkg/types" 18 | "k8s.io/apimachinery/pkg/watch" 19 | testclient "k8s.io/client-go/kubernetes/fake" 20 | k8stesting "k8s.io/client-go/testing" 21 | ) 22 | 23 | type podDescriptor struct { 24 | Name string 25 | Namespace string 26 | IP string 27 | Phase v1.PodPhase 28 | UID types.UID 29 | Controller *workloadResourceDescriptor 30 | } 31 | 32 | type nodeDescriptor struct { 33 | Name string 34 | IP string 35 | UID types.UID 36 | } 37 | 38 | type workloadResourceDescriptor struct { 39 | Name string 40 | Namespace string 41 | UID types.UID 42 | Kind string 43 | } 44 | 45 | func (desc *workloadResourceDescriptor) CreateObject() runtime.Object { 46 | switch desc.Kind { 47 | case "Deployment": 48 | { 49 | return &appsv1.Deployment{ 50 | ObjectMeta: metav1.ObjectMeta{ 51 | Name: desc.Name, 52 | Namespace: desc.Namespace, 53 | UID: desc.UID, 54 | }, 55 | } 56 | } 57 | case "ReplicaSet": 58 | { 59 | return &appsv1.ReplicaSet{ 60 | ObjectMeta: metav1.ObjectMeta{ 61 | Name: desc.Name, 62 | Namespace: desc.Namespace, 63 | UID: desc.UID, 64 | }, 65 | } 66 | } 67 | case "DaemonSet": 68 | { 69 | return &appsv1.DaemonSet{ 70 | ObjectMeta: metav1.ObjectMeta{ 71 | Name: desc.Name, 72 | Namespace: desc.Namespace, 73 | UID: desc.UID, 74 | }, 75 | } 76 | } 77 | case "StatefulSet": 78 | { 79 | return &appsv1.StatefulSet{ 80 | ObjectMeta: metav1.ObjectMeta{ 81 | Name: desc.Name, 82 | Namespace: desc.Namespace, 83 | UID: desc.UID, 84 | }, 85 | } 86 | } 87 | case "Job": 88 | { 89 | return &batchv1.Job{ 90 | ObjectMeta: metav1.ObjectMeta{ 91 | Name: desc.Name, 92 | Namespace: desc.Namespace, 93 | UID: desc.UID, 94 | }, 95 | } 96 | } 97 | case "Service": 98 | { 99 | return &v1.Service{ 100 | ObjectMeta: metav1.ObjectMeta{ 101 | Name: desc.Name, 102 | Namespace: desc.Namespace, 103 | UID: desc.UID, 104 | }, 105 | } 106 | } 107 | case "CronJob": 108 | { 109 | return &batchv1.CronJob{ 110 | ObjectMeta: metav1.ObjectMeta{ 111 | Name: desc.Name, 112 | Namespace: desc.Namespace, 113 | UID: desc.UID, 114 | }, 115 | } 116 | } 117 | } 118 | return nil 119 | } 120 | 121 | func generatePod(pod podDescriptor) runtime.Object { 122 | newPod := v1.Pod{ 123 | ObjectMeta: metav1.ObjectMeta{ 124 | Name: pod.Name, 125 | Namespace: pod.Namespace, 126 | UID: pod.UID, 127 | }, 128 | Status: v1.PodStatus{ 129 | PodIP: pod.IP, 130 | PodIPs: []v1.PodIP{ 131 | {IP: pod.IP}, 132 | }, 133 | }, 134 | } 135 | if pod.Controller != nil { 136 | newTrue := new(bool) 137 | *newTrue = true 138 | ref := metav1.OwnerReference{ 139 | Kind: pod.Controller.Kind, 140 | Name: pod.Controller.Name, 141 | UID: pod.Controller.UID, 142 | Controller: newTrue, 143 | } 144 | newPod.OwnerReferences = append(newPod.OwnerReferences, ref) 145 | } 146 | return &newPod 147 | 148 | } 149 | 150 | func generateWorkloadResource(desc workloadResourceDescriptor) runtime.Object { 151 | return desc.CreateObject() 152 | } 153 | 154 | func generateNode(node nodeDescriptor) runtime.Object { 155 | return &v1.Node{ 156 | ObjectMeta: metav1.ObjectMeta{ 157 | Name: node.Name, 158 | UID: node.UID, 159 | }, 160 | Status: v1.NodeStatus{ 161 | Addresses: []v1.NodeAddress{ 162 | { 163 | Type: "InternalIP", 164 | Address: node.IP, 165 | }, 166 | }, 167 | }, 168 | } 169 | } 170 | 171 | func generateClusterObjects(pods []podDescriptor, workloadsResources []workloadResourceDescriptor, nodes []nodeDescriptor) []runtime.Object { 172 | result := make([]runtime.Object, 0, len(pods)+len(workloadsResources)+len(nodes)) 173 | for _, pod := range pods { 174 | newPod := generatePod(pod) 175 | result = append(result, newPod) 176 | } 177 | for _, desc := range workloadsResources { 178 | result = append(result, generateWorkloadResource(desc)) 179 | } 180 | for _, node := range nodes { 181 | result = append(result, generateNode(node)) 182 | } 183 | return result 184 | } 185 | 186 | type testStep struct { 187 | shouldWait bool 188 | newPods []podDescriptor 189 | newNodes []nodeDescriptor 190 | newWorkloadResource []workloadResourceDescriptor 191 | modifiedPods []podDescriptor 192 | modifiedNodes []nodeDescriptor 193 | modifiedWorkloadResources []workloadResourceDescriptor 194 | expectedResolves map[string]k8s.Workload 195 | } 196 | 197 | type testScenario struct { 198 | description string 199 | initialState testStep 200 | shouldTraverse bool 201 | updateSteps []testStep 202 | } 203 | 204 | type fakeWatchers struct { 205 | nodesWatcher *watch.FakeWatcher 206 | podsWatcher *watch.FakeWatcher 207 | deploymentsWatcher *watch.FakeWatcher 208 | replicasetsWatcher *watch.FakeWatcher 209 | daemonsetsWatcher *watch.FakeWatcher 210 | statefulsetsWatcher *watch.FakeWatcher 211 | jobsWatcher *watch.FakeWatcher 212 | servicesWatcher *watch.FakeWatcher 213 | cronjobsWatcher *watch.FakeWatcher 214 | } 215 | 216 | func createPrependWatchers(clientset *testclient.Clientset) fakeWatchers { 217 | watchers := fakeWatchers{ 218 | nodesWatcher: watch.NewFake(), 219 | podsWatcher: watch.NewFake(), 220 | deploymentsWatcher: watch.NewFake(), 221 | replicasetsWatcher: watch.NewFake(), 222 | daemonsetsWatcher: watch.NewFake(), 223 | statefulsetsWatcher: watch.NewFake(), 224 | jobsWatcher: watch.NewFake(), 225 | servicesWatcher: watch.NewFake(), 226 | cronjobsWatcher: watch.NewFake(), 227 | } 228 | clientset.PrependWatchReactor("nodes", k8stesting.DefaultWatchReactor(watchers.nodesWatcher, nil)) 229 | clientset.PrependWatchReactor("pods", k8stesting.DefaultWatchReactor(watchers.podsWatcher, nil)) 230 | clientset.PrependWatchReactor("deployments", k8stesting.DefaultWatchReactor(watchers.deploymentsWatcher, nil)) 231 | clientset.PrependWatchReactor("replicasets", k8stesting.DefaultWatchReactor(watchers.replicasetsWatcher, nil)) 232 | clientset.PrependWatchReactor("daemonsets", k8stesting.DefaultWatchReactor(watchers.daemonsetsWatcher, nil)) 233 | clientset.PrependWatchReactor("statefulsets", k8stesting.DefaultWatchReactor(watchers.statefulsetsWatcher, nil)) 234 | clientset.PrependWatchReactor("jobs", k8stesting.DefaultWatchReactor(watchers.jobsWatcher, nil)) 235 | clientset.PrependWatchReactor("services", k8stesting.DefaultWatchReactor(watchers.servicesWatcher, nil)) 236 | clientset.PrependWatchReactor("cronjobs", k8stesting.DefaultWatchReactor(watchers.cronjobsWatcher, nil)) 237 | return watchers 238 | } 239 | 240 | func addObject(watchers fakeWatchers, obj runtime.Object, kind string) { 241 | switch kind { 242 | case "Pod": 243 | { 244 | watchers.podsWatcher.Add(obj) 245 | } 246 | case "node": 247 | { 248 | watchers.nodesWatcher.Add(obj) 249 | } 250 | case "Deployment": 251 | { 252 | watchers.deploymentsWatcher.Add(obj) 253 | } 254 | case "ReplicaSet": 255 | { 256 | watchers.replicasetsWatcher.Add(obj) 257 | } 258 | case "DaemonSet": 259 | { 260 | watchers.daemonsetsWatcher.Add(obj) 261 | } 262 | case "StatefulSet": 263 | { 264 | watchers.statefulsetsWatcher.Add(obj) 265 | } 266 | case "Job": 267 | { 268 | watchers.jobsWatcher.Add(obj) 269 | } 270 | case "Service": 271 | { 272 | watchers.servicesWatcher.Add(obj) 273 | } 274 | case "CronJob": 275 | { 276 | watchers.cronjobsWatcher.Add(obj) 277 | } 278 | } 279 | } 280 | 281 | func modifyObject(watchers fakeWatchers, obj runtime.Object, kind string) { 282 | switch kind { 283 | case "Pod": 284 | { 285 | watchers.podsWatcher.Modify(obj) 286 | } 287 | case "node": 288 | { 289 | watchers.nodesWatcher.Modify(obj) 290 | } 291 | case "Deployment": 292 | { 293 | watchers.deploymentsWatcher.Modify(obj) 294 | } 295 | case "ReplicaSet": 296 | { 297 | watchers.replicasetsWatcher.Modify(obj) 298 | } 299 | case "DaemonSet": 300 | { 301 | watchers.daemonsetsWatcher.Modify(obj) 302 | } 303 | case "StatefulSet": 304 | { 305 | watchers.statefulsetsWatcher.Modify(obj) 306 | } 307 | case "Job": 308 | { 309 | watchers.jobsWatcher.Modify(obj) 310 | } 311 | case "Service": 312 | { 313 | watchers.servicesWatcher.Modify(obj) 314 | } 315 | case "CronJob": 316 | { 317 | watchers.cronjobsWatcher.Modify(obj) 318 | } 319 | default: 320 | { 321 | log.Printf("unhandled kind %v", kind) 322 | } 323 | } 324 | } 325 | 326 | func runTest(t *testing.T, test testScenario) { 327 | assert := assert.New(t) 328 | // Arrange 1: mocks and initial state 329 | originalObjs := generateClusterObjects(test.initialState.newPods, test.initialState.newWorkloadResource, test.initialState.newNodes) 330 | fakeClient := testclient.NewSimpleClientset(originalObjs...) 331 | fakeWatchers := createPrependWatchers(fakeClient) 332 | 333 | resolver, err := k8s.NewK8sIPResolver(fakeClient, false, test.shouldTraverse) 334 | assert.NoError(err) 335 | 336 | // Act 1: process initial state 337 | err = resolver.StartWatching() 338 | assert.NoError(err) 339 | 340 | // Assert 1: resolve and compare to expected, original state 341 | for ipToCheck, expectedWorkload := range test.initialState.expectedResolves { 342 | resultWorkload := resolver.ResolveIP(ipToCheck) 343 | assert.Equal(expectedWorkload, resultWorkload) 344 | } 345 | 346 | for _, step := range test.updateSteps { 347 | // Arrange 2+n: update the state via watchers 348 | for _, newPod := range step.newPods { 349 | podObj := generatePod(newPod) 350 | addObject(fakeWatchers, podObj, "Pod") 351 | } 352 | for _, newWorkloadResource := range step.newWorkloadResource { 353 | obj := generateWorkloadResource(newWorkloadResource) 354 | addObject(fakeWatchers, obj, newWorkloadResource.Kind) 355 | } 356 | for _, newNode := range step.newNodes { 357 | obj := generateNode(newNode) 358 | addObject(fakeWatchers, obj, "node") 359 | } 360 | for _, modifiedPod := range step.modifiedPods { 361 | podObj := generatePod(modifiedPod) 362 | modifyObject(fakeWatchers, podObj, "Pod") 363 | } 364 | for _, modifiedWorkloadResource := range step.newWorkloadResource { 365 | obj := generateWorkloadResource(modifiedWorkloadResource) 366 | modifyObject(fakeWatchers, obj, modifiedWorkloadResource.Kind) 367 | } 368 | for _, modifiedNode := range step.modifiedNodes { 369 | obj := generateNode(modifiedNode) 370 | modifyObject(fakeWatchers, obj, "node") 371 | } 372 | 373 | if step.shouldWait { 374 | time.Sleep(1 * time.Second) 375 | } 376 | 377 | // Act+Assert 2+n 378 | for ipToResolve, expectedWorkload := range step.expectedResolves { 379 | assert.Equal(expectedWorkload, resolver.ResolveIP(ipToResolve)) 380 | } 381 | 382 | } 383 | } 384 | 385 | var testDeployment = workloadResourceDescriptor{"deployment1", "namespaceA", types.UID(uuid.NewString()), "Deployment"} 386 | var testReplicaSet = workloadResourceDescriptor{"replicaset1", "namespaceA", types.UID(uuid.NewString()), "ReplicaSet"} 387 | var testDaemonSet = workloadResourceDescriptor{"daemonset1", "namespaceA", types.UID(uuid.NewString()), "DaemonSet"} 388 | var testStatefulSet = workloadResourceDescriptor{"statefulset1", "namespaceA", types.UID(uuid.NewString()), "StatefulSet"} 389 | var testJob = workloadResourceDescriptor{"job1", "namespaceA", types.UID(uuid.NewString()), "Job"} 390 | var testCronjob = workloadResourceDescriptor{"cronjob1", "namespaceA", types.UID(uuid.NewString()), "CronJob"} 391 | 392 | func TestResolving(t *testing.T) { 393 | var tests = []testScenario{ 394 | { 395 | description: "unsuccessful resolving result should be external", 396 | shouldTraverse: true, 397 | initialState: testStep{ 398 | shouldWait: false, 399 | newPods: []podDescriptor{ 400 | {"pod1", "namespaceA", "1.1.1.1", v1.PodRunning, types.UID(uuid.New().String()), nil}, 401 | }, 402 | expectedResolves: map[string]k8s.Workload{ 403 | "1.1.1.2": { 404 | Name: "1.1.1.2", 405 | Namespace: "external", 406 | Kind: "external", 407 | }, 408 | }, 409 | }, 410 | }, 411 | { 412 | description: "initial snapshot 1 pod resolve to pod1", 413 | shouldTraverse: true, 414 | initialState: testStep{ 415 | shouldWait: false, 416 | newPods: []podDescriptor{ 417 | {"pod1", "namespaceA", "1.1.1.1", v1.PodRunning, types.UID(uuid.New().String()), nil}, 418 | }, 419 | expectedResolves: map[string]k8s.Workload{ 420 | "1.1.1.1": { 421 | Name: "pod1", 422 | Namespace: "namespaceA", 423 | Kind: "pod", 424 | }, 425 | }, 426 | }, 427 | }, 428 | { 429 | description: "initial snapshot 3 pods resolve to each pod", 430 | shouldTraverse: true, 431 | initialState: testStep{ 432 | shouldWait: false, 433 | newPods: []podDescriptor{ 434 | {"pod1", "namespaceA", "1.1.1.1", v1.PodRunning, types.UID(uuid.New().String()), nil}, 435 | {"pod2", "namespaceA", "1.1.1.2", v1.PodRunning, types.UID(uuid.New().String()), nil}, 436 | {"pod3", "namespaceA", "1.1.1.3", v1.PodRunning, types.UID(uuid.New().String()), nil}, 437 | }, 438 | expectedResolves: map[string]k8s.Workload{ 439 | "1.1.1.1": { 440 | Name: "pod1", 441 | Namespace: "namespaceA", 442 | Kind: "pod", 443 | }, 444 | "1.1.1.2": { 445 | Name: "pod2", 446 | Namespace: "namespaceA", 447 | Kind: "pod", 448 | }, 449 | "1.1.1.3": { 450 | Name: "pod3", 451 | Namespace: "namespaceA", 452 | Kind: "pod", 453 | }, 454 | }, 455 | }, 456 | }, 457 | { 458 | description: "empty initial 1 pod added resolve to pod", 459 | shouldTraverse: true, 460 | initialState: testStep{ 461 | shouldWait: false, 462 | expectedResolves: map[string]k8s.Workload{ 463 | "1.1.1.1": { 464 | Name: "1.1.1.1", 465 | Namespace: "external", 466 | Kind: "external", 467 | }, 468 | }, 469 | }, 470 | updateSteps: []testStep{ 471 | { 472 | shouldWait: true, 473 | newPods: []podDescriptor{ 474 | {"pod1", "namespaceA", "1.1.1.1", v1.PodRunning, types.UID(uuid.New().String()), nil}, 475 | }, 476 | expectedResolves: map[string]k8s.Workload{ 477 | "1.1.1.1": { 478 | Name: "pod1", 479 | Namespace: "namespaceA", 480 | Kind: "pod", 481 | }, 482 | }, 483 | }, 484 | }, 485 | }, 486 | { 487 | description: "empty initial 1 node added resolve to node", 488 | shouldTraverse: true, 489 | initialState: testStep{ 490 | shouldWait: false, 491 | expectedResolves: map[string]k8s.Workload{ 492 | "1.1.1.0": { 493 | Name: "1.1.1.0", 494 | Namespace: "external", 495 | Kind: "external", 496 | }, 497 | }, 498 | }, 499 | updateSteps: []testStep{ 500 | { 501 | shouldWait: true, 502 | newNodes: []nodeDescriptor{ 503 | {"Node1", "1.1.1.0", types.UID(uuid.NewString())}, 504 | }, 505 | expectedResolves: map[string]k8s.Workload{ 506 | "1.1.1.0": { 507 | Name: "Node1", 508 | Namespace: "node", 509 | Kind: "node", 510 | }, 511 | }, 512 | }, 513 | }, 514 | }, 515 | { 516 | description: "empty initial 1 node, 1 pod added resolve to each", 517 | shouldTraverse: true, 518 | initialState: testStep{ 519 | shouldWait: false, 520 | expectedResolves: map[string]k8s.Workload{ 521 | "1.1.1.0": { 522 | Name: "1.1.1.0", 523 | Namespace: "external", 524 | Kind: "external", 525 | }, 526 | }, 527 | }, 528 | updateSteps: []testStep{ 529 | { 530 | shouldWait: true, 531 | newPods: []podDescriptor{ 532 | {"pod1", "namespaceA", "1.1.1.1", v1.PodRunning, types.UID(uuid.New().String()), nil}, 533 | }, 534 | newNodes: []nodeDescriptor{ 535 | {"Node1", "1.1.1.0", types.UID(uuid.NewString())}, 536 | }, 537 | expectedResolves: map[string]k8s.Workload{ 538 | "1.1.1.0": { 539 | Name: "Node1", 540 | Namespace: "node", 541 | Kind: "node", 542 | }, 543 | "1.1.1.1": { 544 | Name: "pod1", 545 | Namespace: "namespaceA", 546 | Kind: "pod", 547 | }, 548 | }, 549 | }, 550 | }, 551 | }, 552 | { 553 | description: "1 pod changing ip resolve both ips to pod", 554 | shouldTraverse: true, 555 | initialState: testStep{ 556 | shouldWait: false, 557 | newPods: []podDescriptor{ 558 | {"pod1", "namespaceA", "1.1.1.1", v1.PodRunning, types.UID(uuid.New().String()), nil}, 559 | }, 560 | expectedResolves: map[string]k8s.Workload{ 561 | "1.1.1.1": { 562 | Name: "pod1", 563 | Namespace: "namespaceA", 564 | Kind: "pod", 565 | }, 566 | "1.1.1.2": { 567 | Name: "1.1.1.2", 568 | Namespace: "external", 569 | Kind: "external", 570 | }, 571 | }, 572 | }, 573 | updateSteps: []testStep{ 574 | { 575 | shouldWait: true, 576 | modifiedPods: []podDescriptor{ 577 | {"pod1", "namespaceA", "1.1.1.2", v1.PodRunning, types.UID(uuid.New().String()), nil}, 578 | }, 579 | expectedResolves: map[string]k8s.Workload{ 580 | "1.1.1.1": { // the resolver shouldn't delete old not-reused entries 581 | Name: "pod1", 582 | Namespace: "namespaceA", 583 | Kind: "pod", 584 | }, 585 | "1.1.1.2": { 586 | Name: "pod1", 587 | Namespace: "namespaceA", 588 | Kind: "pod", 589 | }, 590 | }, 591 | }, 592 | }, 593 | }, 594 | { 595 | description: "1 pod changing ip old ip is reused resolve reused ip to new pod", 596 | shouldTraverse: true, 597 | initialState: testStep{ 598 | shouldWait: false, 599 | newPods: []podDescriptor{ 600 | {"pod1", "namespaceA", "1.1.1.1", v1.PodRunning, types.UID("1"), nil}, 601 | }, 602 | expectedResolves: map[string]k8s.Workload{ 603 | "1.1.1.1": { 604 | Name: "pod1", 605 | Namespace: "namespaceA", 606 | Kind: "pod", 607 | }, 608 | "1.1.1.2": { 609 | Name: "1.1.1.2", 610 | Namespace: "external", 611 | Kind: "external", 612 | }, 613 | }, 614 | }, 615 | updateSteps: []testStep{ 616 | { 617 | shouldWait: false, 618 | modifiedPods: []podDescriptor{ 619 | {"pod1", "namespaceA", "1.1.1.2", v1.PodRunning, types.UID("1"), nil}, 620 | }, 621 | expectedResolves: map[string]k8s.Workload{}, 622 | }, 623 | { 624 | shouldWait: true, 625 | newPods: []podDescriptor{ 626 | {"pod2", "namespaceA", "1.1.1.1", v1.PodRunning, types.UID(uuid.New().String()), nil}, 627 | }, 628 | expectedResolves: map[string]k8s.Workload{ 629 | "1.1.1.1": { 630 | Name: "pod2", 631 | Namespace: "namespaceA", 632 | Kind: "pod", 633 | }, 634 | "1.1.1.2": { 635 | Name: "pod1", 636 | Namespace: "namespaceA", 637 | Kind: "pod", 638 | }, 639 | }, 640 | }, 641 | }, 642 | }, 643 | { 644 | description: "1 pod changing ip old ip is reused by node resolve ip to new node", 645 | shouldTraverse: true, 646 | initialState: testStep{ 647 | shouldWait: false, 648 | newPods: []podDescriptor{ 649 | {"pod1", "namespaceA", "1.1.1.1", v1.PodRunning, types.UID("1"), nil}, 650 | }, 651 | expectedResolves: map[string]k8s.Workload{ 652 | "1.1.1.1": { 653 | Name: "pod1", 654 | Namespace: "namespaceA", 655 | Kind: "pod", 656 | }, 657 | "1.1.1.2": { 658 | Name: "1.1.1.2", 659 | Namespace: "external", 660 | Kind: "external", 661 | }, 662 | }, 663 | }, 664 | updateSteps: []testStep{ 665 | { 666 | shouldWait: false, 667 | modifiedPods: []podDescriptor{ 668 | {"pod1", "namespaceA", "1.1.1.2", v1.PodRunning, types.UID("1"), nil}, 669 | }, 670 | expectedResolves: map[string]k8s.Workload{}, 671 | }, 672 | { 673 | shouldWait: true, 674 | newNodes: []nodeDescriptor{ 675 | {"Node1", "1.1.1.1", types.UID(uuid.NewString())}, 676 | }, 677 | expectedResolves: map[string]k8s.Workload{ 678 | "1.1.1.1": { 679 | Name: "Node1", 680 | Namespace: "node", 681 | Kind: "node", 682 | }, 683 | "1.1.1.2": { 684 | Name: "pod1", 685 | Namespace: "namespaceA", 686 | Kind: "pod", 687 | }, 688 | }, 689 | }, 690 | }, 691 | }, 692 | { 693 | description: "1 node changing ip resolve both ips to node", 694 | shouldTraverse: true, 695 | initialState: testStep{ 696 | shouldWait: false, 697 | newPods: []podDescriptor{}, 698 | newNodes: []nodeDescriptor{ 699 | {"Node1", "1.1.1.0", types.UID("1")}, 700 | }, 701 | expectedResolves: map[string]k8s.Workload{}, 702 | }, 703 | updateSteps: []testStep{ 704 | { 705 | shouldWait: true, 706 | modifiedNodes: []nodeDescriptor{ 707 | {"Node1", "1.1.2.0", types.UID("1")}, 708 | }, 709 | modifiedWorkloadResources: []workloadResourceDescriptor{}, 710 | expectedResolves: map[string]k8s.Workload{ 711 | "1.1.1.0": { // resolver isn't expected to delete old not-reused entries 712 | Name: "Node1", 713 | Namespace: "node", 714 | Kind: "node", 715 | }, 716 | "1.1.2.0": { 717 | Name: "Node1", 718 | Namespace: "node", 719 | Kind: "node", 720 | }, 721 | }, 722 | }, 723 | }, 724 | }, 725 | { 726 | description: "1 node changing ip, reused by another node resolve reused ip to new node", 727 | shouldTraverse: true, 728 | initialState: testStep{ 729 | shouldWait: true, 730 | newNodes: []nodeDescriptor{ 731 | {"Node1", "1.1.1.0", types.UID("1")}, 732 | }, 733 | expectedResolves: map[string]k8s.Workload{}, 734 | }, 735 | updateSteps: []testStep{ 736 | { 737 | shouldWait: true, 738 | modifiedNodes: []nodeDescriptor{ 739 | {"Node1", "1.1.2.0", types.UID("1")}, 740 | }, 741 | expectedResolves: map[string]k8s.Workload{}, 742 | }, 743 | { 744 | shouldWait: true, 745 | newNodes: []nodeDescriptor{ 746 | {"Node2", "1.1.1.0", types.UID("2")}, 747 | }, 748 | modifiedNodes: []nodeDescriptor{ 749 | {"Node1", "1.1.2.0", types.UID("1")}, 750 | }, 751 | expectedResolves: map[string]k8s.Workload{ 752 | "1.1.1.0": { 753 | Name: "Node2", 754 | Namespace: "node", 755 | Kind: "node", 756 | }, 757 | "1.1.2.0": { 758 | Name: "Node1", 759 | Namespace: "node", 760 | Kind: "node", 761 | }, 762 | }, 763 | }, 764 | }, 765 | }, 766 | { 767 | description: "pod with hostip wont override node", 768 | shouldTraverse: true, 769 | initialState: testStep{ 770 | shouldWait: false, 771 | newNodes: []nodeDescriptor{ 772 | {"Node1", "1.1.1.0", types.UID(uuid.NewString())}, 773 | }, 774 | expectedResolves: map[string]k8s.Workload{}, 775 | }, 776 | updateSteps: []testStep{ 777 | { 778 | shouldWait: true, 779 | newPods: []podDescriptor{ 780 | {"pod1", "namespaceA", "1.1.1.0", v1.PodRunning, types.UID(uuid.New().String()), nil}, 781 | }, 782 | expectedResolves: map[string]k8s.Workload{ 783 | "1.1.1.0": { 784 | Name: "Node1", 785 | Namespace: "node", 786 | Kind: "node", 787 | }, 788 | }, 789 | }, 790 | }, 791 | }, 792 | } 793 | for _, test := range tests { 794 | t.Run(test.description, func(t *testing.T) { 795 | runTest(t, test) 796 | }) 797 | } 798 | } 799 | 800 | func TestControllersResolving(t *testing.T) { 801 | var controllersTests = []testScenario{ 802 | { 803 | description: "initial snapshot 1 pod controlled by deployment resolve to deployment", 804 | shouldTraverse: true, 805 | initialState: testStep{ 806 | shouldWait: false, 807 | newPods: []podDescriptor{ 808 | {"pod1", "namespaceA", "1.1.1.1", v1.PodRunning, types.UID(uuid.NewString()), &testDeployment}, 809 | }, 810 | newWorkloadResource: []workloadResourceDescriptor{testDeployment}, 811 | expectedResolves: map[string]k8s.Workload{ 812 | "1.1.1.1": { 813 | Name: testDeployment.Name, 814 | Namespace: testDeployment.Namespace, 815 | Kind: testDeployment.Kind, 816 | }, 817 | }, 818 | }, 819 | }, 820 | { 821 | description: "initial snapshot 1 pod controlled by replicaset resolve to replicaset", 822 | shouldTraverse: true, 823 | initialState: testStep{ 824 | shouldWait: false, 825 | newPods: []podDescriptor{ 826 | {"pod1", "namespaceA", "1.1.1.1", v1.PodRunning, types.UID(uuid.NewString()), &testReplicaSet}, 827 | }, 828 | newWorkloadResource: []workloadResourceDescriptor{testReplicaSet}, 829 | expectedResolves: map[string]k8s.Workload{ 830 | "1.1.1.1": { 831 | Name: testReplicaSet.Name, 832 | Namespace: testReplicaSet.Namespace, 833 | Kind: testReplicaSet.Kind, 834 | }, 835 | }, 836 | }, 837 | }, 838 | { 839 | description: "initial snapshot 1 pod controlled by daemonset resolve to daemonset", 840 | shouldTraverse: true, 841 | initialState: testStep{ 842 | shouldWait: false, 843 | newPods: []podDescriptor{ 844 | {"pod1", "namespaceA", "1.1.1.1", v1.PodRunning, types.UID(uuid.NewString()), &testDaemonSet}, 845 | }, 846 | newWorkloadResource: []workloadResourceDescriptor{testDaemonSet}, 847 | expectedResolves: map[string]k8s.Workload{ 848 | "1.1.1.1": { 849 | Name: testDaemonSet.Name, 850 | Namespace: testDaemonSet.Namespace, 851 | Kind: testDaemonSet.Kind, 852 | }, 853 | }, 854 | }, 855 | }, 856 | { 857 | description: "initial snapshot 1 pod controlled by statefulset resolve to statefulset", 858 | shouldTraverse: true, 859 | initialState: testStep{ 860 | shouldWait: false, 861 | newPods: []podDescriptor{ 862 | {"pod1", "namespaceA", "1.1.1.1", v1.PodRunning, types.UID(uuid.NewString()), &testStatefulSet}, 863 | }, 864 | newWorkloadResource: []workloadResourceDescriptor{testStatefulSet}, 865 | expectedResolves: map[string]k8s.Workload{ 866 | "1.1.1.1": { 867 | Name: testStatefulSet.Name, 868 | Namespace: testStatefulSet.Namespace, 869 | Kind: testStatefulSet.Kind, 870 | }, 871 | }, 872 | }, 873 | }, 874 | { 875 | description: "initial snapshot 1 pod controlled by job resolve to job", 876 | shouldTraverse: true, 877 | initialState: testStep{ 878 | shouldWait: false, 879 | newPods: []podDescriptor{ 880 | {"pod1", "namespaceA", "1.1.1.1", v1.PodRunning, types.UID(uuid.NewString()), &testJob}, 881 | }, 882 | newWorkloadResource: []workloadResourceDescriptor{testJob}, 883 | expectedResolves: map[string]k8s.Workload{ 884 | "1.1.1.1": { 885 | Name: testJob.Name, 886 | Namespace: testJob.Namespace, 887 | Kind: testJob.Kind, 888 | }, 889 | }, 890 | }, 891 | }, 892 | { 893 | description: "initial snapshot 1 pod controlled by cronjob resolve to cronjob", 894 | shouldTraverse: true, 895 | initialState: testStep{ 896 | shouldWait: false, 897 | newPods: []podDescriptor{ 898 | {"pod1", "namespaceA", "1.1.1.1", v1.PodRunning, types.UID(uuid.NewString()), &testCronjob}, 899 | }, 900 | newWorkloadResource: []workloadResourceDescriptor{testCronjob}, 901 | expectedResolves: map[string]k8s.Workload{ 902 | "1.1.1.1": { 903 | Name: testCronjob.Name, 904 | Namespace: testCronjob.Namespace, 905 | Kind: testCronjob.Kind, 906 | }, 907 | }, 908 | }, 909 | }, 910 | { 911 | description: "initial snapshot 1 pod controlled by deployment owned by deployment", 912 | shouldTraverse: false, 913 | initialState: testStep{ 914 | shouldWait: false, 915 | newPods: []podDescriptor{ 916 | {"pod1", "namespaceA", "1.1.1.1", v1.PodRunning, types.UID(uuid.NewString()), &testDeployment}, 917 | }, 918 | newWorkloadResource: []workloadResourceDescriptor{testDeployment}, 919 | expectedResolves: map[string]k8s.Workload{ 920 | "1.1.1.1": { 921 | Name: "pod1", 922 | Namespace: "namespaceA", 923 | Kind: "pod", 924 | Owner: testDeployment.Name, 925 | }, 926 | }, 927 | }, 928 | }, 929 | { 930 | description: "initial snapshot 1 pod controlled by replicaset owned by replicaset", 931 | shouldTraverse: false, 932 | initialState: testStep{ 933 | shouldWait: false, 934 | newPods: []podDescriptor{ 935 | {"pod1", "namespaceA", "1.1.1.1", v1.PodRunning, types.UID(uuid.NewString()), &testReplicaSet}, 936 | }, 937 | newWorkloadResource: []workloadResourceDescriptor{testReplicaSet}, 938 | expectedResolves: map[string]k8s.Workload{ 939 | "1.1.1.1": { 940 | Name: "pod1", 941 | Namespace: "namespaceA", 942 | Kind: "pod", 943 | Owner: testReplicaSet.Name, 944 | }, 945 | }, 946 | }, 947 | }, 948 | { 949 | description: "initial snapshot 1 pod controlled by daemonset owned by daemonset", 950 | shouldTraverse: false, 951 | initialState: testStep{ 952 | shouldWait: false, 953 | newPods: []podDescriptor{ 954 | {"pod1", "namespaceA", "1.1.1.1", v1.PodRunning, types.UID(uuid.NewString()), &testDaemonSet}, 955 | }, 956 | newWorkloadResource: []workloadResourceDescriptor{testDaemonSet}, 957 | expectedResolves: map[string]k8s.Workload{ 958 | "1.1.1.1": { 959 | Name: "pod1", 960 | Namespace: "namespaceA", 961 | Kind: "pod", 962 | Owner: testDaemonSet.Name, 963 | }, 964 | }, 965 | }, 966 | }, 967 | { 968 | description: "initial snapshot 1 pod controlled by statefulset owned by statefulset", 969 | shouldTraverse: false, 970 | initialState: testStep{ 971 | shouldWait: false, 972 | newPods: []podDescriptor{ 973 | {"pod1", "namespaceA", "1.1.1.1", v1.PodRunning, types.UID(uuid.NewString()), &testStatefulSet}, 974 | }, 975 | newWorkloadResource: []workloadResourceDescriptor{testStatefulSet}, 976 | expectedResolves: map[string]k8s.Workload{ 977 | "1.1.1.1": { 978 | Name: "pod1", 979 | Namespace: "namespaceA", 980 | Kind: "pod", 981 | Owner: testStatefulSet.Name, 982 | }, 983 | }, 984 | }, 985 | }, 986 | { 987 | description: "initial snapshot 1 pod controlled by job owned by job", 988 | shouldTraverse: false, 989 | initialState: testStep{ 990 | shouldWait: false, 991 | newPods: []podDescriptor{ 992 | {"pod1", "namespaceA", "1.1.1.1", v1.PodRunning, types.UID(uuid.NewString()), &testJob}, 993 | }, 994 | newWorkloadResource: []workloadResourceDescriptor{testJob}, 995 | expectedResolves: map[string]k8s.Workload{ 996 | "1.1.1.1": { 997 | Name: "pod1", 998 | Namespace: "namespaceA", 999 | Kind: "pod", 1000 | Owner: testJob.Name, 1001 | }, 1002 | }, 1003 | }, 1004 | }, 1005 | { 1006 | description: "initial snapshot 1 pod controlled by cronjob owned by cronjob", 1007 | shouldTraverse: false, 1008 | initialState: testStep{ 1009 | shouldWait: false, 1010 | newPods: []podDescriptor{ 1011 | {"pod1", "namespaceA", "1.1.1.1", v1.PodRunning, types.UID(uuid.NewString()), &testCronjob}, 1012 | }, 1013 | newWorkloadResource: []workloadResourceDescriptor{testCronjob}, 1014 | expectedResolves: map[string]k8s.Workload{ 1015 | "1.1.1.1": { 1016 | Name: "pod1", 1017 | Namespace: "namespaceA", 1018 | Kind: "pod", 1019 | Owner: testCronjob.Name, 1020 | }, 1021 | }, 1022 | }, 1023 | }, 1024 | } 1025 | for _, test := range controllersTests { 1026 | t.Run(test.description, func(t *testing.T) { 1027 | runTest(t, test) 1028 | }) 1029 | } 1030 | } 1031 | -------------------------------------------------------------------------------- /pkg/metrics/prometheus.go: -------------------------------------------------------------------------------- 1 | package metrics 2 | 3 | import ( 4 | "log" 5 | "net/http" 6 | 7 | "github.com/prometheus/client_golang/prometheus/promhttp" 8 | ) 9 | 10 | func StartMetricsServer(endpoint string, port string) *http.Server { 11 | http.Handle(endpoint, promhttp.Handler()) 12 | server := &http.Server{Addr: port} 13 | go func() { 14 | err := server.ListenAndServe() 15 | if err != nil { 16 | log.Fatalf("Error starting prometheus server on port %v", port) 17 | } 18 | }() 19 | return server 20 | } 21 | -------------------------------------------------------------------------------- /pkg/tracing/ebpf/arm_support.h: -------------------------------------------------------------------------------- 1 | #ifndef __ARM_SUPPORT_H__ 2 | #define __ARM_SUPPORT_H__ 3 | 4 | struct user_pt_regs { 5 | __u64 regs[31]; 6 | __u64 sp; 7 | __u64 pc; 8 | __u64 pstate; 9 | }; 10 | 11 | #endif -------------------------------------------------------------------------------- /pkg/tracing/ebpf/caretta.bpf.c: -------------------------------------------------------------------------------- 1 | #include "core_structures.h" 2 | #include "arm_support.h" 3 | #include 4 | #include 5 | #include 6 | #include "ebpf_utils.h" 7 | #include "epbf_shared_types.h" 8 | #include "ebpf_internal_types.h" 9 | 10 | char __license[] SEC("license") = "Dual MIT/GPL"; 11 | 12 | // internal kernel-only map to hold state for each sock observed. 13 | struct bpf_map_def SEC("maps") sock_infos = { 14 | .type = BPF_MAP_TYPE_HASH, 15 | .key_size = sizeof(struct sock *), 16 | .value_size = sizeof(struct sock_info), 17 | .max_entries = MAX_CONNECTIONS, 18 | }; 19 | 20 | // the main product of the tracing - map containing all connections observed, 21 | // with metadata and throughput stats. 22 | // key is a whole identifier struct and not a single id to split the constant 23 | // and dynamic values and to resemble as closely as possible the end result in 24 | // the userspace code. 25 | struct bpf_map_def SEC("maps") connections = { 26 | .type = BPF_MAP_TYPE_HASH, 27 | .key_size = sizeof(struct connection_identifier), 28 | .value_size = sizeof(struct connection_throughput_stats), 29 | .max_entries = MAX_CONNECTIONS, 30 | }; 31 | 32 | // helper to convert short int from BE to LE 33 | static inline u16 be_to_le(__be16 be) { return (be >> 8) | (be << 8); } 34 | 35 | static inline u32 get_unique_id() { 36 | return bpf_ktime_get_ns() % __UINT32_MAX__; // no reason to use 64 bit for this 37 | } 38 | 39 | // function for parsing the struct sock 40 | static inline int 41 | parse_sock_data(struct sock *sock, struct connection_tuple *out_tuple, 42 | struct connection_throughput_stats *out_throughput) { 43 | 44 | if (sock == NULL) { 45 | return BPF_ERROR; 46 | } 47 | 48 | // struct sock wraps struct tcp_sock and struct inet_sock as its first member 49 | struct tcp_sock *tcp = (struct tcp_sock *)sock; 50 | struct inet_sock *inet = (struct inet_sock *)sock; 51 | 52 | // initialize variables. IP addresses and ports are read originally 53 | // big-endian, and we will convert the ports to little-endian. 54 | __be16 src_port_be = 0; 55 | __be16 dst_port_be = 0; 56 | 57 | // read connection tuple 58 | 59 | if (0 != bpf_core_read(&out_tuple->src_ip, sizeof(out_tuple->src_ip), 60 | &inet->inet_saddr)) { 61 | return BPF_ERROR; 62 | } 63 | 64 | if (0 != bpf_core_read(&out_tuple->dst_ip, sizeof(out_tuple->dst_ip), 65 | &inet->inet_daddr)) { 66 | return BPF_ERROR; 67 | } 68 | 69 | if (0 != bpf_core_read(&src_port_be, sizeof(src_port_be), &inet->inet_sport)) { 70 | return BPF_ERROR; 71 | } 72 | out_tuple->src_port = be_to_le(src_port_be); 73 | 74 | if (0 != bpf_core_read(&dst_port_be, sizeof(dst_port_be), &inet->inet_dport)) { 75 | return BPF_ERROR; 76 | } 77 | out_tuple->dst_port = be_to_le(dst_port_be); 78 | 79 | // read throughput data 80 | 81 | if (0 != bpf_core_read(&out_throughput->bytes_received, 82 | sizeof(out_throughput->bytes_received), 83 | &tcp->bytes_received)) { 84 | return BPF_ERROR; 85 | } 86 | if (0 != bpf_core_read(&out_throughput->bytes_sent, 87 | sizeof(out_throughput->bytes_sent), &tcp->bytes_sent)) { 88 | return BPF_ERROR; 89 | } 90 | 91 | return BPF_SUCCESS; 92 | }; 93 | 94 | static inline enum connection_role get_sock_role(struct sock* sock) { 95 | // the max_ack_backlog holds the limit for the accept queue 96 | // if it is a server, it will not be 0 97 | int max_ack_backlog = 0; 98 | if (0 != bpf_core_read(&max_ack_backlog, sizeof(max_ack_backlog), 99 | &sock->sk_max_ack_backlog)) { 100 | return CONNECTION_ROLE_UNKNOWN; 101 | } 102 | 103 | return max_ack_backlog == 0 ? CONNECTION_ROLE_CLIENT : CONNECTION_ROLE_SERVER; 104 | } 105 | 106 | // probing the tcp_data_queue kernel function, and adding the connection 107 | // observed to the map. 108 | SEC("kprobe/tcp_data_queue") 109 | static int handle_tcp_data_queue(struct pt_regs *ctx) { 110 | // first argument to tcp_data_queue is a struct sock* 111 | struct sock *sock = (struct sock *)PT_REGS_PARM1(ctx); 112 | 113 | struct connection_identifier conn_id = {}; 114 | struct connection_throughput_stats throughput = {}; 115 | 116 | if (parse_sock_data(sock, &conn_id.tuple, &throughput) == BPF_ERROR) { 117 | return BPF_ERROR; 118 | } 119 | 120 | // skip unconnected sockets 121 | if (conn_id.tuple.dst_port == 0 && conn_id.tuple.dst_ip == BPF_SUCCESS) { 122 | return BPF_SUCCESS; 123 | } 124 | 125 | // fill the conn_id extra details from sock_info map entry, or create one 126 | struct sock_info *sock_info = bpf_map_lookup_elem(&sock_infos, &sock); 127 | if (sock_info == NULL) { 128 | // first time we encounter this sock 129 | // check if server or client and insert to the maps 130 | 131 | enum connection_role role = get_sock_role(sock); 132 | 133 | struct sock_info info = { 134 | .pid = 0, // can't associate to pid anyway 135 | .role = role, 136 | .is_active = true, 137 | .id = get_unique_id(), 138 | }; 139 | bpf_map_update_elem(&sock_infos, &sock, &info, BPF_ANY); 140 | 141 | conn_id.pid = info.pid; 142 | conn_id.id = info.id; 143 | conn_id.role = info.role; 144 | throughput.is_active = true; 145 | 146 | bpf_map_update_elem(&connections, &conn_id, &throughput, BPF_ANY); 147 | 148 | return BPF_SUCCESS; 149 | 150 | } 151 | 152 | conn_id.pid = sock_info->pid; 153 | conn_id.id = sock_info->id; 154 | conn_id.role = sock_info->role; 155 | if (!sock_info->is_active) { 156 | return -1; 157 | } 158 | throughput.is_active = sock_info->is_active; 159 | 160 | bpf_map_update_elem(&connections, &conn_id, &throughput, BPF_ANY); 161 | 162 | return BPF_SUCCESS; 163 | }; 164 | 165 | static inline int handle_set_tcp_syn_sent(struct sock* sock) { 166 | // start of a client session 167 | u32 pid = bpf_get_current_pid_tgid() >> 32; 168 | 169 | struct sock_info info = { 170 | .pid = pid, 171 | .role = CONNECTION_ROLE_CLIENT, 172 | .is_active = true, 173 | .id = get_unique_id(), 174 | }; 175 | 176 | bpf_map_update_elem(&sock_infos, &sock, &info, BPF_ANY); 177 | 178 | return BPF_SUCCESS; 179 | } 180 | 181 | static inline int handle_set_tcp_syn_recv(struct sock* sock) { 182 | // this is a server getting syn after listen 183 | struct connection_identifier conn_id = {}; 184 | struct connection_throughput_stats throughput = {}; 185 | 186 | if (parse_sock_data(sock, &conn_id.tuple, &throughput) == BPF_ERROR) { 187 | return BPF_ERROR; 188 | } 189 | 190 | struct sock_info info = { 191 | .pid = 0, // can't associate to process 192 | .role = CONNECTION_ROLE_SERVER, 193 | .is_active = true, 194 | .id = get_unique_id(), 195 | }; 196 | 197 | bpf_map_update_elem(&sock_infos, &sock, &info, BPF_ANY); 198 | 199 | // probably the dst ip will still be uninitialized 200 | if (conn_id.tuple.dst_ip == 0) { 201 | return BPF_SUCCESS; 202 | } 203 | 204 | conn_id.pid = info.pid; 205 | conn_id.id = info.id; 206 | conn_id.role = info.role; 207 | 208 | bpf_map_update_elem(&connections, &conn_id, &throughput, BPF_ANY); 209 | 210 | return BPF_SUCCESS; 211 | } 212 | 213 | static inline int handle_set_tcp_close(struct sock* sock) { 214 | // mark as inactive 215 | struct connection_identifier conn_id = {}; 216 | struct connection_throughput_stats throughput = {}; 217 | 218 | if (parse_sock_data(sock, &conn_id.tuple, &throughput) == BPF_ERROR) { 219 | return BPF_ERROR; 220 | } 221 | 222 | struct sock_info *info = bpf_map_lookup_elem(&sock_infos, &sock); 223 | if (info == NULL) { 224 | conn_id.id = get_unique_id(); 225 | conn_id.pid = 0; // cannot associate to PID in this state 226 | conn_id.role = get_sock_role(sock); 227 | } else { 228 | conn_id.id = info->id; 229 | conn_id.pid = info->pid; 230 | conn_id.role = info->role; 231 | bpf_map_delete_elem(&sock_infos, &sock); 232 | } 233 | 234 | throughput.is_active = false; 235 | bpf_map_update_elem(&connections, &conn_id, &throughput, BPF_ANY); 236 | 237 | return BPF_SUCCESS; 238 | } 239 | 240 | SEC("tracepoint/sock/inet_sock_set_state") 241 | static int handle_sock_set_state(struct set_state_args *args) { 242 | struct sock *sock = (struct sock *)args->skaddr; 243 | 244 | switch(args->newstate) { 245 | case TCP_SYN_RECV: { 246 | return handle_set_tcp_syn_recv(sock) == BPF_ERROR; 247 | } 248 | case TCP_SYN_SENT: { 249 | return handle_set_tcp_syn_sent(sock) == BPF_ERROR; 250 | } 251 | case TCP_CLOSE: { 252 | return handle_set_tcp_close(sock); 253 | } 254 | } 255 | 256 | return BPF_SUCCESS; 257 | } -------------------------------------------------------------------------------- /pkg/tracing/ebpf/core_structures.h: -------------------------------------------------------------------------------- 1 | #ifndef __CORE_STRUCTURES_H__ 2 | #define __CORE_STRUCTURES_H__ 3 | 4 | #include 5 | 6 | /* 7 | * All structs and unions in this file should have a "preserve access index" 8 | * attribute. The following attaches this attribute to all records (structs, 9 | * unions, classes). 10 | * @see https://clang.llvm.org/docs/LanguageExtensions.html 11 | */ 12 | #pragma clang attribute push 13 | #pragma clang attribute(__attribute__((preserve_access_index)), \ 14 | apply_to = record) 15 | 16 | // this is not core structure per se, but it would have been defined in a full 17 | // vmlinux.h 18 | enum { 19 | false = 0, 20 | true = 1, 21 | }; 22 | 23 | enum { 24 | TCP_ESTABLISHED = 1, 25 | TCP_SYN_SENT = 2, 26 | TCP_SYN_RECV = 3, 27 | TCP_FIN_WAIT1 = 4, 28 | TCP_FIN_WAIT2 = 5, 29 | TCP_TIME_WAIT = 6, 30 | TCP_CLOSE = 7, 31 | TCP_CLOSE_WAIT = 8, 32 | TCP_LAST_ACK = 9, 33 | TCP_LISTEN = 10, 34 | TCP_CLOSING = 11, 35 | TCP_NEW_SYN_RECV = 12, 36 | TCP_MAX_STATES = 13, 37 | }; 38 | 39 | 40 | typedef u16 sa_family_t; 41 | typedef u32 socklen_t; 42 | 43 | struct in_addr { 44 | __be32 s_addr; 45 | }; 46 | 47 | struct in6_addr { 48 | union { 49 | __u8 u6_addr8[16]; 50 | } in6_u; 51 | }; 52 | 53 | struct sockaddr_in { 54 | sa_family_t sin_family; 55 | __be16 sin_port; 56 | struct in_addr sin_addr; 57 | }; 58 | 59 | struct sockaddr_in6 { 60 | sa_family_t sin6_family; 61 | __be16 sin6_port; 62 | struct in6_addr sin6_addr; 63 | }; 64 | 65 | struct sockaddr { 66 | sa_family_t sa_family; 67 | }; 68 | 69 | struct sock_common { 70 | struct { 71 | __be32 skc_daddr; 72 | __be32 skc_rcv_saddr; 73 | }; 74 | struct { 75 | __be16 skc_dport; 76 | __u16 skc_num; 77 | }; 78 | unsigned short skc_family; 79 | struct in6_addr skc_v6_daddr; 80 | }; 81 | 82 | struct sock { 83 | struct sock_common __sk_common; 84 | unsigned int sk_shutdown : 2, sk_no_check_tx : 1, sk_no_check_rx : 1, 85 | sk_userlocks : 4, sk_protocol : 8, sk_type : 16; 86 | u32 sk_max_ack_backlog; 87 | }; 88 | 89 | struct socket { 90 | struct sock *sk; 91 | }; 92 | 93 | struct ipv6_pinfo { 94 | struct in6_addr saddr; 95 | }; 96 | 97 | struct inet_sock { 98 | struct sock sk; 99 | struct ipv6_pinfo *pinet6; 100 | __be32 inet_saddr; 101 | __be16 inet_sport; 102 | }; 103 | 104 | struct tcp_sock { 105 | u64 bytes_received; 106 | u64 bytes_sent; 107 | }; 108 | 109 | typedef u8 u_int8_t; 110 | typedef u16 u_int16_t; 111 | 112 | #pragma clang attribute pop 113 | 114 | #endif // __KERNEL_CORE_STRUCTURES_H__ 115 | -------------------------------------------------------------------------------- /pkg/tracing/ebpf/ebpf_internal_types.h: -------------------------------------------------------------------------------- 1 | #include "epbf_shared_types.h" 2 | 3 | #define MAX_CONNECTIONS 1000000 4 | 5 | // internal kernel-only struct to hold socket information which can't be parsed 6 | // from struct sock. 7 | struct sock_info { 8 | u32 pid; 9 | enum connection_role role; 10 | u32 is_active; 11 | u32 id; 12 | }; 13 | 14 | // partial struct of args for tcp_set_state 15 | struct set_state_args { 16 | u64 padding; 17 | struct sock *skaddr; 18 | u32 oldstate; 19 | u32 newstate; 20 | // more... 21 | }; 22 | 23 | 24 | -------------------------------------------------------------------------------- /pkg/tracing/ebpf/ebpf_utils.h: -------------------------------------------------------------------------------- 1 | #ifndef __EBPF_UTILS_H__ 2 | #define __EBPF_UTILS_H__ 3 | 4 | #define BPF_SUCCESS 0 5 | #define BPF_ERROR -1 6 | 7 | #endif -------------------------------------------------------------------------------- /pkg/tracing/ebpf/epbf_shared_types.h: -------------------------------------------------------------------------------- 1 | #ifndef __EBPF_SHARED_TYPES_H__ 2 | #define __EBPF_SHARED_TYPES_H__ 3 | #include "vmlinux.h" 4 | 5 | // helper defs for inet_sock. These are defined in inet_sock.h, but not copied 6 | // automatically to vmlinux.h 7 | #define inet_daddr sk.__sk_common.skc_daddr 8 | #define inet_rcv_saddr sk.__sk_common.skc_rcv_saddr 9 | #define inet_dport sk.__sk_common.skc_dport 10 | #define inet_num sk.__sk_common.skc_num 11 | 12 | 13 | enum connection_role { 14 | CONNECTION_ROLE_UNKNOWN = 0, 15 | CONNECTION_ROLE_CLIENT, 16 | CONNECTION_ROLE_SERVER, 17 | }; 18 | 19 | // describing two sides of a connection. constant for each connection. 20 | struct connection_tuple { 21 | __be32 src_ip; 22 | __be32 dst_ip; 23 | u16 src_port; 24 | u16 dst_port; 25 | }; 26 | 27 | // all information needed to identify a specific connection. 28 | // due to socket reuses, each of the members (beside id) may change while 29 | // maintaining the others. 30 | struct connection_identifier { 31 | u32 id; // uniquely generated id 32 | u32 pid; 33 | struct connection_tuple tuple; 34 | enum connection_role role; 35 | }; 36 | 37 | // dynamic information about the state of a connection. 38 | struct connection_throughput_stats { 39 | u64 bytes_sent; 40 | u64 bytes_received; 41 | u64 is_active; // u64 because it will be padded anyway. should change whether 42 | // new members are added 43 | }; 44 | 45 | #endif -------------------------------------------------------------------------------- /pkg/tracing/probes.go: -------------------------------------------------------------------------------- 1 | package tracing 2 | 3 | import ( 4 | "errors" 5 | "fmt" 6 | "log" 7 | 8 | "github.com/cilium/ebpf" 9 | "github.com/cilium/ebpf/btf" 10 | "github.com/cilium/ebpf/link" 11 | "github.com/cilium/ebpf/rlimit" 12 | ) 13 | 14 | type Probes struct { 15 | Kprobe link.Link 16 | Tracepoint link.Link 17 | BpfObjs bpfObjects 18 | } 19 | 20 | func LoadProbes() (Probes, *ebpf.Map, error) { 21 | if err := rlimit.RemoveMemlock(); err != nil { 22 | return Probes{}, nil, fmt.Errorf("error removing memory lock - %v", err) 23 | } 24 | 25 | objs := bpfObjects{} 26 | err := loadBpfObjects(&objs, &ebpf.CollectionOptions{}) 27 | if err != nil { 28 | var ve *ebpf.VerifierError 29 | if errors.As(err, &ve) { 30 | fmt.Printf("Verifier Error: %+v\n", ve) 31 | } 32 | return Probes{}, nil, fmt.Errorf("error loading BPF objects from go-side. %v", err) 33 | } 34 | log.Printf("BPF objects loaded") 35 | 36 | // attach a kprobe and tracepoint 37 | kp, err := link.Kprobe("tcp_data_queue", objs.bpfPrograms.HandleTcpDataQueue, nil) 38 | if err != nil { 39 | return Probes{}, nil, fmt.Errorf("error attaching kprobe: %v", err) 40 | } 41 | log.Printf("Kprobe attached successfully") 42 | 43 | tp, err := link.Tracepoint("sock", "inet_sock_set_state", objs.bpfPrograms.HandleSockSetState, nil) 44 | if err != nil { 45 | return Probes{}, nil, fmt.Errorf("error attaching tracepoint: %v", err) 46 | } 47 | log.Printf("Tracepoint attached successfully") 48 | 49 | // We are done with loading kprobes - clear the btf cache 50 | btf.FlushKernelSpec() 51 | 52 | return Probes{ 53 | Kprobe: kp, 54 | Tracepoint: tp, 55 | BpfObjs: objs, 56 | }, objs.Connections, nil 57 | } 58 | 59 | func (objs *Probes) UnloadProbes() error { 60 | // if any close operation fails, will continue to try closing the rest of the struct, 61 | // and return the first error 62 | var resultErr error 63 | resultErr = nil 64 | 65 | err := objs.Kprobe.Close() 66 | if err != nil { 67 | resultErr = err 68 | } 69 | err = objs.Tracepoint.Close() 70 | if err != nil && resultErr == nil { 71 | resultErr = err 72 | } 73 | err = objs.BpfObjs.Close() 74 | if err != nil && resultErr == nil { 75 | resultErr = err 76 | } 77 | 78 | return resultErr 79 | } 80 | -------------------------------------------------------------------------------- /scripts/build/download_libbpf_headers.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # This downloads the libbpf headers we need to compile eBPF code. 4 | # The script is based on cilium's update headers script, 5 | # https://github.com/cilium/ebpf/blob/4420605496c54a45653a7f1d277896e71e6705e2/examples/headers/update.sh#L1 6 | 7 | # Version of libbpf to fetch headers from 8 | LIBBPF_VERSION=0.6.1 9 | 10 | # Version of cilium ebpf repository to fetch vmlinux from 11 | CILIUM_VMLINUX_VERSION=0.10.0 12 | 13 | HEADERS_DIRECTORY="/tmp/caretta_extra/libbpf_headers" 14 | 15 | # The headers we want 16 | prefix=libbpf-"$LIBBPF_VERSION" 17 | headers=( 18 | "$prefix"/src/bpf_endian.h 19 | "$prefix"/src/bpf_helper_defs.h 20 | "$prefix"/src/bpf_helpers.h 21 | "$prefix"/src/bpf_tracing.h 22 | "$prefix"/src/bpf_core_read.h 23 | ) 24 | 25 | if [ ! -d "pkg" ] ; then 26 | echo "Run this scripts from the repository's root directory." 1>&2 27 | exit 1 28 | fi 29 | 30 | if [ ! -d "$HEADERS_DIRECTORY" ]; then 31 | mkdir -p "$HEADERS_DIRECTORY" 32 | if [ "$?" -ne 0 ]; then 33 | echo "Failed to create libbpf headers directory \""$HEADERS_DIRECTORY"\"." 1>&2 34 | exit 1 35 | fi 36 | fi 37 | 38 | # Fetch libbpf release and extract the desired headers 39 | curl -sL --connect-timeout 10 --max-time 10 \ 40 | "https://github.com/libbpf/libbpf/archive/refs/tags/v${LIBBPF_VERSION}.tar.gz" | \ 41 | tar -xz --xform='s#.*/##' -C "$HEADERS_DIRECTORY" "${headers[@]}" 42 | if [ "$?" -ne 0 ]; then 43 | echo "Failed to download and extract the needed libbpf headers." 1>&2 44 | exit 1 45 | fi 46 | 47 | # Fetch compact vmlinux file from cilium's ebpf repository. 48 | # This is not a libbpf header per-se, but it's close enough that we put it in the same location. 49 | curl -s -o "$HEADERS_DIRECTORY"/vmlinux.h \ 50 | https://raw.githubusercontent.com/cilium/ebpf/v${CILIUM_VMLINUX_VERSION}/examples/headers/common.h 51 | if [ "$?" -ne 0 ]; then 52 | echo "Failed to download vmlinux compact version from cilium's repository." 53 | exit 1 54 | fi 55 | 56 | echo "Successfully downloaded libbpf headers." 1>&2 57 | --------------------------------------------------------------------------------