├── .github
├── ISSUE_TEMPLATE
│ ├── bug_report.md
│ └── feature_request.md
└── workflows
│ ├── pr.yaml
│ ├── release.yaml
│ └── subcharts.yaml
├── .gitignore
├── CODE_OF_CONDUCT.md
├── Dockerfile
├── LICENSE
├── Makefile
├── README.md
├── chart
├── Chart.lock
├── Chart.yaml
├── charts
│ ├── grafana-6.48.0.tgz
│ └── victoria-metrics-single-0.8.48.tgz
├── dashboard.json
├── templates
│ ├── _helpers.tpl
│ ├── daemonset.yaml
│ ├── grafana
│ │ └── dashboards.yaml
│ └── rbac
│ │ ├── psp.yaml
│ │ ├── role.yaml
│ │ ├── rolebinding.yaml
│ │ └── serviceaccount.yaml
└── values.yaml
├── cmd
└── caretta
│ └── caretta.go
├── go.mod
├── go.sum
├── images
├── caretta.gif
├── logo.svg
└── screenshot.png
├── pkg
├── caretta
│ ├── caretta.go
│ ├── config.go
│ ├── ebpf_map.go
│ ├── links_tracer.go
│ ├── links_tracer_test.go
│ └── types.go
├── k8s
│ ├── ipresolver.go
│ └── ipresolver_test.go
├── metrics
│ └── prometheus.go
└── tracing
│ ├── ebpf
│ ├── arm_support.h
│ ├── caretta.bpf.c
│ ├── core_structures.h
│ ├── ebpf_internal_types.h
│ ├── ebpf_utils.h
│ └── epbf_shared_types.h
│ └── probes.go
└── scripts
└── build
└── download_libbpf_headers.sh
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Bug report
3 | about: Create a report to help us improve
4 | title: ''
5 | labels: ''
6 | assignees: ''
7 |
8 | ---
9 |
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 |
13 | **To Reproduce**
14 | Steps to reproduce the behavior:
15 | 1. Go to '...'
16 | 2. Click on '....'
17 | 3. Scroll down to '....'
18 | 4. See error
19 |
20 | **Expected behavior**
21 | A clear and concise description of what you expected to happen.
22 |
23 | **Screenshots**
24 | If applicable, add screenshots to help explain your problem.
25 |
26 | **Environment (please complete the following information):**
27 | - OS: [e.g. iOS]
28 | - Browser [e.g. chrome, safari]
29 | - Kubernetes cluster information - distribution, version
30 |
31 | **Additional context**
32 | Add any other context about the problem here.
33 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Feature request
3 | about: Suggest an idea for this project
4 | title: ''
5 | labels: ''
6 | assignees: ''
7 |
8 | ---
9 |
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 |
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you want to happen.
15 |
16 | **Describe alternatives you've considered**
17 | A clear and concise description of any alternative solutions or features you've considered.
18 |
19 | **Additional context**
20 | Add any other context or screenshots about the feature request here.
21 |
--------------------------------------------------------------------------------
/.github/workflows/pr.yaml:
--------------------------------------------------------------------------------
1 | name: pr
2 |
3 | on:
4 | pull_request:
5 |
6 | jobs:
7 | build:
8 | runs-on: ubuntu-latest
9 | permissions:
10 | contents: write
11 | id-token: write
12 | steps:
13 | -
14 | name: Checkout
15 | uses: actions/checkout@v3
16 | -
17 | name: Set Up QEMU
18 | uses: docker/setup-qemu-action@v3
19 | with:
20 | platforms: arm64
21 | -
22 | name: Set up Docker Buildx
23 | uses: docker/setup-buildx-action@v3
24 | -
25 | name: Build Docker Image
26 | uses: docker/build-push-action@v3
27 | with:
28 | context: .
29 | push: false
30 | cache-from: type=gha
31 | cache-to: type=gha,mode=max
32 | tags: caretta
33 | platforms: linux/amd64,linux/arm64
34 |
35 |
--------------------------------------------------------------------------------
/.github/workflows/release.yaml:
--------------------------------------------------------------------------------
1 | name: release
2 |
3 | on:
4 | push:
5 | tags:
6 | - 'v*.*.*'
7 |
8 | jobs:
9 | release:
10 | runs-on: ubuntu-latest
11 | permissions:
12 | contents: write
13 | id-token: write
14 | steps:
15 | -
16 | name: Checkout
17 | uses: actions/checkout@v3
18 | -
19 | name: Set Up QEMU
20 | uses: docker/setup-qemu-action@v3
21 | with:
22 | platforms: arm64
23 | -
24 | name: Set up Docker Buildx
25 | uses: docker/setup-buildx-action@v3
26 | -
27 | name: Login to Quay.io
28 | uses: docker/login-action@v2
29 | with:
30 | registry: quay.io
31 | username: ${{ secrets.QUAY_USERNAME }}
32 | password: ${{ secrets.QUAY_ROBOT_TOKEN }}
33 | -
34 | name: Build & Push Docker Image
35 | uses: docker/build-push-action@v3
36 | with:
37 | context: .
38 | push: true
39 | cache-from: type=gha
40 | cache-to: type=gha,mode=max
41 | tags: quay.io/groundcover/caretta:${{ github.ref_name }}
42 | platforms: linux/arm64,linux/amd64
43 | -
44 | name: Checkout Helm Repo
45 | uses: actions/checkout@v3
46 | with:
47 | path: helm-repo
48 | repository: groundcover-com/charts
49 | token: ${{ secrets.HELM_CHARTS_REPO_KEY }}
50 | -
51 | name: Publish Chart
52 | working-directory: helm-repo
53 | env:
54 | GITHUB_TAG: ${{ github.ref_name }}
55 | run: |
56 | version=${GITHUB_TAG#v}
57 | helm lint ../chart
58 | helm package --version ${version} --app-version ${GITHUB_TAG} ../chart
59 | helm repo index --url https://helm.groundcover.com .
60 | git config user.name "ci-groundcover"
61 | git config user.email "ci@groundcover.com"
62 | git add .
63 | git commit -m "Added caretta ${version} chart"
64 | git push
65 |
--------------------------------------------------------------------------------
/.github/workflows/subcharts.yaml:
--------------------------------------------------------------------------------
1 | name: subcharts-images
2 |
3 | on:
4 | push:
5 | branches:
6 | - 'main'
7 | paths:
8 | - 'chart/charts/**'
9 | - '.github/workflows/subcharts.yaml'
10 |
11 | defaults:
12 | run:
13 | working-directory: chart/charts
14 |
15 | jobs:
16 | subchart-images:
17 | runs-on: ubuntu-latest
18 | permissions:
19 | contents: write
20 | id-token: write
21 | steps:
22 | -
23 | name: Checkout
24 | uses: actions/checkout@v3
25 | -
26 | name: Login to Quay.io
27 | uses: docker/login-action@v2
28 | with:
29 | registry: quay.io
30 | username: ${{ secrets.QUAY_USERNAME }}
31 | password: ${{ secrets.QUAY_ROBOT_TOKEN }}
32 | -
33 | name: Set up Docker Buildx
34 | uses: docker/setup-buildx-action@v2
35 | -
36 | name: Push Grafana Image
37 | run: |
38 | IMAGE_TAG=$(helm show chart grafana* | yq e '.appVersion' -)
39 | docker buildx imagetools create grafana/grafana:${IMAGE_TAG} --tag quay.io/groundcover/grafana:${IMAGE_TAG}
40 | -
41 | name: Push Victoria-Metrics Image
42 | run: |
43 | IMAGE_TAG=v$(helm show chart victoria-metrics* | yq e '.appVersion' -)
44 | docker buildx imagetools create victoriametrics/victoria-metrics:${IMAGE_TAG} --tag quay.io/groundcover/victoria-metrics:${IMAGE_TAG}
45 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # If you prefer the allow list template instead of the deny list, see community template:
2 | # https://github.com/github/gitignore/blob/main/community/Golang/Go.AllowList.gitignore
3 | #
4 | # Binaries for programs and plugins
5 | *.exe
6 | *.exe~
7 | *.dll
8 | *.so
9 | *.dylib
10 | *.o
11 |
12 | # Test binary, built with `go test -c`
13 | *.test
14 |
15 | # Output of the go coverage tool, specifically when used with LiteIDE
16 | *.out
17 |
18 | # Dependency directories (remove the comment below to include it)
19 | # vendor/
20 |
21 | # Go workspace file
22 | go.work
23 |
24 |
25 | # autogenerated by bpf2go
26 | *_bpfel_*.go
27 |
28 | # binary output
29 | bin/
30 | vendor/
31 |
32 |
--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | # Contributor Covenant Code of Conduct
2 |
3 | ## Our Pledge
4 |
5 | We as members, contributors, and leaders pledge to make participation in our
6 | community a harassment-free experience for everyone, regardless of age, body
7 | size, visible or invisible disability, ethnicity, sex characteristics, gender
8 | identity and expression, level of experience, education, socio-economic status,
9 | nationality, personal appearance, race, religion, or sexual identity
10 | and orientation.
11 |
12 | We pledge to act and interact in ways that contribute to an open, welcoming,
13 | diverse, inclusive, and healthy community.
14 |
15 | ## Our Standards
16 |
17 | Examples of behavior that contributes to a positive environment for our
18 | community include:
19 |
20 | * Demonstrating empathy and kindness toward other people
21 | * Being respectful of differing opinions, viewpoints, and experiences
22 | * Giving and gracefully accepting constructive feedback
23 | * Accepting responsibility and apologizing to those affected by our mistakes,
24 | and learning from the experience
25 | * Focusing on what is best not just for us as individuals, but for the
26 | overall community
27 |
28 | Examples of unacceptable behavior include:
29 |
30 | * The use of sexualized language or imagery, and sexual attention or
31 | advances of any kind
32 | * Trolling, insulting or derogatory comments, and personal or political attacks
33 | * Public or private harassment
34 | * Publishing others' private information, such as a physical or email
35 | address, without their explicit permission
36 | * Other conduct which could reasonably be considered inappropriate in a
37 | professional setting
38 |
39 | ## Enforcement Responsibilities
40 |
41 | Community leaders are responsible for clarifying and enforcing our standards of
42 | acceptable behavior and will take appropriate and fair corrective action in
43 | response to any behavior that they deem inappropriate, threatening, offensive,
44 | or harmful.
45 |
46 | Community leaders have the right and responsibility to remove, edit, or reject
47 | comments, commits, code, wiki edits, issues, and other contributions that are
48 | not aligned to this Code of Conduct, and will communicate reasons for moderation
49 | decisions when appropriate.
50 |
51 | ## Scope
52 |
53 | This Code of Conduct applies within all community spaces, and also applies when
54 | an individual is officially representing the community in public spaces.
55 | Examples of representing our community include using an official e-mail address,
56 | posting via an official social media account, or acting as an appointed
57 | representative at an online or offline event.
58 |
59 | ## Enforcement
60 |
61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
62 | reported to the community leaders responsible for enforcement at
63 | info@groundcover.com.
64 | All complaints will be reviewed and investigated promptly and fairly.
65 |
66 | All community leaders are obligated to respect the privacy and security of the
67 | reporter of any incident.
68 |
69 | ## Enforcement Guidelines
70 |
71 | Community leaders will follow these Community Impact Guidelines in determining
72 | the consequences for any action they deem in violation of this Code of Conduct:
73 |
74 | ### 1. Correction
75 |
76 | **Community Impact**: Use of inappropriate language or other behavior deemed
77 | unprofessional or unwelcome in the community.
78 |
79 | **Consequence**: A private, written warning from community leaders, providing
80 | clarity around the nature of the violation and an explanation of why the
81 | behavior was inappropriate. A public apology may be requested.
82 |
83 | ### 2. Warning
84 |
85 | **Community Impact**: A violation through a single incident or series
86 | of actions.
87 |
88 | **Consequence**: A warning with consequences for continued behavior. No
89 | interaction with the people involved, including unsolicited interaction with
90 | those enforcing the Code of Conduct, for a specified period of time. This
91 | includes avoiding interactions in community spaces as well as external channels
92 | like social media. Violating these terms may lead to a temporary or
93 | permanent ban.
94 |
95 | ### 3. Temporary Ban
96 |
97 | **Community Impact**: A serious violation of community standards, including
98 | sustained inappropriate behavior.
99 |
100 | **Consequence**: A temporary ban from any sort of interaction or public
101 | communication with the community for a specified period of time. No public or
102 | private interaction with the people involved, including unsolicited interaction
103 | with those enforcing the Code of Conduct, is allowed during this period.
104 | Violating these terms may lead to a permanent ban.
105 |
106 | ### 4. Permanent Ban
107 |
108 | **Community Impact**: Demonstrating a pattern of violation of community
109 | standards, including sustained inappropriate behavior, harassment of an
110 | individual, or aggression toward or disparagement of classes of individuals.
111 |
112 | **Consequence**: A permanent ban from any sort of public interaction within
113 | the community.
114 |
115 | ## Attribution
116 |
117 | This Code of Conduct is adapted from the [Contributor Covenant][homepage],
118 | version 2.0, available at
119 | https://www.contributor-covenant.org/version/2/0/code_of_conduct.html.
120 |
121 | Community Impact Guidelines were inspired by [Mozilla's code of conduct
122 | enforcement ladder](https://github.com/mozilla/diversity).
123 |
124 | [homepage]: https://www.contributor-covenant.org
125 |
126 | For answers to common questions about this code of conduct, see the FAQ at
127 | https://www.contributor-covenant.org/faq. Translations are available at
128 | https://www.contributor-covenant.org/translations.
129 |
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM quay.io/cilium/ebpf-builder:1648566014 AS builder
2 | ARG TARGETARCH
3 | ARG TARGETPLATFORM
4 | RUN echo "Building for $TARGETARCH"
5 | RUN echo "Building for $TARGETPLATFORM"
6 | WORKDIR /build
7 | COPY . /build/
8 | RUN make build ARCH=$TARGETARCH
9 |
10 | FROM alpine:3.17
11 |
12 | WORKDIR /app
13 | COPY --from=builder build/bin/caretta ./
14 |
15 | VOLUME /sys/kernel/debug
16 |
17 | CMD ./caretta
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | BIN_DIR:=bin
2 | BINARY_PATH:=${BIN_DIR}/caretta
3 | DOCKER_BIN:=docker
4 | BPF2GO_BINARY := ${BIN_DIR}/bpf2go
5 | BPF2GO_VERSION := 0.9.0
6 | REPODIR := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST))))
7 | UIDGID := $(shell stat -c '%u:%g' ${REPODIR})
8 | PROJECT_DIRNAME := $(shell basename ${REPODIR})
9 | CILIUM_EBPF_DIRECTORY := /tmp/cilium-ebpf
10 | BUILD_SCRIPTS_DIRECTORY=scripts/build
11 | BPF_CLANG := clang-14
12 | INCLUDE_C_FLAGS := -I/tmp/caretta_extra/libbpf_headers -I/tmp/${PROJECT_DIRNAME}/
13 | BPF_CFLAGS := -O2 -g -Wall -Werror -fdebug-prefix-map=/ebpf=. ${INCLUDE_C_FLAGS}
14 | IMAGE=quay.io/cilium/ebpf-builder
15 | VERSION=1648566014
16 |
17 | ARCH=amd64 # amd64 or arm64
18 |
19 | .PHONY: build
20 | build: ${BIN_DIR} pkg/tracing/bpf_bpfel_x86.go cmd/caretta/caretta.go
21 | GOOS=linux GOARCH=${TARGETARCH} CGO_ENABLED=0 go build -o ${BINARY_PATH} cmd/caretta/caretta.go
22 |
23 | ${BIN_DIR}:
24 | mkdir -p ${BIN_DIR}
25 |
26 | .PHONY: download_libbpf_headers
27 | download_libbpf_headers:
28 | ${REPODIR}/${BUILD_SCRIPTS_DIRECTORY}/download_libbpf_headers.sh
29 |
30 | .PHONY: generate_ebpf
31 | generate_ebpf: ${BPF2GO_BINARY}_${BPF2GO_VERSION} \
32 | download_libbpf_headers
33 | go mod vendor
34 | (cd ${REPODIR}/pkg/tracing && \
35 | GOPACKAGE=tracing ${REPODIR}/${BPF2GO_BINARY}_${BPF2GO_VERSION} \
36 | -cc "${BPF_CLANG}" -cflags "${BPF_CFLAGS}" \
37 | -target arm64,amd64 bpf \
38 | ebpf/caretta.bpf.c --)
39 |
40 | ${BPF2GO_BINARY}_${BPF2GO_VERSION}:
41 | git clone -q --branch v${BPF2GO_VERSION} https://github.com/cilium/ebpf \
42 | ${CILIUM_EBPF_DIRECTORY} 2>/dev/null
43 | cd ${CILIUM_EBPF_DIRECTORY} && \
44 | go build -o ${REPODIR}/${BPF2GO_BINARY}_${BPF2GO_VERSION} ./cmd/bpf2go
45 |
46 | .PHONY: generate_ebpf_in_docker
47 | generate_ebpf_in_docker: ${BIN_DIR}
48 | ${DOCKER_BIN} run \
49 | -v ${REPODIR}:/tmp/caretta \
50 | -w /tmp/${PROJECT_DIRNAME} \
51 | --env HOME="/tmp/" \
52 | "${IMAGE}:${VERSION}" \
53 | ${MAKE} generate_ebpf
54 |
55 | pkg/tracing/bpf_bpfel%.go: pkg/tracing/ebpf/caretta.bpf.c
56 | $(MAKE) generate_ebpf
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
Caretta
3 |
4 |
5 |
6 |
7 |
8 |
Instant K8s service dependency map, right to your Grafana.
9 |
10 |
11 |
12 | [](http://www.groundcover.com/join-slack)
13 | [](https://opensource.org/licenses/Apache-2.0)
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 | ## What is Caretta?
23 |
24 | Caretta is a lightweight, standalone tool that instantly creates a visual network map of the services running in your cluster.
25 |
26 | Carreta leverages eBPF to efficiently map all service network interactions in a K8s cluster, and Grafana to query and visualize the collected data.
27 |
28 | Carreta is built to be efficient, with a minimal footprint on the system, and does not require any modifications of the cluster.
29 |
30 | Caretta demonstrates the power of using eBPF for observability solutions, which is our vision at groundcover. If you're interested in understanding how Caretta is built, head over to our Caretta blog post!
31 |
32 | ## Installing Caretta :zap:
33 | As simple as installing a helm chart. It is recommended to install Caretta in a new, unique namespace.
34 | ```bash
35 | helm repo add groundcover https://helm.groundcover.com/
36 | ```
37 | ```bash
38 | helm repo update
39 | ```
40 | ```bash
41 | helm install caretta --namespace caretta --create-namespace groundcover/caretta
42 | ```
43 |
44 | ### Configuration
45 | You can configure Caretta using helm values.
46 | Useful values:
47 | * **tolerations** can be specified to make sure Caretta's eBPF-agent will run on all cluster in your nodes. *default value will tolerate common control-plane node annotations*
48 | * **victoria-metrics-single.server.persistentVolume.enabled** can be set to *true* if you wish to save Caretta's metrics to a persistent volume *default: false*
49 | * **pollIntervalSeconds** can be modified to specify the polling and publishing interval of new metrics from the kernel. *default: 5*
50 | * The built-in Victoria Metrics and Grafana instances can be disabled by changing the values **victoria-metrics-single.enabled** or **grafana.enabled** to false, accordingly. _default: true_
51 | * Caretta resolves Kubernetes entities to their owners by default. For example, a pod 'pod1' and another pod 'pod2' both belonging to a deployment 'deployment1' will be resolved to 'deployment1'. This can be disabled by setting **traverseUpHierarchy** to false. _default: true_
52 |
53 |
54 | Example yaml for overriding these values:
55 | ```yaml
56 | pollIntervalSeconds: 15 # set metrics polling interval
57 | traverseUpHierarchy: false # disable resolving kubernetes entities to their owners
58 |
59 | tolerations: # set any desired tolerations
60 | - key: node-role.kubernetes.io/control-plane
61 | operator: Exists
62 | effect: NoSchedule
63 |
64 | victoria-metrics-single:
65 | server:
66 | persistentVolume:
67 | enabled: true # set to true to use persistent volume
68 | ```
69 | This can also be done using the --set flag on the `helm install` command.
70 |
71 | ### Uninstallation
72 | To uninstall, delete the helm release:
73 | ```bash
74 | helm delete caretta --namespace caretta
75 | ```
76 | Note that if persistent storage was enabled in the installation, it may not be deleted automatically by this command.
77 |
78 | ## Requirements
79 | * Linux kernel version >= 4.16
80 | * CO-RE support. Supported linux distributions can be found here. Specifically, Docker for Mac uses a distribution which is not currently supported.
81 |
82 |
83 |
84 | ## Working with Caretta :turtle:
85 | Caretta's helm chart ships an instance of Grafana with a predefined dashboard using data published by Caretta. This dashboard contains some examples to demonstrate the usage of Caretta's metrics.
86 |
87 | ### Using the provided Grafana instance
88 | To access Grafana, port-forward port `3000` from the Grafana pod in Caretta's namespace.
89 |
90 | Using *kubectl*, it should look something like this:
91 |
92 | ```bash
93 | kubectl port-forward --namespace caretta 3000:3000
94 | ```
95 |
96 | > **_NOTE:_** Anonymous mode is enabled, making the default dashboard accessible with no login needed.
97 | > To edit the default dashboard or create your own dashboard, use the default administrator's credentials user:`admin` ; password:`caretta`.
98 |
99 | ### Scraping Caretta's metrics
100 |
101 | Caretta uses [Victoria Metrics](https://victoriametrics.com/) to collect and publish its metrics, and the outcome can be consumed by **any Prometheus-compatible dashboard**.
102 |
103 | Caretta's main metric is `caretta_links_observed` (Gauge). It uses the following labels to represent a specific connection (network socket) going through the cluster:
104 | * `client_name` - either a name of a kubernetes entity, if resolved, an external domain, if resolved, or an IP address.
105 | * `client_namespace` - either the namespace of the kubernetes entity, or "node", or "external".
106 | * `client_kind` - either the kind of the kubernetes entity, or "node", or "external".
107 | * `server_name` - either a name of a kubernetes entity, if resolved, an external domain, if resolved, or an IP address.
108 | * `server_namespace` - either the namespace of the kubernetes entity, or "node", or "external".
109 | * `server_kind` - either the kind of the kubernetes entity, or "node", or "external".
110 | * `server_port` - the port used by the server.
111 | * `role` - either 1 (client) or 2 (server).
112 |
113 | Along those labels, Caretta uses other labels for Grafana's Node Graph panel.
114 |
115 | #### Example metric data
116 | This example shows a connection between a client named `checkoutservice`, controlled by a deployment, to a service named `productioncatalogservice` on port 3550, from the perspective of the client. Total bytes sent by the client in this connection is 2537 bytes.
117 | ```bash
118 | caretta_links_observed{client_id="1074587981",client_kind="Deployment",client_name="checkoutservice",client_namespace="demo-ng",link_id="198768460",role="1",server_id="1112713827",server_kind="Service",server_name="productcatalogservice",server_namespace="demo-ng",server_port="3550"} 2537
119 | ```
120 |
121 | #### Example queries :star:
122 | ```bash
123 | increase ((sum (server_port) (caretta_links_observed{client_name="some-client", server_name="some-server}))[15m])
124 | ```
125 | will output the throughput observed between some-client and some-server in the last 15 minutes, aggregated by port.
126 |
127 | ```bash
128 | sum by (server_name) (rate(caretta_links_observed{client_name="some-client"}))
129 | ```
130 | will output the rate of traffic from some-client to servers it communicates with, aggregated by the server's name.
131 |
132 | ```bash
133 | sort_desc(increase((sum by (client_name)(caretta_links_observed{server_namespace="external"}))[5m]))
134 | ```
135 | will output communication to external servers by client's name, sorted descending.
136 |
137 | ## Need help:grey_question:
138 | Feel free to reach us on our slack channel, or create an issue in this repository.
139 |
140 | ## Contribution
141 | Feel free to add your contribution to the project.
142 |
143 | * Open an issue for missing features, or bugs
144 | * Create a pull request for adding code to the project
145 |
--------------------------------------------------------------------------------
/chart/Chart.lock:
--------------------------------------------------------------------------------
1 | dependencies:
2 | - name: victoria-metrics-single
3 | repository: https://victoriametrics.github.io/helm-charts
4 | version: 0.8.48
5 | - name: grafana
6 | repository: https://grafana.github.io/helm-charts
7 | version: 6.48.0
8 | digest: sha256:eb7c3b54ae1fef78dae03136bdd7c0e34a3a08a34c147a227e824437a443bccb
9 | generated: "2022-12-26T10:15:04.518501964Z"
10 |
--------------------------------------------------------------------------------
/chart/Chart.yaml:
--------------------------------------------------------------------------------
1 | version: 0.0.1
2 | apiVersion: v2
3 | appVersion: v0.0.1
4 | name: caretta
5 | description: A helm chart for Caretta service map.
6 | type: application
7 | dependencies:
8 | - name: victoria-metrics-single
9 | version: "0.8.48"
10 | repository: "https://victoriametrics.github.io/helm-charts"
11 | condition: victoria-metrics-single.enabled
12 | - name: grafana
13 | version: "6.48.0"
14 | repository: "https://grafana.github.io/helm-charts"
15 | condition: grafana.enabled
--------------------------------------------------------------------------------
/chart/charts/grafana-6.48.0.tgz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/groundcover-com/caretta/280d1640ce0174b1dfdd7d05bdd104604aa04508/chart/charts/grafana-6.48.0.tgz
--------------------------------------------------------------------------------
/chart/charts/victoria-metrics-single-0.8.48.tgz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/groundcover-com/caretta/280d1640ce0174b1dfdd7d05bdd104604aa04508/chart/charts/victoria-metrics-single-0.8.48.tgz
--------------------------------------------------------------------------------
/chart/dashboard.json:
--------------------------------------------------------------------------------
1 | {
2 | "annotations": {
3 | "list": [
4 | {
5 | "builtIn": 1,
6 | "datasource": {
7 | "type": "grafana",
8 | "uid": "-- Grafana --"
9 | },
10 | "enable": true,
11 | "hide": true,
12 | "iconColor": "rgba(0, 211, 255, 1)",
13 | "name": "Annotations & Alerts",
14 | "target": {
15 | "limit": 100,
16 | "matchAny": false,
17 | "tags": [],
18 | "type": "dashboard"
19 | },
20 | "type": "dashboard"
21 | }
22 | ]
23 | },
24 | "editable": true,
25 | "fiscalYearStartMonth": 0,
26 | "graphTooltip": 0,
27 | "id": 15,
28 | "links": [],
29 | "liveNow": false,
30 | "panels": [
31 | {
32 | "datasource": {
33 | "type": "prometheus",
34 | "uid": "${DS_PROMETHEUS}"
35 | },
36 | "description": "",
37 | "gridPos": {
38 | "h": 24,
39 | "w": 17,
40 | "x": 0,
41 | "y": 0
42 | },
43 | "id": 2,
44 | "interval": "15s",
45 | "options": {
46 | "nodes": {
47 | "arcs": [
48 | {
49 | "color": "#5794F2",
50 | "field": "arc__color"
51 | }
52 | ]
53 | }
54 | },
55 | "targets": [
56 | {
57 | "datasource": {
58 | "type": "prometheus",
59 | "uid": "${DS_PROMETHEUS}"
60 | },
61 | "editorMode": "code",
62 | "exemplar": false,
63 | "expr": "increase((sum by (id, title, subTitle, detail__kind, arc__color) (label_replace((label_replace(label_replace(label_replace(label_replace((caretta_links_observed{client_namespace=~\"$namespace\", client_kind=~\"$kind\", client_name=~\"$workload\", server_port=~\"$port\"} or caretta_links_observed{server_namespace=~\"$namespace\", server_kind=~\"$kind\", server_name=~\"$workload\", server_port=~\"$port\"}), \"detail__kind\", \"$1\", \"server_kind\", \"(.*)\"), \"subTitle\", \"$1\", \"server_namespace\", \"(.*)\"), \"title\", \"$1\", \"server_name\", \"(.*)\"), \"id\", \"$1\", \"server_id\", \"(.*)\") or label_replace(label_replace(label_replace(label_replace((caretta_links_observed{client_namespace=~\"$namespace\", client_kind=~\"$kind\", client_name=~\"$workload\", server_port=~\"$port\"} or caretta_links_observed{server_namespace=~\"$namespace\", server_kind=~\"$kind\", server_name=~\"$workload\", server_port=~\"$port\"}), \"detail__kind\", \"$1\", \"client_kind\", \"(.*)\"), \"subTitle\", \"$1\", \"client_namespace\", \"(.*)\"), \"title\", \"$1\", \"client_name\", \"(.*)\"), \"id\", \"$1\", \"client_id\", \"(.*)\") ), \"arc__color\", \"1\", \"link_id\", \"(.*)\")) )[$__range:$__interval]) > 0",
64 | "format": "table",
65 | "instant": true,
66 | "legendFormat": "__auto",
67 | "range": false,
68 | "refId": "nodes"
69 | },
70 | {
71 | "datasource": {
72 | "type": "prometheus",
73 | "uid": "${DS_PROMETHEUS}"
74 | },
75 | "editorMode": "code",
76 | "exemplar": false,
77 | "expr": "increase((sum by (id, source, target, mainStat) ((label_replace(label_replace(label_replace(label_replace((caretta_links_observed{client_namespace=~\"$namespace\", client_kind=~\"$kind\", client_name=~\"$workload\", server_port=~\"$port\"} or caretta_links_observed{server_namespace=~\"$namespace\", server_kind=~\"$kind\", server_name=~\"$workload\", server_port=~\"$port\"}), \"id\", \"$1\", \"link_id\", \"(.*)\"), \"source\", \"$1\", \"client_id\", \"(.*)\"), \"target\", \"$1\", \"server_id\", \"(.*)\"), \"mainStat\", \"$1\", \"server_port\", \"(.*)\"))) )[$__range:$__interval]) > 0",
78 | "format": "table",
79 | "hide": false,
80 | "instant": true,
81 | "legendFormat": "__auto",
82 | "range": false,
83 | "refId": "edges"
84 | }
85 | ],
86 | "title": "Service Map ☸️",
87 | "type": "nodeGraph"
88 | },
89 | {
90 | "datasource": {
91 | "type": "prometheus",
92 | "uid": "${DS_PROMETHEUS}"
93 | },
94 | "fieldConfig": {
95 | "defaults": {
96 | "color": {
97 | "fixedColor": "blue",
98 | "mode": "fixed"
99 | },
100 | "custom": {
101 | "hideFrom": {
102 | "legend": false,
103 | "tooltip": false,
104 | "viz": false
105 | }
106 | },
107 | "links": [],
108 | "mappings": []
109 | },
110 | "overrides": []
111 | },
112 | "gridPos": {
113 | "h": 7,
114 | "w": 4,
115 | "x": 17,
116 | "y": 0
117 | },
118 | "id": 4,
119 | "options": {
120 | "displayLabels": [
121 | "name"
122 | ],
123 | "legend": {
124 | "displayMode": "list",
125 | "placement": "right",
126 | "showLegend": false
127 | },
128 | "pieType": "donut",
129 | "reduceOptions": {
130 | "calcs": [
131 | "lastNotNull"
132 | ],
133 | "fields": "",
134 | "values": false
135 | },
136 | "tooltip": {
137 | "mode": "single",
138 | "sort": "none"
139 | }
140 | },
141 | "targets": [
142 | {
143 | "datasource": {
144 | "type": "prometheus",
145 | "uid": "${DS_PROMETHEUS}"
146 | },
147 | "editorMode": "code",
148 | "expr": "sum by (server_port) (increase((caretta_links_observed{client_namespace=~\"$namespace\", client_kind=~\"$kind\", client_name=~\"$workload\", server_port=~\"$port\"} or caretta_links_observed{server_namespace=~\"$namespace\", server_kind=~\"$kind\", server_name=~\"$workload\", server_port=~\"$port\"})[$__range:$__interval])) > 0",
149 | "legendFormat": "__auto",
150 | "range": true,
151 | "refId": "A"
152 | }
153 | ],
154 | "title": "Active Ports",
155 | "type": "piechart"
156 | },
157 | {
158 | "datasource": {
159 | "type": "datasource",
160 | "uid": "grafana"
161 | },
162 | "gridPos": {
163 | "h": 7,
164 | "w": 3,
165 | "x": 21,
166 | "y": 0
167 | },
168 | "id": 10,
169 | "options": {
170 | "code": {
171 | "language": "plaintext",
172 | "showLineNumbers": false,
173 | "showMiniMap": false
174 | },
175 | "content": "\n \n\n \n \n \n\n \n [](http://www.groundcover.com/join-slack)\n \n \n\n\n | \n
\n",
176 | "mode": "markdown"
177 | },
178 | "pluginVersion": "10.1.2",
179 | "type": "text"
180 | },
181 | {
182 | "datasource": {
183 | "type": "prometheus",
184 | "uid": "${DS_PROMETHEUS}"
185 | },
186 | "fieldConfig": {
187 | "defaults": {
188 | "color": {
189 | "fixedColor": "purple",
190 | "mode": "continuous-blues"
191 | },
192 | "mappings": [],
193 | "thresholds": {
194 | "mode": "absolute",
195 | "steps": [
196 | {
197 | "color": "green",
198 | "value": null
199 | },
200 | {
201 | "color": "red",
202 | "value": 80
203 | }
204 | ]
205 | },
206 | "unit": "Bps"
207 | },
208 | "overrides": []
209 | },
210 | "gridPos": {
211 | "h": 8,
212 | "w": 7,
213 | "x": 17,
214 | "y": 7
215 | },
216 | "id": 8,
217 | "options": {
218 | "displayMode": "gradient",
219 | "minVizHeight": 10,
220 | "minVizWidth": 0,
221 | "orientation": "horizontal",
222 | "reduceOptions": {
223 | "calcs": [
224 | "lastNotNull"
225 | ],
226 | "fields": "",
227 | "values": false
228 | },
229 | "showUnfilled": true,
230 | "valueMode": "color"
231 | },
232 | "pluginVersion": "10.1.2",
233 | "targets": [
234 | {
235 | "datasource": {
236 | "type": "prometheus",
237 | "uid": "${DS_PROMETHEUS}"
238 | },
239 | "editorMode": "code",
240 | "exemplar": false,
241 | "expr": "topk(8, sum by (client_name) ((rate(caretta_links_observed{client_namespace=~\"$namespace\", client_kind=~\"$kind\", client_name=~\"$workload\", server_port=~\"$port\"}[$__range:$__interval]))))",
242 | "format": "time_series",
243 | "instant": true,
244 | "legendFormat": "__auto",
245 | "range": false,
246 | "refId": "A"
247 | }
248 | ],
249 | "title": "Top Throughput Workloads",
250 | "type": "bargauge"
251 | },
252 | {
253 | "datasource": {
254 | "type": "prometheus",
255 | "uid": "${DS_PROMETHEUS}"
256 | },
257 | "description": "",
258 | "fieldConfig": {
259 | "defaults": {
260 | "color": {
261 | "mode": "continuous-blues"
262 | },
263 | "mappings": [],
264 | "thresholds": {
265 | "mode": "absolute",
266 | "steps": [
267 | {
268 | "color": "green",
269 | "value": null
270 | },
271 | {
272 | "color": "red",
273 | "value": 80
274 | }
275 | ]
276 | },
277 | "unit": "Bps"
278 | },
279 | "overrides": []
280 | },
281 | "gridPos": {
282 | "h": 9,
283 | "w": 7,
284 | "x": 17,
285 | "y": 15
286 | },
287 | "id": 6,
288 | "options": {
289 | "colorMode": "background",
290 | "graphMode": "area",
291 | "justifyMode": "center",
292 | "orientation": "horizontal",
293 | "reduceOptions": {
294 | "calcs": [
295 | "lastNotNull"
296 | ],
297 | "fields": "",
298 | "values": false
299 | },
300 | "text": {},
301 | "textMode": "auto"
302 | },
303 | "pluginVersion": "10.1.2",
304 | "targets": [
305 | {
306 | "datasource": {
307 | "type": "prometheus",
308 | "uid": "${DS_PROMETHEUS}"
309 | },
310 | "editorMode": "code",
311 | "exemplar": false,
312 | "expr": "topk(7, sum by (client_name, server_name) ( rate( (caretta_links_observed{client_namespace=~\"$namespace\", client_kind=~\"$kind\", client_name=~\"$workload\", server_port=~\"$port\", client_kind!~\"(node|external)\",} or caretta_links_observed{server_namespace=~\"$namespace\", server_kind=~\"$kind\", server_name=~\"$workload\", server_port=~\"$port\", server_kind!~\"(node|external)\"})[$__range:$__interval]) ) )",
313 | "format": "time_series",
314 | "instant": true,
315 | "legendFormat": "{{client_name}} ⮂ {{server_name}}",
316 | "range": false,
317 | "refId": "A"
318 | }
319 | ],
320 | "title": "Top Throughput Connections",
321 | "type": "stat"
322 | }
323 | ],
324 | "refresh": "",
325 | "schemaVersion": 38,
326 | "style": "dark",
327 | "tags": [],
328 | "templating": {
329 | "list": [
330 | {
331 | "current": {
332 | "selected": false,
333 | "text": "default",
334 | "value": "default"
335 | },
336 | "hide": 0,
337 | "includeAll": false,
338 | "label": "datasource",
339 | "multi": false,
340 | "name": "DS_PROMETHEUS",
341 | "options": [],
342 | "query": "prometheus",
343 | "queryValue": "",
344 | "refresh": 1,
345 | "regex": "",
346 | "skipUrlSync": false,
347 | "type": "datasource"
348 | },
349 | {
350 | "allValue": "(.*)",
351 | "current": {
352 | "selected": true,
353 | "text": [
354 | "All"
355 | ],
356 | "value": [
357 | "$__all"
358 | ]
359 | },
360 | "datasource": {
361 | "type": "prometheus",
362 | "uid": "${DS_PROMETHEUS}"
363 | },
364 | "definition": "query_result(caretta_links_observed)",
365 | "hide": 0,
366 | "includeAll": true,
367 | "multi": true,
368 | "name": "namespace",
369 | "options": [],
370 | "query": {
371 | "query": "query_result(caretta_links_observed)",
372 | "refId": "StandardVariableQuery"
373 | },
374 | "refresh": 1,
375 | "regex": "/.*_namespace=\"([^\"]*).*/",
376 | "skipUrlSync": false,
377 | "sort": 1,
378 | "type": "query"
379 | },
380 | {
381 | "allValue": "(.*)",
382 | "current": {
383 | "selected": true,
384 | "text": [
385 | "All"
386 | ],
387 | "value": [
388 | "$__all"
389 | ]
390 | },
391 | "datasource": {
392 | "type": "prometheus",
393 | "uid": "${DS_PROMETHEUS}"
394 | },
395 | "definition": "query_result(caretta_links_observed)",
396 | "hide": 0,
397 | "includeAll": true,
398 | "multi": true,
399 | "name": "kind",
400 | "options": [],
401 | "query": {
402 | "query": "query_result(caretta_links_observed)",
403 | "refId": "StandardVariableQuery"
404 | },
405 | "refresh": 1,
406 | "regex": "/.*_kind=\"([^\"]*).*/",
407 | "skipUrlSync": false,
408 | "sort": 0,
409 | "type": "query"
410 | },
411 | {
412 | "allValue": "(.*)",
413 | "current": {
414 | "selected": true,
415 | "text": [
416 | "All"
417 | ],
418 | "value": [
419 | "$__all"
420 | ]
421 | },
422 | "datasource": {
423 | "type": "prometheus",
424 | "uid": "${DS_PROMETHEUS}"
425 | },
426 | "definition": "query_result(caretta_links_observed)",
427 | "hide": 0,
428 | "includeAll": true,
429 | "label": "workload",
430 | "multi": true,
431 | "name": "workload",
432 | "options": [],
433 | "query": {
434 | "query": "query_result(caretta_links_observed)",
435 | "refId": "StandardVariableQuery"
436 | },
437 | "refresh": 2,
438 | "regex": "/.*_name=\"([^\"]*).*/",
439 | "skipUrlSync": false,
440 | "sort": 1,
441 | "type": "query"
442 | },
443 | {
444 | "allValue": "(.*)",
445 | "current": {
446 | "selected": true,
447 | "text": [
448 | "All"
449 | ],
450 | "value": [
451 | "$__all"
452 | ]
453 | },
454 | "datasource": {
455 | "type": "prometheus",
456 | "uid": "${DS_PROMETHEUS}"
457 | },
458 | "definition": "label_values(server_port)",
459 | "hide": 0,
460 | "includeAll": true,
461 | "label": "server port",
462 | "multi": true,
463 | "name": "port",
464 | "options": [],
465 | "query": {
466 | "query": "label_values(server_port)",
467 | "refId": "StandardVariableQuery"
468 | },
469 | "refresh": 1,
470 | "regex": "",
471 | "skipUrlSync": false,
472 | "sort": 0,
473 | "type": "query"
474 | }
475 | ]
476 | },
477 | "time": {
478 | "from": "now-5m",
479 | "to": "now"
480 | },
481 | "timepicker": {},
482 | "timezone": "",
483 | "title": "Caretta Dashboard",
484 | "uid": "k0Om62pVf",
485 | "version": 2,
486 | "weekStart": ""
487 | }
--------------------------------------------------------------------------------
/chart/templates/_helpers.tpl:
--------------------------------------------------------------------------------
1 | {{/*
2 | Expand the name of the chart.
3 | */}}
4 | {{- define "caretta.name" -}}
5 | {{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }}
6 | {{- end }}
7 |
8 | {{/*
9 | Create a default fully qualified app name.
10 | We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec).
11 | If release name contains chart name it will be used as a full name.
12 | */}}
13 | {{- define "caretta.fullname" -}}
14 | {{- if .Values.fullnameOverride }}
15 | {{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }}
16 | {{- else }}
17 | {{- $name := default .Chart.Name .Values.nameOverride }}
18 | {{- if contains $name .Release.Name }}
19 | {{- .Release.Name | trunc 63 | trimSuffix "-" }}
20 | {{- else }}
21 | {{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }}
22 | {{- end }}
23 | {{- end }}
24 | {{- end }}
25 |
26 | {{/*
27 | Create chart name and version as used by the chart label.
28 | */}}
29 | {{- define "caretta.chart" -}}
30 | {{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }}
31 | {{- end }}
32 |
33 | {{/*
34 | Common labels
35 | */}}
36 | {{- define "caretta.labels" -}}
37 | helm.sh/chart: {{ include "caretta.chart" . }}
38 | {{ include "caretta.selectorLabels" . }}
39 | {{- if .Chart.AppVersion }}
40 | app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
41 | {{- end }}
42 | app.kubernetes.io/managed-by: {{ .Release.Service }}
43 | {{- end }}
44 |
45 | {{/*
46 | Selector labels
47 | */}}
48 | {{- define "caretta.selectorLabels" -}}
49 | app.kubernetes.io/name: {{ include "caretta.name" . }}
50 | app.kubernetes.io/instance: {{ .Release.Name }}
51 | {{- end }}
52 |
53 | {{/*
54 | Create the name of the service account to use
55 | */}}
56 | {{- define "caretta.serviceAccountName" -}}
57 | {{- if .Values.serviceAccount.create }}
58 | {{- default (include "caretta.fullname" .) .Values.serviceAccount.name }}
59 | {{- else }}
60 | {{- default "default" .Values.serviceAccount.name }}
61 | {{- end }}
62 | {{- end }}
63 |
--------------------------------------------------------------------------------
/chart/templates/daemonset.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: apps/v1
2 | kind: DaemonSet
3 | metadata:
4 | name: {{ include "caretta.name" . }}
5 | labels:
6 | app: caretta
7 | {{- include "caretta.labels" . | nindent 4 }}
8 | spec:
9 | selector:
10 | matchLabels:
11 | app: caretta
12 | {{- include "caretta.selectorLabels" . | nindent 6 }}
13 | template:
14 | metadata:
15 | annotations:
16 | {{- with .Values.podAnnotations }}
17 | {{- toYaml . | nindent 8 }}
18 | {{- end }}
19 | labels:
20 | app: caretta
21 | {{- include "caretta.selectorLabels" . | nindent 8 }}
22 | spec:
23 | {{- with .Values.imagePullSecrets }}
24 | imagePullSecrets:
25 | {{- toYaml . | nindent 8 }}
26 | {{- end }}
27 | serviceAccountName: {{ include "caretta.name" . }}
28 | {{- if .Values.priorityClassName }}
29 | priorityClassName: {{ .Values.priorityClassName }}
30 | {{- end }}
31 | securityContext:
32 | {{- toYaml .Values.podSecurityContext | nindent 8 }}
33 | containers:
34 | - name: {{ .Chart.Name }}
35 | securityContext:
36 | {{- toYaml .Values.securityContext | nindent 12 }}
37 | image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}"
38 | imagePullPolicy: {{ .Values.image.pullPolicy }}
39 | volumeMounts:
40 | - mountPath: /proc
41 | name: proc
42 | - mountPath: /sys/kernel/debug
43 | name: debug
44 | ports:
45 | - name: prom-metrics
46 | containerPort: {{ .Values.prometheusPort }}
47 | protocol: TCP
48 | env:
49 | - name: "RESOLVE_DNS"
50 | value: "{{ .Values.enableDnsResolving }}"
51 | - name: "PROMETHEUS_PORT"
52 | value: "{{ .Values.prometheusPort }}"
53 | - name: "PROMETHEUS_ENDPOINT"
54 | value: "{{ .Values.prometheusEndpoint }}"
55 | - name: "POLL_INTERVAL"
56 | value: "{{ .Values.pollIntervalSeconds }}"
57 | - name: "TRAVERSE_UP_HIERARCHY"
58 | value: "{{ .Values.traverseUpHierarchy }}"
59 | resources:
60 | {{- toYaml .Values.resources | nindent 12 }}
61 | {{- with .Values.nodeSelector }}
62 | nodeSelector:
63 | {{- toYaml . | nindent 8 }}
64 | {{- end }}
65 | {{- with .Values.affinity }}
66 | affinity:
67 | {{- toYaml . | nindent 8 }}
68 | {{- end }}
69 | {{- with .Values.tolerations }}
70 | tolerations:
71 | {{- toYaml . | nindent 8 }}
72 | {{- end }}
73 | volumes:
74 | - name: proc
75 | hostPath:
76 | path: /proc
77 | - name: debug
78 | hostPath:
79 | path: /sys/kernel/debug
--------------------------------------------------------------------------------
/chart/templates/grafana/dashboards.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: ConfigMap
3 | metadata:
4 | name: caretta-grafana-dashboards
5 | namespace: {{ .Release.Namespace }}
6 | labels:
7 | {{- if $.Values.grafana.sidecar.dashboards.enabled }}
8 | {{ $.Values.grafana.sidecar.dashboards.label }}: {{ $.Values.grafana.sidecar.dashboards.labelValue | quote }}
9 | {{- end }}
10 | data:
11 | dashboard.json: |-
12 | {{ .Files.Get "dashboard.json" | indent 4}}
--------------------------------------------------------------------------------
/chart/templates/rbac/psp.yaml:
--------------------------------------------------------------------------------
1 | {{- if and .Values.rbac.pspEnabled (.Capabilities.APIVersions.Has "policy/v1beta1") }}
2 | apiVersion: policy/v1beta1
3 | kind: PodSecurityPolicy
4 | metadata:
5 | annotations:
6 | seccomp.security.alpha.kubernetes.io/allowedProfileNames: '*'
7 | name: {{ template "caretta.fullname" . }}
8 | spec:
9 | allowPrivilegeEscalation: true
10 | allowedCapabilities:
11 | - '*'
12 | fsGroup:
13 | rule: RunAsAny
14 | hostIPC: true
15 | hostNetwork: false
16 | hostPID: true
17 | hostPorts:
18 | - max: 65535
19 | min: 0
20 | privileged: true
21 | runAsUser:
22 | rule: RunAsAny
23 | seLinux:
24 | rule: RunAsAny
25 | supplementalGroups:
26 | rule: RunAsAny
27 | volumes:
28 | - '*'
29 | {{ end -}}
30 |
--------------------------------------------------------------------------------
/chart/templates/rbac/role.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: rbac.authorization.k8s.io/v1
2 | kind: ClusterRole
3 | metadata:
4 | name: {{ include "caretta.fullname" . }}
5 | rules:
6 | {{- if and .Values.rbac.pspEnabled (.Capabilities.APIVersions.Has "policy/v1beta1") }}
7 | - apiGroups:
8 | - policy
9 | - extensions
10 | resourceNames:
11 | - {{ template "caretta.fullname" . }}
12 | resources:
13 | - podsecuritypolicies
14 | verbs:
15 | - use
16 | {{- end }}
17 | {{- if and .Values.rbac.sccEnabled (.Capabilities.APIVersions.Has "security.openshift.io/v1")}}
18 | - apiGroups:
19 | - security.openshift.io
20 | resources:
21 | - securitycontextconstraints
22 | verbs:
23 | - use
24 | resourceNames:
25 | - privileged
26 | {{- end }}
27 | - verbs:
28 | - get
29 | - list
30 | - watch
31 | apiGroups:
32 | - ''
33 | resources:
34 | - configmaps
35 | - endpoints
36 | - persistentvolumeclaims
37 | - persistentvolumeclaims/status
38 | - pods
39 | - replicationcontrollers
40 | - replicationcontrollers/scale
41 | - serviceaccounts
42 | - services
43 | - services/status
44 | - verbs:
45 | - get
46 | - list
47 | - watch
48 | apiGroups:
49 | - ''
50 | resources:
51 | - bindings
52 | - events
53 | - limitranges
54 | - namespaces/status
55 | - pods/log
56 | - pods/status
57 | - nodes
58 | - replicationcontrollers/status
59 | - resourcequotas
60 | - resourcequotas/status
61 | - verbs:
62 | - get
63 | - list
64 | - watch
65 | apiGroups:
66 | - ''
67 | resources:
68 | - namespaces
69 | - verbs:
70 | - get
71 | - list
72 | - watch
73 | apiGroups:
74 | - apps
75 | resources:
76 | - controllerrevisions
77 | - daemonsets
78 | - daemonsets/status
79 | - deployments
80 | - deployments/scale
81 | - deployments/status
82 | - replicasets
83 | - replicasets/scale
84 | - replicasets/status
85 | - statefulsets
86 | - statefulsets/scale
87 | - statefulsets/status
88 | - verbs:
89 | - get
90 | - list
91 | - watch
92 | apiGroups:
93 | - batch
94 | resources:
95 | - cronjobs
96 | - cronjobs/status
97 | - jobs
98 | - jobs/status
99 | - verbs:
100 | - get
101 | - list
102 | - watch
103 | apiGroups:
104 | - extensions
105 | resources:
106 | - daemonsets
107 | - daemonsets/status
108 | - deployments
109 | - deployments/scale
110 | - deployments/status
111 | - ingresses
112 | - ingresses/status
113 | - networkpolicies
114 | - replicasets
115 | - replicasets/scale
116 | - replicasets/status
117 | - replicationcontrollers/scale
118 | - verbs:
119 | - get
120 | - list
121 | - watch
122 | apiGroups:
123 | - policy
124 | resources:
125 | - poddisruptionbudgets
126 | - poddisruptionbudgets/status
127 | - verbs:
128 | - get
129 | - list
130 | - watch
131 | apiGroups:
132 | - networking.k8s.io
133 | resources:
134 | - ingresses
135 | - ingresses/status
136 | - networkpolicies
137 | - verbs:
138 | - get
139 | apiGroups:
140 | - discovery.k8s.io
141 | resources:
142 | - endpointslices
143 | - verbs:
144 | - list
145 | apiGroups:
146 | - discovery.k8s.io
147 | resources:
148 | - endpointslices
149 | - verbs:
150 | - watch
151 | apiGroups:
152 | - discovery.k8s.io
153 | resources:
154 | - endpointslices
155 | - verbs:
156 | - get
157 | - list
158 | - watch
159 | apiGroups:
160 | - metrics.k8s.io
161 | resources:
162 | - pods
163 | - nodes
164 |
165 |
--------------------------------------------------------------------------------
/chart/templates/rbac/rolebinding.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: rbac.authorization.k8s.io/v1
2 | kind: ClusterRoleBinding
3 | metadata:
4 | name: {{ include "caretta.fullname" . }}
5 | roleRef:
6 | apiGroup: rbac.authorization.k8s.io
7 | kind: ClusterRole
8 | name: {{ include "caretta.fullname" . }}
9 | subjects:
10 | - kind: ServiceAccount
11 | name: {{ include "caretta.name" . }}
12 | namespace: {{ .Release.Namespace }}
--------------------------------------------------------------------------------
/chart/templates/rbac/serviceaccount.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: ServiceAccount
3 | metadata:
4 | name: {{ include "caretta.name" . }}
5 | namespace: {{ .Release.Namespace }}
6 |
--------------------------------------------------------------------------------
/chart/values.yaml:
--------------------------------------------------------------------------------
1 | enableDnsResolving: true
2 | prometheusPort: 7117
3 | prometheusEndpoint: "/metrics"
4 | pollIntervalSeconds: 5
5 | traverseUpHierarchy: true
6 |
7 | rbac:
8 | pspEnabled: true
9 | sccEnabled: true
10 | image:
11 | repository: quay.io/groundcover/caretta
12 | pullPolicy: Always
13 | tag: ""
14 |
15 | imagePullSecrets: []
16 | nameOverride: ""
17 | fullnameOverride: ""
18 |
19 | resources:
20 | limits:
21 | cpu: 150m
22 | memory: 500Mi
23 | requests:
24 | cpu: 10m
25 | memory: 50Mi
26 |
27 | serviceAccount:
28 | # Specifies whether a service account should be created
29 | create: true
30 | # Annotations to add to the service account
31 | annotations: {}
32 | # The name of the service account to use.
33 | # If not set and create is true, a name is generated using the fullname template
34 | name: ""
35 |
36 | podAnnotations: {}
37 |
38 | podSecurityContext: {}
39 | # fsGroup: 2000
40 |
41 | securityContext:
42 | privileged: true
43 | readOnlyRootFilesystem: true
44 | # capabilities:
45 | # drop:
46 | # - ALL
47 | # readOnlyRootFilesystem: true
48 | # runAsNonRoot: true
49 | # runAsUser: 1000
50 |
51 | tolerations:
52 | - key: node-role.kubernetes.io/control-plane
53 | operator: Exists
54 | effect: NoSchedule
55 | - key: node-role.kubernetes.io/master
56 | operator: Exists
57 | effect: NoSchedule
58 |
59 | nodeSelector: {}
60 | affinity: {}
61 | priorityClassName:
62 |
63 | victoria-metrics-single:
64 | server:
65 | image:
66 | repository: quay.io/groundcover/victoria-metrics
67 | resources:
68 | limits:
69 | cpu: 300m
70 | memory: 350Mi
71 | requests:
72 | cpu: 5m
73 | memory: 50Mi
74 | fullnameOverride: caretta-vm
75 | persistentVolume:
76 | enabled: false
77 | size: 16Gi # change enabled to true if you pv is required
78 |
79 | scrape:
80 | enabled: true
81 |
82 | config:
83 | global:
84 | scrape_interval: 15s
85 |
86 | scrape_configs:
87 | - job_name: 'caretta'
88 | metrics_path: /metrics
89 | scrape_interval: 5s
90 | kubernetes_sd_configs:
91 | - role: pod
92 | namespaces:
93 | own_namespace: true
94 | relabel_configs:
95 | - source_labels: [__meta_kubernetes_pod_label_app]
96 | separator: ;
97 | regex: caretta
98 | replacement: $1
99 | action: keep
100 | - action: labelmap
101 | regex: __meta_kubernetes_pod_label_(.+)
102 | - source_labels: [__meta_kubernetes_pod_name]
103 | action: replace
104 | target_label: caretta_pod
105 | - source_labels: [__meta_kubernetes_pod_node_name]
106 | action: replace
107 | target_label: caretta_node
108 |
109 | grafana:
110 | image:
111 | repository: quay.io/groundcover/grafana
112 | resources:
113 | limits:
114 | memory: 300Mi
115 | cpu: 300m
116 | requests:
117 | memory: 50Mi
118 | cpu: 5m
119 | datasources:
120 | datasources.yaml:
121 | apiVersion: 1
122 | datasources:
123 | - name: Prometheus
124 | type: prometheus
125 | access: proxy
126 | url: "http://caretta-vm:8428"
127 | editable: "true"
128 |
129 | sidecar:
130 | dashboards:
131 | label: grafana_dashboard
132 | labelValue: ""
133 |
134 | dashboardProviders:
135 | dashboardproviders.yaml:
136 | apiVersion: 1
137 | providers:
138 | - name: 'default'
139 | orgId: 1
140 | folder: ''
141 | type: file
142 | disableDeletion: false
143 | editable: true
144 | options:
145 | path: /var/lib/grafana/dashboards
146 | foldersFromFilesStructure: true
147 |
148 | dashboardsConfigMaps:
149 | default: "caretta-grafana-dashboards"
150 |
151 | grafana.ini:
152 | auth.anonymous:
153 | enabled: true
154 | dashboards:
155 | default_home_dashboard_path: /var/lib/grafana/dashboards/default/dashboard.json
156 | adminUser: "admin"
157 | adminPassword: "caretta"
158 |
--------------------------------------------------------------------------------
/cmd/caretta/caretta.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "log"
5 | _ "net/http/pprof"
6 | "os"
7 | "os/signal"
8 | "syscall"
9 |
10 | "github.com/groundcover-com/caretta/pkg/caretta"
11 | )
12 |
13 | func main() {
14 | log.Print("Caretta starting...")
15 | caretta := caretta.NewCaretta()
16 |
17 | caretta.Start()
18 |
19 | osSignal := make(chan os.Signal, 1)
20 | signal.Notify(osSignal, syscall.SIGINT, syscall.SIGTERM)
21 | <-osSignal
22 | caretta.Stop()
23 | }
24 |
--------------------------------------------------------------------------------
/go.mod:
--------------------------------------------------------------------------------
1 | module github.com/groundcover-com/caretta
2 |
3 | go 1.19
4 |
5 | require (
6 | github.com/cilium/ebpf v0.10.0
7 | github.com/google/uuid v1.3.0
8 | github.com/hashicorp/golang-lru/v2 v2.0.1
9 | github.com/prometheus/client_golang v1.14.0
10 | github.com/stretchr/testify v1.8.1
11 | k8s.io/api v0.26.0
12 | k8s.io/apimachinery v0.26.0
13 | k8s.io/client-go v0.26.0
14 | )
15 |
16 | require (
17 | github.com/beorn7/perks v1.0.1 // indirect
18 | github.com/cespare/xxhash/v2 v2.1.2 // indirect
19 | github.com/davecgh/go-spew v1.1.1 // indirect
20 | github.com/emicklei/go-restful/v3 v3.9.0 // indirect
21 | github.com/evanphx/json-patch v4.12.0+incompatible // indirect
22 | github.com/go-logr/logr v1.2.3 // indirect
23 | github.com/go-openapi/jsonpointer v0.19.5 // indirect
24 | github.com/go-openapi/jsonreference v0.20.0 // indirect
25 | github.com/go-openapi/swag v0.19.14 // indirect
26 | github.com/gogo/protobuf v1.3.2 // indirect
27 | github.com/golang/protobuf v1.5.2 // indirect
28 | github.com/google/gnostic v0.5.7-v3refs // indirect
29 | github.com/google/go-cmp v0.5.9 // indirect
30 | github.com/google/gofuzz v1.1.0 // indirect
31 | github.com/josharian/intern v1.0.0 // indirect
32 | github.com/json-iterator/go v1.1.12 // indirect
33 | github.com/mailru/easyjson v0.7.6 // indirect
34 | github.com/matttproud/golang_protobuf_extensions v1.0.1 // indirect
35 | github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
36 | github.com/modern-go/reflect2 v1.0.2 // indirect
37 | github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
38 | github.com/pkg/errors v0.9.1 // indirect
39 | github.com/pmezard/go-difflib v1.0.0 // indirect
40 | github.com/prometheus/client_model v0.3.0 // indirect
41 | github.com/prometheus/common v0.37.0 // indirect
42 | github.com/prometheus/procfs v0.8.0 // indirect
43 | golang.org/x/net v0.3.1-0.20221206200815-1e63c2f08a10 // indirect
44 | golang.org/x/oauth2 v0.0.0-20220223155221-ee480838109b // indirect
45 | golang.org/x/sys v0.3.0 // indirect
46 | golang.org/x/term v0.3.0 // indirect
47 | golang.org/x/text v0.5.0 // indirect
48 | golang.org/x/time v0.0.0-20220210224613-90d013bbcef8 // indirect
49 | google.golang.org/appengine v1.6.7 // indirect
50 | google.golang.org/protobuf v1.28.1 // indirect
51 | gopkg.in/inf.v0 v0.9.1 // indirect
52 | gopkg.in/yaml.v2 v2.4.0 // indirect
53 | gopkg.in/yaml.v3 v3.0.1 // indirect
54 | k8s.io/klog/v2 v2.80.1 // indirect
55 | k8s.io/kube-openapi v0.0.0-20221012153701-172d655c2280 // indirect
56 | k8s.io/utils v0.0.0-20221107191617-1a15be271d1d // indirect
57 | sigs.k8s.io/json v0.0.0-20220713155537-f223a00ba0e2 // indirect
58 | sigs.k8s.io/structured-merge-diff/v4 v4.2.3 // indirect
59 | sigs.k8s.io/yaml v1.3.0 // indirect
60 | )
61 |
--------------------------------------------------------------------------------
/images/caretta.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/groundcover-com/caretta/280d1640ce0174b1dfdd7d05bdd104604aa04508/images/caretta.gif
--------------------------------------------------------------------------------
/images/screenshot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/groundcover-com/caretta/280d1640ce0174b1dfdd7d05bdd104604aa04508/images/screenshot.png
--------------------------------------------------------------------------------
/pkg/caretta/caretta.go:
--------------------------------------------------------------------------------
1 | package caretta
2 |
3 | import (
4 | "context"
5 | "hash/fnv"
6 | "log"
7 | "net/http"
8 | "strconv"
9 | "time"
10 |
11 | caretta_k8s "github.com/groundcover-com/caretta/pkg/k8s"
12 | "github.com/groundcover-com/caretta/pkg/metrics"
13 | "github.com/prometheus/client_golang/prometheus"
14 | "github.com/prometheus/client_golang/prometheus/promauto"
15 | "k8s.io/client-go/kubernetes"
16 | "k8s.io/client-go/rest"
17 | )
18 |
19 | var (
20 | linksMetrics = promauto.NewGaugeVec(prometheus.GaugeOpts{
21 | Name: "caretta_links_observed",
22 | Help: "total bytes_sent value of links observed by caretta since its launch",
23 | }, []string{
24 | "link_id", "client_id", "client_name", "client_namespace", "client_kind", "client_owner", "server_id", "server_name", "server_namespace", "server_kind", "server_port", "role",
25 | })
26 | tcpStateMetrics = promauto.NewGaugeVec(prometheus.GaugeOpts{
27 | Name: "caretta_tcp_states",
28 | Help: "state of TCP connections observed by caretta since its launch",
29 | }, []string{
30 | "link_id", "client_id", "client_name", "client_namespace", "client_kind", "client_owner", "server_id", "server_name", "server_namespace", "server_kind", "server_port", "role",
31 | })
32 | )
33 |
34 | type Caretta struct {
35 | stopSignal chan bool
36 | tracer LinksTracer
37 | metricsServer *http.Server
38 | config carettaConfig
39 | }
40 |
41 | func NewCaretta() *Caretta {
42 | return &Caretta{
43 | stopSignal: make(chan bool, 1),
44 | config: readConfig(),
45 | }
46 | }
47 |
48 | func (caretta *Caretta) Start() {
49 | caretta.metricsServer = metrics.StartMetricsServer(caretta.config.prometheusEndpoint, caretta.config.prometheusPort)
50 |
51 | clientset, err := caretta.getClientSet()
52 | if err != nil {
53 | log.Fatalf("Error getting kubernetes clientset: %v", err)
54 | }
55 | resolver, err := caretta_k8s.NewK8sIPResolver(clientset, caretta.config.shouldResolveDns, caretta.config.traverseUpHierarchy)
56 | if err != nil {
57 | log.Fatalf("Error creating resolver: %v", err)
58 | }
59 | err = resolver.StartWatching()
60 | if err != nil {
61 | log.Fatalf("Error watching cluster's state: %v", err)
62 | }
63 |
64 | // wait for resolver to populate
65 | time.Sleep(10 * time.Second)
66 |
67 | caretta.tracer = NewTracer(resolver)
68 | err = caretta.tracer.Start()
69 | if err != nil {
70 | log.Fatalf("Couldn't load probes - %v", err)
71 | }
72 |
73 | pollingTicker := time.NewTicker(time.Duration(caretta.config.pollingIntervalSeconds) * time.Second)
74 |
75 | pastLinks := make(map[NetworkLink]uint64)
76 |
77 | go func() {
78 | for {
79 | select {
80 | case <-caretta.stopSignal:
81 | return
82 | case <-pollingTicker.C:
83 | var links map[NetworkLink]uint64
84 | var tcpConnections []TcpConnection
85 |
86 | if err != nil {
87 | log.Printf("Error updating snapshot of cluster state, skipping iteration")
88 | continue
89 | }
90 |
91 | pastLinks, links, tcpConnections = caretta.tracer.TracesPollingIteration(pastLinks)
92 | for link, throughput := range links {
93 | caretta.handleLink(&link, throughput)
94 | }
95 |
96 | for _, connection := range tcpConnections {
97 | caretta.handleTcpConnection(&connection)
98 | }
99 | }
100 | }
101 | }()
102 | }
103 |
104 | func (caretta *Caretta) Stop() {
105 | log.Print("Stopping Caretta...")
106 | caretta.stopSignal <- true
107 | err := caretta.tracer.Stop()
108 | if err != nil {
109 | log.Printf("Error unloading bpf objects: %v", err)
110 | }
111 | err = caretta.metricsServer.Shutdown(context.Background())
112 | if err != nil {
113 | log.Printf("Error shutting Prometheus server down: %v", err)
114 | }
115 |
116 | }
117 |
118 | func (caretta *Caretta) handleLink(link *NetworkLink, throughput uint64) {
119 | linksMetrics.With(prometheus.Labels{
120 | "link_id": strconv.Itoa(int(fnvHash(link.Client.Name+link.Client.Namespace+link.Server.Name+link.Server.Namespace) + link.Role)),
121 | "client_id": strconv.Itoa(int(fnvHash(link.Client.Name + link.Client.Namespace))),
122 | "client_name": link.Client.Name,
123 | "client_namespace": link.Client.Namespace,
124 | "client_kind": link.Client.Kind,
125 | "client_owner": link.Client.Owner,
126 | "server_id": strconv.Itoa(int(fnvHash(link.Server.Name + link.Server.Namespace))),
127 | "server_name": link.Server.Name,
128 | "server_namespace": link.Server.Namespace,
129 | "server_kind": link.Server.Kind,
130 | "server_port": strconv.Itoa(int(link.ServerPort)),
131 | "role": strconv.Itoa(int(link.Role)),
132 | }).Set(float64(throughput))
133 | }
134 |
135 | func (caretta *Caretta) handleTcpConnection(connection *TcpConnection) {
136 | tcpStateMetrics.With(prometheus.Labels{
137 | "link_id": strconv.Itoa(int(fnvHash(connection.Client.Name+connection.Client.Namespace+connection.Server.Name+connection.Server.Namespace) + connection.Role)),
138 | "client_id": strconv.Itoa(int(fnvHash(connection.Client.Name + connection.Client.Namespace))),
139 | "client_name": connection.Client.Name,
140 | "client_namespace": connection.Client.Namespace,
141 | "client_kind": connection.Client.Kind,
142 | "client_owner": connection.Client.Owner,
143 | "server_id": strconv.Itoa(int(fnvHash(connection.Server.Name + connection.Server.Namespace))),
144 | "server_name": connection.Server.Name,
145 | "server_namespace": connection.Server.Namespace,
146 | "server_kind": connection.Server.Kind,
147 | "server_port": strconv.Itoa(int(connection.ServerPort)),
148 | "role": strconv.Itoa(int(connection.Role)),
149 | }).Set(float64(connection.State))
150 | }
151 |
152 | func (caretta *Caretta) getClientSet() (*kubernetes.Clientset, error) {
153 | config, err := rest.InClusterConfig()
154 | if err != nil {
155 | return nil, err
156 | }
157 |
158 | clientset, err := kubernetes.NewForConfig(config)
159 | if err != nil {
160 | return nil, err
161 | }
162 | return clientset, nil
163 | }
164 |
165 | // simple fnvHash function from string to uint32
166 | func fnvHash(s string) uint32 {
167 | h := fnv.New32a()
168 | h.Write([]byte(s))
169 | return h.Sum32()
170 | }
171 |
--------------------------------------------------------------------------------
/pkg/caretta/config.go:
--------------------------------------------------------------------------------
1 | package caretta
2 |
3 | import (
4 | "fmt"
5 | "os"
6 | "strconv"
7 | )
8 |
9 | const (
10 | defaultPrometheusEndpoint = "/metrics"
11 | defaultPrometheusPort = ":7117"
12 | defaultPollingIntervalSeconds = 5
13 | defaultShouldResolveDns = false
14 | defaultTraverseUpHierarchy = true
15 | )
16 |
17 | type carettaConfig struct {
18 | shouldResolveDns bool
19 | prometheusPort string
20 | prometheusEndpoint string
21 | pollingIntervalSeconds int
22 | traverseUpHierarchy bool
23 | }
24 |
25 | // environment variables based, encapsulated to enable future changes
26 | func readConfig() carettaConfig {
27 | port := defaultPrometheusPort
28 | if val := os.Getenv("PROMETHEUS_PORT"); val != "" {
29 | valInt, err := strconv.Atoi(val)
30 | if err == nil {
31 | port = fmt.Sprintf(":%d", valInt)
32 | }
33 | }
34 |
35 | endpoint := defaultPrometheusEndpoint
36 | if val := os.Getenv("PROMETHEUS_ENDPOINT"); val != "" {
37 | endpoint = val
38 | }
39 |
40 | interval := defaultPollingIntervalSeconds
41 | if val := os.Getenv("POLL_INTERVAL"); val != "" {
42 | valInt, err := strconv.Atoi(val)
43 | if err == nil {
44 | interval = valInt
45 | }
46 | }
47 |
48 | shouldResolveDns := defaultShouldResolveDns
49 | if val := os.Getenv("RESOLVE_DNS"); val != "" {
50 | valBool, err := strconv.ParseBool(val)
51 | if err == nil {
52 | shouldResolveDns = valBool
53 | }
54 | }
55 |
56 | traverseUpHierarchy := defaultTraverseUpHierarchy
57 | if val := os.Getenv("TRAVERSE_UP_HIERARCHY"); val != "" {
58 | valBool, err := strconv.ParseBool(val)
59 | if err == nil {
60 | traverseUpHierarchy = valBool
61 | }
62 | }
63 |
64 | return carettaConfig{
65 | shouldResolveDns: shouldResolveDns,
66 | prometheusPort: port,
67 | prometheusEndpoint: endpoint,
68 | pollingIntervalSeconds: interval,
69 | traverseUpHierarchy: traverseUpHierarchy,
70 | }
71 | }
72 |
--------------------------------------------------------------------------------
/pkg/caretta/ebpf_map.go:
--------------------------------------------------------------------------------
1 | package caretta
2 |
3 | import "github.com/cilium/ebpf"
4 |
5 | type IEbpfMapIterator interface {
6 | Next(interface{}, interface{}) bool
7 | }
8 |
9 | type IEbpfMap interface {
10 | Lookup(interface{}, interface{}) error
11 | Iterate() IEbpfMapIterator
12 | Delete(interface{}) error
13 | }
14 |
15 | type EbpfMap struct {
16 | innerMap *ebpf.Map
17 | }
18 |
19 | type EbpfMapIterator struct {
20 | innerIterator *ebpf.MapIterator
21 | }
22 |
23 | func (m *EbpfMap) Lookup(key interface{}, val interface{}) error {
24 | return m.innerMap.Lookup(key, val)
25 | }
26 |
27 | func (m *EbpfMap) Iterate() IEbpfMapIterator {
28 | return &EbpfMapIterator{innerIterator: m.innerMap.Iterate()}
29 | }
30 |
31 | func (m *EbpfMap) Delete(key interface{}) error {
32 | return m.innerMap.Delete(key)
33 | }
34 |
35 | func (it *EbpfMapIterator) Next(key interface{}, val interface{}) bool {
36 | return it.innerIterator.Next(key, val)
37 | }
38 |
--------------------------------------------------------------------------------
/pkg/caretta/links_tracer.go:
--------------------------------------------------------------------------------
1 | package caretta
2 |
3 | import (
4 | "encoding/binary"
5 | "errors"
6 | "log"
7 | "net"
8 |
9 | "github.com/groundcover-com/caretta/pkg/k8s"
10 | "github.com/groundcover-com/caretta/pkg/tracing"
11 |
12 | "github.com/prometheus/client_golang/prometheus"
13 | "github.com/prometheus/client_golang/prometheus/promauto"
14 | )
15 |
16 | var (
17 | pollsMade = promauto.NewCounter(prometheus.CounterOpts{
18 | Name: "caretta_polls_made",
19 | Help: "Counter of polls made by caretta",
20 | })
21 | failedConnectionDeletion = promauto.NewCounter(prometheus.CounterOpts{
22 | Name: "caretta_failed_deletions",
23 | Help: "Counter of failed deletion of closed connection from map",
24 | })
25 | filteredLoopbackConnections = promauto.NewGauge(prometheus.GaugeOpts{
26 | Name: "caretta_current_loopback_connections",
27 | Help: `Number of loopback connections observed in the last iteration`,
28 | })
29 | mapSize = promauto.NewGauge(prometheus.GaugeOpts{
30 | Name: "caretta_ebpf_connections_map_size",
31 | Help: "number of items in the connections map iterated from user space per iteration",
32 | })
33 | mapDeletions = promauto.NewCounter(prometheus.CounterOpts{
34 | Name: "caretta_connection_deletions",
35 | Help: "total number of deletions from the map done by the userspace",
36 | })
37 | )
38 |
39 | type IPResolver interface {
40 | ResolveIP(string) k8s.Workload
41 | StartWatching() error
42 | StopWatching()
43 | }
44 |
45 | type Probes interface {
46 | UnloadProbes() error
47 | }
48 |
49 | type LinksTracer struct {
50 | ebpfObjects Probes
51 | connections IEbpfMap
52 | resolver IPResolver
53 | }
54 |
55 | // initializes a LinksTracer object
56 | func NewTracer(resolver *k8s.K8sIPResolver) LinksTracer {
57 | tracer := LinksTracer{resolver: resolver}
58 | return tracer
59 | }
60 |
61 | func NewTracerWithObjs(resolver IPResolver, connections IEbpfMap, probes Probes) LinksTracer {
62 | return LinksTracer{
63 | ebpfObjects: probes,
64 | connections: connections,
65 | resolver: resolver,
66 | }
67 | }
68 |
69 | func (tracer *LinksTracer) Start() error {
70 | objs, connMap, err := tracing.LoadProbes()
71 | if err != nil {
72 | return err
73 | }
74 |
75 | tracer.ebpfObjects = &objs
76 | tracer.connections = &EbpfMap{innerMap: connMap}
77 | return nil
78 | }
79 |
80 | func (tracer *LinksTracer) Stop() error {
81 | tracer.resolver.StopWatching()
82 | return tracer.ebpfObjects.UnloadProbes()
83 | }
84 |
85 | // a single polling from the eBPF maps
86 | // iterating the traces from the kernel-space, summing each network link
87 | func (tracer *LinksTracer) TracesPollingIteration(pastLinks map[NetworkLink]uint64) (map[NetworkLink]uint64, map[NetworkLink]uint64, []TcpConnection) {
88 | // outline of an iteration -
89 | // filter unwanted connections, sum all connections as links, add past links, and return the new map
90 | pollsMade.Inc()
91 | loopbackCounter := 0
92 |
93 | currentLinks := make(map[NetworkLink]uint64)
94 | currentTcpConnections := []TcpConnection{}
95 | var connectionsToDelete []ConnectionIdentifier
96 |
97 | var conn ConnectionIdentifier
98 | var throughput ConnectionThroughputStats
99 |
100 | entries := tracer.connections.Iterate()
101 | // iterate the map from the eBPF program
102 | itemsCounter := 0
103 | for entries.Next(&conn, &throughput) {
104 | itemsCounter += 1
105 | // filter unnecessary connection
106 |
107 | if throughput.IsActive == 0 {
108 | connectionsToDelete = append(connectionsToDelete, conn)
109 | }
110 |
111 | // skip loopback connections
112 | if conn.Tuple.SrcIp == conn.Tuple.DstIp && isAddressLoopback(conn.Tuple.DstIp) {
113 | loopbackCounter++
114 | continue
115 | }
116 |
117 | // filter unroled connections (probably indicates a bug)
118 | link, err := tracer.reduceConnectionToLink(conn)
119 | if conn.Role == UnknownConnectionRole || err != nil {
120 | continue
121 | }
122 |
123 | tcpConn, err := tracer.reduceConnectionToTcp(conn, throughput)
124 | if err != nil {
125 | continue
126 | }
127 |
128 | currentLinks[link] += throughput.BytesSent
129 | currentTcpConnections = append(currentTcpConnections, tcpConn)
130 | }
131 |
132 | mapSize.Set(float64(itemsCounter))
133 | filteredLoopbackConnections.Set(float64(loopbackCounter))
134 |
135 | // add past links
136 | for pastLink, pastThroughput := range pastLinks {
137 | currentLinks[pastLink] += pastThroughput
138 | }
139 |
140 | // delete connections marked to delete
141 | for _, conn := range connectionsToDelete {
142 | tracer.deleteAndStoreConnection(&conn, pastLinks)
143 | }
144 |
145 | return pastLinks, currentLinks, currentTcpConnections
146 |
147 | }
148 |
149 | func (tracer *LinksTracer) deleteAndStoreConnection(conn *ConnectionIdentifier, pastLinks map[NetworkLink]uint64) {
150 | // newer kernels introduce batch map operation, but it might not be available so we delete item-by-item
151 | var throughput ConnectionThroughputStats
152 | err := tracer.connections.Lookup(conn, &throughput)
153 | if err != nil {
154 | log.Printf("Error retrieving connection to delete, skipping it: %v", err)
155 | failedConnectionDeletion.Inc()
156 | return
157 | }
158 | err = tracer.connections.Delete(conn)
159 | if err != nil {
160 | log.Printf("Error deleting connection from map: %v", err)
161 | failedConnectionDeletion.Inc()
162 | return
163 | }
164 | // if deletion is successful, add it to past links
165 | link, err := tracer.reduceConnectionToLink(*conn)
166 | if err != nil {
167 | log.Printf("Error reducing connection to link when deleting: %v", err)
168 | return
169 | }
170 |
171 | pastLinks[link] += throughput.BytesSent
172 |
173 | mapDeletions.Inc()
174 | }
175 |
176 | // reduce a specific connection to a general link
177 | func (tracer *LinksTracer) reduceConnectionToLink(connection ConnectionIdentifier) (NetworkLink, error) {
178 | var link NetworkLink
179 | link.Role = connection.Role
180 |
181 | srcWorkload := tracer.resolver.ResolveIP(IP(connection.Tuple.SrcIp).String())
182 | dstWorkload := tracer.resolver.ResolveIP(IP(connection.Tuple.DstIp).String())
183 |
184 | if connection.Role == ClientConnectionRole {
185 | // Src is Client, Dst is Server, Port is DstPort
186 | link.Client = srcWorkload
187 | link.Server = dstWorkload
188 | link.ServerPort = connection.Tuple.DstPort
189 | } else if connection.Role == ServerConnectionRole {
190 | // Dst is Client, Src is Server, Port is SrcPort
191 | link.Client = dstWorkload
192 | link.Server = srcWorkload
193 | link.ServerPort = connection.Tuple.SrcPort
194 | } else {
195 | return NetworkLink{}, errors.New("connection's role is unknown")
196 | }
197 | return link, nil
198 | }
199 |
200 | // reduce a specific connection to a general tcp connection
201 | func (tracer *LinksTracer) reduceConnectionToTcp(connection ConnectionIdentifier, throughput ConnectionThroughputStats) (TcpConnection, error) {
202 | var tcpConn TcpConnection
203 | tcpConn.Role = connection.Role
204 |
205 | srcWorkload := tracer.resolver.ResolveIP(IP(connection.Tuple.SrcIp).String())
206 | dstWorkload := tracer.resolver.ResolveIP(IP(connection.Tuple.DstIp).String())
207 |
208 | if connection.Role == ClientConnectionRole {
209 | // Src is Client, Dst is Server, Port is DstPort
210 | tcpConn.Client = srcWorkload
211 | tcpConn.Server = dstWorkload
212 | tcpConn.ServerPort = connection.Tuple.DstPort
213 | tcpConn.State = TcpConnectionOpenState
214 | } else if connection.Role == ServerConnectionRole {
215 | // Dst is Client, Src is Server, Port is SrcPort
216 | tcpConn.Client = dstWorkload
217 | tcpConn.Server = srcWorkload
218 | tcpConn.ServerPort = connection.Tuple.SrcPort
219 | tcpConn.State = TcpConnectionAcceptState
220 | } else {
221 | return TcpConnection{}, errors.New("connection's role is unknown")
222 | }
223 |
224 | if throughput.IsActive == 0 {
225 | tcpConn.State = TcpConnectionClosedState
226 | }
227 |
228 | return tcpConn, nil
229 | }
230 |
231 | func isAddressLoopback(ip uint32) bool {
232 | ipAddr := make(net.IP, 4)
233 | binary.LittleEndian.PutUint32(ipAddr, ip)
234 | return ipAddr.IsLoopback()
235 | }
236 |
--------------------------------------------------------------------------------
/pkg/caretta/links_tracer_test.go:
--------------------------------------------------------------------------------
1 | package caretta_test
2 |
3 | import (
4 | "errors"
5 | "fmt"
6 | "testing"
7 |
8 | "github.com/groundcover-com/caretta/pkg/caretta"
9 |
10 | "github.com/groundcover-com/caretta/pkg/k8s"
11 | "github.com/stretchr/testify/assert"
12 | )
13 |
14 | // Defining a mock of a map. This is not a complete implementation of a map with iterator
15 | type MockConnectionsMapIterator struct {
16 | innerMap map[caretta.ConnectionIdentifier]caretta.ConnectionThroughputStats
17 | keys []caretta.ConnectionIdentifier
18 | count int
19 | }
20 |
21 | func (mi *MockConnectionsMapIterator) Next(conn interface{}, throughput interface{}) bool {
22 | assertedConn, ok := conn.(*caretta.ConnectionIdentifier)
23 | if !ok {
24 | return false
25 | }
26 | assertedThroughput, ok := throughput.(*caretta.ConnectionThroughputStats)
27 | if !ok {
28 | return false
29 | }
30 | for mi.count < len(mi.keys) {
31 | *assertedConn = mi.keys[mi.count]
32 | *assertedThroughput = mi.innerMap[*assertedConn]
33 | mi.count++
34 | return true
35 | }
36 |
37 | return false
38 | }
39 |
40 | type MockConnectionsMap struct {
41 | innerMap map[caretta.ConnectionIdentifier]caretta.ConnectionThroughputStats
42 | }
43 |
44 | func NewMockConnectionsMap() *MockConnectionsMap {
45 | return &MockConnectionsMap{innerMap: make(map[caretta.ConnectionIdentifier]caretta.ConnectionThroughputStats)}
46 | }
47 |
48 | func (m *MockConnectionsMap) Lookup(conn interface{}, throughput interface{}) error {
49 | assertedConn, ok := conn.(*caretta.ConnectionIdentifier)
50 | if !ok {
51 | return errors.New("wrong type for Lookup")
52 | }
53 | assertedThroughput, ok := throughput.(*caretta.ConnectionThroughputStats)
54 | if !ok {
55 | return errors.New("wrong type for Lookup")
56 | }
57 | *assertedThroughput, ok = m.innerMap[*assertedConn]
58 | if !ok {
59 | return errors.New("Key not in map")
60 | }
61 | return nil
62 | }
63 |
64 | func (m *MockConnectionsMap) Iterate() caretta.IEbpfMapIterator {
65 | keys := make([]caretta.ConnectionIdentifier, 0, len(m.innerMap))
66 | for ci := range m.innerMap {
67 | keys = append(keys, ci)
68 | }
69 |
70 | return &MockConnectionsMapIterator{innerMap: m.innerMap, keys: keys, count: 0}
71 | }
72 |
73 | func (m *MockConnectionsMap) Delete(key interface{}) error {
74 | assertedKey, ok := key.(*caretta.ConnectionIdentifier)
75 | if !ok {
76 | return errors.New("wrong type in delete")
77 | }
78 | delete(m.innerMap, *assertedKey)
79 | return nil
80 | }
81 |
82 | func (m *MockConnectionsMap) Update(key caretta.ConnectionIdentifier, value caretta.ConnectionThroughputStats) {
83 | m.innerMap[key] = value
84 | }
85 |
86 | type MockResolver struct{}
87 |
88 | func (resolver *MockResolver) ResolveIP(ip string) k8s.Workload {
89 | return k8s.Workload{
90 | Name: ip,
91 | Namespace: "Namespace",
92 | Kind: "Kind",
93 | }
94 | }
95 |
96 | func (resolver *MockResolver) StartWatching() error {
97 | return nil
98 | }
99 | func (resolver *MockResolver) StopWatching() {}
100 |
101 | type testConnection struct {
102 | connId caretta.ConnectionIdentifier
103 | throughput caretta.ConnectionThroughputStats
104 | }
105 |
106 | type aggregationTest struct {
107 | description string
108 | connections []testConnection
109 | expectedLink caretta.NetworkLink
110 | expectedThroughput uint64
111 | }
112 |
113 | var clientTuple = caretta.ConnectionTuple{
114 | SrcIp: 1,
115 | DstIp: 2,
116 | SrcPort: 55555,
117 | DstPort: 80,
118 | }
119 | var serverTuple = caretta.ConnectionTuple{
120 | DstIp: 1,
121 | SrcIp: 2,
122 | DstPort: 55555,
123 | SrcPort: 80,
124 | }
125 | var activeThroughput = caretta.ConnectionThroughputStats{
126 | BytesSent: 10,
127 | BytesReceived: 2,
128 | IsActive: 1,
129 | }
130 | var inactiveThroughput = caretta.ConnectionThroughputStats{
131 | BytesSent: 10,
132 | BytesReceived: 2,
133 | IsActive: 0,
134 | }
135 | var clientLink = caretta.NetworkLink{
136 | Client: k8s.Workload{
137 | Name: caretta.IP(1).String(),
138 | Namespace: "Namespace",
139 | Kind: "Kind",
140 | },
141 | Server: k8s.Workload{
142 | Name: caretta.IP(2).String(),
143 | Namespace: "Namespace",
144 | Kind: "Kind",
145 | },
146 | ServerPort: 80,
147 | Role: caretta.ClientConnectionRole,
148 | }
149 | var serverLink = caretta.NetworkLink{
150 | Client: k8s.Workload{
151 | Name: caretta.IP(1).String(),
152 | Namespace: "Namespace",
153 | Kind: "Kind",
154 | },
155 | Server: k8s.Workload{
156 | Name: caretta.IP(2).String(),
157 | Namespace: "Namespace",
158 | Kind: "Kind",
159 | },
160 | ServerPort: 80,
161 | Role: caretta.ServerConnectionRole,
162 | }
163 |
164 | func TestAggregations(t *testing.T) {
165 | var aggregationTests = []aggregationTest{
166 | {
167 | description: "single client connection create correct link",
168 | connections: []testConnection{
169 | {
170 | connId: caretta.ConnectionIdentifier{
171 | Id: 1,
172 | Pid: 1,
173 | Tuple: clientTuple,
174 | Role: caretta.ClientConnectionRole,
175 | },
176 | throughput: activeThroughput,
177 | },
178 | },
179 | expectedLink: clientLink,
180 | expectedThroughput: activeThroughput.BytesSent,
181 | },
182 | {
183 | description: "single server connection create correct link",
184 | connections: []testConnection{
185 | {
186 | connId: caretta.ConnectionIdentifier{
187 | Id: 1,
188 | Pid: 1,
189 | Tuple: serverTuple,
190 | Role: caretta.ServerConnectionRole,
191 | },
192 | throughput: activeThroughput,
193 | },
194 | },
195 | expectedLink: serverLink,
196 | expectedThroughput: activeThroughput.BytesSent,
197 | },
198 | {
199 | description: "2 client connections aggregate both to one",
200 | connections: []testConnection{
201 | {
202 | connId: caretta.ConnectionIdentifier{
203 | Id: 1,
204 | Pid: 1,
205 | Tuple: clientTuple,
206 | Role: caretta.ClientConnectionRole,
207 | },
208 | throughput: activeThroughput,
209 | },
210 | {
211 | connId: caretta.ConnectionIdentifier{
212 | Id: 2,
213 | Pid: 1,
214 | Tuple: clientTuple,
215 | Role: caretta.ClientConnectionRole,
216 | },
217 | throughput: activeThroughput,
218 | },
219 | },
220 | expectedLink: clientLink,
221 | expectedThroughput: 2 * activeThroughput.BytesSent,
222 | },
223 | {
224 | description: "2 server connections aggregate both to one",
225 | connections: []testConnection{
226 | {
227 | connId: caretta.ConnectionIdentifier{
228 | Id: 1,
229 | Pid: 1,
230 | Tuple: serverTuple,
231 | Role: caretta.ServerConnectionRole,
232 | },
233 | throughput: activeThroughput,
234 | },
235 | {
236 | connId: caretta.ConnectionIdentifier{
237 | Id: 2,
238 | Pid: 1,
239 | Tuple: serverTuple,
240 | Role: caretta.ServerConnectionRole,
241 | },
242 | throughput: activeThroughput,
243 | },
244 | },
245 | expectedLink: serverLink,
246 | expectedThroughput: 2 * activeThroughput.BytesSent,
247 | },
248 | {
249 | description: "3 active client connections, 2 inactive aggregate all to one",
250 | connections: []testConnection{
251 | {
252 | connId: caretta.ConnectionIdentifier{
253 | Id: 1,
254 | Pid: 1,
255 | Tuple: clientTuple,
256 | Role: caretta.ClientConnectionRole,
257 | },
258 | throughput: activeThroughput,
259 | },
260 | {
261 | connId: caretta.ConnectionIdentifier{
262 | Id: 2,
263 | Pid: 1,
264 | Tuple: clientTuple,
265 | Role: caretta.ClientConnectionRole,
266 | },
267 | throughput: activeThroughput,
268 | },
269 | {
270 | connId: caretta.ConnectionIdentifier{
271 | Id: 3,
272 | Pid: 1,
273 | Tuple: clientTuple,
274 | Role: caretta.ClientConnectionRole,
275 | },
276 | throughput: activeThroughput,
277 | },
278 | {
279 | connId: caretta.ConnectionIdentifier{
280 | Id: 4,
281 | Pid: 1,
282 | Tuple: clientTuple,
283 | Role: caretta.ClientConnectionRole,
284 | },
285 | throughput: inactiveThroughput,
286 | },
287 | {
288 | connId: caretta.ConnectionIdentifier{
289 | Id: 5,
290 | Pid: 1,
291 | Tuple: clientTuple,
292 | Role: caretta.ClientConnectionRole,
293 | },
294 | throughput: inactiveThroughput,
295 | },
296 | },
297 | expectedLink: clientLink,
298 | expectedThroughput: 3*activeThroughput.BytesSent + 2*inactiveThroughput.BytesSent,
299 | },
300 | {
301 | description: "3 active server connections, 2 inactive aggregate all to one",
302 | connections: []testConnection{
303 | {
304 | connId: caretta.ConnectionIdentifier{
305 | Id: 1,
306 | Pid: 1,
307 | Tuple: serverTuple,
308 | Role: caretta.ServerConnectionRole,
309 | },
310 | throughput: activeThroughput,
311 | },
312 | {
313 | connId: caretta.ConnectionIdentifier{
314 | Id: 2,
315 | Pid: 1,
316 | Tuple: serverTuple,
317 | Role: caretta.ServerConnectionRole,
318 | },
319 | throughput: activeThroughput,
320 | },
321 | {
322 | connId: caretta.ConnectionIdentifier{
323 | Id: 3,
324 | Pid: 1,
325 | Tuple: serverTuple,
326 | Role: caretta.ServerConnectionRole,
327 | },
328 | throughput: activeThroughput,
329 | },
330 | {
331 | connId: caretta.ConnectionIdentifier{
332 | Id: 4,
333 | Pid: 1,
334 | Tuple: serverTuple,
335 | Role: caretta.ServerConnectionRole,
336 | },
337 | throughput: inactiveThroughput,
338 | },
339 | {
340 | connId: caretta.ConnectionIdentifier{
341 | Id: 5,
342 | Pid: 1,
343 | Tuple: serverTuple,
344 | Role: caretta.ServerConnectionRole,
345 | },
346 | throughput: inactiveThroughput,
347 | },
348 | },
349 | expectedLink: serverLink,
350 | expectedThroughput: 3*activeThroughput.BytesSent + 2*inactiveThroughput.BytesSent,
351 | },
352 | }
353 | for _, test := range aggregationTests {
354 | t.Run(test.description, func(t *testing.T) {
355 | assert := assert.New(t)
356 | m := NewMockConnectionsMap()
357 |
358 | tracer := caretta.NewTracerWithObjs(&MockResolver{}, m, nil)
359 | pastLinks := make(map[caretta.NetworkLink]uint64)
360 | var currentLinks map[caretta.NetworkLink]uint64
361 | for _, connection := range test.connections {
362 | m.Update(connection.connId, connection.throughput)
363 | _, currentLinks, _ = tracer.TracesPollingIteration(pastLinks)
364 | }
365 | resultThroughput, ok := currentLinks[test.expectedLink]
366 | assert.True(ok, "expected link not in result map")
367 | assert.Equal(test.expectedThroughput, resultThroughput, "wrong throughput value")
368 | })
369 |
370 | }
371 | }
372 |
373 | func TestDeletion_ActiveConnection_NotDeleted(t *testing.T) {
374 | assert := assert.New(t)
375 |
376 | // Arrange mock map, initial connection
377 | m := NewMockConnectionsMap()
378 |
379 | conn1 := caretta.ConnectionIdentifier{
380 | Id: 1,
381 | Pid: 1,
382 | Tuple: serverTuple,
383 | Role: caretta.ServerConnectionRole,
384 | }
385 | throughput1 := activeThroughput
386 |
387 | tracer := caretta.NewTracerWithObjs(&MockResolver{}, m, nil)
388 |
389 | pastLinks := make(map[caretta.NetworkLink]uint64)
390 |
391 | // Act
392 | m.Update(conn1, throughput1)
393 | _, currentLinks, _ := tracer.TracesPollingIteration(pastLinks)
394 |
395 | // Assert
396 | resultThroughput, ok := currentLinks[serverLink]
397 | assert.True(ok, "link not in map, map is %v", currentLinks)
398 | assert.Equal(throughput1.BytesSent, resultThroughput)
399 |
400 | var testThroughput caretta.ConnectionThroughputStats
401 |
402 | err := m.Lookup(&conn1, &testThroughput)
403 | assert.NoError(err, "connection should stay on the map")
404 | }
405 |
406 | func TestDeletion_InactiveConnection_AddedToPastLinksAndRemovedFromMap(t *testing.T) {
407 | assert := assert.New(t)
408 |
409 | // Arrange mock map, initial connection
410 | m := NewMockConnectionsMap()
411 |
412 | conn1 := caretta.ConnectionIdentifier{
413 | Id: 1,
414 | Pid: 1,
415 | Tuple: serverTuple,
416 | Role: caretta.ServerConnectionRole,
417 | }
418 | throughput1 := activeThroughput
419 | m.Update(conn1, throughput1)
420 |
421 | tracer := caretta.NewTracerWithObjs(&MockResolver{}, m, nil)
422 |
423 | pastLinks := make(map[caretta.NetworkLink]uint64)
424 |
425 | pastLinks, _, _ = tracer.TracesPollingIteration(pastLinks)
426 |
427 | // Act: update the throughput so the connection is inactive, and iterate
428 | throughput2 := inactiveThroughput
429 | m.Update(conn1, throughput2)
430 | pastLinks, currentLinks, _ := tracer.TracesPollingIteration(pastLinks)
431 |
432 | // Assert: check the past connection is both in past links and in current links
433 | resultThroughput, ok := currentLinks[serverLink]
434 | assert.True(ok, "link not in map, map is %v", currentLinks)
435 | assert.Equal(throughput1.BytesSent, resultThroughput)
436 | _, ok = pastLinks[serverLink]
437 | assert.True(ok, "inactive link not in past links: %v", pastLinks)
438 |
439 | var testThroughput caretta.ConnectionThroughputStats
440 | err := m.Lookup(&conn1, &testThroughput)
441 | assert.Error(err, fmt.Sprintf("inactive connection not deleted from connections map, val is %d", testThroughput.BytesSent))
442 | }
443 |
444 | func TestDeletion_InactiveConnection_NewConnectionAfterDeletionUpdatesCorrectly(t *testing.T) {
445 | assert := assert.New(t)
446 |
447 | // Arrange mock map, initial connection, inactive connection
448 | m := NewMockConnectionsMap()
449 |
450 | conn1 := caretta.ConnectionIdentifier{
451 | Id: 1,
452 | Pid: 1,
453 | Tuple: serverTuple,
454 | Role: caretta.ServerConnectionRole,
455 | }
456 | throughput1 := activeThroughput
457 | m.Update(conn1, throughput1)
458 |
459 | tracer := caretta.NewTracerWithObjs(&MockResolver{}, m, nil)
460 |
461 | pastLinks := make(map[caretta.NetworkLink]uint64)
462 |
463 | // update the throughput so the connection is inactive
464 | throughput2 := inactiveThroughput
465 | m.Update(conn1, throughput2)
466 | pastLinks, _, _ = tracer.TracesPollingIteration(pastLinks)
467 |
468 | // Act: new connection, same link
469 | throughput3 := activeThroughput
470 | m.Update(conn1, throughput3)
471 | _, currentLinks, _ := tracer.TracesPollingIteration(pastLinks)
472 |
473 | // Assert the new connection is aggregated correctly
474 | resultThroughput, ok := currentLinks[serverLink]
475 | assert.True(ok, "link not in map, map is %v", currentLinks)
476 | assert.Equal(throughput1.BytesSent+throughput3.BytesSent, resultThroughput)
477 | }
478 |
479 | func TestConnectionState_Open(t *testing.T) {
480 | assert := assert.New(t)
481 |
482 | // Arrange mock map, initial connection
483 | m := NewMockConnectionsMap()
484 |
485 | conn1 := caretta.ConnectionIdentifier{
486 | Id: 1,
487 | Pid: 1,
488 | Tuple: serverTuple,
489 | Role: caretta.ClientConnectionRole,
490 | }
491 | throughput1 := activeThroughput
492 |
493 | tracer := caretta.NewTracerWithObjs(&MockResolver{}, m, nil)
494 |
495 | pastLinks := make(map[caretta.NetworkLink]uint64)
496 |
497 | // Act
498 | m.Update(conn1, throughput1)
499 | _, _, currentConnections := tracer.TracesPollingIteration(pastLinks)
500 |
501 | // Assert
502 | assert.Equal(1, len(currentConnections))
503 | // Get the first element of the map
504 | for _, tcp := range currentConnections {
505 | assert.Equal(uint32(caretta.TcpConnectionOpenState), tcp.State)
506 | break
507 | }
508 | }
509 |
510 | func TestConnectionState_Close(t *testing.T) {
511 | assert := assert.New(t)
512 |
513 | // Arrange mock map, initial connection
514 | m := NewMockConnectionsMap()
515 |
516 | conn1 := caretta.ConnectionIdentifier{
517 | Id: 1,
518 | Pid: 1,
519 | Tuple: serverTuple,
520 | Role: caretta.ServerConnectionRole,
521 | }
522 | throughput1 := inactiveThroughput
523 |
524 | tracer := caretta.NewTracerWithObjs(&MockResolver{}, m, nil)
525 |
526 | pastLinks := make(map[caretta.NetworkLink]uint64)
527 |
528 | // Act
529 | m.Update(conn1, throughput1)
530 | _, _, currentConnections := tracer.TracesPollingIteration(pastLinks)
531 |
532 | // Assert
533 | assert.Equal(1, len(currentConnections))
534 | for _, tcp := range currentConnections {
535 | assert.Equal(uint32(caretta.TcpConnectionClosedState), tcp.State)
536 | break
537 | }
538 | }
539 |
540 | func TestConnectionState_Accept(t *testing.T) {
541 | assert := assert.New(t)
542 |
543 | // Arrange mock map, initial connection
544 | m := NewMockConnectionsMap()
545 |
546 | conn1 := caretta.ConnectionIdentifier{
547 | Id: 1,
548 | Pid: 1,
549 | Tuple: serverTuple,
550 | Role: caretta.ServerConnectionRole,
551 | }
552 | throughput1 := activeThroughput
553 |
554 | tracer := caretta.NewTracerWithObjs(&MockResolver{}, m, nil)
555 |
556 | pastLinks := make(map[caretta.NetworkLink]uint64)
557 |
558 | // Act
559 | m.Update(conn1, throughput1)
560 | _, _, currentConnections := tracer.TracesPollingIteration(pastLinks)
561 |
562 | // Assert
563 | assert.Equal(1, len(currentConnections))
564 | for _, tcp := range currentConnections {
565 | assert.Equal(uint32(caretta.TcpConnectionAcceptState), tcp.State)
566 | break
567 | }
568 | }
569 |
570 | func TestConnectionState_UnknownRole(t *testing.T) {
571 | assert := assert.New(t)
572 |
573 | // Arrange mock map, initial connection
574 | m := NewMockConnectionsMap()
575 |
576 | conn1 := caretta.ConnectionIdentifier{
577 | Id: 1,
578 | Pid: 1,
579 | Tuple: serverTuple,
580 | Role: caretta.UnknownConnectionRole,
581 | }
582 | throughput1 := activeThroughput
583 |
584 | tracer := caretta.NewTracerWithObjs(&MockResolver{}, m, nil)
585 |
586 | pastLinks := make(map[caretta.NetworkLink]uint64)
587 |
588 | // Act
589 | m.Update(conn1, throughput1)
590 | _, _, currentConnections := tracer.TracesPollingIteration(pastLinks)
591 |
592 | // Assert
593 | assert.Equal(0, len(currentConnections))
594 | }
595 |
--------------------------------------------------------------------------------
/pkg/caretta/types.go:
--------------------------------------------------------------------------------
1 | package caretta
2 |
3 | import (
4 | "encoding/binary"
5 | "net"
6 |
7 | caretta_k8s "github.com/groundcover-com/caretta/pkg/k8s"
8 | )
9 |
10 | const (
11 | UnknownConnectionRole = iota
12 | ClientConnectionRole = iota
13 | ServerConnectionRole = iota
14 | TcpConnectionOpenState = iota
15 | TcpConnectionAcceptState = iota
16 | TcpConnectionClosedState = iota
17 | )
18 |
19 | type IP uint32
20 |
21 | func (ip IP) String() string {
22 | netIp := make(net.IP, 4)
23 | binary.LittleEndian.PutUint32(netIp, uint32(ip))
24 | return netIp.String()
25 | }
26 |
27 | // "final" type of link, like an edge on the graph
28 | type NetworkLink struct {
29 | Client caretta_k8s.Workload
30 | Server caretta_k8s.Workload
31 | ServerPort uint16
32 | Role uint32
33 | }
34 |
35 | type TcpConnection struct {
36 | Client caretta_k8s.Workload
37 | Server caretta_k8s.Workload
38 | ServerPort uint16
39 | Role uint32
40 | State uint32
41 | }
42 |
43 | type ConnectionTuple struct {
44 | SrcIp uint32
45 | DstIp uint32
46 | SrcPort uint16
47 | DstPort uint16
48 | }
49 |
50 | type ConnectionIdentifier struct {
51 | Id uint32
52 | Pid uint32
53 | Tuple ConnectionTuple
54 | Role uint32
55 | }
56 |
57 | type ConnectionThroughputStats struct {
58 | BytesSent uint64
59 | BytesReceived uint64
60 | IsActive uint64
61 | }
62 |
--------------------------------------------------------------------------------
/pkg/k8s/ipresolver.go:
--------------------------------------------------------------------------------
1 | package k8s
2 |
3 | import (
4 | "context"
5 | "errors"
6 | "fmt"
7 | "log"
8 | "net"
9 | "sync"
10 | "time"
11 |
12 | "k8s.io/apimachinery/pkg/watch"
13 | "k8s.io/client-go/kubernetes"
14 |
15 | lrucache "github.com/hashicorp/golang-lru/v2"
16 | "github.com/prometheus/client_golang/prometheus"
17 | "github.com/prometheus/client_golang/prometheus/promauto"
18 | appsv1 "k8s.io/api/apps/v1"
19 | batchv1 "k8s.io/api/batch/v1"
20 | "k8s.io/api/batch/v1beta1"
21 | v1 "k8s.io/api/core/v1"
22 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
23 | )
24 |
25 | const MAX_RESOLVED_DNS = 10000 // arbitrary limit
26 | var reregisterWatchSleepDuration = 1 * time.Second
27 |
28 | var (
29 | watchEventsCounter = promauto.NewCounterVec(prometheus.CounterOpts{
30 | Name: "caretta_watcher_events_count",
31 | }, []string{"object_type"})
32 | watchResetsCounter = promauto.NewCounterVec(prometheus.CounterOpts{
33 | Name: "caretta_watcher_resets_count",
34 | }, []string{"object_type"})
35 | )
36 |
37 | type clusterSnapshot struct {
38 | Pods sync.Map // map[types.UID]v1.Pod
39 | Nodes sync.Map // map[types.UID]v1.Node
40 | ReplicaSets sync.Map // map[types.UID]appsv1.ReplicaSet
41 | DaemonSets sync.Map // map[types.UID]appsv1.DaemonSet
42 | StatefulSets sync.Map // map[types.UID]appsv1.StatefulSet
43 | Jobs sync.Map // map[types.UID]batchv1.Job
44 | Services sync.Map // map[types.UID]v1.Service
45 | Deployments sync.Map // map[types.UID]appsv1.Deployment
46 | CronJobs sync.Map // map[types.UID]batchv1.CronJob or batchv1beta.CronJob
47 | PodDescriptors sync.Map // map[types.UID]Workload
48 | }
49 |
50 | type K8sIPResolver struct {
51 | clientset kubernetes.Interface
52 | snapshot clusterSnapshot
53 | ipsMap sync.Map
54 | stopSignal chan bool
55 | shouldResolveDns bool
56 | traverseUpHierarchy bool
57 | dnsResolvedIps *lrucache.Cache[string, string]
58 | }
59 |
60 | type Workload struct {
61 | Name string
62 | Namespace string
63 | Kind string
64 | Owner string
65 | }
66 |
67 | func NewK8sIPResolver(clientset kubernetes.Interface, resolveDns bool, traverseUpHierarchy bool) (*K8sIPResolver, error) {
68 | var dnsCache *lrucache.Cache[string, string]
69 | if resolveDns {
70 | var err error
71 | dnsCache, err = lrucache.New[string, string](MAX_RESOLVED_DNS)
72 | if err != nil {
73 | return nil, err
74 | }
75 | } else {
76 | dnsCache = nil
77 | }
78 | return &K8sIPResolver{
79 | clientset: clientset,
80 | snapshot: clusterSnapshot{},
81 | ipsMap: sync.Map{},
82 | stopSignal: make(chan bool),
83 | shouldResolveDns: resolveDns,
84 | dnsResolvedIps: dnsCache,
85 | traverseUpHierarchy: traverseUpHierarchy,
86 | }, nil
87 | }
88 |
89 | // resolve the given IP from the resolver's cache
90 | // if not available, return the IP itself.
91 | func (resolver *K8sIPResolver) ResolveIP(ip string) Workload {
92 | if val, ok := resolver.ipsMap.Load(ip); ok {
93 | entry, ok := val.(Workload)
94 | if ok {
95 | return entry
96 | }
97 | log.Printf("type confusion in ipsMap")
98 | }
99 | host := ip
100 |
101 | if resolver.shouldResolveDns {
102 | val, ok := resolver.dnsResolvedIps.Get(ip)
103 | if ok {
104 | host = val
105 | } else {
106 | hosts, err := net.LookupAddr(ip)
107 | if err == nil && len(hosts) > 0 {
108 | host = hosts[0]
109 | }
110 | resolver.dnsResolvedIps.Add(ip, host)
111 | }
112 | }
113 | return Workload{
114 | Name: host,
115 | Namespace: "external",
116 | Kind: "external",
117 | }
118 | }
119 |
120 | func (resolver *K8sIPResolver) StartWatching() error {
121 | // register watchers
122 | podsWatcher, err := resolver.clientset.CoreV1().Pods("").Watch(context.Background(), metav1.ListOptions{})
123 | if err != nil {
124 | return fmt.Errorf("error watching pods changes - %v", err)
125 | }
126 |
127 | nodesWatcher, err := resolver.clientset.CoreV1().Nodes().Watch(context.Background(), metav1.ListOptions{})
128 | if err != nil {
129 | return fmt.Errorf("error watching nodes changes - %v", err)
130 | }
131 |
132 | replicasetsWatcher, err := resolver.clientset.AppsV1().ReplicaSets("").Watch(context.Background(), metav1.ListOptions{})
133 | if err != nil {
134 | return fmt.Errorf("error watching replicasets changes - %v", err)
135 | }
136 |
137 | daemonsetsWatcher, err := resolver.clientset.AppsV1().DaemonSets("").Watch(context.Background(), metav1.ListOptions{})
138 | if err != nil {
139 | return fmt.Errorf("error watching daemonsets changes - %v", err)
140 | }
141 |
142 | statefulsetsWatcher, err := resolver.clientset.AppsV1().StatefulSets("").Watch(context.Background(), metav1.ListOptions{})
143 | if err != nil {
144 | return fmt.Errorf("error watching statefulsets changes - %v", err)
145 | }
146 |
147 | jobsWatcher, err := resolver.clientset.BatchV1().Jobs("").Watch(context.Background(), metav1.ListOptions{})
148 | if err != nil {
149 | return fmt.Errorf("error watching jobs changes - %v", err)
150 | }
151 |
152 | servicesWatcher, err := resolver.clientset.CoreV1().Services("").Watch(context.Background(), metav1.ListOptions{})
153 | if err != nil {
154 | return fmt.Errorf("error watching services changes - %v", err)
155 | }
156 |
157 | deploymentsWatcher, err := resolver.clientset.AppsV1().Deployments("").Watch(context.Background(), metav1.ListOptions{})
158 | if err != nil {
159 | return fmt.Errorf("error watching deployments changes - %v", err)
160 | }
161 |
162 | cronJobsWatcher, err := resolver.startCronjobWatcher()
163 | if err != nil {
164 | return fmt.Errorf("error watching cronjobs changes - %v", err)
165 | }
166 |
167 | // invoke a watching function
168 | go func() {
169 | for {
170 | select {
171 | case <-resolver.stopSignal:
172 | podsWatcher.Stop()
173 | nodesWatcher.Stop()
174 | replicasetsWatcher.Stop()
175 | daemonsetsWatcher.Stop()
176 | statefulsetsWatcher.Stop()
177 | jobsWatcher.Stop()
178 | servicesWatcher.Stop()
179 | deploymentsWatcher.Stop()
180 | cronJobsWatcher.Stop()
181 | return
182 | case podEvent, ok := <-podsWatcher.ResultChan():
183 | {
184 | if !ok {
185 | watchResetsCounter.WithLabelValues("pod").Inc()
186 | podsWatcher, err = resolver.clientset.CoreV1().Pods("").Watch(context.Background(), metav1.ListOptions{})
187 | if err != nil {
188 | time.Sleep(reregisterWatchSleepDuration)
189 | }
190 | continue
191 | }
192 | watchEventsCounter.WithLabelValues("pod").Inc()
193 | resolver.handlePodWatchEvent(&podEvent)
194 | }
195 | case nodeEvent, ok := <-nodesWatcher.ResultChan():
196 | {
197 | if !ok {
198 | watchResetsCounter.WithLabelValues("node").Inc()
199 | nodesWatcher, err = resolver.clientset.CoreV1().Nodes().Watch(context.Background(), metav1.ListOptions{})
200 | if err != nil {
201 | time.Sleep(reregisterWatchSleepDuration)
202 | }
203 | continue
204 | }
205 | watchEventsCounter.WithLabelValues("node").Inc()
206 | resolver.handleNodeWatchEvent(&nodeEvent)
207 | }
208 | case replicasetsEvent, ok := <-replicasetsWatcher.ResultChan():
209 | {
210 | if !ok {
211 | watchResetsCounter.WithLabelValues("replicaset").Inc()
212 | replicasetsWatcher, err = resolver.clientset.AppsV1().ReplicaSets("").Watch(context.Background(), metav1.ListOptions{})
213 | if err != nil {
214 | time.Sleep(reregisterWatchSleepDuration)
215 | }
216 | continue
217 | }
218 | watchEventsCounter.WithLabelValues("replicaset").Inc()
219 | resolver.handleReplicaSetWatchEvent(&replicasetsEvent)
220 | }
221 | case daemonsetsEvent, ok := <-daemonsetsWatcher.ResultChan():
222 | {
223 | if !ok {
224 | watchResetsCounter.WithLabelValues("daemonset").Inc()
225 | daemonsetsWatcher, err = resolver.clientset.AppsV1().DaemonSets("").Watch(context.Background(), metav1.ListOptions{})
226 | if err != nil {
227 | time.Sleep(reregisterWatchSleepDuration)
228 | }
229 | continue
230 | }
231 | watchEventsCounter.WithLabelValues("daemonset").Inc()
232 | resolver.handleDaemonSetWatchEvent(&daemonsetsEvent)
233 | }
234 | case statefulsetsEvent, ok := <-statefulsetsWatcher.ResultChan():
235 | {
236 | if !ok {
237 | watchResetsCounter.WithLabelValues("statefulset").Inc()
238 | statefulsetsWatcher, err = resolver.clientset.AppsV1().StatefulSets("").Watch(context.Background(), metav1.ListOptions{})
239 | if err != nil {
240 | time.Sleep(reregisterWatchSleepDuration)
241 | }
242 | continue
243 | }
244 | watchEventsCounter.WithLabelValues("statefulset").Inc()
245 | resolver.handleStatefulSetWatchEvent(&statefulsetsEvent)
246 | }
247 | case jobsEvent, ok := <-jobsWatcher.ResultChan():
248 | {
249 | if !ok {
250 | watchResetsCounter.WithLabelValues("job").Inc()
251 | jobsWatcher, err = resolver.clientset.BatchV1().Jobs("").Watch(context.Background(), metav1.ListOptions{})
252 | if err != nil {
253 | time.Sleep(reregisterWatchSleepDuration)
254 | }
255 | continue
256 | }
257 | watchEventsCounter.WithLabelValues("job").Inc()
258 | resolver.handleJobsWatchEvent(&jobsEvent)
259 | }
260 | case servicesEvent, ok := <-servicesWatcher.ResultChan():
261 | {
262 | if !ok {
263 | watchResetsCounter.WithLabelValues("service").Inc()
264 | servicesWatcher, err = resolver.clientset.CoreV1().Services("").Watch(context.Background(), metav1.ListOptions{})
265 | if err != nil {
266 | time.Sleep(reregisterWatchSleepDuration)
267 | }
268 | continue
269 | }
270 | watchEventsCounter.WithLabelValues("service").Inc()
271 | resolver.handleServicesWatchEvent(&servicesEvent)
272 | }
273 | case deploymentsEvent, ok := <-deploymentsWatcher.ResultChan():
274 | {
275 | if !ok {
276 | watchResetsCounter.WithLabelValues("deployment").Inc()
277 | deploymentsWatcher, err = resolver.clientset.AppsV1().Deployments("").Watch(context.Background(), metav1.ListOptions{})
278 | if err != nil {
279 | time.Sleep(reregisterWatchSleepDuration)
280 | }
281 | continue
282 | }
283 | watchEventsCounter.WithLabelValues("deployment").Inc()
284 | resolver.handleDeploymentsWatchEvent(&deploymentsEvent)
285 | }
286 | case cronjobsEvent, ok := <-cronJobsWatcher.ResultChan():
287 | {
288 | if !ok {
289 | watchResetsCounter.WithLabelValues("cronjob").Inc()
290 | cronJobsWatcher, err = resolver.startCronjobWatcher()
291 | if err != nil {
292 | time.Sleep(reregisterWatchSleepDuration)
293 | }
294 | continue
295 | }
296 | watchEventsCounter.WithLabelValues("cronjob").Inc()
297 | resolver.handleCronJobsWatchEvent(&cronjobsEvent)
298 | }
299 | }
300 | }
301 | }()
302 |
303 | // get initial state
304 | err = resolver.getResolvedClusterSnapshot()
305 | if err != nil {
306 | resolver.StopWatching()
307 | return fmt.Errorf("error retrieving cluster's initial state: %v", err)
308 | }
309 |
310 | return nil
311 | }
312 |
313 | func (resolver *K8sIPResolver) startCronjobWatcher() (watch.Interface, error) {
314 | cronJobsWatcher, err := resolver.clientset.BatchV1().CronJobs("").Watch(context.Background(), metav1.ListOptions{})
315 | if err != nil {
316 | return resolver.clientset.BatchV1beta1().CronJobs("").Watch(context.Background(), metav1.ListOptions{})
317 | }
318 |
319 | return cronJobsWatcher, nil
320 | }
321 |
322 | func (resolver *K8sIPResolver) StopWatching() {
323 | resolver.stopSignal <- true
324 | }
325 |
326 | func (resolver *K8sIPResolver) handlePodWatchEvent(podEvent *watch.Event) {
327 | switch podEvent.Type {
328 | case watch.Added:
329 | pod, ok := podEvent.Object.(*v1.Pod)
330 | if !ok {
331 | return
332 | }
333 | resolver.snapshot.Pods.Store(pod.UID, *pod)
334 | entry := resolver.resolvePodDescriptor(pod)
335 | for _, podIp := range pod.Status.PodIPs {
336 | resolver.storeWorkloadsIP(podIp.IP, &entry)
337 | }
338 | case watch.Modified:
339 | pod, ok := podEvent.Object.(*v1.Pod)
340 | if !ok {
341 | return
342 | }
343 | resolver.snapshot.Pods.Store(pod.UID, *pod)
344 | entry := resolver.resolvePodDescriptor(pod)
345 | for _, podIp := range pod.Status.PodIPs {
346 | resolver.storeWorkloadsIP(podIp.IP, &entry)
347 | }
348 | case watch.Deleted:
349 | if val, ok := podEvent.Object.(*v1.Pod); ok {
350 | resolver.snapshot.Pods.Delete(val.UID)
351 | resolver.snapshot.PodDescriptors.Delete(val.UID)
352 | }
353 | }
354 | }
355 |
356 | func (resolver *K8sIPResolver) handleNodeWatchEvent(nodeEvent *watch.Event) {
357 | switch nodeEvent.Type {
358 | case watch.Added, watch.Modified:
359 | node, ok := nodeEvent.Object.(*v1.Node)
360 | if !ok {
361 | return
362 | }
363 | resolver.snapshot.Nodes.Store(node.UID, *node)
364 | for _, nodeAddress := range node.Status.Addresses {
365 | resolver.storeWorkloadsIP(nodeAddress.Address, &Workload{
366 | Name: node.Name,
367 | Namespace: "node",
368 | Kind: "node",
369 | })
370 | }
371 | case watch.Deleted:
372 | if val, ok := nodeEvent.Object.(*v1.Node); ok {
373 | resolver.snapshot.Nodes.Delete(val.UID)
374 | }
375 | }
376 | }
377 |
378 | func (resolver *K8sIPResolver) handleReplicaSetWatchEvent(replicasetsEvent *watch.Event) {
379 | switch replicasetsEvent.Type {
380 | case watch.Added:
381 | if val, ok := replicasetsEvent.Object.(*appsv1.ReplicaSet); ok {
382 | resolver.snapshot.ReplicaSets.Store(val.UID, *val)
383 | }
384 | case watch.Deleted:
385 | if val, ok := replicasetsEvent.Object.(*appsv1.ReplicaSet); ok {
386 | resolver.snapshot.ReplicaSets.Delete(val.UID)
387 | }
388 | }
389 | }
390 |
391 | func (resolver *K8sIPResolver) handleDaemonSetWatchEvent(daemonsetsEvent *watch.Event) {
392 | switch daemonsetsEvent.Type {
393 | case watch.Added:
394 | if val, ok := daemonsetsEvent.Object.(*appsv1.DaemonSet); ok {
395 | resolver.snapshot.DaemonSets.Store(val.UID, *val)
396 | }
397 | case watch.Deleted:
398 | if val, ok := daemonsetsEvent.Object.(*appsv1.DaemonSet); ok {
399 | resolver.snapshot.DaemonSets.Delete(val.UID)
400 | }
401 | }
402 | }
403 |
404 | func (resolver *K8sIPResolver) handleStatefulSetWatchEvent(statefulsetsEvent *watch.Event) {
405 | switch statefulsetsEvent.Type {
406 | case watch.Added:
407 | if val, ok := statefulsetsEvent.Object.(*appsv1.StatefulSet); ok {
408 | resolver.snapshot.StatefulSets.Store(val.UID, *val)
409 | }
410 | case watch.Deleted:
411 | if val, ok := statefulsetsEvent.Object.(*appsv1.StatefulSet); ok {
412 | resolver.snapshot.StatefulSets.Delete(val.UID)
413 | }
414 | }
415 | }
416 |
417 | func (resolver *K8sIPResolver) handleJobsWatchEvent(jobsEvent *watch.Event) {
418 | switch jobsEvent.Type {
419 | case watch.Added:
420 | if val, ok := jobsEvent.Object.(*batchv1.Job); ok {
421 | resolver.snapshot.Jobs.Store(val.UID, *val)
422 | }
423 | case watch.Deleted:
424 | if val, ok := jobsEvent.Object.(*batchv1.Job); ok {
425 | resolver.snapshot.Jobs.Delete(val.UID)
426 | }
427 | }
428 | }
429 |
430 | func (resolver *K8sIPResolver) handleServicesWatchEvent(servicesEvent *watch.Event) {
431 | switch servicesEvent.Type {
432 | case watch.Added, watch.Modified:
433 | service, ok := servicesEvent.Object.(*v1.Service)
434 | if !ok {
435 | return
436 | }
437 | resolver.snapshot.Services.Store(service.UID, *service)
438 |
439 | // services has (potentially multiple) ClusterIP
440 | workload := Workload{
441 | Name: service.Name,
442 | Namespace: service.Namespace,
443 | Kind: "Service",
444 | }
445 |
446 | // TODO maybe try to match service to workload
447 | for _, clusterIp := range service.Spec.ClusterIPs {
448 | if clusterIp != "None" {
449 | _, ok := resolver.ipsMap.Load(clusterIp)
450 | if !ok {
451 | resolver.storeWorkloadsIP(clusterIp, &workload)
452 | }
453 | }
454 | }
455 | case watch.Deleted:
456 | if val, ok := servicesEvent.Object.(*v1.Service); ok {
457 | resolver.snapshot.Services.Delete(val.UID)
458 | }
459 | }
460 | }
461 |
462 | func (resolver *K8sIPResolver) handleDeploymentsWatchEvent(deploymentsEvent *watch.Event) {
463 | switch deploymentsEvent.Type {
464 | case watch.Added:
465 | if val, ok := deploymentsEvent.Object.(*appsv1.Deployment); ok {
466 | resolver.snapshot.Deployments.Store(val.UID, *val)
467 | }
468 | case watch.Deleted:
469 | if val, ok := deploymentsEvent.Object.(*appsv1.Deployment); ok {
470 | resolver.snapshot.Deployments.Delete(val.UID)
471 | }
472 | }
473 | }
474 |
475 | func (resolver *K8sIPResolver) handleCronJobsWatchEvent(cronjobsEvent *watch.Event) {
476 | switch cronjobsEvent.Type {
477 | case watch.Added:
478 | if val, ok := cronjobsEvent.Object.(*batchv1.CronJob); ok {
479 | resolver.snapshot.CronJobs.Store(val.UID, *val)
480 | }
481 | if val, ok := cronjobsEvent.Object.(*v1beta1.CronJob); ok {
482 | resolver.snapshot.CronJobs.Store(val.UID, *val)
483 | }
484 |
485 | case watch.Deleted:
486 | if val, ok := cronjobsEvent.Object.(*batchv1.CronJob); ok {
487 | resolver.snapshot.CronJobs.Delete(val.UID)
488 | }
489 | if val, ok := cronjobsEvent.Object.(*v1beta1.CronJob); ok {
490 | resolver.snapshot.CronJobs.Delete(val.UID)
491 | }
492 | }
493 | }
494 |
495 | func (resolver *K8sIPResolver) getResolvedClusterSnapshot() error {
496 | err := resolver.getFullClusterSnapshot()
497 | if err != nil {
498 | return err
499 | }
500 | resolver.updateIpMapping()
501 | return nil
502 | }
503 |
504 | // iterate the API for initial coverage of the cluster's state
505 | func (resolver *K8sIPResolver) getFullClusterSnapshot() error {
506 | pods, err := resolver.clientset.CoreV1().Pods("").List(context.Background(), metav1.ListOptions{})
507 | if err != nil {
508 | return errors.New("error getting pods, aborting snapshot update")
509 | }
510 | for _, pod := range pods.Items {
511 | resolver.snapshot.Pods.Store(pod.UID, pod)
512 | }
513 |
514 | nodes, err := resolver.clientset.CoreV1().Nodes().List(context.Background(), metav1.ListOptions{})
515 | if err != nil {
516 | return errors.New("error getting nodes, aborting snapshot update")
517 | }
518 | for _, node := range nodes.Items {
519 | resolver.snapshot.Nodes.Store(node.UID, node)
520 | }
521 |
522 | replicasets, err := resolver.clientset.AppsV1().ReplicaSets("").List(context.Background(), metav1.ListOptions{})
523 | if err != nil {
524 | return errors.New("error getting replicasets, aborting snapshot update")
525 | }
526 | for _, rs := range replicasets.Items {
527 | resolver.snapshot.ReplicaSets.Store(rs.ObjectMeta.UID, rs)
528 | }
529 |
530 | daemonsets, err := resolver.clientset.AppsV1().DaemonSets("").List(context.Background(), metav1.ListOptions{})
531 | if err != nil {
532 | return errors.New("error getting daemonsets, aborting snapshot update")
533 | }
534 | for _, ds := range daemonsets.Items {
535 | resolver.snapshot.DaemonSets.Store(ds.ObjectMeta.UID, ds)
536 | }
537 |
538 | statefulsets, err := resolver.clientset.AppsV1().StatefulSets("").List(context.Background(), metav1.ListOptions{})
539 | if err != nil {
540 | return errors.New("error getting statefulsets, aborting snapshot update")
541 | }
542 | for _, ss := range statefulsets.Items {
543 | resolver.snapshot.StatefulSets.Store(ss.ObjectMeta.UID, ss)
544 | }
545 |
546 | jobs, err := resolver.clientset.BatchV1().Jobs("").List(context.Background(), metav1.ListOptions{})
547 | if err != nil {
548 | return errors.New("error getting jobs, aborting snapshot update")
549 | }
550 | for _, job := range jobs.Items {
551 | resolver.snapshot.Jobs.Store(job.ObjectMeta.UID, job)
552 | }
553 |
554 | services, err := resolver.clientset.CoreV1().Services("").List(context.Background(), metav1.ListOptions{})
555 | if err != nil {
556 | return errors.New("error getting services, aborting snapshot update")
557 | }
558 | for _, service := range services.Items {
559 | resolver.snapshot.Services.Store(service.UID, service)
560 | }
561 |
562 | deployments, err := resolver.clientset.AppsV1().Deployments("").List(context.Background(), metav1.ListOptions{})
563 | if err != nil {
564 | return errors.New("error getting deployments, aborting snapshot update")
565 | }
566 | for _, deployment := range deployments.Items {
567 | resolver.snapshot.Deployments.Store(deployment.UID, deployment)
568 | }
569 |
570 | cronJobs, err := resolver.clientset.BatchV1().CronJobs("").List(context.Background(), metav1.ListOptions{})
571 | if err != nil {
572 | cronJobs, err := resolver.clientset.BatchV1beta1().CronJobs("").List(context.Background(), metav1.ListOptions{})
573 | if err != nil {
574 | return errors.New("error getting cronjobs, aborting snapshot update")
575 | }
576 | for _, cronJob := range cronJobs.Items {
577 | resolver.snapshot.CronJobs.Store(cronJob.UID, cronJob)
578 | }
579 | }
580 | for _, cronJob := range cronJobs.Items {
581 | resolver.snapshot.CronJobs.Store(cronJob.UID, cronJob)
582 | }
583 |
584 | return nil
585 | }
586 |
587 | // add mapping from ip to resolved host to an existing map,
588 | // based on the given cluster snapshot
589 | func (resolver *K8sIPResolver) updateIpMapping() {
590 | // because IP collisions may occur and lead to overwrites in the map, the order is important
591 | // we go from less "favorable" to more "favorable" -
592 | // services -> running pods -> nodes
593 |
594 | resolver.snapshot.Services.Range(func(key any, val any) bool {
595 | service, ok := val.(v1.Service)
596 | if !ok {
597 | log.Printf("Type confusion in services map")
598 | return true // continue
599 | }
600 | // services has (potentially multiple) ClusterIP
601 | workload := Workload{
602 | Name: service.Name,
603 | Namespace: service.Namespace,
604 | Kind: "Service",
605 | }
606 |
607 | // TODO maybe try to match service to workload
608 | for _, clusterIp := range service.Spec.ClusterIPs {
609 | if clusterIp != "None" {
610 | resolver.storeWorkloadsIP(clusterIp, &workload)
611 | }
612 | }
613 | return true
614 | })
615 |
616 | resolver.snapshot.Pods.Range(func(key, value any) bool {
617 | pod, ok := value.(v1.Pod)
618 | if !ok {
619 | log.Printf("Type confusion in pods map")
620 | return true // continue
621 | }
622 | entry := resolver.resolvePodDescriptor(&pod)
623 | for _, podIp := range pod.Status.PodIPs {
624 | // if ip is already in the map, override only if current pod is running
625 | resolver.storeWorkloadsIP(podIp.IP, &entry)
626 | }
627 | return true
628 | })
629 |
630 | resolver.snapshot.Nodes.Range(func(key any, value any) bool {
631 | node, ok := value.(v1.Node)
632 | if !ok {
633 | log.Printf("Type confusion in nodes map")
634 | return true // continue
635 | }
636 | for _, nodeAddress := range node.Status.Addresses {
637 | workload := Workload{
638 | Name: node.Name,
639 | Namespace: "node",
640 | Kind: "node",
641 | }
642 | resolver.storeWorkloadsIP(nodeAddress.Address, &workload)
643 | }
644 | return true
645 | })
646 | }
647 |
648 | func (resolver *K8sIPResolver) storeWorkloadsIP(ip string, newWorkload *Workload) {
649 | // we want to override existing workload, unless the existing workload is a node and the new one isn't
650 | val, ok := resolver.ipsMap.Load(ip)
651 | if ok {
652 | existingWorkload, ok := val.(Workload)
653 | if ok {
654 | if existingWorkload.Kind == "node" && newWorkload.Kind != "node" {
655 | return
656 | }
657 | }
658 | }
659 | resolver.ipsMap.Store(ip, *newWorkload)
660 | }
661 |
662 | // an ugly function to go up one level in hierarchy. maybe there's a better way to do it
663 | // the snapshot is maintained to avoid using an API request for each resolving
664 | func (resolver *K8sIPResolver) getControllerOfOwner(originalOwner *metav1.OwnerReference) (*metav1.OwnerReference, error) {
665 | switch originalOwner.Kind {
666 | case "ReplicaSet":
667 | replicaSetVal, ok := resolver.snapshot.ReplicaSets.Load(originalOwner.UID)
668 | if !ok {
669 | return nil, errors.New("Missing replicaset for UID " + string(originalOwner.UID))
670 | }
671 | replicaSet, ok := replicaSetVal.(appsv1.ReplicaSet)
672 | if !ok {
673 | return nil, errors.New("type confusion in replicasets map")
674 | }
675 | return metav1.GetControllerOf(&replicaSet), nil
676 | case "DaemonSet":
677 | daemonSetVal, ok := resolver.snapshot.DaemonSets.Load(originalOwner.UID)
678 | if !ok {
679 | return nil, errors.New("Missing daemonset for UID " + string(originalOwner.UID))
680 | }
681 | daemonSet, ok := daemonSetVal.(appsv1.DaemonSet)
682 | if !ok {
683 | return nil, errors.New("type confusion in daemonsets map")
684 | }
685 | return metav1.GetControllerOf(&daemonSet), nil
686 | case "StatefulSet":
687 | statefulSetVal, ok := resolver.snapshot.StatefulSets.Load(originalOwner.UID)
688 | if !ok {
689 | return nil, errors.New("Missing statefulset for UID " + string(originalOwner.UID))
690 | }
691 | statefulSet, ok := statefulSetVal.(appsv1.StatefulSet)
692 | if !ok {
693 | return nil, errors.New("type confusion in statefulsets map")
694 | }
695 | return metav1.GetControllerOf(&statefulSet), nil
696 | case "Job":
697 | jobVal, ok := resolver.snapshot.Jobs.Load(originalOwner.UID)
698 | if !ok {
699 | return nil, errors.New("Missing job for UID " + string(originalOwner.UID))
700 | }
701 | job, ok := jobVal.(batchv1.Job)
702 | if !ok {
703 | return nil, errors.New("type confusion in jobs map")
704 | }
705 | return metav1.GetControllerOf(&job), nil
706 | case "Deployment":
707 | deploymentVal, ok := resolver.snapshot.Deployments.Load(originalOwner.UID)
708 | if !ok {
709 | return nil, errors.New("Missing deployment for UID " + string(originalOwner.UID))
710 | }
711 | deployment, ok := deploymentVal.(appsv1.Deployment)
712 | if !ok {
713 | return nil, errors.New("type confusion in deployments map")
714 | }
715 | return metav1.GetControllerOf(&deployment), nil
716 | case "CronJob":
717 | cronJobVal, ok := resolver.snapshot.CronJobs.Load(originalOwner.UID)
718 | if !ok {
719 | return nil, errors.New("Missing cronjob for UID " + string(originalOwner.UID))
720 | }
721 | cronJob, ok := cronJobVal.(batchv1.CronJob)
722 | if !ok {
723 | cronJob, ok := cronJobVal.(v1beta1.CronJob)
724 | if !ok {
725 | return nil, errors.New("type confusion in cronjobs map")
726 | }
727 | return metav1.GetControllerOf(&cronJob), nil
728 | }
729 |
730 | return metav1.GetControllerOf(&cronJob), nil
731 | }
732 | return nil, errors.New("Unsupported kind for lookup - " + originalOwner.Kind)
733 | }
734 |
735 | func (resolver *K8sIPResolver) resolvePodDescriptor(pod *v1.Pod) Workload {
736 | existing, ok := resolver.snapshot.PodDescriptors.Load(pod.UID)
737 | if ok {
738 | result, ok := existing.(Workload)
739 | if ok {
740 | return result
741 | }
742 | }
743 | var err error
744 | name := pod.Name
745 | namespace := pod.Namespace
746 | kind := "pod"
747 | result := Workload{
748 | Name: name,
749 | Namespace: namespace,
750 | Kind: kind,
751 | }
752 |
753 | if resolver.traverseUpHierarchy {
754 | owner := metav1.GetControllerOf(pod)
755 | // climbing up the owners' hierarchy. if an error occurs, we take the data we got and save
756 | // the error to know we shouldn't save this resolution to the descriptors map and retry later.
757 | for owner != nil {
758 | name = owner.Name
759 | kind = owner.Kind
760 | owner, err = resolver.getControllerOfOwner(owner)
761 | if err != nil {
762 | log.Printf("Warning: couldn't retrieve owner of %v - %v. This might happen when starting up", name, err)
763 | }
764 | }
765 |
766 | result.Name = name
767 | result.Kind = kind
768 | } else {
769 | owner := metav1.GetControllerOf(pod)
770 | if owner != nil {
771 | result.Owner = owner.Name
772 | }
773 | }
774 |
775 | if err == nil {
776 | resolver.snapshot.PodDescriptors.Store(pod.UID, result)
777 | }
778 | return result
779 | }
780 |
--------------------------------------------------------------------------------
/pkg/k8s/ipresolver_test.go:
--------------------------------------------------------------------------------
1 | package k8s_test
2 |
3 | import (
4 | "log"
5 | "testing"
6 | "time"
7 |
8 | "github.com/groundcover-com/caretta/pkg/k8s"
9 |
10 | "github.com/google/uuid"
11 | "github.com/stretchr/testify/assert"
12 | appsv1 "k8s.io/api/apps/v1"
13 | batchv1 "k8s.io/api/batch/v1"
14 | v1 "k8s.io/api/core/v1"
15 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
16 | "k8s.io/apimachinery/pkg/runtime"
17 | "k8s.io/apimachinery/pkg/types"
18 | "k8s.io/apimachinery/pkg/watch"
19 | testclient "k8s.io/client-go/kubernetes/fake"
20 | k8stesting "k8s.io/client-go/testing"
21 | )
22 |
23 | type podDescriptor struct {
24 | Name string
25 | Namespace string
26 | IP string
27 | Phase v1.PodPhase
28 | UID types.UID
29 | Controller *workloadResourceDescriptor
30 | }
31 |
32 | type nodeDescriptor struct {
33 | Name string
34 | IP string
35 | UID types.UID
36 | }
37 |
38 | type workloadResourceDescriptor struct {
39 | Name string
40 | Namespace string
41 | UID types.UID
42 | Kind string
43 | }
44 |
45 | func (desc *workloadResourceDescriptor) CreateObject() runtime.Object {
46 | switch desc.Kind {
47 | case "Deployment":
48 | {
49 | return &appsv1.Deployment{
50 | ObjectMeta: metav1.ObjectMeta{
51 | Name: desc.Name,
52 | Namespace: desc.Namespace,
53 | UID: desc.UID,
54 | },
55 | }
56 | }
57 | case "ReplicaSet":
58 | {
59 | return &appsv1.ReplicaSet{
60 | ObjectMeta: metav1.ObjectMeta{
61 | Name: desc.Name,
62 | Namespace: desc.Namespace,
63 | UID: desc.UID,
64 | },
65 | }
66 | }
67 | case "DaemonSet":
68 | {
69 | return &appsv1.DaemonSet{
70 | ObjectMeta: metav1.ObjectMeta{
71 | Name: desc.Name,
72 | Namespace: desc.Namespace,
73 | UID: desc.UID,
74 | },
75 | }
76 | }
77 | case "StatefulSet":
78 | {
79 | return &appsv1.StatefulSet{
80 | ObjectMeta: metav1.ObjectMeta{
81 | Name: desc.Name,
82 | Namespace: desc.Namespace,
83 | UID: desc.UID,
84 | },
85 | }
86 | }
87 | case "Job":
88 | {
89 | return &batchv1.Job{
90 | ObjectMeta: metav1.ObjectMeta{
91 | Name: desc.Name,
92 | Namespace: desc.Namespace,
93 | UID: desc.UID,
94 | },
95 | }
96 | }
97 | case "Service":
98 | {
99 | return &v1.Service{
100 | ObjectMeta: metav1.ObjectMeta{
101 | Name: desc.Name,
102 | Namespace: desc.Namespace,
103 | UID: desc.UID,
104 | },
105 | }
106 | }
107 | case "CronJob":
108 | {
109 | return &batchv1.CronJob{
110 | ObjectMeta: metav1.ObjectMeta{
111 | Name: desc.Name,
112 | Namespace: desc.Namespace,
113 | UID: desc.UID,
114 | },
115 | }
116 | }
117 | }
118 | return nil
119 | }
120 |
121 | func generatePod(pod podDescriptor) runtime.Object {
122 | newPod := v1.Pod{
123 | ObjectMeta: metav1.ObjectMeta{
124 | Name: pod.Name,
125 | Namespace: pod.Namespace,
126 | UID: pod.UID,
127 | },
128 | Status: v1.PodStatus{
129 | PodIP: pod.IP,
130 | PodIPs: []v1.PodIP{
131 | {IP: pod.IP},
132 | },
133 | },
134 | }
135 | if pod.Controller != nil {
136 | newTrue := new(bool)
137 | *newTrue = true
138 | ref := metav1.OwnerReference{
139 | Kind: pod.Controller.Kind,
140 | Name: pod.Controller.Name,
141 | UID: pod.Controller.UID,
142 | Controller: newTrue,
143 | }
144 | newPod.OwnerReferences = append(newPod.OwnerReferences, ref)
145 | }
146 | return &newPod
147 |
148 | }
149 |
150 | func generateWorkloadResource(desc workloadResourceDescriptor) runtime.Object {
151 | return desc.CreateObject()
152 | }
153 |
154 | func generateNode(node nodeDescriptor) runtime.Object {
155 | return &v1.Node{
156 | ObjectMeta: metav1.ObjectMeta{
157 | Name: node.Name,
158 | UID: node.UID,
159 | },
160 | Status: v1.NodeStatus{
161 | Addresses: []v1.NodeAddress{
162 | {
163 | Type: "InternalIP",
164 | Address: node.IP,
165 | },
166 | },
167 | },
168 | }
169 | }
170 |
171 | func generateClusterObjects(pods []podDescriptor, workloadsResources []workloadResourceDescriptor, nodes []nodeDescriptor) []runtime.Object {
172 | result := make([]runtime.Object, 0, len(pods)+len(workloadsResources)+len(nodes))
173 | for _, pod := range pods {
174 | newPod := generatePod(pod)
175 | result = append(result, newPod)
176 | }
177 | for _, desc := range workloadsResources {
178 | result = append(result, generateWorkloadResource(desc))
179 | }
180 | for _, node := range nodes {
181 | result = append(result, generateNode(node))
182 | }
183 | return result
184 | }
185 |
186 | type testStep struct {
187 | shouldWait bool
188 | newPods []podDescriptor
189 | newNodes []nodeDescriptor
190 | newWorkloadResource []workloadResourceDescriptor
191 | modifiedPods []podDescriptor
192 | modifiedNodes []nodeDescriptor
193 | modifiedWorkloadResources []workloadResourceDescriptor
194 | expectedResolves map[string]k8s.Workload
195 | }
196 |
197 | type testScenario struct {
198 | description string
199 | initialState testStep
200 | shouldTraverse bool
201 | updateSteps []testStep
202 | }
203 |
204 | type fakeWatchers struct {
205 | nodesWatcher *watch.FakeWatcher
206 | podsWatcher *watch.FakeWatcher
207 | deploymentsWatcher *watch.FakeWatcher
208 | replicasetsWatcher *watch.FakeWatcher
209 | daemonsetsWatcher *watch.FakeWatcher
210 | statefulsetsWatcher *watch.FakeWatcher
211 | jobsWatcher *watch.FakeWatcher
212 | servicesWatcher *watch.FakeWatcher
213 | cronjobsWatcher *watch.FakeWatcher
214 | }
215 |
216 | func createPrependWatchers(clientset *testclient.Clientset) fakeWatchers {
217 | watchers := fakeWatchers{
218 | nodesWatcher: watch.NewFake(),
219 | podsWatcher: watch.NewFake(),
220 | deploymentsWatcher: watch.NewFake(),
221 | replicasetsWatcher: watch.NewFake(),
222 | daemonsetsWatcher: watch.NewFake(),
223 | statefulsetsWatcher: watch.NewFake(),
224 | jobsWatcher: watch.NewFake(),
225 | servicesWatcher: watch.NewFake(),
226 | cronjobsWatcher: watch.NewFake(),
227 | }
228 | clientset.PrependWatchReactor("nodes", k8stesting.DefaultWatchReactor(watchers.nodesWatcher, nil))
229 | clientset.PrependWatchReactor("pods", k8stesting.DefaultWatchReactor(watchers.podsWatcher, nil))
230 | clientset.PrependWatchReactor("deployments", k8stesting.DefaultWatchReactor(watchers.deploymentsWatcher, nil))
231 | clientset.PrependWatchReactor("replicasets", k8stesting.DefaultWatchReactor(watchers.replicasetsWatcher, nil))
232 | clientset.PrependWatchReactor("daemonsets", k8stesting.DefaultWatchReactor(watchers.daemonsetsWatcher, nil))
233 | clientset.PrependWatchReactor("statefulsets", k8stesting.DefaultWatchReactor(watchers.statefulsetsWatcher, nil))
234 | clientset.PrependWatchReactor("jobs", k8stesting.DefaultWatchReactor(watchers.jobsWatcher, nil))
235 | clientset.PrependWatchReactor("services", k8stesting.DefaultWatchReactor(watchers.servicesWatcher, nil))
236 | clientset.PrependWatchReactor("cronjobs", k8stesting.DefaultWatchReactor(watchers.cronjobsWatcher, nil))
237 | return watchers
238 | }
239 |
240 | func addObject(watchers fakeWatchers, obj runtime.Object, kind string) {
241 | switch kind {
242 | case "Pod":
243 | {
244 | watchers.podsWatcher.Add(obj)
245 | }
246 | case "node":
247 | {
248 | watchers.nodesWatcher.Add(obj)
249 | }
250 | case "Deployment":
251 | {
252 | watchers.deploymentsWatcher.Add(obj)
253 | }
254 | case "ReplicaSet":
255 | {
256 | watchers.replicasetsWatcher.Add(obj)
257 | }
258 | case "DaemonSet":
259 | {
260 | watchers.daemonsetsWatcher.Add(obj)
261 | }
262 | case "StatefulSet":
263 | {
264 | watchers.statefulsetsWatcher.Add(obj)
265 | }
266 | case "Job":
267 | {
268 | watchers.jobsWatcher.Add(obj)
269 | }
270 | case "Service":
271 | {
272 | watchers.servicesWatcher.Add(obj)
273 | }
274 | case "CronJob":
275 | {
276 | watchers.cronjobsWatcher.Add(obj)
277 | }
278 | }
279 | }
280 |
281 | func modifyObject(watchers fakeWatchers, obj runtime.Object, kind string) {
282 | switch kind {
283 | case "Pod":
284 | {
285 | watchers.podsWatcher.Modify(obj)
286 | }
287 | case "node":
288 | {
289 | watchers.nodesWatcher.Modify(obj)
290 | }
291 | case "Deployment":
292 | {
293 | watchers.deploymentsWatcher.Modify(obj)
294 | }
295 | case "ReplicaSet":
296 | {
297 | watchers.replicasetsWatcher.Modify(obj)
298 | }
299 | case "DaemonSet":
300 | {
301 | watchers.daemonsetsWatcher.Modify(obj)
302 | }
303 | case "StatefulSet":
304 | {
305 | watchers.statefulsetsWatcher.Modify(obj)
306 | }
307 | case "Job":
308 | {
309 | watchers.jobsWatcher.Modify(obj)
310 | }
311 | case "Service":
312 | {
313 | watchers.servicesWatcher.Modify(obj)
314 | }
315 | case "CronJob":
316 | {
317 | watchers.cronjobsWatcher.Modify(obj)
318 | }
319 | default:
320 | {
321 | log.Printf("unhandled kind %v", kind)
322 | }
323 | }
324 | }
325 |
326 | func runTest(t *testing.T, test testScenario) {
327 | assert := assert.New(t)
328 | // Arrange 1: mocks and initial state
329 | originalObjs := generateClusterObjects(test.initialState.newPods, test.initialState.newWorkloadResource, test.initialState.newNodes)
330 | fakeClient := testclient.NewSimpleClientset(originalObjs...)
331 | fakeWatchers := createPrependWatchers(fakeClient)
332 |
333 | resolver, err := k8s.NewK8sIPResolver(fakeClient, false, test.shouldTraverse)
334 | assert.NoError(err)
335 |
336 | // Act 1: process initial state
337 | err = resolver.StartWatching()
338 | assert.NoError(err)
339 |
340 | // Assert 1: resolve and compare to expected, original state
341 | for ipToCheck, expectedWorkload := range test.initialState.expectedResolves {
342 | resultWorkload := resolver.ResolveIP(ipToCheck)
343 | assert.Equal(expectedWorkload, resultWorkload)
344 | }
345 |
346 | for _, step := range test.updateSteps {
347 | // Arrange 2+n: update the state via watchers
348 | for _, newPod := range step.newPods {
349 | podObj := generatePod(newPod)
350 | addObject(fakeWatchers, podObj, "Pod")
351 | }
352 | for _, newWorkloadResource := range step.newWorkloadResource {
353 | obj := generateWorkloadResource(newWorkloadResource)
354 | addObject(fakeWatchers, obj, newWorkloadResource.Kind)
355 | }
356 | for _, newNode := range step.newNodes {
357 | obj := generateNode(newNode)
358 | addObject(fakeWatchers, obj, "node")
359 | }
360 | for _, modifiedPod := range step.modifiedPods {
361 | podObj := generatePod(modifiedPod)
362 | modifyObject(fakeWatchers, podObj, "Pod")
363 | }
364 | for _, modifiedWorkloadResource := range step.newWorkloadResource {
365 | obj := generateWorkloadResource(modifiedWorkloadResource)
366 | modifyObject(fakeWatchers, obj, modifiedWorkloadResource.Kind)
367 | }
368 | for _, modifiedNode := range step.modifiedNodes {
369 | obj := generateNode(modifiedNode)
370 | modifyObject(fakeWatchers, obj, "node")
371 | }
372 |
373 | if step.shouldWait {
374 | time.Sleep(1 * time.Second)
375 | }
376 |
377 | // Act+Assert 2+n
378 | for ipToResolve, expectedWorkload := range step.expectedResolves {
379 | assert.Equal(expectedWorkload, resolver.ResolveIP(ipToResolve))
380 | }
381 |
382 | }
383 | }
384 |
385 | var testDeployment = workloadResourceDescriptor{"deployment1", "namespaceA", types.UID(uuid.NewString()), "Deployment"}
386 | var testReplicaSet = workloadResourceDescriptor{"replicaset1", "namespaceA", types.UID(uuid.NewString()), "ReplicaSet"}
387 | var testDaemonSet = workloadResourceDescriptor{"daemonset1", "namespaceA", types.UID(uuid.NewString()), "DaemonSet"}
388 | var testStatefulSet = workloadResourceDescriptor{"statefulset1", "namespaceA", types.UID(uuid.NewString()), "StatefulSet"}
389 | var testJob = workloadResourceDescriptor{"job1", "namespaceA", types.UID(uuid.NewString()), "Job"}
390 | var testCronjob = workloadResourceDescriptor{"cronjob1", "namespaceA", types.UID(uuid.NewString()), "CronJob"}
391 |
392 | func TestResolving(t *testing.T) {
393 | var tests = []testScenario{
394 | {
395 | description: "unsuccessful resolving result should be external",
396 | shouldTraverse: true,
397 | initialState: testStep{
398 | shouldWait: false,
399 | newPods: []podDescriptor{
400 | {"pod1", "namespaceA", "1.1.1.1", v1.PodRunning, types.UID(uuid.New().String()), nil},
401 | },
402 | expectedResolves: map[string]k8s.Workload{
403 | "1.1.1.2": {
404 | Name: "1.1.1.2",
405 | Namespace: "external",
406 | Kind: "external",
407 | },
408 | },
409 | },
410 | },
411 | {
412 | description: "initial snapshot 1 pod resolve to pod1",
413 | shouldTraverse: true,
414 | initialState: testStep{
415 | shouldWait: false,
416 | newPods: []podDescriptor{
417 | {"pod1", "namespaceA", "1.1.1.1", v1.PodRunning, types.UID(uuid.New().String()), nil},
418 | },
419 | expectedResolves: map[string]k8s.Workload{
420 | "1.1.1.1": {
421 | Name: "pod1",
422 | Namespace: "namespaceA",
423 | Kind: "pod",
424 | },
425 | },
426 | },
427 | },
428 | {
429 | description: "initial snapshot 3 pods resolve to each pod",
430 | shouldTraverse: true,
431 | initialState: testStep{
432 | shouldWait: false,
433 | newPods: []podDescriptor{
434 | {"pod1", "namespaceA", "1.1.1.1", v1.PodRunning, types.UID(uuid.New().String()), nil},
435 | {"pod2", "namespaceA", "1.1.1.2", v1.PodRunning, types.UID(uuid.New().String()), nil},
436 | {"pod3", "namespaceA", "1.1.1.3", v1.PodRunning, types.UID(uuid.New().String()), nil},
437 | },
438 | expectedResolves: map[string]k8s.Workload{
439 | "1.1.1.1": {
440 | Name: "pod1",
441 | Namespace: "namespaceA",
442 | Kind: "pod",
443 | },
444 | "1.1.1.2": {
445 | Name: "pod2",
446 | Namespace: "namespaceA",
447 | Kind: "pod",
448 | },
449 | "1.1.1.3": {
450 | Name: "pod3",
451 | Namespace: "namespaceA",
452 | Kind: "pod",
453 | },
454 | },
455 | },
456 | },
457 | {
458 | description: "empty initial 1 pod added resolve to pod",
459 | shouldTraverse: true,
460 | initialState: testStep{
461 | shouldWait: false,
462 | expectedResolves: map[string]k8s.Workload{
463 | "1.1.1.1": {
464 | Name: "1.1.1.1",
465 | Namespace: "external",
466 | Kind: "external",
467 | },
468 | },
469 | },
470 | updateSteps: []testStep{
471 | {
472 | shouldWait: true,
473 | newPods: []podDescriptor{
474 | {"pod1", "namespaceA", "1.1.1.1", v1.PodRunning, types.UID(uuid.New().String()), nil},
475 | },
476 | expectedResolves: map[string]k8s.Workload{
477 | "1.1.1.1": {
478 | Name: "pod1",
479 | Namespace: "namespaceA",
480 | Kind: "pod",
481 | },
482 | },
483 | },
484 | },
485 | },
486 | {
487 | description: "empty initial 1 node added resolve to node",
488 | shouldTraverse: true,
489 | initialState: testStep{
490 | shouldWait: false,
491 | expectedResolves: map[string]k8s.Workload{
492 | "1.1.1.0": {
493 | Name: "1.1.1.0",
494 | Namespace: "external",
495 | Kind: "external",
496 | },
497 | },
498 | },
499 | updateSteps: []testStep{
500 | {
501 | shouldWait: true,
502 | newNodes: []nodeDescriptor{
503 | {"Node1", "1.1.1.0", types.UID(uuid.NewString())},
504 | },
505 | expectedResolves: map[string]k8s.Workload{
506 | "1.1.1.0": {
507 | Name: "Node1",
508 | Namespace: "node",
509 | Kind: "node",
510 | },
511 | },
512 | },
513 | },
514 | },
515 | {
516 | description: "empty initial 1 node, 1 pod added resolve to each",
517 | shouldTraverse: true,
518 | initialState: testStep{
519 | shouldWait: false,
520 | expectedResolves: map[string]k8s.Workload{
521 | "1.1.1.0": {
522 | Name: "1.1.1.0",
523 | Namespace: "external",
524 | Kind: "external",
525 | },
526 | },
527 | },
528 | updateSteps: []testStep{
529 | {
530 | shouldWait: true,
531 | newPods: []podDescriptor{
532 | {"pod1", "namespaceA", "1.1.1.1", v1.PodRunning, types.UID(uuid.New().String()), nil},
533 | },
534 | newNodes: []nodeDescriptor{
535 | {"Node1", "1.1.1.0", types.UID(uuid.NewString())},
536 | },
537 | expectedResolves: map[string]k8s.Workload{
538 | "1.1.1.0": {
539 | Name: "Node1",
540 | Namespace: "node",
541 | Kind: "node",
542 | },
543 | "1.1.1.1": {
544 | Name: "pod1",
545 | Namespace: "namespaceA",
546 | Kind: "pod",
547 | },
548 | },
549 | },
550 | },
551 | },
552 | {
553 | description: "1 pod changing ip resolve both ips to pod",
554 | shouldTraverse: true,
555 | initialState: testStep{
556 | shouldWait: false,
557 | newPods: []podDescriptor{
558 | {"pod1", "namespaceA", "1.1.1.1", v1.PodRunning, types.UID(uuid.New().String()), nil},
559 | },
560 | expectedResolves: map[string]k8s.Workload{
561 | "1.1.1.1": {
562 | Name: "pod1",
563 | Namespace: "namespaceA",
564 | Kind: "pod",
565 | },
566 | "1.1.1.2": {
567 | Name: "1.1.1.2",
568 | Namespace: "external",
569 | Kind: "external",
570 | },
571 | },
572 | },
573 | updateSteps: []testStep{
574 | {
575 | shouldWait: true,
576 | modifiedPods: []podDescriptor{
577 | {"pod1", "namespaceA", "1.1.1.2", v1.PodRunning, types.UID(uuid.New().String()), nil},
578 | },
579 | expectedResolves: map[string]k8s.Workload{
580 | "1.1.1.1": { // the resolver shouldn't delete old not-reused entries
581 | Name: "pod1",
582 | Namespace: "namespaceA",
583 | Kind: "pod",
584 | },
585 | "1.1.1.2": {
586 | Name: "pod1",
587 | Namespace: "namespaceA",
588 | Kind: "pod",
589 | },
590 | },
591 | },
592 | },
593 | },
594 | {
595 | description: "1 pod changing ip old ip is reused resolve reused ip to new pod",
596 | shouldTraverse: true,
597 | initialState: testStep{
598 | shouldWait: false,
599 | newPods: []podDescriptor{
600 | {"pod1", "namespaceA", "1.1.1.1", v1.PodRunning, types.UID("1"), nil},
601 | },
602 | expectedResolves: map[string]k8s.Workload{
603 | "1.1.1.1": {
604 | Name: "pod1",
605 | Namespace: "namespaceA",
606 | Kind: "pod",
607 | },
608 | "1.1.1.2": {
609 | Name: "1.1.1.2",
610 | Namespace: "external",
611 | Kind: "external",
612 | },
613 | },
614 | },
615 | updateSteps: []testStep{
616 | {
617 | shouldWait: false,
618 | modifiedPods: []podDescriptor{
619 | {"pod1", "namespaceA", "1.1.1.2", v1.PodRunning, types.UID("1"), nil},
620 | },
621 | expectedResolves: map[string]k8s.Workload{},
622 | },
623 | {
624 | shouldWait: true,
625 | newPods: []podDescriptor{
626 | {"pod2", "namespaceA", "1.1.1.1", v1.PodRunning, types.UID(uuid.New().String()), nil},
627 | },
628 | expectedResolves: map[string]k8s.Workload{
629 | "1.1.1.1": {
630 | Name: "pod2",
631 | Namespace: "namespaceA",
632 | Kind: "pod",
633 | },
634 | "1.1.1.2": {
635 | Name: "pod1",
636 | Namespace: "namespaceA",
637 | Kind: "pod",
638 | },
639 | },
640 | },
641 | },
642 | },
643 | {
644 | description: "1 pod changing ip old ip is reused by node resolve ip to new node",
645 | shouldTraverse: true,
646 | initialState: testStep{
647 | shouldWait: false,
648 | newPods: []podDescriptor{
649 | {"pod1", "namespaceA", "1.1.1.1", v1.PodRunning, types.UID("1"), nil},
650 | },
651 | expectedResolves: map[string]k8s.Workload{
652 | "1.1.1.1": {
653 | Name: "pod1",
654 | Namespace: "namespaceA",
655 | Kind: "pod",
656 | },
657 | "1.1.1.2": {
658 | Name: "1.1.1.2",
659 | Namespace: "external",
660 | Kind: "external",
661 | },
662 | },
663 | },
664 | updateSteps: []testStep{
665 | {
666 | shouldWait: false,
667 | modifiedPods: []podDescriptor{
668 | {"pod1", "namespaceA", "1.1.1.2", v1.PodRunning, types.UID("1"), nil},
669 | },
670 | expectedResolves: map[string]k8s.Workload{},
671 | },
672 | {
673 | shouldWait: true,
674 | newNodes: []nodeDescriptor{
675 | {"Node1", "1.1.1.1", types.UID(uuid.NewString())},
676 | },
677 | expectedResolves: map[string]k8s.Workload{
678 | "1.1.1.1": {
679 | Name: "Node1",
680 | Namespace: "node",
681 | Kind: "node",
682 | },
683 | "1.1.1.2": {
684 | Name: "pod1",
685 | Namespace: "namespaceA",
686 | Kind: "pod",
687 | },
688 | },
689 | },
690 | },
691 | },
692 | {
693 | description: "1 node changing ip resolve both ips to node",
694 | shouldTraverse: true,
695 | initialState: testStep{
696 | shouldWait: false,
697 | newPods: []podDescriptor{},
698 | newNodes: []nodeDescriptor{
699 | {"Node1", "1.1.1.0", types.UID("1")},
700 | },
701 | expectedResolves: map[string]k8s.Workload{},
702 | },
703 | updateSteps: []testStep{
704 | {
705 | shouldWait: true,
706 | modifiedNodes: []nodeDescriptor{
707 | {"Node1", "1.1.2.0", types.UID("1")},
708 | },
709 | modifiedWorkloadResources: []workloadResourceDescriptor{},
710 | expectedResolves: map[string]k8s.Workload{
711 | "1.1.1.0": { // resolver isn't expected to delete old not-reused entries
712 | Name: "Node1",
713 | Namespace: "node",
714 | Kind: "node",
715 | },
716 | "1.1.2.0": {
717 | Name: "Node1",
718 | Namespace: "node",
719 | Kind: "node",
720 | },
721 | },
722 | },
723 | },
724 | },
725 | {
726 | description: "1 node changing ip, reused by another node resolve reused ip to new node",
727 | shouldTraverse: true,
728 | initialState: testStep{
729 | shouldWait: true,
730 | newNodes: []nodeDescriptor{
731 | {"Node1", "1.1.1.0", types.UID("1")},
732 | },
733 | expectedResolves: map[string]k8s.Workload{},
734 | },
735 | updateSteps: []testStep{
736 | {
737 | shouldWait: true,
738 | modifiedNodes: []nodeDescriptor{
739 | {"Node1", "1.1.2.0", types.UID("1")},
740 | },
741 | expectedResolves: map[string]k8s.Workload{},
742 | },
743 | {
744 | shouldWait: true,
745 | newNodes: []nodeDescriptor{
746 | {"Node2", "1.1.1.0", types.UID("2")},
747 | },
748 | modifiedNodes: []nodeDescriptor{
749 | {"Node1", "1.1.2.0", types.UID("1")},
750 | },
751 | expectedResolves: map[string]k8s.Workload{
752 | "1.1.1.0": {
753 | Name: "Node2",
754 | Namespace: "node",
755 | Kind: "node",
756 | },
757 | "1.1.2.0": {
758 | Name: "Node1",
759 | Namespace: "node",
760 | Kind: "node",
761 | },
762 | },
763 | },
764 | },
765 | },
766 | {
767 | description: "pod with hostip wont override node",
768 | shouldTraverse: true,
769 | initialState: testStep{
770 | shouldWait: false,
771 | newNodes: []nodeDescriptor{
772 | {"Node1", "1.1.1.0", types.UID(uuid.NewString())},
773 | },
774 | expectedResolves: map[string]k8s.Workload{},
775 | },
776 | updateSteps: []testStep{
777 | {
778 | shouldWait: true,
779 | newPods: []podDescriptor{
780 | {"pod1", "namespaceA", "1.1.1.0", v1.PodRunning, types.UID(uuid.New().String()), nil},
781 | },
782 | expectedResolves: map[string]k8s.Workload{
783 | "1.1.1.0": {
784 | Name: "Node1",
785 | Namespace: "node",
786 | Kind: "node",
787 | },
788 | },
789 | },
790 | },
791 | },
792 | }
793 | for _, test := range tests {
794 | t.Run(test.description, func(t *testing.T) {
795 | runTest(t, test)
796 | })
797 | }
798 | }
799 |
800 | func TestControllersResolving(t *testing.T) {
801 | var controllersTests = []testScenario{
802 | {
803 | description: "initial snapshot 1 pod controlled by deployment resolve to deployment",
804 | shouldTraverse: true,
805 | initialState: testStep{
806 | shouldWait: false,
807 | newPods: []podDescriptor{
808 | {"pod1", "namespaceA", "1.1.1.1", v1.PodRunning, types.UID(uuid.NewString()), &testDeployment},
809 | },
810 | newWorkloadResource: []workloadResourceDescriptor{testDeployment},
811 | expectedResolves: map[string]k8s.Workload{
812 | "1.1.1.1": {
813 | Name: testDeployment.Name,
814 | Namespace: testDeployment.Namespace,
815 | Kind: testDeployment.Kind,
816 | },
817 | },
818 | },
819 | },
820 | {
821 | description: "initial snapshot 1 pod controlled by replicaset resolve to replicaset",
822 | shouldTraverse: true,
823 | initialState: testStep{
824 | shouldWait: false,
825 | newPods: []podDescriptor{
826 | {"pod1", "namespaceA", "1.1.1.1", v1.PodRunning, types.UID(uuid.NewString()), &testReplicaSet},
827 | },
828 | newWorkloadResource: []workloadResourceDescriptor{testReplicaSet},
829 | expectedResolves: map[string]k8s.Workload{
830 | "1.1.1.1": {
831 | Name: testReplicaSet.Name,
832 | Namespace: testReplicaSet.Namespace,
833 | Kind: testReplicaSet.Kind,
834 | },
835 | },
836 | },
837 | },
838 | {
839 | description: "initial snapshot 1 pod controlled by daemonset resolve to daemonset",
840 | shouldTraverse: true,
841 | initialState: testStep{
842 | shouldWait: false,
843 | newPods: []podDescriptor{
844 | {"pod1", "namespaceA", "1.1.1.1", v1.PodRunning, types.UID(uuid.NewString()), &testDaemonSet},
845 | },
846 | newWorkloadResource: []workloadResourceDescriptor{testDaemonSet},
847 | expectedResolves: map[string]k8s.Workload{
848 | "1.1.1.1": {
849 | Name: testDaemonSet.Name,
850 | Namespace: testDaemonSet.Namespace,
851 | Kind: testDaemonSet.Kind,
852 | },
853 | },
854 | },
855 | },
856 | {
857 | description: "initial snapshot 1 pod controlled by statefulset resolve to statefulset",
858 | shouldTraverse: true,
859 | initialState: testStep{
860 | shouldWait: false,
861 | newPods: []podDescriptor{
862 | {"pod1", "namespaceA", "1.1.1.1", v1.PodRunning, types.UID(uuid.NewString()), &testStatefulSet},
863 | },
864 | newWorkloadResource: []workloadResourceDescriptor{testStatefulSet},
865 | expectedResolves: map[string]k8s.Workload{
866 | "1.1.1.1": {
867 | Name: testStatefulSet.Name,
868 | Namespace: testStatefulSet.Namespace,
869 | Kind: testStatefulSet.Kind,
870 | },
871 | },
872 | },
873 | },
874 | {
875 | description: "initial snapshot 1 pod controlled by job resolve to job",
876 | shouldTraverse: true,
877 | initialState: testStep{
878 | shouldWait: false,
879 | newPods: []podDescriptor{
880 | {"pod1", "namespaceA", "1.1.1.1", v1.PodRunning, types.UID(uuid.NewString()), &testJob},
881 | },
882 | newWorkloadResource: []workloadResourceDescriptor{testJob},
883 | expectedResolves: map[string]k8s.Workload{
884 | "1.1.1.1": {
885 | Name: testJob.Name,
886 | Namespace: testJob.Namespace,
887 | Kind: testJob.Kind,
888 | },
889 | },
890 | },
891 | },
892 | {
893 | description: "initial snapshot 1 pod controlled by cronjob resolve to cronjob",
894 | shouldTraverse: true,
895 | initialState: testStep{
896 | shouldWait: false,
897 | newPods: []podDescriptor{
898 | {"pod1", "namespaceA", "1.1.1.1", v1.PodRunning, types.UID(uuid.NewString()), &testCronjob},
899 | },
900 | newWorkloadResource: []workloadResourceDescriptor{testCronjob},
901 | expectedResolves: map[string]k8s.Workload{
902 | "1.1.1.1": {
903 | Name: testCronjob.Name,
904 | Namespace: testCronjob.Namespace,
905 | Kind: testCronjob.Kind,
906 | },
907 | },
908 | },
909 | },
910 | {
911 | description: "initial snapshot 1 pod controlled by deployment owned by deployment",
912 | shouldTraverse: false,
913 | initialState: testStep{
914 | shouldWait: false,
915 | newPods: []podDescriptor{
916 | {"pod1", "namespaceA", "1.1.1.1", v1.PodRunning, types.UID(uuid.NewString()), &testDeployment},
917 | },
918 | newWorkloadResource: []workloadResourceDescriptor{testDeployment},
919 | expectedResolves: map[string]k8s.Workload{
920 | "1.1.1.1": {
921 | Name: "pod1",
922 | Namespace: "namespaceA",
923 | Kind: "pod",
924 | Owner: testDeployment.Name,
925 | },
926 | },
927 | },
928 | },
929 | {
930 | description: "initial snapshot 1 pod controlled by replicaset owned by replicaset",
931 | shouldTraverse: false,
932 | initialState: testStep{
933 | shouldWait: false,
934 | newPods: []podDescriptor{
935 | {"pod1", "namespaceA", "1.1.1.1", v1.PodRunning, types.UID(uuid.NewString()), &testReplicaSet},
936 | },
937 | newWorkloadResource: []workloadResourceDescriptor{testReplicaSet},
938 | expectedResolves: map[string]k8s.Workload{
939 | "1.1.1.1": {
940 | Name: "pod1",
941 | Namespace: "namespaceA",
942 | Kind: "pod",
943 | Owner: testReplicaSet.Name,
944 | },
945 | },
946 | },
947 | },
948 | {
949 | description: "initial snapshot 1 pod controlled by daemonset owned by daemonset",
950 | shouldTraverse: false,
951 | initialState: testStep{
952 | shouldWait: false,
953 | newPods: []podDescriptor{
954 | {"pod1", "namespaceA", "1.1.1.1", v1.PodRunning, types.UID(uuid.NewString()), &testDaemonSet},
955 | },
956 | newWorkloadResource: []workloadResourceDescriptor{testDaemonSet},
957 | expectedResolves: map[string]k8s.Workload{
958 | "1.1.1.1": {
959 | Name: "pod1",
960 | Namespace: "namespaceA",
961 | Kind: "pod",
962 | Owner: testDaemonSet.Name,
963 | },
964 | },
965 | },
966 | },
967 | {
968 | description: "initial snapshot 1 pod controlled by statefulset owned by statefulset",
969 | shouldTraverse: false,
970 | initialState: testStep{
971 | shouldWait: false,
972 | newPods: []podDescriptor{
973 | {"pod1", "namespaceA", "1.1.1.1", v1.PodRunning, types.UID(uuid.NewString()), &testStatefulSet},
974 | },
975 | newWorkloadResource: []workloadResourceDescriptor{testStatefulSet},
976 | expectedResolves: map[string]k8s.Workload{
977 | "1.1.1.1": {
978 | Name: "pod1",
979 | Namespace: "namespaceA",
980 | Kind: "pod",
981 | Owner: testStatefulSet.Name,
982 | },
983 | },
984 | },
985 | },
986 | {
987 | description: "initial snapshot 1 pod controlled by job owned by job",
988 | shouldTraverse: false,
989 | initialState: testStep{
990 | shouldWait: false,
991 | newPods: []podDescriptor{
992 | {"pod1", "namespaceA", "1.1.1.1", v1.PodRunning, types.UID(uuid.NewString()), &testJob},
993 | },
994 | newWorkloadResource: []workloadResourceDescriptor{testJob},
995 | expectedResolves: map[string]k8s.Workload{
996 | "1.1.1.1": {
997 | Name: "pod1",
998 | Namespace: "namespaceA",
999 | Kind: "pod",
1000 | Owner: testJob.Name,
1001 | },
1002 | },
1003 | },
1004 | },
1005 | {
1006 | description: "initial snapshot 1 pod controlled by cronjob owned by cronjob",
1007 | shouldTraverse: false,
1008 | initialState: testStep{
1009 | shouldWait: false,
1010 | newPods: []podDescriptor{
1011 | {"pod1", "namespaceA", "1.1.1.1", v1.PodRunning, types.UID(uuid.NewString()), &testCronjob},
1012 | },
1013 | newWorkloadResource: []workloadResourceDescriptor{testCronjob},
1014 | expectedResolves: map[string]k8s.Workload{
1015 | "1.1.1.1": {
1016 | Name: "pod1",
1017 | Namespace: "namespaceA",
1018 | Kind: "pod",
1019 | Owner: testCronjob.Name,
1020 | },
1021 | },
1022 | },
1023 | },
1024 | }
1025 | for _, test := range controllersTests {
1026 | t.Run(test.description, func(t *testing.T) {
1027 | runTest(t, test)
1028 | })
1029 | }
1030 | }
1031 |
--------------------------------------------------------------------------------
/pkg/metrics/prometheus.go:
--------------------------------------------------------------------------------
1 | package metrics
2 |
3 | import (
4 | "log"
5 | "net/http"
6 |
7 | "github.com/prometheus/client_golang/prometheus/promhttp"
8 | )
9 |
10 | func StartMetricsServer(endpoint string, port string) *http.Server {
11 | http.Handle(endpoint, promhttp.Handler())
12 | server := &http.Server{Addr: port}
13 | go func() {
14 | err := server.ListenAndServe()
15 | if err != nil {
16 | log.Fatalf("Error starting prometheus server on port %v", port)
17 | }
18 | }()
19 | return server
20 | }
21 |
--------------------------------------------------------------------------------
/pkg/tracing/ebpf/arm_support.h:
--------------------------------------------------------------------------------
1 | #ifndef __ARM_SUPPORT_H__
2 | #define __ARM_SUPPORT_H__
3 |
4 | struct user_pt_regs {
5 | __u64 regs[31];
6 | __u64 sp;
7 | __u64 pc;
8 | __u64 pstate;
9 | };
10 |
11 | #endif
--------------------------------------------------------------------------------
/pkg/tracing/ebpf/caretta.bpf.c:
--------------------------------------------------------------------------------
1 | #include "core_structures.h"
2 | #include "arm_support.h"
3 | #include
4 | #include
5 | #include
6 | #include "ebpf_utils.h"
7 | #include "epbf_shared_types.h"
8 | #include "ebpf_internal_types.h"
9 |
10 | char __license[] SEC("license") = "Dual MIT/GPL";
11 |
12 | // internal kernel-only map to hold state for each sock observed.
13 | struct bpf_map_def SEC("maps") sock_infos = {
14 | .type = BPF_MAP_TYPE_HASH,
15 | .key_size = sizeof(struct sock *),
16 | .value_size = sizeof(struct sock_info),
17 | .max_entries = MAX_CONNECTIONS,
18 | };
19 |
20 | // the main product of the tracing - map containing all connections observed,
21 | // with metadata and throughput stats.
22 | // key is a whole identifier struct and not a single id to split the constant
23 | // and dynamic values and to resemble as closely as possible the end result in
24 | // the userspace code.
25 | struct bpf_map_def SEC("maps") connections = {
26 | .type = BPF_MAP_TYPE_HASH,
27 | .key_size = sizeof(struct connection_identifier),
28 | .value_size = sizeof(struct connection_throughput_stats),
29 | .max_entries = MAX_CONNECTIONS,
30 | };
31 |
32 | // helper to convert short int from BE to LE
33 | static inline u16 be_to_le(__be16 be) { return (be >> 8) | (be << 8); }
34 |
35 | static inline u32 get_unique_id() {
36 | return bpf_ktime_get_ns() % __UINT32_MAX__; // no reason to use 64 bit for this
37 | }
38 |
39 | // function for parsing the struct sock
40 | static inline int
41 | parse_sock_data(struct sock *sock, struct connection_tuple *out_tuple,
42 | struct connection_throughput_stats *out_throughput) {
43 |
44 | if (sock == NULL) {
45 | return BPF_ERROR;
46 | }
47 |
48 | // struct sock wraps struct tcp_sock and struct inet_sock as its first member
49 | struct tcp_sock *tcp = (struct tcp_sock *)sock;
50 | struct inet_sock *inet = (struct inet_sock *)sock;
51 |
52 | // initialize variables. IP addresses and ports are read originally
53 | // big-endian, and we will convert the ports to little-endian.
54 | __be16 src_port_be = 0;
55 | __be16 dst_port_be = 0;
56 |
57 | // read connection tuple
58 |
59 | if (0 != bpf_core_read(&out_tuple->src_ip, sizeof(out_tuple->src_ip),
60 | &inet->inet_saddr)) {
61 | return BPF_ERROR;
62 | }
63 |
64 | if (0 != bpf_core_read(&out_tuple->dst_ip, sizeof(out_tuple->dst_ip),
65 | &inet->inet_daddr)) {
66 | return BPF_ERROR;
67 | }
68 |
69 | if (0 != bpf_core_read(&src_port_be, sizeof(src_port_be), &inet->inet_sport)) {
70 | return BPF_ERROR;
71 | }
72 | out_tuple->src_port = be_to_le(src_port_be);
73 |
74 | if (0 != bpf_core_read(&dst_port_be, sizeof(dst_port_be), &inet->inet_dport)) {
75 | return BPF_ERROR;
76 | }
77 | out_tuple->dst_port = be_to_le(dst_port_be);
78 |
79 | // read throughput data
80 |
81 | if (0 != bpf_core_read(&out_throughput->bytes_received,
82 | sizeof(out_throughput->bytes_received),
83 | &tcp->bytes_received)) {
84 | return BPF_ERROR;
85 | }
86 | if (0 != bpf_core_read(&out_throughput->bytes_sent,
87 | sizeof(out_throughput->bytes_sent), &tcp->bytes_sent)) {
88 | return BPF_ERROR;
89 | }
90 |
91 | return BPF_SUCCESS;
92 | };
93 |
94 | static inline enum connection_role get_sock_role(struct sock* sock) {
95 | // the max_ack_backlog holds the limit for the accept queue
96 | // if it is a server, it will not be 0
97 | int max_ack_backlog = 0;
98 | if (0 != bpf_core_read(&max_ack_backlog, sizeof(max_ack_backlog),
99 | &sock->sk_max_ack_backlog)) {
100 | return CONNECTION_ROLE_UNKNOWN;
101 | }
102 |
103 | return max_ack_backlog == 0 ? CONNECTION_ROLE_CLIENT : CONNECTION_ROLE_SERVER;
104 | }
105 |
106 | // probing the tcp_data_queue kernel function, and adding the connection
107 | // observed to the map.
108 | SEC("kprobe/tcp_data_queue")
109 | static int handle_tcp_data_queue(struct pt_regs *ctx) {
110 | // first argument to tcp_data_queue is a struct sock*
111 | struct sock *sock = (struct sock *)PT_REGS_PARM1(ctx);
112 |
113 | struct connection_identifier conn_id = {};
114 | struct connection_throughput_stats throughput = {};
115 |
116 | if (parse_sock_data(sock, &conn_id.tuple, &throughput) == BPF_ERROR) {
117 | return BPF_ERROR;
118 | }
119 |
120 | // skip unconnected sockets
121 | if (conn_id.tuple.dst_port == 0 && conn_id.tuple.dst_ip == BPF_SUCCESS) {
122 | return BPF_SUCCESS;
123 | }
124 |
125 | // fill the conn_id extra details from sock_info map entry, or create one
126 | struct sock_info *sock_info = bpf_map_lookup_elem(&sock_infos, &sock);
127 | if (sock_info == NULL) {
128 | // first time we encounter this sock
129 | // check if server or client and insert to the maps
130 |
131 | enum connection_role role = get_sock_role(sock);
132 |
133 | struct sock_info info = {
134 | .pid = 0, // can't associate to pid anyway
135 | .role = role,
136 | .is_active = true,
137 | .id = get_unique_id(),
138 | };
139 | bpf_map_update_elem(&sock_infos, &sock, &info, BPF_ANY);
140 |
141 | conn_id.pid = info.pid;
142 | conn_id.id = info.id;
143 | conn_id.role = info.role;
144 | throughput.is_active = true;
145 |
146 | bpf_map_update_elem(&connections, &conn_id, &throughput, BPF_ANY);
147 |
148 | return BPF_SUCCESS;
149 |
150 | }
151 |
152 | conn_id.pid = sock_info->pid;
153 | conn_id.id = sock_info->id;
154 | conn_id.role = sock_info->role;
155 | if (!sock_info->is_active) {
156 | return -1;
157 | }
158 | throughput.is_active = sock_info->is_active;
159 |
160 | bpf_map_update_elem(&connections, &conn_id, &throughput, BPF_ANY);
161 |
162 | return BPF_SUCCESS;
163 | };
164 |
165 | static inline int handle_set_tcp_syn_sent(struct sock* sock) {
166 | // start of a client session
167 | u32 pid = bpf_get_current_pid_tgid() >> 32;
168 |
169 | struct sock_info info = {
170 | .pid = pid,
171 | .role = CONNECTION_ROLE_CLIENT,
172 | .is_active = true,
173 | .id = get_unique_id(),
174 | };
175 |
176 | bpf_map_update_elem(&sock_infos, &sock, &info, BPF_ANY);
177 |
178 | return BPF_SUCCESS;
179 | }
180 |
181 | static inline int handle_set_tcp_syn_recv(struct sock* sock) {
182 | // this is a server getting syn after listen
183 | struct connection_identifier conn_id = {};
184 | struct connection_throughput_stats throughput = {};
185 |
186 | if (parse_sock_data(sock, &conn_id.tuple, &throughput) == BPF_ERROR) {
187 | return BPF_ERROR;
188 | }
189 |
190 | struct sock_info info = {
191 | .pid = 0, // can't associate to process
192 | .role = CONNECTION_ROLE_SERVER,
193 | .is_active = true,
194 | .id = get_unique_id(),
195 | };
196 |
197 | bpf_map_update_elem(&sock_infos, &sock, &info, BPF_ANY);
198 |
199 | // probably the dst ip will still be uninitialized
200 | if (conn_id.tuple.dst_ip == 0) {
201 | return BPF_SUCCESS;
202 | }
203 |
204 | conn_id.pid = info.pid;
205 | conn_id.id = info.id;
206 | conn_id.role = info.role;
207 |
208 | bpf_map_update_elem(&connections, &conn_id, &throughput, BPF_ANY);
209 |
210 | return BPF_SUCCESS;
211 | }
212 |
213 | static inline int handle_set_tcp_close(struct sock* sock) {
214 | // mark as inactive
215 | struct connection_identifier conn_id = {};
216 | struct connection_throughput_stats throughput = {};
217 |
218 | if (parse_sock_data(sock, &conn_id.tuple, &throughput) == BPF_ERROR) {
219 | return BPF_ERROR;
220 | }
221 |
222 | struct sock_info *info = bpf_map_lookup_elem(&sock_infos, &sock);
223 | if (info == NULL) {
224 | conn_id.id = get_unique_id();
225 | conn_id.pid = 0; // cannot associate to PID in this state
226 | conn_id.role = get_sock_role(sock);
227 | } else {
228 | conn_id.id = info->id;
229 | conn_id.pid = info->pid;
230 | conn_id.role = info->role;
231 | bpf_map_delete_elem(&sock_infos, &sock);
232 | }
233 |
234 | throughput.is_active = false;
235 | bpf_map_update_elem(&connections, &conn_id, &throughput, BPF_ANY);
236 |
237 | return BPF_SUCCESS;
238 | }
239 |
240 | SEC("tracepoint/sock/inet_sock_set_state")
241 | static int handle_sock_set_state(struct set_state_args *args) {
242 | struct sock *sock = (struct sock *)args->skaddr;
243 |
244 | switch(args->newstate) {
245 | case TCP_SYN_RECV: {
246 | return handle_set_tcp_syn_recv(sock) == BPF_ERROR;
247 | }
248 | case TCP_SYN_SENT: {
249 | return handle_set_tcp_syn_sent(sock) == BPF_ERROR;
250 | }
251 | case TCP_CLOSE: {
252 | return handle_set_tcp_close(sock);
253 | }
254 | }
255 |
256 | return BPF_SUCCESS;
257 | }
--------------------------------------------------------------------------------
/pkg/tracing/ebpf/core_structures.h:
--------------------------------------------------------------------------------
1 | #ifndef __CORE_STRUCTURES_H__
2 | #define __CORE_STRUCTURES_H__
3 |
4 | #include
5 |
6 | /*
7 | * All structs and unions in this file should have a "preserve access index"
8 | * attribute. The following attaches this attribute to all records (structs,
9 | * unions, classes).
10 | * @see https://clang.llvm.org/docs/LanguageExtensions.html
11 | */
12 | #pragma clang attribute push
13 | #pragma clang attribute(__attribute__((preserve_access_index)), \
14 | apply_to = record)
15 |
16 | // this is not core structure per se, but it would have been defined in a full
17 | // vmlinux.h
18 | enum {
19 | false = 0,
20 | true = 1,
21 | };
22 |
23 | enum {
24 | TCP_ESTABLISHED = 1,
25 | TCP_SYN_SENT = 2,
26 | TCP_SYN_RECV = 3,
27 | TCP_FIN_WAIT1 = 4,
28 | TCP_FIN_WAIT2 = 5,
29 | TCP_TIME_WAIT = 6,
30 | TCP_CLOSE = 7,
31 | TCP_CLOSE_WAIT = 8,
32 | TCP_LAST_ACK = 9,
33 | TCP_LISTEN = 10,
34 | TCP_CLOSING = 11,
35 | TCP_NEW_SYN_RECV = 12,
36 | TCP_MAX_STATES = 13,
37 | };
38 |
39 |
40 | typedef u16 sa_family_t;
41 | typedef u32 socklen_t;
42 |
43 | struct in_addr {
44 | __be32 s_addr;
45 | };
46 |
47 | struct in6_addr {
48 | union {
49 | __u8 u6_addr8[16];
50 | } in6_u;
51 | };
52 |
53 | struct sockaddr_in {
54 | sa_family_t sin_family;
55 | __be16 sin_port;
56 | struct in_addr sin_addr;
57 | };
58 |
59 | struct sockaddr_in6 {
60 | sa_family_t sin6_family;
61 | __be16 sin6_port;
62 | struct in6_addr sin6_addr;
63 | };
64 |
65 | struct sockaddr {
66 | sa_family_t sa_family;
67 | };
68 |
69 | struct sock_common {
70 | struct {
71 | __be32 skc_daddr;
72 | __be32 skc_rcv_saddr;
73 | };
74 | struct {
75 | __be16 skc_dport;
76 | __u16 skc_num;
77 | };
78 | unsigned short skc_family;
79 | struct in6_addr skc_v6_daddr;
80 | };
81 |
82 | struct sock {
83 | struct sock_common __sk_common;
84 | unsigned int sk_shutdown : 2, sk_no_check_tx : 1, sk_no_check_rx : 1,
85 | sk_userlocks : 4, sk_protocol : 8, sk_type : 16;
86 | u32 sk_max_ack_backlog;
87 | };
88 |
89 | struct socket {
90 | struct sock *sk;
91 | };
92 |
93 | struct ipv6_pinfo {
94 | struct in6_addr saddr;
95 | };
96 |
97 | struct inet_sock {
98 | struct sock sk;
99 | struct ipv6_pinfo *pinet6;
100 | __be32 inet_saddr;
101 | __be16 inet_sport;
102 | };
103 |
104 | struct tcp_sock {
105 | u64 bytes_received;
106 | u64 bytes_sent;
107 | };
108 |
109 | typedef u8 u_int8_t;
110 | typedef u16 u_int16_t;
111 |
112 | #pragma clang attribute pop
113 |
114 | #endif // __KERNEL_CORE_STRUCTURES_H__
115 |
--------------------------------------------------------------------------------
/pkg/tracing/ebpf/ebpf_internal_types.h:
--------------------------------------------------------------------------------
1 | #include "epbf_shared_types.h"
2 |
3 | #define MAX_CONNECTIONS 1000000
4 |
5 | // internal kernel-only struct to hold socket information which can't be parsed
6 | // from struct sock.
7 | struct sock_info {
8 | u32 pid;
9 | enum connection_role role;
10 | u32 is_active;
11 | u32 id;
12 | };
13 |
14 | // partial struct of args for tcp_set_state
15 | struct set_state_args {
16 | u64 padding;
17 | struct sock *skaddr;
18 | u32 oldstate;
19 | u32 newstate;
20 | // more...
21 | };
22 |
23 |
24 |
--------------------------------------------------------------------------------
/pkg/tracing/ebpf/ebpf_utils.h:
--------------------------------------------------------------------------------
1 | #ifndef __EBPF_UTILS_H__
2 | #define __EBPF_UTILS_H__
3 |
4 | #define BPF_SUCCESS 0
5 | #define BPF_ERROR -1
6 |
7 | #endif
--------------------------------------------------------------------------------
/pkg/tracing/ebpf/epbf_shared_types.h:
--------------------------------------------------------------------------------
1 | #ifndef __EBPF_SHARED_TYPES_H__
2 | #define __EBPF_SHARED_TYPES_H__
3 | #include "vmlinux.h"
4 |
5 | // helper defs for inet_sock. These are defined in inet_sock.h, but not copied
6 | // automatically to vmlinux.h
7 | #define inet_daddr sk.__sk_common.skc_daddr
8 | #define inet_rcv_saddr sk.__sk_common.skc_rcv_saddr
9 | #define inet_dport sk.__sk_common.skc_dport
10 | #define inet_num sk.__sk_common.skc_num
11 |
12 |
13 | enum connection_role {
14 | CONNECTION_ROLE_UNKNOWN = 0,
15 | CONNECTION_ROLE_CLIENT,
16 | CONNECTION_ROLE_SERVER,
17 | };
18 |
19 | // describing two sides of a connection. constant for each connection.
20 | struct connection_tuple {
21 | __be32 src_ip;
22 | __be32 dst_ip;
23 | u16 src_port;
24 | u16 dst_port;
25 | };
26 |
27 | // all information needed to identify a specific connection.
28 | // due to socket reuses, each of the members (beside id) may change while
29 | // maintaining the others.
30 | struct connection_identifier {
31 | u32 id; // uniquely generated id
32 | u32 pid;
33 | struct connection_tuple tuple;
34 | enum connection_role role;
35 | };
36 |
37 | // dynamic information about the state of a connection.
38 | struct connection_throughput_stats {
39 | u64 bytes_sent;
40 | u64 bytes_received;
41 | u64 is_active; // u64 because it will be padded anyway. should change whether
42 | // new members are added
43 | };
44 |
45 | #endif
--------------------------------------------------------------------------------
/pkg/tracing/probes.go:
--------------------------------------------------------------------------------
1 | package tracing
2 |
3 | import (
4 | "errors"
5 | "fmt"
6 | "log"
7 |
8 | "github.com/cilium/ebpf"
9 | "github.com/cilium/ebpf/btf"
10 | "github.com/cilium/ebpf/link"
11 | "github.com/cilium/ebpf/rlimit"
12 | )
13 |
14 | type Probes struct {
15 | Kprobe link.Link
16 | Tracepoint link.Link
17 | BpfObjs bpfObjects
18 | }
19 |
20 | func LoadProbes() (Probes, *ebpf.Map, error) {
21 | if err := rlimit.RemoveMemlock(); err != nil {
22 | return Probes{}, nil, fmt.Errorf("error removing memory lock - %v", err)
23 | }
24 |
25 | objs := bpfObjects{}
26 | err := loadBpfObjects(&objs, &ebpf.CollectionOptions{})
27 | if err != nil {
28 | var ve *ebpf.VerifierError
29 | if errors.As(err, &ve) {
30 | fmt.Printf("Verifier Error: %+v\n", ve)
31 | }
32 | return Probes{}, nil, fmt.Errorf("error loading BPF objects from go-side. %v", err)
33 | }
34 | log.Printf("BPF objects loaded")
35 |
36 | // attach a kprobe and tracepoint
37 | kp, err := link.Kprobe("tcp_data_queue", objs.bpfPrograms.HandleTcpDataQueue, nil)
38 | if err != nil {
39 | return Probes{}, nil, fmt.Errorf("error attaching kprobe: %v", err)
40 | }
41 | log.Printf("Kprobe attached successfully")
42 |
43 | tp, err := link.Tracepoint("sock", "inet_sock_set_state", objs.bpfPrograms.HandleSockSetState, nil)
44 | if err != nil {
45 | return Probes{}, nil, fmt.Errorf("error attaching tracepoint: %v", err)
46 | }
47 | log.Printf("Tracepoint attached successfully")
48 |
49 | // We are done with loading kprobes - clear the btf cache
50 | btf.FlushKernelSpec()
51 |
52 | return Probes{
53 | Kprobe: kp,
54 | Tracepoint: tp,
55 | BpfObjs: objs,
56 | }, objs.Connections, nil
57 | }
58 |
59 | func (objs *Probes) UnloadProbes() error {
60 | // if any close operation fails, will continue to try closing the rest of the struct,
61 | // and return the first error
62 | var resultErr error
63 | resultErr = nil
64 |
65 | err := objs.Kprobe.Close()
66 | if err != nil {
67 | resultErr = err
68 | }
69 | err = objs.Tracepoint.Close()
70 | if err != nil && resultErr == nil {
71 | resultErr = err
72 | }
73 | err = objs.BpfObjs.Close()
74 | if err != nil && resultErr == nil {
75 | resultErr = err
76 | }
77 |
78 | return resultErr
79 | }
80 |
--------------------------------------------------------------------------------
/scripts/build/download_libbpf_headers.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | # This downloads the libbpf headers we need to compile eBPF code.
4 | # The script is based on cilium's update headers script,
5 | # https://github.com/cilium/ebpf/blob/4420605496c54a45653a7f1d277896e71e6705e2/examples/headers/update.sh#L1
6 |
7 | # Version of libbpf to fetch headers from
8 | LIBBPF_VERSION=0.6.1
9 |
10 | # Version of cilium ebpf repository to fetch vmlinux from
11 | CILIUM_VMLINUX_VERSION=0.10.0
12 |
13 | HEADERS_DIRECTORY="/tmp/caretta_extra/libbpf_headers"
14 |
15 | # The headers we want
16 | prefix=libbpf-"$LIBBPF_VERSION"
17 | headers=(
18 | "$prefix"/src/bpf_endian.h
19 | "$prefix"/src/bpf_helper_defs.h
20 | "$prefix"/src/bpf_helpers.h
21 | "$prefix"/src/bpf_tracing.h
22 | "$prefix"/src/bpf_core_read.h
23 | )
24 |
25 | if [ ! -d "pkg" ] ; then
26 | echo "Run this scripts from the repository's root directory." 1>&2
27 | exit 1
28 | fi
29 |
30 | if [ ! -d "$HEADERS_DIRECTORY" ]; then
31 | mkdir -p "$HEADERS_DIRECTORY"
32 | if [ "$?" -ne 0 ]; then
33 | echo "Failed to create libbpf headers directory \""$HEADERS_DIRECTORY"\"." 1>&2
34 | exit 1
35 | fi
36 | fi
37 |
38 | # Fetch libbpf release and extract the desired headers
39 | curl -sL --connect-timeout 10 --max-time 10 \
40 | "https://github.com/libbpf/libbpf/archive/refs/tags/v${LIBBPF_VERSION}.tar.gz" | \
41 | tar -xz --xform='s#.*/##' -C "$HEADERS_DIRECTORY" "${headers[@]}"
42 | if [ "$?" -ne 0 ]; then
43 | echo "Failed to download and extract the needed libbpf headers." 1>&2
44 | exit 1
45 | fi
46 |
47 | # Fetch compact vmlinux file from cilium's ebpf repository.
48 | # This is not a libbpf header per-se, but it's close enough that we put it in the same location.
49 | curl -s -o "$HEADERS_DIRECTORY"/vmlinux.h \
50 | https://raw.githubusercontent.com/cilium/ebpf/v${CILIUM_VMLINUX_VERSION}/examples/headers/common.h
51 | if [ "$?" -ne 0 ]; then
52 | echo "Failed to download vmlinux compact version from cilium's repository."
53 | exit 1
54 | fi
55 |
56 | echo "Successfully downloaded libbpf headers." 1>&2
57 |
--------------------------------------------------------------------------------