├── assets └── podtrace-logo.png ├── embedded_bpf.go ├── test ├── pool-test │ ├── go.mod │ ├── go.sum │ └── pool-test-app.go ├── prometheus.yml ├── docker-compose.yml ├── README.md ├── quick-test.sh ├── cleanup-test-pods.sh ├── test-debug.sh ├── test-pods.yaml ├── setup-test-pods.sh ├── run-tests.sh ├── test-pods-full.yaml └── integration_test.go ├── internal ├── kubernetes │ ├── export_test.go │ ├── resolver.go │ ├── errors.go │ ├── errors_test.go │ ├── service_resolver.go │ └── events_correlator.go ├── ebpf │ ├── tracer_wrapper.go │ ├── tracer │ │ ├── interface.go │ │ ├── errors.go │ │ └── errors_test.go │ ├── loader │ │ ├── errors.go │ │ ├── loader.go │ │ ├── loader_test.go │ │ └── errors_test.go │ ├── probes │ │ └── errors.go │ ├── cache │ │ ├── pathcache.go │ │ ├── cache.go │ │ └── lru.go │ ├── tracer_wrapper_test.go │ └── parser │ │ └── parser.go ├── cri │ ├── jsonfind_test.go │ ├── jsonfind.go │ └── resolver_test.go ├── diagnose │ ├── analyzer │ │ ├── common.go │ │ ├── cpu.go │ │ ├── filesystem.go │ │ ├── dns.go │ │ ├── tls.go │ │ ├── network.go │ │ ├── pool.go │ │ └── tls_test.go │ ├── sampling.go │ ├── errors.go │ ├── profiling │ │ └── cpu_profiling_test.go │ ├── detector │ │ └── issues.go │ ├── formatter │ │ ├── formatter.go │ │ └── formatter_test.go │ ├── tracker │ │ ├── pod_communication_test.go │ │ ├── process.go │ │ ├── connection.go │ │ └── trace_tracker_test.go │ └── stacktrace │ │ └── stacktrace.go ├── alerting │ ├── rate_limiter.go │ ├── rate_limiter_test.go │ ├── deduplicator.go │ ├── sender.go │ ├── deduplicator_test.go │ ├── logger_hook.go │ ├── manager_test.go │ ├── webhook.go │ ├── splunk.go │ ├── alert.go │ └── manager.go ├── logger │ ├── logger_test.go │ └── logger.go └── tracing │ ├── extractor │ ├── http_test.go │ └── http.go │ ├── exporter │ ├── splunk.go │ └── otlp.go │ └── context │ └── context.go ├── bpf ├── podtrace.bpf.c ├── filesystem.h ├── events.h ├── common.h ├── memory.c ├── resources.c └── maps.h ├── .gitignore ├── scripts ├── build-and-setup.sh └── setup-capabilities.sh ├── cmd └── podtrace │ ├── interrupt_test.go │ ├── mocks.go │ ├── export_test.go │ ├── main_test.go │ └── diagnose_env.go ├── .github ├── ISSUE_TEMPLATE │ ├── epic.yaml │ ├── feature_request.yaml │ ├── adopters.yaml │ └── bug_report.yaml └── workflows │ ├── bash-checks.yml │ ├── security.yml │ └── ebpf-build.yml ├── doc └── README.md └── go.mod /assets/podtrace-logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gma1k/podtrace/HEAD/assets/podtrace-logo.png -------------------------------------------------------------------------------- /embedded_bpf.go: -------------------------------------------------------------------------------- 1 | package podtrace 2 | 3 | import _ "embed" 4 | 5 | var EmbeddedPodtraceBPFObj []byte 6 | -------------------------------------------------------------------------------- /test/pool-test/go.mod: -------------------------------------------------------------------------------- 1 | module pool-test 2 | 3 | go 1.21 4 | 5 | require github.com/mattn/go-sqlite3 v1.14.32 6 | -------------------------------------------------------------------------------- /test/prometheus.yml: -------------------------------------------------------------------------------- 1 | global: 2 | scrape_interval: 5s 3 | 4 | scrape_configs: 5 | - job_name: 'podtrace' 6 | static_configs: 7 | - targets: ['172.17.0.1:3000'] 8 | -------------------------------------------------------------------------------- /test/pool-test/go.sum: -------------------------------------------------------------------------------- 1 | github.com/mattn/go-sqlite3 v1.14.32 h1:JD12Ag3oLy1zQA+BNn74xRgaBbdhbNIDYvQUEuuErjs= 2 | github.com/mattn/go-sqlite3 v1.14.32/go.mod h1:Uh1q+B4BYcTPb+yiD3kU8Ct7aC0hY9fxUwlHK0RXw+Y= 3 | -------------------------------------------------------------------------------- /internal/kubernetes/export_test.go: -------------------------------------------------------------------------------- 1 | package kubernetes 2 | 3 | import "k8s.io/client-go/kubernetes" 4 | 5 | func NewPodResolverForTesting(clientset kubernetes.Interface) *PodResolver { 6 | return &PodResolver{ 7 | clientset: clientset, 8 | } 9 | } 10 | 11 | -------------------------------------------------------------------------------- /internal/ebpf/tracer_wrapper.go: -------------------------------------------------------------------------------- 1 | package ebpf 2 | 3 | import ( 4 | "github.com/podtrace/podtrace/internal/ebpf/tracer" 5 | ) 6 | 7 | type TracerInterface = tracer.TracerInterface 8 | 9 | func NewTracer() (TracerInterface, error) { 10 | return tracer.NewTracer() 11 | } 12 | 13 | func WaitForInterrupt() { 14 | tracer.WaitForInterrupt() 15 | } 16 | 17 | -------------------------------------------------------------------------------- /internal/ebpf/tracer/interface.go: -------------------------------------------------------------------------------- 1 | package tracer 2 | 3 | import ( 4 | "context" 5 | 6 | "github.com/podtrace/podtrace/internal/events" 7 | ) 8 | 9 | type TracerInterface interface { 10 | AttachToCgroup(cgroupPath string) error 11 | SetContainerID(containerID string) error 12 | Start(ctx context.Context, eventChan chan<- *events.Event) error 13 | Stop() error 14 | } 15 | 16 | -------------------------------------------------------------------------------- /internal/kubernetes/resolver.go: -------------------------------------------------------------------------------- 1 | package kubernetes 2 | 3 | import ( 4 | "context" 5 | 6 | "k8s.io/client-go/kubernetes" 7 | ) 8 | 9 | type PodResolverInterface interface { 10 | ResolvePod(ctx context.Context, podName, namespace, containerName string) (*PodInfo, error) 11 | } 12 | 13 | type ClientsetProvider interface { 14 | GetClientset() kubernetes.Interface 15 | } 16 | 17 | -------------------------------------------------------------------------------- /bpf/podtrace.bpf.c: -------------------------------------------------------------------------------- 1 | // SPDX-License-Identifier: GPL-2.0 2 | 3 | #include "common.h" 4 | #include "maps.h" 5 | #include "events.h" 6 | #include "helpers.h" 7 | 8 | #include "network.c" 9 | #include "filesystem.c" 10 | #include "cpu.c" 11 | #include "memory.c" 12 | #include "syscalls.c" 13 | #include "resources.c" 14 | #include "database.c" 15 | 16 | char LICENSE[] SEC("license") = "GPL"; 17 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Binaries 2 | bin/ 3 | *.o 4 | *.so 5 | 6 | # Go 7 | *.exe 8 | *.test 9 | *.prof 10 | vendor/ 11 | 12 | # eBPF 13 | bpf/*.o 14 | vmlinux.h 15 | 16 | # IDE 17 | .idea/ 18 | .vscode/ 19 | *.swp 20 | *.swo 21 | 22 | # OS 23 | .DS_Store 24 | Thumbs.db 25 | 26 | # Build artifacts 27 | *.a 28 | 29 | # Test coverage 30 | coverage.out 31 | *.coverprofile 32 | *.out 33 | test_output.log 34 | -------------------------------------------------------------------------------- /bpf/filesystem.h: -------------------------------------------------------------------------------- 1 | // SPDX-License-Identifier: GPL-2.0 2 | 3 | #ifndef PODTRACE_FILESYSTEM_H 4 | #define PODTRACE_FILESYSTEM_H 5 | 6 | #include "common.h" 7 | #include "maps.h" 8 | 9 | static inline int get_path_str_from_file(struct file *file, char *out_buf, u32 buf_size) 10 | { 11 | if (file == NULL || out_buf == NULL || buf_size < 2) 12 | return 0; 13 | 14 | struct path path; 15 | bpf_core_read(&path, sizeof(path), &file->f_path); 16 | 17 | struct dentry *dentry; 18 | bpf_core_read(&dentry, sizeof(dentry), &path.dentry); 19 | if (dentry == NULL) 20 | return 0; 21 | 22 | out_buf[0] = '\0'; 23 | return 0; 24 | } 25 | 26 | #endif 27 | 28 | -------------------------------------------------------------------------------- /internal/ebpf/loader/errors.go: -------------------------------------------------------------------------------- 1 | package loader 2 | 3 | import "fmt" 4 | 5 | type ErrorCode int 6 | 7 | const ( 8 | ErrCodeLoadFailed ErrorCode = iota + 1 9 | ) 10 | 11 | type LoaderError struct { 12 | Code ErrorCode 13 | Message string 14 | Err error 15 | } 16 | 17 | func (e *LoaderError) Error() string { 18 | if e.Err != nil { 19 | return fmt.Sprintf("%s: %v", e.Message, e.Err) 20 | } 21 | return e.Message 22 | } 23 | 24 | func (e *LoaderError) Unwrap() error { 25 | return e.Err 26 | } 27 | 28 | func NewLoadError(path string, err error) *LoaderError { 29 | return &LoaderError{ 30 | Code: ErrCodeLoadFailed, 31 | Message: fmt.Sprintf("failed to load eBPF program from %s", path), 32 | Err: err, 33 | } 34 | } 35 | 36 | -------------------------------------------------------------------------------- /internal/ebpf/probes/errors.go: -------------------------------------------------------------------------------- 1 | package probes 2 | 3 | import "fmt" 4 | 5 | type ErrorCode int 6 | 7 | const ( 8 | ErrCodeProbeAttachFailed ErrorCode = iota + 1 9 | ) 10 | 11 | type ProbeError struct { 12 | Code ErrorCode 13 | Message string 14 | Err error 15 | } 16 | 17 | func (e *ProbeError) Error() string { 18 | if e.Err != nil { 19 | return fmt.Sprintf("%s: %v", e.Message, e.Err) 20 | } 21 | return e.Message 22 | } 23 | 24 | func (e *ProbeError) Unwrap() error { 25 | return e.Err 26 | } 27 | 28 | func NewProbeAttachError(probeName string, err error) *ProbeError { 29 | return &ProbeError{ 30 | Code: ErrCodeProbeAttachFailed, 31 | Message: fmt.Sprintf("failed to attach probe %s", probeName), 32 | Err: err, 33 | } 34 | } 35 | 36 | -------------------------------------------------------------------------------- /test/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3.8' 2 | 3 | # WARNING: This docker-compose file contains hardcoded test credentials (admin/admin) 4 | # DO NOT USE IN PRODUCTION - These credentials are for testing purposes only 5 | # In production, use environment variables or secrets management for credentials 6 | 7 | services: 8 | prometheus: 9 | image: prom/prometheus:latest 10 | container_name: prometheus 11 | volumes: 12 | - ./prometheus.yml:/etc/prometheus/prometheus.yml 13 | ports: 14 | - "9090:9090" 15 | 16 | grafana: 17 | image: grafana/grafana:latest 18 | container_name: grafana 19 | environment: 20 | - GF_SECURITY_ADMIN_USER=admin 21 | - GF_SECURITY_ADMIN_PASSWORD=admin 22 | ports: 23 | - "3001:3000" 24 | -------------------------------------------------------------------------------- /internal/cri/jsonfind_test.go: -------------------------------------------------------------------------------- 1 | package cri 2 | 3 | import "testing" 4 | 5 | func TestFindJSONHelpers(t *testing.T) { 6 | obj := map[string]any{ 7 | "pid": float64(123), 8 | "runtimeSpec": map[string]any{ 9 | "linux": map[string]any{ 10 | "cgroupsPath": "/kubepods.slice/test.scope", 11 | }, 12 | }, 13 | } 14 | 15 | if pid, ok := findJSONInt(obj, []string{"pid"}); !ok || pid != 123 { 16 | t.Fatalf("expected pid=123, got pid=%d ok=%v", pid, ok) 17 | } 18 | 19 | if cg, ok := findJSONString(obj, []string{"runtimeSpec.linux.cgroupsPath"}); !ok || cg != "/kubepods.slice/test.scope" { 20 | t.Fatalf("expected cgroupsPath, got %q ok=%v", cg, ok) 21 | } 22 | 23 | if _, ok := findJSONString(obj, []string{"missing.path"}); ok { 24 | t.Fatalf("expected missing path to return ok=false") 25 | } 26 | } 27 | 28 | 29 | 30 | -------------------------------------------------------------------------------- /internal/diagnose/analyzer/common.go: -------------------------------------------------------------------------------- 1 | package analyzer 2 | 3 | import ( 4 | "fmt" 5 | 6 | "github.com/podtrace/podtrace/internal/config" 7 | ) 8 | 9 | func Percentile(sorted []float64, p float64) float64 { 10 | if len(sorted) == 0 { 11 | return 0 12 | } 13 | index := int(float64(len(sorted)-1) * p / 100) 14 | return sorted[index] 15 | } 16 | 17 | func FormatBytes(bytes uint64) string { 18 | if bytes < config.KB { 19 | return fmt.Sprintf("%d B", bytes) 20 | } else if bytes < config.MB { 21 | return fmt.Sprintf("%.2f KB", float64(bytes)/float64(config.KB)) 22 | } else if bytes < config.GB { 23 | return fmt.Sprintf("%.2f MB", float64(bytes)/float64(config.MB)) 24 | } else { 25 | return fmt.Sprintf("%.2f GB", float64(bytes)/float64(config.GB)) 26 | } 27 | } 28 | 29 | type TargetCount struct { 30 | Target string 31 | Count int 32 | } 33 | -------------------------------------------------------------------------------- /internal/diagnose/analyzer/cpu.go: -------------------------------------------------------------------------------- 1 | package analyzer 2 | 3 | import ( 4 | "sort" 5 | 6 | "github.com/podtrace/podtrace/internal/config" 7 | "github.com/podtrace/podtrace/internal/events" 8 | ) 9 | 10 | func AnalyzeCPU(events []*events.Event) (avgBlock, maxBlock float64, p50, p95, p99 float64) { 11 | var totalBlock float64 12 | var blocks []float64 13 | maxBlock = 0 14 | 15 | for _, e := range events { 16 | blockMs := float64(e.LatencyNS) / float64(config.NSPerMS) 17 | blocks = append(blocks, blockMs) 18 | totalBlock += blockMs 19 | if blockMs > maxBlock { 20 | maxBlock = blockMs 21 | } 22 | } 23 | 24 | if len(events) > 0 { 25 | avgBlock = totalBlock / float64(len(events)) 26 | sort.Float64s(blocks) 27 | p50 = Percentile(blocks, 50) 28 | p95 = Percentile(blocks, 95) 29 | p99 = Percentile(blocks, 99) 30 | } 31 | return 32 | } 33 | -------------------------------------------------------------------------------- /internal/ebpf/loader/loader.go: -------------------------------------------------------------------------------- 1 | package loader 2 | 3 | import ( 4 | "bytes" 5 | 6 | "github.com/cilium/ebpf" 7 | 8 | podtrace "github.com/podtrace/podtrace" 9 | "github.com/podtrace/podtrace/internal/config" 10 | ) 11 | 12 | func LoadPodtrace() (*ebpf.CollectionSpec, error) { 13 | spec, err := ebpf.LoadCollectionSpec(config.BPFObjectPath) 14 | if err != nil { 15 | spec, err = ebpf.LoadCollectionSpec("../" + config.BPFObjectPath) 16 | if err != nil { 17 | if config.BPFObjectPath == "bpf/podtrace.bpf.o" && len(podtrace.EmbeddedPodtraceBPFObj) > 0 { 18 | if embeddedSpec, embeddedErr := ebpf.LoadCollectionSpecFromReader(bytes.NewReader(podtrace.EmbeddedPodtraceBPFObj)); embeddedErr == nil { 19 | return embeddedSpec, nil 20 | } 21 | } 22 | return nil, NewLoadError(config.BPFObjectPath, err) 23 | } 24 | } 25 | 26 | return spec, nil 27 | } 28 | -------------------------------------------------------------------------------- /scripts/build-and-setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Build podtrace and automatically set capabilities 3 | 4 | set -e 5 | 6 | ROOT_DIR="$(cd "$(dirname "$0")/.." && pwd)" 7 | 8 | build_podtrace() { 9 | echo "Building podtrace..." 10 | cd "${ROOT_DIR}" 11 | 12 | make clean 13 | make build 14 | 15 | if [[ ! -f "./bin/podtrace" ]]; then 16 | echo "Error: Build failed - bin/podtrace not found" 17 | exit 1 18 | fi 19 | } 20 | 21 | set_capabilities() { 22 | echo "" 23 | echo "Setting capabilities..." 24 | if sudo ./scripts/setup-capabilities.sh; then 25 | echo "" 26 | echo "Build and setup complete!" 27 | echo "" 28 | echo "You can now run podtrace:" 29 | echo " ./bin/podtrace -n " 30 | else 31 | echo "" 32 | echo "Build succeeded but failed to set capabilities." 33 | echo "Run manually: sudo ./scripts/setup-capabilities.sh" 34 | exit 1 35 | fi 36 | } 37 | 38 | main() { 39 | build_podtrace 40 | set_capabilities 41 | } 42 | 43 | main "$@" 44 | -------------------------------------------------------------------------------- /internal/alerting/rate_limiter.go: -------------------------------------------------------------------------------- 1 | package alerting 2 | 3 | import ( 4 | "sync" 5 | "time" 6 | ) 7 | 8 | type RateLimiter struct { 9 | limit int 10 | window time.Duration 11 | counts []time.Time 12 | mu sync.Mutex 13 | } 14 | 15 | func NewRateLimiter(limitPerMinute int) *RateLimiter { 16 | return &RateLimiter{ 17 | limit: limitPerMinute, 18 | window: time.Minute, 19 | counts: make([]time.Time, 0, limitPerMinute), 20 | } 21 | } 22 | 23 | func (rl *RateLimiter) Allow() bool { 24 | rl.mu.Lock() 25 | defer rl.mu.Unlock() 26 | now := time.Now() 27 | cutoff := now.Add(-rl.window) 28 | validCounts := make([]time.Time, 0, rl.limit) 29 | for _, t := range rl.counts { 30 | if t.After(cutoff) { 31 | validCounts = append(validCounts, t) 32 | } 33 | } 34 | if len(validCounts) >= rl.limit { 35 | return false 36 | } 37 | validCounts = append(validCounts, now) 38 | rl.counts = validCounts 39 | return true 40 | } 41 | 42 | func (rl *RateLimiter) Reset() { 43 | rl.mu.Lock() 44 | defer rl.mu.Unlock() 45 | rl.counts = make([]time.Time, 0, rl.limit) 46 | } 47 | 48 | 49 | -------------------------------------------------------------------------------- /test/README.md: -------------------------------------------------------------------------------- 1 | # podtrace Test Environment 2 | 3 | This directory contains test pods and scripts to test the podtrace CLI. 4 | 5 | ## Quick Start 6 | 7 | ### 1. Setup Test Pods 8 | 9 | ```bash 10 | cd /path/to/podtrace/test 11 | ./setup-test-pods.sh 12 | ``` 13 | 14 | This will: 15 | - Create a `podtrace-test` namespace 16 | - Deploy 3 test pods: 17 | - `nginx-test` 18 | - `busybox-test` 19 | - `alpine-test` 20 | 21 | ### 2. Test podtrace 22 | 23 | ```bash 24 | sudo ./bin/podtrace -n podtrace-test nginx-test 25 | ``` 26 | 27 | ### 3. Cleanup 28 | 29 | ```bash 30 | ./cleanup-test-pods.sh 31 | ``` 32 | 33 | ## Automated Test Runner 34 | 35 | Run all tests automatically: 36 | 37 | ```bash 38 | ./test/run-tests.sh 39 | ``` 40 | 41 | ## Files 42 | 43 | - `test-pods.yaml` - Kubernetes manifests for test pods 44 | - `setup-test-pods.sh` - Script to create test environment 45 | - `cleanup-test-pods.sh` - Script to clean up test environment 46 | - `run-tests.sh` - Automated test runner 47 | - `quick-test.sh` - Quick test script 48 | - `test-debug.sh` - Debug test script 49 | - `test-cpu-usage.sh` - CPU usage test script 50 | -------------------------------------------------------------------------------- /bpf/events.h: -------------------------------------------------------------------------------- 1 | // SPDX-License-Identifier: GPL-2.0 2 | 3 | #ifndef PODTRACE_EVENTS_H 4 | #define PODTRACE_EVENTS_H 5 | 6 | #include "common.h" 7 | 8 | enum event_type { 9 | EVENT_DNS, 10 | EVENT_CONNECT, 11 | EVENT_TCP_SEND, 12 | EVENT_TCP_RECV, 13 | EVENT_WRITE, 14 | EVENT_READ, 15 | EVENT_FSYNC, 16 | EVENT_SCHED_SWITCH, 17 | EVENT_TCP_STATE, 18 | EVENT_PAGE_FAULT, 19 | EVENT_OOM_KILL, 20 | EVENT_UDP_SEND, 21 | EVENT_UDP_RECV, 22 | EVENT_HTTP_REQ, 23 | EVENT_HTTP_RESP, 24 | EVENT_LOCK_CONTENTION, 25 | EVENT_TCP_RETRANS, 26 | EVENT_NET_DEV_ERROR, 27 | EVENT_DB_QUERY, 28 | EVENT_EXEC, 29 | EVENT_FORK, 30 | EVENT_OPEN, 31 | EVENT_CLOSE, 32 | EVENT_TLS_HANDSHAKE, 33 | EVENT_TLS_ERROR, 34 | EVENT_RESOURCE_LIMIT, 35 | EVENT_POOL_ACQUIRE, 36 | EVENT_POOL_RELEASE, 37 | EVENT_POOL_EXHAUSTED, 38 | }; 39 | 40 | struct event { 41 | u64 timestamp; 42 | u32 pid; 43 | u32 type; 44 | u64 latency_ns; 45 | s32 error; 46 | u64 bytes; 47 | u32 tcp_state; 48 | u64 stack_key; 49 | u64 cgroup_id; 50 | char comm[COMM_LEN]; 51 | char target[MAX_STRING_LEN]; 52 | char details[MAX_STRING_LEN]; 53 | }; 54 | 55 | #endif 56 | -------------------------------------------------------------------------------- /internal/cri/jsonfind.go: -------------------------------------------------------------------------------- 1 | package cri 2 | 3 | import ( 4 | "strings" 5 | ) 6 | 7 | func findJSONString(obj any, keyPaths []string) (string, bool) { 8 | for _, kp := range keyPaths { 9 | if v, ok := findJSONValue(obj, strings.Split(kp, ".")); ok { 10 | if s, ok := v.(string); ok && s != "" { 11 | return s, true 12 | } 13 | } 14 | } 15 | return "", false 16 | } 17 | 18 | func findJSONInt(obj any, keys []string) (int64, bool) { 19 | for _, k := range keys { 20 | if v, ok := findJSONValue(obj, strings.Split(k, ".")); ok { 21 | switch t := v.(type) { 22 | case float64: 23 | return int64(t), true 24 | case int64: 25 | return t, true 26 | case int: 27 | return int64(t), true 28 | } 29 | } 30 | } 31 | return 0, false 32 | } 33 | 34 | func findJSONValue(obj any, path []string) (any, bool) { 35 | if len(path) == 0 { 36 | return obj, true 37 | } 38 | cur := obj 39 | for _, p := range path { 40 | m, ok := cur.(map[string]any) 41 | if !ok { 42 | return nil, false 43 | } 44 | next, ok := m[p] 45 | if !ok { 46 | return nil, false 47 | } 48 | cur = next 49 | } 50 | return cur, true 51 | } 52 | -------------------------------------------------------------------------------- /test/quick-test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | NAMESPACE="podtrace-test" 5 | POD_NAME="${1:-nginx-cpu-test}" 6 | DURATION="${2:-20s}" 7 | 8 | log() { 9 | echo "$@" 10 | } 11 | 12 | check_pod_exists() { 13 | local pod="$1" 14 | local ns="$2" 15 | 16 | if ! kubectl get pod "${pod}" -n "${ns}" &>/dev/null; then 17 | log "Error: Pod ${pod} not found in namespace ${ns}" 18 | log "Available pods:" 19 | kubectl get pods -n "${ns}" || log "Namespace ${ns} not found" 20 | exit 1 21 | fi 22 | } 23 | 24 | check_podtrace_binary() { 25 | if [[ ! -f "./bin/podtrace" ]]; then 26 | log "Error: ./bin/podtrace not found. Run 'make build' first." 27 | exit 1 28 | fi 29 | } 30 | 31 | run_diagnose() { 32 | local pod="$1" 33 | local ns="$2" 34 | local duration="$3" 35 | 36 | log "Running diagnose mode..." 37 | ./bin/podtrace -n "${ns}" "${pod}" --diagnose "${duration}" 38 | } 39 | 40 | main() { 41 | log "=== Testing podtrace on ${POD_NAME} for ${DURATION} ===" 42 | log "" 43 | 44 | check_pod_exists "${POD_NAME}" "${NAMESPACE}" 45 | check_podtrace_binary 46 | run_diagnose "${POD_NAME}" "${NAMESPACE}" "${DURATION}" 47 | } 48 | 49 | main 50 | -------------------------------------------------------------------------------- /cmd/podtrace/interrupt_test.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "os" 5 | "testing" 6 | "time" 7 | ) 8 | 9 | func TestInterruptChan(t *testing.T) { 10 | ch := interruptChan() 11 | if ch == nil { 12 | t.Fatal("interruptChan returned nil channel") 13 | } 14 | 15 | go func() { 16 | time.Sleep(50 * time.Millisecond) 17 | proc, _ := os.FindProcess(os.Getpid()) 18 | _ = proc.Signal(os.Interrupt) 19 | }() 20 | 21 | select { 22 | case sig := <-ch: 23 | if sig != os.Interrupt { 24 | t.Errorf("Expected os.Interrupt, got %v", sig) 25 | } 26 | case <-time.After(1 * time.Second): 27 | t.Error("interruptChan did not receive signal in time") 28 | } 29 | } 30 | 31 | func TestInterruptChan_PanicRecovery(t *testing.T) { 32 | ch := interruptChan() 33 | if ch == nil { 34 | t.Fatal("interruptChan returned nil channel") 35 | } 36 | 37 | go func() { 38 | time.Sleep(10 * time.Millisecond) 39 | proc, _ := os.FindProcess(os.Getpid()) 40 | _ = proc.Signal(os.Interrupt) 41 | }() 42 | 43 | select { 44 | case <-ch: 45 | case <-time.After(1 * time.Second): 46 | t.Error("interruptChan did not receive signal in time") 47 | } 48 | } 49 | 50 | -------------------------------------------------------------------------------- /test/cleanup-test-pods.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Cleanup script for podtrace test pods 3 | 4 | set -e 5 | 6 | NAMESPACE="podtrace-test" 7 | SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" 8 | 9 | GREEN='\033[0;32m' 10 | # YELLOW='\033[1;33m' 11 | RED='\033[0;31m' 12 | NC='\033[0m' 13 | 14 | print_header() { 15 | echo "=== Cleaning up podtrace test environment ===" 16 | echo "" 17 | } 18 | 19 | check_kubectl() { 20 | if ! command -v kubectl &>/dev/null; then 21 | echo -e "${RED}Error: kubectl is not installed${NC}" 22 | exit 1 23 | fi 24 | } 25 | 26 | delete_resources() { 27 | echo "Deleting test pods and namespace..." 28 | kubectl delete -f "${SCRIPT_DIR}/test-pods.yaml" --ignore-not-found=true 29 | } 30 | 31 | wait_for_namespace_deletion() { 32 | echo "" 33 | echo "Waiting for namespace to be deleted..." 34 | kubectl wait --for=delete namespace/"${NAMESPACE}" --timeout=60s 2>/dev/null || true 35 | } 36 | 37 | print_success() { 38 | echo "" 39 | echo -e "${GREEN}✓ Cleanup completed${NC}" 40 | } 41 | 42 | main() { 43 | print_header 44 | check_kubectl 45 | delete_resources 46 | wait_for_namespace_deletion 47 | print_success 48 | } 49 | 50 | main "$@" 51 | -------------------------------------------------------------------------------- /internal/alerting/rate_limiter_test.go: -------------------------------------------------------------------------------- 1 | package alerting 2 | 3 | import ( 4 | "testing" 5 | "time" 6 | ) 7 | 8 | func TestRateLimiter_Allow(t *testing.T) { 9 | rl := NewRateLimiter(5) 10 | for i := 0; i < 5; i++ { 11 | if !rl.Allow() { 12 | t.Errorf("Request %d should be allowed", i+1) 13 | } 14 | } 15 | if rl.Allow() { 16 | t.Error("Request 6 should not be allowed (rate limit exceeded)") 17 | } 18 | } 19 | 20 | func TestRateLimiter_Allow_TimeWindow(t *testing.T) { 21 | rl := NewRateLimiter(2) 22 | if !rl.Allow() { 23 | t.Error("First request should be allowed") 24 | } 25 | if !rl.Allow() { 26 | t.Error("Second request should be allowed") 27 | } 28 | if rl.Allow() { 29 | t.Error("Third request should not be allowed") 30 | } 31 | time.Sleep(61 * time.Second) 32 | if !rl.Allow() { 33 | t.Error("Request after window should be allowed") 34 | } 35 | } 36 | 37 | func TestRateLimiter_Reset(t *testing.T) { 38 | rl := NewRateLimiter(2) 39 | rl.Allow() 40 | rl.Allow() 41 | if rl.Allow() { 42 | t.Error("Request should not be allowed after limit") 43 | } 44 | rl.Reset() 45 | if !rl.Allow() { 46 | t.Error("Request should be allowed after reset") 47 | } 48 | } 49 | 50 | 51 | -------------------------------------------------------------------------------- /internal/alerting/deduplicator.go: -------------------------------------------------------------------------------- 1 | package alerting 2 | 3 | import ( 4 | "sync" 5 | "time" 6 | ) 7 | 8 | type AlertDeduplicator struct { 9 | seenAlerts map[string]time.Time 10 | window time.Duration 11 | mu sync.RWMutex 12 | } 13 | 14 | func NewAlertDeduplicator(window time.Duration) *AlertDeduplicator { 15 | return &AlertDeduplicator{ 16 | seenAlerts: make(map[string]time.Time), 17 | window: window, 18 | } 19 | } 20 | 21 | func (d *AlertDeduplicator) ShouldSend(alert *Alert) bool { 22 | if alert == nil { 23 | return false 24 | } 25 | key := alert.Key() 26 | d.mu.Lock() 27 | defer d.mu.Unlock() 28 | if lastSent, exists := d.seenAlerts[key]; exists { 29 | if time.Since(lastSent) < d.window { 30 | return false 31 | } 32 | } 33 | d.seenAlerts[key] = time.Now() 34 | return true 35 | } 36 | 37 | func (d *AlertDeduplicator) Cleanup(olderThan time.Duration) { 38 | d.mu.Lock() 39 | defer d.mu.Unlock() 40 | now := time.Now() 41 | for key, timestamp := range d.seenAlerts { 42 | if now.Sub(timestamp) > olderThan { 43 | delete(d.seenAlerts, key) 44 | } 45 | } 46 | } 47 | 48 | func (d *AlertDeduplicator) Reset() { 49 | d.mu.Lock() 50 | defer d.mu.Unlock() 51 | d.seenAlerts = make(map[string]time.Time) 52 | } 53 | 54 | 55 | -------------------------------------------------------------------------------- /internal/diagnose/analyzer/filesystem.go: -------------------------------------------------------------------------------- 1 | package analyzer 2 | 3 | import ( 4 | "sort" 5 | 6 | "github.com/podtrace/podtrace/internal/config" 7 | "github.com/podtrace/podtrace/internal/events" 8 | ) 9 | 10 | func AnalyzeFS(events []*events.Event, fsSlowThreshold float64) (avgLatency, maxLatency float64, slowOps int, p50, p95, p99 float64, totalBytes, avgBytes uint64) { 11 | var totalLatency float64 12 | var latencies []float64 13 | maxLatency = 0 14 | slowOps = 0 15 | totalBytes = 0 16 | 17 | for _, e := range events { 18 | latencyMs := float64(e.LatencyNS) / float64(config.NSPerMS) 19 | latencies = append(latencies, latencyMs) 20 | totalLatency += latencyMs 21 | if latencyMs > maxLatency { 22 | maxLatency = latencyMs 23 | } 24 | if latencyMs > fsSlowThreshold { 25 | slowOps++ 26 | } 27 | if e.Bytes > 0 && e.Bytes < uint64(config.MaxBytesForBandwidth) { 28 | totalBytes += e.Bytes 29 | } 30 | } 31 | 32 | if len(events) > 0 { 33 | avgLatency = totalLatency / float64(len(events)) 34 | sort.Float64s(latencies) 35 | p50 = Percentile(latencies, 50) 36 | p95 = Percentile(latencies, 95) 37 | p99 = Percentile(latencies, 99) 38 | if totalBytes > 0 { 39 | avgBytes = totalBytes / uint64(len(events)) 40 | } 41 | } 42 | return 43 | } 44 | -------------------------------------------------------------------------------- /internal/ebpf/loader/loader_test.go: -------------------------------------------------------------------------------- 1 | package loader 2 | 3 | import ( 4 | "os" 5 | "testing" 6 | 7 | "github.com/podtrace/podtrace/internal/config" 8 | ) 9 | 10 | func TestLoadPodtrace_ExplicitPathIsStrict(t *testing.T) { 11 | originalPath := config.BPFObjectPath 12 | defer func() { config.BPFObjectPath = originalPath }() 13 | 14 | config.BPFObjectPath = "/nonexistent/path/to/bpf.o" 15 | spec, err := LoadPodtrace() 16 | if err == nil { 17 | t.Fatalf("expected error for explicit non-existent path, got nil") 18 | } 19 | if spec != nil { 20 | t.Fatalf("expected nil spec on error, got non-nil") 21 | } 22 | } 23 | 24 | func TestLoadPodtrace_DefaultPathFallsBackToEmbedded(t *testing.T) { 25 | originalPath := config.BPFObjectPath 26 | defer func() { config.BPFObjectPath = originalPath }() 27 | 28 | oldWD, err := os.Getwd() 29 | if err != nil { 30 | t.Fatalf("Getwd: %v", err) 31 | } 32 | defer func() { _ = os.Chdir(oldWD) }() 33 | 34 | emptyWD := t.TempDir() 35 | if err := os.Chdir(emptyWD); err != nil { 36 | t.Fatalf("Chdir: %v", err) 37 | } 38 | 39 | config.BPFObjectPath = "bpf/podtrace.bpf.o" 40 | spec, err := LoadPodtrace() 41 | if err != nil { 42 | t.Skipf("BPF object not available in test environment: %v", err) 43 | } 44 | if spec == nil { 45 | t.Fatalf("expected non-nil spec from embedded fallback") 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /internal/diagnose/analyzer/dns.go: -------------------------------------------------------------------------------- 1 | package analyzer 2 | 3 | import ( 4 | "sort" 5 | 6 | "github.com/podtrace/podtrace/internal/config" 7 | "github.com/podtrace/podtrace/internal/events" 8 | ) 9 | 10 | func AnalyzeDNS(events []*events.Event) (avgLatency, maxLatency float64, errors int, p50, p95, p99 float64, topTargets []TargetCount) { 11 | var totalLatency float64 12 | var latencies []float64 13 | maxLatency = 0 14 | errors = 0 15 | targetMap := make(map[string]int) 16 | 17 | for _, e := range events { 18 | latencyMs := float64(e.LatencyNS) / float64(config.NSPerMS) 19 | latencies = append(latencies, latencyMs) 20 | totalLatency += latencyMs 21 | if latencyMs > maxLatency { 22 | maxLatency = latencyMs 23 | } 24 | if e.Error != 0 { 25 | errors++ 26 | } 27 | if e.Target != "" && e.Target != "?" { 28 | targetMap[e.Target]++ 29 | } 30 | } 31 | 32 | if len(events) > 0 { 33 | avgLatency = totalLatency / float64(len(events)) 34 | sort.Float64s(latencies) 35 | p50 = Percentile(latencies, 50) 36 | p95 = Percentile(latencies, 95) 37 | p99 = Percentile(latencies, 99) 38 | } 39 | 40 | for target, count := range targetMap { 41 | topTargets = append(topTargets, TargetCount{target, count}) 42 | } 43 | sort.Slice(topTargets, func(i, j int) bool { 44 | return topTargets[i].Count > topTargets[j].Count 45 | }) 46 | 47 | return 48 | } 49 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/epic.yaml: -------------------------------------------------------------------------------- 1 | name: Epic 2 | description: For tracking a large feature, including how to demo it. 3 | title: "epic: " 4 | labels: 5 | - epic 6 | body: 7 | - type: textarea 8 | id: objective 9 | attributes: 10 | label: Demo Objective 11 | description: Please describe the objective of your demo. 12 | placeholder: | 13 | - [ ] User should be able to ... 14 | - [ ] ... 15 | validations: 16 | required: true 17 | 18 | - type: textarea 19 | id: steps 20 | attributes: 21 | label: Demo Steps 22 | description: Please describe the steps for the demo. 23 | placeholder: | 24 | 1. Admin does X 25 | 1. User does Y 26 | 1. Everyone is happy :) 27 | 28 | - type: checkboxes 29 | id: action-items 30 | attributes: 31 | label: Action Items 32 | description: Please check the following 33 | options: 34 | - label: Scope of the current demo is necessary to fit in the prototype boundaries 35 | required: true 36 | - label: Contribute to the final demo script and recording 37 | 38 | - type: textarea 39 | id: stories 40 | attributes: 41 | label: Stories 42 | placeholder: | 43 | - [ ] (Example) **stretch-goal:** Add Widgets to `podtrace` CLI 44 | - Out-of-scope (prototype x): Send Widgets to space 45 | validations: 46 | required: false 47 | -------------------------------------------------------------------------------- /internal/ebpf/tracer/errors.go: -------------------------------------------------------------------------------- 1 | package tracer 2 | 3 | import "fmt" 4 | 5 | type ErrorCode int 6 | 7 | const ( 8 | ErrCodeCollectionFailed ErrorCode = iota + 1 9 | ErrCodeRingBufferFailed 10 | ErrCodeMapLookupFailed 11 | ErrCodeInvalidEvent 12 | ) 13 | 14 | type TracerError struct { 15 | Code ErrorCode 16 | Message string 17 | Err error 18 | } 19 | 20 | func (e *TracerError) Error() string { 21 | if e.Err != nil { 22 | return fmt.Sprintf("%s: %v", e.Message, e.Err) 23 | } 24 | return e.Message 25 | } 26 | 27 | func (e *TracerError) Unwrap() error { 28 | return e.Err 29 | } 30 | 31 | func NewCollectionError(err error) *TracerError { 32 | return &TracerError{ 33 | Code: ErrCodeCollectionFailed, 34 | Message: "failed to create eBPF collection", 35 | Err: err, 36 | } 37 | } 38 | 39 | func NewRingBufferError(err error) *TracerError { 40 | return &TracerError{ 41 | Code: ErrCodeRingBufferFailed, 42 | Message: "failed to create ring buffer reader", 43 | Err: err, 44 | } 45 | } 46 | 47 | func NewMapLookupError(mapName string, err error) *TracerError { 48 | return &TracerError{ 49 | Code: ErrCodeMapLookupFailed, 50 | Message: fmt.Sprintf("failed to lookup map %s", mapName), 51 | Err: err, 52 | } 53 | } 54 | 55 | func NewInvalidEventError(reason string) *TracerError { 56 | return &TracerError{ 57 | Code: ErrCodeInvalidEvent, 58 | Message: fmt.Sprintf("invalid event: %s", reason), 59 | } 60 | } 61 | 62 | -------------------------------------------------------------------------------- /doc/README.md: -------------------------------------------------------------------------------- 1 | # Podtrace Documentation 2 | 3 | Welcome to the `Podtrace` documentation. This directory contains comprehensive guides for using, understanding, and developing with `Podtrace`. 4 | 5 | ## Documentation Index 6 | 7 | - **[Architecture](architecture.md)** - System architecture, components, and data flow 8 | - **[Installation](installation.md)** - Installation guide, prerequisites, and troubleshooting 9 | - **[Usage Guide](usage.md)** - Usage examples, command-line options, and tips 10 | - **[eBPF Internals](ebpf-internals.md)** - Deep dive into eBPF programs and tracing mechanisms 11 | - **[Metrics](metrics.md)** - Prometheus metrics, Grafana integration, and query examples 12 | - **[Development](development.md)** - Development guide, code structure, testing, and contributing 13 | - **[Distributed Tracing Guide](distributed-tracing.md)** - Complete distributed tracing user guide 14 | - **[Tracing Exporters Setup](tracing-exporters.md)** - Detailed exporter configuration (OTLP, Jaeger, Splunk) 15 | 16 | ## Quick Start 17 | 18 | 1. **New to Podtrace?** Start with [Installation](installation.md) and [Usage Guide](usage.md) 19 | 2. **Want to understand how it works?** Read [Architecture](architecture.md) 20 | 3. **Need to integrate metrics?** Check [Metrics](metrics.md) 21 | 4. **Setting up distributed tracing?** See [Distributed Tracing Guide](distributed-tracing.md) and [Tracing Exporters Setup](tracing-exporters.md) 22 | 5. **Contributing?** See [Development](development.md) -------------------------------------------------------------------------------- /internal/ebpf/cache/pathcache.go: -------------------------------------------------------------------------------- 1 | package cache 2 | 3 | import ( 4 | "sync" 5 | "time" 6 | 7 | "github.com/podtrace/podtrace/internal/config" 8 | ) 9 | 10 | type pathCacheEntry struct { 11 | path string 12 | timestamp time.Time 13 | } 14 | 15 | type PathCache struct { 16 | mu sync.RWMutex 17 | cache map[string]*pathCacheEntry 18 | ttl time.Duration 19 | } 20 | 21 | func NewPathCache() *PathCache { 22 | ttl := time.Duration(config.CacheTTLSeconds) * time.Second 23 | return &PathCache{ 24 | cache: make(map[string]*pathCacheEntry), 25 | ttl: ttl, 26 | } 27 | } 28 | 29 | func (pc *PathCache) Get(key string) (string, bool) { 30 | pc.mu.RLock() 31 | defer pc.mu.RUnlock() 32 | entry, ok := pc.cache[key] 33 | if !ok { 34 | return "", false 35 | } 36 | if time.Since(entry.timestamp) > pc.ttl { 37 | return "", false 38 | } 39 | return entry.path, true 40 | } 41 | 42 | func (pc *PathCache) Set(key, path string) { 43 | if path == "" { 44 | return 45 | } 46 | pc.mu.Lock() 47 | defer pc.mu.Unlock() 48 | pc.cache[key] = &pathCacheEntry{ 49 | path: path, 50 | timestamp: time.Now(), 51 | } 52 | } 53 | 54 | func (pc *PathCache) Clear() { 55 | pc.mu.Lock() 56 | defer pc.mu.Unlock() 57 | pc.cache = make(map[string]*pathCacheEntry) 58 | } 59 | 60 | func (pc *PathCache) CleanupExpired() { 61 | pc.mu.Lock() 62 | defer pc.mu.Unlock() 63 | now := time.Now() 64 | for key, entry := range pc.cache { 65 | if now.Sub(entry.timestamp) > pc.ttl { 66 | delete(pc.cache, key) 67 | } 68 | } 69 | } 70 | 71 | -------------------------------------------------------------------------------- /internal/ebpf/tracer_wrapper_test.go: -------------------------------------------------------------------------------- 1 | package ebpf 2 | 3 | import ( 4 | "context" 5 | "testing" 6 | "time" 7 | 8 | "github.com/podtrace/podtrace/internal/events" 9 | ) 10 | 11 | func TestNewTracer(t *testing.T) { 12 | tracer, err := NewTracer() 13 | if err == nil && tracer == nil { 14 | t.Log("NewTracer returned nil tracer without error (expected for non-existent BPF object)") 15 | } 16 | if err != nil { 17 | t.Logf("NewTracer returned error as expected: %v", err) 18 | } 19 | } 20 | 21 | func TestWaitForInterrupt(t *testing.T) { 22 | done := make(chan bool, 1) 23 | go func() { 24 | defer func() { 25 | if r := recover(); r != nil { 26 | done <- true 27 | } 28 | }() 29 | WaitForInterrupt() 30 | done <- true 31 | }() 32 | 33 | select { 34 | case <-done: 35 | t.Log("WaitForInterrupt completed") 36 | case <-time.After(100 * time.Millisecond): 37 | t.Log("WaitForInterrupt is waiting for signal (expected behavior)") 38 | } 39 | } 40 | 41 | func TestTracerInterface(t *testing.T) { 42 | var _ TracerInterface = (*mockTracerForInterface)(nil) 43 | } 44 | 45 | type mockTracerForInterface struct{} 46 | 47 | func (m *mockTracerForInterface) AttachToCgroup(cgroupPath string) error { 48 | return nil 49 | } 50 | 51 | func (m *mockTracerForInterface) SetContainerID(containerID string) error { 52 | return nil 53 | } 54 | 55 | func (m *mockTracerForInterface) Start(ctx context.Context, eventChan chan<- *events.Event) error { 56 | return nil 57 | } 58 | 59 | func (m *mockTracerForInterface) Stop() error { 60 | return nil 61 | } 62 | 63 | -------------------------------------------------------------------------------- /internal/diagnose/analyzer/tls.go: -------------------------------------------------------------------------------- 1 | package analyzer 2 | 3 | import ( 4 | "sort" 5 | 6 | "github.com/podtrace/podtrace/internal/config" 7 | "github.com/podtrace/podtrace/internal/events" 8 | ) 9 | 10 | func AnalyzeTLS(events []*events.Event) ( 11 | avgLatency, maxLatency float64, 12 | errors int, 13 | p50, p95, p99 float64, 14 | errorBreakdown map[int32]int, 15 | topTargets []TargetCount, 16 | ) { 17 | var totalLatency float64 18 | var latencies []float64 19 | maxLatency = 0 20 | errors = 0 21 | errorBreakdown = make(map[int32]int) 22 | targetMap := make(map[string]int) 23 | 24 | for _, e := range events { 25 | latencyMs := float64(e.LatencyNS) / float64(config.NSPerMS) 26 | latencies = append(latencies, latencyMs) 27 | totalLatency += latencyMs 28 | 29 | if latencyMs > maxLatency { 30 | maxLatency = latencyMs 31 | } 32 | 33 | if e.Error != 0 { 34 | errors++ 35 | errorBreakdown[e.Error]++ 36 | } 37 | 38 | if e.Target != "" && e.Target != "?" && e.Target != "unknown" && e.Target != "file" { 39 | targetMap[e.Target]++ 40 | } 41 | } 42 | 43 | if len(events) > 0 { 44 | avgLatency = totalLatency / float64(len(events)) 45 | sort.Float64s(latencies) 46 | p50 = Percentile(latencies, 50) 47 | p95 = Percentile(latencies, 95) 48 | p99 = Percentile(latencies, 99) 49 | } 50 | 51 | for target, count := range targetMap { 52 | topTargets = append(topTargets, TargetCount{target, count}) 53 | } 54 | sort.Slice(topTargets, func(i, j int) bool { 55 | return topTargets[i].Count > topTargets[j].Count 56 | }) 57 | 58 | return 59 | } 60 | 61 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.yaml: -------------------------------------------------------------------------------- 1 | name: Feature Request 2 | description: Suggest an idea for this project 3 | title: "feature: " 4 | labels: 5 | - kind/feature 6 | body: 7 | - type: textarea 8 | id: problem 9 | attributes: 10 | label: Feature Description 11 | description: Is your feature request related to a problem? A clear and concise description of what the problem is. 12 | placeholder: I'm always frustrated when [...] 13 | validations: 14 | required: true 15 | 16 | - type: textarea 17 | id: solution 18 | attributes: 19 | label: Proposed Solution 20 | description: A clear and concise description of what you want to happen. 21 | placeholder: We can do [...] 22 | validations: 23 | required: true 24 | 25 | - type: textarea 26 | id: alternatives 27 | attributes: 28 | label: Alternative Solutions 29 | description: A clear and concise description of any alternative solutions or features that you've considered. 30 | placeholder: I think another approach would be [...] 31 | validations: 32 | required: false 33 | 34 | - type: checkboxes 35 | id: contribute 36 | attributes: 37 | label: Want to contribute? 38 | options: 39 | - label: I would like to work on this issue. 40 | required: false 41 | 42 | - type: textarea 43 | id: additional 44 | attributes: 45 | label: Additional Context 46 | description: Add any other context or screenshots about the feature request here. 47 | validations: 48 | required: false 49 | -------------------------------------------------------------------------------- /test/test-debug.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | NAMESPACE="${1:-podtrace-test}" 5 | POD_NAME="${2:-nginx-cpu-test}" 6 | DURATION="${3:-20s}" 7 | 8 | log() { 9 | echo "$@" 10 | } 11 | 12 | check_pod_exists() { 13 | local ns="$1" 14 | local pod="$2" 15 | 16 | if ! kubectl get pod "${pod}" -n "${ns}" &>/dev/null; then 17 | log "Error: Pod ${pod} not found" 18 | exit 1 19 | fi 20 | } 21 | 22 | show_pod_info() { 23 | local ns="$1" 24 | local pod="$2" 25 | 26 | log "Pod Info:" 27 | kubectl get pod "${pod}" -n "${ns}" -o wide 28 | log "" 29 | } 30 | 31 | show_recent_logs() { 32 | local ns="$1" 33 | local pod="$2" 34 | 35 | log "Recent Pod Logs:" 36 | kubectl logs "${pod}" -n "${ns}" --tail=10 || log "No logs available" 37 | log "" 38 | } 39 | 40 | show_pod_activity() { 41 | local ns="$1" 42 | local pod="$2" 43 | 44 | log "Checking pod activity..." 45 | kubectl exec "${pod}" -n "${ns}" -- ps aux 2>/dev/null || log "Cannot exec into pod" 46 | log "" 47 | } 48 | 49 | run_podtrace() { 50 | local ns="$1" 51 | local pod="$2" 52 | local duration="$3" 53 | 54 | log "Running podtrace (check stderr for eBPF attachment info)..." 55 | log "---" 56 | ./bin/podtrace -n "${ns}" "${pod}" --diagnose "${duration}" 2>&1 57 | log "---" 58 | } 59 | 60 | main() { 61 | log "=== Debug Test: ${POD_NAME} for ${DURATION} ===" 62 | log "" 63 | 64 | check_pod_exists "${NAMESPACE}" "${POD_NAME}" 65 | show_pod_info "${NAMESPACE}" "${POD_NAME}" 66 | show_recent_logs "${NAMESPACE}" "${POD_NAME}" 67 | show_pod_activity "${NAMESPACE}" "${POD_NAME}" 68 | run_podtrace "${NAMESPACE}" "${POD_NAME}" "${DURATION}" 69 | } 70 | 71 | main 72 | -------------------------------------------------------------------------------- /internal/alerting/sender.go: -------------------------------------------------------------------------------- 1 | package alerting 2 | 3 | import ( 4 | "context" 5 | "errors" 6 | "fmt" 7 | "time" 8 | ) 9 | 10 | type Sender interface { 11 | Send(ctx context.Context, alert *Alert) error 12 | Name() string 13 | } 14 | 15 | type RetrySender struct { 16 | sender Sender 17 | maxRetries int 18 | backoffBase time.Duration 19 | } 20 | 21 | func NewRetrySender(sender Sender, maxRetries int, backoffBase time.Duration) *RetrySender { 22 | return &RetrySender{ 23 | sender: sender, 24 | maxRetries: maxRetries, 25 | backoffBase: backoffBase, 26 | } 27 | } 28 | 29 | func (rs *RetrySender) Send(ctx context.Context, alert *Alert) error { 30 | if alert == nil { 31 | return fmt.Errorf("alert is nil") 32 | } 33 | if err := alert.Validate(); err != nil { 34 | return fmt.Errorf("invalid alert: %w", err) 35 | } 36 | alert.Sanitize() 37 | var lastErr error 38 | for attempt := 0; attempt <= rs.maxRetries; attempt++ { 39 | if attempt > 0 { 40 | backoff := rs.backoffBase * time.Duration(1< 30*time.Second { 42 | backoff = 30 * time.Second 43 | } 44 | select { 45 | case <-ctx.Done(): 46 | return ctx.Err() 47 | case <-time.After(backoff): 48 | } 49 | } 50 | err := rs.sender.Send(ctx, alert) 51 | if err == nil { 52 | return nil 53 | } 54 | lastErr = err 55 | if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) { 56 | return err 57 | } 58 | } 59 | return fmt.Errorf("failed after %d attempts: %w", rs.maxRetries+1, lastErr) 60 | } 61 | 62 | func (rs *RetrySender) Name() string { 63 | return rs.sender.Name() 64 | } 65 | 66 | 67 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/adopters.yaml: -------------------------------------------------------------------------------- 1 | name: Register as adopter 2 | description: If your organization is using podtrace, we would be delighted to add you to our list of adopters. Please report how you use podtrace and we will take care of adding it to our adopters list. 3 | title: "adopter: COMPANY_NAME" 4 | labels: 5 | - kind/documentation 6 | body: 7 | - type: input 8 | id: contact 9 | attributes: 10 | label: Contact Details 11 | description: How can we get in touch with you if we need more info? 12 | placeholder: eg. email@example.com 13 | validations: 14 | required: false 15 | 16 | - type: input 17 | id: org 18 | attributes: 19 | label: Organization 20 | description: Your organization's name. 21 | validations: 22 | required: true 23 | 24 | - type: textarea 25 | id: description 26 | attributes: 27 | label: Description 28 | description: What are you using podtrace for at your organization? Are you using it for a specific product or project? 29 | validations: 30 | required: true 31 | 32 | - type: dropdown 33 | id: maturity 34 | attributes: 35 | label: Maturity Stage 36 | description: What stage are you at in your adoption of podtrace? 37 | multiple: false 38 | options: 39 | - Production 40 | - Pre-production 41 | - Development 42 | - Conceptual 43 | - I don't know 44 | validations: 45 | required: false 46 | 47 | - type: input 48 | id: url 49 | attributes: 50 | label: Info Link 51 | description: If you have public documentation for the product or project, feel free to share it here. 52 | validations: 53 | required: false 54 | -------------------------------------------------------------------------------- /internal/alerting/deduplicator_test.go: -------------------------------------------------------------------------------- 1 | package alerting 2 | 3 | import ( 4 | "testing" 5 | "time" 6 | ) 7 | 8 | func TestAlertDeduplicator_ShouldSend(t *testing.T) { 9 | dedup := NewAlertDeduplicator(5 * time.Minute) 10 | alert := &Alert{ 11 | Severity: SeverityWarning, 12 | Title: "Test Alert", 13 | Message: "Test message", 14 | Timestamp: time.Now(), 15 | Source: "test", 16 | } 17 | if !dedup.ShouldSend(alert) { 18 | t.Error("First alert should be sent") 19 | } 20 | if dedup.ShouldSend(alert) { 21 | t.Error("Duplicate alert should not be sent") 22 | } 23 | } 24 | 25 | func TestAlertDeduplicator_ShouldSend_Nil(t *testing.T) { 26 | dedup := NewAlertDeduplicator(5 * time.Minute) 27 | if dedup.ShouldSend(nil) { 28 | t.Error("Nil alert should not be sent") 29 | } 30 | } 31 | 32 | func TestAlertDeduplicator_Cleanup(t *testing.T) { 33 | dedup := NewAlertDeduplicator(1 * time.Second) 34 | alert := &Alert{ 35 | Severity: SeverityWarning, 36 | Title: "Test Alert", 37 | Message: "Test message", 38 | Timestamp: time.Now(), 39 | Source: "test", 40 | } 41 | dedup.ShouldSend(alert) 42 | time.Sleep(2 * time.Second) 43 | dedup.Cleanup(1 * time.Second) 44 | if !dedup.ShouldSend(alert) { 45 | t.Error("Alert should be sendable after cleanup") 46 | } 47 | } 48 | 49 | func TestAlertDeduplicator_Reset(t *testing.T) { 50 | dedup := NewAlertDeduplicator(5 * time.Minute) 51 | alert := &Alert{ 52 | Severity: SeverityWarning, 53 | Title: "Test Alert", 54 | Message: "Test message", 55 | Timestamp: time.Now(), 56 | Source: "test", 57 | } 58 | dedup.ShouldSend(alert) 59 | dedup.Reset() 60 | if !dedup.ShouldSend(alert) { 61 | t.Error("Alert should be sendable after reset") 62 | } 63 | } 64 | 65 | 66 | -------------------------------------------------------------------------------- /bpf/common.h: -------------------------------------------------------------------------------- 1 | // SPDX-License-Identifier: GPL-2.0 2 | 3 | #ifndef PODTRACE_COMMON_H 4 | #define PODTRACE_COMMON_H 5 | 6 | #include "vmlinux.h" 7 | #include 8 | #include 9 | #include 10 | 11 | #ifndef PODTRACE_VMLINUX_FROM_BTF 12 | struct pt_regs { 13 | long unsigned int r15; 14 | long unsigned int r14; 15 | long unsigned int r13; 16 | long unsigned int r12; 17 | long unsigned int bp; 18 | long unsigned int bx; 19 | long unsigned int r11; 20 | long unsigned int r10; 21 | long unsigned int r9; 22 | long unsigned int r8; 23 | long unsigned int ax; 24 | long unsigned int cx; 25 | long unsigned int dx; 26 | long unsigned int si; 27 | long unsigned int di; 28 | long unsigned int orig_ax; 29 | long unsigned int ip; 30 | long unsigned int cs; 31 | long unsigned int flags; 32 | long unsigned int sp; 33 | long unsigned int ss; 34 | }; 35 | 36 | struct sockaddr_in { 37 | u16 sin_family; 38 | u16 sin_port; 39 | struct { 40 | u32 s_addr; 41 | } sin_addr; 42 | u8 sin_zero[8]; 43 | }; 44 | #endif 45 | 46 | #define MAX_STRING_LEN 128 47 | #define MAX_STACK_DEPTH 64 48 | 49 | #define NS_PER_MS 1000000ULL 50 | #define PAGE_SIZE 4096 51 | #define MAX_BYTES_THRESHOLD (10ULL * 1024ULL * 1024ULL) 52 | #define MIN_LATENCY_NS (1ULL * NS_PER_MS) 53 | 54 | #define AF_INET 2 55 | #define AF_INET6 10 56 | #define EAGAIN 11 57 | #define HEX_ADDR_LEN 16 58 | #define COMM_LEN 16 59 | 60 | #ifndef BPF_MAP_TYPE_RINGBUF 61 | #define BPF_MAP_TYPE_RINGBUF 27 62 | #endif 63 | #ifndef BPF_MAP_TYPE_HASH 64 | #define BPF_MAP_TYPE_HASH 1 65 | #endif 66 | #ifndef BPF_MAP_TYPE_ARRAY 67 | #define BPF_MAP_TYPE_ARRAY 2 68 | #endif 69 | #ifndef BPF_ANY 70 | #define BPF_ANY 0 71 | #endif 72 | #ifndef BPF_F_USER_STACK 73 | #define BPF_F_USER_STACK 8 74 | #endif 75 | 76 | #endif 77 | -------------------------------------------------------------------------------- /internal/cri/resolver_test.go: -------------------------------------------------------------------------------- 1 | package cri 2 | 3 | import ( 4 | "testing" 5 | ) 6 | 7 | func TestDefaultCandidateEndpoints_PodmanDisabled(t *testing.T) { 8 | t.Setenv("PODTRACE_CRI_ALLOW_PODMAN", "") 9 | eps := DefaultCandidateEndpoints() 10 | for _, ep := range eps { 11 | if ep == "unix:///run/podman/podman.sock" || ep == "unix:///var/run/podman/podman.sock" { 12 | t.Fatalf("expected podman endpoints to be excluded by default, got %v", eps) 13 | } 14 | } 15 | } 16 | 17 | func TestDefaultCandidateEndpoints_PodmanEnabled(t *testing.T) { 18 | t.Setenv("PODTRACE_CRI_ALLOW_PODMAN", "1") 19 | eps := DefaultCandidateEndpoints() 20 | foundRun := false 21 | foundVar := false 22 | for _, ep := range eps { 23 | if ep == "unix:///run/podman/podman.sock" { 24 | foundRun = true 25 | } 26 | if ep == "unix:///var/run/podman/podman.sock" { 27 | foundVar = true 28 | } 29 | } 30 | if !foundRun || !foundVar { 31 | t.Fatalf("expected podman endpoints when enabled, got %v", eps) 32 | } 33 | } 34 | 35 | func TestNormalizeUnixTarget(t *testing.T) { 36 | cases := []struct { 37 | in string 38 | want string 39 | }{ 40 | {"unix:///run/containerd/containerd.sock", "unix:///run/containerd/containerd.sock"}, 41 | {"/run/containerd/containerd.sock", "unix:///run/containerd/containerd.sock"}, 42 | {"something", "something"}, 43 | } 44 | for _, tc := range cases { 45 | if got := normalizeUnixTarget(tc.in); got != tc.want { 46 | t.Fatalf("normalizeUnixTarget(%q)=%q, want %q", tc.in, got, tc.want) 47 | } 48 | } 49 | } 50 | 51 | func TestExtractLooseCgroupsPath(t *testing.T) { 52 | s := `{"cgroupsPath":"\\/kubepods.slice\\/kubepods-burstable.slice\\/cri-containerd-abcdef.scope"}` 53 | got := extractLooseCgroupsPath(s) 54 | if got != "/kubepods.slice/kubepods-burstable.slice/cri-containerd-abcdef.scope" { 55 | t.Fatalf("unexpected cgroups path: %q", got) 56 | } 57 | } 58 | 59 | 60 | 61 | -------------------------------------------------------------------------------- /test/test-pods.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: v1 3 | kind: Namespace 4 | metadata: 5 | name: podtrace-test 6 | --- 7 | apiVersion: v1 8 | kind: Pod 9 | metadata: 10 | name: nginx-test 11 | namespace: podtrace-test 12 | labels: 13 | app: nginx 14 | test: podtrace 15 | spec: 16 | containers: 17 | - name: nginx 18 | image: nginx:alpine 19 | ports: 20 | - containerPort: 80 21 | - containerPort: 3000 22 | command: ["/bin/sh", "-c"] 23 | args: 24 | - | 25 | nginx 26 | while true; do 27 | wget -q -O /dev/null http://localhost/ || true 28 | nslookup google.com || true 29 | sleep 5 30 | done 31 | --- 32 | apiVersion: v1 33 | kind: Pod 34 | metadata: 35 | name: busybox-test 36 | namespace: podtrace-test 37 | labels: 38 | app: busybox 39 | test: podtrace 40 | spec: 41 | containers: 42 | - name: busybox 43 | image: busybox:latest 44 | command: ["/bin/sh", "-c"] 45 | args: 46 | - | 47 | apk add --no-cache curl bind-tools 2>/dev/null || true 48 | while true; do 49 | nslookup kubernetes.default.svc.cluster.local || true 50 | nslookup google.com || true 51 | wget -q -O /dev/null http://www.google.com || true 52 | echo "test $(date)" >> /tmp/test.log 53 | sync 54 | sleep 3 55 | done 56 | --- 57 | apiVersion: v1 58 | kind: Pod 59 | metadata: 60 | name: alpine-test 61 | namespace: podtrace-test 62 | labels: 63 | app: alpine 64 | test: podtrace 65 | spec: 66 | containers: 67 | - name: alpine 68 | image: alpine:latest 69 | command: ["/bin/sh", "-c"] 70 | args: 71 | - | 72 | apk add --no-cache curl bind-tools 73 | while true; do 74 | curl -s --max-time 5 http://www.google.com > /dev/null || true 75 | nslookup github.com || true 76 | echo "$(date) - test log entry" >> /var/log/app.log 77 | sync 78 | sleep 4 79 | done 80 | -------------------------------------------------------------------------------- /cmd/podtrace/mocks.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "context" 5 | 6 | "github.com/podtrace/podtrace/internal/ebpf" 7 | "github.com/podtrace/podtrace/internal/events" 8 | "github.com/podtrace/podtrace/internal/kubernetes" 9 | ) 10 | 11 | type mockPodResolver struct { 12 | resolvePodFunc func(ctx context.Context, podName, namespace, containerName string) (*kubernetes.PodInfo, error) 13 | } 14 | 15 | func (m *mockPodResolver) ResolvePod(ctx context.Context, podName, namespace, containerName string) (*kubernetes.PodInfo, error) { 16 | if m.resolvePodFunc != nil { 17 | return m.resolvePodFunc(ctx, podName, namespace, containerName) 18 | } 19 | return &kubernetes.PodInfo{ 20 | PodName: podName, 21 | Namespace: namespace, 22 | ContainerID: "test-container-id", 23 | CgroupPath: "/sys/fs/cgroup/test", 24 | ContainerName: containerName, 25 | }, nil 26 | } 27 | 28 | type mockTracer struct { 29 | attachToCgroupFunc func(cgroupPath string) error 30 | setContainerIDFunc func(containerID string) error 31 | startFunc func(ctx context.Context, eventChan chan<- *events.Event) error 32 | stopFunc func() error 33 | } 34 | 35 | func (m *mockTracer) AttachToCgroup(cgroupPath string) error { 36 | if m.attachToCgroupFunc != nil { 37 | return m.attachToCgroupFunc(cgroupPath) 38 | } 39 | return nil 40 | } 41 | 42 | func (m *mockTracer) SetContainerID(containerID string) error { 43 | if m.setContainerIDFunc != nil { 44 | return m.setContainerIDFunc(containerID) 45 | } 46 | return nil 47 | } 48 | 49 | func (m *mockTracer) Start(ctx context.Context, eventChan chan<- *events.Event) error { 50 | if m.startFunc != nil { 51 | return m.startFunc(ctx, eventChan) 52 | } 53 | return nil 54 | } 55 | 56 | func (m *mockTracer) Stop() error { 57 | if m.stopFunc != nil { 58 | return m.stopFunc() 59 | } 60 | return nil 61 | } 62 | 63 | var _ ebpf.TracerInterface = (*mockTracer)(nil) 64 | var _ kubernetes.PodResolverInterface = (*mockPodResolver)(nil) -------------------------------------------------------------------------------- /internal/alerting/logger_hook.go: -------------------------------------------------------------------------------- 1 | package alerting 2 | 3 | import ( 4 | "sync" 5 | "time" 6 | 7 | "go.uber.org/zap" 8 | "go.uber.org/zap/zapcore" 9 | ) 10 | 11 | var ( 12 | globalManager *Manager 13 | managerMu sync.RWMutex 14 | ) 15 | 16 | func SetGlobalManager(manager *Manager) { 17 | managerMu.Lock() 18 | defer managerMu.Unlock() 19 | globalManager = manager 20 | } 21 | 22 | func GetGlobalManager() *Manager { 23 | managerMu.RLock() 24 | defer managerMu.RUnlock() 25 | return globalManager 26 | } 27 | 28 | func CreateAlertFromLog(level zapcore.Level, msg string, fields []zap.Field, podName, namespace string) *Alert { 29 | managerMu.RLock() 30 | manager := globalManager 31 | managerMu.RUnlock() 32 | if manager == nil || !manager.IsEnabled() { 33 | return nil 34 | } 35 | var severity AlertSeverity 36 | switch level { 37 | case zapcore.FatalLevel: 38 | severity = SeverityFatal 39 | case zapcore.ErrorLevel: 40 | severity = SeverityError 41 | case zapcore.WarnLevel: 42 | severity = SeverityWarning 43 | default: 44 | return nil 45 | } 46 | context := make(map[string]interface{}) 47 | errorCode := "" 48 | for _, field := range fields { 49 | switch field.Type { 50 | case zapcore.StringType: 51 | context[field.Key] = field.String 52 | case zapcore.Int64Type, zapcore.Int32Type: 53 | context[field.Key] = field.Integer 54 | case zapcore.ErrorType: 55 | if field.Interface != nil { 56 | context[field.Key] = field.Interface.(error).Error() 57 | } 58 | } 59 | if field.Key == "error_code" || field.Key == "code" { 60 | if field.Type == zapcore.StringType { 61 | errorCode = field.String 62 | } 63 | } 64 | } 65 | alert := &Alert{ 66 | Severity: severity, 67 | Title: "Podtrace " + level.String() + " Error", 68 | Message: msg, 69 | Timestamp: time.Now(), 70 | Source: "logger", 71 | PodName: podName, 72 | Namespace: namespace, 73 | Context: context, 74 | ErrorCode: errorCode, 75 | } 76 | return alert 77 | } 78 | 79 | -------------------------------------------------------------------------------- /internal/ebpf/cache/cache.go: -------------------------------------------------------------------------------- 1 | package cache 2 | 3 | import ( 4 | "fmt" 5 | "os" 6 | "strings" 7 | "time" 8 | 9 | "github.com/podtrace/podtrace/internal/config" 10 | "github.com/podtrace/podtrace/internal/metricsexporter" 11 | "github.com/podtrace/podtrace/internal/validation" 12 | ) 13 | 14 | var ( 15 | globalCache *LRUCache 16 | ) 17 | 18 | func init() { 19 | ttl := time.Duration(config.CacheTTLSeconds) * time.Second 20 | globalCache = NewLRUCache(config.CacheMaxSize, ttl) 21 | } 22 | 23 | func ResetGlobalCache() { 24 | if globalCache != nil { 25 | globalCache.Close() 26 | } 27 | ttl := time.Duration(config.CacheTTLSeconds) * time.Second 28 | globalCache = NewLRUCache(config.CacheMaxSize, ttl) 29 | } 30 | 31 | func GetProcessNameQuick(pid uint32) string { 32 | if !validation.ValidatePID(pid) { 33 | return "" 34 | } 35 | 36 | if name, ok := globalCache.Get(pid); ok { 37 | return name 38 | } 39 | 40 | metricsexporter.RecordProcessCacheMiss() 41 | 42 | name := "" 43 | 44 | cmdlinePath := fmt.Sprintf("%s/%d/cmdline", config.ProcBasePath, pid) 45 | if cmdline, err := os.ReadFile(cmdlinePath); err == nil { 46 | parts := strings.Split(string(cmdline), "\x00") 47 | if len(parts) > 0 && parts[0] != "" { 48 | name = parts[0] 49 | if idx := strings.LastIndex(name, "/"); idx >= 0 { 50 | name = name[idx+1:] 51 | } 52 | } 53 | } 54 | 55 | if name == "" { 56 | statPath := fmt.Sprintf("%s/%d/stat", config.ProcBasePath, pid) 57 | if data, err := os.ReadFile(statPath); err == nil { 58 | statStr := string(data) 59 | start := strings.Index(statStr, "(") 60 | end := strings.LastIndex(statStr, ")") 61 | if start >= 0 && end > start { 62 | name = statStr[start+1 : end] 63 | } 64 | } 65 | } 66 | 67 | if name == "" { 68 | commPath := fmt.Sprintf("%s/%d/comm", config.ProcBasePath, pid) 69 | if data, err := os.ReadFile(commPath); err == nil { 70 | name = strings.TrimSpace(string(data)) 71 | } 72 | } 73 | 74 | sanitized := validation.SanitizeProcessName(name) 75 | globalCache.Set(pid, sanitized) 76 | return sanitized 77 | } 78 | 79 | -------------------------------------------------------------------------------- /bpf/memory.c: -------------------------------------------------------------------------------- 1 | // SPDX-License-Identifier: GPL-2.0 2 | 3 | #include "common.h" 4 | #include "maps.h" 5 | #include "events.h" 6 | #include "helpers.h" 7 | 8 | SEC("tp/exceptions/page_fault_user") 9 | int tracepoint_page_fault_user(void *ctx) { 10 | // NOTE: Tracepoint argument layouts can differ across kernels/distros when using raw tracepoints. 11 | // For stability, avoid relying on tracepoint "common_pid" field offsets here and use the 12 | // current task PID from bpf_get_current_pid_tgid(). 13 | u32 pid = bpf_get_current_pid_tgid() >> 32; 14 | 15 | struct event *e = get_event_buf(); 16 | if (!e) { 17 | return 0; 18 | } 19 | e->timestamp = bpf_ktime_get_ns(); 20 | e->pid = pid; 21 | e->type = EVENT_PAGE_FAULT; 22 | e->latency_ns = 0; 23 | // Best-effort: omit error_code (layout is not stable without BTF-typed tracepoints). 24 | e->error = 0; 25 | e->bytes = 0; 26 | e->tcp_state = 0; 27 | e->target[0] = '\0'; 28 | 29 | capture_user_stack(ctx, e->pid, 0, e); 30 | bpf_ringbuf_output(&events, e, sizeof(*e), 0); 31 | return 0; 32 | } 33 | 34 | SEC("tp/oom/oom_kill_process") 35 | int tracepoint_oom_kill_process(void *ctx) { 36 | struct { 37 | unsigned short common_type; 38 | unsigned char common_flags; 39 | unsigned char common_preempt_count; 40 | int common_pid; 41 | char comm[16]; 42 | u32 pid; 43 | u32 tid; 44 | u64 totalpages; 45 | u64 points; 46 | u64 victim_points; 47 | const char *constraint; 48 | u32 constraint_kind; 49 | u32 gfp_mask; 50 | int order; 51 | } args_local; 52 | 53 | bpf_probe_read_kernel(&args_local, sizeof(args_local), ctx); 54 | 55 | struct event *e = get_event_buf(); 56 | if (!e) { 57 | return 0; 58 | } 59 | e->timestamp = bpf_ktime_get_ns(); 60 | e->pid = args_local.pid; 61 | e->type = EVENT_OOM_KILL; 62 | e->latency_ns = 0; 63 | e->error = 0; 64 | e->bytes = args_local.totalpages * PAGE_SIZE; 65 | e->tcp_state = 0; 66 | 67 | bpf_probe_read_kernel_str(e->target, sizeof(e->target), args_local.comm); 68 | 69 | capture_user_stack(ctx, e->pid, 0, e); 70 | bpf_ringbuf_output(&events, e, sizeof(*e), 0); 71 | return 0; 72 | } 73 | -------------------------------------------------------------------------------- /.github/workflows/bash-checks.yml: -------------------------------------------------------------------------------- 1 | name: Bash Script Checks 2 | 3 | on: 4 | push: 5 | branches: ["**"] 6 | paths: 7 | - "**/*.sh" 8 | - "scripts/**" 9 | - "build.sh" 10 | - "test/**/*.sh" 11 | pull_request: 12 | branches: ["**"] 13 | paths: 14 | - "**/*.sh" 15 | - "scripts/**" 16 | - "build.sh" 17 | - "test/**/*.sh" 18 | 19 | jobs: 20 | bash-lint: 21 | runs-on: ubuntu-latest 22 | 23 | steps: 24 | - name: Checkout 25 | uses: actions/checkout@v4 26 | 27 | - name: Install tools 28 | run: | 29 | sudo apt-get update 30 | sudo apt-get install -y \ 31 | shellcheck \ 32 | devscripts \ 33 | shfmt \ 34 | python3-pip 35 | pip3 install bashate || echo "bashate installation failed, will skip bashate checks" 36 | 37 | - name: Syntax Check 38 | run: | 39 | echo "Running bash syntax check..." 40 | for file in $(git ls-files '*.sh'); do 41 | echo "→ Checking syntax: $file" 42 | bash -n "$file" 43 | done 44 | 45 | - name: Run ShellCheck 46 | run: | 47 | echo "Running strict ShellCheck..." 48 | shellcheck --enable=all -x $(git ls-files '*.sh') 49 | 50 | - name: Check for bashisms 51 | run: | 52 | echo "Checking for bashisms..." 53 | checkbashisms $(git ls-files '*.sh') 54 | 55 | - name: Run bashate 56 | run: | 57 | echo "Running bashate..." 58 | if command -v bashate >/dev/null 2>&1; then 59 | bashate $(git ls-files '*.sh') || echo "bashate check completed with warnings" 60 | else 61 | echo "bashate not available, skipping..." 62 | fi 63 | 64 | - name: Check formatting (shfmt) 65 | run: | 66 | echo "Checking formatting with shfmt..." 67 | shfmt -d . || { 68 | echo "Formatting issues found. To fix, run: shfmt -w ." 69 | echo "Or the workflow can auto-fix by running: shfmt -w ." 70 | exit 1 71 | } 72 | -------------------------------------------------------------------------------- /internal/diagnose/sampling.go: -------------------------------------------------------------------------------- 1 | package diagnose 2 | 3 | import ( 4 | "github.com/podtrace/podtrace/internal/config" 5 | "github.com/podtrace/podtrace/internal/events" 6 | ) 7 | 8 | var eventTypeSamplingRates = map[events.EventType]int{ 9 | events.EventOOMKill: 1, 10 | events.EventPageFault: 1, 11 | events.EventNetDevError: 1, 12 | events.EventTCPRetrans: 5, 13 | events.EventDNS: 10, 14 | events.EventConnect: 20, 15 | events.EventHTTPReq: 30, 16 | events.EventHTTPResp: 30, 17 | events.EventTCPSend: 50, 18 | events.EventTCPRecv: 50, 19 | events.EventUDPSend: 50, 20 | events.EventUDPRecv: 50, 21 | events.EventWrite: 100, 22 | events.EventRead: 100, 23 | events.EventFsync: 100, 24 | events.EventSchedSwitch: 200, 25 | events.EventLockContention: 50, 26 | events.EventDBQuery: 20, 27 | events.EventExec: 10, 28 | events.EventFork: 10, 29 | events.EventOpen: 100, 30 | events.EventClose: 200, 31 | events.EventTCPState: 100, 32 | } 33 | 34 | func getEventPriority(event *events.Event) int { 35 | if event == nil { 36 | return config.PriorityLow 37 | } 38 | 39 | if event.Error != 0 { 40 | return config.PriorityCritical 41 | } 42 | 43 | switch event.Type { 44 | case events.EventOOMKill, events.EventPageFault, events.EventNetDevError: 45 | return config.PriorityCritical 46 | case events.EventTCPRetrans, events.EventLockContention: 47 | return config.PriorityHigh 48 | case events.EventDNS, events.EventConnect, events.EventHTTPReq, events.EventHTTPResp: 49 | return config.PriorityNormal 50 | default: 51 | return config.PriorityLow 52 | } 53 | } 54 | 55 | func shouldSampleEvent(event *events.Event, eventCount int) bool { 56 | if event == nil { 57 | return false 58 | } 59 | 60 | priority := getEventPriority(event) 61 | if priority == config.PriorityCritical { 62 | return true 63 | } 64 | 65 | samplingRate, ok := eventTypeSamplingRates[event.Type] 66 | if !ok { 67 | samplingRate = config.EventSamplingRate 68 | } 69 | 70 | return eventCount%samplingRate == 0 71 | } 72 | 73 | -------------------------------------------------------------------------------- /scripts/setup-capabilities.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -euo pipefail 3 | 4 | SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" 5 | PROJECT_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)" 6 | BINARY="${PROJECT_ROOT}/bin/podtrace" 7 | 8 | REQUIRED_CAPS="cap_bpf,cap_sys_admin,cap_sys_resource,cap_net_admin" 9 | 10 | check_binary_exists() { 11 | if [[ ! -f "${BINARY}" ]]; then 12 | echo "Error: ${BINARY} not found" >&2 13 | echo "Build it first with: make build" >&2 14 | exit 1 15 | fi 16 | 17 | if [[ ! -x "${BINARY}" ]]; then 18 | echo "Error: ${BINARY} is not executable" >&2 19 | exit 1 20 | fi 21 | } 22 | 23 | check_root() { 24 | if [[ ${EUID} -ne 0 ]]; then 25 | echo "Error: This script must be run as root (use sudo)" >&2 26 | exit 1 27 | fi 28 | } 29 | 30 | set_capabilities() { 31 | echo "Setting capabilities on ${BINARY}..." 32 | echo "Capabilities: ${REQUIRED_CAPS}" 33 | echo "" 34 | 35 | if ! command -v setcap &>/dev/null; then 36 | echo "Error: setcap command not found. Install libcap2-bin package." >&2 37 | exit 1 38 | fi 39 | 40 | if ! setcap "${REQUIRED_CAPS}+ep" "${BINARY}"; then 41 | echo "Error: Failed to set capabilities" >&2 42 | exit 1 43 | fi 44 | } 45 | 46 | verify_capabilities() { 47 | if ! command -v getcap &>/dev/null; then 48 | echo "Warning: getcap command not found, cannot verify capabilities" >&2 49 | return 0 50 | fi 51 | 52 | local current_caps 53 | current_caps=$(getcap "${BINARY}" 2>/dev/null || echo "") 54 | 55 | if [[ -z "${current_caps}" ]]; then 56 | echo "Error: Failed to verify capabilities were set" >&2 57 | exit 1 58 | fi 59 | 60 | echo "Verified capabilities:" 61 | echo " ${current_caps}" 62 | echo "" 63 | } 64 | 65 | print_success_message() { 66 | echo "✓ Capabilities set successfully!" 67 | echo "" 68 | echo "You can now run podtrace without sudo:" 69 | echo " ${BINARY} -n " 70 | echo "" 71 | echo "To verify capabilities:" 72 | echo " getcap ${BINARY}" 73 | echo "" 74 | echo "To remove capabilities:" 75 | echo " sudo setcap -r ${BINARY}" 76 | } 77 | 78 | main() { 79 | check_binary_exists 80 | check_root 81 | set_capabilities 82 | verify_capabilities 83 | print_success_message 84 | } 85 | 86 | main "$@" 87 | -------------------------------------------------------------------------------- /internal/kubernetes/errors.go: -------------------------------------------------------------------------------- 1 | package kubernetes 2 | 3 | import "fmt" 4 | 5 | type ErrorCode int 6 | 7 | const ( 8 | ErrCodeKubeconfigFailed ErrorCode = iota + 1 9 | ErrCodeClientsetFailed 10 | ErrCodePodNotFound 11 | ErrCodeNoContainers 12 | ErrCodeContainerNotFound 13 | ErrCodeInvalidContainerID 14 | ErrCodeCgroupNotFound 15 | ) 16 | 17 | type KubernetesError struct { 18 | Code ErrorCode 19 | Message string 20 | Err error 21 | } 22 | 23 | func (e *KubernetesError) Error() string { 24 | if e.Err != nil { 25 | return fmt.Sprintf("%s: %v", e.Message, e.Err) 26 | } 27 | return e.Message 28 | } 29 | 30 | func (e *KubernetesError) Unwrap() error { 31 | return e.Err 32 | } 33 | 34 | func NewKubeconfigError(err error) *KubernetesError { 35 | return &KubernetesError{ 36 | Code: ErrCodeKubeconfigFailed, 37 | Message: "failed to get kubeconfig", 38 | Err: err, 39 | } 40 | } 41 | 42 | func NewClientsetError(err error) *KubernetesError { 43 | return &KubernetesError{ 44 | Code: ErrCodeClientsetFailed, 45 | Message: "failed to create Kubernetes clientset", 46 | Err: err, 47 | } 48 | } 49 | 50 | func NewPodNotFoundError(podName, namespace string, err error) *KubernetesError { 51 | return &KubernetesError{ 52 | Code: ErrCodePodNotFound, 53 | Message: fmt.Sprintf("failed to get pod %s in namespace %s", podName, namespace), 54 | Err: err, 55 | } 56 | } 57 | 58 | func NewNoContainersError() *KubernetesError { 59 | return &KubernetesError{ 60 | Code: ErrCodeNoContainers, 61 | Message: "pod has no containers", 62 | } 63 | } 64 | 65 | func NewContainerNotFoundError(containerName string) *KubernetesError { 66 | return &KubernetesError{ 67 | Code: ErrCodeContainerNotFound, 68 | Message: fmt.Sprintf("container %s not found in pod", containerName), 69 | } 70 | } 71 | 72 | func NewInvalidContainerIDError(reason string) *KubernetesError { 73 | return &KubernetesError{ 74 | Code: ErrCodeInvalidContainerID, 75 | Message: fmt.Sprintf("invalid container ID: %s", reason), 76 | } 77 | } 78 | 79 | func NewCgroupNotFoundError(containerID string) *KubernetesError { 80 | return &KubernetesError{ 81 | Code: ErrCodeCgroupNotFound, 82 | Message: fmt.Sprintf("cgroup path not found for container %s", containerID), 83 | } 84 | } 85 | 86 | -------------------------------------------------------------------------------- /internal/alerting/manager_test.go: -------------------------------------------------------------------------------- 1 | package alerting 2 | 3 | import ( 4 | "context" 5 | "os" 6 | "testing" 7 | "time" 8 | 9 | "github.com/podtrace/podtrace/internal/config" 10 | ) 11 | 12 | func TestNewManager_Disabled(t *testing.T) { 13 | _ = os.Setenv("PODTRACE_ALERTING_ENABLED", "false") 14 | defer func() { 15 | _ = os.Unsetenv("PODTRACE_ALERTING_ENABLED") 16 | }() 17 | config.AlertingEnabled = false 18 | manager, err := NewManager() 19 | if err != nil { 20 | t.Fatalf("NewManager() error = %v", err) 21 | } 22 | if manager.IsEnabled() { 23 | t.Error("Manager should be disabled") 24 | } 25 | } 26 | 27 | func TestManager_SendAlert_Disabled(t *testing.T) { 28 | manager := &Manager{enabled: false} 29 | alert := &Alert{ 30 | Severity: SeverityWarning, 31 | Title: "Test", 32 | Message: "Test", 33 | Timestamp: time.Now(), 34 | Source: "test", 35 | } 36 | manager.SendAlert(alert) 37 | } 38 | 39 | func TestManager_SendAlert_Nil(t *testing.T) { 40 | manager := &Manager{enabled: true} 41 | manager.SendAlert(nil) 42 | } 43 | 44 | func TestManager_Shutdown(t *testing.T) { 45 | manager := &Manager{ 46 | enabled: true, 47 | stopCh: make(chan struct{}), 48 | } 49 | manager.cleanupTicker = time.NewTicker(1 * time.Hour) 50 | manager.wg.Add(1) 51 | go func() { 52 | defer manager.wg.Done() 53 | <-manager.stopCh 54 | }() 55 | ctx, cancel := context.WithTimeout(context.Background(), 1*time.Second) 56 | defer cancel() 57 | err := manager.Shutdown(ctx) 58 | if err != nil { 59 | t.Errorf("Shutdown() error = %v", err) 60 | } 61 | } 62 | 63 | func TestManager_AddSender(t *testing.T) { 64 | manager := &Manager{ 65 | enabled: true, 66 | senders: make([]Sender, 0), 67 | } 68 | mockSender := &testMockSender{name: "test"} 69 | manager.AddSender(mockSender) 70 | if len(manager.senders) != 1 { 71 | t.Errorf("Expected 1 sender, got %d", len(manager.senders)) 72 | } 73 | manager.AddSender(nil) 74 | if len(manager.senders) != 1 { 75 | t.Errorf("Expected 1 sender after adding nil, got %d", len(manager.senders)) 76 | } 77 | } 78 | 79 | type testMockSender struct { 80 | sendFunc func(ctx context.Context, alert *Alert) error 81 | name string 82 | } 83 | 84 | func (m *testMockSender) Send(ctx context.Context, alert *Alert) error { 85 | if m.sendFunc != nil { 86 | return m.sendFunc(ctx, alert) 87 | } 88 | return nil 89 | } 90 | 91 | func (m *testMockSender) Name() string { 92 | return m.name 93 | } 94 | -------------------------------------------------------------------------------- /test/setup-test-pods.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Setup script for podtrace test pods 3 | 4 | set -e 5 | 6 | NAMESPACE="podtrace-test" 7 | SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" 8 | 9 | GREEN='\033[0;32m' 10 | RED='\033[0;31m' 11 | NC='\033[0m' 12 | 13 | print_header() { 14 | echo "=== Setting up podtrace test environment ===" 15 | echo "" 16 | } 17 | 18 | check_kubectl_installed() { 19 | if ! command -v kubectl &>/dev/null; then 20 | echo -e "${RED}Error: kubectl is not installed${NC}" 21 | exit 1 22 | fi 23 | } 24 | 25 | check_cluster_access() { 26 | if ! kubectl cluster-info &>/dev/null; then 27 | echo -e "${RED}Error: Cannot connect to Kubernetes cluster${NC}" 28 | echo "Please verify: kubectl cluster-info" 29 | exit 1 30 | fi 31 | 32 | echo -e "${GREEN}✓ Kubernetes cluster accessible${NC}" 33 | echo "" 34 | } 35 | 36 | apply_test_resources() { 37 | echo "Creating test namespace and pods..." 38 | kubectl apply -f "${SCRIPT_DIR}/test-pods.yaml" 39 | echo "" 40 | } 41 | 42 | wait_for_pods_ready() { 43 | echo "Waiting for pods to be ready..." 44 | kubectl wait --for=condition=Ready pod/nginx-test -n "${NAMESPACE}" --timeout=120s || true 45 | kubectl wait --for=condition=Ready pod/busybox-test -n "${NAMESPACE}" --timeout=120s || true 46 | kubectl wait --for=condition=Ready pod/alpine-test -n "${NAMESPACE}" --timeout=120s || true 47 | echo "" 48 | } 49 | 50 | print_pod_status() { 51 | echo "=== Test Pods Status ===" 52 | kubectl get pods -n "${NAMESPACE}" 53 | echo "" 54 | } 55 | 56 | print_instructions() { 57 | echo -e "${GREEN}=== Test pods are ready! ===${NC}" 58 | echo "" 59 | echo "You can now test podtrace with:" 60 | echo "" 61 | echo " # Test with nginx pod" 62 | echo " sudo ./bin/podtrace -n ${NAMESPACE} nginx-test" 63 | echo "" 64 | echo " # Test with busybox pod" 65 | echo " sudo ./bin/podtrace -n ${NAMESPACE} busybox-test" 66 | echo "" 67 | echo " # Test with alpine pod" 68 | echo " sudo ./bin/podtrace -n ${NAMESPACE} alpine-test" 69 | echo "" 70 | echo " # Test diagnose mode" 71 | echo " sudo ./bin/podtrace -n ${NAMESPACE} nginx-test --diagnose 10s" 72 | echo "" 73 | echo "To clean up, run:" 74 | echo " ./test/cleanup-test-pods.sh" 75 | echo "" 76 | } 77 | 78 | main() { 79 | print_header 80 | check_kubectl_installed 81 | check_cluster_access 82 | apply_test_resources 83 | wait_for_pods_ready 84 | print_pod_status 85 | print_instructions 86 | } 87 | 88 | main "$@" 89 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.yaml: -------------------------------------------------------------------------------- 1 | name: Bug Report 2 | description: Create a report to help us improve 3 | title: "bug: " 4 | labels: 5 | - kind/bug 6 | body: 7 | - type: textarea 8 | id: description 9 | attributes: 10 | label: Describe the bug 11 | description: Please provide a clear and concise description of the bug. 12 | placeholder: | 13 | Add logs and screenshots if any. 14 | validations: 15 | required: true 16 | 17 | - type: textarea 18 | id: reproducing 19 | attributes: 20 | label: Steps To Reproduce 21 | description: Steps to reproduce the behavior. 22 | placeholder: | 23 | 1. Run 'podtrace -n ...' 24 | 2. Execute '...' 25 | 3. Observe the error 26 | validations: 27 | required: true 28 | 29 | - type: textarea 30 | id: expected 31 | attributes: 32 | label: Expected Behaviour 33 | description: A clear and concise description of what you expected to happen. 34 | validations: 35 | required: true 36 | 37 | - type: dropdown 38 | id: component 39 | attributes: 40 | label: Component 41 | description: Which component is affected? 42 | options: 43 | - eBPF Programs 44 | - Event Collection 45 | - Kubernetes Integration 46 | - Event Processing 47 | - Diagnostics 48 | - Metrics Export 49 | - CLI 50 | - Other 51 | validations: 52 | required: true 53 | 54 | - type: input 55 | id: kubernetes_version 56 | attributes: 57 | label: Kubernetes version 58 | description: Output of `kubectl version` (server version) 59 | placeholder: e.g., v1.28.0 60 | validations: 61 | required: false 62 | 63 | - type: input 64 | id: podtrace_version 65 | attributes: 66 | label: podtrace version 67 | description: Version of podtrace binary 68 | placeholder: e.g., v0.1.0 69 | validations: 70 | required: false 71 | 72 | - type: input 73 | id: kernel_version 74 | attributes: 75 | label: Kernel version 76 | description: Output of `uname -r` on the node where podtrace runs 77 | placeholder: e.g., 5.15.0 78 | validations: 79 | required: false 80 | 81 | - type: textarea 82 | id: additional 83 | attributes: 84 | label: Additional Context 85 | description: Add any other context about the problem here (logs, cluster setup, pod configuration, etc.). 86 | -------------------------------------------------------------------------------- /internal/logger/logger_test.go: -------------------------------------------------------------------------------- 1 | package logger 2 | 3 | import ( 4 | "os" 5 | "testing" 6 | 7 | "go.uber.org/zap" 8 | "go.uber.org/zap/zapcore" 9 | ) 10 | 11 | func TestLogger(t *testing.T) { 12 | log := Logger() 13 | if log == nil { 14 | t.Error("Logger() should not return nil") 15 | } 16 | } 17 | 18 | func TestSetLevel(t *testing.T) { 19 | originalLevel := atomicLevel.Level() 20 | defer SetLevel(originalLevel.String()) 21 | 22 | tests := []struct { 23 | name string 24 | levelStr string 25 | expected zapcore.Level 26 | }{ 27 | {"debug", "debug", zapcore.DebugLevel}, 28 | {"info", "info", zapcore.InfoLevel}, 29 | {"warn", "warn", zapcore.WarnLevel}, 30 | {"error", "error", zapcore.ErrorLevel}, 31 | {"fatal", "fatal", zapcore.FatalLevel}, 32 | {"invalid", "invalid", zapcore.InfoLevel}, 33 | {"empty", "", zapcore.InfoLevel}, 34 | } 35 | 36 | for _, tt := range tests { 37 | t.Run(tt.name, func(t *testing.T) { 38 | SetLevel(tt.levelStr) 39 | if atomicLevel.Level() != tt.expected { 40 | t.Errorf("Expected level %v, got %v", tt.expected, atomicLevel.Level()) 41 | } 42 | }) 43 | } 44 | } 45 | 46 | func TestLogFunctions(t *testing.T) { 47 | SetLevel("debug") 48 | 49 | Debug("test debug message", zap.String("key", "value")) 50 | Info("test info message", zap.String("key", "value")) 51 | Warn("test warn message", zap.String("key", "value")) 52 | Error("test error message", zap.String("key", "value")) 53 | } 54 | 55 | 56 | func TestSync(t *testing.T) { 57 | Sync() 58 | } 59 | 60 | func TestParseLogLevel(t *testing.T) { 61 | key := "PODTRACE_LOG_LEVEL" 62 | originalValue := os.Getenv(key) 63 | defer func() { 64 | if originalValue != "" { 65 | _ = os.Setenv(key, originalValue) 66 | } else { 67 | _ = os.Unsetenv(key) 68 | } 69 | }() 70 | 71 | tests := []struct { 72 | name string 73 | levelStr string 74 | expected zapcore.Level 75 | }{ 76 | {"debug", "debug", zapcore.DebugLevel}, 77 | {"info", "info", zapcore.InfoLevel}, 78 | {"warn", "warn", zapcore.WarnLevel}, 79 | {"error", "error", zapcore.ErrorLevel}, 80 | {"fatal", "fatal", zapcore.FatalLevel}, 81 | {"invalid", "invalid", zapcore.InfoLevel}, 82 | {"uppercase", "DEBUG", zapcore.InfoLevel}, 83 | } 84 | 85 | for _, tt := range tests { 86 | t.Run(tt.name, func(t *testing.T) { 87 | result := parseLogLevel(tt.levelStr) 88 | if result != tt.expected { 89 | t.Errorf("Expected %v, got %v", tt.expected, result) 90 | } 91 | }) 92 | } 93 | } 94 | 95 | 96 | -------------------------------------------------------------------------------- /internal/diagnose/errors.go: -------------------------------------------------------------------------------- 1 | package diagnose 2 | 3 | import "fmt" 4 | 5 | type ErrorCode int 6 | 7 | const ( 8 | ErrCodeEventLimitReached ErrorCode = iota + 1 9 | ErrCodeContextCancelled 10 | ErrCodeTimeout 11 | ErrCodeInvalidOperation 12 | ErrCodeReportGenerationFailed 13 | ErrCodeStackResolveFailed 14 | ErrCodeAddr2lineFailed 15 | ErrCodeNoEvents 16 | ) 17 | 18 | type DiagnoseError struct { 19 | Code ErrorCode 20 | Message string 21 | Err error 22 | } 23 | 24 | func (e *DiagnoseError) Error() string { 25 | if e.Err != nil { 26 | return fmt.Sprintf("%s: %v", e.Message, e.Err) 27 | } 28 | return e.Message 29 | } 30 | 31 | func (e *DiagnoseError) Unwrap() error { 32 | return e.Err 33 | } 34 | 35 | func NewEventLimitError(dropped int) *DiagnoseError { 36 | return &DiagnoseError{ 37 | Code: ErrCodeEventLimitReached, 38 | Message: fmt.Sprintf("event limit reached, %d events dropped", dropped), 39 | } 40 | } 41 | 42 | func NewContextCancelledError(err error) *DiagnoseError { 43 | return &DiagnoseError{ 44 | Code: ErrCodeContextCancelled, 45 | Message: "operation cancelled", 46 | Err: err, 47 | } 48 | } 49 | 50 | func NewTimeoutError(operation string) *DiagnoseError { 51 | return &DiagnoseError{ 52 | Code: ErrCodeTimeout, 53 | Message: fmt.Sprintf("operation timed out: %s", operation), 54 | } 55 | } 56 | 57 | func NewInvalidOperationError(operation string) *DiagnoseError { 58 | return &DiagnoseError{ 59 | Code: ErrCodeInvalidOperation, 60 | Message: fmt.Sprintf("invalid operation: %s", operation), 61 | } 62 | } 63 | 64 | func NewReportGenerationError(err error) *DiagnoseError { 65 | return &DiagnoseError{ 66 | Code: ErrCodeReportGenerationFailed, 67 | Message: "failed to generate report", 68 | Err: err, 69 | } 70 | } 71 | 72 | func NewStackResolveError(pid uint32, addr uint64, err error) *DiagnoseError { 73 | return &DiagnoseError{ 74 | Code: ErrCodeStackResolveFailed, 75 | Message: fmt.Sprintf("failed to resolve stack trace for PID %d at address 0x%x", pid, addr), 76 | Err: err, 77 | } 78 | } 79 | 80 | func NewAddr2lineError(exePath string, addr uint64, err error) *DiagnoseError { 81 | return &DiagnoseError{ 82 | Code: ErrCodeAddr2lineFailed, 83 | Message: fmt.Sprintf("addr2line failed for %s at 0x%x", exePath, addr), 84 | Err: err, 85 | } 86 | } 87 | 88 | func NewNoEventsError() *DiagnoseError { 89 | return &DiagnoseError{ 90 | Code: ErrCodeNoEvents, 91 | Message: "no events collected during the diagnostic period", 92 | } 93 | } 94 | -------------------------------------------------------------------------------- /internal/kubernetes/errors_test.go: -------------------------------------------------------------------------------- 1 | package kubernetes 2 | 3 | import ( 4 | "errors" 5 | "testing" 6 | ) 7 | 8 | func TestNewKubeconfigError(t *testing.T) { 9 | originalErr := errors.New("kubeconfig error") 10 | err := NewKubeconfigError(originalErr) 11 | 12 | if err == nil { 13 | t.Fatal("NewKubeconfigError returned nil") 14 | } 15 | if err.Code != ErrCodeKubeconfigFailed { 16 | t.Errorf("Expected error code %d, got %d", ErrCodeKubeconfigFailed, err.Code) 17 | } 18 | if err.Message != "failed to get kubeconfig" { 19 | t.Errorf("Expected message 'failed to get kubeconfig', got %q", err.Message) 20 | } 21 | if err.Unwrap() != originalErr { 22 | t.Errorf("Expected unwrapped error to be original error") 23 | } 24 | } 25 | 26 | func TestNewClientsetError(t *testing.T) { 27 | originalErr := errors.New("clientset error") 28 | err := NewClientsetError(originalErr) 29 | 30 | if err == nil { 31 | t.Fatal("NewClientsetError returned nil") 32 | } 33 | if err.Code != ErrCodeClientsetFailed { 34 | t.Errorf("Expected error code %d, got %d", ErrCodeClientsetFailed, err.Code) 35 | } 36 | if err.Message != "failed to create Kubernetes clientset" { 37 | t.Errorf("Expected message 'failed to create Kubernetes clientset', got %q", err.Message) 38 | } 39 | if err.Unwrap() != originalErr { 40 | t.Errorf("Expected unwrapped error to be original error") 41 | } 42 | } 43 | 44 | func TestKubernetesError_Error_WithErr(t *testing.T) { 45 | originalErr := errors.New("underlying error") 46 | kerr := &KubernetesError{ 47 | Code: ErrCodePodNotFound, 48 | Message: "test message", 49 | Err: originalErr, 50 | } 51 | 52 | errStr := kerr.Error() 53 | if errStr == "" { 54 | t.Error("Error() should return non-empty string") 55 | } 56 | if !contains(errStr, "test message") { 57 | t.Errorf("Expected error string to contain 'test message', got %q", errStr) 58 | } 59 | } 60 | 61 | func TestKubernetesError_Error_WithoutErr(t *testing.T) { 62 | kerr := &KubernetesError{ 63 | Code: ErrCodePodNotFound, 64 | Message: "test message", 65 | Err: nil, 66 | } 67 | 68 | errStr := kerr.Error() 69 | if errStr != "test message" { 70 | t.Errorf("Expected error string 'test message', got %q", errStr) 71 | } 72 | } 73 | 74 | func contains(s, substr string) bool { 75 | return len(s) >= len(substr) && (s == substr || 76 | (len(s) > len(substr) && containsMiddle(s, substr))) 77 | } 78 | 79 | func containsMiddle(s, substr string) bool { 80 | for i := 0; i <= len(s)-len(substr); i++ { 81 | if s[i:i+len(substr)] == substr { 82 | return true 83 | } 84 | } 85 | return false 86 | } 87 | 88 | -------------------------------------------------------------------------------- /.github/workflows/security.yml: -------------------------------------------------------------------------------- 1 | name: Security - CodeQL 2 | 3 | on: 4 | push: 5 | branches: ["**"] 6 | pull_request: 7 | branches: ["main"] 8 | schedule: 9 | - cron: "0 3 * * 0" 10 | 11 | jobs: 12 | analyze: 13 | permissions: 14 | contents: read 15 | security-events: write 16 | actions: read 17 | 18 | runs-on: ubuntu-latest 19 | 20 | strategy: 21 | fail-fast: false 22 | matrix: 23 | language: ["go", "cpp"] 24 | 25 | steps: 26 | - name: Checkout 27 | uses: actions/checkout@v4 28 | 29 | - name: Set up Go 30 | if: matrix.language == 'go' 31 | uses: actions/setup-go@v4 32 | with: 33 | go-version-file: go.mod 34 | cache: true 35 | 36 | - name: Initialize CodeQL 37 | uses: github/codeql-action/init@v4 38 | with: 39 | languages: ${{ matrix.language }} 40 | 41 | - name: Install build dependencies 42 | run: | 43 | sudo apt-get update 44 | sudo apt-get install -y clang llvm libbpf-dev linux-headers-$(uname -r) linux-tools-$(uname -r) || sudo apt-get install -y clang llvm libbpf-dev linux-headers-$(uname -r) linux-tools-generic 45 | 46 | - name: Generate vmlinux.h from BTF 47 | if: matrix.language == 'cpp' 48 | run: | 49 | echo "Generating vmlinux.h from BTF..." 50 | if [ -f /sys/kernel/btf/vmlinux ]; then 51 | bpftool btf dump file /sys/kernel/btf/vmlinux format c > bpf/vmlinux.h 52 | echo "vmlinux.h generated successfully" 53 | else 54 | echo "Warning: /sys/kernel/btf/vmlinux not found, using placeholder vmlinux.h" 55 | fi 56 | 57 | - name: Build Go 58 | if: matrix.language == 'go' 59 | run: | 60 | GOTOOLCHAIN=auto go mod download 61 | GOTOOLCHAIN=auto go build ./... 62 | 63 | - name: Build eBPF C 64 | if: matrix.language == 'cpp' 65 | run: | 66 | echo "Compiling podtrace.bpf.c..." 67 | if [ -f /sys/kernel/btf/vmlinux ]; then 68 | clang -O2 -g \ 69 | -target bpf \ 70 | -D__TARGET_ARCH_x86 \ 71 | -DPODTRACE_VMLINUX_FROM_BTF \ 72 | -I./bpf \ 73 | -c bpf/podtrace.bpf.c \ 74 | -o bpf/podtrace.bpf.o 75 | else 76 | clang -O2 -g \ 77 | -target bpf \ 78 | -D__TARGET_ARCH_x86 \ 79 | -I./bpf \ 80 | -c bpf/podtrace.bpf.c \ 81 | -o bpf/podtrace.bpf.o 82 | fi 83 | 84 | - name: Analyze 85 | uses: github/codeql-action/analyze@v4 86 | -------------------------------------------------------------------------------- /bpf/resources.c: -------------------------------------------------------------------------------- 1 | // SPDX-License-Identifier: GPL-2.0 2 | 3 | #include "common.h" 4 | #include "maps.h" 5 | #include "events.h" 6 | #include "helpers.h" 7 | 8 | #define RESOURCE_CPU 0 9 | #define RESOURCE_MEMORY 1 10 | #define RESOURCE_IO 2 11 | 12 | static inline u32 calculate_utilization(u64 usage, u64 limit) { 13 | if (limit == 0 || limit == ~0ULL) { 14 | return 0; 15 | } 16 | if (usage > limit) { 17 | return 100; 18 | } 19 | u64 percent = (usage * 100) / limit; 20 | return (u32)(percent > 100 ? 100 : percent); 21 | } 22 | 23 | static inline u32 check_alert_threshold(u32 utilization) { 24 | if (utilization >= 95) { 25 | return 3; 26 | } else if (utilization >= 90) { 27 | return 2; 28 | } else if (utilization >= 80) { 29 | return 1; 30 | } 31 | return 0; 32 | } 33 | 34 | static inline void emit_resource_alert(u64 cgroup_id, u32 resource_type, u32 utilization, u64 limit, u64 usage) { 35 | struct event *e = get_event_buf(); 36 | if (!e) { 37 | return; 38 | } 39 | 40 | e->timestamp = bpf_ktime_get_ns(); 41 | e->pid = 0; 42 | e->type = EVENT_RESOURCE_LIMIT; 43 | e->latency_ns = 0; 44 | e->error = (s32)utilization; 45 | e->bytes = usage; 46 | e->tcp_state = resource_type; 47 | e->target[0] = '\0'; 48 | 49 | char *details = e->details; 50 | u32 idx = 0; 51 | u32 max_idx = MAX_STRING_LEN - 1; 52 | 53 | const char *resource_names[] = {"CPU", "MEM", "IO"}; 54 | if (resource_type < 3) { 55 | const char *name = resource_names[resource_type]; 56 | for (int i = 0; name[i] != '\0' && idx < max_idx; i++) { 57 | details[idx++] = name[i]; 58 | } 59 | } 60 | 61 | if (idx < max_idx) details[idx++] = ':'; 62 | 63 | if (utilization >= 100 && idx < max_idx - 2) { 64 | details[idx++] = '1'; 65 | details[idx++] = '0'; 66 | details[idx++] = '0'; 67 | } else if (utilization >= 10 && idx < max_idx - 1) { 68 | details[idx++] = '0' + (utilization / 10); 69 | details[idx++] = '0' + (utilization % 10); 70 | } else if (idx < max_idx) { 71 | details[idx++] = '0' + utilization; 72 | } 73 | 74 | if (idx < max_idx) details[idx++] = '%'; 75 | details[idx < MAX_STRING_LEN ? idx : max_idx] = '\0'; 76 | 77 | bpf_ringbuf_output(&events, e, sizeof(*e), 0); 78 | 79 | u32 alert_level = check_alert_threshold(utilization); 80 | if (alert_level > 0) { 81 | bpf_map_update_elem(&cgroup_alerts, &cgroup_id, &alert_level, BPF_ANY); 82 | } else { 83 | bpf_map_delete_elem(&cgroup_alerts, &cgroup_id); 84 | } 85 | } -------------------------------------------------------------------------------- /internal/diagnose/analyzer/network.go: -------------------------------------------------------------------------------- 1 | package analyzer 2 | 3 | import ( 4 | "sort" 5 | 6 | "github.com/podtrace/podtrace/internal/config" 7 | "github.com/podtrace/podtrace/internal/events" 8 | ) 9 | 10 | func AnalyzeTCP(events []*events.Event, rttSpikeThreshold float64) (avgRTT, maxRTT float64, spikes int, p50, p95, p99 float64, errors int, totalBytes, avgBytes, peakBytes uint64) { 11 | var totalRTT float64 12 | var rtts []float64 13 | maxRTT = 0 14 | spikes = 0 15 | errors = 0 16 | totalBytes = 0 17 | peakBytes = 0 18 | 19 | for _, e := range events { 20 | rttMs := float64(e.LatencyNS) / float64(config.NSPerMS) 21 | rtts = append(rtts, rttMs) 22 | totalRTT += rttMs 23 | if rttMs > maxRTT { 24 | maxRTT = rttMs 25 | } 26 | if rttMs > rttSpikeThreshold { 27 | spikes++ 28 | } 29 | if e.Error < 0 && e.Error != -config.EAGAIN { 30 | errors++ 31 | } 32 | if e.Bytes > 0 && e.Bytes < uint64(config.MaxBytesForBandwidth) { 33 | totalBytes += e.Bytes 34 | if e.Bytes > peakBytes { 35 | peakBytes = e.Bytes 36 | } 37 | } 38 | } 39 | 40 | if len(events) > 0 { 41 | avgRTT = totalRTT / float64(len(events)) 42 | sort.Float64s(rtts) 43 | p50 = Percentile(rtts, 50) 44 | p95 = Percentile(rtts, 95) 45 | p99 = Percentile(rtts, 99) 46 | if totalBytes > 0 { 47 | avgBytes = totalBytes / uint64(len(events)) 48 | } 49 | } 50 | return 51 | } 52 | 53 | func AnalyzeConnections(events []*events.Event) (avgLatency, maxLatency float64, errors int, p50, p95, p99 float64, topTargets []TargetCount, errorBreakdown map[int32]int) { 54 | var totalLatency float64 55 | var latencies []float64 56 | maxLatency = 0 57 | errors = 0 58 | targetMap := make(map[string]int) 59 | errorBreakdown = make(map[int32]int) 60 | 61 | for _, e := range events { 62 | latencyMs := float64(e.LatencyNS) / float64(config.NSPerMS) 63 | latencies = append(latencies, latencyMs) 64 | totalLatency += latencyMs 65 | if latencyMs > maxLatency { 66 | maxLatency = latencyMs 67 | } 68 | if e.Error != 0 { 69 | errors++ 70 | errorBreakdown[e.Error]++ 71 | } 72 | if e.Target != "" && e.Target != "?" && e.Target != "unknown" && e.Target != "file" { 73 | targetMap[e.Target]++ 74 | } 75 | } 76 | 77 | if len(events) > 0 { 78 | avgLatency = totalLatency / float64(len(events)) 79 | sort.Float64s(latencies) 80 | p50 = Percentile(latencies, 50) 81 | p95 = Percentile(latencies, 95) 82 | p99 = Percentile(latencies, 99) 83 | } 84 | 85 | for target, count := range targetMap { 86 | topTargets = append(topTargets, TargetCount{target, count}) 87 | } 88 | sort.Slice(topTargets, func(i, j int) bool { 89 | return topTargets[i].Count > topTargets[j].Count 90 | }) 91 | 92 | return 93 | } 94 | -------------------------------------------------------------------------------- /test/run-tests.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Automated test runner for podtrace 3 | 4 | set -e 5 | 6 | NAMESPACE="podtrace-test" 7 | SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" 8 | PROJECT_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)" 9 | 10 | GREEN='\033[0;32m' 11 | YELLOW='\033[1;33m' 12 | RED='\033[0;31m' 13 | BLUE='\033[0;34m' 14 | NC='\033[0m' 15 | 16 | print_header() { 17 | echo -e "${BLUE}=== podtrace Test Runner ===${NC}" 18 | echo "" 19 | } 20 | 21 | check_dependencies() { 22 | if ! command -v kubectl &>/dev/null; then 23 | echo -e "${RED}Error: kubectl is not installed${NC}" 24 | exit 1 25 | fi 26 | 27 | if ! kubectl cluster-info &>/dev/null; then 28 | echo -e "${RED}Error: Cannot connect to Kubernetes cluster${NC}" 29 | exit 1 30 | fi 31 | 32 | if [[ ! -f "${PROJECT_ROOT}/bin/podtrace" ]]; then 33 | echo -e "${RED}Error: podtrace binary not found. Run 'make build' first.${NC}" 34 | exit 1 35 | fi 36 | } 37 | 38 | setup_test_environment() { 39 | echo -e "${YELLOW}[1/4] Setting up test pods...${NC}" 40 | "${SCRIPT_DIR}/setup-test-pods.sh" >/dev/null 2>&1 41 | } 42 | 43 | wait_for_pods() { 44 | echo -e "${YELLOW}[2/4] Waiting for pods to be active...${NC}" 45 | sleep 10 46 | echo "" 47 | } 48 | 49 | run_test() { 50 | local test_name="$1" 51 | local pod_name="$2" 52 | local duration="$3" 53 | 54 | echo -e "${BLUE}${test_name}${NC}" 55 | echo "Running: sudo ${PROJECT_ROOT}/bin/podtrace -n ${NAMESPACE} ${pod_name} --diagnose ${duration}" 56 | echo "" 57 | 58 | local test_output 59 | local test_exit_code 60 | set +e 61 | test_output=$(sudo "${PROJECT_ROOT}/bin/podtrace" -n "${NAMESPACE}" "${pod_name}" --diagnose "${duration}" 2>&1 | head -30 || true) 62 | test_exit_code=${PIPESTATUS[0]} 63 | set -e 64 | echo "${test_output}" 65 | if [[ ${test_exit_code} -eq 0 ]]; then 66 | echo -e "${GREEN}✓ ${test_name} passed${NC}" 67 | else 68 | echo -e "${RED}✗ ${test_name} failed${NC}" 69 | fi 70 | 71 | echo "" 72 | } 73 | 74 | cleanup_test_environment() { 75 | echo -e "${YELLOW}[4/4] Cleaning up...${NC}" 76 | "${SCRIPT_DIR}/cleanup-test-pods.sh" >/dev/null 2>&1 77 | echo "" 78 | } 79 | 80 | print_footer() { 81 | echo -e "${GREEN}=== Tests completed ===${NC}" 82 | } 83 | 84 | main() { 85 | print_header 86 | check_dependencies 87 | setup_test_environment 88 | wait_for_pods 89 | 90 | echo -e "${YELLOW}[3/4] Running tests...${NC}" 91 | echo "" 92 | 93 | run_test "Test 1: Basic tracing (nginx-test)" "nginx-test" "5s" 94 | run_test "Test 2: Diagnose mode (busybox-test)" "busybox-test" "10s" 95 | 96 | cleanup_test_environment 97 | print_footer 98 | } 99 | 100 | main "$@" 101 | -------------------------------------------------------------------------------- /internal/diagnose/profiling/cpu_profiling_test.go: -------------------------------------------------------------------------------- 1 | package profiling 2 | 3 | import ( 4 | "os" 5 | "strings" 6 | "testing" 7 | "time" 8 | 9 | "github.com/podtrace/podtrace/internal/events" 10 | ) 11 | 12 | func TestGenerateCPUUsageReport(t *testing.T) { 13 | duration := 10 * time.Second 14 | 15 | var testEvents []*events.Event 16 | report := GenerateCPUUsageReport(testEvents, duration) 17 | if report == "" { 18 | t.Error("GenerateCPUUsageReport should return a report even with no events") 19 | } 20 | if !strings.Contains(report, "CPU Usage by Process") { 21 | t.Error("Report should contain 'CPU Usage by Process'") 22 | } 23 | 24 | testEvents = []*events.Event{ 25 | { 26 | PID: 1, 27 | ProcessName: "init", 28 | Type: events.EventDNS, 29 | Timestamp: uint64(time.Now().UnixNano()), 30 | }, 31 | { 32 | PID: 1, 33 | ProcessName: "init", 34 | Type: events.EventConnect, 35 | Timestamp: uint64(time.Now().UnixNano()), 36 | }, 37 | } 38 | 39 | selfPID := uint32(os.Getpid()) 40 | if selfPID > 1 { 41 | testEvents = append(testEvents, &events.Event{ 42 | PID: selfPID, 43 | ProcessName: "test-process", 44 | Type: events.EventTCPSend, 45 | Timestamp: uint64(time.Now().UnixNano()), 46 | }) 47 | } 48 | 49 | report = GenerateCPUUsageReport(testEvents, duration) 50 | if !strings.Contains(report, "CPU Usage by Process") { 51 | t.Error("Report should contain 'CPU Usage by Process'") 52 | } 53 | if strings.Contains(report, "Pod Processes") || strings.Contains(report, "System/Kernel Processes") { 54 | } else { 55 | if !strings.Contains(report, "No CPU events") && !strings.Contains(report, "Total CPU usage") { 56 | t.Error("Report should contain either process information or indicate no events") 57 | } 58 | } 59 | } 60 | 61 | 62 | func TestCPUUsageReportWithKernelThreads(t *testing.T) { 63 | duration := 10 * time.Second 64 | 65 | testEvents := []*events.Event{ 66 | { 67 | PID: 1, 68 | ProcessName: "init", 69 | Type: events.EventDNS, 70 | Timestamp: uint64(time.Now().UnixNano()), 71 | }, 72 | { 73 | PID: 2, 74 | ProcessName: "kthreadd", 75 | Type: events.EventSchedSwitch, 76 | Timestamp: uint64(time.Now().UnixNano()), 77 | }, 78 | } 79 | 80 | report := GenerateCPUUsageReport(testEvents, duration) 81 | if !strings.Contains(report, "CPU Usage by Process") { 82 | t.Error("Report should contain 'CPU Usage by Process'") 83 | } 84 | if !strings.Contains(report, "Pod Processes") && !strings.Contains(report, "System/Kernel Processes") { 85 | if !strings.Contains(report, "Total CPU usage") { 86 | t.Error("Report should contain process information or total CPU usage") 87 | } 88 | } 89 | _ = report 90 | } 91 | 92 | -------------------------------------------------------------------------------- /.github/workflows/ebpf-build.yml: -------------------------------------------------------------------------------- 1 | name: eBPF Build 2 | 3 | on: 4 | push: 5 | branches: ["**"] 6 | paths: 7 | - "bpf/**" 8 | - "**/*.c" 9 | - "go.mod" 10 | - "go.sum" 11 | - "cmd/**" 12 | - "internal/**" 13 | - "Makefile" 14 | pull_request: 15 | branches: ["**"] 16 | paths: 17 | - "bpf/**" 18 | - "**/*.c" 19 | - "go.mod" 20 | - "go.sum" 21 | - "cmd/**" 22 | - "internal/**" 23 | - "Makefile" 24 | 25 | jobs: 26 | build-ebpf: 27 | runs-on: ubuntu-latest 28 | 29 | steps: 30 | - name: Checkout 31 | uses: actions/checkout@v4 32 | 33 | - name: Set up Go 34 | uses: actions/setup-go@v4 35 | with: 36 | go-version-file: go.mod 37 | cache: true 38 | 39 | - name: Install build dependencies 40 | run: | 41 | sudo apt-get update 42 | sudo apt-get install -y clang llvm libbpf-dev linux-headers-$(uname -r) linux-tools-$(uname -r) || sudo apt-get install -y clang llvm libbpf-dev linux-headers-$(uname -r) linux-tools-generic 43 | 44 | - name: Generate vmlinux.h from BTF 45 | run: | 46 | echo "Generating vmlinux.h from BTF..." 47 | if [ -f /sys/kernel/btf/vmlinux ]; then 48 | bpftool btf dump file /sys/kernel/btf/vmlinux format c > bpf/vmlinux.h 49 | echo "vmlinux.h generated successfully" 50 | else 51 | echo "Warning: /sys/kernel/btf/vmlinux not found, using placeholder vmlinux.h" 52 | fi 53 | 54 | - name: Install Go dependencies 55 | run: | 56 | echo "Installing Go dependencies..." 57 | GOTOOLCHAIN=auto go mod download 58 | GOTOOLCHAIN=auto go mod verify 59 | 60 | - name: Compile eBPF program 61 | run: | 62 | echo "Compiling podtrace.bpf.c..." 63 | if [ -f /sys/kernel/btf/vmlinux ]; then 64 | clang -O2 -g \ 65 | -target bpf \ 66 | -D__TARGET_ARCH_x86 \ 67 | -DPODTRACE_VMLINUX_FROM_BTF \ 68 | -I./bpf \ 69 | -c bpf/podtrace.bpf.c \ 70 | -o bpf/podtrace.bpf.o 71 | else 72 | clang -O2 -g \ 73 | -target bpf \ 74 | -D__TARGET_ARCH_x86 \ 75 | -I./bpf \ 76 | -c bpf/podtrace.bpf.c \ 77 | -o bpf/podtrace.bpf.o 78 | fi 79 | 80 | - name: Build Go binary 81 | run: | 82 | echo "Building Go binary..." 83 | GOTOOLCHAIN=auto go build -o bin/podtrace ./cmd/podtrace 84 | 85 | - name: List outputs 86 | run: | 87 | echo "eBPF object:" 88 | ls -l bpf/ || true 89 | echo "Go binary:" 90 | ls -l bin/ || true 91 | -------------------------------------------------------------------------------- /internal/diagnose/detector/issues.go: -------------------------------------------------------------------------------- 1 | package detector 2 | 3 | import ( 4 | "fmt" 5 | 6 | "github.com/podtrace/podtrace/internal/config" 7 | "github.com/podtrace/podtrace/internal/events" 8 | ) 9 | 10 | func DetectIssues(allEvents []*events.Event, errorRateThreshold, rttSpikeThreshold float64) []string { 11 | var issues []string 12 | 13 | var connectEvents []*events.Event 14 | for _, e := range allEvents { 15 | if e.Type == events.EventConnect { 16 | connectEvents = append(connectEvents, e) 17 | } 18 | } 19 | 20 | if len(connectEvents) > 0 { 21 | errors := 0 22 | for _, e := range connectEvents { 23 | if e.Error != 0 { 24 | errors++ 25 | } 26 | } 27 | errorRate := float64(errors) / float64(len(connectEvents)) * 100 28 | if errorRate > errorRateThreshold { 29 | issues = append(issues, fmt.Sprintf("High connection failure rate: %.1f%% (%d/%d) (threshold: %.1f%%)", errorRate, errors, len(connectEvents), errorRateThreshold)) 30 | } 31 | } 32 | 33 | var tcpEvents []*events.Event 34 | for _, e := range allEvents { 35 | if e.Type == events.EventTCPSend || e.Type == events.EventTCPRecv { 36 | tcpEvents = append(tcpEvents, e) 37 | } 38 | } 39 | 40 | if len(tcpEvents) > 0 { 41 | spikes := 0 42 | for _, e := range tcpEvents { 43 | if float64(e.LatencyNS)/float64(config.NSPerMS) > rttSpikeThreshold { 44 | spikes++ 45 | } 46 | } 47 | spikeRate := float64(spikes) / float64(len(tcpEvents)) * 100 48 | if spikeRate > config.SpikeRateThreshold { 49 | issues = append(issues, fmt.Sprintf("High TCP RTT spike rate: %.1f%% (%d/%d) (threshold: %.1fms)", spikeRate, spikes, len(tcpEvents), rttSpikeThreshold)) 50 | } 51 | } 52 | 53 | var resourceAlerts = make(map[string]int) 54 | for _, e := range allEvents { 55 | if e.Type == events.EventResourceLimit { 56 | utilization := uint32(e.Error) 57 | resourceType := e.TCPState 58 | 59 | var resourceName string 60 | switch resourceType { 61 | case 0: 62 | resourceName = "CPU" 63 | case 1: 64 | resourceName = "Memory" 65 | case 2: 66 | resourceName = "I/O" 67 | default: 68 | resourceName = "Resource" 69 | } 70 | 71 | key := resourceName 72 | if current, ok := resourceAlerts[key]; !ok || utilization > uint32(current) { 73 | resourceAlerts[key] = int(utilization) 74 | } 75 | } 76 | } 77 | 78 | for resourceName, maxUtil := range resourceAlerts { 79 | var severity string 80 | if maxUtil >= 95 { 81 | severity = "EMERGENCY" 82 | } else if maxUtil >= 90 { 83 | severity = "CRITICAL" 84 | } else if maxUtil >= 80 { 85 | severity = "WARNING" 86 | } 87 | 88 | if severity != "" { 89 | issues = append(issues, fmt.Sprintf("Resource limit %s: %s - %d%% utilization (threshold: 80%% warning, 90%% critical, 95%% emergency)", 90 | severity, resourceName, maxUtil)) 91 | } 92 | } 93 | 94 | return issues 95 | } 96 | -------------------------------------------------------------------------------- /internal/diagnose/analyzer/pool.go: -------------------------------------------------------------------------------- 1 | package analyzer 2 | 3 | import ( 4 | "sort" 5 | "time" 6 | 7 | "github.com/podtrace/podtrace/internal/config" 8 | "github.com/podtrace/podtrace/internal/events" 9 | ) 10 | 11 | type PoolStats struct { 12 | TotalAcquires int 13 | TotalReleases int 14 | ExhaustedCount int 15 | AvgWaitTime time.Duration 16 | MaxWaitTime time.Duration 17 | ReuseRate float64 18 | PeakConnections int 19 | AvgConnections float64 20 | P50WaitTime float64 21 | P95WaitTime float64 22 | P99WaitTime float64 23 | } 24 | 25 | func AnalyzePool(acquireEvents, releaseEvents, exhaustedEvents []*events.Event) PoolStats { 26 | stats := PoolStats{ 27 | TotalAcquires: len(acquireEvents), 28 | TotalReleases: len(releaseEvents), 29 | ExhaustedCount: len(exhaustedEvents), 30 | } 31 | 32 | if stats.TotalAcquires > 0 { 33 | stats.ReuseRate = float64(stats.TotalReleases) / float64(stats.TotalAcquires) 34 | } 35 | 36 | var waitTimes []float64 37 | var totalWaitTime time.Duration 38 | maxWaitTime := time.Duration(0) 39 | 40 | for _, e := range exhaustedEvents { 41 | waitTime := e.Latency() 42 | waitTimes = append(waitTimes, float64(waitTime.Nanoseconds())/float64(config.NSPerMS)) 43 | totalWaitTime += waitTime 44 | if waitTime > maxWaitTime { 45 | maxWaitTime = waitTime 46 | } 47 | } 48 | 49 | if stats.ExhaustedCount > 0 { 50 | stats.AvgWaitTime = totalWaitTime / time.Duration(stats.ExhaustedCount) 51 | stats.MaxWaitTime = maxWaitTime 52 | 53 | if len(waitTimes) > 0 { 54 | sort.Float64s(waitTimes) 55 | stats.P50WaitTime = Percentile(waitTimes, 50) 56 | stats.P95WaitTime = Percentile(waitTimes, 95) 57 | stats.P99WaitTime = Percentile(waitTimes, 99) 58 | } 59 | } 60 | 61 | poolTracker := make(map[string]struct { 62 | current int 63 | peak int 64 | }) 65 | 66 | for _, e := range acquireEvents { 67 | poolID := e.Target 68 | if poolID == "" { 69 | poolID = "default" 70 | } 71 | pool := poolTracker[poolID] 72 | pool.current++ 73 | if pool.current > pool.peak { 74 | pool.peak = pool.current 75 | } 76 | poolTracker[poolID] = pool 77 | } 78 | 79 | for _, e := range releaseEvents { 80 | poolID := e.Target 81 | if poolID == "" { 82 | poolID = "default" 83 | } 84 | pool := poolTracker[poolID] 85 | if pool.current > 0 { 86 | pool.current-- 87 | } 88 | poolTracker[poolID] = pool 89 | } 90 | 91 | totalPeak := 0 92 | totalCurrent := 0 93 | for _, pool := range poolTracker { 94 | if pool.peak > totalPeak { 95 | totalPeak = pool.peak 96 | } 97 | totalCurrent += pool.current 98 | } 99 | 100 | stats.PeakConnections = totalPeak 101 | if len(poolTracker) > 0 { 102 | stats.AvgConnections = float64(totalCurrent) / float64(len(poolTracker)) 103 | } 104 | 105 | return stats 106 | } 107 | -------------------------------------------------------------------------------- /internal/logger/logger.go: -------------------------------------------------------------------------------- 1 | package logger 2 | 3 | import ( 4 | "os" 5 | 6 | "go.uber.org/zap" 7 | "go.uber.org/zap/zapcore" 8 | 9 | "github.com/podtrace/podtrace/internal/alerting" 10 | "github.com/podtrace/podtrace/internal/config" 11 | ) 12 | 13 | var ( 14 | log *zap.Logger 15 | atomicLevel zap.AtomicLevel 16 | ) 17 | 18 | func init() { 19 | level := getLogLevel() 20 | atomicLevel = zap.NewAtomicLevelAt(level) 21 | encoderConfig := zap.NewProductionEncoderConfig() 22 | encoderConfig.EncodeTime = zapcore.ISO8601TimeEncoder 23 | encoderConfig.EncodeLevel = zapcore.LowercaseLevelEncoder 24 | 25 | core := zapcore.NewCore( 26 | zapcore.NewJSONEncoder(encoderConfig), 27 | zapcore.AddSync(os.Stderr), 28 | atomicLevel, 29 | ) 30 | 31 | log = zap.New(core, zap.AddCaller(), zap.AddStacktrace(zapcore.ErrorLevel)) 32 | } 33 | 34 | func getLogLevel() zapcore.Level { 35 | levelStr := os.Getenv("PODTRACE_LOG_LEVEL") 36 | if levelStr == "" { 37 | levelStr = config.DefaultLogLevel 38 | } 39 | return parseLogLevel(levelStr) 40 | } 41 | 42 | func Debug(msg string, fields ...zap.Field) { 43 | log.Debug(msg, fields...) 44 | } 45 | 46 | func Info(msg string, fields ...zap.Field) { 47 | log.Info(msg, fields...) 48 | } 49 | 50 | func Warn(msg string, fields ...zap.Field) { 51 | log.Warn(msg, fields...) 52 | manager := alerting.GetGlobalManager() 53 | if manager != nil { 54 | if alert := alerting.CreateAlertFromLog(zapcore.WarnLevel, msg, fields, "", ""); alert != nil { 55 | manager.SendAlert(alert) 56 | } 57 | } 58 | } 59 | 60 | func Error(msg string, fields ...zap.Field) { 61 | log.Error(msg, fields...) 62 | manager := alerting.GetGlobalManager() 63 | if manager != nil { 64 | if alert := alerting.CreateAlertFromLog(zapcore.ErrorLevel, msg, fields, "", ""); alert != nil { 65 | manager.SendAlert(alert) 66 | } 67 | } 68 | } 69 | 70 | func Fatal(msg string, fields ...zap.Field) { 71 | log.Fatal(msg, fields...) 72 | manager := alerting.GetGlobalManager() 73 | if manager != nil { 74 | if alert := alerting.CreateAlertFromLog(zapcore.FatalLevel, msg, fields, "", ""); alert != nil { 75 | manager.SendAlert(alert) 76 | } 77 | } 78 | } 79 | 80 | func Logger() *zap.Logger { 81 | return log 82 | } 83 | 84 | func Sync() { 85 | _ = log.Sync() 86 | } 87 | 88 | func SetLevel(levelStr string) { 89 | level := parseLogLevel(levelStr) 90 | atomicLevel.SetLevel(level) 91 | } 92 | 93 | func parseLogLevel(levelStr string) zapcore.Level { 94 | switch levelStr { 95 | case "debug": 96 | return zapcore.DebugLevel 97 | case "info": 98 | return zapcore.InfoLevel 99 | case "warn": 100 | return zapcore.WarnLevel 101 | case "error": 102 | return zapcore.ErrorLevel 103 | case "fatal": 104 | return zapcore.FatalLevel 105 | default: 106 | return zapcore.InfoLevel 107 | } 108 | } 109 | -------------------------------------------------------------------------------- /internal/diagnose/formatter/formatter.go: -------------------------------------------------------------------------------- 1 | package formatter 2 | 3 | import ( 4 | "fmt" 5 | "sort" 6 | 7 | "github.com/podtrace/podtrace/internal/config" 8 | "github.com/podtrace/podtrace/internal/diagnose/analyzer" 9 | ) 10 | 11 | func SectionHeader(title string) string { 12 | return fmt.Sprintf("%s Statistics:\n", title) 13 | } 14 | 15 | func TotalWithRate(label string, count int, rate float64) string { 16 | return fmt.Sprintf(" Total %s: %d (%.1f/sec)\n", label, count, rate) 17 | } 18 | 19 | func LatencyMetrics(avgLatency, maxLatency float64) string { 20 | return fmt.Sprintf(" Average latency: %.2fms\n Max latency: %.2fms\n", avgLatency, maxLatency) 21 | } 22 | 23 | func Percentiles(p50, p95, p99 float64) string { 24 | return fmt.Sprintf(" Percentiles: P50=%.2fms, P95=%.2fms, P99=%.2fms\n", p50, p95, p99) 25 | } 26 | 27 | func ErrorRate(errors, total int) string { 28 | if total == 0 { 29 | return fmt.Sprintf(" Errors: %d (0.0%%)\n", errors) 30 | } 31 | return fmt.Sprintf(" Errors: %d (%.1f%%)\n", errors, float64(errors)*float64(config.Percent100)/float64(total)) 32 | } 33 | 34 | func TopTargets(targets []analyzer.TargetCount, limit int, headerLabel, countLabel string) string { 35 | if len(targets) == 0 { 36 | return "" 37 | } 38 | var result string 39 | result += fmt.Sprintf(" Top %s:\n", headerLabel) 40 | for i, target := range targets { 41 | if i >= limit { 42 | break 43 | } 44 | result += fmt.Sprintf(" - %s (%d %s)\n", target.Target, target.Count, countLabel) 45 | } 46 | return result 47 | } 48 | 49 | func BytesSection(totalBytes, avgBytes uint64, throughput uint64) string { 50 | if totalBytes == 0 { 51 | return "" 52 | } 53 | var result string 54 | result += fmt.Sprintf(" Total bytes transferred: %s\n", analyzer.FormatBytes(totalBytes)) 55 | result += fmt.Sprintf(" Average bytes per operation: %s\n", analyzer.FormatBytes(avgBytes)) 56 | if throughput > 0 { 57 | result += fmt.Sprintf(" Average throughput: %s/sec\n", analyzer.FormatBytes(throughput)) 58 | } 59 | return result 60 | } 61 | 62 | func Rate(count int, duration float64) string { 63 | if duration > 0 { 64 | return fmt.Sprintf(" (%.1f/sec)", float64(count)/duration) 65 | } 66 | return "" 67 | } 68 | 69 | func TopItems(items map[string]int, limit int, headerLabel, itemLabel string) string { 70 | if len(items) == 0 { 71 | return "" 72 | } 73 | type itemCount struct { 74 | name string 75 | count int 76 | } 77 | var itemCounts []itemCount 78 | for name, count := range items { 79 | itemCounts = append(itemCounts, itemCount{name: name, count: count}) 80 | } 81 | sort.Slice(itemCounts, func(i, j int) bool { 82 | return itemCounts[i].count > itemCounts[j].count 83 | }) 84 | var result string 85 | result += fmt.Sprintf(" Top %s:\n", headerLabel) 86 | for i, ic := range itemCounts { 87 | if i >= limit { 88 | break 89 | } 90 | result += fmt.Sprintf(" - %s (%d %s)\n", ic.name, ic.count, itemLabel) 91 | } 92 | return result 93 | } 94 | 95 | -------------------------------------------------------------------------------- /cmd/podtrace/export_test.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "bytes" 5 | "io" 6 | "os" 7 | "strings" 8 | "testing" 9 | 10 | "github.com/podtrace/podtrace/internal/diagnose" 11 | "github.com/podtrace/podtrace/internal/events" 12 | ) 13 | 14 | func TestExportReport_JSON(t *testing.T) { 15 | d := diagnose.NewDiagnostician() 16 | d.AddEvent(&events.Event{Type: events.EventDNS, LatencyNS: 5000000, Target: "example.com"}) 17 | d.Finish() 18 | 19 | originalStdout := os.Stdout 20 | r, w, _ := os.Pipe() 21 | os.Stdout = w 22 | 23 | err := exportReport("test report", "json", d) 24 | _ = w.Close() 25 | os.Stdout = originalStdout 26 | 27 | if err == nil { 28 | var buf bytes.Buffer 29 | _, _ = io.Copy(&buf, r) 30 | t.Logf("JSON export test completed, output length: %d", buf.Len()) 31 | } 32 | } 33 | 34 | func TestExportReport_CSV(t *testing.T) { 35 | d := diagnose.NewDiagnostician() 36 | d.AddEvent(&events.Event{Type: events.EventDNS, LatencyNS: 5000000, Target: "example.com"}) 37 | d.Finish() 38 | 39 | var buf bytes.Buffer 40 | originalStdout := os.Stdout 41 | r, w, _ := os.Pipe() 42 | os.Stdout = w 43 | 44 | err := exportReport("test report", "csv", d) 45 | _ = w.Close() 46 | os.Stdout = originalStdout 47 | 48 | if err == nil { 49 | _, _ = io.Copy(&buf, r) 50 | t.Logf("CSV export test completed, output length: %d", buf.Len()) 51 | } 52 | } 53 | 54 | func TestExportReport_InvalidFormat(t *testing.T) { 55 | d := diagnose.NewDiagnostician() 56 | err := exportReport("test report", "invalid", d) 57 | 58 | if err == nil { 59 | t.Error("Expected error for invalid format") 60 | } 61 | 62 | if err != nil && !strings.Contains(err.Error(), "unsupported") { 63 | t.Errorf("Expected error message to contain 'unsupported', got: %v", err) 64 | } 65 | } 66 | 67 | func TestExportReport_FormatVariations(t *testing.T) { 68 | d := diagnose.NewDiagnostician() 69 | d.AddEvent(&events.Event{Type: events.EventDNS, LatencyNS: 5000000, Target: "example.com"}) 70 | d.Finish() 71 | 72 | tests := []struct { 73 | name string 74 | format string 75 | expectError bool 76 | }{ 77 | {"uppercase JSON", "JSON", false}, 78 | {"uppercase CSV", "CSV", false}, 79 | {"mixed case", "Json", false}, 80 | {"with spaces", " json ", false}, 81 | {"invalid format", "xml", true}, 82 | } 83 | 84 | for _, tt := range tests { 85 | t.Run(tt.name, func(t *testing.T) { 86 | originalStdout := os.Stdout 87 | r, w, _ := os.Pipe() 88 | os.Stdout = w 89 | 90 | err := exportReport("test report", tt.format, d) 91 | _ = w.Close() 92 | os.Stdout = originalStdout 93 | 94 | if tt.expectError && err == nil { 95 | t.Error("Expected error but got none") 96 | } 97 | if !tt.expectError && err != nil { 98 | t.Errorf("Unexpected error: %v", err) 99 | } 100 | if !tt.expectError { 101 | _, _ = io.Copy(io.Discard, r) 102 | } 103 | }) 104 | } 105 | } 106 | -------------------------------------------------------------------------------- /test/test-pods-full.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Namespace 3 | metadata: 4 | name: podtrace-test 5 | --- 6 | apiVersion: v1 7 | kind: Pod 8 | metadata: 9 | name: nginx-cpu-test 10 | namespace: podtrace-test 11 | labels: 12 | app: nginx-cpu 13 | spec: 14 | containers: 15 | - name: nginx 16 | image: nginx:latest 17 | resources: 18 | requests: 19 | cpu: "100m" 20 | memory: "128Mi" 21 | limits: 22 | cpu: "500m" 23 | memory: "256Mi" 24 | command: 25 | - sh 26 | - -c 27 | - | 28 | nginx 29 | while true; do 30 | for i in {1..1000}; do 31 | echo "CPU test $i" > /dev/null 32 | done 33 | sleep 0.1 34 | done 35 | --- 36 | apiVersion: v1 37 | kind: Pod 38 | metadata: 39 | name: network-test 40 | namespace: podtrace-test 41 | labels: 42 | app: network-test 43 | spec: 44 | containers: 45 | - name: network 46 | image: curlimages/curl:latest 47 | command: 48 | - sh 49 | - -c 50 | - | 51 | while true; do 52 | nslookup google.com || true 53 | curl -s -o /dev/null http://httpbin.org/get || true 54 | sleep 1 55 | done 56 | --- 57 | apiVersion: v1 58 | kind: Pod 59 | metadata: 60 | name: io-test 61 | namespace: podtrace-test 62 | labels: 63 | app: io-test 64 | spec: 65 | containers: 66 | - name: io 67 | image: busybox:latest 68 | command: 69 | - sh 70 | - -c 71 | - | 72 | while true; do 73 | echo "I/O test $(date)" >> /tmp/io-test.log 74 | sync 75 | sleep 0.5 76 | done 77 | --- 78 | apiVersion: v1 79 | kind: Pod 80 | metadata: 81 | name: memory-test 82 | namespace: podtrace-test 83 | labels: 84 | app: memory-test 85 | spec: 86 | containers: 87 | - name: memory 88 | image: python:3.9-slim 89 | command: 90 | - python3 91 | - -c 92 | - | 93 | import time 94 | data = [] 95 | while True: 96 | data.append(bytearray(1024 * 1024)) 97 | time.sleep(0.1) 98 | if len(data) > 10: 99 | data.pop(0) 100 | --- 101 | apiVersion: v1 102 | kind: Pod 103 | metadata: 104 | name: multithread-test 105 | namespace: podtrace-test 106 | labels: 107 | app: multithread-test 108 | spec: 109 | containers: 110 | - name: multithread 111 | image: python:3.9-slim 112 | command: 113 | - python3 114 | - -c 115 | - | 116 | import threading 117 | import time 118 | 119 | def cpu_worker(worker_id): 120 | while True: 121 | sum(range(10000)) 122 | time.sleep(0.01) 123 | 124 | threads = [] 125 | for i in range(4): 126 | t = threading.Thread(target=cpu_worker, args=(i,)) 127 | t.daemon = True 128 | t.start() 129 | threads.append(t) 130 | 131 | while True: 132 | time.sleep(1) 133 | -------------------------------------------------------------------------------- /internal/kubernetes/service_resolver.go: -------------------------------------------------------------------------------- 1 | package kubernetes 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | "sync" 7 | "time" 8 | 9 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 10 | "k8s.io/client-go/kubernetes" 11 | ) 12 | 13 | type ServiceInfo struct { 14 | Name string 15 | Namespace string 16 | Port int 17 | } 18 | 19 | type ServiceResolver struct { 20 | clientset kubernetes.Interface 21 | endpointCache *sync.Map 22 | cacheTTL time.Duration 23 | informerCache *InformerCache 24 | } 25 | 26 | type endpointCacheEntry struct { 27 | serviceInfo ServiceInfo 28 | expiresAt time.Time 29 | } 30 | 31 | func NewServiceResolver(clientset kubernetes.Interface) *ServiceResolver { 32 | return NewServiceResolverWithCache(clientset, nil) 33 | } 34 | 35 | func NewServiceResolverWithCache(clientset kubernetes.Interface, ic *InformerCache) *ServiceResolver { 36 | ttl := time.Duration(getIntEnvOrDefault("PODTRACE_K8S_CACHE_TTL", 300)) * time.Second 37 | return &ServiceResolver{ 38 | clientset: clientset, 39 | endpointCache: &sync.Map{}, 40 | cacheTTL: ttl, 41 | informerCache: ic, 42 | } 43 | } 44 | 45 | func (sr *ServiceResolver) ResolveService(ctx context.Context, ip string, port int) *ServiceInfo { 46 | if ip == "" || port == 0 || sr.clientset == nil { 47 | if sr.informerCache != nil && ip != "" { 48 | return sr.informerCache.GetServiceByEndpoint(ip, port) 49 | } 50 | return nil 51 | } 52 | 53 | if sr.informerCache != nil { 54 | if svc := sr.informerCache.GetServiceByEndpoint(ip, port); svc != nil { 55 | return svc 56 | } 57 | } 58 | 59 | cacheKey := fmt.Sprintf("%s:%d", ip, port) 60 | if cached, ok := sr.endpointCache.Load(cacheKey); ok { 61 | entry := cached.(*endpointCacheEntry) 62 | if time.Now().Before(entry.expiresAt) { 63 | return &entry.serviceInfo 64 | } 65 | sr.endpointCache.Delete(cacheKey) 66 | } 67 | 68 | serviceInfo := sr.fetchServiceByEndpoint(ctx, ip, port) 69 | if serviceInfo != nil { 70 | sr.endpointCache.Store(cacheKey, &endpointCacheEntry{ 71 | serviceInfo: *serviceInfo, 72 | expiresAt: time.Now().Add(sr.cacheTTL), 73 | }) 74 | } 75 | 76 | return serviceInfo 77 | } 78 | 79 | func (sr *ServiceResolver) fetchServiceByEndpoint(ctx context.Context, ip string, port int) *ServiceInfo { 80 | endpointsList, err := sr.clientset.CoreV1().Endpoints(metav1.NamespaceAll).List(ctx, metav1.ListOptions{}) 81 | if err != nil { 82 | return nil 83 | } 84 | 85 | for _, endpoint := range endpointsList.Items { 86 | for _, subset := range endpoint.Subsets { 87 | for _, addr := range subset.Addresses { 88 | if addr.IP == ip { 89 | for _, epPort := range subset.Ports { 90 | if int(epPort.Port) == port { 91 | return &ServiceInfo{ 92 | Name: endpoint.Name, 93 | Namespace: endpoint.Namespace, 94 | Port: port, 95 | } 96 | } 97 | } 98 | } 99 | } 100 | } 101 | } 102 | 103 | return nil 104 | } 105 | -------------------------------------------------------------------------------- /internal/alerting/webhook.go: -------------------------------------------------------------------------------- 1 | package alerting 2 | 3 | import ( 4 | "bytes" 5 | "context" 6 | "encoding/json" 7 | "fmt" 8 | "io" 9 | "net/http" 10 | "net/url" 11 | "strings" 12 | "time" 13 | 14 | "github.com/podtrace/podtrace/internal/config" 15 | ) 16 | 17 | type WebhookSender struct { 18 | url string 19 | client *http.Client 20 | timeout time.Duration 21 | } 22 | 23 | func NewWebhookSender(webhookURL string, timeout time.Duration) (*WebhookSender, error) { 24 | if webhookURL == "" { 25 | return nil, fmt.Errorf("webhook URL is required") 26 | } 27 | parsedURL, err := url.Parse(webhookURL) 28 | if err != nil { 29 | return nil, fmt.Errorf("invalid webhook URL: %w", err) 30 | } 31 | if parsedURL.Scheme != "http" && parsedURL.Scheme != "https" { 32 | return nil, fmt.Errorf("webhook URL must use http or https scheme") 33 | } 34 | host := strings.ToLower(parsedURL.Hostname()) 35 | if host != "localhost" && host != "127.0.0.1" && host != "::1" { 36 | if parsedURL.Scheme == "http" { 37 | return nil, fmt.Errorf("non-localhost URLs must use https") 38 | } 39 | } 40 | return &WebhookSender{ 41 | url: webhookURL, 42 | client: &http.Client{Timeout: timeout}, 43 | timeout: timeout, 44 | }, nil 45 | } 46 | 47 | func (w *WebhookSender) Send(ctx context.Context, alert *Alert) error { 48 | if alert == nil { 49 | return fmt.Errorf("alert is nil") 50 | } 51 | payload := map[string]interface{}{ 52 | "severity": string(alert.Severity), 53 | "title": alert.Title, 54 | "message": alert.Message, 55 | "timestamp": alert.Timestamp.Format(time.RFC3339), 56 | "source": alert.Source, 57 | "pod": alert.PodName, 58 | "namespace": alert.Namespace, 59 | "context": alert.Context, 60 | } 61 | if alert.ErrorCode != "" { 62 | payload["error_code"] = alert.ErrorCode 63 | } 64 | if len(alert.Recommendations) > 0 { 65 | payload["recommendations"] = alert.Recommendations 66 | } 67 | jsonData, err := json.Marshal(payload) 68 | if err != nil { 69 | return fmt.Errorf("failed to marshal alert: %w", err) 70 | } 71 | if int64(len(jsonData)) > config.AlertMaxPayloadSize { 72 | return fmt.Errorf("payload size %d exceeds maximum %d", len(jsonData), config.AlertMaxPayloadSize) 73 | } 74 | req, err := http.NewRequestWithContext(ctx, "POST", w.url, bytes.NewReader(jsonData)) 75 | if err != nil { 76 | return fmt.Errorf("failed to create request: %w", err) 77 | } 78 | req.Header.Set("Content-Type", "application/json") 79 | req.Header.Set("User-Agent", config.GetUserAgent()) 80 | resp, err := w.client.Do(req) 81 | if err != nil { 82 | return fmt.Errorf("failed to send request: %w", err) 83 | } 84 | defer func() { 85 | _, _ = io.Copy(io.Discard, resp.Body) 86 | _ = resp.Body.Close() 87 | }() 88 | if resp.StatusCode < 200 || resp.StatusCode >= 300 { 89 | bodyBytes, _ := io.ReadAll(io.LimitReader(resp.Body, 512)) 90 | return fmt.Errorf("unexpected status code %d: %s", resp.StatusCode, string(bodyBytes)) 91 | } 92 | return nil 93 | } 94 | 95 | func (w *WebhookSender) Name() string { 96 | return "webhook" 97 | } 98 | -------------------------------------------------------------------------------- /cmd/podtrace/main_test.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "os" 5 | "testing" 6 | "time" 7 | ) 8 | 9 | func TestMain_CommandExecution(t *testing.T) { 10 | if testing.Short() { 11 | t.Skip("Skipping main function test in short mode") 12 | } 13 | 14 | origArgs := os.Args 15 | defer func() { os.Args = origArgs }() 16 | 17 | os.Args = []string{"podtrace", "--help"} 18 | 19 | oldExit := exitFunc 20 | exited := false 21 | exitFunc = func(code int) { 22 | exited = true 23 | } 24 | defer func() { exitFunc = oldExit }() 25 | 26 | done := make(chan bool, 1) 27 | go func() { 28 | main() 29 | done <- true 30 | }() 31 | 32 | select { 33 | case <-done: 34 | if !exited { 35 | t.Log("main function executed (help command)") 36 | } 37 | case <-time.After(1 * time.Second): 38 | t.Log("main function test completed") 39 | } 40 | } 41 | 42 | func TestMain_InvalidArgs(t *testing.T) { 43 | if testing.Short() { 44 | t.Skip("Skipping main function test in short mode") 45 | } 46 | 47 | origArgs := os.Args 48 | defer func() { os.Args = origArgs }() 49 | 50 | os.Args = []string{"podtrace"} 51 | 52 | oldExit := exitFunc 53 | exited := false 54 | exitFunc = func(code int) { 55 | exited = true 56 | } 57 | defer func() { exitFunc = oldExit }() 58 | 59 | done := make(chan bool, 1) 60 | go func() { 61 | main() 62 | done <- true 63 | }() 64 | 65 | select { 66 | case <-done: 67 | if !exited { 68 | t.Log("main function executed (invalid args)") 69 | } 70 | case <-time.After(1 * time.Second): 71 | t.Log("main function test completed") 72 | } 73 | } 74 | 75 | func TestMain_LogLevel(t *testing.T) { 76 | origLogLevel := logLevel 77 | origArgs := os.Args 78 | defer func() { 79 | logLevel = origLogLevel 80 | os.Args = origArgs 81 | }() 82 | 83 | os.Args = []string{"podtrace", "--log-level", "debug", "--help"} 84 | 85 | oldExit := exitFunc 86 | exitFunc = func(code int) { 87 | } 88 | defer func() { exitFunc = oldExit }() 89 | 90 | done := make(chan bool, 1) 91 | go func() { 92 | main() 93 | done <- true 94 | }() 95 | 96 | select { 97 | case <-done: 98 | t.Log("main function executed with log level") 99 | case <-time.After(1 * time.Second): 100 | t.Log("main function test completed") 101 | } 102 | } 103 | 104 | func TestMain_CommandError(t *testing.T) { 105 | if testing.Short() { 106 | t.Skip("Skipping main function test in short mode") 107 | } 108 | 109 | origArgs := os.Args 110 | defer func() { os.Args = origArgs }() 111 | 112 | os.Args = []string{"podtrace", "test-pod", "--invalid-flag"} 113 | 114 | oldExit := exitFunc 115 | exited := false 116 | exitFunc = func(code int) { 117 | exited = true 118 | } 119 | defer func() { exitFunc = oldExit }() 120 | 121 | done := make(chan bool, 1) 122 | go func() { 123 | main() 124 | done <- true 125 | }() 126 | 127 | select { 128 | case <-done: 129 | if !exited { 130 | t.Log("main function executed (command error)") 131 | } 132 | case <-time.After(1 * time.Second): 133 | t.Log("main function test completed") 134 | } 135 | } 136 | -------------------------------------------------------------------------------- /internal/ebpf/loader/errors_test.go: -------------------------------------------------------------------------------- 1 | package loader 2 | 3 | import ( 4 | "errors" 5 | "testing" 6 | ) 7 | 8 | func TestLoaderError_Error_WithErr(t *testing.T) { 9 | originalErr := errors.New("underlying error") 10 | loaderErr := &LoaderError{ 11 | Code: ErrCodeLoadFailed, 12 | Message: "test message", 13 | Err: originalErr, 14 | } 15 | 16 | errStr := loaderErr.Error() 17 | if errStr == "" { 18 | t.Error("Error() should return non-empty string") 19 | } 20 | if !contains(errStr, "test message") { 21 | t.Errorf("Expected error string to contain 'test message', got %q", errStr) 22 | } 23 | if !contains(errStr, "underlying error") { 24 | t.Errorf("Expected error string to contain 'underlying error', got %q", errStr) 25 | } 26 | } 27 | 28 | func TestLoaderError_Error_WithoutErr(t *testing.T) { 29 | loaderErr := &LoaderError{ 30 | Code: ErrCodeLoadFailed, 31 | Message: "test message", 32 | Err: nil, 33 | } 34 | 35 | errStr := loaderErr.Error() 36 | if errStr != "test message" { 37 | t.Errorf("Expected error string 'test message', got %q", errStr) 38 | } 39 | } 40 | 41 | func TestLoaderError_Unwrap(t *testing.T) { 42 | originalErr := errors.New("underlying error") 43 | loaderErr := &LoaderError{ 44 | Code: ErrCodeLoadFailed, 45 | Message: "test message", 46 | Err: originalErr, 47 | } 48 | 49 | unwrapped := loaderErr.Unwrap() 50 | if unwrapped != originalErr { 51 | t.Errorf("Expected unwrapped error to be original error, got %v", unwrapped) 52 | } 53 | } 54 | 55 | func TestLoaderError_Unwrap_Nil(t *testing.T) { 56 | loaderErr := &LoaderError{ 57 | Code: ErrCodeLoadFailed, 58 | Message: "test message", 59 | Err: nil, 60 | } 61 | 62 | unwrapped := loaderErr.Unwrap() 63 | if unwrapped != nil { 64 | t.Errorf("Expected unwrapped error to be nil, got %v", unwrapped) 65 | } 66 | } 67 | 68 | func TestNewLoadError(t *testing.T) { 69 | originalErr := errors.New("file not found") 70 | loaderErr := NewLoadError("/path/to/bpf.o", originalErr) 71 | 72 | if loaderErr == nil { 73 | t.Fatal("NewLoadError returned nil") 74 | } 75 | if loaderErr.Code != ErrCodeLoadFailed { 76 | t.Errorf("Expected error code %d, got %d", ErrCodeLoadFailed, loaderErr.Code) 77 | } 78 | if !contains(loaderErr.Message, "failed to load eBPF program") { 79 | t.Errorf("Expected message to contain 'failed to load eBPF program', got %q", loaderErr.Message) 80 | } 81 | if !contains(loaderErr.Message, "/path/to/bpf.o") { 82 | t.Errorf("Expected message to contain '/path/to/bpf.o', got %q", loaderErr.Message) 83 | } 84 | if loaderErr.Unwrap() != originalErr { 85 | t.Errorf("Expected unwrapped error to be original error") 86 | } 87 | } 88 | 89 | func contains(s, substr string) bool { 90 | return len(s) >= len(substr) && (s == substr || 91 | (len(s) > len(substr) && containsMiddle(s, substr))) 92 | } 93 | 94 | func containsMiddle(s, substr string) bool { 95 | for i := 0; i <= len(s)-len(substr); i++ { 96 | if s[i:i+len(substr)] == substr { 97 | return true 98 | } 99 | } 100 | return false 101 | } 102 | 103 | -------------------------------------------------------------------------------- /internal/tracing/extractor/http_test.go: -------------------------------------------------------------------------------- 1 | package extractor 2 | 3 | import ( 4 | "net/http" 5 | "testing" 6 | ) 7 | 8 | func TestHTTPExtractor_ExtractFromHeaders(t *testing.T) { 9 | extractor := NewHTTPExtractor() 10 | 11 | tests := []struct { 12 | name string 13 | headers map[string]string 14 | wantNil bool 15 | }{ 16 | {"W3C traceparent", map[string]string{"traceparent": "00-4bf92f3577b34da6a3ce929d0e0e4736-00f067aa0ba902b7-01"}, false}, 17 | {"B3 headers", map[string]string{"x-b3-traceid": "abc", "x-b3-spanid": "def"}, false}, 18 | {"Splunk", map[string]string{"x-splunk-requestid": "req123"}, false}, 19 | {"empty", map[string]string{}, true}, 20 | {"no trace headers", map[string]string{"content-type": "application/json"}, true}, 21 | } 22 | 23 | for _, tt := range tests { 24 | t.Run(tt.name, func(t *testing.T) { 25 | tc := extractor.ExtractFromHeaders(tt.headers) 26 | if (tc == nil) != tt.wantNil { 27 | t.Errorf("ExtractFromHeaders() = %v, want nil = %v", tc, tt.wantNil) 28 | } 29 | }) 30 | } 31 | } 32 | 33 | func TestHTTPExtractor_ExtractFromHTTPRequest(t *testing.T) { 34 | extractor := NewHTTPExtractor() 35 | 36 | req, _ := http.NewRequest("GET", "http://example.com", nil) 37 | req.Header.Set("traceparent", "00-4bf92f3577b34da6a3ce929d0e0e4736-00f067aa0ba902b7-01") 38 | 39 | tc := extractor.ExtractFromHTTPRequest(req) 40 | if tc == nil { 41 | t.Fatal("ExtractFromHTTPRequest() returned nil") 42 | } 43 | if tc.TraceID == "" { 44 | t.Error("TraceID should be extracted") 45 | } 46 | } 47 | 48 | func TestHTTPExtractor_ExtractFromHTTPResponse(t *testing.T) { 49 | extractor := NewHTTPExtractor() 50 | 51 | resp := &http.Response{ 52 | Header: make(http.Header), 53 | } 54 | resp.Header.Set("traceparent", "00-4bf92f3577b34da6a3ce929d0e0e4736-00f067aa0ba902b7-01") 55 | 56 | tc := extractor.ExtractFromHTTPResponse(resp) 57 | if tc == nil { 58 | t.Fatal("ExtractFromHTTPResponse() returned nil") 59 | } 60 | if tc.TraceID == "" { 61 | t.Error("TraceID should be extracted") 62 | } 63 | } 64 | 65 | func TestHTTPExtractor_ExtractFromRawHeaders(t *testing.T) { 66 | extractor := NewHTTPExtractor() 67 | 68 | rawHeaders := "traceparent: 00-4bf92f3577b34da6a3ce929d0e0e4736-00f067aa0ba902b7-01\r\ncontent-type: application/json" 69 | 70 | tc := extractor.ExtractFromRawHeaders(rawHeaders) 71 | if tc == nil { 72 | t.Fatal("ExtractFromRawHeaders() returned nil") 73 | } 74 | if tc.TraceID == "" { 75 | t.Error("TraceID should be extracted") 76 | } 77 | } 78 | 79 | func TestHTTPExtractor_ExtractFromRawHeaders_Empty(t *testing.T) { 80 | extractor := NewHTTPExtractor() 81 | tc := extractor.ExtractFromRawHeaders("") 82 | if tc != nil { 83 | t.Error("ExtractFromRawHeaders(\"\") should return nil") 84 | } 85 | } 86 | 87 | func TestParseRawHeaders(t *testing.T) { 88 | raw := "header1: value1\r\nheader2: value2\r\n\r\n" 89 | headers := parseRawHeaders(raw) 90 | 91 | if len(headers) != 2 { 92 | t.Errorf("Expected 2 headers, got %d", len(headers)) 93 | } 94 | if headers["header1"] != "value1" { 95 | t.Errorf("header1 = %s, want value1", headers["header1"]) 96 | } 97 | if headers["header2"] != "value2" { 98 | t.Errorf("header2 = %s, want value2", headers["header2"]) 99 | } 100 | } 101 | -------------------------------------------------------------------------------- /internal/ebpf/cache/lru.go: -------------------------------------------------------------------------------- 1 | package cache 2 | 3 | import ( 4 | "container/list" 5 | "sync" 6 | "time" 7 | 8 | "github.com/podtrace/podtrace/internal/config" 9 | "github.com/podtrace/podtrace/internal/metricsexporter" 10 | "github.com/podtrace/podtrace/internal/validation" 11 | ) 12 | 13 | type cacheEntry struct { 14 | pid uint32 15 | name string 16 | expiresAt time.Time 17 | element *list.Element 18 | } 19 | 20 | type LRUCache struct { 21 | cache map[uint32]*list.Element 22 | list *list.List 23 | maxSize int 24 | ttl time.Duration 25 | mutex sync.RWMutex 26 | stopCleanup chan struct{} 27 | } 28 | 29 | func NewLRUCache(maxSize int, ttl time.Duration) *LRUCache { 30 | c := &LRUCache{ 31 | cache: make(map[uint32]*list.Element), 32 | list: list.New(), 33 | maxSize: maxSize, 34 | ttl: ttl, 35 | stopCleanup: make(chan struct{}), 36 | } 37 | go c.cleanupExpired() 38 | return c 39 | } 40 | 41 | func (c *LRUCache) Get(pid uint32) (string, bool) { 42 | if !validation.ValidatePID(pid) { 43 | return "", false 44 | } 45 | 46 | c.mutex.Lock() 47 | defer c.mutex.Unlock() 48 | 49 | elem, ok := c.cache[pid] 50 | if !ok { 51 | return "", false 52 | } 53 | 54 | entry := elem.Value.(*cacheEntry) 55 | if time.Now().After(entry.expiresAt) { 56 | delete(c.cache, pid) 57 | c.list.Remove(elem) 58 | return "", false 59 | } 60 | 61 | c.list.MoveToFront(elem) 62 | metricsexporter.RecordProcessCacheHit() 63 | return entry.name, true 64 | } 65 | 66 | func (c *LRUCache) Set(pid uint32, name string) { 67 | if !validation.ValidatePID(pid) { 68 | return 69 | } 70 | 71 | c.mutex.Lock() 72 | defer c.mutex.Unlock() 73 | 74 | if elem, ok := c.cache[pid]; ok { 75 | entry := elem.Value.(*cacheEntry) 76 | entry.name = name 77 | entry.expiresAt = time.Now().Add(c.ttl) 78 | c.list.MoveToFront(elem) 79 | return 80 | } 81 | 82 | if len(c.cache) >= c.maxSize { 83 | c.evict() 84 | } 85 | 86 | entry := &cacheEntry{ 87 | pid: pid, 88 | name: name, 89 | expiresAt: time.Now().Add(c.ttl), 90 | } 91 | elem := c.list.PushFront(entry) 92 | entry.element = elem 93 | c.cache[pid] = elem 94 | } 95 | 96 | func (c *LRUCache) evict() { 97 | evictTarget := int(float64(c.maxSize) * config.CacheEvictionThreshold) 98 | for len(c.cache) >= evictTarget { 99 | back := c.list.Back() 100 | if back == nil { 101 | break 102 | } 103 | entry := back.Value.(*cacheEntry) 104 | delete(c.cache, entry.pid) 105 | c.list.Remove(back) 106 | } 107 | } 108 | 109 | func (c *LRUCache) cleanupExpired() { 110 | ticker := time.NewTicker(c.ttl / 2) 111 | defer ticker.Stop() 112 | 113 | for { 114 | select { 115 | case <-ticker.C: 116 | c.mutex.Lock() 117 | now := time.Now() 118 | var toRemove []*list.Element 119 | for _, elem := range c.cache { 120 | entry := elem.Value.(*cacheEntry) 121 | if now.After(entry.expiresAt) { 122 | toRemove = append(toRemove, elem) 123 | } 124 | } 125 | for _, elem := range toRemove { 126 | entry := elem.Value.(*cacheEntry) 127 | delete(c.cache, entry.pid) 128 | c.list.Remove(elem) 129 | } 130 | c.mutex.Unlock() 131 | case <-c.stopCleanup: 132 | return 133 | } 134 | } 135 | } 136 | 137 | func (c *LRUCache) Close() { 138 | close(c.stopCleanup) 139 | } 140 | 141 | -------------------------------------------------------------------------------- /internal/diagnose/analyzer/tls_test.go: -------------------------------------------------------------------------------- 1 | package analyzer 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/podtrace/podtrace/internal/events" 7 | ) 8 | 9 | func TestAnalyzeTLS(t *testing.T) { 10 | tests := []struct { 11 | name string 12 | events []*events.Event 13 | wantAvgLatency float64 14 | wantMaxLatency float64 15 | wantErrors int 16 | wantP50 float64 17 | wantP95 float64 18 | wantP99 float64 19 | }{ 20 | { 21 | name: "empty events", 22 | events: []*events.Event{}, 23 | wantAvgLatency: 0, 24 | wantMaxLatency: 0, 25 | wantErrors: 0, 26 | wantP50: 0, 27 | wantP95: 0, 28 | wantP99: 0, 29 | }, 30 | { 31 | name: "single successful handshake", 32 | events: []*events.Event{ 33 | { 34 | Type: events.EventTLSHandshake, 35 | LatencyNS: 100000000, 36 | Error: 0, 37 | }, 38 | }, 39 | wantAvgLatency: 100.0, 40 | wantMaxLatency: 100.0, 41 | wantErrors: 0, 42 | wantP50: 100.0, 43 | wantP95: 100.0, 44 | wantP99: 100.0, 45 | }, 46 | { 47 | name: "multiple handshakes with errors", 48 | events: []*events.Event{ 49 | { 50 | Type: events.EventTLSHandshake, 51 | LatencyNS: 50000000, 52 | Error: 0, 53 | Target: "example.com:443", 54 | }, 55 | { 56 | Type: events.EventTLSHandshake, 57 | LatencyNS: 200000000, 58 | Error: -1, 59 | Target: "bad.example.com:443", 60 | }, 61 | { 62 | Type: events.EventTLSHandshake, 63 | LatencyNS: 150000000, 64 | Error: 0, 65 | Target: "example.com:443", 66 | }, 67 | }, 68 | wantAvgLatency: 133.33, 69 | wantMaxLatency: 200.0, 70 | wantErrors: 1, 71 | wantP50: 150.0, 72 | wantP95: 150.0, 73 | wantP99: 150.0, 74 | }, 75 | } 76 | 77 | for _, tt := range tests { 78 | t.Run(tt.name, func(t *testing.T) { 79 | avgLatency, maxLatency, errors, p50, p95, p99, errorBreakdown, topTargets := AnalyzeTLS(tt.events) 80 | 81 | if errors != tt.wantErrors { 82 | t.Errorf("AnalyzeTLS() errors = %v, want %v", errors, tt.wantErrors) 83 | } 84 | 85 | if len(tt.events) == 0 { 86 | return 87 | } 88 | 89 | if avgLatency < tt.wantAvgLatency-1 || avgLatency > tt.wantAvgLatency+1 { 90 | t.Errorf("AnalyzeTLS() avgLatency = %v, want %v", avgLatency, tt.wantAvgLatency) 91 | } 92 | 93 | if maxLatency != tt.wantMaxLatency { 94 | t.Errorf("AnalyzeTLS() maxLatency = %v, want %v", maxLatency, tt.wantMaxLatency) 95 | } 96 | 97 | if p50 < tt.wantP50-1 || p50 > tt.wantP50+1 { 98 | t.Errorf("AnalyzeTLS() p50 = %v, want %v", p50, tt.wantP50) 99 | } 100 | 101 | if p95 < tt.wantP95-1 || p95 > tt.wantP95+1 { 102 | t.Errorf("AnalyzeTLS() p95 = %v, want %v", p95, tt.wantP95) 103 | } 104 | 105 | if p99 < tt.wantP99-1 || p99 > tt.wantP99+1 { 106 | t.Errorf("AnalyzeTLS() p99 = %v, want %v", p99, tt.wantP99) 107 | } 108 | 109 | if tt.wantErrors > 0 && len(errorBreakdown) == 0 { 110 | t.Errorf("AnalyzeTLS() expected error breakdown but got none") 111 | } 112 | 113 | if len(tt.events) > 1 && len(topTargets) == 0 { 114 | t.Errorf("AnalyzeTLS() expected top targets but got none") 115 | } 116 | }) 117 | } 118 | } 119 | 120 | -------------------------------------------------------------------------------- /internal/diagnose/tracker/pod_communication_test.go: -------------------------------------------------------------------------------- 1 | package tracker 2 | 3 | import ( 4 | "testing" 5 | "time" 6 | 7 | "github.com/podtrace/podtrace/internal/events" 8 | ) 9 | 10 | func TestPodCommunicationTracker_ProcessEvent(t *testing.T) { 11 | tracker := NewPodCommunicationTracker("source-pod", "default") 12 | 13 | event := &events.Event{ 14 | Type: events.EventConnect, 15 | Target: "10.244.1.5:8080", 16 | Timestamp: uint64(time.Now().UnixNano()), 17 | Error: 0, 18 | } 19 | 20 | k8sContext := map[string]interface{}{ 21 | "target_pod": "target-pod", 22 | "target_service": "test-service", 23 | "target_namespace": "default", 24 | } 25 | 26 | tracker.ProcessEvent(event, k8sContext) 27 | 28 | summaries := tracker.GetSummary() 29 | if len(summaries) == 0 { 30 | t.Fatal("expected at least one communication summary") 31 | } 32 | 33 | if summaries[0].Target != "test-service" { 34 | t.Errorf("expected target 'test-service', got %q", summaries[0].Target) 35 | } 36 | 37 | if summaries[0].Namespace != "default" { 38 | t.Errorf("expected namespace 'default', got %q", summaries[0].Namespace) 39 | } 40 | } 41 | 42 | func TestPodCommunicationTracker_ProcessEvent_NoContext(t *testing.T) { 43 | tracker := NewPodCommunicationTracker("source-pod", "default") 44 | 45 | event := &events.Event{ 46 | Type: events.EventConnect, 47 | Target: "10.244.1.5:8080", 48 | Timestamp: uint64(time.Now().UnixNano()), 49 | } 50 | 51 | tracker.ProcessEvent(event, nil) 52 | 53 | summaries := tracker.GetSummary() 54 | if len(summaries) != 0 { 55 | t.Errorf("expected no summaries without context, got %d", len(summaries)) 56 | } 57 | } 58 | 59 | func TestPodCommunicationTracker_ProcessEvent_NonNetwork(t *testing.T) { 60 | tracker := NewPodCommunicationTracker("source-pod", "default") 61 | 62 | event := &events.Event{ 63 | Type: events.EventRead, 64 | Target: "file.txt", 65 | Timestamp: uint64(time.Now().UnixNano()), 66 | } 67 | 68 | k8sContext := map[string]interface{}{ 69 | "target_pod": "target-pod", 70 | } 71 | 72 | tracker.ProcessEvent(event, k8sContext) 73 | 74 | summaries := tracker.GetSummary() 75 | if len(summaries) != 0 { 76 | t.Errorf("expected no summaries for non-network events, got %d", len(summaries)) 77 | } 78 | } 79 | 80 | func TestGeneratePodCommunicationReport(t *testing.T) { 81 | summaries := []PodCommunicationSummary{ 82 | { 83 | Target: "service-1", 84 | Namespace: "default", 85 | ConnectionCount: 10, 86 | TotalBytes: 1024, 87 | AvgLatency: time.Millisecond * 10, 88 | ErrorCount: 0, 89 | LastSeen: time.Now(), 90 | }, 91 | } 92 | 93 | report := GeneratePodCommunicationReport(summaries) 94 | if report == "" { 95 | t.Fatal("expected non-empty report") 96 | } 97 | 98 | if !contains(report, "service-1") { 99 | t.Error("report should contain service name") 100 | } 101 | } 102 | 103 | func contains(s, substr string) bool { 104 | return len(s) >= len(substr) && (s == substr || len(s) > len(substr) && (s[:len(substr)] == substr || s[len(s)-len(substr):] == substr || containsMiddle(s, substr))) 105 | } 106 | 107 | func containsMiddle(s, substr string) bool { 108 | for i := 0; i <= len(s)-len(substr); i++ { 109 | if s[i:i+len(substr)] == substr { 110 | return true 111 | } 112 | } 113 | return false 114 | } 115 | 116 | -------------------------------------------------------------------------------- /internal/diagnose/tracker/process.go: -------------------------------------------------------------------------------- 1 | package tracker 2 | 3 | import ( 4 | "fmt" 5 | "os" 6 | "sort" 7 | "strings" 8 | "github.com/podtrace/podtrace/internal/config" 9 | "github.com/podtrace/podtrace/internal/events" 10 | "github.com/podtrace/podtrace/internal/validation" 11 | ) 12 | 13 | type PidInfo struct { 14 | Pid uint32 15 | Name string 16 | Count int 17 | Percentage float64 18 | } 19 | 20 | func AnalyzeProcessActivity(events []*events.Event) []PidInfo { 21 | pidMap := make(map[uint32]int) 22 | totalEvents := len(events) 23 | 24 | for _, e := range events { 25 | pidMap[e.PID]++ 26 | } 27 | 28 | var pidInfos []PidInfo 29 | for pid, count := range pidMap { 30 | percentage := float64(count) / float64(totalEvents) * 100 31 | name := "" 32 | for _, e := range events { 33 | if e.PID == pid && e.ProcessName != "" { 34 | name = e.ProcessName 35 | break 36 | } 37 | } 38 | if name == "" { 39 | name = getProcessName(pid) 40 | } 41 | if name == "" { 42 | name = "unknown" 43 | } 44 | pidInfos = append(pidInfos, PidInfo{ 45 | Pid: pid, 46 | Name: name, 47 | Count: count, 48 | Percentage: percentage, 49 | }) 50 | } 51 | 52 | sort.Slice(pidInfos, func(i, j int) bool { 53 | return pidInfos[i].Count > pidInfos[j].Count 54 | }) 55 | 56 | return pidInfos 57 | } 58 | 59 | func getProcessName(pid uint32) string { 60 | name := getProcessNameFromProc(pid) 61 | return validation.SanitizeProcessName(name) 62 | } 63 | 64 | func getProcessNameFromProc(pid uint32) string { 65 | if !validation.ValidatePID(pid) { 66 | return "" 67 | } 68 | 69 | name := "" 70 | 71 | statPath := fmt.Sprintf("%s/%d/stat", config.ProcBasePath, pid) 72 | if data, err := os.ReadFile(statPath); err == nil { 73 | statStr := string(data) 74 | start := strings.Index(statStr, "(") 75 | end := strings.LastIndex(statStr, ")") 76 | if start >= 0 && end > start { 77 | name = statStr[start+1 : end] 78 | } 79 | } 80 | 81 | if name == "" { 82 | commPath := fmt.Sprintf("%s/%d/comm", config.ProcBasePath, pid) 83 | if data, err := os.ReadFile(commPath); err == nil { 84 | name = strings.TrimSpace(string(data)) 85 | } 86 | } 87 | 88 | if name == "" { 89 | cmdlinePath := fmt.Sprintf("%s/%d/cmdline", config.ProcBasePath, pid) 90 | if cmdline, err := os.ReadFile(cmdlinePath); err == nil { 91 | parts := strings.Split(string(cmdline), "\x00") 92 | if len(parts) > 0 && parts[0] != "" { 93 | name = parts[0] 94 | if idx := strings.LastIndex(name, "/"); idx >= 0 { 95 | name = name[idx+1:] 96 | } 97 | } 98 | } 99 | } 100 | 101 | if name == "" { 102 | exePath := fmt.Sprintf("%s/%d/exe", config.ProcBasePath, pid) 103 | if link, err := os.Readlink(exePath); err == nil { 104 | if idx := strings.LastIndex(link, "/"); idx >= 0 { 105 | name = link[idx+1:] 106 | } else { 107 | name = link 108 | } 109 | } 110 | } 111 | 112 | if name == "" { 113 | statusPath := fmt.Sprintf("%s/%d/status", config.ProcBasePath, pid) 114 | if data, err := os.ReadFile(statusPath); err == nil { 115 | lines := strings.Split(string(data), "\n") 116 | for _, line := range lines { 117 | if strings.HasPrefix(line, "Name:") { 118 | parts := strings.Fields(line) 119 | if len(parts) >= 2 { 120 | name = parts[1] 121 | break 122 | } 123 | } 124 | } 125 | } 126 | } 127 | 128 | return name 129 | } 130 | -------------------------------------------------------------------------------- /internal/kubernetes/events_correlator.go: -------------------------------------------------------------------------------- 1 | package kubernetes 2 | 3 | import ( 4 | "context" 5 | "sync" 6 | "time" 7 | 8 | corev1 "k8s.io/api/core/v1" 9 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 10 | "k8s.io/apimachinery/pkg/watch" 11 | "k8s.io/client-go/kubernetes" 12 | ) 13 | 14 | type K8sEvent struct { 15 | Type string 16 | Reason string 17 | Message string 18 | Timestamp time.Time 19 | Count int32 20 | } 21 | 22 | type EventsCorrelator struct { 23 | clientset kubernetes.Interface 24 | podName string 25 | namespace string 26 | events []*K8sEvent 27 | mu sync.RWMutex 28 | eventWatcher watch.Interface 29 | stopCh chan struct{} 30 | } 31 | 32 | func NewEventsCorrelator(clientset kubernetes.Interface, podName, namespace string) *EventsCorrelator { 33 | return &EventsCorrelator{ 34 | clientset: clientset, 35 | podName: podName, 36 | namespace: namespace, 37 | events: make([]*K8sEvent, 0), 38 | stopCh: make(chan struct{}), 39 | } 40 | } 41 | 42 | func (ec *EventsCorrelator) Start(ctx context.Context) error { 43 | if ec.clientset == nil { 44 | return nil 45 | } 46 | 47 | watcher, err := ec.clientset.CoreV1().Events(ec.namespace).Watch(ctx, metav1.ListOptions{ 48 | FieldSelector: "involvedObject.name=" + ec.podName, 49 | }) 50 | if err != nil { 51 | return err 52 | } 53 | 54 | ec.eventWatcher = watcher 55 | 56 | go ec.watchEvents(ctx) 57 | return nil 58 | } 59 | 60 | func (ec *EventsCorrelator) watchEvents(ctx context.Context) { 61 | defer func() { 62 | if ec.eventWatcher != nil { 63 | ec.eventWatcher.Stop() 64 | } 65 | }() 66 | 67 | for { 68 | select { 69 | case <-ctx.Done(): 70 | return 71 | case <-ec.stopCh: 72 | return 73 | case event, ok := <-ec.eventWatcher.ResultChan(): 74 | if !ok { 75 | return 76 | } 77 | 78 | if k8sEvent, ok := event.Object.(*corev1.Event); ok { 79 | ec.addEvent(k8sEvent) 80 | } 81 | } 82 | } 83 | } 84 | 85 | func (ec *EventsCorrelator) addEvent(event *corev1.Event) { 86 | if event.InvolvedObject.Name != ec.podName { 87 | return 88 | } 89 | 90 | ec.mu.Lock() 91 | defer ec.mu.Unlock() 92 | 93 | k8sEvent := &K8sEvent{ 94 | Type: event.Type, 95 | Reason: event.Reason, 96 | Message: event.Message, 97 | Timestamp: event.FirstTimestamp.Time, 98 | Count: event.Count, 99 | } 100 | 101 | ec.events = append(ec.events, k8sEvent) 102 | 103 | maxEvents := 100 104 | if len(ec.events) > maxEvents { 105 | ec.events = ec.events[len(ec.events)-maxEvents:] 106 | } 107 | } 108 | 109 | func (ec *EventsCorrelator) GetEvents() []*K8sEvent { 110 | ec.mu.RLock() 111 | defer ec.mu.RUnlock() 112 | 113 | result := make([]*K8sEvent, len(ec.events)) 114 | copy(result, ec.events) 115 | return result 116 | } 117 | 118 | func (ec *EventsCorrelator) Stop() { 119 | close(ec.stopCh) 120 | if ec.eventWatcher != nil { 121 | ec.eventWatcher.Stop() 122 | } 123 | } 124 | 125 | func (ec *EventsCorrelator) CorrelateWithAppEvents(appEventTime time.Time, window time.Duration) []*K8sEvent { 126 | ec.mu.RLock() 127 | defer ec.mu.RUnlock() 128 | 129 | var correlated []*K8sEvent 130 | windowStart := appEventTime.Add(-window) 131 | windowEnd := appEventTime.Add(window) 132 | 133 | for _, k8sEvent := range ec.events { 134 | if k8sEvent.Timestamp.After(windowStart) && k8sEvent.Timestamp.Before(windowEnd) { 135 | correlated = append(correlated, k8sEvent) 136 | } 137 | } 138 | 139 | return correlated 140 | } 141 | 142 | -------------------------------------------------------------------------------- /internal/tracing/exporter/splunk.go: -------------------------------------------------------------------------------- 1 | package exporter 2 | 3 | import ( 4 | "bytes" 5 | "context" 6 | "encoding/json" 7 | "net/http" 8 | "time" 9 | 10 | "github.com/podtrace/podtrace/internal/config" 11 | "github.com/podtrace/podtrace/internal/diagnose/tracker" 12 | ) 13 | 14 | type SplunkExporter struct { 15 | endpoint string 16 | token string 17 | client *http.Client 18 | enabled bool 19 | sampleRate float64 20 | } 21 | 22 | type SplunkEvent struct { 23 | Time int64 `json:"time"` 24 | Host string `json:"host,omitempty"` 25 | Source string `json:"source,omitempty"` 26 | Sourcetype string `json:"sourcetype,omitempty"` 27 | Event map[string]interface{} `json:"event"` 28 | } 29 | 30 | func NewSplunkExporter(endpoint, token string, sampleRate float64) (*SplunkExporter, error) { 31 | if endpoint == "" { 32 | endpoint = config.DefaultSplunkEndpoint 33 | } 34 | 35 | return &SplunkExporter{ 36 | endpoint: endpoint, 37 | token: token, 38 | client: &http.Client{Timeout: config.TracingExporterTimeout}, 39 | enabled: true, 40 | sampleRate: sampleRate, 41 | }, nil 42 | } 43 | 44 | func (e *SplunkExporter) ExportTraces(traces []*tracker.Trace) error { 45 | if !e.enabled || len(traces) == 0 { 46 | return nil 47 | } 48 | 49 | for _, t := range traces { 50 | if !e.shouldSample(t) { 51 | continue 52 | } 53 | 54 | if err := e.exportTrace(t); err != nil { 55 | continue 56 | } 57 | } 58 | 59 | return nil 60 | } 61 | 62 | func (e *SplunkExporter) shouldSample(_ *tracker.Trace) bool { 63 | if e.sampleRate >= 1.0 { 64 | return true 65 | } 66 | if e.sampleRate <= 0.0 { 67 | return false 68 | } 69 | return time.Now().UnixNano()%int64(1.0/e.sampleRate) == 0 70 | } 71 | 72 | func (e *SplunkExporter) exportTrace(t *tracker.Trace) error { 73 | if len(t.Spans) == 0 { 74 | return nil 75 | } 76 | 77 | events := make([]SplunkEvent, 0) 78 | 79 | for _, span := range t.Spans { 80 | span.UpdateDuration() 81 | 82 | eventData := map[string]interface{}{ 83 | "trace_id": span.TraceID, 84 | "span_id": span.SpanID, 85 | "parent_span_id": span.ParentSpanID, 86 | "operation": span.Operation, 87 | "service": span.Service, 88 | "start_time": span.StartTime.Unix(), 89 | "duration_ms": span.Duration.Milliseconds(), 90 | "span_count": len(span.Events), 91 | } 92 | 93 | for k, v := range span.Attributes { 94 | eventData[k] = v 95 | } 96 | 97 | if span.Error { 98 | eventData["error"] = true 99 | } 100 | 101 | event := SplunkEvent{ 102 | Time: span.StartTime.Unix(), 103 | Sourcetype: "Podtrace:trace", 104 | Event: eventData, 105 | } 106 | 107 | events = append(events, event) 108 | } 109 | 110 | for _, event := range events { 111 | payload, err := json.Marshal(event) 112 | if err != nil { 113 | continue 114 | } 115 | 116 | req, err := http.NewRequestWithContext(context.Background(), "POST", e.endpoint, bytes.NewReader(payload)) 117 | if err != nil { 118 | continue 119 | } 120 | 121 | req.Header.Set("Content-Type", "application/json") 122 | if e.token != "" { 123 | req.Header.Set("Authorization", "Splunk "+e.token) 124 | } 125 | 126 | resp, err := e.client.Do(req) 127 | if err != nil { 128 | continue 129 | } 130 | _ = resp.Body.Close() 131 | } 132 | 133 | return nil 134 | } 135 | 136 | func (e *SplunkExporter) Shutdown(ctx context.Context) error { 137 | return nil 138 | } 139 | -------------------------------------------------------------------------------- /test/integration_test.go: -------------------------------------------------------------------------------- 1 | //go:build integration 2 | // +build integration 3 | 4 | package test 5 | 6 | import ( 7 | "strings" 8 | "testing" 9 | "time" 10 | 11 | "github.com/podtrace/podtrace/internal/diagnose" 12 | "github.com/podtrace/podtrace/internal/events" 13 | ) 14 | 15 | func TestDiagnostician_RealWorldScenario(t *testing.T) { 16 | if testing.Short() { 17 | t.Skip("Skipping integration test in short mode") 18 | } 19 | 20 | d := diagnose.NewDiagnostician() 21 | 22 | eventTypes := []events.EventType{ 23 | events.EventDNS, 24 | events.EventConnect, 25 | events.EventTCPSend, 26 | events.EventTCPRecv, 27 | events.EventRead, 28 | events.EventWrite, 29 | } 30 | 31 | for i := 0; i < 100; i++ { 32 | eventType := eventTypes[i%len(eventTypes)] 33 | event := &events.Event{ 34 | Type: eventType, 35 | PID: uint32(1000 + i%10), 36 | LatencyNS: uint64((i + 1) * 1000000), // 1ms to 100ms 37 | Target: "example.com", 38 | Error: 0, 39 | } 40 | 41 | if i%10 == 0 { 42 | event.Error = 111 43 | } 44 | 45 | d.AddEvent(event) 46 | } 47 | 48 | d.Finish() 49 | 50 | report := d.GenerateReport() 51 | if report == "" { 52 | t.Error("Report should not be empty") 53 | } 54 | 55 | sections := []string{ 56 | "Summary", 57 | "DNS Statistics", 58 | "TCP Statistics", 59 | "Connection Statistics", 60 | } 61 | 62 | for _, section := range sections { 63 | if !contains(report, section) { 64 | t.Errorf("Report should contain section '%s'", section) 65 | } 66 | } 67 | } 68 | 69 | func TestDiagnostician_ExportFormats(t *testing.T) { 70 | if testing.Short() { 71 | t.Skip("Skipping integration test in short mode") 72 | } 73 | 74 | d := diagnose.NewDiagnostician() 75 | 76 | for i := 0; i < 50; i++ { 77 | d.AddEvent(&events.Event{ 78 | Type: events.EventDNS, 79 | LatencyNS: uint64(i * 1000000), 80 | Target: "example.com", 81 | }) 82 | } 83 | 84 | d.Finish() 85 | 86 | jsonData := d.ExportJSON() 87 | if jsonData.Summary == nil { 88 | t.Error("JSON export should include summary") 89 | } 90 | 91 | var csvBuf []byte 92 | writer := &testWriter{data: &csvBuf} 93 | err := d.ExportCSV(writer) 94 | if err != nil { 95 | t.Errorf("CSV export should not fail: %v", err) 96 | } 97 | if len(csvBuf) == 0 { 98 | t.Error("CSV export should produce output") 99 | } 100 | } 101 | 102 | func TestDiagnostician_Performance(t *testing.T) { 103 | if testing.Short() { 104 | t.Skip("Skipping integration test in short mode") 105 | } 106 | 107 | d := diagnose.NewDiagnostician() 108 | 109 | start := time.Now() 110 | for i := 0; i < 10000; i++ { 111 | d.AddEvent(&events.Event{ 112 | Type: events.EventDNS, 113 | LatencyNS: uint64(i * 1000000), 114 | Target: "example.com", 115 | }) 116 | } 117 | addDuration := time.Since(start) 118 | 119 | if addDuration > 1*time.Second { 120 | t.Errorf("Adding 10000 events took too long: %v", addDuration) 121 | } 122 | 123 | d.Finish() 124 | 125 | start = time.Now() 126 | _ = d.GenerateReport() 127 | reportDuration := time.Since(start) 128 | 129 | if reportDuration > 5*time.Second { 130 | t.Errorf("Generating report took too long: %v", reportDuration) 131 | } 132 | } 133 | 134 | func contains(s, substr string) bool { 135 | return len(s) >= len(substr) && strings.Contains(s, substr) 136 | } 137 | 138 | type testWriter struct { 139 | data *[]byte 140 | } 141 | 142 | func (w *testWriter) Write(p []byte) (n int, err error) { 143 | *w.data = append(*w.data, p...) 144 | return len(p), nil 145 | } 146 | -------------------------------------------------------------------------------- /internal/alerting/splunk.go: -------------------------------------------------------------------------------- 1 | package alerting 2 | 3 | import ( 4 | "bytes" 5 | "context" 6 | "encoding/json" 7 | "fmt" 8 | "io" 9 | "net/http" 10 | "net/url" 11 | "time" 12 | 13 | "github.com/podtrace/podtrace/internal/config" 14 | ) 15 | 16 | type SplunkAlertSender struct { 17 | endpoint string 18 | token string 19 | client *http.Client 20 | timeout time.Duration 21 | } 22 | 23 | type SplunkAlertEvent struct { 24 | Time int64 `json:"time"` 25 | Host string `json:"host,omitempty"` 26 | Source string `json:"source,omitempty"` 27 | Sourcetype string `json:"sourcetype,omitempty"` 28 | Event map[string]interface{} `json:"event"` 29 | } 30 | 31 | func NewSplunkAlertSender(endpoint, token string, timeout time.Duration) (*SplunkAlertSender, error) { 32 | if endpoint == "" { 33 | return nil, fmt.Errorf("splunk endpoint is required") 34 | } 35 | parsedURL, err := url.Parse(endpoint) 36 | if err != nil { 37 | return nil, fmt.Errorf("invalid splunk endpoint: %w", err) 38 | } 39 | if parsedURL.Scheme != "http" && parsedURL.Scheme != "https" { 40 | return nil, fmt.Errorf("splunk endpoint must use http or https scheme") 41 | } 42 | if token == "" { 43 | return nil, fmt.Errorf("splunk token is required") 44 | } 45 | return &SplunkAlertSender{ 46 | endpoint: endpoint, 47 | token: token, 48 | client: &http.Client{Timeout: timeout}, 49 | timeout: timeout, 50 | }, nil 51 | } 52 | 53 | func (s *SplunkAlertSender) Send(ctx context.Context, alert *Alert) error { 54 | if alert == nil { 55 | return fmt.Errorf("alert is nil") 56 | } 57 | eventData := map[string]interface{}{ 58 | "severity": string(alert.Severity), 59 | "title": alert.Title, 60 | "message": alert.Message, 61 | "source": alert.Source, 62 | "pod": alert.PodName, 63 | "namespace": alert.Namespace, 64 | } 65 | if alert.ErrorCode != "" { 66 | eventData["error_code"] = alert.ErrorCode 67 | } 68 | if len(alert.Recommendations) > 0 { 69 | eventData["recommendations"] = alert.Recommendations 70 | } 71 | if len(alert.Context) > 0 { 72 | for k, v := range alert.Context { 73 | if len(k) <= 64 { 74 | eventData[k] = v 75 | } 76 | } 77 | } 78 | event := SplunkAlertEvent{ 79 | Time: alert.Timestamp.Unix(), 80 | Sourcetype: "Podtrace:alert", 81 | Event: eventData, 82 | } 83 | jsonData, err := json.Marshal(event) 84 | if err != nil { 85 | return fmt.Errorf("failed to marshal Splunk event: %w", err) 86 | } 87 | if int64(len(jsonData)) > config.AlertMaxPayloadSize { 88 | return fmt.Errorf("payload size %d exceeds maximum %d", len(jsonData), config.AlertMaxPayloadSize) 89 | } 90 | req, err := http.NewRequestWithContext(ctx, "POST", s.endpoint, bytes.NewReader(jsonData)) 91 | if err != nil { 92 | return fmt.Errorf("failed to create request: %w", err) 93 | } 94 | req.Header.Set("Content-Type", "application/json") 95 | req.Header.Set("Authorization", "Splunk "+s.token) 96 | req.Header.Set("User-Agent", config.GetUserAgent()) 97 | resp, err := s.client.Do(req) 98 | if err != nil { 99 | return fmt.Errorf("failed to send request: %w", err) 100 | } 101 | defer func() { 102 | _, _ = io.Copy(io.Discard, resp.Body) 103 | _ = resp.Body.Close() 104 | }() 105 | if resp.StatusCode < 200 || resp.StatusCode >= 300 { 106 | bodyBytes, _ := io.ReadAll(io.LimitReader(resp.Body, 512)) 107 | return fmt.Errorf("unexpected status code %d: %s", resp.StatusCode, string(bodyBytes)) 108 | } 109 | return nil 110 | } 111 | 112 | func (s *SplunkAlertSender) Name() string { 113 | return "splunk" 114 | } 115 | -------------------------------------------------------------------------------- /internal/alerting/alert.go: -------------------------------------------------------------------------------- 1 | package alerting 2 | 3 | import ( 4 | "crypto/sha256" 5 | "encoding/hex" 6 | "fmt" 7 | "time" 8 | 9 | "github.com/podtrace/podtrace/internal/config" 10 | ) 11 | 12 | type AlertSeverity string 13 | 14 | const ( 15 | SeverityFatal AlertSeverity = "fatal" 16 | SeverityCritical AlertSeverity = "critical" 17 | SeverityWarning AlertSeverity = "warning" 18 | SeverityError AlertSeverity = "error" 19 | ) 20 | 21 | type Alert struct { 22 | Severity AlertSeverity 23 | Title string 24 | Message string 25 | Timestamp time.Time 26 | Source string 27 | PodName string 28 | Namespace string 29 | Context map[string]interface{} 30 | ErrorCode string 31 | Recommendations []string 32 | } 33 | 34 | func (a *Alert) Key() string { 35 | if a == nil { 36 | return "" 37 | } 38 | h := sha256.New() 39 | h.Write([]byte(a.Severity)) 40 | h.Write([]byte(a.Source)) 41 | h.Write([]byte(a.PodName)) 42 | h.Write([]byte(a.Namespace)) 43 | h.Write([]byte(a.Title)) 44 | return hex.EncodeToString(h.Sum(nil))[:16] 45 | } 46 | 47 | func (a *Alert) Validate() error { 48 | if a == nil { 49 | return fmt.Errorf("alert is nil") 50 | } 51 | if a.Severity == "" { 52 | return fmt.Errorf("alert severity is required") 53 | } 54 | if a.Title == "" { 55 | return fmt.Errorf("alert title is required") 56 | } 57 | if a.Message == "" { 58 | return fmt.Errorf("alert message is required") 59 | } 60 | if a.Timestamp.IsZero() { 61 | return fmt.Errorf("alert timestamp is required") 62 | } 63 | if a.Source == "" { 64 | return fmt.Errorf("alert source is required") 65 | } 66 | return nil 67 | } 68 | 69 | func (a *Alert) Sanitize() { 70 | if a == nil { 71 | return 72 | } 73 | if len(a.Title) > 256 { 74 | a.Title = a.Title[:253] + "..." 75 | } 76 | if len(a.Message) > 1024 { 77 | a.Message = a.Message[:1021] + "..." 78 | } 79 | if len(a.PodName) > 256 { 80 | a.PodName = a.PodName[:253] + "..." 81 | } 82 | if len(a.Namespace) > 256 { 83 | a.Namespace = a.Namespace[:253] + "..." 84 | } 85 | if len(a.Source) > 128 { 86 | a.Source = a.Source[:125] + "..." 87 | } 88 | if len(a.ErrorCode) > 64 { 89 | a.ErrorCode = a.ErrorCode[:61] + "..." 90 | } 91 | if len(a.Recommendations) > 10 { 92 | a.Recommendations = a.Recommendations[:10] 93 | } 94 | for i, rec := range a.Recommendations { 95 | if len(rec) > 512 { 96 | a.Recommendations[i] = rec[:509] + "..." 97 | } 98 | } 99 | } 100 | 101 | func MapResourceAlertLevel(level uint32) AlertSeverity { 102 | switch level { 103 | case 3: 104 | return SeverityFatal 105 | case 2: 106 | return SeverityCritical 107 | case 1: 108 | return SeverityWarning 109 | default: 110 | return SeverityError 111 | } 112 | } 113 | 114 | func ParseSeverity(severity string) AlertSeverity { 115 | switch severity { 116 | case "fatal": 117 | return SeverityFatal 118 | case "critical": 119 | return SeverityCritical 120 | case "warning": 121 | return SeverityWarning 122 | case "error": 123 | return SeverityError 124 | default: 125 | return SeverityError 126 | } 127 | } 128 | 129 | func SeverityLevel(severity AlertSeverity) int { 130 | switch severity { 131 | case SeverityFatal: 132 | return 4 133 | case SeverityCritical: 134 | return 3 135 | case SeverityWarning: 136 | return 2 137 | case SeverityError: 138 | return 1 139 | default: 140 | return 0 141 | } 142 | } 143 | 144 | func ShouldSendAlert(severity AlertSeverity) bool { 145 | if !config.AlertingEnabled { 146 | return false 147 | } 148 | minSeverity := ParseSeverity(config.GetAlertMinSeverity()) 149 | return SeverityLevel(severity) >= SeverityLevel(minSeverity) 150 | } 151 | 152 | -------------------------------------------------------------------------------- /bpf/maps.h: -------------------------------------------------------------------------------- 1 | // SPDX-License-Identifier: GPL-2.0 2 | 3 | #ifndef PODTRACE_MAPS_H 4 | #define PODTRACE_MAPS_H 5 | 6 | #include "common.h" 7 | #include "events.h" 8 | 9 | struct stack_trace_t { 10 | u64 ips[MAX_STACK_DEPTH]; 11 | u32 nr; 12 | }; 13 | 14 | struct { 15 | __uint(type, BPF_MAP_TYPE_RINGBUF); 16 | __uint(max_entries, 2 * 1024 * 1024); 17 | } events SEC(".maps"); 18 | 19 | struct { 20 | __uint(type, BPF_MAP_TYPE_HASH); 21 | __uint(max_entries, 1024); 22 | __type(key, u64); 23 | __type(value, u64); 24 | } start_times SEC(".maps"); 25 | 26 | struct { 27 | __uint(type, BPF_MAP_TYPE_HASH); 28 | __uint(max_entries, 1024); 29 | __type(key, u64); 30 | __type(value, char[MAX_STRING_LEN]); 31 | } dns_targets SEC(".maps"); 32 | 33 | struct { 34 | __uint(type, BPF_MAP_TYPE_HASH); 35 | __uint(max_entries, 1024); 36 | __type(key, u64); 37 | __type(value, char[MAX_STRING_LEN]); 38 | } socket_conns SEC(".maps"); 39 | 40 | struct { 41 | __uint(type, BPF_MAP_TYPE_HASH); 42 | __uint(max_entries, 1024); 43 | __type(key, u64); 44 | __type(value, u64); 45 | } tcp_sockets SEC(".maps"); 46 | 47 | struct { 48 | __uint(type, BPF_MAP_TYPE_HASH); 49 | __uint(max_entries, 2048); 50 | __type(key, u64); 51 | __type(value, struct stack_trace_t); 52 | } stack_traces SEC(".maps"); 53 | 54 | struct { 55 | __uint(type, BPF_MAP_TYPE_HASH); 56 | __uint(max_entries, 1024); 57 | __type(key, u64); 58 | __type(value, char[MAX_STRING_LEN]); 59 | } lock_targets SEC(".maps"); 60 | 61 | struct { 62 | __uint(type, BPF_MAP_TYPE_HASH); 63 | __uint(max_entries, 1024); 64 | __type(key, u64); 65 | __type(value, char[MAX_STRING_LEN]); 66 | } db_queries SEC(".maps"); 67 | 68 | struct { 69 | __uint(type, BPF_MAP_TYPE_HASH); 70 | __uint(max_entries, 1024); 71 | __type(key, u64); 72 | __type(value, char[MAX_STRING_LEN]); 73 | } syscall_paths SEC(".maps"); 74 | 75 | struct { 76 | __uint(type, BPF_MAP_TYPE_HASH); 77 | __uint(max_entries, 1024); 78 | __type(key, u64); 79 | __type(value, u64); 80 | } tls_handshakes SEC(".maps"); 81 | 82 | struct resource_limit { 83 | u64 limit_bytes; 84 | u64 usage_bytes; 85 | u64 last_update_ns; 86 | u32 resource_type; 87 | }; 88 | 89 | struct { 90 | __uint(type, BPF_MAP_TYPE_HASH); 91 | __uint(max_entries, 1024); 92 | __type(key, u64); 93 | __type(value, struct resource_limit); 94 | } cgroup_limits SEC(".maps"); 95 | 96 | struct { 97 | __uint(type, BPF_MAP_TYPE_HASH); 98 | __uint(max_entries, 1024); 99 | __type(key, u64); 100 | __type(value, u32); 101 | } cgroup_alerts SEC(".maps"); 102 | 103 | struct { 104 | __uint(type, BPF_MAP_TYPE_ARRAY); 105 | __uint(max_entries, 1); 106 | __type(key, u32); 107 | __type(value, u64); 108 | } target_cgroup_id SEC(".maps"); 109 | 110 | struct pool_state { 111 | u64 last_use_ns; 112 | u32 connection_id; 113 | u32 in_use; 114 | }; 115 | 116 | struct { 117 | __uint(type, BPF_MAP_TYPE_HASH); 118 | __uint(max_entries, 1024); 119 | __type(key, u64); 120 | __type(value, struct pool_state); 121 | } pool_states SEC(".maps"); 122 | 123 | struct { 124 | __uint(type, BPF_MAP_TYPE_HASH); 125 | __uint(max_entries, 1024); 126 | __type(key, u64); 127 | __type(value, u64); 128 | } pool_acquire_times SEC(".maps"); 129 | 130 | struct { 131 | __uint(type, BPF_MAP_TYPE_HASH); 132 | __uint(max_entries, 1024); 133 | __type(key, u64); 134 | __type(value, u32); 135 | } pool_db_types SEC(".maps"); 136 | 137 | struct { 138 | __uint(type, BPF_MAP_TYPE_ARRAY); 139 | __uint(max_entries, 1); 140 | __type(key, u32); 141 | __type(value, struct event); 142 | } event_buf SEC(".maps"); 143 | 144 | struct { 145 | __uint(type, BPF_MAP_TYPE_ARRAY); 146 | __uint(max_entries, 1); 147 | __type(key, u32); 148 | __type(value, struct stack_trace_t); 149 | } stack_buf SEC(".maps"); 150 | 151 | #endif 152 | -------------------------------------------------------------------------------- /internal/alerting/manager.go: -------------------------------------------------------------------------------- 1 | package alerting 2 | 3 | import ( 4 | "context" 5 | "sync" 6 | "time" 7 | 8 | "github.com/podtrace/podtrace/internal/config" 9 | ) 10 | 11 | type Manager struct { 12 | senders []Sender 13 | deduplicator *AlertDeduplicator 14 | rateLimiter *RateLimiter 15 | enabled bool 16 | mu sync.RWMutex 17 | cleanupTicker *time.Ticker 18 | stopCh chan struct{} 19 | wg sync.WaitGroup 20 | } 21 | 22 | func NewManager() (*Manager, error) { 23 | if !config.AlertingEnabled { 24 | return &Manager{enabled: false}, nil 25 | } 26 | manager := &Manager{ 27 | senders: make([]Sender, 0), 28 | deduplicator: NewAlertDeduplicator(config.AlertDeduplicationWindow), 29 | rateLimiter: NewRateLimiter(config.AlertRateLimitPerMinute), 30 | enabled: true, 31 | stopCh: make(chan struct{}), 32 | } 33 | if config.AlertWebhookURL != "" { 34 | webhookSender, err := NewWebhookSender(config.AlertWebhookURL, config.AlertHTTPTimeout) 35 | if err != nil { 36 | } else { 37 | retrySender := NewRetrySender(webhookSender, config.AlertMaxRetries, config.DefaultAlertRetryBackoffBase) 38 | manager.senders = append(manager.senders, retrySender) 39 | } 40 | } 41 | if config.AlertSlackWebhookURL != "" { 42 | slackSender, err := NewSlackSender(config.AlertSlackWebhookURL, config.AlertSlackChannel, config.AlertHTTPTimeout) 43 | if err != nil { 44 | } else { 45 | retrySender := NewRetrySender(slackSender, config.AlertMaxRetries, config.DefaultAlertRetryBackoffBase) 46 | manager.senders = append(manager.senders, retrySender) 47 | } 48 | } 49 | if config.AlertSplunkEnabled { 50 | splunkEndpoint := config.GetSplunkEndpoint() 51 | splunkToken := config.GetSplunkToken() 52 | if splunkEndpoint != "" && splunkToken != "" { 53 | splunkSender, err := NewSplunkAlertSender(splunkEndpoint, splunkToken, config.AlertHTTPTimeout) 54 | if err != nil { 55 | } else { 56 | retrySender := NewRetrySender(splunkSender, config.AlertMaxRetries, config.DefaultAlertRetryBackoffBase) 57 | manager.senders = append(manager.senders, retrySender) 58 | } 59 | } 60 | } 61 | if len(manager.senders) == 0 { 62 | return &Manager{enabled: false}, nil 63 | } 64 | manager.cleanupTicker = time.NewTicker(1 * time.Hour) 65 | manager.wg.Add(1) 66 | go manager.cleanupLoop() 67 | return manager, nil 68 | } 69 | 70 | func (m *Manager) SendAlert(alert *Alert) { 71 | if !m.enabled || alert == nil { 72 | return 73 | } 74 | if !ShouldSendAlert(alert.Severity) { 75 | return 76 | } 77 | if !m.rateLimiter.Allow() { 78 | return 79 | } 80 | if !m.deduplicator.ShouldSend(alert) { 81 | return 82 | } 83 | m.mu.RLock() 84 | senders := make([]Sender, len(m.senders)) 85 | copy(senders, m.senders) 86 | m.mu.RUnlock() 87 | for _, sender := range senders { 88 | go func(s Sender) { 89 | ctx, cancel := context.WithTimeout(context.Background(), config.AlertHTTPTimeout*2) 90 | defer cancel() 91 | _ = s.Send(ctx, alert) 92 | }(sender) 93 | } 94 | } 95 | 96 | func (m *Manager) cleanupLoop() { 97 | defer m.wg.Done() 98 | for { 99 | select { 100 | case <-m.stopCh: 101 | return 102 | case <-m.cleanupTicker.C: 103 | m.deduplicator.Cleanup(config.AlertDeduplicationWindow * 2) 104 | } 105 | } 106 | } 107 | 108 | func (m *Manager) Shutdown(ctx context.Context) error { 109 | if !m.enabled { 110 | return nil 111 | } 112 | close(m.stopCh) 113 | if m.cleanupTicker != nil { 114 | m.cleanupTicker.Stop() 115 | } 116 | done := make(chan struct{}) 117 | go func() { 118 | m.wg.Wait() 119 | close(done) 120 | }() 121 | select { 122 | case <-done: 123 | return nil 124 | case <-ctx.Done(): 125 | return ctx.Err() 126 | } 127 | } 128 | 129 | func (m *Manager) AddSender(sender Sender) { 130 | if sender == nil { 131 | return 132 | } 133 | m.mu.Lock() 134 | defer m.mu.Unlock() 135 | m.senders = append(m.senders, sender) 136 | } 137 | 138 | func (m *Manager) IsEnabled() bool { 139 | return m.enabled 140 | } 141 | 142 | -------------------------------------------------------------------------------- /cmd/podtrace/diagnose_env.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "bytes" 5 | "encoding/json" 6 | "fmt" 7 | "os" 8 | "runtime" 9 | "time" 10 | 11 | "github.com/spf13/cobra" 12 | "golang.org/x/sys/unix" 13 | 14 | podtrace "github.com/podtrace/podtrace" 15 | "github.com/podtrace/podtrace/internal/config" 16 | "github.com/podtrace/podtrace/internal/cri" 17 | "github.com/podtrace/podtrace/internal/ebpf/loader" 18 | ) 19 | 20 | type envReport struct { 21 | Time string `json:"time"` 22 | GoVersion string `json:"goVersion"` 23 | GOOS string `json:"goos"` 24 | GOARCH string `json:"goarch"` 25 | KernelRelease string `json:"kernelRelease"` 26 | CgroupBase string `json:"cgroupBase"` 27 | ProcBase string `json:"procBase"` 28 | CgroupV2 bool `json:"cgroupV2"` 29 | BTFVmlinux bool `json:"btfVmlinuxPresent"` 30 | BTFFile string `json:"btfFile"` 31 | CRIEndpointEnv string `json:"criEndpointEnv"` 32 | CRICandidates []string `json:"criCandidates"` 33 | CRIDetected string `json:"criDetected"` 34 | BPFObjectPath string `json:"bpfObjectPath"` 35 | BPFEmbedded bool `json:"bpfEmbeddedAvailable"` 36 | BPFPrograms []string `json:"bpfPrograms"` 37 | BPFMaps []string `json:"bpfMaps"` 38 | HasCgroupIDMap bool `json:"hasTargetCgroupIdMap"` 39 | Warnings []string `json:"warnings"` 40 | } 41 | 42 | func newDiagnoseEnvCmd() *cobra.Command { 43 | cmd := &cobra.Command{ 44 | Use: "diagnose-env", 45 | Short: "Print environment diagnostics for Podtrace (kernel/BTF/cgroups/CRI/BPF)", 46 | RunE: func(cmd *cobra.Command, args []string) error { 47 | rep := collectEnvReport() 48 | out, _ := json.MarshalIndent(rep, "", " ") 49 | fmt.Println(string(out)) 50 | return nil 51 | }, 52 | } 53 | return cmd 54 | } 55 | 56 | func collectEnvReport() envReport { 57 | rep := envReport{ 58 | Time: time.Now().Format(time.RFC3339), 59 | GoVersion: runtime.Version(), 60 | GOOS: runtime.GOOS, 61 | GOARCH: runtime.GOARCH, 62 | CgroupBase: config.CgroupBasePath, 63 | ProcBase: config.ProcBasePath, 64 | BTFFile: config.BTFFilePath, 65 | CRIEndpointEnv: os.Getenv("PODTRACE_CRI_ENDPOINT"), 66 | CRICandidates: cri.DefaultCandidateEndpoints(), 67 | BPFObjectPath: config.BPFObjectPath, 68 | BPFEmbedded: len(podtrace.EmbeddedPodtraceBPFObj) > 0, 69 | } 70 | 71 | var u unix.Utsname 72 | if err := unix.Uname(&u); err == nil { 73 | rep.KernelRelease = bytesToString(u.Release[:]) 74 | } 75 | 76 | if _, err := os.Stat("/sys/fs/cgroup/cgroup.controllers"); err == nil { 77 | rep.CgroupV2 = true 78 | } 79 | if _, err := os.Stat("/sys/kernel/btf/vmlinux"); err == nil { 80 | rep.BTFVmlinux = true 81 | } 82 | 83 | if r, err := cri.NewResolver(); err == nil { 84 | rep.CRIDetected = r.Endpoint() 85 | _ = r.Close() 86 | } 87 | 88 | if spec, err := loader.LoadPodtrace(); err == nil && spec != nil { 89 | for name := range spec.Programs { 90 | rep.BPFPrograms = append(rep.BPFPrograms, name) 91 | } 92 | for name := range spec.Maps { 93 | rep.BPFMaps = append(rep.BPFMaps, name) 94 | if name == "target_cgroup_id" { 95 | rep.HasCgroupIDMap = true 96 | } 97 | } 98 | } else if err != nil { 99 | rep.Warnings = append(rep.Warnings, fmt.Sprintf("failed to load BPF spec: %v", err)) 100 | } 101 | 102 | if !rep.BTFVmlinux && rep.BTFFile == "" { 103 | rep.Warnings = append(rep.Warnings, "kernel BTF (/sys/kernel/btf/vmlinux) not found and PODTRACE_BTF_FILE not set; CO-RE relocations may fail") 104 | } 105 | if rep.CgroupV2 && !rep.HasCgroupIDMap { 106 | rep.Warnings = append(rep.Warnings, "cgroup v2 detected but BPF map target_cgroup_id missing; kernel-side cgroup filtering will be unavailable") 107 | } 108 | 109 | return rep 110 | } 111 | 112 | func bytesToString(bts []byte) string { 113 | var b bytes.Buffer 114 | for _, c := range bts { 115 | if c == 0 { 116 | break 117 | } 118 | b.WriteByte(c) 119 | } 120 | return b.String() 121 | } 122 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/podtrace/podtrace 2 | 3 | go 1.24.0 4 | 5 | require ( 6 | github.com/cilium/ebpf v0.20.0 7 | github.com/spf13/cobra v1.10.2 8 | go.opentelemetry.io/otel v1.38.0 9 | go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.38.0 10 | go.opentelemetry.io/otel/sdk v1.38.0 11 | go.opentelemetry.io/otel/trace v1.38.0 12 | go.uber.org/zap v1.27.1 13 | golang.org/x/sys v0.38.0 14 | google.golang.org/grpc v1.75.0 15 | k8s.io/api v0.34.2 16 | k8s.io/apimachinery v0.34.2 17 | k8s.io/client-go v0.34.2 18 | k8s.io/cri-api v0.34.2 19 | ) 20 | 21 | require ( 22 | github.com/beorn7/perks v1.0.1 // indirect 23 | github.com/cenkalti/backoff/v5 v5.0.3 // indirect 24 | github.com/cespare/xxhash/v2 v2.3.0 // indirect 25 | github.com/go-logr/stdr v1.2.2 // indirect 26 | github.com/google/go-cmp v0.7.0 // indirect 27 | github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.2 // indirect 28 | github.com/pmezard/go-difflib v1.0.0 // indirect 29 | github.com/prometheus/client_model v0.6.2 // indirect 30 | github.com/prometheus/common v0.67.4 // indirect 31 | github.com/prometheus/procfs v0.19.2 // indirect 32 | go.opentelemetry.io/auto/sdk v1.1.0 // indirect 33 | go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.38.0 // indirect 34 | go.opentelemetry.io/otel/metric v1.38.0 // indirect 35 | go.opentelemetry.io/proto/otlp v1.7.1 // indirect 36 | go.uber.org/multierr v1.10.0 // indirect 37 | google.golang.org/genproto/googleapis/api v0.0.0-20250825161204-c5933d9347a5 // indirect 38 | google.golang.org/genproto/googleapis/rpc v0.0.0-20250825161204-c5933d9347a5 // indirect 39 | ) 40 | 41 | require ( 42 | github.com/davecgh/go-spew v1.1.1 // indirect 43 | github.com/emicklei/go-restful/v3 v3.13.0 // indirect 44 | github.com/fxamacker/cbor/v2 v2.9.0 // indirect 45 | github.com/go-logr/logr v1.4.3 // indirect 46 | github.com/go-openapi/jsonpointer v0.22.3 // indirect 47 | github.com/go-openapi/jsonreference v0.21.3 // indirect 48 | github.com/go-openapi/swag v0.25.3 // indirect 49 | github.com/go-openapi/swag/cmdutils v0.25.3 // indirect 50 | github.com/go-openapi/swag/conv v0.25.3 // indirect 51 | github.com/go-openapi/swag/fileutils v0.25.3 // indirect 52 | github.com/go-openapi/swag/jsonname v0.25.3 // indirect 53 | github.com/go-openapi/swag/jsonutils v0.25.3 // indirect 54 | github.com/go-openapi/swag/loading v0.25.3 // indirect 55 | github.com/go-openapi/swag/mangling v0.25.3 // indirect 56 | github.com/go-openapi/swag/netutils v0.25.3 // indirect 57 | github.com/go-openapi/swag/stringutils v0.25.3 // indirect 58 | github.com/go-openapi/swag/typeutils v0.25.3 // indirect 59 | github.com/go-openapi/swag/yamlutils v0.25.3 // indirect 60 | github.com/gogo/protobuf v1.3.2 // indirect 61 | github.com/google/gnostic-models v0.7.1 // indirect 62 | github.com/google/uuid v1.6.0 // indirect 63 | github.com/inconshreveable/mousetrap v1.1.0 // indirect 64 | github.com/json-iterator/go v1.1.12 // indirect 65 | github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect 66 | github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee // indirect 67 | github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect 68 | github.com/prometheus/client_golang v1.23.2 69 | github.com/spf13/pflag v1.0.10 // indirect 70 | github.com/x448/float16 v0.8.4 // indirect 71 | go.yaml.in/yaml/v2 v2.4.3 // indirect 72 | go.yaml.in/yaml/v3 v3.0.4 // indirect 73 | golang.org/x/net v0.47.0 // indirect 74 | golang.org/x/oauth2 v0.33.0 // indirect 75 | golang.org/x/term v0.37.0 // indirect 76 | golang.org/x/text v0.31.0 // indirect 77 | golang.org/x/time v0.14.0 78 | google.golang.org/protobuf v1.36.10 // indirect 79 | gopkg.in/evanphx/json-patch.v4 v4.13.0 // indirect 80 | gopkg.in/inf.v0 v0.9.1 // indirect 81 | k8s.io/klog/v2 v2.130.1 // indirect 82 | k8s.io/kube-openapi v0.0.0-20251121143641-b6aabc6c6745 // indirect 83 | k8s.io/utils v0.0.0-20251002143259-bc988d571ff4 // indirect 84 | sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730 // indirect 85 | sigs.k8s.io/randfill v1.0.0 // indirect 86 | sigs.k8s.io/structured-merge-diff/v6 v6.3.1 // indirect 87 | sigs.k8s.io/yaml v1.6.0 // indirect 88 | ) 89 | -------------------------------------------------------------------------------- /internal/ebpf/parser/parser.go: -------------------------------------------------------------------------------- 1 | package parser 2 | 3 | import ( 4 | "bytes" 5 | "encoding/binary" 6 | "sync" 7 | "unsafe" 8 | 9 | "github.com/podtrace/podtrace/internal/events" 10 | ) 11 | 12 | var ( 13 | binaryRead = binary.Read 14 | eventPool = sync.Pool{ 15 | New: func() interface{} { 16 | return &events.Event{} 17 | }, 18 | } 19 | ) 20 | 21 | type rawEvent struct { 22 | Timestamp uint64 23 | PID uint32 24 | Type uint32 25 | LatencyNS uint64 26 | Error int32 27 | _ uint32 28 | Bytes uint64 29 | TCPState uint32 30 | _ uint32 31 | StackKey uint64 32 | Target [128]byte 33 | Details [128]byte 34 | } 35 | 36 | func ParseEvent(data []byte) *events.Event { 37 | type rawEventV1 = rawEvent 38 | type rawEventV2 struct { 39 | Timestamp uint64 40 | PID uint32 41 | Type uint32 42 | LatencyNS uint64 43 | Error int32 44 | _ uint32 45 | Bytes uint64 46 | TCPState uint32 47 | _ uint32 48 | StackKey uint64 49 | CgroupID uint64 50 | Target [128]byte 51 | Details [128]byte 52 | } 53 | type rawEventV3 struct { 54 | Timestamp uint64 55 | PID uint32 56 | Type uint32 57 | LatencyNS uint64 58 | Error int32 59 | _ uint32 60 | Bytes uint64 61 | TCPState uint32 62 | _ uint32 63 | StackKey uint64 64 | CgroupID uint64 65 | Comm [16]byte 66 | Target [128]byte 67 | Details [128]byte 68 | } 69 | 70 | expectedV3 := int(unsafe.Sizeof(rawEventV3{})) 71 | expectedV2 := int(unsafe.Sizeof(rawEventV2{})) 72 | expectedV1 := int(unsafe.Sizeof(rawEventV1{})) 73 | if len(data) < expectedV1 { 74 | return nil 75 | } 76 | 77 | event := eventPool.Get().(*events.Event) 78 | event.ProcessName = "" 79 | event.Stack = nil 80 | event.CgroupID = 0 81 | 82 | if len(data) >= expectedV3 { 83 | var e rawEventV3 84 | if err := binaryRead(bytes.NewReader(data[:expectedV3]), binary.LittleEndian, &e); err != nil { 85 | return nil 86 | } 87 | 88 | event.Timestamp = e.Timestamp 89 | event.PID = e.PID 90 | event.Type = events.EventType(e.Type) 91 | event.LatencyNS = e.LatencyNS 92 | event.Error = e.Error 93 | event.Bytes = e.Bytes 94 | event.TCPState = e.TCPState 95 | event.StackKey = e.StackKey 96 | event.CgroupID = e.CgroupID 97 | event.ProcessName = string(bytes.TrimRight(e.Comm[:], "\x00")) 98 | event.Target = string(bytes.TrimRight(e.Target[:], "\x00")) 99 | event.Details = string(bytes.TrimRight(e.Details[:], "\x00")) 100 | 101 | return event 102 | } 103 | 104 | if len(data) >= expectedV2 { 105 | var e rawEventV2 106 | if err := binaryRead(bytes.NewReader(data[:expectedV2]), binary.LittleEndian, &e); err != nil { 107 | return nil 108 | } 109 | 110 | event.Timestamp = e.Timestamp 111 | event.PID = e.PID 112 | event.Type = events.EventType(e.Type) 113 | event.LatencyNS = e.LatencyNS 114 | event.Error = e.Error 115 | event.Bytes = e.Bytes 116 | event.TCPState = e.TCPState 117 | event.StackKey = e.StackKey 118 | event.CgroupID = e.CgroupID 119 | event.Target = string(bytes.TrimRight(e.Target[:], "\x00")) 120 | event.Details = string(bytes.TrimRight(e.Details[:], "\x00")) 121 | 122 | return event 123 | } 124 | 125 | var e rawEventV1 126 | if err := binaryRead(bytes.NewReader(data[:expectedV1]), binary.LittleEndian, &e); err != nil { 127 | return nil 128 | } 129 | 130 | event.Timestamp = e.Timestamp 131 | event.PID = e.PID 132 | event.Type = events.EventType(e.Type) 133 | event.LatencyNS = e.LatencyNS 134 | event.Error = e.Error 135 | event.Bytes = e.Bytes 136 | event.TCPState = e.TCPState 137 | event.StackKey = e.StackKey 138 | event.Target = string(bytes.TrimRight(e.Target[:], "\x00")) 139 | event.Details = string(bytes.TrimRight(e.Details[:], "\x00")) 140 | 141 | return event 142 | } 143 | 144 | func PutEvent(event *events.Event) { 145 | if event == nil { 146 | return 147 | } 148 | event.Stack = nil 149 | event.ProcessName = "" 150 | event.Target = "" 151 | event.Details = "" 152 | eventPool.Put(event) 153 | } 154 | -------------------------------------------------------------------------------- /internal/diagnose/stacktrace/stacktrace.go: -------------------------------------------------------------------------------- 1 | package stacktrace 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | "os" 7 | "os/exec" 8 | "path/filepath" 9 | "sort" 10 | "strings" 11 | 12 | "github.com/podtrace/podtrace/internal/config" 13 | "github.com/podtrace/podtrace/internal/events" 14 | ) 15 | 16 | type Diagnostician interface { 17 | GetEvents() []*events.Event 18 | } 19 | 20 | type stackSummary struct { 21 | Key string 22 | Count int 23 | Sample *events.Event 24 | FirstFrame string 25 | } 26 | 27 | type stackResolver struct { 28 | cache map[string]string 29 | } 30 | 31 | func (r *stackResolver) resolve(ctx context.Context, pid uint32, addr uint64) string { 32 | select { 33 | case <-ctx.Done(): 34 | return "" 35 | default: 36 | } 37 | 38 | if addr == 0 { 39 | return "" 40 | } 41 | if r.cache == nil { 42 | r.cache = make(map[string]string) 43 | } 44 | exePath, err := os.Readlink(fmt.Sprintf("/proc/%d/exe", pid)) 45 | if err != nil || exePath == "" { 46 | return fmt.Sprintf("0x%x", addr) 47 | } 48 | key := exePath + "|" + fmt.Sprintf("%x", addr) 49 | if v, ok := r.cache[key]; ok { 50 | return v 51 | } 52 | timeoutCtx, cancel := context.WithTimeout(ctx, config.DefaultAddr2lineTimeout) 53 | defer cancel() 54 | cmd := exec.CommandContext(timeoutCtx, "addr2line", "-e", exePath, fmt.Sprintf("%#x", addr)) 55 | out, err := cmd.Output() 56 | if err != nil { 57 | v := fmt.Sprintf("%s@0x%x", filepath.Base(exePath), addr) 58 | r.cache[key] = v 59 | return v 60 | } 61 | line := strings.TrimSpace(string(out)) 62 | if line == "" || line == "??:0" || line == "??:?" { 63 | line = fmt.Sprintf("%s@0x%x", filepath.Base(exePath), addr) 64 | } else { 65 | line = filepath.Base(exePath) + ":" + line 66 | } 67 | r.cache[key] = line 68 | return line 69 | } 70 | 71 | func GenerateStackTraceSectionWithContext(d Diagnostician, ctx context.Context) string { 72 | allEvents := d.GetEvents() 73 | if len(allEvents) == 0 { 74 | return "" 75 | } 76 | 77 | resolver := &stackResolver{cache: make(map[string]string)} 78 | stackMap := make(map[string]*stackSummary) 79 | processed := 0 80 | 81 | for _, e := range allEvents { 82 | if processed >= config.MaxEventsForStacks { 83 | break 84 | } 85 | if e == nil { 86 | continue 87 | } 88 | if len(e.Stack) == 0 { 89 | continue 90 | } 91 | if e.LatencyNS < uint64(config.MinLatencyForStackNS) && e.Type != events.EventLockContention && e.Type != events.EventDBQuery { 92 | continue 93 | } 94 | processed++ 95 | top := e.Stack[0] 96 | frame := resolver.resolve(ctx, e.PID, top) 97 | if frame == "" { 98 | continue 99 | } 100 | key := fmt.Sprintf("%s|%d", frame, e.Type) 101 | if entry, ok := stackMap[key]; ok { 102 | entry.Count++ 103 | } else { 104 | stackMap[key] = &stackSummary{ 105 | Key: key, 106 | Count: 1, 107 | Sample: e, 108 | FirstFrame: frame, 109 | } 110 | } 111 | } 112 | 113 | if len(stackMap) == 0 { 114 | return "" 115 | } 116 | 117 | var summaries []*stackSummary 118 | for _, v := range stackMap { 119 | summaries = append(summaries, v) 120 | } 121 | sort.Slice(summaries, func(i, j int) bool { 122 | return summaries[i].Count > summaries[j].Count 123 | }) 124 | 125 | var report string 126 | report += "Stack Traces for Slow Operations:\n" 127 | limit := config.MaxStackTracesLimit 128 | if len(summaries) < limit { 129 | limit = len(summaries) 130 | } 131 | for i := 0; i < limit; i++ { 132 | s := summaries[i] 133 | e := s.Sample 134 | if e == nil { 135 | continue 136 | } 137 | report += fmt.Sprintf(" Hot stack %d: %d events, type=%s, target=%s, avg latency=%.2fms\n", i+1, s.Count, e.TypeString(), e.Target, float64(e.LatencyNS)/float64(config.NSPerMS)) 138 | maxFrames := config.MaxStackFramesLimit 139 | if len(e.Stack) < maxFrames { 140 | maxFrames = len(e.Stack) 141 | } 142 | for j := 0; j < maxFrames; j++ { 143 | addr := e.Stack[j] 144 | frame := resolver.resolve(ctx, e.PID, addr) 145 | report += fmt.Sprintf(" #%d %s\n", j, frame) 146 | } 147 | } 148 | report += "\n" 149 | return report 150 | } 151 | 152 | -------------------------------------------------------------------------------- /internal/tracing/extractor/http.go: -------------------------------------------------------------------------------- 1 | package extractor 2 | 3 | import ( 4 | "net/http" 5 | "strings" 6 | 7 | "github.com/podtrace/podtrace/internal/tracing/context" 8 | ) 9 | 10 | const ( 11 | MaxHeaderNameLength = 256 12 | MaxHeaderValueLength = 4096 13 | MaxHeaderCount = 100 14 | ) 15 | 16 | type HTTPExtractor struct { 17 | extractW3C bool 18 | extractB3 bool 19 | extractSplunk bool 20 | } 21 | 22 | func NewHTTPExtractor() *HTTPExtractor { 23 | return &HTTPExtractor{ 24 | extractW3C: true, 25 | extractB3: true, 26 | extractSplunk: true, 27 | } 28 | } 29 | 30 | func (e *HTTPExtractor) ExtractFromHeaders(headers map[string]string) *context.TraceContext { 31 | if headers == nil { 32 | return nil 33 | } 34 | 35 | if len(headers) > MaxHeaderCount { 36 | return nil 37 | } 38 | 39 | normalized := make(map[string]string, len(headers)) 40 | for k, v := range headers { 41 | if len(k) > MaxHeaderNameLength || len(v) > MaxHeaderValueLength { 42 | continue 43 | } 44 | if strings.ContainsAny(k, "\r\n") || strings.ContainsAny(v, "\r\n") { 45 | continue 46 | } 47 | normalized[strings.ToLower(k)] = v 48 | } 49 | 50 | if e.extractW3C { 51 | if traceParent, ok := normalized["traceparent"]; ok { 52 | if tc, err := context.ParseW3CTraceParent(traceParent); err == nil { 53 | if tracestate, ok := normalized["tracestate"]; ok { 54 | tc.State = tracestate 55 | } 56 | return tc 57 | } 58 | } 59 | } 60 | 61 | if e.extractB3 { 62 | b3Headers := make(map[string]string) 63 | for k, v := range normalized { 64 | if strings.HasPrefix(k, "x-b3-") { 65 | b3Headers[k] = v 66 | } 67 | } 68 | if tc := context.ParseB3TraceContext(b3Headers); tc != nil { 69 | return tc 70 | } 71 | } 72 | 73 | if e.extractSplunk { 74 | if requestID, ok := normalized["x-splunk-requestid"]; ok { 75 | tc := context.NewTraceContext() 76 | tc.State = requestID 77 | return tc 78 | } 79 | } 80 | 81 | return nil 82 | } 83 | 84 | func (e *HTTPExtractor) ExtractFromHTTPRequest(req *http.Request) *context.TraceContext { 85 | if req == nil || req.Header == nil { 86 | return nil 87 | } 88 | 89 | if len(req.Header) > MaxHeaderCount { 90 | return nil 91 | } 92 | 93 | headers := make(map[string]string) 94 | for k, v := range req.Header { 95 | if len(v) > 0 { 96 | headerValue := v[0] 97 | if len(k) > MaxHeaderNameLength || len(headerValue) > MaxHeaderValueLength { 98 | continue 99 | } 100 | headers[k] = headerValue 101 | } 102 | } 103 | 104 | return e.ExtractFromHeaders(headers) 105 | } 106 | 107 | func (e *HTTPExtractor) ExtractFromHTTPResponse(resp *http.Response) *context.TraceContext { 108 | if resp == nil || resp.Header == nil { 109 | return nil 110 | } 111 | 112 | if len(resp.Header) > MaxHeaderCount { 113 | return nil 114 | } 115 | 116 | headers := make(map[string]string) 117 | for k, v := range resp.Header { 118 | if len(v) > 0 { 119 | headerValue := v[0] 120 | if len(k) > MaxHeaderNameLength || len(headerValue) > MaxHeaderValueLength { 121 | continue 122 | } 123 | headers[k] = headerValue 124 | } 125 | } 126 | 127 | return e.ExtractFromHeaders(headers) 128 | } 129 | 130 | func (e *HTTPExtractor) ExtractFromRawHeaders(rawHeaders string) *context.TraceContext { 131 | if rawHeaders == "" { 132 | return nil 133 | } 134 | 135 | headers := parseRawHeaders(rawHeaders) 136 | return e.ExtractFromHeaders(headers) 137 | } 138 | 139 | func parseRawHeaders(raw string) map[string]string { 140 | headers := make(map[string]string) 141 | lines := strings.Split(raw, "\r\n") 142 | 143 | for _, line := range lines { 144 | if line == "" { 145 | continue 146 | } 147 | if len(headers) >= MaxHeaderCount { 148 | break 149 | } 150 | idx := strings.Index(line, ":") 151 | if idx <= 0 { 152 | continue 153 | } 154 | key := strings.TrimSpace(line[:idx]) 155 | value := strings.TrimSpace(line[idx+1:]) 156 | if key != "" && value != "" { 157 | if len(key) > MaxHeaderNameLength || len(value) > MaxHeaderValueLength { 158 | continue 159 | } 160 | headers[key] = value 161 | } 162 | } 163 | 164 | return headers 165 | } 166 | -------------------------------------------------------------------------------- /internal/diagnose/tracker/connection.go: -------------------------------------------------------------------------------- 1 | package tracker 2 | 3 | import ( 4 | "fmt" 5 | "sort" 6 | "time" 7 | 8 | "github.com/podtrace/podtrace/internal/config" 9 | "github.com/podtrace/podtrace/internal/events" 10 | ) 11 | 12 | type ConnectionInfo struct { 13 | Target string 14 | ConnectTime time.Time 15 | SendCount int 16 | RecvCount int 17 | TotalLatency time.Duration 18 | LastActivity time.Time 19 | } 20 | 21 | type ConnectionTracker struct { 22 | connections map[string]*ConnectionInfo 23 | } 24 | 25 | func NewConnectionTracker() *ConnectionTracker { 26 | return &ConnectionTracker{ 27 | connections: make(map[string]*ConnectionInfo), 28 | } 29 | } 30 | 31 | func (ct *ConnectionTracker) ProcessEvent(event *events.Event) { 32 | if event == nil { 33 | return 34 | } 35 | 36 | switch event.Type { 37 | case events.EventConnect: 38 | if event.Error == 0 && event.Target != "" { 39 | conn := &ConnectionInfo{ 40 | Target: event.Target, 41 | ConnectTime: event.TimestampTime(), 42 | LastActivity: event.TimestampTime(), 43 | } 44 | ct.connections[event.Target] = conn 45 | } 46 | 47 | case events.EventTCPSend, events.EventTCPRecv: 48 | if event.Target != "" { 49 | if conn, exists := ct.connections[event.Target]; exists { 50 | if event.Type == events.EventTCPSend { 51 | conn.SendCount++ 52 | } else { 53 | conn.RecvCount++ 54 | } 55 | conn.TotalLatency += event.Latency() 56 | conn.LastActivity = event.TimestampTime() 57 | } else { 58 | conn := &ConnectionInfo{ 59 | Target: event.Target, 60 | ConnectTime: event.TimestampTime(), 61 | LastActivity: event.TimestampTime(), 62 | } 63 | if event.Type == events.EventTCPSend { 64 | conn.SendCount = 1 65 | } else { 66 | conn.RecvCount = 1 67 | } 68 | conn.TotalLatency = event.Latency() 69 | ct.connections[event.Target] = conn 70 | } 71 | } 72 | } 73 | } 74 | 75 | func (ct *ConnectionTracker) GetConnectionSummary() []ConnectionSummary { 76 | var summaries []ConnectionSummary 77 | for target, conn := range ct.connections { 78 | avgLatency := time.Duration(0) 79 | totalOps := conn.SendCount + conn.RecvCount 80 | if totalOps > 0 { 81 | avgLatency = conn.TotalLatency / time.Duration(totalOps) 82 | } 83 | summaries = append(summaries, ConnectionSummary{ 84 | Target: target, 85 | ConnectTime: conn.ConnectTime, 86 | SendCount: conn.SendCount, 87 | RecvCount: conn.RecvCount, 88 | TotalOps: totalOps, 89 | AvgLatency: avgLatency, 90 | LastActivity: conn.LastActivity, 91 | }) 92 | } 93 | sort.Slice(summaries, func(i, j int) bool { 94 | return summaries[i].TotalOps > summaries[j].TotalOps 95 | }) 96 | return summaries 97 | } 98 | 99 | type ConnectionSummary struct { 100 | Target string 101 | ConnectTime time.Time 102 | SendCount int 103 | RecvCount int 104 | TotalOps int 105 | AvgLatency time.Duration 106 | LastActivity time.Time 107 | } 108 | 109 | func GenerateConnectionCorrelation(events []*events.Event) string { 110 | tracker := NewConnectionTracker() 111 | for _, event := range events { 112 | tracker.ProcessEvent(event) 113 | } 114 | 115 | summaries := tracker.GetConnectionSummary() 116 | if len(summaries) == 0 { 117 | return "" 118 | } 119 | 120 | report := "Connection Correlation:\n" 121 | report += fmt.Sprintf(" Active connections: %d\n", len(summaries)) 122 | report += " Top connections by activity:\n" 123 | for i, summary := range summaries { 124 | if i >= config.MaxConnectionTargets { 125 | break 126 | } 127 | report += fmt.Sprintf(" - %s:\n", summary.Target) 128 | report += fmt.Sprintf(" Connect: %s\n", summary.ConnectTime.Format("15:04:05")) 129 | report += fmt.Sprintf(" Operations: %d send, %d recv (total: %d)\n", summary.SendCount, summary.RecvCount, summary.TotalOps) 130 | report += fmt.Sprintf(" Avg latency: %.2fms\n", float64(summary.AvgLatency.Nanoseconds())/float64(config.NSPerMS)) 131 | report += fmt.Sprintf(" Last activity: %s\n", summary.LastActivity.Format("15:04:05.000")) 132 | } 133 | report += "\n" 134 | return report 135 | } 136 | -------------------------------------------------------------------------------- /internal/tracing/context/context.go: -------------------------------------------------------------------------------- 1 | package context 2 | 3 | import ( 4 | "crypto/rand" 5 | "encoding/hex" 6 | "fmt" 7 | "strings" 8 | ) 9 | 10 | type TraceContext struct { 11 | TraceID string 12 | SpanID string 13 | ParentSpanID string 14 | Flags uint8 15 | State string 16 | } 17 | 18 | func NewTraceContext() *TraceContext { 19 | return &TraceContext{ 20 | TraceID: generateTraceID(), 21 | SpanID: generateSpanID(), 22 | Flags: 0x01, 23 | } 24 | } 25 | 26 | func (tc *TraceContext) IsValid() bool { 27 | return tc.TraceID != "" && tc.SpanID != "" 28 | } 29 | 30 | func (tc *TraceContext) IsSampled() bool { 31 | return (tc.Flags & 0x01) == 0x01 32 | } 33 | 34 | func (tc *TraceContext) SetSampled(sampled bool) { 35 | if sampled { 36 | tc.Flags |= 0x01 37 | } else { 38 | tc.Flags &= 0xFE 39 | } 40 | } 41 | 42 | func (tc *TraceContext) CreateChild() *TraceContext { 43 | child := &TraceContext{ 44 | TraceID: tc.TraceID, 45 | ParentSpanID: tc.SpanID, 46 | SpanID: generateSpanID(), 47 | Flags: tc.Flags, 48 | State: tc.State, 49 | } 50 | return child 51 | } 52 | 53 | func generateTraceID() string { 54 | b := make([]byte, 16) 55 | if _, err := rand.Read(b); err != nil { 56 | return "" 57 | } 58 | return hex.EncodeToString(b) 59 | } 60 | 61 | func generateSpanID() string { 62 | b := make([]byte, 8) 63 | if _, err := rand.Read(b); err != nil { 64 | return "" 65 | } 66 | return hex.EncodeToString(b) 67 | } 68 | 69 | func ParseW3CTraceParent(traceParent string) (*TraceContext, error) { 70 | if traceParent == "" { 71 | return nil, fmt.Errorf("empty traceparent") 72 | } 73 | 74 | parts := strings.Split(traceParent, "-") 75 | if len(parts) != 4 { 76 | return nil, fmt.Errorf("invalid traceparent format") 77 | } 78 | 79 | if parts[0] != "00" { 80 | return nil, fmt.Errorf("unsupported version: %s", parts[0]) 81 | } 82 | 83 | traceID := parts[1] 84 | parentID := parts[2] 85 | flags := parts[3] 86 | 87 | if len(traceID) != 32 { 88 | return nil, fmt.Errorf("invalid trace ID length: %d", len(traceID)) 89 | } 90 | if len(parentID) != 16 { 91 | return nil, fmt.Errorf("invalid parent ID length: %d", len(parentID)) 92 | } 93 | if len(flags) != 2 { 94 | return nil, fmt.Errorf("invalid flags length: %d", len(flags)) 95 | } 96 | 97 | var flagsByte uint8 98 | if _, err := fmt.Sscanf(flags, "%02x", &flagsByte); err != nil { 99 | return nil, fmt.Errorf("invalid flags: %w", err) 100 | } 101 | 102 | return &TraceContext{ 103 | TraceID: traceID, 104 | ParentSpanID: parentID, 105 | SpanID: generateSpanID(), 106 | Flags: flagsByte, 107 | }, nil 108 | } 109 | 110 | func ParseB3TraceContext(headers map[string]string) *TraceContext { 111 | var traceID, spanID, parentSpanID, sampled, flags string 112 | 113 | for k, v := range headers { 114 | lowerK := strings.ToLower(k) 115 | switch lowerK { 116 | case "x-b3-traceid": 117 | traceID = v 118 | case "x-b3-spanid": 119 | spanID = v 120 | case "x-b3-parentspanid": 121 | parentSpanID = v 122 | case "x-b3-sampled": 123 | sampled = v 124 | case "x-b3-flags": 125 | flags = v 126 | } 127 | } 128 | 129 | if traceID == "" || spanID == "" { 130 | return nil 131 | } 132 | 133 | tc := &TraceContext{ 134 | TraceID: traceID, 135 | SpanID: spanID, 136 | ParentSpanID: parentSpanID, 137 | } 138 | 139 | if sampled == "1" || sampled == "true" || flags == "1" { 140 | tc.Flags = 0x01 141 | } 142 | 143 | return tc 144 | } 145 | 146 | func (tc *TraceContext) ToW3CTraceParent() string { 147 | if !tc.IsValid() { 148 | return "" 149 | } 150 | flags := fmt.Sprintf("%02x", tc.Flags) 151 | return fmt.Sprintf("00-%s-%s-%s", tc.TraceID, tc.SpanID, flags) 152 | } 153 | 154 | func (tc *TraceContext) ToB3Headers() map[string]string { 155 | if !tc.IsValid() { 156 | return nil 157 | } 158 | headers := map[string]string{ 159 | "X-B3-TraceId": tc.TraceID, 160 | "X-B3-SpanId": tc.SpanID, 161 | } 162 | if tc.ParentSpanID != "" { 163 | headers["X-B3-ParentSpanID"] = tc.ParentSpanID 164 | } 165 | if tc.IsSampled() { 166 | headers["X-B3-Sampled"] = "1" 167 | } 168 | return headers 169 | } 170 | -------------------------------------------------------------------------------- /internal/diagnose/tracker/trace_tracker_test.go: -------------------------------------------------------------------------------- 1 | package tracker 2 | 3 | import ( 4 | "testing" 5 | "time" 6 | 7 | "github.com/podtrace/podtrace/internal/events" 8 | ) 9 | 10 | func TestNewTraceTracker(t *testing.T) { 11 | tt := NewTraceTracker() 12 | if tt == nil { 13 | t.Fatal("NewTraceTracker returned nil") 14 | } 15 | if tt.GetTraceCount() != 0 { 16 | t.Error("New tracker should have 0 traces") 17 | } 18 | } 19 | 20 | func TestTraceTracker_ProcessEvent(t *testing.T) { 21 | tt := NewTraceTracker() 22 | 23 | event := &events.Event{ 24 | TraceID: "trace123", 25 | SpanID: "span123", 26 | ParentSpanID: "", 27 | Type: events.EventHTTPReq, 28 | Timestamp: uint64(time.Now().UnixNano()), 29 | ProcessName: "test-process", 30 | Target: "http://example.com", 31 | } 32 | 33 | tt.ProcessEvent(event, nil) 34 | 35 | if tt.GetTraceCount() != 1 { 36 | t.Errorf("Expected 1 trace, got %d", tt.GetTraceCount()) 37 | } 38 | 39 | trace := tt.GetTrace("trace123") 40 | if trace == nil { 41 | t.Fatal("Trace not found") 42 | } 43 | if len(trace.Spans) != 1 { 44 | t.Errorf("Expected 1 span, got %d", len(trace.Spans)) 45 | } 46 | } 47 | 48 | func TestTraceTracker_ProcessEvent_WithK8sContext(t *testing.T) { 49 | tt := NewTraceTracker() 50 | 51 | event := &events.Event{ 52 | TraceID: "trace123", 53 | SpanID: "span123", 54 | Type: events.EventHTTPReq, 55 | Timestamp: uint64(time.Now().UnixNano()), 56 | ProcessName: "test-process", 57 | } 58 | 59 | k8sCtx := map[string]interface{}{ 60 | "target_service": "test-service", 61 | "target_namespace": "default", 62 | "target_pod": "test-pod", 63 | "target_labels": map[string]string{"app": "test"}, 64 | } 65 | 66 | tt.ProcessEvent(event, k8sCtx) 67 | 68 | trace := tt.GetTrace("trace123") 69 | if trace == nil { 70 | t.Fatal("Trace not found") 71 | } 72 | 73 | if len(trace.Services) == 0 { 74 | t.Error("Services should be populated") 75 | } 76 | } 77 | 78 | func TestTraceTracker_ProcessEvent_NoTraceID(t *testing.T) { 79 | tt := NewTraceTracker() 80 | 81 | event := &events.Event{ 82 | TraceID: "", 83 | SpanID: "span123", 84 | Type: events.EventHTTPReq, 85 | } 86 | 87 | tt.ProcessEvent(event, nil) 88 | 89 | if tt.GetTraceCount() != 0 { 90 | t.Error("Event without TraceID should not create trace") 91 | } 92 | } 93 | 94 | func TestTraceTracker_GetAllTraces(t *testing.T) { 95 | tt := NewTraceTracker() 96 | 97 | event1 := &events.Event{ 98 | TraceID: "trace1", 99 | SpanID: "span1", 100 | Timestamp: uint64(time.Now().UnixNano()), 101 | Type: events.EventHTTPReq, 102 | } 103 | 104 | event2 := &events.Event{ 105 | TraceID: "trace2", 106 | SpanID: "span2", 107 | Timestamp: uint64(time.Now().UnixNano()), 108 | Type: events.EventHTTPReq, 109 | } 110 | 111 | tt.ProcessEvent(event1, nil) 112 | tt.ProcessEvent(event2, nil) 113 | 114 | traces := tt.GetAllTraces() 115 | if len(traces) != 2 { 116 | t.Errorf("Expected 2 traces, got %d", len(traces)) 117 | } 118 | } 119 | 120 | func TestTraceTracker_CleanupOldTraces(t *testing.T) { 121 | tt := NewTraceTracker() 122 | 123 | oldTime := time.Now().Add(-15 * time.Minute) 124 | event := &events.Event{ 125 | TraceID: "old-trace", 126 | SpanID: "span1", 127 | Timestamp: uint64(oldTime.UnixNano()), 128 | Type: events.EventHTTPReq, 129 | } 130 | 131 | tt.ProcessEvent(event, nil) 132 | 133 | tt.CleanupOldTraces(10 * time.Minute) 134 | 135 | if tt.GetTraceCount() != 0 { 136 | t.Error("Old traces should be cleaned up") 137 | } 138 | } 139 | 140 | func TestSpan_UpdateDuration(t *testing.T) { 141 | span := &Span{ 142 | TraceID: "trace1", 143 | SpanID: "span1", 144 | StartTime: time.Now(), 145 | Events: []*events.Event{ 146 | { 147 | Timestamp: uint64(time.Now().UnixNano()), 148 | Type: events.EventHTTPReq, 149 | }, 150 | { 151 | Timestamp: uint64(time.Now().Add(100 * time.Millisecond).UnixNano()), 152 | Type: events.EventHTTPResp, 153 | }, 154 | }, 155 | } 156 | 157 | span.UpdateDuration() 158 | 159 | if span.Duration == 0 { 160 | t.Error("Duration should be updated") 161 | } 162 | } 163 | -------------------------------------------------------------------------------- /internal/ebpf/tracer/errors_test.go: -------------------------------------------------------------------------------- 1 | package tracer 2 | 3 | import ( 4 | "errors" 5 | "testing" 6 | ) 7 | 8 | func TestTracerError_Error(t *testing.T) { 9 | tests := []struct { 10 | name string 11 | err *TracerError 12 | wantMsg string 13 | }{ 14 | { 15 | name: "error with wrapped error", 16 | err: &TracerError{ 17 | Code: ErrCodeCollectionFailed, 18 | Message: "test error", 19 | Err: errors.New("wrapped error"), 20 | }, 21 | wantMsg: "test error: wrapped error", 22 | }, 23 | { 24 | name: "error without wrapped error", 25 | err: &TracerError{ 26 | Code: ErrCodeRingBufferFailed, 27 | Message: "test error", 28 | Err: nil, 29 | }, 30 | wantMsg: "test error", 31 | }, 32 | } 33 | 34 | for _, tt := range tests { 35 | t.Run(tt.name, func(t *testing.T) { 36 | got := tt.err.Error() 37 | if got != tt.wantMsg { 38 | t.Errorf("Error() = %v, want %v", got, tt.wantMsg) 39 | } 40 | }) 41 | } 42 | } 43 | 44 | func TestTracerError_Unwrap(t *testing.T) { 45 | wrappedErr := errors.New("wrapped error") 46 | err := &TracerError{ 47 | Code: ErrCodeCollectionFailed, 48 | Message: "test error", 49 | Err: wrappedErr, 50 | } 51 | 52 | unwrapped := err.Unwrap() 53 | if unwrapped != wrappedErr { 54 | t.Errorf("Unwrap() = %v, want %v", unwrapped, wrappedErr) 55 | } 56 | 57 | errNoWrap := &TracerError{ 58 | Code: ErrCodeRingBufferFailed, 59 | Message: "test error", 60 | Err: nil, 61 | } 62 | 63 | unwrapped = errNoWrap.Unwrap() 64 | if unwrapped != nil { 65 | t.Errorf("Unwrap() = %v, want nil", unwrapped) 66 | } 67 | } 68 | 69 | func TestNewCollectionError(t *testing.T) { 70 | wrappedErr := errors.New("collection failed") 71 | err := NewCollectionError(wrappedErr) 72 | 73 | if err == nil { 74 | t.Fatal("Expected non-nil error") 75 | } 76 | 77 | if err.Code != ErrCodeCollectionFailed { 78 | t.Errorf("Expected Code %d, got %d", ErrCodeCollectionFailed, err.Code) 79 | } 80 | 81 | if err.Message != "failed to create eBPF collection" { 82 | t.Errorf("Expected Message 'failed to create eBPF collection', got %q", err.Message) 83 | } 84 | 85 | if err.Err != wrappedErr { 86 | t.Errorf("Expected wrapped error %v, got %v", wrappedErr, err.Err) 87 | } 88 | } 89 | 90 | func TestNewRingBufferError(t *testing.T) { 91 | wrappedErr := errors.New("ring buffer failed") 92 | err := NewRingBufferError(wrappedErr) 93 | 94 | if err == nil { 95 | t.Fatal("Expected non-nil error") 96 | } 97 | 98 | if err.Code != ErrCodeRingBufferFailed { 99 | t.Errorf("Expected Code %d, got %d", ErrCodeRingBufferFailed, err.Code) 100 | } 101 | 102 | if err.Message != "failed to create ring buffer reader" { 103 | t.Errorf("Expected Message 'failed to create ring buffer reader', got %q", err.Message) 104 | } 105 | 106 | if err.Err != wrappedErr { 107 | t.Errorf("Expected wrapped error %v, got %v", wrappedErr, err.Err) 108 | } 109 | } 110 | 111 | func TestNewMapLookupError(t *testing.T) { 112 | mapName := "test_map" 113 | wrappedErr := errors.New("lookup failed") 114 | err := NewMapLookupError(mapName, wrappedErr) 115 | 116 | if err == nil { 117 | t.Fatal("Expected non-nil error") 118 | } 119 | 120 | if err.Code != ErrCodeMapLookupFailed { 121 | t.Errorf("Expected Code %d, got %d", ErrCodeMapLookupFailed, err.Code) 122 | } 123 | 124 | expectedMsg := "failed to lookup map test_map" 125 | if err.Message != expectedMsg { 126 | t.Errorf("Expected Message %q, got %q", expectedMsg, err.Message) 127 | } 128 | 129 | if err.Err != wrappedErr { 130 | t.Errorf("Expected wrapped error %v, got %v", wrappedErr, err.Err) 131 | } 132 | } 133 | 134 | func TestNewInvalidEventError(t *testing.T) { 135 | reason := "invalid format" 136 | err := NewInvalidEventError(reason) 137 | 138 | if err == nil { 139 | t.Fatal("Expected non-nil error") 140 | } 141 | 142 | if err.Code != ErrCodeInvalidEvent { 143 | t.Errorf("Expected Code %d, got %d", ErrCodeInvalidEvent, err.Code) 144 | } 145 | 146 | expectedMsg := "invalid event: invalid format" 147 | if err.Message != expectedMsg { 148 | t.Errorf("Expected Message %q, got %q", expectedMsg, err.Message) 149 | } 150 | 151 | if err.Err != nil { 152 | t.Errorf("Expected nil wrapped error, got %v", err.Err) 153 | } 154 | } 155 | 156 | -------------------------------------------------------------------------------- /test/pool-test/pool-test-app.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "context" 5 | "database/sql" 6 | "fmt" 7 | "log" 8 | "sync" 9 | "time" 10 | 11 | _ "github.com/mattn/go-sqlite3" 12 | ) 13 | 14 | func main() { 15 | db, err := sql.Open("sqlite3", ":memory:") 16 | if err != nil { 17 | log.Fatalf("Failed to open database: %v", err) 18 | } 19 | defer db.Close() 20 | 21 | maxOpenConns := 5 22 | maxIdleConns := 2 23 | db.SetMaxOpenConns(maxOpenConns) 24 | db.SetMaxIdleConns(maxIdleConns) 25 | db.SetConnMaxLifetime(time.Hour) 26 | 27 | _, err = db.Exec("CREATE TABLE IF NOT EXISTS test (id INTEGER PRIMARY KEY, data TEXT)") 28 | if err != nil { 29 | log.Fatalf("Failed to create table: %v", err) 30 | } 31 | 32 | fmt.Println("=== Connection Pool Test App Started ===") 33 | fmt.Printf("MaxOpenConns: %d, MaxIdleConns: %d\n", maxOpenConns, maxIdleConns) 34 | fmt.Println("") 35 | 36 | var wg sync.WaitGroup 37 | acquireCount := int64(0) 38 | releaseCount := int64(0) 39 | var mu sync.Mutex 40 | 41 | phase1 := func() { 42 | fmt.Println("Phase 1: Normal operations (100 inserts)") 43 | for i := 0; i < 100; i++ { 44 | _, err := db.Exec("INSERT INTO test (data) VALUES (?)", fmt.Sprintf("data-%d", i)) 45 | if err != nil { 46 | log.Printf("Failed to execute query: %v", err) 47 | continue 48 | } 49 | 50 | mu.Lock() 51 | acquireCount++ 52 | releaseCount++ 53 | mu.Unlock() 54 | 55 | if i%10 == 0 { 56 | fmt.Printf(" Insert %d completed\n", i) 57 | } 58 | 59 | time.Sleep(50 * time.Millisecond) 60 | } 61 | fmt.Println("Phase 1 completed") 62 | fmt.Println("") 63 | } 64 | 65 | phase2 := func() { 66 | fmt.Println("Phase 2: Concurrent queries (20 queries)") 67 | for i := 0; i < 20; i++ { 68 | var count int 69 | err := db.QueryRow("SELECT COUNT(*) FROM test").Scan(&count) 70 | if err != nil { 71 | log.Printf("Query failed: %v", err) 72 | } else { 73 | mu.Lock() 74 | acquireCount++ 75 | releaseCount++ 76 | mu.Unlock() 77 | } 78 | time.Sleep(100 * time.Millisecond) 79 | } 80 | fmt.Println("Phase 2 completed") 81 | fmt.Println("") 82 | } 83 | 84 | phase3 := func() { 85 | fmt.Println("Phase 3: Pool exhaustion test (10 concurrent connections)") 86 | ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) 87 | defer cancel() 88 | 89 | for i := 0; i < 10; i++ { 90 | wg.Add(1) 91 | go func(id int) { 92 | defer wg.Done() 93 | conn, err := db.Conn(ctx) 94 | if err != nil { 95 | fmt.Printf(" Connection %d: Failed to acquire (pool exhausted): %v\n", id, err) 96 | return 97 | } 98 | defer conn.Close() 99 | 100 | mu.Lock() 101 | acquireCount++ 102 | mu.Unlock() 103 | 104 | fmt.Printf(" Connection %d: Acquired\n", id) 105 | time.Sleep(2 * time.Second) 106 | 107 | mu.Lock() 108 | releaseCount++ 109 | mu.Unlock() 110 | 111 | fmt.Printf(" Connection %d: Released\n", id) 112 | }(i) 113 | } 114 | wg.Wait() 115 | fmt.Println("Phase 3 completed") 116 | fmt.Println("") 117 | } 118 | 119 | phase4 := func() { 120 | fmt.Println("Phase 4: Continuous operations (running indefinitely)") 121 | ticker := time.NewTicker(500 * time.Millisecond) 122 | defer ticker.Stop() 123 | 124 | for range ticker.C { 125 | var count int 126 | err := db.QueryRow("SELECT COUNT(*) FROM test").Scan(&count) 127 | if err != nil { 128 | log.Printf("Query failed: %v", err) 129 | continue 130 | } 131 | 132 | mu.Lock() 133 | acquireCount++ 134 | releaseCount++ 135 | currentAcq := acquireCount 136 | currentRel := releaseCount 137 | mu.Unlock() 138 | 139 | if currentAcq%10 == 0 { 140 | fmt.Printf("Running: %d acquires, %d releases\n", currentAcq, currentRel) 141 | } 142 | } 143 | } 144 | 145 | phase1() 146 | phase2() 147 | phase3() 148 | 149 | mu.Lock() 150 | totalAcq := acquireCount 151 | totalRel := releaseCount 152 | mu.Unlock() 153 | 154 | fmt.Printf("=== Summary ===\n") 155 | fmt.Printf("Total acquires: %d\n", totalAcq) 156 | fmt.Printf("Total releases: %d\n", totalRel) 157 | fmt.Printf("Reuse rate: %.2f%%\n", float64(totalRel)/float64(totalAcq)*100) 158 | fmt.Println("") 159 | fmt.Println("Starting continuous operations...") 160 | fmt.Println("") 161 | 162 | phase4() 163 | } 164 | -------------------------------------------------------------------------------- /internal/tracing/exporter/otlp.go: -------------------------------------------------------------------------------- 1 | package exporter 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | "time" 7 | 8 | "go.opentelemetry.io/otel" 9 | "go.opentelemetry.io/otel/attribute" 10 | "go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp" 11 | "go.opentelemetry.io/otel/sdk/resource" 12 | sdktrace "go.opentelemetry.io/otel/sdk/trace" 13 | semconv "go.opentelemetry.io/otel/semconv/v1.24.0" 14 | "go.opentelemetry.io/otel/trace" 15 | 16 | "github.com/podtrace/podtrace/internal/config" 17 | "github.com/podtrace/podtrace/internal/diagnose/tracker" 18 | ) 19 | 20 | type OTLPExporter struct { 21 | exporter sdktrace.SpanExporter 22 | tracer trace.Tracer 23 | tp *sdktrace.TracerProvider 24 | endpoint string 25 | enabled bool 26 | sampleRate float64 27 | } 28 | 29 | func NewOTLPExporter(endpoint string, sampleRate float64) (*OTLPExporter, error) { 30 | if endpoint == "" { 31 | endpoint = config.DefaultOTLPEndpoint 32 | } 33 | 34 | ctx := context.Background() 35 | otlpExporter, err := otlptracehttp.New(ctx, 36 | otlptracehttp.WithEndpoint(endpoint), 37 | otlptracehttp.WithInsecure(), 38 | ) 39 | if err != nil { 40 | return nil, fmt.Errorf("failed to create OTLP exporter: %w", err) 41 | } 42 | 43 | res, err := resource.New(ctx, 44 | resource.WithAttributes( 45 | semconv.ServiceNameKey.String("Podtrace"), 46 | ), 47 | ) 48 | if err != nil { 49 | return nil, fmt.Errorf("failed to create resource: %w", err) 50 | } 51 | 52 | tp := sdktrace.NewTracerProvider( 53 | sdktrace.WithBatcher(otlpExporter), 54 | sdktrace.WithResource(res), 55 | ) 56 | 57 | otel.SetTracerProvider(tp) 58 | 59 | return &OTLPExporter{ 60 | exporter: otlpExporter, 61 | tp: tp, 62 | tracer: tp.Tracer("Podtrace"), 63 | endpoint: endpoint, 64 | enabled: true, 65 | sampleRate: sampleRate, 66 | }, nil 67 | } 68 | 69 | func (e *OTLPExporter) ExportTraces(traces []*tracker.Trace) error { 70 | if !e.enabled || len(traces) == 0 { 71 | return nil 72 | } 73 | 74 | ctx := context.Background() 75 | for _, t := range traces { 76 | if !e.shouldSample(t) { 77 | continue 78 | } 79 | 80 | for _, span := range t.Spans { 81 | if err := e.exportSpan(ctx, span, t); err != nil { 82 | continue 83 | } 84 | } 85 | } 86 | 87 | return nil 88 | } 89 | 90 | func (e *OTLPExporter) shouldSample(_ *tracker.Trace) bool { 91 | if e.sampleRate >= 1.0 { 92 | return true 93 | } 94 | if e.sampleRate <= 0.0 { 95 | return false 96 | } 97 | return time.Now().UnixNano()%int64(1.0/e.sampleRate) == 0 98 | } 99 | 100 | func (e *OTLPExporter) exportSpan(ctx context.Context, span *tracker.Span, _ *tracker.Trace) error { 101 | span.UpdateDuration() 102 | 103 | traceID, err := trace.TraceIDFromHex(span.TraceID) 104 | if err != nil { 105 | return fmt.Errorf("invalid trace ID: %w", err) 106 | } 107 | 108 | spanID, err := trace.SpanIDFromHex(span.SpanID) 109 | if err != nil { 110 | return fmt.Errorf("invalid span ID: %w", err) 111 | } 112 | 113 | spanContext := trace.NewSpanContext(trace.SpanContextConfig{ 114 | TraceID: traceID, 115 | SpanID: spanID, 116 | Remote: false, 117 | TraceFlags: trace.FlagsSampled, 118 | }) 119 | 120 | ctx = trace.ContextWithSpanContext(ctx, spanContext) 121 | 122 | _, otelSpan := e.tracer.Start(ctx, span.Operation, 123 | trace.WithTimestamp(span.StartTime), 124 | ) 125 | 126 | attrs := make([]attribute.KeyValue, 0, len(span.Attributes)) 127 | for k, v := range span.Attributes { 128 | attrs = append(attrs, attribute.String(k, v)) 129 | } 130 | if span.ParentSpanID != "" { 131 | attrs = append(attrs, attribute.String("parent_span_id", span.ParentSpanID)) 132 | } 133 | otelSpan.SetAttributes(attrs...) 134 | 135 | if span.Error { 136 | otelSpan.RecordError(fmt.Errorf("span error")) 137 | } 138 | 139 | for _, event := range span.Events { 140 | otelSpan.AddEvent(event.TypeString(), 141 | trace.WithTimestamp(event.TimestampTime()), 142 | trace.WithAttributes( 143 | attribute.String("target", event.Target), 144 | attribute.Int64("latency_ns", int64(event.LatencyNS)), 145 | ), 146 | ) 147 | } 148 | 149 | otelSpan.End(trace.WithTimestamp(span.StartTime.Add(span.Duration))) 150 | 151 | return nil 152 | } 153 | 154 | func (e *OTLPExporter) Shutdown(ctx context.Context) error { 155 | if e.tp != nil { 156 | return e.tp.Shutdown(ctx) 157 | } 158 | return nil 159 | } 160 | -------------------------------------------------------------------------------- /internal/diagnose/formatter/formatter_test.go: -------------------------------------------------------------------------------- 1 | package formatter 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/podtrace/podtrace/internal/diagnose/analyzer" 7 | ) 8 | 9 | func TestSectionHeader(t *testing.T) { 10 | result := SectionHeader("DNS") 11 | if result != "DNS Statistics:\n" { 12 | t.Errorf("Expected 'DNS Statistics:\\n', got %q", result) 13 | } 14 | } 15 | 16 | func TestTotalWithRate(t *testing.T) { 17 | result := TotalWithRate("lookups", 100, 10.5) 18 | expected := " Total lookups: 100 (10.5/sec)\n" 19 | if result != expected { 20 | t.Errorf("Expected %q, got %q", expected, result) 21 | } 22 | } 23 | 24 | func TestLatencyMetrics(t *testing.T) { 25 | result := LatencyMetrics(5.5, 10.2) 26 | if !contains(result, "5.50") || !contains(result, "10.20") { 27 | t.Errorf("Expected latency metrics, got %q", result) 28 | } 29 | } 30 | 31 | func TestPercentiles(t *testing.T) { 32 | result := Percentiles(1.0, 2.0, 3.0) 33 | if !contains(result, "P50=1.00") || !contains(result, "P95=2.00") || !contains(result, "P99=3.00") { 34 | t.Errorf("Expected percentiles, got %q", result) 35 | } 36 | } 37 | 38 | func TestErrorRate_ZeroTotal(t *testing.T) { 39 | result := ErrorRate(5, 0) 40 | if !contains(result, "0.0%") { 41 | t.Errorf("Expected 0.0%% for zero total, got %q", result) 42 | } 43 | } 44 | 45 | func TestErrorRate_WithErrors(t *testing.T) { 46 | result := ErrorRate(5, 100) 47 | if !contains(result, "5.0%") { 48 | t.Errorf("Expected 5.0%% error rate, got %q", result) 49 | } 50 | } 51 | 52 | func TestTopTargets_Empty(t *testing.T) { 53 | result := TopTargets([]analyzer.TargetCount{}, 5, "targets", "counts") 54 | if result != "" { 55 | t.Errorf("Expected empty string for empty targets, got %q", result) 56 | } 57 | } 58 | 59 | func TestTopTargets_WithLimit(t *testing.T) { 60 | targets := []analyzer.TargetCount{ 61 | {Target: "target1", Count: 10}, 62 | {Target: "target2", Count: 20}, 63 | {Target: "target3", Count: 30}, 64 | {Target: "target4", Count: 40}, 65 | {Target: "target5", Count: 50}, 66 | {Target: "target6", Count: 60}, 67 | } 68 | result := TopTargets(targets, 3, "targets", "counts") 69 | if countOccurrences(result, "-") > 3 { 70 | t.Errorf("Expected at most 3 targets, got more") 71 | } 72 | } 73 | 74 | func TestBytesSection_Empty(t *testing.T) { 75 | result := BytesSection(0, 0, 0) 76 | if result != "" { 77 | t.Errorf("Expected empty string for zero bytes, got %q", result) 78 | } 79 | } 80 | 81 | func TestBytesSection_WithBytes(t *testing.T) { 82 | result := BytesSection(1024, 512, 256) 83 | if result == "" { 84 | t.Error("Expected non-empty bytes section") 85 | } 86 | if !contains(result, "KB") && !contains(result, "B") { 87 | t.Errorf("Expected bytes section with formatted bytes, got %q", result) 88 | } 89 | } 90 | 91 | func TestRate_ZeroDuration(t *testing.T) { 92 | result := Rate(100, 0) 93 | if result != "" { 94 | t.Errorf("Expected empty string for zero duration, got %q", result) 95 | } 96 | } 97 | 98 | func TestRate_WithDuration(t *testing.T) { 99 | result := Rate(100, 10.0) 100 | if !contains(result, "10.0") { 101 | t.Errorf("Expected rate string, got %q", result) 102 | } 103 | } 104 | 105 | func TestTopItems_Empty(t *testing.T) { 106 | result := TopItems(map[string]int{}, 5, "items", "counts") 107 | if result != "" { 108 | t.Errorf("Expected empty string for empty items, got %q", result) 109 | } 110 | } 111 | 112 | func TestTopItems_WithLimit(t *testing.T) { 113 | items := map[string]int{ 114 | "item1": 10, 115 | "item2": 20, 116 | "item3": 30, 117 | "item4": 40, 118 | "item5": 50, 119 | "item6": 60, 120 | } 121 | result := TopItems(items, 3, "items", "counts") 122 | if countOccurrences(result, "-") > 3 { 123 | t.Errorf("Expected at most 3 items, got more") 124 | } 125 | } 126 | 127 | func contains(s, substr string) bool { 128 | return len(s) >= len(substr) && (s == substr || len(s) > len(substr) && (s[:len(substr)] == substr || s[len(s)-len(substr):] == substr || containsMiddle(s, substr))) 129 | } 130 | 131 | func containsMiddle(s, substr string) bool { 132 | for i := 0; i <= len(s)-len(substr); i++ { 133 | if s[i:i+len(substr)] == substr { 134 | return true 135 | } 136 | } 137 | return false 138 | } 139 | 140 | func countOccurrences(s, substr string) int { 141 | count := 0 142 | for i := 0; i <= len(s)-len(substr); i++ { 143 | if s[i:i+len(substr)] == substr { 144 | count++ 145 | } 146 | } 147 | return count 148 | } 149 | 150 | --------------------------------------------------------------------------------