├── resource └── npd │ ├── yml-json-rules.png │ ├── yml-your_json_name.png │ ├── yml-json-conditions.png │ ├── yml-json-your_json_name.png │ ├── npd-dashboard-DNSProblem.png │ └── npd-dashboard-daemonsets.png ├── Dockerfile ├── pkg ├── env │ ├── darwin.go │ ├── windows.go │ ├── linux.go │ ├── env.go │ ├── k8s.go │ └── azure.go ├── tools │ ├── aadssh │ │ ├── ssh_agent_unix.go │ │ ├── ssh_agent_win.go │ │ ├── ssh_agent.go │ │ ├── transport.go │ │ ├── token_azure_cli.go │ │ ├── aadssh.go │ │ ├── token.go │ │ └── ssh.go │ ├── registry.go │ ├── tool.go │ ├── vmrebootdetector │ │ ├── vmrebootdetector_test.go │ │ └── vmrebootdetector.go │ ├── upgradeinspector │ │ ├── upgradeinspector_test.go │ │ └── upgradeinspector.go │ ├── tcpdump │ │ ├── tcpdump_test.go │ │ └── tcpdump.go │ └── netexec │ │ ├── netexec_test.go │ │ └── netexec.go ├── batch │ ├── static_discoverer.go │ ├── batch.go │ ├── file_discoverer_test.go │ ├── file_discoverer.go │ ├── kube_discoverer_test.go │ ├── kube_discoverer.go │ ├── ssh_executor.go │ └── pod_executor.go ├── formatters │ ├── formatter.go │ ├── json.go │ ├── oneline.go │ └── text.go ├── checkers │ ├── dummy │ │ └── dummy.go │ ├── checker.go │ ├── tcpping │ │ ├── tcpping_test.go │ │ └── tcpping.go │ ├── kube │ │ ├── objectsize │ │ │ ├── objectsize_test.go │ │ │ └── objectsize.go │ │ └── pod │ │ │ └── pod_restart_reason_checker.go │ ├── systemload │ │ ├── systemload_test.go │ │ └── systemload.go │ ├── podschedule │ │ ├── podschedule_test.go │ │ └── podschedule.go │ ├── icmp │ │ ├── icmp_test.go │ │ └── icmp.go │ ├── registry.go │ ├── liveness │ │ └── liveness.go │ ├── oom │ │ ├── oom_test.go │ │ └── oom.go │ ├── diskusage │ │ ├── diskusage_test.go │ │ └── diskusage.go │ ├── http │ │ └── http.go │ ├── diskreadonly │ │ └── disk_readonly.go │ ├── kmscachesize │ │ └── kms_cache_size.go │ └── dns │ │ ├── dns_test.go │ │ └── dns.go └── base │ └── models.go ├── cmd ├── pause.go ├── batch.go ├── run-as-host │ └── main.go └── main.go ├── .gitignore ├── Makefile ├── CODE_OF_CONDUCT.md ├── .github └── workflows │ ├── go.yml │ ├── release.yml │ └── container.yml ├── LICENSE ├── .devcontainer ├── Dockerfile └── devcontainer.json ├── SUPPORT.md ├── SECURITY.md ├── deploy └── node-problem-detector │ ├── README.md │ ├── node-problem-detector.yaml │ └── node-problem-detector-template.yaml ├── go.mod └── README.md /resource/npd/yml-json-rules.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/kdebug/HEAD/resource/npd/yml-json-rules.png -------------------------------------------------------------------------------- /resource/npd/yml-your_json_name.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/kdebug/HEAD/resource/npd/yml-your_json_name.png -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM gcr.io/distroless/static-debian11 2 | 3 | ADD bin/kdebug bin/run-as-host / 4 | 5 | CMD [ "/kdebug" ] 6 | -------------------------------------------------------------------------------- /resource/npd/yml-json-conditions.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/kdebug/HEAD/resource/npd/yml-json-conditions.png -------------------------------------------------------------------------------- /resource/npd/yml-json-your_json_name.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/kdebug/HEAD/resource/npd/yml-json-your_json_name.png -------------------------------------------------------------------------------- /resource/npd/npd-dashboard-DNSProblem.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/kdebug/HEAD/resource/npd/npd-dashboard-DNSProblem.png -------------------------------------------------------------------------------- /resource/npd/npd-dashboard-daemonsets.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/kdebug/HEAD/resource/npd/npd-dashboard-daemonsets.png -------------------------------------------------------------------------------- /pkg/env/darwin.go: -------------------------------------------------------------------------------- 1 | //go:build darwin 2 | 3 | package env 4 | 5 | import "runtime" 6 | 7 | func getLinuxFlags() []string { 8 | return []string{ 9 | runtime.GOOS, 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /pkg/env/windows.go: -------------------------------------------------------------------------------- 1 | //go:build windows 2 | 3 | package env 4 | 5 | import ( 6 | "runtime" 7 | ) 8 | 9 | func getLinuxFlags() []string { 10 | return []string{ 11 | runtime.GOOS, 12 | } 13 | } 14 | -------------------------------------------------------------------------------- /pkg/tools/aadssh/ssh_agent_unix.go: -------------------------------------------------------------------------------- 1 | //go:build !windows 2 | 3 | package aadssh 4 | 5 | import "net" 6 | 7 | func dialSSHAgent(path string) (net.Conn, error) { 8 | return net.Dial("unix", path) 9 | } 10 | -------------------------------------------------------------------------------- /pkg/batch/static_discoverer.go: -------------------------------------------------------------------------------- 1 | package batch 2 | 3 | type StaticBatchDiscoverer struct { 4 | Machines []string 5 | } 6 | 7 | func (d *StaticBatchDiscoverer) Discover() ([]string, error) { 8 | return d.Machines, nil 9 | } 10 | -------------------------------------------------------------------------------- /pkg/tools/aadssh/ssh_agent_win.go: -------------------------------------------------------------------------------- 1 | //go:build windows 2 | 3 | package aadssh 4 | 5 | import ( 6 | "net" 7 | 8 | "github.com/Microsoft/go-winio" 9 | ) 10 | 11 | func dialSSHAgent(path string) (net.Conn, error) { 12 | return winio.DialPipe(path, nil) 13 | } 14 | -------------------------------------------------------------------------------- /cmd/pause.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "os" 6 | "os/signal" 7 | ) 8 | 9 | func pause() { 10 | c := make(chan os.Signal, 1) 11 | signal.Notify(c, os.Interrupt) 12 | signal.Notify(c, os.Kill) 13 | s := <-c 14 | fmt.Printf("Shutting down, got signal: %s", s) 15 | } 16 | -------------------------------------------------------------------------------- /pkg/formatters/formatter.go: -------------------------------------------------------------------------------- 1 | package formatters 2 | 3 | import ( 4 | "io" 5 | 6 | "github.com/Azure/kdebug/pkg/base" 7 | "github.com/Azure/kdebug/pkg/batch" 8 | ) 9 | 10 | type Formatter interface { 11 | WriteResults(io.Writer, []*base.CheckResult) error 12 | WriteBatchResults(io.Writer, []*batch.BatchResult) error 13 | } 14 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Binaries for programs and plugins 2 | *.exe 3 | *.exe~ 4 | *.dll 5 | *.so 6 | *.dylib 7 | 8 | # Test binary, built with `go test -c` 9 | *.test 10 | 11 | # Output of the go coverage tool, specifically when used with LiteIDE 12 | *.out 13 | 14 | # Dependency directories 15 | vendor/ 16 | 17 | # Output directory 18 | bin/ 19 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | build: 2 | CGO_ENABLED=0 go build -o bin/kdebug github.com/Azure/kdebug/cmd 3 | CGO_ENABLED=0 go build -o bin/run-as-host github.com/Azure/kdebug/cmd/run-as-host 4 | 5 | build-win: 6 | CGO_ENABLED=0 GOOS=windows go build -o bin/kdebug.exe github.com/Azure/kdebug/cmd 7 | 8 | test: 9 | CGO_ENABLED=0 go test -v github.com/Azure/kdebug/... 10 | -------------------------------------------------------------------------------- /pkg/env/linux.go: -------------------------------------------------------------------------------- 1 | //go:build linux 2 | 3 | package env 4 | 5 | import ( 6 | "os" 7 | "runtime" 8 | "strings" 9 | 10 | "github.com/zcalusic/sysinfo" 11 | ) 12 | 13 | func getLinuxFlags() []string { 14 | var si sysinfo.SysInfo 15 | si.GetSysInfo() 16 | flags := []string{ 17 | runtime.GOOS, 18 | strings.ToLower(si.OS.Vendor), 19 | } 20 | 21 | if os.Geteuid() == 0 { 22 | flags = append(flags, "root") 23 | } 24 | 25 | return flags 26 | } 27 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Microsoft Open Source Code of Conduct 2 | 3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 4 | 5 | Resources: 6 | 7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/) 8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) 9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns 10 | -------------------------------------------------------------------------------- /pkg/formatters/json.go: -------------------------------------------------------------------------------- 1 | package formatters 2 | 3 | import ( 4 | "encoding/json" 5 | "io" 6 | 7 | "github.com/Azure/kdebug/pkg/base" 8 | "github.com/Azure/kdebug/pkg/batch" 9 | ) 10 | 11 | type JsonFormatter struct{} 12 | 13 | func (f *JsonFormatter) WriteResults(w io.Writer, results []*base.CheckResult) error { 14 | enc := json.NewEncoder(w) 15 | enc.SetIndent("", " ") 16 | return enc.Encode(results) 17 | } 18 | 19 | func (f *JsonFormatter) WriteBatchResults(w io.Writer, results []*batch.BatchResult) error { 20 | // TODO 21 | return nil 22 | } 23 | -------------------------------------------------------------------------------- /.github/workflows/go.yml: -------------------------------------------------------------------------------- 1 | name: Go 2 | 3 | on: 4 | push: 5 | branches: [ main ] 6 | pull_request: 7 | branches: [ main ] 8 | 9 | jobs: 10 | 11 | build: 12 | runs-on: ubuntu-latest 13 | steps: 14 | - uses: actions/checkout@v2 15 | 16 | - name: Set up Go 17 | uses: actions/setup-go@v2 18 | with: 19 | go-version: "1.20" 20 | 21 | - name: Build 22 | run: make build 23 | 24 | - name: Test 25 | run: make test 26 | 27 | - name: Upload 28 | uses: actions/upload-artifact@v3 29 | with: 30 | name: kdebug 31 | path: bin/kdebug 32 | -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | name: Release 2 | 3 | on: 4 | release: 5 | types: [ created ] 6 | 7 | jobs: 8 | 9 | build: 10 | runs-on: ubuntu-latest 11 | permissions: 12 | contents: write 13 | steps: 14 | - uses: actions/checkout@v2 15 | 16 | - name: Set up Go 17 | uses: actions/setup-go@v2 18 | with: 19 | go-version: "1.20" 20 | 21 | - name: Build 22 | run: make build 23 | 24 | - name: Test 25 | run: make test 26 | 27 | - name: Release 28 | uses: softprops/action-gh-release@v1 29 | with: 30 | files: bin/kdebug 31 | generate_release_notes: true 32 | 33 | -------------------------------------------------------------------------------- /pkg/env/env.go: -------------------------------------------------------------------------------- 1 | package env 2 | 3 | type Environment interface { 4 | HasFlag(flag string) bool 5 | } 6 | 7 | type StaticEnvironment struct { 8 | Flags []string 9 | } 10 | 11 | func (e *StaticEnvironment) HasFlag(flag string) bool { 12 | for _, f := range e.Flags { 13 | if flag == f { 14 | return true 15 | } 16 | } 17 | return false 18 | } 19 | 20 | func GetEnvironment() Environment { 21 | return &StaticEnvironment{ 22 | Flags: getFlags(), 23 | } 24 | } 25 | 26 | func getFlags() []string { 27 | flags := []string{} 28 | flags = append(flags, getLinuxFlags()...) 29 | flags = append(flags, getAzureFlags()...) 30 | flags = append(flags, getK8sFlags()...) 31 | 32 | return flags 33 | } 34 | -------------------------------------------------------------------------------- /pkg/batch/batch.go: -------------------------------------------------------------------------------- 1 | package batch 2 | 3 | import "github.com/Azure/kdebug/pkg/base" 4 | 5 | type BatchOptions struct { 6 | Machines []string 7 | Checkers []string 8 | Concurrency int 9 | Reporter BatchReportor 10 | } 11 | 12 | type batchTask struct { 13 | Machine string 14 | Checkers []string 15 | } 16 | 17 | type BatchResult struct { 18 | Machine string 19 | Error error 20 | CheckResults []*base.CheckResult 21 | } 22 | 23 | type BatchExecutor interface { 24 | Execute(opts *BatchOptions) ([]*BatchResult, error) 25 | } 26 | 27 | type BatchReportor interface { 28 | OnResult(result *BatchResult) 29 | } 30 | 31 | type BatchDiscoverer interface { 32 | Discover() ([]string, error) 33 | } 34 | -------------------------------------------------------------------------------- /pkg/batch/file_discoverer_test.go: -------------------------------------------------------------------------------- 1 | package batch 2 | 3 | import ( 4 | "io/ioutil" 5 | "os" 6 | "reflect" 7 | "testing" 8 | ) 9 | 10 | func TestFileBatchDiscoverer(t *testing.T) { 11 | f, err := ioutil.TempFile("", "batch-discover") 12 | if err != nil { 13 | t.Errorf("Fail to create temp file") 14 | } 15 | defer os.Remove(f.Name()) 16 | if _, err = f.Write([]byte("m1\nm2")); err != nil { 17 | t.Errorf("Fail to write temp file") 18 | } 19 | f.Close() 20 | 21 | d := &FileBatchDiscoverer{Path: f.Name()} 22 | machines, err := d.Discover() 23 | if err != nil { 24 | t.Errorf("Expect no error but got: %+v", err) 25 | } 26 | if !reflect.DeepEqual(machines, []string{"m1", "m2"}) { 27 | t.Errorf("Discovered machines list is not correct: %+v", machines) 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /pkg/tools/registry.go: -------------------------------------------------------------------------------- 1 | package tools 2 | 3 | import ( 4 | "sort" 5 | 6 | "github.com/Azure/kdebug/pkg/tools/aadssh" 7 | "github.com/Azure/kdebug/pkg/tools/netexec" 8 | "github.com/Azure/kdebug/pkg/tools/tcpdump" 9 | "github.com/Azure/kdebug/pkg/tools/upgradeinspector" 10 | "github.com/Azure/kdebug/pkg/tools/vmrebootdetector" 11 | ) 12 | 13 | var allTools = map[string]Tool{ 14 | "tcpdump": tcpdump.New(), 15 | "vmrebootinspect": vmrebootdetector.New(), 16 | "upgradesinspect": upgradeinspector.New(), 17 | "aadssh": aadssh.New(), 18 | "netexec": netexec.New(), 19 | } 20 | 21 | func ListAllToolNames() []string { 22 | names := make([]string, 0, len(allTools)) 23 | for n := range allTools { 24 | names = append(names, n) 25 | } 26 | sort.Strings(names) 27 | return names 28 | } 29 | -------------------------------------------------------------------------------- /pkg/checkers/dummy/dummy.go: -------------------------------------------------------------------------------- 1 | package dummy 2 | 3 | import ( 4 | "os" 5 | 6 | "github.com/Azure/kdebug/pkg/base" 7 | ) 8 | 9 | type DummyChecker struct { 10 | } 11 | 12 | var okResult = base.CheckResult{ 13 | Checker: "Dummy", 14 | } 15 | 16 | var failResult = base.CheckResult{ 17 | Checker: "Dummy", 18 | Error: "Dummy failure", 19 | Description: "This is a dummy failure", 20 | Recommendations: []string{ 21 | "Remove environment variable `KDEBUG_DUMMY_FAIL`.", 22 | }, 23 | } 24 | 25 | func (c *DummyChecker) Name() string { 26 | return "Dummy" 27 | } 28 | 29 | func (c *DummyChecker) Check(_ *base.CheckContext) ([]*base.CheckResult, error) { 30 | if os.Getenv("KDEBUG_DUMMY_FAIL") == "1" { 31 | return []*base.CheckResult{&failResult}, nil 32 | } else { 33 | return []*base.CheckResult{&okResult}, nil 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /pkg/tools/tool.go: -------------------------------------------------------------------------------- 1 | package tools 2 | 3 | import ( 4 | "errors" 5 | 6 | "github.com/Azure/kdebug/pkg/base" 7 | ) 8 | 9 | type Tool interface { 10 | Name() string 11 | ParseArgs(*base.ToolContext, []string) error 12 | Run(*base.ToolContext) error 13 | } 14 | 15 | func getTool(name string) (Tool, error) { 16 | if tool, ok := allTools[name]; ok { 17 | return tool, nil 18 | } else { 19 | return nil, errors.New("Unknown tool: " + name) 20 | } 21 | } 22 | 23 | func ParseArgs(ctx *base.ToolContext, name string, args []string) error { 24 | tool, err := getTool(name) 25 | if err != nil { 26 | return err 27 | } 28 | return tool.ParseArgs(ctx, args) 29 | } 30 | 31 | func Run(ctx *base.ToolContext, name string) error { 32 | tool, err := getTool(name) 33 | if err != nil { 34 | return err 35 | } 36 | return tool.Run(ctx) 37 | } 38 | -------------------------------------------------------------------------------- /pkg/tools/vmrebootdetector/vmrebootdetector_test.go: -------------------------------------------------------------------------------- 1 | package vmrebootdetector 2 | 3 | import ( 4 | "strings" 5 | "testing" 6 | 7 | "github.com/Azure/kdebug/pkg/base" 8 | ) 9 | 10 | func TestReboot(t *testing.T) { 11 | tool := Tool{} 12 | err := tool.Run(&base.ToolContext{ 13 | Config: &Config{}, 14 | }) 15 | if err != nil { 16 | t.Error(err) 17 | } 18 | } 19 | 20 | func TestRebootParser(t *testing.T) { 21 | lastContent := "reboot system boot 5.4.0-1074-azure 2022-05-27T04:51:43+0000 still running\nreboot system boot 5.4.0-1074-azure 2022-04-04T07:49:09+0000 - 2022-04-20T17:12:20+0000 (16+09:23)\n\nwtmp begins 2022-04-04T07:47:27+0000\n" 22 | tool := Tool{} 23 | result := tool.parseResult(lastContent) 24 | if !strings.Contains(result, "Detect") { 25 | t.Error("VMRebootCheck failed to parse reboot result") 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /pkg/tools/aadssh/ssh_agent.go: -------------------------------------------------------------------------------- 1 | package aadssh 2 | 3 | import ( 4 | "crypto/rsa" 5 | "time" 6 | 7 | "golang.org/x/crypto/ssh" 8 | "golang.org/x/crypto/ssh/agent" 9 | ) 10 | 11 | // addSSHKeyToAgent adds SSH key to SSH agent 12 | // sockPath can be a unix socket on Unix or a named pipe on Windows 13 | func addSSHKeyToAgent( 14 | sockPath string, 15 | sshPrivKey *rsa.PrivateKey, 16 | sshCert *ssh.Certificate) error { 17 | 18 | conn, err := dialSSHAgent(sockPath) 19 | if err != nil { 20 | return err 21 | } 22 | defer conn.Close() 23 | 24 | lifeTimeSecs := uint32(uint64(time.Now().Unix()) - sshCert.ValidBefore) 25 | 26 | client := agent.NewClient(conn) 27 | return client.Add(agent.AddedKey{ 28 | Comment: "AAD SSH Key", 29 | PrivateKey: sshPrivKey, 30 | Certificate: sshCert, 31 | LifetimeSecs: lifeTimeSecs, 32 | }) 33 | } 34 | -------------------------------------------------------------------------------- /pkg/checkers/checker.go: -------------------------------------------------------------------------------- 1 | package checker 2 | 3 | import ( 4 | "errors" 5 | 6 | log "github.com/sirupsen/logrus" 7 | 8 | "github.com/Azure/kdebug/pkg/base" 9 | ) 10 | 11 | type Checker interface { 12 | Name() string 13 | Check(*base.CheckContext) ([]*base.CheckResult, error) 14 | } 15 | 16 | func Check(ctx *base.CheckContext, checkerNames []string) ([]*base.CheckResult, error) { 17 | checkers := make([]Checker, 0, len(checkerNames)) 18 | 19 | for _, name := range checkerNames { 20 | if checker, ok := allCheckers[name]; ok { 21 | checkers = append(checkers, checker) 22 | } else { 23 | return nil, errors.New("Unknown checker: " + name) 24 | } 25 | } 26 | var results []*base.CheckResult 27 | for _, checker := range checkers { 28 | r, err := checker.Check(ctx) 29 | if err != nil { 30 | log.Warnf("Checker(%s): %s", checker.Name(), err) 31 | } 32 | results = append(results, r...) 33 | } 34 | 35 | return results, nil 36 | } 37 | -------------------------------------------------------------------------------- /pkg/batch/file_discoverer.go: -------------------------------------------------------------------------------- 1 | package batch 2 | 3 | import ( 4 | "bufio" 5 | "fmt" 6 | "os" 7 | "strings" 8 | ) 9 | 10 | type FileBatchDiscoverer struct { 11 | Path string 12 | } 13 | 14 | func (d *FileBatchDiscoverer) Discover() ([]string, error) { 15 | var file *os.File 16 | var err error 17 | 18 | if d.Path == "-" { 19 | file = os.Stdin 20 | } else { 21 | file, err = os.Open(d.Path) 22 | if err != nil { 23 | return nil, fmt.Errorf("Fail to open machines list file %s due to: %+v", 24 | d.Path, err) 25 | } 26 | defer file.Close() 27 | } 28 | 29 | var machines []string 30 | scanner := bufio.NewScanner(file) 31 | for scanner.Scan() { 32 | line := string(scanner.Text()) 33 | line = strings.TrimSpace(line) 34 | machines = append(machines, line) 35 | } 36 | 37 | if err := scanner.Err(); err != nil { 38 | return nil, fmt.Errorf("Fail to read machines list file %s due to: %+v", 39 | d.Path, err) 40 | } 41 | 42 | return machines, nil 43 | } 44 | -------------------------------------------------------------------------------- /pkg/checkers/tcpping/tcpping_test.go: -------------------------------------------------------------------------------- 1 | package tcpping 2 | 3 | import ( 4 | "fmt" 5 | "github.com/Azure/kdebug/pkg/base" 6 | "math/rand" 7 | "net" 8 | "strings" 9 | "testing" 10 | ) 11 | 12 | func TestCheck(t *testing.T) { 13 | checker := &TCPChecker{ 14 | dialer: net.Dialer{ 15 | Timeout: TimeOut, 16 | }, 17 | targets: []pingEndpoint{ 18 | { 19 | ServerAddress: "fooTest", 20 | Name: fmt.Sprintf("%d.kdebug:80", rand.Int()), 21 | }, 22 | }, 23 | } 24 | context := &base.CheckContext{} 25 | results, err := checker.Check(context) 26 | if err != nil { 27 | t.Errorf("check fail %v\n", err) 28 | } 29 | for _, result := range results { 30 | if strings.Contains(result.Description, "fooTest") { 31 | if result.Error == "" { 32 | t.Errorf("fooTest didn't fail") 33 | } 34 | } 35 | if strings.Contains(result.Description, "Google") { 36 | if result.Error != "" { 37 | t.Errorf("google test fail %v\n", result.Error) 38 | } 39 | } 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /pkg/base/models.go: -------------------------------------------------------------------------------- 1 | package base 2 | 3 | import ( 4 | "io" 5 | 6 | "k8s.io/cli-runtime/pkg/genericclioptions" 7 | "k8s.io/client-go/kubernetes" 8 | 9 | "github.com/Azure/kdebug/pkg/env" 10 | ) 11 | 12 | type CheckContext struct { 13 | // TODO: Add user input here 14 | Pod struct { 15 | Name string 16 | Namespace string 17 | } 18 | 19 | // TODO: Add shared dependencies here, for example, kube-client 20 | Environment env.Environment 21 | KubeClient *kubernetes.Clientset 22 | Output io.Writer 23 | } 24 | 25 | type ToolContext struct { 26 | Args []string 27 | Config interface{} 28 | Environment env.Environment 29 | KubeConfigFlag *genericclioptions.ConfigFlags 30 | } 31 | 32 | type CheckResult struct { 33 | Checker string 34 | Error string 35 | Description string 36 | Recommendations []string 37 | Logs []string 38 | HelpLinks []string 39 | } 40 | 41 | func (r *CheckResult) Ok() bool { 42 | return r.Error == "" 43 | } 44 | -------------------------------------------------------------------------------- /pkg/checkers/kube/objectsize/objectsize_test.go: -------------------------------------------------------------------------------- 1 | package dns 2 | 3 | import ( 4 | "testing" 5 | 6 | v1 "k8s.io/api/core/v1" 7 | ) 8 | 9 | func TestCheckObjectSize_OK(t *testing.T) { 10 | cm := v1.ConfigMap{ 11 | BinaryData: map[string][]byte{ 12 | "key": make([]byte, 100), 13 | }, 14 | } 15 | checker := New() 16 | result := checker.checkObjectSize("ConfigMap", "default", "cm", cm) 17 | if !result.Ok() { 18 | t.Errorf("Expect ok result but got %+v", result) 19 | } 20 | } 21 | 22 | func TestCheckObjectSize_Warn(t *testing.T) { 23 | cm := v1.ConfigMap{ 24 | BinaryData: map[string][]byte{ 25 | "key": make([]byte, WarnSizeThreshold+1), 26 | }, 27 | } 28 | checker := New() 29 | result := checker.checkObjectSize("ConfigMap", "default", "cm", cm) 30 | if result.Ok() { 31 | t.Errorf("Expect non ok result but got %+v", result) 32 | } 33 | if result.Error == "" || result.Description == "" || len(result.Recommendations) == 0 { 34 | t.Errorf("Expect non empty result but got %+v", result) 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /pkg/env/k8s.go: -------------------------------------------------------------------------------- 1 | package env 2 | 3 | import ( 4 | "fmt" 5 | "os" 6 | "strings" 7 | 8 | "github.com/shirou/gopsutil/v3/process" 9 | log "github.com/sirupsen/logrus" 10 | ) 11 | 12 | const KubernetesServiceHost = "KUBERNETES_SERVICE_HOST" 13 | 14 | func getK8sFlags() []string { 15 | var flags []string 16 | if inK8s() { 17 | flags = append(flags, "k8s") 18 | } 19 | return flags 20 | } 21 | 22 | func inK8s() bool { 23 | //check if in a pod 24 | for _, e := range os.Environ() { 25 | if strings.Contains(e, KubernetesServiceHost) { 26 | return true 27 | } 28 | } 29 | // check in a host vm 30 | processes, err := process.Processes() 31 | if err != nil { 32 | log.Warn(fmt.Sprintf("List process error %v\n", err)) 33 | return false 34 | } else { 35 | for _, proc := range processes { 36 | name, err := proc.Name() 37 | if err != nil { 38 | log.Warn(fmt.Sprintf("List process error %v. Skip in-cluster tcp checking\n", err)) 39 | return false 40 | } 41 | if name == "kubelet" { 42 | return true 43 | } 44 | } 45 | } 46 | return false 47 | } 48 | -------------------------------------------------------------------------------- /pkg/batch/kube_discoverer_test.go: -------------------------------------------------------------------------------- 1 | package batch 2 | 3 | import ( 4 | "testing" 5 | 6 | corev1 "k8s.io/api/core/v1" 7 | ) 8 | 9 | func TestMatchNode(t *testing.T) { 10 | d := &KubeBatchDiscoverer{} 11 | node := &corev1.Node{} 12 | if !d.matchNode(node) { 13 | t.Errorf("Expect matchNode == true when not specifying unready but got false") 14 | } 15 | 16 | d = &KubeBatchDiscoverer{unready: true} 17 | node = &corev1.Node{ 18 | Status: corev1.NodeStatus{ 19 | Conditions: []corev1.NodeCondition{ 20 | { 21 | Type: corev1.NodeReady, 22 | Status: corev1.ConditionFalse, 23 | }, 24 | }, 25 | }, 26 | } 27 | if !d.matchNode(node) { 28 | t.Errorf("Expect matchNode == true when specifying unready and node is unready but got false") 29 | } 30 | 31 | node = &corev1.Node{ 32 | Status: corev1.NodeStatus{ 33 | Conditions: []corev1.NodeCondition{ 34 | { 35 | Type: corev1.NodeReady, 36 | Status: corev1.ConditionTrue, 37 | }, 38 | }, 39 | }, 40 | } 41 | if d.matchNode(node) { 42 | t.Errorf("Expect matchNode == false when specifying unready and node is ready but got true") 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /pkg/env/azure.go: -------------------------------------------------------------------------------- 1 | package env 2 | 3 | import ( 4 | "net/http" 5 | "os" 6 | "time" 7 | ) 8 | 9 | const ( 10 | AzureIMDSEndpoint = "http://169.254.169.254/metadata" 11 | ) 12 | 13 | func getAzureFlags() []string { 14 | // IMDS should exist on Azure VMs 15 | client := &http.Client{ 16 | Timeout: time.Second, 17 | } 18 | req, _ := http.NewRequest("GET", AzureIMDSEndpoint+"/instance?api-version=2017-03-01", nil) 19 | req.Header.Set("Metadata", "true") 20 | resp, err := client.Do(req) 21 | if err != nil { 22 | // Not on Azure 23 | return []string{} 24 | } 25 | defer resp.Body.Close() 26 | if resp.StatusCode != http.StatusOK { 27 | // Not 200 status, might not be on Azure 28 | return []string{} 29 | } 30 | 31 | // If we are on Azure, check if it's AKS 32 | return append([]string{"azure"}, getAksFlags()...) 33 | } 34 | 35 | func getAksFlags() []string { 36 | // Check kubernetes directory to see if it's a AKS node 37 | finfo, err := os.Stat("/etc/kubernetes") 38 | if err != nil { 39 | return []string{} 40 | } 41 | if !finfo.IsDir() { 42 | // Not a dir 43 | return []string{} 44 | } 45 | return []string{"aks"} 46 | } 47 | -------------------------------------------------------------------------------- /pkg/checkers/systemload/systemload_test.go: -------------------------------------------------------------------------------- 1 | package systemload 2 | 3 | import ( 4 | "testing" 5 | ) 6 | 7 | func TestMemPercentage_Success(t *testing.T) { 8 | usage := getMemPercentage(30, 100) 9 | if usage != 70 { 10 | t.Errorf("Expect the mem percentage is 70 but got %f", usage) 11 | } 12 | } 13 | 14 | func TestSystemCPUPercentage_Success(t *testing.T) { 15 | usage := getSystemCPUPercentage(2000, 5000) 16 | if usage != 60 { 17 | t.Errorf("Expect the cpu percentage is 60 but got %f", usage) 18 | } 19 | } 20 | 21 | func TestProcessCPUPercentageAsGlobal_Success(t *testing.T) { 22 | usage := getProcessCPUPercentageAsGlobal(50, 5000) 23 | if usage != 1 { 24 | t.Errorf("Expect the process cpu percentage is 1 but got %f", usage) 25 | } 26 | } 27 | 28 | func TestProcessCPUPercentageAsSingleCore_Success(t *testing.T) { 29 | usage := getProcessCPUPercentageAsSingleCore(400, 2) 30 | if usage != 200 { 31 | t.Errorf("Expect the process cpu percentage is 200 but got %f", usage) 32 | } 33 | 34 | usage = getProcessCPUPercentageAsSingleCore(100, 10) 35 | if usage != 10 { 36 | t.Errorf("Expect the process cpu percentage is 10 but got %f", usage) 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) Microsoft Corporation. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE 22 | -------------------------------------------------------------------------------- /.devcontainer/Dockerfile: -------------------------------------------------------------------------------- 1 | # See here for image contents: https://github.com/microsoft/vscode-dev-containers/tree/v0.234.0/containers/go/.devcontainer/base.Dockerfile 2 | 3 | # [Choice] Go version (use -bullseye variants on local arm64/Apple Silicon): 1, 1.16, 1.17, 1-bullseye, 1.16-bullseye, 1.17-bullseye, 1-buster, 1.16-buster, 1.17-buster 4 | ARG VARIANT="1.18-bullseye" 5 | FROM mcr.microsoft.com/vscode/devcontainers/go:0-${VARIANT} 6 | 7 | # [Choice] Node.js version: none, lts/*, 16, 14, 12, 10 8 | ARG NODE_VERSION="none" 9 | RUN if [ "${NODE_VERSION}" != "none" ]; then su vscode -c "umask 0002 && . /usr/local/share/nvm/nvm.sh && nvm install ${NODE_VERSION} 2>&1"; fi 10 | 11 | # [Optional] Uncomment this section to install additional OS packages. 12 | # RUN apt-get update && export DEBIAN_FRONTEND=noninteractive \ 13 | # && apt-get -y install --no-install-recommends 14 | 15 | # [Optional] Uncomment the next lines to use go get to install anything else you need 16 | # USER vscode 17 | # RUN go get -x 18 | 19 | # [Optional] Uncomment this line to install global node packages. 20 | # RUN su vscode -c "source /usr/local/share/nvm/nvm.sh && npm install -g " 2>&1 21 | -------------------------------------------------------------------------------- /pkg/tools/upgradeinspector/upgradeinspector_test.go: -------------------------------------------------------------------------------- 1 | package upgradeinspector 2 | 3 | import ( 4 | "fmt" 5 | "testing" 6 | "time" 7 | 8 | "github.com/Azure/kdebug/pkg/base" 9 | ) 10 | 11 | func TestUpgradeParser_Success(t *testing.T) { 12 | upgradeInspectTool := New() 13 | 14 | ctx := &base.ToolContext{ 15 | Config: &Config{CheckDays: 7, RecordLimit: 10}, 16 | } 17 | 18 | upgradeInspectTool.parseArgument(ctx) 19 | 20 | logTime := time.Now().AddDate(0, 0, -1) 21 | dateStr := logTime.Format("2006-01-02") 22 | 23 | logs := fmt.Sprintf("%s 17:12:13 upgrade libubsan1:amd64 12-20220319-1ubuntu1 12.1.0-2ubuntu1~22.04\n", dateStr) + 24 | fmt.Sprintf("%s 17:12:13 upgrade gcc-12-base:amd64 12-20220319-1ubuntu1 12.1.0-2ubuntu1~22.04\n", dateStr) 25 | 26 | expected := fmt.Sprintf("\n%-19s\t%-40s\t%-30s\t%-30s\n\n", "Timestamp", "Package", "OldVer", "NewVer") + 27 | fmt.Sprintf("%v-%v\t%-40s\t%-30s\t%-30s\n", dateStr, "17:12:13", "libubsan1:amd64", "12-20220319-1ubuntu1", "12.1.0-2ubuntu1~22.04") + 28 | fmt.Sprintf("%v-%v\t%-40s\t%-30s\t%-30s\n", dateStr, "17:12:13", "gcc-12-base:amd64", "12-20220319-1ubuntu1", "12.1.0-2ubuntu1~22.04") 29 | 30 | output := upgradeInspectTool.parseResult(logs) 31 | 32 | if output != expected { 33 | t.Errorf("UpgradeInspectTool parser output is expected to be\n%s\n, but got\n%s\n", expected, output) 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /pkg/checkers/podschedule/podschedule_test.go: -------------------------------------------------------------------------------- 1 | package podschedule 2 | 3 | import ( 4 | "testing" 5 | 6 | v1 "k8s.io/api/core/v1" 7 | ) 8 | 9 | func TestPodSchedule_Single_Panic(t *testing.T) { 10 | podList := []v1.Pod{ 11 | { 12 | Spec: v1.PodSpec{ 13 | NodeName: "a", 14 | }, 15 | }, 16 | } 17 | 18 | defer func() { 19 | if recover() == nil { 20 | t.Errorf("Expect panic") 21 | } 22 | }() 23 | 24 | checker := New() 25 | checker.checkPodsScheduleInReplicaSet("rc1", podList) 26 | } 27 | 28 | func TestPodSchedule_DifferentName_OK(t *testing.T) { 29 | podList := []v1.Pod{ 30 | { 31 | Spec: v1.PodSpec{ 32 | NodeName: "a", 33 | }, 34 | }, 35 | { 36 | Spec: v1.PodSpec{ 37 | NodeName: "b", 38 | }, 39 | }, 40 | } 41 | 42 | checker := New() 43 | result := checker.checkPodsScheduleInReplicaSet("rc1", podList) 44 | if !result.Ok() { 45 | t.Errorf("Expect ok result but got %+v", result) 46 | } 47 | } 48 | 49 | func TestPodSchedule_Failed(t *testing.T) { 50 | podList := []v1.Pod{ 51 | { 52 | Spec: v1.PodSpec{ 53 | NodeName: "a", 54 | }, 55 | }, 56 | { 57 | Spec: v1.PodSpec{ 58 | NodeName: "a", 59 | }, 60 | }, 61 | } 62 | 63 | checker := New() 64 | result := checker.checkPodsScheduleInReplicaSet("rc1", podList) 65 | if result.Ok() { 66 | t.Errorf("Expect failed result but got %+v", result) 67 | } 68 | } 69 | -------------------------------------------------------------------------------- /SUPPORT.md: -------------------------------------------------------------------------------- 1 | # TODO: The maintainer of this repo has not yet edited this file 2 | 3 | **REPO OWNER**: Do you want Customer Service & Support (CSS) support for this product/project? 4 | 5 | - **No CSS support:** Fill out this template with information about how to file issues and get help. 6 | - **Yes CSS support:** Fill out an intake form at [aka.ms/spot](https://aka.ms/spot). CSS will work with/help you to determine next steps. More details also available at [aka.ms/onboardsupport](https://aka.ms/onboardsupport). 7 | - **Not sure?** Fill out a SPOT intake as though the answer were "Yes". CSS will help you decide. 8 | 9 | *Then remove this first heading from this SUPPORT.MD file before publishing your repo.* 10 | 11 | # Support 12 | 13 | ## How to file issues and get help 14 | 15 | This project uses GitHub Issues to track bugs and feature requests. Please search the existing 16 | issues before filing new issues to avoid duplicates. For new issues, file your bug or 17 | feature request as a new Issue. 18 | 19 | For help and questions about using this project, please **REPO MAINTAINER: INSERT INSTRUCTIONS HERE 20 | FOR HOW TO ENGAGE REPO OWNERS OR COMMUNITY FOR HELP. COULD BE A STACK OVERFLOW TAG OR OTHER 21 | CHANNEL. WHERE WILL YOU HELP PEOPLE?**. 22 | 23 | ## Microsoft Support Policy 24 | 25 | Support for this **PROJECT or PRODUCT** is limited to the resources listed above. 26 | -------------------------------------------------------------------------------- /.devcontainer/devcontainer.json: -------------------------------------------------------------------------------- 1 | // For format details, see https://aka.ms/devcontainer.json. For config options, see the README at: 2 | // https://github.com/microsoft/vscode-dev-containers/tree/v0.234.0/containers/go 3 | { 4 | "name": "Go", 5 | "build": { 6 | "dockerfile": "Dockerfile", 7 | "args": { 8 | // Update the VARIANT arg to pick a version of Go: 1, 1.18, 1.17 9 | // Append -bullseye or -buster to pin to an OS version. 10 | // Use -bullseye variants on local arm64/Apple Silicon. 11 | "VARIANT": "1-bullseye", 12 | // Options 13 | "NODE_VERSION": "lts/*" 14 | } 15 | }, 16 | "runArgs": [ "--cap-add=SYS_PTRACE", "--security-opt", "seccomp=unconfined" ], 17 | 18 | // Set *default* container specific settings.json values on container create. 19 | "settings": { 20 | "go.toolsManagement.checkForUpdates": "local", 21 | "go.useLanguageServer": true, 22 | "go.gopath": "/go" 23 | }, 24 | 25 | // Add the IDs of extensions you want installed when the container is created. 26 | "extensions": [ 27 | "golang.Go" 28 | ], 29 | 30 | // Use 'forwardPorts' to make a list of ports inside the container available locally. 31 | // "forwardPorts": [], 32 | 33 | // Use 'postCreateCommand' to run commands after the container is created. 34 | // "postCreateCommand": "go version", 35 | 36 | // Comment out to connect as root instead. More info: https://aka.ms/vscode-remote/containers/non-root. 37 | "remoteUser": "vscode" 38 | } 39 | -------------------------------------------------------------------------------- /pkg/formatters/oneline.go: -------------------------------------------------------------------------------- 1 | package formatters 2 | 3 | import ( 4 | "fmt" 5 | "io" 6 | "strings" 7 | 8 | "github.com/Azure/kdebug/pkg/base" 9 | "github.com/Azure/kdebug/pkg/batch" 10 | "github.com/fatih/color" 11 | log "github.com/sirupsen/logrus" 12 | ) 13 | 14 | type OneLineFormatter struct{} 15 | 16 | func (f *OneLineFormatter) WriteResults(w io.Writer, results []*base.CheckResult) error { 17 | failedCheckers := make(map[string]struct{}) 18 | failures := []*base.CheckResult{} 19 | for _, r := range results { 20 | if r.Ok() { 21 | if log.IsLevelEnabled(log.DebugLevel) { 22 | fmt.Fprintf(w, "[%s] %s\n", r.Checker, r.Description) 23 | } 24 | } else { 25 | failures = append(failures, r) 26 | failedCheckers[r.Checker] = struct{}{} 27 | } 28 | } 29 | 30 | if len(failures) == 0 { 31 | fmt.Fprintf(w, "All %v checks passed!\n", 32 | color.GreenString("%d", len(results))) 33 | return nil 34 | } 35 | 36 | failedCheckersList := []string{} 37 | for c := range failedCheckers { 38 | failedCheckersList = append(failedCheckersList, c) 39 | } 40 | 41 | fmt.Fprintf(w, "%v checks passed, %v failed: %s", 42 | color.GreenString("%d", len(results)-len(failures)), 43 | color.RedString("%d", len(failures)), 44 | strings.Join(failedCheckersList, ", ")) 45 | 46 | return nil 47 | } 48 | 49 | func (f *OneLineFormatter) WriteBatchResults(w io.Writer, results []*batch.BatchResult) error { 50 | return fmt.Errorf("not implemented: one line formatter for batch results") 51 | } 52 | -------------------------------------------------------------------------------- /pkg/tools/tcpdump/tcpdump_test.go: -------------------------------------------------------------------------------- 1 | package tcpdump 2 | 3 | import ( 4 | "testing" 5 | ) 6 | 7 | func TestParseIPAndPort_Success(t *testing.T) { 8 | ip, port := ParseIPAndPort("192.168.1.1:7") 9 | if ip != "192.168.1.1" || port != "7" { 10 | t.Errorf("Parsing 192.168.1.1:7 and expect ip is 192.168.1.1 and port is 7 but got %s and %s", ip, port) 11 | } 12 | 13 | ip, port = ParseIPAndPort("192.168.1.1") 14 | if ip != "192.168.1.1" || len(port) != 0 { 15 | t.Errorf("Parsing 192.168.1.1 and expect ip is 192.168.1.1 and no port but got %s and %s", ip, port) 16 | } 17 | 18 | ip, port = ParseIPAndPort("192.168.1.1:") 19 | if ip != "192.168.1.1" || len(port) != 0 { 20 | t.Errorf("Parsing 192.168.1.1: and expect ip is 192.168.1.1 and no port but got %s and %s", ip, port) 21 | } 22 | 23 | ip, port = ParseIPAndPort(":80") 24 | if len(ip) != 0 || port != "80" { 25 | t.Errorf("Parsing :80 and expect no ip and port is 80 but got %s and %s", ip, port) 26 | } 27 | } 28 | 29 | func TestGenerateTcpdumpParamerters_Success(t *testing.T) { 30 | tcpdumptool := New() 31 | 32 | config := &Config{"192.168.1.1:1", "23.32.10.2:80", ":443", "19920", true} 33 | tcpdumptool.ParseParameters(config) 34 | parameter := tcpdumptool.GenerateTcpdumpParamerters() 35 | 36 | expected := "-nvvv src 192.168.1.1 and src port 1 and dst 23.32.10.2 and dst port 80 and port 443 and tcp" 37 | if parameter != expected { 38 | t.Errorf("Generate parameter is expected to be %s but actually %s", expected, parameter) 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /pkg/tools/aadssh/transport.go: -------------------------------------------------------------------------------- 1 | package aadssh 2 | 3 | import ( 4 | "fmt" 5 | "io" 6 | "io/ioutil" 7 | "net/http" 8 | "net/url" 9 | "strings" 10 | 11 | log "github.com/sirupsen/logrus" 12 | ) 13 | 14 | const ( 15 | TokenURLSuffix = "/oauth2/v2.0/token" 16 | ) 17 | 18 | // A HTTP trasport for adding additional parameters in AAD token request 19 | type Transport struct { 20 | // Additional parameter key-value pairs 21 | data map[string]string 22 | } 23 | 24 | // RoundTrip modifies AAD token request 25 | func (t *Transport) RoundTrip(req *http.Request) (*http.Response, error) { 26 | log.WithFields(log.Fields{"url": *req.URL}).Debug("MSAL request") 27 | 28 | if strings.HasSuffix(req.URL.Path, TokenURLSuffix) { 29 | bodyBuf, err := ioutil.ReadAll(req.Body) 30 | if err != nil { 31 | return nil, err 32 | } 33 | defer req.Body.Close() 34 | 35 | log.WithFields(log.Fields{"body": string(bodyBuf)}).Debug("Original request body") 36 | 37 | values, err := url.ParseQuery(string(bodyBuf)) 38 | if err != nil { 39 | return nil, err 40 | } 41 | 42 | for k, v := range t.data { 43 | values.Add(k, v) 44 | } 45 | 46 | bodyString := values.Encode() 47 | log.WithFields(log.Fields{"body": bodyString}).Debug("Modified request body") 48 | 49 | bodyStream := strings.NewReader(bodyString) 50 | req.ContentLength = bodyStream.Size() 51 | req.Header.Set("Content-Length", fmt.Sprintf("%d", bodyStream.Size())) 52 | req.Body = io.NopCloser(bodyStream) 53 | } 54 | 55 | return http.DefaultTransport.RoundTrip(req) 56 | } 57 | -------------------------------------------------------------------------------- /.github/workflows/container.yml: -------------------------------------------------------------------------------- 1 | name: Container 2 | 3 | on: 4 | push: 5 | branches: ['main'] 6 | release: 7 | types: [ created ] 8 | 9 | env: 10 | REGISTRY: ghcr.io 11 | IMAGE_NAME: ${{ github.repository }} 12 | 13 | jobs: 14 | build-and-push-image: 15 | runs-on: ubuntu-latest 16 | permissions: 17 | contents: read 18 | packages: write 19 | 20 | steps: 21 | - name: Checkout repository 22 | uses: actions/checkout@v3 23 | 24 | - name: Set up Go 25 | uses: actions/setup-go@v2 26 | with: 27 | go-version: "1.20" 28 | 29 | - name: Build 30 | run: make build 31 | 32 | - name: Log in container registry 33 | uses: docker/login-action@f054a8b539a109f9f41c372932f1ae047eff08c9 34 | with: 35 | registry: ${{ env.REGISTRY }} 36 | username: ${{ github.actor }} 37 | password: ${{ secrets.GITHUB_TOKEN }} 38 | 39 | - name: Generate metadata 40 | id: meta 41 | uses: docker/metadata-action@98669ae865ea3cffbcbaa878cf57c20bbf1c6c38 42 | with: 43 | images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} 44 | tags: | 45 | type=ref,event=branch 46 | type=ref,event=tag 47 | type=sha,prefix=,format=long 48 | 49 | - name: Build and push container image 50 | uses: docker/build-push-action@ad44023a93711e3deb337508980b4b5e9bcdc5dc 51 | with: 52 | context: . 53 | push: true 54 | tags: ${{ steps.meta.outputs.tags }} 55 | labels: ${{ steps.meta.outputs.labels }} 56 | -------------------------------------------------------------------------------- /pkg/checkers/icmp/icmp_test.go: -------------------------------------------------------------------------------- 1 | package icmpping 2 | 3 | import ( 4 | "os" 5 | "strings" 6 | "testing" 7 | 8 | "github.com/Azure/kdebug/pkg/base" 9 | "github.com/Azure/kdebug/pkg/env" 10 | ) 11 | 12 | func TestICMPCheckRoot(t *testing.T) { 13 | if os.Geteuid() != 0 { 14 | t.Skip("Must run with root") 15 | return 16 | } 17 | targets := []pingTarget{{Address: "x.x.x.x"}, 18 | {Address: "127.0.0.1"}, 19 | } 20 | checker := ICMPChecker{targets: targets} 21 | context := &base.CheckContext{ 22 | Environment: &env.StaticEnvironment{ 23 | Flags: []string{"root"}, 24 | }, 25 | KubeClient: nil, 26 | } 27 | results, _ := checker.Check(context) 28 | for _, result := range results { 29 | if strings.Contains(result.Description, "x.x.x.x") { 30 | if result.Error == "" { 31 | t.Errorf("ping x.x.x.x should fail") 32 | } 33 | } 34 | if strings.Contains(result.Description, "127.0.0.1") { 35 | if result.Error != "" { 36 | t.Errorf("ping 127.0.0.1 failed %v\n", result.Error) 37 | } 38 | } 39 | } 40 | } 41 | 42 | func TestICMPCheckNonRoot(t *testing.T) { 43 | if os.Geteuid() == 0 { 44 | t.Skip("Must run with non-root") 45 | return 46 | } 47 | 48 | targets := []pingTarget{{Address: "x.x.x.x"}, 49 | {Address: "127.0.0.1"}, 50 | } 51 | checker := ICMPChecker{targets: targets} 52 | context := &base.CheckContext{ 53 | Environment: &env.StaticEnvironment{ 54 | Flags: []string{}, 55 | }, 56 | KubeClient: nil, 57 | } 58 | results, _ := checker.Check(context) 59 | if len(results) != 0 { 60 | t.Errorf("icmp checker unexpected results when not in root mode") 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /pkg/batch/kube_discoverer.go: -------------------------------------------------------------------------------- 1 | package batch 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | "time" 7 | 8 | corev1 "k8s.io/api/core/v1" 9 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 10 | "k8s.io/client-go/kubernetes" 11 | ) 12 | 13 | type KubeBatchDiscoverer struct { 14 | client *kubernetes.Clientset 15 | labelSelector string 16 | unready bool 17 | } 18 | 19 | func NewKubeBatchDiscoverer(client *kubernetes.Clientset, labelSelector string, unready bool) *KubeBatchDiscoverer { 20 | return &KubeBatchDiscoverer{ 21 | client: client, 22 | labelSelector: labelSelector, 23 | unready: unready, 24 | } 25 | } 26 | 27 | func (d *KubeBatchDiscoverer) Discover() ([]string, error) { 28 | if d.client == nil { 29 | return nil, fmt.Errorf("Kubernetes client is not initialized") 30 | } 31 | 32 | ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) 33 | defer cancel() 34 | 35 | resp, err := d.client.CoreV1().Nodes().List(ctx, metav1.ListOptions{ 36 | LabelSelector: d.labelSelector, 37 | }) 38 | if err != nil { 39 | return nil, fmt.Errorf("Fail to list nodes from API server: %+v", err) 40 | } 41 | 42 | var names []string 43 | for _, node := range resp.Items { 44 | if d.matchNode(&node) { 45 | names = append(names, node.ObjectMeta.Name) 46 | } 47 | } 48 | 49 | return names, nil 50 | } 51 | 52 | func (d *KubeBatchDiscoverer) matchNode(node *corev1.Node) bool { 53 | if d.unready { 54 | // Unready only 55 | for _, cond := range node.Status.Conditions { 56 | if cond.Type == corev1.NodeReady { 57 | return cond.Status != corev1.ConditionTrue 58 | } 59 | } 60 | } 61 | 62 | return true 63 | } 64 | -------------------------------------------------------------------------------- /pkg/checkers/registry.go: -------------------------------------------------------------------------------- 1 | package checker 2 | 3 | import ( 4 | "sort" 5 | 6 | "github.com/Azure/kdebug/pkg/checkers/diskreadonly" 7 | "github.com/Azure/kdebug/pkg/checkers/diskusage" 8 | "github.com/Azure/kdebug/pkg/checkers/dns" 9 | "github.com/Azure/kdebug/pkg/checkers/dummy" 10 | "github.com/Azure/kdebug/pkg/checkers/http" 11 | icmpping "github.com/Azure/kdebug/pkg/checkers/icmp" 12 | "github.com/Azure/kdebug/pkg/checkers/kmscachesize" 13 | kubeobjectsize "github.com/Azure/kdebug/pkg/checkers/kube/objectsize" 14 | "github.com/Azure/kdebug/pkg/checkers/kube/pod" 15 | "github.com/Azure/kdebug/pkg/checkers/liveness" 16 | "github.com/Azure/kdebug/pkg/checkers/oom" 17 | "github.com/Azure/kdebug/pkg/checkers/podschedule" 18 | "github.com/Azure/kdebug/pkg/checkers/systemload" 19 | "github.com/Azure/kdebug/pkg/checkers/tcpping" 20 | ) 21 | 22 | var allCheckers = map[string]Checker{ 23 | "dummy": &dummy.DummyChecker{}, 24 | "dns": dns.New(), 25 | "oom": oom.New(), 26 | "kubeobjectsize": kubeobjectsize.New(), 27 | "diskusage": diskusage.New(), 28 | "diskreadonly": diskreadonly.New(), 29 | "kubepod": pod.New(), 30 | "liveness": liveness.New(), 31 | "http": http.New(), 32 | "tcp": tcpping.New(), 33 | "ping": icmpping.New(), 34 | "systemload": systemload.New(), 35 | "kmscachesize": kmscachesize.New(), 36 | "podschedule": podschedule.New(), 37 | } 38 | 39 | func ListAllCheckerNames() []string { 40 | names := make([]string, 0, len(allCheckers)) 41 | for n := range allCheckers { 42 | names = append(names, n) 43 | } 44 | sort.Strings(names) 45 | return names 46 | } 47 | -------------------------------------------------------------------------------- /pkg/checkers/liveness/liveness.go: -------------------------------------------------------------------------------- 1 | package liveness 2 | 3 | import ( 4 | "os/exec" 5 | "regexp" 6 | "strings" 7 | 8 | "github.com/Azure/kdebug/pkg/base" 9 | log "github.com/sirupsen/logrus" 10 | ) 11 | 12 | const ( 13 | CheckerName = "Liveness (kubelet)" 14 | FailedToCheckLiveness = "Failed to check liveness." 15 | ) 16 | 17 | type LivenessChecker struct { 18 | } 19 | 20 | func New() *LivenessChecker { 21 | return &LivenessChecker{} 22 | } 23 | 24 | func (c *LivenessChecker) Name() string { 25 | return CheckerName 26 | } 27 | 28 | func (c *LivenessChecker) Check(ctx *base.CheckContext) ([]*base.CheckResult, error) { 29 | results := []*base.CheckResult{} 30 | 31 | out, err := exec.Command("systemctl", "status", "kubelet").Output() 32 | 33 | if err != nil { 34 | log.Debugf("systemctl status returned non-zero exit code: %+v", err) 35 | } 36 | 37 | results = append(results, parseOutput(out)) 38 | return results, nil 39 | } 40 | 41 | func parseOutput(output []byte) *base.CheckResult { 42 | rows := strings.Split(string(output), "\n") 43 | re := regexp.MustCompile(`active \(running\) since`) 44 | isActive := false 45 | var details string 46 | 47 | for _, row := range rows { 48 | if len(row) == 0 { 49 | continue 50 | } 51 | 52 | if re.MatchString(row) { 53 | isActive = true 54 | details = row 55 | break 56 | } 57 | } 58 | 59 | if isActive { 60 | return &base.CheckResult{ 61 | Checker: CheckerName, 62 | Description: details, 63 | Logs: rows, 64 | } 65 | } 66 | 67 | return &base.CheckResult{ 68 | Checker: CheckerName, 69 | Error: "Kubelet is NOT running well in this node. Please check the logs for more details.", 70 | Logs: rows, 71 | } 72 | } 73 | -------------------------------------------------------------------------------- /pkg/checkers/oom/oom_test.go: -------------------------------------------------------------------------------- 1 | package oom 2 | 3 | import ( 4 | "io/ioutil" 5 | "os" 6 | "testing" 7 | 8 | "github.com/Azure/kdebug/pkg/base" 9 | "github.com/Azure/kdebug/pkg/env" 10 | ) 11 | 12 | var testStrings = []string{ 13 | "Feb 22 16:15:02 k8s-ingress-11186066-z1-vmss0000B3 kernel: [989751.247878] Memory cgroup out of memory: Killed process 3841 (nginx) total-vm:240652kB, anon-rss:130344kB, file-rss:5212kB, shmem-rss:208kB, UID:101 pgtables:332kB oom_score_adj:986\n", 14 | "Feb 22 16:15:02 k8s-ingress-11186066-z1-vmss0000B3 kernel: [989751.247878] Out of memory: Killed process 3841 (nginx) total-vm:240652kB, anon-rss:130344kB, file-rss:5212kB, shmem-rss:208kB, UID:101 pgtables:332kB oom_score_adj:986\n", 15 | } 16 | 17 | func TestCheckOOMLogWhenOOM(t *testing.T) { 18 | environment := &env.StaticEnvironment{ 19 | Flags: []string{"linux"}, 20 | } 21 | for _, testString := range testStrings { 22 | 23 | tmp, err := ioutil.TempFile("", "kernlog") 24 | if err != nil { 25 | t.Fatalf("error creating tmp file:%v", err) 26 | } 27 | check := OOMChecker{kernLogPath: tmp.Name()} 28 | defer func() { 29 | os.Remove(check.kernLogPath) 30 | }() 31 | //should be 600. But it fails in 600 32 | err = os.WriteFile(check.kernLogPath, []byte(testString), 777) 33 | if err != nil { 34 | t.Errorf("Create tmp file error:%v", err) 35 | } 36 | result, err := check.Check(&base.CheckContext{ 37 | Environment: environment, 38 | }) 39 | if err != nil { 40 | t.Errorf("Expect no error but got: %s", err) 41 | } 42 | if len(result) != 1 { 43 | t.Errorf("Get unexpected OOM result length %v", len(result)) 44 | } 45 | checkErr := result[0].Error 46 | if checkErr != "progress:[3841 nginx] is OOM kill at time [Feb 22 16:15:02]. [rss:130344kB] [oom_score_adj:986]\n" { 47 | t.Errorf("Unexpected check result:\n %v \n %v", result[0].Description, checkErr) 48 | } 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /pkg/formatters/text.go: -------------------------------------------------------------------------------- 1 | package formatters 2 | 3 | import ( 4 | "fmt" 5 | "io" 6 | 7 | "github.com/Azure/kdebug/pkg/base" 8 | "github.com/Azure/kdebug/pkg/batch" 9 | "github.com/fatih/color" 10 | log "github.com/sirupsen/logrus" 11 | ) 12 | 13 | type TextFormatter struct{} 14 | 15 | func (f *TextFormatter) WriteResults(w io.Writer, results []*base.CheckResult) error { 16 | failures := []*base.CheckResult{} 17 | for _, r := range results { 18 | if r.Ok() { 19 | if log.IsLevelEnabled(log.DebugLevel) { 20 | fmt.Fprintf(w, "[%s] %s\n", r.Checker, r.Description) 21 | } 22 | } else { 23 | failures = append(failures, r) 24 | } 25 | } 26 | 27 | fmt.Fprintf(w, "------------------------------\n") 28 | 29 | if len(failures) == 0 { 30 | fmt.Fprintf(w, "All %v checks passed!\n", 31 | color.GreenString("%d", len(results))) 32 | return nil 33 | } 34 | 35 | fmt.Fprintf(w, "%v checks passed. %v failed.\n", 36 | color.GreenString("%d", len(results)-len(failures)), 37 | color.RedString("%d", len(failures))) 38 | fmt.Fprintf(w, "------------------------------\n") 39 | fmt.Fprintf(w, "kdebug has detected these problems for you:\n") 40 | 41 | for _, r := range failures { 42 | fmt.Fprintf(w, "------------------------------\n") 43 | fmt.Fprintf(w, color.YellowString("Checker: %s\n", r.Checker)) 44 | fmt.Fprintf(w, "Error: %s\n", r.Error) 45 | fmt.Fprintf(w, "Description: %s\n", r.Description) 46 | if len(r.Recommendations) > 0 { 47 | fmt.Fprintf(w, "Recommendations:\n") 48 | for i, rec := range r.Recommendations { 49 | fmt.Fprintf(w, "[%d] %s\n", i+1, rec) 50 | } 51 | } 52 | // TODO: Make logs more pretty 53 | if len(r.Logs) > 0 { 54 | fmt.Fprintf(w, "Logs:\n") 55 | for _, l := range r.Logs { 56 | fmt.Fprintf(w, "%s\n", l) 57 | } 58 | } 59 | if len(r.HelpLinks) > 0 { 60 | fmt.Fprintf(w, "Help links:\n") 61 | for i, l := range r.HelpLinks { 62 | fmt.Fprintf(w, "[%d] %s\n", i+1, l) 63 | } 64 | } 65 | } 66 | 67 | return nil 68 | } 69 | 70 | func (f *TextFormatter) WriteBatchResults(w io.Writer, results []*batch.BatchResult) error { 71 | for _, result := range results { 72 | fmt.Fprintf(w, color.BlueString("=============== Machine: %s ===============\n", 73 | result.Machine)) 74 | if result.Error == nil { 75 | f.WriteResults(w, result.CheckResults) 76 | } else { 77 | fmt.Fprintf(w, "Remote execution error: %s\n", result.Error) 78 | } 79 | } 80 | return nil 81 | } 82 | -------------------------------------------------------------------------------- /pkg/tools/netexec/netexec_test.go: -------------------------------------------------------------------------------- 1 | package netexec 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/Azure/kdebug/pkg/base" 7 | "k8s.io/cli-runtime/pkg/genericclioptions" 8 | ) 9 | 10 | func TestParseParameterPid_Success(t *testing.T) { 11 | netexec := &NetexecTool{} 12 | netexec.parseAndCheckParameters(&base.ToolContext{ 13 | Config: &Config{ 14 | Pid: "1", 15 | Command: "bash", 16 | }, 17 | }) 18 | 19 | if netexec.pid != "1" { 20 | t.Errorf("pid should got %s but got %s", "1", netexec.pid) 21 | } 22 | 23 | if netexec.command != "bash" { 24 | t.Errorf("command should got %s but got %s", "bash", netexec.command) 25 | } 26 | } 27 | 28 | func TestParseParameterPod_Success(t *testing.T) { 29 | netexec := &NetexecTool{} 30 | netexec.parseAndCheckParameters(&base.ToolContext{ 31 | Config: &Config{ 32 | PodName: "pod", 33 | Command: "bash", 34 | Namespace: "kube-system", 35 | Image: "image", 36 | }, 37 | KubeConfigFlag: &genericclioptions.ConfigFlags{}, 38 | }) 39 | 40 | if netexec.podName != "pod" { 41 | t.Errorf("podname should got %s but got %s", "pod", netexec.podName) 42 | } 43 | 44 | if netexec.command != "bash" { 45 | t.Errorf("command should got %s but got %s", "bash", netexec.command) 46 | } 47 | 48 | if netexec.namespace != "kube-system" { 49 | t.Errorf("namespace should got %s but got %s", "kube-system", netexec.namespace) 50 | } 51 | 52 | if netexec.image != "image" { 53 | t.Errorf("image should got %s but got %s", "image", netexec.image) 54 | } 55 | } 56 | 57 | func TestParseParameter_Failed(t *testing.T) { 58 | netexec := &NetexecTool{} 59 | err := netexec.parseAndCheckParameters(&base.ToolContext{ 60 | Config: &Config{}, 61 | }) 62 | 63 | if err == nil { 64 | t.Error("Should got err: 'Either --pid and --pod should be set.', but error is not raised") 65 | } 66 | 67 | err = netexec.parseAndCheckParameters(&base.ToolContext{ 68 | Config: &Config{ 69 | Pid: "1", 70 | PodName: "pod", 71 | }, 72 | }) 73 | 74 | if err == nil { 75 | t.Error("Should got err: '--pid and --pod can not be assigned together. Please set either of them.', but error is not raised") 76 | } 77 | 78 | err = netexec.parseAndCheckParameters(&base.ToolContext{ 79 | Config: &Config{ 80 | PodName: "pod", 81 | Command: "bash", 82 | Namespace: "kube-system", 83 | Image: "image", 84 | }, 85 | }) 86 | 87 | if err == nil { 88 | t.Error("Should got err: 'kubernetes client is not availble. Check kubeconfig.', but error is not raised") 89 | } 90 | } 91 | -------------------------------------------------------------------------------- /pkg/checkers/diskusage/diskusage_test.go: -------------------------------------------------------------------------------- 1 | package diskusage 2 | 3 | import ( 4 | "testing" 5 | ) 6 | 7 | func TestDfParse_Success(t *testing.T) { 8 | dfOutput := `Filesystem Size Used Avail Use% Mounted on 9 | /dev/sdb 251G 11G 228G 5% /` 10 | 11 | result, _ := parseDfResult(dfOutput, DfHeaders["LINUX"]) 12 | if len(result) != 1 { 13 | t.Errorf("Expect the length of result is 1 but got %+v", len(result)) 14 | } 15 | 16 | if result[0].Filesystem != "/dev/sdb" { 17 | t.Errorf("Expect Filesystem is /dev/sdb but got %s", result[0].Filesystem) 18 | } 19 | 20 | if result[0].Size != "251G" { 21 | t.Errorf("Expect Size is 251G but got %s", result[0].Size) 22 | } 23 | 24 | if result[0].Used != "11G" { 25 | t.Errorf("Expect Used is 11G but got %s", result[0].Used) 26 | } 27 | 28 | if result[0].Avail != "228G" { 29 | t.Errorf("Expect Avail is 228G but got %s", result[0].Avail) 30 | } 31 | 32 | if result[0].Use != 5 { 33 | t.Errorf("Expect Use is 5 but got %v", result[0].Use) 34 | } 35 | 36 | if result[0].MountedOn != "/" { 37 | t.Errorf("Expect MountedOn is / but got %s", result[0].MountedOn) 38 | } 39 | } 40 | 41 | func TestDfParse_FreeBSD_Success(t *testing.T) { 42 | dfOutput := `Filesystem Size Used Avail Capacity Mounted on 43 | /dev/gpt/rootfs 29G 4.0G 23G 15% /` 44 | 45 | result, _ := parseDfResult(dfOutput, DfHeaders["FREEBSD"]) 46 | if len(result) != 1 { 47 | t.Errorf("Expect the length of result is 1 but got %+v", len(result)) 48 | } 49 | 50 | if result[0].Filesystem != "/dev/gpt/rootfs" { 51 | t.Errorf("Expect Filesystem is /dev/gpt/rootfs but got %s", result[0].Filesystem) 52 | } 53 | 54 | if result[0].Size != "29G" { 55 | t.Errorf("Expect Size is 29G but got %s", result[0].Size) 56 | } 57 | 58 | if result[0].Used != "4.0G" { 59 | t.Errorf("Expect Used is 4.0G but got %s", result[0].Used) 60 | } 61 | 62 | if result[0].Avail != "23G" { 63 | t.Errorf("Expect Avail is 23G but got %s", result[0].Avail) 64 | } 65 | 66 | if result[0].Use != 15 { 67 | t.Errorf("Expect Use is 15 but got %v", result[0].Use) 68 | } 69 | 70 | if result[0].MountedOn != "/" { 71 | t.Errorf("Expect MountedOn is / but got %s", result[0].MountedOn) 72 | } 73 | } 74 | 75 | func TestDfParse_Failed(t *testing.T) { 76 | dfOutput := `Filesystem Size Used Avail Use% MountedOn 77 | /dev/sdb 251G 11G 228G 5% /` 78 | 79 | _, err := parseDfResult(dfOutput, DfHeaders["LINUX"]) 80 | if err == nil { 81 | t.Errorf("Expect error in parseDfResult but not") 82 | } 83 | } 84 | -------------------------------------------------------------------------------- /pkg/checkers/http/http.go: -------------------------------------------------------------------------------- 1 | package http 2 | 3 | import ( 4 | "fmt" 5 | "net/http" 6 | "time" 7 | 8 | "github.com/Azure/kdebug/pkg/base" 9 | "github.com/Azure/kdebug/pkg/env" 10 | ) 11 | 12 | var ( 13 | GoogleTarget = HttpTarget{ 14 | Name: "Google HTTP endpoint", 15 | URL: "https://google.com", 16 | } 17 | AzureIMDSTarget = HttpTarget{ 18 | Name: "Azure IMDS HTTP endpoint", 19 | URL: "http://169.254.169.254/metadata/versions", 20 | Header: http.Header{ 21 | "Metadata": {"true"}, 22 | }, 23 | } 24 | ) 25 | 26 | type HttpTarget struct { 27 | Name string 28 | URL string 29 | Header http.Header 30 | } 31 | 32 | type HttpChecker struct { 33 | Client HttpClient 34 | } 35 | 36 | type HttpClient interface { 37 | Do(req *http.Request) (*http.Response, error) 38 | } 39 | 40 | func New() *HttpChecker { 41 | return &HttpChecker{ 42 | Client: &http.Client{ 43 | // Disable proxy. Azure IMDS don't support to be used behind proxy. 44 | Transport: &http.Transport{Proxy: nil}, 45 | Timeout: 10 * time.Second, 46 | }, 47 | } 48 | } 49 | 50 | func (c *HttpChecker) Name() string { 51 | return "Http" 52 | } 53 | 54 | func (c *HttpChecker) Check(ctx *base.CheckContext) ([]*base.CheckResult, error) { 55 | results := []*base.CheckResult{} 56 | targets := getCheckTargets(ctx.Environment) 57 | var result *base.CheckResult 58 | for _, httpTarget := range targets { 59 | request, err := http.NewRequest("GET", httpTarget.URL, nil) 60 | if err != nil { 61 | return nil, fmt.Errorf("Fail to create request for target %s: %+v", 62 | httpTarget.Name, err) 63 | } 64 | request.Header = httpTarget.Header 65 | 66 | response, err := c.Client.Do(request) 67 | if err != nil { 68 | result = &base.CheckResult{ 69 | Checker: c.Name(), 70 | Error: fmt.Sprintf("Fail to invoke HTTP GET method on URL %s.", 71 | httpTarget.URL), 72 | Description: err.Error(), 73 | //todo: Recommendations and help links 74 | } 75 | } else { 76 | defer response.Body.Close() 77 | result = &base.CheckResult{ 78 | Checker: c.Name(), 79 | Description: fmt.Sprintf("Successfully invoke HTTP GET on URL %s , response status code is %s.", 80 | httpTarget.URL, response.Status), 81 | } 82 | } 83 | results = append(results, result) 84 | } 85 | 86 | return results, nil 87 | } 88 | 89 | func getCheckTargets(e env.Environment) []HttpTarget { 90 | targets := []HttpTarget{ 91 | GoogleTarget, 92 | } 93 | 94 | if e.HasFlag("azure") { 95 | targets = append(targets, AzureIMDSTarget) 96 | } 97 | 98 | return targets 99 | } 100 | -------------------------------------------------------------------------------- /cmd/batch.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "io" 5 | 6 | "github.com/schollz/progressbar/v3" 7 | log "github.com/sirupsen/logrus" 8 | 9 | "github.com/Azure/kdebug/pkg/base" 10 | "github.com/Azure/kdebug/pkg/batch" 11 | "github.com/Azure/kdebug/pkg/formatters" 12 | ) 13 | 14 | func getBatchDiscoverer(opts *Options, chkCtx *base.CheckContext) batch.BatchDiscoverer { 15 | if opts.Batch.KubeMachines || opts.Batch.KubeMachinesUnready || len(opts.Batch.KubeMachinesLabelSelector) > 0 { 16 | return batch.NewKubeBatchDiscoverer(chkCtx.KubeClient, opts.Batch.KubeMachinesLabelSelector, opts.Batch.KubeMachinesUnready) 17 | } else if opts.Batch.MachinesFile != "" { 18 | return &batch.FileBatchDiscoverer{ 19 | Path: opts.Batch.MachinesFile, 20 | } 21 | } else { 22 | return &batch.StaticBatchDiscoverer{ 23 | Machines: opts.Batch.Machines, 24 | } 25 | } 26 | } 27 | 28 | func getBatchExecutor(opts *Options, chkCtx *base.CheckContext) batch.BatchExecutor { 29 | if opts.Batch.SshUser != "" { 30 | return batch.NewSshBatchExecutor(opts.Batch.SshUser) 31 | } else if chkCtx.KubeClient != nil { 32 | return batch.NewPodBatchExecutor( 33 | chkCtx.KubeClient, 34 | opts.Batch.PodExecutorImage, 35 | opts.Batch.PodExecutorNamespace, 36 | opts.Batch.PodExecutorMode, 37 | ) 38 | } else { 39 | log.Fatal("No batch executor configured") 40 | return nil 41 | } 42 | } 43 | 44 | type batchReporter struct { 45 | out io.Writer 46 | bar *progressbar.ProgressBar 47 | } 48 | 49 | func newBatchReporter(out io.Writer, max int64) *batchReporter { 50 | return &batchReporter{ 51 | out: out, 52 | bar: progressbar.Default(max), 53 | } 54 | } 55 | 56 | func (r *batchReporter) OnResult(result *batch.BatchResult) { 57 | r.bar.Add(1) 58 | } 59 | 60 | func runBatch(opts *Options, chkCtx *base.CheckContext, formatter formatters.Formatter) { 61 | discoverer := getBatchDiscoverer(opts, chkCtx) 62 | machines, err := discoverer.Discover() 63 | if err != nil { 64 | log.Fatalf("Fail to discover machines: %+v", err) 65 | } 66 | 67 | log.WithFields(log.Fields{"count": len(machines)}).Info("Discovered machines list") 68 | 69 | executor := getBatchExecutor(opts, chkCtx) 70 | concurrency := 1 71 | if opts.Batch.Concurrency > 0 { 72 | concurrency = opts.Batch.Concurrency 73 | } 74 | batchOpts := &batch.BatchOptions{ 75 | Machines: machines, 76 | Checkers: opts.Checkers, 77 | Concurrency: concurrency, 78 | Reporter: newBatchReporter(chkCtx.Output, int64(len(machines))), 79 | } 80 | batchResults, err := executor.Execute(batchOpts) 81 | if err != nil { 82 | log.Fatalf("Fail to run batch: %s", err) 83 | } 84 | 85 | err = formatter.WriteBatchResults(chkCtx.Output, batchResults) 86 | if err != nil { 87 | log.Fatal(err) 88 | } 89 | } 90 | -------------------------------------------------------------------------------- /pkg/checkers/icmp/icmp.go: -------------------------------------------------------------------------------- 1 | package icmpping 2 | 3 | import ( 4 | "errors" 5 | "fmt" 6 | "time" 7 | 8 | probing "github.com/prometheus-community/pro-bing" 9 | log "github.com/sirupsen/logrus" 10 | 11 | "github.com/Azure/kdebug/pkg/base" 12 | ) 13 | 14 | var PublicTargets = []pingTarget{ 15 | { 16 | Address: "8.8.8.8", 17 | Name: "GoogleDns", 18 | Recomendations: []string{"Google DNS is not reachable. Check firewall settings if this is not desired."}, 19 | }, 20 | { 21 | Address: "10.0.0.10", 22 | Name: "ClusterDns", 23 | Recomendations: []string{"Cluster CoreDNS is not reachable. Check CoreDNS pods and network settings."}, 24 | }, 25 | } 26 | 27 | type ICMPChecker struct { 28 | targets []pingTarget 29 | } 30 | 31 | type pingTarget struct { 32 | Address string 33 | Name string 34 | Recomendations []string 35 | } 36 | 37 | func New() *ICMPChecker { 38 | return &ICMPChecker{} 39 | } 40 | 41 | func (c *ICMPChecker) Name() string { 42 | return "icmp" 43 | } 44 | 45 | func (c *ICMPChecker) Check(ctx *base.CheckContext) ([]*base.CheckResult, error) { 46 | var results []*base.CheckResult 47 | // TODO: Invoke `ping` command if non-root 48 | if !ctx.Environment.HasFlag("root") { 49 | log.Debug("Not root. Skip ICMP checker") 50 | return results, nil 51 | } 52 | if !ctx.Environment.HasFlag("azure") { 53 | c.targets = append(c.targets, PublicTargets...) 54 | } 55 | if ctx.KubeClient != nil { 56 | 57 | } 58 | resultChan := make(chan *base.CheckResult, len(c.targets)) 59 | for _, target := range c.targets { 60 | go func(pingTarget pingTarget) { 61 | result := &base.CheckResult{ 62 | Checker: c.Name(), 63 | } 64 | err := pingOne(pingTarget.Address) 65 | if err != nil { 66 | result.Error = err.Error() 67 | result.Description = fmt.Sprintf("ping %s[%s] failed", pingTarget.Address, pingTarget.Name) 68 | result.Recommendations = pingTarget.Recomendations 69 | } else { 70 | result.Description = fmt.Sprintf("ping %s[%s] succeeded", pingTarget.Address, pingTarget.Name) 71 | } 72 | resultChan <- result 73 | 74 | }(target) 75 | } 76 | for i := 0; i < len(c.targets); i++ { 77 | result := <-resultChan 78 | results = append(results, result) 79 | } 80 | return results, nil 81 | } 82 | 83 | func pingOne(ip string) error { 84 | pinger, err := probing.NewPinger(ip) 85 | if err != nil { 86 | return err 87 | } 88 | 89 | pinger.Count = 3 90 | pinger.Interval = time.Millisecond * 20 91 | pinger.Timeout = time.Millisecond * 1000 92 | err = pinger.Run() 93 | if err != nil { 94 | return err 95 | } 96 | stats := pinger.Statistics() 97 | if stats.PacketsRecv <= 0 { 98 | return errors.New("ping receives no reply") 99 | } 100 | return nil 101 | } 102 | -------------------------------------------------------------------------------- /pkg/tools/vmrebootdetector/vmrebootdetector.go: -------------------------------------------------------------------------------- 1 | package vmrebootdetector 2 | 3 | import ( 4 | "bufio" 5 | "fmt" 6 | "os/exec" 7 | "strings" 8 | "time" 9 | 10 | "github.com/fatih/color" 11 | flags "github.com/jessevdk/go-flags" 12 | 13 | "github.com/Azure/kdebug/pkg/base" 14 | ) 15 | 16 | var helpLink = []string{ 17 | "https://www.baeldung.com/linux/last-command", 18 | "https://man7.org/linux/man-pages/man1/last.1.html", 19 | } 20 | 21 | var explain = "You can also use `last` command to inspect above events. Columns in its output are user, login terminal, kernel version, login time, login period\n" 22 | 23 | type Tool struct { 24 | rebootCheckTImeInDay int 25 | } 26 | 27 | type Config struct { 28 | CheckDays int `short:"d" long:"checkdays" description:"Days you want to look back to search for reboot events." default:"30"` 29 | } 30 | 31 | func (t *Tool) Name() string { 32 | return "vmrebootDetector" 33 | } 34 | 35 | func New() *Tool { 36 | return &Tool{} 37 | } 38 | 39 | func (t *Tool) ParseArgs(ctx *base.ToolContext, args []string) error { 40 | var config Config 41 | remaningArgs, err := flags.ParseArgs(&config, args) 42 | if err != nil { 43 | return err 44 | } 45 | ctx.Config = &config 46 | ctx.Args = remaningArgs 47 | return nil 48 | } 49 | 50 | // Run todo: support batch mode 51 | func (t *Tool) Run(ctx *base.ToolContext) error { 52 | t.parseArgument(ctx) 53 | return t.exec() 54 | } 55 | 56 | func (t *Tool) parseArgument(ctx *base.ToolContext) { 57 | config := ctx.Config.(*Config) 58 | t.rebootCheckTImeInDay = config.CheckDays 59 | } 60 | 61 | func (t *Tool) exec() error { 62 | sinceTime := time.Now().Add(-time.Hour * 24 * time.Duration(t.rebootCheckTImeInDay)).Format("2006-01-02 15:04:05") 63 | cmd := exec.Command("last", "reboot", "--since", sinceTime, "--time-format", "iso") 64 | stdout, err := cmd.Output() 65 | if err != nil { 66 | return err 67 | } 68 | fmt.Println(t.parseResult(string(stdout))) 69 | return nil 70 | } 71 | 72 | func (t *Tool) parseResult(result string) string { 73 | sb := strings.Builder{} 74 | scanner := bufio.NewScanner(strings.NewReader(result)) 75 | var reboots []string 76 | for scanner.Scan() { 77 | text := scanner.Text() 78 | if text == "" { 79 | break 80 | } else { 81 | reboots = append(reboots, text) 82 | } 83 | } 84 | if reboots == nil { 85 | sb.WriteString(color.GreenString("No reboot found in past %v days\n", t.rebootCheckTImeInDay)) 86 | } else { 87 | sb.WriteString(color.YellowString("Detected following VM reboots:\n")) 88 | sb.WriteString("\n") 89 | sb.WriteString(strings.Join(reboots, "\n")) 90 | sb.WriteString("\n\n") 91 | sb.WriteString(color.YellowString(explain)) 92 | sb.WriteString("\n") 93 | sb.WriteString(color.YellowString("See also:\n")) 94 | sb.WriteString(color.YellowString(strings.Join(helpLink, "\n"))) 95 | } 96 | return sb.String() 97 | } 98 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ## Security 4 | 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/). 6 | 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://docs.microsoft.com/en-us/previous-versions/tn-archive/cc751383(v=technet.10)), please report it to us as described below. 8 | 9 | ## Reporting Security Issues 10 | 11 | **Please do not report security vulnerabilities through public GitHub issues.** 12 | 13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://msrc.microsoft.com/create-report). 14 | 15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://www.microsoft.com/en-us/msrc/pgp-key-msrc). 16 | 17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc). 18 | 19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue: 20 | 21 | * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.) 22 | * Full paths of source file(s) related to the manifestation of the issue 23 | * The location of the affected source code (tag/branch/commit or direct URL) 24 | * Any special configuration required to reproduce the issue 25 | * Step-by-step instructions to reproduce the issue 26 | * Proof-of-concept or exploit code (if possible) 27 | * Impact of the issue, including how an attacker might exploit the issue 28 | 29 | This information will help us triage your report more quickly. 30 | 31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://microsoft.com/msrc/bounty) page for more details about our active programs. 32 | 33 | ## Preferred Languages 34 | 35 | We prefer all communications to be in English. 36 | 37 | ## Policy 38 | 39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://www.microsoft.com/en-us/msrc/cvd). 40 | 41 | -------------------------------------------------------------------------------- /pkg/checkers/podschedule/podschedule.go: -------------------------------------------------------------------------------- 1 | package podschedule 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | 7 | log "github.com/sirupsen/logrus" 8 | corev1 "k8s.io/api/core/v1" 9 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 10 | "k8s.io/client-go/kubernetes" 11 | 12 | "github.com/Azure/kdebug/pkg/base" 13 | ) 14 | 15 | type PodScheduleChecker struct { 16 | } 17 | 18 | func New() *PodScheduleChecker { 19 | return &PodScheduleChecker{} 20 | } 21 | 22 | func (c *PodScheduleChecker) Name() string { 23 | return "PodSchedule" 24 | } 25 | 26 | func (c *PodScheduleChecker) Check(ctx *base.CheckContext) ([]*base.CheckResult, error) { 27 | results := []*base.CheckResult{} 28 | 29 | if ctx.KubeClient != nil { 30 | results = append(results, c.checkPodSchedule(ctx.KubeClient)...) 31 | } else { 32 | log.Debugf("Skip %s due to missing Kubernetes config", c.Name()) 33 | } 34 | 35 | return results, nil 36 | } 37 | 38 | func (c *PodScheduleChecker) checkPodSchedule(clientset *kubernetes.Clientset) []*base.CheckResult { 39 | results := []*base.CheckResult{} 40 | 41 | // List all pods 42 | pods, err := clientset.CoreV1().Pods("").List(context.Background(), metav1.ListOptions{}) 43 | if err != nil { 44 | log.WithFields(log.Fields{"error": err}).Warn("Fail to list pods") 45 | return results 46 | } 47 | 48 | // Group pods by replicaset 49 | podsByRs := make(map[string][]corev1.Pod) 50 | for _, pod := range pods.Items { 51 | if pod.ObjectMeta.OwnerReferences == nil || len(pod.ObjectMeta.OwnerReferences) == 0 { 52 | continue 53 | } 54 | 55 | ownerRef := pod.ObjectMeta.OwnerReferences[0] 56 | if ownerRef.APIVersion == "apps/v1" && 57 | ownerRef.Kind == "ReplicaSet" { 58 | 59 | rsName := pod.ObjectMeta.Namespace + "/" + ownerRef.Name 60 | if rsPods, ok := podsByRs[rsName]; ok { 61 | podsByRs[rsName] = append(rsPods, pod) 62 | } else { 63 | podsByRs[rsName] = []corev1.Pod{pod} 64 | } 65 | } 66 | } 67 | 68 | // Check replica sets 69 | for rsName, rsPods := range podsByRs { 70 | if len(rsPods) <= 1 { 71 | continue 72 | } 73 | 74 | results = append(results, c.checkPodsScheduleInReplicaSet(rsName, rsPods)) 75 | } 76 | 77 | return results 78 | } 79 | 80 | func (c *PodScheduleChecker) checkPodsScheduleInReplicaSet(rsName string, pods []corev1.Pod) *base.CheckResult { 81 | if len(pods) <= 1 { 82 | panic("Should not be called with less than 2 pods") 83 | } 84 | 85 | node := "" 86 | for _, pod := range pods { 87 | if node == "" { 88 | node = pod.Spec.NodeName 89 | } else if node != pod.Spec.NodeName { 90 | return &base.CheckResult{ 91 | Checker: c.Name(), 92 | Description: fmt.Sprintf("Pods in replica set %s are scheduled to different nodes", rsName), 93 | } 94 | } 95 | } 96 | return &base.CheckResult{ 97 | Checker: c.Name(), 98 | Error: fmt.Sprintf("All pods of replica set %s are scheduled on same node", rsName), 99 | Recommendations: []string{ 100 | "Please reference to document to set Affinity and anti-affinity: https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/#affinity-and-anti-affinity", 101 | }, 102 | } 103 | } 104 | -------------------------------------------------------------------------------- /pkg/checkers/diskreadonly/disk_readonly.go: -------------------------------------------------------------------------------- 1 | package diskreadonly 2 | 3 | import ( 4 | "fmt" 5 | "os" 6 | "os/exec" 7 | "strings" 8 | 9 | log "github.com/sirupsen/logrus" 10 | 11 | "github.com/Azure/kdebug/pkg/base" 12 | ) 13 | 14 | const ( 15 | Reason = "The filesystem mignt enter read-only state due to underlying data integrity issues." 16 | GeneralRecommendation = "Find out which filesystem your home dir is mounted on via 'df' command. Try to use 'fsck' command to fix the filesystem and then reboot the vm." 17 | ) 18 | 19 | var helpLink = []string{ 20 | "linux.die.net/man/8/mount", 21 | "linux.die.net/man/8/fsck", 22 | "https://askubuntu.com/a/197468", 23 | } 24 | 25 | type DiskReadOnlyChecker struct { 26 | } 27 | 28 | func New() *DiskReadOnlyChecker { 29 | return &DiskReadOnlyChecker{} 30 | } 31 | 32 | func (c *DiskReadOnlyChecker) Name() string { 33 | return "DiskReadOnly" 34 | } 35 | 36 | func (c *DiskReadOnlyChecker) Check(ctx *base.CheckContext) ([]*base.CheckResult, error) { 37 | if !ctx.Environment.HasFlag("linux") { 38 | // This checker is only valid on Linux. 39 | log.Debugf("Skip %s checker in non-linux os", c.Name()) 40 | return []*base.CheckResult{}, nil 41 | } 42 | 43 | homeDir, err := os.UserHomeDir() 44 | if err != nil { 45 | return nil, fmt.Errorf("Fail to get user home dir. %w", err) 46 | } 47 | 48 | f, err := os.CreateTemp(homeDir, "testReadOnlyFile") 49 | var result *base.CheckResult 50 | if err != nil { 51 | var recommendation string 52 | if strings.Contains(strings.ToLower(err.Error()), "read-only") { 53 | mountSrc, mountTarget, findMntErr := getMountSrcAndTarget(homeDir) 54 | if findMntErr != nil { 55 | log.Warnf("Fail to find mount src for %s: %s", homeDir, findMntErr) 56 | recommendation = fmt.Sprintf("%s%s", Reason, GeneralRecommendation) 57 | } else { 58 | recommendation = fmt.Sprintf("%s Try to use 'fsck' command to fix the %s mounted on %s and then reboot the vm.", Reason, mountSrc, mountTarget) 59 | } 60 | result = &base.CheckResult{ 61 | Checker: c.Name(), 62 | Error: "Disk might be read-only", 63 | Description: fmt.Sprintf("Cannot create a temp file in %s due to %s", homeDir, err), 64 | Recommendations: []string{recommendation}, 65 | HelpLinks: []string{}, 66 | } 67 | } else { 68 | return nil, fmt.Errorf("Fail to create a temp file in %s due to unexpected error: %w", homeDir, err) 69 | } 70 | } else { 71 | defer os.Remove(f.Name()) 72 | result = &base.CheckResult{ 73 | Checker: c.Name(), 74 | Description: fmt.Sprintf("%s is not read-only", homeDir), 75 | } 76 | } 77 | 78 | return []*base.CheckResult{result}, nil 79 | } 80 | 81 | func getMountSrcAndTarget(homeDir string) (string, string, error) { 82 | findMntCmd := exec.Command("findmnt", "--target", homeDir, "--output", "SOURCE,TARGET", "--noheadings") 83 | mountDescription, err := findMntCmd.Output() 84 | if err != nil { 85 | return "", "", fmt.Errorf("Fail to find the filesystem of %s with command '%s': %w", 86 | homeDir, findMntCmd.String(), err) 87 | } else { 88 | mountDescriptions := strings.Split(strings.TrimSuffix(string(mountDescription), "\n"), " ") 89 | // mount source, mount target, error 90 | return mountDescriptions[0], mountDescriptions[1], nil 91 | } 92 | } 93 | -------------------------------------------------------------------------------- /pkg/checkers/kube/objectsize/objectsize.go: -------------------------------------------------------------------------------- 1 | package dns 2 | 3 | import ( 4 | "context" 5 | "encoding/json" 6 | "fmt" 7 | 8 | log "github.com/sirupsen/logrus" 9 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 10 | "k8s.io/client-go/kubernetes" 11 | 12 | "github.com/Azure/kdebug/pkg/base" 13 | "github.com/dustin/go-humanize" 14 | ) 15 | 16 | const ( 17 | WarnSizeThreshold = 800 * (1 << 10) // 800 KB 18 | ) 19 | 20 | type KubeObjectSizeChecker struct { 21 | } 22 | 23 | func New() *KubeObjectSizeChecker { 24 | return &KubeObjectSizeChecker{} 25 | } 26 | 27 | func (c *KubeObjectSizeChecker) Name() string { 28 | return "KubeObjectSize" 29 | } 30 | 31 | func (c *KubeObjectSizeChecker) Check(ctx *base.CheckContext) ([]*base.CheckResult, error) { 32 | results := []*base.CheckResult{} 33 | 34 | if ctx.KubeClient != nil { 35 | results = append(results, c.checkConfigMaps(ctx.KubeClient)...) 36 | results = append(results, c.checkSecrets(ctx.KubeClient)...) 37 | } else { 38 | log.Warn("Skip KubeObjectSizeChecker due to missing kube client") 39 | } 40 | 41 | return results, nil 42 | } 43 | 44 | func (c *KubeObjectSizeChecker) checkConfigMaps(clientset *kubernetes.Clientset) []*base.CheckResult { 45 | results := []*base.CheckResult{} 46 | 47 | cms, err := clientset.CoreV1().ConfigMaps("").List(context.Background(), metav1.ListOptions{}) 48 | if err != nil { 49 | log.WithFields(log.Fields{"error": err}).Warn("Fail to list config maps") 50 | return results 51 | } 52 | 53 | for _, cm := range cms.Items { 54 | result := c.checkObjectSize("ConfigMap", cm.ObjectMeta.Namespace, cm.ObjectMeta.Name, cm) 55 | if result != nil { 56 | results = append(results, result) 57 | } 58 | } 59 | 60 | return results 61 | } 62 | 63 | func (c *KubeObjectSizeChecker) checkSecrets(clientset *kubernetes.Clientset) []*base.CheckResult { 64 | results := []*base.CheckResult{} 65 | 66 | cms, err := clientset.CoreV1().Secrets("").List(context.Background(), metav1.ListOptions{}) 67 | if err != nil { 68 | log.WithFields(log.Fields{"error": err}).Warn("Fail to list secrets") 69 | return results 70 | } 71 | 72 | for _, cm := range cms.Items { 73 | result := c.checkObjectSize("Secret", cm.ObjectMeta.Namespace, cm.ObjectMeta.Name, cm) 74 | if result != nil { 75 | results = append(results, result) 76 | } 77 | } 78 | 79 | return results 80 | } 81 | 82 | func (c *KubeObjectSizeChecker) checkObjectSize(kind, ns, name string, obj interface{}) *base.CheckResult { 83 | data, err := json.Marshal(obj) 84 | if err != nil { 85 | return nil 86 | } 87 | 88 | if len(data) > WarnSizeThreshold { 89 | return &base.CheckResult{ 90 | Checker: c.Name(), 91 | Error: fmt.Sprintf("%s %s/%s reaching size limit.", kind, ns, name), 92 | Description: fmt.Sprintf("%s %s/%s of size %s is reaching size limit. It cannot exceed 1MiB.", kind, ns, name, humanize.Bytes(uint64(len(data)))), 93 | Recommendations: []string{ 94 | "Consider mounting a volume or use a separate database or file service.", 95 | }, 96 | } 97 | } 98 | 99 | return &base.CheckResult{ 100 | Checker: c.Name(), 101 | Description: fmt.Sprintf("%s %s/%s of size %s is not reaching size limit.", kind, ns, name, humanize.Bytes(uint64(len(data)))), 102 | } 103 | } 104 | -------------------------------------------------------------------------------- /cmd/run-as-host/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "context" 5 | "io" 6 | "io/ioutil" 7 | "os" 8 | "path" 9 | "strings" 10 | "time" 11 | 12 | "github.com/coreos/go-systemd/v22/dbus" 13 | log "github.com/sirupsen/logrus" 14 | ) 15 | 16 | const ( 17 | SystemdConfigDir = "/etc/systemd/system" 18 | SystemdUnitName = "kdebug.service" 19 | SystemdUnitTemplate = `[Unit] 20 | Description=kdebug 21 | 22 | [Service] 23 | Type=oneshot 24 | ExecStart=TODO_EXEC_START 25 | TimeoutSec=60 26 | 27 | [Install] 28 | WantedBy=multi-user.target 29 | ` 30 | OutputFile = "/tmp/kdebug.stdout.log" 31 | ) 32 | 33 | func copyFile(src, dst string) error { 34 | in, err := os.Open(src) 35 | if err != nil { 36 | return err 37 | } 38 | defer in.Close() 39 | 40 | out, err := os.OpenFile(dst, os.O_CREATE|os.O_WRONLY, 0755) 41 | if err != nil { 42 | return err 43 | } 44 | defer out.Close() 45 | 46 | if _, err = io.Copy(out, in); err != nil { 47 | return err 48 | } 49 | 50 | return out.Sync() 51 | } 52 | 53 | func writeSystemdUnit(cmd string) error { 54 | unitConfig := strings.Replace(SystemdUnitTemplate, 55 | "TODO_EXEC_START", cmd, 1) 56 | unitConfigPath := path.Join(SystemdConfigDir, SystemdUnitName) 57 | return ioutil.WriteFile(unitConfigPath, []byte(unitConfig), 0644) 58 | } 59 | 60 | func removeSystemdUnit() error { 61 | unitConfigPath := path.Join(SystemdConfigDir, SystemdUnitName) 62 | return os.Remove(unitConfigPath) 63 | } 64 | 65 | func readOutputs() ([]byte, error) { 66 | f, err := os.Open(OutputFile) 67 | if err != nil { 68 | return nil, err 69 | } 70 | defer f.Close() 71 | return ioutil.ReadAll(f) 72 | } 73 | 74 | func main() { 75 | if len(os.Args) < 2 { 76 | log.Fatal("not enough args") 77 | } 78 | 79 | cmd := os.Args[1] 80 | cmdArgs := append(os.Args[2:], "--output", OutputFile) 81 | 82 | // Copy binary to host 83 | baseName := path.Base(cmd) 84 | dstPath := path.Join("/tmp", baseName) 85 | if err := copyFile(cmd, dstPath); err != nil { 86 | log.Fatalf("fail to copy file: %+v", err) 87 | } 88 | 89 | // Set up system config 90 | dstCmd := dstPath + " " + strings.Join(cmdArgs, " ") 91 | if err := writeSystemdUnit(dstCmd); err != nil { 92 | log.Fatalf("fail to write unit file: %+v", err) 93 | } 94 | 95 | // Invoke 96 | conn, err := dbus.NewSystemConnectionContext(context.Background()) 97 | if err != nil { 98 | log.Fatalf("fail to connect to systemd: %+v", err) 99 | } 100 | defer conn.Close() 101 | 102 | if err = conn.ReloadContext(context.Background()); err != nil { 103 | log.Fatalf("fail to reload systemd: %+v", err) 104 | } 105 | 106 | ch := make(chan string) 107 | _, err = conn.StartUnitContext(context.Background(), 108 | SystemdUnitName, "replace", ch) 109 | if err != nil { 110 | log.Fatalf("fail to start systemd unit: %+v", err) 111 | } 112 | 113 | select { 114 | case <-ch: 115 | break 116 | case <-time.After(75 * time.Second): 117 | log.Fatalf("timeout starting systemd unit") 118 | } 119 | 120 | output, err := readOutputs() 121 | if err != nil { 122 | log.Fatalf("fail to read output: %+v", err) 123 | } 124 | 125 | // Cleanup 126 | if err = removeSystemdUnit(); err != nil { 127 | log.Fatalf("fail to remove systemd unit: %+v", err) 128 | } 129 | 130 | if err = os.Remove(OutputFile); err != nil { 131 | log.Fatalf("fail to remove stdout file: %+v", err) 132 | } 133 | 134 | // Output 135 | os.Stdout.Write(output) 136 | } 137 | -------------------------------------------------------------------------------- /pkg/tools/tcpdump/tcpdump.go: -------------------------------------------------------------------------------- 1 | package tcpdump 2 | 3 | import ( 4 | "fmt" 5 | "os" 6 | "os/exec" 7 | "strings" 8 | 9 | flags "github.com/jessevdk/go-flags" 10 | 11 | "github.com/Azure/kdebug/pkg/base" 12 | log "github.com/sirupsen/logrus" 13 | ) 14 | 15 | type TcpdumpTool struct { 16 | srcIP string 17 | srcPort string 18 | dstIP string 19 | dstPort string 20 | hostIP string 21 | hostPort string 22 | pid string 23 | tcpOnly bool 24 | } 25 | 26 | const ( 27 | DefaultTcpdumpArguments = "-nvvv" 28 | ) 29 | 30 | type Config struct { 31 | Source string `long:"source" description:"The source of the connection. Format: :. Watch all sources if not assigned."` 32 | Destination string `long:"destination" description:"The destination of the connection. Format: :. Watch all destination if not assigned."` 33 | Host string `long:"host" description:"The host(either src or dst) of the connection. Format: :. Watch if not assigned."` 34 | Pid string `short:"p" long:"pid" description:"Attach into a specific pid's network namespace. Use current namespace if not assigned"` 35 | TcpOnly bool `long:"tcponly" description:"Only watch tcp connections"` 36 | } 37 | 38 | func New() *TcpdumpTool { 39 | return &TcpdumpTool{} 40 | } 41 | 42 | func (c *TcpdumpTool) Name() string { 43 | return "Tcpdump" 44 | } 45 | 46 | func logAndExec(name string, args ...string) *exec.Cmd { 47 | log.Infof("Exec %s %+v", name, args) 48 | return exec.Command(name, args...) 49 | } 50 | 51 | func (c *TcpdumpTool) ParseArgs(ctx *base.ToolContext, args []string) error { 52 | var config Config 53 | remainingArgs, err := flags.ParseArgs(&config, args) 54 | if err != nil { 55 | return err 56 | } 57 | ctx.Config = &config 58 | ctx.Args = remainingArgs 59 | return nil 60 | } 61 | 62 | func (c *TcpdumpTool) Run(ctx *base.ToolContext) error { 63 | config := ctx.Config.(*Config) 64 | c.ParseParameters(config) 65 | tcpdumpArgs := c.GenerateTcpdumpParamerters() 66 | 67 | // Attch pid 68 | if len(config.Pid) > 0 { 69 | _, err := logAndExec("nsenter", "-n", "-t", config.Pid).Output() 70 | 71 | if err != nil { 72 | return err 73 | } 74 | } 75 | 76 | cmd := logAndExec("tcpdump", strings.Split(tcpdumpArgs, " ")...) 77 | cmd.Stdout = os.Stdout 78 | cmd.Stderr = os.Stderr 79 | err := cmd.Run() 80 | return err 81 | } 82 | 83 | func (c *TcpdumpTool) ParseParameters(config *Config) { 84 | c.srcIP, c.srcPort = ParseIPAndPort(config.Source) 85 | c.dstIP, c.dstPort = ParseIPAndPort(config.Destination) 86 | c.hostIP, c.hostPort = ParseIPAndPort(config.Host) 87 | c.pid = config.Pid 88 | c.tcpOnly = config.TcpOnly 89 | } 90 | 91 | func (c *TcpdumpTool) GenerateTcpdumpParamerters() string { 92 | var cmd []string 93 | if len(c.srcIP) > 0 { 94 | cmd = append(cmd, fmt.Sprintf("src %s", c.srcIP)) 95 | } 96 | if len(c.srcPort) > 0 { 97 | cmd = append(cmd, fmt.Sprintf("src port %s", c.srcPort)) 98 | } 99 | if len(c.dstIP) > 0 { 100 | cmd = append(cmd, fmt.Sprintf("dst %s", c.dstIP)) 101 | } 102 | if len(c.dstPort) > 0 { 103 | cmd = append(cmd, fmt.Sprintf("dst port %s", c.dstPort)) 104 | } 105 | if len(c.hostIP) > 0 { 106 | cmd = append(cmd, fmt.Sprintf("host %s", c.hostIP)) 107 | } 108 | if len(c.hostPort) > 0 { 109 | cmd = append(cmd, fmt.Sprintf("port %s", c.hostPort)) 110 | } 111 | if c.tcpOnly { 112 | cmd = append(cmd, "tcp") 113 | } 114 | return DefaultTcpdumpArguments + " " + strings.Join(cmd, " and ") 115 | } 116 | 117 | func ParseIPAndPort(s string) (ip string, port string) { 118 | colon := strings.Index(s, ":") 119 | if colon == -1 { 120 | return s, "" 121 | } 122 | 123 | return s[0:colon], s[colon+1:] 124 | } 125 | -------------------------------------------------------------------------------- /pkg/tools/upgradeinspector/upgradeinspector.go: -------------------------------------------------------------------------------- 1 | package upgradeinspector 2 | 3 | import ( 4 | "fmt" 5 | "os/exec" 6 | "strings" 7 | "time" 8 | 9 | "github.com/Azure/kdebug/pkg/base" 10 | "github.com/Azure/kdebug/pkg/env" 11 | "github.com/fatih/color" 12 | flags "github.com/jessevdk/go-flags" 13 | ) 14 | 15 | const logPath = "/var/log/dpkg.log" 16 | 17 | const suggestion = "You can check '/var/log/dpkg.log' and '/var/log/apt/history.log' for further detail." 18 | 19 | var columns = []string{ 20 | "Timestamp", 21 | "Package", 22 | "OldVer", 23 | "NewVer", 24 | } 25 | 26 | type UpgradeInspectTool struct { 27 | checkDays int 28 | recordLimit int 29 | } 30 | 31 | type Config struct { 32 | CheckDays int `long:"checkdays" default:"7" description:"Days you want to look back to search for package upgrade history. Default is 7."` 33 | RecordLimit int `long:"recordlimit" default:"50" description:"Number of records you want to inspect for package upgrade history. Default is 50."` 34 | } 35 | 36 | func (t *UpgradeInspectTool) Name() string { 37 | return "upgradeinspector" 38 | } 39 | 40 | func New() *UpgradeInspectTool { 41 | return &UpgradeInspectTool{} 42 | } 43 | 44 | func (t *UpgradeInspectTool) ParseArgs(ctx *base.ToolContext, args []string) error { 45 | var config Config 46 | remaningArgs, err := flags.ParseArgs(&config, args) 47 | if err != nil { 48 | return err 49 | } 50 | ctx.Config = &config 51 | ctx.Args = remaningArgs 52 | return nil 53 | } 54 | 55 | func (t *UpgradeInspectTool) Run(ctx *base.ToolContext) error { 56 | t.parseArgument(ctx) 57 | if !envCheck(ctx.Environment) { 58 | fmt.Println(color.YellowString("Skip upgrade inspect in non ubuntu/debian os")) 59 | return nil 60 | } 61 | return t.exec() 62 | } 63 | 64 | func (t *UpgradeInspectTool) parseArgument(ctx *base.ToolContext) { 65 | config := ctx.Config.(*Config) 66 | t.checkDays = config.CheckDays 67 | t.recordLimit = config.RecordLimit 68 | } 69 | 70 | func (t *UpgradeInspectTool) exec() error { 71 | cmd := exec.Command("grep", " upgrade ", logPath) 72 | stdout, err := cmd.Output() 73 | if err != nil { 74 | return err 75 | } 76 | fmt.Println(t.parseResult(string(stdout))) 77 | fmt.Println(color.YellowString("\n%v\n", suggestion)) 78 | return nil 79 | } 80 | 81 | func (t *UpgradeInspectTool) parseResult(result string) string { 82 | sb := strings.Builder{} 83 | logs := t.filterResult(result) 84 | logNum := len(logs) 85 | 86 | if logNum == 0 { 87 | sb.WriteString(color.GreenString("\nNo package upgrade log found\n")) 88 | } else { 89 | sb.WriteString(fmt.Sprintf("\n%-19s\t%-40s\t%-30s\t%-30s\n\n", columns[0], columns[1], columns[2], columns[3])) 90 | } 91 | 92 | for i := 0; i < logNum && i < t.recordLimit; i++ { 93 | strs := strings.Split(logs[i], " ") 94 | sb.WriteString(fmt.Sprintf("%v-%v\t%-40s\t%-30s\t%-30s\n", strs[0], strs[1], strs[3], strs[4], strs[5])) 95 | } 96 | if t.recordLimit < logNum { 97 | sb.WriteString(color.YellowString("\n%v package(s) omitted\n", logNum-t.recordLimit)) 98 | } 99 | return sb.String() 100 | } 101 | 102 | func (t *UpgradeInspectTool) filterResult(result string) []string { 103 | logs := strings.Split(result, "\n") 104 | filtered := []string{} 105 | cutTime := time.Now().AddDate(0, 0, -t.checkDays) 106 | 107 | for i := 0; i < len(logs)-1; i++ { 108 | strs := strings.Split(logs[i], " ") 109 | logTime, err := time.Parse("2006-01-02 15:04:05", fmt.Sprintf(`%s %s`, strs[0], strs[1])) 110 | if err == nil && logTime.After(cutTime) { 111 | filtered = append(filtered, logs[i]) 112 | } 113 | } 114 | return filtered 115 | } 116 | 117 | func envCheck(environment env.Environment) bool { 118 | return environment.HasFlag("ubuntu") || environment.HasFlag("debian") 119 | } 120 | -------------------------------------------------------------------------------- /pkg/tools/aadssh/token_azure_cli.go: -------------------------------------------------------------------------------- 1 | package aadssh 2 | 3 | import ( 4 | "context" 5 | "encoding/json" 6 | "fmt" 7 | "io/ioutil" 8 | "net/http" 9 | "net/url" 10 | "os" 11 | "path" 12 | "strings" 13 | "time" 14 | 15 | msal "github.com/AzureAD/microsoft-authentication-library-for-go/apps/public" 16 | log "github.com/sirupsen/logrus" 17 | ) 18 | 19 | // acquireTokenByAzureCLI acquires a token from AAD using Azure CLI credentials 20 | func acquireTokenByAzureCLI(ctx context.Context, scopes []string, data map[string]string) (msal.AuthResult, error) { 21 | homeDir, err := os.UserHomeDir() 22 | if err != nil { 23 | return msal.AuthResult{}, fmt.Errorf("Fail to get OS home dir: %+v", err) 24 | } 25 | 26 | tokenCacheFilePath := path.Join(homeDir, AzureCLIDirName, AzureCLITokenCacheFileName) 27 | f, err := os.Open(tokenCacheFilePath) 28 | if err != nil { 29 | return msal.AuthResult{}, fmt.Errorf("Fail to read Azure CLI token cache: %+v", err) 30 | } 31 | defer f.Close() 32 | 33 | decoder := json.NewDecoder(f) 34 | var tokenCache struct { 35 | RefreshToken map[string]struct { 36 | CredentialType string `json:"credential_type"` 37 | Secret string `json:"secret"` 38 | ClientID string `json:"client_id"` 39 | HomeAccountID string `json:"home_account_id"` 40 | Environment string `json:"environment"` 41 | } `json:"RefreshToken"` 42 | } 43 | err = decoder.Decode(&tokenCache) 44 | if err != nil { 45 | return msal.AuthResult{}, fmt.Errorf("Fail to decode Azure CLI token cache: %+v", err) 46 | } 47 | 48 | var refreshToken string 49 | var tenantId string 50 | var clientId string 51 | var host string 52 | for _, token := range tokenCache.RefreshToken { 53 | // TODO: Add more checks 54 | if token.CredentialType == "RefreshToken" { 55 | refreshToken = token.Secret 56 | tenantId = strings.Split(token.HomeAccountID, ".")[1] 57 | host = token.Environment 58 | break 59 | } 60 | } 61 | 62 | if refreshToken == "" { 63 | return msal.AuthResult{}, fmt.Errorf("Cannot find any refresh token in Azure CLI token cache. Please do `az login`") 64 | } 65 | 66 | defaultScopes := []string{ 67 | "openid", 68 | "profile", 69 | "offline_access", 70 | } 71 | values := url.Values{} 72 | values.Add("client_id", clientId) 73 | values.Add("grant_type", "refresh_token") 74 | values.Add("scope", strings.Join(append(scopes, defaultScopes...), " ")) 75 | values.Add("refresh_token", refreshToken) 76 | for k, v := range data { 77 | values.Add(k, v) 78 | } 79 | bodyString := values.Encode() 80 | bodyStream := strings.NewReader(bodyString) 81 | 82 | url := fmt.Sprintf("https://%s/%s%s", host, tenantId, TokenURLSuffix) 83 | log.WithFields(log.Fields{"body": bodyString, "url": url}).Debug("Token request") 84 | 85 | req, err := http.NewRequestWithContext(ctx, "POST", url, bodyStream) 86 | if err != nil { 87 | return msal.AuthResult{}, fmt.Errorf("Fail to construct request: %+v", err) 88 | } 89 | 90 | httpClient := &http.Client{ 91 | Timeout: time.Minute, 92 | } 93 | resp, err := httpClient.Do(req) 94 | if err != nil { 95 | return msal.AuthResult{}, fmt.Errorf("Fail to request token: %+v", err) 96 | } 97 | defer resp.Body.Close() 98 | 99 | if resp.StatusCode != http.StatusOK { 100 | respContent, _ := ioutil.ReadAll(resp.Body) 101 | return msal.AuthResult{}, 102 | fmt.Errorf("Unexpected token response status code: %d. Body: %s", 103 | resp.StatusCode, string(respContent)) 104 | } 105 | 106 | var body struct { 107 | AccessToken string `json:"access_token"` 108 | TokenType string `json:"token_type"` 109 | ExpiresIn int `json:"expires_in"` 110 | Scope string `json:"scope"` 111 | RefreshToken string `json:"refresh_token"` 112 | IDToken string `json:"id_token"` 113 | } 114 | decoder = json.NewDecoder(resp.Body) 115 | err = decoder.Decode(&body) 116 | if err != nil { 117 | return msal.AuthResult{}, fmt.Errorf("Fail to decode token response: %+v", err) 118 | } 119 | 120 | log.WithFields(log.Fields{"body": fmt.Sprintf("%+v", body)}).Debug("Token response") 121 | 122 | return msal.AuthResult{ 123 | AccessToken: body.AccessToken, 124 | ExpiresOn: time.Now().Add(time.Duration(body.ExpiresIn) * time.Second), 125 | }, nil 126 | } 127 | -------------------------------------------------------------------------------- /pkg/tools/netexec/netexec.go: -------------------------------------------------------------------------------- 1 | package netexec 2 | 3 | import ( 4 | "fmt" 5 | "os" 6 | "os/exec" 7 | "strings" 8 | 9 | "github.com/Azure/kdebug/pkg/base" 10 | "github.com/jessevdk/go-flags" 11 | log "github.com/sirupsen/logrus" 12 | "k8s.io/cli-runtime/pkg/genericclioptions" 13 | kubecmd "k8s.io/kubectl/pkg/cmd" 14 | ) 15 | 16 | type Config struct { 17 | Pid string `long:"pid" description:"Attach into a specific pid's network namespace."` 18 | PodName string `long:"pod" description:"Attach into a specific pod's network namespace. Caution: The command will use ephemeral debug container to attach a container with 'ghcr.io/azure/kdebug:main' to the target pod."` 19 | Namespace string `long:"namespace" description:"The namespace of the pod."` 20 | Command string `long:"command" description:"Customize the command to be run in container namespace. Leave it blank to use 'sh'."` 21 | Image string `long:"image" description:"Customize the image to be used to run command when using --pod. Leave it blank to use busybox."` 22 | } 23 | 24 | type NetexecTool struct { 25 | pid string 26 | podName string 27 | namespace string 28 | command string 29 | image string 30 | } 31 | 32 | const ( 33 | DefaultCommand = "sh" 34 | DefaultContainerImage = "busybox" 35 | DefaultNamespace = "default" 36 | DefaultKubectlBasicCommandFormat = "debug -ti %s --image %s -n %s -- " 37 | ) 38 | 39 | func New() *NetexecTool { 40 | return &NetexecTool{} 41 | } 42 | 43 | func (c *NetexecTool) Name() string { 44 | return "Netexec" 45 | } 46 | 47 | func logAndExec(name string, args ...string) *exec.Cmd { 48 | log.Infof("Exec %s %+v", name, args) 49 | return exec.Command(name, args...) 50 | } 51 | 52 | func (c *NetexecTool) Run(ctx *base.ToolContext) error { 53 | if len(c.pid) > 0 { 54 | return c.checkWithPid() 55 | } 56 | 57 | return c.checkWithPod(ctx.KubeConfigFlag) 58 | } 59 | 60 | func (c *NetexecTool) ParseArgs(ctx *base.ToolContext, args []string) error { 61 | var config Config 62 | remainingArgs, err := flags.ParseArgs(&config, args) 63 | if err != nil { 64 | return err 65 | } 66 | ctx.Config = &config 67 | ctx.Args = remainingArgs 68 | return c.parseAndCheckParameters(ctx) 69 | } 70 | 71 | func (c *NetexecTool) parseAndCheckParameters(ctx *base.ToolContext) error { 72 | config := ctx.Config.(*Config) 73 | 74 | if len(config.Pid) == 0 && len(config.PodName) == 0 { 75 | return fmt.Errorf("Either --pid and --pod should be set.") 76 | } 77 | if len(config.Pid) > 0 && len(config.PodName) > 0 { 78 | return fmt.Errorf("--pid and --pod can not be assigned together. Please set either of them.") 79 | } 80 | if len(config.PodName) > 0 { 81 | if ctx.KubeConfigFlag == nil { 82 | return fmt.Errorf("kubernetes client is not availble. Check kubeconfig.") 83 | } 84 | } 85 | 86 | c.pid = config.Pid 87 | c.podName = config.PodName 88 | if len(config.Command) > 0 { 89 | c.command = config.Command 90 | } else { 91 | c.command = DefaultCommand 92 | } 93 | 94 | if len(config.Image) > 0 { 95 | c.image = config.Image 96 | } else { 97 | c.image = DefaultContainerImage 98 | } 99 | 100 | if len(config.Namespace) > 0 { 101 | c.namespace = config.Namespace 102 | } else { 103 | c.namespace = DefaultNamespace 104 | } 105 | 106 | return nil 107 | } 108 | 109 | func (c *NetexecTool) checkWithPid() error { 110 | _, err := logAndExec("nsenter", "-n", "-t", c.pid).Output() 111 | if err != nil { 112 | return err 113 | } 114 | 115 | args := strings.Fields(c.command) 116 | cmd := logAndExec(args[0], args[1:]...) 117 | cmd.Stdin = os.Stdin 118 | cmd.Stdout = os.Stdout 119 | cmd.Stderr = os.Stderr 120 | return cmd.Run() 121 | } 122 | 123 | func (c *NetexecTool) checkWithPod(configFlags *genericclioptions.ConfigFlags) error { 124 | cmd := fmt.Sprintf("%s%s", fmt.Sprintf(DefaultKubectlBasicCommandFormat, c.podName, c.image, c.namespace), c.command) 125 | arg := strings.Fields(cmd) 126 | log.Infof("The command is equivalent to 'kubectl %s'", cmd) 127 | kubectlCmd := kubecmd.NewKubectlCommand(kubecmd.KubectlOptions{ 128 | ConfigFlags: configFlags, 129 | IOStreams: genericclioptions.IOStreams{In: os.Stdin, Out: os.Stdout, ErrOut: os.Stderr}, 130 | }) 131 | kubectlCmd.SetArgs(arg) 132 | 133 | return kubectlCmd.Execute() 134 | } 135 | -------------------------------------------------------------------------------- /pkg/batch/ssh_executor.go: -------------------------------------------------------------------------------- 1 | package batch 2 | 3 | import ( 4 | "context" 5 | "encoding/json" 6 | "fmt" 7 | "net" 8 | "os" 9 | "os/user" 10 | 11 | scp "github.com/bramvdbogaerde/go-scp" 12 | log "github.com/sirupsen/logrus" 13 | "golang.org/x/crypto/ssh" 14 | "golang.org/x/crypto/ssh/agent" 15 | ) 16 | 17 | type SshBatchExecutor struct { 18 | User string 19 | } 20 | 21 | func NewSshBatchExecutor(userName string) *SshBatchExecutor { 22 | e := &SshBatchExecutor{ 23 | User: userName, 24 | } 25 | if len(e.User) == 0 { 26 | // Use current user 27 | ui, err := user.Current() 28 | if err == nil { 29 | e.User = ui.Username 30 | } 31 | } 32 | return e 33 | } 34 | 35 | func (e *SshBatchExecutor) Execute(opts *BatchOptions) ([]*BatchResult, error) { 36 | taskChan := make(chan *batchTask, opts.Concurrency) 37 | resultChan := make(chan *BatchResult, opts.Concurrency) 38 | 39 | for i := 0; i < opts.Concurrency; i++ { 40 | go e.startWorker(taskChan, resultChan) 41 | } 42 | 43 | for _, machine := range opts.Machines { 44 | go func(m string) { 45 | taskChan <- &batchTask{ 46 | Machine: m, 47 | Checkers: opts.Checkers, 48 | } 49 | }(machine) 50 | } 51 | 52 | results := make([]*BatchResult, 0, len(opts.Machines)) 53 | for i := 0; i < len(opts.Machines); i++ { 54 | result := <-resultChan 55 | results = append(results, result) 56 | opts.Reporter.OnResult(result) 57 | } 58 | 59 | close(taskChan) 60 | 61 | return results, nil 62 | } 63 | 64 | func (e *SshBatchExecutor) startWorker(taskChan chan *batchTask, resultChan chan *BatchResult) { 65 | for task := range taskChan { 66 | resultChan <- e.executeTask(task) 67 | } 68 | } 69 | 70 | func (e *SshBatchExecutor) createSshClient(machine string) (*ssh.Client, error) { 71 | // TODO: One per SSH client 72 | authSock := os.Getenv("SSH_AUTH_SOCK") 73 | authConn, err := net.Dial("unix", authSock) 74 | if err != nil { 75 | return nil, fmt.Errorf("fail to connect to SSH_AUTH_SOCK: %+v", err) 76 | } 77 | 78 | agentClient := agent.NewClient(authConn) 79 | config := &ssh.ClientConfig{ 80 | User: e.User, 81 | Auth: []ssh.AuthMethod{ 82 | ssh.PublicKeysCallback(agentClient.Signers), 83 | }, 84 | HostKeyCallback: ssh.InsecureIgnoreHostKey(), 85 | } 86 | 87 | return ssh.Dial("tcp", machine+":22", config) 88 | } 89 | 90 | func (e *SshBatchExecutor) executeTask(task *batchTask) *BatchResult { 91 | result := &BatchResult{ 92 | Machine: task.Machine, 93 | } 94 | 95 | sshClient, err := e.createSshClient(task.Machine) 96 | if err != nil { 97 | result.Error = fmt.Errorf("fail to create SSH client: %+v", err) 98 | return result 99 | } 100 | defer sshClient.Close() 101 | 102 | // Copy binary to remote 103 | log.Debugf("Copy kdebug to %s", task.Machine) 104 | err = copyExecutable(sshClient) 105 | if err != nil { 106 | result.Error = fmt.Errorf("fail to copy kdebug to remote machine: %+v", err) 107 | return result 108 | } 109 | 110 | sess, err := sshClient.NewSession() 111 | if err != nil { 112 | result.Error = fmt.Errorf("fail to create SSH session: %+v", err) 113 | return result 114 | } 115 | defer sess.Close() 116 | 117 | // Execute command 118 | cmd := fmt.Sprintf("/tmp/kdebug -f json --no-set-exit-code") 119 | for _, c := range task.Checkers { 120 | cmd += fmt.Sprintf(" -c %s", c) 121 | } 122 | log.Debugf("Execute kdebug on %s. Cmd: %s", task.Machine, cmd) 123 | output, err := sess.Output(cmd) 124 | if err != nil { 125 | result.Error = fmt.Errorf("fail to run kdebug on remote machine: %+v", err) 126 | return result 127 | } 128 | 129 | // Build result 130 | log.Debugf("Aggregate results from %s", task.Machine) 131 | result.Error = json.Unmarshal(output, &result.CheckResults) 132 | return result 133 | } 134 | 135 | func copyExecutable(sshClient *ssh.Client) error { 136 | path, err := os.Executable() 137 | if err != nil { 138 | return fmt.Errorf("fail to determine current executable location: %+v", err) 139 | } 140 | 141 | f, err := os.Open(path) 142 | if err != nil { 143 | return fmt.Errorf("fail to open file %s: %+v", path, err) 144 | } 145 | defer f.Close() 146 | 147 | scpClient, err := scp.NewClientBySSH(sshClient) 148 | if err != nil { 149 | return fmt.Errorf("fail to create SCP client: %+v", err) 150 | } 151 | 152 | return scpClient.CopyFromFile(context.Background(), *f, "/tmp/kdebug", "0755") 153 | } 154 | -------------------------------------------------------------------------------- /pkg/checkers/oom/oom.go: -------------------------------------------------------------------------------- 1 | package oom 2 | 3 | import ( 4 | "bufio" 5 | "errors" 6 | "fmt" 7 | "io" 8 | "os" 9 | "regexp" 10 | "strings" 11 | "syscall" 12 | 13 | "github.com/Azure/kdebug/pkg/base" 14 | ) 15 | 16 | const ( 17 | kmsgLogPath = "/dev/kmsg" 18 | ubuntuLogPath = "/var/log/kern.log" 19 | cgroupOOMKeyStr = "Memory cgroup out of memory" 20 | outOfMemoryKey = "Out of memory" 21 | ) 22 | 23 | var helpLink = []string{ 24 | "https://www.kernel.org/doc/gorman/html/understand/understand016.html", 25 | "https://stackoverflow.com/questions/18845857/what-does-anon-rss-and-total-vm-mean", 26 | "https://medium.com/tailwinds-navigator/kubernetes-tip-how-does-oomkilled-work-ba71b135993b", 27 | } 28 | 29 | var oomRegex = regexp.MustCompile("^(.*:.{2}:.{2}) .* process (.*) \\((.*)\\) .* anon-rss:(.*), file-rss.* oom_score_adj:(.*)") 30 | 31 | type OOMChecker struct { 32 | kernLogPath string 33 | } 34 | 35 | func (c *OOMChecker) Name() string { 36 | return "OOM" 37 | } 38 | 39 | func New() *OOMChecker { 40 | paths := []string{kmsgLogPath, ubuntuLogPath} 41 | for _, path := range paths { 42 | if file, err := os.Open(path); err == nil { 43 | file.Close() 44 | return &OOMChecker{ 45 | kernLogPath: path, 46 | } 47 | } 48 | } 49 | return &OOMChecker{ 50 | kernLogPath: "", 51 | } 52 | } 53 | 54 | func (c *OOMChecker) Check(ctx *base.CheckContext) ([]*base.CheckResult, error) { 55 | var results []*base.CheckResult 56 | oomResult, err := c.checkOOM(ctx) 57 | if err != nil { 58 | return nil, err 59 | } 60 | results = append(results, oomResult) 61 | return results, nil 62 | } 63 | 64 | func (c *OOMChecker) checkOOM(ctx *base.CheckContext) (*base.CheckResult, error) { 65 | result := &base.CheckResult{ 66 | Checker: c.Name(), 67 | } 68 | //todo:support other os 69 | if !ctx.Environment.HasFlag("linux") { 70 | result.Description = fmt.Sprint("Skip oom check in non-linux os") 71 | return result, nil 72 | } 73 | if c.kernLogPath == "" { 74 | result.Description = fmt.Sprint("Skip oom check because of can't access supported kern log path") 75 | return result, nil 76 | } 77 | oomInfos, err := c.getAndParseOOMLog() 78 | if err != nil { 79 | return nil, err 80 | } else if len(oomInfos) > 0 { 81 | result.Error = strings.Join(oomInfos, "\n") 82 | result.Description = "Detect process oom killed" 83 | result.HelpLinks = helpLink 84 | } else { 85 | result.Description = "No OOM found in recent kernlog." 86 | } 87 | return result, nil 88 | } 89 | 90 | type nonBlockReader struct { 91 | fd int 92 | } 93 | 94 | func (r *nonBlockReader) Read(buf []byte) (n int, err error) { 95 | n, err = syscall.Read(r.fd, buf) 96 | if err != nil { 97 | if errors.Is(err, syscall.EAGAIN) { 98 | return 0, io.EOF 99 | } 100 | } 101 | if n == 0 && err == nil { 102 | return 0, io.EOF 103 | } 104 | return n, err 105 | } 106 | 107 | func (c *OOMChecker) getAndParseOOMLog() ([]string, error) { 108 | file, err := os.Open(c.kernLogPath) 109 | if err != nil { 110 | return nil, err 111 | } 112 | defer file.Close() 113 | 114 | fd := int(file.Fd()) 115 | if err = syscall.SetNonblock(fd, true); err != nil { 116 | return nil, fmt.Errorf("Fail to read in non-block mode: %s", err) 117 | } 118 | 119 | var oomInfos []string 120 | scanner := bufio.NewScanner(&nonBlockReader{fd}) 121 | for scanner.Scan() { 122 | tmp := scanner.Text() 123 | //todo: more sophisticated OOM context 124 | //pattern match. https://github.com/torvalds/linux/blob/551acdc3c3d2b6bc97f11e31dcf960bc36343bfc/mm/oom_kill.c#L1120, https://github.com/torvalds/linux/blob/551acdc3c3d2b6bc97f11e31dcf960bc36343bfc/mm/oom_kill.c#L895 125 | if strings.Contains(tmp, cgroupOOMKeyStr) || strings.Contains(tmp, outOfMemoryKey) { 126 | oomInfo, err := parseOOMContent(tmp) 127 | if err != nil { 128 | return nil, err 129 | } else { 130 | oomInfos = append(oomInfos, oomInfo) 131 | } 132 | } 133 | } 134 | 135 | if err := scanner.Err(); err != nil { 136 | return nil, err 137 | } 138 | return oomInfos, nil 139 | } 140 | 141 | func parseOOMContent(content string) (string, error) { 142 | match := oomRegex.FindStringSubmatch(content) 143 | if len(match) != 6 { 144 | err := fmt.Errorf("Can't parse oom content:%s \n", content) 145 | return "", err 146 | } else { 147 | return fmt.Sprintf("progress:[%s %s] is OOM kill at time [%s]. [rss:%s] [oom_score_adj:%s]\n", match[2], match[3], match[1], match[4], match[5]), nil 148 | } 149 | } 150 | -------------------------------------------------------------------------------- /pkg/checkers/tcpping/tcpping.go: -------------------------------------------------------------------------------- 1 | package tcpping 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | "net" 7 | "strings" 8 | "time" 9 | 10 | v1 "k8s.io/api/core/v1" 11 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 12 | 13 | "github.com/Azure/kdebug/pkg/base" 14 | ) 15 | 16 | const KubernetesServiceHost = "KUBERNETES_SERVICE_HOST" 17 | const TimeOut = 1000 * time.Millisecond 18 | 19 | var PublicTargets = []pingEndpoint{ 20 | { 21 | ServerAddress: "www.google.com:443", 22 | Name: "Google", 23 | NameSpace: "", 24 | }, 25 | } 26 | 27 | type pingEndpoint struct { 28 | ServerAddress string 29 | Name string 30 | NameSpace string 31 | } 32 | 33 | func (t *TCPChecker) ping(serverAddr string) error { 34 | conn, err := t.dialer.Dial("tcp", serverAddr) 35 | if err != nil { 36 | return err 37 | } 38 | defer conn.Close() 39 | conn.(*net.TCPConn).SetLinger(0) 40 | return nil 41 | } 42 | 43 | type TCPChecker struct { 44 | dialer net.Dialer 45 | targets []pingEndpoint 46 | } 47 | 48 | func New() *TCPChecker { 49 | return &TCPChecker{ 50 | dialer: net.Dialer{ 51 | Timeout: TimeOut, 52 | }, 53 | } 54 | } 55 | 56 | func (t *TCPChecker) Name() string { 57 | return "TcpChecker" 58 | } 59 | 60 | func (t *TCPChecker) Check(ctx *base.CheckContext) ([]*base.CheckResult, error) { 61 | var results []*base.CheckResult 62 | targets := append(t.targets, getCheckTargets(ctx)...) 63 | resultChan := make(chan *base.CheckResult, len(targets)) 64 | for _, pingTarget := range targets { 65 | go func(target pingEndpoint) { 66 | result := &base.CheckResult{ 67 | Checker: t.Name(), 68 | } 69 | err := t.ping(target.ServerAddress) 70 | sb := strings.Builder{} 71 | if err != nil { 72 | sb.WriteString(fmt.Sprintf("Fail to establish tcp connection to %s (%s) ", 73 | target.ServerAddress, target.Name)) 74 | result.Error = err.Error() 75 | result.Recommendations = []string{"Check firewall settings if this is not expected."} 76 | } else { 77 | sb.WriteString(fmt.Sprintf("Successfully establish tcp connection to %s (%s)", target.ServerAddress, target.Name)) 78 | } 79 | if target.NameSpace != "" { 80 | sb.WriteString(fmt.Sprintf(" in namespace %s", target.NameSpace)) 81 | } 82 | sb.WriteString("\n") 83 | result.Description = sb.String() 84 | resultChan <- result 85 | }(pingTarget) 86 | } 87 | for i := 0; i < len(targets); i++ { 88 | result := <-resultChan 89 | results = append(results, result) 90 | } 91 | return results, nil 92 | } 93 | 94 | func getCheckTargets(c *base.CheckContext) []pingEndpoint { 95 | var targets []pingEndpoint 96 | targets = append(targets, PublicTargets...) 97 | // TODO: A bit noisy. Maybe add a new subset option for user to enable these checks 98 | // if c.KubeClient != nil { 99 | // services, err := getServicePingEndpoint(c) 100 | // if err != nil { 101 | // log.Warnf("Fetch cluster service ping endpoint error %v.Skip those checks", err) 102 | // } else { 103 | // targets = append(targets, services...) 104 | // } 105 | // } 106 | return targets 107 | } 108 | 109 | func getServicePingEndpoint(c *base.CheckContext) ([]pingEndpoint, error) { 110 | services, err := c.KubeClient.CoreV1().Services("").List(context.TODO(), metav1.ListOptions{}) 111 | isInKubernetes := c.Environment.HasFlag("k8s") 112 | if err != nil { 113 | return nil, err 114 | } 115 | var pingEndpoints []pingEndpoint 116 | for _, service := range services.Items { 117 | for _, port := range service.Spec.Ports { 118 | if port.Protocol == v1.ProtocolTCP { 119 | address := formatIP(service.Spec.LoadBalancerIP) 120 | if address == "" && len(service.Status.LoadBalancer.Ingress) > 0 { 121 | address = formatIP(service.Status.LoadBalancer.Ingress[0].IP) 122 | } 123 | if address == "" && isInKubernetes { 124 | address = formatIP(service.Spec.ClusterIP) 125 | } 126 | if address != "" { 127 | serverUrl := fmt.Sprintf("%s:%d", address, port.Port) 128 | pingEndpoints = append(pingEndpoints, pingEndpoint{ 129 | ServerAddress: serverUrl, 130 | Name: service.Name, 131 | NameSpace: service.Namespace, 132 | }) 133 | } 134 | } 135 | } 136 | 137 | } 138 | return pingEndpoints, nil 139 | } 140 | 141 | func formatIP(address string) string { 142 | if address == "" || address == "None" { 143 | return "" 144 | } 145 | if strings.Contains(address, ":") { 146 | return fmt.Sprintf("[%s]", address) 147 | } else { 148 | return address 149 | } 150 | } 151 | -------------------------------------------------------------------------------- /deploy/node-problem-detector/README.md: -------------------------------------------------------------------------------- 1 | ## What is npd-kdebug 2 | 3 | [node-problem-detector](https://github.com/kubernetes/node-problem-detector) aims to make various node problems visible to the upstream layers in the cluster management stack. It is a daemon that runs on each node, detects node problems and reports them to apiserver. node-problem-detector can either run as a DaemonSet or run standalone. Now it is running as a Kubernetes Addon enabled by default in the GCE cluster. 4 | 5 | In this example, we integrate the node-problem-detector with kdebug. After deploying kdebug with a specific check mode and node-problem-detector, kdebug will try to detect potential problems under the check mode. The results of the check will pass to node-problem-detector, and be reported through problem APIs of node-problem-detector. 6 | * `NodeCondition`: Permanent problem that makes the node unavailable for pods should 7 | be reported as `NodeCondition`. 8 | * `Event`: Temporary problem that has limited impact on pod but is informative 9 | should be reported as `Event`. 10 | 11 | We call the integration of node-problem-detector(npd) and kdebug as **npd-kdebug**. 12 | 13 | ## How to deploy npd-kdebug 14 | 15 | We have already prepared a [sample yaml](./node-problem-detector.yaml) file for you to help you deploy the intergration application of npd-kdebug with DNS check mode. You can run the following command to deploy the integrated daemon app to your kubernetes cluster. 16 | ```shell 17 | kubectl apply -f ./node-problem-detector.yaml 18 | ``` 19 | 20 | ## What can npd-kdebug show you 21 | 22 | ### Check the npd-kdebug is ready 23 | * In kubernetes dashboard, you can click `Daemon Sets` in the side bar. 24 | * If you see information like the following picture, it means that npd-kdebug is working on your cluster. 25 | 26 | ![image](../../resource/npd/npd-dashboard-daemonsets.png) 27 | 28 | ### Check the problem detecting result of kdebug check 29 | * Click `Cluster` > `Nodes`, and select a node. 30 | * In the 'Conditions' tag, you can see a `'DNSProblem'` type. It is a type of problems that detected by kdebug, and reported to node-problem-detector, as the node-problem-detector finally shows the `Status` and `Messages`. 31 | * If `Status=False`, it means there is no DNS problem. 32 | * If `Status=True`, it means npd-kdebug detected some DNS problems, and error messages show in `Messages`. 33 | 34 | ![image](../../resource/npd/npd-dashboard-DNSProblem.png) 35 | 36 | ## Customization 37 | 38 | Besides `DNSProblem` check, you can integrate other kdebug check modes with npd. To customize different check modes npd-kdebug, you can follow the step-by-step tutorial in this section. 39 | 40 | * Step 1: Copy the [template yaml](./node-problem-detector-template.yaml) and open it. 41 | * Step 2: Replace `` with a json file name you want at `line 42`. Recommend to include the check mode name you want to deploy. For example, `kdebug-http`. 42 | ![image](../../resource/npd/yml-your_json_name.png) 43 | * Step 3: Now you should edit your config json. This part of contents describe the parameters that how you run the `kdebug`, which is as the custom plugin of npd. 44 | * Step 4: You should replace `` with the json file name you entered in Step 2. 45 | ![image](../../resource/npd/yml-json-your_json_name.png) 46 | 47 | * Step 5: In `"conditions"`, you should fill the values of `type`, `reason` and `message`, which will be showed in dashboard if the mode check by `kdebug` passed. 48 | ![image](../../resource/npd/yml-json-conditions.png) 49 | 50 | * Step 6: You could define rules by edit the `rules` property. For the npd supporting two different types of check, you should define two rules of `temporary` and `permanent` types. Actually, you can define both rules by offering the same parameters of `kdebug` commands. 51 | 52 | You could replace `` and `reason` as you want. For `` in `args` property, please replace it with the name of check mode(`'http'` for HTTP check, e.g.). Due to you could only define certain conditions in Step 5, you should include the flag `-c` in `args`, or kdebug would execute all modes of check. The `-f` flag would make the output of kdebug check formatted, which will be showed as [Message](#what-can-npd-kdebug-show-you) in dashboard. For more supported arguments of kdebug you can use, please refer to the help messages by running the following command. 53 | 54 | ```shell 55 | kdebug -h 56 | ``` 57 | ![image](../../resource/npd/yml-json-rules.png) 58 | 59 | * Step 7: Now you can use your customized yaml file to deploy your npd-kdebug by following [How to deploy npd-kdebug](#how-to-deploy-npd-kdebug) -------------------------------------------------------------------------------- /pkg/tools/aadssh/aadssh.go: -------------------------------------------------------------------------------- 1 | package aadssh 2 | 3 | import ( 4 | "encoding/base64" 5 | "fmt" 6 | "os" 7 | "os/exec" 8 | "path" 9 | 10 | flags "github.com/jessevdk/go-flags" 11 | log "github.com/sirupsen/logrus" 12 | "golang.org/x/crypto/ssh" 13 | 14 | "github.com/Azure/kdebug/pkg/base" 15 | ) 16 | 17 | const ( 18 | // Extracted from Azure CLI code 19 | AzureCLIClientId = "04b07795-8ddb-461a-bbee-02f9e1bf7b46" 20 | AzureCLIDirName = ".azure" 21 | AzureCLITokenCacheFileName = "msal_token_cache.json" 22 | SSHDirName = ".aad-ssh" 23 | SSHPrivateKeyName = "id_rsa" 24 | SSHPublicKeyName = "id_rsa.pub" 25 | SSHCertificateName = "id_rsa-cert.pub" 26 | ) 27 | 28 | type AadSsh struct { 29 | } 30 | 31 | type Config struct { 32 | Cloud string `long:"cloud" description:"Azure cloud name. Support values are: azurecloud, azurechinacloud, azureusgovernment"` 33 | UseAzureCLI bool `long:"use-azure-cli" description:"Use Azure CLI credentials"` 34 | } 35 | 36 | func New() *AadSsh { 37 | return &AadSsh{} 38 | } 39 | 40 | func (c *AadSsh) Name() string { 41 | return "AAD SSH" 42 | } 43 | 44 | func (c *AadSsh) ParseArgs(ctx *base.ToolContext, args []string) error { 45 | var config Config 46 | remainingArgs, err := flags.ParseArgs(&config, args) 47 | if err != nil { 48 | return err 49 | } 50 | ctx.Args = remainingArgs 51 | ctx.Config = &config 52 | return nil 53 | } 54 | 55 | func (c *AadSsh) Run(ctx *base.ToolContext) error { 56 | config := ctx.Config.(*Config) 57 | 58 | if config.Cloud == "" { 59 | // Default to public cloud 60 | config.Cloud = "azurecloud" 61 | } 62 | 63 | // Ensure key dir 64 | sshDir, err := ensureSSHKeyDir(SSHDirName) 65 | if err != nil { 66 | return fmt.Errorf("Fail to ensure SSH directory: %+v", err) 67 | } 68 | 69 | // Load SSH private key 70 | sshPrivKeyPath := path.Join(sshDir, SSHPrivateKeyName) 71 | sshPrivKey, err := createOrLoadSSHPrivateKey(sshPrivKeyPath) 72 | if err != nil { 73 | return fmt.Errorf("Fail to create or load SSH private key: %+v", err) 74 | } 75 | log.WithFields(log.Fields{"path": sshPrivKeyPath}).Info("Loaded SSH private key") 76 | 77 | // Save SSH public key 78 | sshPubKey, err := ssh.NewPublicKey(&sshPrivKey.PublicKey) 79 | if err != nil { 80 | return fmt.Errorf("Fail to create SSH public key: %+v", err) 81 | } 82 | sshPubKeyPath := path.Join(sshDir, SSHPublicKeyName) 83 | if err = saveSSHPublicKey(sshPubKey, sshPubKeyPath); err != nil { 84 | return fmt.Errorf("Fail to save SSH public key: %+v", err) 85 | } 86 | log.WithFields(log.Fields{"path": sshPubKeyPath}).Info("Saved SSH public key") 87 | 88 | // Try existing certificate 89 | sshCertPath := path.Join(sshDir, SSHCertificateName) 90 | sshCert, err := loadSSHCertificate(sshCertPath) 91 | if err != nil { 92 | log.WithFields(log.Fields{"error": err}).Debug("Fail to load existing SSH certificate") 93 | log.Info("Acquire a new SSH certificate from AAD") 94 | 95 | // Acquire a certificate from AAD 96 | sshCert, err = acquireCertificate(config.Cloud, config.UseAzureCLI, sshPubKey) 97 | if err != nil { 98 | return fmt.Errorf("Fail to acquire SSH certificate from AAD: %+v", err) 99 | } 100 | 101 | // Save SSH certificate to file 102 | sshCertContent := ssh.CertAlgoRSAv01 + " " + base64.StdEncoding.EncodeToString(sshCert.Marshal()) 103 | if err = saveSSHCertificate(sshCertContent, sshCertPath); err != nil { 104 | return fmt.Errorf("Fail to save SSH certificate: %+v", err) 105 | } 106 | log.WithFields(log.Fields{"path": sshCertPath}).Info("Saved SSH certificate") 107 | } else { 108 | log.WithFields(log.Fields{"path": sshCertPath}).Info("Loaded valid SSH certificate") 109 | } 110 | 111 | // Add SSH key to SSH agent 112 | sshAuthSock := os.Getenv("SSH_AUTH_SOCK") 113 | if sshAuthSock != "" { 114 | if err = addSSHKeyToAgent(sshAuthSock, sshPrivKey, sshCert); err != nil { 115 | return fmt.Errorf("Fail to add SSH key to agent: %+v", err) 116 | } 117 | log.WithFields(log.Fields{"path": sshPrivKeyPath}).Info("Added SSH key to agent") 118 | } 119 | 120 | // Call SSH there are remaining args 121 | if len(ctx.Args) > 0 { 122 | args := getSSHArgs(ctx.Args, sshPrivKeyPath, sshAuthSock != "") 123 | log.WithFields(log.Fields{"args": args}).Info("Starting SSH") 124 | cmd := exec.Command("ssh", args...) 125 | cmd.Stdin = os.Stdin 126 | cmd.Stdout = os.Stdout 127 | cmd.Stderr = os.Stderr 128 | if err = cmd.Start(); err != nil { 129 | return fmt.Errorf("Fail to start SSH: %+v", err) 130 | } 131 | cmd.Wait() 132 | } 133 | 134 | return nil 135 | } 136 | -------------------------------------------------------------------------------- /pkg/checkers/kmscachesize/kms_cache_size.go: -------------------------------------------------------------------------------- 1 | package kmscachesize 2 | 3 | import ( 4 | "context" 5 | "errors" 6 | "fmt" 7 | "os" 8 | "strings" 9 | 10 | "github.com/shirou/gopsutil/v3/process" 11 | log "github.com/sirupsen/logrus" 12 | "gopkg.in/yaml.v3" 13 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 14 | 15 | "github.com/Azure/kdebug/pkg/base" 16 | ) 17 | 18 | var helpLink = []string{ 19 | "https://kubernetes.io/docs/tasks/administer-cluster/kms-provider/#configuring-the-kms-provider-kms-v2", 20 | } 21 | 22 | const cacheSizeAlertThreshold = 0.8 23 | const kmsConfigCmd = "--encryption-provider-config=" 24 | 25 | type encConfig struct { 26 | Resources []encResource `yaml:"resources"` 27 | } 28 | 29 | type encResource struct { 30 | Providers []encProvider `yaml:"providers"` 31 | } 32 | 33 | type encProvider struct { 34 | Kms encKms `yaml:"kms"` 35 | } 36 | 37 | type encKms struct { 38 | CacheSize int `yaml:"cachesize"` 39 | } 40 | 41 | type KMSCacheSizeChecker struct { 42 | } 43 | 44 | func (c *KMSCacheSizeChecker) Name() string { 45 | return "KMSCacheSize" 46 | } 47 | 48 | func New() *KMSCacheSizeChecker { 49 | return &KMSCacheSizeChecker{} 50 | } 51 | 52 | func (c *KMSCacheSizeChecker) Check(ctx *base.CheckContext) ([]*base.CheckResult, error) { 53 | if !ctx.Environment.HasFlag("linux") { 54 | log.Debugf("Skip %s checker in non-linux os", c.Name()) 55 | return []*base.CheckResult{}, nil 56 | } 57 | 58 | if ctx.KubeClient == nil { 59 | log.Debugf("Skip %s checker due to no kube config provided", c.Name()) 60 | return []*base.CheckResult{}, nil 61 | } 62 | 63 | kmsConfigPath, err := getKmsConfigPath() 64 | if err != nil { 65 | log.Debugf("Cannot find KMS config file: %s", err) 66 | return []*base.CheckResult{}, nil 67 | } 68 | 69 | cacheSize, err := getKmsCacheSize(kmsConfigPath) 70 | if err != nil { 71 | return nil, err 72 | } 73 | 74 | log.Debugf("KMS cache size: %d", cacheSize) 75 | 76 | if cacheSize == 0 { 77 | log.Debugf("There's no limit for KMS cache size") 78 | return []*base.CheckResult{}, nil 79 | } 80 | 81 | secretsCount, err := c.getCurrentSecretsCount(ctx) 82 | if err != nil { 83 | return nil, err 84 | } 85 | 86 | log.Debugf("Secrets count: %d", secretsCount) 87 | 88 | result := &base.CheckResult{ 89 | Checker: c.Name(), 90 | Description: fmt.Sprintf("Current secrets:%d, cache size:%d.", secretsCount, cacheSize), 91 | } 92 | 93 | if float32(secretsCount) > (float32(cacheSize) * cacheSizeAlertThreshold) { 94 | result.Error = fmt.Sprintf("KMS cache size is insufficient.") 95 | result.Description += fmt.Sprintf(" When number of secrets exceeds KMS cache size, Kubernetes may suffer frmo significant performance issue.") 96 | result.HelpLinks = helpLink 97 | } 98 | 99 | return []*base.CheckResult{result}, nil 100 | } 101 | 102 | func getKmsConfigPath() (string, error) { 103 | procs, err := process.Processes() 104 | if err != nil { 105 | return "", err 106 | } 107 | 108 | for _, proc := range procs { 109 | procName, err := proc.Name() 110 | if err != nil { 111 | log.Errorf("Fail get proc name for pid: %d", proc.Pid) 112 | continue 113 | } 114 | 115 | if strings.ToLower(procName) == "kube-apiserver" { 116 | cmds, err := proc.CmdlineSlice() 117 | if err != nil { 118 | log.Errorf("Fail get proc cmdline for: %s", procName) 119 | continue 120 | } 121 | 122 | for _, cmd := range cmds { 123 | if strings.HasPrefix(cmd, kmsConfigCmd) { 124 | return cmd[len(kmsConfigCmd):], nil 125 | } 126 | } 127 | 128 | return "", errors.New("API server doesn't have KMS configured") 129 | } 130 | } 131 | 132 | return "", errors.New("Fail to find api server process") 133 | } 134 | 135 | func getKmsCacheSize(path string) (int, error) { 136 | f, err := os.Open(path) 137 | if err != nil { 138 | return 0, err 139 | } 140 | defer f.Close() 141 | 142 | decoder := yaml.NewDecoder(f) 143 | var config encConfig 144 | err = decoder.Decode(&config) 145 | if err != nil { 146 | return 0, err 147 | } 148 | 149 | if len(config.Resources) > 0 && len(config.Resources[0].Providers) > 0 { 150 | return config.Resources[0].Providers[0].Kms.CacheSize, nil 151 | } else { 152 | return 0, fmt.Errorf("Fail to parse cache size from kms config: %s", path) 153 | } 154 | } 155 | 156 | func (c *KMSCacheSizeChecker) getCurrentSecretsCount(ctx *base.CheckContext) (int, error) { 157 | client := ctx.KubeClient 158 | secrets, err := client.CoreV1().Secrets("").List(context.TODO(), metav1.ListOptions{}) 159 | if err != nil { 160 | return 0, fmt.Errorf("Fail to list secrets from Kubernetes: %s", err) 161 | } 162 | return len(secrets.Items), nil 163 | } 164 | -------------------------------------------------------------------------------- /pkg/tools/aadssh/token.go: -------------------------------------------------------------------------------- 1 | package aadssh 2 | 3 | import ( 4 | "context" 5 | "crypto/sha256" 6 | "encoding/base64" 7 | "encoding/hex" 8 | "encoding/json" 9 | "fmt" 10 | "net/http" 11 | "sort" 12 | "strings" 13 | "time" 14 | 15 | msal "github.com/AzureAD/microsoft-authentication-library-for-go/apps/public" 16 | log "github.com/sirupsen/logrus" 17 | "golang.org/x/crypto/ssh" 18 | ) 19 | 20 | var cloudToScope = map[string]string{ 21 | "azurecloud": "https://pas.windows.net/CheckMyAccess/Linux/.default", 22 | "azurechinacloud": "https://pas.chinacloudapi.cn/CheckMyAccess/Linux/.default", 23 | "azureusgovernment": "https://pasff.usgovcloudapi.net/CheckMyAccess/Linux/.default", 24 | } 25 | 26 | var cloudToAuthority = map[string]string{ 27 | "azurecloud": "https://login.microsoftonline.com/common", 28 | "azurechinacloud": "https://login.chinacloudapi.cn/common", 29 | "azureusgovernment": "https://login.microsoftonline.us/common", 30 | } 31 | 32 | // prepareRequestData prepares AAD token request data 33 | func prepareRequestData(sshPubKey ssh.PublicKey) (map[string]string, error) { 34 | exponentString, modulusString, err := parseSSHPublicKey(sshPubKey) 35 | if err != nil { 36 | return nil, fmt.Errorf("Fail to parse SSH public key due to: %+v", err) 37 | } 38 | 39 | hash := sha256.New() 40 | hash.Write([]byte(modulusString)) 41 | hash.Write([]byte(exponentString)) 42 | keyId := hex.EncodeToString(hash.Sum(nil)) 43 | jwk := map[string]string{ 44 | "kty": "RSA", 45 | "n": modulusString, 46 | "e": exponentString, 47 | "kid": keyId, 48 | } 49 | jwkJson, err := json.Marshal(jwk) 50 | if err != nil { 51 | return nil, fmt.Errorf("Fail to parse encode JWK payload due to: %+v", err) 52 | } 53 | 54 | data := map[string]string{ 55 | "token_type": "ssh-cert", 56 | "req_cnf": string(jwkJson), 57 | "key_id": keyId, 58 | } 59 | 60 | return data, nil 61 | } 62 | 63 | // getSupportedClouds returns supported cloud names 64 | func getSupportedClouds() []string { 65 | cloudNames := []string{} 66 | for n := range cloudToScope { 67 | cloudNames = append(cloudNames, n) 68 | } 69 | sort.Strings(cloudNames) 70 | return cloudNames 71 | } 72 | 73 | // acquireCertificate acquires SSH certificate from AAD 74 | func acquireCertificate(cloud string, useAzureCLI bool, sshPubKey ssh.PublicKey) (*ssh.Certificate, error) { 75 | // Prepare token request data 76 | data, err := prepareRequestData(sshPubKey) 77 | if err != nil { 78 | return nil, fmt.Errorf("Fail to prepare request data: %+v", err) 79 | } 80 | log.WithFields(log.Fields{ 81 | "data": data, 82 | }).Debug("Token request data") 83 | 84 | // Request token 85 | authority := cloudToAuthority[cloud] 86 | if authority == "" { 87 | return nil, fmt.Errorf("Unsupported cloud: %s. Supported clouds include %+v", cloud, getSupportedClouds()) 88 | } 89 | httpClient := &http.Client{ 90 | Timeout: time.Minute, 91 | Transport: &Transport{data: data}, 92 | } 93 | client, err := msal.New(AzureCLIClientId, 94 | msal.WithAuthority(authority), 95 | msal.WithHTTPClient(httpClient)) 96 | if err != nil { 97 | return nil, fmt.Errorf("Fail to create MSAL client: %+v", err) 98 | } 99 | 100 | scope := cloudToScope[strings.ToLower(cloud)] 101 | if scope == "" { 102 | return nil, fmt.Errorf("Unsupported cloud: %s. Supported clouds include %+v", cloud, getSupportedClouds()) 103 | } 104 | 105 | scopes := []string{scope} 106 | ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute) 107 | defer cancel() 108 | var authResult msal.AuthResult 109 | if useAzureCLI { 110 | authResult, err = acquireTokenByAzureCLI(ctx, scopes, data) 111 | } else { 112 | authResult, err = client.AcquireTokenInteractive(ctx, scopes) 113 | } 114 | if err != nil { 115 | return nil, fmt.Errorf("Fail to create acquire AAD token: %+v", err) 116 | } 117 | 118 | log.WithFields(log.Fields{"authResult": fmt.Sprintf("%+v", authResult)}).Debug("Got AAD auth result") 119 | 120 | sshCertData, err := base64.StdEncoding.DecodeString(authResult.AccessToken) 121 | if err != nil { 122 | return nil, fmt.Errorf("Fail to base64 decode SSH certificate: %+v", err) 123 | } 124 | sshPub, err := ssh.ParsePublicKey(sshCertData) 125 | if err != nil { 126 | return nil, fmt.Errorf("Fail to parse SSH certificate: %+v", err) 127 | } 128 | sshCert, ok := sshPub.(*ssh.Certificate) 129 | if !ok { 130 | return nil, fmt.Errorf("Not a SSH certificate") 131 | } 132 | 133 | validBefore := time.Unix(int64(sshCert.ValidBefore), 0) 134 | log.WithFields(log.Fields{"validBefore": validBefore}).Info("Got SSH certificate. Re-run this command to obtain a new one after it expires.") 135 | 136 | return sshCert, nil 137 | } 138 | -------------------------------------------------------------------------------- /pkg/checkers/dns/dns_test.go: -------------------------------------------------------------------------------- 1 | package dns 2 | 3 | import ( 4 | "errors" 5 | "reflect" 6 | "testing" 7 | "time" 8 | 9 | "github.com/Azure/kdebug/pkg/base" 10 | "github.com/Azure/kdebug/pkg/env" 11 | "github.com/miekg/dns" 12 | ) 13 | 14 | type FakeDnsClient struct { 15 | r *dns.Msg 16 | e error 17 | m *dns.Msg 18 | a string 19 | } 20 | 21 | func (c *FakeDnsClient) Exchange(m *dns.Msg, a string) (r *dns.Msg, rtt time.Duration, err error) { 22 | c.m = m 23 | c.a = a 24 | return c.r, time.Duration(0), c.e 25 | } 26 | 27 | func TestCheckServer(t *testing.T) { 28 | client := &FakeDnsClient{ 29 | r: &dns.Msg{ 30 | MsgHdr: dns.MsgHdr{ 31 | Rcode: dns.RcodeSuccess, 32 | }, 33 | }, 34 | } 35 | checker := &DnsChecker{ 36 | client: client, 37 | } 38 | r, err := checker.checkServer(GoogleDnsServer, "www.bing.com") 39 | if err != nil { 40 | t.Errorf("expect no error but got: %+v", err) 41 | } 42 | if !r.Ok() { 43 | t.Errorf("expect ok but not") 44 | } 45 | if client.a != GoogleDnsServer.Server+":53" { 46 | t.Errorf("dns request server is wrong: %s", client.a) 47 | } 48 | if client.m.Question[0].String() != ";www.bing.com.\tIN\t A" { 49 | t.Errorf("wrong dns question: %s", client.m.Question[0].String()) 50 | } 51 | } 52 | 53 | func TestCheckServerBadRcode(t *testing.T) { 54 | client := &FakeDnsClient{ 55 | r: &dns.Msg{ 56 | MsgHdr: dns.MsgHdr{ 57 | Rcode: dns.RcodeServerFailure, 58 | }, 59 | }, 60 | } 61 | checker := &DnsChecker{ 62 | client: client, 63 | } 64 | r, err := checker.checkServer(GoogleDnsServer, "www.bing.com") 65 | if err != nil { 66 | t.Errorf("expect no error but got: %+v", err) 67 | } 68 | if r.Ok() { 69 | t.Errorf("expect not ok") 70 | } 71 | if r.Error == "" || r.Description == "" || 72 | !reflect.DeepEqual(r.Recommendations, GoogleDnsServer.Recommendations) || 73 | !reflect.DeepEqual(r.HelpLinks, GoogleDnsServer.HelpLinks) { 74 | t.Errorf("unexpected result") 75 | } 76 | if client.a != GoogleDnsServer.Server+":53" { 77 | t.Errorf("dns request server is wrong: %s", client.a) 78 | } 79 | if client.m.Question[0].String() != ";www.bing.com.\tIN\t A" { 80 | t.Errorf("wrong dns question: %s", client.m.Question[0].String()) 81 | } 82 | } 83 | 84 | func TestCheckServerError(t *testing.T) { 85 | client := &FakeDnsClient{ 86 | e: errors.New("err"), 87 | } 88 | checker := &DnsChecker{ 89 | client: client, 90 | } 91 | r, err := checker.checkServer(GoogleDnsServer, "www.bing.com") 92 | if err != nil { 93 | t.Errorf("expect no error but got: %+v", err) 94 | } 95 | if r.Ok() { 96 | t.Errorf("expect not ok") 97 | } 98 | if r.Error == "" || r.Description != "err" || 99 | !reflect.DeepEqual(r.Recommendations, GoogleDnsServer.Recommendations) || 100 | !reflect.DeepEqual(r.HelpLinks, GoogleDnsServer.HelpLinks) { 101 | t.Errorf("unexpected result") 102 | } 103 | if client.a != GoogleDnsServer.Server+":53" { 104 | t.Errorf("dns request server is wrong: %s", client.a) 105 | } 106 | if client.m.Question[0].String() != ";www.bing.com.\tIN\t A" { 107 | t.Errorf("wrong dns question: %s", client.m.Question[0].String()) 108 | } 109 | } 110 | 111 | func TestGetCheckTargets(t *testing.T) { 112 | { 113 | e := &env.StaticEnvironment{ 114 | Flags: []string{"ubuntu"}, 115 | } 116 | servers := getCheckTargets(e) 117 | if !reflect.DeepEqual(servers, []DnsServer{GoogleDnsServer, SystemdResolvedDnsServer}) { 118 | t.Errorf("unexpected check targets on 'ubuntu'") 119 | } 120 | } 121 | 122 | { 123 | e := &env.StaticEnvironment{ 124 | Flags: []string{"azure"}, 125 | } 126 | servers := getCheckTargets(e) 127 | if !reflect.DeepEqual(servers, 128 | []DnsServer{GoogleDnsServer, AzureDnsServer, AksCoreDnsServerPublic, AksCoreDnsServerInCluster}) { 129 | t.Errorf("unexpected check targets on 'azure'") 130 | } 131 | } 132 | 133 | { 134 | e := &env.StaticEnvironment{ 135 | Flags: []string{""}, 136 | } 137 | servers := getCheckTargets(e) 138 | if !reflect.DeepEqual(servers, []DnsServer{GoogleDnsServer}) { 139 | t.Errorf("unexpected check targets on ''") 140 | } 141 | } 142 | } 143 | 144 | func TestCheck(t *testing.T) { 145 | client := &FakeDnsClient{ 146 | r: &dns.Msg{ 147 | MsgHdr: dns.MsgHdr{ 148 | Rcode: dns.RcodeSuccess, 149 | }, 150 | }, 151 | } 152 | checker := &DnsChecker{ 153 | client: client, 154 | } 155 | 156 | ctx := &base.CheckContext{ 157 | Environment: &env.StaticEnvironment{ 158 | Flags: []string{"ubuntu"}, 159 | }, 160 | } 161 | r, err := checker.Check(ctx) 162 | if err != nil { 163 | t.Errorf("expect no error but got: %+v", err) 164 | } 165 | if len(r) != 4 { 166 | t.Errorf("expect 4 results but got %d", len(r)) 167 | } 168 | } 169 | -------------------------------------------------------------------------------- /deploy/node-problem-detector/node-problem-detector.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: DaemonSet 3 | metadata: 4 | name: node-problem-detector 5 | namespace: kube-system 6 | labels: 7 | app: node-problem-detector 8 | spec: 9 | selector: 10 | matchLabels: 11 | app: node-problem-detector 12 | template: 13 | metadata: 14 | labels: 15 | app: node-problem-detector 16 | spec: 17 | serviceAccountName: node-problem-detector 18 | affinity: 19 | nodeAffinity: 20 | requiredDuringSchedulingIgnoredDuringExecution: 21 | nodeSelectorTerms: 22 | - matchExpressions: 23 | - key: kubernetes.io/os 24 | operator: In 25 | values: 26 | - linux 27 | initContainers: 28 | - name: download-kdebug 29 | image: k8s.gcr.io/busybox:1.27 30 | command: 31 | - 'sh' 32 | - '-c' 33 | - 'wget -O /opt/kdebug/kdebug https://github.com/Azure/kdebug/releases/download/v0.4-beta-1/kdebug && chmod +x /opt/kdebug/kdebug' 34 | volumeMounts: 35 | - name: kdebug 36 | mountPath: /opt/kdebug 37 | containers: 38 | - name: node-problem-detector 39 | command: 40 | - /node-problem-detector 41 | - --logtostderr 42 | - --config.custom-plugin-monitor=/config/kdebug-dns.json 43 | - --apiserver-override=kubernetes 44 | image: k8s.gcr.io/node-problem-detector/node-problem-detector:v0.8.7 45 | resources: 46 | limits: 47 | cpu: 10m 48 | memory: 80Mi 49 | requests: 50 | cpu: 10m 51 | memory: 80Mi 52 | imagePullPolicy: Always 53 | securityContext: 54 | privileged: true 55 | env: 56 | - name: NODE_NAME 57 | valueFrom: 58 | fieldRef: 59 | fieldPath: spec.nodeName 60 | volumeMounts: 61 | # Make sure node problem detector is in the same timezone 62 | # with the host. 63 | - name: localtime 64 | mountPath: /etc/localtime 65 | readOnly: true 66 | - name: config 67 | mountPath: /config 68 | readOnly: true 69 | - name: kdebug 70 | mountPath: /opt/kdebug 71 | readOnly: true 72 | volumes: 73 | - name: localtime 74 | hostPath: 75 | path: /etc/localtime 76 | - name: config 77 | configMap: 78 | name: node-problem-detector-config 79 | - name: kdebug 80 | emptyDir: {} 81 | tolerations: 82 | - effect: NoSchedule 83 | operator: Exists 84 | - effect: NoExecute 85 | operator: Exists 86 | --- 87 | apiVersion: v1 88 | kind: ConfigMap 89 | metadata: 90 | name: node-problem-detector-config 91 | namespace: kube-system 92 | data: 93 | kdebug-dns.json: | 94 | { 95 | "plugin": "custom", 96 | "pluginConfig": { 97 | "invoke_interval": "30s", 98 | "timeout": "30s", 99 | "max_output_length": 80, 100 | "concurrency": 3, 101 | "enable_message_change_based_condition_update": false 102 | }, 103 | "source": "kdebug-dns", 104 | "metricsReporting": true, 105 | "conditions": [ 106 | { 107 | "type": "DNSProblem", 108 | "reason": "DNSChecksPass", 109 | "message": "No DNS problem found" 110 | } 111 | ], 112 | "rules": [ 113 | { 114 | "type": "temporary", 115 | "reason": "DNSHasProblem", 116 | "path": "/opt/kdebug/kdebug", 117 | "args": [ 118 | "-c", 119 | "dns", 120 | "-f", 121 | "oneline" 122 | ] 123 | }, 124 | { 125 | "type": "permanent", 126 | "condition": "DNSProblem", 127 | "reason": "DNSHasProblem", 128 | "path": "/opt/kdebug/kdebug", 129 | "args": [ 130 | "-c", 131 | "dns", 132 | "-f", 133 | "oneline" 134 | ] 135 | } 136 | ] 137 | } 138 | --- 139 | apiVersion: v1 140 | kind: ServiceAccount 141 | metadata: 142 | name: node-problem-detector 143 | labels: 144 | app: node-problem-detector 145 | namespace: kube-system 146 | --- 147 | apiVersion: rbac.authorization.k8s.io/v1 148 | kind: ClusterRoleBinding 149 | metadata: 150 | name: node-problem-detector 151 | labels: 152 | app: node-problem-detector 153 | subjects: 154 | - kind: ServiceAccount 155 | name: node-problem-detector 156 | namespace: kube-system 157 | roleRef: 158 | kind: ClusterRole 159 | name: system:node-problem-detector 160 | apiGroup: rbac.authorization.k8s.io 161 | -------------------------------------------------------------------------------- /deploy/node-problem-detector/node-problem-detector-template.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: DaemonSet 3 | metadata: 4 | name: node-problem-detector 5 | namespace: kube-system 6 | labels: 7 | app: node-problem-detector 8 | spec: 9 | selector: 10 | matchLabels: 11 | app: node-problem-detector 12 | template: 13 | metadata: 14 | labels: 15 | app: node-problem-detector 16 | spec: 17 | serviceAccountName: node-problem-detector 18 | affinity: 19 | nodeAffinity: 20 | requiredDuringSchedulingIgnoredDuringExecution: 21 | nodeSelectorTerms: 22 | - matchExpressions: 23 | - key: kubernetes.io/os 24 | operator: In 25 | values: 26 | - linux 27 | initContainers: 28 | - name: download-kdebug 29 | image: k8s.gcr.io/busybox:1.27 30 | command: 31 | - 'sh' 32 | - '-c' 33 | - 'wget -O /opt/kdebug/kdebug https://github.com/Azure/kdebug/releases/download/v0.4-beta-1/kdebug && chmod +x /opt/kdebug/kdebug' 34 | volumeMounts: 35 | - name: kdebug 36 | mountPath: /opt/kdebug 37 | containers: 38 | - name: node-problem-detector 39 | command: 40 | - /node-problem-detector 41 | - --logtostderr 42 | - --config.custom-plugin-monitor=/config/.json 43 | - --apiserver-override=kubernetes 44 | image: k8s.gcr.io/node-problem-detector/node-problem-detector:v0.8.7 45 | resources: 46 | limits: 47 | cpu: 10m 48 | memory: 80Mi 49 | requests: 50 | cpu: 10m 51 | memory: 80Mi 52 | imagePullPolicy: Always 53 | securityContext: 54 | privileged: true 55 | env: 56 | - name: NODE_NAME 57 | valueFrom: 58 | fieldRef: 59 | fieldPath: spec.nodeName 60 | volumeMounts: 61 | # Make sure node problem detector is in the same timezone 62 | # with the host. 63 | - name: localtime 64 | mountPath: /etc/localtime 65 | readOnly: true 66 | - name: config 67 | mountPath: /config 68 | readOnly: true 69 | - name: kdebug 70 | mountPath: /opt/kdebug 71 | readOnly: true 72 | volumes: 73 | - name: localtime 74 | hostPath: 75 | path: /etc/localtime 76 | - name: config 77 | configMap: 78 | name: node-problem-detector-config 79 | - name: kdebug 80 | emptyDir: {} 81 | tolerations: 82 | - effect: NoSchedule 83 | operator: Exists 84 | - effect: NoExecute 85 | operator: Exists 86 | --- 87 | apiVersion: v1 88 | kind: ConfigMap 89 | metadata: 90 | name: node-problem-detector-config 91 | namespace: kube-system 92 | data: 93 | .json: | 94 | { 95 | "plugin": "custom", 96 | "pluginConfig": { 97 | "invoke_interval": "30s", 98 | "timeout": "30s", 99 | "max_output_length": 80, 100 | "concurrency": 3, 101 | "enable_message_change_based_condition_update": false 102 | }, 103 | "source": "", 104 | "metricsReporting": true, 105 | "conditions": [ 106 | { 107 | "type": "", 108 | "reason": "ChecksPass", 109 | "message": "No problem found" 110 | } 111 | ], 112 | "rules": [ 113 | { 114 | "type": "temporary", 115 | "reason": "HasProblem", 116 | "path": "/opt/kdebug/kdebug", 117 | "args": [ 118 | "-c", 119 | "", 120 | "-f", 121 | "oneline" 122 | ] 123 | }, 124 | { 125 | "type": "permanent", 126 | "condition": "", 127 | "reason": "HasProblem", 128 | "path": "/opt/kdebug/kdebug", 129 | "args": [ 130 | "-c", 131 | "", 132 | "-f", 133 | "oneline" 134 | ] 135 | } 136 | ] 137 | } 138 | --- 139 | apiVersion: v1 140 | kind: ServiceAccount 141 | metadata: 142 | name: node-problem-detector 143 | labels: 144 | app: node-problem-detector 145 | namespace: kube-system 146 | --- 147 | apiVersion: rbac.authorization.k8s.io/v1 148 | kind: ClusterRoleBinding 149 | metadata: 150 | name: node-problem-detector 151 | labels: 152 | app: node-problem-detector 153 | subjects: 154 | - kind: ServiceAccount 155 | name: node-problem-detector 156 | namespace: kube-system 157 | roleRef: 158 | kind: ClusterRole 159 | name: system:node-problem-detector 160 | apiGroup: rbac.authorization.k8s.io 161 | -------------------------------------------------------------------------------- /pkg/tools/aadssh/ssh.go: -------------------------------------------------------------------------------- 1 | package aadssh 2 | 3 | import ( 4 | "crypto/rand" 5 | "crypto/rsa" 6 | "crypto/x509" 7 | "encoding/base64" 8 | "encoding/binary" 9 | "encoding/pem" 10 | "fmt" 11 | "io/ioutil" 12 | "os" 13 | "path" 14 | "strings" 15 | "time" 16 | 17 | "golang.org/x/crypto/ssh" 18 | ) 19 | 20 | // ensureSSHKeyDir creates a directory under user home for storing SSH keys 21 | // returns directory path 22 | func ensureSSHKeyDir(dirName string) (string, error) { 23 | homeDir, err := os.UserHomeDir() 24 | if err != nil { 25 | return "", err 26 | } 27 | sshDir := path.Join(homeDir, dirName) 28 | if _, err = os.Stat(sshDir); err != nil { 29 | if os.IsNotExist(err) { 30 | if err = os.Mkdir(sshDir, 0700); err != nil { 31 | return "", err 32 | } 33 | } else { 34 | return "", err 35 | } 36 | } 37 | return sshDir, nil 38 | } 39 | 40 | // createOrLoadSSHPrivateKey creates or loads a SSH private key from file 41 | // returns RSA private key 42 | func createOrLoadSSHPrivateKey(keyPath string) (*rsa.PrivateKey, error) { 43 | if _, err := os.Stat(keyPath); err == nil { 44 | f, err := os.Open(keyPath) 45 | if err != nil { 46 | return nil, err 47 | } 48 | defer f.Close() 49 | 50 | content, err := ioutil.ReadAll(f) 51 | if err != nil { 52 | return nil, err 53 | } 54 | 55 | block, _ := pem.Decode(content) 56 | if block == nil { 57 | return nil, fmt.Errorf("Empty PEM block") 58 | } 59 | 60 | return x509.ParsePKCS1PrivateKey(block.Bytes) 61 | } else { 62 | if os.IsNotExist(err) { 63 | key, err := rsa.GenerateKey(rand.Reader, 4096) 64 | if err != nil { 65 | return nil, err 66 | } 67 | 68 | der := x509.MarshalPKCS1PrivateKey(key) 69 | content := pem.EncodeToMemory(&pem.Block{ 70 | Type: "RSA PRIVATE KEY", 71 | Bytes: der, 72 | }) 73 | err = os.WriteFile(keyPath, content, 0600) 74 | if err != nil { 75 | return nil, err 76 | } 77 | 78 | return key, nil 79 | } else { 80 | return nil, err 81 | } 82 | } 83 | } 84 | 85 | // parseSSHPublicKey parses exponent and modulus part from SSH public key 86 | // returns base64 encoded exponent and modulus 87 | func parseSSHPublicKey(pubKey ssh.PublicKey) (e string, n string, err error) { 88 | keyBytes := pubKey.Marshal() 89 | // ,, 90 | fields := [][]byte{} 91 | 92 | read := 0 93 | for read < len(keyBytes) { 94 | length := int(binary.BigEndian.Uint32(keyBytes[read : read+4])) 95 | read += 4 96 | fields = append(fields, keyBytes[read:read+length]) 97 | read += length 98 | } 99 | 100 | return base64.RawURLEncoding.EncodeToString(fields[1]), 101 | base64.RawURLEncoding.EncodeToString(fields[2]), 102 | nil 103 | } 104 | 105 | // saveSSHPublicKey saves SSH public key to file 106 | func saveSSHPublicKey(key ssh.PublicKey, path string) error { 107 | content := ssh.MarshalAuthorizedKey(key) 108 | return os.WriteFile(path, content, 0600) 109 | } 110 | 111 | // loadSSHCertificate loads SSH certificate from file 112 | func loadSSHCertificate(path string) (*ssh.Certificate, error) { 113 | f, err := os.Open(path) 114 | if err != nil { 115 | return nil, fmt.Errorf("Fail to open SSH certificate file: %+v", err) 116 | } 117 | defer f.Close() 118 | 119 | content, err := ioutil.ReadAll(f) 120 | if err != nil { 121 | return nil, fmt.Errorf("Fail to read SSH certificate file: %+v", err) 122 | } 123 | 124 | parts := strings.Split(string(content), " ") 125 | if len(parts) < 2 { 126 | return nil, fmt.Errorf("SSH certificate file is in bad format") 127 | } 128 | 129 | data, err := base64.StdEncoding.DecodeString(parts[1]) 130 | if err != nil { 131 | return nil, fmt.Errorf("Fail to decode SSH certificate: %+v", err) 132 | } 133 | 134 | pubKey, err := ssh.ParsePublicKey(data) 135 | if err != nil { 136 | return nil, fmt.Errorf("Fail to parse SSH certificate: %+v", err) 137 | } 138 | 139 | sshCert, ok := pubKey.(*ssh.Certificate) 140 | if !ok { 141 | return nil, fmt.Errorf("Not a SSH certificate") 142 | } 143 | 144 | validBefore := time.Unix(int64(sshCert.ValidBefore), 0) 145 | validAfter := time.Unix(int64(sshCert.ValidAfter), 0) 146 | valid := time.Now().Before(validBefore) && time.Now().After(validAfter) 147 | if !valid { 148 | return nil, fmt.Errorf("SSH certificate has expired. Valid before: %s. Valid after: %s", 149 | validBefore, validAfter) 150 | } 151 | 152 | return sshCert, nil 153 | } 154 | 155 | // saveSSHCertificate saves SSH certificate to file 156 | func saveSSHCertificate(content, path string) error { 157 | return os.WriteFile(path, []byte(content), 0600) 158 | } 159 | 160 | // getSSHArgs returns command line arguments when calling SSH command 161 | func getSSHArgs(inputArgs []string, sshPrivKeyPath string, useSSHAgent bool) []string { 162 | args := inputArgs 163 | argsMap := make(map[string]bool) 164 | for _, arg := range inputArgs { 165 | argsMap[arg] = true 166 | } 167 | 168 | if useSSHAgent && !argsMap["-A"] { 169 | args = append(args, "-A") 170 | } 171 | 172 | if !useSSHAgent && !argsMap["-i"] { 173 | args = append(args, "-i", sshPrivKeyPath) 174 | } 175 | 176 | return args 177 | } 178 | -------------------------------------------------------------------------------- /pkg/checkers/dns/dns.go: -------------------------------------------------------------------------------- 1 | package dns 2 | 3 | import ( 4 | "fmt" 5 | "time" 6 | 7 | "github.com/Azure/kdebug/pkg/base" 8 | "github.com/Azure/kdebug/pkg/env" 9 | "github.com/miekg/dns" 10 | ) 11 | 12 | const ( 13 | PublicDnsRecommendation = "Check your public network connectivity and outbound security settings." 14 | CoreDnsRecommendation = "CoreDNS pods might be down. Check their liveness using `kubectl get pods -n kube-system -o wide -l k8s-app=kube-dns`." 15 | ) 16 | 17 | var ( 18 | GoogleDnsServer = DnsServer{ 19 | Name: "Google DNS", 20 | Server: "8.8.8.8", 21 | Queries: []string{ 22 | "www.google.com", 23 | "www.bing.com", 24 | }, 25 | Recommendations: []string{PublicDnsRecommendation}, 26 | HelpLinks: []string{ 27 | "https://developers.google.com/speed/public-dns", 28 | }, 29 | } 30 | AzureDnsServer = DnsServer{ 31 | Name: "Azure DNS", 32 | Server: "168.63.129.16", 33 | Queries: []string{ 34 | "www.google.com", 35 | "www.bing.com", 36 | }, 37 | Recommendations: []string{ 38 | PublicDnsRecommendation, 39 | "VM might be on a bad host. Try to `redeploy` it.", 40 | }, 41 | HelpLinks: []string{ 42 | "https://docs.microsoft.com/en-us/azure/virtual-network/what-is-ip-address-168-63-129-16", 43 | "https://docs.microsoft.com/en-us/azure/virtual-network/virtual-networks-name-resolution-for-vms-and-role-instances#azure-provided-name-resolution", 44 | }, 45 | } 46 | AksCoreDnsServerPublic = DnsServer{ 47 | Name: "AKS CoreDNS", 48 | Server: "10.0.0.10", 49 | Queries: []string{ 50 | "www.google.com", 51 | "www.bing.com", 52 | }, 53 | Recommendations: []string{ 54 | PublicDnsRecommendation, 55 | }, 56 | HelpLinks: []string{ 57 | "https://kubernetes.io/docs/tasks/administer-cluster/dns-custom-nameservers/", 58 | "https://kubernetes.io/docs/tasks/administer-cluster/coredns/", 59 | "https://coredns.io/plugins/kubernetes/", 60 | }, 61 | } 62 | AksCoreDnsServerInCluster = DnsServer{ 63 | Name: "AKS CoreDNS", 64 | Server: "10.0.0.10", 65 | Queries: []string{ 66 | "kubernetes.default.svc.cluster.local", 67 | }, 68 | Recommendations: []string{ 69 | CoreDnsRecommendation, 70 | }, 71 | HelpLinks: []string{ 72 | "https://kubernetes.io/docs/tasks/administer-cluster/dns-custom-nameservers/", 73 | "https://kubernetes.io/docs/tasks/administer-cluster/coredns/", 74 | "https://coredns.io/plugins/kubernetes/", 75 | }, 76 | } 77 | SystemdResolvedDnsServer = DnsServer{ 78 | Name: "systemd-resolved", 79 | Server: "127.0.0.53", 80 | Queries: []string{ 81 | "www.google.com", 82 | "www.bing.com", 83 | }, 84 | Recommendations: []string{ 85 | "systemd-resolved service might not be running. Check by running `sudo systemctl status systemd-resolved`.", 86 | }, 87 | HelpLinks: []string{ 88 | "https://www.freedesktop.org/software/systemd/man/systemd-resolved.service.html", 89 | }, 90 | } 91 | ) 92 | 93 | type DnsServer struct { 94 | Name string 95 | Server string 96 | Queries []string 97 | Recommendations []string 98 | HelpLinks []string 99 | } 100 | 101 | type DnsClient interface { 102 | Exchange(m *dns.Msg, a string) (r *dns.Msg, rtt time.Duration, err error) 103 | } 104 | 105 | type DnsChecker struct { 106 | client DnsClient 107 | } 108 | 109 | func New() *DnsChecker { 110 | return &DnsChecker{ 111 | client: &dns.Client{ 112 | Timeout: time.Second, 113 | }, 114 | } 115 | } 116 | 117 | func (c *DnsChecker) Name() string { 118 | return "Dns" 119 | } 120 | 121 | func (c *DnsChecker) Check(ctx *base.CheckContext) ([]*base.CheckResult, error) { 122 | result := []*base.CheckResult{} 123 | targets := getCheckTargets(ctx.Environment) 124 | for _, server := range targets { 125 | for _, query := range server.Queries { 126 | r, err := c.checkServer(server, query) 127 | if err != nil { 128 | return result, err 129 | } 130 | result = append(result, r) 131 | } 132 | } 133 | return result, nil 134 | } 135 | 136 | func getCheckTargets(e env.Environment) []DnsServer { 137 | targets := []DnsServer{ 138 | GoogleDnsServer, 139 | } 140 | 141 | if e.HasFlag("ubuntu") { 142 | targets = append(targets, SystemdResolvedDnsServer) 143 | } 144 | 145 | if e.HasFlag("azure") { 146 | targets = append(targets, 147 | AzureDnsServer, 148 | AksCoreDnsServerPublic, 149 | AksCoreDnsServerInCluster) 150 | } 151 | 152 | return targets 153 | } 154 | 155 | func (c *DnsChecker) checkServer(server DnsServer, query string) (*base.CheckResult, error) { 156 | m := new(dns.Msg) 157 | m.SetQuestion(query+".", dns.TypeA) 158 | m.RecursionDesired = true 159 | r, _, err := c.client.Exchange(m, server.Server+":53") 160 | if err != nil { 161 | return &base.CheckResult{ 162 | Checker: c.Name(), 163 | Error: fmt.Sprintf("Fail to query domain name %s from server %s(%s)", 164 | query, server.Name, server.Server), 165 | Description: err.Error(), 166 | Recommendations: server.Recommendations, 167 | HelpLinks: server.HelpLinks, 168 | }, nil 169 | } 170 | if r.Rcode != dns.RcodeSuccess { 171 | return &base.CheckResult{ 172 | Checker: c.Name(), 173 | Error: fmt.Sprintf("Fail to query domain name %s from server %s(%s)", query, 174 | server.Name, server.Server), 175 | Description: fmt.Sprintf("Unexpected rcode: %d", r.Rcode), 176 | Recommendations: server.Recommendations, 177 | HelpLinks: server.HelpLinks, 178 | }, nil 179 | } 180 | return &base.CheckResult{ 181 | Checker: c.Name(), 182 | Description: fmt.Sprintf("Successfully query domain name %s from server %s(%s)", 183 | query, server.Name, server.Server), 184 | }, nil 185 | } 186 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/Azure/kdebug 2 | 3 | go 1.17 4 | 5 | require ( 6 | github.com/AzureAD/microsoft-authentication-library-for-go v0.6.1 7 | github.com/Microsoft/go-winio v0.5.2 8 | github.com/bramvdbogaerde/go-scp v1.2.0 9 | github.com/coreos/go-systemd/v22 v22.5.0 10 | github.com/dustin/go-humanize v1.0.0 11 | github.com/fatih/color v1.7.0 12 | github.com/jessevdk/go-flags v1.5.0 13 | github.com/mattn/go-isatty v0.0.14 14 | github.com/miekg/dns v1.1.43 15 | github.com/schollz/progressbar/v3 v3.8.6 16 | github.com/shirou/gopsutil/v3 v3.23.2 17 | github.com/sirupsen/logrus v1.8.1 18 | github.com/zcalusic/sysinfo v0.0.0-20210905121133-6fa2f969a900 19 | golang.org/x/crypto v0.1.0 20 | k8s.io/api v0.24.7 21 | k8s.io/apimachinery v0.24.7 22 | k8s.io/cli-runtime v0.24.7 23 | k8s.io/client-go v0.24.7 24 | k8s.io/kubectl v0.24.7 25 | ) 26 | 27 | require ( 28 | github.com/Azure/go-ansiterm v0.0.0-20210617225240-d185dfc1b5a1 // indirect 29 | github.com/MakeNowJust/heredoc v0.0.0-20170808103936-bb23615498cd // indirect 30 | github.com/PuerkitoBio/purell v1.1.1 // indirect 31 | github.com/PuerkitoBio/urlesc v0.0.0-20170810143723-de5bf2ad4578 // indirect 32 | github.com/chai2010/gettext-go v0.0.0-20160711120539-c6fed771bfd5 // indirect 33 | github.com/davecgh/go-spew v1.1.1 // indirect 34 | github.com/daviddengcn/go-colortext v0.0.0-20160507010035-511bcaf42ccd // indirect 35 | github.com/docker/distribution v2.8.1+incompatible // indirect 36 | github.com/emicklei/go-restful v2.16.0+incompatible // indirect 37 | github.com/evanphx/json-patch v4.12.0+incompatible // indirect 38 | github.com/exponent-io/jsonpath v0.0.0-20151013193312-d6023ce2651d // indirect 39 | github.com/fatih/camelcase v1.0.0 // indirect 40 | github.com/fvbommel/sortorder v1.0.1 // indirect 41 | github.com/go-errors/errors v1.0.1 // indirect 42 | github.com/go-logr/logr v1.2.0 // indirect 43 | github.com/go-ole/go-ole v1.2.6 // indirect 44 | github.com/go-openapi/jsonpointer v0.19.5 // indirect 45 | github.com/go-openapi/jsonreference v0.19.5 // indirect 46 | github.com/go-openapi/swag v0.19.14 // indirect 47 | github.com/godbus/dbus/v5 v5.0.4 // indirect 48 | github.com/gogo/protobuf v1.3.2 // indirect 49 | github.com/golang-jwt/jwt/v4 v4.4.2 // indirect 50 | github.com/golang/protobuf v1.5.2 // indirect 51 | github.com/google/btree v1.0.1 // indirect 52 | github.com/google/gnostic v0.5.7-v3refs // indirect 53 | github.com/google/go-cmp v0.5.9 // indirect 54 | github.com/google/gofuzz v1.1.0 // indirect 55 | github.com/google/shlex v0.0.0-20191202100458-e7afc7fbc510 // indirect 56 | github.com/google/uuid v1.3.0 // indirect 57 | github.com/gregjones/httpcache v0.0.0-20180305231024-9cad4c3443a7 // indirect 58 | github.com/imdario/mergo v0.3.5 // indirect 59 | github.com/inconshreveable/mousetrap v1.0.0 // indirect 60 | github.com/jonboulle/clockwork v0.2.2 // indirect 61 | github.com/josharian/intern v1.0.0 // indirect 62 | github.com/json-iterator/go v1.1.12 // indirect 63 | github.com/kylelemons/godebug v1.1.0 // indirect 64 | github.com/liggitt/tabwriter v0.0.0-20181228230101-89fcab3d43de // indirect 65 | github.com/lithammer/dedent v1.1.0 // indirect 66 | github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0 // indirect 67 | github.com/mailru/easyjson v0.7.6 // indirect 68 | github.com/mattn/go-colorable v0.0.9 // indirect 69 | github.com/mattn/go-runewidth v0.0.13 // indirect 70 | github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db // indirect 71 | github.com/mitchellh/go-wordwrap v1.0.0 // indirect 72 | github.com/moby/spdystream v0.2.0 // indirect 73 | github.com/moby/term v0.0.0-20210619224110-3f7ff695adc6 // indirect 74 | github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect 75 | github.com/modern-go/reflect2 v1.0.2 // indirect 76 | github.com/monochromegane/go-gitignore v0.0.0-20200626010858-205db1a8cc00 // indirect 77 | github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect 78 | github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f // indirect 79 | github.com/opencontainers/go-digest v1.0.0 // indirect 80 | github.com/peterbourgon/diskv v2.0.1+incompatible // indirect 81 | github.com/pkg/browser v0.0.0-20210115035449-ce105d075bb4 // indirect 82 | github.com/pkg/errors v0.9.1 // indirect 83 | github.com/pmezard/go-difflib v1.0.0 // indirect 84 | github.com/power-devops/perfstat v0.0.0-20210106213030-5aafc221ea8c // indirect 85 | github.com/rivo/uniseg v0.2.0 // indirect 86 | github.com/russross/blackfriday v1.5.2 // indirect 87 | github.com/spf13/cobra v1.4.0 // indirect 88 | github.com/spf13/pflag v1.0.5 // indirect 89 | github.com/stretchr/testify v1.8.2 // indirect 90 | github.com/tklauser/go-sysconf v0.3.11 // indirect 91 | github.com/tklauser/numcpus v0.6.0 // indirect 92 | github.com/xlab/treeprint v0.0.0-20181112141820-a009c3971eca // indirect 93 | github.com/yusufpapurcu/wmi v1.2.2 // indirect 94 | go.starlark.net v0.0.0-20200306205701-8dd3e2ee1dd5 // indirect 95 | golang.org/x/net v0.7.0 // indirect 96 | golang.org/x/oauth2 v0.0.0-20211104180415-d3ed0bb246c8 // indirect 97 | golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4 // indirect 98 | golang.org/x/sys v0.5.0 // indirect 99 | golang.org/x/term v0.5.0 // indirect 100 | golang.org/x/text v0.7.0 // indirect 101 | golang.org/x/time v0.0.0-20220210224613-90d013bbcef8 // indirect 102 | google.golang.org/appengine v1.6.7 // indirect 103 | google.golang.org/protobuf v1.27.1 // indirect 104 | gopkg.in/inf.v0 v0.9.1 // indirect 105 | gopkg.in/yaml.v2 v2.4.0 // indirect 106 | k8s.io/component-base v0.24.7 // indirect 107 | k8s.io/component-helpers v0.24.7 // indirect 108 | k8s.io/klog/v2 v2.60.1 // indirect 109 | k8s.io/kube-openapi v0.0.0-20220328201542-3ee0da9b0b42 // indirect 110 | k8s.io/metrics v0.24.7 // indirect 111 | k8s.io/utils v0.0.0-20220210201930-3a6ce19ff2f9 // indirect 112 | sigs.k8s.io/json v0.0.0-20211208200746-9f7c6b3444d2 // indirect 113 | sigs.k8s.io/kustomize/api v0.11.4 // indirect 114 | sigs.k8s.io/kustomize/kustomize/v4 v4.5.4 // indirect 115 | sigs.k8s.io/kustomize/kyaml v0.13.6 // indirect 116 | sigs.k8s.io/structured-merge-diff/v4 v4.2.1 // indirect 117 | sigs.k8s.io/yaml v1.2.0 // indirect 118 | ) 119 | 120 | require ( 121 | github.com/c9s/goprocinfo v0.0.0-20210130143923-c95fcf8c64a8 122 | github.com/prometheus-community/pro-bing v0.1.0 123 | gopkg.in/yaml.v3 v3.0.1 124 | ) 125 | -------------------------------------------------------------------------------- /pkg/checkers/diskusage/diskusage.go: -------------------------------------------------------------------------------- 1 | package diskusage 2 | 3 | import ( 4 | "errors" 5 | "fmt" 6 | "reflect" 7 | "strconv" 8 | "strings" 9 | 10 | "os/exec" 11 | 12 | "github.com/Azure/kdebug/pkg/base" 13 | ) 14 | 15 | const ( 16 | NoHighDiskUsageResult = "Disk usage is in normal size. No additional action required." 17 | HighUsageRecommandation = "Check files listed. If it's just log files or can be deleted, run bash command: `truncate -s 0 /path/to/file` to reduce disk usage. Note: `rm` will not really delete the file if it's opened by processes." 18 | FailedToRunCommand = "Failed to check disk usage with '%s'" 19 | NotSupportedOS = "The OS is not supported: %s" 20 | ) 21 | 22 | var ( 23 | DfHeaders = map[string][]string{ 24 | "LINUX": { 25 | "Filesystem", 26 | "Size", 27 | "Used", 28 | "Avail", 29 | "Use%", 30 | "Mounted", 31 | "on", 32 | }, 33 | "FREEBSD": { 34 | "Filesystem", 35 | "Size", 36 | "Used", 37 | "Avail", 38 | "Capacity", 39 | "Mounted", 40 | "on", 41 | }, 42 | } 43 | 44 | DiskUsageRateThreshold = 90 45 | InterestedBigFilePath = []string{ 46 | "/var/log", 47 | } 48 | InterestedBigFileNum = 10 49 | 50 | HighdfRecommandations = []string{HighUsageRecommandation} 51 | ) 52 | 53 | type DfRow struct { 54 | Filesystem string 55 | Size string 56 | Used string 57 | Avail string 58 | Use int 59 | MountedOn string 60 | } 61 | 62 | type DiskUsageChecker struct { 63 | } 64 | 65 | func New() *DiskUsageChecker { 66 | return &DiskUsageChecker{} 67 | } 68 | 69 | func (c *DiskUsageChecker) Name() string { 70 | return "DiskUsage" 71 | } 72 | 73 | func (c *DiskUsageChecker) Check(ctx *base.CheckContext) ([]*base.CheckResult, error) { 74 | result := []*base.CheckResult{} 75 | 76 | rst, err := c.getDiskUsage() 77 | if err != nil { 78 | return result, err 79 | } 80 | result = append(result, rst) 81 | 82 | return result, nil 83 | } 84 | 85 | func (c *DiskUsageChecker) getDiskUsage() (*base.CheckResult, error) { 86 | out, err := exec.Command("uname").Output() 87 | if err != nil { 88 | return &base.CheckResult{ 89 | Checker: c.Name(), 90 | Error: fmt.Sprintf(FailedToRunCommand, "uname"), 91 | Description: err.Error(), 92 | }, nil 93 | } 94 | 95 | uname := strings.TrimSpace(string(out)) 96 | dfHeaders, ok := DfHeaders[strings.ToUpper(uname)] 97 | if !ok { 98 | return &base.CheckResult{ 99 | Checker: c.Name(), 100 | Error: fmt.Sprintf(NotSupportedOS, uname), 101 | }, nil 102 | } 103 | 104 | out, err = exec.Command("df", "-h").Output() 105 | if err != nil { 106 | return &base.CheckResult{ 107 | Checker: c.Name(), 108 | Error: fmt.Sprintf(FailedToRunCommand, "df -h"), 109 | Description: err.Error(), 110 | }, nil 111 | } 112 | 113 | rows, err := parseDfResult(string(out), dfHeaders) 114 | if err != nil { 115 | return &base.CheckResult{ 116 | Checker: c.Name(), 117 | Error: FailedToRunCommand, 118 | Description: err.Error(), 119 | }, nil 120 | } 121 | 122 | found, row := getUsageAt("/", rows) 123 | if found && row.Use > DiskUsageRateThreshold { 124 | bigFiles := []string{} 125 | 126 | for _, path := range InterestedBigFilePath { 127 | output, err := FindTopSizeFiles(path, InterestedBigFileNum) 128 | if err != nil { 129 | return &base.CheckResult{ 130 | Checker: c.Name(), 131 | Description: FormatHighDfDescription(row), 132 | Error: err.Error(), 133 | Recommendations: HighdfRecommandations, 134 | }, nil 135 | } 136 | 137 | bigFiles = append(bigFiles, output) 138 | } 139 | 140 | return &base.CheckResult{ 141 | Checker: c.Name(), 142 | Error: "Disk is reaching high usage. Details: " + FormatHighDfDescription(row), 143 | Description: "\n" + strings.Join(bigFiles, "\n"), 144 | Recommendations: HighdfRecommandations, 145 | }, nil 146 | } 147 | 148 | return &base.CheckResult{ 149 | Checker: c.Name(), 150 | Description: fmt.Sprintf("%s Current %v%%, Threshold %v%%", NoHighDiskUsageResult, row.Use, DiskUsageRateThreshold), 151 | }, nil 152 | } 153 | 154 | func getUsageAt(path string, rows []DfRow) (bool, DfRow) { 155 | for _, row := range rows { 156 | if row.MountedOn == path { 157 | return true, row 158 | } 159 | } 160 | 161 | return false, DfRow{} 162 | } 163 | 164 | func parseDfResult(output string, dfHeaders []string) ([]DfRow, error) { 165 | lines := strings.Split(output, "\n") 166 | result := make([]DfRow, 0, len(lines)) 167 | 168 | for _, line := range lines { 169 | if len(line) == 0 { 170 | continue 171 | } 172 | 173 | ds := strings.Fields(strings.TrimSpace(line)) 174 | if ds[0] == dfHeaders[0] { 175 | // header 176 | if !reflect.DeepEqual(ds, dfHeaders) { 177 | return result, errors.New(fmt.Sprintf("Result in df has wrong header format. Expected %v, Actually %v", dfHeaders, ds)) 178 | } 179 | continue 180 | } 181 | 182 | row, err := parseDfRow(ds, dfHeaders) 183 | if err != nil { 184 | return nil, err 185 | } 186 | 187 | result = append(result, row) 188 | } 189 | 190 | return result, nil 191 | } 192 | 193 | func parseDfRow(row []string, dfHeader []string) (DfRow, error) { 194 | if len(row) != len(dfHeader)-1 { 195 | return DfRow{}, fmt.Errorf(`unexpected row column number %v (expected %v)`, row, dfHeader) 196 | } 197 | 198 | return DfRow{ 199 | Filesystem: strings.TrimSpace(row[0]), 200 | Size: strings.TrimSpace(row[1]), 201 | Used: strings.TrimSpace(row[2]), 202 | Avail: strings.TrimSpace(row[3]), 203 | Use: AtoiHepler(strings.TrimSpace(strings.Replace(row[4], "%", "", -1))), 204 | MountedOn: strings.TrimSpace(row[5]), 205 | }, nil 206 | } 207 | 208 | func AtoiHepler(s string) int { 209 | rst, _ := strconv.Atoi(s) 210 | return rst 211 | } 212 | 213 | func FormatHighDfDescription(row DfRow) string { 214 | return fmt.Sprintf("[Used %d%%] Filesystem: %s, UsedSize: %s, AvailableSize: %s, MountedOn %s", row.Use, row.Filesystem, row.Used, row.Avail, row.MountedOn) 215 | } 216 | 217 | func FindTopSizeFiles(path string, topCount int) (string, error) { 218 | commandline := fmt.Sprintf("du -ah %s | sort -rh | head -n %d", path, topCount) 219 | out, err := exec.Command("bash", "-c", commandline).Output() 220 | 221 | if err != nil { 222 | return "", err 223 | } 224 | 225 | return string(out), nil 226 | } 227 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # kdebug 2 | 3 | kdebug is a command line utility that helps troubleshoot a running Kubernetes cluster and apps in it. 4 | 5 | It focuses on DevOps scenarios and covers these areas: 6 | 7 | * OS diagnostics 8 | * Kubernetes components diagnostics 9 | * Lightweight application diagnostics 10 | 11 | ## Check mode 12 | 13 | kdebug runs in check mode by default. 14 | By running a set of predefined checks, it gives diagnostics information and guides you to next steps. 15 | 16 | Currently kdebug supports following checks: 17 | 18 | * Disk usage: Check disk usage and identity top large files. 19 | * Disk read-only: Check if the user home directory is read-only. 20 | * DNS: Check cluster DNS. 21 | * HTTP: Check HTTP connectivity to well known endpoints. 22 | * Kube object size: Check configmap/secret object size. 23 | * Kube pod: Check pod restart reasons. 24 | * Kube pod scheduling: Check pods scheduling issues. 25 | * OOM: Analysis out-of-memory events. 26 | * System load: Check the CPU and Memory of VM and some primary processes (etcd, kubelet...) 27 | * TCP: Check if the TCP connection could be established to exposed services (external load balancer, internal cluster service) 28 | * Ping: Check if the icmp ping/pong could work towards public IP (8.8.8.8) and cluster IP (node, pod) 29 | * KMS cache size: Check if API server KMS cache size is sufficient to hold all secrets. 30 | 31 | ## How to use 32 | 33 | ### Basic 34 | 35 | Run all checks: 36 | 37 | ```bash 38 | kdebug 39 | ``` 40 | 41 | Run a specific check: 42 | 43 | ```bash 44 | kdebug -c dns 45 | ``` 46 | 47 | List available checks: 48 | 49 | ```bash 50 | kdebug --list 51 | ``` 52 | 53 | See full supported arguments and help: 54 | 55 | ```bash 56 | kdebug -h 57 | ``` 58 | 59 | ### Kubernetes checks 60 | 61 | Kubernetes related checks require a working kubeconfig. You can either put it at the default location `$HOME/.kube/config`, or you can specify via `--kube-config-path`: 62 | 63 | ```bash 64 | kdebug -c kubepod \ 65 | --kube-config-path /path/to/kubeconfig 66 | ``` 67 | 68 | ### Batch mode 69 | 70 | kdebug supports running on a batch of remote machines simultaneously via SSH. 71 | 72 | Explictly specify a list of machine names: 73 | 74 | ```bash 75 | kdebug -c dns \ 76 | --batch.machines=machine-1 \ 77 | --batch.machines=machine-2 \ 78 | --batch.concurrency=2 \ 79 | --batch.ssh-user=azureuser 80 | ``` 81 | 82 | Read machine names list from a file or stdin: 83 | 84 | ```bash 85 | # From file 86 | kdebug -c dns \ 87 | --batch.machines-file=/path/to/machine/names/file 88 | 89 | # From stdin 90 | kubectl get nodes | grep NotReady | awk '{print $1}' | kdebug -c dns --batch.machines-file=- 91 | ``` 92 | 93 | Auto discover machines list via Kubernetes API server. 94 | 95 | ```bash 96 | kdebug -c dns --batch.kube-machines 97 | ``` 98 | 99 | In addition, you can specify a label selector: 100 | 101 | ```bash 102 | kdebug -c dns \ 103 | --batch.kube-machines \ 104 | --batch.kube-machines-label=kubernetes.io/role=agent 105 | ``` 106 | 107 | Or filter out unready nodes only: 108 | 109 | ```bash 110 | kdebug -c dns \ 111 | --batch.kube-machines-unready 112 | ``` 113 | 114 | ## Tool mode 115 | 116 | In addition to the default check mode, kdebug also supports a tool mode. 117 | Tool mode wraps useful commands and makes them easier to used in typical scenarios. 118 | 119 | 120 | Currently kdebug provides following tools: 121 | 122 | * Tcpdump: Wrap tcpdump command and provides a simpler interface for container scenarios. 123 | * Reboot reason: Inspect last reboot reason. 124 | * AAD SSH: SSH via AAD. This is a handy replacement for the original Azure CLI based implementation. 125 | * NetExec: Execute the command with the same network namespace with a specific process or pod. 126 | 127 | You can see a full list with: 128 | 129 | ```bash 130 | kdebug --list 131 | ``` 132 | 133 | Use following command to start a tool: 134 | 135 | ```bash 136 | kdebug -t 137 | ``` 138 | 139 | Show tool specific options: 140 | 141 | ```bash 142 | kdebug -t -h 143 | ``` 144 | 145 | ### Tcpdump 146 | 147 | Attach to network namespace of a process with pid=100 and capture all traffic: 148 | 149 | ```bash 150 | kdebug -t tcpdump --pid=100 151 | ``` 152 | 153 | With source and destination specified, and TCP only: 154 | 155 | ```bash 156 | kdebug -t tcpdump \ 157 | --pid=100 \ 158 | --source=10.0.0.1:1000 \ 159 | --destination=10.0.0.2:2000 \ 160 | --tcponly 161 | ``` 162 | 163 | `--host` matches either source or destination: 164 | 165 | ```bash 166 | kdebug -t tcpdump --host=10.0.0.1:1000 167 | ``` 168 | 169 | ### Reboot reason 170 | 171 | Check VM last reboot reason within last 1 day: 172 | 173 | ``` 174 | kdebug -t vmrebootdetector 175 | ``` 176 | 177 | Check VM last reboot reason within last 100 days: 178 | 179 | ``` 180 | kdebug -t vmrebootdetector \ 181 | --checkdays=100 182 | ``` 183 | 184 | ### Package upgrade inspect 185 | 186 | Check upgraded packages within last 14 days: 187 | 188 | ``` 189 | kdebug --tool upgradeinspector --checkdays 14 190 | ``` 191 | 192 | Check upgraded package within last 7 days, limit 10 records: 193 | 194 | ``` 195 | kdebug --tool upgradeinspector --recordlimit 10 196 | ``` 197 | 198 | ### AAD SSH 199 | 200 | SSH via AAD. See [Azure Linux VMs and Azure AD](https://learn.microsoft.com/en-us/azure/active-directory/devices/howto-vm-sign-in-azure-ad-linux). 201 | 202 | This is a handy replacement for the original Azure CLI based implementation. 203 | 204 | Login via interactive flow: 205 | 206 | ```bash 207 | kdebug -t aadssh @@ 208 | ``` 209 | 210 | A browser will pop up for credentials. 211 | 212 | Login via Azure CLI credentials: 213 | 214 | ```bash 215 | az login 216 | kdebug -t aadssh --use-azure-cli @@ 217 | ``` 218 | 219 | ### NetExec 220 | Execute the command with the same network namespace with a process, you need to on the VM the process locate in. 221 | 222 | ```bash 223 | kdebug -t netexec --pid= 224 | ``` 225 | 226 | Execute the command with the same network namespace with a pod, you need to have the kubeconfig. 227 | 228 | ```bash 229 | kdebug -t netexec --pod= --namespace= 230 | ``` 231 | 232 | And specify the command with `--command=`. The default command is `sh` 233 | 234 | ## Development 235 | 236 | Prerequisite: 237 | 238 | * [Golang](https://go.dev/dl/) 239 | 240 | Build: 241 | 242 | ```bash 243 | make build 244 | ``` 245 | 246 | Test: 247 | 248 | ```bash 249 | make test 250 | ``` 251 | 252 | ## Contributing 253 | 254 | This project welcomes contributions and suggestions. Most contributions require you to agree to a 255 | Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us 256 | the rights to use your contribution. For details, visit https://cla.opensource.microsoft.com. 257 | 258 | When you submit a pull request, a CLA bot will automatically determine whether you need to provide 259 | a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions 260 | provided by the bot. You will only need to do this once across all repos using our CLA. 261 | 262 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 263 | For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or 264 | contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments. 265 | 266 | ## Trademarks 267 | 268 | This project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft 269 | trademarks or logos is subject to and must follow 270 | [Microsoft's Trademark & Brand Guidelines](https://www.microsoft.com/en-us/legal/intellectualproperty/trademarks/usage/general). 271 | Use of Microsoft trademarks or logos in modified versions of this project must not cause confusion or imply Microsoft sponsorship. 272 | Any use of third-party trademarks or logos are subject to those third-party's policies. 273 | -------------------------------------------------------------------------------- /pkg/checkers/systemload/systemload.go: -------------------------------------------------------------------------------- 1 | package systemload 2 | 3 | import ( 4 | "fmt" 5 | "path/filepath" 6 | "time" 7 | 8 | "github.com/Azure/kdebug/pkg/base" 9 | linuxproc "github.com/c9s/goprocinfo/linux" 10 | ) 11 | 12 | const ( 13 | GlobalCPUTooHigh = "The VM's CPU usage is higher than threshold. Currently %.1f%% (threshold is %.1f%%)." 14 | GlobalMemoryTooHigh = "The VM's Memory usage is higher than threshold. Currently %.1f%% (threshold is %.1f%%)" 15 | ProcessCPUTooHigh = "The CPU usage of process [%d] (%s) is higher than threshold. The proportion of cpu is %.1f%% to whole capacity (threshold is %.1f%%). The proportion of cpu is %.1f%% to one core (threshold is %.1f%%)" 16 | GloablHighCPURecommandation = "You may remote to the target VM and use 'top' to find out which process consumes most of CPU. Further actions may depends." 17 | GloablHighMemoryRecommandation = "You may remote to the target VM and use 'top' to find out which process consumes most of Memory. Further actions may depends." 18 | ProcessHighCPURecommandation = "You may restart to process if feasible and see whether the CPU usage comes to normal. Or you can 'perf' to diagnose the root cause." 19 | ) 20 | 21 | var ( 22 | VMCPUPercentageLimit float64 = 80 // The percentage compare to the whole VM CPU capacity. 100 means using up all the cpu capacity 23 | VMMemoryPercentageLimit float64 = 90 // The percentage compare to the VM Total Memory. 100 means using up all the memory capacity 24 | ClkTck float64 = 100 // default value of cycles per seconds 25 | CPUSpan float64 = 1 // The timespan of CPU load in seconds 26 | InterestedProcNames = map[string]ProcLimitMeasurement{ 27 | "etcd": {CPULimitAsGloabl: 50, CPULimitAsSingleCore: 80}, 28 | "kubelet": {CPULimitAsGloabl: 50, CPULimitAsSingleCore: 80}, 29 | "kube-apiserver": {CPULimitAsGloabl: 50, CPULimitAsSingleCore: 80}} 30 | ) 31 | 32 | type InterestedProc struct { 33 | StatFilePath string // Process stat file location. Should follow /proc/[pid]/stat 34 | Name string // The command of the process 35 | Pid uint64 // Pid 36 | TotalTime uint64 // Time of the process used in cpu cycle 37 | CPULimitAsGloabl float64 // CPU limit compare to the whole VM CPU capacity 38 | CPULimitAsSingleCore float64 // CPU limit compare to one core 39 | } 40 | 41 | type ProcLimitMeasurement struct { 42 | CPULimitAsGloabl float64 // The percentage compare to the whole VM CPU capacity. 100 means using up all the cpu capacity 43 | CPULimitAsSingleCore float64 // The percentage compare to one core. 100 means using up 1 core's capacity. Maximum number can be 100 * cores 44 | } 45 | 46 | type SystemLoadChecker struct { 47 | } 48 | 49 | func New() *SystemLoadChecker { 50 | return &SystemLoadChecker{} 51 | } 52 | 53 | func (c *SystemLoadChecker) Name() string { 54 | return "SystemLoad" 55 | } 56 | 57 | func (c *SystemLoadChecker) Check(ctx *base.CheckContext) ([]*base.CheckResult, error) { 58 | result := []*base.CheckResult{} 59 | 60 | if !ctx.Environment.HasFlag("linux") { 61 | return result, nil 62 | } 63 | 64 | // VM Memory 65 | memInfo, err := linuxproc.ReadMemInfo("/proc/meminfo") 66 | if err != nil { 67 | return result, err 68 | } 69 | var memUsage = getMemPercentage(memInfo.MemAvailable, memInfo.MemTotal) 70 | if memUsage > VMMemoryPercentageLimit { 71 | result = append(result, &base.CheckResult{ 72 | Checker: c.Name(), 73 | Error: fmt.Sprintf(GlobalMemoryTooHigh, memUsage, VMMemoryPercentageLimit), 74 | Description: GloablHighMemoryRecommandation, 75 | }) 76 | } 77 | 78 | interestedProcesses, err := getInterestedProc() 79 | if err != nil { 80 | return result, err 81 | } 82 | 83 | // Read global status 84 | stat, err := linuxproc.ReadStat("/proc/stat") 85 | if err != nil { 86 | return result, err 87 | } 88 | 89 | // How to calculate global cpu usage: https://rosettacode.org/wiki/Linux_CPU_utilization 90 | previousIdleTime, previousTotalTime := getSystemCPUTime(stat.CPUStatAll) 91 | 92 | // Sleep a time span and check cpu time again to get average CPU load 93 | time.Sleep(time.Duration(CPUSpan * float64(time.Second))) 94 | 95 | stat, err = linuxproc.ReadStat("/proc/stat") 96 | if err != nil { 97 | return result, err 98 | } 99 | 100 | idleTime, totalTime := getSystemCPUTime(stat.CPUStatAll) 101 | var deltaSystemIdleTime = idleTime - previousIdleTime 102 | var deltaSystemTotalTime = totalTime - previousTotalTime 103 | var usage = getSystemCPUPercentage(deltaSystemIdleTime, deltaSystemTotalTime) 104 | 105 | // VM CPU 106 | if usage > VMCPUPercentageLimit { 107 | result = append(result, &base.CheckResult{ 108 | Checker: c.Name(), 109 | Error: fmt.Sprintf(GlobalCPUTooHigh, usage, VMCPUPercentageLimit), 110 | Description: GloablHighCPURecommandation, 111 | }) 112 | } 113 | 114 | // Interested proc cpu 115 | for _, proc := range interestedProcesses { 116 | stat, err := linuxproc.ReadProcessStat(proc.StatFilePath) 117 | if err != nil { 118 | continue 119 | } 120 | 121 | // https://stackoverflow.com/questions/16726779/how-do-i-get-the-total-cpu-usage-of-an-application-from-proc-pid-stat/16736599#16736599 122 | totalTime := stat.Utime + stat.Stime 123 | cpuUsageAsGlobal := getProcessCPUPercentageAsGlobal(totalTime-proc.TotalTime, deltaSystemTotalTime) 124 | cpuUsageAsSingleCore := getProcessCPUPercentageAsSingleCore(totalTime-proc.TotalTime, CPUSpan) 125 | 126 | if cpuUsageAsGlobal > proc.CPULimitAsGloabl || cpuUsageAsSingleCore > proc.CPULimitAsSingleCore { 127 | result = append(result, &base.CheckResult{ 128 | Checker: c.Name(), 129 | Error: fmt.Sprintf(ProcessCPUTooHigh, proc.Pid, proc.Name, cpuUsageAsGlobal, proc.CPULimitAsGloabl, cpuUsageAsSingleCore, proc.CPULimitAsSingleCore), 130 | Description: ProcessHighCPURecommandation, 131 | }) 132 | } 133 | } 134 | 135 | return result, nil 136 | } 137 | 138 | func getTotalTime(stat linuxproc.CPUStat) uint64 { 139 | return stat.User + stat.Nice + stat.System + stat.Idle + stat.IOWait + stat.IRQ + stat.SoftIRQ + 140 | stat.Steal + stat.Guest + stat.GuestNice 141 | } 142 | 143 | func getInterestedProc() ([]*InterestedProc, error) { 144 | result := []*InterestedProc{} 145 | 146 | procStatusFiles, err := filepath.Glob("/proc/[0-9]*/stat") 147 | if err != nil { 148 | return result, err 149 | } 150 | 151 | // Read status and find out interested process 152 | for _, f := range procStatusFiles { 153 | stat, err := linuxproc.ReadProcessStat(f) 154 | if err != nil { 155 | continue 156 | } 157 | 158 | var cmd = stat.Comm[1 : len(stat.Comm)-1] // name: (cmd) 159 | if limit, ok := InterestedProcNames[cmd]; ok { 160 | result = append(result, &InterestedProc{ 161 | StatFilePath: f, 162 | Name: cmd, 163 | Pid: stat.Pid, 164 | CPULimitAsGloabl: limit.CPULimitAsGloabl, 165 | CPULimitAsSingleCore: limit.CPULimitAsSingleCore, 166 | TotalTime: stat.Utime + stat.Stime, // Time in user space + Time in kernal space 167 | }) 168 | } 169 | } 170 | 171 | return result, nil 172 | } 173 | 174 | func getSystemCPUTime(stat linuxproc.CPUStat) (idleTime uint64, totalTime uint64) { 175 | return stat.Idle, getTotalTime(stat) 176 | } 177 | 178 | func getMemPercentage(memAvailable uint64, memTotal uint64) float64 { 179 | return 100 - (float64(100*memAvailable) / float64(memTotal)) 180 | } 181 | 182 | func getSystemCPUPercentage(deltaSystemIdleTime uint64, deltaSystemTime uint64) float64 { 183 | return 100 - (float64(100*(deltaSystemIdleTime)) / float64(deltaSystemTime)) 184 | } 185 | 186 | func getProcessCPUPercentageAsGlobal(deltaProcessCPUTime uint64, deltaSystemCPUTime uint64) float64 { 187 | return 100 * float64(deltaProcessCPUTime) / float64(deltaSystemCPUTime) 188 | } 189 | 190 | func getProcessCPUPercentageAsSingleCore(deltaProcessCPUTime uint64, deltaRealTimeInSeconds float64) float64 { 191 | return 100 * float64(deltaProcessCPUTime) / deltaRealTimeInSeconds / ClkTck // deltaCPUTime / ClrTck = deltaProcessCPUTime in seconds 192 | } 193 | -------------------------------------------------------------------------------- /pkg/batch/pod_executor.go: -------------------------------------------------------------------------------- 1 | package batch 2 | 3 | import ( 4 | "context" 5 | "encoding/json" 6 | "fmt" 7 | "math/rand" 8 | "time" 9 | 10 | log "github.com/sirupsen/logrus" 11 | batchv1 "k8s.io/api/batch/v1" 12 | corev1 "k8s.io/api/core/v1" 13 | apierrors "k8s.io/apimachinery/pkg/api/errors" 14 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 15 | "k8s.io/client-go/kubernetes" 16 | ) 17 | 18 | type PodBatchExecutor struct { 19 | Client *kubernetes.Clientset 20 | Image string 21 | Namespace string 22 | Mode string 23 | } 24 | 25 | func NewPodBatchExecutor(kubeClient *kubernetes.Clientset, image, ns, mode string) *PodBatchExecutor { 26 | e := &PodBatchExecutor{ 27 | Client: kubeClient, 28 | Image: image, 29 | Namespace: ns, 30 | Mode: mode, 31 | } 32 | 33 | log.WithFields(log.Fields{ 34 | "image": image, "namespace": ns, "mode": mode, 35 | }).Debug("NewPodBatchExecutor") 36 | 37 | return e 38 | } 39 | 40 | func (e *PodBatchExecutor) generateRunName() string { 41 | rand.Seed(time.Now().UnixNano()) 42 | b := make([]byte, 10) 43 | rand.Read(b) 44 | return fmt.Sprintf("kdebug-%x", b) 45 | } 46 | 47 | func (e *PodBatchExecutor) isJobCompleted(job *batchv1.Job) bool { 48 | if job.Status.Conditions != nil { 49 | for _, cond := range job.Status.Conditions { 50 | if cond.Type == "Complete" && cond.Status == "True" { 51 | return true 52 | } 53 | } 54 | } 55 | return false 56 | } 57 | 58 | func (e *PodBatchExecutor) Execute(opts *BatchOptions) ([]*BatchResult, error) { 59 | ns := &corev1.Namespace{ 60 | ObjectMeta: metav1.ObjectMeta{ 61 | Name: e.Namespace, 62 | }, 63 | } 64 | _, err := e.Client.CoreV1().Namespaces().Create( 65 | context.Background(), ns, metav1.CreateOptions{}) 66 | if err != nil && !apierrors.IsAlreadyExists(err) { 67 | return nil, fmt.Errorf("Fail to create namespace %s for batch operations: %s", 68 | e.Namespace, err) 69 | } 70 | 71 | taskChan := make(chan *batchTask, opts.Concurrency) 72 | resultChan := make(chan *BatchResult, opts.Concurrency) 73 | runName := e.generateRunName() 74 | 75 | for i := 0; i < opts.Concurrency; i++ { 76 | go e.startWorker(runName, taskChan, resultChan) 77 | } 78 | 79 | for _, machine := range opts.Machines { 80 | go func(m string) { 81 | taskChan <- &batchTask{ 82 | Machine: m, 83 | Checkers: opts.Checkers, 84 | } 85 | }(machine) 86 | } 87 | 88 | results := make([]*BatchResult, 0, len(opts.Machines)) 89 | for i := 0; i < len(opts.Machines); i++ { 90 | result := <-resultChan 91 | results = append(results, result) 92 | opts.Reporter.OnResult(result) 93 | } 94 | 95 | close(taskChan) 96 | 97 | return results, nil 98 | } 99 | 100 | func (e *PodBatchExecutor) startWorker(runName string, taskChan chan *batchTask, resultChan chan *BatchResult) { 101 | for task := range taskChan { 102 | resultChan <- e.executeTask(runName, task) 103 | } 104 | } 105 | 106 | func (e *PodBatchExecutor) getPodTemplateSpecContainerMode(cmd []string, machine string) corev1.PodTemplateSpec { 107 | return corev1.PodTemplateSpec{ 108 | Spec: corev1.PodSpec{ 109 | Containers: []corev1.Container{ 110 | corev1.Container{ 111 | Name: "kdebug", 112 | Image: e.Image, 113 | Command: cmd, 114 | ImagePullPolicy: corev1.PullIfNotPresent, 115 | }, 116 | }, 117 | RestartPolicy: "Never", 118 | NodeName: machine, 119 | }, 120 | } 121 | } 122 | 123 | func (e *PodBatchExecutor) getPodTemplateSpecHostMode(rawCmd []string, machine string) corev1.PodTemplateSpec { 124 | cmd := []string{"/run-as-host"} 125 | cmd = append(cmd, rawCmd...) 126 | 127 | privileged := true 128 | hostPathSocket := corev1.HostPathSocket 129 | hostPathDirectory := corev1.HostPathDirectory 130 | 131 | return corev1.PodTemplateSpec{ 132 | Spec: corev1.PodSpec{ 133 | Containers: []corev1.Container{ 134 | corev1.Container{ 135 | Name: "kdebug", 136 | Image: e.Image, 137 | Command: cmd, 138 | SecurityContext: &corev1.SecurityContext{ 139 | Privileged: &privileged, 140 | }, 141 | VolumeMounts: []corev1.VolumeMount{ 142 | corev1.VolumeMount{ 143 | Name: "system-bus-socket", 144 | MountPath: "/var/run/dbus/system_bus_socket", 145 | }, 146 | corev1.VolumeMount{ 147 | Name: "systemd-system-config", 148 | MountPath: "/etc/systemd/system", 149 | }, 150 | corev1.VolumeMount{ 151 | Name: "tmp", 152 | MountPath: "/tmp", 153 | }, 154 | }, 155 | ImagePullPolicy: corev1.PullIfNotPresent, 156 | }, 157 | }, 158 | Volumes: []corev1.Volume{ 159 | corev1.Volume{ 160 | Name: "system-bus-socket", 161 | VolumeSource: corev1.VolumeSource{ 162 | HostPath: &corev1.HostPathVolumeSource{ 163 | Path: "/var/run/dbus/system_bus_socket", 164 | Type: &hostPathSocket, 165 | }, 166 | }, 167 | }, 168 | corev1.Volume{ 169 | Name: "systemd-system-config", 170 | VolumeSource: corev1.VolumeSource{ 171 | HostPath: &corev1.HostPathVolumeSource{ 172 | Path: "/etc/systemd/system", 173 | Type: &hostPathDirectory, 174 | }, 175 | }, 176 | }, 177 | corev1.Volume{ 178 | Name: "tmp", 179 | VolumeSource: corev1.VolumeSource{ 180 | HostPath: &corev1.HostPathVolumeSource{ 181 | Path: "/tmp", 182 | Type: &hostPathDirectory, 183 | }, 184 | }, 185 | }, 186 | }, 187 | RestartPolicy: "Never", 188 | NodeName: machine, 189 | }, 190 | } 191 | } 192 | 193 | func (e *PodBatchExecutor) executeTask(runName string, task *batchTask) *BatchResult { 194 | result := &BatchResult{ 195 | Machine: task.Machine, 196 | } 197 | 198 | // Create job 199 | cmd := []string{ 200 | "/kdebug", 201 | "-f", "json", 202 | "--no-set-exit-code", 203 | "-v", "none", 204 | } 205 | for _, checker := range task.Checkers { 206 | cmd = append(cmd, "-c") 207 | cmd = append(cmd, checker) 208 | } 209 | 210 | ttl := int32(300) 211 | backoff := int32(0) 212 | job := &batchv1.Job{ 213 | ObjectMeta: metav1.ObjectMeta{ 214 | Name: fmt.Sprintf("%s-%s", runName, task.Machine), 215 | Namespace: e.Namespace, 216 | Labels: map[string]string{ 217 | "kdebug-run": runName, 218 | }, 219 | }, 220 | Spec: batchv1.JobSpec{ 221 | TTLSecondsAfterFinished: &ttl, 222 | BackoffLimit: &backoff, 223 | }, 224 | } 225 | 226 | if e.Mode == "host" { 227 | job.Spec.Template = e.getPodTemplateSpecHostMode(cmd, task.Machine) 228 | } else { 229 | job.Spec.Template = e.getPodTemplateSpecContainerMode(cmd, task.Machine) 230 | } 231 | 232 | job, err := e.Client.BatchV1().Jobs(e.Namespace).Create( 233 | context.Background(), job, metav1.CreateOptions{}) 234 | if err != nil { 235 | result.Error = fmt.Errorf("fail to create Kubernetes job: %+v", err) 236 | return result 237 | } 238 | 239 | // Wait for job 240 | timeout := 5 * time.Minute 241 | startTime := time.Now() 242 | for { 243 | time.Sleep(5 * time.Second) 244 | 245 | job, err := e.Client.BatchV1().Jobs(e.Namespace).Get( 246 | context.Background(), job.Name, metav1.GetOptions{}) 247 | if err != nil { 248 | result.Error = fmt.Errorf("fail to get Kubernetes job %s: %+v", job.Name, err) 249 | return result 250 | } 251 | 252 | if e.isJobCompleted(job) { 253 | break 254 | } 255 | 256 | if time.Now().Sub(startTime) >= timeout { 257 | result.Error = fmt.Errorf("timeout waiting for Kubernetes job %s: %+v", job.Name, err) 258 | return result 259 | } 260 | } 261 | 262 | // Fetch pod log 263 | pods, err := e.Client.CoreV1().Pods(e.Namespace).List(context.Background(), metav1.ListOptions{ 264 | LabelSelector: "job-name=" + job.Name, 265 | }) 266 | if err != nil { 267 | result.Error = fmt.Errorf("fail to get Kubernetes pods of job %s: %+v", job.Name, err) 268 | return result 269 | } 270 | 271 | // Parse result 272 | pod := pods.Items[0] 273 | req := e.Client.CoreV1().Pods(e.Namespace).GetLogs( 274 | pod.Name, &corev1.PodLogOptions{}) 275 | logs, err := req.Stream(context.Background()) 276 | if err != nil { 277 | result.Error = fmt.Errorf("fail to stream logs of pod %s: %+v", pod.Name, err) 278 | return result 279 | } 280 | defer logs.Close() 281 | 282 | decoder := json.NewDecoder(logs) 283 | 284 | result.Error = decoder.Decode(&result.CheckResults) 285 | 286 | return result 287 | } 288 | -------------------------------------------------------------------------------- /cmd/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "io/ioutil" 6 | "os" 7 | "path/filepath" 8 | "runtime/debug" 9 | 10 | "github.com/fatih/color" 11 | flags "github.com/jessevdk/go-flags" 12 | "github.com/mattn/go-isatty" 13 | "github.com/sirupsen/logrus" 14 | log "github.com/sirupsen/logrus" 15 | "k8s.io/cli-runtime/pkg/genericclioptions" 16 | "k8s.io/client-go/kubernetes" 17 | "k8s.io/client-go/tools/clientcmd" 18 | "k8s.io/client-go/util/homedir" 19 | 20 | "github.com/Azure/kdebug/pkg/base" 21 | chks "github.com/Azure/kdebug/pkg/checkers" 22 | "github.com/Azure/kdebug/pkg/env" 23 | "github.com/Azure/kdebug/pkg/formatters" 24 | tools "github.com/Azure/kdebug/pkg/tools" 25 | ) 26 | 27 | type Options struct { 28 | ListCheckers bool `short:"l" long:"list" description:"List all checks and tools"` 29 | Checkers []string `short:"c" long:"check" description:"Check name. Can specify multiple times."` 30 | Tool string `short:"t" long:"tool" description:"Use tool"` 31 | Format string `short:"f" long:"format" description:"Output format"` 32 | KubeMasterUrl string `long:"kube-master-url" description:"Kubernetes API server URL"` 33 | KubeConfigPath string `long:"kube-config-path" description:"Path to kubeconfig file"` 34 | Verbose string `short:"v" long:"verbose" description:"Log level"` 35 | NoColor bool `long:"no-color" description:"Disable colorized output"` 36 | Pause bool `long:"pause" description:"Pause until interrupted"` 37 | Help bool `short:"h" long:"help" description:"Show help message"` 38 | NoSetExitCode bool `long:"no-set-exit-code" hidden:"-"` 39 | Output string `short:"o" long:"output" description:"Output file"` 40 | 41 | Batch struct { 42 | KubeMachines bool `long:"kube-machines" description:"Discover machines from Kubernetes API server"` 43 | KubeMachinesUnready bool `long:"kube-machines-unready" description:"Discover unready machines from Kubernetes API server"` 44 | KubeMachinesLabelSelector string `long:"kube-machines-label" description:"Label selector for Kubernetes machines"` 45 | Machines []string `long:"machines" description:"Machine names"` 46 | MachinesFile string `long:"machines-file" description:"Path to a file that contains machine names list. Can use - to read from stdin."` 47 | Concurrency int `long:"concurrency" default:"4" description:"Batch concurrency"` 48 | SshUser string `long:"ssh-user" description:"SSH user"` 49 | PodExecutorImage string `long:"pod-executor-image" description:"Container image used by pod executor"` 50 | PodExecutorNamespace string `long:"pod-executor-namespace" description:"Namespace used by pod executor" default:"kdebug"` 51 | PodExecutorMode string `long:"pod-executor-mode" choice:"host" choice:"container" default:"host" description:"Run as container or run as host"` 52 | } `group:"Batch Options" namespace:"batch" description:"Batch mode"` 53 | 54 | RemainingArgs []string 55 | } 56 | 57 | func (o *Options) IsBatchMode() bool { 58 | return o.Batch.KubeMachines || o.Batch.KubeMachinesUnready || len(o.Batch.Machines) > 0 || len(o.Batch.MachinesFile) > 0 59 | } 60 | 61 | func (o *Options) IsToolMode() bool { 62 | return len(o.Tool) > 0 63 | } 64 | 65 | func getDefaultPodExecutorImage() string { 66 | tag := "main" 67 | if info, ok := debug.ReadBuildInfo(); ok { 68 | for _, setting := range info.Settings { 69 | if setting.Key == "vcs.revision" { 70 | tag = setting.Value 71 | break 72 | } 73 | } 74 | } 75 | return "ghcr.io/azure/kdebug:" + tag 76 | } 77 | 78 | func processOptions(o *Options) { 79 | // Run all checkers if not specified 80 | if len(o.Checkers) == 0 { 81 | o.Checkers = chks.ListAllCheckerNames() 82 | } 83 | if o.Batch.PodExecutorImage == "" { 84 | o.Batch.PodExecutorImage = getDefaultPodExecutorImage() 85 | } 86 | } 87 | 88 | func buildKubeClient(masterUrl, kubeConfigPath string) (*kubernetes.Clientset, *genericclioptions.ConfigFlags, error) { 89 | // Try env 90 | if kubeConfigPath == "" { 91 | if path := os.Getenv("KUBECONFIG"); path != "" { 92 | kubeConfigPath = path 93 | } 94 | } 95 | 96 | // Try default path 97 | if kubeConfigPath == "" { 98 | if home := homedir.HomeDir(); home != "" { 99 | kubeConfigPath = filepath.Join(home, ".kube", "config") 100 | } 101 | } 102 | 103 | config, err := clientcmd.BuildConfigFromFlags(masterUrl, kubeConfigPath) 104 | if err != nil { 105 | return nil, nil, err 106 | } 107 | clientSet, err := kubernetes.NewForConfig(config) 108 | if err != nil { 109 | return nil, nil, err 110 | } 111 | kubeConfigFlag := genericclioptions.NewConfigFlags(false) 112 | kubeConfigFlag.APIServer = &masterUrl 113 | kubeConfigFlag.KubeConfig = &kubeConfigPath 114 | 115 | return clientSet, kubeConfigFlag, nil 116 | } 117 | 118 | func buildCheckContext(opts *Options) (*base.CheckContext, error) { 119 | ctx := &base.CheckContext{ 120 | Environment: env.GetEnvironment(), 121 | } 122 | 123 | log.WithFields(log.Fields{ 124 | "env": ctx.Environment, 125 | }).Debug("Environment") 126 | 127 | kubeClient, _, err := buildKubeClient(opts.KubeMasterUrl, opts.KubeConfigPath) 128 | if err == nil { 129 | ctx.KubeClient = kubeClient 130 | } else { 131 | log.WithFields(log.Fields{ 132 | "error": err, 133 | }).Warn("Kubernetes related checkers will not work") 134 | } 135 | 136 | return ctx, nil 137 | } 138 | 139 | func buildToolContext(opts *Options) (*base.ToolContext, error) { 140 | // Add back help arg so tool can see it 141 | if opts.Help { 142 | opts.RemainingArgs = append(opts.RemainingArgs, "-h") 143 | } 144 | log.WithFields(log.Fields{"args": opts.RemainingArgs}).Debug("Tool context") 145 | ctx := &base.ToolContext{ 146 | Args: opts.RemainingArgs, 147 | Environment: env.GetEnvironment(), 148 | } 149 | if _, configFlags, err := buildKubeClient(opts.KubeMasterUrl, opts.KubeConfigPath); err == nil { 150 | ctx.KubeConfigFlag = configFlags 151 | } 152 | return ctx, nil 153 | } 154 | 155 | func main() { 156 | // Process options 157 | var opts Options 158 | flagsParser := flags.NewParser(&opts, flags.PrintErrors|flags.PassDoubleDash|flags.IgnoreUnknown) 159 | remainingArgs, err := flagsParser.Parse() 160 | if err != nil { 161 | log.Fatal(err) 162 | return 163 | } 164 | opts.RemainingArgs = remainingArgs 165 | 166 | processOptions(&opts) 167 | 168 | if len(opts.Verbose) > 0 { 169 | if opts.Verbose == "none" { 170 | logrus.SetOutput(ioutil.Discard) 171 | } else { 172 | logLevel, err := logrus.ParseLevel(opts.Verbose) 173 | if err != nil { 174 | log.Fatal(err) 175 | } 176 | logrus.SetLevel(logLevel) 177 | } 178 | } 179 | 180 | if !isatty.IsTerminal(os.Stdout.Fd()) || opts.NoColor || opts.Output != "" { 181 | color.NoColor = true 182 | } 183 | 184 | if opts.ListCheckers { 185 | fmt.Print("checks: ") 186 | fmt.Println(chks.ListAllCheckerNames()) 187 | fmt.Print("tools: ") 188 | fmt.Println(tools.ListAllToolNames()) 189 | return 190 | } 191 | 192 | if opts.Pause { 193 | pause() 194 | return 195 | } 196 | 197 | var formatter formatters.Formatter 198 | if opts.Format == "json" { 199 | formatter = &formatters.JsonFormatter{} 200 | } else if opts.Format == "oneline" { 201 | formatter = &formatters.OneLineFormatter{} 202 | } else { 203 | formatter = &formatters.TextFormatter{} 204 | } 205 | 206 | // Tool Mode 207 | if opts.IsToolMode() { 208 | ctx, err := buildToolContext(&opts) 209 | if err != nil { 210 | log.Fatal(err) 211 | } 212 | 213 | err = tools.ParseArgs(ctx, opts.Tool, opts.RemainingArgs) 214 | if err != nil { 215 | if !flags.WroteHelp(err) { 216 | log.Fatal(err) 217 | } 218 | return 219 | } 220 | 221 | err = tools.Run(ctx, opts.Tool) 222 | if err != nil { 223 | log.Fatal(err) 224 | } 225 | return 226 | } 227 | 228 | if opts.Help { 229 | flagsParser.WriteHelp(os.Stdout) 230 | return 231 | } 232 | 233 | // Prepare dependencies 234 | ctx, err := buildCheckContext(&opts) 235 | if err != nil { 236 | log.Fatal(err) 237 | } 238 | 239 | ctx.Output = os.Stdout 240 | if opts.Output != "" { 241 | outFile, err := os.OpenFile(opts.Output, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644) 242 | if err != nil { 243 | log.Fatalf("Fail to open output file: %s", opts.Output) 244 | } 245 | defer outFile.Close() 246 | ctx.Output = outFile 247 | } 248 | 249 | // Batch mode 250 | if opts.IsBatchMode() { 251 | runBatch(&opts, ctx, formatter) 252 | return 253 | } 254 | 255 | // Check 256 | results, err := chks.Check(ctx, opts.Checkers) 257 | if err != nil { 258 | log.Fatal(err) 259 | } 260 | 261 | // Output 262 | err = formatter.WriteResults(ctx.Output, results) 263 | if err != nil { 264 | log.Fatal(err) 265 | } 266 | 267 | if !opts.NoSetExitCode { 268 | for _, r := range results { 269 | if !r.Ok() { 270 | os.Exit(1) 271 | } 272 | } 273 | } 274 | } 275 | -------------------------------------------------------------------------------- /pkg/checkers/kube/pod/pod_restart_reason_checker.go: -------------------------------------------------------------------------------- 1 | package pod 2 | 3 | import ( 4 | "bytes" 5 | "context" 6 | "fmt" 7 | "io" 8 | "strings" 9 | "text/tabwriter" 10 | "time" 11 | 12 | "github.com/Azure/kdebug/pkg/base" 13 | log "github.com/sirupsen/logrus" 14 | 15 | corev1 "k8s.io/api/core/v1" 16 | v1 "k8s.io/api/core/v1" 17 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 18 | "k8s.io/apimachinery/pkg/runtime" 19 | "k8s.io/apimachinery/pkg/types" 20 | "k8s.io/apimachinery/pkg/util/duration" 21 | runtimeresource "k8s.io/cli-runtime/pkg/resource" 22 | corev1client "k8s.io/client-go/kubernetes/typed/core/v1" 23 | "k8s.io/client-go/tools/reference" 24 | "k8s.io/kubectl/pkg/cmd/util" 25 | "k8s.io/kubectl/pkg/describe" 26 | "k8s.io/kubectl/pkg/scheme" 27 | "k8s.io/kubectl/pkg/util/qos" 28 | ) 29 | 30 | const levelSpace = " " 31 | 32 | type KubePodRestartReasonChecker struct { 33 | } 34 | 35 | func New() *KubePodRestartReasonChecker { 36 | return &KubePodRestartReasonChecker{} 37 | } 38 | 39 | func (c *KubePodRestartReasonChecker) Name() string { 40 | return "KubePodRestartReason" 41 | } 42 | 43 | // Check borrows many logic and helper functions from src/k8s.io/kubectl/pkg/describe to check Pod status and events. 44 | func (c *KubePodRestartReasonChecker) Check(ctx *base.CheckContext) ([]*base.CheckResult, error) { 45 | if ctx.KubeClient == nil { 46 | log.Warn("Skip KubePodRestartReasonChecker due to missing kube client") 47 | return nil, nil 48 | } 49 | 50 | pods, err := ctx.KubeClient.CoreV1().Pods("").List(context.Background(), metav1.ListOptions{}) 51 | if err != nil { 52 | log.WithFields(log.Fields{"error": err}).Warn("Fail to list pods") 53 | return nil, err 54 | } 55 | 56 | results := []*base.CheckResult{} 57 | for _, pod := range pods.Items { 58 | var crashing = false 59 | for _, containerStatus := range pod.Status.ContainerStatuses { 60 | if containerStatus.State.Waiting != nil && containerStatus.State.Waiting.Reason == "CrashLoopBackOff" { 61 | crashing = true 62 | break 63 | } 64 | } 65 | 66 | if crashing { 67 | result := c.checkPod(ctx, &pod) 68 | if result != nil { 69 | results = append(results, result) 70 | } 71 | } 72 | } 73 | 74 | return results, nil 75 | } 76 | 77 | func (c *KubePodRestartReasonChecker) checkPod(ctx *base.CheckContext, pod *v1.Pod) *base.CheckResult { 78 | var events *corev1.EventList 79 | ref, err := reference.GetReference(scheme.Scheme, pod) 80 | if err != nil { 81 | log.WithFields(log.Fields{"pod": pod, "error": err}).Warn("Unable to construct reference") 82 | return nil 83 | } 84 | 85 | ref.Kind = "" 86 | if _, isMirrorPod := pod.Annotations[corev1.MirrorPodAnnotationKey]; isMirrorPod { 87 | ref.UID = types.UID(pod.Annotations[corev1.MirrorPodAnnotationKey]) 88 | } 89 | events, _ = searchEvents(ctx.KubeClient.CoreV1(), ref, util.DefaultChunkSize) 90 | text, _ := describePodStatus(pod, events) 91 | logs := strings.Split(text, "\n") 92 | 93 | for i := range logs { 94 | logs[i] = levelSpace + logs[i] 95 | } 96 | 97 | return &base.CheckResult{ 98 | Checker: c.Name(), 99 | Error: fmt.Sprintf("one or more containers of %s/%s are failing and restarting repeatedly.", pod.Namespace, pod.Name), 100 | Description: fmt.Sprintf("%s/%s is not running well.", pod.Namespace, pod.Name), 101 | Logs: logs, 102 | } 103 | } 104 | 105 | func describePodStatus(pod *corev1.Pod, events *corev1.EventList) (string, error) { 106 | return tabbedString(func(out io.Writer) error { 107 | w := describe.NewPrefixWriter(out) 108 | w.Write(describe.LEVEL_0, "Name:\t%s\n", pod.Name) 109 | w.Write(describe.LEVEL_0, "Namespace:\t%s\n", pod.Namespace) 110 | if pod.Status.StartTime != nil { 111 | w.Write(describe.LEVEL_0, "Start Time:\t%s\n", pod.Status.StartTime.Time.Format(time.RFC1123Z)) 112 | } 113 | if pod.DeletionTimestamp != nil { 114 | w.Write(describe.LEVEL_0, "Status:\tTerminating (lasts %s)\n", translateTimestampSince(*pod.DeletionTimestamp)) 115 | w.Write(describe.LEVEL_0, "Termination Grace Period:\t%ds\n", *pod.DeletionGracePeriodSeconds) 116 | } else { 117 | w.Write(describe.LEVEL_0, "Status:\t%s\n", string(pod.Status.Phase)) 118 | } 119 | if len(pod.Status.Reason) > 0 { 120 | w.Write(describe.LEVEL_0, "Reason:\t%s\n", pod.Status.Reason) 121 | } 122 | if len(pod.Status.Message) > 0 { 123 | w.Write(describe.LEVEL_0, "Message:\t%s\n", pod.Status.Message) 124 | } 125 | describeContainers("Containers", pod.Spec.Containers, pod.Status.ContainerStatuses, describe.EnvValueRetriever(pod), w, "") 126 | if len(pod.Status.Conditions) > 0 { 127 | w.Write(describe.LEVEL_0, "Conditions:\n Type\tStatus\n") 128 | for _, c := range pod.Status.Conditions { 129 | w.Write(describe.LEVEL_1, "%v \t%v \n", 130 | c.Type, 131 | c.Status) 132 | } 133 | } 134 | if pod.Status.QOSClass != "" { 135 | w.Write(describe.LEVEL_0, "QoS Class:\t%s\n", pod.Status.QOSClass) 136 | } else { 137 | w.Write(describe.LEVEL_0, "QoS Class:\t%s\n", qos.GetPodQOS(pod)) 138 | } 139 | if events != nil { 140 | describe.DescribeEvents(events, w) 141 | } 142 | return nil 143 | }) 144 | } 145 | 146 | func tabbedString(f func(io.Writer) error) (string, error) { 147 | out := new(tabwriter.Writer) 148 | buf := &bytes.Buffer{} 149 | out.Init(buf, 0, 8, 2, ' ', 0) 150 | 151 | err := f(out) 152 | if err != nil { 153 | return "", err 154 | } 155 | 156 | out.Flush() 157 | str := string(buf.String()) 158 | return str, nil 159 | } 160 | 161 | func searchEvents(client corev1client.EventsGetter, objOrRef runtime.Object, limit int64) (*corev1.EventList, error) { 162 | ref, err := reference.GetReference(scheme.Scheme, objOrRef) 163 | if err != nil { 164 | return nil, err 165 | } 166 | stringRefKind := string(ref.Kind) 167 | var refKind *string 168 | if len(stringRefKind) > 0 { 169 | refKind = &stringRefKind 170 | } 171 | stringRefUID := string(ref.UID) 172 | var refUID *string 173 | if len(stringRefUID) > 0 { 174 | refUID = &stringRefUID 175 | } 176 | 177 | e := client.Events(ref.Namespace) 178 | fieldSelector := e.GetFieldSelector(&ref.Name, &ref.Namespace, refKind, refUID) 179 | initialOpts := metav1.ListOptions{FieldSelector: fieldSelector.String(), Limit: limit} 180 | eventList := &corev1.EventList{} 181 | err = runtimeresource.FollowContinue(&initialOpts, 182 | func(options metav1.ListOptions) (runtime.Object, error) { 183 | newEvents, err := e.List(context.TODO(), options) 184 | if err != nil { 185 | return nil, runtimeresource.EnhanceListError(err, options, "events") 186 | } 187 | eventList.Items = append(eventList.Items, newEvents.Items...) 188 | return newEvents, nil 189 | }) 190 | return eventList, err 191 | } 192 | 193 | func describeContainers(label string, containers []corev1.Container, containerStatuses []corev1.ContainerStatus, 194 | resolverFn describe.EnvVarResolverFunc, w describe.PrefixWriter, space string) { 195 | statuses := map[string]corev1.ContainerStatus{} 196 | for _, status := range containerStatuses { 197 | statuses[status.Name] = status 198 | } 199 | 200 | for _, container := range containers { 201 | status, ok := statuses[container.Name] 202 | describeContainerBasicInfo(container, status, ok, space, w) 203 | if ok { 204 | describeContainerState(status, w) 205 | } 206 | } 207 | } 208 | 209 | func describeContainerBasicInfo(container corev1.Container, status corev1.ContainerStatus, ok bool, space string, w describe.PrefixWriter) { 210 | nameIndent := "" 211 | if len(space) > 0 { 212 | nameIndent = " " 213 | } 214 | w.Write(describe.LEVEL_1, "%s%v:\n", nameIndent, container.Name) 215 | if ok { 216 | w.Write(describe.LEVEL_2, "Container ID:\t%s\n", status.ContainerID) 217 | } 218 | w.Write(describe.LEVEL_2, "Image:\t%s\n", container.Image) 219 | if ok { 220 | w.Write(describe.LEVEL_2, "Image ID:\t%s\n", status.ImageID) 221 | } 222 | } 223 | 224 | func describeContainerState(status corev1.ContainerStatus, w describe.PrefixWriter) { 225 | describeStatus("State", status.State, w) 226 | if status.LastTerminationState.Terminated != nil { 227 | describeStatus("Last State", status.LastTerminationState, w) 228 | } 229 | w.Write(describe.LEVEL_2, "Ready:\t%v\n", printBool(status.Ready)) 230 | w.Write(describe.LEVEL_2, "Restart Count:\t%d\n", status.RestartCount) 231 | } 232 | 233 | func describeStatus(stateName string, state corev1.ContainerState, w describe.PrefixWriter) { 234 | switch { 235 | case state.Running != nil: 236 | w.Write(describe.LEVEL_2, "%s:\tRunning\n", stateName) 237 | w.Write(describe.LEVEL_3, "Started:\t%v\n", state.Running.StartedAt.Time.Format(time.RFC1123Z)) 238 | case state.Waiting != nil: 239 | w.Write(describe.LEVEL_2, "%s:\tWaiting\n", stateName) 240 | if state.Waiting.Reason != "" { 241 | w.Write(describe.LEVEL_3, "Reason:\t%s\n", state.Waiting.Reason) 242 | } 243 | case state.Terminated != nil: 244 | w.Write(describe.LEVEL_2, "%s:\tTerminated\n", stateName) 245 | if state.Terminated.Reason != "" { 246 | w.Write(describe.LEVEL_3, "Reason:\t%s\n", state.Terminated.Reason) 247 | } 248 | if state.Terminated.Message != "" { 249 | w.Write(describe.LEVEL_3, "Message:\t%s\n", state.Terminated.Message) 250 | } 251 | w.Write(describe.LEVEL_3, "Exit Code:\t%d\n", state.Terminated.ExitCode) 252 | if state.Terminated.Signal > 0 { 253 | w.Write(describe.LEVEL_3, "Signal:\t%d\n", state.Terminated.Signal) 254 | } 255 | w.Write(describe.LEVEL_3, "Started:\t%s\n", state.Terminated.StartedAt.Time.Format(time.RFC1123Z)) 256 | w.Write(describe.LEVEL_3, "Finished:\t%s\n", state.Terminated.FinishedAt.Time.Format(time.RFC1123Z)) 257 | default: 258 | w.Write(describe.LEVEL_2, "%s:\tWaiting\n", stateName) 259 | } 260 | } 261 | 262 | func translateTimestampSince(timestamp metav1.Time) string { 263 | if timestamp.IsZero() { 264 | return "" 265 | } 266 | 267 | return duration.HumanDuration(time.Since(timestamp.Time)) 268 | } 269 | 270 | func printBool(value bool) string { 271 | if value { 272 | return "True" 273 | } 274 | 275 | return "False" 276 | } 277 | --------------------------------------------------------------------------------