├── resource
    └── npd
    │   ├── yml-json-rules.png
    │   ├── yml-your_json_name.png
    │   ├── yml-json-conditions.png
    │   ├── yml-json-your_json_name.png
    │   ├── npd-dashboard-DNSProblem.png
    │   └── npd-dashboard-daemonsets.png
├── Dockerfile
├── pkg
    ├── env
    │   ├── darwin.go
    │   ├── windows.go
    │   ├── linux.go
    │   ├── env.go
    │   ├── k8s.go
    │   └── azure.go
    ├── tools
    │   ├── aadssh
    │   │   ├── ssh_agent_unix.go
    │   │   ├── ssh_agent_win.go
    │   │   ├── ssh_agent.go
    │   │   ├── transport.go
    │   │   ├── token_azure_cli.go
    │   │   ├── aadssh.go
    │   │   ├── token.go
    │   │   └── ssh.go
    │   ├── registry.go
    │   ├── tool.go
    │   ├── vmrebootdetector
    │   │   ├── vmrebootdetector_test.go
    │   │   └── vmrebootdetector.go
    │   ├── upgradeinspector
    │   │   ├── upgradeinspector_test.go
    │   │   └── upgradeinspector.go
    │   ├── tcpdump
    │   │   ├── tcpdump_test.go
    │   │   └── tcpdump.go
    │   └── netexec
    │   │   ├── netexec_test.go
    │   │   └── netexec.go
    ├── batch
    │   ├── static_discoverer.go
    │   ├── batch.go
    │   ├── file_discoverer_test.go
    │   ├── file_discoverer.go
    │   ├── kube_discoverer_test.go
    │   ├── kube_discoverer.go
    │   ├── ssh_executor.go
    │   └── pod_executor.go
    ├── formatters
    │   ├── formatter.go
    │   ├── json.go
    │   ├── oneline.go
    │   └── text.go
    ├── checkers
    │   ├── dummy
    │   │   └── dummy.go
    │   ├── checker.go
    │   ├── tcpping
    │   │   ├── tcpping_test.go
    │   │   └── tcpping.go
    │   ├── kube
    │   │   ├── objectsize
    │   │   │   ├── objectsize_test.go
    │   │   │   └── objectsize.go
    │   │   └── pod
    │   │   │   └── pod_restart_reason_checker.go
    │   ├── systemload
    │   │   ├── systemload_test.go
    │   │   └── systemload.go
    │   ├── podschedule
    │   │   ├── podschedule_test.go
    │   │   └── podschedule.go
    │   ├── icmp
    │   │   ├── icmp_test.go
    │   │   └── icmp.go
    │   ├── registry.go
    │   ├── liveness
    │   │   └── liveness.go
    │   ├── oom
    │   │   ├── oom_test.go
    │   │   └── oom.go
    │   ├── diskusage
    │   │   ├── diskusage_test.go
    │   │   └── diskusage.go
    │   ├── http
    │   │   └── http.go
    │   ├── diskreadonly
    │   │   └── disk_readonly.go
    │   ├── kmscachesize
    │   │   └── kms_cache_size.go
    │   └── dns
    │   │   ├── dns_test.go
    │   │   └── dns.go
    └── base
    │   └── models.go
├── cmd
    ├── pause.go
    ├── batch.go
    ├── run-as-host
    │   └── main.go
    └── main.go
├── .gitignore
├── Makefile
├── CODE_OF_CONDUCT.md
├── .github
    └── workflows
    │   ├── go.yml
    │   ├── release.yml
    │   └── container.yml
├── LICENSE
├── .devcontainer
    ├── Dockerfile
    └── devcontainer.json
├── SUPPORT.md
├── SECURITY.md
├── deploy
    └── node-problem-detector
    │   ├── README.md
    │   ├── node-problem-detector.yaml
    │   └── node-problem-detector-template.yaml
├── go.mod
└── README.md


/resource/npd/yml-json-rules.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/kdebug/HEAD/resource/npd/yml-json-rules.png


--------------------------------------------------------------------------------
/resource/npd/yml-your_json_name.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/kdebug/HEAD/resource/npd/yml-your_json_name.png


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM gcr.io/distroless/static-debian11
2 | 
3 | ADD bin/kdebug bin/run-as-host /
4 | 
5 | CMD [ "/kdebug" ]
6 | 


--------------------------------------------------------------------------------
/resource/npd/yml-json-conditions.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/kdebug/HEAD/resource/npd/yml-json-conditions.png


--------------------------------------------------------------------------------
/resource/npd/yml-json-your_json_name.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/kdebug/HEAD/resource/npd/yml-json-your_json_name.png


--------------------------------------------------------------------------------
/resource/npd/npd-dashboard-DNSProblem.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/kdebug/HEAD/resource/npd/npd-dashboard-DNSProblem.png


--------------------------------------------------------------------------------
/resource/npd/npd-dashboard-daemonsets.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure/kdebug/HEAD/resource/npd/npd-dashboard-daemonsets.png


--------------------------------------------------------------------------------
/pkg/env/darwin.go:
--------------------------------------------------------------------------------
 1 | //go:build darwin
 2 | 
 3 | package env
 4 | 
 5 | import "runtime"
 6 | 
 7 | func getLinuxFlags() []string {
 8 | 	return []string{
 9 | 		runtime.GOOS,
10 | 	}
11 | }
12 | 


--------------------------------------------------------------------------------
/pkg/env/windows.go:
--------------------------------------------------------------------------------
 1 | //go:build windows
 2 | 
 3 | package env
 4 | 
 5 | import (
 6 | 	"runtime"
 7 | )
 8 | 
 9 | func getLinuxFlags() []string {
10 | 	return []string{
11 | 		runtime.GOOS,
12 | 	}
13 | }
14 | 


--------------------------------------------------------------------------------
/pkg/tools/aadssh/ssh_agent_unix.go:
--------------------------------------------------------------------------------
 1 | //go:build !windows
 2 | 
 3 | package aadssh
 4 | 
 5 | import "net"
 6 | 
 7 | func dialSSHAgent(path string) (net.Conn, error) {
 8 | 	return net.Dial("unix", path)
 9 | }
10 | 


--------------------------------------------------------------------------------
/pkg/batch/static_discoverer.go:
--------------------------------------------------------------------------------
 1 | package batch
 2 | 
 3 | type StaticBatchDiscoverer struct {
 4 | 	Machines []string
 5 | }
 6 | 
 7 | func (d *StaticBatchDiscoverer) Discover() ([]string, error) {
 8 | 	return d.Machines, nil
 9 | }
10 | 


--------------------------------------------------------------------------------
/pkg/tools/aadssh/ssh_agent_win.go:
--------------------------------------------------------------------------------
 1 | //go:build windows
 2 | 
 3 | package aadssh
 4 | 
 5 | import (
 6 | 	"net"
 7 | 
 8 | 	"github.com/Microsoft/go-winio"
 9 | )
10 | 
11 | func dialSSHAgent(path string) (net.Conn, error) {
12 | 	return winio.DialPipe(path, nil)
13 | }
14 | 


--------------------------------------------------------------------------------
/cmd/pause.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"os"
 6 | 	"os/signal"
 7 | )
 8 | 
 9 | func pause() {
10 | 	c := make(chan os.Signal, 1)
11 | 	signal.Notify(c, os.Interrupt)
12 | 	signal.Notify(c, os.Kill)
13 | 	s := <-c
14 | 	fmt.Printf("Shutting down, got signal: %s", s)
15 | }
16 | 


--------------------------------------------------------------------------------
/pkg/formatters/formatter.go:
--------------------------------------------------------------------------------
 1 | package formatters
 2 | 
 3 | import (
 4 | 	"io"
 5 | 
 6 | 	"github.com/Azure/kdebug/pkg/base"
 7 | 	"github.com/Azure/kdebug/pkg/batch"
 8 | )
 9 | 
10 | type Formatter interface {
11 | 	WriteResults(io.Writer, []*base.CheckResult) error
12 | 	WriteBatchResults(io.Writer, []*batch.BatchResult) error
13 | }
14 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Binaries for programs and plugins
 2 | *.exe
 3 | *.exe~
 4 | *.dll
 5 | *.so
 6 | *.dylib
 7 | 
 8 | # Test binary, built with `go test -c`
 9 | *.test
10 | 
11 | # Output of the go coverage tool, specifically when used with LiteIDE
12 | *.out
13 | 
14 | # Dependency directories
15 | vendor/
16 | 
17 | # Output directory
18 | bin/
19 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | build:
 2 | 	CGO_ENABLED=0 go build -o bin/kdebug github.com/Azure/kdebug/cmd
 3 | 	CGO_ENABLED=0 go build -o bin/run-as-host github.com/Azure/kdebug/cmd/run-as-host
 4 | 
 5 | build-win:
 6 | 	CGO_ENABLED=0 GOOS=windows go build -o bin/kdebug.exe github.com/Azure/kdebug/cmd
 7 | 
 8 | test:
 9 | 	CGO_ENABLED=0 go test -v github.com/Azure/kdebug/...
10 | 


--------------------------------------------------------------------------------
/pkg/env/linux.go:
--------------------------------------------------------------------------------
 1 | //go:build linux
 2 | 
 3 | package env
 4 | 
 5 | import (
 6 | 	"os"
 7 | 	"runtime"
 8 | 	"strings"
 9 | 
10 | 	"github.com/zcalusic/sysinfo"
11 | )
12 | 
13 | func getLinuxFlags() []string {
14 | 	var si sysinfo.SysInfo
15 | 	si.GetSysInfo()
16 | 	flags := []string{
17 | 		runtime.GOOS,
18 | 		strings.ToLower(si.OS.Vendor),
19 | 	}
20 | 
21 | 	if os.Geteuid() == 0 {
22 | 		flags = append(flags, "root")
23 | 	}
24 | 
25 | 	return flags
26 | }
27 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Microsoft Open Source Code of Conduct
 2 | 
 3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
 4 | 
 5 | Resources:
 6 | 
 7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/)
 8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
 9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns
10 | 


--------------------------------------------------------------------------------
/pkg/formatters/json.go:
--------------------------------------------------------------------------------
 1 | package formatters
 2 | 
 3 | import (
 4 | 	"encoding/json"
 5 | 	"io"
 6 | 
 7 | 	"github.com/Azure/kdebug/pkg/base"
 8 | 	"github.com/Azure/kdebug/pkg/batch"
 9 | )
10 | 
11 | type JsonFormatter struct{}
12 | 
13 | func (f *JsonFormatter) WriteResults(w io.Writer, results []*base.CheckResult) error {
14 | 	enc := json.NewEncoder(w)
15 | 	enc.SetIndent("", "    ")
16 | 	return enc.Encode(results)
17 | }
18 | 
19 | func (f *JsonFormatter) WriteBatchResults(w io.Writer, results []*batch.BatchResult) error {
20 | 	// TODO
21 | 	return nil
22 | }
23 | 


--------------------------------------------------------------------------------
/.github/workflows/go.yml:
--------------------------------------------------------------------------------
 1 | name: Go
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ main ]
 6 |   pull_request:
 7 |     branches: [ main ]
 8 | 
 9 | jobs:
10 | 
11 |   build:
12 |     runs-on: ubuntu-latest
13 |     steps:
14 |     - uses: actions/checkout@v2
15 | 
16 |     - name: Set up Go
17 |       uses: actions/setup-go@v2
18 |       with:
19 |         go-version: "1.20"
20 | 
21 |     - name: Build
22 |       run: make build
23 | 
24 |     - name: Test
25 |       run: make test
26 | 
27 |     - name: Upload
28 |       uses: actions/upload-artifact@v3
29 |       with:
30 |         name: kdebug
31 |         path: bin/kdebug
32 | 


--------------------------------------------------------------------------------
/.github/workflows/release.yml:
--------------------------------------------------------------------------------
 1 | name: Release
 2 | 
 3 | on:
 4 |   release:
 5 |     types: [ created ]
 6 | 
 7 | jobs:
 8 | 
 9 |   build:
10 |     runs-on: ubuntu-latest
11 |     permissions:
12 |       contents: write
13 |     steps:
14 |     - uses: actions/checkout@v2
15 | 
16 |     - name: Set up Go
17 |       uses: actions/setup-go@v2
18 |       with:
19 |         go-version: "1.20"
20 | 
21 |     - name: Build
22 |       run: make build
23 | 
24 |     - name: Test
25 |       run: make test
26 | 
27 |     - name: Release
28 |       uses: softprops/action-gh-release@v1
29 |       with:
30 |         files: bin/kdebug
31 |         generate_release_notes: true
32 | 
33 | 


--------------------------------------------------------------------------------
/pkg/env/env.go:
--------------------------------------------------------------------------------
 1 | package env
 2 | 
 3 | type Environment interface {
 4 | 	HasFlag(flag string) bool
 5 | }
 6 | 
 7 | type StaticEnvironment struct {
 8 | 	Flags []string
 9 | }
10 | 
11 | func (e *StaticEnvironment) HasFlag(flag string) bool {
12 | 	for _, f := range e.Flags {
13 | 		if flag == f {
14 | 			return true
15 | 		}
16 | 	}
17 | 	return false
18 | }
19 | 
20 | func GetEnvironment() Environment {
21 | 	return &StaticEnvironment{
22 | 		Flags: getFlags(),
23 | 	}
24 | }
25 | 
26 | func getFlags() []string {
27 | 	flags := []string{}
28 | 	flags = append(flags, getLinuxFlags()...)
29 | 	flags = append(flags, getAzureFlags()...)
30 | 	flags = append(flags, getK8sFlags()...)
31 | 
32 | 	return flags
33 | }
34 | 


--------------------------------------------------------------------------------
/pkg/batch/batch.go:
--------------------------------------------------------------------------------
 1 | package batch
 2 | 
 3 | import "github.com/Azure/kdebug/pkg/base"
 4 | 
 5 | type BatchOptions struct {
 6 | 	Machines    []string
 7 | 	Checkers    []string
 8 | 	Concurrency int
 9 | 	Reporter    BatchReportor
10 | }
11 | 
12 | type batchTask struct {
13 | 	Machine  string
14 | 	Checkers []string
15 | }
16 | 
17 | type BatchResult struct {
18 | 	Machine      string
19 | 	Error        error
20 | 	CheckResults []*base.CheckResult
21 | }
22 | 
23 | type BatchExecutor interface {
24 | 	Execute(opts *BatchOptions) ([]*BatchResult, error)
25 | }
26 | 
27 | type BatchReportor interface {
28 | 	OnResult(result *BatchResult)
29 | }
30 | 
31 | type BatchDiscoverer interface {
32 | 	Discover() ([]string, error)
33 | }
34 | 


--------------------------------------------------------------------------------
/pkg/batch/file_discoverer_test.go:
--------------------------------------------------------------------------------
 1 | package batch
 2 | 
 3 | import (
 4 | 	"io/ioutil"
 5 | 	"os"
 6 | 	"reflect"
 7 | 	"testing"
 8 | )
 9 | 
10 | func TestFileBatchDiscoverer(t *testing.T) {
11 | 	f, err := ioutil.TempFile("", "batch-discover")
12 | 	if err != nil {
13 | 		t.Errorf("Fail to create temp file")
14 | 	}
15 | 	defer os.Remove(f.Name())
16 | 	if _, err = f.Write([]byte("m1\nm2")); err != nil {
17 | 		t.Errorf("Fail to write temp file")
18 | 	}
19 | 	f.Close()
20 | 
21 | 	d := &FileBatchDiscoverer{Path: f.Name()}
22 | 	machines, err := d.Discover()
23 | 	if err != nil {
24 | 		t.Errorf("Expect no error but got: %+v", err)
25 | 	}
26 | 	if !reflect.DeepEqual(machines, []string{"m1", "m2"}) {
27 | 		t.Errorf("Discovered machines list is not correct: %+v", machines)
28 | 	}
29 | }
30 | 


--------------------------------------------------------------------------------
/pkg/tools/registry.go:
--------------------------------------------------------------------------------
 1 | package tools
 2 | 
 3 | import (
 4 | 	"sort"
 5 | 
 6 | 	"github.com/Azure/kdebug/pkg/tools/aadssh"
 7 | 	"github.com/Azure/kdebug/pkg/tools/netexec"
 8 | 	"github.com/Azure/kdebug/pkg/tools/tcpdump"
 9 | 	"github.com/Azure/kdebug/pkg/tools/upgradeinspector"
10 | 	"github.com/Azure/kdebug/pkg/tools/vmrebootdetector"
11 | )
12 | 
13 | var allTools = map[string]Tool{
14 | 	"tcpdump":         tcpdump.New(),
15 | 	"vmrebootinspect": vmrebootdetector.New(),
16 | 	"upgradesinspect": upgradeinspector.New(),
17 | 	"aadssh":          aadssh.New(),
18 | 	"netexec":         netexec.New(),
19 | }
20 | 
21 | func ListAllToolNames() []string {
22 | 	names := make([]string, 0, len(allTools))
23 | 	for n := range allTools {
24 | 		names = append(names, n)
25 | 	}
26 | 	sort.Strings(names)
27 | 	return names
28 | }
29 | 


--------------------------------------------------------------------------------
/pkg/checkers/dummy/dummy.go:
--------------------------------------------------------------------------------
 1 | package dummy
 2 | 
 3 | import (
 4 | 	"os"
 5 | 
 6 | 	"github.com/Azure/kdebug/pkg/base"
 7 | )
 8 | 
 9 | type DummyChecker struct {
10 | }
11 | 
12 | var okResult = base.CheckResult{
13 | 	Checker: "Dummy",
14 | }
15 | 
16 | var failResult = base.CheckResult{
17 | 	Checker:     "Dummy",
18 | 	Error:       "Dummy failure",
19 | 	Description: "This is a dummy failure",
20 | 	Recommendations: []string{
21 | 		"Remove environment variable `KDEBUG_DUMMY_FAIL`.",
22 | 	},
23 | }
24 | 
25 | func (c *DummyChecker) Name() string {
26 | 	return "Dummy"
27 | }
28 | 
29 | func (c *DummyChecker) Check(_ *base.CheckContext) ([]*base.CheckResult, error) {
30 | 	if os.Getenv("KDEBUG_DUMMY_FAIL") == "1" {
31 | 		return []*base.CheckResult{&failResult}, nil
32 | 	} else {
33 | 		return []*base.CheckResult{&okResult}, nil
34 | 	}
35 | }
36 | 


--------------------------------------------------------------------------------
/pkg/tools/tool.go:
--------------------------------------------------------------------------------
 1 | package tools
 2 | 
 3 | import (
 4 | 	"errors"
 5 | 
 6 | 	"github.com/Azure/kdebug/pkg/base"
 7 | )
 8 | 
 9 | type Tool interface {
10 | 	Name() string
11 | 	ParseArgs(*base.ToolContext, []string) error
12 | 	Run(*base.ToolContext) error
13 | }
14 | 
15 | func getTool(name string) (Tool, error) {
16 | 	if tool, ok := allTools[name]; ok {
17 | 		return tool, nil
18 | 	} else {
19 | 		return nil, errors.New("Unknown tool: " + name)
20 | 	}
21 | }
22 | 
23 | func ParseArgs(ctx *base.ToolContext, name string, args []string) error {
24 | 	tool, err := getTool(name)
25 | 	if err != nil {
26 | 		return err
27 | 	}
28 | 	return tool.ParseArgs(ctx, args)
29 | }
30 | 
31 | func Run(ctx *base.ToolContext, name string) error {
32 | 	tool, err := getTool(name)
33 | 	if err != nil {
34 | 		return err
35 | 	}
36 | 	return tool.Run(ctx)
37 | }
38 | 


--------------------------------------------------------------------------------
/pkg/tools/vmrebootdetector/vmrebootdetector_test.go:
--------------------------------------------------------------------------------
 1 | package vmrebootdetector
 2 | 
 3 | import (
 4 | 	"strings"
 5 | 	"testing"
 6 | 
 7 | 	"github.com/Azure/kdebug/pkg/base"
 8 | )
 9 | 
10 | func TestReboot(t *testing.T) {
11 | 	tool := Tool{}
12 | 	err := tool.Run(&base.ToolContext{
13 | 		Config: &Config{},
14 | 	})
15 | 	if err != nil {
16 | 		t.Error(err)
17 | 	}
18 | }
19 | 
20 | func TestRebootParser(t *testing.T) {
21 | 	lastContent := "reboot   system boot  5.4.0-1074-azure 2022-05-27T04:51:43+0000   still running\nreboot   system boot  5.4.0-1074-azure 2022-04-04T07:49:09+0000 - 2022-04-20T17:12:20+0000 (16+09:23)\n\nwtmp begins 2022-04-04T07:47:27+0000\n"
22 | 	tool := Tool{}
23 | 	result := tool.parseResult(lastContent)
24 | 	if !strings.Contains(result, "Detect") {
25 | 		t.Error("VMRebootCheck failed to parse reboot result")
26 | 	}
27 | }
28 | 


--------------------------------------------------------------------------------
/pkg/tools/aadssh/ssh_agent.go:
--------------------------------------------------------------------------------
 1 | package aadssh
 2 | 
 3 | import (
 4 | 	"crypto/rsa"
 5 | 	"time"
 6 | 
 7 | 	"golang.org/x/crypto/ssh"
 8 | 	"golang.org/x/crypto/ssh/agent"
 9 | )
10 | 
11 | // addSSHKeyToAgent adds SSH key to SSH agent
12 | // sockPath can be a unix socket on Unix or a named pipe on Windows
13 | func addSSHKeyToAgent(
14 | 	sockPath string,
15 | 	sshPrivKey *rsa.PrivateKey,
16 | 	sshCert *ssh.Certificate) error {
17 | 
18 | 	conn, err := dialSSHAgent(sockPath)
19 | 	if err != nil {
20 | 		return err
21 | 	}
22 | 	defer conn.Close()
23 | 
24 | 	lifeTimeSecs := uint32(uint64(time.Now().Unix()) - sshCert.ValidBefore)
25 | 
26 | 	client := agent.NewClient(conn)
27 | 	return client.Add(agent.AddedKey{
28 | 		Comment:      "AAD SSH Key",
29 | 		PrivateKey:   sshPrivKey,
30 | 		Certificate:  sshCert,
31 | 		LifetimeSecs: lifeTimeSecs,
32 | 	})
33 | }
34 | 


--------------------------------------------------------------------------------
/pkg/checkers/checker.go:
--------------------------------------------------------------------------------
 1 | package checker
 2 | 
 3 | import (
 4 | 	"errors"
 5 | 
 6 | 	log "github.com/sirupsen/logrus"
 7 | 
 8 | 	"github.com/Azure/kdebug/pkg/base"
 9 | )
10 | 
11 | type Checker interface {
12 | 	Name() string
13 | 	Check(*base.CheckContext) ([]*base.CheckResult, error)
14 | }
15 | 
16 | func Check(ctx *base.CheckContext, checkerNames []string) ([]*base.CheckResult, error) {
17 | 	checkers := make([]Checker, 0, len(checkerNames))
18 | 
19 | 	for _, name := range checkerNames {
20 | 		if checker, ok := allCheckers[name]; ok {
21 | 			checkers = append(checkers, checker)
22 | 		} else {
23 | 			return nil, errors.New("Unknown checker: " + name)
24 | 		}
25 | 	}
26 | 	var results []*base.CheckResult
27 | 	for _, checker := range checkers {
28 | 		r, err := checker.Check(ctx)
29 | 		if err != nil {
30 | 			log.Warnf("Checker(%s): %s", checker.Name(), err)
31 | 		}
32 | 		results = append(results, r...)
33 | 	}
34 | 
35 | 	return results, nil
36 | }
37 | 


--------------------------------------------------------------------------------
/pkg/batch/file_discoverer.go:
--------------------------------------------------------------------------------
 1 | package batch
 2 | 
 3 | import (
 4 | 	"bufio"
 5 | 	"fmt"
 6 | 	"os"
 7 | 	"strings"
 8 | )
 9 | 
10 | type FileBatchDiscoverer struct {
11 | 	Path string
12 | }
13 | 
14 | func (d *FileBatchDiscoverer) Discover() ([]string, error) {
15 | 	var file *os.File
16 | 	var err error
17 | 
18 | 	if d.Path == "-" {
19 | 		file = os.Stdin
20 | 	} else {
21 | 		file, err = os.Open(d.Path)
22 | 		if err != nil {
23 | 			return nil, fmt.Errorf("Fail to open machines list file %s due to: %+v",
24 | 				d.Path, err)
25 | 		}
26 | 		defer file.Close()
27 | 	}
28 | 
29 | 	var machines []string
30 | 	scanner := bufio.NewScanner(file)
31 | 	for scanner.Scan() {
32 | 		line := string(scanner.Text())
33 | 		line = strings.TrimSpace(line)
34 | 		machines = append(machines, line)
35 | 	}
36 | 
37 | 	if err := scanner.Err(); err != nil {
38 | 		return nil, fmt.Errorf("Fail to read machines list file %s due to: %+v",
39 | 			d.Path, err)
40 | 	}
41 | 
42 | 	return machines, nil
43 | }
44 | 


--------------------------------------------------------------------------------
/pkg/checkers/tcpping/tcpping_test.go:
--------------------------------------------------------------------------------
 1 | package tcpping
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"github.com/Azure/kdebug/pkg/base"
 6 | 	"math/rand"
 7 | 	"net"
 8 | 	"strings"
 9 | 	"testing"
10 | )
11 | 
12 | func TestCheck(t *testing.T) {
13 | 	checker := &TCPChecker{
14 | 		dialer: net.Dialer{
15 | 			Timeout: TimeOut,
16 | 		},
17 | 		targets: []pingEndpoint{
18 | 			{
19 | 				ServerAddress: "fooTest",
20 | 				Name:          fmt.Sprintf("%d.kdebug:80", rand.Int()),
21 | 			},
22 | 		},
23 | 	}
24 | 	context := &base.CheckContext{}
25 | 	results, err := checker.Check(context)
26 | 	if err != nil {
27 | 		t.Errorf("check fail %v\n", err)
28 | 	}
29 | 	for _, result := range results {
30 | 		if strings.Contains(result.Description, "fooTest") {
31 | 			if result.Error == "" {
32 | 				t.Errorf("fooTest didn't fail")
33 | 			}
34 | 		}
35 | 		if strings.Contains(result.Description, "Google") {
36 | 			if result.Error != "" {
37 | 				t.Errorf("google test fail %v\n", result.Error)
38 | 			}
39 | 		}
40 | 	}
41 | }
42 | 


--------------------------------------------------------------------------------
/pkg/base/models.go:
--------------------------------------------------------------------------------
 1 | package base
 2 | 
 3 | import (
 4 | 	"io"
 5 | 
 6 | 	"k8s.io/cli-runtime/pkg/genericclioptions"
 7 | 	"k8s.io/client-go/kubernetes"
 8 | 
 9 | 	"github.com/Azure/kdebug/pkg/env"
10 | )
11 | 
12 | type CheckContext struct {
13 | 	// TODO: Add user input here
14 | 	Pod struct {
15 | 		Name      string
16 | 		Namespace string
17 | 	}
18 | 
19 | 	// TODO: Add shared dependencies here, for example, kube-client
20 | 	Environment env.Environment
21 | 	KubeClient  *kubernetes.Clientset
22 | 	Output      io.Writer
23 | }
24 | 
25 | type ToolContext struct {
26 | 	Args           []string
27 | 	Config         interface{}
28 | 	Environment    env.Environment
29 | 	KubeConfigFlag *genericclioptions.ConfigFlags
30 | }
31 | 
32 | type CheckResult struct {
33 | 	Checker         string
34 | 	Error           string
35 | 	Description     string
36 | 	Recommendations []string
37 | 	Logs            []string
38 | 	HelpLinks       []string
39 | }
40 | 
41 | func (r *CheckResult) Ok() bool {
42 | 	return r.Error == ""
43 | }
44 | 


--------------------------------------------------------------------------------
/pkg/checkers/kube/objectsize/objectsize_test.go:
--------------------------------------------------------------------------------
 1 | package dns
 2 | 
 3 | import (
 4 | 	"testing"
 5 | 
 6 | 	v1 "k8s.io/api/core/v1"
 7 | )
 8 | 
 9 | func TestCheckObjectSize_OK(t *testing.T) {
10 | 	cm := v1.ConfigMap{
11 | 		BinaryData: map[string][]byte{
12 | 			"key": make([]byte, 100),
13 | 		},
14 | 	}
15 | 	checker := New()
16 | 	result := checker.checkObjectSize("ConfigMap", "default", "cm", cm)
17 | 	if !result.Ok() {
18 | 		t.Errorf("Expect ok result but got %+v", result)
19 | 	}
20 | }
21 | 
22 | func TestCheckObjectSize_Warn(t *testing.T) {
23 | 	cm := v1.ConfigMap{
24 | 		BinaryData: map[string][]byte{
25 | 			"key": make([]byte, WarnSizeThreshold+1),
26 | 		},
27 | 	}
28 | 	checker := New()
29 | 	result := checker.checkObjectSize("ConfigMap", "default", "cm", cm)
30 | 	if result.Ok() {
31 | 		t.Errorf("Expect non ok result but got %+v", result)
32 | 	}
33 | 	if result.Error == "" || result.Description == "" || len(result.Recommendations) == 0 {
34 | 		t.Errorf("Expect non empty result but got %+v", result)
35 | 	}
36 | }
37 | 


--------------------------------------------------------------------------------
/pkg/env/k8s.go:
--------------------------------------------------------------------------------
 1 | package env
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"os"
 6 | 	"strings"
 7 | 
 8 | 	"github.com/shirou/gopsutil/v3/process"
 9 | 	log "github.com/sirupsen/logrus"
10 | )
11 | 
12 | const KubernetesServiceHost = "KUBERNETES_SERVICE_HOST"
13 | 
14 | func getK8sFlags() []string {
15 | 	var flags []string
16 | 	if inK8s() {
17 | 		flags = append(flags, "k8s")
18 | 	}
19 | 	return flags
20 | }
21 | 
22 | func inK8s() bool {
23 | 	//check if in a pod
24 | 	for _, e := range os.Environ() {
25 | 		if strings.Contains(e, KubernetesServiceHost) {
26 | 			return true
27 | 		}
28 | 	}
29 | 	// check in a host vm
30 | 	processes, err := process.Processes()
31 | 	if err != nil {
32 | 		log.Warn(fmt.Sprintf("List process error %v\n", err))
33 | 		return false
34 | 	} else {
35 | 		for _, proc := range processes {
36 | 			name, err := proc.Name()
37 | 			if err != nil {
38 | 				log.Warn(fmt.Sprintf("List process error %v. Skip in-cluster tcp checking\n", err))
39 | 				return false
40 | 			}
41 | 			if name == "kubelet" {
42 | 				return true
43 | 			}
44 | 		}
45 | 	}
46 | 	return false
47 | }
48 | 


--------------------------------------------------------------------------------
/pkg/batch/kube_discoverer_test.go:
--------------------------------------------------------------------------------
 1 | package batch
 2 | 
 3 | import (
 4 | 	"testing"
 5 | 
 6 | 	corev1 "k8s.io/api/core/v1"
 7 | )
 8 | 
 9 | func TestMatchNode(t *testing.T) {
10 | 	d := &KubeBatchDiscoverer{}
11 | 	node := &corev1.Node{}
12 | 	if !d.matchNode(node) {
13 | 		t.Errorf("Expect matchNode == true when not specifying unready but got false")
14 | 	}
15 | 
16 | 	d = &KubeBatchDiscoverer{unready: true}
17 | 	node = &corev1.Node{
18 | 		Status: corev1.NodeStatus{
19 | 			Conditions: []corev1.NodeCondition{
20 | 				{
21 | 					Type:   corev1.NodeReady,
22 | 					Status: corev1.ConditionFalse,
23 | 				},
24 | 			},
25 | 		},
26 | 	}
27 | 	if !d.matchNode(node) {
28 | 		t.Errorf("Expect matchNode == true when specifying unready and node is unready but got false")
29 | 	}
30 | 
31 | 	node = &corev1.Node{
32 | 		Status: corev1.NodeStatus{
33 | 			Conditions: []corev1.NodeCondition{
34 | 				{
35 | 					Type:   corev1.NodeReady,
36 | 					Status: corev1.ConditionTrue,
37 | 				},
38 | 			},
39 | 		},
40 | 	}
41 | 	if d.matchNode(node) {
42 | 		t.Errorf("Expect matchNode == false when specifying unready and node is ready but got true")
43 | 	}
44 | }
45 | 


--------------------------------------------------------------------------------
/pkg/env/azure.go:
--------------------------------------------------------------------------------
 1 | package env
 2 | 
 3 | import (
 4 | 	"net/http"
 5 | 	"os"
 6 | 	"time"
 7 | )
 8 | 
 9 | const (
10 | 	AzureIMDSEndpoint = "http://169.254.169.254/metadata"
11 | )
12 | 
13 | func getAzureFlags() []string {
14 | 	// IMDS should exist on Azure VMs
15 | 	client := &http.Client{
16 | 		Timeout: time.Second,
17 | 	}
18 | 	req, _ := http.NewRequest("GET", AzureIMDSEndpoint+"/instance?api-version=2017-03-01", nil)
19 | 	req.Header.Set("Metadata", "true")
20 | 	resp, err := client.Do(req)
21 | 	if err != nil {
22 | 		// Not on Azure
23 | 		return []string{}
24 | 	}
25 | 	defer resp.Body.Close()
26 | 	if resp.StatusCode != http.StatusOK {
27 | 		// Not 200 status, might not be on Azure
28 | 		return []string{}
29 | 	}
30 | 
31 | 	// If we are on Azure, check if it's AKS
32 | 	return append([]string{"azure"}, getAksFlags()...)
33 | }
34 | 
35 | func getAksFlags() []string {
36 | 	// Check kubernetes directory to see if it's a AKS node
37 | 	finfo, err := os.Stat("/etc/kubernetes")
38 | 	if err != nil {
39 | 		return []string{}
40 | 	}
41 | 	if !finfo.IsDir() {
42 | 		// Not a dir
43 | 		return []string{}
44 | 	}
45 | 	return []string{"aks"}
46 | }
47 | 


--------------------------------------------------------------------------------
/pkg/checkers/systemload/systemload_test.go:
--------------------------------------------------------------------------------
 1 | package systemload
 2 | 
 3 | import (
 4 | 	"testing"
 5 | )
 6 | 
 7 | func TestMemPercentage_Success(t *testing.T) {
 8 | 	usage := getMemPercentage(30, 100)
 9 | 	if usage != 70 {
10 | 		t.Errorf("Expect the mem percentage is 70 but got %f", usage)
11 | 	}
12 | }
13 | 
14 | func TestSystemCPUPercentage_Success(t *testing.T) {
15 | 	usage := getSystemCPUPercentage(2000, 5000)
16 | 	if usage != 60 {
17 | 		t.Errorf("Expect the cpu percentage is 60 but got %f", usage)
18 | 	}
19 | }
20 | 
21 | func TestProcessCPUPercentageAsGlobal_Success(t *testing.T) {
22 | 	usage := getProcessCPUPercentageAsGlobal(50, 5000)
23 | 	if usage != 1 {
24 | 		t.Errorf("Expect the process cpu percentage is 1 but got %f", usage)
25 | 	}
26 | }
27 | 
28 | func TestProcessCPUPercentageAsSingleCore_Success(t *testing.T) {
29 | 	usage := getProcessCPUPercentageAsSingleCore(400, 2)
30 | 	if usage != 200 {
31 | 		t.Errorf("Expect the process cpu percentage is 200 but got %f", usage)
32 | 	}
33 | 
34 | 	usage = getProcessCPUPercentageAsSingleCore(100, 10)
35 | 	if usage != 10 {
36 | 		t.Errorf("Expect the process cpu percentage is 10 but got %f", usage)
37 | 	}
38 | }
39 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 |     MIT License
 2 | 
 3 |     Copyright (c) Microsoft Corporation.
 4 | 
 5 |     Permission is hereby granted, free of charge, to any person obtaining a copy
 6 |     of this software and associated documentation files (the "Software"), to deal
 7 |     in the Software without restriction, including without limitation the rights
 8 |     to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 |     copies of the Software, and to permit persons to whom the Software is
10 |     furnished to do so, subject to the following conditions:
11 | 
12 |     The above copyright notice and this permission notice shall be included in all
13 |     copies or substantial portions of the Software.
14 | 
15 |     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 |     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 |     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 |     AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 |     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 |     OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 |     SOFTWARE
22 | 


--------------------------------------------------------------------------------
/.devcontainer/Dockerfile:
--------------------------------------------------------------------------------
 1 | # See here for image contents: https://github.com/microsoft/vscode-dev-containers/tree/v0.234.0/containers/go/.devcontainer/base.Dockerfile
 2 | 
 3 | # [Choice] Go version (use -bullseye variants on local arm64/Apple Silicon): 1, 1.16, 1.17, 1-bullseye, 1.16-bullseye, 1.17-bullseye, 1-buster, 1.16-buster, 1.17-buster
 4 | ARG VARIANT="1.18-bullseye"
 5 | FROM mcr.microsoft.com/vscode/devcontainers/go:0-${VARIANT}
 6 | 
 7 | # [Choice] Node.js version: none, lts/*, 16, 14, 12, 10
 8 | ARG NODE_VERSION="none"
 9 | RUN if [ "${NODE_VERSION}" != "none" ]; then su vscode -c "umask 0002 && . /usr/local/share/nvm/nvm.sh && nvm install ${NODE_VERSION} 2>&1"; fi
10 | 
11 | # [Optional] Uncomment this section to install additional OS packages.
12 | # RUN apt-get update && export DEBIAN_FRONTEND=noninteractive \
13 | #     && apt-get -y install --no-install-recommends <your-package-list-here>
14 | 
15 | # [Optional] Uncomment the next lines to use go get to install anything else you need
16 | # USER vscode
17 | # RUN go get -x <your-dependency-or-tool>
18 | 
19 | # [Optional] Uncomment this line to install global node packages.
20 | # RUN su vscode -c "source /usr/local/share/nvm/nvm.sh && npm install -g <your-package-here>" 2>&1
21 | 


--------------------------------------------------------------------------------
/pkg/tools/upgradeinspector/upgradeinspector_test.go:
--------------------------------------------------------------------------------
 1 | package upgradeinspector
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"testing"
 6 | 	"time"
 7 | 
 8 | 	"github.com/Azure/kdebug/pkg/base"
 9 | )
10 | 
11 | func TestUpgradeParser_Success(t *testing.T) {
12 | 	upgradeInspectTool := New()
13 | 
14 | 	ctx := &base.ToolContext{
15 | 		Config: &Config{CheckDays: 7, RecordLimit: 10},
16 | 	}
17 | 
18 | 	upgradeInspectTool.parseArgument(ctx)
19 | 
20 | 	logTime := time.Now().AddDate(0, 0, -1)
21 | 	dateStr := logTime.Format("2006-01-02")
22 | 
23 | 	logs := fmt.Sprintf("%s 17:12:13 upgrade libubsan1:amd64 12-20220319-1ubuntu1 12.1.0-2ubuntu1~22.04\n", dateStr) +
24 | 		fmt.Sprintf("%s 17:12:13 upgrade gcc-12-base:amd64 12-20220319-1ubuntu1 12.1.0-2ubuntu1~22.04\n", dateStr)
25 | 
26 | 	expected := fmt.Sprintf("\n%-19s\t%-40s\t%-30s\t%-30s\n\n", "Timestamp", "Package", "OldVer", "NewVer") +
27 | 		fmt.Sprintf("%v-%v\t%-40s\t%-30s\t%-30s\n", dateStr, "17:12:13", "libubsan1:amd64", "12-20220319-1ubuntu1", "12.1.0-2ubuntu1~22.04") +
28 | 		fmt.Sprintf("%v-%v\t%-40s\t%-30s\t%-30s\n", dateStr, "17:12:13", "gcc-12-base:amd64", "12-20220319-1ubuntu1", "12.1.0-2ubuntu1~22.04")
29 | 
30 | 	output := upgradeInspectTool.parseResult(logs)
31 | 
32 | 	if output != expected {
33 | 		t.Errorf("UpgradeInspectTool parser output is expected to be\n%s\n, but got\n%s\n", expected, output)
34 | 	}
35 | }
36 | 


--------------------------------------------------------------------------------
/pkg/checkers/podschedule/podschedule_test.go:
--------------------------------------------------------------------------------
 1 | package podschedule
 2 | 
 3 | import (
 4 | 	"testing"
 5 | 
 6 | 	v1 "k8s.io/api/core/v1"
 7 | )
 8 | 
 9 | func TestPodSchedule_Single_Panic(t *testing.T) {
10 | 	podList := []v1.Pod{
11 | 		{
12 | 			Spec: v1.PodSpec{
13 | 				NodeName: "a",
14 | 			},
15 | 		},
16 | 	}
17 | 
18 | 	defer func() {
19 | 		if recover() == nil {
20 | 			t.Errorf("Expect panic")
21 | 		}
22 | 	}()
23 | 
24 | 	checker := New()
25 | 	checker.checkPodsScheduleInReplicaSet("rc1", podList)
26 | }
27 | 
28 | func TestPodSchedule_DifferentName_OK(t *testing.T) {
29 | 	podList := []v1.Pod{
30 | 		{
31 | 			Spec: v1.PodSpec{
32 | 				NodeName: "a",
33 | 			},
34 | 		},
35 | 		{
36 | 			Spec: v1.PodSpec{
37 | 				NodeName: "b",
38 | 			},
39 | 		},
40 | 	}
41 | 
42 | 	checker := New()
43 | 	result := checker.checkPodsScheduleInReplicaSet("rc1", podList)
44 | 	if !result.Ok() {
45 | 		t.Errorf("Expect ok result but got %+v", result)
46 | 	}
47 | }
48 | 
49 | func TestPodSchedule_Failed(t *testing.T) {
50 | 	podList := []v1.Pod{
51 | 		{
52 | 			Spec: v1.PodSpec{
53 | 				NodeName: "a",
54 | 			},
55 | 		},
56 | 		{
57 | 			Spec: v1.PodSpec{
58 | 				NodeName: "a",
59 | 			},
60 | 		},
61 | 	}
62 | 
63 | 	checker := New()
64 | 	result := checker.checkPodsScheduleInReplicaSet("rc1", podList)
65 | 	if result.Ok() {
66 | 		t.Errorf("Expect failed result but got %+v", result)
67 | 	}
68 | }
69 | 


--------------------------------------------------------------------------------
/SUPPORT.md:
--------------------------------------------------------------------------------
 1 | # TODO: The maintainer of this repo has not yet edited this file
 2 | 
 3 | **REPO OWNER**: Do you want Customer Service & Support (CSS) support for this product/project?
 4 | 
 5 | - **No CSS support:** Fill out this template with information about how to file issues and get help.
 6 | - **Yes CSS support:** Fill out an intake form at [aka.ms/spot](https://aka.ms/spot). CSS will work with/help you to determine next steps. More details also available at [aka.ms/onboardsupport](https://aka.ms/onboardsupport).
 7 | - **Not sure?** Fill out a SPOT intake as though the answer were "Yes". CSS will help you decide.
 8 | 
 9 | *Then remove this first heading from this SUPPORT.MD file before publishing your repo.*
10 | 
11 | # Support
12 | 
13 | ## How to file issues and get help  
14 | 
15 | This project uses GitHub Issues to track bugs and feature requests. Please search the existing 
16 | issues before filing new issues to avoid duplicates.  For new issues, file your bug or 
17 | feature request as a new Issue.
18 | 
19 | For help and questions about using this project, please **REPO MAINTAINER: INSERT INSTRUCTIONS HERE 
20 | FOR HOW TO ENGAGE REPO OWNERS OR COMMUNITY FOR HELP. COULD BE A STACK OVERFLOW TAG OR OTHER
21 | CHANNEL. WHERE WILL YOU HELP PEOPLE?**.
22 | 
23 | ## Microsoft Support Policy  
24 | 
25 | Support for this **PROJECT or PRODUCT** is limited to the resources listed above.
26 | 


--------------------------------------------------------------------------------
/.devcontainer/devcontainer.json:
--------------------------------------------------------------------------------
 1 | // For format details, see https://aka.ms/devcontainer.json. For config options, see the README at:
 2 | // https://github.com/microsoft/vscode-dev-containers/tree/v0.234.0/containers/go
 3 | {
 4 | 	"name": "Go",
 5 | 	"build": {
 6 | 		"dockerfile": "Dockerfile",
 7 | 		"args": {
 8 | 			// Update the VARIANT arg to pick a version of Go: 1, 1.18, 1.17
 9 | 			// Append -bullseye or -buster to pin to an OS version.
10 | 			// Use -bullseye variants on local arm64/Apple Silicon.
11 | 			"VARIANT": "1-bullseye",
12 | 			// Options
13 | 			"NODE_VERSION": "lts/*"
14 | 		}
15 | 	},
16 | 	"runArgs": [ "--cap-add=SYS_PTRACE", "--security-opt", "seccomp=unconfined" ],
17 | 
18 | 	// Set *default* container specific settings.json values on container create.
19 | 	"settings": {
20 | 		"go.toolsManagement.checkForUpdates": "local",
21 | 		"go.useLanguageServer": true,
22 | 		"go.gopath": "/go"
23 | 	},
24 | 
25 | 	// Add the IDs of extensions you want installed when the container is created.
26 | 	"extensions": [
27 | 		"golang.Go"
28 | 	],
29 | 
30 | 	// Use 'forwardPorts' to make a list of ports inside the container available locally.
31 | 	// "forwardPorts": [],
32 | 
33 | 	// Use 'postCreateCommand' to run commands after the container is created.
34 | 	// "postCreateCommand": "go version",
35 | 
36 | 	// Comment out to connect as root instead. More info: https://aka.ms/vscode-remote/containers/non-root.
37 | 	"remoteUser": "vscode"
38 | }
39 | 


--------------------------------------------------------------------------------
/pkg/formatters/oneline.go:
--------------------------------------------------------------------------------
 1 | package formatters
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"io"
 6 | 	"strings"
 7 | 
 8 | 	"github.com/Azure/kdebug/pkg/base"
 9 | 	"github.com/Azure/kdebug/pkg/batch"
10 | 	"github.com/fatih/color"
11 | 	log "github.com/sirupsen/logrus"
12 | )
13 | 
14 | type OneLineFormatter struct{}
15 | 
16 | func (f *OneLineFormatter) WriteResults(w io.Writer, results []*base.CheckResult) error {
17 | 	failedCheckers := make(map[string]struct{})
18 | 	failures := []*base.CheckResult{}
19 | 	for _, r := range results {
20 | 		if r.Ok() {
21 | 			if log.IsLevelEnabled(log.DebugLevel) {
22 | 				fmt.Fprintf(w, "[%s] %s\n", r.Checker, r.Description)
23 | 			}
24 | 		} else {
25 | 			failures = append(failures, r)
26 | 			failedCheckers[r.Checker] = struct{}{}
27 | 		}
28 | 	}
29 | 
30 | 	if len(failures) == 0 {
31 | 		fmt.Fprintf(w, "All %v checks passed!\n",
32 | 			color.GreenString("%d", len(results)))
33 | 		return nil
34 | 	}
35 | 
36 | 	failedCheckersList := []string{}
37 | 	for c := range failedCheckers {
38 | 		failedCheckersList = append(failedCheckersList, c)
39 | 	}
40 | 
41 | 	fmt.Fprintf(w, "%v checks passed, %v failed: %s",
42 | 		color.GreenString("%d", len(results)-len(failures)),
43 | 		color.RedString("%d", len(failures)),
44 | 		strings.Join(failedCheckersList, ", "))
45 | 
46 | 	return nil
47 | }
48 | 
49 | func (f *OneLineFormatter) WriteBatchResults(w io.Writer, results []*batch.BatchResult) error {
50 | 	return fmt.Errorf("not implemented: one line formatter for batch results")
51 | }
52 | 


--------------------------------------------------------------------------------
/pkg/tools/tcpdump/tcpdump_test.go:
--------------------------------------------------------------------------------
 1 | package tcpdump
 2 | 
 3 | import (
 4 | 	"testing"
 5 | )
 6 | 
 7 | func TestParseIPAndPort_Success(t *testing.T) {
 8 | 	ip, port := ParseIPAndPort("192.168.1.1:7")
 9 | 	if ip != "192.168.1.1" || port != "7" {
10 | 		t.Errorf("Parsing 192.168.1.1:7 and expect ip is 192.168.1.1 and port is 7 but got %s and %s", ip, port)
11 | 	}
12 | 
13 | 	ip, port = ParseIPAndPort("192.168.1.1")
14 | 	if ip != "192.168.1.1" || len(port) != 0 {
15 | 		t.Errorf("Parsing 192.168.1.1 and expect ip is 192.168.1.1 and no port but got %s and %s", ip, port)
16 | 	}
17 | 
18 | 	ip, port = ParseIPAndPort("192.168.1.1:")
19 | 	if ip != "192.168.1.1" || len(port) != 0 {
20 | 		t.Errorf("Parsing 192.168.1.1: and expect ip is 192.168.1.1 and no port but got %s and %s", ip, port)
21 | 	}
22 | 
23 | 	ip, port = ParseIPAndPort(":80")
24 | 	if len(ip) != 0 || port != "80" {
25 | 		t.Errorf("Parsing :80 and expect no ip and port is 80 but got %s and %s", ip, port)
26 | 	}
27 | }
28 | 
29 | func TestGenerateTcpdumpParamerters_Success(t *testing.T) {
30 | 	tcpdumptool := New()
31 | 
32 | 	config := &Config{"192.168.1.1:1", "23.32.10.2:80", ":443", "19920", true}
33 | 	tcpdumptool.ParseParameters(config)
34 | 	parameter := tcpdumptool.GenerateTcpdumpParamerters()
35 | 
36 | 	expected := "-nvvv src 192.168.1.1 and src port 1 and dst 23.32.10.2 and dst port 80 and port 443 and tcp"
37 | 	if parameter != expected {
38 | 		t.Errorf("Generate parameter is expected to be %s but actually %s", expected, parameter)
39 | 	}
40 | }
41 | 


--------------------------------------------------------------------------------
/pkg/tools/aadssh/transport.go:
--------------------------------------------------------------------------------
 1 | package aadssh
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"io"
 6 | 	"io/ioutil"
 7 | 	"net/http"
 8 | 	"net/url"
 9 | 	"strings"
10 | 
11 | 	log "github.com/sirupsen/logrus"
12 | )
13 | 
14 | const (
15 | 	TokenURLSuffix = "/oauth2/v2.0/token"
16 | )
17 | 
18 | // A HTTP trasport for adding additional parameters in AAD token request
19 | type Transport struct {
20 | 	// Additional parameter key-value pairs
21 | 	data map[string]string
22 | }
23 | 
24 | // RoundTrip modifies AAD token request
25 | func (t *Transport) RoundTrip(req *http.Request) (*http.Response, error) {
26 | 	log.WithFields(log.Fields{"url": *req.URL}).Debug("MSAL request")
27 | 
28 | 	if strings.HasSuffix(req.URL.Path, TokenURLSuffix) {
29 | 		bodyBuf, err := ioutil.ReadAll(req.Body)
30 | 		if err != nil {
31 | 			return nil, err
32 | 		}
33 | 		defer req.Body.Close()
34 | 
35 | 		log.WithFields(log.Fields{"body": string(bodyBuf)}).Debug("Original request body")
36 | 
37 | 		values, err := url.ParseQuery(string(bodyBuf))
38 | 		if err != nil {
39 | 			return nil, err
40 | 		}
41 | 
42 | 		for k, v := range t.data {
43 | 			values.Add(k, v)
44 | 		}
45 | 
46 | 		bodyString := values.Encode()
47 | 		log.WithFields(log.Fields{"body": bodyString}).Debug("Modified request body")
48 | 
49 | 		bodyStream := strings.NewReader(bodyString)
50 | 		req.ContentLength = bodyStream.Size()
51 | 		req.Header.Set("Content-Length", fmt.Sprintf("%d", bodyStream.Size()))
52 | 		req.Body = io.NopCloser(bodyStream)
53 | 	}
54 | 
55 | 	return http.DefaultTransport.RoundTrip(req)
56 | }
57 | 


--------------------------------------------------------------------------------
/.github/workflows/container.yml:
--------------------------------------------------------------------------------
 1 | name: Container
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: ['main']
 6 |   release:
 7 |     types: [ created ]
 8 | 
 9 | env:
10 |   REGISTRY: ghcr.io
11 |   IMAGE_NAME: ${{ github.repository }}
12 | 
13 | jobs:
14 |   build-and-push-image:
15 |     runs-on: ubuntu-latest
16 |     permissions:
17 |       contents: read
18 |       packages: write
19 | 
20 |     steps:
21 |     - name: Checkout repository
22 |       uses: actions/checkout@v3
23 | 
24 |     - name: Set up Go
25 |       uses: actions/setup-go@v2
26 |       with:
27 |         go-version: "1.20"
28 | 
29 |     - name: Build
30 |       run: make build
31 | 
32 |     - name: Log in container registry
33 |       uses: docker/login-action@f054a8b539a109f9f41c372932f1ae047eff08c9
34 |       with:
35 |         registry: ${{ env.REGISTRY }}
36 |         username: ${{ github.actor }}
37 |         password: ${{ secrets.GITHUB_TOKEN }}
38 | 
39 |     - name: Generate metadata
40 |       id: meta
41 |       uses: docker/metadata-action@98669ae865ea3cffbcbaa878cf57c20bbf1c6c38
42 |       with:
43 |         images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
44 |         tags: |
45 |           type=ref,event=branch
46 |           type=ref,event=tag
47 |           type=sha,prefix=,format=long
48 | 
49 |     - name: Build and push container image
50 |       uses: docker/build-push-action@ad44023a93711e3deb337508980b4b5e9bcdc5dc
51 |       with:
52 |         context: .
53 |         push: true
54 |         tags: ${{ steps.meta.outputs.tags }}
55 |         labels: ${{ steps.meta.outputs.labels }}
56 | 


--------------------------------------------------------------------------------
/pkg/checkers/icmp/icmp_test.go:
--------------------------------------------------------------------------------
 1 | package icmpping
 2 | 
 3 | import (
 4 | 	"os"
 5 | 	"strings"
 6 | 	"testing"
 7 | 
 8 | 	"github.com/Azure/kdebug/pkg/base"
 9 | 	"github.com/Azure/kdebug/pkg/env"
10 | )
11 | 
12 | func TestICMPCheckRoot(t *testing.T) {
13 | 	if os.Geteuid() != 0 {
14 | 		t.Skip("Must run with root")
15 | 		return
16 | 	}
17 | 	targets := []pingTarget{{Address: "x.x.x.x"},
18 | 		{Address: "127.0.0.1"},
19 | 	}
20 | 	checker := ICMPChecker{targets: targets}
21 | 	context := &base.CheckContext{
22 | 		Environment: &env.StaticEnvironment{
23 | 			Flags: []string{"root"},
24 | 		},
25 | 		KubeClient: nil,
26 | 	}
27 | 	results, _ := checker.Check(context)
28 | 	for _, result := range results {
29 | 		if strings.Contains(result.Description, "x.x.x.x") {
30 | 			if result.Error == "" {
31 | 				t.Errorf("ping x.x.x.x should fail")
32 | 			}
33 | 		}
34 | 		if strings.Contains(result.Description, "127.0.0.1") {
35 | 			if result.Error != "" {
36 | 				t.Errorf("ping 127.0.0.1 failed %v\n", result.Error)
37 | 			}
38 | 		}
39 | 	}
40 | }
41 | 
42 | func TestICMPCheckNonRoot(t *testing.T) {
43 | 	if os.Geteuid() == 0 {
44 | 		t.Skip("Must run with non-root")
45 | 		return
46 | 	}
47 | 
48 | 	targets := []pingTarget{{Address: "x.x.x.x"},
49 | 		{Address: "127.0.0.1"},
50 | 	}
51 | 	checker := ICMPChecker{targets: targets}
52 | 	context := &base.CheckContext{
53 | 		Environment: &env.StaticEnvironment{
54 | 			Flags: []string{},
55 | 		},
56 | 		KubeClient: nil,
57 | 	}
58 | 	results, _ := checker.Check(context)
59 | 	if len(results) != 0 {
60 | 		t.Errorf("icmp checker unexpected results when not in root mode")
61 | 	}
62 | }
63 | 


--------------------------------------------------------------------------------
/pkg/batch/kube_discoverer.go:
--------------------------------------------------------------------------------
 1 | package batch
 2 | 
 3 | import (
 4 | 	"context"
 5 | 	"fmt"
 6 | 	"time"
 7 | 
 8 | 	corev1 "k8s.io/api/core/v1"
 9 | 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
10 | 	"k8s.io/client-go/kubernetes"
11 | )
12 | 
13 | type KubeBatchDiscoverer struct {
14 | 	client        *kubernetes.Clientset
15 | 	labelSelector string
16 | 	unready       bool
17 | }
18 | 
19 | func NewKubeBatchDiscoverer(client *kubernetes.Clientset, labelSelector string, unready bool) *KubeBatchDiscoverer {
20 | 	return &KubeBatchDiscoverer{
21 | 		client:        client,
22 | 		labelSelector: labelSelector,
23 | 		unready:       unready,
24 | 	}
25 | }
26 | 
27 | func (d *KubeBatchDiscoverer) Discover() ([]string, error) {
28 | 	if d.client == nil {
29 | 		return nil, fmt.Errorf("Kubernetes client is not initialized")
30 | 	}
31 | 
32 | 	ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
33 | 	defer cancel()
34 | 
35 | 	resp, err := d.client.CoreV1().Nodes().List(ctx, metav1.ListOptions{
36 | 		LabelSelector: d.labelSelector,
37 | 	})
38 | 	if err != nil {
39 | 		return nil, fmt.Errorf("Fail to list nodes from API server: %+v", err)
40 | 	}
41 | 
42 | 	var names []string
43 | 	for _, node := range resp.Items {
44 | 		if d.matchNode(&node) {
45 | 			names = append(names, node.ObjectMeta.Name)
46 | 		}
47 | 	}
48 | 
49 | 	return names, nil
50 | }
51 | 
52 | func (d *KubeBatchDiscoverer) matchNode(node *corev1.Node) bool {
53 | 	if d.unready {
54 | 		// Unready only
55 | 		for _, cond := range node.Status.Conditions {
56 | 			if cond.Type == corev1.NodeReady {
57 | 				return cond.Status != corev1.ConditionTrue
58 | 			}
59 | 		}
60 | 	}
61 | 
62 | 	return true
63 | }
64 | 


--------------------------------------------------------------------------------
/pkg/checkers/registry.go:
--------------------------------------------------------------------------------
 1 | package checker
 2 | 
 3 | import (
 4 | 	"sort"
 5 | 
 6 | 	"github.com/Azure/kdebug/pkg/checkers/diskreadonly"
 7 | 	"github.com/Azure/kdebug/pkg/checkers/diskusage"
 8 | 	"github.com/Azure/kdebug/pkg/checkers/dns"
 9 | 	"github.com/Azure/kdebug/pkg/checkers/dummy"
10 | 	"github.com/Azure/kdebug/pkg/checkers/http"
11 | 	icmpping "github.com/Azure/kdebug/pkg/checkers/icmp"
12 | 	"github.com/Azure/kdebug/pkg/checkers/kmscachesize"
13 | 	kubeobjectsize "github.com/Azure/kdebug/pkg/checkers/kube/objectsize"
14 | 	"github.com/Azure/kdebug/pkg/checkers/kube/pod"
15 | 	"github.com/Azure/kdebug/pkg/checkers/liveness"
16 | 	"github.com/Azure/kdebug/pkg/checkers/oom"
17 | 	"github.com/Azure/kdebug/pkg/checkers/podschedule"
18 | 	"github.com/Azure/kdebug/pkg/checkers/systemload"
19 | 	"github.com/Azure/kdebug/pkg/checkers/tcpping"
20 | )
21 | 
22 | var allCheckers = map[string]Checker{
23 | 	"dummy":          &dummy.DummyChecker{},
24 | 	"dns":            dns.New(),
25 | 	"oom":            oom.New(),
26 | 	"kubeobjectsize": kubeobjectsize.New(),
27 | 	"diskusage":      diskusage.New(),
28 | 	"diskreadonly":   diskreadonly.New(),
29 | 	"kubepod":        pod.New(),
30 | 	"liveness":       liveness.New(),
31 | 	"http":           http.New(),
32 | 	"tcp":            tcpping.New(),
33 | 	"ping":           icmpping.New(),
34 | 	"systemload":     systemload.New(),
35 | 	"kmscachesize":   kmscachesize.New(),
36 | 	"podschedule":    podschedule.New(),
37 | }
38 | 
39 | func ListAllCheckerNames() []string {
40 | 	names := make([]string, 0, len(allCheckers))
41 | 	for n := range allCheckers {
42 | 		names = append(names, n)
43 | 	}
44 | 	sort.Strings(names)
45 | 	return names
46 | }
47 | 


--------------------------------------------------------------------------------
/pkg/checkers/liveness/liveness.go:
--------------------------------------------------------------------------------
 1 | package liveness
 2 | 
 3 | import (
 4 | 	"os/exec"
 5 | 	"regexp"
 6 | 	"strings"
 7 | 
 8 | 	"github.com/Azure/kdebug/pkg/base"
 9 | 	log "github.com/sirupsen/logrus"
10 | )
11 | 
12 | const (
13 | 	CheckerName           = "Liveness (kubelet)"
14 | 	FailedToCheckLiveness = "Failed to check liveness."
15 | )
16 | 
17 | type LivenessChecker struct {
18 | }
19 | 
20 | func New() *LivenessChecker {
21 | 	return &LivenessChecker{}
22 | }
23 | 
24 | func (c *LivenessChecker) Name() string {
25 | 	return CheckerName
26 | }
27 | 
28 | func (c *LivenessChecker) Check(ctx *base.CheckContext) ([]*base.CheckResult, error) {
29 | 	results := []*base.CheckResult{}
30 | 
31 | 	out, err := exec.Command("systemctl", "status", "kubelet").Output()
32 | 
33 | 	if err != nil {
34 | 		log.Debugf("systemctl status returned non-zero exit code: %+v", err)
35 | 	}
36 | 
37 | 	results = append(results, parseOutput(out))
38 | 	return results, nil
39 | }
40 | 
41 | func parseOutput(output []byte) *base.CheckResult {
42 | 	rows := strings.Split(string(output), "\n")
43 | 	re := regexp.MustCompile(`active \(running\) since`)
44 | 	isActive := false
45 | 	var details string
46 | 
47 | 	for _, row := range rows {
48 | 		if len(row) == 0 {
49 | 			continue
50 | 		}
51 | 
52 | 		if re.MatchString(row) {
53 | 			isActive = true
54 | 			details = row
55 | 			break
56 | 		}
57 | 	}
58 | 
59 | 	if isActive {
60 | 		return &base.CheckResult{
61 | 			Checker:     CheckerName,
62 | 			Description: details,
63 | 			Logs:        rows,
64 | 		}
65 | 	}
66 | 
67 | 	return &base.CheckResult{
68 | 		Checker: CheckerName,
69 | 		Error:   "Kubelet is NOT running well in this node. Please check the logs for more details.",
70 | 		Logs:    rows,
71 | 	}
72 | }
73 | 


--------------------------------------------------------------------------------
/pkg/checkers/oom/oom_test.go:
--------------------------------------------------------------------------------
 1 | package oom
 2 | 
 3 | import (
 4 | 	"io/ioutil"
 5 | 	"os"
 6 | 	"testing"
 7 | 
 8 | 	"github.com/Azure/kdebug/pkg/base"
 9 | 	"github.com/Azure/kdebug/pkg/env"
10 | )
11 | 
12 | var testStrings = []string{
13 | 	"Feb 22 16:15:02 k8s-ingress-11186066-z1-vmss0000B3 kernel: [989751.247878] Memory cgroup out of memory: Killed process 3841 (nginx) total-vm:240652kB, anon-rss:130344kB, file-rss:5212kB, shmem-rss:208kB, UID:101 pgtables:332kB oom_score_adj:986\n",
14 | 	"Feb 22 16:15:02 k8s-ingress-11186066-z1-vmss0000B3 kernel: [989751.247878] Out of memory: Killed process 3841 (nginx) total-vm:240652kB, anon-rss:130344kB, file-rss:5212kB, shmem-rss:208kB, UID:101 pgtables:332kB oom_score_adj:986\n",
15 | }
16 | 
17 | func TestCheckOOMLogWhenOOM(t *testing.T) {
18 | 	environment := &env.StaticEnvironment{
19 | 		Flags: []string{"linux"},
20 | 	}
21 | 	for _, testString := range testStrings {
22 | 
23 | 		tmp, err := ioutil.TempFile("", "kernlog")
24 | 		if err != nil {
25 | 			t.Fatalf("error creating tmp file:%v", err)
26 | 		}
27 | 		check := OOMChecker{kernLogPath: tmp.Name()}
28 | 		defer func() {
29 | 			os.Remove(check.kernLogPath)
30 | 		}()
31 | 		//should be 600. But it fails in 600
32 | 		err = os.WriteFile(check.kernLogPath, []byte(testString), 777)
33 | 		if err != nil {
34 | 			t.Errorf("Create tmp file error:%v", err)
35 | 		}
36 | 		result, err := check.Check(&base.CheckContext{
37 | 			Environment: environment,
38 | 		})
39 | 		if err != nil {
40 | 			t.Errorf("Expect no error but got: %s", err)
41 | 		}
42 | 		if len(result) != 1 {
43 | 			t.Errorf("Get unexpected OOM result length %v", len(result))
44 | 		}
45 | 		checkErr := result[0].Error
46 | 		if checkErr != "progress:[3841 nginx] is OOM kill at time [Feb 22 16:15:02]. [rss:130344kB] [oom_score_adj:986]\n" {
47 | 			t.Errorf("Unexpected check result:\n %v \n %v", result[0].Description, checkErr)
48 | 		}
49 | 	}
50 | }
51 | 


--------------------------------------------------------------------------------
/pkg/formatters/text.go:
--------------------------------------------------------------------------------
 1 | package formatters
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"io"
 6 | 
 7 | 	"github.com/Azure/kdebug/pkg/base"
 8 | 	"github.com/Azure/kdebug/pkg/batch"
 9 | 	"github.com/fatih/color"
10 | 	log "github.com/sirupsen/logrus"
11 | )
12 | 
13 | type TextFormatter struct{}
14 | 
15 | func (f *TextFormatter) WriteResults(w io.Writer, results []*base.CheckResult) error {
16 | 	failures := []*base.CheckResult{}
17 | 	for _, r := range results {
18 | 		if r.Ok() {
19 | 			if log.IsLevelEnabled(log.DebugLevel) {
20 | 				fmt.Fprintf(w, "[%s] %s\n", r.Checker, r.Description)
21 | 			}
22 | 		} else {
23 | 			failures = append(failures, r)
24 | 		}
25 | 	}
26 | 
27 | 	fmt.Fprintf(w, "------------------------------\n")
28 | 
29 | 	if len(failures) == 0 {
30 | 		fmt.Fprintf(w, "All %v checks passed!\n",
31 | 			color.GreenString("%d", len(results)))
32 | 		return nil
33 | 	}
34 | 
35 | 	fmt.Fprintf(w, "%v checks passed. %v failed.\n",
36 | 		color.GreenString("%d", len(results)-len(failures)),
37 | 		color.RedString("%d", len(failures)))
38 | 	fmt.Fprintf(w, "------------------------------\n")
39 | 	fmt.Fprintf(w, "kdebug has detected these problems for you:\n")
40 | 
41 | 	for _, r := range failures {
42 | 		fmt.Fprintf(w, "------------------------------\n")
43 | 		fmt.Fprintf(w, color.YellowString("Checker: %s\n", r.Checker))
44 | 		fmt.Fprintf(w, "Error: %s\n", r.Error)
45 | 		fmt.Fprintf(w, "Description: %s\n", r.Description)
46 | 		if len(r.Recommendations) > 0 {
47 | 			fmt.Fprintf(w, "Recommendations:\n")
48 | 			for i, rec := range r.Recommendations {
49 | 				fmt.Fprintf(w, "[%d] %s\n", i+1, rec)
50 | 			}
51 | 		}
52 | 		// TODO: Make logs more pretty
53 | 		if len(r.Logs) > 0 {
54 | 			fmt.Fprintf(w, "Logs:\n")
55 | 			for _, l := range r.Logs {
56 | 				fmt.Fprintf(w, "%s\n", l)
57 | 			}
58 | 		}
59 | 		if len(r.HelpLinks) > 0 {
60 | 			fmt.Fprintf(w, "Help links:\n")
61 | 			for i, l := range r.HelpLinks {
62 | 				fmt.Fprintf(w, "[%d] %s\n", i+1, l)
63 | 			}
64 | 		}
65 | 	}
66 | 
67 | 	return nil
68 | }
69 | 
70 | func (f *TextFormatter) WriteBatchResults(w io.Writer, results []*batch.BatchResult) error {
71 | 	for _, result := range results {
72 | 		fmt.Fprintf(w, color.BlueString("=============== Machine: %s ===============\n",
73 | 			result.Machine))
74 | 		if result.Error == nil {
75 | 			f.WriteResults(w, result.CheckResults)
76 | 		} else {
77 | 			fmt.Fprintf(w, "Remote execution error: %s\n", result.Error)
78 | 		}
79 | 	}
80 | 	return nil
81 | }
82 | 


--------------------------------------------------------------------------------
/pkg/tools/netexec/netexec_test.go:
--------------------------------------------------------------------------------
 1 | package netexec
 2 | 
 3 | import (
 4 | 	"testing"
 5 | 
 6 | 	"github.com/Azure/kdebug/pkg/base"
 7 | 	"k8s.io/cli-runtime/pkg/genericclioptions"
 8 | )
 9 | 
10 | func TestParseParameterPid_Success(t *testing.T) {
11 | 	netexec := &NetexecTool{}
12 | 	netexec.parseAndCheckParameters(&base.ToolContext{
13 | 		Config: &Config{
14 | 			Pid:     "1",
15 | 			Command: "bash",
16 | 		},
17 | 	})
18 | 
19 | 	if netexec.pid != "1" {
20 | 		t.Errorf("pid should got %s but got %s", "1", netexec.pid)
21 | 	}
22 | 
23 | 	if netexec.command != "bash" {
24 | 		t.Errorf("command should got %s but got %s", "bash", netexec.command)
25 | 	}
26 | }
27 | 
28 | func TestParseParameterPod_Success(t *testing.T) {
29 | 	netexec := &NetexecTool{}
30 | 	netexec.parseAndCheckParameters(&base.ToolContext{
31 | 		Config: &Config{
32 | 			PodName:   "pod",
33 | 			Command:   "bash",
34 | 			Namespace: "kube-system",
35 | 			Image:     "image",
36 | 		},
37 | 		KubeConfigFlag: &genericclioptions.ConfigFlags{},
38 | 	})
39 | 
40 | 	if netexec.podName != "pod" {
41 | 		t.Errorf("podname should got %s but got %s", "pod", netexec.podName)
42 | 	}
43 | 
44 | 	if netexec.command != "bash" {
45 | 		t.Errorf("command should got %s but got %s", "bash", netexec.command)
46 | 	}
47 | 
48 | 	if netexec.namespace != "kube-system" {
49 | 		t.Errorf("namespace should got %s but got %s", "kube-system", netexec.namespace)
50 | 	}
51 | 
52 | 	if netexec.image != "image" {
53 | 		t.Errorf("image should got %s but got %s", "image", netexec.image)
54 | 	}
55 | }
56 | 
57 | func TestParseParameter_Failed(t *testing.T) {
58 | 	netexec := &NetexecTool{}
59 | 	err := netexec.parseAndCheckParameters(&base.ToolContext{
60 | 		Config: &Config{},
61 | 	})
62 | 
63 | 	if err == nil {
64 | 		t.Error("Should got err: 'Either --pid and --pod should be set.', but error is not raised")
65 | 	}
66 | 
67 | 	err = netexec.parseAndCheckParameters(&base.ToolContext{
68 | 		Config: &Config{
69 | 			Pid:     "1",
70 | 			PodName: "pod",
71 | 		},
72 | 	})
73 | 
74 | 	if err == nil {
75 | 		t.Error("Should got err: '--pid and --pod can not be assigned together. Please set either of them.', but error is not raised")
76 | 	}
77 | 
78 | 	err = netexec.parseAndCheckParameters(&base.ToolContext{
79 | 		Config: &Config{
80 | 			PodName:   "pod",
81 | 			Command:   "bash",
82 | 			Namespace: "kube-system",
83 | 			Image:     "image",
84 | 		},
85 | 	})
86 | 
87 | 	if err == nil {
88 | 		t.Error("Should got err: 'kubernetes client is not availble. Check kubeconfig.', but error is not raised")
89 | 	}
90 | }
91 | 


--------------------------------------------------------------------------------
/pkg/checkers/diskusage/diskusage_test.go:
--------------------------------------------------------------------------------
 1 | package diskusage
 2 | 
 3 | import (
 4 | 	"testing"
 5 | )
 6 | 
 7 | func TestDfParse_Success(t *testing.T) {
 8 | 	dfOutput := `Filesystem      Size  Used Avail Use% Mounted on
 9 | 	/dev/sdb        251G   11G  228G   5% /`
10 | 
11 | 	result, _ := parseDfResult(dfOutput, DfHeaders["LINUX"])
12 | 	if len(result) != 1 {
13 | 		t.Errorf("Expect the length of result is 1 but got %+v", len(result))
14 | 	}
15 | 
16 | 	if result[0].Filesystem != "/dev/sdb" {
17 | 		t.Errorf("Expect Filesystem is /dev/sdb but got %s", result[0].Filesystem)
18 | 	}
19 | 
20 | 	if result[0].Size != "251G" {
21 | 		t.Errorf("Expect Size is 251G but got %s", result[0].Size)
22 | 	}
23 | 
24 | 	if result[0].Used != "11G" {
25 | 		t.Errorf("Expect Used is 11G but got %s", result[0].Used)
26 | 	}
27 | 
28 | 	if result[0].Avail != "228G" {
29 | 		t.Errorf("Expect Avail is 228G but got %s", result[0].Avail)
30 | 	}
31 | 
32 | 	if result[0].Use != 5 {
33 | 		t.Errorf("Expect Use is 5 but got %v", result[0].Use)
34 | 	}
35 | 
36 | 	if result[0].MountedOn != "/" {
37 | 		t.Errorf("Expect MountedOn is / but got %s", result[0].MountedOn)
38 | 	}
39 | }
40 | 
41 | func TestDfParse_FreeBSD_Success(t *testing.T) {
42 | 	dfOutput := `Filesystem         Size    Used   Avail Capacity  Mounted on
43 | 	/dev/gpt/rootfs     29G    4.0G     23G    15%    /`
44 | 
45 | 	result, _ := parseDfResult(dfOutput, DfHeaders["FREEBSD"])
46 | 	if len(result) != 1 {
47 | 		t.Errorf("Expect the length of result is 1 but got %+v", len(result))
48 | 	}
49 | 
50 | 	if result[0].Filesystem != "/dev/gpt/rootfs" {
51 | 		t.Errorf("Expect Filesystem is /dev/gpt/rootfs but got %s", result[0].Filesystem)
52 | 	}
53 | 
54 | 	if result[0].Size != "29G" {
55 | 		t.Errorf("Expect Size is 29G but got %s", result[0].Size)
56 | 	}
57 | 
58 | 	if result[0].Used != "4.0G" {
59 | 		t.Errorf("Expect Used is 4.0G but got %s", result[0].Used)
60 | 	}
61 | 
62 | 	if result[0].Avail != "23G" {
63 | 		t.Errorf("Expect Avail is 23G but got %s", result[0].Avail)
64 | 	}
65 | 
66 | 	if result[0].Use != 15 {
67 | 		t.Errorf("Expect Use is 15 but got %v", result[0].Use)
68 | 	}
69 | 
70 | 	if result[0].MountedOn != "/" {
71 | 		t.Errorf("Expect MountedOn is / but got %s", result[0].MountedOn)
72 | 	}
73 | }
74 | 
75 | func TestDfParse_Failed(t *testing.T) {
76 | 	dfOutput := `Filesystem      Size  Used Avail Use% MountedOn
77 | 	/dev/sdb        251G   11G  228G   5% /`
78 | 
79 | 	_, err := parseDfResult(dfOutput, DfHeaders["LINUX"])
80 | 	if err == nil {
81 | 		t.Errorf("Expect error in parseDfResult but not")
82 | 	}
83 | }
84 | 


--------------------------------------------------------------------------------
/pkg/checkers/http/http.go:
--------------------------------------------------------------------------------
  1 | package http
  2 | 
  3 | import (
  4 | 	"fmt"
  5 | 	"net/http"
  6 | 	"time"
  7 | 
  8 | 	"github.com/Azure/kdebug/pkg/base"
  9 | 	"github.com/Azure/kdebug/pkg/env"
 10 | )
 11 | 
 12 | var (
 13 | 	GoogleTarget = HttpTarget{
 14 | 		Name: "Google HTTP endpoint",
 15 | 		URL:  "https://google.com",
 16 | 	}
 17 | 	AzureIMDSTarget = HttpTarget{
 18 | 		Name: "Azure IMDS HTTP endpoint",
 19 | 		URL:  "http://169.254.169.254/metadata/versions",
 20 | 		Header: http.Header{
 21 | 			"Metadata": {"true"},
 22 | 		},
 23 | 	}
 24 | )
 25 | 
 26 | type HttpTarget struct {
 27 | 	Name   string
 28 | 	URL    string
 29 | 	Header http.Header
 30 | }
 31 | 
 32 | type HttpChecker struct {
 33 | 	Client HttpClient
 34 | }
 35 | 
 36 | type HttpClient interface {
 37 | 	Do(req *http.Request) (*http.Response, error)
 38 | }
 39 | 
 40 | func New() *HttpChecker {
 41 | 	return &HttpChecker{
 42 | 		Client: &http.Client{
 43 | 			// Disable proxy. Azure IMDS don't support to be used behind proxy.
 44 | 			Transport: &http.Transport{Proxy: nil},
 45 | 			Timeout:   10 * time.Second,
 46 | 		},
 47 | 	}
 48 | }
 49 | 
 50 | func (c *HttpChecker) Name() string {
 51 | 	return "Http"
 52 | }
 53 | 
 54 | func (c *HttpChecker) Check(ctx *base.CheckContext) ([]*base.CheckResult, error) {
 55 | 	results := []*base.CheckResult{}
 56 | 	targets := getCheckTargets(ctx.Environment)
 57 | 	var result *base.CheckResult
 58 | 	for _, httpTarget := range targets {
 59 | 		request, err := http.NewRequest("GET", httpTarget.URL, nil)
 60 | 		if err != nil {
 61 | 			return nil, fmt.Errorf("Fail to create request for target %s: %+v",
 62 | 				httpTarget.Name, err)
 63 | 		}
 64 | 		request.Header = httpTarget.Header
 65 | 
 66 | 		response, err := c.Client.Do(request)
 67 | 		if err != nil {
 68 | 			result = &base.CheckResult{
 69 | 				Checker: c.Name(),
 70 | 				Error: fmt.Sprintf("Fail to invoke HTTP GET method on URL %s.",
 71 | 					httpTarget.URL),
 72 | 				Description: err.Error(),
 73 | 				//todo: Recommendations and help links
 74 | 			}
 75 | 		} else {
 76 | 			defer response.Body.Close()
 77 | 			result = &base.CheckResult{
 78 | 				Checker: c.Name(),
 79 | 				Description: fmt.Sprintf("Successfully invoke HTTP GET on URL %s , response status code is %s.",
 80 | 					httpTarget.URL, response.Status),
 81 | 			}
 82 | 		}
 83 | 		results = append(results, result)
 84 | 	}
 85 | 
 86 | 	return results, nil
 87 | }
 88 | 
 89 | func getCheckTargets(e env.Environment) []HttpTarget {
 90 | 	targets := []HttpTarget{
 91 | 		GoogleTarget,
 92 | 	}
 93 | 
 94 | 	if e.HasFlag("azure") {
 95 | 		targets = append(targets, AzureIMDSTarget)
 96 | 	}
 97 | 
 98 | 	return targets
 99 | }
100 | 


--------------------------------------------------------------------------------
/cmd/batch.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"io"
 5 | 
 6 | 	"github.com/schollz/progressbar/v3"
 7 | 	log "github.com/sirupsen/logrus"
 8 | 
 9 | 	"github.com/Azure/kdebug/pkg/base"
10 | 	"github.com/Azure/kdebug/pkg/batch"
11 | 	"github.com/Azure/kdebug/pkg/formatters"
12 | )
13 | 
14 | func getBatchDiscoverer(opts *Options, chkCtx *base.CheckContext) batch.BatchDiscoverer {
15 | 	if opts.Batch.KubeMachines || opts.Batch.KubeMachinesUnready || len(opts.Batch.KubeMachinesLabelSelector) > 0 {
16 | 		return batch.NewKubeBatchDiscoverer(chkCtx.KubeClient, opts.Batch.KubeMachinesLabelSelector, opts.Batch.KubeMachinesUnready)
17 | 	} else if opts.Batch.MachinesFile != "" {
18 | 		return &batch.FileBatchDiscoverer{
19 | 			Path: opts.Batch.MachinesFile,
20 | 		}
21 | 	} else {
22 | 		return &batch.StaticBatchDiscoverer{
23 | 			Machines: opts.Batch.Machines,
24 | 		}
25 | 	}
26 | }
27 | 
28 | func getBatchExecutor(opts *Options, chkCtx *base.CheckContext) batch.BatchExecutor {
29 | 	if opts.Batch.SshUser != "" {
30 | 		return batch.NewSshBatchExecutor(opts.Batch.SshUser)
31 | 	} else if chkCtx.KubeClient != nil {
32 | 		return batch.NewPodBatchExecutor(
33 | 			chkCtx.KubeClient,
34 | 			opts.Batch.PodExecutorImage,
35 | 			opts.Batch.PodExecutorNamespace,
36 | 			opts.Batch.PodExecutorMode,
37 | 		)
38 | 	} else {
39 | 		log.Fatal("No batch executor configured")
40 | 		return nil
41 | 	}
42 | }
43 | 
44 | type batchReporter struct {
45 | 	out io.Writer
46 | 	bar *progressbar.ProgressBar
47 | }
48 | 
49 | func newBatchReporter(out io.Writer, max int64) *batchReporter {
50 | 	return &batchReporter{
51 | 		out: out,
52 | 		bar: progressbar.Default(max),
53 | 	}
54 | }
55 | 
56 | func (r *batchReporter) OnResult(result *batch.BatchResult) {
57 | 	r.bar.Add(1)
58 | }
59 | 
60 | func runBatch(opts *Options, chkCtx *base.CheckContext, formatter formatters.Formatter) {
61 | 	discoverer := getBatchDiscoverer(opts, chkCtx)
62 | 	machines, err := discoverer.Discover()
63 | 	if err != nil {
64 | 		log.Fatalf("Fail to discover machines: %+v", err)
65 | 	}
66 | 
67 | 	log.WithFields(log.Fields{"count": len(machines)}).Info("Discovered machines list")
68 | 
69 | 	executor := getBatchExecutor(opts, chkCtx)
70 | 	concurrency := 1
71 | 	if opts.Batch.Concurrency > 0 {
72 | 		concurrency = opts.Batch.Concurrency
73 | 	}
74 | 	batchOpts := &batch.BatchOptions{
75 | 		Machines:    machines,
76 | 		Checkers:    opts.Checkers,
77 | 		Concurrency: concurrency,
78 | 		Reporter:    newBatchReporter(chkCtx.Output, int64(len(machines))),
79 | 	}
80 | 	batchResults, err := executor.Execute(batchOpts)
81 | 	if err != nil {
82 | 		log.Fatalf("Fail to run batch: %s", err)
83 | 	}
84 | 
85 | 	err = formatter.WriteBatchResults(chkCtx.Output, batchResults)
86 | 	if err != nil {
87 | 		log.Fatal(err)
88 | 	}
89 | }
90 | 


--------------------------------------------------------------------------------
/pkg/checkers/icmp/icmp.go:
--------------------------------------------------------------------------------
  1 | package icmpping
  2 | 
  3 | import (
  4 | 	"errors"
  5 | 	"fmt"
  6 | 	"time"
  7 | 
  8 | 	probing "github.com/prometheus-community/pro-bing"
  9 | 	log "github.com/sirupsen/logrus"
 10 | 
 11 | 	"github.com/Azure/kdebug/pkg/base"
 12 | )
 13 | 
 14 | var PublicTargets = []pingTarget{
 15 | 	{
 16 | 		Address:        "8.8.8.8",
 17 | 		Name:           "GoogleDns",
 18 | 		Recomendations: []string{"Google DNS is not reachable. Check firewall settings if this is not desired."},
 19 | 	},
 20 | 	{
 21 | 		Address:        "10.0.0.10",
 22 | 		Name:           "ClusterDns",
 23 | 		Recomendations: []string{"Cluster CoreDNS is not reachable. Check CoreDNS pods and network settings."},
 24 | 	},
 25 | }
 26 | 
 27 | type ICMPChecker struct {
 28 | 	targets []pingTarget
 29 | }
 30 | 
 31 | type pingTarget struct {
 32 | 	Address        string
 33 | 	Name           string
 34 | 	Recomendations []string
 35 | }
 36 | 
 37 | func New() *ICMPChecker {
 38 | 	return &ICMPChecker{}
 39 | }
 40 | 
 41 | func (c *ICMPChecker) Name() string {
 42 | 	return "icmp"
 43 | }
 44 | 
 45 | func (c *ICMPChecker) Check(ctx *base.CheckContext) ([]*base.CheckResult, error) {
 46 | 	var results []*base.CheckResult
 47 | 	// TODO: Invoke `ping` command if non-root
 48 | 	if !ctx.Environment.HasFlag("root") {
 49 | 		log.Debug("Not root. Skip ICMP checker")
 50 | 		return results, nil
 51 | 	}
 52 | 	if !ctx.Environment.HasFlag("azure") {
 53 | 		c.targets = append(c.targets, PublicTargets...)
 54 | 	}
 55 | 	if ctx.KubeClient != nil {
 56 | 
 57 | 	}
 58 | 	resultChan := make(chan *base.CheckResult, len(c.targets))
 59 | 	for _, target := range c.targets {
 60 | 		go func(pingTarget pingTarget) {
 61 | 			result := &base.CheckResult{
 62 | 				Checker: c.Name(),
 63 | 			}
 64 | 			err := pingOne(pingTarget.Address)
 65 | 			if err != nil {
 66 | 				result.Error = err.Error()
 67 | 				result.Description = fmt.Sprintf("ping %s[%s] failed", pingTarget.Address, pingTarget.Name)
 68 | 				result.Recommendations = pingTarget.Recomendations
 69 | 			} else {
 70 | 				result.Description = fmt.Sprintf("ping %s[%s] succeeded", pingTarget.Address, pingTarget.Name)
 71 | 			}
 72 | 			resultChan <- result
 73 | 
 74 | 		}(target)
 75 | 	}
 76 | 	for i := 0; i < len(c.targets); i++ {
 77 | 		result := <-resultChan
 78 | 		results = append(results, result)
 79 | 	}
 80 | 	return results, nil
 81 | }
 82 | 
 83 | func pingOne(ip string) error {
 84 | 	pinger, err := probing.NewPinger(ip)
 85 | 	if err != nil {
 86 | 		return err
 87 | 	}
 88 | 
 89 | 	pinger.Count = 3
 90 | 	pinger.Interval = time.Millisecond * 20
 91 | 	pinger.Timeout = time.Millisecond * 1000
 92 | 	err = pinger.Run()
 93 | 	if err != nil {
 94 | 		return err
 95 | 	}
 96 | 	stats := pinger.Statistics()
 97 | 	if stats.PacketsRecv <= 0 {
 98 | 		return errors.New("ping receives no reply")
 99 | 	}
100 | 	return nil
101 | }
102 | 


--------------------------------------------------------------------------------
/pkg/tools/vmrebootdetector/vmrebootdetector.go:
--------------------------------------------------------------------------------
 1 | package vmrebootdetector
 2 | 
 3 | import (
 4 | 	"bufio"
 5 | 	"fmt"
 6 | 	"os/exec"
 7 | 	"strings"
 8 | 	"time"
 9 | 
10 | 	"github.com/fatih/color"
11 | 	flags "github.com/jessevdk/go-flags"
12 | 
13 | 	"github.com/Azure/kdebug/pkg/base"
14 | )
15 | 
16 | var helpLink = []string{
17 | 	"https://www.baeldung.com/linux/last-command",
18 | 	"https://man7.org/linux/man-pages/man1/last.1.html",
19 | }
20 | 
21 | var explain = "You can also use `last` command to inspect above events. Columns in its output are user, login terminal, kernel version, login time, login period\n"
22 | 
23 | type Tool struct {
24 | 	rebootCheckTImeInDay int
25 | }
26 | 
27 | type Config struct {
28 | 	CheckDays int `short:"d" long:"checkdays" description:"Days you want to look back to search for reboot events." default:"30"`
29 | }
30 | 
31 | func (t *Tool) Name() string {
32 | 	return "vmrebootDetector"
33 | }
34 | 
35 | func New() *Tool {
36 | 	return &Tool{}
37 | }
38 | 
39 | func (t *Tool) ParseArgs(ctx *base.ToolContext, args []string) error {
40 | 	var config Config
41 | 	remaningArgs, err := flags.ParseArgs(&config, args)
42 | 	if err != nil {
43 | 		return err
44 | 	}
45 | 	ctx.Config = &config
46 | 	ctx.Args = remaningArgs
47 | 	return nil
48 | }
49 | 
50 | // Run todo: support batch mode
51 | func (t *Tool) Run(ctx *base.ToolContext) error {
52 | 	t.parseArgument(ctx)
53 | 	return t.exec()
54 | }
55 | 
56 | func (t *Tool) parseArgument(ctx *base.ToolContext) {
57 | 	config := ctx.Config.(*Config)
58 | 	t.rebootCheckTImeInDay = config.CheckDays
59 | }
60 | 
61 | func (t *Tool) exec() error {
62 | 	sinceTime := time.Now().Add(-time.Hour * 24 * time.Duration(t.rebootCheckTImeInDay)).Format("2006-01-02 15:04:05")
63 | 	cmd := exec.Command("last", "reboot", "--since", sinceTime, "--time-format", "iso")
64 | 	stdout, err := cmd.Output()
65 | 	if err != nil {
66 | 		return err
67 | 	}
68 | 	fmt.Println(t.parseResult(string(stdout)))
69 | 	return nil
70 | }
71 | 
72 | func (t *Tool) parseResult(result string) string {
73 | 	sb := strings.Builder{}
74 | 	scanner := bufio.NewScanner(strings.NewReader(result))
75 | 	var reboots []string
76 | 	for scanner.Scan() {
77 | 		text := scanner.Text()
78 | 		if text == "" {
79 | 			break
80 | 		} else {
81 | 			reboots = append(reboots, text)
82 | 		}
83 | 	}
84 | 	if reboots == nil {
85 | 		sb.WriteString(color.GreenString("No reboot found in past %v days\n", t.rebootCheckTImeInDay))
86 | 	} else {
87 | 		sb.WriteString(color.YellowString("Detected following VM reboots:\n"))
88 | 		sb.WriteString("\n")
89 | 		sb.WriteString(strings.Join(reboots, "\n"))
90 | 		sb.WriteString("\n\n")
91 | 		sb.WriteString(color.YellowString(explain))
92 | 		sb.WriteString("\n")
93 | 		sb.WriteString(color.YellowString("See also:\n"))
94 | 		sb.WriteString(color.YellowString(strings.Join(helpLink, "\n")))
95 | 	}
96 | 	return sb.String()
97 | }
98 | 


--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
 1 | <!-- BEGIN MICROSOFT SECURITY.MD V0.0.5 BLOCK -->
 2 | 
 3 | ## Security
 4 | 
 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/).
 6 | 
 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://docs.microsoft.com/en-us/previous-versions/tn-archive/cc751383(v=technet.10)), please report it to us as described below.
 8 | 
 9 | ## Reporting Security Issues
10 | 
11 | **Please do not report security vulnerabilities through public GitHub issues.**
12 | 
13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://msrc.microsoft.com/create-report).
14 | 
15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com).  If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://www.microsoft.com/en-us/msrc/pgp-key-msrc).
16 | 
17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc). 
18 | 
19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
20 | 
21 |   * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
22 |   * Full paths of source file(s) related to the manifestation of the issue
23 |   * The location of the affected source code (tag/branch/commit or direct URL)
24 |   * Any special configuration required to reproduce the issue
25 |   * Step-by-step instructions to reproduce the issue
26 |   * Proof-of-concept or exploit code (if possible)
27 |   * Impact of the issue, including how an attacker might exploit the issue
28 | 
29 | This information will help us triage your report more quickly.
30 | 
31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://microsoft.com/msrc/bounty) page for more details about our active programs.
32 | 
33 | ## Preferred Languages
34 | 
35 | We prefer all communications to be in English.
36 | 
37 | ## Policy
38 | 
39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://www.microsoft.com/en-us/msrc/cvd).
40 | 
41 | <!-- END MICROSOFT SECURITY.MD BLOCK -->


--------------------------------------------------------------------------------
/pkg/checkers/podschedule/podschedule.go:
--------------------------------------------------------------------------------
  1 | package podschedule
  2 | 
  3 | import (
  4 | 	"context"
  5 | 	"fmt"
  6 | 
  7 | 	log "github.com/sirupsen/logrus"
  8 | 	corev1 "k8s.io/api/core/v1"
  9 | 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 10 | 	"k8s.io/client-go/kubernetes"
 11 | 
 12 | 	"github.com/Azure/kdebug/pkg/base"
 13 | )
 14 | 
 15 | type PodScheduleChecker struct {
 16 | }
 17 | 
 18 | func New() *PodScheduleChecker {
 19 | 	return &PodScheduleChecker{}
 20 | }
 21 | 
 22 | func (c *PodScheduleChecker) Name() string {
 23 | 	return "PodSchedule"
 24 | }
 25 | 
 26 | func (c *PodScheduleChecker) Check(ctx *base.CheckContext) ([]*base.CheckResult, error) {
 27 | 	results := []*base.CheckResult{}
 28 | 
 29 | 	if ctx.KubeClient != nil {
 30 | 		results = append(results, c.checkPodSchedule(ctx.KubeClient)...)
 31 | 	} else {
 32 | 		log.Debugf("Skip %s due to missing Kubernetes config", c.Name())
 33 | 	}
 34 | 
 35 | 	return results, nil
 36 | }
 37 | 
 38 | func (c *PodScheduleChecker) checkPodSchedule(clientset *kubernetes.Clientset) []*base.CheckResult {
 39 | 	results := []*base.CheckResult{}
 40 | 
 41 | 	// List all pods
 42 | 	pods, err := clientset.CoreV1().Pods("").List(context.Background(), metav1.ListOptions{})
 43 | 	if err != nil {
 44 | 		log.WithFields(log.Fields{"error": err}).Warn("Fail to list pods")
 45 | 		return results
 46 | 	}
 47 | 
 48 | 	// Group pods by replicaset
 49 | 	podsByRs := make(map[string][]corev1.Pod)
 50 | 	for _, pod := range pods.Items {
 51 | 		if pod.ObjectMeta.OwnerReferences == nil || len(pod.ObjectMeta.OwnerReferences) == 0 {
 52 | 			continue
 53 | 		}
 54 | 
 55 | 		ownerRef := pod.ObjectMeta.OwnerReferences[0]
 56 | 		if ownerRef.APIVersion == "apps/v1" &&
 57 | 			ownerRef.Kind == "ReplicaSet" {
 58 | 
 59 | 			rsName := pod.ObjectMeta.Namespace + "/" + ownerRef.Name
 60 | 			if rsPods, ok := podsByRs[rsName]; ok {
 61 | 				podsByRs[rsName] = append(rsPods, pod)
 62 | 			} else {
 63 | 				podsByRs[rsName] = []corev1.Pod{pod}
 64 | 			}
 65 | 		}
 66 | 	}
 67 | 
 68 | 	// Check replica sets
 69 | 	for rsName, rsPods := range podsByRs {
 70 | 		if len(rsPods) <= 1 {
 71 | 			continue
 72 | 		}
 73 | 
 74 | 		results = append(results, c.checkPodsScheduleInReplicaSet(rsName, rsPods))
 75 | 	}
 76 | 
 77 | 	return results
 78 | }
 79 | 
 80 | func (c *PodScheduleChecker) checkPodsScheduleInReplicaSet(rsName string, pods []corev1.Pod) *base.CheckResult {
 81 | 	if len(pods) <= 1 {
 82 | 		panic("Should not be called with less than 2 pods")
 83 | 	}
 84 | 
 85 | 	node := ""
 86 | 	for _, pod := range pods {
 87 | 		if node == "" {
 88 | 			node = pod.Spec.NodeName
 89 | 		} else if node != pod.Spec.NodeName {
 90 | 			return &base.CheckResult{
 91 | 				Checker:     c.Name(),
 92 | 				Description: fmt.Sprintf("Pods in replica set %s are scheduled to different nodes", rsName),
 93 | 			}
 94 | 		}
 95 | 	}
 96 | 	return &base.CheckResult{
 97 | 		Checker: c.Name(),
 98 | 		Error:   fmt.Sprintf("All pods of replica set %s are scheduled on same node", rsName),
 99 | 		Recommendations: []string{
100 | 			"Please reference to document to set Affinity and anti-affinity: https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/#affinity-and-anti-affinity",
101 | 		},
102 | 	}
103 | }
104 | 


--------------------------------------------------------------------------------
/pkg/checkers/diskreadonly/disk_readonly.go:
--------------------------------------------------------------------------------
 1 | package diskreadonly
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"os"
 6 | 	"os/exec"
 7 | 	"strings"
 8 | 
 9 | 	log "github.com/sirupsen/logrus"
10 | 
11 | 	"github.com/Azure/kdebug/pkg/base"
12 | )
13 | 
14 | const (
15 | 	Reason                = "The filesystem mignt enter read-only state due to underlying data integrity issues."
16 | 	GeneralRecommendation = "Find out which filesystem your home dir is mounted on via 'df' command. Try to use 'fsck' command to fix the filesystem and then reboot the vm."
17 | )
18 | 
19 | var helpLink = []string{
20 | 	"linux.die.net/man/8/mount",
21 | 	"linux.die.net/man/8/fsck",
22 | 	"https://askubuntu.com/a/197468",
23 | }
24 | 
25 | type DiskReadOnlyChecker struct {
26 | }
27 | 
28 | func New() *DiskReadOnlyChecker {
29 | 	return &DiskReadOnlyChecker{}
30 | }
31 | 
32 | func (c *DiskReadOnlyChecker) Name() string {
33 | 	return "DiskReadOnly"
34 | }
35 | 
36 | func (c *DiskReadOnlyChecker) Check(ctx *base.CheckContext) ([]*base.CheckResult, error) {
37 | 	if !ctx.Environment.HasFlag("linux") {
38 | 		// This checker is only valid on Linux.
39 | 		log.Debugf("Skip %s checker in non-linux os", c.Name())
40 | 		return []*base.CheckResult{}, nil
41 | 	}
42 | 
43 | 	homeDir, err := os.UserHomeDir()
44 | 	if err != nil {
45 | 		return nil, fmt.Errorf("Fail to get user home dir. %w", err)
46 | 	}
47 | 
48 | 	f, err := os.CreateTemp(homeDir, "testReadOnlyFile")
49 | 	var result *base.CheckResult
50 | 	if err != nil {
51 | 		var recommendation string
52 | 		if strings.Contains(strings.ToLower(err.Error()), "read-only") {
53 | 			mountSrc, mountTarget, findMntErr := getMountSrcAndTarget(homeDir)
54 | 			if findMntErr != nil {
55 | 				log.Warnf("Fail to find mount src for %s: %s", homeDir, findMntErr)
56 | 				recommendation = fmt.Sprintf("%s%s", Reason, GeneralRecommendation)
57 | 			} else {
58 | 				recommendation = fmt.Sprintf("%s Try to use 'fsck' command to fix the %s mounted on %s and then reboot the vm.", Reason, mountSrc, mountTarget)
59 | 			}
60 | 			result = &base.CheckResult{
61 | 				Checker:         c.Name(),
62 | 				Error:           "Disk might be read-only",
63 | 				Description:     fmt.Sprintf("Cannot create a temp file in %s due to %s", homeDir, err),
64 | 				Recommendations: []string{recommendation},
65 | 				HelpLinks:       []string{},
66 | 			}
67 | 		} else {
68 | 			return nil, fmt.Errorf("Fail to create a temp file in %s due to unexpected error: %w", homeDir, err)
69 | 		}
70 | 	} else {
71 | 		defer os.Remove(f.Name())
72 | 		result = &base.CheckResult{
73 | 			Checker:     c.Name(),
74 | 			Description: fmt.Sprintf("%s is not read-only", homeDir),
75 | 		}
76 | 	}
77 | 
78 | 	return []*base.CheckResult{result}, nil
79 | }
80 | 
81 | func getMountSrcAndTarget(homeDir string) (string, string, error) {
82 | 	findMntCmd := exec.Command("findmnt", "--target", homeDir, "--output", "SOURCE,TARGET", "--noheadings")
83 | 	mountDescription, err := findMntCmd.Output()
84 | 	if err != nil {
85 | 		return "", "", fmt.Errorf("Fail to find the filesystem of %s with command '%s': %w",
86 | 			homeDir, findMntCmd.String(), err)
87 | 	} else {
88 | 		mountDescriptions := strings.Split(strings.TrimSuffix(string(mountDescription), "\n"), " ")
89 | 		// mount source, mount target, error
90 | 		return mountDescriptions[0], mountDescriptions[1], nil
91 | 	}
92 | }
93 | 


--------------------------------------------------------------------------------
/pkg/checkers/kube/objectsize/objectsize.go:
--------------------------------------------------------------------------------
  1 | package dns
  2 | 
  3 | import (
  4 | 	"context"
  5 | 	"encoding/json"
  6 | 	"fmt"
  7 | 
  8 | 	log "github.com/sirupsen/logrus"
  9 | 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 10 | 	"k8s.io/client-go/kubernetes"
 11 | 
 12 | 	"github.com/Azure/kdebug/pkg/base"
 13 | 	"github.com/dustin/go-humanize"
 14 | )
 15 | 
 16 | const (
 17 | 	WarnSizeThreshold = 800 * (1 << 10) // 800 KB
 18 | )
 19 | 
 20 | type KubeObjectSizeChecker struct {
 21 | }
 22 | 
 23 | func New() *KubeObjectSizeChecker {
 24 | 	return &KubeObjectSizeChecker{}
 25 | }
 26 | 
 27 | func (c *KubeObjectSizeChecker) Name() string {
 28 | 	return "KubeObjectSize"
 29 | }
 30 | 
 31 | func (c *KubeObjectSizeChecker) Check(ctx *base.CheckContext) ([]*base.CheckResult, error) {
 32 | 	results := []*base.CheckResult{}
 33 | 
 34 | 	if ctx.KubeClient != nil {
 35 | 		results = append(results, c.checkConfigMaps(ctx.KubeClient)...)
 36 | 		results = append(results, c.checkSecrets(ctx.KubeClient)...)
 37 | 	} else {
 38 | 		log.Warn("Skip KubeObjectSizeChecker due to missing kube client")
 39 | 	}
 40 | 
 41 | 	return results, nil
 42 | }
 43 | 
 44 | func (c *KubeObjectSizeChecker) checkConfigMaps(clientset *kubernetes.Clientset) []*base.CheckResult {
 45 | 	results := []*base.CheckResult{}
 46 | 
 47 | 	cms, err := clientset.CoreV1().ConfigMaps("").List(context.Background(), metav1.ListOptions{})
 48 | 	if err != nil {
 49 | 		log.WithFields(log.Fields{"error": err}).Warn("Fail to list config maps")
 50 | 		return results
 51 | 	}
 52 | 
 53 | 	for _, cm := range cms.Items {
 54 | 		result := c.checkObjectSize("ConfigMap", cm.ObjectMeta.Namespace, cm.ObjectMeta.Name, cm)
 55 | 		if result != nil {
 56 | 			results = append(results, result)
 57 | 		}
 58 | 	}
 59 | 
 60 | 	return results
 61 | }
 62 | 
 63 | func (c *KubeObjectSizeChecker) checkSecrets(clientset *kubernetes.Clientset) []*base.CheckResult {
 64 | 	results := []*base.CheckResult{}
 65 | 
 66 | 	cms, err := clientset.CoreV1().Secrets("").List(context.Background(), metav1.ListOptions{})
 67 | 	if err != nil {
 68 | 		log.WithFields(log.Fields{"error": err}).Warn("Fail to list secrets")
 69 | 		return results
 70 | 	}
 71 | 
 72 | 	for _, cm := range cms.Items {
 73 | 		result := c.checkObjectSize("Secret", cm.ObjectMeta.Namespace, cm.ObjectMeta.Name, cm)
 74 | 		if result != nil {
 75 | 			results = append(results, result)
 76 | 		}
 77 | 	}
 78 | 
 79 | 	return results
 80 | }
 81 | 
 82 | func (c *KubeObjectSizeChecker) checkObjectSize(kind, ns, name string, obj interface{}) *base.CheckResult {
 83 | 	data, err := json.Marshal(obj)
 84 | 	if err != nil {
 85 | 		return nil
 86 | 	}
 87 | 
 88 | 	if len(data) > WarnSizeThreshold {
 89 | 		return &base.CheckResult{
 90 | 			Checker:     c.Name(),
 91 | 			Error:       fmt.Sprintf("%s %s/%s reaching size limit.", kind, ns, name),
 92 | 			Description: fmt.Sprintf("%s %s/%s of size %s is reaching size limit. It cannot exceed 1MiB.", kind, ns, name, humanize.Bytes(uint64(len(data)))),
 93 | 			Recommendations: []string{
 94 | 				"Consider mounting a volume or use a separate database or file service.",
 95 | 			},
 96 | 		}
 97 | 	}
 98 | 
 99 | 	return &base.CheckResult{
100 | 		Checker:     c.Name(),
101 | 		Description: fmt.Sprintf("%s %s/%s of size %s is not reaching size limit.", kind, ns, name, humanize.Bytes(uint64(len(data)))),
102 | 	}
103 | }
104 | 


--------------------------------------------------------------------------------
/cmd/run-as-host/main.go:
--------------------------------------------------------------------------------
  1 | package main
  2 | 
  3 | import (
  4 | 	"context"
  5 | 	"io"
  6 | 	"io/ioutil"
  7 | 	"os"
  8 | 	"path"
  9 | 	"strings"
 10 | 	"time"
 11 | 
 12 | 	"github.com/coreos/go-systemd/v22/dbus"
 13 | 	log "github.com/sirupsen/logrus"
 14 | )
 15 | 
 16 | const (
 17 | 	SystemdConfigDir    = "/etc/systemd/system"
 18 | 	SystemdUnitName     = "kdebug.service"
 19 | 	SystemdUnitTemplate = `[Unit]
 20 | Description=kdebug
 21 | 
 22 | [Service]
 23 | Type=oneshot
 24 | ExecStart=TODO_EXEC_START
 25 | TimeoutSec=60
 26 | 
 27 | [Install]
 28 | WantedBy=multi-user.target
 29 | `
 30 | 	OutputFile = "/tmp/kdebug.stdout.log"
 31 | )
 32 | 
 33 | func copyFile(src, dst string) error {
 34 | 	in, err := os.Open(src)
 35 | 	if err != nil {
 36 | 		return err
 37 | 	}
 38 | 	defer in.Close()
 39 | 
 40 | 	out, err := os.OpenFile(dst, os.O_CREATE|os.O_WRONLY, 0755)
 41 | 	if err != nil {
 42 | 		return err
 43 | 	}
 44 | 	defer out.Close()
 45 | 
 46 | 	if _, err = io.Copy(out, in); err != nil {
 47 | 		return err
 48 | 	}
 49 | 
 50 | 	return out.Sync()
 51 | }
 52 | 
 53 | func writeSystemdUnit(cmd string) error {
 54 | 	unitConfig := strings.Replace(SystemdUnitTemplate,
 55 | 		"TODO_EXEC_START", cmd, 1)
 56 | 	unitConfigPath := path.Join(SystemdConfigDir, SystemdUnitName)
 57 | 	return ioutil.WriteFile(unitConfigPath, []byte(unitConfig), 0644)
 58 | }
 59 | 
 60 | func removeSystemdUnit() error {
 61 | 	unitConfigPath := path.Join(SystemdConfigDir, SystemdUnitName)
 62 | 	return os.Remove(unitConfigPath)
 63 | }
 64 | 
 65 | func readOutputs() ([]byte, error) {
 66 | 	f, err := os.Open(OutputFile)
 67 | 	if err != nil {
 68 | 		return nil, err
 69 | 	}
 70 | 	defer f.Close()
 71 | 	return ioutil.ReadAll(f)
 72 | }
 73 | 
 74 | func main() {
 75 | 	if len(os.Args) < 2 {
 76 | 		log.Fatal("not enough args")
 77 | 	}
 78 | 
 79 | 	cmd := os.Args[1]
 80 | 	cmdArgs := append(os.Args[2:], "--output", OutputFile)
 81 | 
 82 | 	// Copy binary to host
 83 | 	baseName := path.Base(cmd)
 84 | 	dstPath := path.Join("/tmp", baseName)
 85 | 	if err := copyFile(cmd, dstPath); err != nil {
 86 | 		log.Fatalf("fail to copy file: %+v", err)
 87 | 	}
 88 | 
 89 | 	// Set up system config
 90 | 	dstCmd := dstPath + " " + strings.Join(cmdArgs, " ")
 91 | 	if err := writeSystemdUnit(dstCmd); err != nil {
 92 | 		log.Fatalf("fail to write unit file: %+v", err)
 93 | 	}
 94 | 
 95 | 	// Invoke
 96 | 	conn, err := dbus.NewSystemConnectionContext(context.Background())
 97 | 	if err != nil {
 98 | 		log.Fatalf("fail to connect to systemd: %+v", err)
 99 | 	}
100 | 	defer conn.Close()
101 | 
102 | 	if err = conn.ReloadContext(context.Background()); err != nil {
103 | 		log.Fatalf("fail to reload systemd: %+v", err)
104 | 	}
105 | 
106 | 	ch := make(chan string)
107 | 	_, err = conn.StartUnitContext(context.Background(),
108 | 		SystemdUnitName, "replace", ch)
109 | 	if err != nil {
110 | 		log.Fatalf("fail to start systemd unit: %+v", err)
111 | 	}
112 | 
113 | 	select {
114 | 	case <-ch:
115 | 		break
116 | 	case <-time.After(75 * time.Second):
117 | 		log.Fatalf("timeout starting systemd unit")
118 | 	}
119 | 
120 | 	output, err := readOutputs()
121 | 	if err != nil {
122 | 		log.Fatalf("fail to read output: %+v", err)
123 | 	}
124 | 
125 | 	// Cleanup
126 | 	if err = removeSystemdUnit(); err != nil {
127 | 		log.Fatalf("fail to remove systemd unit: %+v", err)
128 | 	}
129 | 
130 | 	if err = os.Remove(OutputFile); err != nil {
131 | 		log.Fatalf("fail to remove stdout file: %+v", err)
132 | 	}
133 | 
134 | 	// Output
135 | 	os.Stdout.Write(output)
136 | }
137 | 


--------------------------------------------------------------------------------
/pkg/tools/tcpdump/tcpdump.go:
--------------------------------------------------------------------------------
  1 | package tcpdump
  2 | 
  3 | import (
  4 | 	"fmt"
  5 | 	"os"
  6 | 	"os/exec"
  7 | 	"strings"
  8 | 
  9 | 	flags "github.com/jessevdk/go-flags"
 10 | 
 11 | 	"github.com/Azure/kdebug/pkg/base"
 12 | 	log "github.com/sirupsen/logrus"
 13 | )
 14 | 
 15 | type TcpdumpTool struct {
 16 | 	srcIP    string
 17 | 	srcPort  string
 18 | 	dstIP    string
 19 | 	dstPort  string
 20 | 	hostIP   string
 21 | 	hostPort string
 22 | 	pid      string
 23 | 	tcpOnly  bool
 24 | }
 25 | 
 26 | const (
 27 | 	DefaultTcpdumpArguments = "-nvvv"
 28 | )
 29 | 
 30 | type Config struct {
 31 | 	Source      string `long:"source" description:"The source of the connection. Format: <ip>:<port>. Watch all sources if not assigned."`
 32 | 	Destination string `long:"destination" description:"The destination of the connection. Format: <ip>:<port>. Watch all destination if not assigned."`
 33 | 	Host        string `long:"host" description:"The host(either src or dst) of the connection. Format: <ip>:<port>. Watch if not assigned."`
 34 | 	Pid         string `short:"p" long:"pid" description:"Attach into a specific pid's network namespace. Use current namespace if not assigned"`
 35 | 	TcpOnly     bool   `long:"tcponly" description:"Only watch tcp connections"`
 36 | }
 37 | 
 38 | func New() *TcpdumpTool {
 39 | 	return &TcpdumpTool{}
 40 | }
 41 | 
 42 | func (c *TcpdumpTool) Name() string {
 43 | 	return "Tcpdump"
 44 | }
 45 | 
 46 | func logAndExec(name string, args ...string) *exec.Cmd {
 47 | 	log.Infof("Exec %s %+v", name, args)
 48 | 	return exec.Command(name, args...)
 49 | }
 50 | 
 51 | func (c *TcpdumpTool) ParseArgs(ctx *base.ToolContext, args []string) error {
 52 | 	var config Config
 53 | 	remainingArgs, err := flags.ParseArgs(&config, args)
 54 | 	if err != nil {
 55 | 		return err
 56 | 	}
 57 | 	ctx.Config = &config
 58 | 	ctx.Args = remainingArgs
 59 | 	return nil
 60 | }
 61 | 
 62 | func (c *TcpdumpTool) Run(ctx *base.ToolContext) error {
 63 | 	config := ctx.Config.(*Config)
 64 | 	c.ParseParameters(config)
 65 | 	tcpdumpArgs := c.GenerateTcpdumpParamerters()
 66 | 
 67 | 	// Attch pid
 68 | 	if len(config.Pid) > 0 {
 69 | 		_, err := logAndExec("nsenter", "-n", "-t", config.Pid).Output()
 70 | 
 71 | 		if err != nil {
 72 | 			return err
 73 | 		}
 74 | 	}
 75 | 
 76 | 	cmd := logAndExec("tcpdump", strings.Split(tcpdumpArgs, " ")...)
 77 | 	cmd.Stdout = os.Stdout
 78 | 	cmd.Stderr = os.Stderr
 79 | 	err := cmd.Run()
 80 | 	return err
 81 | }
 82 | 
 83 | func (c *TcpdumpTool) ParseParameters(config *Config) {
 84 | 	c.srcIP, c.srcPort = ParseIPAndPort(config.Source)
 85 | 	c.dstIP, c.dstPort = ParseIPAndPort(config.Destination)
 86 | 	c.hostIP, c.hostPort = ParseIPAndPort(config.Host)
 87 | 	c.pid = config.Pid
 88 | 	c.tcpOnly = config.TcpOnly
 89 | }
 90 | 
 91 | func (c *TcpdumpTool) GenerateTcpdumpParamerters() string {
 92 | 	var cmd []string
 93 | 	if len(c.srcIP) > 0 {
 94 | 		cmd = append(cmd, fmt.Sprintf("src %s", c.srcIP))
 95 | 	}
 96 | 	if len(c.srcPort) > 0 {
 97 | 		cmd = append(cmd, fmt.Sprintf("src port %s", c.srcPort))
 98 | 	}
 99 | 	if len(c.dstIP) > 0 {
100 | 		cmd = append(cmd, fmt.Sprintf("dst %s", c.dstIP))
101 | 	}
102 | 	if len(c.dstPort) > 0 {
103 | 		cmd = append(cmd, fmt.Sprintf("dst port %s", c.dstPort))
104 | 	}
105 | 	if len(c.hostIP) > 0 {
106 | 		cmd = append(cmd, fmt.Sprintf("host %s", c.hostIP))
107 | 	}
108 | 	if len(c.hostPort) > 0 {
109 | 		cmd = append(cmd, fmt.Sprintf("port %s", c.hostPort))
110 | 	}
111 | 	if c.tcpOnly {
112 | 		cmd = append(cmd, "tcp")
113 | 	}
114 | 	return DefaultTcpdumpArguments + " " + strings.Join(cmd, " and ")
115 | }
116 | 
117 | func ParseIPAndPort(s string) (ip string, port string) {
118 | 	colon := strings.Index(s, ":")
119 | 	if colon == -1 {
120 | 		return s, ""
121 | 	}
122 | 
123 | 	return s[0:colon], s[colon+1:]
124 | }
125 | 


--------------------------------------------------------------------------------
/pkg/tools/upgradeinspector/upgradeinspector.go:
--------------------------------------------------------------------------------
  1 | package upgradeinspector
  2 | 
  3 | import (
  4 | 	"fmt"
  5 | 	"os/exec"
  6 | 	"strings"
  7 | 	"time"
  8 | 
  9 | 	"github.com/Azure/kdebug/pkg/base"
 10 | 	"github.com/Azure/kdebug/pkg/env"
 11 | 	"github.com/fatih/color"
 12 | 	flags "github.com/jessevdk/go-flags"
 13 | )
 14 | 
 15 | const logPath = "/var/log/dpkg.log"
 16 | 
 17 | const suggestion = "You can check '/var/log/dpkg.log' and '/var/log/apt/history.log' for further detail."
 18 | 
 19 | var columns = []string{
 20 | 	"Timestamp",
 21 | 	"Package",
 22 | 	"OldVer",
 23 | 	"NewVer",
 24 | }
 25 | 
 26 | type UpgradeInspectTool struct {
 27 | 	checkDays   int
 28 | 	recordLimit int
 29 | }
 30 | 
 31 | type Config struct {
 32 | 	CheckDays   int `long:"checkdays" default:"7" description:"Days you want to look back to search for package upgrade history. Default is 7."`
 33 | 	RecordLimit int `long:"recordlimit" default:"50" description:"Number of records you want to inspect for package upgrade history. Default is 50."`
 34 | }
 35 | 
 36 | func (t *UpgradeInspectTool) Name() string {
 37 | 	return "upgradeinspector"
 38 | }
 39 | 
 40 | func New() *UpgradeInspectTool {
 41 | 	return &UpgradeInspectTool{}
 42 | }
 43 | 
 44 | func (t *UpgradeInspectTool) ParseArgs(ctx *base.ToolContext, args []string) error {
 45 | 	var config Config
 46 | 	remaningArgs, err := flags.ParseArgs(&config, args)
 47 | 	if err != nil {
 48 | 		return err
 49 | 	}
 50 | 	ctx.Config = &config
 51 | 	ctx.Args = remaningArgs
 52 | 	return nil
 53 | }
 54 | 
 55 | func (t *UpgradeInspectTool) Run(ctx *base.ToolContext) error {
 56 | 	t.parseArgument(ctx)
 57 | 	if !envCheck(ctx.Environment) {
 58 | 		fmt.Println(color.YellowString("Skip upgrade inspect in non ubuntu/debian os"))
 59 | 		return nil
 60 | 	}
 61 | 	return t.exec()
 62 | }
 63 | 
 64 | func (t *UpgradeInspectTool) parseArgument(ctx *base.ToolContext) {
 65 | 	config := ctx.Config.(*Config)
 66 | 	t.checkDays = config.CheckDays
 67 | 	t.recordLimit = config.RecordLimit
 68 | }
 69 | 
 70 | func (t *UpgradeInspectTool) exec() error {
 71 | 	cmd := exec.Command("grep", " upgrade ", logPath)
 72 | 	stdout, err := cmd.Output()
 73 | 	if err != nil {
 74 | 		return err
 75 | 	}
 76 | 	fmt.Println(t.parseResult(string(stdout)))
 77 | 	fmt.Println(color.YellowString("\n%v\n", suggestion))
 78 | 	return nil
 79 | }
 80 | 
 81 | func (t *UpgradeInspectTool) parseResult(result string) string {
 82 | 	sb := strings.Builder{}
 83 | 	logs := t.filterResult(result)
 84 | 	logNum := len(logs)
 85 | 
 86 | 	if logNum == 0 {
 87 | 		sb.WriteString(color.GreenString("\nNo package upgrade log found\n"))
 88 | 	} else {
 89 | 		sb.WriteString(fmt.Sprintf("\n%-19s\t%-40s\t%-30s\t%-30s\n\n", columns[0], columns[1], columns[2], columns[3]))
 90 | 	}
 91 | 
 92 | 	for i := 0; i < logNum && i < t.recordLimit; i++ {
 93 | 		strs := strings.Split(logs[i], " ")
 94 | 		sb.WriteString(fmt.Sprintf("%v-%v\t%-40s\t%-30s\t%-30s\n", strs[0], strs[1], strs[3], strs[4], strs[5]))
 95 | 	}
 96 | 	if t.recordLimit < logNum {
 97 | 		sb.WriteString(color.YellowString("\n%v package(s) omitted\n", logNum-t.recordLimit))
 98 | 	}
 99 | 	return sb.String()
100 | }
101 | 
102 | func (t *UpgradeInspectTool) filterResult(result string) []string {
103 | 	logs := strings.Split(result, "\n")
104 | 	filtered := []string{}
105 | 	cutTime := time.Now().AddDate(0, 0, -t.checkDays)
106 | 
107 | 	for i := 0; i < len(logs)-1; i++ {
108 | 		strs := strings.Split(logs[i], " ")
109 | 		logTime, err := time.Parse("2006-01-02 15:04:05", fmt.Sprintf(`%s %s`, strs[0], strs[1]))
110 | 		if err == nil && logTime.After(cutTime) {
111 | 			filtered = append(filtered, logs[i])
112 | 		}
113 | 	}
114 | 	return filtered
115 | }
116 | 
117 | func envCheck(environment env.Environment) bool {
118 | 	return environment.HasFlag("ubuntu") || environment.HasFlag("debian")
119 | }
120 | 


--------------------------------------------------------------------------------
/pkg/tools/aadssh/token_azure_cli.go:
--------------------------------------------------------------------------------
  1 | package aadssh
  2 | 
  3 | import (
  4 | 	"context"
  5 | 	"encoding/json"
  6 | 	"fmt"
  7 | 	"io/ioutil"
  8 | 	"net/http"
  9 | 	"net/url"
 10 | 	"os"
 11 | 	"path"
 12 | 	"strings"
 13 | 	"time"
 14 | 
 15 | 	msal "github.com/AzureAD/microsoft-authentication-library-for-go/apps/public"
 16 | 	log "github.com/sirupsen/logrus"
 17 | )
 18 | 
 19 | // acquireTokenByAzureCLI acquires a token from AAD using Azure CLI credentials
 20 | func acquireTokenByAzureCLI(ctx context.Context, scopes []string, data map[string]string) (msal.AuthResult, error) {
 21 | 	homeDir, err := os.UserHomeDir()
 22 | 	if err != nil {
 23 | 		return msal.AuthResult{}, fmt.Errorf("Fail to get OS home dir: %+v", err)
 24 | 	}
 25 | 
 26 | 	tokenCacheFilePath := path.Join(homeDir, AzureCLIDirName, AzureCLITokenCacheFileName)
 27 | 	f, err := os.Open(tokenCacheFilePath)
 28 | 	if err != nil {
 29 | 		return msal.AuthResult{}, fmt.Errorf("Fail to read Azure CLI token cache: %+v", err)
 30 | 	}
 31 | 	defer f.Close()
 32 | 
 33 | 	decoder := json.NewDecoder(f)
 34 | 	var tokenCache struct {
 35 | 		RefreshToken map[string]struct {
 36 | 			CredentialType string `json:"credential_type"`
 37 | 			Secret         string `json:"secret"`
 38 | 			ClientID       string `json:"client_id"`
 39 | 			HomeAccountID  string `json:"home_account_id"`
 40 | 			Environment    string `json:"environment"`
 41 | 		} `json:"RefreshToken"`
 42 | 	}
 43 | 	err = decoder.Decode(&tokenCache)
 44 | 	if err != nil {
 45 | 		return msal.AuthResult{}, fmt.Errorf("Fail to decode Azure CLI token cache: %+v", err)
 46 | 	}
 47 | 
 48 | 	var refreshToken string
 49 | 	var tenantId string
 50 | 	var clientId string
 51 | 	var host string
 52 | 	for _, token := range tokenCache.RefreshToken {
 53 | 		// TODO: Add more checks
 54 | 		if token.CredentialType == "RefreshToken" {
 55 | 			refreshToken = token.Secret
 56 | 			tenantId = strings.Split(token.HomeAccountID, ".")[1]
 57 | 			host = token.Environment
 58 | 			break
 59 | 		}
 60 | 	}
 61 | 
 62 | 	if refreshToken == "" {
 63 | 		return msal.AuthResult{}, fmt.Errorf("Cannot find any refresh token in Azure CLI token cache. Please do `az login`")
 64 | 	}
 65 | 
 66 | 	defaultScopes := []string{
 67 | 		"openid",
 68 | 		"profile",
 69 | 		"offline_access",
 70 | 	}
 71 | 	values := url.Values{}
 72 | 	values.Add("client_id", clientId)
 73 | 	values.Add("grant_type", "refresh_token")
 74 | 	values.Add("scope", strings.Join(append(scopes, defaultScopes...), " "))
 75 | 	values.Add("refresh_token", refreshToken)
 76 | 	for k, v := range data {
 77 | 		values.Add(k, v)
 78 | 	}
 79 | 	bodyString := values.Encode()
 80 | 	bodyStream := strings.NewReader(bodyString)
 81 | 
 82 | 	url := fmt.Sprintf("https://%s/%s%s", host, tenantId, TokenURLSuffix)
 83 | 	log.WithFields(log.Fields{"body": bodyString, "url": url}).Debug("Token request")
 84 | 
 85 | 	req, err := http.NewRequestWithContext(ctx, "POST", url, bodyStream)
 86 | 	if err != nil {
 87 | 		return msal.AuthResult{}, fmt.Errorf("Fail to construct request: %+v", err)
 88 | 	}
 89 | 
 90 | 	httpClient := &http.Client{
 91 | 		Timeout: time.Minute,
 92 | 	}
 93 | 	resp, err := httpClient.Do(req)
 94 | 	if err != nil {
 95 | 		return msal.AuthResult{}, fmt.Errorf("Fail to request token: %+v", err)
 96 | 	}
 97 | 	defer resp.Body.Close()
 98 | 
 99 | 	if resp.StatusCode != http.StatusOK {
100 | 		respContent, _ := ioutil.ReadAll(resp.Body)
101 | 		return msal.AuthResult{},
102 | 			fmt.Errorf("Unexpected token response status code: %d. Body: %s",
103 | 				resp.StatusCode, string(respContent))
104 | 	}
105 | 
106 | 	var body struct {
107 | 		AccessToken  string `json:"access_token"`
108 | 		TokenType    string `json:"token_type"`
109 | 		ExpiresIn    int    `json:"expires_in"`
110 | 		Scope        string `json:"scope"`
111 | 		RefreshToken string `json:"refresh_token"`
112 | 		IDToken      string `json:"id_token"`
113 | 	}
114 | 	decoder = json.NewDecoder(resp.Body)
115 | 	err = decoder.Decode(&body)
116 | 	if err != nil {
117 | 		return msal.AuthResult{}, fmt.Errorf("Fail to decode token response: %+v", err)
118 | 	}
119 | 
120 | 	log.WithFields(log.Fields{"body": fmt.Sprintf("%+v", body)}).Debug("Token response")
121 | 
122 | 	return msal.AuthResult{
123 | 		AccessToken: body.AccessToken,
124 | 		ExpiresOn:   time.Now().Add(time.Duration(body.ExpiresIn) * time.Second),
125 | 	}, nil
126 | }
127 | 


--------------------------------------------------------------------------------
/pkg/tools/netexec/netexec.go:
--------------------------------------------------------------------------------
  1 | package netexec
  2 | 
  3 | import (
  4 | 	"fmt"
  5 | 	"os"
  6 | 	"os/exec"
  7 | 	"strings"
  8 | 
  9 | 	"github.com/Azure/kdebug/pkg/base"
 10 | 	"github.com/jessevdk/go-flags"
 11 | 	log "github.com/sirupsen/logrus"
 12 | 	"k8s.io/cli-runtime/pkg/genericclioptions"
 13 | 	kubecmd "k8s.io/kubectl/pkg/cmd"
 14 | )
 15 | 
 16 | type Config struct {
 17 | 	Pid       string `long:"pid" description:"Attach into a specific pid's network namespace."`
 18 | 	PodName   string `long:"pod" description:"Attach into a specific pod's network namespace. Caution: The command will use ephemeral debug container to attach a container with 'ghcr.io/azure/kdebug:main' to the target pod."`
 19 | 	Namespace string `long:"namespace" description:"The namespace of the pod."`
 20 | 	Command   string `long:"command" description:"Customize the command to be run in container namespace. Leave it blank to use 'sh'."`
 21 | 	Image     string `long:"image" description:"Customize the image to be used to run command when using --pod. Leave it blank to use busybox."`
 22 | }
 23 | 
 24 | type NetexecTool struct {
 25 | 	pid       string
 26 | 	podName   string
 27 | 	namespace string
 28 | 	command   string
 29 | 	image     string
 30 | }
 31 | 
 32 | const (
 33 | 	DefaultCommand                   = "sh"
 34 | 	DefaultContainerImage            = "busybox"
 35 | 	DefaultNamespace                 = "default"
 36 | 	DefaultKubectlBasicCommandFormat = "debug -ti %s --image %s -n %s -- "
 37 | )
 38 | 
 39 | func New() *NetexecTool {
 40 | 	return &NetexecTool{}
 41 | }
 42 | 
 43 | func (c *NetexecTool) Name() string {
 44 | 	return "Netexec"
 45 | }
 46 | 
 47 | func logAndExec(name string, args ...string) *exec.Cmd {
 48 | 	log.Infof("Exec %s %+v", name, args)
 49 | 	return exec.Command(name, args...)
 50 | }
 51 | 
 52 | func (c *NetexecTool) Run(ctx *base.ToolContext) error {
 53 | 	if len(c.pid) > 0 {
 54 | 		return c.checkWithPid()
 55 | 	}
 56 | 
 57 | 	return c.checkWithPod(ctx.KubeConfigFlag)
 58 | }
 59 | 
 60 | func (c *NetexecTool) ParseArgs(ctx *base.ToolContext, args []string) error {
 61 | 	var config Config
 62 | 	remainingArgs, err := flags.ParseArgs(&config, args)
 63 | 	if err != nil {
 64 | 		return err
 65 | 	}
 66 | 	ctx.Config = &config
 67 | 	ctx.Args = remainingArgs
 68 | 	return c.parseAndCheckParameters(ctx)
 69 | }
 70 | 
 71 | func (c *NetexecTool) parseAndCheckParameters(ctx *base.ToolContext) error {
 72 | 	config := ctx.Config.(*Config)
 73 | 
 74 | 	if len(config.Pid) == 0 && len(config.PodName) == 0 {
 75 | 		return fmt.Errorf("Either --pid and --pod should be set.")
 76 | 	}
 77 | 	if len(config.Pid) > 0 && len(config.PodName) > 0 {
 78 | 		return fmt.Errorf("--pid and --pod can not be assigned together. Please set either of them.")
 79 | 	}
 80 | 	if len(config.PodName) > 0 {
 81 | 		if ctx.KubeConfigFlag == nil {
 82 | 			return fmt.Errorf("kubernetes client is not availble. Check kubeconfig.")
 83 | 		}
 84 | 	}
 85 | 
 86 | 	c.pid = config.Pid
 87 | 	c.podName = config.PodName
 88 | 	if len(config.Command) > 0 {
 89 | 		c.command = config.Command
 90 | 	} else {
 91 | 		c.command = DefaultCommand
 92 | 	}
 93 | 
 94 | 	if len(config.Image) > 0 {
 95 | 		c.image = config.Image
 96 | 	} else {
 97 | 		c.image = DefaultContainerImage
 98 | 	}
 99 | 
100 | 	if len(config.Namespace) > 0 {
101 | 		c.namespace = config.Namespace
102 | 	} else {
103 | 		c.namespace = DefaultNamespace
104 | 	}
105 | 
106 | 	return nil
107 | }
108 | 
109 | func (c *NetexecTool) checkWithPid() error {
110 | 	_, err := logAndExec("nsenter", "-n", "-t", c.pid).Output()
111 | 	if err != nil {
112 | 		return err
113 | 	}
114 | 
115 | 	args := strings.Fields(c.command)
116 | 	cmd := logAndExec(args[0], args[1:]...)
117 | 	cmd.Stdin = os.Stdin
118 | 	cmd.Stdout = os.Stdout
119 | 	cmd.Stderr = os.Stderr
120 | 	return cmd.Run()
121 | }
122 | 
123 | func (c *NetexecTool) checkWithPod(configFlags *genericclioptions.ConfigFlags) error {
124 | 	cmd := fmt.Sprintf("%s%s", fmt.Sprintf(DefaultKubectlBasicCommandFormat, c.podName, c.image, c.namespace), c.command)
125 | 	arg := strings.Fields(cmd)
126 | 	log.Infof("The command is equivalent to 'kubectl %s'", cmd)
127 | 	kubectlCmd := kubecmd.NewKubectlCommand(kubecmd.KubectlOptions{
128 | 		ConfigFlags: configFlags,
129 | 		IOStreams:   genericclioptions.IOStreams{In: os.Stdin, Out: os.Stdout, ErrOut: os.Stderr},
130 | 	})
131 | 	kubectlCmd.SetArgs(arg)
132 | 
133 | 	return kubectlCmd.Execute()
134 | }
135 | 


--------------------------------------------------------------------------------
/pkg/batch/ssh_executor.go:
--------------------------------------------------------------------------------
  1 | package batch
  2 | 
  3 | import (
  4 | 	"context"
  5 | 	"encoding/json"
  6 | 	"fmt"
  7 | 	"net"
  8 | 	"os"
  9 | 	"os/user"
 10 | 
 11 | 	scp "github.com/bramvdbogaerde/go-scp"
 12 | 	log "github.com/sirupsen/logrus"
 13 | 	"golang.org/x/crypto/ssh"
 14 | 	"golang.org/x/crypto/ssh/agent"
 15 | )
 16 | 
 17 | type SshBatchExecutor struct {
 18 | 	User string
 19 | }
 20 | 
 21 | func NewSshBatchExecutor(userName string) *SshBatchExecutor {
 22 | 	e := &SshBatchExecutor{
 23 | 		User: userName,
 24 | 	}
 25 | 	if len(e.User) == 0 {
 26 | 		// Use current user
 27 | 		ui, err := user.Current()
 28 | 		if err == nil {
 29 | 			e.User = ui.Username
 30 | 		}
 31 | 	}
 32 | 	return e
 33 | }
 34 | 
 35 | func (e *SshBatchExecutor) Execute(opts *BatchOptions) ([]*BatchResult, error) {
 36 | 	taskChan := make(chan *batchTask, opts.Concurrency)
 37 | 	resultChan := make(chan *BatchResult, opts.Concurrency)
 38 | 
 39 | 	for i := 0; i < opts.Concurrency; i++ {
 40 | 		go e.startWorker(taskChan, resultChan)
 41 | 	}
 42 | 
 43 | 	for _, machine := range opts.Machines {
 44 | 		go func(m string) {
 45 | 			taskChan <- &batchTask{
 46 | 				Machine:  m,
 47 | 				Checkers: opts.Checkers,
 48 | 			}
 49 | 		}(machine)
 50 | 	}
 51 | 
 52 | 	results := make([]*BatchResult, 0, len(opts.Machines))
 53 | 	for i := 0; i < len(opts.Machines); i++ {
 54 | 		result := <-resultChan
 55 | 		results = append(results, result)
 56 | 		opts.Reporter.OnResult(result)
 57 | 	}
 58 | 
 59 | 	close(taskChan)
 60 | 
 61 | 	return results, nil
 62 | }
 63 | 
 64 | func (e *SshBatchExecutor) startWorker(taskChan chan *batchTask, resultChan chan *BatchResult) {
 65 | 	for task := range taskChan {
 66 | 		resultChan <- e.executeTask(task)
 67 | 	}
 68 | }
 69 | 
 70 | func (e *SshBatchExecutor) createSshClient(machine string) (*ssh.Client, error) {
 71 | 	// TODO: One per SSH client
 72 | 	authSock := os.Getenv("SSH_AUTH_SOCK")
 73 | 	authConn, err := net.Dial("unix", authSock)
 74 | 	if err != nil {
 75 | 		return nil, fmt.Errorf("fail to connect to SSH_AUTH_SOCK: %+v", err)
 76 | 	}
 77 | 
 78 | 	agentClient := agent.NewClient(authConn)
 79 | 	config := &ssh.ClientConfig{
 80 | 		User: e.User,
 81 | 		Auth: []ssh.AuthMethod{
 82 | 			ssh.PublicKeysCallback(agentClient.Signers),
 83 | 		},
 84 | 		HostKeyCallback: ssh.InsecureIgnoreHostKey(),
 85 | 	}
 86 | 
 87 | 	return ssh.Dial("tcp", machine+":22", config)
 88 | }
 89 | 
 90 | func (e *SshBatchExecutor) executeTask(task *batchTask) *BatchResult {
 91 | 	result := &BatchResult{
 92 | 		Machine: task.Machine,
 93 | 	}
 94 | 
 95 | 	sshClient, err := e.createSshClient(task.Machine)
 96 | 	if err != nil {
 97 | 		result.Error = fmt.Errorf("fail to create SSH client: %+v", err)
 98 | 		return result
 99 | 	}
100 | 	defer sshClient.Close()
101 | 
102 | 	// Copy binary to remote
103 | 	log.Debugf("Copy kdebug to %s", task.Machine)
104 | 	err = copyExecutable(sshClient)
105 | 	if err != nil {
106 | 		result.Error = fmt.Errorf("fail to copy kdebug to remote machine: %+v", err)
107 | 		return result
108 | 	}
109 | 
110 | 	sess, err := sshClient.NewSession()
111 | 	if err != nil {
112 | 		result.Error = fmt.Errorf("fail to create SSH session: %+v", err)
113 | 		return result
114 | 	}
115 | 	defer sess.Close()
116 | 
117 | 	// Execute command
118 | 	cmd := fmt.Sprintf("/tmp/kdebug -f json --no-set-exit-code")
119 | 	for _, c := range task.Checkers {
120 | 		cmd += fmt.Sprintf(" -c %s", c)
121 | 	}
122 | 	log.Debugf("Execute kdebug on %s. Cmd: %s", task.Machine, cmd)
123 | 	output, err := sess.Output(cmd)
124 | 	if err != nil {
125 | 		result.Error = fmt.Errorf("fail to run kdebug on remote machine: %+v", err)
126 | 		return result
127 | 	}
128 | 
129 | 	// Build result
130 | 	log.Debugf("Aggregate results from %s", task.Machine)
131 | 	result.Error = json.Unmarshal(output, &result.CheckResults)
132 | 	return result
133 | }
134 | 
135 | func copyExecutable(sshClient *ssh.Client) error {
136 | 	path, err := os.Executable()
137 | 	if err != nil {
138 | 		return fmt.Errorf("fail to determine current executable location: %+v", err)
139 | 	}
140 | 
141 | 	f, err := os.Open(path)
142 | 	if err != nil {
143 | 		return fmt.Errorf("fail to open file %s: %+v", path, err)
144 | 	}
145 | 	defer f.Close()
146 | 
147 | 	scpClient, err := scp.NewClientBySSH(sshClient)
148 | 	if err != nil {
149 | 		return fmt.Errorf("fail to create SCP client: %+v", err)
150 | 	}
151 | 
152 | 	return scpClient.CopyFromFile(context.Background(), *f, "/tmp/kdebug", "0755")
153 | }
154 | 


--------------------------------------------------------------------------------
/pkg/checkers/oom/oom.go:
--------------------------------------------------------------------------------
  1 | package oom
  2 | 
  3 | import (
  4 | 	"bufio"
  5 | 	"errors"
  6 | 	"fmt"
  7 | 	"io"
  8 | 	"os"
  9 | 	"regexp"
 10 | 	"strings"
 11 | 	"syscall"
 12 | 
 13 | 	"github.com/Azure/kdebug/pkg/base"
 14 | )
 15 | 
 16 | const (
 17 | 	kmsgLogPath     = "/dev/kmsg"
 18 | 	ubuntuLogPath   = "/var/log/kern.log"
 19 | 	cgroupOOMKeyStr = "Memory cgroup out of memory"
 20 | 	outOfMemoryKey  = "Out of memory"
 21 | )
 22 | 
 23 | var helpLink = []string{
 24 | 	"https://www.kernel.org/doc/gorman/html/understand/understand016.html",
 25 | 	"https://stackoverflow.com/questions/18845857/what-does-anon-rss-and-total-vm-mean",
 26 | 	"https://medium.com/tailwinds-navigator/kubernetes-tip-how-does-oomkilled-work-ba71b135993b",
 27 | }
 28 | 
 29 | var oomRegex = regexp.MustCompile("^(.*:.{2}:.{2}) .* process (.*) \\((.*)\\) .* anon-rss:(.*), file-rss.* oom_score_adj:(.*)")
 30 | 
 31 | type OOMChecker struct {
 32 | 	kernLogPath string
 33 | }
 34 | 
 35 | func (c *OOMChecker) Name() string {
 36 | 	return "OOM"
 37 | }
 38 | 
 39 | func New() *OOMChecker {
 40 | 	paths := []string{kmsgLogPath, ubuntuLogPath}
 41 | 	for _, path := range paths {
 42 | 		if file, err := os.Open(path); err == nil {
 43 | 			file.Close()
 44 | 			return &OOMChecker{
 45 | 				kernLogPath: path,
 46 | 			}
 47 | 		}
 48 | 	}
 49 | 	return &OOMChecker{
 50 | 		kernLogPath: "",
 51 | 	}
 52 | }
 53 | 
 54 | func (c *OOMChecker) Check(ctx *base.CheckContext) ([]*base.CheckResult, error) {
 55 | 	var results []*base.CheckResult
 56 | 	oomResult, err := c.checkOOM(ctx)
 57 | 	if err != nil {
 58 | 		return nil, err
 59 | 	}
 60 | 	results = append(results, oomResult)
 61 | 	return results, nil
 62 | }
 63 | 
 64 | func (c *OOMChecker) checkOOM(ctx *base.CheckContext) (*base.CheckResult, error) {
 65 | 	result := &base.CheckResult{
 66 | 		Checker: c.Name(),
 67 | 	}
 68 | 	//todo:support other os
 69 | 	if !ctx.Environment.HasFlag("linux") {
 70 | 		result.Description = fmt.Sprint("Skip oom check in non-linux os")
 71 | 		return result, nil
 72 | 	}
 73 | 	if c.kernLogPath == "" {
 74 | 		result.Description = fmt.Sprint("Skip oom check because of can't access supported kern log path")
 75 | 		return result, nil
 76 | 	}
 77 | 	oomInfos, err := c.getAndParseOOMLog()
 78 | 	if err != nil {
 79 | 		return nil, err
 80 | 	} else if len(oomInfos) > 0 {
 81 | 		result.Error = strings.Join(oomInfos, "\n")
 82 | 		result.Description = "Detect process oom killed"
 83 | 		result.HelpLinks = helpLink
 84 | 	} else {
 85 | 		result.Description = "No OOM found in recent kernlog."
 86 | 	}
 87 | 	return result, nil
 88 | }
 89 | 
 90 | type nonBlockReader struct {
 91 | 	fd int
 92 | }
 93 | 
 94 | func (r *nonBlockReader) Read(buf []byte) (n int, err error) {
 95 | 	n, err = syscall.Read(r.fd, buf)
 96 | 	if err != nil {
 97 | 		if errors.Is(err, syscall.EAGAIN) {
 98 | 			return 0, io.EOF
 99 | 		}
100 | 	}
101 | 	if n == 0 && err == nil {
102 | 		return 0, io.EOF
103 | 	}
104 | 	return n, err
105 | }
106 | 
107 | func (c *OOMChecker) getAndParseOOMLog() ([]string, error) {
108 | 	file, err := os.Open(c.kernLogPath)
109 | 	if err != nil {
110 | 		return nil, err
111 | 	}
112 | 	defer file.Close()
113 | 
114 | 	fd := int(file.Fd())
115 | 	if err = syscall.SetNonblock(fd, true); err != nil {
116 | 		return nil, fmt.Errorf("Fail to read in non-block mode: %s", err)
117 | 	}
118 | 
119 | 	var oomInfos []string
120 | 	scanner := bufio.NewScanner(&nonBlockReader{fd})
121 | 	for scanner.Scan() {
122 | 		tmp := scanner.Text()
123 | 		//todo: more sophisticated OOM context
124 | 		//pattern match. https://github.com/torvalds/linux/blob/551acdc3c3d2b6bc97f11e31dcf960bc36343bfc/mm/oom_kill.c#L1120, https://github.com/torvalds/linux/blob/551acdc3c3d2b6bc97f11e31dcf960bc36343bfc/mm/oom_kill.c#L895
125 | 		if strings.Contains(tmp, cgroupOOMKeyStr) || strings.Contains(tmp, outOfMemoryKey) {
126 | 			oomInfo, err := parseOOMContent(tmp)
127 | 			if err != nil {
128 | 				return nil, err
129 | 			} else {
130 | 				oomInfos = append(oomInfos, oomInfo)
131 | 			}
132 | 		}
133 | 	}
134 | 
135 | 	if err := scanner.Err(); err != nil {
136 | 		return nil, err
137 | 	}
138 | 	return oomInfos, nil
139 | }
140 | 
141 | func parseOOMContent(content string) (string, error) {
142 | 	match := oomRegex.FindStringSubmatch(content)
143 | 	if len(match) != 6 {
144 | 		err := fmt.Errorf("Can't parse oom content:%s \n", content)
145 | 		return "", err
146 | 	} else {
147 | 		return fmt.Sprintf("progress:[%s %s] is OOM kill at time [%s]. [rss:%s] [oom_score_adj:%s]\n", match[2], match[3], match[1], match[4], match[5]), nil
148 | 	}
149 | }
150 | 


--------------------------------------------------------------------------------
/pkg/checkers/tcpping/tcpping.go:
--------------------------------------------------------------------------------
  1 | package tcpping
  2 | 
  3 | import (
  4 | 	"context"
  5 | 	"fmt"
  6 | 	"net"
  7 | 	"strings"
  8 | 	"time"
  9 | 
 10 | 	v1 "k8s.io/api/core/v1"
 11 | 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 12 | 
 13 | 	"github.com/Azure/kdebug/pkg/base"
 14 | )
 15 | 
 16 | const KubernetesServiceHost = "KUBERNETES_SERVICE_HOST"
 17 | const TimeOut = 1000 * time.Millisecond
 18 | 
 19 | var PublicTargets = []pingEndpoint{
 20 | 	{
 21 | 		ServerAddress: "www.google.com:443",
 22 | 		Name:          "Google",
 23 | 		NameSpace:     "",
 24 | 	},
 25 | }
 26 | 
 27 | type pingEndpoint struct {
 28 | 	ServerAddress string
 29 | 	Name          string
 30 | 	NameSpace     string
 31 | }
 32 | 
 33 | func (t *TCPChecker) ping(serverAddr string) error {
 34 | 	conn, err := t.dialer.Dial("tcp", serverAddr)
 35 | 	if err != nil {
 36 | 		return err
 37 | 	}
 38 | 	defer conn.Close()
 39 | 	conn.(*net.TCPConn).SetLinger(0)
 40 | 	return nil
 41 | }
 42 | 
 43 | type TCPChecker struct {
 44 | 	dialer  net.Dialer
 45 | 	targets []pingEndpoint
 46 | }
 47 | 
 48 | func New() *TCPChecker {
 49 | 	return &TCPChecker{
 50 | 		dialer: net.Dialer{
 51 | 			Timeout: TimeOut,
 52 | 		},
 53 | 	}
 54 | }
 55 | 
 56 | func (t *TCPChecker) Name() string {
 57 | 	return "TcpChecker"
 58 | }
 59 | 
 60 | func (t *TCPChecker) Check(ctx *base.CheckContext) ([]*base.CheckResult, error) {
 61 | 	var results []*base.CheckResult
 62 | 	targets := append(t.targets, getCheckTargets(ctx)...)
 63 | 	resultChan := make(chan *base.CheckResult, len(targets))
 64 | 	for _, pingTarget := range targets {
 65 | 		go func(target pingEndpoint) {
 66 | 			result := &base.CheckResult{
 67 | 				Checker: t.Name(),
 68 | 			}
 69 | 			err := t.ping(target.ServerAddress)
 70 | 			sb := strings.Builder{}
 71 | 			if err != nil {
 72 | 				sb.WriteString(fmt.Sprintf("Fail to establish tcp connection to %s (%s) ",
 73 | 					target.ServerAddress, target.Name))
 74 | 				result.Error = err.Error()
 75 | 				result.Recommendations = []string{"Check firewall settings if this is not expected."}
 76 | 			} else {
 77 | 				sb.WriteString(fmt.Sprintf("Successfully establish tcp connection to %s (%s)", target.ServerAddress, target.Name))
 78 | 			}
 79 | 			if target.NameSpace != "" {
 80 | 				sb.WriteString(fmt.Sprintf(" in namespace %s", target.NameSpace))
 81 | 			}
 82 | 			sb.WriteString("\n")
 83 | 			result.Description = sb.String()
 84 | 			resultChan <- result
 85 | 		}(pingTarget)
 86 | 	}
 87 | 	for i := 0; i < len(targets); i++ {
 88 | 		result := <-resultChan
 89 | 		results = append(results, result)
 90 | 	}
 91 | 	return results, nil
 92 | }
 93 | 
 94 | func getCheckTargets(c *base.CheckContext) []pingEndpoint {
 95 | 	var targets []pingEndpoint
 96 | 	targets = append(targets, PublicTargets...)
 97 | 	// TODO: A bit noisy. Maybe add a new subset option for user to enable these checks
 98 | 	// if c.KubeClient != nil {
 99 | 	// 	services, err := getServicePingEndpoint(c)
100 | 	// 	if err != nil {
101 | 	// 		log.Warnf("Fetch cluster service ping endpoint error %v.Skip those checks", err)
102 | 	// 	} else {
103 | 	// 		targets = append(targets, services...)
104 | 	// 	}
105 | 	// }
106 | 	return targets
107 | }
108 | 
109 | func getServicePingEndpoint(c *base.CheckContext) ([]pingEndpoint, error) {
110 | 	services, err := c.KubeClient.CoreV1().Services("").List(context.TODO(), metav1.ListOptions{})
111 | 	isInKubernetes := c.Environment.HasFlag("k8s")
112 | 	if err != nil {
113 | 		return nil, err
114 | 	}
115 | 	var pingEndpoints []pingEndpoint
116 | 	for _, service := range services.Items {
117 | 		for _, port := range service.Spec.Ports {
118 | 			if port.Protocol == v1.ProtocolTCP {
119 | 				address := formatIP(service.Spec.LoadBalancerIP)
120 | 				if address == "" && len(service.Status.LoadBalancer.Ingress) > 0 {
121 | 					address = formatIP(service.Status.LoadBalancer.Ingress[0].IP)
122 | 				}
123 | 				if address == "" && isInKubernetes {
124 | 					address = formatIP(service.Spec.ClusterIP)
125 | 				}
126 | 				if address != "" {
127 | 					serverUrl := fmt.Sprintf("%s:%d", address, port.Port)
128 | 					pingEndpoints = append(pingEndpoints, pingEndpoint{
129 | 						ServerAddress: serverUrl,
130 | 						Name:          service.Name,
131 | 						NameSpace:     service.Namespace,
132 | 					})
133 | 				}
134 | 			}
135 | 		}
136 | 
137 | 	}
138 | 	return pingEndpoints, nil
139 | }
140 | 
141 | func formatIP(address string) string {
142 | 	if address == "" || address == "None" {
143 | 		return ""
144 | 	}
145 | 	if strings.Contains(address, ":") {
146 | 		return fmt.Sprintf("[%s]", address)
147 | 	} else {
148 | 		return address
149 | 	}
150 | }
151 | 


--------------------------------------------------------------------------------
/deploy/node-problem-detector/README.md:
--------------------------------------------------------------------------------
 1 | ## What is npd-kdebug
 2 | 
 3 | [node-problem-detector](https://github.com/kubernetes/node-problem-detector) aims to make various node problems visible to the upstream layers in the cluster management stack. It is a daemon that runs on each node, detects node problems and reports them to apiserver. node-problem-detector can either run as a DaemonSet or run standalone. Now it is running as a Kubernetes Addon enabled by default in the GCE cluster.
 4 | 
 5 | In this example, we integrate the node-problem-detector with kdebug. After deploying kdebug with a specific check mode and node-problem-detector, kdebug will try to detect potential problems under the check mode. The results of the check will pass to node-problem-detector, and be reported through problem APIs of node-problem-detector. 
 6 | * `NodeCondition`: Permanent problem that makes the node unavailable for pods should
 7 | be reported as `NodeCondition`.
 8 | * `Event`: Temporary problem that has limited impact on pod but is informative
 9 | should be reported as `Event`.
10 | 
11 | We call the integration of node-problem-detector(npd) and kdebug as **npd-kdebug**.
12 | 
13 | ## How to deploy npd-kdebug
14 | 
15 | We have already prepared a [sample yaml](./node-problem-detector.yaml) file for you to help you deploy the intergration application of npd-kdebug with DNS check mode. You can run the following command to deploy the integrated daemon app to your kubernetes cluster.
16 | ```shell
17 | kubectl apply -f ./node-problem-detector.yaml
18 | ```
19 | 
20 | ## What can npd-kdebug show you
21 | 
22 | ### Check the npd-kdebug is ready
23 | * In kubernetes dashboard, you can click `Daemon Sets` in the side bar. 
24 | * If you see information like the following picture, it means that npd-kdebug is working on your cluster. 
25 | 
26 | ![image](../../resource/npd/npd-dashboard-daemonsets.png)
27 | 
28 | ### Check the problem detecting result of kdebug check
29 | * Click `Cluster` > `Nodes`, and select a node. 
30 | * In the 'Conditions' tag, you can see a `'DNSProblem'` type. It is a type of problems that detected by kdebug, and reported to node-problem-detector, as the node-problem-detector finally shows the `Status` and `Messages`. 
31 |     * If `Status=False`, it means there is no DNS problem. 
32 |     * If `Status=True`, it means npd-kdebug detected some DNS problems, and error messages show in `Messages`.
33 | 
34 | ![image](../../resource/npd/npd-dashboard-DNSProblem.png)
35 | 
36 | ## Customization
37 | 
38 | Besides `DNSProblem` check, you can integrate other kdebug check modes with npd. To customize different check modes npd-kdebug, you can follow the step-by-step tutorial in this section.
39 | 
40 | * Step 1: Copy the [template yaml](./node-problem-detector-template.yaml) and open it.
41 | * Step 2: Replace `<YOUR_JSON_NAME>` with a json file name you want at `line 42`. Recommend to include the check mode name you want to deploy. For example, `kdebug-http`.
42 | ![image](../../resource/npd/yml-your_json_name.png)
43 | * Step 3: Now you should edit your config json. This part of contents describe the parameters that how you run the `kdebug`, which is as the custom plugin of npd.
44 | * Step 4: You should replace `<YOUR_JSON_NAME>` with the json file name you entered in Step 2. 
45 | ![image](../../resource/npd/yml-json-your_json_name.png)
46 |     
47 | * Step 5: In `"conditions"`, you should fill the values of `type`, `reason` and `message`, which will be showed in dashboard if the mode check by `kdebug` passed. 
48 | ![image](../../resource/npd/yml-json-conditions.png) 
49 | 
50 | * Step 6: You could define rules by edit the `rules` property. For the npd supporting two different types of check, you should define two rules of `temporary` and `permanent` types. Actually, you can define both rules by offering the same parameters of `kdebug` commands.
51 | 
52 |     You could replace `<CONDITION_NAME>` and `reason` as you want. For `<CHECK_MODE>` in `args` property, please replace it with the name of check mode(`'http'` for HTTP check, e.g.). Due to you could only define certain conditions in Step 5, you should include the flag `-c` in `args`, or kdebug would execute all modes of check. The `-f` flag would make the output of kdebug check formatted, which will be showed as [Message](#what-can-npd-kdebug-show-you) in dashboard. For more supported arguments of kdebug you can use, please refer to the help messages by running the following command.
53 | 
54 |     ```shell
55 |     kdebug -h
56 |     ``` 
57 |     ![image](../../resource/npd/yml-json-rules.png) 
58 | 
59 | * Step 7: Now you can use your customized yaml file to deploy your npd-kdebug by following [How to deploy npd-kdebug](#how-to-deploy-npd-kdebug)


--------------------------------------------------------------------------------
/pkg/tools/aadssh/aadssh.go:
--------------------------------------------------------------------------------
  1 | package aadssh
  2 | 
  3 | import (
  4 | 	"encoding/base64"
  5 | 	"fmt"
  6 | 	"os"
  7 | 	"os/exec"
  8 | 	"path"
  9 | 
 10 | 	flags "github.com/jessevdk/go-flags"
 11 | 	log "github.com/sirupsen/logrus"
 12 | 	"golang.org/x/crypto/ssh"
 13 | 
 14 | 	"github.com/Azure/kdebug/pkg/base"
 15 | )
 16 | 
 17 | const (
 18 | 	// Extracted from Azure CLI code
 19 | 	AzureCLIClientId           = "04b07795-8ddb-461a-bbee-02f9e1bf7b46"
 20 | 	AzureCLIDirName            = ".azure"
 21 | 	AzureCLITokenCacheFileName = "msal_token_cache.json"
 22 | 	SSHDirName                 = ".aad-ssh"
 23 | 	SSHPrivateKeyName          = "id_rsa"
 24 | 	SSHPublicKeyName           = "id_rsa.pub"
 25 | 	SSHCertificateName         = "id_rsa-cert.pub"
 26 | )
 27 | 
 28 | type AadSsh struct {
 29 | }
 30 | 
 31 | type Config struct {
 32 | 	Cloud       string `long:"cloud" description:"Azure cloud name. Support values are: azurecloud, azurechinacloud, azureusgovernment"`
 33 | 	UseAzureCLI bool   `long:"use-azure-cli" description:"Use Azure CLI credentials"`
 34 | }
 35 | 
 36 | func New() *AadSsh {
 37 | 	return &AadSsh{}
 38 | }
 39 | 
 40 | func (c *AadSsh) Name() string {
 41 | 	return "AAD SSH"
 42 | }
 43 | 
 44 | func (c *AadSsh) ParseArgs(ctx *base.ToolContext, args []string) error {
 45 | 	var config Config
 46 | 	remainingArgs, err := flags.ParseArgs(&config, args)
 47 | 	if err != nil {
 48 | 		return err
 49 | 	}
 50 | 	ctx.Args = remainingArgs
 51 | 	ctx.Config = &config
 52 | 	return nil
 53 | }
 54 | 
 55 | func (c *AadSsh) Run(ctx *base.ToolContext) error {
 56 | 	config := ctx.Config.(*Config)
 57 | 
 58 | 	if config.Cloud == "" {
 59 | 		// Default to public cloud
 60 | 		config.Cloud = "azurecloud"
 61 | 	}
 62 | 
 63 | 	// Ensure key dir
 64 | 	sshDir, err := ensureSSHKeyDir(SSHDirName)
 65 | 	if err != nil {
 66 | 		return fmt.Errorf("Fail to ensure SSH directory: %+v", err)
 67 | 	}
 68 | 
 69 | 	// Load SSH private key
 70 | 	sshPrivKeyPath := path.Join(sshDir, SSHPrivateKeyName)
 71 | 	sshPrivKey, err := createOrLoadSSHPrivateKey(sshPrivKeyPath)
 72 | 	if err != nil {
 73 | 		return fmt.Errorf("Fail to create or load SSH private key: %+v", err)
 74 | 	}
 75 | 	log.WithFields(log.Fields{"path": sshPrivKeyPath}).Info("Loaded SSH private key")
 76 | 
 77 | 	// Save SSH public key
 78 | 	sshPubKey, err := ssh.NewPublicKey(&sshPrivKey.PublicKey)
 79 | 	if err != nil {
 80 | 		return fmt.Errorf("Fail to create SSH public key: %+v", err)
 81 | 	}
 82 | 	sshPubKeyPath := path.Join(sshDir, SSHPublicKeyName)
 83 | 	if err = saveSSHPublicKey(sshPubKey, sshPubKeyPath); err != nil {
 84 | 		return fmt.Errorf("Fail to save SSH public key: %+v", err)
 85 | 	}
 86 | 	log.WithFields(log.Fields{"path": sshPubKeyPath}).Info("Saved SSH public key")
 87 | 
 88 | 	// Try existing certificate
 89 | 	sshCertPath := path.Join(sshDir, SSHCertificateName)
 90 | 	sshCert, err := loadSSHCertificate(sshCertPath)
 91 | 	if err != nil {
 92 | 		log.WithFields(log.Fields{"error": err}).Debug("Fail to load existing SSH certificate")
 93 | 		log.Info("Acquire a new SSH certificate from AAD")
 94 | 
 95 | 		// Acquire a certificate from AAD
 96 | 		sshCert, err = acquireCertificate(config.Cloud, config.UseAzureCLI, sshPubKey)
 97 | 		if err != nil {
 98 | 			return fmt.Errorf("Fail to acquire SSH certificate from AAD: %+v", err)
 99 | 		}
100 | 
101 | 		// Save SSH certificate to file
102 | 		sshCertContent := ssh.CertAlgoRSAv01 + " " + base64.StdEncoding.EncodeToString(sshCert.Marshal())
103 | 		if err = saveSSHCertificate(sshCertContent, sshCertPath); err != nil {
104 | 			return fmt.Errorf("Fail to save SSH certificate: %+v", err)
105 | 		}
106 | 		log.WithFields(log.Fields{"path": sshCertPath}).Info("Saved SSH certificate")
107 | 	} else {
108 | 		log.WithFields(log.Fields{"path": sshCertPath}).Info("Loaded valid SSH certificate")
109 | 	}
110 | 
111 | 	// Add SSH key to SSH agent
112 | 	sshAuthSock := os.Getenv("SSH_AUTH_SOCK")
113 | 	if sshAuthSock != "" {
114 | 		if err = addSSHKeyToAgent(sshAuthSock, sshPrivKey, sshCert); err != nil {
115 | 			return fmt.Errorf("Fail to add SSH key to agent: %+v", err)
116 | 		}
117 | 		log.WithFields(log.Fields{"path": sshPrivKeyPath}).Info("Added SSH key to agent")
118 | 	}
119 | 
120 | 	// Call SSH there are remaining args
121 | 	if len(ctx.Args) > 0 {
122 | 		args := getSSHArgs(ctx.Args, sshPrivKeyPath, sshAuthSock != "")
123 | 		log.WithFields(log.Fields{"args": args}).Info("Starting SSH")
124 | 		cmd := exec.Command("ssh", args...)
125 | 		cmd.Stdin = os.Stdin
126 | 		cmd.Stdout = os.Stdout
127 | 		cmd.Stderr = os.Stderr
128 | 		if err = cmd.Start(); err != nil {
129 | 			return fmt.Errorf("Fail to start SSH: %+v", err)
130 | 		}
131 | 		cmd.Wait()
132 | 	}
133 | 
134 | 	return nil
135 | }
136 | 


--------------------------------------------------------------------------------
/pkg/checkers/kmscachesize/kms_cache_size.go:
--------------------------------------------------------------------------------
  1 | package kmscachesize
  2 | 
  3 | import (
  4 | 	"context"
  5 | 	"errors"
  6 | 	"fmt"
  7 | 	"os"
  8 | 	"strings"
  9 | 
 10 | 	"github.com/shirou/gopsutil/v3/process"
 11 | 	log "github.com/sirupsen/logrus"
 12 | 	"gopkg.in/yaml.v3"
 13 | 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 14 | 
 15 | 	"github.com/Azure/kdebug/pkg/base"
 16 | )
 17 | 
 18 | var helpLink = []string{
 19 | 	"https://kubernetes.io/docs/tasks/administer-cluster/kms-provider/#configuring-the-kms-provider-kms-v2",
 20 | }
 21 | 
 22 | const cacheSizeAlertThreshold = 0.8
 23 | const kmsConfigCmd = "--encryption-provider-config="
 24 | 
 25 | type encConfig struct {
 26 | 	Resources []encResource `yaml:"resources"`
 27 | }
 28 | 
 29 | type encResource struct {
 30 | 	Providers []encProvider `yaml:"providers"`
 31 | }
 32 | 
 33 | type encProvider struct {
 34 | 	Kms encKms `yaml:"kms"`
 35 | }
 36 | 
 37 | type encKms struct {
 38 | 	CacheSize int `yaml:"cachesize"`
 39 | }
 40 | 
 41 | type KMSCacheSizeChecker struct {
 42 | }
 43 | 
 44 | func (c *KMSCacheSizeChecker) Name() string {
 45 | 	return "KMSCacheSize"
 46 | }
 47 | 
 48 | func New() *KMSCacheSizeChecker {
 49 | 	return &KMSCacheSizeChecker{}
 50 | }
 51 | 
 52 | func (c *KMSCacheSizeChecker) Check(ctx *base.CheckContext) ([]*base.CheckResult, error) {
 53 | 	if !ctx.Environment.HasFlag("linux") {
 54 | 		log.Debugf("Skip %s checker in non-linux os", c.Name())
 55 | 		return []*base.CheckResult{}, nil
 56 | 	}
 57 | 
 58 | 	if ctx.KubeClient == nil {
 59 | 		log.Debugf("Skip %s checker due to no kube config provided", c.Name())
 60 | 		return []*base.CheckResult{}, nil
 61 | 	}
 62 | 
 63 | 	kmsConfigPath, err := getKmsConfigPath()
 64 | 	if err != nil {
 65 | 		log.Debugf("Cannot find KMS config file: %s", err)
 66 | 		return []*base.CheckResult{}, nil
 67 | 	}
 68 | 
 69 | 	cacheSize, err := getKmsCacheSize(kmsConfigPath)
 70 | 	if err != nil {
 71 | 		return nil, err
 72 | 	}
 73 | 
 74 | 	log.Debugf("KMS cache size: %d", cacheSize)
 75 | 
 76 | 	if cacheSize == 0 {
 77 | 		log.Debugf("There's no limit for KMS cache size")
 78 | 		return []*base.CheckResult{}, nil
 79 | 	}
 80 | 
 81 | 	secretsCount, err := c.getCurrentSecretsCount(ctx)
 82 | 	if err != nil {
 83 | 		return nil, err
 84 | 	}
 85 | 
 86 | 	log.Debugf("Secrets count: %d", secretsCount)
 87 | 
 88 | 	result := &base.CheckResult{
 89 | 		Checker:     c.Name(),
 90 | 		Description: fmt.Sprintf("Current secrets:%d, cache size:%d.", secretsCount, cacheSize),
 91 | 	}
 92 | 
 93 | 	if float32(secretsCount) > (float32(cacheSize) * cacheSizeAlertThreshold) {
 94 | 		result.Error = fmt.Sprintf("KMS cache size is insufficient.")
 95 | 		result.Description += fmt.Sprintf(" When number of secrets exceeds KMS cache size, Kubernetes may suffer frmo significant performance issue.")
 96 | 		result.HelpLinks = helpLink
 97 | 	}
 98 | 
 99 | 	return []*base.CheckResult{result}, nil
100 | }
101 | 
102 | func getKmsConfigPath() (string, error) {
103 | 	procs, err := process.Processes()
104 | 	if err != nil {
105 | 		return "", err
106 | 	}
107 | 
108 | 	for _, proc := range procs {
109 | 		procName, err := proc.Name()
110 | 		if err != nil {
111 | 			log.Errorf("Fail get proc name for pid: %d", proc.Pid)
112 | 			continue
113 | 		}
114 | 
115 | 		if strings.ToLower(procName) == "kube-apiserver" {
116 | 			cmds, err := proc.CmdlineSlice()
117 | 			if err != nil {
118 | 				log.Errorf("Fail get proc cmdline for: %s", procName)
119 | 				continue
120 | 			}
121 | 
122 | 			for _, cmd := range cmds {
123 | 				if strings.HasPrefix(cmd, kmsConfigCmd) {
124 | 					return cmd[len(kmsConfigCmd):], nil
125 | 				}
126 | 			}
127 | 
128 | 			return "", errors.New("API server doesn't have KMS configured")
129 | 		}
130 | 	}
131 | 
132 | 	return "", errors.New("Fail to find api server process")
133 | }
134 | 
135 | func getKmsCacheSize(path string) (int, error) {
136 | 	f, err := os.Open(path)
137 | 	if err != nil {
138 | 		return 0, err
139 | 	}
140 | 	defer f.Close()
141 | 
142 | 	decoder := yaml.NewDecoder(f)
143 | 	var config encConfig
144 | 	err = decoder.Decode(&config)
145 | 	if err != nil {
146 | 		return 0, err
147 | 	}
148 | 
149 | 	if len(config.Resources) > 0 && len(config.Resources[0].Providers) > 0 {
150 | 		return config.Resources[0].Providers[0].Kms.CacheSize, nil
151 | 	} else {
152 | 		return 0, fmt.Errorf("Fail to parse cache size from kms config: %s", path)
153 | 	}
154 | }
155 | 
156 | func (c *KMSCacheSizeChecker) getCurrentSecretsCount(ctx *base.CheckContext) (int, error) {
157 | 	client := ctx.KubeClient
158 | 	secrets, err := client.CoreV1().Secrets("").List(context.TODO(), metav1.ListOptions{})
159 | 	if err != nil {
160 | 		return 0, fmt.Errorf("Fail to list secrets from Kubernetes: %s", err)
161 | 	}
162 | 	return len(secrets.Items), nil
163 | }
164 | 


--------------------------------------------------------------------------------
/pkg/tools/aadssh/token.go:
--------------------------------------------------------------------------------
  1 | package aadssh
  2 | 
  3 | import (
  4 | 	"context"
  5 | 	"crypto/sha256"
  6 | 	"encoding/base64"
  7 | 	"encoding/hex"
  8 | 	"encoding/json"
  9 | 	"fmt"
 10 | 	"net/http"
 11 | 	"sort"
 12 | 	"strings"
 13 | 	"time"
 14 | 
 15 | 	msal "github.com/AzureAD/microsoft-authentication-library-for-go/apps/public"
 16 | 	log "github.com/sirupsen/logrus"
 17 | 	"golang.org/x/crypto/ssh"
 18 | )
 19 | 
 20 | var cloudToScope = map[string]string{
 21 | 	"azurecloud":        "https://pas.windows.net/CheckMyAccess/Linux/.default",
 22 | 	"azurechinacloud":   "https://pas.chinacloudapi.cn/CheckMyAccess/Linux/.default",
 23 | 	"azureusgovernment": "https://pasff.usgovcloudapi.net/CheckMyAccess/Linux/.default",
 24 | }
 25 | 
 26 | var cloudToAuthority = map[string]string{
 27 | 	"azurecloud":        "https://login.microsoftonline.com/common",
 28 | 	"azurechinacloud":   "https://login.chinacloudapi.cn/common",
 29 | 	"azureusgovernment": "https://login.microsoftonline.us/common",
 30 | }
 31 | 
 32 | // prepareRequestData prepares AAD token request data
 33 | func prepareRequestData(sshPubKey ssh.PublicKey) (map[string]string, error) {
 34 | 	exponentString, modulusString, err := parseSSHPublicKey(sshPubKey)
 35 | 	if err != nil {
 36 | 		return nil, fmt.Errorf("Fail to parse SSH public key due to: %+v", err)
 37 | 	}
 38 | 
 39 | 	hash := sha256.New()
 40 | 	hash.Write([]byte(modulusString))
 41 | 	hash.Write([]byte(exponentString))
 42 | 	keyId := hex.EncodeToString(hash.Sum(nil))
 43 | 	jwk := map[string]string{
 44 | 		"kty": "RSA",
 45 | 		"n":   modulusString,
 46 | 		"e":   exponentString,
 47 | 		"kid": keyId,
 48 | 	}
 49 | 	jwkJson, err := json.Marshal(jwk)
 50 | 	if err != nil {
 51 | 		return nil, fmt.Errorf("Fail to parse encode JWK payload due to: %+v", err)
 52 | 	}
 53 | 
 54 | 	data := map[string]string{
 55 | 		"token_type": "ssh-cert",
 56 | 		"req_cnf":    string(jwkJson),
 57 | 		"key_id":     keyId,
 58 | 	}
 59 | 
 60 | 	return data, nil
 61 | }
 62 | 
 63 | // getSupportedClouds returns supported cloud names
 64 | func getSupportedClouds() []string {
 65 | 	cloudNames := []string{}
 66 | 	for n := range cloudToScope {
 67 | 		cloudNames = append(cloudNames, n)
 68 | 	}
 69 | 	sort.Strings(cloudNames)
 70 | 	return cloudNames
 71 | }
 72 | 
 73 | // acquireCertificate acquires SSH certificate from AAD
 74 | func acquireCertificate(cloud string, useAzureCLI bool, sshPubKey ssh.PublicKey) (*ssh.Certificate, error) {
 75 | 	// Prepare token request data
 76 | 	data, err := prepareRequestData(sshPubKey)
 77 | 	if err != nil {
 78 | 		return nil, fmt.Errorf("Fail to prepare request data: %+v", err)
 79 | 	}
 80 | 	log.WithFields(log.Fields{
 81 | 		"data": data,
 82 | 	}).Debug("Token request data")
 83 | 
 84 | 	// Request token
 85 | 	authority := cloudToAuthority[cloud]
 86 | 	if authority == "" {
 87 | 		return nil, fmt.Errorf("Unsupported cloud: %s. Supported clouds include %+v", cloud, getSupportedClouds())
 88 | 	}
 89 | 	httpClient := &http.Client{
 90 | 		Timeout:   time.Minute,
 91 | 		Transport: &Transport{data: data},
 92 | 	}
 93 | 	client, err := msal.New(AzureCLIClientId,
 94 | 		msal.WithAuthority(authority),
 95 | 		msal.WithHTTPClient(httpClient))
 96 | 	if err != nil {
 97 | 		return nil, fmt.Errorf("Fail to create MSAL client: %+v", err)
 98 | 	}
 99 | 
100 | 	scope := cloudToScope[strings.ToLower(cloud)]
101 | 	if scope == "" {
102 | 		return nil, fmt.Errorf("Unsupported cloud: %s. Supported clouds include %+v", cloud, getSupportedClouds())
103 | 	}
104 | 
105 | 	scopes := []string{scope}
106 | 	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
107 | 	defer cancel()
108 | 	var authResult msal.AuthResult
109 | 	if useAzureCLI {
110 | 		authResult, err = acquireTokenByAzureCLI(ctx, scopes, data)
111 | 	} else {
112 | 		authResult, err = client.AcquireTokenInteractive(ctx, scopes)
113 | 	}
114 | 	if err != nil {
115 | 		return nil, fmt.Errorf("Fail to create acquire AAD token: %+v", err)
116 | 	}
117 | 
118 | 	log.WithFields(log.Fields{"authResult": fmt.Sprintf("%+v", authResult)}).Debug("Got AAD auth result")
119 | 
120 | 	sshCertData, err := base64.StdEncoding.DecodeString(authResult.AccessToken)
121 | 	if err != nil {
122 | 		return nil, fmt.Errorf("Fail to base64 decode SSH certificate: %+v", err)
123 | 	}
124 | 	sshPub, err := ssh.ParsePublicKey(sshCertData)
125 | 	if err != nil {
126 | 		return nil, fmt.Errorf("Fail to parse SSH certificate: %+v", err)
127 | 	}
128 | 	sshCert, ok := sshPub.(*ssh.Certificate)
129 | 	if !ok {
130 | 		return nil, fmt.Errorf("Not a SSH certificate")
131 | 	}
132 | 
133 | 	validBefore := time.Unix(int64(sshCert.ValidBefore), 0)
134 | 	log.WithFields(log.Fields{"validBefore": validBefore}).Info("Got SSH certificate. Re-run this command to obtain a new one after it expires.")
135 | 
136 | 	return sshCert, nil
137 | }
138 | 


--------------------------------------------------------------------------------
/pkg/checkers/dns/dns_test.go:
--------------------------------------------------------------------------------
  1 | package dns
  2 | 
  3 | import (
  4 | 	"errors"
  5 | 	"reflect"
  6 | 	"testing"
  7 | 	"time"
  8 | 
  9 | 	"github.com/Azure/kdebug/pkg/base"
 10 | 	"github.com/Azure/kdebug/pkg/env"
 11 | 	"github.com/miekg/dns"
 12 | )
 13 | 
 14 | type FakeDnsClient struct {
 15 | 	r *dns.Msg
 16 | 	e error
 17 | 	m *dns.Msg
 18 | 	a string
 19 | }
 20 | 
 21 | func (c *FakeDnsClient) Exchange(m *dns.Msg, a string) (r *dns.Msg, rtt time.Duration, err error) {
 22 | 	c.m = m
 23 | 	c.a = a
 24 | 	return c.r, time.Duration(0), c.e
 25 | }
 26 | 
 27 | func TestCheckServer(t *testing.T) {
 28 | 	client := &FakeDnsClient{
 29 | 		r: &dns.Msg{
 30 | 			MsgHdr: dns.MsgHdr{
 31 | 				Rcode: dns.RcodeSuccess,
 32 | 			},
 33 | 		},
 34 | 	}
 35 | 	checker := &DnsChecker{
 36 | 		client: client,
 37 | 	}
 38 | 	r, err := checker.checkServer(GoogleDnsServer, "www.bing.com")
 39 | 	if err != nil {
 40 | 		t.Errorf("expect no error but got: %+v", err)
 41 | 	}
 42 | 	if !r.Ok() {
 43 | 		t.Errorf("expect ok but not")
 44 | 	}
 45 | 	if client.a != GoogleDnsServer.Server+":53" {
 46 | 		t.Errorf("dns request server is wrong: %s", client.a)
 47 | 	}
 48 | 	if client.m.Question[0].String() != ";www.bing.com.\tIN\t A" {
 49 | 		t.Errorf("wrong dns question: %s", client.m.Question[0].String())
 50 | 	}
 51 | }
 52 | 
 53 | func TestCheckServerBadRcode(t *testing.T) {
 54 | 	client := &FakeDnsClient{
 55 | 		r: &dns.Msg{
 56 | 			MsgHdr: dns.MsgHdr{
 57 | 				Rcode: dns.RcodeServerFailure,
 58 | 			},
 59 | 		},
 60 | 	}
 61 | 	checker := &DnsChecker{
 62 | 		client: client,
 63 | 	}
 64 | 	r, err := checker.checkServer(GoogleDnsServer, "www.bing.com")
 65 | 	if err != nil {
 66 | 		t.Errorf("expect no error but got: %+v", err)
 67 | 	}
 68 | 	if r.Ok() {
 69 | 		t.Errorf("expect not ok")
 70 | 	}
 71 | 	if r.Error == "" || r.Description == "" ||
 72 | 		!reflect.DeepEqual(r.Recommendations, GoogleDnsServer.Recommendations) ||
 73 | 		!reflect.DeepEqual(r.HelpLinks, GoogleDnsServer.HelpLinks) {
 74 | 		t.Errorf("unexpected result")
 75 | 	}
 76 | 	if client.a != GoogleDnsServer.Server+":53" {
 77 | 		t.Errorf("dns request server is wrong: %s", client.a)
 78 | 	}
 79 | 	if client.m.Question[0].String() != ";www.bing.com.\tIN\t A" {
 80 | 		t.Errorf("wrong dns question: %s", client.m.Question[0].String())
 81 | 	}
 82 | }
 83 | 
 84 | func TestCheckServerError(t *testing.T) {
 85 | 	client := &FakeDnsClient{
 86 | 		e: errors.New("err"),
 87 | 	}
 88 | 	checker := &DnsChecker{
 89 | 		client: client,
 90 | 	}
 91 | 	r, err := checker.checkServer(GoogleDnsServer, "www.bing.com")
 92 | 	if err != nil {
 93 | 		t.Errorf("expect no error but got: %+v", err)
 94 | 	}
 95 | 	if r.Ok() {
 96 | 		t.Errorf("expect not ok")
 97 | 	}
 98 | 	if r.Error == "" || r.Description != "err" ||
 99 | 		!reflect.DeepEqual(r.Recommendations, GoogleDnsServer.Recommendations) ||
100 | 		!reflect.DeepEqual(r.HelpLinks, GoogleDnsServer.HelpLinks) {
101 | 		t.Errorf("unexpected result")
102 | 	}
103 | 	if client.a != GoogleDnsServer.Server+":53" {
104 | 		t.Errorf("dns request server is wrong: %s", client.a)
105 | 	}
106 | 	if client.m.Question[0].String() != ";www.bing.com.\tIN\t A" {
107 | 		t.Errorf("wrong dns question: %s", client.m.Question[0].String())
108 | 	}
109 | }
110 | 
111 | func TestGetCheckTargets(t *testing.T) {
112 | 	{
113 | 		e := &env.StaticEnvironment{
114 | 			Flags: []string{"ubuntu"},
115 | 		}
116 | 		servers := getCheckTargets(e)
117 | 		if !reflect.DeepEqual(servers, []DnsServer{GoogleDnsServer, SystemdResolvedDnsServer}) {
118 | 			t.Errorf("unexpected check targets on 'ubuntu'")
119 | 		}
120 | 	}
121 | 
122 | 	{
123 | 		e := &env.StaticEnvironment{
124 | 			Flags: []string{"azure"},
125 | 		}
126 | 		servers := getCheckTargets(e)
127 | 		if !reflect.DeepEqual(servers,
128 | 			[]DnsServer{GoogleDnsServer, AzureDnsServer, AksCoreDnsServerPublic, AksCoreDnsServerInCluster}) {
129 | 			t.Errorf("unexpected check targets on 'azure'")
130 | 		}
131 | 	}
132 | 
133 | 	{
134 | 		e := &env.StaticEnvironment{
135 | 			Flags: []string{""},
136 | 		}
137 | 		servers := getCheckTargets(e)
138 | 		if !reflect.DeepEqual(servers, []DnsServer{GoogleDnsServer}) {
139 | 			t.Errorf("unexpected check targets on ''")
140 | 		}
141 | 	}
142 | }
143 | 
144 | func TestCheck(t *testing.T) {
145 | 	client := &FakeDnsClient{
146 | 		r: &dns.Msg{
147 | 			MsgHdr: dns.MsgHdr{
148 | 				Rcode: dns.RcodeSuccess,
149 | 			},
150 | 		},
151 | 	}
152 | 	checker := &DnsChecker{
153 | 		client: client,
154 | 	}
155 | 
156 | 	ctx := &base.CheckContext{
157 | 		Environment: &env.StaticEnvironment{
158 | 			Flags: []string{"ubuntu"},
159 | 		},
160 | 	}
161 | 	r, err := checker.Check(ctx)
162 | 	if err != nil {
163 | 		t.Errorf("expect no error but got: %+v", err)
164 | 	}
165 | 	if len(r) != 4 {
166 | 		t.Errorf("expect 4 results but got %d", len(r))
167 | 	}
168 | }
169 | 


--------------------------------------------------------------------------------
/deploy/node-problem-detector/node-problem-detector.yaml:
--------------------------------------------------------------------------------
  1 | apiVersion: apps/v1
  2 | kind: DaemonSet
  3 | metadata:
  4 |   name: node-problem-detector
  5 |   namespace: kube-system
  6 |   labels:
  7 |     app: node-problem-detector
  8 | spec:
  9 |   selector:
 10 |     matchLabels:
 11 |       app: node-problem-detector
 12 |   template:
 13 |     metadata:
 14 |       labels:
 15 |         app: node-problem-detector
 16 |     spec:
 17 |       serviceAccountName: node-problem-detector
 18 |       affinity:
 19 |         nodeAffinity:
 20 |           requiredDuringSchedulingIgnoredDuringExecution:
 21 |             nodeSelectorTerms:
 22 |               - matchExpressions:
 23 |                   - key: kubernetes.io/os
 24 |                     operator: In
 25 |                     values:
 26 |                       - linux
 27 |       initContainers:
 28 |       - name: download-kdebug
 29 |         image: k8s.gcr.io/busybox:1.27
 30 |         command:
 31 |         - 'sh'
 32 |         - '-c'
 33 |         - 'wget -O /opt/kdebug/kdebug https://github.com/Azure/kdebug/releases/download/v0.4-beta-1/kdebug && chmod +x /opt/kdebug/kdebug'
 34 |         volumeMounts:
 35 |         - name: kdebug
 36 |           mountPath: /opt/kdebug
 37 |       containers:
 38 |       - name: node-problem-detector
 39 |         command:
 40 |         - /node-problem-detector
 41 |         - --logtostderr
 42 |         - --config.custom-plugin-monitor=/config/kdebug-dns.json
 43 |         - --apiserver-override=kubernetes
 44 |         image: k8s.gcr.io/node-problem-detector/node-problem-detector:v0.8.7
 45 |         resources:
 46 |           limits:
 47 |             cpu: 10m
 48 |             memory: 80Mi
 49 |           requests:
 50 |             cpu: 10m
 51 |             memory: 80Mi
 52 |         imagePullPolicy: Always
 53 |         securityContext:
 54 |           privileged: true
 55 |         env:
 56 |         - name: NODE_NAME
 57 |           valueFrom:
 58 |             fieldRef:
 59 |               fieldPath: spec.nodeName
 60 |         volumeMounts:
 61 |         # Make sure node problem detector is in the same timezone
 62 |         # with the host.
 63 |         - name: localtime
 64 |           mountPath: /etc/localtime
 65 |           readOnly: true
 66 |         - name: config
 67 |           mountPath: /config
 68 |           readOnly: true
 69 |         - name: kdebug
 70 |           mountPath: /opt/kdebug
 71 |           readOnly: true
 72 |       volumes:
 73 |       - name: localtime
 74 |         hostPath:
 75 |           path: /etc/localtime
 76 |       - name: config
 77 |         configMap:
 78 |           name: node-problem-detector-config
 79 |       - name: kdebug
 80 |         emptyDir: {}
 81 |       tolerations:
 82 |         - effect: NoSchedule
 83 |           operator: Exists
 84 |         - effect: NoExecute
 85 |           operator: Exists
 86 | ---
 87 | apiVersion: v1
 88 | kind: ConfigMap
 89 | metadata:
 90 |   name: node-problem-detector-config
 91 |   namespace: kube-system
 92 | data:
 93 |   kdebug-dns.json: |
 94 |     {
 95 |         "plugin": "custom",
 96 |         "pluginConfig": {
 97 |           "invoke_interval": "30s",
 98 |           "timeout": "30s",
 99 |           "max_output_length": 80,
100 |           "concurrency": 3,
101 |           "enable_message_change_based_condition_update": false
102 |         },
103 |         "source": "kdebug-dns",
104 |         "metricsReporting": true,
105 |         "conditions": [
106 |             {
107 |                 "type": "DNSProblem",
108 |                 "reason": "DNSChecksPass",
109 |                 "message": "No DNS problem found"
110 |             }
111 |         ],
112 |         "rules": [
113 |             {
114 |                 "type": "temporary",
115 |                 "reason": "DNSHasProblem",
116 |                 "path": "/opt/kdebug/kdebug",
117 |                 "args": [
118 |                   "-c",
119 |                   "dns",
120 |                   "-f",
121 |                   "oneline"
122 |                 ]
123 |             },
124 |             {
125 |                 "type": "permanent",
126 |                 "condition": "DNSProblem",
127 |                 "reason": "DNSHasProblem",
128 |                 "path": "/opt/kdebug/kdebug",
129 |                 "args": [
130 |                   "-c",
131 |                   "dns",
132 |                   "-f",
133 |                   "oneline"
134 |                 ]
135 |             }
136 |         ]
137 |     }
138 | ---
139 | apiVersion: v1
140 | kind: ServiceAccount
141 | metadata:
142 |   name: node-problem-detector
143 |   labels:
144 |     app: node-problem-detector
145 |   namespace: kube-system
146 | ---
147 | apiVersion: rbac.authorization.k8s.io/v1
148 | kind: ClusterRoleBinding
149 | metadata:
150 |   name: node-problem-detector
151 |   labels:
152 |     app: node-problem-detector
153 | subjects:
154 | - kind: ServiceAccount
155 |   name: node-problem-detector
156 |   namespace: kube-system
157 | roleRef:
158 |   kind: ClusterRole
159 |   name: system:node-problem-detector
160 |   apiGroup: rbac.authorization.k8s.io
161 | 


--------------------------------------------------------------------------------
/deploy/node-problem-detector/node-problem-detector-template.yaml:
--------------------------------------------------------------------------------
  1 | apiVersion: apps/v1
  2 | kind: DaemonSet
  3 | metadata:
  4 |   name: node-problem-detector
  5 |   namespace: kube-system
  6 |   labels:
  7 |     app: node-problem-detector
  8 | spec:
  9 |   selector:
 10 |     matchLabels:
 11 |       app: node-problem-detector
 12 |   template:
 13 |     metadata:
 14 |       labels:
 15 |         app: node-problem-detector
 16 |     spec:
 17 |       serviceAccountName: node-problem-detector
 18 |       affinity:
 19 |         nodeAffinity:
 20 |           requiredDuringSchedulingIgnoredDuringExecution:
 21 |             nodeSelectorTerms:
 22 |               - matchExpressions:
 23 |                   - key: kubernetes.io/os
 24 |                     operator: In
 25 |                     values:
 26 |                       - linux
 27 |       initContainers:
 28 |       - name: download-kdebug
 29 |         image: k8s.gcr.io/busybox:1.27
 30 |         command:
 31 |         - 'sh'
 32 |         - '-c'
 33 |         - 'wget -O /opt/kdebug/kdebug https://github.com/Azure/kdebug/releases/download/v0.4-beta-1/kdebug && chmod +x /opt/kdebug/kdebug'
 34 |         volumeMounts:
 35 |         - name: kdebug
 36 |           mountPath: /opt/kdebug
 37 |       containers:
 38 |       - name: node-problem-detector
 39 |         command:
 40 |         - /node-problem-detector
 41 |         - --logtostderr
 42 |         - --config.custom-plugin-monitor=/config/<YOUR_JSON_NAME>.json
 43 |         - --apiserver-override=kubernetes
 44 |         image: k8s.gcr.io/node-problem-detector/node-problem-detector:v0.8.7
 45 |         resources:
 46 |           limits:
 47 |             cpu: 10m
 48 |             memory: 80Mi
 49 |           requests:
 50 |             cpu: 10m
 51 |             memory: 80Mi
 52 |         imagePullPolicy: Always
 53 |         securityContext:
 54 |           privileged: true
 55 |         env:
 56 |         - name: NODE_NAME
 57 |           valueFrom:
 58 |             fieldRef:
 59 |               fieldPath: spec.nodeName
 60 |         volumeMounts:
 61 |         # Make sure node problem detector is in the same timezone
 62 |         # with the host.
 63 |         - name: localtime
 64 |           mountPath: /etc/localtime
 65 |           readOnly: true
 66 |         - name: config
 67 |           mountPath: /config
 68 |           readOnly: true
 69 |         - name: kdebug
 70 |           mountPath: /opt/kdebug
 71 |           readOnly: true
 72 |       volumes:
 73 |       - name: localtime
 74 |         hostPath:
 75 |           path: /etc/localtime
 76 |       - name: config
 77 |         configMap:
 78 |           name: node-problem-detector-config
 79 |       - name: kdebug
 80 |         emptyDir: {}
 81 |       tolerations:
 82 |         - effect: NoSchedule
 83 |           operator: Exists
 84 |         - effect: NoExecute
 85 |           operator: Exists
 86 | ---
 87 | apiVersion: v1
 88 | kind: ConfigMap
 89 | metadata:
 90 |   name: node-problem-detector-config
 91 |   namespace: kube-system
 92 | data:
 93 |   <YOUR_JSON_NAME>.json: |
 94 |     {
 95 |         "plugin": "custom",
 96 |         "pluginConfig": {
 97 |           "invoke_interval": "30s",
 98 |           "timeout": "30s",
 99 |           "max_output_length": 80,
100 |           "concurrency": 3,
101 |           "enable_message_change_based_condition_update": false
102 |         },
103 |         "source": "<YOUR_JSON_NAME>",
104 |         "metricsReporting": true,
105 |         "conditions": [
106 |             {
107 |                 "type": "<CONDITION_NAME>",
108 |                 "reason": "<CHECK_MODE_NAME>ChecksPass",
109 |                 "message": "No <CHECK_MODE_NAME> problem found"
110 |             }
111 |         ],
112 |         "rules": [
113 |             {
114 |                 "type": "temporary",
115 |                 "reason": "<CHECK_MODE_NAME>HasProblem",
116 |                 "path": "/opt/kdebug/kdebug",
117 |                 "args": [
118 |                   "-c",
119 |                   "<CHECK_MODE>",
120 |                   "-f",
121 |                   "oneline"
122 |                 ]
123 |             },
124 |             {
125 |                 "type": "permanent",
126 |                 "condition": "<CONDITION_NAME>",
127 |                 "reason": "<CHECK_MODE_NAME>HasProblem",
128 |                 "path": "/opt/kdebug/kdebug",
129 |                 "args": [
130 |                   "-c",
131 |                   "<CHECK_MODE>",
132 |                   "-f",
133 |                   "oneline"
134 |                 ]
135 |             }
136 |         ]
137 |     }
138 | ---
139 | apiVersion: v1
140 | kind: ServiceAccount
141 | metadata:
142 |   name: node-problem-detector
143 |   labels:
144 |     app: node-problem-detector
145 |   namespace: kube-system
146 | ---
147 | apiVersion: rbac.authorization.k8s.io/v1
148 | kind: ClusterRoleBinding
149 | metadata:
150 |   name: node-problem-detector
151 |   labels:
152 |     app: node-problem-detector
153 | subjects:
154 | - kind: ServiceAccount
155 |   name: node-problem-detector
156 |   namespace: kube-system
157 | roleRef:
158 |   kind: ClusterRole
159 |   name: system:node-problem-detector
160 |   apiGroup: rbac.authorization.k8s.io
161 | 


--------------------------------------------------------------------------------
/pkg/tools/aadssh/ssh.go:
--------------------------------------------------------------------------------
  1 | package aadssh
  2 | 
  3 | import (
  4 | 	"crypto/rand"
  5 | 	"crypto/rsa"
  6 | 	"crypto/x509"
  7 | 	"encoding/base64"
  8 | 	"encoding/binary"
  9 | 	"encoding/pem"
 10 | 	"fmt"
 11 | 	"io/ioutil"
 12 | 	"os"
 13 | 	"path"
 14 | 	"strings"
 15 | 	"time"
 16 | 
 17 | 	"golang.org/x/crypto/ssh"
 18 | )
 19 | 
 20 | // ensureSSHKeyDir creates a directory under user home for storing SSH keys
 21 | // returns directory path
 22 | func ensureSSHKeyDir(dirName string) (string, error) {
 23 | 	homeDir, err := os.UserHomeDir()
 24 | 	if err != nil {
 25 | 		return "", err
 26 | 	}
 27 | 	sshDir := path.Join(homeDir, dirName)
 28 | 	if _, err = os.Stat(sshDir); err != nil {
 29 | 		if os.IsNotExist(err) {
 30 | 			if err = os.Mkdir(sshDir, 0700); err != nil {
 31 | 				return "", err
 32 | 			}
 33 | 		} else {
 34 | 			return "", err
 35 | 		}
 36 | 	}
 37 | 	return sshDir, nil
 38 | }
 39 | 
 40 | // createOrLoadSSHPrivateKey creates or loads a SSH private key from file
 41 | // returns RSA private key
 42 | func createOrLoadSSHPrivateKey(keyPath string) (*rsa.PrivateKey, error) {
 43 | 	if _, err := os.Stat(keyPath); err == nil {
 44 | 		f, err := os.Open(keyPath)
 45 | 		if err != nil {
 46 | 			return nil, err
 47 | 		}
 48 | 		defer f.Close()
 49 | 
 50 | 		content, err := ioutil.ReadAll(f)
 51 | 		if err != nil {
 52 | 			return nil, err
 53 | 		}
 54 | 
 55 | 		block, _ := pem.Decode(content)
 56 | 		if block == nil {
 57 | 			return nil, fmt.Errorf("Empty PEM block")
 58 | 		}
 59 | 
 60 | 		return x509.ParsePKCS1PrivateKey(block.Bytes)
 61 | 	} else {
 62 | 		if os.IsNotExist(err) {
 63 | 			key, err := rsa.GenerateKey(rand.Reader, 4096)
 64 | 			if err != nil {
 65 | 				return nil, err
 66 | 			}
 67 | 
 68 | 			der := x509.MarshalPKCS1PrivateKey(key)
 69 | 			content := pem.EncodeToMemory(&pem.Block{
 70 | 				Type:  "RSA PRIVATE KEY",
 71 | 				Bytes: der,
 72 | 			})
 73 | 			err = os.WriteFile(keyPath, content, 0600)
 74 | 			if err != nil {
 75 | 				return nil, err
 76 | 			}
 77 | 
 78 | 			return key, nil
 79 | 		} else {
 80 | 			return nil, err
 81 | 		}
 82 | 	}
 83 | }
 84 | 
 85 | // parseSSHPublicKey parses exponent and modulus part from SSH public key
 86 | // returns base64 encoded exponent and modulus
 87 | func parseSSHPublicKey(pubKey ssh.PublicKey) (e string, n string, err error) {
 88 | 	keyBytes := pubKey.Marshal()
 89 | 	// <algorithm>,<exponent>,<modulus>
 90 | 	fields := [][]byte{}
 91 | 
 92 | 	read := 0
 93 | 	for read < len(keyBytes) {
 94 | 		length := int(binary.BigEndian.Uint32(keyBytes[read : read+4]))
 95 | 		read += 4
 96 | 		fields = append(fields, keyBytes[read:read+length])
 97 | 		read += length
 98 | 	}
 99 | 
100 | 	return base64.RawURLEncoding.EncodeToString(fields[1]),
101 | 		base64.RawURLEncoding.EncodeToString(fields[2]),
102 | 		nil
103 | }
104 | 
105 | // saveSSHPublicKey saves SSH public key to file
106 | func saveSSHPublicKey(key ssh.PublicKey, path string) error {
107 | 	content := ssh.MarshalAuthorizedKey(key)
108 | 	return os.WriteFile(path, content, 0600)
109 | }
110 | 
111 | // loadSSHCertificate loads SSH certificate from file
112 | func loadSSHCertificate(path string) (*ssh.Certificate, error) {
113 | 	f, err := os.Open(path)
114 | 	if err != nil {
115 | 		return nil, fmt.Errorf("Fail to open SSH certificate file: %+v", err)
116 | 	}
117 | 	defer f.Close()
118 | 
119 | 	content, err := ioutil.ReadAll(f)
120 | 	if err != nil {
121 | 		return nil, fmt.Errorf("Fail to read SSH certificate file: %+v", err)
122 | 	}
123 | 
124 | 	parts := strings.Split(string(content), " ")
125 | 	if len(parts) < 2 {
126 | 		return nil, fmt.Errorf("SSH certificate file is in bad format")
127 | 	}
128 | 
129 | 	data, err := base64.StdEncoding.DecodeString(parts[1])
130 | 	if err != nil {
131 | 		return nil, fmt.Errorf("Fail to decode SSH certificate: %+v", err)
132 | 	}
133 | 
134 | 	pubKey, err := ssh.ParsePublicKey(data)
135 | 	if err != nil {
136 | 		return nil, fmt.Errorf("Fail to parse SSH certificate: %+v", err)
137 | 	}
138 | 
139 | 	sshCert, ok := pubKey.(*ssh.Certificate)
140 | 	if !ok {
141 | 		return nil, fmt.Errorf("Not a SSH certificate")
142 | 	}
143 | 
144 | 	validBefore := time.Unix(int64(sshCert.ValidBefore), 0)
145 | 	validAfter := time.Unix(int64(sshCert.ValidAfter), 0)
146 | 	valid := time.Now().Before(validBefore) && time.Now().After(validAfter)
147 | 	if !valid {
148 | 		return nil, fmt.Errorf("SSH certificate has expired. Valid before: %s. Valid after: %s",
149 | 			validBefore, validAfter)
150 | 	}
151 | 
152 | 	return sshCert, nil
153 | }
154 | 
155 | // saveSSHCertificate saves SSH certificate to file
156 | func saveSSHCertificate(content, path string) error {
157 | 	return os.WriteFile(path, []byte(content), 0600)
158 | }
159 | 
160 | // getSSHArgs returns command line arguments when calling SSH command
161 | func getSSHArgs(inputArgs []string, sshPrivKeyPath string, useSSHAgent bool) []string {
162 | 	args := inputArgs
163 | 	argsMap := make(map[string]bool)
164 | 	for _, arg := range inputArgs {
165 | 		argsMap[arg] = true
166 | 	}
167 | 
168 | 	if useSSHAgent && !argsMap["-A"] {
169 | 		args = append(args, "-A")
170 | 	}
171 | 
172 | 	if !useSSHAgent && !argsMap["-i"] {
173 | 		args = append(args, "-i", sshPrivKeyPath)
174 | 	}
175 | 
176 | 	return args
177 | }
178 | 


--------------------------------------------------------------------------------
/pkg/checkers/dns/dns.go:
--------------------------------------------------------------------------------
  1 | package dns
  2 | 
  3 | import (
  4 | 	"fmt"
  5 | 	"time"
  6 | 
  7 | 	"github.com/Azure/kdebug/pkg/base"
  8 | 	"github.com/Azure/kdebug/pkg/env"
  9 | 	"github.com/miekg/dns"
 10 | )
 11 | 
 12 | const (
 13 | 	PublicDnsRecommendation = "Check your public network connectivity and outbound security settings."
 14 | 	CoreDnsRecommendation   = "CoreDNS pods might be down. Check their liveness using `kubectl get pods -n kube-system -o wide -l k8s-app=kube-dns`."
 15 | )
 16 | 
 17 | var (
 18 | 	GoogleDnsServer = DnsServer{
 19 | 		Name:   "Google DNS",
 20 | 		Server: "8.8.8.8",
 21 | 		Queries: []string{
 22 | 			"www.google.com",
 23 | 			"www.bing.com",
 24 | 		},
 25 | 		Recommendations: []string{PublicDnsRecommendation},
 26 | 		HelpLinks: []string{
 27 | 			"https://developers.google.com/speed/public-dns",
 28 | 		},
 29 | 	}
 30 | 	AzureDnsServer = DnsServer{
 31 | 		Name:   "Azure DNS",
 32 | 		Server: "168.63.129.16",
 33 | 		Queries: []string{
 34 | 			"www.google.com",
 35 | 			"www.bing.com",
 36 | 		},
 37 | 		Recommendations: []string{
 38 | 			PublicDnsRecommendation,
 39 | 			"VM might be on a bad host. Try to `redeploy` it.",
 40 | 		},
 41 | 		HelpLinks: []string{
 42 | 			"https://docs.microsoft.com/en-us/azure/virtual-network/what-is-ip-address-168-63-129-16",
 43 | 			"https://docs.microsoft.com/en-us/azure/virtual-network/virtual-networks-name-resolution-for-vms-and-role-instances#azure-provided-name-resolution",
 44 | 		},
 45 | 	}
 46 | 	AksCoreDnsServerPublic = DnsServer{
 47 | 		Name:   "AKS CoreDNS",
 48 | 		Server: "10.0.0.10",
 49 | 		Queries: []string{
 50 | 			"www.google.com",
 51 | 			"www.bing.com",
 52 | 		},
 53 | 		Recommendations: []string{
 54 | 			PublicDnsRecommendation,
 55 | 		},
 56 | 		HelpLinks: []string{
 57 | 			"https://kubernetes.io/docs/tasks/administer-cluster/dns-custom-nameservers/",
 58 | 			"https://kubernetes.io/docs/tasks/administer-cluster/coredns/",
 59 | 			"https://coredns.io/plugins/kubernetes/",
 60 | 		},
 61 | 	}
 62 | 	AksCoreDnsServerInCluster = DnsServer{
 63 | 		Name:   "AKS CoreDNS",
 64 | 		Server: "10.0.0.10",
 65 | 		Queries: []string{
 66 | 			"kubernetes.default.svc.cluster.local",
 67 | 		},
 68 | 		Recommendations: []string{
 69 | 			CoreDnsRecommendation,
 70 | 		},
 71 | 		HelpLinks: []string{
 72 | 			"https://kubernetes.io/docs/tasks/administer-cluster/dns-custom-nameservers/",
 73 | 			"https://kubernetes.io/docs/tasks/administer-cluster/coredns/",
 74 | 			"https://coredns.io/plugins/kubernetes/",
 75 | 		},
 76 | 	}
 77 | 	SystemdResolvedDnsServer = DnsServer{
 78 | 		Name:   "systemd-resolved",
 79 | 		Server: "127.0.0.53",
 80 | 		Queries: []string{
 81 | 			"www.google.com",
 82 | 			"www.bing.com",
 83 | 		},
 84 | 		Recommendations: []string{
 85 | 			"systemd-resolved service might not be running. Check by running `sudo systemctl status systemd-resolved`.",
 86 | 		},
 87 | 		HelpLinks: []string{
 88 | 			"https://www.freedesktop.org/software/systemd/man/systemd-resolved.service.html",
 89 | 		},
 90 | 	}
 91 | )
 92 | 
 93 | type DnsServer struct {
 94 | 	Name            string
 95 | 	Server          string
 96 | 	Queries         []string
 97 | 	Recommendations []string
 98 | 	HelpLinks       []string
 99 | }
100 | 
101 | type DnsClient interface {
102 | 	Exchange(m *dns.Msg, a string) (r *dns.Msg, rtt time.Duration, err error)
103 | }
104 | 
105 | type DnsChecker struct {
106 | 	client DnsClient
107 | }
108 | 
109 | func New() *DnsChecker {
110 | 	return &DnsChecker{
111 | 		client: &dns.Client{
112 | 			Timeout: time.Second,
113 | 		},
114 | 	}
115 | }
116 | 
117 | func (c *DnsChecker) Name() string {
118 | 	return "Dns"
119 | }
120 | 
121 | func (c *DnsChecker) Check(ctx *base.CheckContext) ([]*base.CheckResult, error) {
122 | 	result := []*base.CheckResult{}
123 | 	targets := getCheckTargets(ctx.Environment)
124 | 	for _, server := range targets {
125 | 		for _, query := range server.Queries {
126 | 			r, err := c.checkServer(server, query)
127 | 			if err != nil {
128 | 				return result, err
129 | 			}
130 | 			result = append(result, r)
131 | 		}
132 | 	}
133 | 	return result, nil
134 | }
135 | 
136 | func getCheckTargets(e env.Environment) []DnsServer {
137 | 	targets := []DnsServer{
138 | 		GoogleDnsServer,
139 | 	}
140 | 
141 | 	if e.HasFlag("ubuntu") {
142 | 		targets = append(targets, SystemdResolvedDnsServer)
143 | 	}
144 | 
145 | 	if e.HasFlag("azure") {
146 | 		targets = append(targets,
147 | 			AzureDnsServer,
148 | 			AksCoreDnsServerPublic,
149 | 			AksCoreDnsServerInCluster)
150 | 	}
151 | 
152 | 	return targets
153 | }
154 | 
155 | func (c *DnsChecker) checkServer(server DnsServer, query string) (*base.CheckResult, error) {
156 | 	m := new(dns.Msg)
157 | 	m.SetQuestion(query+".", dns.TypeA)
158 | 	m.RecursionDesired = true
159 | 	r, _, err := c.client.Exchange(m, server.Server+":53")
160 | 	if err != nil {
161 | 		return &base.CheckResult{
162 | 			Checker: c.Name(),
163 | 			Error: fmt.Sprintf("Fail to query domain name %s from server %s(%s)",
164 | 				query, server.Name, server.Server),
165 | 			Description:     err.Error(),
166 | 			Recommendations: server.Recommendations,
167 | 			HelpLinks:       server.HelpLinks,
168 | 		}, nil
169 | 	}
170 | 	if r.Rcode != dns.RcodeSuccess {
171 | 		return &base.CheckResult{
172 | 			Checker: c.Name(),
173 | 			Error: fmt.Sprintf("Fail to query domain name %s from server %s(%s)", query,
174 | 				server.Name, server.Server),
175 | 			Description:     fmt.Sprintf("Unexpected rcode: %d", r.Rcode),
176 | 			Recommendations: server.Recommendations,
177 | 			HelpLinks:       server.HelpLinks,
178 | 		}, nil
179 | 	}
180 | 	return &base.CheckResult{
181 | 		Checker: c.Name(),
182 | 		Description: fmt.Sprintf("Successfully query domain name %s from server %s(%s)",
183 | 			query, server.Name, server.Server),
184 | 	}, nil
185 | }
186 | 


--------------------------------------------------------------------------------
/go.mod:
--------------------------------------------------------------------------------
  1 | module github.com/Azure/kdebug
  2 | 
  3 | go 1.17
  4 | 
  5 | require (
  6 | 	github.com/AzureAD/microsoft-authentication-library-for-go v0.6.1
  7 | 	github.com/Microsoft/go-winio v0.5.2
  8 | 	github.com/bramvdbogaerde/go-scp v1.2.0
  9 | 	github.com/coreos/go-systemd/v22 v22.5.0
 10 | 	github.com/dustin/go-humanize v1.0.0
 11 | 	github.com/fatih/color v1.7.0
 12 | 	github.com/jessevdk/go-flags v1.5.0
 13 | 	github.com/mattn/go-isatty v0.0.14
 14 | 	github.com/miekg/dns v1.1.43
 15 | 	github.com/schollz/progressbar/v3 v3.8.6
 16 | 	github.com/shirou/gopsutil/v3 v3.23.2
 17 | 	github.com/sirupsen/logrus v1.8.1
 18 | 	github.com/zcalusic/sysinfo v0.0.0-20210905121133-6fa2f969a900
 19 | 	golang.org/x/crypto v0.1.0
 20 | 	k8s.io/api v0.24.7
 21 | 	k8s.io/apimachinery v0.24.7
 22 | 	k8s.io/cli-runtime v0.24.7
 23 | 	k8s.io/client-go v0.24.7
 24 | 	k8s.io/kubectl v0.24.7
 25 | )
 26 | 
 27 | require (
 28 | 	github.com/Azure/go-ansiterm v0.0.0-20210617225240-d185dfc1b5a1 // indirect
 29 | 	github.com/MakeNowJust/heredoc v0.0.0-20170808103936-bb23615498cd // indirect
 30 | 	github.com/PuerkitoBio/purell v1.1.1 // indirect
 31 | 	github.com/PuerkitoBio/urlesc v0.0.0-20170810143723-de5bf2ad4578 // indirect
 32 | 	github.com/chai2010/gettext-go v0.0.0-20160711120539-c6fed771bfd5 // indirect
 33 | 	github.com/davecgh/go-spew v1.1.1 // indirect
 34 | 	github.com/daviddengcn/go-colortext v0.0.0-20160507010035-511bcaf42ccd // indirect
 35 | 	github.com/docker/distribution v2.8.1+incompatible // indirect
 36 | 	github.com/emicklei/go-restful v2.16.0+incompatible // indirect
 37 | 	github.com/evanphx/json-patch v4.12.0+incompatible // indirect
 38 | 	github.com/exponent-io/jsonpath v0.0.0-20151013193312-d6023ce2651d // indirect
 39 | 	github.com/fatih/camelcase v1.0.0 // indirect
 40 | 	github.com/fvbommel/sortorder v1.0.1 // indirect
 41 | 	github.com/go-errors/errors v1.0.1 // indirect
 42 | 	github.com/go-logr/logr v1.2.0 // indirect
 43 | 	github.com/go-ole/go-ole v1.2.6 // indirect
 44 | 	github.com/go-openapi/jsonpointer v0.19.5 // indirect
 45 | 	github.com/go-openapi/jsonreference v0.19.5 // indirect
 46 | 	github.com/go-openapi/swag v0.19.14 // indirect
 47 | 	github.com/godbus/dbus/v5 v5.0.4 // indirect
 48 | 	github.com/gogo/protobuf v1.3.2 // indirect
 49 | 	github.com/golang-jwt/jwt/v4 v4.4.2 // indirect
 50 | 	github.com/golang/protobuf v1.5.2 // indirect
 51 | 	github.com/google/btree v1.0.1 // indirect
 52 | 	github.com/google/gnostic v0.5.7-v3refs // indirect
 53 | 	github.com/google/go-cmp v0.5.9 // indirect
 54 | 	github.com/google/gofuzz v1.1.0 // indirect
 55 | 	github.com/google/shlex v0.0.0-20191202100458-e7afc7fbc510 // indirect
 56 | 	github.com/google/uuid v1.3.0 // indirect
 57 | 	github.com/gregjones/httpcache v0.0.0-20180305231024-9cad4c3443a7 // indirect
 58 | 	github.com/imdario/mergo v0.3.5 // indirect
 59 | 	github.com/inconshreveable/mousetrap v1.0.0 // indirect
 60 | 	github.com/jonboulle/clockwork v0.2.2 // indirect
 61 | 	github.com/josharian/intern v1.0.0 // indirect
 62 | 	github.com/json-iterator/go v1.1.12 // indirect
 63 | 	github.com/kylelemons/godebug v1.1.0 // indirect
 64 | 	github.com/liggitt/tabwriter v0.0.0-20181228230101-89fcab3d43de // indirect
 65 | 	github.com/lithammer/dedent v1.1.0 // indirect
 66 | 	github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0 // indirect
 67 | 	github.com/mailru/easyjson v0.7.6 // indirect
 68 | 	github.com/mattn/go-colorable v0.0.9 // indirect
 69 | 	github.com/mattn/go-runewidth v0.0.13 // indirect
 70 | 	github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db // indirect
 71 | 	github.com/mitchellh/go-wordwrap v1.0.0 // indirect
 72 | 	github.com/moby/spdystream v0.2.0 // indirect
 73 | 	github.com/moby/term v0.0.0-20210619224110-3f7ff695adc6 // indirect
 74 | 	github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
 75 | 	github.com/modern-go/reflect2 v1.0.2 // indirect
 76 | 	github.com/monochromegane/go-gitignore v0.0.0-20200626010858-205db1a8cc00 // indirect
 77 | 	github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
 78 | 	github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f // indirect
 79 | 	github.com/opencontainers/go-digest v1.0.0 // indirect
 80 | 	github.com/peterbourgon/diskv v2.0.1+incompatible // indirect
 81 | 	github.com/pkg/browser v0.0.0-20210115035449-ce105d075bb4 // indirect
 82 | 	github.com/pkg/errors v0.9.1 // indirect
 83 | 	github.com/pmezard/go-difflib v1.0.0 // indirect
 84 | 	github.com/power-devops/perfstat v0.0.0-20210106213030-5aafc221ea8c // indirect
 85 | 	github.com/rivo/uniseg v0.2.0 // indirect
 86 | 	github.com/russross/blackfriday v1.5.2 // indirect
 87 | 	github.com/spf13/cobra v1.4.0 // indirect
 88 | 	github.com/spf13/pflag v1.0.5 // indirect
 89 | 	github.com/stretchr/testify v1.8.2 // indirect
 90 | 	github.com/tklauser/go-sysconf v0.3.11 // indirect
 91 | 	github.com/tklauser/numcpus v0.6.0 // indirect
 92 | 	github.com/xlab/treeprint v0.0.0-20181112141820-a009c3971eca // indirect
 93 | 	github.com/yusufpapurcu/wmi v1.2.2 // indirect
 94 | 	go.starlark.net v0.0.0-20200306205701-8dd3e2ee1dd5 // indirect
 95 | 	golang.org/x/net v0.7.0 // indirect
 96 | 	golang.org/x/oauth2 v0.0.0-20211104180415-d3ed0bb246c8 // indirect
 97 | 	golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4 // indirect
 98 | 	golang.org/x/sys v0.5.0 // indirect
 99 | 	golang.org/x/term v0.5.0 // indirect
100 | 	golang.org/x/text v0.7.0 // indirect
101 | 	golang.org/x/time v0.0.0-20220210224613-90d013bbcef8 // indirect
102 | 	google.golang.org/appengine v1.6.7 // indirect
103 | 	google.golang.org/protobuf v1.27.1 // indirect
104 | 	gopkg.in/inf.v0 v0.9.1 // indirect
105 | 	gopkg.in/yaml.v2 v2.4.0 // indirect
106 | 	k8s.io/component-base v0.24.7 // indirect
107 | 	k8s.io/component-helpers v0.24.7 // indirect
108 | 	k8s.io/klog/v2 v2.60.1 // indirect
109 | 	k8s.io/kube-openapi v0.0.0-20220328201542-3ee0da9b0b42 // indirect
110 | 	k8s.io/metrics v0.24.7 // indirect
111 | 	k8s.io/utils v0.0.0-20220210201930-3a6ce19ff2f9 // indirect
112 | 	sigs.k8s.io/json v0.0.0-20211208200746-9f7c6b3444d2 // indirect
113 | 	sigs.k8s.io/kustomize/api v0.11.4 // indirect
114 | 	sigs.k8s.io/kustomize/kustomize/v4 v4.5.4 // indirect
115 | 	sigs.k8s.io/kustomize/kyaml v0.13.6 // indirect
116 | 	sigs.k8s.io/structured-merge-diff/v4 v4.2.1 // indirect
117 | 	sigs.k8s.io/yaml v1.2.0 // indirect
118 | )
119 | 
120 | require (
121 | 	github.com/c9s/goprocinfo v0.0.0-20210130143923-c95fcf8c64a8
122 | 	github.com/prometheus-community/pro-bing v0.1.0
123 | 	gopkg.in/yaml.v3 v3.0.1
124 | )
125 | 


--------------------------------------------------------------------------------
/pkg/checkers/diskusage/diskusage.go:
--------------------------------------------------------------------------------
  1 | package diskusage
  2 | 
  3 | import (
  4 | 	"errors"
  5 | 	"fmt"
  6 | 	"reflect"
  7 | 	"strconv"
  8 | 	"strings"
  9 | 
 10 | 	"os/exec"
 11 | 
 12 | 	"github.com/Azure/kdebug/pkg/base"
 13 | )
 14 | 
 15 | const (
 16 | 	NoHighDiskUsageResult   = "Disk usage is in normal size. No additional action required."
 17 | 	HighUsageRecommandation = "Check files listed. If it's just log files or can be deleted, run bash command: `truncate -s 0 /path/to/file` to reduce disk usage. Note: `rm` will not really delete the file if it's opened by processes."
 18 | 	FailedToRunCommand      = "Failed to check disk usage with '%s'"
 19 | 	NotSupportedOS          = "The OS is not supported: %s"
 20 | )
 21 | 
 22 | var (
 23 | 	DfHeaders = map[string][]string{
 24 | 		"LINUX": {
 25 | 			"Filesystem",
 26 | 			"Size",
 27 | 			"Used",
 28 | 			"Avail",
 29 | 			"Use%",
 30 | 			"Mounted",
 31 | 			"on",
 32 | 		},
 33 | 		"FREEBSD": {
 34 | 			"Filesystem",
 35 | 			"Size",
 36 | 			"Used",
 37 | 			"Avail",
 38 | 			"Capacity",
 39 | 			"Mounted",
 40 | 			"on",
 41 | 		},
 42 | 	}
 43 | 
 44 | 	DiskUsageRateThreshold = 90
 45 | 	InterestedBigFilePath  = []string{
 46 | 		"/var/log",
 47 | 	}
 48 | 	InterestedBigFileNum = 10
 49 | 
 50 | 	HighdfRecommandations = []string{HighUsageRecommandation}
 51 | )
 52 | 
 53 | type DfRow struct {
 54 | 	Filesystem string
 55 | 	Size       string
 56 | 	Used       string
 57 | 	Avail      string
 58 | 	Use        int
 59 | 	MountedOn  string
 60 | }
 61 | 
 62 | type DiskUsageChecker struct {
 63 | }
 64 | 
 65 | func New() *DiskUsageChecker {
 66 | 	return &DiskUsageChecker{}
 67 | }
 68 | 
 69 | func (c *DiskUsageChecker) Name() string {
 70 | 	return "DiskUsage"
 71 | }
 72 | 
 73 | func (c *DiskUsageChecker) Check(ctx *base.CheckContext) ([]*base.CheckResult, error) {
 74 | 	result := []*base.CheckResult{}
 75 | 
 76 | 	rst, err := c.getDiskUsage()
 77 | 	if err != nil {
 78 | 		return result, err
 79 | 	}
 80 | 	result = append(result, rst)
 81 | 
 82 | 	return result, nil
 83 | }
 84 | 
 85 | func (c *DiskUsageChecker) getDiskUsage() (*base.CheckResult, error) {
 86 | 	out, err := exec.Command("uname").Output()
 87 | 	if err != nil {
 88 | 		return &base.CheckResult{
 89 | 			Checker:     c.Name(),
 90 | 			Error:       fmt.Sprintf(FailedToRunCommand, "uname"),
 91 | 			Description: err.Error(),
 92 | 		}, nil
 93 | 	}
 94 | 
 95 | 	uname := strings.TrimSpace(string(out))
 96 | 	dfHeaders, ok := DfHeaders[strings.ToUpper(uname)]
 97 | 	if !ok {
 98 | 		return &base.CheckResult{
 99 | 			Checker: c.Name(),
100 | 			Error:   fmt.Sprintf(NotSupportedOS, uname),
101 | 		}, nil
102 | 	}
103 | 
104 | 	out, err = exec.Command("df", "-h").Output()
105 | 	if err != nil {
106 | 		return &base.CheckResult{
107 | 			Checker:     c.Name(),
108 | 			Error:       fmt.Sprintf(FailedToRunCommand, "df -h"),
109 | 			Description: err.Error(),
110 | 		}, nil
111 | 	}
112 | 
113 | 	rows, err := parseDfResult(string(out), dfHeaders)
114 | 	if err != nil {
115 | 		return &base.CheckResult{
116 | 			Checker:     c.Name(),
117 | 			Error:       FailedToRunCommand,
118 | 			Description: err.Error(),
119 | 		}, nil
120 | 	}
121 | 
122 | 	found, row := getUsageAt("/", rows)
123 | 	if found && row.Use > DiskUsageRateThreshold {
124 | 		bigFiles := []string{}
125 | 
126 | 		for _, path := range InterestedBigFilePath {
127 | 			output, err := FindTopSizeFiles(path, InterestedBigFileNum)
128 | 			if err != nil {
129 | 				return &base.CheckResult{
130 | 					Checker:         c.Name(),
131 | 					Description:     FormatHighDfDescription(row),
132 | 					Error:           err.Error(),
133 | 					Recommendations: HighdfRecommandations,
134 | 				}, nil
135 | 			}
136 | 
137 | 			bigFiles = append(bigFiles, output)
138 | 		}
139 | 
140 | 		return &base.CheckResult{
141 | 			Checker:         c.Name(),
142 | 			Error:           "Disk is reaching high usage. Details: " + FormatHighDfDescription(row),
143 | 			Description:     "\n" + strings.Join(bigFiles, "\n"),
144 | 			Recommendations: HighdfRecommandations,
145 | 		}, nil
146 | 	}
147 | 
148 | 	return &base.CheckResult{
149 | 		Checker:     c.Name(),
150 | 		Description: fmt.Sprintf("%s Current %v%%, Threshold %v%%", NoHighDiskUsageResult, row.Use, DiskUsageRateThreshold),
151 | 	}, nil
152 | }
153 | 
154 | func getUsageAt(path string, rows []DfRow) (bool, DfRow) {
155 | 	for _, row := range rows {
156 | 		if row.MountedOn == path {
157 | 			return true, row
158 | 		}
159 | 	}
160 | 
161 | 	return false, DfRow{}
162 | }
163 | 
164 | func parseDfResult(output string, dfHeaders []string) ([]DfRow, error) {
165 | 	lines := strings.Split(output, "\n")
166 | 	result := make([]DfRow, 0, len(lines))
167 | 
168 | 	for _, line := range lines {
169 | 		if len(line) == 0 {
170 | 			continue
171 | 		}
172 | 
173 | 		ds := strings.Fields(strings.TrimSpace(line))
174 | 		if ds[0] == dfHeaders[0] {
175 | 			// header
176 | 			if !reflect.DeepEqual(ds, dfHeaders) {
177 | 				return result, errors.New(fmt.Sprintf("Result in df has wrong header format. Expected %v, Actually %v", dfHeaders, ds))
178 | 			}
179 | 			continue
180 | 		}
181 | 
182 | 		row, err := parseDfRow(ds, dfHeaders)
183 | 		if err != nil {
184 | 			return nil, err
185 | 		}
186 | 
187 | 		result = append(result, row)
188 | 	}
189 | 
190 | 	return result, nil
191 | }
192 | 
193 | func parseDfRow(row []string, dfHeader []string) (DfRow, error) {
194 | 	if len(row) != len(dfHeader)-1 {
195 | 		return DfRow{}, fmt.Errorf(`unexpected row column number %v (expected %v)`, row, dfHeader)
196 | 	}
197 | 
198 | 	return DfRow{
199 | 		Filesystem: strings.TrimSpace(row[0]),
200 | 		Size:       strings.TrimSpace(row[1]),
201 | 		Used:       strings.TrimSpace(row[2]),
202 | 		Avail:      strings.TrimSpace(row[3]),
203 | 		Use:        AtoiHepler(strings.TrimSpace(strings.Replace(row[4], "%", "", -1))),
204 | 		MountedOn:  strings.TrimSpace(row[5]),
205 | 	}, nil
206 | }
207 | 
208 | func AtoiHepler(s string) int {
209 | 	rst, _ := strconv.Atoi(s)
210 | 	return rst
211 | }
212 | 
213 | func FormatHighDfDescription(row DfRow) string {
214 | 	return fmt.Sprintf("[Used %d%%] Filesystem: %s, UsedSize: %s, AvailableSize: %s, MountedOn %s", row.Use, row.Filesystem, row.Used, row.Avail, row.MountedOn)
215 | }
216 | 
217 | func FindTopSizeFiles(path string, topCount int) (string, error) {
218 | 	commandline := fmt.Sprintf("du -ah %s | sort -rh | head -n %d", path, topCount)
219 | 	out, err := exec.Command("bash", "-c", commandline).Output()
220 | 
221 | 	if err != nil {
222 | 		return "", err
223 | 	}
224 | 
225 | 	return string(out), nil
226 | }
227 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # kdebug
  2 | 
  3 | kdebug is a command line utility that helps troubleshoot a running Kubernetes cluster and apps in it.
  4 | 
  5 | It focuses on DevOps scenarios and covers these areas:
  6 | 
  7 | * OS diagnostics
  8 | * Kubernetes components diagnostics
  9 | * Lightweight application diagnostics
 10 | 
 11 | ## Check mode
 12 | 
 13 | kdebug runs in check mode by default.
 14 | By running a set of predefined checks, it gives diagnostics information and guides you to next steps.
 15 | 
 16 | Currently kdebug supports following checks:
 17 | 
 18 | * Disk usage: Check disk usage and identity top large files.
 19 | * Disk read-only: Check if the user home directory is read-only.
 20 | * DNS: Check cluster DNS.
 21 | * HTTP: Check HTTP connectivity to well known endpoints.
 22 | * Kube object size: Check configmap/secret object size.
 23 | * Kube pod: Check pod restart reasons.
 24 | * Kube pod scheduling: Check pods scheduling issues.
 25 | * OOM: Analysis out-of-memory events.
 26 | * System load: Check the CPU and Memory of VM and some primary processes (etcd, kubelet...)
 27 | * TCP: Check if the TCP connection could be established to exposed services (external load balancer, internal cluster service)
 28 | * Ping: Check if the icmp ping/pong could work towards public IP (8.8.8.8) and cluster IP (node, pod) 
 29 | * KMS cache size: Check if API server KMS cache size is sufficient to hold all secrets.
 30 | 
 31 | ## How to use
 32 | 
 33 | ### Basic
 34 | 
 35 | Run all checks:
 36 | 
 37 | ```bash
 38 | kdebug
 39 | ```
 40 | 
 41 | Run a specific check:
 42 | 
 43 | ```bash
 44 | kdebug -c dns
 45 | ```
 46 | 
 47 | List available checks:
 48 | 
 49 | ```bash
 50 | kdebug --list
 51 | ```
 52 | 
 53 | See full supported arguments and help:
 54 | 
 55 | ```bash
 56 | kdebug -h
 57 | ```
 58 | 
 59 | ### Kubernetes checks
 60 | 
 61 | Kubernetes related checks require a working kubeconfig. You can either put it at the default location `$HOME/.kube/config`, or you can specify via `--kube-config-path`:
 62 | 
 63 | ```bash
 64 | kdebug -c kubepod \
 65 |     --kube-config-path /path/to/kubeconfig
 66 | ```
 67 | 
 68 | ### Batch mode
 69 | 
 70 | kdebug supports running on a batch of remote machines simultaneously via SSH.
 71 | 
 72 | Explictly specify a list of machine names:
 73 | 
 74 | ```bash
 75 | kdebug -c dns \
 76 |     --batch.machines=machine-1 \
 77 |     --batch.machines=machine-2 \
 78 |     --batch.concurrency=2 \
 79 |     --batch.ssh-user=azureuser
 80 | ```
 81 | 
 82 | Read machine names list from a file or stdin:
 83 | 
 84 | ```bash
 85 | # From file
 86 | kdebug -c dns \
 87 |     --batch.machines-file=/path/to/machine/names/file
 88 | 
 89 | # From stdin
 90 | kubectl get nodes | grep NotReady | awk '{print $1}' | kdebug -c dns --batch.machines-file=-
 91 | ```
 92 | 
 93 | Auto discover machines list via Kubernetes API server.
 94 | 
 95 | ```bash
 96 | kdebug -c dns --batch.kube-machines
 97 | ```
 98 | 
 99 | In addition, you can specify a label selector:
100 | 
101 | ```bash
102 | kdebug -c dns \
103 |     --batch.kube-machines \
104 |     --batch.kube-machines-label=kubernetes.io/role=agent
105 | ```
106 | 
107 | Or filter out unready nodes only:
108 | 
109 | ```bash
110 | kdebug -c dns \
111 |     --batch.kube-machines-unready
112 | ```
113 | 
114 | ## Tool mode
115 | 
116 | In addition to the default check mode, kdebug also supports a tool mode.
117 | Tool mode wraps useful commands and makes them easier to used in typical scenarios.
118 | 
119 | 
120 | Currently kdebug provides following tools:
121 | 
122 | * Tcpdump: Wrap tcpdump command and provides a simpler interface for container scenarios.
123 | * Reboot reason: Inspect last reboot reason.
124 | * AAD SSH: SSH via AAD. This is a handy replacement for the original Azure CLI based implementation.
125 | * NetExec: Execute the command with the same network namespace with a specific process or pod.
126 | 
127 | You can see a full list with:
128 | 
129 | ```bash
130 | kdebug --list
131 | ```
132 | 
133 | Use following command to start a tool:
134 | 
135 | ```bash
136 | kdebug -t <tool>
137 | ```
138 | 
139 | Show tool specific options:
140 | 
141 | ```bash
142 | kdebug -t <tool> -h
143 | ```
144 | 
145 | ### Tcpdump
146 | 
147 | Attach to network namespace of a process with pid=100 and capture all traffic:
148 | 
149 | ```bash
150 | kdebug -t tcpdump --pid=100
151 | ```
152 | 
153 | With source and destination specified, and TCP only:
154 | 
155 | ```bash
156 | kdebug -t tcpdump \
157 |     --pid=100 \
158 |     --source=10.0.0.1:1000 \
159 |     --destination=10.0.0.2:2000 \
160 |     --tcponly
161 | ```
162 | 
163 | `--host` matches either source or destination:
164 | 
165 | ```bash
166 | kdebug -t tcpdump --host=10.0.0.1:1000
167 | ```
168 | 
169 | ### Reboot reason
170 | 
171 | Check VM last reboot reason within last 1 day:
172 | 
173 | ```
174 | kdebug -t vmrebootdetector
175 | ```
176 | 
177 | Check VM last reboot reason within last 100 days:
178 | 
179 | ```
180 | kdebug -t vmrebootdetector \
181 |     --checkdays=100
182 | ```
183 | 
184 | ### Package upgrade inspect
185 | 
186 | Check upgraded packages within last 14 days:
187 | 
188 | ```
189 | kdebug --tool upgradeinspector --checkdays 14
190 | ```
191 | 
192 | Check upgraded package within last 7 days, limit 10 records:
193 | 
194 | ```
195 | kdebug --tool upgradeinspector --recordlimit 10
196 | ```
197 | 
198 | ### AAD SSH
199 | 
200 | SSH via AAD. See [Azure Linux VMs and Azure AD](https://learn.microsoft.com/en-us/azure/active-directory/devices/howto-vm-sign-in-azure-ad-linux).
201 | 
202 | This is a handy replacement for the original Azure CLI based implementation.
203 | 
204 | Login via interactive flow:
205 | 
206 | ```bash
207 | kdebug -t aadssh <user>@<tenant>@<hostname-or-ip>
208 | ```
209 | 
210 | A browser will pop up for credentials.
211 | 
212 | Login via Azure CLI credentials:
213 | 
214 | ```bash
215 | az login
216 | kdebug -t aadssh --use-azure-cli <user>@<tenant>@<hostname-or-ip>
217 | ```
218 | 
219 | ### NetExec
220 | Execute the command with the same network namespace with a process, you need to on the VM the process locate in.
221 | 
222 | ```bash
223 | kdebug -t netexec --pid=<process-pid>
224 | ```
225 | 
226 | Execute the command with the same network namespace with a pod, you need to have the kubeconfig.
227 | 
228 | ```bash
229 | kdebug -t netexec --pod=<pod-name> --namespace=<pod-namespace>
230 | ```
231 | 
232 | And specify the command with `--command=`. The default command is `sh`
233 | 
234 | ## Development
235 | 
236 | Prerequisite:
237 | 
238 | * [Golang](https://go.dev/dl/)
239 | 
240 | Build:
241 | 
242 | ```bash
243 | make build
244 | ```
245 | 
246 | Test:
247 | 
248 | ```bash
249 | make test
250 | ```
251 | 
252 | ## Contributing
253 | 
254 | This project welcomes contributions and suggestions.  Most contributions require you to agree to a
255 | Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us
256 | the rights to use your contribution. For details, visit https://cla.opensource.microsoft.com.
257 | 
258 | When you submit a pull request, a CLA bot will automatically determine whether you need to provide
259 | a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions
260 | provided by the bot. You will only need to do this once across all repos using our CLA.
261 | 
262 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
263 | For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or
264 | contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.
265 | 
266 | ## Trademarks
267 | 
268 | This project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft
269 | trademarks or logos is subject to and must follow
270 | [Microsoft's Trademark & Brand Guidelines](https://www.microsoft.com/en-us/legal/intellectualproperty/trademarks/usage/general).
271 | Use of Microsoft trademarks or logos in modified versions of this project must not cause confusion or imply Microsoft sponsorship.
272 | Any use of third-party trademarks or logos are subject to those third-party's policies.
273 | 


--------------------------------------------------------------------------------
/pkg/checkers/systemload/systemload.go:
--------------------------------------------------------------------------------
  1 | package systemload
  2 | 
  3 | import (
  4 | 	"fmt"
  5 | 	"path/filepath"
  6 | 	"time"
  7 | 
  8 | 	"github.com/Azure/kdebug/pkg/base"
  9 | 	linuxproc "github.com/c9s/goprocinfo/linux"
 10 | )
 11 | 
 12 | const (
 13 | 	GlobalCPUTooHigh               = "The VM's CPU usage is higher than threshold. Currently %.1f%% (threshold is %.1f%%)."
 14 | 	GlobalMemoryTooHigh            = "The VM's Memory usage is higher than threshold. Currently %.1f%% (threshold is %.1f%%)"
 15 | 	ProcessCPUTooHigh              = "The CPU usage of process [%d] (%s) is higher than threshold. The proportion of cpu is %.1f%% to whole capacity (threshold is %.1f%%). The proportion of cpu is %.1f%% to one core (threshold is %.1f%%)"
 16 | 	GloablHighCPURecommandation    = "You may remote to the target VM and use 'top' to find out which process consumes most of CPU. Further actions may depends."
 17 | 	GloablHighMemoryRecommandation = "You may remote to the target VM and use 'top' to find out which process consumes most of Memory. Further actions may depends."
 18 | 	ProcessHighCPURecommandation   = "You may restart to process if feasible and see whether the CPU usage comes to normal. Or you can 'perf' to diagnose the root cause."
 19 | )
 20 | 
 21 | var (
 22 | 	VMCPUPercentageLimit    float64 = 80  // The percentage compare to the whole VM CPU capacity. 100 means using up all the cpu capacity
 23 | 	VMMemoryPercentageLimit float64 = 90  // The percentage compare to the VM Total Memory. 100 means using up all the memory capacity
 24 | 	ClkTck                  float64 = 100 // default value of cycles per seconds
 25 | 	CPUSpan                 float64 = 1   // The timespan of CPU load in seconds
 26 | 	InterestedProcNames             = map[string]ProcLimitMeasurement{
 27 | 		"etcd":           {CPULimitAsGloabl: 50, CPULimitAsSingleCore: 80},
 28 | 		"kubelet":        {CPULimitAsGloabl: 50, CPULimitAsSingleCore: 80},
 29 | 		"kube-apiserver": {CPULimitAsGloabl: 50, CPULimitAsSingleCore: 80}}
 30 | )
 31 | 
 32 | type InterestedProc struct {
 33 | 	StatFilePath         string  // Process stat file location. Should follow /proc/[pid]/stat
 34 | 	Name                 string  // The command of the process
 35 | 	Pid                  uint64  // Pid
 36 | 	TotalTime            uint64  // Time of the process used in cpu cycle
 37 | 	CPULimitAsGloabl     float64 // CPU limit compare to the whole VM CPU capacity
 38 | 	CPULimitAsSingleCore float64 // CPU limit compare to one core
 39 | }
 40 | 
 41 | type ProcLimitMeasurement struct {
 42 | 	CPULimitAsGloabl     float64 // The percentage compare to the whole VM CPU capacity. 100 means using up all the cpu capacity
 43 | 	CPULimitAsSingleCore float64 // The percentage compare to one core. 100 means using up 1 core's capacity. Maximum number can be 100 * cores
 44 | }
 45 | 
 46 | type SystemLoadChecker struct {
 47 | }
 48 | 
 49 | func New() *SystemLoadChecker {
 50 | 	return &SystemLoadChecker{}
 51 | }
 52 | 
 53 | func (c *SystemLoadChecker) Name() string {
 54 | 	return "SystemLoad"
 55 | }
 56 | 
 57 | func (c *SystemLoadChecker) Check(ctx *base.CheckContext) ([]*base.CheckResult, error) {
 58 | 	result := []*base.CheckResult{}
 59 | 
 60 | 	if !ctx.Environment.HasFlag("linux") {
 61 | 		return result, nil
 62 | 	}
 63 | 
 64 | 	// VM Memory
 65 | 	memInfo, err := linuxproc.ReadMemInfo("/proc/meminfo")
 66 | 	if err != nil {
 67 | 		return result, err
 68 | 	}
 69 | 	var memUsage = getMemPercentage(memInfo.MemAvailable, memInfo.MemTotal)
 70 | 	if memUsage > VMMemoryPercentageLimit {
 71 | 		result = append(result, &base.CheckResult{
 72 | 			Checker:     c.Name(),
 73 | 			Error:       fmt.Sprintf(GlobalMemoryTooHigh, memUsage, VMMemoryPercentageLimit),
 74 | 			Description: GloablHighMemoryRecommandation,
 75 | 		})
 76 | 	}
 77 | 
 78 | 	interestedProcesses, err := getInterestedProc()
 79 | 	if err != nil {
 80 | 		return result, err
 81 | 	}
 82 | 
 83 | 	// Read global status
 84 | 	stat, err := linuxproc.ReadStat("/proc/stat")
 85 | 	if err != nil {
 86 | 		return result, err
 87 | 	}
 88 | 
 89 | 	// How to calculate global cpu usage: https://rosettacode.org/wiki/Linux_CPU_utilization
 90 | 	previousIdleTime, previousTotalTime := getSystemCPUTime(stat.CPUStatAll)
 91 | 
 92 | 	// Sleep a time span and check cpu time again to get average CPU load
 93 | 	time.Sleep(time.Duration(CPUSpan * float64(time.Second)))
 94 | 
 95 | 	stat, err = linuxproc.ReadStat("/proc/stat")
 96 | 	if err != nil {
 97 | 		return result, err
 98 | 	}
 99 | 
100 | 	idleTime, totalTime := getSystemCPUTime(stat.CPUStatAll)
101 | 	var deltaSystemIdleTime = idleTime - previousIdleTime
102 | 	var deltaSystemTotalTime = totalTime - previousTotalTime
103 | 	var usage = getSystemCPUPercentage(deltaSystemIdleTime, deltaSystemTotalTime)
104 | 
105 | 	// VM CPU
106 | 	if usage > VMCPUPercentageLimit {
107 | 		result = append(result, &base.CheckResult{
108 | 			Checker:     c.Name(),
109 | 			Error:       fmt.Sprintf(GlobalCPUTooHigh, usage, VMCPUPercentageLimit),
110 | 			Description: GloablHighCPURecommandation,
111 | 		})
112 | 	}
113 | 
114 | 	// Interested proc cpu
115 | 	for _, proc := range interestedProcesses {
116 | 		stat, err := linuxproc.ReadProcessStat(proc.StatFilePath)
117 | 		if err != nil {
118 | 			continue
119 | 		}
120 | 
121 | 		// https://stackoverflow.com/questions/16726779/how-do-i-get-the-total-cpu-usage-of-an-application-from-proc-pid-stat/16736599#16736599
122 | 		totalTime := stat.Utime + stat.Stime
123 | 		cpuUsageAsGlobal := getProcessCPUPercentageAsGlobal(totalTime-proc.TotalTime, deltaSystemTotalTime)
124 | 		cpuUsageAsSingleCore := getProcessCPUPercentageAsSingleCore(totalTime-proc.TotalTime, CPUSpan)
125 | 
126 | 		if cpuUsageAsGlobal > proc.CPULimitAsGloabl || cpuUsageAsSingleCore > proc.CPULimitAsSingleCore {
127 | 			result = append(result, &base.CheckResult{
128 | 				Checker:     c.Name(),
129 | 				Error:       fmt.Sprintf(ProcessCPUTooHigh, proc.Pid, proc.Name, cpuUsageAsGlobal, proc.CPULimitAsGloabl, cpuUsageAsSingleCore, proc.CPULimitAsSingleCore),
130 | 				Description: ProcessHighCPURecommandation,
131 | 			})
132 | 		}
133 | 	}
134 | 
135 | 	return result, nil
136 | }
137 | 
138 | func getTotalTime(stat linuxproc.CPUStat) uint64 {
139 | 	return stat.User + stat.Nice + stat.System + stat.Idle + stat.IOWait + stat.IRQ + stat.SoftIRQ +
140 | 		stat.Steal + stat.Guest + stat.GuestNice
141 | }
142 | 
143 | func getInterestedProc() ([]*InterestedProc, error) {
144 | 	result := []*InterestedProc{}
145 | 
146 | 	procStatusFiles, err := filepath.Glob("/proc/[0-9]*/stat")
147 | 	if err != nil {
148 | 		return result, err
149 | 	}
150 | 
151 | 	// Read status and find out interested process
152 | 	for _, f := range procStatusFiles {
153 | 		stat, err := linuxproc.ReadProcessStat(f)
154 | 		if err != nil {
155 | 			continue
156 | 		}
157 | 
158 | 		var cmd = stat.Comm[1 : len(stat.Comm)-1] // name: (cmd)
159 | 		if limit, ok := InterestedProcNames[cmd]; ok {
160 | 			result = append(result, &InterestedProc{
161 | 				StatFilePath:         f,
162 | 				Name:                 cmd,
163 | 				Pid:                  stat.Pid,
164 | 				CPULimitAsGloabl:     limit.CPULimitAsGloabl,
165 | 				CPULimitAsSingleCore: limit.CPULimitAsSingleCore,
166 | 				TotalTime:            stat.Utime + stat.Stime, // Time in user space + Time in kernal space
167 | 			})
168 | 		}
169 | 	}
170 | 
171 | 	return result, nil
172 | }
173 | 
174 | func getSystemCPUTime(stat linuxproc.CPUStat) (idleTime uint64, totalTime uint64) {
175 | 	return stat.Idle, getTotalTime(stat)
176 | }
177 | 
178 | func getMemPercentage(memAvailable uint64, memTotal uint64) float64 {
179 | 	return 100 - (float64(100*memAvailable) / float64(memTotal))
180 | }
181 | 
182 | func getSystemCPUPercentage(deltaSystemIdleTime uint64, deltaSystemTime uint64) float64 {
183 | 	return 100 - (float64(100*(deltaSystemIdleTime)) / float64(deltaSystemTime))
184 | }
185 | 
186 | func getProcessCPUPercentageAsGlobal(deltaProcessCPUTime uint64, deltaSystemCPUTime uint64) float64 {
187 | 	return 100 * float64(deltaProcessCPUTime) / float64(deltaSystemCPUTime)
188 | }
189 | 
190 | func getProcessCPUPercentageAsSingleCore(deltaProcessCPUTime uint64, deltaRealTimeInSeconds float64) float64 {
191 | 	return 100 * float64(deltaProcessCPUTime) / deltaRealTimeInSeconds / ClkTck // deltaCPUTime / ClrTck = deltaProcessCPUTime in seconds
192 | }
193 | 


--------------------------------------------------------------------------------
/pkg/batch/pod_executor.go:
--------------------------------------------------------------------------------
  1 | package batch
  2 | 
  3 | import (
  4 | 	"context"
  5 | 	"encoding/json"
  6 | 	"fmt"
  7 | 	"math/rand"
  8 | 	"time"
  9 | 
 10 | 	log "github.com/sirupsen/logrus"
 11 | 	batchv1 "k8s.io/api/batch/v1"
 12 | 	corev1 "k8s.io/api/core/v1"
 13 | 	apierrors "k8s.io/apimachinery/pkg/api/errors"
 14 | 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 15 | 	"k8s.io/client-go/kubernetes"
 16 | )
 17 | 
 18 | type PodBatchExecutor struct {
 19 | 	Client    *kubernetes.Clientset
 20 | 	Image     string
 21 | 	Namespace string
 22 | 	Mode      string
 23 | }
 24 | 
 25 | func NewPodBatchExecutor(kubeClient *kubernetes.Clientset, image, ns, mode string) *PodBatchExecutor {
 26 | 	e := &PodBatchExecutor{
 27 | 		Client:    kubeClient,
 28 | 		Image:     image,
 29 | 		Namespace: ns,
 30 | 		Mode:      mode,
 31 | 	}
 32 | 
 33 | 	log.WithFields(log.Fields{
 34 | 		"image": image, "namespace": ns, "mode": mode,
 35 | 	}).Debug("NewPodBatchExecutor")
 36 | 
 37 | 	return e
 38 | }
 39 | 
 40 | func (e *PodBatchExecutor) generateRunName() string {
 41 | 	rand.Seed(time.Now().UnixNano())
 42 | 	b := make([]byte, 10)
 43 | 	rand.Read(b)
 44 | 	return fmt.Sprintf("kdebug-%x", b)
 45 | }
 46 | 
 47 | func (e *PodBatchExecutor) isJobCompleted(job *batchv1.Job) bool {
 48 | 	if job.Status.Conditions != nil {
 49 | 		for _, cond := range job.Status.Conditions {
 50 | 			if cond.Type == "Complete" && cond.Status == "True" {
 51 | 				return true
 52 | 			}
 53 | 		}
 54 | 	}
 55 | 	return false
 56 | }
 57 | 
 58 | func (e *PodBatchExecutor) Execute(opts *BatchOptions) ([]*BatchResult, error) {
 59 | 	ns := &corev1.Namespace{
 60 | 		ObjectMeta: metav1.ObjectMeta{
 61 | 			Name: e.Namespace,
 62 | 		},
 63 | 	}
 64 | 	_, err := e.Client.CoreV1().Namespaces().Create(
 65 | 		context.Background(), ns, metav1.CreateOptions{})
 66 | 	if err != nil && !apierrors.IsAlreadyExists(err) {
 67 | 		return nil, fmt.Errorf("Fail to create namespace %s for batch operations: %s",
 68 | 			e.Namespace, err)
 69 | 	}
 70 | 
 71 | 	taskChan := make(chan *batchTask, opts.Concurrency)
 72 | 	resultChan := make(chan *BatchResult, opts.Concurrency)
 73 | 	runName := e.generateRunName()
 74 | 
 75 | 	for i := 0; i < opts.Concurrency; i++ {
 76 | 		go e.startWorker(runName, taskChan, resultChan)
 77 | 	}
 78 | 
 79 | 	for _, machine := range opts.Machines {
 80 | 		go func(m string) {
 81 | 			taskChan <- &batchTask{
 82 | 				Machine:  m,
 83 | 				Checkers: opts.Checkers,
 84 | 			}
 85 | 		}(machine)
 86 | 	}
 87 | 
 88 | 	results := make([]*BatchResult, 0, len(opts.Machines))
 89 | 	for i := 0; i < len(opts.Machines); i++ {
 90 | 		result := <-resultChan
 91 | 		results = append(results, result)
 92 | 		opts.Reporter.OnResult(result)
 93 | 	}
 94 | 
 95 | 	close(taskChan)
 96 | 
 97 | 	return results, nil
 98 | }
 99 | 
100 | func (e *PodBatchExecutor) startWorker(runName string, taskChan chan *batchTask, resultChan chan *BatchResult) {
101 | 	for task := range taskChan {
102 | 		resultChan <- e.executeTask(runName, task)
103 | 	}
104 | }
105 | 
106 | func (e *PodBatchExecutor) getPodTemplateSpecContainerMode(cmd []string, machine string) corev1.PodTemplateSpec {
107 | 	return corev1.PodTemplateSpec{
108 | 		Spec: corev1.PodSpec{
109 | 			Containers: []corev1.Container{
110 | 				corev1.Container{
111 | 					Name:            "kdebug",
112 | 					Image:           e.Image,
113 | 					Command:         cmd,
114 | 					ImagePullPolicy: corev1.PullIfNotPresent,
115 | 				},
116 | 			},
117 | 			RestartPolicy: "Never",
118 | 			NodeName:      machine,
119 | 		},
120 | 	}
121 | }
122 | 
123 | func (e *PodBatchExecutor) getPodTemplateSpecHostMode(rawCmd []string, machine string) corev1.PodTemplateSpec {
124 | 	cmd := []string{"/run-as-host"}
125 | 	cmd = append(cmd, rawCmd...)
126 | 
127 | 	privileged := true
128 | 	hostPathSocket := corev1.HostPathSocket
129 | 	hostPathDirectory := corev1.HostPathDirectory
130 | 
131 | 	return corev1.PodTemplateSpec{
132 | 		Spec: corev1.PodSpec{
133 | 			Containers: []corev1.Container{
134 | 				corev1.Container{
135 | 					Name:    "kdebug",
136 | 					Image:   e.Image,
137 | 					Command: cmd,
138 | 					SecurityContext: &corev1.SecurityContext{
139 | 						Privileged: &privileged,
140 | 					},
141 | 					VolumeMounts: []corev1.VolumeMount{
142 | 						corev1.VolumeMount{
143 | 							Name:      "system-bus-socket",
144 | 							MountPath: "/var/run/dbus/system_bus_socket",
145 | 						},
146 | 						corev1.VolumeMount{
147 | 							Name:      "systemd-system-config",
148 | 							MountPath: "/etc/systemd/system",
149 | 						},
150 | 						corev1.VolumeMount{
151 | 							Name:      "tmp",
152 | 							MountPath: "/tmp",
153 | 						},
154 | 					},
155 | 					ImagePullPolicy: corev1.PullIfNotPresent,
156 | 				},
157 | 			},
158 | 			Volumes: []corev1.Volume{
159 | 				corev1.Volume{
160 | 					Name: "system-bus-socket",
161 | 					VolumeSource: corev1.VolumeSource{
162 | 						HostPath: &corev1.HostPathVolumeSource{
163 | 							Path: "/var/run/dbus/system_bus_socket",
164 | 							Type: &hostPathSocket,
165 | 						},
166 | 					},
167 | 				},
168 | 				corev1.Volume{
169 | 					Name: "systemd-system-config",
170 | 					VolumeSource: corev1.VolumeSource{
171 | 						HostPath: &corev1.HostPathVolumeSource{
172 | 							Path: "/etc/systemd/system",
173 | 							Type: &hostPathDirectory,
174 | 						},
175 | 					},
176 | 				},
177 | 				corev1.Volume{
178 | 					Name: "tmp",
179 | 					VolumeSource: corev1.VolumeSource{
180 | 						HostPath: &corev1.HostPathVolumeSource{
181 | 							Path: "/tmp",
182 | 							Type: &hostPathDirectory,
183 | 						},
184 | 					},
185 | 				},
186 | 			},
187 | 			RestartPolicy: "Never",
188 | 			NodeName:      machine,
189 | 		},
190 | 	}
191 | }
192 | 
193 | func (e *PodBatchExecutor) executeTask(runName string, task *batchTask) *BatchResult {
194 | 	result := &BatchResult{
195 | 		Machine: task.Machine,
196 | 	}
197 | 
198 | 	// Create job
199 | 	cmd := []string{
200 | 		"/kdebug",
201 | 		"-f", "json",
202 | 		"--no-set-exit-code",
203 | 		"-v", "none",
204 | 	}
205 | 	for _, checker := range task.Checkers {
206 | 		cmd = append(cmd, "-c")
207 | 		cmd = append(cmd, checker)
208 | 	}
209 | 
210 | 	ttl := int32(300)
211 | 	backoff := int32(0)
212 | 	job := &batchv1.Job{
213 | 		ObjectMeta: metav1.ObjectMeta{
214 | 			Name:      fmt.Sprintf("%s-%s", runName, task.Machine),
215 | 			Namespace: e.Namespace,
216 | 			Labels: map[string]string{
217 | 				"kdebug-run": runName,
218 | 			},
219 | 		},
220 | 		Spec: batchv1.JobSpec{
221 | 			TTLSecondsAfterFinished: &ttl,
222 | 			BackoffLimit:            &backoff,
223 | 		},
224 | 	}
225 | 
226 | 	if e.Mode == "host" {
227 | 		job.Spec.Template = e.getPodTemplateSpecHostMode(cmd, task.Machine)
228 | 	} else {
229 | 		job.Spec.Template = e.getPodTemplateSpecContainerMode(cmd, task.Machine)
230 | 	}
231 | 
232 | 	job, err := e.Client.BatchV1().Jobs(e.Namespace).Create(
233 | 		context.Background(), job, metav1.CreateOptions{})
234 | 	if err != nil {
235 | 		result.Error = fmt.Errorf("fail to create Kubernetes job: %+v", err)
236 | 		return result
237 | 	}
238 | 
239 | 	// Wait for job
240 | 	timeout := 5 * time.Minute
241 | 	startTime := time.Now()
242 | 	for {
243 | 		time.Sleep(5 * time.Second)
244 | 
245 | 		job, err := e.Client.BatchV1().Jobs(e.Namespace).Get(
246 | 			context.Background(), job.Name, metav1.GetOptions{})
247 | 		if err != nil {
248 | 			result.Error = fmt.Errorf("fail to get Kubernetes job %s: %+v", job.Name, err)
249 | 			return result
250 | 		}
251 | 
252 | 		if e.isJobCompleted(job) {
253 | 			break
254 | 		}
255 | 
256 | 		if time.Now().Sub(startTime) >= timeout {
257 | 			result.Error = fmt.Errorf("timeout waiting for Kubernetes job %s: %+v", job.Name, err)
258 | 			return result
259 | 		}
260 | 	}
261 | 
262 | 	// Fetch pod log
263 | 	pods, err := e.Client.CoreV1().Pods(e.Namespace).List(context.Background(), metav1.ListOptions{
264 | 		LabelSelector: "job-name=" + job.Name,
265 | 	})
266 | 	if err != nil {
267 | 		result.Error = fmt.Errorf("fail to get Kubernetes pods of job %s: %+v", job.Name, err)
268 | 		return result
269 | 	}
270 | 
271 | 	// Parse result
272 | 	pod := pods.Items[0]
273 | 	req := e.Client.CoreV1().Pods(e.Namespace).GetLogs(
274 | 		pod.Name, &corev1.PodLogOptions{})
275 | 	logs, err := req.Stream(context.Background())
276 | 	if err != nil {
277 | 		result.Error = fmt.Errorf("fail to stream logs of pod %s: %+v", pod.Name, err)
278 | 		return result
279 | 	}
280 | 	defer logs.Close()
281 | 
282 | 	decoder := json.NewDecoder(logs)
283 | 
284 | 	result.Error = decoder.Decode(&result.CheckResults)
285 | 
286 | 	return result
287 | }
288 | 


--------------------------------------------------------------------------------
/cmd/main.go:
--------------------------------------------------------------------------------
  1 | package main
  2 | 
  3 | import (
  4 | 	"fmt"
  5 | 	"io/ioutil"
  6 | 	"os"
  7 | 	"path/filepath"
  8 | 	"runtime/debug"
  9 | 
 10 | 	"github.com/fatih/color"
 11 | 	flags "github.com/jessevdk/go-flags"
 12 | 	"github.com/mattn/go-isatty"
 13 | 	"github.com/sirupsen/logrus"
 14 | 	log "github.com/sirupsen/logrus"
 15 | 	"k8s.io/cli-runtime/pkg/genericclioptions"
 16 | 	"k8s.io/client-go/kubernetes"
 17 | 	"k8s.io/client-go/tools/clientcmd"
 18 | 	"k8s.io/client-go/util/homedir"
 19 | 
 20 | 	"github.com/Azure/kdebug/pkg/base"
 21 | 	chks "github.com/Azure/kdebug/pkg/checkers"
 22 | 	"github.com/Azure/kdebug/pkg/env"
 23 | 	"github.com/Azure/kdebug/pkg/formatters"
 24 | 	tools "github.com/Azure/kdebug/pkg/tools"
 25 | )
 26 | 
 27 | type Options struct {
 28 | 	ListCheckers   bool     `short:"l" long:"list" description:"List all checks and tools"`
 29 | 	Checkers       []string `short:"c" long:"check" description:"Check name. Can specify multiple times."`
 30 | 	Tool           string   `short:"t" long:"tool" description:"Use tool"`
 31 | 	Format         string   `short:"f" long:"format" description:"Output format"`
 32 | 	KubeMasterUrl  string   `long:"kube-master-url" description:"Kubernetes API server URL"`
 33 | 	KubeConfigPath string   `long:"kube-config-path" description:"Path to kubeconfig file"`
 34 | 	Verbose        string   `short:"v" long:"verbose" description:"Log level"`
 35 | 	NoColor        bool     `long:"no-color" description:"Disable colorized output"`
 36 | 	Pause          bool     `long:"pause" description:"Pause until interrupted"`
 37 | 	Help           bool     `short:"h" long:"help" description:"Show help message"`
 38 | 	NoSetExitCode  bool     `long:"no-set-exit-code" hidden:"-"`
 39 | 	Output         string   `short:"o" long:"output" description:"Output file"`
 40 | 
 41 | 	Batch struct {
 42 | 		KubeMachines              bool     `long:"kube-machines" description:"Discover machines from Kubernetes API server"`
 43 | 		KubeMachinesUnready       bool     `long:"kube-machines-unready" description:"Discover unready machines from Kubernetes API server"`
 44 | 		KubeMachinesLabelSelector string   `long:"kube-machines-label" description:"Label selector for Kubernetes machines"`
 45 | 		Machines                  []string `long:"machines" description:"Machine names"`
 46 | 		MachinesFile              string   `long:"machines-file" description:"Path to a file that contains machine names list. Can use - to read from stdin."`
 47 | 		Concurrency               int      `long:"concurrency" default:"4" description:"Batch concurrency"`
 48 | 		SshUser                   string   `long:"ssh-user" description:"SSH user"`
 49 | 		PodExecutorImage          string   `long:"pod-executor-image" description:"Container image used by pod executor"`
 50 | 		PodExecutorNamespace      string   `long:"pod-executor-namespace" description:"Namespace used by pod executor" default:"kdebug"`
 51 | 		PodExecutorMode           string   `long:"pod-executor-mode" choice:"host" choice:"container" default:"host" description:"Run as container or run as host"`
 52 | 	} `group:"Batch Options" namespace:"batch" description:"Batch mode"`
 53 | 
 54 | 	RemainingArgs []string
 55 | }
 56 | 
 57 | func (o *Options) IsBatchMode() bool {
 58 | 	return o.Batch.KubeMachines || o.Batch.KubeMachinesUnready || len(o.Batch.Machines) > 0 || len(o.Batch.MachinesFile) > 0
 59 | }
 60 | 
 61 | func (o *Options) IsToolMode() bool {
 62 | 	return len(o.Tool) > 0
 63 | }
 64 | 
 65 | func getDefaultPodExecutorImage() string {
 66 | 	tag := "main"
 67 | 	if info, ok := debug.ReadBuildInfo(); ok {
 68 | 		for _, setting := range info.Settings {
 69 | 			if setting.Key == "vcs.revision" {
 70 | 				tag = setting.Value
 71 | 				break
 72 | 			}
 73 | 		}
 74 | 	}
 75 | 	return "ghcr.io/azure/kdebug:" + tag
 76 | }
 77 | 
 78 | func processOptions(o *Options) {
 79 | 	// Run all checkers if not specified
 80 | 	if len(o.Checkers) == 0 {
 81 | 		o.Checkers = chks.ListAllCheckerNames()
 82 | 	}
 83 | 	if o.Batch.PodExecutorImage == "" {
 84 | 		o.Batch.PodExecutorImage = getDefaultPodExecutorImage()
 85 | 	}
 86 | }
 87 | 
 88 | func buildKubeClient(masterUrl, kubeConfigPath string) (*kubernetes.Clientset, *genericclioptions.ConfigFlags, error) {
 89 | 	// Try env
 90 | 	if kubeConfigPath == "" {
 91 | 		if path := os.Getenv("KUBECONFIG"); path != "" {
 92 | 			kubeConfigPath = path
 93 | 		}
 94 | 	}
 95 | 
 96 | 	// Try default path
 97 | 	if kubeConfigPath == "" {
 98 | 		if home := homedir.HomeDir(); home != "" {
 99 | 			kubeConfigPath = filepath.Join(home, ".kube", "config")
100 | 		}
101 | 	}
102 | 
103 | 	config, err := clientcmd.BuildConfigFromFlags(masterUrl, kubeConfigPath)
104 | 	if err != nil {
105 | 		return nil, nil, err
106 | 	}
107 | 	clientSet, err := kubernetes.NewForConfig(config)
108 | 	if err != nil {
109 | 		return nil, nil, err
110 | 	}
111 | 	kubeConfigFlag := genericclioptions.NewConfigFlags(false)
112 | 	kubeConfigFlag.APIServer = &masterUrl
113 | 	kubeConfigFlag.KubeConfig = &kubeConfigPath
114 | 
115 | 	return clientSet, kubeConfigFlag, nil
116 | }
117 | 
118 | func buildCheckContext(opts *Options) (*base.CheckContext, error) {
119 | 	ctx := &base.CheckContext{
120 | 		Environment: env.GetEnvironment(),
121 | 	}
122 | 
123 | 	log.WithFields(log.Fields{
124 | 		"env": ctx.Environment,
125 | 	}).Debug("Environment")
126 | 
127 | 	kubeClient, _, err := buildKubeClient(opts.KubeMasterUrl, opts.KubeConfigPath)
128 | 	if err == nil {
129 | 		ctx.KubeClient = kubeClient
130 | 	} else {
131 | 		log.WithFields(log.Fields{
132 | 			"error": err,
133 | 		}).Warn("Kubernetes related checkers will not work")
134 | 	}
135 | 
136 | 	return ctx, nil
137 | }
138 | 
139 | func buildToolContext(opts *Options) (*base.ToolContext, error) {
140 | 	// Add back help arg so tool can see it
141 | 	if opts.Help {
142 | 		opts.RemainingArgs = append(opts.RemainingArgs, "-h")
143 | 	}
144 | 	log.WithFields(log.Fields{"args": opts.RemainingArgs}).Debug("Tool context")
145 | 	ctx := &base.ToolContext{
146 | 		Args:        opts.RemainingArgs,
147 | 		Environment: env.GetEnvironment(),
148 | 	}
149 | 	if _, configFlags, err := buildKubeClient(opts.KubeMasterUrl, opts.KubeConfigPath); err == nil {
150 | 		ctx.KubeConfigFlag = configFlags
151 | 	}
152 | 	return ctx, nil
153 | }
154 | 
155 | func main() {
156 | 	// Process options
157 | 	var opts Options
158 | 	flagsParser := flags.NewParser(&opts, flags.PrintErrors|flags.PassDoubleDash|flags.IgnoreUnknown)
159 | 	remainingArgs, err := flagsParser.Parse()
160 | 	if err != nil {
161 | 		log.Fatal(err)
162 | 		return
163 | 	}
164 | 	opts.RemainingArgs = remainingArgs
165 | 
166 | 	processOptions(&opts)
167 | 
168 | 	if len(opts.Verbose) > 0 {
169 | 		if opts.Verbose == "none" {
170 | 			logrus.SetOutput(ioutil.Discard)
171 | 		} else {
172 | 			logLevel, err := logrus.ParseLevel(opts.Verbose)
173 | 			if err != nil {
174 | 				log.Fatal(err)
175 | 			}
176 | 			logrus.SetLevel(logLevel)
177 | 		}
178 | 	}
179 | 
180 | 	if !isatty.IsTerminal(os.Stdout.Fd()) || opts.NoColor || opts.Output != "" {
181 | 		color.NoColor = true
182 | 	}
183 | 
184 | 	if opts.ListCheckers {
185 | 		fmt.Print("checks: ")
186 | 		fmt.Println(chks.ListAllCheckerNames())
187 | 		fmt.Print("tools: ")
188 | 		fmt.Println(tools.ListAllToolNames())
189 | 		return
190 | 	}
191 | 
192 | 	if opts.Pause {
193 | 		pause()
194 | 		return
195 | 	}
196 | 
197 | 	var formatter formatters.Formatter
198 | 	if opts.Format == "json" {
199 | 		formatter = &formatters.JsonFormatter{}
200 | 	} else if opts.Format == "oneline" {
201 | 		formatter = &formatters.OneLineFormatter{}
202 | 	} else {
203 | 		formatter = &formatters.TextFormatter{}
204 | 	}
205 | 
206 | 	// Tool Mode
207 | 	if opts.IsToolMode() {
208 | 		ctx, err := buildToolContext(&opts)
209 | 		if err != nil {
210 | 			log.Fatal(err)
211 | 		}
212 | 
213 | 		err = tools.ParseArgs(ctx, opts.Tool, opts.RemainingArgs)
214 | 		if err != nil {
215 | 			if !flags.WroteHelp(err) {
216 | 				log.Fatal(err)
217 | 			}
218 | 			return
219 | 		}
220 | 
221 | 		err = tools.Run(ctx, opts.Tool)
222 | 		if err != nil {
223 | 			log.Fatal(err)
224 | 		}
225 | 		return
226 | 	}
227 | 
228 | 	if opts.Help {
229 | 		flagsParser.WriteHelp(os.Stdout)
230 | 		return
231 | 	}
232 | 
233 | 	// Prepare dependencies
234 | 	ctx, err := buildCheckContext(&opts)
235 | 	if err != nil {
236 | 		log.Fatal(err)
237 | 	}
238 | 
239 | 	ctx.Output = os.Stdout
240 | 	if opts.Output != "" {
241 | 		outFile, err := os.OpenFile(opts.Output, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644)
242 | 		if err != nil {
243 | 			log.Fatalf("Fail to open output file: %s", opts.Output)
244 | 		}
245 | 		defer outFile.Close()
246 | 		ctx.Output = outFile
247 | 	}
248 | 
249 | 	// Batch mode
250 | 	if opts.IsBatchMode() {
251 | 		runBatch(&opts, ctx, formatter)
252 | 		return
253 | 	}
254 | 
255 | 	// Check
256 | 	results, err := chks.Check(ctx, opts.Checkers)
257 | 	if err != nil {
258 | 		log.Fatal(err)
259 | 	}
260 | 
261 | 	// Output
262 | 	err = formatter.WriteResults(ctx.Output, results)
263 | 	if err != nil {
264 | 		log.Fatal(err)
265 | 	}
266 | 
267 | 	if !opts.NoSetExitCode {
268 | 		for _, r := range results {
269 | 			if !r.Ok() {
270 | 				os.Exit(1)
271 | 			}
272 | 		}
273 | 	}
274 | }
275 | 


--------------------------------------------------------------------------------
/pkg/checkers/kube/pod/pod_restart_reason_checker.go:
--------------------------------------------------------------------------------
  1 | package pod
  2 | 
  3 | import (
  4 | 	"bytes"
  5 | 	"context"
  6 | 	"fmt"
  7 | 	"io"
  8 | 	"strings"
  9 | 	"text/tabwriter"
 10 | 	"time"
 11 | 
 12 | 	"github.com/Azure/kdebug/pkg/base"
 13 | 	log "github.com/sirupsen/logrus"
 14 | 
 15 | 	corev1 "k8s.io/api/core/v1"
 16 | 	v1 "k8s.io/api/core/v1"
 17 | 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 18 | 	"k8s.io/apimachinery/pkg/runtime"
 19 | 	"k8s.io/apimachinery/pkg/types"
 20 | 	"k8s.io/apimachinery/pkg/util/duration"
 21 | 	runtimeresource "k8s.io/cli-runtime/pkg/resource"
 22 | 	corev1client "k8s.io/client-go/kubernetes/typed/core/v1"
 23 | 	"k8s.io/client-go/tools/reference"
 24 | 	"k8s.io/kubectl/pkg/cmd/util"
 25 | 	"k8s.io/kubectl/pkg/describe"
 26 | 	"k8s.io/kubectl/pkg/scheme"
 27 | 	"k8s.io/kubectl/pkg/util/qos"
 28 | )
 29 | 
 30 | const levelSpace = "  "
 31 | 
 32 | type KubePodRestartReasonChecker struct {
 33 | }
 34 | 
 35 | func New() *KubePodRestartReasonChecker {
 36 | 	return &KubePodRestartReasonChecker{}
 37 | }
 38 | 
 39 | func (c *KubePodRestartReasonChecker) Name() string {
 40 | 	return "KubePodRestartReason"
 41 | }
 42 | 
 43 | // Check borrows many logic and helper functions from src/k8s.io/kubectl/pkg/describe to check Pod status and events.
 44 | func (c *KubePodRestartReasonChecker) Check(ctx *base.CheckContext) ([]*base.CheckResult, error) {
 45 | 	if ctx.KubeClient == nil {
 46 | 		log.Warn("Skip KubePodRestartReasonChecker due to missing kube client")
 47 | 		return nil, nil
 48 | 	}
 49 | 
 50 | 	pods, err := ctx.KubeClient.CoreV1().Pods("").List(context.Background(), metav1.ListOptions{})
 51 | 	if err != nil {
 52 | 		log.WithFields(log.Fields{"error": err}).Warn("Fail to list pods")
 53 | 		return nil, err
 54 | 	}
 55 | 
 56 | 	results := []*base.CheckResult{}
 57 | 	for _, pod := range pods.Items {
 58 | 		var crashing = false
 59 | 		for _, containerStatus := range pod.Status.ContainerStatuses {
 60 | 			if containerStatus.State.Waiting != nil && containerStatus.State.Waiting.Reason == "CrashLoopBackOff" {
 61 | 				crashing = true
 62 | 				break
 63 | 			}
 64 | 		}
 65 | 
 66 | 		if crashing {
 67 | 			result := c.checkPod(ctx, &pod)
 68 | 			if result != nil {
 69 | 				results = append(results, result)
 70 | 			}
 71 | 		}
 72 | 	}
 73 | 
 74 | 	return results, nil
 75 | }
 76 | 
 77 | func (c *KubePodRestartReasonChecker) checkPod(ctx *base.CheckContext, pod *v1.Pod) *base.CheckResult {
 78 | 	var events *corev1.EventList
 79 | 	ref, err := reference.GetReference(scheme.Scheme, pod)
 80 | 	if err != nil {
 81 | 		log.WithFields(log.Fields{"pod": pod, "error": err}).Warn("Unable to construct reference")
 82 | 		return nil
 83 | 	}
 84 | 
 85 | 	ref.Kind = ""
 86 | 	if _, isMirrorPod := pod.Annotations[corev1.MirrorPodAnnotationKey]; isMirrorPod {
 87 | 		ref.UID = types.UID(pod.Annotations[corev1.MirrorPodAnnotationKey])
 88 | 	}
 89 | 	events, _ = searchEvents(ctx.KubeClient.CoreV1(), ref, util.DefaultChunkSize)
 90 | 	text, _ := describePodStatus(pod, events)
 91 | 	logs := strings.Split(text, "\n")
 92 | 
 93 | 	for i := range logs {
 94 | 		logs[i] = levelSpace + logs[i]
 95 | 	}
 96 | 
 97 | 	return &base.CheckResult{
 98 | 		Checker:     c.Name(),
 99 | 		Error:       fmt.Sprintf("one or more containers of %s/%s are failing and restarting repeatedly.", pod.Namespace, pod.Name),
100 | 		Description: fmt.Sprintf("%s/%s is not running well.", pod.Namespace, pod.Name),
101 | 		Logs:        logs,
102 | 	}
103 | }
104 | 
105 | func describePodStatus(pod *corev1.Pod, events *corev1.EventList) (string, error) {
106 | 	return tabbedString(func(out io.Writer) error {
107 | 		w := describe.NewPrefixWriter(out)
108 | 		w.Write(describe.LEVEL_0, "Name:\t%s\n", pod.Name)
109 | 		w.Write(describe.LEVEL_0, "Namespace:\t%s\n", pod.Namespace)
110 | 		if pod.Status.StartTime != nil {
111 | 			w.Write(describe.LEVEL_0, "Start Time:\t%s\n", pod.Status.StartTime.Time.Format(time.RFC1123Z))
112 | 		}
113 | 		if pod.DeletionTimestamp != nil {
114 | 			w.Write(describe.LEVEL_0, "Status:\tTerminating (lasts %s)\n", translateTimestampSince(*pod.DeletionTimestamp))
115 | 			w.Write(describe.LEVEL_0, "Termination Grace Period:\t%ds\n", *pod.DeletionGracePeriodSeconds)
116 | 		} else {
117 | 			w.Write(describe.LEVEL_0, "Status:\t%s\n", string(pod.Status.Phase))
118 | 		}
119 | 		if len(pod.Status.Reason) > 0 {
120 | 			w.Write(describe.LEVEL_0, "Reason:\t%s\n", pod.Status.Reason)
121 | 		}
122 | 		if len(pod.Status.Message) > 0 {
123 | 			w.Write(describe.LEVEL_0, "Message:\t%s\n", pod.Status.Message)
124 | 		}
125 | 		describeContainers("Containers", pod.Spec.Containers, pod.Status.ContainerStatuses, describe.EnvValueRetriever(pod), w, "")
126 | 		if len(pod.Status.Conditions) > 0 {
127 | 			w.Write(describe.LEVEL_0, "Conditions:\n  Type\tStatus\n")
128 | 			for _, c := range pod.Status.Conditions {
129 | 				w.Write(describe.LEVEL_1, "%v \t%v \n",
130 | 					c.Type,
131 | 					c.Status)
132 | 			}
133 | 		}
134 | 		if pod.Status.QOSClass != "" {
135 | 			w.Write(describe.LEVEL_0, "QoS Class:\t%s\n", pod.Status.QOSClass)
136 | 		} else {
137 | 			w.Write(describe.LEVEL_0, "QoS Class:\t%s\n", qos.GetPodQOS(pod))
138 | 		}
139 | 		if events != nil {
140 | 			describe.DescribeEvents(events, w)
141 | 		}
142 | 		return nil
143 | 	})
144 | }
145 | 
146 | func tabbedString(f func(io.Writer) error) (string, error) {
147 | 	out := new(tabwriter.Writer)
148 | 	buf := &bytes.Buffer{}
149 | 	out.Init(buf, 0, 8, 2, ' ', 0)
150 | 
151 | 	err := f(out)
152 | 	if err != nil {
153 | 		return "", err
154 | 	}
155 | 
156 | 	out.Flush()
157 | 	str := string(buf.String())
158 | 	return str, nil
159 | }
160 | 
161 | func searchEvents(client corev1client.EventsGetter, objOrRef runtime.Object, limit int64) (*corev1.EventList, error) {
162 | 	ref, err := reference.GetReference(scheme.Scheme, objOrRef)
163 | 	if err != nil {
164 | 		return nil, err
165 | 	}
166 | 	stringRefKind := string(ref.Kind)
167 | 	var refKind *string
168 | 	if len(stringRefKind) > 0 {
169 | 		refKind = &stringRefKind
170 | 	}
171 | 	stringRefUID := string(ref.UID)
172 | 	var refUID *string
173 | 	if len(stringRefUID) > 0 {
174 | 		refUID = &stringRefUID
175 | 	}
176 | 
177 | 	e := client.Events(ref.Namespace)
178 | 	fieldSelector := e.GetFieldSelector(&ref.Name, &ref.Namespace, refKind, refUID)
179 | 	initialOpts := metav1.ListOptions{FieldSelector: fieldSelector.String(), Limit: limit}
180 | 	eventList := &corev1.EventList{}
181 | 	err = runtimeresource.FollowContinue(&initialOpts,
182 | 		func(options metav1.ListOptions) (runtime.Object, error) {
183 | 			newEvents, err := e.List(context.TODO(), options)
184 | 			if err != nil {
185 | 				return nil, runtimeresource.EnhanceListError(err, options, "events")
186 | 			}
187 | 			eventList.Items = append(eventList.Items, newEvents.Items...)
188 | 			return newEvents, nil
189 | 		})
190 | 	return eventList, err
191 | }
192 | 
193 | func describeContainers(label string, containers []corev1.Container, containerStatuses []corev1.ContainerStatus,
194 | 	resolverFn describe.EnvVarResolverFunc, w describe.PrefixWriter, space string) {
195 | 	statuses := map[string]corev1.ContainerStatus{}
196 | 	for _, status := range containerStatuses {
197 | 		statuses[status.Name] = status
198 | 	}
199 | 
200 | 	for _, container := range containers {
201 | 		status, ok := statuses[container.Name]
202 | 		describeContainerBasicInfo(container, status, ok, space, w)
203 | 		if ok {
204 | 			describeContainerState(status, w)
205 | 		}
206 | 	}
207 | }
208 | 
209 | func describeContainerBasicInfo(container corev1.Container, status corev1.ContainerStatus, ok bool, space string, w describe.PrefixWriter) {
210 | 	nameIndent := ""
211 | 	if len(space) > 0 {
212 | 		nameIndent = " "
213 | 	}
214 | 	w.Write(describe.LEVEL_1, "%s%v:\n", nameIndent, container.Name)
215 | 	if ok {
216 | 		w.Write(describe.LEVEL_2, "Container ID:\t%s\n", status.ContainerID)
217 | 	}
218 | 	w.Write(describe.LEVEL_2, "Image:\t%s\n", container.Image)
219 | 	if ok {
220 | 		w.Write(describe.LEVEL_2, "Image ID:\t%s\n", status.ImageID)
221 | 	}
222 | }
223 | 
224 | func describeContainerState(status corev1.ContainerStatus, w describe.PrefixWriter) {
225 | 	describeStatus("State", status.State, w)
226 | 	if status.LastTerminationState.Terminated != nil {
227 | 		describeStatus("Last State", status.LastTerminationState, w)
228 | 	}
229 | 	w.Write(describe.LEVEL_2, "Ready:\t%v\n", printBool(status.Ready))
230 | 	w.Write(describe.LEVEL_2, "Restart Count:\t%d\n", status.RestartCount)
231 | }
232 | 
233 | func describeStatus(stateName string, state corev1.ContainerState, w describe.PrefixWriter) {
234 | 	switch {
235 | 	case state.Running != nil:
236 | 		w.Write(describe.LEVEL_2, "%s:\tRunning\n", stateName)
237 | 		w.Write(describe.LEVEL_3, "Started:\t%v\n", state.Running.StartedAt.Time.Format(time.RFC1123Z))
238 | 	case state.Waiting != nil:
239 | 		w.Write(describe.LEVEL_2, "%s:\tWaiting\n", stateName)
240 | 		if state.Waiting.Reason != "" {
241 | 			w.Write(describe.LEVEL_3, "Reason:\t%s\n", state.Waiting.Reason)
242 | 		}
243 | 	case state.Terminated != nil:
244 | 		w.Write(describe.LEVEL_2, "%s:\tTerminated\n", stateName)
245 | 		if state.Terminated.Reason != "" {
246 | 			w.Write(describe.LEVEL_3, "Reason:\t%s\n", state.Terminated.Reason)
247 | 		}
248 | 		if state.Terminated.Message != "" {
249 | 			w.Write(describe.LEVEL_3, "Message:\t%s\n", state.Terminated.Message)
250 | 		}
251 | 		w.Write(describe.LEVEL_3, "Exit Code:\t%d\n", state.Terminated.ExitCode)
252 | 		if state.Terminated.Signal > 0 {
253 | 			w.Write(describe.LEVEL_3, "Signal:\t%d\n", state.Terminated.Signal)
254 | 		}
255 | 		w.Write(describe.LEVEL_3, "Started:\t%s\n", state.Terminated.StartedAt.Time.Format(time.RFC1123Z))
256 | 		w.Write(describe.LEVEL_3, "Finished:\t%s\n", state.Terminated.FinishedAt.Time.Format(time.RFC1123Z))
257 | 	default:
258 | 		w.Write(describe.LEVEL_2, "%s:\tWaiting\n", stateName)
259 | 	}
260 | }
261 | 
262 | func translateTimestampSince(timestamp metav1.Time) string {
263 | 	if timestamp.IsZero() {
264 | 		return "<unknown>"
265 | 	}
266 | 
267 | 	return duration.HumanDuration(time.Since(timestamp.Time))
268 | }
269 | 
270 | func printBool(value bool) string {
271 | 	if value {
272 | 		return "True"
273 | 	}
274 | 
275 | 	return "False"
276 | }
277 | 


--------------------------------------------------------------------------------