├── .github ├── pull_request_template.md └── workflows │ └── ci.yml ├── .gitignore ├── .golangci.yml ├── LICENSE ├── Makefile ├── OWNERS ├── README.md ├── component ├── conprof │ ├── README.md │ ├── contprof.go │ ├── http │ │ ├── api.go │ │ ├── api_test.go │ │ └── svg.go │ ├── jeprof │ │ ├── jeprof.go │ │ └── jeprof.in │ ├── meta │ │ ├── meta.go │ │ └── meta_test.go │ ├── scrape │ │ ├── manager.go │ │ ├── manager_test.go │ │ ├── scrape.go │ │ ├── ticker.go │ │ └── ticker_test.go │ └── store │ │ ├── gc.go │ │ ├── store.go │ │ └── store_test.go ├── domain │ ├── client.go │ ├── domain.go │ └── domain_test.go ├── subscriber │ ├── main_test.go │ ├── manager.go │ ├── mock_sub_controller_test.go │ ├── model │ │ └── model.go │ ├── sub_controller.go │ ├── subscriber.go │ └── subscriber_test.go ├── topology │ ├── discovery.go │ ├── syncer.go │ ├── topology.go │ └── topology_test.go └── topsql │ ├── codec │ ├── plan │ │ ├── plan.go │ │ ├── plan_test.go │ │ └── testdata │ │ │ ├── big_decoded_plan.txt │ │ │ └── big_encoded_plan.txt │ └── resource_group_tag │ │ └── resource_group_tag.go │ ├── mock │ ├── mem_store.go │ └── pubsub.go │ ├── query │ ├── default_query.go │ ├── default_query_test.go │ ├── model.go │ ├── pools.go │ └── query.go │ ├── service │ ├── http.go │ └── pools.go │ ├── store │ ├── default_store.go │ ├── model.go │ ├── pools.go │ └── store.go │ ├── subscriber │ ├── main_test.go │ ├── pools.go │ ├── scraper.go │ ├── scraper_test.go │ ├── subscriber.go │ └── subscriber_test.go │ ├── topsql.go │ └── topsql_test.go ├── config ├── config.go ├── config.toml.example ├── config_test.go ├── pdvariable │ ├── pdvariable.go │ └── pdvariable_test.go ├── persist.go ├── service.go └── service_test.go ├── database ├── database.go ├── docdb │ ├── docdb.go │ ├── docdb_test.go │ ├── genji.go │ ├── genji_logger.go │ ├── genji_test.go │ ├── sqlite.go │ └── sqlite_test.go └── timeseries │ ├── handler.go │ ├── syscall_linux.go │ ├── syscall_not_linux_unix.go │ └── vm.go ├── go.mod ├── go.sum ├── main.go ├── service ├── http │ └── http.go └── service.go ├── tests ├── mock.go └── topsql_test.go └── utils ├── limiter.go ├── misc.go ├── pools.go ├── printer ├── fips.go └── printer.go ├── resp_writer.go ├── retry.go ├── retry_test.go └── testutil └── util.go /.github/pull_request_template.md: -------------------------------------------------------------------------------- 1 | 4 | 5 | ### What problem does this PR solve? 6 | 14 | 15 | Issue Number: close #xxx 16 | 17 | ### What is changed and how it works? 18 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | - 'release-**' 8 | pull_request: 9 | branches: 10 | - main 11 | - 'release-**' 12 | 13 | jobs: 14 | ci: 15 | runs-on: ubuntu-latest 16 | steps: 17 | - uses: actions/checkout@v3 18 | - uses: actions/setup-go@v3 19 | with: 20 | go-version: '1.24.2' 21 | - name: Format 22 | run: make fmt 23 | - name: Lint 24 | run: make lint 25 | - name: Test 26 | run: make test 27 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | data 3 | bin 4 | log 5 | *.log 6 | coverage.txt 7 | tools/bin/ 8 | -------------------------------------------------------------------------------- /.golangci.yml: -------------------------------------------------------------------------------- 1 | run: 2 | timeout: 10m 3 | linters: 4 | disable-all: true 5 | enable: 6 | - misspell 7 | - ineffassign 8 | - typecheck 9 | # - varcheck 10 | - unused 11 | # - structcheck 12 | # - deadcode 13 | - gosimple 14 | - goimports 15 | - errcheck 16 | - staticcheck 17 | - stylecheck 18 | - gosec 19 | - asciicheck 20 | - bodyclose 21 | # - exportloopref 22 | # - copyloopvar 23 | - rowserrcheck 24 | - unconvert 25 | - makezero 26 | - durationcheck 27 | - prealloc 28 | 29 | linters-settings: 30 | staticcheck: 31 | checks: ["S1002","S1004","S1007","S1009","S1010","S1012","S1019","S1020","S1021","S1024","S1030","SA2*","SA3*","SA4009","SA5*","SA6000","SA6001","SA6005", "-SA2002"] 32 | stylecheck: 33 | checks: ["-ST1003"] 34 | issues: 35 | exclude-rules: 36 | - path: _test\.go 37 | linters: 38 | - errcheck 39 | - gosec 40 | - rowserrcheck 41 | - makezero 42 | - linters: 43 | - gosec 44 | text: "G115:" 45 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | PACKAGE_LIST := go list ./...| grep -E "github.com/pingcap/ng-monitoring/" 2 | PACKAGE_LIST_TESTS := go list ./... | grep -E "github.com/pingcap/ng-monitoring/" 3 | PACKAGES ?= $$($(PACKAGE_LIST)) 4 | PACKAGES_TESTS ?= $$($(PACKAGE_LIST_TESTS)) 5 | PACKAGE_DIRECTORIES := $(PACKAGE_LIST) | sed 's|github.com/pingcap/ng-monitoring/||' 6 | FILES := $$(find $$($(PACKAGE_DIRECTORIES)) -name "*.go") 7 | FAIL_ON_STDOUT := awk '{ print } END { if (NR > 0) { exit 1 } }' 8 | BUILD_TS := $(shell date -u '+%Y-%m-%d %H:%M:%S') 9 | GIT_HASH := $(shell git rev-parse HEAD) 10 | GIT_BRANCH := $(shell git rev-parse --abbrev-ref HEAD) 11 | 12 | BUILD_GOEXPERIMENT ?= 13 | BUILD_CGO_ENABLED ?= 14 | BUILD_TAGS ?= 15 | ifeq ("${ENABLE_FIPS}", "1") 16 | GIT_HASH := $(GIT_HASH) (with fips) 17 | GIT_BRANCH := $(GIT_BRANCH) (with fips) 18 | BUILD_TAGS += boringcrypto 19 | BUILD_GOEXPERIMENT = GOEXPERIMENT=boringcrypto 20 | BUILD_CGO_ENABLED = CGO_ENABLED=1 21 | endif 22 | 23 | LDFLAGS += -X "github.com/pingcap/ng-monitoring/utils/printer.NGMBuildTS=$(BUILD_TS)" 24 | LDFLAGS += -X "github.com/pingcap/ng-monitoring/utils/printer.NGMGitHash=$(GIT_HASH)" 25 | LDFLAGS += -X "github.com/pingcap/ng-monitoring/utils/printer.NGMGitBranch=$(GIT_BRANCH)" 26 | 27 | GO := $(BUILD_GOEXPERIMENT) $(BUILD_CGO_ENABLED) GO111MODULE=on go 28 | GOBUILD := $(GO) build 29 | GOTEST := $(GO) test -p 8 30 | 31 | 32 | 33 | default: 34 | $(GOBUILD) -ldflags '$(LDFLAGS)' -tags '${BUILD_TAGS}' -o bin/ng-monitoring-server ./main.go 35 | @echo Build successfully! 36 | 37 | fmt: 38 | @echo "gofmt (simplify)" 39 | @gofmt -s -l -w . 2>&1 | $(FAIL_ON_STDOUT) 40 | @gofmt -s -l -w $(FILES) 2>&1 | $(FAIL_ON_STDOUT) 41 | 42 | test: 43 | @echo "Running test" 44 | @export log_level=info; export TZ='Asia/Shanghai'; \ 45 | $(GOTEST) -cover $(PACKAGES_TESTS) -coverprofile=coverage.txt 46 | 47 | lint: tools/bin/golangci-lint 48 | GO111MODULE=on tools/bin/golangci-lint run -v $$($(PACKAGE_DIRECTORIES)) --config .golangci.yml 49 | 50 | tools/bin/golangci-lint: 51 | curl -sfL https://raw.githubusercontent.com/golangci/golangci-lint/master/install.sh| sh -s -- -b ./tools/bin v1.64.5 52 | -------------------------------------------------------------------------------- /OWNERS: -------------------------------------------------------------------------------- 1 | # See the OWNERS docs at https://go.k8s.io/owners 2 | approvers: 3 | - breezewish 4 | - c4pt0r 5 | - crazycs520 6 | - iamxy 7 | - mornyx 8 | - ngaut 9 | - qiuyesuifeng 10 | - shenli 11 | - siddontang 12 | - ucjmh 13 | - zhangyangyu 14 | - zhongzc 15 | - zhangpeijin-milo 16 | - z2665 17 | - just1900 18 | reviewers: 19 | - breezewish 20 | - c4pt0r 21 | - crazycs520 22 | - iamxy 23 | - mornyx 24 | - ngaut 25 | - qiuyesuifeng 26 | - shenli 27 | - siddontang 28 | - ucjmh 29 | - zhangyangyu 30 | - zhongzc 31 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Next Generation Monitoring Server 2 | 3 | ## Build 4 | 5 | ```shell 6 | make 7 | ``` 8 | 9 | ## Arguments 10 | 11 | ```shell 12 | $ bin/ng-monitoring-server --help 13 | Usage of bin/ng-monitoring-server: 14 | --address string TCP address to listen for http connections 15 | --advertise-address string tidb server advertise IP 16 | --config string config file path 17 | --log.path string Log path of ng monitoring server 18 | --pd.endpoints strings Addresses of PD instances within the TiDB cluster. Multiple addresses are separated by commas, e.g. --pd.endpoints 10.0.0.1:2379,10.0.0.2:2379 19 | --retention-period string Data with timestamps outside the retentionPeriod is automatically deleted 20 | The following optional suffixes are supported: h (hour), d (day), w (week), y (year). If suffix isn't set, then the duration is counted in months (default "1") 21 | --storage.path string Storage path of ng monitoring server 22 | pflag: help requested 23 | ``` 24 | 25 | ## Config Example 26 | 27 | ```shell 28 | $ cat config/config.toml.example 29 | # NG Monitoring Server Configuration. 30 | 31 | # Server address. 32 | address = "0.0.0.0:12020" 33 | 34 | advertise-address = "0.0.0.0:12020" 35 | 36 | [log] 37 | # Log path 38 | path = "log" 39 | 40 | # Log level: DEBUG, INFO, WARN, ERROR 41 | level = "INFO" 42 | 43 | [pd] 44 | # Addresses of PD instances within the TiDB cluster. Multiple addresses are separated by commas, e.g. ["10.0.0.1:2379","10.0.0.2:2379"] 45 | endpoints = ["0.0.0.0:2379"] 46 | 47 | [storage] 48 | # Storage path of ng monitoring server 49 | path = "data" 50 | 51 | [security] 52 | ca-path = "" 53 | cert-path = "" 54 | key-path = "" 55 | ``` 56 | 57 | ## Reload Config 58 | 59 | ```shell 60 | $ bin/ng-monitoring-server --config config/config.toml.example 61 | 62 | # Another shell session 63 | $ pkill -SIGHUP ng-monitoring-server 64 | ``` 65 | 66 | 67 | 68 | -------------------------------------------------------------------------------- /component/conprof/README.md: -------------------------------------------------------------------------------- 1 | # HTTP API 2 | 3 | ```shell 4 | # get current config 5 | curl http://0.0.0.0:12020/config 6 | 7 | # modify config 8 | curl -X POST -d '{"continuous_profiling": {"enable": false,"profile_seconds":6,"interval_seconds":11}}' http://0.0.0.0:12020/config 9 | 10 | # estimate size profile data size 11 | curl http://0.0.0.0:12020/continuous_profiling/estimate_size\?days\=3 12 | 13 | # query group profiles 14 | 15 | curl "http://0.0.0.0:12020/continuous_profiling/group_profiles?begin_time=1634836900&end_time=1654836910" 16 | [ 17 | { 18 | "ts": 1634836900, 19 | "profile_duration_secs": 5, 20 | "state": "success", 21 | "component_num": { 22 | "tidb": 1, 23 | "pd": 1, 24 | "tikv": 1, 25 | "tiflash": 0 26 | } 27 | }, 28 | { 29 | "ts": 1634836910, 30 | "profile_duration_secs": 5, 31 | "state": "success", 32 | "component_num": { 33 | "tidb": 1, 34 | "pd": 1, 35 | "tikv": 1, 36 | "tiflash": 0 37 | } 38 | } 39 | ] 40 | 41 | # query group profile detail. 42 | curl "http://0.0.0.0:12020/continuous_profiling/group_profile/detail?ts=1634836910" 43 | { 44 | "ts": 1634836910, 45 | "profile_duration_secs": 5, 46 | "state": "success", 47 | "target_profiles": [ 48 | { 49 | "state": "success", 50 | "error": "", 51 | "profile_type": "profile", 52 | "target": { 53 | "component": "tikv", 54 | "address": "10.0.1.21:20180" 55 | } 56 | }, 57 | { 58 | "state": "success", 59 | "error": "", 60 | "profile_type": "profile", 61 | "target": { 62 | "component": "pd", 63 | "address": "10.0.1.21:2379" 64 | } 65 | }, 66 | { 67 | "state": "success", 68 | "error": "", 69 | "profile_type": "mutex", 70 | "target": { 71 | "component": "tidb", 72 | "address": "10.0.1.21:10080" 73 | } 74 | } 75 | ] 76 | } 77 | 78 | # view single profile data 79 | curl "http://0.0.0.0:12020/continuous_profiling/single_profile/view?ts=1634836910&profile_type=profile&component=tidb&address=10.0.1.21:10080" > profile 80 | 81 | # view single profile data and specify data type 82 | curl "http://0.0.0.0:12020/continuous_profiling/single_profile/view?ts=1635480630&profile_type=profile&component=tidb&address=10.0.1.21:10080&data_format=protobuf" > profile 83 | 84 | # Download profile 85 | curl "http://0.0.0.0:12020/continuous_profiling/download?ts=1634836910" > d.zip 86 | 87 | # Download profile data and specify data type 88 | curl "http://0.0.0.0:12020/continuous_profiling/download?ts=1635480630&data_format=protobuf" > d.zip 89 | 90 | # Download single profile data by specify target 91 | curl "http://0.0.0.0:12020/continuous_profiling/download?ts=1639733820&profile_type=profile&component=pd&address=10.0.1.11:21379&data_format=protobuf" > profile.zip 92 | ``` 93 | -------------------------------------------------------------------------------- /component/conprof/contprof.go: -------------------------------------------------------------------------------- 1 | package conprof 2 | 3 | import ( 4 | "github.com/pingcap/ng-monitoring/component/conprof/scrape" 5 | "github.com/pingcap/ng-monitoring/component/conprof/store" 6 | "github.com/pingcap/ng-monitoring/component/topology" 7 | "github.com/pingcap/ng-monitoring/database/docdb" 8 | ) 9 | 10 | var ( 11 | storage *store.ProfileStorage 12 | manager *scrape.Manager 13 | ) 14 | 15 | func Init(db docdb.DocDB, subscriber topology.Subscriber) error { 16 | var err error 17 | storage, err = store.NewProfileStorage(db) 18 | if err != nil { 19 | return err 20 | } 21 | manager = scrape.NewManager(storage, subscriber) 22 | manager.Start() 23 | return nil 24 | } 25 | 26 | func Stop() { 27 | manager.Close() 28 | } 29 | 30 | func GetStorage() *store.ProfileStorage { 31 | return storage 32 | } 33 | 34 | func GetManager() *scrape.Manager { 35 | return manager 36 | } 37 | -------------------------------------------------------------------------------- /component/conprof/http/svg.go: -------------------------------------------------------------------------------- 1 | package http 2 | 3 | import ( 4 | "bytes" 5 | "flag" 6 | "io" 7 | "strconv" 8 | "time" 9 | 10 | graphviz "github.com/goccy/go-graphviz" 11 | "github.com/google/pprof/driver" 12 | "github.com/google/pprof/profile" 13 | ) 14 | 15 | func ConvertToSVG(protoData []byte) ([]byte, error) { 16 | p, err := profile.ParseData(protoData) 17 | if err != nil { 18 | return nil, err 19 | } 20 | 21 | dotData, err := convertToDot(p) 22 | if err != nil { 23 | return nil, err 24 | } 25 | 26 | g := graphviz.New() 27 | graph, err := graphviz.ParseBytes(dotData) 28 | if err != nil { 29 | return nil, err 30 | } 31 | 32 | buf := bytes.NewBuffer(nil) 33 | err = g.Render(graph, graphviz.SVG, buf) 34 | if err != nil { 35 | return nil, err 36 | } 37 | return buf.Bytes(), nil 38 | } 39 | 40 | func convertToDot(p *profile.Profile) ([]byte, error) { 41 | args := []string{ 42 | "-dot", 43 | // prevent printing stdout 44 | "-output", "dummy", 45 | "-seconds", strconv.Itoa(int(30)), 46 | } 47 | // mock address 48 | args = append(args, "127.0.0.1:10080") 49 | f := &flagSet{ 50 | FlagSet: flag.NewFlagSet("pprof", flag.PanicOnError), 51 | args: args, 52 | } 53 | 54 | bufw := &bufWriteCloser{Buffer: bytes.NewBuffer(nil)} 55 | err := driver.PProf(&driver.Options{ 56 | Fetch: &localProfileFetcher{p: p}, 57 | Flagset: f, 58 | UI: &blankPprofUI{}, 59 | Writer: bufw, 60 | }) 61 | return bufw.Bytes(), err 62 | } 63 | 64 | type bufWriteCloser struct { 65 | *bytes.Buffer 66 | } 67 | 68 | func (o *bufWriteCloser) Open(_ string) (io.WriteCloser, error) { 69 | return o, nil 70 | } 71 | 72 | func (o *bufWriteCloser) Close() error { 73 | return nil 74 | } 75 | 76 | type localProfileFetcher struct { 77 | p *profile.Profile 78 | } 79 | 80 | func (s *localProfileFetcher) Fetch(src string, duration, timeout time.Duration) (*profile.Profile, string, error) { 81 | return s.p, "", nil 82 | } 83 | 84 | type flagSet struct { 85 | *flag.FlagSet 86 | args []string 87 | } 88 | 89 | func (f *flagSet) StringList(o, d, c string) *[]*string { 90 | return &[]*string{f.String(o, d, c)} 91 | } 92 | 93 | func (f *flagSet) ExtraUsage() string { 94 | return "" 95 | } 96 | 97 | func (f *flagSet) Parse(usage func()) []string { 98 | f.Usage = usage 99 | _ = f.FlagSet.Parse(f.args) 100 | return f.Args() 101 | } 102 | 103 | func (f *flagSet) AddExtraUsage(eu string) {} 104 | 105 | // blankPprofUI is used to eliminate the pprof logs 106 | type blankPprofUI struct { 107 | } 108 | 109 | func (b blankPprofUI) ReadLine(prompt string) (string, error) { 110 | panic("not support") 111 | } 112 | 113 | func (b blankPprofUI) Print(i ...interface{}) { 114 | } 115 | 116 | func (b blankPprofUI) PrintErr(i ...interface{}) { 117 | } 118 | 119 | func (b blankPprofUI) IsTerminal() bool { 120 | return false 121 | } 122 | 123 | func (b blankPprofUI) WantBrowser() bool { 124 | return false 125 | } 126 | 127 | func (b blankPprofUI) SetAutoComplete(complete func(string) string) { 128 | } 129 | -------------------------------------------------------------------------------- /component/conprof/jeprof/jeprof.go: -------------------------------------------------------------------------------- 1 | package jeprof 2 | 3 | import ( 4 | "bytes" 5 | _ "embed" 6 | "fmt" 7 | "io" 8 | "os" 9 | "os/exec" 10 | "strings" 11 | 12 | graphviz "github.com/goccy/go-graphviz" 13 | "github.com/prometheus/common/config" 14 | ) 15 | 16 | //go:embed jeprof.in 17 | var jeprof string 18 | 19 | func FetchRaw(url string, cfg config.HTTPClientConfig) ([]byte, error) { 20 | cmd := exec.Command("perl", "/dev/stdin", "--raw", url) //nolint:gosec 21 | cmd.Stdin = strings.NewReader(jeprof) 22 | if len(cfg.TLSConfig.CertFile) != 0 && len(cfg.TLSConfig.KeyFile) != 0 { 23 | cmd.Env = append(os.Environ(), fmt.Sprintf( 24 | "URL_FETCHER=curl -s --cert %s --key %s --cacert %s", 25 | cfg.TLSConfig.CertFile, 26 | cfg.TLSConfig.KeyFile, 27 | cfg.TLSConfig.CAFile, 28 | )) 29 | } 30 | stdout, err := cmd.StdoutPipe() 31 | if err != nil { 32 | return nil, err 33 | } 34 | stderr, err := cmd.StderrPipe() 35 | if err != nil { 36 | return nil, err 37 | } 38 | err = cmd.Start() 39 | if err != nil { 40 | return nil, err 41 | } 42 | data, err := io.ReadAll(stdout) 43 | if err != nil { 44 | return nil, err 45 | } 46 | errMsg, err := io.ReadAll(stderr) 47 | if err != nil { 48 | return nil, err 49 | } 50 | err = cmd.Wait() 51 | if err != nil { 52 | return nil, fmt.Errorf("failed to fetch tikv heap profile: %s", errMsg) 53 | } 54 | return data, nil 55 | } 56 | 57 | func ConvertToSVG(data []byte) ([]byte, error) { 58 | f, err := os.CreateTemp("", "prof") 59 | if err != nil { 60 | return nil, err 61 | } 62 | defer os.Remove(f.Name()) 63 | _, err = f.Write(data) 64 | if err != nil { 65 | return nil, err 66 | } 67 | 68 | cmd := exec.Command("perl", "/dev/stdin", "--dot", f.Name()) //nolint:gosec 69 | cmd.Stdin = strings.NewReader(jeprof) 70 | dotContent, err := cmd.Output() 71 | if err != nil { 72 | return nil, err 73 | } 74 | svgContent, err := convertDotToSVG(dotContent) 75 | if err != nil { 76 | return nil, err 77 | } 78 | return svgContent, nil 79 | } 80 | 81 | func convertDotToSVG(dotData []byte) ([]byte, error) { 82 | g := graphviz.New() 83 | graph, err := graphviz.ParseBytes(dotData) 84 | if err != nil { 85 | return nil, err 86 | } 87 | 88 | buf := bytes.NewBuffer(nil) 89 | err = g.Render(graph, graphviz.SVG, buf) 90 | if err != nil { 91 | return nil, err 92 | } 93 | return buf.Bytes(), nil 94 | } 95 | 96 | func ConvertToText(data []byte) ([]byte, error) { 97 | f, err := os.CreateTemp("", "prof") 98 | if err != nil { 99 | return nil, err 100 | } 101 | defer os.Remove(f.Name()) 102 | _, err = f.Write(data) 103 | if err != nil { 104 | return nil, err 105 | } 106 | 107 | // Brendan Gregg's collapsed stack format 108 | cmd := exec.Command("perl", "/dev/stdin", "--collapsed", f.Name()) //nolint:gosec 109 | cmd.Stdin = strings.NewReader(jeprof) 110 | textContent, err := cmd.Output() 111 | if err != nil { 112 | return nil, err 113 | } 114 | return textContent, nil 115 | } 116 | -------------------------------------------------------------------------------- /component/conprof/meta/meta.go: -------------------------------------------------------------------------------- 1 | package meta 2 | 3 | const ( 4 | ProfileKindProfile = "profile" 5 | ProfileKindGoroutine = "goroutine" 6 | ProfileKindHeap = "heap" 7 | ProfileKindMutex = "mutex" 8 | ProfileDataFormatSVG = "svg" 9 | ProfileDataFormatText = "text" 10 | ProfileDataFormatProtobuf = "protobuf" 11 | ProfileDataFormatJeprof = "jeprof" 12 | ) 13 | 14 | type ProfileStatus int64 15 | 16 | const ( 17 | ProfileStatusFinished ProfileStatus = 0 18 | ProfileStatusFailed ProfileStatus = 1 19 | ProfileStatusRunning ProfileStatus = 2 20 | ProfileStatusFinishedWithError ProfileStatus = 3 21 | ) 22 | 23 | type ProfileTarget struct { 24 | Kind string `json:"kind"` 25 | Component string `json:"component"` 26 | Address string `json:"address"` 27 | } 28 | 29 | type TargetInfo struct { 30 | ID int64 31 | LastScrapeTs int64 32 | } 33 | 34 | type BasicQueryParam struct { 35 | Begin int64 `json:"begin_time"` 36 | End int64 `json:"end_time"` 37 | Limit int64 `json:"limit"` 38 | Targets []ProfileTarget `json:"targets"` 39 | DataFormat string `json:"data_format"` 40 | } 41 | 42 | type ProfileList struct { 43 | Target ProfileTarget `json:"target"` 44 | ErrorList []string `json:"-"` 45 | TsList []int64 `json:"timestamp_list"` 46 | } 47 | 48 | func (s ProfileStatus) String() string { 49 | switch s { 50 | case ProfileStatusFinished: 51 | return "finished" 52 | case ProfileStatusFailed: 53 | return "failed" 54 | case ProfileStatusRunning: 55 | return "running" 56 | case ProfileStatusFinishedWithError: 57 | return "finished_with_error" 58 | default: 59 | return "unknown_state" 60 | } 61 | } 62 | 63 | type StatusCounter struct { 64 | finishedCount int 65 | runningCount int 66 | failedCount int 67 | totalCount int 68 | } 69 | 70 | func (s *StatusCounter) AddStatus(status ProfileStatus) { 71 | s.totalCount++ 72 | switch status { 73 | case ProfileStatusFinished: 74 | s.finishedCount++ 75 | case ProfileStatusFailed: 76 | s.failedCount++ 77 | case ProfileStatusRunning: 78 | s.runningCount++ 79 | } 80 | } 81 | 82 | func (s *StatusCounter) GetFinalStatus() ProfileStatus { 83 | if s.finishedCount == s.totalCount { 84 | return ProfileStatusFinished 85 | } 86 | if s.failedCount == s.totalCount { 87 | return ProfileStatusFailed 88 | } 89 | if s.runningCount > 0 { 90 | return ProfileStatusRunning 91 | } 92 | return ProfileStatusFinishedWithError 93 | } 94 | -------------------------------------------------------------------------------- /component/conprof/meta/meta_test.go: -------------------------------------------------------------------------------- 1 | package meta 2 | 3 | import ( 4 | "encoding/json" 5 | "testing" 6 | 7 | "github.com/stretchr/testify/require" 8 | ) 9 | 10 | func TestMetaJson(t *testing.T) { 11 | target := ProfileTarget{ 12 | Kind: "profile", 13 | Component: "tidb", 14 | Address: "10.0.1.21:10080", 15 | } 16 | data, err := json.Marshal(target) 17 | require.NoError(t, err) 18 | require.Equal(t, `{"kind":"profile","component":"tidb","address":"10.0.1.21:10080"}`, string(data)) 19 | 20 | param := BasicQueryParam{ 21 | Begin: 1, 22 | End: 2, 23 | Limit: 100, 24 | Targets: []ProfileTarget{target}, 25 | DataFormat: ProfileDataFormatProtobuf, 26 | } 27 | data, err = json.Marshal(param) 28 | require.NoError(t, err) 29 | require.Equal(t, `{"begin_time":1,"end_time":2,"limit":100,"targets":[{"kind":"profile","component":"tidb","address":"10.0.1.21:10080"}],"data_format":"protobuf"}`, string(data)) 30 | 31 | list := ProfileList{ 32 | Target: target, 33 | TsList: []int64{1, 2, 3, 4}, 34 | } 35 | data, err = json.Marshal(list) 36 | require.NoError(t, err) 37 | require.Equal(t, `{"target":{"kind":"profile","component":"tidb","address":"10.0.1.21:10080"},"timestamp_list":[1,2,3,4]}`, string(data)) 38 | } 39 | 40 | func TestStatusCounter(t *testing.T) { 41 | cases := []struct { 42 | statusList []ProfileStatus 43 | expect string 44 | }{ 45 | {[]ProfileStatus{ProfileStatusFinished, ProfileStatusFailed}, "finished_with_error"}, 46 | {[]ProfileStatus{ProfileStatusFailed, ProfileStatusFailed}, "failed"}, 47 | {[]ProfileStatus{ProfileStatusFinished, ProfileStatusRunning, ProfileStatusFailed}, "running"}, 48 | {[]ProfileStatus{ProfileStatusFinished, ProfileStatusRunning, ProfileStatusRunning}, "running"}, 49 | {[]ProfileStatus{ProfileStatusRunning, ProfileStatusRunning, ProfileStatusRunning}, "running"}, 50 | {[]ProfileStatus{ProfileStatusFinished, ProfileStatusFinished, ProfileStatusFinished}, "finished"}, 51 | } 52 | for _, ca := range cases { 53 | sc := StatusCounter{} 54 | for _, status := range ca.statusList { 55 | sc.AddStatus(status) 56 | } 57 | final := sc.GetFinalStatus() 58 | require.Equal(t, ca.expect, final.String()) 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /component/conprof/scrape/manager_test.go: -------------------------------------------------------------------------------- 1 | package scrape 2 | 3 | import ( 4 | "fmt" 5 | "os" 6 | "strings" 7 | "testing" 8 | "time" 9 | 10 | "github.com/pingcap/ng-monitoring/component/conprof/meta" 11 | "github.com/pingcap/ng-monitoring/component/conprof/store" 12 | "github.com/pingcap/ng-monitoring/component/topology" 13 | "github.com/pingcap/ng-monitoring/config" 14 | "github.com/pingcap/ng-monitoring/database/docdb" 15 | "github.com/pingcap/ng-monitoring/utils/testutil" 16 | 17 | "github.com/pingcap/log" 18 | "github.com/stretchr/testify/require" 19 | "go.uber.org/goleak" 20 | "go.uber.org/zap" 21 | ) 22 | 23 | func TestMain(m *testing.M) { 24 | opts := []goleak.Option{ 25 | goleak.IgnoreTopFunction("github.com/golang/glog.(*loggingT).flushDaemon"), 26 | goleak.IgnoreTopFunction("github.com/golang/glog.(*fileSink).flushDaemon"), 27 | goleak.IgnoreTopFunction("github.com/VictoriaMetrics/VictoriaMetrics/lib/fasttime.init.0.func1"), 28 | } 29 | 30 | goleak.VerifyTestMain(m, opts...) 31 | } 32 | 33 | func TestManager(t *testing.T) { 34 | tmpDir, err := os.MkdirTemp(os.TempDir(), "ngm-test-.*") 35 | require.NoError(t, err) 36 | defer func() { 37 | err := os.RemoveAll(tmpDir) 38 | require.NoError(t, err) 39 | }() 40 | 41 | cfg := config.GetDefaultConfig() 42 | cfg.ContinueProfiling.Enable = true 43 | cfg.ContinueProfiling.ProfileSeconds = 1 44 | cfg.ContinueProfiling.IntervalSeconds = 1 45 | config.StoreGlobalConfig(cfg) 46 | 47 | db, err := docdb.NewGenjiDBFromGenji(testutil.NewGenjiDB(t, tmpDir)) 48 | require.NoError(t, err) 49 | defer db.Close() 50 | storage, err := store.NewProfileStorage(db) 51 | require.NoError(t, err) 52 | 53 | topoSubScribe := make(topology.Subscriber) 54 | updateTargetMetaInterval = time.Millisecond * 100 55 | manager := NewManager(storage, topoSubScribe) 56 | manager.Start() 57 | defer manager.Close() 58 | 59 | mockServer := testutil.CreateMockProfileServer(t) 60 | defer mockServer.Stop(t) 61 | 62 | addr := mockServer.Addr 63 | port := mockServer.Port 64 | components := []topology.Component{ 65 | {Name: topology.ComponentPD, IP: addr, Port: port, StatusPort: port}, 66 | {Name: topology.ComponentTiDB, IP: addr, Port: port, StatusPort: port}, 67 | {Name: topology.ComponentTiKV, IP: addr, Port: port, StatusPort: port}, 68 | {Name: topology.ComponentTiFlash, IP: addr, Port: port, StatusPort: port}, 69 | } 70 | // notify topology 71 | topoSubScribe <- topoGetter(components) 72 | 73 | t1 := time.Now() 74 | // wait for scrape finish 75 | time.Sleep(time.Millisecond * 1500) 76 | 77 | t2 := time.Now() 78 | param := &meta.BasicQueryParam{ 79 | Begin: t1.Unix(), 80 | End: t2.Unix(), 81 | Limit: 1000, 82 | Targets: nil, 83 | } 84 | plists, err := storage.QueryGroupProfiles(param) 85 | require.NoError(t, err) 86 | checkListData := func(plists []meta.ProfileList, components []topology.Component, param *meta.BasicQueryParam) { 87 | require.True(t, len(plists) > len(components)) 88 | maxTs := int64(0) 89 | for _, list := range plists { 90 | found := false 91 | for _, comp := range components { 92 | if list.Target.Component == comp.Name && list.Target.Address == fmt.Sprintf("%v:%v", comp.IP, comp.Port) { 93 | found = true 94 | break 95 | } 96 | } 97 | // TODO: remove this after support tiflash 98 | require.True(t, list.Target.Component != topology.ComponentTiFlash) 99 | require.True(t, found, fmt.Sprintf("%#v", list)) 100 | for _, ts := range list.TsList { 101 | require.True(t, ts >= param.Begin && ts <= param.End) 102 | if ts > maxTs { 103 | maxTs = ts 104 | } 105 | } 106 | } 107 | require.True(t, maxTs > 0) 108 | } 109 | checkListData(plists, components, param) 110 | 111 | // test for scrape profile data 112 | count := 0 113 | err = storage.QueryProfileData(param, func(target meta.ProfileTarget, i int64, data []byte) error { 114 | count++ 115 | found := false 116 | for _, comp := range components { 117 | if target.Component == comp.Name && target.Address == fmt.Sprintf("%v:%v", comp.IP, comp.Port) { 118 | found = true 119 | break 120 | } 121 | } 122 | require.True(t, found, fmt.Sprintf("%#v", target)) 123 | require.True(t, strings.Contains(string(data), target.Kind)) 124 | return nil 125 | }) 126 | require.True(t, count > len(components)) 127 | require.NoError(t, err) 128 | 129 | // test for update target meta. 130 | for _, list := range plists { 131 | info := storage.GetTargetInfoFromCache(list.Target) 132 | require.NotNil(t, info) 133 | require.True(t, info.ID > 0) 134 | require.True(t, info.LastScrapeTs >= t1.Unix()) 135 | } 136 | 137 | // test for GetCurrentScrapeComponents 138 | comp := manager.GetCurrentScrapeComponents() 139 | // TODO: update this after support tiflash 140 | require.Equal(t, len(comp), len(components)-1) 141 | 142 | // test for topology changed. 143 | mockServer2 := testutil.CreateMockProfileServer(t) 144 | defer mockServer2.Stop(t) 145 | addr2 := mockServer2.Addr 146 | port2 := mockServer2.Port 147 | log.Info("new mock server", zap.Uint("port", port2)) 148 | components = []topology.Component{ 149 | {Name: topology.ComponentPD, IP: addr2, Port: port2, StatusPort: port2}, 150 | {Name: topology.ComponentTiDB, IP: addr, Port: port, StatusPort: port}, 151 | {Name: topology.ComponentTiDB, IP: addr2, Port: port2, StatusPort: port2}, 152 | {Name: topology.ComponentTiKV, IP: addr2, Port: port2, StatusPort: port2}, 153 | } 154 | 155 | // mock for disable conprof 156 | cfg.ContinueProfiling.Enable = false 157 | config.StoreGlobalConfig(cfg) 158 | 159 | // notify topology 160 | topoSubScribe <- topoGetter(components) 161 | 162 | // wait for stop scrape 163 | time.Sleep(time.Millisecond * 100) 164 | 165 | // currently, shouldn't have any scrape component. 166 | comp = manager.GetCurrentScrapeComponents() 167 | require.Equal(t, len(comp), 0) 168 | 169 | cfg.ContinueProfiling.Enable = true 170 | config.StoreGlobalConfig(cfg) 171 | // renotify topology 172 | topoSubScribe <- topoGetter(components) 173 | // wait for scrape finish 174 | time.Sleep(time.Millisecond * 3000) 175 | 176 | t3 := time.Now() 177 | param = &meta.BasicQueryParam{ 178 | Begin: t3.Unix() - 1, 179 | End: t3.Unix(), 180 | Limit: 1000, 181 | Targets: nil, 182 | } 183 | plists, err = storage.QueryGroupProfiles(param) 184 | require.NoError(t, err) 185 | checkListData(plists, components, param) 186 | 187 | comp = manager.GetCurrentScrapeComponents() 188 | require.Equal(t, len(comp), len(components), fmt.Sprintf("%#v \n %#v", comp, components)) 189 | 190 | status := manager.GetRunningStatus() 191 | require.True(t, status == meta.ProfileStatusRunning || status == meta.ProfileStatusFinished) 192 | } 193 | 194 | func topoGetter(components []topology.Component) topology.GetLatestTopology { 195 | return func() []topology.Component { 196 | return components 197 | } 198 | } 199 | -------------------------------------------------------------------------------- /component/conprof/scrape/scrape.go: -------------------------------------------------------------------------------- 1 | package scrape 2 | 3 | import ( 4 | "bytes" 5 | "context" 6 | "fmt" 7 | "io" 8 | "net/http" 9 | "net/url" 10 | "strconv" 11 | "time" 12 | 13 | "github.com/pingcap/ng-monitoring/component/conprof/jeprof" 14 | "github.com/pingcap/ng-monitoring/component/conprof/meta" 15 | "github.com/pingcap/ng-monitoring/component/conprof/store" 16 | "github.com/pingcap/ng-monitoring/component/topology" 17 | "github.com/pingcap/ng-monitoring/config" 18 | 19 | "github.com/pingcap/log" 20 | "github.com/pkg/errors" 21 | "go.uber.org/atomic" 22 | "go.uber.org/zap" 23 | "golang.org/x/net/context/ctxhttp" 24 | ) 25 | 26 | type ScrapeSuite struct { 27 | scraper Scraper 28 | lastScrape time.Time 29 | lastScrapeStatus atomic.Int64 30 | lastScrapeSize int 31 | store *store.ProfileStorage 32 | ctx context.Context 33 | cancel func() 34 | } 35 | 36 | func newScrapeSuite(ctx context.Context, sc Scraper, store *store.ProfileStorage) *ScrapeSuite { 37 | sl := &ScrapeSuite{ 38 | scraper: sc, 39 | store: store, 40 | } 41 | sl.ctx, sl.cancel = context.WithCancel(ctx) 42 | return sl 43 | } 44 | 45 | func (sl *ScrapeSuite) run(ticker *TickerChan) { 46 | target := sl.scraper.target 47 | 48 | defer func() { 49 | ticker.Stop() 50 | log.Debug("scraper stop running", 51 | zap.String("component", target.Component), 52 | zap.String("address", target.Address), 53 | zap.String("kind", target.Kind)) 54 | }() 55 | 56 | log.Debug("scraper start to run", 57 | zap.String("component", target.Component), 58 | zap.String("address", target.Address), 59 | zap.String("kind", target.Kind)) 60 | 61 | buf := bytes.NewBuffer(make([]byte, 0, 1024)) 62 | sl.lastScrapeSize = 0 63 | var start time.Time 64 | for { 65 | select { 66 | case <-sl.ctx.Done(): 67 | return 68 | case start = <-ticker.ch: 69 | sl.lastScrape = start 70 | sl.lastScrapeStatus.Store(int64(meta.ProfileStatusRunning)) 71 | } 72 | 73 | if sl.lastScrapeSize > 0 && buf.Cap() > 2*sl.lastScrapeSize { 74 | // shrink the buffer size. 75 | buf = bytes.NewBuffer(make([]byte, 0, sl.lastScrapeSize)) 76 | } 77 | 78 | buf.Reset() 79 | scrapeCtx, cancel := context.WithTimeout(sl.ctx, time.Second*time.Duration(config.GetGlobalConfig().ContinueProfiling.TimeoutSeconds)) 80 | scrapeErr := sl.scraper.scrape(scrapeCtx, buf) 81 | cancel() 82 | if scrapeErr != nil { 83 | if scrapeErr != context.Canceled { 84 | log.Error("scrape failed", 85 | zap.String("component", target.Component), 86 | zap.String("address", target.Address), 87 | zap.String("kind", target.Kind), 88 | zap.Error(scrapeErr)) 89 | } 90 | } 91 | sl.lastScrapeSize = buf.Len() 92 | 93 | err := sl.store.AddProfile(meta.ProfileTarget{ 94 | Kind: sl.scraper.target.Kind, 95 | Component: sl.scraper.target.Component, 96 | Address: sl.scraper.target.Address, 97 | }, start, buf.Bytes(), scrapeErr) 98 | if err != nil { 99 | log.Error("save scrape data failed", 100 | zap.String("component", target.Component), 101 | zap.String("address", target.Address), 102 | zap.String("kind", target.Kind), 103 | zap.Time("start", start), 104 | zap.Error(err)) 105 | } 106 | 107 | if scrapeErr != nil || err != nil { 108 | sl.lastScrapeStatus.Store(int64(meta.ProfileStatusFailed)) 109 | } else { 110 | sl.lastScrapeStatus.Store(int64(meta.ProfileStatusFinished)) 111 | } 112 | } 113 | } 114 | 115 | // Stop the scraping. May still write data and stale markers after it has 116 | // returned. Cancel the context to stop all writes. 117 | func (sl *ScrapeSuite) stop() { 118 | sl.cancel() 119 | } 120 | 121 | type Scraper struct { 122 | target *Target 123 | client *http.Client 124 | req *http.Request 125 | } 126 | 127 | func newScraper(target *Target, client *http.Client) Scraper { 128 | return Scraper{ 129 | target: target, 130 | client: client, 131 | } 132 | } 133 | 134 | func (s *Scraper) scrape(ctx context.Context, w io.Writer) error { 135 | cfg := config.GetGlobalConfig() 136 | if !cfg.ContinueProfiling.Enable { 137 | return nil 138 | } 139 | 140 | if s.target.Component == topology.ComponentTiKV && s.target.Kind == meta.ProfileKindHeap { 141 | // use jeprof to fetch tikv heap profile 142 | data, err := jeprof.FetchRaw(s.target.GetURLString(), cfg.Security.GetHTTPClientConfig()) 143 | if err != nil { 144 | return err 145 | } 146 | _, err = w.Write(data) 147 | return err 148 | } 149 | 150 | if s.req == nil { 151 | req, err := http.NewRequest("GET", s.target.GetURLString(), nil) 152 | if err != nil { 153 | return err 154 | } 155 | if header := s.target.header; len(header) > 0 { 156 | for k, v := range header { 157 | req.Header.Set(k, v) 158 | } 159 | } 160 | 161 | s.req = req 162 | } 163 | 164 | resp, err := ctxhttp.Do(ctx, s.client, s.req) 165 | if err != nil { 166 | return err 167 | } 168 | defer resp.Body.Close() 169 | 170 | if resp.StatusCode != http.StatusOK { 171 | return fmt.Errorf("server returned HTTP status %s", resp.Status) 172 | } 173 | 174 | b, err := io.ReadAll(resp.Body) 175 | if err != nil { 176 | return errors.Wrap(err, "failed to read body") 177 | } 178 | 179 | _, err = w.Write(b) 180 | return err 181 | } 182 | 183 | // Target refers to a singular HTTP or HTTPS endpoint. 184 | type Target struct { 185 | meta.ProfileTarget 186 | header map[string]string 187 | *url.URL 188 | } 189 | 190 | func NewTarget(component, address, scrapeAddress, kind, schema string, cfg *config.PprofProfilingConfig) *Target { 191 | t := &Target{ 192 | ProfileTarget: meta.ProfileTarget{ 193 | Kind: kind, 194 | Component: component, 195 | Address: address, 196 | }, 197 | } 198 | vs := url.Values{} 199 | for k, v := range cfg.Params { 200 | vs.Set(k, v) 201 | } 202 | if cfg.Seconds > 0 { 203 | vs.Add("seconds", strconv.Itoa(cfg.Seconds)) 204 | } 205 | 206 | t.header = cfg.Header 207 | t.URL = &url.URL{ 208 | Scheme: schema, 209 | Host: scrapeAddress, 210 | Path: cfg.Path, 211 | RawQuery: vs.Encode(), 212 | } 213 | return t 214 | } 215 | 216 | func (t *Target) GetURLString() string { 217 | return t.URL.String() 218 | } 219 | -------------------------------------------------------------------------------- /component/conprof/scrape/ticker.go: -------------------------------------------------------------------------------- 1 | package scrape 2 | 3 | import ( 4 | "context" 5 | "sync" 6 | "time" 7 | 8 | "github.com/pingcap/ng-monitoring/utils" 9 | ) 10 | 11 | type Ticker struct { 12 | sync.Mutex 13 | interval time.Duration 14 | subscribers map[int]chan time.Time 15 | idAlloc int 16 | cancel context.CancelFunc 17 | lastTime time.Time 18 | } 19 | 20 | func NewTicker(d time.Duration) *Ticker { 21 | if d == 0 { 22 | panic("should never happen") 23 | } 24 | ctx, cancel := context.WithCancel(context.Background()) 25 | t := &Ticker{ 26 | interval: d, 27 | subscribers: make(map[int]chan time.Time), 28 | cancel: cancel, 29 | } 30 | go utils.GoWithRecovery(func() { 31 | t.run(ctx) 32 | }, nil) 33 | return t 34 | } 35 | 36 | type TickerChan struct { 37 | id int 38 | ch chan time.Time 39 | ticker *Ticker 40 | } 41 | 42 | func (tc *TickerChan) Stop() { 43 | tc.ticker.Lock() 44 | defer tc.ticker.Unlock() 45 | delete(tc.ticker.subscribers, tc.id) 46 | } 47 | 48 | func (t *Ticker) Subscribe() *TickerChan { 49 | ch := make(chan time.Time, 1) 50 | t.Lock() 51 | defer t.Unlock() 52 | 53 | t.idAlloc += 1 54 | id := t.idAlloc 55 | t.subscribers[id] = ch 56 | return &TickerChan{ 57 | id: id, 58 | ch: ch, 59 | ticker: t, 60 | } 61 | } 62 | 63 | func (t *Ticker) Reset(d time.Duration) { 64 | if t.interval == d { 65 | return 66 | } 67 | t.Stop() 68 | 69 | ctx, cancel := context.WithCancel(context.Background()) 70 | t.cancel = cancel 71 | t.interval = d 72 | go utils.GoWithRecovery(func() { 73 | t.run(ctx) 74 | }, nil) 75 | } 76 | 77 | func (t *Ticker) run(ctx context.Context) { 78 | nextStart := int64(t.interval) - time.Now().UnixNano()%int64(t.interval) 79 | select { 80 | case <-time.After(time.Duration(nextStart)): 81 | // Continue after the scraping offset. 82 | case <-ctx.Done(): 83 | return 84 | } 85 | 86 | t.notify(time.Now()) 87 | 88 | ticker := time.NewTicker(t.interval) 89 | defer ticker.Stop() 90 | for { 91 | select { 92 | case <-ctx.Done(): 93 | return 94 | case now := <-ticker.C: 95 | t.notify(now) 96 | } 97 | } 98 | } 99 | 100 | func (t *Ticker) notify(now time.Time) { 101 | t.Lock() 102 | defer t.Unlock() 103 | t.lastTime = now 104 | for _, ch := range t.subscribers { 105 | select { 106 | case ch <- now: 107 | default: 108 | } 109 | } 110 | } 111 | 112 | func (t *Ticker) Stop() { 113 | if t.cancel != nil { 114 | t.cancel() 115 | } 116 | } 117 | -------------------------------------------------------------------------------- /component/conprof/scrape/ticker_test.go: -------------------------------------------------------------------------------- 1 | package scrape 2 | 3 | import ( 4 | "testing" 5 | "time" 6 | 7 | "github.com/stretchr/testify/require" 8 | ) 9 | 10 | func TestTicker(t *testing.T) { 11 | ticker := NewTicker(time.Millisecond * 50) 12 | defer ticker.Stop() 13 | tc := ticker.Subscribe() 14 | require.Equal(t, 1, len(ticker.subscribers)) 15 | tc.Stop() 16 | require.Equal(t, 0, len(ticker.subscribers)) 17 | 18 | tc1 := ticker.Subscribe() 19 | tc2 := ticker.Subscribe() 20 | t1 := <-tc1.ch 21 | t2 := <-tc2.ch 22 | require.Equal(t, t1.Unix(), t2.Unix()) 23 | 24 | ticker.Reset(time.Millisecond * 70) 25 | t1 = <-tc1.ch 26 | t2 = <-tc2.ch 27 | require.Equal(t, t1.Unix(), t2.Unix()) 28 | } 29 | -------------------------------------------------------------------------------- /component/conprof/store/gc.go: -------------------------------------------------------------------------------- 1 | package store 2 | 3 | import ( 4 | "time" 5 | 6 | "github.com/pingcap/ng-monitoring/component/conprof/meta" 7 | "github.com/pingcap/ng-monitoring/config" 8 | 9 | "github.com/pingcap/log" 10 | "go.uber.org/zap" 11 | ) 12 | 13 | const ( 14 | gcInterval = time.Minute * 10 15 | ) 16 | 17 | func (s *ProfileStorage) doGCLoop() { 18 | ticker := time.NewTicker(gcInterval) 19 | defer ticker.Stop() 20 | // run gc when started. 21 | s.runGC() 22 | for { 23 | select { 24 | case <-s.ctx.Done(): 25 | return 26 | case <-ticker.C: 27 | s.runGC() 28 | } 29 | } 30 | } 31 | 32 | func (s *ProfileStorage) runGC() { 33 | start := time.Now() 34 | allTargets, allInfos, err := s.loadAllTargetsFromTable() 35 | if err != nil { 36 | log.Info("gc load all target info from meta table failed", zap.Error(err)) 37 | return 38 | } 39 | safePointTs := s.getLastSafePointTs() 40 | for i, target := range allTargets { 41 | info := allInfos[i] 42 | err := s.db.ConprofDeleteProfileDataBeforeTs(s.ctx, info.ID, safePointTs) 43 | if err != nil { 44 | log.Error("gc delete target data failed", zap.Error(err)) 45 | } 46 | err = s.db.ConprofDeleteProfileMetaBeforeTs(s.ctx, info.ID, safePointTs) 47 | if err != nil { 48 | log.Error("gc delete target meta failed", zap.Error(err)) 49 | } 50 | err = s.dropProfileTableIfStaled(target, info, safePointTs) 51 | if err != nil { 52 | log.Error("gc drop target table failed", zap.Error(err)) 53 | } 54 | } 55 | log.Info("gc finished", 56 | zap.Int("total-targets", len(allTargets)), 57 | zap.Int64("safepoint", safePointTs), 58 | zap.Duration("cost", time.Since(start))) 59 | } 60 | 61 | func (s *ProfileStorage) loadAllTargetsFromTable() ([]meta.ProfileTarget, []meta.TargetInfo, error) { 62 | targets := make([]meta.ProfileTarget, 0, 16) 63 | infos := make([]meta.TargetInfo, 0, 16) 64 | err := s.db.ConprofQueryAllProfileTargets(s.ctx, func(target meta.ProfileTarget, info meta.TargetInfo) error { 65 | s.rebaseID(info.ID) 66 | targets = append(targets, target) 67 | infos = append(infos, info) 68 | return nil 69 | }) 70 | if err != nil { 71 | return nil, nil, err 72 | } 73 | log.Info("gc load all target info from meta table", zap.Int("all-target-count", len(targets))) 74 | return targets, infos, nil 75 | } 76 | 77 | func (s *ProfileStorage) getLastSafePointTs() int64 { 78 | cfg := config.GetGlobalConfig() 79 | safePoint := time.Now().Add(time.Duration(-cfg.ContinueProfiling.DataRetentionSeconds) * time.Second) 80 | return safePoint.Unix() 81 | } 82 | -------------------------------------------------------------------------------- /component/domain/client.go: -------------------------------------------------------------------------------- 1 | package domain 2 | 3 | import ( 4 | "context" 5 | "errors" 6 | "fmt" 7 | "sync/atomic" 8 | "time" 9 | 10 | "github.com/pingcap/ng-monitoring/config" 11 | 12 | "github.com/pingcap/log" 13 | "github.com/pingcap/tidb-dashboard/util/client/httpclient" 14 | "github.com/pingcap/tidb-dashboard/util/client/pdclient" 15 | clientv3 "go.etcd.io/etcd/client/v3" 16 | "go.uber.org/zap" 17 | ) 18 | 19 | const ( 20 | minRetryInterval = time.Millisecond * 10 21 | maxRetryInterval = time.Second 22 | ) 23 | 24 | type ClientMaintainer struct { 25 | pdCli atomic.Value // *pdclient.APIClient 26 | etcdCli atomic.Value // *clientv3.Client 27 | pdCfg config.PD 28 | initialized chan struct{} 29 | } 30 | 31 | func NewClientMaintainer() *ClientMaintainer { 32 | return &ClientMaintainer{ 33 | initialized: make(chan struct{}), 34 | } 35 | } 36 | 37 | func (cm *ClientMaintainer) Init(pdCfg config.PD, pdCli *pdclient.APIClient, etcdCli *clientv3.Client) { 38 | cm.pdCfg = pdCfg 39 | cm.pdCli.Store(pdCli) 40 | cm.etcdCli.Store(etcdCli) 41 | close(cm.initialized) 42 | } 43 | 44 | // WARN: call this function will blocked until successfully created PD client. 45 | func (cm *ClientMaintainer) GetPDClient(ctx context.Context) (*pdclient.APIClient, error) { 46 | err := cm.waitUntilInitialized(ctx) 47 | if err != nil { 48 | return nil, err 49 | } 50 | cli := cm.pdCli.Load() 51 | return cli.(*pdclient.APIClient), nil 52 | } 53 | 54 | // WARN: call this function will blocked until successfully created etcd client. 55 | func (cm *ClientMaintainer) GetEtcdClient(ctx context.Context) (*clientv3.Client, error) { 56 | err := cm.waitUntilInitialized(ctx) 57 | if err != nil { 58 | return nil, err 59 | } 60 | cli := cm.etcdCli.Load() 61 | return cli.(*clientv3.Client), nil 62 | } 63 | 64 | func (cm *ClientMaintainer) waitUntilInitialized(ctx context.Context) error { 65 | select { 66 | case <-ctx.Done(): 67 | return ctx.Err() 68 | case <-cm.initialized: 69 | return nil 70 | } 71 | } 72 | 73 | func (cm *ClientMaintainer) IsInitialized() bool { 74 | select { 75 | case <-cm.initialized: 76 | return true 77 | default: 78 | return false 79 | } 80 | } 81 | 82 | func (cm *ClientMaintainer) NeedRecreateClient(pdCfg config.PD) bool { 83 | return !cm.pdCfg.Equal(pdCfg) 84 | } 85 | 86 | func (cm *ClientMaintainer) Close() { 87 | select { 88 | case <-cm.initialized: 89 | etcdCli := cm.etcdCli.Load() 90 | _ = etcdCli.(*clientv3.Client).Close() 91 | default: 92 | return 93 | } 94 | } 95 | 96 | func createClientWithRetry(ctx context.Context) (*pdclient.APIClient, *clientv3.Client, error) { 97 | latest := time.Now() 98 | backoff := minRetryInterval 99 | for { 100 | select { 101 | case <-ctx.Done(): 102 | return nil, nil, ctx.Err() 103 | default: 104 | } 105 | 106 | cfg := config.GetGlobalConfig() 107 | pdCli, etcdCli, err := createClient(&cfg) 108 | if err == nil { 109 | return pdCli, etcdCli, nil 110 | } 111 | if time.Since(latest) > time.Second*5 { 112 | latest = time.Now() 113 | log.Warn("create pd/etcd client failed", zap.Error(err)) 114 | } 115 | time.Sleep(backoff) 116 | backoff = backoff * 2 117 | if backoff > maxRetryInterval { 118 | backoff = maxRetryInterval 119 | } 120 | } 121 | } 122 | 123 | func createClient(cfg *config.Config) (*pdclient.APIClient, *clientv3.Client, error) { 124 | if len(cfg.PD.Endpoints) == 0 { 125 | return nil, nil, fmt.Errorf("unexpected empty pd endpoints, please specify at least one pd endpoint") 126 | } 127 | etcdCli, err := pdclient.NewEtcdClient(pdclient.EtcdClientConfig{ 128 | Endpoints: cfg.PD.Endpoints, 129 | Context: context.Background(), 130 | TLS: cfg.Security.GetTLSConfig(), 131 | }) 132 | if err != nil { 133 | return nil, nil, err 134 | } 135 | 136 | pdCli, err := CreatePDClient(cfg) 137 | if err != nil { 138 | etcdCli.Close() 139 | return nil, nil, err 140 | } 141 | return pdCli, etcdCli, nil 142 | } 143 | 144 | func CreatePDClient(cfg *config.Config) (*pdclient.APIClient, error) { 145 | if cfg == nil || len(cfg.PD.Endpoints) == 0 { 146 | return nil, errors.New("need specify pd endpoints") 147 | } 148 | var pdCli *pdclient.APIClient 149 | var err error 150 | for _, endpoint := range cfg.PD.Endpoints { 151 | pdCli = pdclient.NewAPIClient(httpclient.Config{ 152 | // TODO: support all PD endpoints. 153 | DefaultBaseURL: fmt.Sprintf("%v://%v", cfg.GetHTTPScheme(), endpoint), 154 | DefaultCtx: context.Background(), 155 | TLSConfig: cfg.Security.GetTLSConfig(), 156 | }) 157 | _, err = pdCli.GetHealth(context.Background()) 158 | if err == nil { 159 | log.Info("create pd client success", zap.String("pd-address", endpoint)) 160 | return pdCli, nil 161 | } 162 | } 163 | if err != nil { 164 | return nil, err 165 | } 166 | if pdCli == nil { 167 | return nil, fmt.Errorf("can't create pd client, should never happen") 168 | } 169 | return pdCli, err 170 | } 171 | -------------------------------------------------------------------------------- /component/domain/domain.go: -------------------------------------------------------------------------------- 1 | package domain 2 | 3 | import ( 4 | "context" 5 | 6 | "github.com/pingcap/ng-monitoring/config" 7 | "github.com/pingcap/ng-monitoring/utils" 8 | 9 | "github.com/pingcap/tidb-dashboard/util/client/pdclient" 10 | clientv3 "go.etcd.io/etcd/client/v3" 11 | ) 12 | 13 | type Domain struct { 14 | ctx context.Context 15 | cancel context.CancelFunc 16 | cm *ClientMaintainer 17 | cfgChangeCh config.Subscriber 18 | } 19 | 20 | func NewDomain() *Domain { 21 | cfgSub := config.Subscribe() 22 | getCurCfg := <-cfgSub 23 | curCfg := getCurCfg() 24 | 25 | ctx, cancel := context.WithCancel(context.Background()) 26 | do := &Domain{ 27 | ctx: ctx, 28 | cancel: cancel, 29 | cm: NewClientMaintainer(), 30 | cfgChangeCh: cfgSub, 31 | } 32 | go utils.GoWithRecovery(func() { 33 | do.start(curCfg) 34 | }, nil) 35 | return do 36 | } 37 | 38 | func NewDomainForTest(pdCli *pdclient.APIClient, etcdCli *clientv3.Client) *Domain { 39 | cfgSub := config.Subscribe() 40 | getCurCfg := <-cfgSub 41 | curCfg := getCurCfg() 42 | 43 | ctx, cancel := context.WithCancel(context.Background()) 44 | do := &Domain{ 45 | ctx: ctx, 46 | cancel: cancel, 47 | cm: NewClientMaintainer(), 48 | cfgChangeCh: cfgSub, 49 | } 50 | do.cm.Init(curCfg.PD, pdCli, etcdCli) 51 | return do 52 | } 53 | 54 | // WARN: call this function will blocked until successfully created PD client. 55 | func (do *Domain) GetPDClient() (*pdclient.APIClient, error) { 56 | return do.cm.GetPDClient(do.ctx) 57 | } 58 | 59 | // WARN: call this function will blocked until successfully created etcd client. 60 | func (do *Domain) GetEtcdClient() (*clientv3.Client, error) { 61 | return do.cm.GetEtcdClient(do.ctx) 62 | } 63 | 64 | func (do *Domain) start(cfg config.Config) { 65 | err := do.createClientWithRetry(cfg) 66 | if err != nil { 67 | return 68 | } 69 | for { 70 | select { 71 | case <-do.ctx.Done(): 72 | return 73 | case getCfg := <-do.cfgChangeCh: 74 | cfg = getCfg() 75 | err = do.createClientWithRetry(cfg) 76 | if err == context.Canceled { 77 | return 78 | } 79 | } 80 | } 81 | } 82 | 83 | func (do *Domain) createClientWithRetry(cfg config.Config) error { 84 | if do.cm.IsInitialized() { 85 | if !do.cm.NeedRecreateClient(cfg.PD) { 86 | return nil 87 | } 88 | do.cm.Close() 89 | do.cm = NewClientMaintainer() 90 | } 91 | pdCli, etcdCli, err := createClientWithRetry(do.ctx) 92 | if err != nil { 93 | return err 94 | } 95 | do.cm.Init(cfg.PD, pdCli, etcdCli) 96 | return nil 97 | } 98 | 99 | func (do *Domain) Close() { 100 | do.cm.Close() 101 | if do.cancel != nil { 102 | do.cancel() 103 | } 104 | } 105 | -------------------------------------------------------------------------------- /component/domain/domain_test.go: -------------------------------------------------------------------------------- 1 | package domain 2 | 3 | import ( 4 | "context" 5 | "testing" 6 | "time" 7 | 8 | "github.com/pingcap/ng-monitoring/config" 9 | "github.com/pingcap/ng-monitoring/utils/testutil" 10 | 11 | "github.com/stretchr/testify/require" 12 | "go.uber.org/goleak" 13 | ) 14 | 15 | func TestMain(m *testing.M) { 16 | opts := []goleak.Option{ 17 | goleak.IgnoreTopFunction("github.com/golang/glog.(*loggingT).flushDaemon"), 18 | goleak.IgnoreTopFunction("github.com/golang/glog.(*fileSink).flushDaemon"), 19 | goleak.IgnoreTopFunction("github.com/VictoriaMetrics/VictoriaMetrics/lib/fasttime.init.0.func1"), 20 | } 21 | 22 | goleak.VerifyTestMain(m, opts...) 23 | } 24 | 25 | func TestDomain(t *testing.T) { 26 | cfg := config.GetDefaultConfig() 27 | config.StoreGlobalConfig(cfg) 28 | do := NewDomain() 29 | do.Close() 30 | _, err := do.GetEtcdClient() 31 | require.Error(t, err, context.Canceled) 32 | _, err = do.GetPDClient() 33 | require.Error(t, err, context.Canceled) 34 | 35 | mockPD := testutil.MockPDHTTPServer{} 36 | mockPD.Setup(t) 37 | defer mockPD.Close(t) 38 | cfg.PD.Endpoints = []string{mockPD.Addr} 39 | config.StoreGlobalConfig(cfg) 40 | 41 | do = NewDomain() 42 | defer do.Close() 43 | 44 | pdCli1, err := do.GetPDClient() 45 | require.NoError(t, err) 46 | cfg.ContinueProfiling.Enable = true 47 | config.StoreGlobalConfig(cfg) 48 | pdCli2, err := do.GetPDClient() 49 | require.NoError(t, err) 50 | require.Equal(t, pdCli1, pdCli2) 51 | 52 | mockPD2 := testutil.MockPDHTTPServer{} 53 | mockPD2.Setup(t) 54 | defer mockPD2.Close(t) 55 | cfg.PD.Endpoints = []string{mockPD2.Addr} 56 | config.StoreGlobalConfig(cfg) 57 | time.Sleep(time.Millisecond * 10) 58 | pdCli3, err := do.GetPDClient() 59 | require.NoError(t, err) 60 | require.NotEqual(t, pdCli1, pdCli3) 61 | } 62 | 63 | func TestClientMaintainer(t *testing.T) { 64 | cfg := config.GetDefaultConfig() 65 | cfg.PD.Endpoints = nil 66 | config.StoreGlobalConfig(cfg) 67 | ctx, cancel := context.WithTimeout(context.Background(), time.Millisecond*100) 68 | _, _, err := createClientWithRetry(ctx) 69 | cancel() 70 | require.Error(t, err, context.DeadlineExceeded) 71 | 72 | mockPD := testutil.MockPDHTTPServer{} 73 | mockPD.Setup(t) 74 | defer mockPD.Close(t) 75 | 76 | _, err = CreatePDClient(&cfg) 77 | require.NotNil(t, err) 78 | require.Equal(t, "need specify pd endpoints", err.Error()) 79 | cfg.PD.Endpoints = []string{mockPD.Addr} 80 | config.StoreGlobalConfig(cfg) 81 | mockPD.Health = false 82 | _, err = CreatePDClient(&cfg) 83 | require.NotNil(t, err) 84 | require.Contains(t, err.Error(), "Response status 503") 85 | mockPD.Health = true 86 | pdCli, etcdCli, err := createClientWithRetry(context.Background()) 87 | require.NoError(t, err) 88 | require.NotNil(t, pdCli) 89 | require.NotNil(t, etcdCli) 90 | 91 | cm := NewClientMaintainer() 92 | require.False(t, cm.IsInitialized()) 93 | ctx, cancel = context.WithTimeout(context.Background(), time.Millisecond*10) 94 | _, err = cm.GetPDClient(ctx) 95 | cancel() 96 | require.NotNil(t, err) 97 | require.Equal(t, context.DeadlineExceeded, err) 98 | 99 | ctx, cancel = context.WithTimeout(context.Background(), time.Millisecond*10) 100 | _, err = cm.GetEtcdClient(ctx) 101 | cancel() 102 | require.NotNil(t, err) 103 | require.Equal(t, context.DeadlineExceeded, err) 104 | 105 | cm.Init(cfg.PD, pdCli, etcdCli) 106 | require.True(t, cm.IsInitialized()) 107 | ctx, cancel = context.WithTimeout(context.Background(), time.Millisecond*10) 108 | pdCli, err = cm.GetPDClient(ctx) 109 | cancel() 110 | require.NoError(t, err) 111 | require.NotNil(t, pdCli) 112 | 113 | ctx, cancel = context.WithTimeout(context.Background(), time.Millisecond*10) 114 | etcdCli, err = cm.GetEtcdClient(ctx) 115 | cancel() 116 | require.NoError(t, err) 117 | require.NotNil(t, etcdCli) 118 | 119 | cm.Close() 120 | } 121 | -------------------------------------------------------------------------------- /component/subscriber/main_test.go: -------------------------------------------------------------------------------- 1 | package subscriber_test 2 | 3 | import ( 4 | "testing" 5 | 6 | "go.uber.org/goleak" 7 | ) 8 | 9 | func TestMain(m *testing.M) { 10 | opts := []goleak.Option{ 11 | goleak.IgnoreTopFunction("github.com/golang/glog.(*fileSink).flushDaemon"), 12 | goleak.IgnoreTopFunction("github.com/VictoriaMetrics/VictoriaMetrics/lib/fasttime.init.0.func1"), 13 | } 14 | 15 | goleak.VerifyTestMain(m, opts...) 16 | } 17 | -------------------------------------------------------------------------------- /component/subscriber/manager.go: -------------------------------------------------------------------------------- 1 | package subscriber 2 | 3 | import ( 4 | "context" 5 | "encoding/json" 6 | "fmt" 7 | "net/http" 8 | "net/url" 9 | "reflect" 10 | "strconv" 11 | "sync" 12 | "time" 13 | 14 | "github.com/pingcap/ng-monitoring/component/domain" 15 | "github.com/pingcap/ng-monitoring/component/subscriber/model" 16 | "github.com/pingcap/ng-monitoring/component/topology" 17 | "github.com/pingcap/ng-monitoring/config" 18 | "github.com/pingcap/ng-monitoring/config/pdvariable" 19 | "github.com/pingcap/ng-monitoring/utils" 20 | "go.uber.org/zap" 21 | 22 | "github.com/pingcap/log" 23 | ) 24 | 25 | const schemaCheckInterval = 2 * time.Second 26 | 27 | type Manager struct { 28 | ctx context.Context 29 | wg *sync.WaitGroup 30 | 31 | prevEnabled bool 32 | components []topology.Component 33 | 34 | scrapers map[topology.Component]Scraper 35 | 36 | topoSubscriber topology.Subscriber 37 | cfgSubscriber config.Subscriber 38 | varSubscriber pdvariable.Subscriber 39 | httpCli *http.Client 40 | 41 | do *domain.Domain 42 | schemaCache *sync.Map 43 | schemaVersion int64 44 | 45 | subscribeController SubscribeController 46 | } 47 | 48 | func NewManager( 49 | ctx context.Context, 50 | wg *sync.WaitGroup, 51 | do *domain.Domain, 52 | varSubscriber pdvariable.Subscriber, 53 | topoSubscriber topology.Subscriber, 54 | cfgSubscriber config.Subscriber, 55 | subscribeController SubscribeController, 56 | ) *Manager { 57 | return &Manager{ 58 | ctx: ctx, 59 | wg: wg, 60 | 61 | scrapers: make(map[topology.Component]Scraper), 62 | schemaCache: &sync.Map{}, 63 | 64 | varSubscriber: varSubscriber, 65 | topoSubscriber: topoSubscriber, 66 | cfgSubscriber: cfgSubscriber, 67 | 68 | do: do, 69 | prevEnabled: subscribeController.IsEnabled(), 70 | subscribeController: subscribeController, 71 | } 72 | } 73 | 74 | func (m *Manager) Run() { 75 | defer m.clearScrapers() 76 | ticker := time.NewTicker(schemaCheckInterval) 77 | for { 78 | select { 79 | case getCfg := <-m.cfgSubscriber: 80 | m.subscribeController.UpdateConfig(getCfg()) 81 | m.httpCli = m.subscribeController.NewHTTPClient() 82 | case getVars := <-m.varSubscriber: 83 | m.subscribeController.UpdatePDVariable(getVars()) 84 | case getTopology := <-m.topoSubscriber: 85 | m.components = getTopology() 86 | m.subscribeController.UpdateTopology(getTopology()) 87 | case <-ticker.C: 88 | m.updateSchemaCache() 89 | continue 90 | case <-m.ctx.Done(): 91 | return 92 | } 93 | 94 | curEnabled := m.subscribeController.IsEnabled() 95 | if curEnabled != m.prevEnabled { // switch 96 | action := "off" 97 | if curEnabled { 98 | action = "on" 99 | } 100 | log.Info(fmt.Sprintf("%s is turned %s", m.subscribeController.Name(), action)) 101 | } 102 | m.prevEnabled = curEnabled 103 | 104 | if curEnabled { 105 | m.updateScrapers() 106 | } else { 107 | m.clearScrapers() 108 | } 109 | } 110 | } 111 | 112 | func (m *Manager) updateSchemaCache() { 113 | if !m.subscribeController.IsEnabled() { 114 | // clear cache 115 | m.schemaCache.Range(func(k, v interface{}) bool { 116 | m.schemaCache.Delete(k) 117 | return true 118 | }) 119 | m.schemaVersion = 0 120 | return 121 | } 122 | if m.do == nil { 123 | return 124 | } 125 | 126 | ectx, cancel := context.WithTimeout(context.TODO(), 3*time.Second) 127 | etcdCli, err := m.do.GetEtcdClient() 128 | defer cancel() 129 | if err != nil { 130 | log.Error("failed to get etcd client", zap.Error(err)) 131 | return 132 | } 133 | resp, err := etcdCli.Get(ectx, model.SchemaVersionPath) 134 | if err != nil || len(resp.Kvs) != 1 { 135 | if resp != nil && len(resp.Kvs) == 0 { 136 | return 137 | } 138 | log.Warn("failed to get tidb schema version", zap.Error(err)) 139 | return 140 | } 141 | schemaVersion, err := strconv.ParseInt(string(resp.Kvs[0].Value), 10, 64) 142 | if err != nil { 143 | log.Warn("failed to get tidb schema version", zap.Error(err)) 144 | return 145 | } 146 | if schemaVersion == m.schemaVersion { 147 | return 148 | } 149 | log.Info("schema version changed", zap.Int64("old", m.schemaVersion), zap.Int64("new", schemaVersion)) 150 | m.tryUpdateSchemaCache(schemaVersion) 151 | } 152 | 153 | type getConfig interface { 154 | GetConfig() *config.Config 155 | } 156 | 157 | func (m *Manager) requestDB(path string, v interface{}) error { 158 | schema := "http" 159 | if sc, ok := m.subscribeController.(getConfig); ok && sc.GetConfig().Security.GetTLSConfig() != nil { 160 | schema = "https" 161 | } 162 | for _, compt := range m.components { 163 | if compt.Name != topology.ComponentTiDB { 164 | continue 165 | } 166 | 167 | url := fmt.Sprintf("%s://%s:%d%s", schema, compt.IP, compt.StatusPort, path) 168 | resp, err := m.httpCli.Get(url) 169 | if err != nil { 170 | log.Error("request failed", zap.Error(err)) 171 | continue 172 | } 173 | defer resp.Body.Close() 174 | if resp.StatusCode != http.StatusOK { 175 | log.Error("request failed", zap.String("status", resp.Status)) 176 | continue 177 | } 178 | if err := json.NewDecoder(resp.Body).Decode(v); err != nil { 179 | log.Error("decode response failed", zap.Error(err)) 180 | continue 181 | } 182 | return nil 183 | } 184 | return fmt.Errorf("all request failed") 185 | } 186 | 187 | func (m *Manager) tryUpdateSchemaCache(schemaVersion int64) { 188 | // get all database info 189 | var dbInfos []*model.DBInfo 190 | if err := m.requestDB("/schema", &dbInfos); err != nil { 191 | return 192 | } 193 | 194 | // get all table info 195 | updateSuccess := true 196 | for _, db := range dbInfos { 197 | if db.State == model.StateNone { 198 | continue 199 | } 200 | var tableInfos []*model.TableInfo 201 | encodeName := url.PathEscape(db.Name.O) 202 | if err := m.requestDB(fmt.Sprintf("/schema/%s?id_name_only=true", encodeName), &tableInfos); err != nil { 203 | updateSuccess = false 204 | continue 205 | } 206 | log.Info("update table info", zap.String("db", db.Name.O), zap.Reflect("table-info", tableInfos)) 207 | if len(tableInfos) == 0 { 208 | continue 209 | } 210 | for _, table := range tableInfos { 211 | indices := make(map[int64]string, len(table.Indices)) 212 | for _, index := range table.Indices { 213 | indices[index.ID] = index.Name.O 214 | } 215 | detail := &model.TableDetail{ 216 | Name: table.Name.O, 217 | DB: db.Name.O, 218 | ID: table.ID, 219 | Indices: indices, 220 | } 221 | m.schemaCache.Store(table.ID, detail) 222 | if partition := table.GetPartitionInfo(); partition != nil { 223 | for _, partitionDef := range partition.Definitions { 224 | detail := &model.TableDetail{ 225 | Name: fmt.Sprintf("%s/%s", table.Name.O, partitionDef.Name.O), 226 | DB: db.Name.O, 227 | ID: partitionDef.ID, 228 | Indices: indices, 229 | } 230 | m.schemaCache.Store(partitionDef.ID, detail) 231 | } 232 | } 233 | } 234 | } 235 | if updateSuccess { 236 | m.schemaVersion = schemaVersion 237 | } 238 | } 239 | 240 | func (m *Manager) updateScrapers() { 241 | // clean up closed scrapers 242 | for component, scraper := range m.scrapers { 243 | if !isNil(scraper) && scraper.IsDown() { 244 | scraper.Close() 245 | delete(m.scrapers, component) 246 | } 247 | } 248 | 249 | in, out := m.getTopoChange() 250 | 251 | // clean up stale scrapers 252 | for i := range out { 253 | scraper := m.scrapers[out[i]] 254 | if !isNil(scraper) { 255 | scraper.Close() 256 | } 257 | delete(m.scrapers, out[i]) 258 | } 259 | 260 | // set up incoming scrapers 261 | for i := range in { 262 | scraper := m.subscribeController.NewScraper(m.ctx, in[i], m.schemaCache) 263 | m.scrapers[in[i]] = scraper 264 | 265 | if !isNil(scraper) { 266 | m.wg.Add(1) 267 | go utils.GoWithRecovery(func() { 268 | defer m.wg.Done() 269 | scraper.Run() 270 | }, nil) 271 | } 272 | } 273 | } 274 | 275 | func (m *Manager) getTopoChange() (in, out []topology.Component) { 276 | curMap := make(map[topology.Component]struct{}) 277 | 278 | for i := range m.components { 279 | component := m.components[i] 280 | curMap[component] = struct{}{} 281 | if _, contains := m.scrapers[component]; !contains { 282 | in = append(in, component) 283 | } 284 | } 285 | 286 | for c := range m.scrapers { 287 | if _, contains := curMap[c]; !contains { 288 | out = append(out, c) 289 | } 290 | } 291 | 292 | return 293 | } 294 | 295 | func (m *Manager) clearScrapers() { 296 | for component, scraper := range m.scrapers { 297 | if !isNil(scraper) { 298 | scraper.Close() 299 | } 300 | delete(m.scrapers, component) 301 | } 302 | } 303 | 304 | func isNil(scraper Scraper) bool { 305 | if scraper == nil { 306 | return true 307 | } 308 | switch reflect.TypeOf(scraper).Kind() { 309 | case reflect.Ptr, reflect.Map, reflect.Array, reflect.Chan, reflect.Slice: 310 | return reflect.ValueOf(scraper).IsNil() 311 | } 312 | return false 313 | } 314 | -------------------------------------------------------------------------------- /component/subscriber/mock_sub_controller_test.go: -------------------------------------------------------------------------------- 1 | // Code generated by MockGen. DO NOT EDIT. 2 | // Source: github.com/pingcap/ng-monitoring/component/subscriber (interfaces: SubscribeController,Scraper) 3 | 4 | // Package subscriber_test is a generated GoMock package. 5 | package subscriber_test 6 | 7 | import ( 8 | "net/http" 9 | "sync" 10 | 11 | context "context" 12 | reflect "reflect" 13 | 14 | gomock "github.com/golang/mock/gomock" 15 | subscriber "github.com/pingcap/ng-monitoring/component/subscriber" 16 | topology "github.com/pingcap/ng-monitoring/component/topology" 17 | config "github.com/pingcap/ng-monitoring/config" 18 | pdvariable "github.com/pingcap/ng-monitoring/config/pdvariable" 19 | ) 20 | 21 | // MockSubscribeController is a mock of SubscribeController interface. 22 | type MockSubscribeController struct { 23 | ctrl *gomock.Controller 24 | recorder *MockSubscribeControllerMockRecorder 25 | } 26 | 27 | // MockSubscribeControllerMockRecorder is the mock recorder for MockSubscribeController. 28 | type MockSubscribeControllerMockRecorder struct { 29 | mock *MockSubscribeController 30 | } 31 | 32 | // NewMockSubscribeController creates a new mock instance. 33 | func NewMockSubscribeController(ctrl *gomock.Controller) *MockSubscribeController { 34 | mock := &MockSubscribeController{ctrl: ctrl} 35 | mock.recorder = &MockSubscribeControllerMockRecorder{mock} 36 | return mock 37 | } 38 | 39 | // EXPECT returns an object that allows the caller to indicate expected use. 40 | func (m *MockSubscribeController) EXPECT() *MockSubscribeControllerMockRecorder { 41 | return m.recorder 42 | } 43 | 44 | func (m *MockSubscribeController) NewHTTPClient() *http.Client { 45 | return nil 46 | } 47 | 48 | // IsEnabled mocks base method. 49 | func (m *MockSubscribeController) IsEnabled() bool { 50 | m.ctrl.T.Helper() 51 | ret := m.ctrl.Call(m, "IsEnabled") 52 | ret0, _ := ret[0].(bool) 53 | return ret0 54 | } 55 | 56 | // IsEnabled indicates an expected call of IsEnabled. 57 | func (mr *MockSubscribeControllerMockRecorder) IsEnabled() *gomock.Call { 58 | mr.mock.ctrl.T.Helper() 59 | return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "IsEnabled", reflect.TypeOf((*MockSubscribeController)(nil).IsEnabled)) 60 | } 61 | 62 | // Name mocks base method. 63 | func (m *MockSubscribeController) Name() string { 64 | m.ctrl.T.Helper() 65 | ret := m.ctrl.Call(m, "Name") 66 | ret0, _ := ret[0].(string) 67 | return ret0 68 | } 69 | 70 | // Name indicates an expected call of Name. 71 | func (mr *MockSubscribeControllerMockRecorder) Name() *gomock.Call { 72 | mr.mock.ctrl.T.Helper() 73 | return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Name", reflect.TypeOf((*MockSubscribeController)(nil).Name)) 74 | } 75 | 76 | // NewScraper mocks base method. 77 | func (m *MockSubscribeController) NewScraper(arg0 context.Context, arg1 topology.Component, _ *sync.Map) subscriber.Scraper { 78 | m.ctrl.T.Helper() 79 | ret := m.ctrl.Call(m, "NewScraper", arg0, arg1) 80 | ret0, _ := ret[0].(subscriber.Scraper) 81 | return ret0 82 | } 83 | 84 | // NewScraper indicates an expected call of NewScraper. 85 | func (mr *MockSubscribeControllerMockRecorder) NewScraper(arg0, arg1 interface{}) *gomock.Call { 86 | mr.mock.ctrl.T.Helper() 87 | return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "NewScraper", reflect.TypeOf((*MockSubscribeController)(nil).NewScraper), arg0, arg1) 88 | } 89 | 90 | // UpdateConfig mocks base method. 91 | func (m *MockSubscribeController) UpdateConfig(arg0 config.Config) { 92 | m.ctrl.T.Helper() 93 | m.ctrl.Call(m, "UpdateConfig", arg0) 94 | } 95 | 96 | // UpdateConfig indicates an expected call of UpdateConfig. 97 | func (mr *MockSubscribeControllerMockRecorder) UpdateConfig(arg0 interface{}) *gomock.Call { 98 | mr.mock.ctrl.T.Helper() 99 | return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "UpdateConfig", reflect.TypeOf((*MockSubscribeController)(nil).UpdateConfig), arg0) 100 | } 101 | 102 | // UpdatePDVariable mocks base method. 103 | func (m *MockSubscribeController) UpdatePDVariable(arg0 pdvariable.PDVariable) { 104 | m.ctrl.T.Helper() 105 | m.ctrl.Call(m, "UpdatePDVariable", arg0) 106 | } 107 | 108 | // UpdatePDVariable indicates an expected call of UpdatePDVariable. 109 | func (mr *MockSubscribeControllerMockRecorder) UpdatePDVariable(arg0 interface{}) *gomock.Call { 110 | mr.mock.ctrl.T.Helper() 111 | return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "UpdatePDVariable", reflect.TypeOf((*MockSubscribeController)(nil).UpdatePDVariable), arg0) 112 | } 113 | 114 | // UpdateTopology mocks base method. 115 | func (m *MockSubscribeController) UpdateTopology(arg0 []topology.Component) { 116 | m.ctrl.T.Helper() 117 | m.ctrl.Call(m, "UpdateTopology", arg0) 118 | } 119 | 120 | // UpdateTopology indicates an expected call of UpdateTopology. 121 | func (mr *MockSubscribeControllerMockRecorder) UpdateTopology(arg0 interface{}) *gomock.Call { 122 | mr.mock.ctrl.T.Helper() 123 | return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "UpdateTopology", reflect.TypeOf((*MockSubscribeController)(nil).UpdateTopology), arg0) 124 | } 125 | 126 | // MockScraper is a mock of Scraper interface. 127 | type MockScraper struct { 128 | ctrl *gomock.Controller 129 | recorder *MockScraperMockRecorder 130 | } 131 | 132 | // MockScraperMockRecorder is the mock recorder for MockScraper. 133 | type MockScraperMockRecorder struct { 134 | mock *MockScraper 135 | } 136 | 137 | // NewMockScraper creates a new mock instance. 138 | func NewMockScraper(ctrl *gomock.Controller) *MockScraper { 139 | mock := &MockScraper{ctrl: ctrl} 140 | mock.recorder = &MockScraperMockRecorder{mock} 141 | return mock 142 | } 143 | 144 | // EXPECT returns an object that allows the caller to indicate expected use. 145 | func (m *MockScraper) EXPECT() *MockScraperMockRecorder { 146 | return m.recorder 147 | } 148 | 149 | // Close mocks base method. 150 | func (m *MockScraper) Close() { 151 | m.ctrl.T.Helper() 152 | m.ctrl.Call(m, "Close") 153 | } 154 | 155 | // Close indicates an expected call of Close. 156 | func (mr *MockScraperMockRecorder) Close() *gomock.Call { 157 | mr.mock.ctrl.T.Helper() 158 | return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Close", reflect.TypeOf((*MockScraper)(nil).Close)) 159 | } 160 | 161 | // IsDown mocks base method. 162 | func (m *MockScraper) IsDown() bool { 163 | m.ctrl.T.Helper() 164 | ret := m.ctrl.Call(m, "IsDown") 165 | ret0, _ := ret[0].(bool) 166 | return ret0 167 | } 168 | 169 | // IsDown indicates an expected call of IsDown. 170 | func (mr *MockScraperMockRecorder) IsDown() *gomock.Call { 171 | mr.mock.ctrl.T.Helper() 172 | return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "IsDown", reflect.TypeOf((*MockScraper)(nil).IsDown)) 173 | } 174 | 175 | // Run mocks base method. 176 | func (m *MockScraper) Run() { 177 | m.ctrl.T.Helper() 178 | m.ctrl.Call(m, "Run") 179 | } 180 | 181 | // Run indicates an expected call of Run. 182 | func (mr *MockScraperMockRecorder) Run() *gomock.Call { 183 | mr.mock.ctrl.T.Helper() 184 | return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Run", reflect.TypeOf((*MockScraper)(nil).Run)) 185 | } 186 | -------------------------------------------------------------------------------- /component/subscriber/model/model.go: -------------------------------------------------------------------------------- 1 | // Copyright 2024 PingCAP, Inc. Licensed under Apache-2.0. 2 | 3 | package model 4 | 5 | // SchemaState is the state for schema elements. 6 | type SchemaState byte 7 | 8 | const ( 9 | // StateNone means this schema element is absent and can't be used. 10 | StateNone SchemaState = iota 11 | // StateDeleteOnly means we can only delete items for this schema element. 12 | StateDeleteOnly 13 | // StateWriteOnly means we can use any write operation on this schema element, 14 | // but outer can't read the changed data. 15 | StateWriteOnly 16 | // StateWriteReorganization means we are re-organizing whole data after write only state. 17 | StateWriteReorganization 18 | // StateDeleteReorganization means we are re-organizing whole data after delete only state. 19 | StateDeleteReorganization 20 | // StatePublic means this schema element is ok for all write and read operations. 21 | StatePublic 22 | ) 23 | 24 | const ( 25 | SchemaVersionPath = "/tidb/ddl/global_schema_version" 26 | ) 27 | 28 | // CIStr is case insensitive string. 29 | type CIStr struct { 30 | O string `json:"O"` // Original string. 31 | L string `json:"L"` // Lower case string. 32 | } 33 | 34 | // DBInfo provides meta data describing a DB. 35 | type DBInfo struct { 36 | ID int64 `json:"id"` 37 | Name CIStr `json:"db_name"` 38 | State SchemaState `json:"state"` 39 | } 40 | 41 | // IndexInfo provides meta data describing a DB index. 42 | // It corresponds to the statement `CREATE INDEX Name ON Table (Column);` 43 | // See https://dev.mysql.com/doc/refman/5.7/en/create-index.html 44 | type IndexInfo struct { 45 | ID int64 `json:"id"` 46 | Name CIStr `json:"idx_name"` 47 | } 48 | 49 | // PartitionDefinition defines a single partition. 50 | type PartitionDefinition struct { 51 | ID int64 `json:"id"` 52 | Name CIStr `json:"name"` 53 | } 54 | 55 | // PartitionInfo provides table partition info. 56 | type PartitionInfo struct { 57 | // User may already creates table with partition but table partition is not 58 | // yet supported back then. When Enable is true, write/read need use tid 59 | // rather than pid. 60 | Enable bool `json:"enable"` 61 | Definitions []*PartitionDefinition `json:"definitions"` 62 | } 63 | 64 | // TableInfo provides meta data describing a DB table. 65 | type TableInfo struct { 66 | ID int64 `json:"id"` 67 | Name CIStr `json:"name"` 68 | Indices []*IndexInfo `json:"index_info"` 69 | Partition *PartitionInfo `json:"partition"` 70 | } 71 | 72 | // GetPartitionInfo returns the partition information. 73 | func (t *TableInfo) GetPartitionInfo() *PartitionInfo { 74 | if t.Partition != nil && t.Partition.Enable { 75 | return t.Partition 76 | } 77 | return nil 78 | } 79 | 80 | type DBTablesInfo struct { 81 | DB DBInfo `json:"db"` 82 | Tables []TableInfo `json:"tables"` 83 | } 84 | 85 | type DBTableInfo struct { 86 | DB DBInfo 87 | Table IndexedTableInfo 88 | } 89 | 90 | type IndexedTableInfo struct { 91 | ID int64 92 | Name CIStr 93 | Indices map[int64]string 94 | } 95 | 96 | type TableDetail struct { 97 | Name string 98 | DB string 99 | ID int64 100 | Indices map[int64]string 101 | } 102 | -------------------------------------------------------------------------------- /component/subscriber/sub_controller.go: -------------------------------------------------------------------------------- 1 | package subscriber 2 | 3 | import ( 4 | "context" 5 | "net/http" 6 | "sync" 7 | 8 | "github.com/pingcap/ng-monitoring/component/topology" 9 | "github.com/pingcap/ng-monitoring/config" 10 | "github.com/pingcap/ng-monitoring/config/pdvariable" 11 | ) 12 | 13 | type SubscribeController interface { 14 | ScraperFactory 15 | 16 | Name() string 17 | IsEnabled() bool 18 | UpdatePDVariable(pdvariable.PDVariable) 19 | UpdateConfig(config.Config) 20 | UpdateTopology([]topology.Component) 21 | NewHTTPClient() *http.Client 22 | } 23 | 24 | type ScraperFactory interface { 25 | NewScraper(ctx context.Context, component topology.Component, schemaInfo *sync.Map) Scraper 26 | } 27 | 28 | type Scraper interface { 29 | Run() 30 | IsDown() bool 31 | Close() 32 | } 33 | -------------------------------------------------------------------------------- /component/subscriber/subscriber.go: -------------------------------------------------------------------------------- 1 | package subscriber 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | "sync" 7 | 8 | "github.com/pingcap/ng-monitoring/component/domain" 9 | "github.com/pingcap/ng-monitoring/component/topology" 10 | "github.com/pingcap/ng-monitoring/config" 11 | "github.com/pingcap/ng-monitoring/config/pdvariable" 12 | "github.com/pingcap/ng-monitoring/utils" 13 | 14 | "github.com/pingcap/log" 15 | ) 16 | 17 | type Subscriber struct { 18 | ctx context.Context 19 | cancel context.CancelFunc 20 | wg *sync.WaitGroup 21 | subscribeController SubscribeController 22 | } 23 | 24 | func NewSubscriber( 25 | do *domain.Domain, 26 | topoSubscriber topology.Subscriber, 27 | varSubscriber pdvariable.Subscriber, 28 | cfgSubscriber config.Subscriber, 29 | subscribeController SubscribeController, 30 | ) *Subscriber { 31 | ctx, cancel := context.WithCancel(context.Background()) 32 | wg := &sync.WaitGroup{} 33 | 34 | wg.Add(1) 35 | sm := NewManager(ctx, wg, do, varSubscriber, topoSubscriber, cfgSubscriber, subscribeController) 36 | go utils.GoWithRecovery(func() { 37 | defer wg.Done() 38 | sm.Run() 39 | }, nil) 40 | 41 | return &Subscriber{ 42 | ctx: ctx, 43 | cancel: cancel, 44 | wg: wg, 45 | subscribeController: subscribeController, 46 | } 47 | } 48 | 49 | func (s *Subscriber) Close() { 50 | log.Info(fmt.Sprintf("stopping %s scrapers", s.subscribeController.Name())) 51 | s.cancel() 52 | s.wg.Wait() 53 | log.Info(fmt.Sprintf("stop %s scrapers successfully", s.subscribeController.Name())) 54 | } 55 | -------------------------------------------------------------------------------- /component/topology/discovery.go: -------------------------------------------------------------------------------- 1 | package topology 2 | 3 | import ( 4 | "context" 5 | "encoding/json" 6 | "strconv" 7 | "strings" 8 | "sync" 9 | "sync/atomic" 10 | "time" 11 | 12 | "github.com/pingcap/ng-monitoring/component/domain" 13 | "github.com/pingcap/ng-monitoring/utils" 14 | clientv3 "go.etcd.io/etcd/client/v3" 15 | 16 | "github.com/pingcap/log" 17 | "github.com/pingcap/tidb-dashboard/util/topo" 18 | "github.com/pingcap/tidb-dashboard/util/topo/pdtopo" 19 | "go.uber.org/zap" 20 | ) 21 | 22 | const ( 23 | ComponentTiDB = "tidb" 24 | ComponentTiKV = "tikv" 25 | ComponentTiFlash = "tiflash" 26 | ComponentPD = "pd" 27 | ComponentTiCDC = "ticdc" 28 | ) 29 | 30 | var ( 31 | discoverInterval = time.Second * 30 32 | ) 33 | 34 | type TopologyDiscoverer struct { 35 | sync.Mutex 36 | do *domain.Domain 37 | subscriber []Subscriber 38 | components atomic.Value 39 | closed chan struct{} 40 | } 41 | 42 | type Component struct { 43 | Name string `json:"name"` 44 | IP string `json:"ip"` 45 | Port uint `json:"port"` 46 | StatusPort uint `json:"status_port"` 47 | } 48 | 49 | type Subscriber = chan GetLatestTopology 50 | type GetLatestTopology = func() []Component 51 | 52 | func NewTopologyDiscoverer(do *domain.Domain) (*TopologyDiscoverer, error) { 53 | d := &TopologyDiscoverer{ 54 | do: do, 55 | closed: make(chan struct{}), 56 | } 57 | return d, nil 58 | } 59 | 60 | func (d *TopologyDiscoverer) Subscribe() Subscriber { 61 | ch := make(Subscriber, 1) 62 | d.Lock() 63 | d.subscriber = append(d.subscriber, ch) 64 | ch <- d.load 65 | d.Unlock() 66 | return ch 67 | } 68 | 69 | func (d *TopologyDiscoverer) Start() { 70 | go utils.GoWithRecovery(d.loadTopologyLoop, nil) 71 | } 72 | 73 | func (d *TopologyDiscoverer) Close() error { 74 | close(d.closed) 75 | return nil 76 | } 77 | 78 | func (d *TopologyDiscoverer) loadTopologyLoop() { 79 | err := d.fetchTopology() 80 | log.Info("first load topology", zap.Reflect("component", d.components), zap.Error(err)) 81 | ticker := time.NewTicker(discoverInterval) 82 | defer ticker.Stop() 83 | for { 84 | select { 85 | case <-d.closed: 86 | return 87 | case <-ticker.C: 88 | err = d.fetchTopology() 89 | if err != nil { 90 | log.Error("load topology failed", zap.Error(err)) 91 | } else { 92 | log.Debug("load topology success", zap.Reflect("component", d.components)) 93 | } 94 | d.notifySubscriber() 95 | } 96 | } 97 | } 98 | 99 | func (d *TopologyDiscoverer) fetchTopology() error { 100 | ctx, cancel := context.WithTimeout(context.Background(), discoverInterval) 101 | defer cancel() 102 | components, err := d.fetchAllScrapeTargets(ctx) 103 | if err != nil { 104 | return err 105 | } 106 | d.components.Store(components) 107 | return nil 108 | } 109 | 110 | func (d *TopologyDiscoverer) load() []Component { 111 | v := d.components.Load() 112 | if v == nil { 113 | return nil 114 | } 115 | return d.components.Load().([]Component) 116 | } 117 | 118 | func (d *TopologyDiscoverer) notifySubscriber() { 119 | d.Lock() 120 | for _, ch := range d.subscriber { 121 | select { 122 | case ch <- d.load: 123 | default: 124 | } 125 | } 126 | d.Unlock() 127 | } 128 | 129 | func (d *TopologyDiscoverer) fetchAllScrapeTargets(ctx context.Context) ([]Component, error) { 130 | fns := []func(context.Context) ([]Component, error){ 131 | d.getTiDBComponents, 132 | d.getPDComponents, 133 | d.getStoreComponents, 134 | d.getTiCDCComponents, 135 | } 136 | components := make([]Component, 0, 8) 137 | for _, fn := range fns { 138 | nodes, err := fn(ctx) 139 | if err != nil { 140 | return nil, err 141 | } 142 | components = append(components, nodes...) 143 | } 144 | return components, nil 145 | } 146 | 147 | func (d *TopologyDiscoverer) getTiDBComponents(ctx context.Context) ([]Component, error) { 148 | etcdCli, err := d.do.GetEtcdClient() 149 | if err != nil { 150 | return nil, err 151 | } 152 | instances, err := pdtopo.GetTiDBInstances(ctx, etcdCli) 153 | if err != nil { 154 | return nil, err 155 | } 156 | components := make([]Component, 0, len(instances)) 157 | for _, instance := range instances { 158 | if instance.Status != topo.CompStatusUp { 159 | continue 160 | } 161 | components = append(components, Component{ 162 | Name: ComponentTiDB, 163 | IP: instance.IP, 164 | Port: instance.Port, 165 | StatusPort: instance.StatusPort, 166 | }) 167 | } 168 | return components, nil 169 | } 170 | 171 | func (d *TopologyDiscoverer) getPDComponents(ctx context.Context) ([]Component, error) { 172 | pdCli, err := d.do.GetPDClient() 173 | if err != nil { 174 | return nil, err 175 | } 176 | instances, err := pdtopo.GetPDInstances(ctx, pdCli) 177 | if err != nil { 178 | return nil, err 179 | } 180 | components := make([]Component, 0, len(instances)) 181 | for _, instance := range instances { 182 | if instance.Status != topo.CompStatusUp { 183 | continue 184 | } 185 | components = append(components, Component{ 186 | Name: ComponentPD, 187 | IP: instance.IP, 188 | Port: instance.Port, 189 | StatusPort: instance.Port, 190 | }) 191 | } 192 | return components, nil 193 | } 194 | 195 | func (d *TopologyDiscoverer) getStoreComponents(ctx context.Context) ([]Component, error) { 196 | pdCli, err := d.do.GetPDClient() 197 | if err != nil { 198 | return nil, err 199 | } 200 | tikvInstances, tiflashInstances, err := pdtopo.GetStoreInstances(ctx, pdCli) 201 | if err != nil { 202 | return nil, err 203 | } 204 | components := make([]Component, 0, len(tikvInstances)+len(tiflashInstances)) 205 | for _, instance := range tikvInstances { 206 | if instance.Status != topo.CompStatusUp { 207 | continue 208 | } 209 | components = append(components, Component{ 210 | Name: ComponentTiKV, 211 | IP: instance.IP, 212 | Port: instance.Port, 213 | StatusPort: instance.StatusPort, 214 | }) 215 | } 216 | for _, instance := range tiflashInstances { 217 | if instance.Status != topo.CompStatusUp { 218 | continue 219 | } 220 | components = append(components, Component{ 221 | Name: ComponentTiFlash, 222 | IP: instance.IP, 223 | Port: instance.Port, 224 | StatusPort: instance.StatusPort, 225 | }) 226 | } 227 | return components, nil 228 | } 229 | 230 | func (d *TopologyDiscoverer) getTiCDCComponents(ctx context.Context) ([]Component, error) { 231 | etcdCli, err := d.do.GetEtcdClient() 232 | if err != nil { 233 | return nil, err 234 | } 235 | return getTiCDCComponents(ctx, etcdCli) 236 | } 237 | 238 | const ticdcTopologyKeyPrefix = "/tidb/cdc/default/__cdc_meta__/capture/" 239 | 240 | type ticdcNodeItem struct { 241 | ID string `json:"id"` 242 | Address string `json:"address"` 243 | Version string `json:"version"` 244 | } 245 | 246 | func getTiCDCComponents(ctx context.Context, etcdCli *clientv3.Client) ([]Component, error) { 247 | resp, err := etcdCli.Get(ctx, ticdcTopologyKeyPrefix, clientv3.WithPrefix()) 248 | if err != nil { 249 | return nil, err 250 | } 251 | components := make([]Component, 0, 3) 252 | for _, kv := range resp.Kvs { 253 | key := string(kv.Key) 254 | if !strings.HasPrefix(key, ticdcTopologyKeyPrefix) { 255 | continue 256 | } 257 | var item ticdcNodeItem 258 | if err := json.Unmarshal(kv.Value, &item); err != nil { 259 | log.Warn("invalid ticdc node item in etcd", zap.Error(err)) 260 | continue 261 | } 262 | arr := strings.Split(item.Address, ":") 263 | if len(arr) != 2 { 264 | log.Warn("invalid ticdc node address in etcd", zap.String("address", item.Address)) 265 | continue 266 | } 267 | ip := arr[0] 268 | port, err := strconv.Atoi(arr[1]) 269 | if err != nil { 270 | log.Warn("invalid ticdc node address in etcd", 271 | zap.Error(err), 272 | zap.String("address", item.Address)) 273 | continue 274 | } 275 | components = append(components, Component{ 276 | Name: ComponentTiCDC, 277 | IP: ip, 278 | Port: uint(port), 279 | StatusPort: uint(port), 280 | }) 281 | } 282 | return components, nil 283 | } 284 | -------------------------------------------------------------------------------- /component/topology/syncer.go: -------------------------------------------------------------------------------- 1 | package topology 2 | 3 | import ( 4 | "context" 5 | "encoding/json" 6 | "fmt" 7 | "net" 8 | "strconv" 9 | "time" 10 | 11 | "github.com/pingcap/ng-monitoring/component/domain" 12 | "github.com/pingcap/ng-monitoring/config" 13 | "github.com/pingcap/ng-monitoring/utils" 14 | "github.com/pingcap/ng-monitoring/utils/printer" 15 | 16 | "github.com/pingcap/log" 17 | clientv3 "go.etcd.io/etcd/client/v3" 18 | "go.etcd.io/etcd/client/v3/concurrency" 19 | "go.uber.org/zap" 20 | ) 21 | 22 | const ( 23 | topologyPrefix = "/topology/ng-monitoring" 24 | defaultRetryCnt = 3 25 | defaultTimeout = 2 * time.Second 26 | defRetryInterval = 30 * time.Millisecond 27 | newSessionRetryInterval = 200 * time.Millisecond 28 | logIntervalCnt = int(3 * time.Second / newSessionRetryInterval) 29 | topologySessionTTL = 45 30 | ) 31 | 32 | var ( 33 | topologyTimeToRefresh = 30 * time.Second 34 | ) 35 | 36 | type TopologySyncer struct { 37 | do *domain.Domain 38 | topologySession *concurrency.Session 39 | serverInfo *ServerInfo 40 | ctx context.Context 41 | cancel context.CancelFunc 42 | } 43 | 44 | func NewTopologySyncer(do *domain.Domain) *TopologySyncer { 45 | syncer := &TopologySyncer{ 46 | do: do, 47 | serverInfo: getServerInfo(), 48 | } 49 | syncer.ctx, syncer.cancel = context.WithCancel(context.Background()) 50 | return syncer 51 | } 52 | 53 | func (s *TopologySyncer) Start() { 54 | go utils.GoWithRecovery(s.topologyInfoKeeperLoop, nil) 55 | } 56 | 57 | func (s *TopologySyncer) topologyInfoKeeperLoop() { 58 | err := syncer.newTopologySessionAndStoreServerInfo() 59 | if err != nil { 60 | log.Error("store topology into etcd failed", zap.Error(err)) 61 | } 62 | ticker := time.NewTicker(topologyTimeToRefresh) 63 | defer ticker.Stop() 64 | for { 65 | select { 66 | case <-ticker.C: 67 | err := s.storeTopologyInfo() 68 | if err != nil { 69 | log.Error("refresh topology in loop failed", zap.Error(err)) 70 | } 71 | case <-s.topologySessionDone(): 72 | log.Info("server topology syncer need to restart") 73 | if err := s.newTopologySessionAndStoreServerInfo(); err != nil { 74 | log.Error("server topology syncer restart failed", zap.Error(err)) 75 | } else { 76 | log.Info("server topology syncer restarted") 77 | } 78 | case <-s.ctx.Done(): 79 | return 80 | } 81 | } 82 | } 83 | 84 | func (s *TopologySyncer) newTopologySessionAndStoreServerInfo() error { 85 | etcdCli, err := s.do.GetEtcdClient() 86 | if err != nil { 87 | return err 88 | } 89 | session, err := newEtcdSession(s.ctx, etcdCli, defaultRetryCnt, topologySessionTTL) 90 | if err != nil { 91 | return err 92 | } 93 | s.topologySession = session 94 | 95 | err = s.storeServerInfo(etcdCli) 96 | if err != nil { 97 | return err 98 | } 99 | 100 | return s.storeTopologyInfo() 101 | } 102 | 103 | func (s *TopologySyncer) storeServerInfo(etcdCli *clientv3.Client) error { 104 | cfg := config.GetGlobalConfig() 105 | key := fmt.Sprintf("%s/%s/info", topologyPrefix, cfg.AdvertiseAddress) 106 | infoBuf, err := json.Marshal(s.serverInfo) 107 | if err != nil { 108 | return err 109 | } 110 | value := string(infoBuf) 111 | // Note: no lease is required here. 112 | err = putKVToEtcd(s.ctx, etcdCli, defaultRetryCnt, key, value) 113 | return err 114 | } 115 | 116 | func (s *TopologySyncer) storeTopologyInfo() error { 117 | cfg := config.GetGlobalConfig() 118 | 119 | key := fmt.Sprintf("%s/%s/ttl", topologyPrefix, cfg.AdvertiseAddress) 120 | 121 | etcdCli, err := s.do.GetEtcdClient() 122 | if err != nil { 123 | return err 124 | } 125 | return putKVToEtcd(s.ctx, etcdCli, defaultRetryCnt, key, 126 | fmt.Sprintf("%v", time.Now().UnixNano()), 127 | clientv3.WithLease(s.topologySession.Lease())) 128 | } 129 | 130 | func (s *TopologySyncer) topologySessionDone() <-chan struct{} { 131 | if s.topologySession == nil { 132 | return make(chan struct{}, 1) 133 | } 134 | return s.topologySession.Done() 135 | } 136 | 137 | func (s *TopologySyncer) Stop() { 138 | s.cancel() 139 | } 140 | 141 | func putKVToEtcd(ctx context.Context, etcdCli *clientv3.Client, retryCnt int, key, val string, 142 | opts ...clientv3.OpOption) error { 143 | var err error 144 | for i := 0; i < retryCnt; i++ { 145 | if isContextDone(ctx) { 146 | return ctx.Err() 147 | } 148 | 149 | childCtx, cancel := context.WithTimeout(ctx, defaultTimeout) 150 | _, err = etcdCli.Put(childCtx, key, val, opts...) 151 | cancel() 152 | if err == nil { 153 | return nil 154 | } 155 | log.Warn("[syncer] etcd-cli put kv failed", zap.String("key", key), zap.String("value", val), zap.Error(err), zap.Int("retryCnt", i)) 156 | time.Sleep(defRetryInterval) 157 | } 158 | return err 159 | } 160 | 161 | func newEtcdSession(ctx context.Context, etcdCli *clientv3.Client, retryCnt, ttl int) (*concurrency.Session, error) { 162 | var err error 163 | var etcdSession *concurrency.Session 164 | failedCnt := 0 165 | for i := 0; i < retryCnt; i++ { 166 | if isContextDone(ctx) { 167 | return etcdSession, ctx.Err() 168 | } 169 | 170 | etcdSession, err = concurrency.NewSession(etcdCli, 171 | concurrency.WithTTL(ttl), concurrency.WithContext(ctx)) 172 | if err == nil { 173 | break 174 | } 175 | if failedCnt%logIntervalCnt == 0 { 176 | log.Warn("failed to new session to etcd", zap.Error(err)) 177 | } 178 | 179 | time.Sleep(newSessionRetryInterval) 180 | failedCnt++ 181 | } 182 | return etcdSession, err 183 | } 184 | 185 | func isContextDone(ctx context.Context) bool { 186 | select { 187 | case <-ctx.Done(): 188 | return true 189 | default: 190 | } 191 | return false 192 | } 193 | 194 | // ServerInfo is server static information. 195 | // It will not be updated when server running. 196 | // So please only put static information in ServerInfo struct. 197 | type ServerInfo struct { 198 | GitHash string `json:"git_hash"` 199 | IP string `json:"ip"` 200 | Port uint64 `json:"listening_port"` 201 | StartTimestamp int64 `json:"start_timestamp"` 202 | } 203 | 204 | func getServerInfo() *ServerInfo { 205 | info := &ServerInfo{ 206 | GitHash: printer.NGMGitHash, 207 | IP: "", 208 | Port: 0, 209 | StartTimestamp: time.Now().Unix(), 210 | } 211 | cfg := config.GetGlobalConfig() 212 | host, port, err := net.SplitHostPort(cfg.AdvertiseAddress) 213 | if err == nil { 214 | info.IP = host 215 | info.Port, _ = strconv.ParseUint(port, 10, 64) 216 | } 217 | return info 218 | } 219 | -------------------------------------------------------------------------------- /component/topology/topology.go: -------------------------------------------------------------------------------- 1 | package topology 2 | 3 | import ( 4 | "github.com/pingcap/ng-monitoring/component/domain" 5 | ) 6 | 7 | var ( 8 | discover *TopologyDiscoverer 9 | syncer *TopologySyncer 10 | ) 11 | 12 | func Init(do *domain.Domain) error { 13 | var err error 14 | discover, err = NewTopologyDiscoverer(do) 15 | if err != nil { 16 | return err 17 | } 18 | syncer = NewTopologySyncer(do) 19 | syncer.Start() 20 | discover.Start() 21 | return err 22 | } 23 | 24 | func InitForTest(comps []Component) { 25 | discover = &TopologyDiscoverer{} 26 | discover.components.Store(comps) 27 | } 28 | 29 | func GetCurrentComponent() []Component { 30 | if discover == nil { 31 | return nil 32 | } 33 | return discover.load() 34 | } 35 | 36 | func Subscribe() Subscriber { 37 | return discover.Subscribe() 38 | } 39 | 40 | func Stop() { 41 | if syncer != nil { 42 | syncer.Stop() 43 | } 44 | if discover != nil { 45 | _ = discover.Close() 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /component/topology/topology_test.go: -------------------------------------------------------------------------------- 1 | package topology 2 | 3 | import ( 4 | "context" 5 | "runtime" 6 | "strconv" 7 | "testing" 8 | "time" 9 | 10 | "github.com/pingcap/ng-monitoring/component/domain" 11 | "github.com/pingcap/ng-monitoring/config" 12 | "github.com/pingcap/ng-monitoring/utils/printer" 13 | "github.com/pingcap/ng-monitoring/utils/testutil" 14 | 15 | "github.com/stretchr/testify/require" 16 | clientv3 "go.etcd.io/etcd/client/v3" 17 | "go.etcd.io/etcd/tests/v3/integration" 18 | ) 19 | 20 | func TestBasic(t *testing.T) { 21 | cfg := config.GetDefaultConfig() 22 | config.StoreGlobalConfig(cfg) 23 | do := domain.NewDomain() 24 | defer do.Close() 25 | err := Init(do) 26 | require.NoError(t, err) 27 | comps := GetCurrentComponent() 28 | require.Equal(t, len(comps), 0) 29 | 30 | InitForTest([]Component{ 31 | {Name: ComponentTiDB}, 32 | }) 33 | comps = GetCurrentComponent() 34 | require.Equal(t, len(comps), 1) 35 | require.Equal(t, comps[0].Name, ComponentTiDB) 36 | } 37 | 38 | func TestTopology(t *testing.T) { 39 | if runtime.GOOS == "windows" { 40 | t.Skip("integration.NewClusterV3 will create file contains a colon which is not allowed on Windows") 41 | } 42 | 43 | integration.BeforeTestExternal(t) 44 | cluster := integration.NewClusterV3(t, &integration.ClusterConfig{Size: 1}) 45 | defer cluster.Terminate(t) 46 | mockPD := testutil.MockPDHTTPServer{} 47 | mockPD.Setup(t) 48 | defer mockPD.Close(t) 49 | 50 | cfg := config.GetDefaultConfig() 51 | cfg.PD.Endpoints = nil 52 | cfg.AdvertiseAddress = "10.0.1.8:12020" 53 | _, err := domain.CreatePDClient(&cfg) 54 | require.NotNil(t, err) 55 | require.Equal(t, "need specify pd endpoints", err.Error()) 56 | cfg.PD.Endpoints = []string{mockPD.Addr} 57 | config.StoreGlobalConfig(cfg) 58 | mockPD.Health = false 59 | _, err = domain.CreatePDClient(&cfg) 60 | require.NotNil(t, err) 61 | require.Contains(t, err.Error(), "Response status 503") 62 | mockPD.Health = true 63 | pdCli, err := domain.CreatePDClient(&cfg) 64 | require.NoError(t, err) 65 | 66 | do := domain.NewDomainForTest(pdCli, cluster.RandClient()) 67 | discover = &TopologyDiscoverer{ 68 | do: do, 69 | closed: make(chan struct{}), 70 | } 71 | err = discover.fetchTopology() 72 | require.NoError(t, err) 73 | 74 | sub := discover.Subscribe() 75 | discoverInterval = time.Millisecond * 100 76 | discover.Start() 77 | 78 | getComponents := <-sub 79 | components := getComponents() 80 | require.Equal(t, len(components), 2) 81 | require.Equal(t, components[0].Name, "pd") 82 | require.Equal(t, components[1].Name, "tikv") 83 | 84 | // test syncer 85 | printer.NGMGitHash = "b225682e6660cb617b8f4ccc77da252f845f411c" 86 | syncer = NewTopologySyncer(do) 87 | require.Equal(t, "10.0.1.8", syncer.serverInfo.IP) 88 | require.Equal(t, uint64(12020), syncer.serverInfo.Port) 89 | require.Equal(t, "b225682e6660cb617b8f4ccc77da252f845f411c", syncer.serverInfo.GitHash) 90 | require.True(t, syncer.serverInfo.StartTimestamp > 0) 91 | syncer.serverInfo.StartTimestamp = 1639643120 92 | err = syncer.newTopologySessionAndStoreServerInfo() 93 | require.NoError(t, err) 94 | err = syncer.storeTopologyInfo() 95 | require.NoError(t, err) 96 | 97 | etcdCli := cluster.RandClient() 98 | check := func() { 99 | // get ngm topology from etcd. 100 | resp, err := etcdCli.Get(context.Background(), "/topology/ng-monitoring/10.0.1.8:12020/ttl") 101 | require.NoError(t, err) 102 | require.Equal(t, 1, int(resp.Count)) 103 | require.Equal(t, "/topology/ng-monitoring/10.0.1.8:12020/ttl", string(resp.Kvs[0].Key)) 104 | ts, err := strconv.Atoi(string(resp.Kvs[0].Value)) 105 | require.NoError(t, err) 106 | require.True(t, ts > 0) 107 | 108 | // get ngm server info from etcd. 109 | resp, err = etcdCli.Get(context.Background(), "/topology/ng-monitoring/10.0.1.8:12020/info") 110 | require.NoError(t, err) 111 | require.Equal(t, 1, int(resp.Count)) 112 | require.Equal(t, "/topology/ng-monitoring/10.0.1.8:12020/info", string(resp.Kvs[0].Key)) 113 | require.Equal(t, `{"git_hash":"b225682e6660cb617b8f4ccc77da252f845f411c","ip":"10.0.1.8","listening_port":12020,"start_timestamp":1639643120}`, string(resp.Kvs[0].Value)) 114 | } 115 | check() 116 | 117 | // test syncer sync. 118 | topologyTimeToRefresh = time.Millisecond * 10 119 | syncer.Start() 120 | respd, err := etcdCli.Delete(context.Background(), topologyPrefix, clientv3.WithPrefix()) 121 | require.NoError(t, err) 122 | require.Equal(t, 2, int(respd.Deleted)) 123 | // wait syncer to restore the info. 124 | time.Sleep(time.Millisecond * 100) 125 | resp, err := etcdCli.Get(context.Background(), topologyPrefix, clientv3.WithPrefix()) 126 | require.NoError(t, err) 127 | require.Equal(t, 2, int(resp.Count)) 128 | check() 129 | Stop() 130 | 131 | // Test invalid address 132 | cfg.AdvertiseAddress = "abcd" 133 | config.StoreGlobalConfig(cfg) 134 | serverInfo := getServerInfo() 135 | require.Equal(t, "", serverInfo.IP) 136 | require.Equal(t, uint64(0), serverInfo.Port) 137 | cfg.AdvertiseAddress = "abcd:x" 138 | config.StoreGlobalConfig(cfg) 139 | serverInfo = getServerInfo() 140 | require.Equal(t, "abcd", serverInfo.IP) 141 | require.Equal(t, uint64(0), serverInfo.Port) 142 | } 143 | -------------------------------------------------------------------------------- /component/topsql/codec/plan/plan.go: -------------------------------------------------------------------------------- 1 | package plan 2 | 3 | import ( 4 | "encoding/base64" 5 | 6 | "github.com/golang/snappy" 7 | "github.com/pingcap/tidb/pkg/util/plancodec" 8 | ) 9 | 10 | func Decode(planString string) (string, error) { 11 | binaryPlan, err := decompress(planString) 12 | if err != nil { 13 | return "", err 14 | } 15 | 16 | return plancodec.DecodeNormalizedPlan(binaryPlan) 17 | } 18 | 19 | func decompress(str string) (string, error) { 20 | decodeBytes, err := base64.StdEncoding.DecodeString(str) 21 | if err != nil { 22 | return "", err 23 | } 24 | 25 | bs, err := snappy.Decode(nil, decodeBytes) 26 | if err != nil { 27 | return "", err 28 | } 29 | return string(bs), nil 30 | } 31 | -------------------------------------------------------------------------------- /component/topsql/codec/plan/plan_test.go: -------------------------------------------------------------------------------- 1 | package plan_test 2 | 3 | import ( 4 | "os" 5 | "testing" 6 | 7 | "github.com/pingcap/ng-monitoring/component/topsql/codec/plan" 8 | 9 | "github.com/stretchr/testify/require" 10 | ) 11 | 12 | func TestBigPlan(t *testing.T) { 13 | t.Parallel() 14 | 15 | bigEncodedPlan, err := os.ReadFile("testdata/big_encoded_plan.txt") 16 | require.NoError(t, err) 17 | 18 | planText, err := plan.Decode(string(bigEncodedPlan)) 19 | require.NoError(t, err) 20 | 21 | bigDecodedPlan, err := os.ReadFile("testdata/big_decoded_plan.txt") 22 | require.NoError(t, err) 23 | require.Equal(t, planText, string(bigDecodedPlan)) 24 | } 25 | 26 | func TestSmallPlan(t *testing.T) { 27 | t.Parallel() 28 | 29 | encodedPlan := "WrAwCTM4CTAJdGFibGU6R0xPQkFMX1ZBUklBQkxFUywgaW5kZXg6UFJJTUFSWSgRGZBfTkFNRSksIGtlZXAgb3JkZXI6ZmFsc2UsIGRlc2M6ZmFsc2UK" 30 | expectedPlanText := "\tBatch_Point_Get\troot\ttable:GLOBAL_VARIABLES, index:PRIMARY(VARIABLE_NAME), keep order:false, desc:false" 31 | 32 | planText, err := plan.Decode(encodedPlan) 33 | require.NoError(t, err) 34 | require.Equal(t, planText, expectedPlanText) 35 | } 36 | -------------------------------------------------------------------------------- /component/topsql/codec/plan/testdata/big_encoded_plan.txt: -------------------------------------------------------------------------------- 1 | j+ICWDAJNgkwCWZ1bmNzOnN1bSg/KS0+PywgfhEAsAoxCTMJMAljYXN0KHRlc3QudC5hLCBkZWNpbWFsKDIwLDApIEJJTkFSWSksIP4mABkmdAoyCTE3CTAJQ0FSVEVTSUFOIGlubmVyIGpvaW4KM2ocAAA0ahwACDUJNQHMQAo2CTMxCTAJCjcJMQkxCW5lHdnwPj8pCjgJNDMJMQl0YWJsZTp0MSwgcGFydGl0aW9uOj8sIHJhbmdlOls/LD9dLCBrZWVwIG9yZGVyOmZhbHNlCv5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aADZaAM4wIwAy/jAjYjAj/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loA/loAhloA/nxGPnxGADP+TCNiTCP+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgCGWgDOfEYANP4wI2IwI/5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAP5aAIZaAAAzGtyMADQa3IwANVLcjAA2MtyMADWyMCP+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgD+WgA2WgA= 2 | -------------------------------------------------------------------------------- /component/topsql/codec/resource_group_tag/resource_group_tag.go: -------------------------------------------------------------------------------- 1 | package resource_group_tag 2 | 3 | import "github.com/pingcap/tipb/go-tipb" 4 | 5 | func Decode(encoded []byte) (tipb.ResourceGroupTag, error) { 6 | tag := tipb.ResourceGroupTag{} 7 | err := tag.Unmarshal(encoded) 8 | return tag, err 9 | } 10 | -------------------------------------------------------------------------------- /component/topsql/mock/mem_store.go: -------------------------------------------------------------------------------- 1 | package mock 2 | 3 | import ( 4 | "sync" 5 | "time" 6 | 7 | "github.com/pingcap/ng-monitoring/component/topsql/store" 8 | 9 | rsmetering "github.com/pingcap/kvproto/pkg/resource_usage_agent" 10 | "github.com/pingcap/tipb/go-tipb" 11 | ) 12 | 13 | type Component struct { 14 | Name string 15 | Addr string 16 | } 17 | 18 | type MemStore struct { 19 | sync.Mutex 20 | 21 | // instance -> value 22 | InstanceStore map[Component]struct{} 23 | 24 | // instance -> sql digest -> plan digest -> records 25 | TopSQLRecords map[string]map[string]map[string]*tipb.TopSQLRecord 26 | 27 | // instance -> resource tag -> records 28 | ResourceMeteringRecords map[string]map[string]*rsmetering.ResourceUsageRecord 29 | 30 | // SQL digest -> meta 31 | SQLMetas map[string]struct { 32 | Meta *tipb.SQLMeta 33 | } 34 | 35 | // plan digest -> meta 36 | PlanMetas map[string]struct { 37 | Meta *tipb.PlanMeta 38 | } 39 | } 40 | 41 | func NewMemStore() *MemStore { 42 | return &MemStore{ 43 | InstanceStore: make(map[Component]struct{}), 44 | TopSQLRecords: make(map[string]map[string]map[string]*tipb.TopSQLRecord), 45 | ResourceMeteringRecords: make(map[string]map[string]*rsmetering.ResourceUsageRecord), 46 | SQLMetas: make(map[string]struct { 47 | Meta *tipb.SQLMeta 48 | }), 49 | PlanMetas: make(map[string]struct { 50 | Meta *tipb.PlanMeta 51 | }), 52 | } 53 | } 54 | 55 | func (m *MemStore) Pred(pred func(*MemStore) bool, beginWaitTime time.Duration, maxWaitTime time.Duration) bool { 56 | begin := time.Now() 57 | timeToWait := beginWaitTime 58 | 59 | for { 60 | passed := func() bool { 61 | m.Lock() 62 | defer m.Unlock() 63 | 64 | return pred(m) 65 | }() 66 | 67 | waitedTime := time.Since(begin) 68 | if passed { 69 | return true 70 | } else if waitedTime >= maxWaitTime { 71 | return false 72 | } 73 | 74 | if waitedTime+timeToWait > maxWaitTime { 75 | timeToWait = maxWaitTime - waitedTime 76 | } 77 | time.Sleep(timeToWait) 78 | timeToWait *= 2 79 | } 80 | } 81 | 82 | var _ store.Store = &MemStore{} 83 | 84 | func (m *MemStore) Instances(items []store.InstanceItem) error { 85 | m.Lock() 86 | for _, item := range items { 87 | m.InstanceStore[Component{ 88 | Name: item.InstanceType, 89 | Addr: item.Instance, 90 | }] = struct{}{} 91 | } 92 | m.Unlock() 93 | 94 | return nil 95 | } 96 | 97 | func (m *MemStore) TopSQLRecord(instance, _ string, record *tipb.TopSQLRecord) error { 98 | m.Lock() 99 | defer m.Unlock() 100 | 101 | if _, ok := m.TopSQLRecords[instance]; !ok { 102 | m.TopSQLRecords[instance] = make(map[string]map[string]*tipb.TopSQLRecord) 103 | } 104 | if _, ok := m.TopSQLRecords[instance][string(record.SqlDigest)]; !ok { 105 | m.TopSQLRecords[instance][string(record.SqlDigest)] = make(map[string]*tipb.TopSQLRecord) 106 | } 107 | if _, ok := m.TopSQLRecords[instance][string(record.SqlDigest)][string(record.PlanDigest)]; !ok { 108 | m.TopSQLRecords[instance][string(record.SqlDigest)][string(record.PlanDigest)] = &tipb.TopSQLRecord{ 109 | SqlDigest: record.SqlDigest, 110 | PlanDigest: record.PlanDigest, 111 | } 112 | } 113 | r := m.TopSQLRecords[instance][string(record.SqlDigest)][string(record.PlanDigest)] 114 | r.Items = append(r.Items, record.Items...) 115 | 116 | return nil 117 | } 118 | 119 | func (m *MemStore) ResourceMeteringRecord(instance, _ string, record *rsmetering.ResourceUsageRecord, _ *sync.Map) error { 120 | m.Lock() 121 | defer m.Unlock() 122 | if _, ok := m.ResourceMeteringRecords[instance]; !ok { 123 | m.ResourceMeteringRecords[instance] = make(map[string]*rsmetering.ResourceUsageRecord) 124 | } 125 | if _, ok := m.ResourceMeteringRecords[instance][string(record.GetRecord().ResourceGroupTag)]; !ok { 126 | m.ResourceMeteringRecords[instance][string(record.GetRecord().ResourceGroupTag)] = &rsmetering.ResourceUsageRecord{ 127 | RecordOneof: &rsmetering.ResourceUsageRecord_Record{ 128 | Record: &rsmetering.GroupTagRecord{ 129 | ResourceGroupTag: record.GetRecord().ResourceGroupTag, 130 | }, 131 | }, 132 | } 133 | } 134 | r := m.ResourceMeteringRecords[instance][string(record.GetRecord().ResourceGroupTag)] 135 | r.GetRecord().Items = append(r.GetRecord().Items, record.GetRecord().GetItems()...) 136 | 137 | return nil 138 | } 139 | 140 | func (m *MemStore) SQLMeta(meta *tipb.SQLMeta) error { 141 | m.Lock() 142 | m.SQLMetas[string(meta.SqlDigest)] = struct{ Meta *tipb.SQLMeta }{Meta: meta} 143 | m.Unlock() 144 | 145 | return nil 146 | } 147 | 148 | func (m *MemStore) PlanMeta(meta *tipb.PlanMeta) error { 149 | m.Lock() 150 | m.PlanMetas[string(meta.PlanDigest)] = struct{ Meta *tipb.PlanMeta }{Meta: meta} 151 | m.Unlock() 152 | 153 | return nil 154 | } 155 | 156 | func (m *MemStore) Close() { 157 | } 158 | -------------------------------------------------------------------------------- /component/topsql/mock/pubsub.go: -------------------------------------------------------------------------------- 1 | package mock 2 | 3 | import ( 4 | "context" 5 | "crypto/tls" 6 | "net" 7 | "time" 8 | 9 | rua "github.com/pingcap/kvproto/pkg/resource_usage_agent" 10 | "github.com/pingcap/tipb/go-tipb" 11 | "google.golang.org/grpc" 12 | "google.golang.org/grpc/credentials" 13 | "google.golang.org/grpc/keepalive" 14 | ) 15 | 16 | type tidbStreamAccessor = func(_ tipb.TopSQLPubSub_SubscribeServer) error 17 | type tikvStreamAccessor = func(_ rua.ResourceMeteringPubSub_SubscribeServer) error 18 | 19 | type MockPubSub struct { 20 | ctx context.Context 21 | cancel context.CancelFunc 22 | 23 | listener net.Listener 24 | server *grpc.Server 25 | 26 | tidbAccessor chan tidbStreamAccessor 27 | tikvAccessor chan tikvStreamAccessor 28 | } 29 | 30 | func NewMockPubSub() *MockPubSub { 31 | ctx, cancel := context.WithCancel(context.Background()) 32 | 33 | return &MockPubSub{ 34 | ctx: ctx, 35 | cancel: cancel, 36 | tidbAccessor: make(chan tidbStreamAccessor), 37 | tikvAccessor: make(chan tikvStreamAccessor), 38 | } 39 | } 40 | 41 | func (s *MockPubSub) Listen(addr string, tls *tls.Config) (ip string, port uint, err error) { 42 | s.listener, err = net.Listen("tcp", addr) 43 | if err != nil { 44 | return 45 | } 46 | 47 | var opts []grpc.ServerOption 48 | if tls != nil { 49 | opts = append(opts, grpc.Creds(credentials.NewTLS(tls))) 50 | } 51 | 52 | opts = append(opts, grpc.KeepaliveParams(keepalive.ServerParameters{ 53 | Time: 10 * time.Second, 54 | Timeout: 3 * time.Second, 55 | })) 56 | 57 | s.server = grpc.NewServer(opts...) 58 | tipb.RegisterTopSQLPubSubServer(s.server, &tidbService{ 59 | ctx: s.ctx, 60 | accessor: s.tidbAccessor, 61 | }) 62 | rua.RegisterResourceMeteringPubSubServer(s.server, &tikvService{ 63 | ctx: s.ctx, 64 | accessor: s.tikvAccessor, 65 | }) 66 | 67 | adr := s.listener.Addr().(*net.TCPAddr) 68 | return adr.IP.String(), uint(adr.Port), nil 69 | } 70 | 71 | func (s *MockPubSub) Serve() error { 72 | return s.server.Serve(s.listener) 73 | } 74 | 75 | func (s *MockPubSub) AccessTiDBStream(fn func(_ tipb.TopSQLPubSub_SubscribeServer) error) { 76 | s.tidbAccessor <- fn 77 | } 78 | 79 | func (s *MockPubSub) AccessTiKVStream(fn func(_ rua.ResourceMeteringPubSub_SubscribeServer) error) { 80 | s.tikvAccessor <- fn 81 | } 82 | 83 | func (s *MockPubSub) Stop() { 84 | s.server.Stop() 85 | s.cancel() 86 | } 87 | 88 | type tidbService struct { 89 | ctx context.Context 90 | accessor chan tidbStreamAccessor 91 | } 92 | 93 | var _ tipb.TopSQLPubSubServer = &tidbService{} 94 | 95 | func (s *tidbService) Subscribe( 96 | _ *tipb.TopSQLSubRequest, 97 | stream tipb.TopSQLPubSub_SubscribeServer, 98 | ) error { 99 | for { 100 | select { 101 | case <-stream.Context().Done(): 102 | return nil 103 | case <-s.ctx.Done(): 104 | return nil 105 | case accessor := <-s.accessor: 106 | if err := accessor(stream); err != nil { 107 | return err 108 | } 109 | } 110 | } 111 | } 112 | 113 | type tikvService struct { 114 | ctx context.Context 115 | accessor chan tikvStreamAccessor 116 | } 117 | 118 | var _ rua.ResourceMeteringPubSubServer = &tikvService{} 119 | 120 | func (s *tikvService) Subscribe( 121 | _ *rua.ResourceMeteringRequest, 122 | stream rua.ResourceMeteringPubSub_SubscribeServer, 123 | ) error { 124 | for { 125 | select { 126 | case <-stream.Context().Done(): 127 | return nil 128 | case <-s.ctx.Done(): 129 | return nil 130 | case accessor := <-s.accessor: 131 | if err := accessor(stream); err != nil { 132 | return err 133 | } 134 | } 135 | } 136 | } 137 | -------------------------------------------------------------------------------- /component/topsql/query/model.go: -------------------------------------------------------------------------------- 1 | package query 2 | 3 | type RecordKey struct { 4 | SQLDigest string 5 | PlanDigest string 6 | } 7 | 8 | type RecordItem struct { 9 | SQLDigest string `json:"sql_digest"` 10 | SQLText string `json:"sql_text"` 11 | IsOther bool `json:"is_other"` 12 | Plans []RecordPlanItem `json:"plans"` 13 | } 14 | 15 | type RecordPlanItem struct { 16 | PlanDigest string `json:"plan_digest"` 17 | PlanText string `json:"plan_text"` 18 | TimestampSec []uint64 `json:"timestamp_sec"` 19 | CPUTimeMs []uint64 `json:"cpu_time_ms,omitempty"` 20 | ReadRows []uint64 `json:"read_rows,omitempty"` 21 | ReadIndexes []uint64 `json:"read_indexes,omitempty"` 22 | WriteRows []uint64 `json:"write_rows,omitempty"` 23 | WriteIndexes []uint64 `json:"write_indexes,omitempty"` 24 | SQLExecCount []uint64 `json:"sql_exec_count,omitempty"` 25 | SQLDurationSum []uint64 `json:"sql_duration_sum,omitempty"` 26 | SQLDurationCount []uint64 `json:"sql_duration_count,omitempty"` 27 | } 28 | 29 | type SummaryByItem struct { 30 | Text string `json:"text"` 31 | TimestampSec []uint64 `json:"timestamp_sec"` 32 | CPUTimeMs []uint64 `json:"cpu_time_ms,omitempty"` 33 | CPUTimeMsSum uint64 `json:"cpu_time_ms_sum"` 34 | IsOther bool `json:"is_other"` 35 | } 36 | 37 | type SummaryItem struct { 38 | SQLDigest string `json:"sql_digest"` 39 | SQLText string `json:"sql_text"` 40 | IsOther bool `json:"is_other"` 41 | CPUTimeMs uint64 `json:"cpu_time_ms"` 42 | ExecCountPerSec float64 `json:"exec_count_per_sec"` 43 | DurationPerExecMs float64 `json:"duration_per_exec_ms"` 44 | ScanRecordsPerSec float64 `json:"scan_records_per_sec"` 45 | ScanIndexesPerSec float64 `json:"scan_indexes_per_sec"` 46 | Plans []SummaryPlanItem `json:"plans"` 47 | } 48 | 49 | type SummaryPlanItem struct { 50 | PlanDigest string `json:"plan_digest"` 51 | PlanText string `json:"plan_text"` 52 | TimestampSec []uint64 `json:"timestamp_sec"` 53 | CPUTimeMs []uint64 `json:"cpu_time_ms,omitempty"` 54 | ExecCountPerSec float64 `json:"exec_count_per_sec"` 55 | DurationPerExecMs float64 `json:"duration_per_exec_ms"` 56 | ScanRecordsPerSec float64 `json:"scan_records_per_sec"` 57 | ScanIndexesPerSec float64 `json:"scan_indexes_per_sec"` 58 | } 59 | 60 | type InstanceItem struct { 61 | Instance string `json:"instance"` 62 | InstanceType string `json:"instance_type"` 63 | } 64 | 65 | type recordsMetricResp struct { 66 | Status string `json:"status"` 67 | Data recordsMetricRespData `json:"data"` 68 | } 69 | 70 | type recordsMetricRespData struct { 71 | ResultType string `json:"resultType"` 72 | Results []recordsMetricRespDataResult `json:"result"` 73 | } 74 | 75 | type recordsMetricRespDataResult struct { 76 | Metric recordsMetricRespDataResultMetric `json:"metric"` 77 | Values []recordsMetricRespDataResultValue `json:"values"` 78 | } 79 | 80 | type recordsMetricRespDataResultMetric struct { 81 | Instance string `json:"instance"` 82 | InstanceType string `json:"instance_type"` 83 | SQLDigest string `json:"sql_digest"` 84 | PlanDigest string `json:"plan_digest"` 85 | } 86 | 87 | type recordsMetricRespV2 struct { 88 | Status string `json:"status"` 89 | Data recordsMetricRespDataV2 `json:"data"` 90 | } 91 | 92 | type recordsMetricRespDataV2 struct { 93 | ResultType string `json:"resultType"` 94 | Results []recordsMetricRespDataResultV2 `json:"result"` 95 | } 96 | 97 | type recordsMetricRespDataResultV2 struct { 98 | Metric map[string]interface{} `json:"metric"` 99 | Values []recordsMetricRespDataResultValue `json:"values"` 100 | } 101 | 102 | type recordsMetricRespDataResultValue = []interface{} 103 | 104 | type instancesMetricResp struct { 105 | Status string `json:"status"` 106 | Data instancesMetricRespData `json:"data"` 107 | } 108 | 109 | type instancesMetricRespData struct { 110 | ResultType string `json:"resultType"` 111 | Results []instancesMetricRespDataResult `json:"result"` 112 | } 113 | 114 | type instancesMetricRespDataResult struct { 115 | Metric instancesMetricRespDataResultMetric `json:"metric"` 116 | } 117 | 118 | type instancesMetricRespDataResultMetric struct { 119 | Instance string `json:"instance"` 120 | InstanceType string `json:"instance_type"` 121 | } 122 | 123 | type sumMetricResp struct { 124 | Status string `json:"status"` 125 | Data sumMetricRespData `json:"data"` 126 | } 127 | 128 | type sumMetricRespData struct { 129 | ResultType string `json:"resultType"` 130 | Results []sumMetricRespDataResult `json:"result"` 131 | } 132 | 133 | type sumMetricRespDataResult struct { 134 | Metric sumMetricRespDataResultMetric `json:"metric"` 135 | Value []interface{} `json:"value"` 136 | } 137 | 138 | type sumMetricRespDataResultMetric struct { 139 | SQLDigest string `json:"sql_digest"` 140 | PlanDigest string `json:"plan_digest"` 141 | } 142 | -------------------------------------------------------------------------------- /component/topsql/query/pools.go: -------------------------------------------------------------------------------- 1 | package query 2 | 3 | import ( 4 | "sync" 5 | ) 6 | 7 | type sqlGroupSlicePool struct { 8 | p sync.Pool 9 | } 10 | 11 | func (ssp *sqlGroupSlicePool) Get() *[]sqlGroup { 12 | ssv := ssp.p.Get() 13 | if ssv == nil { 14 | return &[]sqlGroup{} 15 | } 16 | return ssv.(*[]sqlGroup) 17 | } 18 | 19 | func (ssp *sqlGroupSlicePool) Put(ssv *[]sqlGroup) { 20 | *ssv = (*ssv)[:0] 21 | ssp.p.Put(ssv) 22 | } 23 | 24 | type sqlDigestMapPool struct { 25 | p sync.Pool 26 | } 27 | 28 | func (smp *sqlDigestMapPool) Get() map[string]sqlGroup { 29 | smv := smp.p.Get() 30 | if smv == nil { 31 | return make(map[string]sqlGroup) 32 | } 33 | return smv.(map[string]sqlGroup) 34 | } 35 | 36 | func (smp *sqlDigestMapPool) Put(smv map[string]sqlGroup) { 37 | for key := range smv { 38 | delete(smv, key) 39 | } 40 | smp.p.Put(smv) 41 | } 42 | 43 | type sumMapPool struct { 44 | p sync.Pool 45 | } 46 | 47 | func (smp *sumMapPool) Get() map[RecordKey]float64 { 48 | smv := smp.p.Get() 49 | if smv == nil { 50 | return make(map[RecordKey]float64) 51 | } 52 | return smv.(map[RecordKey]float64) 53 | } 54 | 55 | func (smp *sumMapPool) Put(smv map[RecordKey]float64) { 56 | for key := range smv { 57 | delete(smv, key) 58 | } 59 | smp.p.Put(smv) 60 | } 61 | -------------------------------------------------------------------------------- /component/topsql/query/query.go: -------------------------------------------------------------------------------- 1 | package query 2 | 3 | type Query interface { 4 | Records(name string, startSecs, endSecs, windowSecs, top int, instance, instanceType string, fill *[]RecordItem) error 5 | Summary(startSecs, endSecs, windowSecs, top int, instance, instanceType string, fill *[]SummaryItem) error 6 | SummaryBy(startSecs, endSecs, windowSecs, top int, instance, instanceType, by string, fill *[]SummaryByItem) error 7 | Instances(startSecs, endSecs int, fill *[]InstanceItem) error 8 | Close() 9 | } 10 | -------------------------------------------------------------------------------- /component/topsql/service/http.go: -------------------------------------------------------------------------------- 1 | package service 2 | 3 | import ( 4 | "errors" 5 | "net/http" 6 | "strconv" 7 | "time" 8 | 9 | "github.com/pingcap/ng-monitoring/component/topsql/query" 10 | "github.com/pingcap/ng-monitoring/component/topsql/store" 11 | 12 | "github.com/gin-gonic/gin" 13 | ) 14 | 15 | var ( 16 | recordsP = recordsPool{} 17 | summaryBySqlP = summarySQLPool{} 18 | summaryByItemP = summaryByItemPool{} 19 | instanceItemsP = InstanceItemsPool{} 20 | 21 | metricNames = []string{ 22 | store.MetricNameCPUTime, 23 | store.MetricNameReadRow, 24 | store.MetricNameReadIndex, 25 | store.MetricNameWriteRow, 26 | store.MetricNameWriteIndex, 27 | store.MetricNameSQLExecCount, 28 | store.MetricNameSQLDurationSum, 29 | store.MetricNameSQLDurationCount, 30 | } 31 | ) 32 | 33 | type Service struct { 34 | query query.Query 35 | } 36 | 37 | func NewService(query query.Query) *Service { 38 | return &Service{query: query} 39 | } 40 | 41 | func (s *Service) HTTPService(g *gin.RouterGroup) { 42 | g.GET("/v1/instances", s.instancesHandler) 43 | for _, name := range metricNames { 44 | g.GET("/v1/"+name, s.metricHandler(name)) 45 | } 46 | g.GET("/v1/summary", s.summaryHandler) 47 | } 48 | 49 | func (s *Service) instancesHandler(c *gin.Context) { 50 | instances := instanceItemsP.Get() 51 | defer instanceItemsP.Put(instances) 52 | 53 | start, end, err := parseStartEnd(c) 54 | if err != nil { 55 | c.JSON(http.StatusBadRequest, gin.H{ 56 | "status": "error", 57 | "message": err.Error(), 58 | }) 59 | return 60 | } 61 | 62 | if err = s.query.Instances(start, end, instances); err != nil { 63 | c.JSON(http.StatusServiceUnavailable, gin.H{ 64 | "status": "error", 65 | "message": err.Error(), 66 | }) 67 | return 68 | } 69 | 70 | c.JSON(http.StatusOK, gin.H{ 71 | "status": "ok", 72 | "data": instances, 73 | }) 74 | } 75 | 76 | func (s *Service) metricHandler(name string) gin.HandlerFunc { 77 | return func(c *gin.Context) { 78 | s.queryMetric(c, name) 79 | } 80 | } 81 | 82 | func (s *Service) summaryHandler(c *gin.Context) { 83 | start, end, windowSecs, top, instance, instanceType, groupBy, err := parseAllParams(c) 84 | if err != nil { 85 | c.JSON(http.StatusBadRequest, gin.H{ 86 | "status": "error", 87 | "message": err.Error(), 88 | }) 89 | return 90 | } 91 | switch groupBy { 92 | case query.AggLevelTable: 93 | if instanceType == "tidb" { 94 | c.JSON(http.StatusBadRequest, gin.H{ 95 | "status": "error", 96 | "message": "table summary is not supported for tidb", 97 | }) 98 | return 99 | } 100 | items := summaryByItemP.Get() 101 | defer summaryByItemP.Put(items) 102 | err = s.query.SummaryBy(start, end, windowSecs, top, instance, instanceType, query.AggLevelTable, items) 103 | if err != nil { 104 | c.JSON(http.StatusServiceUnavailable, gin.H{ 105 | "status": "error", 106 | "message": err.Error(), 107 | }) 108 | return 109 | } 110 | c.JSON(http.StatusOK, gin.H{ 111 | "status": "ok", 112 | "data_by": items, 113 | }) 114 | case query.AggLevelDB: 115 | if instanceType == "tidb" { 116 | c.JSON(http.StatusBadRequest, gin.H{ 117 | "status": "error", 118 | "message": "db summary is not supported for tidb", 119 | }) 120 | return 121 | } 122 | items := summaryByItemP.Get() 123 | defer summaryByItemP.Put(items) 124 | err = s.query.SummaryBy(start, end, windowSecs, top, instance, instanceType, query.AggLevelDB, items) 125 | if err != nil { 126 | c.JSON(http.StatusServiceUnavailable, gin.H{ 127 | "status": "error", 128 | "message": err.Error(), 129 | }) 130 | return 131 | } 132 | c.JSON(http.StatusOK, gin.H{ 133 | "status": "ok", 134 | "data_by": items, 135 | }) 136 | default: 137 | items := summaryBySqlP.Get() 138 | defer summaryBySqlP.Put(items) 139 | err = s.query.Summary(start, end, windowSecs, top, instance, instanceType, items) 140 | if err != nil { 141 | c.JSON(http.StatusServiceUnavailable, gin.H{ 142 | "status": "error", 143 | "message": err.Error(), 144 | }) 145 | return 146 | } 147 | c.JSON(http.StatusOK, gin.H{ 148 | "status": "ok", 149 | "data": items, 150 | }) 151 | } 152 | 153 | } 154 | func (s *Service) queryMetric(c *gin.Context, name string) { 155 | start, end, windowSecs, top, instance, instanceType, _, err := parseAllParams(c) 156 | if err != nil { 157 | c.JSON(http.StatusBadRequest, gin.H{ 158 | "status": "error", 159 | "message": err.Error(), 160 | }) 161 | return 162 | } 163 | 164 | items := recordsP.Get() 165 | defer recordsP.Put(items) 166 | err = s.query.Records(name, start, end, windowSecs, top, instance, instanceType, items) 167 | if err != nil { 168 | c.JSON(http.StatusServiceUnavailable, gin.H{ 169 | "status": "error", 170 | "message": err.Error(), 171 | }) 172 | return 173 | } 174 | 175 | c.JSON(http.StatusOK, gin.H{ 176 | "status": "ok", 177 | "data": items, 178 | }) 179 | } 180 | 181 | func parseAllParams(c *gin.Context) (start, end, windowSecs, top int, instance, instanceType string, groupBy string, err error) { 182 | instance = c.Query("instance") 183 | if len(instance) == 0 { 184 | err = errors.New("no instance") 185 | return 186 | } 187 | 188 | instanceType = c.Query("instance_type") 189 | if len(instanceType) == 0 { 190 | err = errors.New("no instance_type") 191 | return 192 | } 193 | 194 | start, end, err = parseStartEnd(c) 195 | if err != nil { 196 | return 197 | } 198 | 199 | defaultTop := "-1" 200 | defaultWindow := "1m" 201 | raw := c.DefaultQuery("top", "-1") 202 | if len(raw) == 0 { 203 | raw = defaultTop 204 | } 205 | topInt64, err1 := strconv.ParseInt(raw, 10, 64) 206 | if err1 != nil { 207 | err = err1 208 | return 209 | } 210 | top = int(topInt64) 211 | 212 | raw = c.DefaultQuery("window", "1m") 213 | if len(raw) == 0 { 214 | raw = defaultWindow 215 | } 216 | duration, err1 := time.ParseDuration(raw) 217 | if err1 != nil { 218 | err = err1 219 | return 220 | } 221 | windowSecs = int(duration.Seconds()) 222 | 223 | groupBy = c.Query("group_by") 224 | return 225 | } 226 | 227 | func parseStartEnd(c *gin.Context) (start, end int, err error) { 228 | now := time.Now().Unix() 229 | 230 | var startSecs float64 231 | var endSecs float64 232 | 233 | const weekSecs = 7 * 24 * 60 * 60 234 | defaultStart := strconv.Itoa(int(now - 2*weekSecs)) 235 | defaultEnd := strconv.Itoa(int(now)) 236 | 237 | raw := c.DefaultQuery("start", defaultStart) 238 | if len(raw) == 0 { 239 | raw = defaultStart 240 | } 241 | startSecs, err = strconv.ParseFloat(raw, 64) 242 | if err != nil { 243 | return 244 | } 245 | 246 | raw = c.DefaultQuery("end", strconv.Itoa(int(now))) 247 | if len(raw) == 0 { 248 | raw = defaultEnd 249 | } 250 | endSecs, err = strconv.ParseFloat(raw, 64) 251 | if err != nil { 252 | return 253 | } 254 | 255 | return int(startSecs), int(endSecs), nil 256 | } 257 | -------------------------------------------------------------------------------- /component/topsql/service/pools.go: -------------------------------------------------------------------------------- 1 | package service 2 | 3 | import ( 4 | "sync" 5 | 6 | "github.com/pingcap/ng-monitoring/component/topsql/query" 7 | 8 | "github.com/pingcap/kvproto/pkg/resource_usage_agent" 9 | "github.com/pingcap/tipb/go-tipb" 10 | ) 11 | 12 | type SQLMetaSlicePool struct { 13 | p sync.Pool 14 | } 15 | 16 | func (ssp *SQLMetaSlicePool) Get() *[]*tipb.SQLMeta { 17 | ssv := ssp.p.Get() 18 | if ssv == nil { 19 | return &[]*tipb.SQLMeta{} 20 | } 21 | return ssv.(*[]*tipb.SQLMeta) 22 | } 23 | 24 | func (ssp *SQLMetaSlicePool) Put(ss *[]*tipb.SQLMeta) { 25 | *ss = (*ss)[:0] 26 | ssp.p.Put(ss) 27 | } 28 | 29 | type PlanMetaSlicePool struct { 30 | p sync.Pool 31 | } 32 | 33 | func (psp *PlanMetaSlicePool) Get() *[]*tipb.PlanMeta { 34 | ps := psp.p.Get() 35 | if ps == nil { 36 | return &[]*tipb.PlanMeta{} 37 | } 38 | return ps.(*[]*tipb.PlanMeta) 39 | } 40 | 41 | func (psp *PlanMetaSlicePool) Put(ps *[]*tipb.PlanMeta) { 42 | *ps = (*ps)[:0] 43 | psp.p.Put(ps) 44 | } 45 | 46 | type ResourceCPUTimeSlicePool struct { 47 | p sync.Pool 48 | } 49 | 50 | func (rsp *ResourceCPUTimeSlicePool) Get() *[]*resource_usage_agent.ResourceUsageRecord { 51 | rs := rsp.p.Get() 52 | if rs == nil { 53 | return &[]*resource_usage_agent.ResourceUsageRecord{} 54 | } 55 | return rs.(*[]*resource_usage_agent.ResourceUsageRecord) 56 | } 57 | 58 | func (rsp *ResourceCPUTimeSlicePool) Put(rs *[]*resource_usage_agent.ResourceUsageRecord) { 59 | *rs = (*rs)[:0] 60 | rsp.p.Put(rs) 61 | } 62 | 63 | type recordsPool struct { 64 | p sync.Pool 65 | } 66 | 67 | func (tip *recordsPool) Get() *[]query.RecordItem { 68 | tiv := tip.p.Get() 69 | if tiv == nil { 70 | return &[]query.RecordItem{} 71 | } 72 | return tiv.(*[]query.RecordItem) 73 | } 74 | 75 | func (tip *recordsPool) Put(ti *[]query.RecordItem) { 76 | *ti = (*ti)[:0] 77 | tip.p.Put(ti) 78 | } 79 | 80 | type summarySQLPool struct { 81 | p sync.Pool 82 | } 83 | 84 | func (sp *summarySQLPool) Get() *[]query.SummaryItem { 85 | sv := sp.p.Get() 86 | if sv == nil { 87 | return &[]query.SummaryItem{} 88 | } 89 | return sv.(*[]query.SummaryItem) 90 | } 91 | 92 | func (sp *summarySQLPool) Put(s *[]query.SummaryItem) { 93 | *s = (*s)[:0] 94 | sp.p.Put(s) 95 | } 96 | 97 | type summaryByItemPool struct { 98 | p sync.Pool 99 | } 100 | 101 | func (tp *summaryByItemPool) Get() *[]query.SummaryByItem { 102 | tv := tp.p.Get() 103 | if tv == nil { 104 | return &[]query.SummaryByItem{} 105 | } 106 | return tv.(*[]query.SummaryByItem) 107 | } 108 | 109 | func (tp *summaryByItemPool) Put(t *[]query.SummaryByItem) { 110 | *t = (*t)[:0] 111 | tp.p.Put(t) 112 | } 113 | 114 | type InstanceItemsPool struct { 115 | p sync.Pool 116 | } 117 | 118 | func (iip *InstanceItemsPool) Get() *[]query.InstanceItem { 119 | iiv := iip.p.Get() 120 | if iiv == nil { 121 | return &[]query.InstanceItem{} 122 | } 123 | return iiv.(*[]query.InstanceItem) 124 | } 125 | 126 | func (iip *InstanceItemsPool) Put(iiv *[]query.InstanceItem) { 127 | *iiv = (*iiv)[:0] 128 | iip.p.Put(iiv) 129 | } 130 | -------------------------------------------------------------------------------- /component/topsql/store/model.go: -------------------------------------------------------------------------------- 1 | package store 2 | 3 | const ( 4 | MetricNameInstance = "instance" 5 | MetricNameCPUTime = "cpu_time" 6 | MetricNameReadRow = "read_row" 7 | MetricNameReadIndex = "read_index" 8 | MetricNameWriteRow = "write_row" 9 | MetricNameWriteIndex = "write_index" 10 | MetricNameSQLExecCount = "sql_exec_count" 11 | MetricNameSQLDurationSum = "sql_duration_sum" 12 | MetricNameSQLDurationCount = "sql_duration_count" 13 | ) 14 | 15 | type Metric struct { 16 | Metric interface{} `json:"metric"` 17 | TimestampMs []uint64 `json:"timestamps"` 18 | Values []uint64 `json:"values"` 19 | } 20 | 21 | type InstanceItem struct { 22 | Instance string `json:"instance"` 23 | InstanceType string `json:"instance_type"` 24 | TimestampSec uint64 `json:"timestamp"` 25 | } 26 | 27 | type recordTags struct { 28 | Name string `json:"__name__"` 29 | Instance string `json:"instance"` 30 | InstanceType string `json:"instance_type"` 31 | SQLDigest string `json:"sql_digest"` 32 | PlanDigest string `json:"plan_digest"` 33 | DB string `json:"db"` 34 | Table string `json:"table"` 35 | } 36 | 37 | type instanceTags struct { 38 | Name string `json:"__name__"` 39 | Instance string `json:"instance"` 40 | InstanceType string `json:"instance_type"` 41 | } 42 | -------------------------------------------------------------------------------- /component/topsql/store/pools.go: -------------------------------------------------------------------------------- 1 | package store 2 | 3 | import ( 4 | "strings" 5 | "sync" 6 | ) 7 | 8 | type StringBuilderPool struct { 9 | p sync.Pool 10 | } 11 | 12 | func (sbp *StringBuilderPool) Get() *strings.Builder { 13 | sbv := sbp.p.Get() 14 | if sbv == nil { 15 | return &strings.Builder{} 16 | } 17 | return sbv.(*strings.Builder) 18 | } 19 | 20 | func (sbp *StringBuilderPool) Put(sb *strings.Builder) { 21 | sb.Reset() 22 | sbp.p.Put(sb) 23 | } 24 | 25 | type PrepareSlicePool struct { 26 | p sync.Pool 27 | } 28 | 29 | func (psp *PrepareSlicePool) Get() *[]interface{} { 30 | psv := psp.p.Get() 31 | if psv == nil { 32 | return &[]interface{}{} 33 | } 34 | return psv.(*[]interface{}) 35 | } 36 | 37 | func (psp *PrepareSlicePool) Put(ps *[]interface{}) { 38 | *ps = (*ps)[:0] 39 | psp.p.Put(ps) 40 | } 41 | -------------------------------------------------------------------------------- /component/topsql/store/store.go: -------------------------------------------------------------------------------- 1 | package store 2 | 3 | import ( 4 | "sync" 5 | 6 | rsmetering "github.com/pingcap/kvproto/pkg/resource_usage_agent" 7 | "github.com/pingcap/tipb/go-tipb" 8 | ) 9 | 10 | type Store interface { 11 | Instances(items []InstanceItem) error 12 | TopSQLRecord(instance, instanceType string, record *tipb.TopSQLRecord) error 13 | ResourceMeteringRecord(instance, instanceType string, record *rsmetering.ResourceUsageRecord, schemaInfo *sync.Map) error 14 | SQLMeta(meta *tipb.SQLMeta) error 15 | PlanMeta(meta *tipb.PlanMeta) error 16 | Close() 17 | } 18 | -------------------------------------------------------------------------------- /component/topsql/subscriber/main_test.go: -------------------------------------------------------------------------------- 1 | package subscriber_test 2 | 3 | import ( 4 | "testing" 5 | 6 | "go.uber.org/goleak" 7 | ) 8 | 9 | func TestMain(m *testing.M) { 10 | opts := []goleak.Option{ 11 | goleak.IgnoreTopFunction("github.com/golang/glog.(*loggingT).flushDaemon"), 12 | goleak.IgnoreTopFunction("github.com/golang/glog.(*fileSink).flushDaemon"), 13 | goleak.IgnoreTopFunction("github.com/VictoriaMetrics/VictoriaMetrics/lib/fasttime.init.0.func1"), 14 | } 15 | 16 | goleak.VerifyTestMain(m, opts...) 17 | } 18 | -------------------------------------------------------------------------------- /component/topsql/subscriber/pools.go: -------------------------------------------------------------------------------- 1 | package subscriber 2 | 3 | import ( 4 | "sync" 5 | 6 | "github.com/pingcap/ng-monitoring/component/topsql/store" 7 | ) 8 | 9 | type instancesItemSlicePool struct { 10 | p sync.Pool 11 | } 12 | 13 | func (isp *instancesItemSlicePool) Get() *[]store.InstanceItem { 14 | is := isp.p.Get() 15 | if is == nil { 16 | return &[]store.InstanceItem{} 17 | } 18 | return is.(*[]store.InstanceItem) 19 | } 20 | 21 | func (isp *instancesItemSlicePool) Put(is *[]store.InstanceItem) { 22 | *is = (*is)[:0] 23 | isp.p.Put(is) 24 | } 25 | -------------------------------------------------------------------------------- /component/topsql/subscriber/scraper.go: -------------------------------------------------------------------------------- 1 | package subscriber 2 | 3 | import ( 4 | "context" 5 | "crypto/tls" 6 | "fmt" 7 | "sync" 8 | "time" 9 | 10 | "github.com/pingcap/ng-monitoring/component/subscriber" 11 | "github.com/pingcap/ng-monitoring/component/topology" 12 | "github.com/pingcap/ng-monitoring/component/topsql/store" 13 | "github.com/pingcap/ng-monitoring/utils" 14 | 15 | "github.com/pingcap/kvproto/pkg/resource_usage_agent" 16 | "github.com/pingcap/log" 17 | "github.com/pingcap/tipb/go-tipb" 18 | "go.uber.org/zap" 19 | "google.golang.org/grpc" 20 | "google.golang.org/grpc/backoff" 21 | "google.golang.org/grpc/credentials" 22 | "google.golang.org/grpc/keepalive" 23 | ) 24 | 25 | var ( 26 | dialTimeout = 5 * time.Second 27 | ) 28 | 29 | type Scraper struct { 30 | ctx context.Context 31 | cancel context.CancelFunc 32 | tlsConfig *tls.Config 33 | component topology.Component 34 | store store.Store 35 | // schemaInfo is used to store the schema information. tableID -> schema 36 | schemaInfo *sync.Map 37 | } 38 | 39 | func NewScraper(ctx context.Context, schemaInfo *sync.Map, component topology.Component, store store.Store, tlsConfig *tls.Config) *Scraper { 40 | switch component.Name { 41 | case topology.ComponentTiDB, topology.ComponentTiKV: 42 | ctx, cancel := context.WithCancel(ctx) 43 | return &Scraper{ 44 | ctx: ctx, 45 | cancel: cancel, 46 | tlsConfig: tlsConfig, 47 | component: component, 48 | store: store, 49 | schemaInfo: schemaInfo, 50 | } 51 | default: 52 | return nil 53 | } 54 | } 55 | 56 | var _ subscriber.Scraper = &Scraper{} 57 | 58 | func (s *Scraper) IsDown() bool { 59 | select { 60 | case <-s.ctx.Done(): 61 | return true 62 | default: 63 | return false 64 | } 65 | } 66 | 67 | func (s *Scraper) Close() { 68 | s.cancel() 69 | } 70 | 71 | func (s *Scraper) Run() { 72 | log.Info("starting to scrape Top SQL from the component", zap.Any("component", s.component)) 73 | defer func() { 74 | s.cancel() 75 | log.Info("stop scraping Top SQL from the component", zap.Any("component", s.component)) 76 | }() 77 | 78 | switch s.component.Name { 79 | case topology.ComponentTiDB: 80 | s.scrapeTiDB() 81 | case topology.ComponentTiKV: 82 | s.scrapeTiKV() 83 | default: 84 | log.Error("unexpected scrape target", zap.String("component", s.component.Name)) 85 | } 86 | } 87 | 88 | func (s *Scraper) scrapeTiDB() { 89 | addr := fmt.Sprintf("%s:%d", s.component.IP, s.component.StatusPort) 90 | bo := newBackoffScrape(s.ctx, s.tlsConfig, addr, s.component) 91 | defer bo.close() 92 | 93 | for { 94 | record := bo.scrapeTiDBRecord() 95 | if record == nil { 96 | return 97 | } 98 | 99 | if r := record.GetRecord(); r != nil { 100 | err := s.store.TopSQLRecord(addr, topology.ComponentTiDB, r) 101 | if err != nil { 102 | log.Warn("failed to store Top SQL records", zap.Error(err)) 103 | } 104 | continue 105 | } 106 | 107 | if meta := record.GetSqlMeta(); meta != nil { 108 | err := s.store.SQLMeta(meta) 109 | if err != nil { 110 | log.Warn("failed to store SQL meta", zap.Error(err)) 111 | } 112 | continue 113 | } 114 | 115 | if meta := record.GetPlanMeta(); meta != nil { 116 | err := s.store.PlanMeta(meta) 117 | if err != nil { 118 | log.Warn("failed to store SQL meta", zap.Error(err)) 119 | } 120 | } 121 | } 122 | } 123 | 124 | func (s *Scraper) scrapeTiKV() { 125 | addr := fmt.Sprintf("%s:%d", s.component.IP, s.component.Port) 126 | bo := newBackoffScrape(s.ctx, s.tlsConfig, addr, s.component) 127 | defer bo.close() 128 | 129 | for { 130 | record := bo.scrapeTiKVRecord() 131 | if record == nil { 132 | return 133 | } 134 | 135 | err := s.store.ResourceMeteringRecord(addr, topology.ComponentTiKV, record, s.schemaInfo) 136 | if err != nil { 137 | log.Warn("failed to store resource metering records", zap.Error(err)) 138 | } 139 | } 140 | 141 | } 142 | 143 | func dial(ctx context.Context, tlsConfig *tls.Config, addr string) (*grpc.ClientConn, error) { 144 | var tlsOption grpc.DialOption 145 | if tlsConfig == nil { 146 | tlsOption = grpc.WithInsecure() 147 | } else { 148 | tlsOption = grpc.WithTransportCredentials(credentials.NewTLS(tlsConfig)) 149 | } 150 | 151 | dialCtx, cancel := context.WithTimeout(ctx, dialTimeout) 152 | defer cancel() 153 | 154 | return grpc.DialContext( 155 | dialCtx, 156 | addr, 157 | tlsOption, 158 | grpc.WithBlock(), 159 | grpc.WithKeepaliveParams(keepalive.ClientParameters{ 160 | Time: 10 * time.Second, 161 | Timeout: 3 * time.Second, 162 | }), 163 | grpc.WithConnectParams(grpc.ConnectParams{ 164 | Backoff: backoff.Config{ 165 | BaseDelay: 100 * time.Millisecond, // Default was 1s. 166 | Multiplier: 1.6, // Default 167 | Jitter: 0.2, // Default 168 | MaxDelay: 3 * time.Second, // Default was 120s. 169 | }, 170 | }), 171 | ) 172 | } 173 | 174 | type backoffScrape struct { 175 | ctx context.Context 176 | tlsCfg *tls.Config 177 | address string 178 | component topology.Component 179 | 180 | conn *grpc.ClientConn 181 | client interface{} 182 | stream interface{} 183 | 184 | firstWaitTime time.Duration 185 | maxRetryTimes uint 186 | } 187 | 188 | func newBackoffScrape(ctx context.Context, tlsCfg *tls.Config, address string, component topology.Component) *backoffScrape { 189 | return &backoffScrape{ 190 | ctx: ctx, 191 | tlsCfg: tlsCfg, 192 | address: address, 193 | component: component, 194 | 195 | firstWaitTime: 2 * time.Second, 196 | maxRetryTimes: 8, 197 | } 198 | } 199 | 200 | func (bo *backoffScrape) scrapeTiDBRecord() *tipb.TopSQLSubResponse { 201 | if record := bo.scrape(); record != nil { 202 | if res, ok := record.(*tipb.TopSQLSubResponse); ok { 203 | return res 204 | } 205 | } 206 | 207 | return nil 208 | } 209 | 210 | func (bo *backoffScrape) scrapeTiKVRecord() *resource_usage_agent.ResourceUsageRecord { 211 | if record := bo.scrape(); record != nil { 212 | if res, ok := record.(*resource_usage_agent.ResourceUsageRecord); ok { 213 | return res 214 | } 215 | } 216 | 217 | return nil 218 | } 219 | 220 | func (bo *backoffScrape) scrape() interface{} { 221 | if bo.stream != nil { 222 | switch s := bo.stream.(type) { 223 | case tipb.TopSQLPubSub_SubscribeClient: 224 | if record, _ := s.Recv(); record != nil { 225 | return record 226 | } 227 | case resource_usage_agent.ResourceMeteringPubSub_SubscribeClient: 228 | if record, _ := s.Recv(); record != nil { 229 | return record 230 | } 231 | } 232 | } 233 | 234 | return bo.backoffScrape() 235 | } 236 | 237 | func (bo *backoffScrape) backoffScrape() (record interface{}) { 238 | utils.WithRetryBackoff(bo.ctx, bo.maxRetryTimes, bo.firstWaitTime, func(retried uint) bool { 239 | if retried != 0 { 240 | log.Warn("retry to scrape component", zap.Any("component", bo.component), zap.Uint("retried", retried)) 241 | } 242 | 243 | if bo.conn != nil { 244 | _ = bo.conn.Close() 245 | bo.conn = nil 246 | bo.client = nil 247 | bo.stream = nil 248 | } 249 | 250 | conn, err := dial(bo.ctx, bo.tlsCfg, bo.address) 251 | if err != nil { 252 | log.Warn("failed to dial scrape target", zap.Any("component", bo.component), zap.Error(err)) 253 | return false 254 | } 255 | 256 | bo.conn = conn 257 | switch bo.component.Name { 258 | case topology.ComponentTiDB: 259 | client := tipb.NewTopSQLPubSubClient(conn) 260 | bo.client = client 261 | stream, err := client.Subscribe(bo.ctx, &tipb.TopSQLSubRequest{}) 262 | if err != nil { 263 | log.Warn("failed to call Subscribe", zap.Any("component", bo.component), zap.Error(err)) 264 | return false 265 | } 266 | bo.stream = stream 267 | record, err = stream.Recv() 268 | if err != nil { 269 | log.Warn("failed to call Subscribe", zap.Any("component", bo.component), zap.Error(err)) 270 | return false 271 | } 272 | 273 | return true 274 | 275 | case topology.ComponentTiKV: 276 | client := resource_usage_agent.NewResourceMeteringPubSubClient(conn) 277 | bo.client = client 278 | stream, err := client.Subscribe(bo.ctx, &resource_usage_agent.ResourceMeteringRequest{}) 279 | if err != nil { 280 | log.Warn("failed to call Subscribe", zap.Any("component", bo.component), zap.Error(err)) 281 | return false 282 | } 283 | bo.stream = stream 284 | record, err = stream.Recv() 285 | if err != nil { 286 | log.Warn("failed to call Subscribe", zap.Any("component", bo.component), zap.Error(err)) 287 | return false 288 | } 289 | 290 | return true 291 | default: 292 | return true 293 | } 294 | }) 295 | 296 | return 297 | } 298 | 299 | func (bo *backoffScrape) close() { 300 | if bo.conn != nil { 301 | _ = bo.conn.Close() 302 | bo.conn = nil 303 | bo.client = nil 304 | bo.stream = nil 305 | } 306 | } 307 | -------------------------------------------------------------------------------- /component/topsql/subscriber/scraper_test.go: -------------------------------------------------------------------------------- 1 | package subscriber_test 2 | 3 | import ( 4 | "context" 5 | "crypto/tls" 6 | "fmt" 7 | "math/rand" 8 | "testing" 9 | "time" 10 | 11 | "github.com/pingcap/ng-monitoring/component/topology" 12 | "github.com/pingcap/ng-monitoring/component/topsql/mock" 13 | "github.com/pingcap/ng-monitoring/component/topsql/subscriber" 14 | "github.com/pingcap/ng-monitoring/utils/testutil" 15 | 16 | rua "github.com/pingcap/kvproto/pkg/resource_usage_agent" 17 | "github.com/pingcap/tipb/go-tipb" 18 | "github.com/stretchr/testify/require" 19 | ) 20 | 21 | func TestScraperTiDBBasic(t *testing.T) { 22 | t.Parallel() 23 | 24 | // insecure 25 | testScraperTiDBBasic(t, nil, nil) 26 | 27 | // tls 28 | serverTLS, clientTLS, err := testutil.SetupCert() 29 | require.NoError(t, err) 30 | testScraperTiDBBasic(t, serverTLS, clientTLS) 31 | } 32 | 33 | func TestScraperTiKVBasic(t *testing.T) { 34 | t.Parallel() 35 | 36 | // insecure 37 | testScraperTiKVBasic(t, nil, nil) 38 | 39 | // tls 40 | serverTLS, clientTLS, err := testutil.SetupCert() 41 | require.NoError(t, err) 42 | testScraperTiKVBasic(t, serverTLS, clientTLS) 43 | } 44 | 45 | func testScraperTiDBBasic(t *testing.T, serverTLS *tls.Config, clientTLS *tls.Config) { 46 | store := mock.NewMemStore() 47 | defer store.Close() 48 | 49 | pubsub := mock.NewMockPubSub() 50 | ip, port, err := pubsub.Listen("127.0.0.1:0", serverTLS) 51 | require.NoError(t, err) 52 | go pubsub.Serve() 53 | defer pubsub.Stop() 54 | 55 | component := topology.Component{ 56 | Name: topology.ComponentTiDB, 57 | IP: ip, 58 | StatusPort: port, 59 | } 60 | scraper := subscriber.NewScraper(context.Background(), nil, component, store, clientTLS) 61 | go scraper.Run() 62 | defer scraper.Close() 63 | 64 | checkTiDBScrape(t, fmt.Sprintf("%s:%d", ip, port), pubsub, store) 65 | } 66 | 67 | func testScraperTiKVBasic(t *testing.T, serverTLS *tls.Config, clientTLS *tls.Config) { 68 | store := mock.NewMemStore() 69 | defer store.Close() 70 | 71 | pubsub := mock.NewMockPubSub() 72 | ip, port, err := pubsub.Listen("127.0.0.1:0", serverTLS) 73 | require.NoError(t, err) 74 | go pubsub.Serve() 75 | defer pubsub.Stop() 76 | 77 | component := topology.Component{ 78 | Name: topology.ComponentTiKV, 79 | IP: ip, 80 | Port: port, 81 | } 82 | scraper := subscriber.NewScraper(context.Background(), nil, component, store, clientTLS) 83 | go scraper.Run() 84 | defer scraper.Close() 85 | 86 | checkTiKVScrape(t, fmt.Sprintf("%s:%d", ip, port), pubsub, store) 87 | } 88 | 89 | func TestScraperCloseFirst(t *testing.T) { 90 | t.Parallel() 91 | 92 | store := mock.NewMemStore() 93 | defer store.Close() 94 | 95 | pubsub := mock.NewMockPubSub() 96 | ip, port, err := pubsub.Listen("127.0.0.1:0", nil) 97 | require.NoError(t, err) 98 | go pubsub.Serve() 99 | defer pubsub.Stop() 100 | 101 | component := topology.Component{ 102 | Name: topology.ComponentTiDB, 103 | IP: ip, 104 | Port: port, 105 | } 106 | scraper := subscriber.NewScraper(context.Background(), nil, component, store, nil) 107 | scraper.Close() 108 | scraper.Run() 109 | } 110 | 111 | func TestScraperOtherComponent(t *testing.T) { 112 | t.Parallel() 113 | 114 | store := mock.NewMemStore() 115 | defer store.Close() 116 | 117 | component := topology.Component{ 118 | Name: topology.ComponentPD, 119 | } 120 | scraper := subscriber.NewScraper(context.Background(), nil, component, store, nil) 121 | require.Nil(t, scraper) 122 | } 123 | 124 | func TestScraperTiDBRestart(t *testing.T) { 125 | t.Parallel() 126 | 127 | store := mock.NewMemStore() 128 | defer store.Close() 129 | 130 | pubsub := mock.NewMockPubSub() 131 | ip, port, err := pubsub.Listen("127.0.0.1:0", nil) 132 | require.NoError(t, err) 133 | go pubsub.Serve() 134 | 135 | component := topology.Component{ 136 | Name: topology.ComponentTiDB, 137 | IP: ip, 138 | StatusPort: port, 139 | } 140 | scraper := subscriber.NewScraper(context.Background(), nil, component, store, nil) 141 | go scraper.Run() 142 | defer scraper.Close() 143 | 144 | addr := fmt.Sprintf("%s:%d", ip, port) 145 | checkTiDBScrape(t, addr, pubsub, store) 146 | 147 | pubsub.Stop() 148 | time.Sleep(5 * time.Second) 149 | 150 | pubsub = mock.NewMockPubSub() 151 | _, _, err = pubsub.Listen(addr, nil) 152 | require.NoError(t, err) 153 | go pubsub.Serve() 154 | defer pubsub.Stop() 155 | checkTiDBScrape(t, addr, pubsub, store) 156 | } 157 | 158 | func TestScraperTiKVRestart(t *testing.T) { 159 | t.Parallel() 160 | 161 | store := mock.NewMemStore() 162 | defer store.Close() 163 | 164 | pubsub := mock.NewMockPubSub() 165 | ip, port, err := pubsub.Listen("127.0.0.1:0", nil) 166 | require.NoError(t, err) 167 | go pubsub.Serve() 168 | 169 | component := topology.Component{ 170 | Name: topology.ComponentTiKV, 171 | IP: ip, 172 | Port: port, 173 | } 174 | scraper := subscriber.NewScraper(context.Background(), nil, component, store, nil) 175 | go scraper.Run() 176 | defer scraper.Close() 177 | 178 | addr := fmt.Sprintf("%s:%d", ip, port) 179 | checkTiKVScrape(t, addr, pubsub, store) 180 | 181 | pubsub.Stop() 182 | time.Sleep(5 * time.Second) 183 | 184 | pubsub = mock.NewMockPubSub() 185 | _, _, err = pubsub.Listen(addr, nil) 186 | require.NoError(t, err) 187 | go pubsub.Serve() 188 | defer pubsub.Stop() 189 | checkTiKVScrape(t, addr, pubsub, store) 190 | } 191 | 192 | func checkTiDBScrape(t *testing.T, addr string, pubsub *mock.MockPubSub, store *mock.MemStore) { 193 | rand.Seed(time.Now().Unix()) 194 | tsSec := rand.Uint64() 195 | cpuTimeMs := rand.Uint32() 196 | meta := rand.Int() 197 | sqlDigest := fmt.Sprintf("mock_sql_digest_%d", meta) 198 | sqlText := fmt.Sprintf("mock_normalized_sql_%d", meta) 199 | planDigest := fmt.Sprintf("mock_plan_digest_%d", meta) 200 | planText := fmt.Sprintf("mock__normalized_plan_%d", meta) 201 | 202 | pubsub.AccessTiDBStream(func(stream tipb.TopSQLPubSub_SubscribeServer) error { 203 | require.NoError(t, stream.Send(&tipb.TopSQLSubResponse{RespOneof: &tipb.TopSQLSubResponse_Record{ 204 | Record: &tipb.TopSQLRecord{ 205 | SqlDigest: []byte(sqlDigest), 206 | PlanDigest: []byte(planDigest), 207 | Items: []*tipb.TopSQLRecordItem{{TimestampSec: tsSec, CpuTimeMs: cpuTimeMs}}, 208 | }, 209 | }})) 210 | 211 | require.NoError(t, stream.Send(&tipb.TopSQLSubResponse{RespOneof: &tipb.TopSQLSubResponse_SqlMeta{ 212 | SqlMeta: &tipb.SQLMeta{ 213 | SqlDigest: []byte(sqlDigest), 214 | NormalizedSql: sqlText, 215 | }, 216 | }})) 217 | 218 | require.NoError(t, stream.Send(&tipb.TopSQLSubResponse{RespOneof: &tipb.TopSQLSubResponse_PlanMeta{ 219 | PlanMeta: &tipb.PlanMeta{ 220 | PlanDigest: []byte(planDigest), 221 | NormalizedPlan: planText, 222 | }, 223 | }})) 224 | return nil 225 | }) 226 | 227 | require.True(t, store.Pred(func(store *mock.MemStore) bool { 228 | if _, ok := store.TopSQLRecords[addr]; !ok { 229 | return false 230 | } 231 | if _, ok := store.TopSQLRecords[addr][sqlDigest]; !ok { 232 | return false 233 | } 234 | if _, ok := store.TopSQLRecords[addr][sqlDigest][planDigest]; !ok { 235 | return false 236 | } 237 | if _, ok := store.SQLMetas[sqlDigest]; !ok { 238 | return false 239 | } 240 | if _, ok := store.PlanMetas[planDigest]; !ok { 241 | return false 242 | } 243 | 244 | require.Equal(t, store.SQLMetas[sqlDigest].Meta.NormalizedSql, sqlText) 245 | require.Equal(t, store.PlanMetas[planDigest].Meta.NormalizedPlan, planText) 246 | record := store.TopSQLRecords[addr][sqlDigest][planDigest] 247 | got := false 248 | for _, i := range record.Items { 249 | if i.TimestampSec == tsSec { 250 | got = true 251 | require.Equal(t, cpuTimeMs, i.CpuTimeMs) 252 | } 253 | } 254 | require.True(t, got) 255 | return true 256 | }, 10*time.Millisecond, 1*time.Second)) 257 | } 258 | 259 | func checkTiKVScrape(t *testing.T, addr string, pubsub *mock.MockPubSub, store *mock.MemStore) { 260 | rand.Seed(time.Now().Unix()) 261 | tsSec := rand.Uint64() 262 | cpuMs := rand.Uint32() 263 | rdKeys := rand.Uint32() 264 | wtKeys := rand.Uint32() 265 | tag := fmt.Sprintf("mock_resource_group_tag_%d", rand.Int()) 266 | 267 | pubsub.AccessTiKVStream(func(stream rua.ResourceMeteringPubSub_SubscribeServer) error { 268 | return stream.Send(&rua.ResourceUsageRecord{ 269 | RecordOneof: &rua.ResourceUsageRecord_Record{ 270 | Record: &rua.GroupTagRecord{ 271 | ResourceGroupTag: []byte(tag), 272 | Items: []*rua.GroupTagRecordItem{{ 273 | TimestampSec: tsSec, 274 | CpuTimeMs: cpuMs, 275 | ReadKeys: rdKeys, 276 | WriteKeys: wtKeys, 277 | }}, 278 | }, 279 | }, 280 | }) 281 | }) 282 | 283 | require.True(t, store.Pred(func(store *mock.MemStore) bool { 284 | if _, ok := store.ResourceMeteringRecords[addr]; !ok { 285 | return false 286 | } 287 | if _, ok := store.ResourceMeteringRecords[addr][tag]; !ok { 288 | return false 289 | } 290 | 291 | record := store.ResourceMeteringRecords[addr][tag] 292 | got := false 293 | for _, item := range record.GetRecord().GetItems() { 294 | if item.TimestampSec == tsSec { 295 | got = true 296 | require.Equal(t, item.CpuTimeMs, cpuMs) 297 | require.Equal(t, item.ReadKeys, rdKeys) 298 | require.Equal(t, item.WriteKeys, wtKeys) 299 | } 300 | } 301 | require.True(t, got) 302 | return true 303 | }, 10*time.Millisecond, 1*time.Second)) 304 | } 305 | -------------------------------------------------------------------------------- /component/topsql/subscriber/subscriber.go: -------------------------------------------------------------------------------- 1 | package subscriber 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | "net" 7 | "net/http" 8 | "runtime" 9 | "sync" 10 | "time" 11 | 12 | "github.com/pingcap/ng-monitoring/component/domain" 13 | "github.com/pingcap/ng-monitoring/component/subscriber" 14 | "github.com/pingcap/ng-monitoring/component/topology" 15 | "github.com/pingcap/ng-monitoring/component/topsql/store" 16 | "github.com/pingcap/ng-monitoring/config" 17 | "github.com/pingcap/ng-monitoring/config/pdvariable" 18 | 19 | "github.com/pingcap/log" 20 | "go.uber.org/zap" 21 | ) 22 | 23 | var ( 24 | instancesItemSliceP = &instancesItemSlicePool{} 25 | ) 26 | 27 | func NewSubscriber( 28 | topoSubscriber topology.Subscriber, 29 | varSubscriber pdvariable.Subscriber, 30 | cfgSubscriber config.Subscriber, 31 | domain *domain.Domain, 32 | store store.Store, 33 | ) *subscriber.Subscriber { 34 | controller := NewSubscriberController(store) 35 | return subscriber.NewSubscriber( 36 | domain, 37 | topoSubscriber, 38 | varSubscriber, 39 | cfgSubscriber, 40 | controller, 41 | ) 42 | } 43 | 44 | type SubscriberController struct { 45 | store store.Store 46 | 47 | config *config.Config 48 | variable *pdvariable.PDVariable 49 | components []topology.Component 50 | } 51 | 52 | func NewSubscriberController(store store.Store) *SubscriberController { 53 | cfg := config.GetDefaultConfig() 54 | variable := pdvariable.DefaultPDVariable() 55 | return &SubscriberController{ 56 | store: store, 57 | config: &cfg, 58 | variable: variable, 59 | } 60 | } 61 | 62 | var _ subscriber.SubscribeController = &SubscriberController{} 63 | 64 | func (sc *SubscriberController) NewScraper(ctx context.Context, component topology.Component, schemaInfo *sync.Map) subscriber.Scraper { 65 | return NewScraper(ctx, schemaInfo, component, sc.store, sc.config.Security.GetTLSConfig()) 66 | } 67 | 68 | func (sc *SubscriberController) NewHTTPClient() *http.Client { 69 | dialer := &net.Dialer{ 70 | Timeout: 10 * time.Second, 71 | } 72 | tr := &http.Transport{ 73 | Proxy: http.ProxyFromEnvironment, 74 | DialContext: dialer.DialContext, 75 | MaxIdleConns: 100, 76 | IdleConnTimeout: 90 * time.Second, 77 | TLSHandshakeTimeout: 10 * time.Second, 78 | ExpectContinueTimeout: 1 * time.Second, 79 | MaxIdleConnsPerHost: runtime.GOMAXPROCS(0) + 1, 80 | TLSClientConfig: sc.config.Security.GetTLSConfig(), 81 | } 82 | return &http.Client{ 83 | Transport: tr, 84 | } 85 | } 86 | 87 | func (sc *SubscriberController) Name() string { 88 | return "Top SQL" 89 | } 90 | 91 | func (sc *SubscriberController) GetConfig() *config.Config { 92 | return sc.config 93 | } 94 | 95 | func (sc *SubscriberController) IsEnabled() bool { 96 | return sc.variable.EnableTopSQL 97 | } 98 | 99 | func (sc *SubscriberController) UpdatePDVariable(variable pdvariable.PDVariable) { 100 | sc.variable = &variable 101 | } 102 | 103 | func (sc *SubscriberController) UpdateConfig(cfg config.Config) { 104 | sc.config = &cfg 105 | } 106 | 107 | func (sc *SubscriberController) UpdateTopology(components []topology.Component) { 108 | sc.components = components 109 | 110 | if sc.variable.EnableTopSQL { 111 | if err := sc.storeTopology(); err != nil { 112 | log.Warn("failed to store topology", zap.Error(err)) 113 | } 114 | } 115 | } 116 | 117 | func (sc *SubscriberController) storeTopology() error { 118 | if len(sc.components) == 0 { 119 | return nil 120 | } 121 | 122 | items := instancesItemSliceP.Get() 123 | defer instancesItemSliceP.Put(items) 124 | 125 | now := time.Now().Unix() 126 | for _, com := range sc.components { 127 | switch com.Name { 128 | case topology.ComponentTiDB: 129 | *items = append(*items, store.InstanceItem{ 130 | Instance: fmt.Sprintf("%s:%d", com.IP, com.StatusPort), 131 | InstanceType: topology.ComponentTiDB, 132 | TimestampSec: uint64(now), 133 | }) 134 | case topology.ComponentTiKV: 135 | *items = append(*items, store.InstanceItem{ 136 | Instance: fmt.Sprintf("%s:%d", com.IP, com.Port), 137 | InstanceType: topology.ComponentTiKV, 138 | TimestampSec: uint64(now), 139 | }) 140 | } 141 | } 142 | return sc.store.Instances(*items) 143 | } 144 | -------------------------------------------------------------------------------- /component/topsql/subscriber/subscriber_test.go: -------------------------------------------------------------------------------- 1 | package subscriber_test 2 | 3 | import ( 4 | "fmt" 5 | "strings" 6 | "testing" 7 | "time" 8 | 9 | "github.com/pingcap/ng-monitoring/component/subscriber" 10 | "github.com/pingcap/ng-monitoring/component/topology" 11 | "github.com/pingcap/ng-monitoring/component/topsql/mock" 12 | sub "github.com/pingcap/ng-monitoring/component/topsql/subscriber" 13 | "github.com/pingcap/ng-monitoring/config" 14 | "github.com/pingcap/ng-monitoring/config/pdvariable" 15 | 16 | "github.com/pingcap/tipb/go-tipb" 17 | "github.com/stretchr/testify/require" 18 | ) 19 | 20 | type testSuite struct { 21 | subscriber *subscriber.Subscriber 22 | 23 | varSubscriber pdvariable.Subscriber 24 | topoSubscriber topology.Subscriber 25 | cfgSubscriber config.Subscriber 26 | 27 | store *mock.MemStore 28 | service *mock.MockPubSub 29 | 30 | ip string 31 | port uint 32 | } 33 | 34 | func newTestSuite() *testSuite { 35 | ts := &testSuite{} 36 | 37 | ts.varSubscriber = make(pdvariable.Subscriber) 38 | ts.topoSubscriber = make(topology.Subscriber) 39 | ts.cfgSubscriber = make(config.Subscriber) 40 | ts.store = mock.NewMemStore() 41 | 42 | controller := sub.NewSubscriberController(ts.store) 43 | ts.subscriber = subscriber.NewSubscriber(nil, ts.topoSubscriber, ts.varSubscriber, ts.cfgSubscriber, controller) 44 | 45 | ts.service = mock.NewMockPubSub() 46 | ts.ip, ts.port, _ = ts.service.Listen("127.0.0.1:0", nil) 47 | go ts.service.Serve() 48 | return ts 49 | } 50 | 51 | func (ts *testSuite) checkTiDBScrape(t *testing.T) { 52 | checkTiDBScrape(t, fmt.Sprintf("%s:%d", ts.ip, ts.port), ts.service, ts.store) 53 | } 54 | 55 | func (ts *testSuite) checkTiKVScrape(t *testing.T) { 56 | checkTiKVScrape(t, fmt.Sprintf("%s:%d", ts.ip, ts.port), ts.service, ts.store) 57 | } 58 | 59 | func (ts *testSuite) Close() { 60 | ts.service.Stop() 61 | ts.subscriber.Close() 62 | ts.store.Close() 63 | } 64 | 65 | func TestSubscriberBasic(t *testing.T) { 66 | t.Parallel() 67 | 68 | ts := newTestSuite() 69 | defer ts.Close() 70 | 71 | ts.varSubscriber <- enable 72 | topo := []topology.Component{{ 73 | Name: topology.ComponentTiDB, 74 | IP: ts.ip, 75 | StatusPort: ts.port, 76 | }, { 77 | Name: topology.ComponentPD, 78 | }, { 79 | Name: topology.ComponentTiFlash, 80 | }} 81 | ts.topoSubscriber <- topoGetter(topo) 82 | ts.checkTiDBScrape(t) 83 | 84 | topo = append(topo, topology.Component{ 85 | Name: topology.ComponentTiKV, 86 | IP: ts.ip, 87 | Port: ts.port, 88 | }) 89 | ts.topoSubscriber <- topoGetter(topo) 90 | ts.checkTiKVScrape(t) 91 | } 92 | 93 | func TestSubscriberEnableAfterTopoIsReady(t *testing.T) { 94 | t.Parallel() 95 | 96 | ts := newTestSuite() 97 | defer ts.Close() 98 | 99 | topo := []topology.Component{{ 100 | Name: topology.ComponentTiDB, 101 | IP: ts.ip, 102 | StatusPort: ts.port, 103 | }} 104 | ts.topoSubscriber <- topoGetter(topo) 105 | ts.varSubscriber <- enable 106 | ts.checkTiDBScrape(t) 107 | } 108 | 109 | func TestSubscriberTopoChange(t *testing.T) { 110 | t.Parallel() 111 | 112 | ts := newTestSuite() 113 | defer ts.Close() 114 | 115 | ts.varSubscriber <- enable 116 | topo := []topology.Component{{ 117 | Name: topology.ComponentTiDB, 118 | IP: ts.ip, 119 | StatusPort: ts.port, 120 | }, { 121 | Name: topology.ComponentTiKV, 122 | IP: ts.ip, 123 | Port: ts.port, 124 | }} 125 | ts.topoSubscriber <- topoGetter(topo) 126 | 127 | ts.checkTiDBScrape(t) 128 | ts.checkTiKVScrape(t) 129 | 130 | // tidb component is out 131 | ts.service.AccessTiDBStream(func(s tipb.TopSQLPubSub_SubscribeServer) error { 132 | retry := 0 133 | for { 134 | err := s.Send(&tipb.TopSQLSubResponse{}) 135 | if err != nil && strings.Contains(err.Error(), "transport is closing") { 136 | return nil 137 | } 138 | 139 | if retry > 5 { 140 | require.Fail(t, "err should not be nil due to scraper should be closed") 141 | } 142 | retry += 1 143 | time.Sleep(10 * time.Millisecond) 144 | } 145 | }) 146 | ts.topoSubscriber <- topoGetter(topo[1:]) 147 | } 148 | 149 | func TestSubscriberDisable(t *testing.T) { 150 | t.Parallel() 151 | 152 | ts := newTestSuite() 153 | defer ts.Close() 154 | 155 | ts.varSubscriber <- enable 156 | topo := []topology.Component{{ 157 | Name: topology.ComponentTiDB, 158 | IP: ts.ip, 159 | StatusPort: ts.port, 160 | }} 161 | ts.topoSubscriber <- topoGetter(topo) 162 | ts.checkTiDBScrape(t) 163 | 164 | // disable 165 | ts.service.AccessTiDBStream(func(s tipb.TopSQLPubSub_SubscribeServer) error { 166 | retry := 0 167 | for { 168 | err := s.Send(&tipb.TopSQLSubResponse{}) 169 | if err != nil && strings.Contains(err.Error(), "transport is closing") { 170 | return nil 171 | } 172 | 173 | if retry > 5 { 174 | require.Fail(t, "err should not be nil due to scraper should be closed") 175 | } 176 | retry += 1 177 | time.Sleep(10 * time.Millisecond) 178 | } 179 | }) 180 | ts.varSubscriber <- disable 181 | } 182 | 183 | func enable() pdvariable.PDVariable { 184 | return pdvariable.PDVariable{EnableTopSQL: true} 185 | } 186 | 187 | func disable() pdvariable.PDVariable { 188 | return pdvariable.PDVariable{EnableTopSQL: false} 189 | } 190 | 191 | func topoGetter(topo []topology.Component) topology.GetLatestTopology { 192 | return func() []topology.Component { 193 | return topo 194 | } 195 | } 196 | -------------------------------------------------------------------------------- /component/topsql/topsql.go: -------------------------------------------------------------------------------- 1 | package topsql 2 | 3 | import ( 4 | "net/http" 5 | 6 | "github.com/pingcap/ng-monitoring/component/domain" 7 | "github.com/pingcap/ng-monitoring/component/subscriber" 8 | "github.com/pingcap/ng-monitoring/component/topology" 9 | "github.com/pingcap/ng-monitoring/component/topsql/query" 10 | "github.com/pingcap/ng-monitoring/component/topsql/service" 11 | "github.com/pingcap/ng-monitoring/component/topsql/store" 12 | sub "github.com/pingcap/ng-monitoring/component/topsql/subscriber" 13 | "github.com/pingcap/ng-monitoring/config" 14 | "github.com/pingcap/ng-monitoring/config/pdvariable" 15 | "github.com/pingcap/ng-monitoring/database/docdb" 16 | 17 | "github.com/gin-gonic/gin" 18 | ) 19 | 20 | var ( 21 | defStore store.Store 22 | defQuery query.Query 23 | defSubscriber *subscriber.Subscriber 24 | defService *service.Service 25 | ) 26 | 27 | func Init( 28 | do *domain.Domain, 29 | cfgSub config.Subscriber, 30 | docDB docdb.DocDB, 31 | insertHdr, selectHdr http.HandlerFunc, 32 | topSub topology.Subscriber, 33 | varSub pdvariable.Subscriber, 34 | metaRetentionSecs int64, 35 | ) (err error) { 36 | defStore = store.NewDefaultStore(insertHdr, docDB, metaRetentionSecs) 37 | defQuery = query.NewDefaultQuery(selectHdr, docDB) 38 | defSubscriber = sub.NewSubscriber(topSub, varSub, cfgSub, do, defStore) 39 | defService = service.NewService(defQuery) 40 | return nil 41 | } 42 | 43 | func HTTPService(g *gin.RouterGroup) { 44 | defService.HTTPService(g) 45 | } 46 | 47 | func Stop() { 48 | defSubscriber.Close() 49 | defQuery.Close() 50 | defStore.Close() 51 | } 52 | -------------------------------------------------------------------------------- /config/config.toml.example: -------------------------------------------------------------------------------- 1 | # NG Monitoring Server Configuration. 2 | 3 | # Server address. 4 | address = "0.0.0.0:12020" 5 | 6 | advertise-address = "0.0.0.0:12020" 7 | 8 | [log] 9 | # Log path 10 | path = "log" 11 | 12 | # Log level: DEBUG, INFO, WARN, ERROR 13 | level = "INFO" 14 | 15 | [pd] 16 | # Addresses of PD instances within the TiDB cluster. Multiple addresses are separated by commas, e.g. ["10.0.0.1:2379","10.0.0.2:2379"] 17 | endpoints = ["0.0.0.0:2379"] 18 | 19 | [storage] 20 | # Storage path of ng monitoring server 21 | path = "data" 22 | docdb-backend = "sqlite" 23 | 24 | [security] 25 | ca-path = "" 26 | cert-path = "" 27 | key-path = "" 28 | 29 | [tsdb] 30 | # Data with timestamps outside the retentionPeriod is automatically deleted 31 | # The following optional suffixes are supported: h (hour), d (day), w (week), y (year). 32 | # If suffix isn't set, then the duration is counted in months. 33 | retention-period = "1" 34 | # `search-max-unique-timeseries` limits the number of unique time series a single query can find and process. 35 | # VictoriaMetrics(tsdb) keeps in memory some metainformation about the time series located by each query 36 | # and spends some CPU time for processing the found time series. This means that the maximum memory usage 37 | # and CPU usage a single query can use is proportional to `search-max-unique-timeseries`. 38 | search-max-unique-timeseries = 300000 39 | 40 | [docdb] 41 | lsm-only = false 42 | -------------------------------------------------------------------------------- /config/pdvariable/pdvariable.go: -------------------------------------------------------------------------------- 1 | package pdvariable 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | "strconv" 7 | "strings" 8 | "sync" 9 | "sync/atomic" 10 | "time" 11 | 12 | "github.com/pingcap/ng-monitoring/component/domain" 13 | "github.com/pingcap/ng-monitoring/utils" 14 | 15 | "github.com/pingcap/log" 16 | "go.etcd.io/etcd/api/v3/mvccpb" 17 | clientv3 "go.etcd.io/etcd/client/v3" 18 | "go.uber.org/zap" 19 | ) 20 | 21 | var ( 22 | GlobalConfigPath = "/global/config/" 23 | defaultRetryCnt = 5 24 | defaultTimeout = time.Second 25 | defaultRetryInterval = time.Millisecond * 200 26 | ) 27 | 28 | func Init(do *domain.Domain) { 29 | loader = &variableLoader{do: do} 30 | defVar := DefaultPDVariable() 31 | loader.variable.Store(*defVar) 32 | go utils.GoWithRecovery(loader.start, nil) 33 | } 34 | 35 | type PDVariable struct { 36 | EnableTopSQL bool 37 | } 38 | 39 | func DefaultPDVariable() *PDVariable { 40 | return &PDVariable{EnableTopSQL: false} 41 | } 42 | 43 | type Subscriber = chan GetLatestPDVariable 44 | type GetLatestPDVariable = func() PDVariable 45 | 46 | func Subscribe() Subscriber { 47 | return loader.subscribe() 48 | } 49 | 50 | var loader *variableLoader 51 | 52 | type variableLoader struct { 53 | do *domain.Domain 54 | cancel context.CancelFunc 55 | 56 | variable atomic.Value 57 | 58 | sync.Mutex 59 | subscribers []Subscriber 60 | } 61 | 62 | func (l *variableLoader) start() { 63 | ctx, cancel := context.WithCancel(context.Background()) 64 | l.cancel = cancel 65 | l.loadGlobalConfigLoop(ctx) 66 | } 67 | 68 | func Stop() { 69 | if loader != nil && loader.cancel != nil { 70 | loader.cancel() 71 | 72 | loader.Lock() 73 | for _, ch := range loader.subscribers { 74 | close(ch) 75 | } 76 | loader.Unlock() 77 | } 78 | } 79 | 80 | func (l *variableLoader) GetEtcdClient() (*clientv3.Client, error) { 81 | return l.do.GetEtcdClient() 82 | } 83 | 84 | func (l *variableLoader) loadGlobalConfigLoop(ctx context.Context) { 85 | etcdCli, err := l.do.GetEtcdClient() 86 | if err != nil { 87 | return 88 | } 89 | 90 | ticker := time.NewTicker(time.Minute) 91 | watchCh := etcdCli.Watch(ctx, GlobalConfigPath, clientv3.WithPrefix()) 92 | 93 | cfg, err := l.loadAllGlobalConfig(ctx) 94 | if err != nil { 95 | log.Error("first load global config failed", zap.Error(err)) 96 | } else { 97 | log.Info("first load global config", zap.Reflect("global-config", cfg)) 98 | l.variable.Store(*cfg) 99 | l.notifySubscriber() 100 | } 101 | 102 | for { 103 | select { 104 | case <-ctx.Done(): 105 | return 106 | case <-ticker.C: 107 | newCfg, err := l.loadAllGlobalConfig(ctx) 108 | if err != nil || newCfg == nil { 109 | log.Error("load global config failed", zap.Error(err)) 110 | } else if cfg != nil { 111 | if newCfg != cfg { 112 | l.variable.Store(*newCfg) 113 | cfg = newCfg 114 | log.Info("load global config", zap.Reflect("cfg", cfg)) 115 | l.notifySubscriber() 116 | } 117 | } 118 | case e, ok := <-watchCh: 119 | if !ok { 120 | log.Info("global config watch channel closed") 121 | etcdCli, err = l.do.GetEtcdClient() 122 | if err != nil { 123 | return 124 | } 125 | watchCh = etcdCli.Watch(ctx, GlobalConfigPath, clientv3.WithPrefix()) 126 | // sleep a while to avoid too often. 127 | time.Sleep(time.Second) 128 | } else { 129 | newCfg := *cfg 130 | for _, event := range e.Events { 131 | if event.Type != mvccpb.PUT { 132 | continue 133 | } 134 | err = l.parseGlobalConfig(string(event.Kv.Key), string(event.Kv.Value), &newCfg) 135 | if err != nil { 136 | log.Error("load global config failed", zap.Error(err)) 137 | } 138 | log.Info("watch global config changed", zap.Reflect("cfg", newCfg)) 139 | } 140 | if newCfg != *cfg { 141 | l.variable.Store(newCfg) 142 | *cfg = newCfg 143 | l.notifySubscriber() 144 | } 145 | } 146 | } 147 | } 148 | } 149 | 150 | func (l *variableLoader) loadAllGlobalConfig(ctx context.Context) (*PDVariable, error) { 151 | var err error 152 | var resp *clientv3.GetResponse 153 | for i := 0; i < defaultRetryCnt; i++ { 154 | select { 155 | case <-ctx.Done(): 156 | return nil, ctx.Err() 157 | default: 158 | } 159 | var etcdCli *clientv3.Client 160 | etcdCli, err = l.do.GetEtcdClient() 161 | if err != nil { 162 | return nil, err 163 | } 164 | childCtx, cancel := context.WithTimeout(ctx, defaultTimeout) 165 | resp, err = etcdCli.Get(childCtx, GlobalConfigPath, clientv3.WithPrefix()) 166 | cancel() 167 | if err != nil { 168 | log.Debug("load global config failed.", zap.Error(err)) 169 | time.Sleep(defaultRetryInterval) 170 | continue 171 | } 172 | cfg := DefaultPDVariable() 173 | if len(resp.Kvs) == 0 { 174 | return cfg, nil 175 | } 176 | for _, kv := range resp.Kvs { 177 | err = l.parseGlobalConfig(string(kv.Key), string(kv.Value), cfg) 178 | if err != nil { 179 | return nil, err 180 | } 181 | } 182 | return cfg, nil 183 | } 184 | return nil, err 185 | } 186 | 187 | func (l *variableLoader) parseGlobalConfig(key, value string, cfg *PDVariable) error { 188 | key = strings.TrimPrefix(key, GlobalConfigPath) 189 | switch key { 190 | case "enable_resource_metering": 191 | v, err := strconv.ParseBool(value) 192 | if err != nil { 193 | return fmt.Errorf("global config %v has invalid value: %v", 194 | "enable_resource_metering", value) 195 | } 196 | cfg.EnableTopSQL = v 197 | } 198 | return nil 199 | } 200 | 201 | func (l *variableLoader) subscribe() Subscriber { 202 | ch := make(Subscriber, 1) 203 | l.Lock() 204 | l.subscribers = append(l.subscribers, ch) 205 | ch <- l.load 206 | l.Unlock() 207 | return ch 208 | } 209 | 210 | func (l *variableLoader) load() PDVariable { 211 | return l.variable.Load().(PDVariable) 212 | } 213 | 214 | func (l *variableLoader) notifySubscriber() { 215 | l.Lock() 216 | 217 | for _, ch := range l.subscribers { 218 | select { 219 | case ch <- l.load: 220 | default: 221 | } 222 | } 223 | 224 | l.Unlock() 225 | } 226 | -------------------------------------------------------------------------------- /config/pdvariable/pdvariable_test.go: -------------------------------------------------------------------------------- 1 | package pdvariable_test 2 | 3 | import ( 4 | "context" 5 | "runtime" 6 | "testing" 7 | "time" 8 | 9 | "github.com/pingcap/ng-monitoring/component/domain" 10 | "github.com/pingcap/ng-monitoring/config" 11 | "github.com/pingcap/ng-monitoring/config/pdvariable" 12 | 13 | "github.com/stretchr/testify/require" 14 | "go.etcd.io/etcd/tests/v3/integration" 15 | ) 16 | 17 | func TestPDVariableSubscribe(t *testing.T) { 18 | if runtime.GOOS == "windows" { 19 | t.Skip("integration.NewClusterV3 will create file contains a colon which is not allowed on Windows") 20 | } 21 | 22 | integration.BeforeTestExternal(t) 23 | for i := 0; i < 2; i++ { 24 | testPDVariableSubscribe(t, i%2 == 0) 25 | } 26 | } 27 | 28 | func testPDVariableSubscribe(t *testing.T, init bool) { 29 | cfg := config.GetDefaultConfig() 30 | config.StoreGlobalConfig(cfg) 31 | 32 | cluster := integration.NewClusterV3(t, &integration.ClusterConfig{Size: 1}) 33 | defer cluster.Terminate(t) 34 | 35 | if init { 36 | cli := cluster.RandClient() 37 | _, err := cli.Put(context.Background(), pdvariable.GlobalConfigPath+"enable_resource_metering", "false") 38 | require.NoError(t, err) 39 | } 40 | 41 | do := domain.NewDomainForTest(nil, cluster.RandClient()) 42 | pdvariable.Init(do) 43 | defer pdvariable.Stop() 44 | 45 | // wait for first load finish 46 | time.Sleep(time.Millisecond * 100) 47 | 48 | sub := pdvariable.Subscribe() 49 | getVars := <-sub 50 | require.Equal(t, false, getVars().EnableTopSQL) 51 | 52 | cli := cluster.RandClient() 53 | _, err := cli.Put(context.Background(), pdvariable.GlobalConfigPath+"unknown", "false") 54 | require.NoError(t, err) 55 | _, err = cli.Put(context.Background(), pdvariable.GlobalConfigPath+"unknown", "abcd") 56 | require.NoError(t, err) 57 | _, err = cli.Put(context.Background(), pdvariable.GlobalConfigPath+"enable_resource_metering", "true") 58 | require.NoError(t, err) 59 | 60 | time.Sleep(time.Millisecond * 100) 61 | getVars = <-sub 62 | require.Equal(t, true, getVars().EnableTopSQL) 63 | 64 | cli = cluster.RandClient() 65 | _, err = cli.Put(context.Background(), pdvariable.GlobalConfigPath+"enable_resource_metering", "false") 66 | require.NoError(t, err) 67 | _, err = cli.Put(context.Background(), pdvariable.GlobalConfigPath+"enable_resource_metering", "false") 68 | require.NoError(t, err) 69 | _, err = cli.Put(context.Background(), pdvariable.GlobalConfigPath+"enable_resource_metering", "true") 70 | require.NoError(t, err) 71 | 72 | time.Sleep(time.Millisecond * 100) 73 | getVars = <-sub 74 | require.Equal(t, true, getVars().EnableTopSQL) 75 | 76 | cli = cluster.RandClient() 77 | _, err = cli.Put(context.Background(), pdvariable.GlobalConfigPath+"enable_resource_metering", "true") 78 | require.NoError(t, err) 79 | _, err = cli.Put(context.Background(), pdvariable.GlobalConfigPath+"enable_resource_metering", "false") 80 | require.NoError(t, err) 81 | _, err = cli.Put(context.Background(), pdvariable.GlobalConfigPath+"enable_resource_metering", "true") 82 | require.NoError(t, err) 83 | 84 | time.Sleep(time.Millisecond * 100) 85 | getVars = <-sub 86 | require.Equal(t, true, getVars().EnableTopSQL) 87 | 88 | cli = cluster.RandClient() 89 | _, err = cli.Put(context.Background(), pdvariable.GlobalConfigPath+"enable_resource_metering", "false") 90 | require.NoError(t, err) 91 | _, err = cli.Put(context.Background(), pdvariable.GlobalConfigPath+"enable_resource_metering", "true") 92 | require.NoError(t, err) 93 | _, err = cli.Put(context.Background(), pdvariable.GlobalConfigPath+"enable_resource_metering", "false") 94 | require.NoError(t, err) 95 | 96 | time.Sleep(time.Millisecond * 100) 97 | getVars = <-sub 98 | require.Equal(t, false, getVars().EnableTopSQL) 99 | } 100 | -------------------------------------------------------------------------------- /config/persist.go: -------------------------------------------------------------------------------- 1 | package config 2 | 3 | import ( 4 | "bytes" 5 | "context" 6 | "encoding/json" 7 | "fmt" 8 | 9 | "github.com/pingcap/log" 10 | "github.com/pingcap/ng-monitoring/database/docdb" 11 | "go.uber.org/zap" 12 | ) 13 | 14 | const ( 15 | continuousProfilingModule = "continuous_profiling" 16 | ) 17 | 18 | func LoadConfigFromStorage(ctx context.Context, db docdb.DocDB) error { 19 | cfgMap, err := db.LoadConfig(ctx) 20 | if err != nil { 21 | return err 22 | } 23 | UpdateGlobalConfig(func(curCfg Config) (res Config) { 24 | res = curCfg 25 | for module, cfgStr := range cfgMap { 26 | switch module { 27 | case continuousProfilingModule: 28 | var newCfg ContinueProfilingConfig 29 | if err = json.NewDecoder(bytes.NewReader([]byte(cfgStr))).Decode(&newCfg); err != nil { 30 | return 31 | } 32 | if newCfg.Valid() { 33 | res.ContinueProfiling = newCfg 34 | } else { 35 | log.Info("load invalid config", 36 | zap.String("module", module), 37 | zap.Reflect("module-config", newCfg)) 38 | } 39 | default: 40 | err = fmt.Errorf("unknow module config in storage, module: %v, config: %v", module, cfgStr) 41 | return 42 | } 43 | log.Info("load config from storage", 44 | zap.String("module", module), 45 | zap.String("module-config", cfgStr), 46 | zap.Reflect("global-config", res)) 47 | } 48 | return 49 | }) 50 | return err 51 | } 52 | 53 | func saveConfigIntoStorage(db docdb.DocDB) error { 54 | cfg := GetGlobalConfig() 55 | continuousProfilingCfg := cfg.ContinueProfiling 56 | data, err := json.Marshal(continuousProfilingCfg) 57 | if err != nil { 58 | return err 59 | } 60 | return db.SaveConfig(context.Background(), map[string]string{ 61 | continuousProfilingModule: string(data), 62 | }) 63 | } 64 | -------------------------------------------------------------------------------- /config/service.go: -------------------------------------------------------------------------------- 1 | package config 2 | 3 | import ( 4 | "bytes" 5 | "encoding/json" 6 | "fmt" 7 | "net/http" 8 | 9 | "github.com/pingcap/ng-monitoring/database/docdb" 10 | 11 | "github.com/gin-gonic/gin" 12 | "github.com/pingcap/log" 13 | "go.uber.org/zap" 14 | ) 15 | 16 | func HTTPService(g *gin.RouterGroup, docDB docdb.DocDB) { 17 | g.GET("", handleGetConfig(docDB)) 18 | g.POST("", handlePostConfig(docDB)) 19 | } 20 | 21 | func handleGetConfig(docDB docdb.DocDB) gin.HandlerFunc { 22 | return func(c *gin.Context) { 23 | cfg := GetGlobalConfig() 24 | c.JSON(http.StatusOK, cfg) 25 | } 26 | } 27 | 28 | func handlePostConfig(docDB docdb.DocDB) gin.HandlerFunc { 29 | return func(c *gin.Context) { 30 | err := handleModifyConfig(c, docDB) 31 | if err != nil { 32 | c.JSON(http.StatusServiceUnavailable, gin.H{ 33 | "status": "error", 34 | "message": err.Error(), 35 | }) 36 | return 37 | } 38 | c.JSON(http.StatusOK, gin.H{ 39 | "status": "ok", 40 | }) 41 | } 42 | } 43 | 44 | func handleModifyConfig(c *gin.Context, docDB docdb.DocDB) error { 45 | var reqNested map[string]interface{} 46 | if err := json.NewDecoder(c.Request.Body).Decode(&reqNested); err != nil { 47 | return err 48 | } 49 | for k, v := range reqNested { 50 | switch k { 51 | case "continuous_profiling": 52 | m, ok := v.(map[string]interface{}) 53 | if !ok { 54 | return fmt.Errorf("%v config value is invalid: %v", k, v) 55 | } 56 | err := handleContinueProfilingConfigModify(m, docDB) 57 | if err != nil { 58 | return err 59 | } 60 | default: 61 | return fmt.Errorf("config %v not support modify or unknow", k) 62 | } 63 | } 64 | return nil 65 | } 66 | 67 | func handleContinueProfilingConfigModify(reqNested map[string]interface{}, docDB docdb.DocDB) (err error) { 68 | UpdateGlobalConfig(func(curCfg Config) (res Config) { 69 | res = curCfg 70 | var current []byte 71 | current, err = json.Marshal(curCfg.ContinueProfiling) 72 | if err != nil { 73 | return 74 | } 75 | 76 | var currentNested map[string]interface{} 77 | if err = json.NewDecoder(bytes.NewReader(current)).Decode(¤tNested); err != nil { 78 | return 79 | } 80 | 81 | for k, newValue := range reqNested { 82 | oldValue, ok := currentNested[k] 83 | if !ok { 84 | err = fmt.Errorf("unknown config `%v`", k) 85 | return 86 | } 87 | if oldValue == newValue { 88 | continue 89 | } 90 | currentNested[k] = newValue 91 | log.Info("handle continuous profiling config modify", 92 | zap.String("name", k), 93 | zap.Reflect("old-value", oldValue), 94 | zap.Reflect("new-value", newValue)) 95 | } 96 | 97 | var data []byte 98 | data, err = json.Marshal(currentNested) 99 | if err != nil { 100 | return 101 | } 102 | var newCfg ContinueProfilingConfig 103 | err = json.NewDecoder(bytes.NewReader(data)).Decode(&newCfg) 104 | if err != nil { 105 | return 106 | } 107 | 108 | if !newCfg.Valid() { 109 | err = fmt.Errorf("new config is invalid: %v", string(data)) 110 | return 111 | } 112 | res.ContinueProfiling = newCfg 113 | return 114 | }) 115 | 116 | if err != nil { 117 | return err 118 | } 119 | 120 | return saveConfigIntoStorage(docDB) 121 | } 122 | -------------------------------------------------------------------------------- /config/service_test.go: -------------------------------------------------------------------------------- 1 | package config 2 | 3 | import ( 4 | "bytes" 5 | "context" 6 | "encoding/json" 7 | "io" 8 | "net" 9 | "net/http" 10 | "os" 11 | "testing" 12 | "time" 13 | 14 | "github.com/VictoriaMetrics/VictoriaMetrics/lib/procutil" 15 | "github.com/gin-gonic/gin" 16 | "github.com/pingcap/ng-monitoring/database/docdb" 17 | "github.com/pingcap/ng-monitoring/utils/testutil" 18 | "github.com/stretchr/testify/require" 19 | ) 20 | 21 | type testSuite struct { 22 | tmpDir string 23 | db docdb.DocDB 24 | } 25 | 26 | func (ts *testSuite) setup(t *testing.T) { 27 | var err error 28 | ts.tmpDir, err = os.MkdirTemp(os.TempDir(), "ngm-test-.*") 29 | require.NoError(t, err) 30 | ts.db, err = docdb.NewGenjiDBFromGenji(testutil.NewGenjiDB(t, ts.tmpDir)) 31 | require.NoError(t, err) 32 | def := GetDefaultConfig() 33 | StoreGlobalConfig(def) 34 | err = LoadConfigFromStorage(context.Background(), ts.db) 35 | require.NoError(t, err) 36 | } 37 | 38 | func (ts *testSuite) close(t *testing.T) { 39 | err := ts.db.Close() 40 | require.NoError(t, err) 41 | err = os.RemoveAll(ts.tmpDir) 42 | require.NoError(t, err) 43 | } 44 | 45 | func TestHTTPService(t *testing.T) { 46 | ts := testSuite{} 47 | ts.setup(t) 48 | defer ts.close(t) 49 | addr := setupHTTPService(t, ts.db) 50 | resp, err := http.Get("http://" + addr + "/config") 51 | require.NoError(t, err) 52 | data, err := io.ReadAll(resp.Body) 53 | require.NoError(t, err) 54 | err = resp.Body.Close() 55 | require.NoError(t, err) 56 | cfg := Config{} 57 | require.Equal(t, len(data) > 10, true) 58 | err = json.Unmarshal(data, &cfg) 59 | require.NoError(t, err) 60 | 61 | res, err := http.Post("http://"+addr+"/config", "application/json", bytes.NewReader([]byte(`{"continuous_profiling": {"enable": true,"profile_seconds":6,"interval_seconds":11}}`))) 62 | require.NoError(t, err) 63 | require.Equal(t, 200, res.StatusCode) 64 | globalCfg := GetGlobalConfig() 65 | require.Equal(t, true, globalCfg.ContinueProfiling.Enable) 66 | require.Equal(t, 6, globalCfg.ContinueProfiling.ProfileSeconds) 67 | require.Equal(t, 11, globalCfg.ContinueProfiling.IntervalSeconds) 68 | err = res.Body.Close() 69 | require.NoError(t, err) 70 | 71 | // test for post invalid config 72 | res, err = http.Post("http://"+addr+"/config", "application/json", bytes.NewReader([]byte(`{"continuous_profiling": {"enable": true,"profile_seconds":1000,"interval_seconds":11}}`))) 73 | require.NoError(t, err) 74 | require.Equal(t, 503, res.StatusCode) 75 | body, err := io.ReadAll(res.Body) 76 | require.NoError(t, err) 77 | require.Equal(t, `{"message":"new config is invalid: {\"data_retention_seconds\":259200,\"enable\":true,\"interval_seconds\":11,\"profile_seconds\":1000,\"timeout_seconds\":120}","status":"error"}`, string(body)) 78 | err = res.Body.Close() 79 | require.NoError(t, err) 80 | 81 | // test empty body config 82 | res, err = http.Post("http://"+addr+"/config", "application/json", bytes.NewReader([]byte(``))) 83 | require.NoError(t, err) 84 | require.Equal(t, 503, res.StatusCode) 85 | body, err = io.ReadAll(res.Body) 86 | require.NoError(t, err) 87 | require.Equal(t, `{"message":"EOF","status":"error"}`, string(body)) 88 | err = res.Body.Close() 89 | require.NoError(t, err) 90 | 91 | // test unknown config 92 | res, err = http.Post("http://"+addr+"/config", "application/json", bytes.NewReader([]byte(`{"unknown_module": {"enable": true}}`))) 93 | require.NoError(t, err) 94 | require.Equal(t, 503, res.StatusCode) 95 | body, err = io.ReadAll(res.Body) 96 | require.NoError(t, err) 97 | require.Equal(t, `{"message":"config unknown_module not support modify or unknow","status":"error"}`, string(body)) 98 | err = res.Body.Close() 99 | require.NoError(t, err) 100 | 101 | globalCfg = GetGlobalConfig() 102 | require.Equal(t, true, globalCfg.ContinueProfiling.Enable) 103 | require.Equal(t, 6, globalCfg.ContinueProfiling.ProfileSeconds) 104 | require.Equal(t, 11, globalCfg.ContinueProfiling.IntervalSeconds) 105 | } 106 | 107 | func TestCombineHTTPWithFile(t *testing.T) { 108 | ts := testSuite{} 109 | ts.setup(t) 110 | defer ts.close(t) 111 | addr := setupHTTPService(t, ts.db) 112 | 113 | cfgFileName := "test-cfg.toml" 114 | err := os.WriteFile(cfgFileName, []byte(""), 0666) 115 | require.NoError(t, err) 116 | defer os.Remove(cfgFileName) 117 | 118 | ctx, cancel := context.WithCancel(context.Background()) 119 | defer cancel() 120 | go ReloadRoutine(ctx, cfgFileName) 121 | 122 | res, err := http.Post("http://"+addr+"/config", "application/json", bytes.NewReader([]byte(`{"continuous_profiling": {"enable": true}}`))) 123 | require.NoError(t, err) 124 | require.NoError(t, res.Body.Close()) 125 | 126 | time.Sleep(100 * time.Millisecond) 127 | cfg := GetGlobalConfig() 128 | require.Equal(t, cfg.ContinueProfiling.Enable, true) 129 | 130 | err = os.WriteFile(cfgFileName, []byte("[pd]\nendpoints = [\"10.0.1.8:2379\"]"), 0666) 131 | require.NoError(t, err) 132 | procutil.SelfSIGHUP() 133 | 134 | time.Sleep(100 * time.Millisecond) 135 | cfg = GetGlobalConfig() 136 | require.Equal(t, cfg.ContinueProfiling.Enable, true) 137 | require.Equal(t, cfg.PD.Endpoints, []string{"10.0.1.8:2379"}) 138 | 139 | res, err = http.Post("http://"+addr+"/config", "application/json", bytes.NewReader([]byte(`{"continuous_profiling": {"enable": false}}`))) 140 | require.NoError(t, err) 141 | require.NoError(t, res.Body.Close()) 142 | 143 | time.Sleep(100 * time.Millisecond) 144 | cfg = GetGlobalConfig() 145 | require.Equal(t, cfg.ContinueProfiling.Enable, false) 146 | require.Equal(t, cfg.PD.Endpoints, []string{"10.0.1.8:2379"}) 147 | 148 | err = os.WriteFile(cfgFileName, []byte("[pd]\nendpoints = [\"10.0.1.8:2479\"]"), 0666) 149 | require.NoError(t, err) 150 | procutil.SelfSIGHUP() 151 | 152 | time.Sleep(100 * time.Millisecond) 153 | cfg = GetGlobalConfig() 154 | require.Equal(t, cfg.ContinueProfiling.Enable, false) 155 | require.Equal(t, cfg.PD.Endpoints, []string{"10.0.1.8:2479"}) 156 | } 157 | 158 | func setupHTTPService(t *testing.T, docDB docdb.DocDB) string { 159 | listener, err := net.Listen("tcp", "127.0.0.1:0") 160 | require.NoError(t, err) 161 | 162 | gin.SetMode(gin.ReleaseMode) 163 | ng := gin.New() 164 | 165 | ng.Use(gin.Recovery()) 166 | configGroup := ng.Group("/config") 167 | HTTPService(configGroup, docDB) 168 | httpServer := &http.Server{Handler: ng} 169 | 170 | go func() { 171 | if err = httpServer.Serve(listener); err != nil && err != http.ErrServerClosed { 172 | require.NoError(t, err) 173 | } 174 | }() 175 | return listener.Addr().String() 176 | } 177 | -------------------------------------------------------------------------------- /database/database.go: -------------------------------------------------------------------------------- 1 | package database 2 | 3 | import ( 4 | "github.com/pingcap/ng-monitoring/config" 5 | "github.com/pingcap/ng-monitoring/database/timeseries" 6 | 7 | "github.com/pingcap/log" 8 | "go.uber.org/zap" 9 | ) 10 | 11 | func Init(cfg *config.Config) { 12 | timeseries.Init(cfg) 13 | // document.Init(cfg) 14 | 15 | log.Info("Initialize database successfully", zap.String("path", cfg.Storage.Path)) 16 | } 17 | 18 | func Stop() { 19 | log.Info("Stopping timeseries database") 20 | timeseries.Stop() 21 | log.Info("Stop timeseries database successfully") 22 | 23 | // log.Info("Stopping document database") 24 | // document.Stop() 25 | // log.Info("Stop document database successfully") 26 | } 27 | -------------------------------------------------------------------------------- /database/docdb/docdb.go: -------------------------------------------------------------------------------- 1 | package docdb 2 | 3 | import ( 4 | "context" 5 | "io" 6 | 7 | "github.com/pingcap/ng-monitoring/component/conprof/meta" 8 | "github.com/pingcap/tipb/go-tipb" 9 | ) 10 | 11 | type DocDB interface { 12 | io.Closer 13 | 14 | SaveConfig(ctx context.Context, cfg map[string]string) error 15 | LoadConfig(ctx context.Context) (map[string]string, error) 16 | 17 | WriteSQLMeta(ctx context.Context, meta *tipb.SQLMeta) error 18 | QuerySQLMeta(ctx context.Context, digest string) (string, error) 19 | DeleteSQLMetaBeforeTs(ctx context.Context, ts int64) error 20 | WritePlanMeta(ctx context.Context, meta *tipb.PlanMeta) error 21 | QueryPlanMeta(ctx context.Context, digest string) (string, string, error) 22 | DeletePlanMetaBeforeTs(ctx context.Context, ts int64) error 23 | 24 | ConprofCreateProfileTables(ctx context.Context, id int64) error 25 | ConprofDeleteProfileTables(ctx context.Context, id int64) error 26 | ConprofCreateTargetInfo(ctx context.Context, target meta.ProfileTarget, info meta.TargetInfo) error 27 | ConprofUpdateTargetInfo(ctx context.Context, info meta.TargetInfo) error 28 | ConprofQueryTargetInfo(ctx context.Context, target meta.ProfileTarget, f func(info meta.TargetInfo) error) error 29 | ConprofQueryAllProfileTargets(ctx context.Context, f func(target meta.ProfileTarget, info meta.TargetInfo) error) error 30 | ConprofWriteProfileData(ctx context.Context, id, ts int64, data []byte) error 31 | ConprofQueryProfileData(ctx context.Context, id, begin, end int64, f func(ts int64, data []byte) error) error 32 | ConprofDeleteProfileDataBeforeTs(ctx context.Context, id, ts int64) error 33 | ConprofWriteProfileMeta(ctx context.Context, id, ts int64, err string) error 34 | ConprofQueryProfileMeta(ctx context.Context, id, begin, end int64, f func(ts int64, verr string) error) error 35 | ConprofDeleteProfileMetaBeforeTs(ctx context.Context, id, ts int64) error 36 | } 37 | -------------------------------------------------------------------------------- /database/docdb/docdb_test.go: -------------------------------------------------------------------------------- 1 | package docdb 2 | 3 | import ( 4 | "context" 5 | "encoding/hex" 6 | "testing" 7 | "time" 8 | 9 | "github.com/pingcap/ng-monitoring/component/conprof/meta" 10 | "github.com/pingcap/tipb/go-tipb" 11 | "github.com/stretchr/testify/require" 12 | ) 13 | 14 | func testDocDB(t *testing.T, db DocDB) { 15 | ctx := context.Background() 16 | if deadline, ok := t.Deadline(); ok { 17 | var cancel context.CancelFunc 18 | ctx, cancel = context.WithDeadline(ctx, deadline) 19 | defer cancel() 20 | } 21 | defer func() { 22 | err := db.Close() 23 | require.NoError(t, err) 24 | }() 25 | 26 | err := db.SaveConfig(ctx, map[string]string{"test_k": "test_v"}) 27 | require.NoError(t, err) 28 | 29 | cfgs, err := db.LoadConfig(ctx) 30 | require.NoError(t, err) 31 | require.Equal(t, map[string]string{"test_k": "test_v"}, cfgs) 32 | 33 | err = db.WriteSQLMeta(ctx, &tipb.SQLMeta{ 34 | SqlDigest: []byte("test_digest"), 35 | NormalizedSql: "test_sql", 36 | IsInternalSql: true, 37 | }) 38 | require.NoError(t, err) 39 | err = db.WritePlanMeta(ctx, &tipb.PlanMeta{ 40 | PlanDigest: []byte("test_digest"), 41 | NormalizedPlan: "test_plan", 42 | EncodedNormalizedPlan: "test_encoded_plan", 43 | }) 44 | require.NoError(t, err) 45 | 46 | sqlDigest, err := db.QuerySQLMeta(ctx, hex.EncodeToString([]byte("test_digest"))) 47 | require.NoError(t, err) 48 | require.Equal(t, "test_sql", sqlDigest) 49 | planDigest, encodedPlan, err := db.QueryPlanMeta(ctx, hex.EncodeToString([]byte("test_digest"))) 50 | require.NoError(t, err) 51 | require.Equal(t, "test_plan", planDigest) 52 | require.Equal(t, "test_encoded_plan", encodedPlan) 53 | 54 | // genjiDB does not support DeleteSQLMetaBeforeTs and DeletePlanMetaBeforeTs 55 | if _, ok := db.(*genjiDB); !ok { 56 | safePointTs := time.Now().Unix() + 100 57 | err = db.DeleteSQLMetaBeforeTs(ctx, safePointTs) 58 | require.NoError(t, err) 59 | err = db.DeletePlanMetaBeforeTs(ctx, safePointTs) 60 | require.NoError(t, err) 61 | sqlDigest, err = db.QuerySQLMeta(ctx, hex.EncodeToString([]byte("test_digest"))) 62 | require.NoError(t, err) 63 | require.Equal(t, "", sqlDigest) 64 | planDigest, encodedPlan, err = db.QueryPlanMeta(ctx, hex.EncodeToString([]byte("test_digest"))) 65 | require.NoError(t, err) 66 | require.Equal(t, "", planDigest) 67 | require.Equal(t, "", encodedPlan) 68 | } 69 | 70 | err = db.ConprofCreateProfileTables(ctx, 1) 71 | require.NoError(t, err) 72 | 73 | err = db.ConprofCreateTargetInfo(ctx, meta.ProfileTarget{ 74 | Kind: "test_kind", 75 | Component: "test_component", 76 | Address: "test_address", 77 | }, meta.TargetInfo{ 78 | ID: 1, 79 | LastScrapeTs: 2, 80 | }) 81 | require.NoError(t, err) 82 | 83 | targets := []meta.ProfileTarget{} 84 | infos := []meta.TargetInfo{} 85 | db.ConprofQueryAllProfileTargets(ctx, func(target meta.ProfileTarget, info meta.TargetInfo) error { 86 | targets = append(targets, target) 87 | infos = append(infos, info) 88 | return nil 89 | }) 90 | require.Len(t, targets, 1) 91 | require.Len(t, infos, 1) 92 | require.Equal(t, meta.ProfileTarget{ 93 | Kind: "test_kind", 94 | Component: "test_component", 95 | Address: "test_address", 96 | }, targets[0]) 97 | require.Equal(t, meta.TargetInfo{ 98 | ID: 1, 99 | LastScrapeTs: 2, 100 | }, infos[0]) 101 | 102 | infos = []meta.TargetInfo{} 103 | db.ConprofQueryTargetInfo(ctx, meta.ProfileTarget{ 104 | Kind: "test_kind", 105 | Component: "test_component", 106 | Address: "test_address", 107 | }, func(info meta.TargetInfo) error { 108 | infos = append(infos, info) 109 | return nil 110 | }) 111 | require.Len(t, infos, 1) 112 | require.Equal(t, meta.TargetInfo{ 113 | ID: 1, 114 | LastScrapeTs: 2, 115 | }, infos[0]) 116 | 117 | err = db.ConprofUpdateTargetInfo(ctx, meta.TargetInfo{ 118 | ID: 1, 119 | LastScrapeTs: 3, 120 | }) 121 | require.NoError(t, err) 122 | 123 | infos = []meta.TargetInfo{} 124 | db.ConprofQueryTargetInfo(ctx, meta.ProfileTarget{ 125 | Kind: "test_kind", 126 | Component: "test_component", 127 | Address: "test_address", 128 | }, func(info meta.TargetInfo) error { 129 | infos = append(infos, info) 130 | return nil 131 | }) 132 | require.Len(t, infos, 1) 133 | require.Equal(t, meta.TargetInfo{ 134 | ID: 1, 135 | LastScrapeTs: 3, 136 | }, infos[0]) 137 | 138 | err = db.ConprofWriteProfileData(ctx, 1, 2, []byte("test_data")) 139 | require.NoError(t, err) 140 | 141 | tss := []int64{} 142 | datas := [][]byte{} 143 | err = db.ConprofQueryProfileData(ctx, 1, 1, 3, func(ts int64, data []byte) error { 144 | tss = append(tss, ts) 145 | datas = append(datas, data) 146 | return nil 147 | }) 148 | require.NoError(t, err) 149 | require.Len(t, tss, 1) 150 | require.Len(t, datas, 1) 151 | require.Equal(t, int64(2), tss[0]) 152 | require.Equal(t, []byte("test_data"), datas[0]) 153 | 154 | err = db.ConprofDeleteProfileDataBeforeTs(ctx, 1, 4) 155 | require.NoError(t, err) 156 | 157 | tss = []int64{} 158 | datas = [][]byte{} 159 | err = db.ConprofQueryProfileData(ctx, 1, 1, 3, func(ts int64, data []byte) error { 160 | tss = append(tss, ts) 161 | datas = append(datas, data) 162 | return nil 163 | }) 164 | require.NoError(t, err) 165 | require.Empty(t, tss) 166 | require.Empty(t, datas) 167 | 168 | err = db.ConprofWriteProfileMeta(ctx, 1, 2, "test_err") 169 | require.NoError(t, err) 170 | 171 | tss = []int64{} 172 | verrs := []string{} 173 | err = db.ConprofQueryProfileMeta(ctx, 1, 1, 3, func(ts int64, verr string) error { 174 | tss = append(tss, ts) 175 | verrs = append(verrs, verr) 176 | return nil 177 | }) 178 | require.NoError(t, err) 179 | require.Len(t, tss, 1) 180 | require.Len(t, verrs, 1) 181 | require.Equal(t, int64(2), tss[0]) 182 | require.Equal(t, "test_err", verrs[0]) 183 | 184 | err = db.ConprofDeleteProfileMetaBeforeTs(ctx, 1, 4) 185 | require.NoError(t, err) 186 | 187 | tss = []int64{} 188 | verrs = []string{} 189 | err = db.ConprofQueryProfileMeta(ctx, 1, 1, 3, func(ts int64, verr string) error { 190 | tss = append(tss, ts) 191 | verrs = append(verrs, verr) 192 | return nil 193 | }) 194 | require.NoError(t, err) 195 | require.Empty(t, tss) 196 | require.Empty(t, verrs) 197 | 198 | err = db.ConprofDeleteProfileTables(ctx, 1) 199 | require.NoError(t, err) 200 | 201 | targets = []meta.ProfileTarget{} 202 | infos = []meta.TargetInfo{} 203 | db.ConprofQueryAllProfileTargets(ctx, func(target meta.ProfileTarget, info meta.TargetInfo) error { 204 | targets = append(targets, target) 205 | infos = append(infos, info) 206 | return nil 207 | }) 208 | require.Empty(t, targets) 209 | require.Empty(t, infos) 210 | } 211 | -------------------------------------------------------------------------------- /database/docdb/genji_logger.go: -------------------------------------------------------------------------------- 1 | package docdb 2 | 3 | import ( 4 | stdlog "log" 5 | "os" 6 | "path" 7 | 8 | "github.com/pingcap/log" 9 | "go.uber.org/zap" 10 | ) 11 | 12 | type loggingLevel int 13 | 14 | const ( 15 | DEBUG loggingLevel = iota 16 | INFO 17 | WARN 18 | ERROR 19 | ) 20 | 21 | const ( 22 | LevelDebug = "DEBUG" 23 | LevelInfo = "INFO" 24 | LevelWarn = "WARN" 25 | LevelError = "ERROR" 26 | ) 27 | 28 | type logger struct { 29 | *stdlog.Logger 30 | level loggingLevel 31 | } 32 | 33 | func initLogger(logPath, logLevel string) (*logger, error) { 34 | var err error 35 | var logDir string 36 | if logPath != "" { 37 | logDir = logPath 38 | } else { 39 | logDir = path.Join(logPath, "docdb-log") 40 | err := os.MkdirAll(logDir, os.ModePerm) 41 | if err != nil { 42 | return nil, err 43 | } 44 | } 45 | logFileName := path.Join(logDir, "docdb.log") 46 | logFile, err := os.OpenFile(logFileName, os.O_WRONLY|os.O_CREATE|os.O_APPEND, 0644) 47 | if err != nil { 48 | // Need to log via the default logger due to `l` is not initialized. 49 | log.Warn("Failed to init logger", zap.String("filename", logFileName)) 50 | return nil, err 51 | } 52 | var level loggingLevel 53 | switch logLevel { 54 | case LevelDebug: 55 | level = DEBUG 56 | case LevelInfo: 57 | level = INFO 58 | case LevelWarn: 59 | level = WARN 60 | case LevelError: 61 | level = ERROR 62 | default: 63 | log.Fatal("Unsupported log level", zap.String("level", logLevel)) 64 | } 65 | return &logger{Logger: stdlog.New(logFile, "badger ", stdlog.LstdFlags), level: level}, nil 66 | } 67 | 68 | func (l *logger) Errorf(f string, v ...interface{}) { 69 | if l.level <= ERROR { 70 | l.Printf("ERROR: "+f, v...) 71 | } 72 | } 73 | 74 | func (l *logger) Warningf(f string, v ...interface{}) { 75 | if l.level <= WARN { 76 | l.Printf("WARN: "+f, v...) 77 | } 78 | } 79 | 80 | func (l *logger) Infof(f string, v ...interface{}) { 81 | if l.level <= INFO { 82 | l.Printf("INFO: "+f, v...) 83 | } 84 | } 85 | 86 | func (l *logger) Debugf(f string, v ...interface{}) { 87 | if l.level <= DEBUG { 88 | l.Printf("DEBUG: "+f, v...) 89 | } 90 | } 91 | -------------------------------------------------------------------------------- /database/docdb/genji_test.go: -------------------------------------------------------------------------------- 1 | package docdb 2 | 3 | import ( 4 | "os" 5 | "testing" 6 | "time" 7 | 8 | "github.com/pingcap/ng-monitoring/utils/testutil" 9 | 10 | "github.com/stretchr/testify/require" 11 | ) 12 | 13 | func TestGenji(t *testing.T) { 14 | dir, err := os.MkdirTemp(os.TempDir(), "ngm-test-.*") 15 | require.NoError(t, err) 16 | db, err := NewGenjiDBFromGenji(testutil.NewGenjiDB(t, dir)) 17 | require.NoError(t, err) 18 | testDocDB(t, db) 19 | } 20 | 21 | func TestGC(t *testing.T) { 22 | tmpDir, err := os.MkdirTemp(os.TempDir(), "ngm-test-.*") 23 | require.NoError(t, err) 24 | defer func() { 25 | err := os.RemoveAll(tmpDir) 26 | require.NoError(t, err) 27 | }() 28 | 29 | db := testutil.NewBadgerDB(t, tmpDir) 30 | ts, err := getLastFlattenTs(db) 31 | require.NoError(t, err) 32 | require.Equal(t, int64(0), ts) 33 | 34 | ts = time.Now().Unix() 35 | err = storeLastFlattenTs(db, ts) 36 | require.NoError(t, err) 37 | 38 | lastTs, err := getLastFlattenTs(db) 39 | require.NoError(t, err) 40 | require.Equal(t, ts, lastTs) 41 | 42 | require.False(t, needFlatten(db)) 43 | runGC(db) 44 | 45 | lastTs = ts - int64(flattenInterval/time.Second) 46 | err = storeLastFlattenTs(db, lastTs) 47 | require.NoError(t, err) 48 | require.True(t, needFlatten(db)) 49 | 50 | runGC(db) 51 | lastFlattenTs, err := getLastFlattenTs(db) 52 | require.NoError(t, err) 53 | require.NotEqual(t, lastTs, lastFlattenTs) 54 | require.Less(t, time.Now().Unix()-lastFlattenTs, int64(10)) 55 | } 56 | -------------------------------------------------------------------------------- /database/docdb/sqlite_test.go: -------------------------------------------------------------------------------- 1 | package docdb 2 | 3 | import ( 4 | "os" 5 | "testing" 6 | 7 | "github.com/stretchr/testify/require" 8 | ) 9 | 10 | func TestSQLite(t *testing.T) { 11 | dir, err := os.MkdirTemp(os.TempDir(), "ngm-test-.*") 12 | require.NoError(t, err) 13 | db, err := NewSQLiteDB(dir, true) 14 | require.NoError(t, err) 15 | testDocDB(t, db) 16 | } 17 | -------------------------------------------------------------------------------- /database/timeseries/handler.go: -------------------------------------------------------------------------------- 1 | package timeseries 2 | 3 | import ( 4 | "net/http" 5 | 6 | "github.com/VictoriaMetrics/VictoriaMetrics/app/vminsert" 7 | "github.com/VictoriaMetrics/VictoriaMetrics/app/vmselect" 8 | ) 9 | 10 | var _ http.HandlerFunc = InsertHandler 11 | var _ http.HandlerFunc = SelectHandler 12 | 13 | func InsertHandler(writer http.ResponseWriter, request *http.Request) { 14 | vminsert.RequestHandler(writer, request) 15 | } 16 | 17 | func SelectHandler(writer http.ResponseWriter, request *http.Request) { 18 | vmselect.RequestHandler(writer, request) 19 | } 20 | -------------------------------------------------------------------------------- /database/timeseries/syscall_linux.go: -------------------------------------------------------------------------------- 1 | package timeseries 2 | 3 | import ( 4 | "syscall" 5 | ) 6 | 7 | func dup2(oldfd int, newfd int) error { 8 | return syscall.Dup3(oldfd, newfd, 0) 9 | } 10 | -------------------------------------------------------------------------------- /database/timeseries/syscall_not_linux_unix.go: -------------------------------------------------------------------------------- 1 | //go:build !linux 2 | // +build !linux 3 | 4 | package timeseries 5 | 6 | import ( 7 | "syscall" 8 | ) 9 | 10 | func dup2(oldfd int, newfd int) error { 11 | return syscall.Dup2(oldfd, newfd) 12 | } 13 | -------------------------------------------------------------------------------- /database/timeseries/vm.go: -------------------------------------------------------------------------------- 1 | package timeseries 2 | 3 | import ( 4 | "flag" 5 | "fmt" 6 | "os" 7 | "path" 8 | "time" 9 | 10 | "github.com/pingcap/ng-monitoring/config" 11 | 12 | "github.com/VictoriaMetrics/VictoriaMetrics/app/vminsert" 13 | "github.com/VictoriaMetrics/VictoriaMetrics/app/vmselect" 14 | "github.com/VictoriaMetrics/VictoriaMetrics/app/vmselect/promql" 15 | "github.com/VictoriaMetrics/VictoriaMetrics/app/vmstorage" 16 | "github.com/VictoriaMetrics/VictoriaMetrics/lib/fs" 17 | "github.com/VictoriaMetrics/VictoriaMetrics/lib/logger" 18 | "github.com/pingcap/log" 19 | "go.uber.org/zap" 20 | ) 21 | 22 | func Init(cfg *config.Config) { 23 | if err := initLogger(cfg); err != nil { 24 | log.Fatal("Failed to open log file", zap.Error(err)) 25 | } 26 | initDataDir(path.Join(cfg.Storage.Path, "tsdb")) 27 | 28 | _ = flag.Set("retentionPeriod", cfg.TSDB.RetentionPeriod) 29 | _ = flag.Set("search.maxStepForPointsAdjustment", "1s") 30 | _ = flag.Set("search.maxUniqueTimeseries", fmt.Sprintf("%d", cfg.TSDB.SearchMaxUniqueTimeseries)) 31 | if cfg.TSDB.MemoryAllowedBytes > 0 { 32 | _ = flag.Set("memory.allowedBytes", fmt.Sprintf("%d", cfg.TSDB.MemoryAllowedBytes)) 33 | } 34 | if cfg.TSDB.MemoryAllowedPercent > 0 { 35 | _ = flag.Set("memory.allowedPercent", fmt.Sprintf("%f", cfg.TSDB.MemoryAllowedPercent)) 36 | } 37 | if cfg.TSDB.CacheSizeIndexDBDataBlocks != "" { 38 | _ = flag.Set("storage.cacheSizeIndexDBDataBlocks", cfg.TSDB.CacheSizeIndexDBDataBlocks) 39 | } 40 | if cfg.TSDB.CacheSizeIndexDBDataBlocksSparse != "" { 41 | _ = flag.Set("storage.cacheSizeIndexDBDataBlocksSparse", cfg.TSDB.CacheSizeIndexDBDataBlocksSparse) 42 | } 43 | if cfg.TSDB.CacheSizeIndexDBIndexBlocks != "" { 44 | _ = flag.Set("storage.cacheSizeIndexDBIndexBlocks", cfg.TSDB.CacheSizeIndexDBIndexBlocks) 45 | } 46 | if cfg.TSDB.CacheSizeIndexDBTagFilters != "" { 47 | _ = flag.Set("storage.cacheSizeIndexDBTagFilters", cfg.TSDB.CacheSizeIndexDBTagFilters) 48 | } 49 | if cfg.TSDB.CacheSizeMetricNamesStats != "" { 50 | _ = flag.Set("storage.cacheSizeMetricNamesStats", cfg.TSDB.CacheSizeMetricNamesStats) 51 | } 52 | if cfg.TSDB.CacheSizeStorageTSID != "" { 53 | _ = flag.Set("storage.cacheSizeStorageTSID", cfg.TSDB.CacheSizeStorageTSID) 54 | } 55 | 56 | // Some components in VictoriaMetrics want parsed arguments, i.e. assert `flag.Parsed()`. Make them happy. 57 | _ = flag.CommandLine.Parse(nil) 58 | 59 | startTime := time.Now() 60 | vmstorage.Init(promql.ResetRollupResultCacheIfNeeded) 61 | vmselect.Init() 62 | vminsert.Init() 63 | 64 | logger.Infof("started VictoriaMetrics in %.3f seconds", time.Since(startTime).Seconds()) 65 | } 66 | 67 | func Stop() { 68 | startTime := time.Now() 69 | vminsert.Stop() 70 | logger.Infof("successfully shut down the webservice in %.3f seconds", time.Since(startTime).Seconds()) 71 | 72 | vmstorage.Stop() 73 | vmselect.Stop() 74 | 75 | fs.MustStopDirRemover() 76 | 77 | logger.Infof("the VictoriaMetrics has been stopped in %.3f seconds", time.Since(startTime).Seconds()) 78 | } 79 | 80 | func initLogger(cfg *config.Config) error { 81 | _ = flag.Set("loggerOutput", "stderr") 82 | _ = flag.Set("loggerLevel", mapLogLevel(cfg.Log.Level)) 83 | 84 | var logDir string 85 | if cfg.Log.Path != "" { 86 | logDir = cfg.Log.Path 87 | } else { 88 | // create tsdb log dir 89 | logDir = path.Join(cfg.Storage.Path, "tsdb-log") 90 | err := os.MkdirAll(logDir, os.ModePerm) 91 | if err != nil { 92 | return err 93 | } 94 | } 95 | 96 | // VictoriaMetrics only supports stdout or stderr as log output. 97 | // To output the log to the specified file, redirect stderr to that file. 98 | logFileName := path.Join(logDir, "tsdb.log") 99 | file, err := os.OpenFile(logFileName, os.O_WRONLY|os.O_CREATE|os.O_APPEND, 0644) 100 | if err != nil { 101 | return err 102 | } 103 | if err = dup2(int(file.Fd()), int(os.Stderr.Fd())); err != nil { 104 | return err 105 | } 106 | logger.Init() 107 | 108 | return nil 109 | } 110 | 111 | func initDataDir(dataPath string) { 112 | _ = flag.Set("storageDataPath", dataPath) 113 | } 114 | 115 | func mapLogLevel(level string) string { 116 | switch level { 117 | case config.LevelDebug, config.LevelInfo: 118 | return "INFO" 119 | case config.LevelWarn: 120 | return "WARN" 121 | case config.LevelError: 122 | return "ERROR" 123 | default: 124 | return "INFO" 125 | } 126 | } 127 | -------------------------------------------------------------------------------- /main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | stdlog "log" 7 | "os" 8 | "runtime/debug" 9 | 10 | "github.com/pingcap/ng-monitoring/component/conprof" 11 | "github.com/pingcap/ng-monitoring/component/domain" 12 | "github.com/pingcap/ng-monitoring/component/topology" 13 | "github.com/pingcap/ng-monitoring/component/topsql" 14 | "github.com/pingcap/ng-monitoring/config" 15 | "github.com/pingcap/ng-monitoring/config/pdvariable" 16 | "github.com/pingcap/ng-monitoring/database" 17 | "github.com/pingcap/ng-monitoring/database/docdb" 18 | "github.com/pingcap/ng-monitoring/database/timeseries" 19 | "github.com/pingcap/ng-monitoring/service" 20 | "github.com/pingcap/ng-monitoring/utils/printer" 21 | 22 | "github.com/VictoriaMetrics/VictoriaMetrics/lib/procutil" 23 | "github.com/pingcap/log" 24 | "github.com/spf13/pflag" 25 | "go.uber.org/zap" 26 | ) 27 | 28 | const ( 29 | nmVersion = "version" 30 | nmAddr = "address" 31 | nmPdEndpoints = "pd.endpoints" 32 | nmLogPath = "log.path" 33 | nmStoragePath = "storage.path" 34 | nmConfig = "config" 35 | nmAdvertiseAddress = "advertise-address" 36 | nmRetentionPeriod = "retention-period" 37 | ) 38 | 39 | var ( 40 | version = pflag.BoolP(nmVersion, "V", false, "print version information and exit") 41 | listenAddr = pflag.String(nmAddr, "", "TCP address to listen for http connections") 42 | pdEndpoints = pflag.StringSlice(nmPdEndpoints, nil, "Addresses of PD instances within the TiDB cluster. Multiple addresses are separated by commas, e.g. --pd.endpoints 10.0.0.1:2379,10.0.0.2:2379") 43 | logPath = pflag.String(nmLogPath, "", "Log path of ng monitoring server") 44 | storagePath = pflag.String(nmStoragePath, "", "Storage path of ng monitoring server") 45 | configPath = pflag.String(nmConfig, "", "config file path") 46 | advertiseAddress = pflag.String(nmAdvertiseAddress, "", "ngm server advertise IP:PORT") 47 | retentionPeriod = pflag.String(nmRetentionPeriod, "", "Data with timestamps outside the retentionPeriod is automatically deleted\nThe following optional suffixes are supported: h (hour), d (day), w (week), y (year). If suffix isn't set, then the duration is counted in months") 48 | ) 49 | 50 | func main() { 51 | // There are dependencies that use `flag`. 52 | // For isolation and avoiding conflict, we use another command line parser package `pflag`. 53 | pflag.Parse() 54 | 55 | if *version { 56 | fmt.Println(printer.GetNGMInfo()) 57 | return 58 | } 59 | 60 | cfg, err := config.InitConfig(*configPath, overrideConfig) 61 | if err != nil { 62 | stdlog.Fatalf("Failed to initialize config, err: %s", err.Error()) 63 | } 64 | 65 | if cfg.Go.GCPercent > 0 { 66 | debug.SetGCPercent(cfg.Go.GCPercent) 67 | } 68 | if cfg.Go.MemoryLimit > 0 { 69 | debug.SetMemoryLimit(cfg.Go.MemoryLimit) 70 | } 71 | 72 | cfg.Log.InitDefaultLogger() 73 | printer.PrintNGMInfo() 74 | log.Info("config", zap.Any("config", cfg)) 75 | 76 | mustCreateDirs(cfg) 77 | 78 | database.Init(cfg) 79 | defer database.Stop() 80 | 81 | var docDB docdb.DocDB 82 | switch cfg.Storage.DocDBBackend { 83 | case "sqlite": 84 | docDB, err = docdb.NewSQLiteDB(cfg.Storage.Path, cfg.Storage.SQLiteUseWAL) 85 | default: 86 | docDB, err = docdb.NewGenjiDB(context.Background(), &docdb.GenjiConfig{ 87 | Path: cfg.Storage.Path, 88 | LogPath: cfg.Log.Path, 89 | LogLevel: cfg.Log.Level, 90 | BadgerConfig: cfg.DocDB, 91 | }) 92 | } 93 | if err != nil { 94 | stdlog.Fatalf("Failed to create docdb err: %s", err.Error()) 95 | } 96 | defer func() { 97 | if err := docDB.Close(); err != nil { 98 | stdlog.Fatalf("Failed to close docdb err: %s", err.Error()) 99 | } 100 | }() 101 | 102 | err = config.LoadConfigFromStorage(context.Background(), docDB) 103 | if err != nil { 104 | stdlog.Fatalf("Failed to load config from storage, err: %s", err.Error()) 105 | } 106 | 107 | do := domain.NewDomain() 108 | defer do.Close() 109 | 110 | err = topology.Init(do) 111 | if err != nil { 112 | log.Fatal("Failed to initialize topology", zap.Error(err)) 113 | } 114 | defer topology.Stop() 115 | 116 | pdvariable.Init(do) 117 | defer pdvariable.Stop() 118 | 119 | err = topsql.Init(do, config.Subscribe(), docDB, timeseries.InsertHandler, timeseries.SelectHandler, topology.Subscribe(), pdvariable.Subscribe(), cfg.Storage.MetaRetentionSecs) 120 | if err != nil { 121 | log.Fatal("Failed to initialize topsql", zap.Error(err)) 122 | } 123 | defer topsql.Stop() 124 | 125 | err = conprof.Init(docDB, topology.Subscribe()) 126 | if err != nil { 127 | log.Fatal("Failed to initialize continuous profiling", zap.Error(err)) 128 | } 129 | defer conprof.Stop() 130 | 131 | service.Init(cfg, docDB) 132 | defer service.Stop() 133 | 134 | ctx, cancel := context.WithCancel(context.Background()) 135 | defer cancel() 136 | 137 | go config.ReloadRoutine(ctx, *configPath) 138 | sig := procutil.WaitForSigterm() 139 | log.Info("received signal", zap.String("sig", sig.String())) 140 | } 141 | 142 | func overrideConfig(config *config.Config) { 143 | pflag.Visit(func(f *pflag.Flag) { 144 | switch f.Name { 145 | case nmAddr: 146 | config.Address = *listenAddr 147 | case nmPdEndpoints: 148 | config.PD.Endpoints = *pdEndpoints 149 | case nmLogPath: 150 | config.Log.Path = *logPath 151 | case nmStoragePath: 152 | config.Storage.Path = *storagePath 153 | case nmAdvertiseAddress: 154 | config.AdvertiseAddress = *advertiseAddress 155 | case nmRetentionPeriod: 156 | config.TSDB.RetentionPeriod = *retentionPeriod 157 | } 158 | }) 159 | } 160 | 161 | func mustCreateDirs(config *config.Config) { 162 | if config.Log.Path != "" { 163 | if err := os.MkdirAll(config.Log.Path, os.ModePerm); err != nil { 164 | log.Fatal("failed to init log path", zap.Error(err)) 165 | } 166 | } 167 | 168 | if err := os.MkdirAll(config.Storage.Path, os.ModePerm); err != nil { 169 | log.Fatal("failed to init storage path", zap.Error(err)) 170 | } 171 | } 172 | -------------------------------------------------------------------------------- /service/http/http.go: -------------------------------------------------------------------------------- 1 | package http 2 | 3 | import ( 4 | "net" 5 | "net/http" 6 | "os" 7 | "path" 8 | "time" 9 | 10 | "github.com/VictoriaMetrics/VictoriaMetrics/app/vminsert" 11 | "github.com/VictoriaMetrics/VictoriaMetrics/app/vmselect" 12 | "github.com/VictoriaMetrics/VictoriaMetrics/app/vmstorage" 13 | conprofhttp "github.com/pingcap/ng-monitoring/component/conprof/http" 14 | "github.com/pingcap/ng-monitoring/component/topsql" 15 | "github.com/pingcap/ng-monitoring/config" 16 | "github.com/pingcap/ng-monitoring/database/docdb" 17 | 18 | "github.com/gin-contrib/pprof" 19 | "github.com/gin-gonic/gin" 20 | "github.com/pingcap/log" 21 | "github.com/prometheus/client_golang/prometheus/promhttp" 22 | "go.uber.org/zap" 23 | ) 24 | 25 | var ( 26 | httpServer *http.Server = nil 27 | ) 28 | 29 | func ServeHTTP(l *config.Log, listener net.Listener, docDB docdb.DocDB) { 30 | gin.SetMode(gin.ReleaseMode) 31 | ng := gin.New() 32 | 33 | var logFile *os.File 34 | var err error 35 | if l.Path != "" { 36 | logFileName := path.Join(l.Path, "service.log") 37 | logFile, err = os.OpenFile(logFileName, os.O_WRONLY|os.O_CREATE|os.O_APPEND, 0644) 38 | if err != nil { 39 | log.Fatal("Failed to open the log file", zap.String("filename", logFileName)) 40 | } 41 | } else { 42 | logFile = os.Stdout 43 | } 44 | ng.Use(gin.LoggerWithWriter(logFile)) 45 | 46 | // recovery 47 | ng.Use(gin.Recovery()) 48 | 49 | ng.Handle(http.MethodGet, "/health", func(g *gin.Context) { 50 | g.JSON(http.StatusOK, Status{Health: true}) 51 | }) 52 | 53 | // route 54 | configGroup := ng.Group("/config") 55 | config.HTTPService(configGroup, docDB) 56 | topSQLGroup := ng.Group("/topsql") 57 | topsql.HTTPService(topSQLGroup) 58 | // register pprof http api 59 | pprof.Register(ng) 60 | 61 | continuousProfilingGroup := ng.Group("/continuous_profiling") 62 | conprofhttp.HTTPService(continuousProfilingGroup) 63 | 64 | promHandler := promhttp.Handler() 65 | promGroup := ng.Group("/metrics") 66 | promGroup.Any("", func(c *gin.Context) { 67 | promHandler.ServeHTTP(c.Writer, c.Request) 68 | }) 69 | // compatible with victoria-metrics handlers 70 | ng.NoRoute(func(c *gin.Context) { 71 | handlerNoRouter(c) 72 | }) 73 | httpServer = &http.Server{ 74 | Handler: ng, 75 | ReadHeaderTimeout: 5 * time.Second, 76 | } 77 | if err = httpServer.Serve(listener); err != nil && err != http.ErrServerClosed { 78 | log.Warn("failed to serve http service", zap.Error(err)) 79 | } 80 | } 81 | 82 | // Try Victoria-Metrics' handlers first. If not handled, then return a 404 error. 83 | func handlerNoRouter(c *gin.Context) { 84 | //reset to default 85 | c.Writer.WriteHeader(http.StatusOK) 86 | if vminsert.RequestHandler(c.Writer, c.Request) { 87 | return 88 | } 89 | 90 | if vmselect.RequestHandler(c.Writer, c.Request) { 91 | return 92 | } 93 | 94 | if vmstorage.RequestHandler(c.Writer, c.Request) { 95 | return 96 | } 97 | 98 | c.String(http.StatusNotFound, "404 page not found") 99 | } 100 | 101 | type Status struct { 102 | Health bool `json:"health"` 103 | } 104 | 105 | func StopHTTP() { 106 | if httpServer == nil { 107 | return 108 | } 109 | 110 | log.Info("shutting down http server") 111 | _ = httpServer.Close() 112 | log.Info("http server is down") 113 | } 114 | -------------------------------------------------------------------------------- /service/service.go: -------------------------------------------------------------------------------- 1 | package service 2 | 3 | import ( 4 | "net" 5 | 6 | "github.com/pingcap/ng-monitoring/config" 7 | "github.com/pingcap/ng-monitoring/database/docdb" 8 | "github.com/pingcap/ng-monitoring/service/http" 9 | "github.com/pingcap/ng-monitoring/utils" 10 | 11 | "github.com/pingcap/log" 12 | "go.uber.org/zap" 13 | ) 14 | 15 | func Init(cfg *config.Config, docDB docdb.DocDB) { 16 | listener, err := net.Listen("tcp", cfg.Address) 17 | if err != nil { 18 | log.Fatal("failed to listen", 19 | zap.String("address", cfg.Address), 20 | zap.Error(err), 21 | ) 22 | } 23 | 24 | go utils.GoWithRecovery(func() { 25 | http.ServeHTTP(&cfg.Log, listener, docDB) 26 | }, nil) 27 | 28 | log.Info( 29 | "starting http service", 30 | zap.String("address", cfg.Address), 31 | ) 32 | } 33 | 34 | func Stop() { 35 | http.StopHTTP() 36 | } 37 | -------------------------------------------------------------------------------- /tests/mock.go: -------------------------------------------------------------------------------- 1 | package tests 2 | 3 | import ( 4 | "net" 5 | "net/http" 6 | "sync" 7 | "time" 8 | 9 | rua "github.com/pingcap/kvproto/pkg/resource_usage_agent" 10 | "github.com/pingcap/tipb/go-tipb" 11 | "google.golang.org/grpc" 12 | ) 13 | 14 | var _ tipb.TopSQLPubSubServer = &MockTiDBServer{} 15 | 16 | type MockTiDBServer struct { 17 | listener net.Listener 18 | server *grpc.Server 19 | records []tipb.TopSQLRecord 20 | changed bool 21 | mu sync.Mutex 22 | } 23 | 24 | func NewMockTiDBServer() *MockTiDBServer { 25 | return &MockTiDBServer{} 26 | } 27 | 28 | func (s *MockTiDBServer) PushRecords(records []tipb.TopSQLRecord) { 29 | s.mu.Lock() 30 | defer s.mu.Unlock() 31 | s.records = append(s.records, records...) 32 | s.changed = true 33 | } 34 | 35 | func (s *MockTiDBServer) Listen() (addr string, err error) { 36 | s.listener, err = net.Listen("tcp", "127.0.0.1:0") 37 | if err != nil { 38 | return 39 | } 40 | return s.listener.Addr().String(), nil 41 | } 42 | 43 | func (s *MockTiDBServer) Serve() error { 44 | s.server = grpc.NewServer() 45 | tipb.RegisterTopSQLPubSubServer(s.server, s) 46 | return s.server.Serve(s.listener) 47 | } 48 | 49 | func (s *MockTiDBServer) Stop() { 50 | s.server.Stop() 51 | } 52 | 53 | func (s *MockTiDBServer) Subscribe(req *tipb.TopSQLSubRequest, stream tipb.TopSQLPubSub_SubscribeServer) error { 54 | ticker := time.NewTicker(50 * time.Millisecond) 55 | defer ticker.Stop() 56 | for { 57 | select { 58 | case <-stream.Context().Done(): 59 | return nil 60 | case <-ticker.C: 61 | s.mu.Lock() 62 | records := s.records 63 | changed := s.changed 64 | s.records = nil 65 | s.mu.Unlock() 66 | if !changed { 67 | continue 68 | } 69 | for i := range records { 70 | record := &records[i] 71 | if err := stream.Send(&tipb.TopSQLSubResponse{ 72 | RespOneof: &tipb.TopSQLSubResponse_Record{ 73 | Record: record, 74 | }, 75 | }); err != nil { 76 | panic(err) 77 | } 78 | } 79 | } 80 | } 81 | } 82 | 83 | var _ rua.ResourceMeteringPubSubServer = &MockTiKVServer{} 84 | 85 | type MockTiKVServer struct { 86 | listener net.Listener 87 | server *grpc.Server 88 | records []*rua.ResourceUsageRecord 89 | changed bool 90 | mu sync.Mutex 91 | } 92 | 93 | func NewMockTiKVServer() *MockTiKVServer { 94 | return &MockTiKVServer{} 95 | } 96 | 97 | func (s *MockTiKVServer) PushRecords(records []*rua.ResourceUsageRecord) { 98 | s.mu.Lock() 99 | defer s.mu.Unlock() 100 | s.records = append(s.records, records...) 101 | s.changed = true 102 | } 103 | 104 | func (s *MockTiKVServer) Listen() (addr string, err error) { 105 | s.listener, err = net.Listen("tcp", "127.0.0.1:0") 106 | if err != nil { 107 | return 108 | } 109 | return s.listener.Addr().String(), nil 110 | } 111 | 112 | func (s *MockTiKVServer) Serve() error { 113 | s.server = grpc.NewServer() 114 | rua.RegisterResourceMeteringPubSubServer(s.server, s) 115 | return s.server.Serve(s.listener) 116 | } 117 | 118 | func (s *MockTiKVServer) Stop() { 119 | s.server.Stop() 120 | } 121 | 122 | func (s *MockTiKVServer) Subscribe(req *rua.ResourceMeteringRequest, stream rua.ResourceMeteringPubSub_SubscribeServer) error { 123 | ticker := time.NewTicker(50 * time.Millisecond) 124 | defer ticker.Stop() 125 | for { 126 | select { 127 | case <-stream.Context().Done(): 128 | return nil 129 | case <-ticker.C: 130 | s.mu.Lock() 131 | records := s.records 132 | changed := s.changed 133 | s.records = nil 134 | s.mu.Unlock() 135 | if !changed { 136 | continue 137 | } 138 | for _, record := range records { 139 | if err := stream.Send(record); err != nil { 140 | panic(err) 141 | } 142 | } 143 | } 144 | } 145 | } 146 | 147 | var _ http.ResponseWriter = &MockResponseWriter{} 148 | 149 | type MockResponseWriter struct { 150 | StatusCode int 151 | Body []byte 152 | header http.Header 153 | } 154 | 155 | func NewMockResponseWriter() *MockResponseWriter { 156 | return &MockResponseWriter{ 157 | header: map[string][]string{}, 158 | } 159 | } 160 | 161 | func (w *MockResponseWriter) Header() http.Header { 162 | return w.header 163 | } 164 | 165 | func (w *MockResponseWriter) Write(buf []byte) (int, error) { 166 | w.Body = buf 167 | return len(buf), nil 168 | } 169 | 170 | func (w *MockResponseWriter) WriteHeader(statusCode int) { 171 | w.StatusCode = statusCode 172 | } 173 | -------------------------------------------------------------------------------- /utils/limiter.go: -------------------------------------------------------------------------------- 1 | package utils 2 | 3 | // RateLimit wraps a fix sized channel to control concurrency. 4 | type RateLimit struct { 5 | capacity int 6 | token chan struct{} 7 | } 8 | 9 | // NewRateLimit creates a limit controller with capacity n. 10 | func NewRateLimit(n int) *RateLimit { 11 | return &RateLimit{ 12 | capacity: n, 13 | token: make(chan struct{}, n), 14 | } 15 | } 16 | 17 | // GetToken acquires a token. 18 | func (r *RateLimit) GetToken(done <-chan struct{}) (exit bool) { 19 | select { 20 | case <-done: 21 | return true 22 | case r.token <- struct{}{}: 23 | return false 24 | } 25 | } 26 | 27 | // PutToken puts a token back. 28 | func (r *RateLimit) PutToken() { 29 | select { 30 | case <-r.token: 31 | default: 32 | panic("put a redundant token") 33 | } 34 | } 35 | 36 | // GetCapacity returns the token capacity. 37 | func (r *RateLimit) GetCapacity() int { 38 | return r.capacity 39 | } 40 | -------------------------------------------------------------------------------- /utils/misc.go: -------------------------------------------------------------------------------- 1 | package utils 2 | 3 | import ( 4 | "net" 5 | 6 | "github.com/pingcap/log" 7 | "go.uber.org/zap" 8 | ) 9 | 10 | // GoWithRecovery wraps goroutine startup call with force recovery. 11 | // it will dump current goroutine stack into log if catch any recover result. 12 | // 13 | // exec: execute logic function. 14 | // recoverFn: handler will be called after recover and before dump stack, passing `nil` means noop. 15 | func GoWithRecovery(exec func(), recoverFn func(r interface{})) { 16 | defer func() { 17 | r := recover() 18 | if recoverFn != nil { 19 | recoverFn(r) 20 | } 21 | if r != nil { 22 | log.Error("panic in the recoverable goroutine", 23 | zap.Reflect("r", r), 24 | zap.Stack("stack trace")) 25 | } 26 | }() 27 | exec() 28 | } 29 | 30 | // GetLocalIP will return a local IP(non-loopback, non 0.0.0.0), if there is one 31 | func GetLocalIP() string { 32 | addrs, err := net.InterfaceAddrs() 33 | if err == nil { 34 | for _, address := range addrs { 35 | ipnet, ok := address.(*net.IPNet) 36 | if ok && ipnet.IP.IsGlobalUnicast() { 37 | return ipnet.IP.String() 38 | } 39 | } 40 | } 41 | return "" 42 | } 43 | -------------------------------------------------------------------------------- /utils/pools.go: -------------------------------------------------------------------------------- 1 | package utils 2 | 3 | import ( 4 | "bytes" 5 | "net/http" 6 | "sync" 7 | ) 8 | 9 | type BytesBufferPool struct { 10 | p sync.Pool 11 | } 12 | 13 | func (bbp *BytesBufferPool) Get() *bytes.Buffer { 14 | bbv := bbp.p.Get() 15 | if bbv == nil { 16 | return &bytes.Buffer{} 17 | } 18 | return bbv.(*bytes.Buffer) 19 | } 20 | 21 | func (bbp *BytesBufferPool) Put(bb *bytes.Buffer) { 22 | bb.Reset() 23 | bbp.p.Put(bb) 24 | } 25 | 26 | type HeaderPool struct { 27 | p sync.Pool 28 | } 29 | 30 | func (hdp *HeaderPool) Get() http.Header { 31 | hdv := hdp.p.Get() 32 | if hdv == nil { 33 | return make(http.Header) 34 | } 35 | return hdv.(http.Header) 36 | } 37 | 38 | func (hdp *HeaderPool) Put(hdv http.Header) { 39 | for key := range hdv { 40 | delete(hdv, key) 41 | } 42 | hdp.p.Put(hdv) 43 | } 44 | -------------------------------------------------------------------------------- /utils/printer/fips.go: -------------------------------------------------------------------------------- 1 | //go:build boringcrypto 2 | // +build boringcrypto 3 | 4 | package printer 5 | 6 | import _ "crypto/tls/fipsonly" 7 | -------------------------------------------------------------------------------- /utils/printer/printer.go: -------------------------------------------------------------------------------- 1 | package printer 2 | 3 | import ( 4 | "fmt" 5 | "runtime" 6 | 7 | "github.com/pingcap/log" 8 | "go.uber.org/zap" 9 | 10 | _ "unsafe" // required by go:linkname 11 | ) 12 | 13 | // Version information. 14 | var ( 15 | NGMBuildTS = "None" 16 | NGMGitHash = "None" 17 | NGMGitBranch = "None" 18 | ) 19 | 20 | var buildVersion string 21 | 22 | func init() { 23 | buildVersion = runtime.Version() 24 | } 25 | 26 | // PrintNGMInfo prints the NGM version information. 27 | func PrintNGMInfo() { 28 | log.Info("Welcome to ng-monitoring.", 29 | zap.String("Git Commit Hash", NGMGitHash), 30 | zap.String("Git Branch", NGMGitBranch), 31 | zap.String("UTC Build Time", NGMBuildTS), 32 | zap.String("GoVersion", buildVersion)) 33 | } 34 | 35 | func GetNGMInfo() string { 36 | return fmt.Sprintf("Git Commit Hash: %s\n"+ 37 | "Git Branch: %s\n"+ 38 | "UTC Build Time: %s\n"+ 39 | "GoVersion: %s", 40 | NGMGitHash, 41 | NGMGitBranch, 42 | NGMBuildTS, 43 | buildVersion) 44 | } 45 | -------------------------------------------------------------------------------- /utils/resp_writer.go: -------------------------------------------------------------------------------- 1 | package utils 2 | 3 | import ( 4 | "bytes" 5 | "net/http" 6 | ) 7 | 8 | var _ http.ResponseWriter = &ResponseWriter{} 9 | 10 | type ResponseWriter struct { 11 | Body *bytes.Buffer 12 | Headers http.Header 13 | Code int 14 | } 15 | 16 | func NewRespWriter(body *bytes.Buffer, header http.Header) ResponseWriter { 17 | return ResponseWriter{ 18 | Body: body, 19 | Headers: header, 20 | Code: 200, 21 | } 22 | } 23 | 24 | func (r *ResponseWriter) Header() http.Header { 25 | return r.Headers 26 | } 27 | 28 | func (r *ResponseWriter) Write(b []byte) (int, error) { 29 | return r.Body.Write(b) 30 | } 31 | 32 | func (r *ResponseWriter) WriteHeader(statusCode int) { 33 | r.Code = statusCode 34 | } 35 | -------------------------------------------------------------------------------- /utils/retry.go: -------------------------------------------------------------------------------- 1 | package utils 2 | 3 | import ( 4 | "context" 5 | "time" 6 | ) 7 | 8 | // WithRetry provides a general retry logic. 9 | // 10 | // The given f will keep running, until: 11 | // - f returns true. It means work is done. 12 | // - retry times is greater than the given times 13 | // - ctx is done. 14 | // 15 | // Otherwise, this function will wait a time of the given duration 16 | // and continue to execute f. 17 | // 18 | // The argument provided for f is the retried times. 19 | func WithRetry(ctx context.Context, maxRetryTimes uint, duration time.Duration, f func(uint) bool) { 20 | for retried := uint(0); retried <= maxRetryTimes; retried++ { 21 | if done := f(retried); done { 22 | return 23 | } 24 | if retried < maxRetryTimes { 25 | select { 26 | case <-time.After(duration): 27 | case <-ctx.Done(): 28 | return 29 | } 30 | } 31 | } 32 | } 33 | 34 | // WithRetryBackoff provides a general retry logic. 35 | // 36 | // The given f will keep running, until: 37 | // - f returns true. It means work is done. 38 | // - retry times is greater than the given times 39 | // - ctx is done. 40 | // 41 | // Otherwise, this function will wait a time of the given duration 42 | // in a backoff way, and continue to execute f. 43 | // 44 | // The argument provided for f is the retried times. 45 | func WithRetryBackoff(ctx context.Context, maxRetryTimes uint, firstDuration time.Duration, f func(uint) bool) { 46 | duration := firstDuration 47 | for retried := uint(0); retried <= maxRetryTimes; retried++ { 48 | if done := f(retried); done { 49 | return 50 | } 51 | if retried < maxRetryTimes { 52 | select { 53 | case <-time.After(duration): 54 | case <-ctx.Done(): 55 | return 56 | } 57 | duration *= 2 58 | } 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /utils/retry_test.go: -------------------------------------------------------------------------------- 1 | package utils_test 2 | 3 | import ( 4 | "context" 5 | "testing" 6 | "time" 7 | 8 | "github.com/pingcap/ng-monitoring/utils" 9 | 10 | "github.com/stretchr/testify/require" 11 | ) 12 | 13 | func TestWithRetry(t *testing.T) { 14 | t.Parallel() 15 | 16 | maxRetryTimes := uint(10) 17 | executed := uint(0) 18 | utils.WithRetry(context.Background(), maxRetryTimes, 1*time.Millisecond, func(u uint) bool { 19 | require.Equal(t, u, executed) 20 | executed += 1 21 | return false 22 | }) 23 | require.Equal(t, maxRetryTimes+1, executed) 24 | } 25 | 26 | func TestWithRetryCtxDone(t *testing.T) { 27 | t.Parallel() 28 | 29 | ctx, cancel := context.WithTimeout(context.Background(), 10*time.Millisecond) 30 | defer cancel() 31 | 32 | maxRetryTimes := uint(10) 33 | executed := uint(0) 34 | utils.WithRetry(ctx, maxRetryTimes, 1*time.Minute, func(u uint) bool { 35 | require.Equal(t, u, executed) 36 | executed += 1 37 | return false 38 | }) 39 | 40 | require.Equal(t, executed, uint(1)) 41 | } 42 | 43 | func TestWithRetryDoneMidway(t *testing.T) { 44 | t.Parallel() 45 | 46 | maxRetryTimes := uint(10) 47 | executed := uint(0) 48 | utils.WithRetry(context.Background(), maxRetryTimes, 1*time.Millisecond, func(u uint) bool { 49 | require.Equal(t, u, executed) 50 | executed += 1 51 | 52 | return u == 3 53 | }) 54 | require.Equal(t, executed, uint(4)) 55 | } 56 | 57 | func TestWithRetryBackoff(t *testing.T) { 58 | t.Parallel() 59 | 60 | now := time.Now() 61 | maxRetryTimes := uint(10) 62 | executed := uint(0) 63 | utils.WithRetryBackoff(context.Background(), maxRetryTimes, 1*time.Millisecond, func(u uint) bool { 64 | require.Equal(t, u, executed) 65 | executed += 1 66 | return false 67 | }) 68 | require.Equal(t, maxRetryTimes+1, executed) 69 | 70 | // wait about 1023 millis 71 | require.Greater(t, time.Since(now), 1*time.Second) 72 | } 73 | 74 | func TestWithRetryBackoffCtxDone(t *testing.T) { 75 | t.Parallel() 76 | 77 | ctx, cancel := context.WithTimeout(context.Background(), 10*time.Millisecond) 78 | defer cancel() 79 | 80 | maxRetryTimes := uint(10) 81 | executed := uint(0) 82 | utils.WithRetryBackoff(ctx, maxRetryTimes, 1*time.Minute, func(u uint) bool { 83 | require.Equal(t, u, executed) 84 | executed += 1 85 | return false 86 | }) 87 | 88 | require.Equal(t, executed, uint(1)) 89 | } 90 | 91 | func TestWithRetryBackoffDoneMidway(t *testing.T) { 92 | t.Parallel() 93 | 94 | maxRetryTimes := uint(10) 95 | executed := uint(0) 96 | utils.WithRetryBackoff(context.Background(), maxRetryTimes, 1*time.Millisecond, func(u uint) bool { 97 | require.Equal(t, u, executed) 98 | executed += 1 99 | 100 | return u == 3 101 | }) 102 | require.Equal(t, executed, uint(4)) 103 | } 104 | --------------------------------------------------------------------------------