├── .gitignore ├── README.md ├── build.sh ├── checks └── check_kafka_connect.go ├── cloudwatch └── healthy_task_count.go ├── common ├── kafka_connect.go └── schema_registry.go └── prometheus └── metrics_exporter.go /.gitignore: -------------------------------------------------------------------------------- 1 | build/* 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ### Kafka Connect Monitoring Tools 2 | 3 | These tools utilize the `/connectors/$connector_name/status` endpoint of http://docs.confluent.io/current/connect/ to verify there are healthy tasks running. 4 | 5 | #### Installation 6 | 7 | Download the appropriate binary from the release page: https://github.com/jesseadams/kafka-connect-monitoring-tools/releases 8 | 9 | #### Usage 10 | 11 | ##### checks/check_kafka_connect 12 | 13 | This was designed to be a Nagios/Sensu like check. You can use exit codes as you would expect to determine success. 14 | 15 | ###### Parameters 16 | 17 | ```go 18 | Host string `arg:"required"` 19 | Connector string `arg:"required"` 20 | DontValidateSsl bool `arg:"--dont-validate-ssl"` 21 | TaskCount int 22 | Port int 23 | Insecure bool 24 | ProtocolString string 25 | ``` 26 | 27 | Example: 28 | 29 | `./check_kafka_connect --host foo.example.com --connector my-connector-name --taskcount 1 --dont-validate-ssl` 30 | 31 | ##### cloudwatch/healthy_task_count 32 | 33 | This is used to feed a HealthyTaskCount metric into AWS CloudWatch. By default, it will call a PutMetricData for Namespace: KafkaConnect, Metric: HealthyTaskCount, Unit: Count. The dimension defaults to the hostname of the server. You'll likely want to setup a cron job that runs every 5 minutes on the servers. 34 | 35 | ```go 36 | Host string `arg:"required"` 37 | Connector string `arg:"required"` 38 | DontValidateSsl bool `arg:"--dont-validate-ssl"` 39 | DimensionName string 40 | DimensionValue string 41 | Namespace string 42 | Port int 43 | Insecure bool 44 | ProtocolString string 45 | ``` 46 | 47 | Example: 48 | 49 | `./healthy_task_count --host foo.example.com --connector my-connector-name --dont-validate-ssl` 50 | 51 | ##### prometheus/metrics_exporter 52 | 53 | This is a lightweight HTTP service that polls the kafka connect and schema registry APIs and publishes an endpoint that provides Prometheus-friendly metrics. So far the following metrics are supported. 54 | 55 | * kafka_connect_connectorcount 56 | * kafka_connect_runningtaskscount (per connector) 57 | * schema_registry_subjectcount 58 | * schema_registry_versioncount 59 | 60 | Configuration is achieved through setting the following environment variables. 61 | 62 | * KAFKA_CONNECT_URL - Required, example: https://example.com 63 | * SCHEMA_REGISTRY_URL - Required, example: http://example.com 64 | * METRICS_REFRESH_RATE - Optional, in seconds, defaults to 10 65 | -------------------------------------------------------------------------------- /build.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | export GOARCH=amd64 4 | os_list="linux darwin windows" 5 | rm -f build/* 6 | 7 | cd checks 8 | go get 9 | for os in $os_list; do 10 | echo "Building check_kafka_connect for $os" 11 | GOOS=$os go build -o ../build/check_kafka_connect.${os}.amd64 12 | done 13 | cd .. 14 | 15 | cd cloudwatch 16 | go get 17 | for os in $os_list; do 18 | echo "Building healthy_task_count for $os" 19 | GOOS=$os go build -o ../build/healthy_task_count.${os}.amd64 20 | done 21 | cd .. 22 | 23 | cd prometheus 24 | go get 25 | for os in $os_list; do 26 | echo "Building metrics_exporter for $os" 27 | GOOS=$os go build -o ../build/metrics_exporter.${os}.amd64 28 | done 29 | cd .. 30 | 31 | echo "Done" 32 | -------------------------------------------------------------------------------- /checks/check_kafka_connect.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import( 4 | "fmt" 5 | "os" 6 | "strconv" 7 | "github.com/alexflint/go-arg" 8 | "github.com/jesseadams/kafka-connect-monitoring-tools/common" 9 | ) 10 | 11 | var args struct { 12 | Host string `arg:"required"` 13 | Connector string `arg:"required"` 14 | DontValidateSsl bool `arg:"--dont-validate-ssl"` 15 | TaskCount int 16 | Port int 17 | Insecure bool 18 | ProtocolString string 19 | } 20 | 21 | func main() { 22 | args.TaskCount = 1 23 | arg.MustParse(&args) 24 | 25 | if args.Port == 0 { 26 | if args.Insecure { 27 | args.Port = 80 28 | args.ProtocolString = "http" 29 | } else { 30 | args.Port = 443 31 | args.ProtocolString = "https" 32 | } 33 | } 34 | 35 | hostString := args.ProtocolString + "://" + args.Host + ":" + strconv.Itoa(args.Port) 36 | status := new(kafka_connect.KafkaConnectorStatus) 37 | err := kafka_connect.CheckStatus(hostString, args.Connector, status, args.DontValidateSsl) 38 | 39 | if err != nil { 40 | fmt.Println(err) 41 | os.Exit(1) 42 | } 43 | 44 | tasksCount := len(status.Tasks) 45 | if tasksCount != args.TaskCount { 46 | fmt.Printf("Task count is off! Wanted: %d, Actual: %d\n", args.TaskCount, tasksCount) 47 | os.Exit(1) 48 | } else { 49 | fmt.Println("Task count OK") 50 | } 51 | 52 | failure := false 53 | for _, task := range status.Tasks { 54 | fmt.Printf("Task ID %d is %s\n", task.Id, task.State) 55 | if task.State != "RUNNING" { 56 | failure = true 57 | } 58 | } 59 | 60 | if failure { 61 | fmt.Println("One more more tasks are not running!") 62 | os.Exit(1) 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /cloudwatch/healthy_task_count.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import( 4 | "fmt" 5 | "os" 6 | "strconv" 7 | "time" 8 | "github.com/alexflint/go-arg" 9 | "github.com/jesseadams/kafka-connect-monitoring-tools/common" 10 | "github.com/aws/aws-sdk-go/aws" 11 | "github.com/aws/aws-sdk-go/aws/session" 12 | "github.com/aws/aws-sdk-go/service/cloudwatch" 13 | ) 14 | 15 | var args struct { 16 | Host string `arg:"required"` 17 | Connector string `arg:"required"` 18 | DontValidateSsl bool `arg:"--dont-validate-ssl"` 19 | DimensionName string 20 | DimensionValue string 21 | Namespace string 22 | Port int 23 | Insecure bool 24 | ProtocolString string 25 | } 26 | 27 | func main() { 28 | args.Namespace = "KafkaConnect" 29 | args.DimensionName = "Host" 30 | args.DimensionValue, _ = os.Hostname() 31 | 32 | arg.MustParse(&args) 33 | 34 | if args.Port == 0 { 35 | if args.Insecure { 36 | args.Port = 80 37 | args.ProtocolString = "http" 38 | } else { 39 | args.Port = 443 40 | args.ProtocolString = "https" 41 | } 42 | } 43 | 44 | hostString := args.ProtocolString + "://" + args.Host + ":" + strconv.Itoa(args.Port) 45 | status := new(kafka_connect.KafkaConnectorStatus) 46 | err := kafka_connect.CheckStatus(hostString, args.Connector, status, args.DontValidateSsl) 47 | 48 | if err != nil { 49 | fmt.Println(err) 50 | os.Exit(1) 51 | } 52 | 53 | tasksCount := len(status.Tasks) 54 | fmt.Println(tasksCount) 55 | 56 | sess := session.Must(session.NewSession()) 57 | svc := cloudwatch.New(sess) 58 | params := &cloudwatch.PutMetricDataInput{ 59 | MetricData: []*cloudwatch.MetricDatum{ 60 | { 61 | MetricName: aws.String("HealthyTaskCount"), 62 | Dimensions: []*cloudwatch.Dimension{ 63 | { 64 | Name: aws.String(args.DimensionName), 65 | Value: aws.String(args.DimensionValue), 66 | }, 67 | }, 68 | Timestamp: aws.Time(time.Now()), 69 | Unit: aws.String("Count"), 70 | Value: aws.Float64(float64(tasksCount)), 71 | }, 72 | }, 73 | Namespace: aws.String(args.Namespace), 74 | } 75 | _, err = svc.PutMetricData(params) 76 | 77 | if err != nil { 78 | fmt.Println(err.Error()) 79 | return 80 | } 81 | } 82 | -------------------------------------------------------------------------------- /common/kafka_connect.go: -------------------------------------------------------------------------------- 1 | package kafka_connect 2 | 3 | import( 4 | "net/http" 5 | "encoding/json" 6 | "crypto/tls" 7 | "time" 8 | ) 9 | 10 | type KafkaConnector struct { 11 | State string 12 | WorkerId string 13 | } 14 | 15 | type KafkaConnectorTask struct { 16 | State string 17 | Id int 18 | WorkerId string 19 | } 20 | 21 | type KafkaConnectorStatus struct { 22 | Name string 23 | Connector KafkaConnector 24 | Tasks []KafkaConnectorTask 25 | } 26 | 27 | func CheckStatus(baseUrl string, connector string, target interface{}, validateSsl bool) error { 28 | endpoint := "/connectors/" + connector + "/status" 29 | transport := &http.Transport{ 30 | TLSClientConfig: &tls.Config{InsecureSkipVerify: validateSsl}, 31 | } 32 | var client = &http.Client{Transport: transport, Timeout: 10 * time.Second} 33 | response, err := client.Get(baseUrl + endpoint) 34 | 35 | if err != nil { 36 | return err 37 | } 38 | defer response.Body.Close() 39 | 40 | return json.NewDecoder(response.Body).Decode(target) 41 | } 42 | 43 | func ListConnectors(baseUrl string, validateSsl bool) ([]string, error) { 44 | endpoint := "/connectors" 45 | transport := &http.Transport{ 46 | TLSClientConfig: &tls.Config{InsecureSkipVerify: validateSsl}, 47 | } 48 | var client = &http.Client{Transport: transport, Timeout: 10 * time.Second} 49 | response, err := client.Get(baseUrl + endpoint) 50 | 51 | if err != nil { 52 | return nil, err 53 | } 54 | defer response.Body.Close() 55 | 56 | var connectors []string 57 | errors := json.NewDecoder(response.Body).Decode(&connectors) 58 | 59 | return connectors, errors 60 | } 61 | -------------------------------------------------------------------------------- /common/schema_registry.go: -------------------------------------------------------------------------------- 1 | package kafka_connect 2 | 3 | import( 4 | "net/http" 5 | "encoding/json" 6 | "crypto/tls" 7 | "time" 8 | ) 9 | 10 | func ListVersions(baseUrl string, subject string, validateSsl bool) ([]string, error) { 11 | endpoint := "/subjects/" + subject + "/versions" 12 | transport := &http.Transport{ 13 | TLSClientConfig: &tls.Config{InsecureSkipVerify: validateSsl}, 14 | } 15 | var client = &http.Client{Transport: transport, Timeout: 10 * time.Second} 16 | response, err := client.Get(baseUrl + endpoint) 17 | 18 | if err != nil { 19 | return nil, err 20 | } 21 | defer response.Body.Close() 22 | 23 | var versions []string 24 | errors := json.NewDecoder(response.Body).Decode(&versions) 25 | 26 | return versions, errors 27 | } 28 | 29 | func ListSubjects(baseUrl string, validateSsl bool) ([]string, error) { 30 | endpoint := "/subjects" 31 | transport := &http.Transport{ 32 | TLSClientConfig: &tls.Config{InsecureSkipVerify: validateSsl}, 33 | } 34 | var client = &http.Client{Transport: transport, Timeout: 10 * time.Second} 35 | response, err := client.Get(baseUrl + endpoint) 36 | 37 | if err != nil { 38 | return nil, err 39 | } 40 | defer response.Body.Close() 41 | 42 | var subjects []string 43 | errors := json.NewDecoder(response.Body).Decode(&subjects) 44 | 45 | return subjects, errors 46 | } 47 | -------------------------------------------------------------------------------- /prometheus/metrics_exporter.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "io" 5 | "net/http" 6 | "log" 7 | "os" 8 | "fmt" 9 | "time" 10 | "strconv" 11 | "github.com/jesseadams/kafka-connect-monitoring-tools/common" 12 | ) 13 | 14 | var ResponseString = "Initializing..." 15 | 16 | func RetrieveKafkaConnectMetrics(hostString string) string { 17 | var output string 18 | 19 | connectors, err := kafka_connect.ListConnectors(hostString, true) 20 | if err != nil { 21 | fmt.Println(err) 22 | } 23 | 24 | connectorCount := len(connectors) 25 | output += "# TYPE kafka_connect_connectorcount gauge\n" 26 | line := fmt.Sprintf("kafka_connect_connectorcount %.1f\n", float64(connectorCount)) 27 | output += line 28 | 29 | output += "# TYPE kafka_connect_runningtaskscount gauge\n" 30 | for _, connector := range connectors { 31 | status := new(kafka_connect.KafkaConnectorStatus) 32 | err = kafka_connect.CheckStatus(hostString, connector, status, true) 33 | 34 | if err != nil { 35 | fmt.Println(err) 36 | } 37 | 38 | runningTasksCount := 0.0 39 | for _, task := range status.Tasks { 40 | if task.State == "RUNNING" { 41 | runningTasksCount += 1.0 42 | } 43 | } 44 | line := fmt.Sprintf("kafka_connect_runningtaskscount{connector=\"%s\"} %.1f\n", connector, runningTasksCount) 45 | output += line 46 | } 47 | 48 | return output 49 | } 50 | 51 | func RetrieveSchemaRegistryMetrics(hostString string) string { 52 | var output string 53 | 54 | subjects, err := kafka_connect.ListSubjects(hostString, true) 55 | if err != nil { 56 | fmt.Println(err) 57 | } 58 | 59 | subjectCount := len(subjects) 60 | output += "# TYPE schema_registry_subjectcount gauge\n" 61 | line := fmt.Sprintf("schema_registry_subjectcount %.1f\n", float64(subjectCount)) 62 | output += line 63 | 64 | output += "# TYPE schema_registry_versioncount gauge\n" 65 | for _, subject := range subjects { 66 | versions, err := kafka_connect.ListVersions(hostString, subject, true) 67 | 68 | if err != nil { 69 | fmt.Println(err) 70 | } 71 | 72 | versionCount := len(versions) 73 | line := fmt.Sprintf("schema_registry_versioncount{subject=\"%s\"} %.1f\n", subject, float64(versionCount)) 74 | output += line 75 | } 76 | 77 | return output 78 | } 79 | 80 | func PublishPrometheusMetrics(writer http.ResponseWriter, req *http.Request) { 81 | io.WriteString(writer, ResponseString) 82 | } 83 | 84 | func RetrievePrometheusMetrics() { 85 | kafkaConnectHostString := os.Getenv("KAFKA_CONNECT_URL") 86 | schemaRegistryHostString := os.Getenv("SCHEMA_REGISTRY_URL") 87 | metricsRefreshRate, err := strconv.ParseInt(os.Getenv("METRICS_REFRESH_RATE"), 10, 32) 88 | 89 | if err != nil { 90 | fmt.Println("Unable to parse refresh interval from METRICS_REFRESH_RATE") 91 | fmt.Println(err) 92 | } 93 | 94 | if metricsRefreshRate < 10 { 95 | metricsRefreshRate = 60 96 | } 97 | 98 | fmt.Printf("Metrics Refresh Rate: %d seconds\n", metricsRefreshRate) 99 | for true { 100 | fmt.Println("Refreshing metrics...") 101 | kafkaConnectOutput := RetrieveKafkaConnectMetrics(kafkaConnectHostString) 102 | schemaRegistryOutput := RetrieveSchemaRegistryMetrics(schemaRegistryHostString) 103 | ResponseString = kafkaConnectOutput + schemaRegistryOutput 104 | fmt.Println("Metrics refresh complete!") 105 | 106 | time.Sleep(time.Duration(metricsRefreshRate) * time.Second) 107 | } 108 | } 109 | 110 | func main() { 111 | go RetrievePrometheusMetrics() 112 | http.HandleFunc("/metrics", PublishPrometheusMetrics) 113 | log.Fatal(http.ListenAndServe(":7071", nil)) 114 | } 115 | --------------------------------------------------------------------------------