├── runtime_metrics ├── fds_darwin.go ├── fds_linux.go ├── runtime_metrics_test.go └── runtime_metrics.go ├── identifier.go ├── TODO ├── error.go ├── healthd ├── debouncer.go ├── mock_time.go ├── debouncer_test.go ├── poll.go ├── poll_test.go ├── healthd_test.go ├── api.go └── healthd.go ├── stack ├── stack.go ├── frame.go └── stack_test.go ├── LICENSE ├── json_polling_sink_test.go ├── sinks ├── bugsnag │ ├── sink_test.go │ ├── api_test.go │ ├── sink.go │ └── api.go └── librato │ ├── sink_test.go │ └── sink.go ├── json_polling_sink_http.go ├── json_polling_sink_http_test.go ├── interval_aggregation_clone.go ├── cmd ├── healthtop │ ├── main.go │ ├── hosts.go │ └── jobs.go └── healthd │ └── main.go ├── json_writer_sink.go ├── interval_aggregation_merge.go ├── json_polling_sink.go ├── error_test.go ├── interval_aggregation_merge_test.go ├── health_test.go ├── writer_sink.go ├── json_writer_sink_test.go ├── interval_aggregation_clone_test.go ├── interval_aggregation.go ├── aggregator.go ├── health.go ├── writer_sink_test.go ├── aggregator_test.go ├── statsd_sink.go ├── statsd_sink_test.go └── README.md /runtime_metrics/fds_darwin.go: -------------------------------------------------------------------------------- 1 | package runtime_metrics 2 | 3 | func getFDUsage() (uint64, error) { 4 | return 0, nil 5 | } 6 | -------------------------------------------------------------------------------- /runtime_metrics/fds_linux.go: -------------------------------------------------------------------------------- 1 | package runtime_metrics 2 | 3 | import ( 4 | "io/ioutil" 5 | ) 6 | 7 | func getFDUsage() (uint64, error) { 8 | fds, err := ioutil.ReadDir("/proc/self/fd") 9 | if err != nil { 10 | return 0, err 11 | } 12 | return uint64(len(fds)), nil 13 | } 14 | -------------------------------------------------------------------------------- /identifier.go: -------------------------------------------------------------------------------- 1 | package health 2 | 3 | import ( 4 | "fmt" 5 | "os" 6 | ) 7 | 8 | var Identifier = getIdentifier() 9 | 10 | func getIdentifier() string { 11 | pid := os.Getpid() 12 | host, err := os.Hostname() 13 | if err != nil { 14 | host = "hostname_errored" 15 | } 16 | 17 | return fmt.Sprintf("%s.%d", host, pid) 18 | } 19 | -------------------------------------------------------------------------------- /TODO: -------------------------------------------------------------------------------- 1 | - make sure bugsnag works w/o Hostname and ReleaseStage (eg defaults work) 2 | - make sure healthtop default sort works or at least figure out what it is 3 | - in readme make sure I have the right syntax for http.Handle. 4 | - in readme make sure I get samples of how everything is logged. 5 | - screenshots for healthtop 6 | - remove self-logging in healthd -------------------------------------------------------------------------------- /error.go: -------------------------------------------------------------------------------- 1 | package health 2 | 3 | import ( 4 | "github.com/gocraft/health/stack" 5 | ) 6 | 7 | type MutedError struct { 8 | Err error 9 | } 10 | 11 | type UnmutedError struct { 12 | Err error 13 | Stack *stack.Trace 14 | Emitted bool 15 | } 16 | 17 | func (e *MutedError) Error() string { 18 | return e.Err.Error() 19 | } 20 | 21 | func (e *UnmutedError) Error() string { 22 | return e.Err.Error() 23 | } 24 | 25 | func Mute(err error) *MutedError { 26 | return &MutedError{Err: err} 27 | } 28 | 29 | func wrapErr(err error) error { 30 | switch err := err.(type) { 31 | case *MutedError, *UnmutedError: 32 | return err 33 | default: 34 | return &UnmutedError{Err: err, Stack: stack.NewTrace(2)} 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /healthd/debouncer.go: -------------------------------------------------------------------------------- 1 | package healthd 2 | 3 | import ( 4 | "time" 5 | ) 6 | 7 | // A) don't fire more than every 2 seconds B) the time between an input and output should be at most 2 seconds 8 | func debouncer(doit chan<- struct{}, needitdone <-chan struct{}, threshold time.Duration, sleepTime time.Duration) { 9 | var oldestNeedItDone time.Time 10 | 11 | for { 12 | select { 13 | case <-needitdone: 14 | if oldestNeedItDone.IsZero() { 15 | oldestNeedItDone = now() 16 | } 17 | default: 18 | // This sleep time is the max error that we'll be off by. 19 | time.Sleep(sleepTime) 20 | } 21 | 22 | if !oldestNeedItDone.IsZero() && (now().Sub(oldestNeedItDone) > threshold) { 23 | doit <- struct{}{} 24 | oldestNeedItDone = time.Time{} // Zero the object 25 | } 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /healthd/mock_time.go: -------------------------------------------------------------------------------- 1 | package healthd 2 | 3 | import ( 4 | "sync" 5 | "time" 6 | ) 7 | 8 | var nowMock time.Time 9 | var nowMut sync.RWMutex 10 | 11 | func now() time.Time { 12 | nowMut.RLock() 13 | defer nowMut.RUnlock() 14 | if nowMock.IsZero() { 15 | return time.Now() 16 | } 17 | return nowMock 18 | } 19 | 20 | func setNowMock(t string) { 21 | var err error 22 | nowMut.Lock() 23 | defer nowMut.Unlock() 24 | nowMock, err = time.Parse(time.RFC3339, t) 25 | if err != nil { 26 | panic(err) 27 | } 28 | } 29 | 30 | func advanceNowMock(dur time.Duration) { 31 | nowMut.Lock() 32 | defer nowMut.Unlock() 33 | if nowMock.IsZero() { 34 | panic("nowMock is not set") 35 | } 36 | nowMock = nowMock.Add(dur) 37 | } 38 | 39 | func resetNowMock() { 40 | nowMut.Lock() 41 | defer nowMut.Unlock() 42 | nowMock = time.Time{} 43 | } 44 | -------------------------------------------------------------------------------- /stack/stack.go: -------------------------------------------------------------------------------- 1 | package stack 2 | 3 | import ( 4 | "bytes" 5 | "runtime" 6 | ) 7 | 8 | // MaxStackDepth is the maximum number of stackframes on any error. 9 | var MaxStackDepth = 50 10 | 11 | type Trace struct { 12 | stack []uintptr 13 | frames []Frame 14 | } 15 | 16 | func NewTrace(skip int) *Trace { 17 | stack := make([]uintptr, MaxStackDepth) 18 | length := runtime.Callers(2+skip, stack) 19 | return &Trace{ 20 | stack: stack[:length], 21 | } 22 | } 23 | 24 | // StackFrames returns an array of frames containing information about the stack. 25 | func (t *Trace) Frames() []Frame { 26 | if t.frames == nil { 27 | t.frames = make([]Frame, len(t.stack)) 28 | 29 | for i, pc := range t.stack { 30 | t.frames[i] = NewFrame(pc) 31 | } 32 | } 33 | 34 | return t.frames 35 | } 36 | 37 | // Stack returns a formatted callstack. 38 | func (t *Trace) Stack() []byte { 39 | buf := bytes.Buffer{} 40 | 41 | for _, frame := range t.Frames() { 42 | buf.WriteString(frame.String()) 43 | buf.WriteRune('\n') 44 | } 45 | 46 | return buf.Bytes() 47 | } 48 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 Jonathan Novak 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of 6 | this software and associated documentation files (the "Software"), to deal in 7 | the Software without restriction, including without limitation the rights to 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 9 | the Software, and to permit persons to whom the Software is furnished to do so, 10 | subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 17 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 18 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 19 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 20 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 21 | -------------------------------------------------------------------------------- /json_polling_sink_test.go: -------------------------------------------------------------------------------- 1 | package health 2 | 3 | import ( 4 | "errors" 5 | "testing" 6 | "time" 7 | 8 | "github.com/stretchr/testify/assert" 9 | ) 10 | 11 | func TestJsonPollingSink(t *testing.T) { 12 | setNowMock("2011-09-09T23:36:13Z") 13 | defer resetNowMock() 14 | 15 | sink := NewJsonPollingSink(time.Minute, time.Minute*5) 16 | 17 | sink.EmitEvent("myjob", "myevent", nil) 18 | sink.EmitEventErr("myjob", "myevent", errors.New("myerr"), nil) 19 | sink.EmitTiming("myjob", "myevent", 100, nil) 20 | sink.EmitGauge("myjob", "myevent", 3.14, nil) 21 | sink.EmitComplete("myjob", Success, 9, nil) 22 | 23 | time.Sleep(10 * time.Millisecond) // we need to make sure we process the above metrics before we get the metrics. 24 | intervals := sink.GetMetrics() 25 | 26 | sink.ShutdownServer() 27 | 28 | assert.Equal(t, 1, len(intervals)) 29 | 30 | intAgg := intervals[0] 31 | assert.EqualValues(t, 1, intAgg.Events["myevent"]) 32 | assert.EqualValues(t, 3.14, intAgg.Gauges["myevent"]) 33 | assert.EqualValues(t, 1, intAgg.EventErrs["myevent"].Count) 34 | assert.EqualValues(t, 1, intAgg.Timers["myevent"].Count) 35 | assert.EqualValues(t, 1, intAgg.Jobs["myjob"].Count) 36 | } 37 | -------------------------------------------------------------------------------- /healthd/debouncer_test.go: -------------------------------------------------------------------------------- 1 | package healthd 2 | 3 | import ( 4 | "testing" 5 | "time" 6 | ) 7 | 8 | func TestDebouncer(t *testing.T) { 9 | doit := make(chan struct{}) 10 | needitdone := make(chan struct{}) 11 | 12 | setNowMock("2011-09-09T23:36:13Z") 13 | defer resetNowMock() 14 | 15 | go debouncer(doit, needitdone, time.Second*2, time.Millisecond) 16 | 17 | needitdone <- struct{}{} 18 | needitdone <- struct{}{} 19 | 20 | time.Sleep(time.Millisecond * 2) 21 | 22 | select { 23 | case <-doit: 24 | t.Error("Did it too soon") 25 | default: 26 | // cool 27 | } 28 | 29 | advanceNowMock(time.Second * 1) 30 | time.Sleep(time.Millisecond * 2) // Need the goroutine to wake up 31 | 32 | select { 33 | case <-doit: 34 | t.Error("Did it too soon") 35 | default: 36 | // cool 37 | } 38 | 39 | advanceNowMock(time.Second * 2) 40 | time.Sleep(time.Millisecond * 2) // Need the goroutine to wake up 41 | 42 | select { 43 | case <-doit: 44 | // cool 45 | default: 46 | t.Error("never did it") 47 | } 48 | 49 | time.Sleep(time.Millisecond * 2) // Need the goroutine to wake up 50 | 51 | select { 52 | case <-doit: 53 | t.Error("should only do it once") 54 | default: 55 | // cool 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /sinks/bugsnag/sink_test.go: -------------------------------------------------------------------------------- 1 | package bugsnag 2 | 3 | import ( 4 | "fmt" 5 | "github.com/gocraft/health" 6 | "github.com/gocraft/health/stack" 7 | "github.com/stretchr/testify/assert" 8 | "net/http" 9 | "testing" 10 | "time" 11 | ) 12 | 13 | func TestSink(t *testing.T) { 14 | config := &Config{ 15 | APIKey: "abcd", 16 | Endpoint: "http://localhost:5052/", 17 | ReleaseStage: "staging", 18 | AppVersion: "1.0", 19 | Hostname: "", 20 | } 21 | 22 | s := NewSink(config) 23 | defer s.ShutdownServer() 24 | 25 | n := notifyHandler{ 26 | PayloadChan: make(chan *payload, 2), 27 | } 28 | 29 | go http.ListenAndServe(":5052", n) 30 | 31 | err := &health.UnmutedError{Err: fmt.Errorf("err str"), Stack: stack.NewTrace(2)} 32 | s.EmitEventErr("thejob", "theevent", err, nil) 33 | 34 | p := <-n.PayloadChan 35 | evt := p.Events[0] 36 | assert.Equal(t, evt.Context, "thejob") 37 | 38 | ex := evt.Exceptions[0] 39 | assert.Equal(t, ex.ErrorClass, "theevent") 40 | assert.Equal(t, ex.Message, "err str") 41 | 42 | err.Emitted = true 43 | s.EmitEventErr("thejob", "theevent2", err, nil) 44 | 45 | time.Sleep(1 * time.Millisecond) 46 | 47 | select { 48 | case <-n.PayloadChan: 49 | t.Errorf("did not expect payload") 50 | default: 51 | // yay 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /json_polling_sink_http.go: -------------------------------------------------------------------------------- 1 | package health 2 | 3 | import ( 4 | "encoding/json" 5 | "fmt" 6 | "net/http" 7 | "time" 8 | ) 9 | 10 | type HealthAggregationsResponse struct { 11 | InstanceId string `json:"instance_id"` 12 | IntervalDuration time.Duration `json:"interval_duration"` 13 | IntervalAggregations []*IntervalAggregation `json:"aggregations"` 14 | } 15 | 16 | func (s *JsonPollingSink) StartServer(addr string) { 17 | go http.ListenAndServe(addr, s) 18 | } 19 | 20 | func (s *JsonPollingSink) ServeHTTP(rw http.ResponseWriter, r *http.Request) { 21 | rw.Header().Set("Content-Type", "application/json; charset=utf-8") 22 | if r.URL.Path == "/health" { 23 | metrics := s.GetMetrics() 24 | response := &HealthAggregationsResponse{ 25 | InstanceId: Identifier, 26 | IntervalDuration: s.intervalDuration, 27 | IntervalAggregations: metrics, 28 | } 29 | jsonData, err := json.MarshalIndent(response, "", "\t") 30 | if err != nil { 31 | renderError(rw, err) 32 | return 33 | } 34 | fmt.Fprintf(rw, string(jsonData)) 35 | } else { 36 | renderNotFound(rw) 37 | } 38 | } 39 | 40 | func renderNotFound(rw http.ResponseWriter) { 41 | rw.WriteHeader(404) 42 | fmt.Fprintf(rw, `{"error": "not_found"}`) 43 | } 44 | 45 | func renderError(rw http.ResponseWriter, err error) { 46 | rw.WriteHeader(500) 47 | fmt.Fprintf(rw, `{"error": "%s"}`, err.Error()) 48 | } 49 | -------------------------------------------------------------------------------- /json_polling_sink_http_test.go: -------------------------------------------------------------------------------- 1 | package health 2 | 3 | import ( 4 | "encoding/json" 5 | "fmt" 6 | "github.com/stretchr/testify/assert" 7 | "net/http" 8 | "net/http/httptest" 9 | "testing" 10 | "time" 11 | ) 12 | 13 | func TestJsonPollingSinkServerSuccess(t *testing.T) { 14 | sink := NewJsonPollingSink(time.Minute, time.Minute*5) 15 | defer sink.ShutdownServer() 16 | 17 | sink.EmitEvent("myjob", "myevent", nil) 18 | sink.EmitEventErr("myjob", "myevent", fmt.Errorf("myerr"), nil) 19 | sink.EmitTiming("myjob", "myevent", 100, nil) 20 | sink.EmitGauge("myjob", "myevent", 3.14, nil) 21 | sink.EmitComplete("myjob", Success, 9, nil) 22 | 23 | time.Sleep(10 * time.Millisecond) 24 | 25 | recorder := httptest.NewRecorder() 26 | request, _ := http.NewRequest("GET", "/health", nil) 27 | 28 | sink.ServeHTTP(recorder, request) 29 | 30 | assert.Equal(t, 200, recorder.Code) 31 | 32 | var resp HealthAggregationsResponse 33 | err := json.Unmarshal(recorder.Body.Bytes(), &resp) 34 | assert.NoError(t, err) 35 | assert.Equal(t, 1, len(resp.IntervalAggregations)) 36 | assert.Equal(t, map[string]int64{"myevent": 1}, resp.IntervalAggregations[0].Events) 37 | } 38 | 39 | func TestJsonPollingSinkServerNotFound(t *testing.T) { 40 | sink := NewJsonPollingSink(time.Minute, time.Minute*5) 41 | defer sink.ShutdownServer() 42 | 43 | recorder := httptest.NewRecorder() 44 | request, _ := http.NewRequest("GET", "/wat", nil) 45 | sink.ServeHTTP(recorder, request) 46 | assert.Equal(t, 404, recorder.Code) 47 | } 48 | -------------------------------------------------------------------------------- /runtime_metrics/runtime_metrics_test.go: -------------------------------------------------------------------------------- 1 | package runtime_metrics 2 | 3 | import ( 4 | "testing" 5 | ) 6 | 7 | type testReceiver struct { 8 | gauges map[string]float64 9 | } 10 | 11 | func TestRuntimeMetrics(t *testing.T) { 12 | tr := &testReceiver{ 13 | gauges: make(map[string]float64), 14 | } 15 | m := NewRuntimeMetrics(tr, nil) 16 | m.Start() 17 | defer m.Stop() 18 | m.Report() 19 | 20 | expectedKeys := []string{"heap_objects", "alloc", "num_gc", "next_gc", "gc_cpu_fraction", "pause_total_ns", "gc_pause_quantile_50", "gc_pause_quantile_max", "num_cgo_call", "num_goroutines", "num_fds_used"} 21 | 22 | for _, k := range expectedKeys { 23 | if _, ok := tr.gauges[k]; !ok { 24 | t.Errorf("expected to have key %s but didn't. map=%v", k, tr.gauges) 25 | } 26 | } 27 | } 28 | 29 | func (t *testReceiver) Event(eventName string) { 30 | 31 | } 32 | 33 | func (t *testReceiver) EventKv(eventName string, kvs map[string]string) { 34 | 35 | } 36 | 37 | func (t *testReceiver) EventErr(eventName string, err error) error { 38 | return nil 39 | } 40 | 41 | func (t *testReceiver) EventErrKv(eventName string, err error, kvs map[string]string) error { 42 | return nil 43 | } 44 | 45 | func (t *testReceiver) Timing(eventName string, nanoseconds int64) { 46 | 47 | } 48 | 49 | func (t *testReceiver) TimingKv(eventName string, nanoseconds int64, kvs map[string]string) { 50 | 51 | } 52 | 53 | func (t *testReceiver) Gauge(eventName string, value float64) { 54 | t.gauges[eventName] = value 55 | } 56 | 57 | func (t *testReceiver) GaugeKv(eventName string, value float64, kvs map[string]string) { 58 | 59 | } 60 | -------------------------------------------------------------------------------- /interval_aggregation_clone.go: -------------------------------------------------------------------------------- 1 | package health 2 | 3 | // Clone does a deep clone of ia, duplicating all maps and whatnot. 4 | func (ia *IntervalAggregation) Clone() *IntervalAggregation { 5 | dup := &IntervalAggregation{} 6 | dup.IntervalStart = ia.IntervalStart 7 | dup.SerialNumber = ia.SerialNumber 8 | dup.aggregationMaps = *ia.aggregationMaps.Clone() 9 | 10 | dup.Jobs = make(map[string]*JobAggregation) 11 | for k, v := range ia.Jobs { 12 | dup.Jobs[k] = v.Clone() 13 | } 14 | 15 | return dup 16 | } 17 | 18 | func (am *aggregationMaps) Clone() *aggregationMaps { 19 | dup := &aggregationMaps{} 20 | 21 | dup.initAggregationMaps() 22 | 23 | for k, v := range am.Events { 24 | dup.Events[k] = v 25 | } 26 | 27 | for k, v := range am.Gauges { 28 | dup.Gauges[k] = v 29 | } 30 | 31 | for k, v := range am.Timers { 32 | dup.Timers[k] = v.Clone() 33 | } 34 | 35 | for k, v := range am.EventErrs { 36 | dup.EventErrs[k] = v.Clone() 37 | } 38 | 39 | return dup 40 | } 41 | 42 | func (ta *TimerAggregation) Clone() *TimerAggregation { 43 | var dup = *ta 44 | return &dup 45 | } 46 | 47 | func (ec *ErrorCounter) Clone() *ErrorCounter { 48 | var dup = *ec 49 | return &dup 50 | } 51 | 52 | func (ja *JobAggregation) Clone() *JobAggregation { 53 | dup := &JobAggregation{ 54 | CountSuccess: ja.CountSuccess, 55 | CountValidationError: ja.CountValidationError, 56 | CountPanic: ja.CountPanic, 57 | CountError: ja.CountError, 58 | CountJunk: ja.CountJunk, 59 | } 60 | 61 | dup.aggregationMaps = *ja.aggregationMaps.Clone() 62 | dup.TimerAggregation = ja.TimerAggregation 63 | 64 | return dup 65 | } 66 | -------------------------------------------------------------------------------- /healthd/poll.go: -------------------------------------------------------------------------------- 1 | package healthd 2 | 3 | import ( 4 | "encoding/json" 5 | "github.com/gocraft/health" 6 | "io/ioutil" 7 | "net/http" 8 | "time" 9 | ) 10 | 11 | type pollResponse struct { 12 | HostPort string 13 | Timestamp time.Time 14 | 15 | Err error 16 | Code int 17 | Nanos int64 18 | 19 | health.HealthAggregationsResponse 20 | } 21 | 22 | // poll checks a server 23 | func poll(stream *health.Stream, hostPort string, responses chan<- *pollResponse) { 24 | job := stream.NewJob("poll") 25 | 26 | var body []byte 27 | var err error 28 | 29 | response := &pollResponse{ 30 | HostPort: hostPort, 31 | Timestamp: now(), 32 | } 33 | 34 | start := time.Now() 35 | 36 | client := &http.Client{ 37 | Timeout: 5 * time.Second, 38 | } 39 | 40 | resp, err := client.Get(metricsUrl(hostPort)) 41 | if err != nil { 42 | response.Err = job.EventErr("poll.client.get", err) 43 | goto POLL_FINISH 44 | } 45 | defer resp.Body.Close() 46 | body, err = ioutil.ReadAll(resp.Body) 47 | 48 | response.Nanos = time.Since(start).Nanoseconds() // don't mock b/c we need duration 49 | response.Code = resp.StatusCode 50 | 51 | if err != nil { // ioutil.ReadAll. We're checking here b/c we still want to capture nanos/code 52 | response.Err = job.EventErr("poll.ioutil.read_all", err) 53 | goto POLL_FINISH 54 | } 55 | 56 | if err := json.Unmarshal(body, &response.HealthAggregationsResponse); err != nil { 57 | response.Err = job.EventErr("poll.json.unmarshall", err) 58 | goto POLL_FINISH 59 | } 60 | 61 | POLL_FINISH: 62 | 63 | if response.Err != nil { 64 | job.CompleteKv(health.Error, health.Kvs{"host_port": hostPort}) 65 | } else { 66 | job.CompleteKv(health.Success, health.Kvs{"host_port": hostPort}) 67 | } 68 | 69 | responses <- response 70 | } 71 | 72 | func metricsUrl(hostPort string) string { 73 | return "http://" + hostPort + "/health" 74 | } 75 | -------------------------------------------------------------------------------- /cmd/healthtop/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "github.com/spf13/cobra" 6 | "time" 7 | ) 8 | 9 | // v2: 10 | // jobs/min vs jobs total vs jobs/sec (eg normalization) 11 | // - errors ?????? 12 | // - tests 13 | // - switch views/sorts while inside healthd 14 | 15 | type healthdStatus struct { 16 | lastSuccessAt time.Time 17 | lastErrorAt time.Time 18 | lastError error 19 | } 20 | 21 | func (s *healthdStatus) FmtNow() string { 22 | return time.Now().Format(time.RFC1123) 23 | } 24 | 25 | func (s *healthdStatus) FmtStatus() string { 26 | if s.lastErrorAt.IsZero() && s.lastSuccessAt.IsZero() { 27 | return "[starting...]" 28 | } else if s.lastErrorAt.After(s.lastSuccessAt) { 29 | return fmt.Sprint("[error: '", s.lastError.Error(), "' LastErrorAt: ", s.lastErrorAt.Format(time.RFC1123), "]") 30 | } else { 31 | return "[success]" 32 | } 33 | } 34 | 35 | var sourceHostPort string 36 | 37 | func main() { 38 | var cmdRoot = &cobra.Command{ 39 | Use: "healthtop [command]", 40 | } 41 | cmdRoot.PersistentFlags().StringVar(&sourceHostPort, "source", "localhost:5032", "source is the host:port of the healthd to query. ex: localhost:5031") 42 | 43 | var sort string 44 | var name string 45 | 46 | var cmdJobs = &cobra.Command{ 47 | Use: "jobs", 48 | Short: "list jobs", 49 | Run: func(cmd *cobra.Command, args []string) { 50 | jobsLoop(&jobOptions{Name: name, Sort: sort}) 51 | }, 52 | } 53 | 54 | cmdJobs.Flags().StringVar(&sort, "sort", "name", "sort ∈ {name, count, count_success, count_XXX, min, max, avg}") 55 | cmdJobs.Flags().StringVar(&name, "name", "", "name is a partial match on the name") 56 | 57 | var cmdHosts = &cobra.Command{ 58 | Use: "hosts", 59 | Short: "list hosts", 60 | Run: func(cmd *cobra.Command, args []string) { 61 | hostsLoop() 62 | }, 63 | } 64 | 65 | cmdRoot.AddCommand(cmdJobs) 66 | cmdRoot.AddCommand(cmdHosts) 67 | cmdRoot.Execute() 68 | } 69 | -------------------------------------------------------------------------------- /healthd/poll_test.go: -------------------------------------------------------------------------------- 1 | package healthd 2 | 3 | import ( 4 | "encoding/json" 5 | "fmt" 6 | "net/http" 7 | "testing" 8 | "time" 9 | 10 | "github.com/braintree/manners" 11 | "github.com/gocraft/health" 12 | "github.com/stretchr/testify/assert" 13 | ) 14 | 15 | func TestPoll(t *testing.T) { 16 | setNowMock("2011-09-09T23:36:13Z") 17 | defer resetNowMock() 18 | 19 | intAgg := health.NewIntervalAggregation(now()) 20 | data := &health.HealthAggregationsResponse{ 21 | InstanceId: "web22.12345", 22 | IntervalDuration: time.Minute, 23 | IntervalAggregations: []*health.IntervalAggregation{intAgg}, 24 | } 25 | stop := serveJson(":5050", data) 26 | defer func() { 27 | stop() 28 | }() 29 | 30 | responses := make(chan *pollResponse, 2) 31 | poll(health.NewStream(), ":5050", responses) 32 | response := <-responses 33 | 34 | assert.NotNil(t, response) 35 | assert.Equal(t, response.HostPort, ":5050") 36 | assert.Equal(t, response.Timestamp, now()) 37 | assert.Nil(t, response.Err) 38 | assert.Equal(t, response.Code, 200) 39 | assert.True(t, response.Nanos > 0 && response.Nanos < int64(time.Second)) 40 | assert.Equal(t, response.InstanceId, "web22.12345") 41 | // we'll just "trust" that the other stuff gets unmarshalled correctly. We didn't really put anything in there anyway in this test. 42 | } 43 | 44 | // serveJson will start a server on the hostPort and serve any path the Jsonified data. 45 | // Each successive HTTP request will return the next data. 46 | // If there is only one data, it will be returned on each request. 47 | func serveJson(hostPort string, data ...interface{}) func() bool { 48 | var curData = 0 49 | 50 | var f http.HandlerFunc 51 | f = func(rw http.ResponseWriter, r *http.Request) { 52 | d := data[curData] 53 | curData = (curData + 1) % len(data) 54 | jsonData, err := json.MarshalIndent(d, "", "\t") 55 | if err != nil { 56 | panic(err) 57 | } 58 | fmt.Fprintf(rw, string(jsonData)) 59 | } 60 | 61 | go manners.ListenAndServe(hostPort, f) 62 | time.Sleep(10 * time.Millisecond) 63 | 64 | return manners.Close 65 | } 66 | -------------------------------------------------------------------------------- /sinks/bugsnag/api_test.go: -------------------------------------------------------------------------------- 1 | package bugsnag 2 | 3 | import ( 4 | "encoding/json" 5 | "fmt" 6 | "github.com/gocraft/health/stack" 7 | "github.com/stretchr/testify/assert" 8 | "io/ioutil" 9 | "net/http" 10 | "strings" 11 | "testing" 12 | "time" 13 | ) 14 | 15 | func TestNotify(t *testing.T) { 16 | config := &Config{ 17 | APIKey: "abcd", 18 | Endpoint: "http://localhost:5051/", 19 | ReleaseStage: "staging", 20 | AppVersion: "1.0", 21 | Hostname: "", 22 | } 23 | 24 | n := notifyHandler{ 25 | PayloadChan: make(chan *payload, 1), 26 | } 27 | 28 | go http.ListenAndServe(":5051", n) 29 | time.Sleep(10 * time.Millisecond) 30 | 31 | err := Notify(config, "users/get", "foo.bar", fmt.Errorf("imanerror"), stack.NewTrace(0), make(map[string]string)) 32 | if err != nil { 33 | t.Fatalf("expected no error, got %v", err) 34 | } 35 | 36 | p := <-n.PayloadChan 37 | 38 | assert.NotNil(t, p) 39 | assert.Equal(t, p.APIKey, "abcd") 40 | assert.Equal(t, p.Notifier.Name, "health") 41 | assert.Equal(t, len(p.Events), 1) 42 | 43 | evt := p.Events[0] 44 | assert.Equal(t, evt.Context, "users/get") 45 | assert.Equal(t, evt.App.ReleaseStage, "staging") 46 | assert.Equal(t, len(evt.Exceptions), 1) 47 | 48 | ex := evt.Exceptions[0] 49 | assert.Equal(t, ex.ErrorClass, "foo.bar") 50 | assert.Equal(t, ex.Message, "imanerror") 51 | 52 | frame := ex.Stacktrace[0] 53 | assert.True(t, strings.HasSuffix(frame.File, "api_test.go")) 54 | assert.Equal(t, frame.Method, "github.com/gocraft/health/sinks/bugsnag:TestNotify") 55 | 56 | } 57 | 58 | type notifyHandler struct { 59 | PayloadChan chan *payload 60 | } 61 | 62 | func (h notifyHandler) ServeHTTP(rw http.ResponseWriter, r *http.Request) { 63 | body, err := ioutil.ReadAll(r.Body) 64 | if err != nil { 65 | fmt.Fprintf(rw, "got error in ready body: %v", err) 66 | return 67 | } 68 | 69 | var resp payload 70 | err = json.Unmarshal(body, &resp) 71 | if err != nil { 72 | fmt.Fprintf(rw, "got error in unmarshal: %v", err) 73 | return 74 | } 75 | 76 | h.PayloadChan <- &resp 77 | 78 | fmt.Fprintf(rw, "OK") 79 | } 80 | -------------------------------------------------------------------------------- /stack/frame.go: -------------------------------------------------------------------------------- 1 | package stack 2 | 3 | import ( 4 | // "bytes" 5 | "fmt" 6 | // "io/ioutil" 7 | "runtime" 8 | "strings" 9 | ) 10 | 11 | // Frame contains all necessary information about to generate a line in a callstack. 12 | type Frame struct { 13 | File string 14 | LineNumber int 15 | Name string 16 | Package string 17 | IsSystemPackage bool 18 | ProgramCounter uintptr 19 | } 20 | 21 | // NewFrame popoulates a stack frame object from the program counter. 22 | func NewFrame(pc uintptr) Frame { 23 | frame := Frame{ProgramCounter: pc} 24 | if frame.Func() == nil { 25 | return frame 26 | } 27 | frame.Package, frame.Name = packageAndName(frame.Func()) 28 | 29 | // pc -1 because the program counters we use are usually return addresses, 30 | // and we want to show the line that corresponds to the function call 31 | frame.File, frame.LineNumber = frame.Func().FileLine(pc - 1) 32 | frame.IsSystemPackage = isSystemPackage(frame.File, frame.Package) 33 | 34 | return frame 35 | } 36 | 37 | // Func returns the function that this stackframe corresponds to 38 | func (frame *Frame) Func() *runtime.Func { 39 | if frame.ProgramCounter == 0 { 40 | return nil 41 | } 42 | return runtime.FuncForPC(frame.ProgramCounter) 43 | } 44 | 45 | func (frame *Frame) String() string { 46 | return fmt.Sprintf("%s:%d %s", frame.File, frame.LineNumber, frame.Name) 47 | } 48 | 49 | func packageAndName(fn *runtime.Func) (string, string) { 50 | name := fn.Name() 51 | pkg := "" 52 | 53 | // we first remove the path prefix if there is one. 54 | if lastslash := strings.LastIndex(name, "/"); lastslash >= 0 { 55 | pkg += name[:lastslash] + "/" 56 | name = name[lastslash+1:] 57 | } 58 | if period := strings.Index(name, "."); period >= 0 { 59 | pkg += name[:period] 60 | name = name[period+1:] 61 | } 62 | 63 | return pkg, name 64 | } 65 | 66 | var goroot = runtime.GOROOT() 67 | 68 | // isSystemPackage returns true iff the package is a system package like 'runtime' or 'net/http' 69 | func isSystemPackage(file, pkg string) bool { 70 | return strings.HasPrefix(file, goroot) 71 | } 72 | -------------------------------------------------------------------------------- /json_writer_sink.go: -------------------------------------------------------------------------------- 1 | package health 2 | 3 | import ( 4 | "encoding/json" 5 | "fmt" 6 | "io" 7 | ) 8 | 9 | type JsonWriterSink struct { 10 | io.Writer 11 | } 12 | 13 | func (j *JsonWriterSink) EmitEvent(job string, event string, kvs map[string]string) { 14 | 15 | b, err := json.Marshal(struct { 16 | Job string 17 | Event string 18 | Timestamp string 19 | Kvs map[string]string 20 | }{job, event, timestamp(), kvs}) 21 | 22 | if err != nil { 23 | return 24 | } 25 | j.Write(b) 26 | } 27 | 28 | func (j *JsonWriterSink) EmitEventErr(job string, event string, err error, kvs map[string]string) { 29 | 30 | b, err := json.Marshal(struct { 31 | Job string 32 | Event string 33 | Timestamp string 34 | Err string 35 | Kvs map[string]string 36 | }{job, event, timestamp(), fmt.Sprint(err), kvs}) 37 | 38 | if err != nil { 39 | return 40 | } 41 | j.Write(b) 42 | } 43 | 44 | func (j *JsonWriterSink) EmitTiming(job string, event string, nanoseconds int64, kvs map[string]string) { 45 | 46 | b, err := json.Marshal(struct { 47 | Job string 48 | Event string 49 | Timestamp string 50 | Nanoseconds int64 51 | Kvs map[string]string 52 | }{job, event, timestamp(), nanoseconds, kvs}) 53 | 54 | if err != nil { 55 | return 56 | } 57 | j.Write(b) 58 | } 59 | 60 | func (j *JsonWriterSink) EmitGauge(job string, event string, value float64, kvs map[string]string) { 61 | 62 | b, err := json.Marshal(struct { 63 | Job string 64 | Event string 65 | Timestamp string 66 | Value float64 67 | Kvs map[string]string 68 | }{job, event, timestamp(), value, kvs}) 69 | 70 | if err != nil { 71 | return 72 | } 73 | j.Write(b) 74 | } 75 | 76 | func (j *JsonWriterSink) EmitComplete(job string, status CompletionStatus, nanoseconds int64, kvs map[string]string) { 77 | 78 | b, err := json.Marshal(struct { 79 | Job string 80 | Status string 81 | Timestamp string 82 | Nanoseconds int64 83 | Kvs map[string]string 84 | }{job, status.String(), timestamp(), nanoseconds, kvs}) 85 | 86 | if err != nil { 87 | return 88 | } 89 | j.Write(b) 90 | } 91 | -------------------------------------------------------------------------------- /interval_aggregation_merge.go: -------------------------------------------------------------------------------- 1 | package health 2 | 3 | // Merge merges intAgg into ia, mutating ia. 4 | // Requires that ia and intAgg are a fully valid with no nil maps. 5 | func (ia *IntervalAggregation) Merge(intAgg *IntervalAggregation) { 6 | ia.aggregationMaps.merge(&intAgg.aggregationMaps) 7 | 8 | for k, v := range intAgg.Jobs { 9 | if existingJob, ok := ia.Jobs[k]; ok { 10 | existingJob.merge(v) 11 | } else { 12 | ia.Jobs[k] = v.Clone() 13 | } 14 | } 15 | 16 | ia.SerialNumber++ 17 | } 18 | 19 | func (intoJob *JobAggregation) merge(fromJob *JobAggregation) { 20 | intoJob.aggregationMaps.merge(&fromJob.aggregationMaps) 21 | intoJob.TimerAggregation.merge(&fromJob.TimerAggregation) 22 | intoJob.CountSuccess += fromJob.CountSuccess 23 | intoJob.CountValidationError += fromJob.CountValidationError 24 | intoJob.CountPanic += fromJob.CountPanic 25 | intoJob.CountError += fromJob.CountError 26 | intoJob.CountJunk += fromJob.CountJunk 27 | } 28 | 29 | func (intoTa *TimerAggregation) merge(fromTa *TimerAggregation) { 30 | intoTa.Count += fromTa.Count 31 | intoTa.NanosSum += fromTa.NanosSum 32 | intoTa.NanosSumSquares += fromTa.NanosSumSquares 33 | if fromTa.NanosMin < intoTa.NanosMin { 34 | intoTa.NanosMin = fromTa.NanosMin 35 | } 36 | if fromTa.NanosMax > intoTa.NanosMax { 37 | intoTa.NanosMax = fromTa.NanosMax 38 | } 39 | } 40 | 41 | func (intoAm *aggregationMaps) merge(fromAm *aggregationMaps) { 42 | for k, v := range fromAm.Events { 43 | intoAm.Events[k] += v 44 | } 45 | 46 | for k, v := range fromAm.Gauges { 47 | intoAm.Gauges[k] = v 48 | } 49 | 50 | for k, v := range fromAm.Timers { 51 | if existingTimer, ok := intoAm.Timers[k]; ok { 52 | existingTimer.merge(v) 53 | } else { 54 | intoAm.Timers[k] = v.Clone() 55 | } 56 | } 57 | 58 | for k, v := range fromAm.EventErrs { 59 | if existingErrCounter, ok := intoAm.EventErrs[k]; ok { 60 | existingErrCounter.Count += v.Count 61 | 62 | // merging two ring buffers given our shitty implementation is problematic. 63 | for _, err := range v.errorSamples { 64 | if err != nil { 65 | existingErrCounter.addError(err) 66 | } 67 | } 68 | } else { 69 | intoAm.EventErrs[k] = v.Clone() 70 | } 71 | } 72 | } 73 | -------------------------------------------------------------------------------- /sinks/bugsnag/sink.go: -------------------------------------------------------------------------------- 1 | package bugsnag 2 | 3 | import ( 4 | "fmt" 5 | "github.com/gocraft/health" 6 | "os" 7 | ) 8 | 9 | // This sink emits to a StatsD deaemon by sending it a UDP packet. 10 | type Sink struct { 11 | *Config 12 | cmdChan chan *cmdEventErr 13 | doneChan chan int 14 | } 15 | 16 | type cmdEventErr struct { 17 | Job string 18 | Event string 19 | Err *health.UnmutedError 20 | Kvs map[string]string 21 | } 22 | 23 | func NewSink(config *Config) *Sink { 24 | const maxChanSize = 25 25 | 26 | if config.Endpoint == "" { 27 | config.Endpoint = "https://notify.bugsnag.com/" 28 | } 29 | 30 | s := &Sink{ 31 | Config: config, 32 | cmdChan: make(chan *cmdEventErr, maxChanSize), 33 | doneChan: make(chan int), 34 | } 35 | 36 | go errorProcessingLoop(s) 37 | 38 | return s 39 | } 40 | 41 | func (s *Sink) EmitEvent(job string, event string, kvs map[string]string) { 42 | // no-op 43 | } 44 | 45 | func (s *Sink) EmitEventErr(job string, event string, inputErr error, kvs map[string]string) { 46 | switch inputErr := inputErr.(type) { 47 | case *health.UnmutedError: 48 | if !inputErr.Emitted { 49 | s.cmdChan <- &cmdEventErr{Job: job, Event: event, Err: inputErr, Kvs: kvs} 50 | } 51 | case *health.MutedError: 52 | // Do nothing! 53 | default: // eg, case error: 54 | // This shouldn't happen, all errors passed in here should be wrapped. 55 | } 56 | } 57 | 58 | func (s *Sink) EmitTiming(job string, event string, nanos int64, kvs map[string]string) { 59 | // no-op 60 | } 61 | 62 | func (s *Sink) EmitGauge(job string, event string, value float64, kvs map[string]string) { 63 | // no-op 64 | } 65 | 66 | func (s *Sink) EmitComplete(job string, status health.CompletionStatus, nanos int64, kvs map[string]string) { 67 | // no-op 68 | } 69 | 70 | func (s *Sink) ShutdownServer() { 71 | s.doneChan <- 1 72 | } 73 | 74 | func errorProcessingLoop(sink *Sink) { 75 | cmdChan := sink.cmdChan 76 | doneChan := sink.doneChan 77 | 78 | PROCESSING_LOOP: 79 | for { 80 | select { 81 | case <-doneChan: 82 | break PROCESSING_LOOP 83 | case cmd := <-cmdChan: 84 | if err := Notify(sink.Config, cmd.Job, cmd.Event, cmd.Err, cmd.Err.Stack, cmd.Kvs); err != nil { 85 | fmt.Fprintf(os.Stderr, "bugsnag.Notify: could not notify bugsnag. err=%v\n", err) 86 | } 87 | } 88 | } 89 | } 90 | -------------------------------------------------------------------------------- /stack/stack_test.go: -------------------------------------------------------------------------------- 1 | package stack 2 | 3 | import ( 4 | "fmt" 5 | "regexp" 6 | "testing" 7 | // "github.com/stretchr/testify/assert" 8 | ) 9 | 10 | func level2() *Trace { 11 | return NewTrace(0) 12 | } 13 | 14 | func level1() *Trace { 15 | return level2() 16 | } 17 | 18 | func level0() *Trace { 19 | return level1() 20 | } 21 | 22 | func assertFrame(t *testing.T, frame *Frame, file string, line int, fun string) { 23 | testName := fmt.Sprintf("[file: %s line: %d fun: %s]", file, line, fun) 24 | 25 | if !regexp.MustCompile(file).MatchString(frame.File) { 26 | t.Errorf("assertFrame: %s didn't match file in %v", testName, frame) 27 | } 28 | 29 | if frame.LineNumber != line { 30 | t.Errorf("assertFrame: %s didn't match line in %v", testName, frame) 31 | } 32 | 33 | if frame.Name != fun { 34 | t.Errorf("assertFrame: %s didn't match function name in %v", testName, frame) 35 | } 36 | } 37 | 38 | func TestNewTrace(t *testing.T) { 39 | trace := level0() 40 | 41 | frames := trace.Frames() 42 | 43 | // Yes, this is a persnickety test that will fail as the file is modified. Sorry guise. 44 | assertFrame(t, &frames[0], "stack_test\\.go", 11, "level2") 45 | assertFrame(t, &frames[1], "stack_test\\.go", 15, "level1") 46 | assertFrame(t, &frames[2], "stack_test\\.go", 19, "level0") 47 | assertFrame(t, &frames[3], "stack_test\\.go", 39, "TestNewTrace") 48 | } 49 | 50 | type someT struct{} 51 | 52 | func (s someT) level2() *Trace { 53 | return NewTrace(0) 54 | } 55 | 56 | func (s someT) level1() *Trace { 57 | return s.level2() 58 | } 59 | 60 | func (s someT) level0() *Trace { 61 | return s.level1() 62 | } 63 | 64 | func TestNewTraceWithTypes(t *testing.T) { 65 | obj := &someT{} 66 | trace := obj.level0() 67 | 68 | frames := trace.Frames() 69 | 70 | // Yes, this is a persnickety test that will fail as the file is modified. Sorry guise. 71 | assertFrame(t, &frames[0], "stack_test\\.go", 53, "someT.level2") 72 | assertFrame(t, &frames[1], "stack_test\\.go", 57, "someT.level1") 73 | assertFrame(t, &frames[2], "stack_test\\.go", 61, "someT.level0") 74 | assertFrame(t, &frames[3], "stack_test\\.go", 66, "TestNewTraceWithTypes") 75 | } 76 | 77 | func TestStackPrint(t *testing.T) { 78 | trace := level0() 79 | stack := trace.Stack() 80 | reg := regexp.MustCompile("stack_test\\.go:11 level2\n.+stack_test\\.go:15 level1\n.+stack_test\\.go:19 level0") 81 | 82 | if !reg.Match(trace.Stack()) { 83 | t.Errorf("trace didn't match. Got:\n%s\n", string(stack)) 84 | } 85 | } 86 | -------------------------------------------------------------------------------- /sinks/librato/sink_test.go: -------------------------------------------------------------------------------- 1 | package librato 2 | 3 | import ( 4 | "fmt" 5 | "github.com/gocraft/health" 6 | "github.com/stretchr/testify/assert" 7 | "testing" 8 | "time" 9 | ) 10 | 11 | func TestNewShutdown(t *testing.T) { 12 | s := New("a", "b", "c") 13 | defer s.Stop() 14 | 15 | assert.Equal(t, "a", s.libratoUser) 16 | assert.Equal(t, "b", s.libratoApiKey) 17 | assert.Equal(t, "c", s.prefix) 18 | } 19 | 20 | func TestEmit(t *testing.T) { 21 | s := New("a", "b", "c") 22 | 23 | s.EmitEvent("cool", "story", nil) 24 | s.EmitEvent("cool", "story", nil) 25 | s.EmitEvent("cool", "story", nil) 26 | 27 | s.EmitEventErr("sad", "day", fmt.Errorf("ok"), nil) 28 | s.EmitEventErr("sad", "day", fmt.Errorf("ok"), nil) 29 | 30 | s.EmitTiming("rad", "dino", 6000000, nil) 31 | s.EmitTiming("bad", "dino", 12000000, nil) 32 | 33 | s.EmitComplete("tylersmith", health.Success, 22000000, nil) 34 | s.EmitComplete("tylersmart", health.Junk, 8000000, nil) 35 | 36 | time.Sleep(3 * time.Millisecond) 37 | s.Stop() 38 | 39 | assert.Equal(t, int64(3), s.counters["c.story.count"]) 40 | assert.Equal(t, int64(3), s.counters["c.cool.story.count"]) 41 | assert.Equal(t, int64(2), s.counters["c.day.error.count"]) 42 | assert.Equal(t, int64(2), s.counters["c.sad.day.error.count"]) 43 | 44 | g := s.timers["c.dino.timing"] 45 | assert.Equal(t, int64(2), g.Count) 46 | assert.Equal(t, 18.0, g.Sum) 47 | assert.Equal(t, 6.0, g.Min) 48 | assert.Equal(t, 12.0, g.Max) 49 | assert.Equal(t, 180.0, g.SumSquares) 50 | assert.Equal(t, defaultTimerAttributes, g.Attributes) 51 | 52 | g = s.timers["c.rad.dino.timing"] 53 | assert.Equal(t, int64(1), g.Count) 54 | assert.Equal(t, 6.0, g.Sum) 55 | assert.Equal(t, 6.0, g.Min) 56 | assert.Equal(t, 6.0, g.Max) 57 | assert.Equal(t, 36.0, g.SumSquares) 58 | 59 | g = s.timers["c.bad.dino.timing"] 60 | assert.Equal(t, int64(1), g.Count) 61 | assert.Equal(t, 12.0, g.Sum) 62 | assert.Equal(t, 12.0, g.Min) 63 | assert.Equal(t, 12.0, g.Max) 64 | assert.Equal(t, 144.0, g.SumSquares) 65 | 66 | g = s.timers["c.tylersmith.success.timing"] 67 | assert.Equal(t, int64(1), g.Count) 68 | assert.Equal(t, 22.0, g.Sum) 69 | assert.Equal(t, 22.0, g.Min) 70 | assert.Equal(t, 22.0, g.Max) 71 | assert.Equal(t, 484.0, g.SumSquares) 72 | 73 | g = s.timers["c.tylersmart.junk.timing"] 74 | assert.Equal(t, int64(1), g.Count) 75 | assert.Equal(t, 8.0, g.Sum) 76 | assert.Equal(t, 8.0, g.Min) 77 | assert.Equal(t, 8.0, g.Max) 78 | assert.Equal(t, 64.0, g.SumSquares) 79 | 80 | } 81 | -------------------------------------------------------------------------------- /json_polling_sink.go: -------------------------------------------------------------------------------- 1 | package health 2 | 3 | import ( 4 | "time" 5 | ) 6 | 7 | type JsonPollingSink struct { 8 | intervalDuration time.Duration 9 | cmdChan chan *emitCmd 10 | doneChan chan int 11 | doneDoneChan chan int 12 | intervalsChanChan chan chan []*IntervalAggregation 13 | } 14 | 15 | type cmdKind int 16 | 17 | const ( 18 | cmdKindEvent cmdKind = iota 19 | cmdKindEventErr 20 | cmdKindTiming 21 | cmdKindGauge 22 | cmdKindComplete 23 | ) 24 | 25 | type emitCmd struct { 26 | Kind cmdKind 27 | Job string 28 | Event string 29 | Err error 30 | Nanos int64 31 | Value float64 32 | Status CompletionStatus 33 | } 34 | 35 | func NewJsonPollingSink(intervalDuration time.Duration, retain time.Duration) *JsonPollingSink { 36 | const buffSize = 4096 // random-ass-guess 37 | 38 | s := &JsonPollingSink{ 39 | intervalDuration: intervalDuration, 40 | cmdChan: make(chan *emitCmd, buffSize), 41 | doneChan: make(chan int), 42 | doneDoneChan: make(chan int), 43 | intervalsChanChan: make(chan chan []*IntervalAggregation), 44 | } 45 | 46 | go startAggregator(intervalDuration, retain, s) 47 | 48 | return s 49 | } 50 | 51 | func (s *JsonPollingSink) ShutdownServer() { 52 | s.doneChan <- 1 53 | <-s.doneDoneChan 54 | } 55 | 56 | func (s *JsonPollingSink) EmitEvent(job string, event string, kvs map[string]string) { 57 | s.cmdChan <- &emitCmd{Kind: cmdKindEvent, Job: job, Event: event} 58 | } 59 | 60 | func (s *JsonPollingSink) EmitEventErr(job string, event string, inputErr error, kvs map[string]string) { 61 | s.cmdChan <- &emitCmd{Kind: cmdKindEventErr, Job: job, Event: event, Err: inputErr} 62 | } 63 | 64 | func (s *JsonPollingSink) EmitTiming(job string, event string, nanos int64, kvs map[string]string) { 65 | s.cmdChan <- &emitCmd{Kind: cmdKindTiming, Job: job, Event: event, Nanos: nanos} 66 | } 67 | 68 | func (s *JsonPollingSink) EmitGauge(job string, event string, value float64, kvs map[string]string) { 69 | s.cmdChan <- &emitCmd{Kind: cmdKindGauge, Job: job, Event: event, Value: value} 70 | } 71 | 72 | func (s *JsonPollingSink) EmitComplete(job string, status CompletionStatus, nanos int64, kvs map[string]string) { 73 | s.cmdChan <- &emitCmd{Kind: cmdKindComplete, Job: job, Status: status, Nanos: nanos} 74 | } 75 | 76 | func (s *JsonPollingSink) GetMetrics() []*IntervalAggregation { 77 | intervalsChan := make(chan []*IntervalAggregation) 78 | s.intervalsChanChan <- intervalsChan 79 | return <-intervalsChan 80 | } 81 | -------------------------------------------------------------------------------- /error_test.go: -------------------------------------------------------------------------------- 1 | package health 2 | 3 | import ( 4 | "fmt" 5 | "github.com/stretchr/testify/assert" 6 | "testing" 7 | ) 8 | 9 | func TestUnmutedErrors(t *testing.T) { 10 | stream := NewStream() 11 | sink := &testSink{} 12 | stream.AddSink(sink) 13 | job := stream.NewJob("myjob") 14 | 15 | origErr := fmt.Errorf("wat") 16 | retErr := job.EventErr("abcd", origErr) 17 | 18 | // retErr is an UnmutedError with Emitted=true 19 | if retErr, ok := retErr.(*UnmutedError); ok { 20 | assert.True(t, retErr.Emitted) 21 | assert.Equal(t, retErr.Err, origErr) 22 | } else { 23 | t.Errorf("expected retErr to be an *UnmutedError") 24 | } 25 | 26 | // LastErr has Emitted=false, WasUnmuted=true 27 | assert.NotNil(t, sink.LastErr) 28 | assert.True(t, sink.LastErrUnmuted) 29 | assert.False(t, sink.LastErrEmitted) 30 | 31 | // Log it again! 32 | retErr2 := job.EventErr("abcdefg", retErr) 33 | 34 | // retErr is an UnmutedError with Emitted=true 35 | if retErr2, ok := retErr2.(*UnmutedError); ok { 36 | assert.True(t, retErr2.Emitted) 37 | assert.Equal(t, retErr2.Err, origErr) // We don't endlessly wrap UnmutedErrors inside UnmutedErrors 38 | } else { 39 | t.Errorf("expected retErr to be an *UnmutedError") 40 | } 41 | 42 | // LastErr has Emitted=false, WasUnmuted=true 43 | assert.NotNil(t, sink.LastErr) 44 | assert.True(t, sink.LastErrUnmuted) 45 | assert.True(t, sink.LastErrEmitted) 46 | } 47 | 48 | func TestMutedErrors(t *testing.T) { 49 | stream := NewStream() 50 | sink := &testSink{} 51 | stream.AddSink(sink) 52 | job := stream.NewJob("myjob") 53 | 54 | origErr := fmt.Errorf("wat") 55 | mutedOrig := Mute(origErr) 56 | retErr := job.EventErr("abcd", mutedOrig) 57 | 58 | // retErr is an UnmutedError with Emitted=true 59 | if retErr, ok := retErr.(*MutedError); ok { 60 | assert.Equal(t, retErr.Err, origErr) 61 | } else { 62 | t.Errorf("expected retErr to be an *MutedError") 63 | } 64 | 65 | // LastErr has Emitted=false, WasUnmuted=true 66 | assert.NotNil(t, sink.LastErr) 67 | assert.True(t, sink.LastErrMuted) 68 | 69 | // Log it again! 70 | retErr2 := job.EventErr("abcdefg", retErr) 71 | 72 | // retErr is an UnmutedError with Emitted=true 73 | if retErr2, ok := retErr2.(*MutedError); ok { 74 | assert.Equal(t, retErr2.Err, origErr) // We don't endlessly wrap MutedErrors inside MutedErrors 75 | } else { 76 | t.Errorf("expected retErr to be an *MutedError") 77 | } 78 | 79 | // LastErr has Emitted=false, WasUnmuted=true 80 | assert.NotNil(t, sink.LastErr) 81 | assert.True(t, sink.LastErrMuted) 82 | } 83 | -------------------------------------------------------------------------------- /interval_aggregation_merge_test.go: -------------------------------------------------------------------------------- 1 | package health 2 | 3 | import ( 4 | "fmt" 5 | "testing" 6 | 7 | "github.com/stretchr/testify/assert" 8 | ) 9 | 10 | // Let's leverage clone's fixture data and make sure we can merge into a new blank aggregation to get the same data. 11 | func TestMergeBasic(t *testing.T) { 12 | setNowMock("2011-09-09T23:36:13Z") 13 | defer resetNowMock() 14 | 15 | a := aggregatorWithData() 16 | intAgg := a.intervalAggregations[0] 17 | assertAggregationData(t, intAgg) 18 | newAgg := NewIntervalAggregation(intAgg.IntervalStart) 19 | newAgg.Merge(intAgg) 20 | assertAggregationData(t, newAgg) 21 | } 22 | 23 | func TestMerge(t *testing.T) { 24 | setNowMock("2011-09-09T23:36:13Z") 25 | defer resetNowMock() 26 | 27 | // Make two aggregations, merge together: 28 | a := aggregatorWithData() 29 | intAgg := a.intervalAggregations[0] 30 | a2 := aggregatorWithData() 31 | intAgg2 := a2.intervalAggregations[0] 32 | 33 | // Modify a gauge: 34 | a2.EmitGauge("job0", "gauge1", 5.5) 35 | 36 | intAgg.Merge(intAgg2) 37 | 38 | // same number of events: 39 | assert.Equal(t, 300, len(intAgg.Jobs)) 40 | assert.Equal(t, 1200, len(intAgg.Events)) 41 | assert.Equal(t, 1200, len(intAgg.Timers)) 42 | assert.Equal(t, 1200, len(intAgg.Gauges)) 43 | assert.Equal(t, 1200, len(intAgg.EventErrs)) 44 | 45 | // Spot-check events: 46 | assert.EqualValues(t, 2, intAgg.Events["event0"]) 47 | 48 | // Spot-check gauges: 49 | assert.EqualValues(t, 3.14, intAgg.Gauges["gauge0"]) 50 | assert.EqualValues(t, 5.5, intAgg.Gauges["gauge1"]) // 5.5 takes precedence over 3.14 (argument to merge takes precedence.) 51 | 52 | // Spot-check timings: 53 | assert.EqualValues(t, 2, intAgg.Timers["timing0"].Count) 54 | assert.EqualValues(t, 24, intAgg.Timers["timing0"].NanosSum) 55 | 56 | // Spot-check event-errs: 57 | assert.EqualValues(t, 2, intAgg.EventErrs["err0"].Count) 58 | assert.EqualValues(t, []error{fmt.Errorf("wat")}, intAgg.EventErrs["err0"].getErrorSamples()) 59 | 60 | // Spot-check jobs: 61 | job := intAgg.Jobs["job0"] 62 | assert.EqualValues(t, 2, job.CountSuccess) 63 | assert.EqualValues(t, 0, job.CountError) 64 | assert.EqualValues(t, 2, job.Events["event0"]) 65 | assert.EqualValues(t, 0, job.Events["event4"]) 66 | assert.EqualValues(t, 3.14, job.Gauges["gauge0"]) 67 | assert.EqualValues(t, 2, job.Timers["timing0"].Count) 68 | assert.EqualValues(t, 24, job.Timers["timing0"].NanosSum) 69 | assert.EqualValues(t, 2, job.EventErrs["err0"].Count) 70 | assert.Equal(t, []error{fmt.Errorf("wat")}, job.EventErrs["err0"].getErrorSamples()) 71 | } 72 | -------------------------------------------------------------------------------- /cmd/healthd/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "github.com/gocraft/health" 6 | "github.com/gocraft/health/healthd" 7 | "os" 8 | "strings" 9 | "time" 10 | ) 11 | 12 | // TODO's: 13 | // - in /overall, make interval_start and interval_duration make sense. 14 | // - maybe add interval_end? 15 | // - some easy way of exposing errors 16 | // - add to health the concept of a daemon/service name. 17 | // - so getting /health gives you: 18 | // - name (eg, metroid_api), version?, git sha?, process, time ranges, and then the metrics. 19 | 20 | // nice to have's 21 | // - handle the case when time goes backwards. 22 | // - we need some way to not return all data from clients on every request. Necessary? 23 | func main() { 24 | // Get inputs. Read from env variables for now (command line options?) 25 | monitoredHostPorts := getMonitoredHostPorts() 26 | serverHostPort := getServerHostPort() 27 | healthHostPort := getHealthHostPort() 28 | 29 | // Monitor ourselves. This will make our own instrumentation show up in the healthd output 30 | // I'm not totally sure we want to do this, but (shrug) seems reasonable right now. 31 | monitoredHostPorts = append(monitoredHostPorts, healthHostPort) 32 | 33 | // Setup our health stream. 34 | // Log to stdout and a setup an polling sink 35 | stream := health.NewStream() 36 | stream.AddSink(&health.WriterSink{os.Stdout}) 37 | jsonPollingSink := health.NewJsonPollingSink(time.Minute, time.Minute*5) 38 | jsonPollingSink.StartServer(healthHostPort) 39 | stream.AddSink(jsonPollingSink) 40 | 41 | // Say we're starting! 42 | stream.EventKv("starting", health.Kvs{ 43 | "monitored_host_ports": strings.Join(monitoredHostPorts, ","), 44 | "server_host_port": serverHostPort, 45 | "health_host_port": healthHostPort, 46 | }) 47 | 48 | // Start the healthd aggregators in a goroutine(s) 49 | healthd.StartNewHealthD(monitoredHostPorts, serverHostPort, stream) 50 | 51 | // Block 52 | select {} 53 | } 54 | 55 | func getHealthHostPort() string { 56 | ret := os.Getenv("HEALTH_HOSTPORT") 57 | if ret == "" { 58 | ret = ":5030" 59 | } 60 | return ret 61 | } 62 | 63 | func getMonitoredHostPorts() []string { 64 | hps := os.Getenv("HEALTHD_MONITORED_HOSTPORTS") 65 | if hps == "" { 66 | fmt.Println("no hosts to monitor. Pass them in with the environment variable HEALTHD_MONITORED_HOSTPORTS") 67 | fmt.Println("example: $ HEALTHD_MONITORED_HOSTPORTS=web31:5020,web32:5020 ./healthd") 68 | os.Exit(1) 69 | } 70 | return strings.Split(hps, ",") 71 | } 72 | 73 | func getServerHostPort() string { 74 | ret := os.Getenv("HEALTHD_SERVER_HOSTPORT") 75 | if ret == "" { 76 | ret = ":5031" 77 | } 78 | return ret 79 | } 80 | -------------------------------------------------------------------------------- /runtime_metrics/runtime_metrics.go: -------------------------------------------------------------------------------- 1 | package runtime_metrics 2 | 3 | import ( 4 | "github.com/gocraft/health" 5 | "runtime" 6 | "runtime/debug" 7 | "time" 8 | ) 9 | 10 | type RuntimeMetrics struct { 11 | stream health.EventReceiver 12 | options Options 13 | stopChan chan bool 14 | stopStopChan chan bool 15 | } 16 | 17 | type Options struct { 18 | Interval time.Duration 19 | 20 | Memory bool 21 | GC bool 22 | GCQuantile bool 23 | Goroutines bool 24 | Cgo bool 25 | FDs bool 26 | } 27 | 28 | func NewRuntimeMetrics(stream health.EventReceiver, options *Options) *RuntimeMetrics { 29 | rm := &RuntimeMetrics{ 30 | stream: stream, 31 | stopChan: make(chan bool), 32 | stopStopChan: make(chan bool), 33 | } 34 | 35 | if options != nil { 36 | rm.options = *options 37 | } else { 38 | rm.options = Options{time.Second * 5, true, true, true, true, true, true} 39 | } 40 | 41 | return rm 42 | } 43 | 44 | func (rm *RuntimeMetrics) Start() { 45 | go rm.metricsPoller() 46 | } 47 | 48 | func (rm *RuntimeMetrics) Stop() { 49 | rm.stopChan <- true 50 | <-rm.stopStopChan 51 | } 52 | 53 | func (rm *RuntimeMetrics) metricsPoller() { 54 | ticker := time.NewTicker(rm.options.Interval) 55 | 56 | METRICS_POOLER_LOOP: 57 | for { 58 | select { 59 | case <-rm.stopChan: 60 | break METRICS_POOLER_LOOP 61 | case <-ticker.C: 62 | rm.Report() 63 | } 64 | } 65 | 66 | ticker.Stop() 67 | rm.stopStopChan <- true 68 | } 69 | 70 | func (rm *RuntimeMetrics) Report() { 71 | var mem runtime.MemStats 72 | runtime.ReadMemStats(&mem) 73 | 74 | if rm.options.Memory { 75 | // bytes allocated and not yet freed 76 | rm.reportGauge("alloc", float64(mem.Alloc)) 77 | 78 | // total number of allocated objects 79 | rm.reportGauge("heap_objects", float64(mem.HeapObjects)) 80 | } 81 | 82 | if rm.options.GC { 83 | rm.reportGauge("pause_total_ns", float64(mem.PauseTotalNs)) 84 | rm.reportGauge("num_gc", float64(mem.NumGC)) 85 | rm.reportGauge("next_gc", float64(mem.NextGC)) 86 | rm.reportGauge("gc_cpu_fraction", mem.GCCPUFraction) 87 | } 88 | 89 | if rm.options.GCQuantile { 90 | var gc debug.GCStats 91 | gc.PauseQuantiles = make([]time.Duration, 3) 92 | debug.ReadGCStats(&gc) 93 | rm.reportGauge("gc_pause_quantile_50", float64(gc.PauseQuantiles[1]/1000)/1000.0) 94 | rm.reportGauge("gc_pause_quantile_max", float64(gc.PauseQuantiles[2]/1000)/1000.0) 95 | } 96 | 97 | if rm.options.Goroutines { 98 | rm.reportGauge("num_goroutines", float64(runtime.NumGoroutine())) 99 | } 100 | 101 | if rm.options.Cgo { 102 | rm.reportGauge("num_cgo_call", float64(runtime.NumCgoCall())) 103 | } 104 | 105 | if rm.options.FDs { 106 | if num, err := getFDUsage(); err == nil { 107 | rm.reportGauge("num_fds_used", float64(num)) 108 | } 109 | } 110 | } 111 | 112 | func (rm *RuntimeMetrics) reportGauge(key string, val float64) { 113 | rm.stream.Gauge(key, val) 114 | } 115 | -------------------------------------------------------------------------------- /cmd/healthtop/hosts.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "encoding/json" 5 | "fmt" 6 | "github.com/buger/goterm" 7 | "github.com/gocraft/health/healthd" 8 | "io/ioutil" 9 | "net/http" 10 | "strings" 11 | "time" 12 | ) 13 | 14 | func hostsLoop() { 15 | secondTicker := time.Tick(1 * time.Second) 16 | 17 | var lastApiResponse *healthd.ApiResponseHosts 18 | var hStatus healthdStatus 19 | 20 | responses := make(chan *healthd.ApiResponseHosts) 21 | errors := make(chan error) 22 | 23 | go pollHealthDHosts(responses, errors) 24 | for { 25 | select { 26 | case <-secondTicker: 27 | go pollHealthDHosts(responses, errors) 28 | printHosts(lastApiResponse, &hStatus) 29 | case resp := <-responses: 30 | lastApiResponse = resp 31 | hStatus.lastSuccessAt = time.Now() 32 | printHosts(lastApiResponse, &hStatus) 33 | case err := <-errors: 34 | hStatus.lastErrorAt = time.Now() 35 | hStatus.lastError = err 36 | } 37 | } 38 | } 39 | 40 | func pollHealthDHosts(responses chan *healthd.ApiResponseHosts, errors chan error) { 41 | var body []byte 42 | 43 | uri := "http://" + sourceHostPort + "/healthd/hosts" 44 | 45 | resp, err := http.Get(uri) 46 | if err != nil { 47 | errors <- err 48 | return 49 | } 50 | defer resp.Body.Close() 51 | body, err = ioutil.ReadAll(resp.Body) 52 | if err != nil { 53 | errors <- err 54 | return 55 | } 56 | 57 | var response healthd.ApiResponseHosts 58 | if err := json.Unmarshal(body, &response); err != nil { 59 | errors <- err 60 | return 61 | } 62 | 63 | responses <- &response 64 | } 65 | 66 | func printHosts(lastApiResponse *healthd.ApiResponseHosts, status *healthdStatus) { 67 | goterm.Clear() // Clear current screen 68 | goterm.MoveCursor(1, 1) 69 | defer goterm.Flush() 70 | goterm.Println("Current Time:", status.FmtNow(), " Status:", status.FmtStatus()) 71 | 72 | // 73 | if lastApiResponse == nil { 74 | goterm.Println("no data yet") 75 | return 76 | } 77 | 78 | columns := []string{ 79 | "Host:Port", 80 | "Status", 81 | "Last Checked", 82 | "Last Response Time", 83 | } 84 | 85 | for i, s := range columns { 86 | columns[i] = goterm.Bold(goterm.Color(s, goterm.BLACK)) 87 | } 88 | 89 | table := goterm.NewTable(0, goterm.Width()-1, 5, ' ', 0) 90 | fmt.Fprintf(table, "%s\n", strings.Join(columns, "\t")) 91 | 92 | for _, host := range lastApiResponse.Hosts { 93 | printHost(table, host) 94 | } 95 | 96 | goterm.Println(table) 97 | } 98 | 99 | func printHost(table *goterm.Table, host *healthd.HostStatus) { 100 | success := host.LastCode == 200 && host.LastErr == "" 101 | var status string 102 | if success { 103 | status = "Success" 104 | } else if host.LastCheckTime.IsZero() { 105 | status = "Unknown" 106 | } else { 107 | status = "Failure: " + host.LastErr 108 | } 109 | 110 | printCellString(host.HostPort, table, true, false, false) 111 | printCellString(status, table, false, success, !success) 112 | printCellString(host.LastCheckTime.Format(time.RFC1123), table, false, false, false) 113 | printCellNanos(int64(host.LastNanos), table, false, false, false) 114 | fmt.Fprintf(table, "\n") 115 | } 116 | -------------------------------------------------------------------------------- /health_test.go: -------------------------------------------------------------------------------- 1 | package health 2 | 3 | import ( 4 | "errors" 5 | "github.com/stretchr/testify/assert" 6 | "testing" 7 | ) 8 | 9 | type testSink struct { 10 | LastEmitKind string // "Event", "EventErr", ..., "Complete" 11 | LastJob string 12 | LastEvent string 13 | 14 | LastErr error 15 | LastErrEmitted bool 16 | LastErrUnmuted bool 17 | LastErrMuted bool 18 | LastErrRaw bool 19 | 20 | LastNanos int64 21 | LastValue float64 22 | LastKvs map[string]string 23 | LastStatus CompletionStatus 24 | } 25 | 26 | func (s *testSink) EmitEvent(job string, event string, kvs map[string]string) { 27 | s.LastEmitKind = "Event" 28 | s.LastJob = job 29 | s.LastEvent = event 30 | s.LastKvs = kvs 31 | } 32 | 33 | func (s *testSink) EmitEventErr(job string, event string, inputErr error, kvs map[string]string) { 34 | s.LastEmitKind = "EventErr" 35 | s.LastJob = job 36 | s.LastEvent = event 37 | s.LastKvs = kvs 38 | s.LastErr = inputErr 39 | 40 | switch inputErr := inputErr.(type) { 41 | case *UnmutedError: 42 | s.LastErrUnmuted = true 43 | s.LastErrEmitted = inputErr.Emitted 44 | case *MutedError: 45 | s.LastErrMuted = true 46 | default: // eg, case error: 47 | s.LastErrRaw = true 48 | } 49 | } 50 | func (s *testSink) EmitTiming(job string, event string, nanos int64, kvs map[string]string) { 51 | s.LastEmitKind = "Timing" 52 | s.LastJob = job 53 | s.LastEvent = event 54 | s.LastKvs = kvs 55 | s.LastNanos = nanos 56 | } 57 | func (s *testSink) EmitGauge(job string, event string, value float64, kvs map[string]string) { 58 | s.LastEmitKind = "Gauge" 59 | s.LastJob = job 60 | s.LastEvent = event 61 | s.LastKvs = kvs 62 | s.LastValue = value 63 | } 64 | func (s *testSink) EmitComplete(job string, status CompletionStatus, nanos int64, kvs map[string]string) { 65 | s.LastEmitKind = "Complete" 66 | s.LastJob = job 67 | s.LastKvs = kvs 68 | s.LastNanos = nanos 69 | s.LastStatus = status 70 | } 71 | 72 | func successFunc() error { 73 | return nil 74 | } 75 | 76 | func errorFunc() error { 77 | return errors.New("sad_day") 78 | } 79 | 80 | func panicFunc() error { 81 | panic("wat") 82 | return nil 83 | } 84 | 85 | func TestRun(t *testing.T) { 86 | s := NewStream() 87 | 88 | ts := &testSink{} 89 | s.AddSink(ts) 90 | 91 | err := s.Run("foo", successFunc) 92 | assert.NoError(t, err) 93 | 94 | assert.Equal(t, "Complete", ts.LastEmitKind) 95 | assert.Equal(t, "foo", ts.LastJob) 96 | assert.Equal(t, Success, ts.LastStatus) 97 | 98 | err = s.Run("foo", errorFunc) 99 | assert.Equal(t, "sad_day", err.Error()) 100 | 101 | assert.Equal(t, "Complete", ts.LastEmitKind) 102 | assert.Equal(t, "foo", ts.LastJob) 103 | assert.Equal(t, Error, ts.LastStatus) 104 | 105 | err = s.Run("foo", panicFunc) 106 | assert.Equal(t, "wat", err.Error()) 107 | 108 | assert.Equal(t, "Complete", ts.LastEmitKind) 109 | assert.Equal(t, "foo", ts.LastJob) 110 | assert.Equal(t, Panic, ts.LastStatus) 111 | 112 | // Panicing will fire an EventErr and then a Complete(Panic) 113 | // This test relies on the fact that LastErr isn't cleared when a Complete comes in 114 | assert.Equal(t, "wat", ts.LastErr.Error()) 115 | 116 | // Now just make sure that job also has a similar Run function: 117 | j := s.NewJob("bob") 118 | err = j.Run(successFunc) 119 | assert.NoError(t, err) 120 | 121 | assert.Equal(t, "Complete", ts.LastEmitKind) 122 | assert.Equal(t, "bob", ts.LastJob) 123 | assert.Equal(t, Success, ts.LastStatus) 124 | } 125 | -------------------------------------------------------------------------------- /writer_sink.go: -------------------------------------------------------------------------------- 1 | package health 2 | 3 | import ( 4 | "bytes" 5 | "fmt" 6 | "io" 7 | "sort" 8 | "time" 9 | ) 10 | 11 | // This sink writes bytes in a format that a human might like to read in a logfile 12 | // This can be used to log to Stdout: 13 | // .AddSink(&WriterSink{os.Stdout}) 14 | // And to a file: 15 | // f, err := os.OpenFile(fname, os.O_RDWR|os.O_CREATE|os.O_APPEND, 0666) 16 | // .AddSink(&WriterSink{f}) 17 | // And to syslog: 18 | // w, err := syslog.New(LOG_INFO, "wat") 19 | // .AddSink(&WriterSink{w}) 20 | type WriterSink struct { 21 | io.Writer 22 | } 23 | 24 | func (s *WriterSink) EmitEvent(job string, event string, kvs map[string]string) { 25 | var b bytes.Buffer 26 | b.WriteRune('[') 27 | b.WriteString(timestamp()) 28 | b.WriteString("]: job:") 29 | b.WriteString(job) 30 | b.WriteString(" event:") 31 | b.WriteString(event) 32 | writeMapConsistently(&b, kvs) 33 | b.WriteRune('\n') 34 | s.Writer.Write(b.Bytes()) 35 | } 36 | 37 | func (s *WriterSink) EmitEventErr(job string, event string, inputErr error, kvs map[string]string) { 38 | var b bytes.Buffer 39 | b.WriteRune('[') 40 | b.WriteString(timestamp()) 41 | b.WriteString("]: job:") 42 | b.WriteString(job) 43 | b.WriteString(" event:") 44 | b.WriteString(event) 45 | b.WriteString(" err:") 46 | b.WriteString(inputErr.Error()) 47 | writeMapConsistently(&b, kvs) 48 | b.WriteRune('\n') 49 | s.Writer.Write(b.Bytes()) 50 | } 51 | 52 | func (s *WriterSink) EmitTiming(job string, event string, nanos int64, kvs map[string]string) { 53 | var b bytes.Buffer 54 | b.WriteRune('[') 55 | b.WriteString(timestamp()) 56 | b.WriteString("]: job:") 57 | b.WriteString(job) 58 | b.WriteString(" event:") 59 | b.WriteString(event) 60 | b.WriteString(" time:") 61 | writeNanoseconds(&b, nanos) 62 | writeMapConsistently(&b, kvs) 63 | b.WriteRune('\n') 64 | s.Writer.Write(b.Bytes()) 65 | } 66 | 67 | func (s *WriterSink) EmitGauge(job string, event string, value float64, kvs map[string]string) { 68 | var b bytes.Buffer 69 | b.WriteRune('[') 70 | b.WriteString(timestamp()) 71 | b.WriteString("]: job:") 72 | b.WriteString(job) 73 | b.WriteString(" event:") 74 | b.WriteString(event) 75 | b.WriteString(" gauge:") 76 | fmt.Fprintf(&b, "%g", value) 77 | writeMapConsistently(&b, kvs) 78 | b.WriteRune('\n') 79 | s.Writer.Write(b.Bytes()) 80 | } 81 | 82 | func (s *WriterSink) EmitComplete(job string, status CompletionStatus, nanos int64, kvs map[string]string) { 83 | var b bytes.Buffer 84 | b.WriteRune('[') 85 | b.WriteString(timestamp()) 86 | b.WriteString("]: job:") 87 | b.WriteString(job) 88 | b.WriteString(" status:") 89 | b.WriteString(status.String()) 90 | b.WriteString(" time:") 91 | writeNanoseconds(&b, nanos) 92 | writeMapConsistently(&b, kvs) 93 | b.WriteRune('\n') 94 | s.Writer.Write(b.Bytes()) 95 | } 96 | 97 | func timestamp() string { 98 | return time.Now().UTC().Format(time.RFC3339Nano) 99 | } 100 | 101 | func writeMapConsistently(b *bytes.Buffer, kvs map[string]string) { 102 | if kvs == nil { 103 | return 104 | } 105 | keys := make([]string, 0, len(kvs)) 106 | for k := range kvs { 107 | keys = append(keys, k) 108 | } 109 | sort.Strings(keys) 110 | keysLenMinusOne := len(keys) - 1 111 | 112 | b.WriteString(" kvs:[") 113 | for i, k := range keys { 114 | b.WriteString(k) 115 | b.WriteRune(':') 116 | b.WriteString(kvs[k]) 117 | 118 | if i != keysLenMinusOne { 119 | b.WriteRune(' ') 120 | } 121 | } 122 | b.WriteRune(']') 123 | } 124 | 125 | func writeNanoseconds(b *bytes.Buffer, nanos int64) { 126 | switch { 127 | case nanos > 2000000: 128 | fmt.Fprintf(b, "%d ms", nanos/1000000) 129 | case nanos > 2000: 130 | fmt.Fprintf(b, "%d μs", nanos/1000) 131 | default: 132 | fmt.Fprintf(b, "%d ns", nanos) 133 | } 134 | } 135 | -------------------------------------------------------------------------------- /json_writer_sink_test.go: -------------------------------------------------------------------------------- 1 | package health 2 | 3 | import ( 4 | "bytes" 5 | "encoding/json" 6 | "errors" 7 | "testing" 8 | 9 | "github.com/stretchr/testify/assert" 10 | ) 11 | 12 | type testJsonEvent struct { 13 | Job string 14 | Event string 15 | Timestamp string 16 | Err string 17 | Nanoseconds int64 18 | Value float64 19 | Status string 20 | Kvs map[string]string 21 | } 22 | 23 | func TestJsonWriterSinkEvent(t *testing.T) { 24 | var buf bytes.Buffer 25 | someKvs := map[string]string{"foo": "bar", "qux": "dog"} 26 | sink := JsonWriterSink{&buf} 27 | sink.EmitEvent("myjob", "myevent", someKvs) 28 | 29 | dec := json.NewDecoder(&buf) 30 | event := &testJsonEvent{} 31 | err := dec.Decode(event) 32 | 33 | assert.NoError(t, err) 34 | assert.Equal(t, "bar", event.Kvs["foo"]) 35 | assert.Equal(t, "myjob", event.Job) 36 | assert.Equal(t, "myevent", event.Event) 37 | } 38 | 39 | func TestJsonWriterSinkEventErr(t *testing.T) { 40 | var buf bytes.Buffer 41 | sink := JsonWriterSink{&buf} 42 | someKvs := map[string]string{"foo": "bar", "qux": "dog"} 43 | sink.EmitEventErr("myjob", "myevent", errors.New("test err"), someKvs) 44 | 45 | dec := json.NewDecoder(&buf) 46 | event := &testJsonEvent{} 47 | err := dec.Decode(event) 48 | 49 | assert.NoError(t, err) 50 | assert.Equal(t, "bar", event.Kvs["foo"]) 51 | assert.Equal(t, "myjob", event.Job) 52 | assert.Equal(t, "myevent", event.Event) 53 | assert.Equal(t, "test err", event.Err) 54 | } 55 | 56 | func TestJsonWriterSinkEventTiming(t *testing.T) { 57 | var buf bytes.Buffer 58 | sink := JsonWriterSink{&buf} 59 | someKvs := map[string]string{"foo": "bar", "qux": "dog"} 60 | sink.EmitTiming("myjob", "myevent", 34567890, someKvs) 61 | 62 | event := &testJsonEvent{} 63 | dec := json.NewDecoder(&buf) 64 | err := dec.Decode(event) 65 | 66 | assert.NoError(t, err) 67 | assert.Equal(t, "bar", event.Kvs["foo"]) 68 | assert.Equal(t, "myjob", event.Job) 69 | assert.Equal(t, "myevent", event.Event) 70 | assert.EqualValues(t, 34567890, event.Nanoseconds) 71 | } 72 | 73 | func TestJsonWriterSinkEventGauge(t *testing.T) { 74 | var buf bytes.Buffer 75 | sink := JsonWriterSink{&buf} 76 | someKvs := map[string]string{"foo": "bar", "qux": "dog"} 77 | sink.EmitGauge("myjob", "myevent", 3.14, someKvs) 78 | 79 | event := &testJsonEvent{} 80 | dec := json.NewDecoder(&buf) 81 | err := dec.Decode(event) 82 | 83 | assert.NoError(t, err) 84 | assert.Equal(t, "bar", event.Kvs["foo"]) 85 | assert.Equal(t, "myjob", event.Job) 86 | assert.Equal(t, "myevent", event.Event) 87 | assert.EqualValues(t, 3.14, event.Value) 88 | } 89 | 90 | func TestJsonWriterSinkEventComplete(t *testing.T) { 91 | var buf bytes.Buffer 92 | dec := json.NewDecoder(&buf) 93 | for kind, kindStr := range completionStatusToString { 94 | sink := JsonWriterSink{&buf} 95 | sink.EmitComplete("myjob", kind, 1204000, nil) 96 | 97 | event := &testJsonEvent{} 98 | err := dec.Decode(event) 99 | 100 | assert.NoError(t, err) 101 | 102 | assert.Equal(t, "myjob", event.Job) 103 | assert.Equal(t, kindStr, event.Status) 104 | assert.EqualValues(t, 1204000, event.Nanoseconds) 105 | buf.Reset() 106 | } 107 | } 108 | 109 | func BenchmarkJsonWriterSinkEmitBlankEvent(b *testing.B) { 110 | var buf bytes.Buffer 111 | sink := JsonWriterSink{&buf} 112 | b.ResetTimer() 113 | for i := 0; i < b.N; i++ { 114 | buf.Reset() 115 | sink.EmitEvent("myjob", "myevent", nil) 116 | } 117 | b.ReportAllocs() 118 | } 119 | 120 | func BenchmarkJsonWriterSinkEmitSmallEvent(b *testing.B) { 121 | var buf bytes.Buffer 122 | someKvs := map[string]string{"foo": "bar", "qux": "dog"} 123 | sink := JsonWriterSink{&buf} 124 | b.ResetTimer() 125 | for i := 0; i < b.N; i++ { 126 | buf.Reset() 127 | sink.EmitEvent("myjob", "myevent", someKvs) 128 | } 129 | b.ReportAllocs() 130 | } 131 | -------------------------------------------------------------------------------- /healthd/healthd_test.go: -------------------------------------------------------------------------------- 1 | package healthd 2 | 3 | import ( 4 | "encoding/json" 5 | "net/http" 6 | "net/http/httptest" 7 | "testing" 8 | "time" 9 | 10 | "github.com/gocraft/health" 11 | "github.com/stretchr/testify/assert" 12 | ) 13 | 14 | func TestHealthD(t *testing.T) { 15 | // Make two sinks: 16 | sink := health.NewJsonPollingSink(time.Minute, time.Minute*5) 17 | sink.StartServer(":6050") 18 | sink.EmitEvent("foo", "bar", nil) 19 | sink.EmitTiming("foo", "baz", 1234, nil) 20 | sink.EmitComplete("foo", health.Success, 5678, nil) 21 | 22 | sink2 := health.NewJsonPollingSink(time.Minute, time.Minute*5) 23 | sink2.StartServer(":6051") 24 | sink2.EmitEvent("foo", "bar", nil) 25 | sink2.EmitTiming("foo", "baz", 4321, nil) 26 | sink2.EmitComplete("foo", health.ValidationError, 8765, nil) 27 | 28 | hd := StartNewHealthD([]string{":6050", ":6051"}, ":6060", health.NewStream()) 29 | 30 | defer func() { 31 | hd.Stop() 32 | time.Sleep(time.Millisecond) 33 | }() 34 | 35 | time.Sleep(time.Millisecond * 15) 36 | 37 | testAggregations(t, hd) 38 | testAggregationsOverall(t, hd) 39 | testJobs(t, hd) 40 | testHosts(t, hd) 41 | 42 | } 43 | 44 | func testAggregations(t *testing.T, hd *HealthD) { 45 | recorder := httptest.NewRecorder() 46 | request, _ := http.NewRequest("GET", "/healthd/aggregations", nil) 47 | hd.apiRouter().ServeHTTP(recorder, request) 48 | assert.Equal(t, 200, recorder.Code) 49 | 50 | var resp ApiResponseAggregations 51 | err := json.Unmarshal(recorder.Body.Bytes(), &resp) 52 | 53 | assert.NoError(t, err) 54 | assert.Equal(t, len(resp.Aggregations), 1) 55 | assertFooBarAggregation(t, resp.Aggregations[0]) 56 | } 57 | 58 | func testAggregationsOverall(t *testing.T, hd *HealthD) { 59 | recorder := httptest.NewRecorder() 60 | request, _ := http.NewRequest("GET", "/healthd/aggregations/overall", nil) 61 | hd.apiRouter().ServeHTTP(recorder, request) 62 | assert.Equal(t, 200, recorder.Code) 63 | 64 | var resp ApiResponseAggregationsOverall 65 | err := json.Unmarshal(recorder.Body.Bytes(), &resp) 66 | 67 | assert.NoError(t, err) 68 | assert.NotNil(t, resp.Overall) 69 | assertFooBarAggregation(t, resp.Overall) 70 | } 71 | 72 | func testJobs(t *testing.T, hd *HealthD) { 73 | recorder := httptest.NewRecorder() 74 | request, _ := http.NewRequest("GET", "/healthd/jobs", nil) 75 | hd.apiRouter().ServeHTTP(recorder, request) 76 | assert.Equal(t, 200, recorder.Code) 77 | 78 | var resp ApiResponseJobs 79 | err := json.Unmarshal(recorder.Body.Bytes(), &resp) 80 | 81 | assert.NoError(t, err) 82 | assert.Equal(t, len(resp.Jobs), 1) 83 | job := resp.Jobs[0] 84 | assert.Equal(t, job.Name, "foo") 85 | assert.EqualValues(t, job.Count, 2) 86 | assert.EqualValues(t, job.CountSuccess, 1) 87 | assert.EqualValues(t, job.CountValidationError, 1) 88 | assert.EqualValues(t, job.CountError, 0) 89 | assert.EqualValues(t, job.CountPanic, 0) 90 | assert.EqualValues(t, job.CountJunk, 0) 91 | assert.EqualValues(t, job.NanosSum, 14443) 92 | assert.EqualValues(t, job.NanosMin, 5678) 93 | assert.EqualValues(t, job.NanosMax, 8765) 94 | assert.InDelta(t, job.NanosAvg, 7221.5, 0.01) 95 | assert.InDelta(t, job.NanosSumSquares, 1.09064909e+08, 0.01) 96 | assert.InDelta(t, job.NanosStdDev, 2182.8386, 0.01) 97 | } 98 | 99 | func testHosts(t *testing.T, hd *HealthD) { 100 | recorder := httptest.NewRecorder() 101 | request, _ := http.NewRequest("GET", "/healthd/hosts", nil) 102 | hd.apiRouter().ServeHTTP(recorder, request) 103 | assert.Equal(t, 200, recorder.Code) 104 | 105 | var resp ApiResponseHosts 106 | err := json.Unmarshal(recorder.Body.Bytes(), &resp) 107 | 108 | assert.NoError(t, err) 109 | assert.Equal(t, len(resp.Hosts), 2) 110 | assert.Equal(t, resp.Hosts[0].HostPort, ":6050") 111 | assert.Equal(t, resp.Hosts[1].HostPort, ":6051") 112 | 113 | for _, hs := range resp.Hosts { 114 | assert.WithinDuration(t, hs.LastCheckTime, time.Now(), time.Second*2) 115 | assert.WithinDuration(t, hs.FirstSuccessfulResponse, time.Now(), time.Second*2) 116 | assert.WithinDuration(t, hs.LastSuccessfulResponse, time.Now(), time.Second*2) 117 | assert.EqualValues(t, hs.LastInstanceId, health.Identifier) 118 | assert.EqualValues(t, hs.LastIntervalDuration, time.Minute) 119 | assert.EqualValues(t, hs.LastCode, 200) 120 | assert.Equal(t, hs.LastErr, "") 121 | } 122 | } 123 | 124 | // assertFooBarAggregation asserts that intAgg is the aggregation (generally) of the stuff created in TestHealthD 125 | func assertFooBarAggregation(t *testing.T, intAgg *health.IntervalAggregation) { 126 | assert.EqualValues(t, intAgg.Events["bar"], 2) 127 | assert.EqualValues(t, intAgg.Timers["baz"].Count, 2) 128 | 129 | job := intAgg.Jobs["foo"] 130 | assert.NotNil(t, job) 131 | assert.EqualValues(t, job.Count, 2) 132 | assert.EqualValues(t, job.CountSuccess, 1) 133 | assert.EqualValues(t, job.CountValidationError, 1) 134 | } 135 | -------------------------------------------------------------------------------- /interval_aggregation_clone_test.go: -------------------------------------------------------------------------------- 1 | package health 2 | 3 | import ( 4 | "fmt" 5 | "testing" 6 | "time" 7 | 8 | "github.com/stretchr/testify/assert" 9 | ) 10 | 11 | type eventErr struct { 12 | event string 13 | err error 14 | } 15 | 16 | func TestClone(t *testing.T) { 17 | setNowMock("2011-09-09T23:36:13Z") 18 | defer resetNowMock() 19 | 20 | a := aggregatorWithData() 21 | intAgg := a.intervalAggregations[0] 22 | assertAggregationData(t, intAgg) 23 | clonedAgg := intAgg.Clone() 24 | assertAggregationData(t, clonedAgg) 25 | 26 | // Let's add some data to intAgg and make sure it doesn't propagate to clonedAgg 27 | a.EmitEvent("foo", "bar") 28 | a.EmitTiming("foo", "bar", 100) 29 | a.EmitEventErr("foo", "bar", fmt.Errorf("hi")) 30 | a.EmitGauge("foo", "bar", 3.14) 31 | a.EmitComplete("foo", Error, 99) 32 | 33 | assert.Equal(t, 301, len(intAgg.Jobs)) 34 | 35 | assertAggregationData(t, clonedAgg) 36 | } 37 | 38 | func BenchmarkClone(b *testing.B) { 39 | setNowMock("2011-09-09T23:36:13Z") 40 | defer resetNowMock() 41 | 42 | a := aggregatorWithData() 43 | intAgg := a.intervalAggregations[0] 44 | 45 | b.ResetTimer() 46 | for i := 0; i < b.N; i++ { 47 | intAgg.Clone() 48 | } 49 | } 50 | 51 | func assertAggregationData(t *testing.T, intAgg *IntervalAggregation) { 52 | assert.Equal(t, 300, len(intAgg.Jobs)) 53 | assert.Equal(t, 1200, len(intAgg.Events)) 54 | assert.Equal(t, 1200, len(intAgg.Timers)) 55 | assert.Equal(t, 1200, len(intAgg.Gauges)) 56 | assert.Equal(t, 1200, len(intAgg.EventErrs)) 57 | 58 | // Spot-check events: 59 | assert.EqualValues(t, 1, intAgg.Events["event0"]) 60 | 61 | // Spot check gauges: 62 | assert.EqualValues(t, 3.14, intAgg.Gauges["gauge0"]) 63 | 64 | // Spot-check timings: 65 | assert.EqualValues(t, 1, intAgg.Timers["timing0"].Count) 66 | assert.EqualValues(t, 12, intAgg.Timers["timing0"].NanosSum) 67 | 68 | // Spot-check event-errs: 69 | assert.EqualValues(t, 1, intAgg.EventErrs["err0"].Count) 70 | assert.Equal(t, []error{fmt.Errorf("wat")}, intAgg.EventErrs["err0"].getErrorSamples()) 71 | 72 | // Spot-check jobs: 73 | job := intAgg.Jobs["job0"] 74 | assert.EqualValues(t, 1, job.CountSuccess) 75 | assert.EqualValues(t, 0, job.CountError) 76 | assert.EqualValues(t, 1, job.Events["event0"]) 77 | assert.EqualValues(t, 0, job.Events["event4"]) 78 | assert.EqualValues(t, 3.14, job.Gauges["gauge0"]) 79 | assert.EqualValues(t, 0.0, job.Gauges["gauge4"]) 80 | assert.EqualValues(t, 1, job.Timers["timing0"].Count) 81 | assert.EqualValues(t, 12, job.Timers["timing0"].NanosSum) 82 | assert.EqualValues(t, 1, job.EventErrs["err0"].Count) 83 | assert.Equal(t, []error{fmt.Errorf("wat")}, job.EventErrs["err0"].getErrorSamples()) 84 | 85 | // Nothing foo or bar related 86 | _, ok := intAgg.Jobs["foo"] 87 | assert.False(t, ok) 88 | assert.EqualValues(t, 0, intAgg.Events["bar"]) 89 | assert.Nil(t, intAgg.Timers["bar"]) 90 | assert.Nil(t, intAgg.EventErrs["bar"]) 91 | 92 | } 93 | 94 | func aggregatorWithData() *aggregator { 95 | a := newAggregator(time.Minute, time.Minute*5) 96 | 97 | // We want 300 jobs 98 | // Each job will have 5 events, but we want 1200 events total 99 | // Each job will have 5 timers, but we want 1200 timers total 100 | // Each job will have 5 gauges, but we want 1200 gauges total 101 | // Each job will have 5 errs, but we want 1200 errs total 102 | // Given this 300/1200 dichotomy, 103 | // - the first job will have 4 events, the next job 4 events, etc. 104 | 105 | jobs := []string{} 106 | for i := 0; i < 300; i++ { 107 | jobs = append(jobs, fmt.Sprintf("job%d", i)) 108 | } 109 | 110 | events := []string{} 111 | for i := 0; i < 1200; i++ { 112 | events = append(events, fmt.Sprintf("event%d", i)) 113 | } 114 | 115 | timings := []string{} 116 | for i := 0; i < 1200; i++ { 117 | timings = append(timings, fmt.Sprintf("timing%d", i)) 118 | } 119 | 120 | gauges := []string{} 121 | for i := 0; i < 1200; i++ { 122 | gauges = append(gauges, fmt.Sprintf("gauge%d", i)) 123 | } 124 | 125 | eventErrs := []eventErr{} 126 | for i := 0; i < 1200; i++ { 127 | eventErrs = append(eventErrs, eventErr{ 128 | event: fmt.Sprintf("err%d", i), 129 | err: fmt.Errorf("wat"), 130 | }) 131 | } 132 | 133 | cur := 0 134 | for _, j := range jobs { 135 | for i := 0; i < 4; i++ { 136 | a.EmitEvent(j, events[cur]) 137 | cur++ 138 | } 139 | } 140 | 141 | cur = 0 142 | for _, j := range jobs { 143 | for i := 0; i < 4; i++ { 144 | a.EmitEventErr(j, eventErrs[cur].event, eventErrs[cur].err) 145 | cur++ 146 | } 147 | } 148 | 149 | cur = 0 150 | for _, j := range jobs { 151 | for i := 0; i < 4; i++ { 152 | a.EmitTiming(j, timings[cur], 12) 153 | cur++ 154 | } 155 | } 156 | 157 | cur = 0 158 | for _, j := range jobs { 159 | for i := 0; i < 4; i++ { 160 | a.EmitGauge(j, gauges[cur], 3.14) 161 | cur++ 162 | } 163 | } 164 | 165 | for _, j := range jobs { 166 | a.EmitComplete(j, Success, 12) 167 | } 168 | 169 | return a 170 | } 171 | -------------------------------------------------------------------------------- /sinks/bugsnag/api.go: -------------------------------------------------------------------------------- 1 | package bugsnag 2 | 3 | import ( 4 | "bytes" 5 | "encoding/json" 6 | "fmt" 7 | "github.com/gocraft/health/stack" 8 | "io/ioutil" 9 | "net/http" 10 | ) 11 | 12 | type Config struct { 13 | // Your Bugsnag API key, e.g. "c9d60ae4c7e70c4b6c4ebd3e8056d2b8". You can 14 | // find this by clicking Settings on https://bugsnag.com/. 15 | APIKey string 16 | 17 | // The Endpoint to notify about crashes. This defaults to 18 | // "https://notify.bugsnag.com/", if you're using Bugsnag Enterprise then 19 | // set it to your internal Bugsnag endpoint. 20 | Endpoint string 21 | 22 | // The current release stage. This defaults to "production" and is used to 23 | // filter errors in the Bugsnag dashboard. 24 | ReleaseStage string 25 | 26 | // The currently running version of the app. This is used to filter errors 27 | // in the Bugsnag dasboard. If you set this then Bugsnag will only re-open 28 | // resolved errors if they happen in different app versions. 29 | AppVersion string 30 | 31 | // The hostname of the current server. This defaults to the return value of 32 | // os.Hostname() and is graphed in the Bugsnag dashboard. 33 | Hostname string 34 | } 35 | 36 | type payload struct { 37 | APIKey string `json:"apiKey"` 38 | 39 | Notifier struct { 40 | Name string `json:"name"` 41 | Version string `json:"version"` 42 | URL string `json:"url"` 43 | } `json:"notifier"` 44 | 45 | Events []payloadEvent `json:"events"` 46 | } 47 | 48 | type payloadEvent struct { 49 | PayloadVersion string `json:"payloadVersion"` 50 | Exceptions []payloadException `json:"exceptions"` 51 | 52 | // threads 53 | 54 | Context string `json:"context"` 55 | 56 | // groupingHash 57 | // severity 58 | // user 59 | 60 | App struct { 61 | // version 62 | ReleaseStage string `json:"releaseStage"` 63 | } `json:"app"` 64 | 65 | Device struct { 66 | //osVersion 67 | Hostname string `json:"hostname"` 68 | } `json:"device"` 69 | 70 | // meta data 71 | 72 | Metadata struct { 73 | Request request `json:"request"` 74 | Kvs map[string]string `json:"kvs"` 75 | } `json:"metaData"` 76 | } 77 | 78 | type payloadException struct { 79 | ErrorClass string `json:"errorClass"` 80 | Message string `json:"message"` 81 | Stacktrace []payloadFrame `json:"stacktrace"` 82 | } 83 | 84 | type payloadFrame struct { 85 | File string `json:"file"` 86 | LineNumber int `json:"lineNumber"` 87 | Method string `json:"method"` 88 | InProject bool `json:"inProject"` 89 | //code 90 | } 91 | 92 | type request struct { 93 | Url string `json:"url"` 94 | Parameters string `json:"parameters"` 95 | } 96 | 97 | // Notify will send the error and stack trace to Bugsnag. Note that this doesn't take advantage of all of Bugsnag's capabilities. 98 | func Notify(config *Config, jobName string, eventName string, err error, trace *stack.Trace, kvs map[string]string) error { 99 | 100 | // Make a struct that serializes to the JSON needed for the API request to bugsnag 101 | p := newPayload(config, jobName, eventName, err, trace, kvs) 102 | 103 | // JSON serialize it 104 | data, err := json.MarshalIndent(p, "", "\t") 105 | if err != nil { 106 | return err 107 | } 108 | 109 | // Post it to the server: 110 | client := http.Client{} 111 | resp, err := client.Post(config.Endpoint, "application/json", bytes.NewBuffer(data)) 112 | if err != nil { 113 | return err 114 | } 115 | body, err := ioutil.ReadAll(resp.Body) 116 | if err != nil { 117 | return err 118 | } 119 | if string(body) != "OK" { 120 | return fmt.Errorf("response from bugsnag wasn't 'OK'") 121 | } 122 | 123 | return nil 124 | } 125 | 126 | func newPayload(config *Config, jobName string, eventName string, err error, trace *stack.Trace, kvs map[string]string) *payload { 127 | except := payloadException{ 128 | ErrorClass: eventName, 129 | Message: err.Error(), 130 | } 131 | for _, frame := range trace.Frames() { 132 | pf := payloadFrame{ 133 | File: frame.File, 134 | LineNumber: frame.LineNumber, 135 | Method: frame.Package + ":" + frame.Name, 136 | InProject: !frame.IsSystemPackage, 137 | } 138 | except.Stacktrace = append(except.Stacktrace, pf) 139 | } 140 | 141 | evt := payloadEvent{ 142 | PayloadVersion: "2", 143 | Exceptions: []payloadException{except}, 144 | Context: jobName, 145 | } 146 | evt.App.ReleaseStage = config.ReleaseStage 147 | evt.Device.Hostname = config.Hostname 148 | evt.Metadata.Kvs = kvs 149 | 150 | if requestUrl, requestUrlExists := kvs["request"]; requestUrlExists { 151 | evt.Metadata.Request.Url = requestUrl 152 | } 153 | 154 | if formData, formDataExists := kvs["formdata"]; formDataExists { 155 | evt.Metadata.Request.Parameters = formData 156 | } 157 | 158 | p := payload{ 159 | APIKey: config.APIKey, 160 | Events: []payloadEvent{evt}, 161 | } 162 | p.Notifier.Name = "health" 163 | p.Notifier.Version = "1.0" 164 | p.Notifier.URL = "https://www.github.com/gocraft/health" 165 | 166 | return &p 167 | } 168 | -------------------------------------------------------------------------------- /interval_aggregation.go: -------------------------------------------------------------------------------- 1 | package health 2 | 3 | import ( 4 | "reflect" 5 | "time" 6 | ) 7 | 8 | // IntervalAggregation will hold data for a given aggregation interval. 9 | type IntervalAggregation struct { 10 | // The start time of the interval 11 | IntervalStart time.Time `json:"interval_start"` 12 | 13 | // SerialNumber increments every time the aggregation changes. It does not increment if the aggregation does not change. 14 | SerialNumber int64 `json:"serial_number"` 15 | 16 | // Jobs hold a map of job name -> data about that job. 17 | // This includes both primary-job information (success vs error, et all) as well as 18 | // scoping timers/counters by the job. 19 | Jobs map[string]*JobAggregation `json:"jobs"` 20 | 21 | // aggregationMaps will hold event/timer information that is not nested per-job. 22 | aggregationMaps 23 | } 24 | 25 | type aggregationMaps struct { 26 | Timers map[string]*TimerAggregation `json:"timers"` 27 | Gauges map[string]float64 `json:"gauges"` 28 | Events map[string]int64 `json:"events"` 29 | EventErrs map[string]*ErrorCounter `json:"event_errs"` 30 | } 31 | 32 | type JobAggregation struct { 33 | aggregationMaps 34 | TimerAggregation 35 | 36 | CountSuccess int64 `json:"count_success"` 37 | CountValidationError int64 `json:"count_validation_error"` 38 | CountPanic int64 `json:"count_panic"` 39 | CountError int64 `json:"count_error"` 40 | CountJunk int64 `json:"count_junk"` 41 | } 42 | 43 | type TimerAggregation struct { 44 | Count int64 `json:"count"` 45 | NanosSum int64 `json:"nanos_sum"` 46 | NanosSumSquares float64 `json:"nanos_sum_squares"` // 3seconds^2 overflows an int64 47 | NanosMin int64 `json:"nanos_min"` 48 | NanosMax int64 `json:"nanos_max"` 49 | } 50 | 51 | type ErrorCounter struct { 52 | Count int64 `json:"count"` 53 | 54 | // Let's keep a ring buffer of some errors. I feel like this isn't the best data structure / plan of attack here but works for now. 55 | errorSamples [5]error 56 | errorSampleIndex int 57 | } 58 | 59 | func NewIntervalAggregation(intervalStart time.Time) *IntervalAggregation { 60 | intAgg := &IntervalAggregation{ 61 | IntervalStart: intervalStart, 62 | Jobs: make(map[string]*JobAggregation), 63 | } 64 | intAgg.initAggregationMaps() 65 | 66 | return intAgg 67 | } 68 | 69 | func (am *aggregationMaps) initAggregationMaps() { 70 | am.Timers = make(map[string]*TimerAggregation) 71 | am.Gauges = make(map[string]float64) 72 | am.Events = make(map[string]int64) 73 | am.EventErrs = make(map[string]*ErrorCounter) 74 | } 75 | 76 | func (am *aggregationMaps) getCounterErrs(event string) *ErrorCounter { 77 | ce := am.EventErrs[event] 78 | if ce == nil { 79 | ce = &ErrorCounter{} 80 | am.EventErrs[event] = ce 81 | } 82 | return ce 83 | } 84 | 85 | func (am *aggregationMaps) getTimers(event string) *TimerAggregation { 86 | t := am.Timers[event] 87 | if t == nil { 88 | t = &TimerAggregation{} 89 | am.Timers[event] = t 90 | } 91 | return t 92 | } 93 | 94 | func (ec *ErrorCounter) incrementAndAddError(inputErr error) { 95 | ec.Count++ 96 | ec.addError(inputErr) 97 | } 98 | 99 | func (ec *ErrorCounter) addError(inputErr error) { 100 | lastErr := ec.errorSamples[ec.errorSampleIndex] 101 | if lastErr == nil { 102 | ec.errorSamples[ec.errorSampleIndex] = inputErr 103 | } else if !reflect.DeepEqual(lastErr, inputErr) { 104 | n := len(ec.errorSamples) 105 | ec.errorSampleIndex = (ec.errorSampleIndex + 1) % n 106 | ec.errorSamples[ec.errorSampleIndex] = inputErr 107 | } 108 | } 109 | 110 | func (ec *ErrorCounter) getErrorSamples() []error { 111 | // Count how many non-nil errors are there so we can make a slice of the right size 112 | count := 0 113 | for _, e := range ec.errorSamples { 114 | if e != nil { 115 | count++ 116 | } 117 | } 118 | ret := make([]error, 0, count) 119 | 120 | // Put non-nil errors in slice 121 | for _, e := range ec.errorSamples { 122 | if e != nil { 123 | ret = append(ret, e) 124 | } 125 | } 126 | return ret 127 | } 128 | 129 | func (ia *IntervalAggregation) getJobAggregation(job string) *JobAggregation { 130 | jobAgg := ia.Jobs[job] 131 | if jobAgg == nil { 132 | jobAgg = &JobAggregation{} 133 | jobAgg.initAggregationMaps() 134 | ia.Jobs[job] = jobAgg 135 | } 136 | return jobAgg 137 | } 138 | 139 | func (a *TimerAggregation) ingest(nanos int64) { 140 | a.Count++ 141 | a.NanosSum += nanos 142 | floatNano := float64(nanos) 143 | a.NanosSumSquares += (floatNano * floatNano) 144 | if a.Count == 1 || nanos < a.NanosMin { 145 | a.NanosMin = nanos 146 | } 147 | if a.Count == 1 || nanos > a.NanosMax { 148 | a.NanosMax = nanos 149 | } 150 | } 151 | 152 | func (a *JobAggregation) ingest(status CompletionStatus, nanos int64) { 153 | a.TimerAggregation.ingest(nanos) 154 | if status == Success { 155 | a.CountSuccess++ 156 | } else if status == ValidationError { 157 | a.CountValidationError++ 158 | } else if status == Panic { 159 | a.CountPanic++ 160 | } else if status == Error { 161 | a.CountError++ 162 | } else if status == Junk { 163 | a.CountJunk++ 164 | } 165 | } 166 | -------------------------------------------------------------------------------- /aggregator.go: -------------------------------------------------------------------------------- 1 | package health 2 | 3 | import ( 4 | "time" 5 | ) 6 | 7 | type aggregator struct { 8 | // How long is each aggregation interval. Eg, 1 minute 9 | intervalDuration time.Duration 10 | 11 | // Retain controls how many metrics interval we keep. Eg, 5 minutes 12 | retain time.Duration 13 | 14 | // maxIntervals is the maximum length of intervals. 15 | // It is retain / interval. 16 | maxIntervals int 17 | 18 | // intervals is a slice of the retained intervals 19 | intervalAggregations []*IntervalAggregation 20 | } 21 | 22 | func startAggregator(intervalDuration time.Duration, retain time.Duration, sink *JsonPollingSink) { 23 | cmdChan := sink.cmdChan 24 | doneChan := sink.doneChan 25 | intervalsChanChan := sink.intervalsChanChan 26 | ticker := time.Tick(1 * time.Second) 27 | 28 | agg := newAggregator(intervalDuration, retain) 29 | 30 | AGGREGATE_LOOP: 31 | for { 32 | select { 33 | case <-doneChan: 34 | sink.doneDoneChan <- 1 35 | break AGGREGATE_LOOP 36 | case cmd := <-cmdChan: 37 | if cmd.Kind == cmdKindEvent { 38 | agg.EmitEvent(cmd.Job, cmd.Event) 39 | } else if cmd.Kind == cmdKindEventErr { 40 | agg.EmitEventErr(cmd.Job, cmd.Event, cmd.Err) 41 | } else if cmd.Kind == cmdKindTiming { 42 | agg.EmitTiming(cmd.Job, cmd.Event, cmd.Nanos) 43 | } else if cmd.Kind == cmdKindGauge { 44 | agg.EmitGauge(cmd.Job, cmd.Event, cmd.Value) 45 | } else if cmd.Kind == cmdKindComplete { 46 | agg.EmitComplete(cmd.Job, cmd.Status, cmd.Nanos) 47 | } 48 | case <-ticker: 49 | agg.getIntervalAggregation() // this has the side effect of sliding the interval window if necessary. 50 | case intervalsChan := <-intervalsChanChan: 51 | intervalsChan <- agg.memorySafeIntervals() 52 | } 53 | } 54 | } 55 | 56 | func newAggregator(intervalDuration time.Duration, retain time.Duration) *aggregator { 57 | maxIntervals := int(retain / intervalDuration) 58 | return &aggregator{ 59 | intervalDuration: intervalDuration, 60 | retain: retain, 61 | maxIntervals: maxIntervals, 62 | intervalAggregations: make([]*IntervalAggregation, 0, maxIntervals), 63 | } 64 | } 65 | 66 | func (a *aggregator) memorySafeIntervals() []*IntervalAggregation { 67 | ret := make([]*IntervalAggregation, 0, len(a.intervalAggregations)) 68 | curAgg := a.getIntervalAggregation() 69 | 70 | for _, intAgg := range a.intervalAggregations { 71 | if intAgg == curAgg { 72 | ret = append(ret, intAgg.Clone()) 73 | } else { 74 | ret = append(ret, intAgg) 75 | } 76 | } 77 | 78 | return ret 79 | } 80 | 81 | func (a *aggregator) EmitEvent(job string, event string) { 82 | intAgg := a.getIntervalAggregation() 83 | intAgg.Events[event] = intAgg.Events[event] + 1 84 | jobAgg := intAgg.getJobAggregation(job) 85 | jobAgg.Events[event] = jobAgg.Events[event] + 1 86 | intAgg.SerialNumber++ 87 | } 88 | 89 | func (a *aggregator) EmitEventErr(job string, event string, inputErr error) { 90 | intAgg := a.getIntervalAggregation() 91 | errc := intAgg.getCounterErrs(event) 92 | errc.incrementAndAddError(inputErr) 93 | jobAgg := intAgg.getJobAggregation(job) 94 | jerrc := jobAgg.getCounterErrs(event) 95 | jerrc.incrementAndAddError(inputErr) 96 | intAgg.SerialNumber++ 97 | } 98 | 99 | func (a *aggregator) EmitTiming(job string, event string, nanos int64) { 100 | intAgg := a.getIntervalAggregation() 101 | t := intAgg.getTimers(event) 102 | t.ingest(nanos) 103 | jobAgg := intAgg.getJobAggregation(job) 104 | jt := jobAgg.getTimers(event) 105 | jt.ingest(nanos) 106 | intAgg.SerialNumber++ 107 | } 108 | 109 | func (a *aggregator) EmitGauge(job string, event string, value float64) { 110 | intAgg := a.getIntervalAggregation() 111 | intAgg.Gauges[event] = value 112 | jobAgg := intAgg.getJobAggregation(job) 113 | jobAgg.Gauges[event] = value 114 | intAgg.SerialNumber++ 115 | } 116 | 117 | func (a *aggregator) EmitComplete(job string, status CompletionStatus, nanos int64) { 118 | intAgg := a.getIntervalAggregation() 119 | jobAgg := intAgg.getJobAggregation(job) 120 | jobAgg.ingest(status, nanos) 121 | intAgg.SerialNumber++ 122 | } 123 | 124 | func (a *aggregator) getIntervalAggregation() *IntervalAggregation { 125 | intervalStart := now().Truncate(a.intervalDuration) 126 | 127 | n := len(a.intervalAggregations) 128 | if n > 0 && a.intervalAggregations[n-1].IntervalStart == intervalStart { 129 | return a.intervalAggregations[n-1] 130 | } 131 | 132 | return a.createIntervalAggregation(intervalStart) 133 | } 134 | 135 | func (a *aggregator) createIntervalAggregation(interval time.Time) *IntervalAggregation { 136 | // Make new interval: 137 | current := NewIntervalAggregation(interval) 138 | 139 | // If we've reached our max intervals, and we're going to shift everything down, then set the last one 140 | n := len(a.intervalAggregations) 141 | if n == a.maxIntervals { 142 | for i := 1; i < n; i++ { 143 | a.intervalAggregations[i-1] = a.intervalAggregations[i] 144 | } 145 | a.intervalAggregations[n-1] = current 146 | } else { 147 | a.intervalAggregations = append(a.intervalAggregations, current) 148 | } 149 | 150 | return current 151 | } 152 | 153 | var nowMock time.Time 154 | 155 | func now() time.Time { 156 | if nowMock.IsZero() { 157 | return time.Now() 158 | } 159 | return nowMock 160 | } 161 | 162 | func setNowMock(t string) { 163 | var err error 164 | nowMock, err = time.Parse(time.RFC3339, t) 165 | if err != nil { 166 | panic(err) 167 | } 168 | } 169 | 170 | func resetNowMock() { 171 | nowMock = time.Time{} 172 | } 173 | -------------------------------------------------------------------------------- /cmd/healthtop/jobs.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "encoding/json" 5 | "fmt" 6 | "github.com/buger/goterm" 7 | "github.com/gocraft/health/healthd" 8 | "io/ioutil" 9 | "net/http" 10 | "net/url" 11 | "strings" 12 | "time" 13 | ) 14 | 15 | type jobOptions struct { 16 | Sort string 17 | Name string 18 | } 19 | 20 | func jobsLoop(opts *jobOptions) { 21 | secondTicker := time.Tick(1 * time.Second) 22 | 23 | var lastApiResponse *healthd.ApiResponseJobs 24 | var hStatus healthdStatus 25 | 26 | responses := make(chan *healthd.ApiResponseJobs) 27 | errors := make(chan error) 28 | 29 | go pollHealthDJobs(opts, responses, errors) 30 | for { 31 | select { 32 | case <-secondTicker: 33 | go pollHealthDJobs(opts, responses, errors) 34 | printJobs(lastApiResponse, &hStatus) 35 | case resp := <-responses: 36 | lastApiResponse = resp 37 | hStatus.lastSuccessAt = time.Now() 38 | printJobs(lastApiResponse, &hStatus) 39 | case err := <-errors: 40 | hStatus.lastErrorAt = time.Now() 41 | hStatus.lastError = err 42 | } 43 | } 44 | } 45 | 46 | func pollHealthDJobs(opts *jobOptions, responses chan *healthd.ApiResponseJobs, errors chan error) { 47 | var body []byte 48 | 49 | // limit. If name is not set, then limit it to the terminal height. 50 | // if name IS set, then don't limit it b/c we will currently filter in-memory 51 | var limit uint 52 | if opts.Name == "" { 53 | limit = maxRows() 54 | } 55 | 56 | values := url.Values{} 57 | if opts.Sort != "" { 58 | values.Add("sort", opts.Sort) 59 | } 60 | if limit != 0 { 61 | values.Add("limit", fmt.Sprint(limit)) 62 | } 63 | 64 | uri := "http://" + sourceHostPort + "/healthd/jobs" 65 | params := values.Encode() 66 | if params != "" { 67 | uri = uri + "?" + params 68 | } 69 | 70 | resp, err := http.Get(uri) 71 | if err != nil { 72 | errors <- err 73 | return 74 | } 75 | defer resp.Body.Close() 76 | body, err = ioutil.ReadAll(resp.Body) 77 | if err != nil { 78 | errors <- err 79 | return 80 | } 81 | 82 | var response healthd.ApiResponseJobs 83 | if err := json.Unmarshal(body, &response); err != nil { 84 | errors <- err 85 | return 86 | } 87 | 88 | if opts.Name != "" { 89 | filterJobsByName(&response, opts.Name) 90 | } 91 | 92 | responses <- &response 93 | } 94 | 95 | // Given the api response, remove any job entries that don't have 'name' in them. 96 | func filterJobsByName(resp *healthd.ApiResponseJobs, name string) { 97 | filteredSlice := []*healthd.Job{} 98 | 99 | for _, job := range resp.Jobs { 100 | if strings.Contains(job.Name, name) { 101 | filteredSlice = append(filteredSlice, job) 102 | } 103 | } 104 | 105 | resp.Jobs = filteredSlice 106 | } 107 | 108 | func printJobs(lastApiResponse *healthd.ApiResponseJobs, status *healthdStatus) { 109 | goterm.Clear() // Clear current screen 110 | goterm.MoveCursor(1, 1) 111 | defer goterm.Flush() 112 | goterm.Println("Current Time:", status.FmtNow(), " Status:", status.FmtStatus()) 113 | 114 | if lastApiResponse == nil { 115 | goterm.Println("no data yet") 116 | return 117 | } 118 | 119 | columns := []string{ 120 | "Job", 121 | // "Jobs/Second", //minute? flag? 122 | "Total Count", 123 | "Success", 124 | "ValidationError", 125 | "Panic", 126 | "Error", 127 | "Junk", 128 | "Avg Response Time", 129 | "Stddev", 130 | "Min", 131 | "Max", 132 | "Total", 133 | } 134 | 135 | for i, s := range columns { 136 | columns[i] = goterm.Bold(goterm.Color(s, goterm.BLACK)) 137 | } 138 | 139 | table := goterm.NewTable(0, goterm.Width()-1, 5, ' ', 0) 140 | fmt.Fprintf(table, "%s\n", strings.Join(columns, "\t")) 141 | 142 | for _, job := range lastApiResponse.Jobs { 143 | printJob(table, job) 144 | } 145 | 146 | goterm.Println(table) 147 | } 148 | 149 | func printJob(table *goterm.Table, job *healthd.Job) { 150 | fullSuccess := job.Count == job.CountSuccess 151 | printCellString(job.Name, table, true, false, false) 152 | printCellInt64(job.Count, table, false, fullSuccess, false) 153 | printCellInt64(job.CountSuccess, table, fullSuccess, fullSuccess, false) 154 | printCellInt64(job.CountValidationError, table, job.CountValidationError > 0, false, job.CountValidationError > 0) 155 | printCellInt64(job.CountPanic, table, job.CountPanic > 0, false, job.CountPanic > 0) 156 | printCellInt64(job.CountError, table, job.CountError > 0, false, job.CountError > 0) 157 | printCellInt64(job.CountJunk, table, job.CountJunk > 0, false, job.CountJunk > 0) 158 | printCellNanos(int64(job.NanosAvg), table, true, false, false) 159 | printCellNanos(int64(job.NanosStdDev), table, false, false, false) 160 | printCellNanos(job.NanosMin, table, false, false, false) 161 | printCellNanos(job.NanosMax, table, false, false, false) 162 | printCellNanos(job.NanosSum, table, false, false, false) 163 | fmt.Fprintf(table, "\n") 164 | } 165 | 166 | func printCellNanos(nanos int64, table *goterm.Table, isBold, isGreen, isRed bool) { 167 | var units string 168 | switch { 169 | case nanos > 2000000: 170 | units = "ms" 171 | nanos /= 1000000 172 | case nanos > 1000: 173 | units = "μs" 174 | nanos /= 1000 175 | default: 176 | units = "ns" 177 | } 178 | 179 | printCellString(fmt.Sprintf("%d %s", nanos, units), table, isBold, isGreen, isRed) 180 | } 181 | 182 | func printCellInt64(val int64, table *goterm.Table, isBold, isGreen, isRed bool) { 183 | printCellString(fmt.Sprint(val), table, isBold, isGreen, isRed) 184 | } 185 | 186 | func printCellString(text string, table *goterm.Table, isBold, isGreen, isRed bool) { 187 | color := goterm.BLACK 188 | if isGreen { 189 | color = goterm.GREEN 190 | } else if isRed { 191 | color = goterm.RED 192 | } 193 | 194 | fmt.Fprintf(table, "%s\t", format(text, color, isBold)) 195 | } 196 | 197 | func format(text string, color int, isBold bool) string { 198 | if isBold { 199 | return goterm.Bold(goterm.Color(text, color)) 200 | } else { 201 | return normal(goterm.Color(text, color)) 202 | } 203 | 204 | } 205 | 206 | func normal(text string) string { 207 | return fmt.Sprintf("\033[0m%s\033[0m", text) 208 | } 209 | 210 | // Returns the max amount of metrics/rows we can display 211 | // This is the # of rows in the terminal minus 2 (for time / stats + grid header) 212 | // To elimate any weird cases where the terminal is super short, we'll return a min rows of 3 213 | func maxRows() uint { 214 | n := goterm.Height() - 2 215 | if n < 3 { 216 | n = 3 217 | } 218 | return uint(n) 219 | } 220 | -------------------------------------------------------------------------------- /health.go: -------------------------------------------------------------------------------- 1 | package health 2 | 3 | import ( 4 | "errors" 5 | "fmt" 6 | "runtime" 7 | "time" 8 | ) 9 | 10 | // This is primarily used as syntactic sugar for libs outside this app for passing in maps easily. 11 | // We don't rely on it internally b/c I don't want to tie interfaces to the 'health' package. 12 | type Kvs map[string]string 13 | 14 | type EventReceiver interface { 15 | Event(eventName string) 16 | EventKv(eventName string, kvs map[string]string) 17 | EventErr(eventName string, err error) error 18 | EventErrKv(eventName string, err error, kvs map[string]string) error 19 | Timing(eventName string, nanoseconds int64) 20 | TimingKv(eventName string, nanoseconds int64, kvs map[string]string) 21 | Gauge(eventName string, value float64) 22 | GaugeKv(eventName string, value float64, kvs map[string]string) 23 | } 24 | 25 | type Stream struct { 26 | Sinks []Sink 27 | KeyValues map[string]string 28 | *Job 29 | } 30 | 31 | type Job struct { 32 | Stream *Stream 33 | JobName string 34 | KeyValues map[string]string 35 | Start time.Time 36 | } 37 | 38 | type CompletionStatus int 39 | 40 | const ( 41 | Success CompletionStatus = iota 42 | ValidationError 43 | Panic 44 | Error 45 | Junk 46 | ) 47 | 48 | var completionStatusToString = map[CompletionStatus]string{ 49 | Success: "success", 50 | ValidationError: "validation_error", 51 | Panic: "panic", 52 | Error: "error", 53 | Junk: "junk", 54 | } 55 | 56 | func (cs CompletionStatus) String() string { 57 | return completionStatusToString[cs] 58 | } 59 | 60 | type Sink interface { 61 | EmitEvent(job string, event string, kvs map[string]string) 62 | EmitEventErr(job string, event string, err error, kvs map[string]string) 63 | EmitTiming(job string, event string, nanoseconds int64, kvs map[string]string) 64 | EmitComplete(job string, status CompletionStatus, nanoseconds int64, kvs map[string]string) 65 | EmitGauge(job string, event string, value float64, kvs map[string]string) 66 | } 67 | 68 | func NewStream() *Stream { 69 | s := &Stream{} 70 | s.Job = s.NewJob("general") 71 | return s 72 | } 73 | 74 | func (s *Stream) AddSink(sink Sink) *Stream { 75 | s.Sinks = append(s.Sinks, sink) 76 | return s 77 | } 78 | 79 | func (s *Stream) KeyValue(key string, value string) *Stream { 80 | if s.KeyValues == nil { 81 | s.KeyValues = make(map[string]string) 82 | } 83 | s.KeyValues[key] = value 84 | return s 85 | } 86 | 87 | func (s *Stream) NewJob(name string) *Job { 88 | return &Job{ 89 | Stream: s, 90 | JobName: name, 91 | Start: time.Now(), 92 | } 93 | } 94 | 95 | func (j *Job) KeyValue(key string, value string) *Job { 96 | if j.KeyValues == nil { 97 | j.KeyValues = make(map[string]string) 98 | } 99 | j.KeyValues[key] = value 100 | return j 101 | } 102 | 103 | func (j *Job) Event(eventName string) { 104 | allKvs := j.mergedKeyValues(nil) 105 | for _, sink := range j.Stream.Sinks { 106 | sink.EmitEvent(j.JobName, eventName, allKvs) 107 | } 108 | } 109 | 110 | func (j *Job) EventKv(eventName string, kvs map[string]string) { 111 | allKvs := j.mergedKeyValues(kvs) 112 | for _, sink := range j.Stream.Sinks { 113 | sink.EmitEvent(j.JobName, eventName, allKvs) 114 | } 115 | } 116 | 117 | func (j *Job) EventErr(eventName string, err error) error { 118 | err = wrapErr(err) 119 | allKvs := j.mergedKeyValues(nil) 120 | for _, sink := range j.Stream.Sinks { 121 | sink.EmitEventErr(j.JobName, eventName, err, allKvs) 122 | } 123 | if err, ok := err.(*UnmutedError); ok { 124 | err.Emitted = true 125 | } 126 | return err 127 | } 128 | 129 | func (j *Job) EventErrKv(eventName string, err error, kvs map[string]string) error { 130 | err = wrapErr(err) 131 | allKvs := j.mergedKeyValues(kvs) 132 | for _, sink := range j.Stream.Sinks { 133 | sink.EmitEventErr(j.JobName, eventName, err, allKvs) 134 | } 135 | if err, ok := err.(*UnmutedError); ok { 136 | err.Emitted = true 137 | } 138 | return err 139 | } 140 | 141 | func (j *Job) Timing(eventName string, nanoseconds int64) { 142 | allKvs := j.mergedKeyValues(nil) 143 | for _, sink := range j.Stream.Sinks { 144 | sink.EmitTiming(j.JobName, eventName, nanoseconds, allKvs) 145 | } 146 | } 147 | 148 | func (j *Job) TimingKv(eventName string, nanoseconds int64, kvs map[string]string) { 149 | allKvs := j.mergedKeyValues(kvs) 150 | for _, sink := range j.Stream.Sinks { 151 | sink.EmitTiming(j.JobName, eventName, nanoseconds, allKvs) 152 | } 153 | } 154 | 155 | func (j *Job) Gauge(eventName string, value float64) { 156 | allKvs := j.mergedKeyValues(nil) 157 | for _, sink := range j.Stream.Sinks { 158 | sink.EmitGauge(j.JobName, eventName, value, allKvs) 159 | } 160 | } 161 | 162 | func (j *Job) GaugeKv(eventName string, value float64, kvs map[string]string) { 163 | allKvs := j.mergedKeyValues(kvs) 164 | for _, sink := range j.Stream.Sinks { 165 | sink.EmitGauge(j.JobName, eventName, value, allKvs) 166 | } 167 | } 168 | 169 | func (j *Job) Complete(status CompletionStatus) { 170 | allKvs := j.mergedKeyValues(nil) 171 | for _, sink := range j.Stream.Sinks { 172 | sink.EmitComplete(j.JobName, status, time.Since(j.Start).Nanoseconds(), allKvs) 173 | } 174 | } 175 | 176 | func (j *Job) CompleteKv(status CompletionStatus, kvs map[string]string) { 177 | allKvs := j.mergedKeyValues(kvs) 178 | for _, sink := range j.Stream.Sinks { 179 | sink.EmitComplete(j.JobName, status, time.Since(j.Start).Nanoseconds(), allKvs) 180 | } 181 | } 182 | 183 | func (j *Job) mergedKeyValues(instanceKvs map[string]string) map[string]string { 184 | var allKvs map[string]string 185 | 186 | // Count how many maps actually have contents in them. If it's 0 or 1, we won't allocate a new map. 187 | // Also, optimistically set allKvs. We might use it or we might overwrite the value with a newly made map. 188 | var kvCount = 0 189 | if len(j.KeyValues) > 0 { 190 | kvCount += 1 191 | allKvs = j.KeyValues 192 | } 193 | if len(j.Stream.KeyValues) > 0 { 194 | kvCount += 1 195 | allKvs = j.Stream.KeyValues 196 | } 197 | if len(instanceKvs) > 0 { 198 | kvCount += 1 199 | allKvs = instanceKvs 200 | } 201 | 202 | if kvCount > 1 { 203 | allKvs = make(map[string]string) 204 | for k, v := range j.Stream.KeyValues { 205 | allKvs[k] = v 206 | } 207 | for k, v := range j.KeyValues { 208 | allKvs[k] = v 209 | } 210 | for k, v := range instanceKvs { 211 | allKvs[k] = v 212 | } 213 | } 214 | 215 | return allKvs 216 | } 217 | 218 | func (s *Stream) Run(jobName string, f func() error) error { 219 | j := s.NewJob(jobName) 220 | return j.Run(f) 221 | } 222 | 223 | func (j *Job) Run(f func() error) (err error) { 224 | defer func() { 225 | if r := recover(); r != nil { 226 | stack := make([]byte, 4096) 227 | stack = stack[:runtime.Stack(stack, false)] 228 | 229 | // recovered value from panic() is an interface{}, and it might not be `error` 230 | // do not simply type-assert here 231 | err = errors.New(fmt.Sprint(r)) 232 | j.EventErrKv("panic", err, Kvs{"stack": string(stack)}) 233 | j.Complete(Panic) 234 | } 235 | }() 236 | 237 | err = f() 238 | if err != nil { 239 | j.Complete(Error) 240 | } else { 241 | j.Complete(Success) 242 | } 243 | 244 | return 245 | } 246 | -------------------------------------------------------------------------------- /writer_sink_test.go: -------------------------------------------------------------------------------- 1 | package health 2 | 3 | import ( 4 | "bytes" 5 | "errors" 6 | "github.com/stretchr/testify/assert" 7 | "regexp" 8 | "testing" 9 | ) 10 | 11 | var basicEventRegexp = regexp.MustCompile("\\[[^\\]]+\\]: job:(.+) event:(.+)") 12 | var kvsEventRegexp = regexp.MustCompile("\\[[^\\]]+\\]: job:(.+) event:(.+) kvs:\\[(.+)\\]") 13 | var basicEventErrRegexp = regexp.MustCompile("\\[[^\\]]+\\]: job:(.+) event:(.+) err:(.+)") 14 | var kvsEventErrRegexp = regexp.MustCompile("\\[[^\\]]+\\]: job:(.+) event:(.+) err:(.+) kvs:\\[(.+)\\]") 15 | var basicTimingRegexp = regexp.MustCompile("\\[[^\\]]+\\]: job:(.+) event:(.+) time:(.+)") 16 | var kvsTimingRegexp = regexp.MustCompile("\\[[^\\]]+\\]: job:(.+) event:(.+) time:(.+) kvs:\\[(.+)\\]") 17 | var basicGaugeRegexp = regexp.MustCompile("\\[[^\\]]+\\]: job:(.+) event:(.+) gauge:(.+)") 18 | var kvsGaugeRegexp = regexp.MustCompile("\\[[^\\]]+\\]: job:(.+) event:(.+) gauge:(.+) kvs:\\[(.+)\\]") 19 | var basicCompletionRegexp = regexp.MustCompile("\\[[^\\]]+\\]: job:(.+) status:(.+) time:(.+)") 20 | var kvsCompletionRegexp = regexp.MustCompile("\\[[^\\]]+\\]: job:(.+) status:(.+) time:(.+) kvs:\\[(.+)\\]") 21 | 22 | var testErr = errors.New("my test error") 23 | 24 | func BenchmarkWriterSinkEmitEvent(b *testing.B) { 25 | var by bytes.Buffer 26 | someKvs := map[string]string{"foo": "bar", "qux": "dog"} 27 | sink := WriterSink{&by} 28 | b.ResetTimer() 29 | for i := 0; i < b.N; i++ { 30 | by.Reset() 31 | sink.EmitEvent("myjob", "myevent", someKvs) 32 | } 33 | } 34 | 35 | func BenchmarkWriterSinkEmitEventErr(b *testing.B) { 36 | var by bytes.Buffer 37 | someKvs := map[string]string{"foo": "bar", "qux": "dog"} 38 | sink := WriterSink{&by} 39 | b.ResetTimer() 40 | for i := 0; i < b.N; i++ { 41 | by.Reset() 42 | sink.EmitEventErr("myjob", "myevent", testErr, someKvs) 43 | } 44 | } 45 | 46 | func BenchmarkWriterSinkEmitTiming(b *testing.B) { 47 | var by bytes.Buffer 48 | someKvs := map[string]string{"foo": "bar", "qux": "dog"} 49 | sink := WriterSink{&by} 50 | b.ResetTimer() 51 | for i := 0; i < b.N; i++ { 52 | by.Reset() 53 | sink.EmitTiming("myjob", "myevent", 234203, someKvs) 54 | } 55 | } 56 | 57 | func BenchmarkWriterSinkEmitComplete(b *testing.B) { 58 | var by bytes.Buffer 59 | someKvs := map[string]string{"foo": "bar", "qux": "dog"} 60 | sink := WriterSink{&by} 61 | b.ResetTimer() 62 | for i := 0; i < b.N; i++ { 63 | by.Reset() 64 | sink.EmitComplete("myjob", Success, 234203, someKvs) 65 | } 66 | } 67 | 68 | func TestWriterSinkEmitEventBasic(t *testing.T) { 69 | var b bytes.Buffer 70 | sink := WriterSink{&b} 71 | sink.EmitEvent("myjob", "myevent", nil) 72 | 73 | str := b.String() 74 | 75 | result := basicEventRegexp.FindStringSubmatch(str) 76 | assert.Equal(t, 3, len(result)) 77 | assert.Equal(t, "myjob", result[1]) 78 | assert.Equal(t, "myevent", result[2]) 79 | } 80 | 81 | func TestWriterSinkEmitEventKvs(t *testing.T) { 82 | var b bytes.Buffer 83 | sink := WriterSink{&b} 84 | sink.EmitEvent("myjob", "myevent", map[string]string{"wat": "ok", "another": "thing"}) 85 | 86 | str := b.String() 87 | 88 | result := kvsEventRegexp.FindStringSubmatch(str) 89 | assert.Equal(t, 4, len(result)) 90 | assert.Equal(t, "myjob", result[1]) 91 | assert.Equal(t, "myevent", result[2]) 92 | assert.Equal(t, "another:thing wat:ok", result[3]) 93 | } 94 | 95 | func TestWriterSinkEmitEventErrBasic(t *testing.T) { 96 | var b bytes.Buffer 97 | sink := WriterSink{&b} 98 | sink.EmitEventErr("myjob", "myevent", testErr, nil) 99 | 100 | str := b.String() 101 | 102 | result := basicEventErrRegexp.FindStringSubmatch(str) 103 | assert.Equal(t, 4, len(result)) 104 | assert.Equal(t, "myjob", result[1]) 105 | assert.Equal(t, "myevent", result[2]) 106 | assert.Equal(t, testErr.Error(), result[3]) 107 | } 108 | 109 | func TestWriterSinkEmitEventErrKvs(t *testing.T) { 110 | var b bytes.Buffer 111 | sink := WriterSink{&b} 112 | sink.EmitEventErr("myjob", "myevent", testErr, map[string]string{"wat": "ok", "another": "thing"}) 113 | 114 | str := b.String() 115 | 116 | result := kvsEventErrRegexp.FindStringSubmatch(str) 117 | assert.Equal(t, 5, len(result)) 118 | assert.Equal(t, "myjob", result[1]) 119 | assert.Equal(t, "myevent", result[2]) 120 | assert.Equal(t, testErr.Error(), result[3]) 121 | assert.Equal(t, "another:thing wat:ok", result[4]) 122 | } 123 | 124 | func TestWriterSinkEmitTimingBasic(t *testing.T) { 125 | var b bytes.Buffer 126 | sink := WriterSink{&b} 127 | sink.EmitTiming("myjob", "myevent", 1204000, nil) 128 | 129 | str := b.String() 130 | 131 | result := basicTimingRegexp.FindStringSubmatch(str) 132 | assert.Equal(t, 4, len(result)) 133 | assert.Equal(t, "myjob", result[1]) 134 | assert.Equal(t, "myevent", result[2]) 135 | assert.Equal(t, "1204 μs", result[3]) 136 | } 137 | 138 | func TestWriterSinkEmitTimingKvs(t *testing.T) { 139 | var b bytes.Buffer 140 | sink := WriterSink{&b} 141 | sink.EmitTiming("myjob", "myevent", 34567890, map[string]string{"wat": "ok", "another": "thing"}) 142 | 143 | str := b.String() 144 | 145 | result := kvsTimingRegexp.FindStringSubmatch(str) 146 | assert.Equal(t, 5, len(result)) 147 | assert.Equal(t, "myjob", result[1]) 148 | assert.Equal(t, "myevent", result[2]) 149 | assert.Equal(t, "34 ms", result[3]) 150 | assert.Equal(t, "another:thing wat:ok", result[4]) 151 | } 152 | 153 | func TestWriterSinkEmitGaugeBasic(t *testing.T) { 154 | var b bytes.Buffer 155 | sink := WriterSink{&b} 156 | sink.EmitGauge("myjob", "myevent", 3.14, nil) 157 | 158 | str := b.String() 159 | 160 | result := basicGaugeRegexp.FindStringSubmatch(str) 161 | assert.Equal(t, 4, len(result)) 162 | assert.Equal(t, "myjob", result[1]) 163 | assert.Equal(t, "myevent", result[2]) 164 | assert.Equal(t, "3.14", result[3]) 165 | } 166 | 167 | func TestWriterSinkEmitGaugeKvs(t *testing.T) { 168 | var b bytes.Buffer 169 | sink := WriterSink{&b} 170 | sink.EmitGauge("myjob", "myevent", 0.11, map[string]string{"wat": "ok", "another": "thing"}) 171 | 172 | str := b.String() 173 | 174 | result := kvsGaugeRegexp.FindStringSubmatch(str) 175 | assert.Equal(t, 5, len(result)) 176 | assert.Equal(t, "myjob", result[1]) 177 | assert.Equal(t, "myevent", result[2]) 178 | assert.Equal(t, "0.11", result[3]) 179 | assert.Equal(t, "another:thing wat:ok", result[4]) 180 | } 181 | 182 | func TestWriterSinkEmitCompleteBasic(t *testing.T) { 183 | for kind, kindStr := range completionStatusToString { 184 | var b bytes.Buffer 185 | sink := WriterSink{&b} 186 | sink.EmitComplete("myjob", kind, 1204000, nil) 187 | 188 | str := b.String() 189 | 190 | result := basicCompletionRegexp.FindStringSubmatch(str) 191 | assert.Equal(t, 4, len(result)) 192 | assert.Equal(t, "myjob", result[1]) 193 | assert.Equal(t, kindStr, result[2]) 194 | assert.Equal(t, "1204 μs", result[3]) 195 | } 196 | } 197 | 198 | func TestWriterSinkEmitCompleteKvs(t *testing.T) { 199 | for kind, kindStr := range completionStatusToString { 200 | var b bytes.Buffer 201 | sink := WriterSink{&b} 202 | sink.EmitComplete("myjob", kind, 34567890, map[string]string{"wat": "ok", "another": "thing"}) 203 | 204 | str := b.String() 205 | 206 | result := kvsCompletionRegexp.FindStringSubmatch(str) 207 | assert.Equal(t, 5, len(result)) 208 | assert.Equal(t, "myjob", result[1]) 209 | assert.Equal(t, kindStr, result[2]) 210 | assert.Equal(t, "34 ms", result[3]) 211 | assert.Equal(t, "another:thing wat:ok", result[4]) 212 | } 213 | } 214 | -------------------------------------------------------------------------------- /aggregator_test.go: -------------------------------------------------------------------------------- 1 | package health 2 | 3 | import ( 4 | "errors" 5 | "testing" 6 | "time" 7 | 8 | "github.com/stretchr/testify/assert" 9 | ) 10 | 11 | func TestNewAggregator(t *testing.T) { 12 | a := newAggregator(time.Minute, time.Minute*5) 13 | assert.Equal(t, time.Minute, a.intervalDuration) 14 | assert.Equal(t, time.Minute*5, a.retain) 15 | assert.Equal(t, 5, a.maxIntervals) 16 | assert.Equal(t, 0, len(a.intervalAggregations)) 17 | assert.NotNil(t, a.intervalAggregations) 18 | } 19 | 20 | func TestEmitEvent(t *testing.T) { 21 | // Set time, and do a single event 22 | setNowMock("2011-09-09T23:36:13Z") 23 | defer resetNowMock() 24 | a := newAggregator(time.Minute, time.Minute*5) 25 | a.EmitEvent("foo", "bar") 26 | 27 | assert.Equal(t, 1, len(a.intervalAggregations)) 28 | 29 | intAgg := a.intervalAggregations[0] 30 | assert.NotNil(t, intAgg.Events) 31 | assert.EqualValues(t, 1, intAgg.Events["bar"]) 32 | assert.EqualValues(t, 1, intAgg.SerialNumber) 33 | 34 | assert.NotNil(t, intAgg.Jobs) 35 | jobAgg := intAgg.Jobs["foo"] 36 | assert.NotNil(t, jobAgg) 37 | assert.NotNil(t, jobAgg.Events) 38 | assert.EqualValues(t, 1, jobAgg.Events["bar"]) 39 | 40 | // Now, without changing the time, we'll do 3 more events: 41 | a.EmitEvent("foo", "bar") // duplicate to above 42 | a.EmitEvent("foo", "baz") // same job, diff event 43 | a.EmitEvent("wat", "bar") // diff job, same event 44 | 45 | assert.Equal(t, 1, len(a.intervalAggregations)) 46 | 47 | intAgg = a.intervalAggregations[0] 48 | assert.EqualValues(t, 3, intAgg.Events["bar"]) 49 | assert.EqualValues(t, 4, intAgg.SerialNumber) 50 | 51 | jobAgg = intAgg.Jobs["foo"] 52 | assert.EqualValues(t, 2, jobAgg.Events["bar"]) 53 | assert.EqualValues(t, 1, jobAgg.Events["baz"]) 54 | 55 | jobAgg = intAgg.Jobs["wat"] 56 | assert.NotNil(t, jobAgg) 57 | assert.EqualValues(t, 1, jobAgg.Events["bar"]) 58 | 59 | // Now we'll increment time and do one more event: 60 | setNowMock("2011-09-09T23:37:01Z") 61 | a.EmitEvent("foo", "bar") 62 | 63 | assert.Equal(t, 2, len(a.intervalAggregations)) 64 | 65 | // make sure old values don't change: 66 | intAgg = a.intervalAggregations[0] 67 | assert.EqualValues(t, 3, intAgg.Events["bar"]) 68 | assert.EqualValues(t, 4, intAgg.SerialNumber) 69 | 70 | intAgg = a.intervalAggregations[1] 71 | assert.EqualValues(t, 1, intAgg.Events["bar"]) 72 | assert.EqualValues(t, 1, intAgg.SerialNumber) 73 | } 74 | 75 | func TestEmitEventErr(t *testing.T) { 76 | setNowMock("2011-09-09T23:36:13Z") 77 | defer resetNowMock() 78 | a := newAggregator(time.Minute, time.Minute*5) 79 | a.EmitEventErr("foo", "bar", errors.New("wat")) 80 | 81 | assert.Equal(t, 1, len(a.intervalAggregations)) 82 | 83 | intAgg := a.intervalAggregations[0] 84 | assert.NotNil(t, intAgg.EventErrs) 85 | ce := intAgg.EventErrs["bar"] 86 | assert.NotNil(t, ce) 87 | assert.EqualValues(t, 1, ce.Count) 88 | assert.Equal(t, []error{errors.New("wat")}, ce.getErrorSamples()) 89 | assert.EqualValues(t, 1, intAgg.SerialNumber) 90 | 91 | assert.NotNil(t, intAgg.Jobs) 92 | jobAgg := intAgg.Jobs["foo"] 93 | assert.NotNil(t, jobAgg) 94 | assert.NotNil(t, jobAgg.EventErrs) 95 | ce = jobAgg.EventErrs["bar"] 96 | assert.EqualValues(t, 1, ce.Count) 97 | assert.Equal(t, []error{errors.New("wat")}, ce.getErrorSamples()) 98 | 99 | // One more event with the same error: 100 | a.EmitEventErr("foo", "bar", errors.New("wat")) 101 | 102 | intAgg = a.intervalAggregations[0] 103 | ce = intAgg.EventErrs["bar"] 104 | assert.EqualValues(t, 2, ce.Count) 105 | assert.Equal(t, []error{errors.New("wat")}, ce.getErrorSamples()) // doesn't change 106 | 107 | // One more event with diff error: 108 | a.EmitEventErr("foo", "bar", errors.New("lol")) 109 | 110 | intAgg = a.intervalAggregations[0] 111 | ce = intAgg.EventErrs["bar"] 112 | assert.EqualValues(t, 3, ce.Count) 113 | assert.Equal(t, []error{errors.New("wat"), errors.New("lol")}, ce.getErrorSamples()) // new error added 114 | } 115 | 116 | func TestEmitTiming(t *testing.T) { 117 | setNowMock("2011-09-09T23:36:13Z") 118 | defer resetNowMock() 119 | a := newAggregator(time.Minute, time.Minute*5) 120 | a.EmitTiming("foo", "bar", 100) 121 | 122 | assert.Equal(t, 1, len(a.intervalAggregations)) 123 | 124 | intAgg := a.intervalAggregations[0] 125 | assert.NotNil(t, intAgg.Timers) 126 | assert.EqualValues(t, 1, intAgg.SerialNumber) 127 | tAgg := intAgg.Timers["bar"] 128 | assert.NotNil(t, tAgg) 129 | assert.EqualValues(t, 1, tAgg.Count) 130 | assert.EqualValues(t, 100, tAgg.NanosSum) 131 | assert.EqualValues(t, 10000, tAgg.NanosSumSquares) 132 | assert.EqualValues(t, 100, tAgg.NanosMin) 133 | assert.EqualValues(t, 100, tAgg.NanosMax) 134 | 135 | assert.NotNil(t, intAgg.Jobs) 136 | jobAgg := intAgg.Jobs["foo"] 137 | assert.NotNil(t, jobAgg) 138 | assert.NotNil(t, jobAgg.Timers) 139 | tAgg = jobAgg.Timers["bar"] 140 | assert.EqualValues(t, 1, tAgg.Count) 141 | assert.EqualValues(t, 100, tAgg.NanosSum) 142 | assert.EqualValues(t, 10000, tAgg.NanosSumSquares) 143 | assert.EqualValues(t, 100, tAgg.NanosMin) 144 | assert.EqualValues(t, 100, tAgg.NanosMax) 145 | 146 | // Another timing: 147 | a.EmitTiming("baz", "bar", 9) // note: diff job 148 | 149 | intAgg = a.intervalAggregations[0] 150 | tAgg = intAgg.Timers["bar"] 151 | assert.NotNil(t, tAgg) 152 | assert.EqualValues(t, 2, tAgg.Count) 153 | assert.EqualValues(t, 109, tAgg.NanosSum) 154 | assert.EqualValues(t, 10081, tAgg.NanosSumSquares) 155 | assert.EqualValues(t, 9, tAgg.NanosMin) 156 | assert.EqualValues(t, 100, tAgg.NanosMax) 157 | 158 | jobAgg = intAgg.Jobs["baz"] 159 | tAgg = jobAgg.Timers["bar"] 160 | assert.EqualValues(t, 1, tAgg.Count) 161 | assert.EqualValues(t, 9, tAgg.NanosSum) 162 | assert.EqualValues(t, 81, tAgg.NanosSumSquares) 163 | assert.EqualValues(t, 9, tAgg.NanosMin) 164 | assert.EqualValues(t, 9, tAgg.NanosMax) 165 | } 166 | 167 | func TestEmitGauge(t *testing.T) { 168 | setNowMock("2011-09-09T23:36:13Z") 169 | defer resetNowMock() 170 | a := newAggregator(time.Minute, time.Minute*5) 171 | a.EmitGauge("foo", "bar", 100) 172 | 173 | assert.Equal(t, 1, len(a.intervalAggregations)) 174 | 175 | intAgg := a.intervalAggregations[0] 176 | assert.NotNil(t, intAgg.Gauges) 177 | assert.EqualValues(t, 1, intAgg.SerialNumber) 178 | v, ok := intAgg.Gauges["bar"] 179 | assert.True(t, ok) 180 | assert.Equal(t, 100.0, v) 181 | 182 | assert.NotNil(t, intAgg.Jobs) 183 | jobAgg := intAgg.Jobs["foo"] 184 | assert.NotNil(t, jobAgg) 185 | assert.NotNil(t, jobAgg.Gauges) 186 | v, ok = intAgg.Gauges["bar"] 187 | assert.True(t, ok) 188 | assert.Equal(t, 100.0, v) 189 | 190 | // Another gauge: 191 | a.EmitGauge("baz", "bar", 3.14) // note: diff job 192 | 193 | intAgg = a.intervalAggregations[0] 194 | v, ok = intAgg.Gauges["bar"] 195 | assert.True(t, ok) 196 | assert.Equal(t, 3.14, v) 197 | 198 | jobAgg = intAgg.Jobs["baz"] 199 | v, ok = intAgg.Gauges["bar"] 200 | assert.True(t, ok) 201 | assert.Equal(t, 3.14, v) 202 | } 203 | 204 | func TestEmitComplete(t *testing.T) { 205 | setNowMock("2011-09-09T23:36:13Z") 206 | defer resetNowMock() 207 | a := newAggregator(time.Minute, time.Minute*5) 208 | a.EmitComplete("foo", Success, 100) 209 | a.EmitComplete("foo", ValidationError, 5) 210 | a.EmitComplete("foo", Panic, 9) 211 | a.EmitComplete("foo", Error, 7) 212 | a.EmitComplete("foo", Junk, 11) 213 | 214 | assert.Equal(t, 1, len(a.intervalAggregations)) 215 | 216 | intAgg := a.intervalAggregations[0] 217 | assert.EqualValues(t, 5, intAgg.SerialNumber) 218 | jobAgg := intAgg.Jobs["foo"] 219 | assert.NotNil(t, jobAgg) 220 | 221 | assert.EqualValues(t, 5, jobAgg.Count) 222 | assert.EqualValues(t, 1, jobAgg.CountSuccess) 223 | assert.EqualValues(t, 1, jobAgg.CountValidationError) 224 | assert.EqualValues(t, 1, jobAgg.CountPanic) 225 | assert.EqualValues(t, 1, jobAgg.CountError) 226 | assert.EqualValues(t, 1, jobAgg.CountJunk) 227 | assert.EqualValues(t, 132, jobAgg.NanosSum) 228 | assert.EqualValues(t, 10276, jobAgg.NanosSumSquares) 229 | assert.EqualValues(t, 5, jobAgg.NanosMin) 230 | assert.EqualValues(t, 100, jobAgg.NanosMax) 231 | } 232 | 233 | func TestRotation(t *testing.T) { 234 | defer resetNowMock() 235 | a := newAggregator(time.Minute, time.Minute*5) 236 | setNowMock("2011-09-09T23:36:13Z") 237 | a.EmitEvent("foo", "bar") 238 | 239 | setNowMock("2011-09-09T23:37:13Z") 240 | a.EmitEvent("foo", "bar") 241 | a.EmitEvent("foo", "bar") 242 | 243 | setNowMock("2011-09-09T23:38:13Z") 244 | a.EmitEvent("foo", "bar") 245 | a.EmitEvent("foo", "bar") 246 | a.EmitEvent("foo", "bar") 247 | 248 | setNowMock("2011-09-09T23:39:13Z") 249 | a.EmitEvent("foo", "bar") 250 | a.EmitEvent("foo", "bar") 251 | a.EmitEvent("foo", "bar") 252 | a.EmitEvent("foo", "bar") 253 | 254 | setNowMock("2011-09-09T23:40:13Z") 255 | a.EmitEvent("foo", "bar") 256 | a.EmitEvent("foo", "bar") 257 | a.EmitEvent("foo", "bar") 258 | a.EmitEvent("foo", "bar") 259 | a.EmitEvent("foo", "bar") 260 | 261 | assert.Equal(t, 5, len(a.intervalAggregations)) 262 | 263 | for i := 0; i < 5; i++ { 264 | intAgg := a.intervalAggregations[i] 265 | assert.EqualValues(t, i+1, intAgg.Events["bar"]) 266 | } 267 | 268 | setNowMock("2011-09-09T23:41:13Z") 269 | a.EmitEvent("foo", "ok") 270 | 271 | assert.Equal(t, 5, len(a.intervalAggregations)) 272 | 273 | for i := 0; i < 4; i++ { 274 | intAgg := a.intervalAggregations[i] 275 | assert.EqualValues(t, i+2, intAgg.Events["bar"]) 276 | } 277 | intAgg := a.intervalAggregations[4] 278 | assert.EqualValues(t, 0, intAgg.Events["bar"]) 279 | assert.EqualValues(t, 1, intAgg.Events["ok"]) 280 | 281 | } 282 | -------------------------------------------------------------------------------- /sinks/librato/sink.go: -------------------------------------------------------------------------------- 1 | package librato 2 | 3 | import ( 4 | "bytes" 5 | "encoding/json" 6 | "fmt" 7 | "net/http" 8 | "os" 9 | "time" 10 | 11 | "github.com/gocraft/health" 12 | ) 13 | 14 | type SanitizationFunc func(string) string 15 | 16 | type Sink struct { 17 | SanitizationFunc 18 | Source string 19 | FlushPeriod time.Duration 20 | 21 | cmdChan chan *emitCmd 22 | doneChan chan int 23 | doneDoneChan chan int 24 | httpClient *http.Client 25 | 26 | libratoUser string 27 | libratoApiKey string 28 | 29 | // Prefix is something like "metroid" 30 | // Events emitted to StatsD would be metroid.myevent.wat 31 | // Eg, don't include a trailing dot in the prefix. 32 | // It can be "", that's fine. 33 | prefix string 34 | 35 | timers map[string]*gauge 36 | counters map[string]int64 37 | } 38 | 39 | type gauge struct { 40 | Count int64 `json:"count"` 41 | Sum float64 `json:"sum"` 42 | Min float64 `json:"min"` 43 | Max float64 `json:"max"` 44 | SumSquares float64 `json:"sum_squares"` 45 | Attributes gaugeAttributes `json:"attributes"` 46 | } 47 | 48 | type gaugeAttributes struct { 49 | Aggregate bool `json:"aggregate"` 50 | DisplayUnitsShort string `json:"display_units_long"` 51 | } 52 | 53 | type libratoCounterValue struct { 54 | Value int64 `json:"value"` 55 | Attributes gaugeAttributes `json:"attributes"` 56 | } 57 | 58 | type libratoMetricsPost struct { 59 | MeasureTime int64 `json:"measure_time"` 60 | Period int64 `json:"period"` 61 | Source string `json:"source,omitempty"` 62 | Gauges map[string]interface{} `json:"gauges,omitempty"` 63 | } 64 | 65 | var defaultTimerAttributes gaugeAttributes = gaugeAttributes{true, "ms"} 66 | var defaultCounterAttributes gaugeAttributes = gaugeAttributes{true, "count"} 67 | var libratoRequestPath string = "https://metrics-api.librato.com/v1/metrics" 68 | 69 | type cmdKind int 70 | 71 | const ( 72 | cmdKindEvent cmdKind = iota 73 | cmdKindEventErr 74 | cmdKindTiming 75 | cmdKindGauge 76 | cmdKindComplete 77 | ) 78 | 79 | type emitCmd struct { 80 | Kind cmdKind 81 | Job string 82 | Event string 83 | Err error 84 | Nanos int64 85 | Value float64 86 | Status health.CompletionStatus 87 | } 88 | 89 | func New(user, apiKey, prefix string) *Sink { 90 | const buffSize = 4096 // random-ass-guess 91 | 92 | s := &Sink{ 93 | SanitizationFunc: sanitizeKey, 94 | FlushPeriod: 15 * time.Second, 95 | cmdChan: make(chan *emitCmd, buffSize), 96 | doneChan: make(chan int), 97 | doneDoneChan: make(chan int), 98 | httpClient: &http.Client{}, 99 | libratoUser: user, 100 | libratoApiKey: apiKey, 101 | prefix: prefix, 102 | timers: make(map[string]*gauge), 103 | counters: make(map[string]int64), 104 | } 105 | 106 | s.Source, _ = os.Hostname() 107 | 108 | go s.start() 109 | 110 | return s 111 | } 112 | 113 | func (s *Sink) Stop() { 114 | s.doneChan <- 1 115 | <-s.doneDoneChan 116 | } 117 | 118 | func (s *Sink) EmitEvent(job string, event string, kvs map[string]string) { 119 | s.cmdChan <- &emitCmd{Kind: cmdKindEvent, Job: job, Event: event} 120 | } 121 | 122 | func (s *Sink) EmitEventErr(job string, event string, inputErr error, kvs map[string]string) { 123 | s.cmdChan <- &emitCmd{Kind: cmdKindEventErr, Job: job, Event: event, Err: inputErr} 124 | } 125 | 126 | func (s *Sink) EmitTiming(job string, event string, nanos int64, kvs map[string]string) { 127 | s.cmdChan <- &emitCmd{Kind: cmdKindTiming, Job: job, Event: event, Nanos: nanos} 128 | } 129 | 130 | func (s *Sink) EmitGauge(job string, event string, value float64, kvs map[string]string) { 131 | s.cmdChan <- &emitCmd{Kind: cmdKindGauge, Job: job, Event: event, Value: value} 132 | } 133 | 134 | func (s *Sink) EmitComplete(job string, status health.CompletionStatus, nanos int64, kvs map[string]string) { 135 | s.cmdChan <- &emitCmd{Kind: cmdKindComplete, Job: job, Status: status, Nanos: nanos} 136 | } 137 | 138 | func (s *Sink) start() { 139 | cmdChan := s.cmdChan 140 | doneChan := s.doneChan 141 | ticker := time.Tick(s.FlushPeriod) 142 | 143 | LIBRATO_LOOP: 144 | for { 145 | select { 146 | case <-doneChan: 147 | s.doneDoneChan <- 1 148 | break LIBRATO_LOOP 149 | case cmd := <-cmdChan: 150 | if cmd.Kind == cmdKindEvent { 151 | s.processEvent(cmd.Job, cmd.Event) 152 | } else if cmd.Kind == cmdKindEventErr { 153 | s.processEventErr(cmd.Job, cmd.Event, cmd.Err) 154 | } else if cmd.Kind == cmdKindTiming { 155 | s.processTiming(cmd.Job, cmd.Event, cmd.Nanos) 156 | } else if cmd.Kind == cmdKindGauge { 157 | s.processGauge(cmd.Job, cmd.Event, cmd.Value) 158 | } else if cmd.Kind == cmdKindComplete { 159 | s.processComplete(cmd.Job, cmd.Status, cmd.Nanos) 160 | } 161 | case <-ticker: 162 | s.purge() 163 | } 164 | } 165 | } 166 | 167 | func (s *Sink) processEvent(job string, event string) { 168 | key1, key2 := s.eventKeys(job, event, "count") 169 | s.inc(key1) 170 | s.inc(key2) 171 | } 172 | 173 | func (s *Sink) processEventErr(job string, event string, err error) { 174 | key1, key2 := s.eventKeys(job, event, "error.count") 175 | s.inc(key1) 176 | s.inc(key2) 177 | } 178 | 179 | func (s *Sink) processTiming(job string, event string, nanos int64) { 180 | key1, key2 := s.eventKeys(job, event, "timing") 181 | ms := float64(nanos) / float64(time.Millisecond) 182 | s.measure(key1, ms) 183 | s.measure(key2, ms) 184 | } 185 | 186 | func (s *Sink) processGauge(job string, event string, value float64) { 187 | key1, key2 := s.eventKeys(job, event, "gauge") 188 | s.measure(key1, value) 189 | s.measure(key2, value) 190 | } 191 | 192 | func (s *Sink) processComplete(job string, status health.CompletionStatus, nanos int64) { 193 | var b bytes.Buffer 194 | 195 | if s.prefix != "" { 196 | b.WriteString(s.prefix) 197 | b.WriteRune('.') 198 | } 199 | b.WriteString(s.SanitizationFunc(job)) 200 | b.WriteRune('.') 201 | b.WriteString(status.String()) 202 | b.WriteString(".timing") 203 | 204 | ms := float64(nanos) / float64(time.Millisecond) 205 | s.measure(b.String(), ms) 206 | } 207 | 208 | func (s *Sink) eventKeys(job, event, suffix string) (string, string) { 209 | var key1 bytes.Buffer // event 210 | var key2 bytes.Buffer // job.event 211 | 212 | if s.prefix != "" { 213 | key1.WriteString(s.prefix) 214 | key1.WriteRune('.') 215 | key2.WriteString(s.prefix) 216 | key2.WriteRune('.') 217 | } 218 | 219 | key1.WriteString(s.SanitizationFunc(event)) 220 | key2.WriteString(s.SanitizationFunc(job)) 221 | key2.WriteRune('.') 222 | key2.WriteString(s.SanitizationFunc(event)) 223 | 224 | if suffix != "" { 225 | key1.WriteRune('.') 226 | key1.WriteString(suffix) 227 | key2.WriteRune('.') 228 | key2.WriteString(suffix) 229 | } 230 | 231 | return key1.String(), key2.String() 232 | } 233 | 234 | func (s *Sink) inc(key string) { 235 | s.counters[key] += 1 236 | } 237 | 238 | func (s *Sink) measure(key string, value float64) { 239 | g, ok := s.timers[key] 240 | if !ok { 241 | g = &gauge{Min: value, Max: value, Sum: value, Count: 1, SumSquares: value * value, Attributes: defaultTimerAttributes} 242 | s.timers[key] = g 243 | } else { 244 | g.Count++ 245 | g.Sum += value 246 | g.SumSquares += value * value 247 | 248 | if value < g.Min { 249 | g.Min = value 250 | } 251 | if value > g.Max { 252 | g.Max = value 253 | } 254 | } 255 | } 256 | 257 | func (s *Sink) purge() { 258 | if err := s.send(); err != nil { 259 | fmt.Println("Error sending to librato: ", err) 260 | } 261 | s.timers = make(map[string]*gauge) 262 | s.counters = make(map[string]int64) 263 | } 264 | 265 | func (s *Sink) send() error { 266 | 267 | // no data? don't send anything to librato 268 | if len(s.timers) == 0 && len(s.counters) == 0 { 269 | return nil 270 | } 271 | 272 | body := libratoMetricsPost{ 273 | MeasureTime: time.Now().Unix(), 274 | Period: int64(s.FlushPeriod / time.Second), 275 | Source: s.Source, 276 | } 277 | 278 | gauges := make(map[string]interface{}) 279 | 280 | for k, v := range s.timers { 281 | gauges[k] = v 282 | } 283 | 284 | for k, v := range s.counters { 285 | gauges[k] = libratoCounterValue{v, defaultCounterAttributes} 286 | } 287 | body.Gauges = gauges 288 | 289 | b, err := json.Marshal(body) 290 | if nil != err { 291 | return err 292 | } 293 | 294 | fmt.Println(string(b)) 295 | 296 | req, err := http.NewRequest( 297 | "POST", 298 | libratoRequestPath, 299 | bytes.NewBuffer(b), 300 | ) 301 | if nil != err { 302 | return err 303 | } 304 | req.Header.Add("Content-Type", "application/json") 305 | req.SetBasicAuth(s.libratoUser, s.libratoApiKey) 306 | _, err = s.httpClient.Do(req) 307 | 308 | //fmt.Println(resp.Status) 309 | 310 | return err 311 | } 312 | 313 | // valid librato charactors: A-Za-z0-9.:-_ 314 | func shouldSanitize(r rune) bool { 315 | switch { 316 | case 'A' <= r && r <= 'Z': 317 | fallthrough 318 | case 'a' <= r && r <= 'z': 319 | fallthrough 320 | case '0' <= r && r <= '9': 321 | fallthrough 322 | case r == '.': 323 | fallthrough 324 | case r == ':': 325 | fallthrough 326 | case r == '-': 327 | fallthrough 328 | case r == '_': 329 | return false 330 | } 331 | return true 332 | } 333 | 334 | func sanitizeKey(k string) string { 335 | for _, r := range k { 336 | if shouldSanitize(r) { 337 | goto SANITIZE 338 | } 339 | } 340 | return k 341 | SANITIZE: 342 | var key bytes.Buffer 343 | for _, r := range k { 344 | if shouldSanitize(r) { 345 | key.WriteRune('_') 346 | } else { 347 | key.WriteRune(r) 348 | } 349 | } 350 | return key.String() 351 | } 352 | -------------------------------------------------------------------------------- /statsd_sink.go: -------------------------------------------------------------------------------- 1 | package health 2 | 3 | import ( 4 | "bytes" 5 | "net" 6 | "strconv" 7 | "time" 8 | ) 9 | 10 | type StatsDSinkSanitizationFunc func(*bytes.Buffer, string) 11 | 12 | type eventKey struct { 13 | job string 14 | event string 15 | suffix string 16 | } 17 | 18 | type prefixBuffer struct { 19 | *bytes.Buffer 20 | prefixLen int 21 | } 22 | 23 | type StatsDSinkOptions struct { 24 | // Prefix is something like "metroid" 25 | // Events emitted to StatsD would be metroid.myevent.wat 26 | // Eg, don't include a trailing dot in the prefix. 27 | // It can be "", that's fine. 28 | Prefix string 29 | 30 | // SanitizationFunc sanitizes jobs and events before sending them to statsd 31 | SanitizationFunc StatsDSinkSanitizationFunc 32 | 33 | // SkipNestedEvents will skip {events,timers,gauges} from sending the job.event version 34 | // and will only send the event version. 35 | SkipNestedEvents bool 36 | 37 | // SkipTopLevelEvents will skip {events,timers,gauges} from sending the event version 38 | // and will only send the job.event version. 39 | SkipTopLevelEvents bool 40 | } 41 | 42 | var defaultStatsDOptions = StatsDSinkOptions{SanitizationFunc: sanitizeKey} 43 | 44 | type StatsDSink struct { 45 | options StatsDSinkOptions 46 | 47 | cmdChan chan statsdEmitCmd 48 | drainDoneChan chan struct{} 49 | stopDoneChan chan struct{} 50 | 51 | flushPeriod time.Duration 52 | 53 | udpBuf bytes.Buffer 54 | timingBuf []byte 55 | 56 | udpConn *net.UDPConn 57 | udpAddr *net.UDPAddr 58 | 59 | // map of {job,event,suffix} to a re-usable buffer prefixed with the key. 60 | // Since each timing/gauge has a unique component (the time), we'll truncate to the prefix, write the timing, 61 | // and write the statsD suffix (eg, "|ms\n"). Then copy that to the UDP buffer. 62 | prefixBuffers map[eventKey]prefixBuffer 63 | } 64 | 65 | type statsdCmdKind int 66 | 67 | const ( 68 | statsdCmdKindEvent statsdCmdKind = iota 69 | statsdCmdKindEventErr 70 | statsdCmdKindTiming 71 | statsdCmdKindGauge 72 | statsdCmdKindComplete 73 | statsdCmdKindFlush 74 | statsdCmdKindDrain 75 | statsdCmdKindStop 76 | ) 77 | 78 | type statsdEmitCmd struct { 79 | Kind statsdCmdKind 80 | Job string 81 | Event string 82 | Nanos int64 83 | Value float64 84 | Status CompletionStatus 85 | } 86 | 87 | const cmdChanBuffSize = 8192 // random-ass-guess 88 | const maxUdpBytes = 1440 // 1500(Ethernet MTU) - 60(Max UDP header size 89 | 90 | func NewStatsDSink(addr string, options *StatsDSinkOptions) (*StatsDSink, error) { 91 | c, err := net.ListenPacket("udp", ":0") 92 | if err != nil { 93 | return nil, err 94 | } 95 | 96 | ra, err := net.ResolveUDPAddr("udp", addr) 97 | if err != nil { 98 | return nil, err 99 | } 100 | 101 | s := &StatsDSink{ 102 | udpConn: c.(*net.UDPConn), 103 | udpAddr: ra, 104 | cmdChan: make(chan statsdEmitCmd, cmdChanBuffSize), 105 | drainDoneChan: make(chan struct{}), 106 | stopDoneChan: make(chan struct{}), 107 | flushPeriod: 100 * time.Millisecond, 108 | prefixBuffers: make(map[eventKey]prefixBuffer), 109 | } 110 | 111 | if options != nil { 112 | s.options = *options 113 | if s.options.SanitizationFunc == nil { 114 | s.options.SanitizationFunc = sanitizeKey 115 | } 116 | } else { 117 | s.options = defaultStatsDOptions 118 | } 119 | 120 | go s.loop() 121 | 122 | return s, nil 123 | } 124 | 125 | func (s *StatsDSink) Stop() { 126 | s.cmdChan <- statsdEmitCmd{Kind: statsdCmdKindStop} 127 | <-s.stopDoneChan 128 | } 129 | 130 | func (s *StatsDSink) Drain() { 131 | s.cmdChan <- statsdEmitCmd{Kind: statsdCmdKindDrain} 132 | <-s.drainDoneChan 133 | } 134 | 135 | func (s *StatsDSink) EmitEvent(job string, event string, kvs map[string]string) { 136 | s.cmdChan <- statsdEmitCmd{Kind: statsdCmdKindEvent, Job: job, Event: event} 137 | } 138 | 139 | func (s *StatsDSink) EmitEventErr(job string, event string, inputErr error, kvs map[string]string) { 140 | s.cmdChan <- statsdEmitCmd{Kind: statsdCmdKindEventErr, Job: job, Event: event} 141 | } 142 | 143 | func (s *StatsDSink) EmitTiming(job string, event string, nanos int64, kvs map[string]string) { 144 | s.cmdChan <- statsdEmitCmd{Kind: statsdCmdKindTiming, Job: job, Event: event, Nanos: nanos} 145 | } 146 | 147 | func (s *StatsDSink) EmitGauge(job string, event string, value float64, kvs map[string]string) { 148 | s.cmdChan <- statsdEmitCmd{Kind: statsdCmdKindGauge, Job: job, Event: event, Value: value} 149 | } 150 | 151 | func (s *StatsDSink) EmitComplete(job string, status CompletionStatus, nanos int64, kvs map[string]string) { 152 | s.cmdChan <- statsdEmitCmd{Kind: statsdCmdKindComplete, Job: job, Status: status, Nanos: nanos} 153 | } 154 | 155 | func (s *StatsDSink) loop() { 156 | cmdChan := s.cmdChan 157 | 158 | ticker := time.NewTicker(s.flushPeriod) 159 | go func() { 160 | for _ = range ticker.C { 161 | cmdChan <- statsdEmitCmd{Kind: statsdCmdKindFlush} 162 | } 163 | }() 164 | 165 | LOOP: 166 | for cmd := range cmdChan { 167 | switch cmd.Kind { 168 | case statsdCmdKindDrain: 169 | DRAIN_LOOP: 170 | for { 171 | select { 172 | case cmd := <-cmdChan: 173 | s.processCmd(&cmd) 174 | default: 175 | s.flush() 176 | s.drainDoneChan <- struct{}{} 177 | break DRAIN_LOOP 178 | } 179 | } 180 | case statsdCmdKindStop: 181 | s.stopDoneChan <- struct{}{} 182 | break LOOP 183 | case statsdCmdKindFlush: 184 | s.flush() 185 | default: 186 | s.processCmd(&cmd) 187 | } 188 | } 189 | 190 | ticker.Stop() 191 | } 192 | 193 | func (s *StatsDSink) processCmd(cmd *statsdEmitCmd) { 194 | switch cmd.Kind { 195 | case statsdCmdKindEvent: 196 | s.processEvent(cmd.Job, cmd.Event) 197 | case statsdCmdKindEventErr: 198 | s.processEventErr(cmd.Job, cmd.Event) 199 | case statsdCmdKindTiming: 200 | s.processTiming(cmd.Job, cmd.Event, cmd.Nanos) 201 | case statsdCmdKindGauge: 202 | s.processGauge(cmd.Job, cmd.Event, cmd.Value) 203 | case statsdCmdKindComplete: 204 | s.processComplete(cmd.Job, cmd.Status, cmd.Nanos) 205 | } 206 | } 207 | 208 | func (s *StatsDSink) processEvent(job string, event string) { 209 | if !s.options.SkipTopLevelEvents { 210 | pb := s.getPrefixBuffer("", event, "") 211 | pb.WriteString("1|c\n") 212 | s.writeStatsDMetric(pb.Bytes()) 213 | } 214 | 215 | if !s.options.SkipNestedEvents { 216 | pb := s.getPrefixBuffer(job, event, "") 217 | pb.WriteString("1|c\n") 218 | s.writeStatsDMetric(pb.Bytes()) 219 | } 220 | } 221 | 222 | func (s *StatsDSink) processEventErr(job string, event string) { 223 | if !s.options.SkipTopLevelEvents { 224 | pb := s.getPrefixBuffer("", event, "error") 225 | pb.WriteString("1|c\n") 226 | s.writeStatsDMetric(pb.Bytes()) 227 | } 228 | 229 | if !s.options.SkipNestedEvents { 230 | pb := s.getPrefixBuffer(job, event, "error") 231 | pb.WriteString("1|c\n") 232 | s.writeStatsDMetric(pb.Bytes()) 233 | } 234 | } 235 | 236 | func (s *StatsDSink) processTiming(job string, event string, nanos int64) { 237 | s.writeNanosToTimingBuf(nanos) 238 | 239 | if !s.options.SkipTopLevelEvents { 240 | pb := s.getPrefixBuffer("", event, "") 241 | pb.Write(s.timingBuf) 242 | pb.WriteString("|ms\n") 243 | s.writeStatsDMetric(pb.Bytes()) 244 | } 245 | 246 | if !s.options.SkipNestedEvents { 247 | pb := s.getPrefixBuffer(job, event, "") 248 | pb.Write(s.timingBuf) 249 | pb.WriteString("|ms\n") 250 | s.writeStatsDMetric(pb.Bytes()) 251 | } 252 | } 253 | 254 | func (s *StatsDSink) processGauge(job string, event string, value float64) { 255 | s.timingBuf = s.timingBuf[0:0] 256 | prec := 2 257 | if (value < 0.1) && (value > -0.1) { 258 | prec = -1 259 | } 260 | s.timingBuf = strconv.AppendFloat(s.timingBuf, value, 'f', prec, 64) 261 | 262 | if !s.options.SkipTopLevelEvents { 263 | pb := s.getPrefixBuffer("", event, "") 264 | pb.Write(s.timingBuf) 265 | pb.WriteString("|g\n") 266 | s.writeStatsDMetric(pb.Bytes()) 267 | } 268 | 269 | if !s.options.SkipNestedEvents { 270 | pb := s.getPrefixBuffer(job, event, "") 271 | pb.Write(s.timingBuf) 272 | pb.WriteString("|g\n") 273 | s.writeStatsDMetric(pb.Bytes()) 274 | } 275 | } 276 | 277 | func (s *StatsDSink) processComplete(job string, status CompletionStatus, nanos int64) { 278 | s.writeNanosToTimingBuf(nanos) 279 | statusString := status.String() 280 | 281 | pb := s.getPrefixBuffer(job, "", statusString) 282 | pb.Write(s.timingBuf) 283 | pb.WriteString("|ms\n") 284 | s.writeStatsDMetric(pb.Bytes()) 285 | } 286 | 287 | func (s *StatsDSink) flush() { 288 | if s.udpBuf.Len() > 0 { 289 | s.udpConn.WriteToUDP(s.udpBuf.Bytes(), s.udpAddr) 290 | s.udpBuf.Truncate(0) 291 | } 292 | } 293 | 294 | // assumes b is a well-formed statsd metric like "job.event:1|c\n" (including newline) 295 | func (s *StatsDSink) writeStatsDMetric(b []byte) { 296 | lenb := len(b) 297 | 298 | if lenb == 0 { 299 | return 300 | } 301 | 302 | // single metric exceeds limit. sad day. 303 | if lenb > maxUdpBytes { 304 | return 305 | } 306 | 307 | lenUdpBuf := s.udpBuf.Len() 308 | 309 | if (lenb + lenUdpBuf) > maxUdpBytes { 310 | s.udpConn.WriteToUDP(s.udpBuf.Bytes(), s.udpAddr) 311 | s.udpBuf.Truncate(0) 312 | } 313 | 314 | s.udpBuf.Write(b) 315 | } 316 | 317 | func (s *StatsDSink) getPrefixBuffer(job, event, suffix string) prefixBuffer { 318 | key := eventKey{job, event, suffix} 319 | 320 | b, ok := s.prefixBuffers[key] 321 | if !ok { 322 | b.Buffer = &bytes.Buffer{} 323 | s.writeSanitizedKeys(b.Buffer, s.options.Prefix, job, event, suffix) 324 | b.WriteByte(':') 325 | b.prefixLen = b.Len() 326 | 327 | // 123456789.99|ms\n 16 bytes. timing value represents 11 days max 328 | b.Grow(16) 329 | s.prefixBuffers[key] = b 330 | } else { 331 | b.Truncate(b.prefixLen) 332 | } 333 | 334 | return b 335 | } 336 | 337 | func (s *StatsDSink) writeSanitizedKeys(b *bytes.Buffer, keys ...string) { 338 | needDot := false 339 | for _, k := range keys { 340 | if k != "" { 341 | if needDot { 342 | b.WriteByte('.') 343 | } 344 | s.options.SanitizationFunc(b, k) 345 | needDot = true 346 | } 347 | } 348 | } 349 | 350 | func (s *StatsDSink) writeNanosToTimingBuf(nanos int64) { 351 | s.timingBuf = s.timingBuf[0:0] 352 | if nanos >= 10e6 { 353 | // More than 10 milliseconds. We'll just print as an integer 354 | s.timingBuf = strconv.AppendInt(s.timingBuf, nanos/1e6, 10) 355 | } else { 356 | s.timingBuf = strconv.AppendFloat(s.timingBuf, float64(nanos)/float64(time.Millisecond), 'f', 2, 64) 357 | } 358 | } 359 | 360 | func sanitizeKey(b *bytes.Buffer, s string) { 361 | b.Grow(len(s) + 1) 362 | for i := 0; i < len(s); i++ { 363 | si := s[i] 364 | if ('A' <= si && si <= 'Z') || ('a' <= si && si <= 'z') || ('0' <= si && s[i] <= '9') || si == '_' || si == '.' { 365 | b.WriteByte(si) 366 | } else { 367 | b.WriteByte('$') 368 | } 369 | } 370 | } 371 | -------------------------------------------------------------------------------- /healthd/api.go: -------------------------------------------------------------------------------- 1 | package healthd 2 | 3 | import ( 4 | "encoding/json" 5 | "fmt" 6 | "github.com/braintree/manners" 7 | "github.com/gocraft/health" 8 | "github.com/gocraft/web" 9 | "math" 10 | "net/http" 11 | "sort" 12 | "strconv" 13 | "time" 14 | ) 15 | 16 | // Job represents a health.JobAggregation, but designed for JSON-ization without all the nested counters/timers 17 | type Job struct { 18 | Name string `json:"name"` 19 | Count int64 `json:"count"` 20 | CountSuccess int64 `json:"count_success"` 21 | CountValidationError int64 `json:"count_validation_error"` 22 | CountPanic int64 `json:"count_panic"` 23 | CountError int64 `json:"count_error"` 24 | CountJunk int64 `json:"count_junk"` 25 | 26 | NanosSum int64 `json:"nanos_sum"` 27 | NanosSumSquares float64 `json:"nanos_sum_squares"` 28 | NanosMin int64 `json:"nanos_min"` 29 | NanosMax int64 `json:"nanos_max"` 30 | NanosAvg float64 `json:"nanos_avg"` 31 | NanosStdDev float64 `json:"nanos_std_dev"` 32 | } 33 | 34 | type apiResponse struct { 35 | InstanceId string `json:"instance_id"` 36 | IntervalDuration time.Duration `json:"interval_duration"` 37 | } 38 | 39 | type ApiResponseJobs struct { 40 | apiResponse 41 | Jobs []*Job `json:"jobs"` 42 | } 43 | 44 | type ApiResponseAggregations struct { 45 | apiResponse 46 | Aggregations []*health.IntervalAggregation `json:"aggregations"` 47 | } 48 | 49 | type ApiResponseAggregationsOverall struct { 50 | apiResponse 51 | Overall *health.IntervalAggregation `json:"overall"` 52 | } 53 | 54 | type ApiResponseHosts struct { 55 | apiResponse 56 | Hosts []*HostStatus `json:"hosts"` 57 | } 58 | 59 | type apiContext struct { 60 | hd *HealthD 61 | *health.Job 62 | } 63 | 64 | func (hd *HealthD) apiRouter() http.Handler { 65 | router := web.New(apiContext{}) 66 | router.NotFound(func(rw web.ResponseWriter, req *web.Request) { 67 | renderNotFound(rw) 68 | }) 69 | 70 | healthdRouter := router.Subrouter(apiContext{}, "/healthd") 71 | 72 | healthdRouter.Middleware(func(c *apiContext, rw web.ResponseWriter, req *web.Request, next web.NextMiddlewareFunc) { 73 | c.hd = hd 74 | next(rw, req) 75 | }) 76 | 77 | healthdRouter.Middleware((*apiContext).SetContentType). 78 | Middleware((*apiContext).HealthMiddleware). 79 | Get("/aggregations", (*apiContext).Aggregations). 80 | Get("/aggregations/overall", (*apiContext).Overall). 81 | Get("/jobs", (*apiContext).Jobs). 82 | Get("/hosts", (*apiContext).Hosts) 83 | 84 | return router 85 | } 86 | 87 | func (hd *HealthD) startHttpServer(hostPort string, done chan bool) { 88 | server := manners.NewWithServer(&http.Server{ 89 | Addr: hostPort, 90 | Handler: hd.apiRouter(), 91 | }) 92 | hd.stopHTTP = server.Close 93 | done <- true 94 | server.ListenAndServe() 95 | } 96 | 97 | func (c *apiContext) SetContentType(rw web.ResponseWriter, req *web.Request, next web.NextMiddlewareFunc) { 98 | rw.Header().Set("Content-Type", "application/json; charset=utf-8") 99 | next(rw, req) 100 | } 101 | 102 | func (c *apiContext) HealthMiddleware(rw web.ResponseWriter, r *web.Request, next web.NextMiddlewareFunc) { 103 | c.Job = c.hd.stream.NewJob(r.RoutePath()) 104 | 105 | path := r.URL.Path 106 | c.EventKv("starting_request", health.Kvs{"path": path}) 107 | 108 | next(rw, r) 109 | 110 | code := rw.StatusCode() 111 | kvs := health.Kvs{ 112 | "code": fmt.Sprint(code), 113 | "path": path, 114 | } 115 | 116 | // Map HTTP status code to category. 117 | var status health.CompletionStatus 118 | // if c.Panic { 119 | // status = health.Panic 120 | // } else 121 | if code < 400 { 122 | status = health.Success 123 | } else if code == 422 { 124 | status = health.ValidationError 125 | } else if code < 500 { 126 | status = health.Junk // 404, 401 127 | } else { 128 | status = health.Error 129 | } 130 | c.CompleteKv(status, kvs) 131 | } 132 | 133 | func (c *apiContext) Aggregations(rw web.ResponseWriter, r *web.Request) { 134 | aggregations := c.hd.getAggregationSequence() 135 | resp := &ApiResponseAggregations{ 136 | apiResponse: getApiResponse(c.hd.intervalDuration), 137 | Aggregations: aggregations, 138 | } 139 | renderJson(rw, resp) 140 | } 141 | 142 | func (c *apiContext) Overall(rw web.ResponseWriter, r *web.Request) { 143 | aggregations := c.hd.getAggregationSequence() 144 | overall := combineAggregations(aggregations) 145 | resp := &ApiResponseAggregationsOverall{ 146 | apiResponse: getApiResponse(c.hd.intervalDuration), 147 | Overall: overall, 148 | } 149 | renderJson(rw, resp) 150 | } 151 | 152 | func (c *apiContext) Jobs(rw web.ResponseWriter, r *web.Request) { 153 | sort := getSort(r) 154 | limit := getLimit(r) 155 | aggregations := c.hd.getAggregationSequence() 156 | overall := combineAggregations(aggregations) 157 | jobs := filterJobs(overall, sort, limit) 158 | resp := &ApiResponseJobs{ 159 | apiResponse: getApiResponse(c.hd.intervalDuration), 160 | Jobs: jobs, 161 | } 162 | renderJson(rw, resp) 163 | } 164 | 165 | func (c *apiContext) Hosts(rw web.ResponseWriter, r *web.Request) { 166 | hosts := c.hd.getHosts() 167 | sort.Sort(HostStatusByHostPort(hosts)) 168 | resp := &ApiResponseHosts{ 169 | apiResponse: getApiResponse(c.hd.intervalDuration), 170 | Hosts: hosts, 171 | } 172 | renderJson(rw, resp) 173 | } 174 | 175 | func getApiResponse(duration time.Duration) apiResponse { 176 | return apiResponse{ 177 | InstanceId: health.Identifier, 178 | IntervalDuration: duration, 179 | } 180 | } 181 | 182 | func renderJson(rw http.ResponseWriter, data interface{}) { 183 | jsonData, err := json.MarshalIndent(data, "", "\t") 184 | if err != nil { 185 | renderError(rw, err) 186 | return 187 | } 188 | fmt.Fprintf(rw, string(jsonData)) 189 | } 190 | 191 | func renderNotFound(rw http.ResponseWriter) { 192 | rw.WriteHeader(404) 193 | fmt.Fprintf(rw, `{"error": "not_found"}`) 194 | } 195 | 196 | func renderError(rw http.ResponseWriter, err error) { 197 | rw.WriteHeader(500) 198 | fmt.Fprintf(rw, `{"error": "%s"}`, err.Error()) 199 | } 200 | 201 | func combineAggregations(aggregations []*health.IntervalAggregation) *health.IntervalAggregation { 202 | if len(aggregations) == 0 { 203 | return nil 204 | } 205 | 206 | overallAgg := health.NewIntervalAggregation(aggregations[0].IntervalStart) 207 | for _, ia := range aggregations { 208 | overallAgg.Merge(ia) 209 | } 210 | return overallAgg 211 | } 212 | 213 | func getSort(r *web.Request) string { 214 | return r.URL.Query().Get("sort") 215 | } 216 | 217 | func getLimit(r *web.Request) int { 218 | limit := r.URL.Query().Get("limit") 219 | if limit == "" { 220 | return 0 221 | } 222 | 223 | n, err := strconv.ParseInt(limit, 10, 0) 224 | if err != nil { 225 | return 0 226 | } 227 | return int(n) 228 | } 229 | 230 | // By is the type of a "less" function that defines the ordering of its Planet arguments. 231 | type By func(j1, j2 *Job) bool 232 | 233 | // Sort is a method on the function type, By, that sorts the argument slice according to the function. 234 | func (by By) Sort(jobs []*Job) { 235 | js := &jobSorter{ 236 | jobs: jobs, 237 | by: by, // The Sort method's receiver is the function (closure) that defines the sort order. 238 | } 239 | sort.Sort(js) 240 | } 241 | 242 | // planetSorter joins a By function and a slice of Planets to be sorted. 243 | type jobSorter struct { 244 | jobs []*Job 245 | by By 246 | } 247 | 248 | // Len is part of sort.Interface. 249 | func (s *jobSorter) Len() int { 250 | return len(s.jobs) 251 | } 252 | 253 | // Swap is part of sort.Interface. 254 | func (s *jobSorter) Swap(i, j int) { 255 | s.jobs[i], s.jobs[j] = s.jobs[j], s.jobs[i] 256 | } 257 | 258 | // Less is part of sort.Interface. It is implemented by calling the "by" closure in the sorter. 259 | func (s *jobSorter) Less(i, j int) bool { 260 | return s.by(s.jobs[i], s.jobs[j]) 261 | } 262 | 263 | var jobSorters = map[string]By{ 264 | "name": func(j1, j2 *Job) bool { 265 | return j1.Name < j2.Name 266 | }, 267 | "count": func(j1, j2 *Job) bool { 268 | return j1.Count > j2.Count 269 | }, 270 | "count_success": func(j1, j2 *Job) bool { 271 | return j1.CountSuccess > j2.CountSuccess 272 | }, 273 | "count_validation_error": func(j1, j2 *Job) bool { 274 | return j1.CountValidationError > j2.CountValidationError 275 | }, 276 | "count_panic": func(j1, j2 *Job) bool { 277 | return j1.CountPanic > j2.CountPanic 278 | }, 279 | "count_error": func(j1, j2 *Job) bool { 280 | return j1.CountError > j2.CountError 281 | }, 282 | "count_junk": func(j1, j2 *Job) bool { 283 | return j1.CountJunk > j2.CountJunk 284 | }, 285 | "total_time": func(j1, j2 *Job) bool { 286 | return j1.NanosSum > j2.NanosSum 287 | }, 288 | "avg": func(j1, j2 *Job) bool { 289 | return j1.NanosAvg > j2.NanosAvg 290 | }, 291 | "min": func(j1, j2 *Job) bool { 292 | return j1.NanosMin > j2.NanosMin 293 | }, 294 | "max": func(j1, j2 *Job) bool { 295 | return j1.NanosMax > j2.NanosMax 296 | }, 297 | "stddev": func(j1, j2 *Job) bool { 298 | return j1.NanosStdDev > j2.NanosStdDev 299 | }, 300 | } 301 | 302 | func sortJobs(jobs []*Job, sort string) { 303 | if by, ok := jobSorters[sort]; ok { 304 | by.Sort(jobs) 305 | } 306 | } 307 | 308 | func filterJobs(overall *health.IntervalAggregation, sort string, limit int) []*Job { 309 | if overall == nil { 310 | return nil 311 | } 312 | jobs := make([]*Job, 0, len(overall.Jobs)) 313 | 314 | for k, j := range overall.Jobs { 315 | var avg, stddev float64 316 | if j.Count == 0 { 317 | avg = 0 318 | stddev = 0 319 | } else { 320 | avg = float64(j.NanosSum) / float64(j.Count) 321 | if j.Count == 1 { 322 | stddev = 0 323 | } else { 324 | num := (float64(j.Count) * j.NanosSumSquares) - math.Pow(float64(j.NanosSum), 2) 325 | div := float64(j.Count * (j.Count - 1)) 326 | stddev = math.Sqrt(num / div) 327 | } 328 | } 329 | job := &Job{ 330 | Name: k, 331 | Count: j.Count, 332 | CountSuccess: j.CountSuccess, 333 | CountValidationError: j.CountValidationError, 334 | CountPanic: j.CountPanic, 335 | CountError: j.CountError, 336 | CountJunk: j.CountJunk, 337 | NanosSum: j.NanosSum, 338 | NanosSumSquares: j.NanosSumSquares, 339 | NanosMin: j.NanosMin, 340 | NanosMax: j.NanosMax, 341 | NanosAvg: avg, 342 | NanosStdDev: stddev, 343 | } 344 | jobs = append(jobs, job) 345 | } 346 | 347 | sortJobs(jobs, sort) 348 | 349 | if limit > 0 { 350 | max := len(jobs) 351 | if limit > max { 352 | limit = max 353 | } 354 | jobs = jobs[0:limit] 355 | } 356 | 357 | return jobs 358 | } 359 | 360 | type HostStatusByHostPort []*HostStatus 361 | 362 | func (a HostStatusByHostPort) Len() int { return len(a) } 363 | func (a HostStatusByHostPort) Swap(i, j int) { a[i], a[j] = a[j], a[i] } 364 | func (a HostStatusByHostPort) Less(i, j int) bool { return a[i].HostPort < a[j].HostPort } 365 | -------------------------------------------------------------------------------- /healthd/healthd.go: -------------------------------------------------------------------------------- 1 | package healthd 2 | 3 | import ( 4 | "fmt" 5 | "github.com/gocraft/health" 6 | "sort" 7 | "sync" 8 | "sync/atomic" 9 | "time" 10 | ) 11 | 12 | type HealthD struct { 13 | stream *health.Stream 14 | 15 | // How long is each aggregation interval. Eg, 1 minute 16 | intervalDuration time.Duration 17 | 18 | // Retain controls how many metrics interval we keep. Eg, 5 minutes 19 | retain time.Duration 20 | 21 | // maxIntervals is the maximum length of intervals. 22 | // It is retain / interval. 23 | maxIntervals int 24 | 25 | // These guys are the real aggregated deal 26 | intervalAggregations []*health.IntervalAggregation 27 | 28 | // let's keep the last 5 minutes worth of data from each host 29 | hostAggregations map[hostAggregationKey]*health.IntervalAggregation 30 | 31 | // intervalsNeedingRecalculation is a set of intervals that need to be recalculated. It is cleared when they are recalculated. 32 | intervalsNeedingRecalculation map[time.Time]struct{} 33 | 34 | // map from HostPort to status 35 | hostStatus map[string]*HostStatus 36 | 37 | intervalsChanChan chan chan []*health.IntervalAggregation 38 | hostsChanChan chan chan []*HostStatus 39 | 40 | stopFlag int64 41 | stopAggregator chan bool 42 | stopStopAggregator chan bool 43 | stopHTTP func() bool 44 | } 45 | 46 | type HostStatus struct { 47 | HostPort string `json:"host_port"` 48 | 49 | LastCheckTime time.Time `json:"last_check_time"` 50 | LastInstanceId string `json:"last_instance_id"` 51 | LastIntervalDuration time.Duration `json:"last_interval_duration"` 52 | LastErr string `json:"last_err"` 53 | LastNanos int64 `json:"last_nanos"` 54 | LastCode int `json:"last_code"` // http status code of last response 55 | 56 | FirstSuccessfulResponse time.Time `json:"first_successful_response"` 57 | LastSuccessfulResponse time.Time `json:"last_successful_response"` 58 | } 59 | 60 | type hostAggregationKey struct { 61 | Time time.Time 62 | InstanceId string 63 | HostPort string 64 | } 65 | 66 | func StartNewHealthD(monitoredHostPorts []string, serverHostPort string, stream *health.Stream) *HealthD { 67 | hd := &HealthD{} 68 | hd.stream = stream 69 | hd.intervalsChanChan = make(chan chan []*health.IntervalAggregation, 16) 70 | hd.hostsChanChan = make(chan chan []*HostStatus, 16) 71 | hd.hostStatus = make(map[string]*HostStatus) 72 | hd.hostAggregations = make(map[hostAggregationKey]*health.IntervalAggregation) 73 | hd.intervalsNeedingRecalculation = make(map[time.Time]struct{}) 74 | hd.retain = time.Hour * 2 // In the future this should be configurable 75 | hd.intervalDuration = 0 // We don't know this yet. Will be configured from polled hosts. 76 | hd.maxIntervals = 0 // We don't know this yet. See above. 77 | hd.stopAggregator = make(chan bool) 78 | hd.stopStopAggregator = make(chan bool) 79 | 80 | for _, hp := range monitoredHostPorts { 81 | hd.hostStatus[hp] = &HostStatus{ 82 | HostPort: hp, 83 | } 84 | } 85 | 86 | go hd.pollAndAggregate() 87 | 88 | httpStarted := make(chan bool) 89 | go hd.startHttpServer(serverHostPort, httpStarted) 90 | <-httpStarted 91 | 92 | return hd 93 | } 94 | 95 | func (hd *HealthD) Stop() { 96 | atomic.StoreInt64(&hd.stopFlag, 1) 97 | hd.stopAggregator <- true 98 | <-hd.stopStopAggregator 99 | hd.stopHTTP() 100 | } 101 | 102 | // shouldStop returns true if we've been flagged to stop 103 | func (hd *HealthD) shouldStop() bool { 104 | v := atomic.LoadInt64(&hd.stopFlag) 105 | return v == 1 106 | } 107 | 108 | func (hd *HealthD) pollAndAggregate() { 109 | ticker := time.Tick(10 * time.Second) 110 | 111 | responses := make(chan *pollResponse, 64) 112 | recalcIntervals := make(chan struct{}) 113 | recalcIntervalsRequest := make(chan struct{}, 64) 114 | intervalsChanChan := hd.intervalsChanChan 115 | hostsChanChan := hd.hostsChanChan 116 | 117 | go debouncer(recalcIntervals, recalcIntervalsRequest, time.Second*2, time.Millisecond*300) 118 | 119 | // Immediately poll for servers on healthd startup 120 | go hd.poll(responses) 121 | 122 | AGGREGATE_LOOP: 123 | for { 124 | // Usual flow: 125 | // 1. ticker ticks. Poll each host. 126 | // 2. Get responses in. Trigger debouncer 127 | // 3. If we get all responses quickly, we'll get a nil, and then recalc. 128 | // 4. The debouncer will fire in 2 seconds and do a partial calc or full recalc. 129 | // 5. Repeat 2-4 until all resonses are in and everything settles down. 130 | // At any time, we could get: 131 | // - A requset for metrics. We'll get a channel and send response back on that channel. 132 | // - A requset to shut down. 133 | select { 134 | case <-ticker: 135 | go hd.poll(responses) 136 | hd.purge() 137 | case resp := <-responses: 138 | if resp == nil { 139 | // nil is a sentinel value that is sent when all hosts have reported in. 140 | hd.recalculateIntervals() 141 | } else { 142 | hd.consumePollResponse(resp) 143 | recalcIntervalsRequest <- struct{}{} 144 | } 145 | case <-recalcIntervals: 146 | hd.recalculateIntervals() 147 | case intervalsChan := <-intervalsChanChan: 148 | intervalsChan <- hd.memorySafeIntervals() 149 | case hostsChan := <-hostsChanChan: 150 | hostsChan <- hd.memorySafeHosts() 151 | case <-hd.stopAggregator: 152 | hd.stopStopAggregator <- true 153 | break AGGREGATE_LOOP 154 | } 155 | } 156 | } 157 | 158 | // poll is meant to be alled in a new goroutine. 159 | // It will poll each managed host in a new goroutine. 160 | // When everything has finished, it will send nil to responses to signal that we have all data. 161 | func (hd *HealthD) poll(responses chan *pollResponse) { 162 | var wg sync.WaitGroup 163 | for _, hs := range hd.hostStatus { 164 | wg.Add(1) 165 | go func(hs *HostStatus) { 166 | defer wg.Done() 167 | poll(hd.stream, hs.HostPort, responses) 168 | }(hs) 169 | } 170 | wg.Wait() 171 | responses <- nil 172 | } 173 | 174 | func (hd *HealthD) getAggregationSequence() []*health.IntervalAggregation { 175 | if hd.shouldStop() { 176 | return nil 177 | } 178 | intervalsChan := make(chan []*health.IntervalAggregation) 179 | hd.intervalsChanChan <- intervalsChan 180 | return <-intervalsChan 181 | } 182 | 183 | func (hd *HealthD) getHosts() []*HostStatus { 184 | if hd.shouldStop() { 185 | return nil 186 | } 187 | hostsChan := make(chan []*HostStatus) 188 | hd.hostsChanChan <- hostsChan 189 | return <-hostsChan 190 | } 191 | 192 | func (agg *HealthD) memorySafeIntervals() []*health.IntervalAggregation { 193 | ret := make([]*health.IntervalAggregation, 0, len(agg.intervalAggregations)) 194 | 195 | for _, intAgg := range agg.intervalAggregations { 196 | ret = append(ret, intAgg.Clone()) 197 | } 198 | 199 | return ret 200 | } 201 | 202 | func (hd *HealthD) memorySafeHosts() []*HostStatus { 203 | ret := make([]*HostStatus, 0, len(hd.hostStatus)) 204 | 205 | for _, hs := range hd.hostStatus { 206 | var host = *hs // copy mem 207 | ret = append(ret, &host) 208 | } 209 | 210 | return ret 211 | } 212 | 213 | func (hd *HealthD) consumePollResponse(resp *pollResponse) { 214 | if hs, ok := hd.hostStatus[resp.HostPort]; ok { 215 | hs.LastCheckTime = resp.Timestamp 216 | hs.LastNanos = resp.Nanos 217 | hs.LastInstanceId = resp.InstanceId 218 | hs.LastIntervalDuration = resp.IntervalDuration 219 | hs.LastCode = resp.Code 220 | if resp.Err == nil { 221 | hs.LastErr = "" 222 | } else { 223 | hs.LastErr = resp.Err.Error() 224 | } 225 | 226 | if resp.Code == 200 && resp.Err == nil { 227 | if hs.FirstSuccessfulResponse.IsZero() { 228 | hs.FirstSuccessfulResponse = now() 229 | } 230 | hs.LastSuccessfulResponse = now() 231 | } 232 | } else { 233 | // BUG 234 | // TODO: log that we got an unknown hostPort. 235 | } 236 | 237 | // Add resp to hostAggregations 238 | if resp.Code == 200 && resp.Err == nil { 239 | if hd.intervalDuration == 0 { 240 | hd.intervalDuration = resp.IntervalDuration // TODO: validate this 241 | hd.maxIntervals = int(hd.retain / hd.intervalDuration) 242 | } else if hd.intervalDuration != resp.IntervalDuration { 243 | fmt.Println("interval duration mismatch: agg.intervalDuration=", hd.intervalDuration, " but resp.IntervalDuration=", resp.IntervalDuration) 244 | return 245 | } 246 | 247 | for _, intAgg := range resp.IntervalAggregations { 248 | key := hostAggregationKey{ 249 | Time: intAgg.IntervalStart, 250 | InstanceId: resp.InstanceId, 251 | HostPort: resp.HostPort, 252 | } 253 | 254 | existingIntAgg, ok := hd.hostAggregations[key] 255 | if ok && existingIntAgg.SerialNumber == intAgg.SerialNumber { 256 | // ignore; we already have this data 257 | } else { 258 | hd.hostAggregations[key] = intAgg 259 | hd.intervalsNeedingRecalculation[intAgg.IntervalStart] = struct{}{} 260 | } 261 | } 262 | } 263 | } 264 | 265 | // purge purges old hostAggregations older than 5 intervals 266 | func (agg *HealthD) purge() { 267 | var threshold = agg.intervalDuration * 5 // NOTE: this is arbitrary. 268 | for k, _ := range agg.hostAggregations { 269 | if time.Since(k.Time) > threshold { 270 | delete(agg.hostAggregations, k) 271 | } 272 | } 273 | 274 | n := len(agg.intervalAggregations) 275 | if n > agg.maxIntervals { 276 | agg.intervalAggregations = agg.intervalAggregations[(n - agg.maxIntervals):] 277 | } 278 | } 279 | 280 | func (hd *HealthD) recalculateIntervals() { 281 | job := hd.stream.NewJob("recalculate") 282 | 283 | for k, _ := range hd.intervalsNeedingRecalculation { 284 | intAggsAtTime := []*health.IntervalAggregation{} 285 | 286 | for key, intAgg := range hd.hostAggregations { 287 | if key.Time == k { 288 | intAggsAtTime = append(intAggsAtTime, intAgg) 289 | } 290 | } 291 | 292 | overallAgg := health.NewIntervalAggregation(k) 293 | for _, ia := range intAggsAtTime { 294 | overallAgg.Merge(ia) 295 | } 296 | hd.setAggregation(overallAgg) 297 | } 298 | 299 | // Reset everything: 300 | hd.intervalsNeedingRecalculation = make(map[time.Time]struct{}) 301 | 302 | job.Complete(health.Success) 303 | } 304 | 305 | type ByInterval []*health.IntervalAggregation 306 | 307 | func (a ByInterval) Len() int { return len(a) } 308 | func (a ByInterval) Swap(i, j int) { a[i], a[j] = a[j], a[i] } 309 | func (a ByInterval) Less(i, j int) bool { return a[i].IntervalStart.Before(a[j].IntervalStart) } 310 | 311 | func (agg *HealthD) setAggregation(intAgg *health.IntervalAggregation) { 312 | // If we already have the intAgg, replace it. 313 | for i, existingAgg := range agg.intervalAggregations { 314 | if existingAgg.IntervalStart == intAgg.IntervalStart { 315 | agg.intervalAggregations[i] = intAgg 316 | return 317 | } 318 | } 319 | 320 | // Otherwise, just append it and sort to get ordering right. 321 | agg.intervalAggregations = append(agg.intervalAggregations, intAgg) 322 | sort.Sort(ByInterval(agg.intervalAggregations)) 323 | 324 | // If we have too many aggregations, truncate 325 | n := len(agg.intervalAggregations) 326 | if n > agg.maxIntervals { 327 | agg.intervalAggregations = agg.intervalAggregations[(n - agg.maxIntervals):] 328 | } 329 | } 330 | -------------------------------------------------------------------------------- /statsd_sink_test.go: -------------------------------------------------------------------------------- 1 | package health 2 | 3 | import ( 4 | "fmt" 5 | "github.com/stretchr/testify/assert" 6 | "net" 7 | "runtime" 8 | "strings" 9 | "sync" 10 | "testing" 11 | "time" 12 | ) 13 | 14 | var testAddr = "127.0.0.1:7890" 15 | 16 | func callerInfo() string { 17 | _, file, line, ok := runtime.Caller(2) 18 | if !ok { 19 | return "" 20 | } 21 | parts := strings.Split(file, "/") 22 | file = parts[len(parts)-1] 23 | return fmt.Sprintf("%s:%d", file, line) 24 | } 25 | 26 | func listenFor(t *testing.T, msgs []string, f func()) { 27 | c, err := net.ListenPacket("udp", testAddr) 28 | defer c.Close() 29 | assert.NoError(t, err) 30 | 31 | f() 32 | 33 | buf := make([]byte, 10000) 34 | for _, msg := range msgs { 35 | err = c.SetReadDeadline(time.Now().Add(1 * time.Millisecond)) 36 | assert.NoError(t, err) 37 | nbytes, _, err := c.ReadFrom(buf) 38 | assert.NoError(t, err) 39 | if err == nil { 40 | gotMsg := string(buf[0:nbytes]) 41 | if gotMsg != msg { 42 | t.Errorf("Expected UPD packet %s but got %s\n", msg, gotMsg) 43 | } 44 | } 45 | } 46 | } 47 | 48 | func TestStatsDSinkPeriodicPurge(t *testing.T) { 49 | sink, err := NewStatsDSink(testAddr, &StatsDSinkOptions{Prefix: "metroid"}) 50 | assert.NoError(t, err) 51 | 52 | // Stop the sink, set a smaller flush period, and start it agian 53 | sink.Stop() 54 | sink.flushPeriod = 1 * time.Millisecond 55 | go sink.loop() 56 | defer sink.Stop() 57 | 58 | listenFor(t, []string{"metroid.my.event:1|c\nmetroid.my.job.my.event:1|c\n"}, func() { 59 | sink.EmitEvent("my.job", "my.event", nil) 60 | time.Sleep(10 * time.Millisecond) 61 | }) 62 | } 63 | 64 | func TestStatsDSinkPacketLimit(t *testing.T) { 65 | sink, err := NewStatsDSink(testAddr, &StatsDSinkOptions{Prefix: "metroid", SkipNestedEvents: true}) 66 | assert.NoError(t, err) 67 | 68 | // s is 101 bytes 69 | s := "metroid." + strings.Repeat("a", 88) + ":1|c\n" 70 | 71 | // expect 1 packet that is 14*101=1414 bytes, and the next one to be 101 bytes 72 | listenFor(t, []string{strings.Repeat(s, 14), s}, func() { 73 | for i := 0; i < 15; i++ { 74 | sink.EmitEvent("my.job", strings.Repeat("a", 88), nil) 75 | } 76 | 77 | sink.Drain() 78 | }) 79 | } 80 | 81 | func TestStatsDSinkEmitEventPrefix(t *testing.T) { 82 | sink, err := NewStatsDSink(testAddr, &StatsDSinkOptions{Prefix: "metroid"}) 83 | defer sink.Stop() 84 | assert.NoError(t, err) 85 | listenFor(t, []string{"metroid.my.event:1|c\nmetroid.my.job.my.event:1|c\n"}, func() { 86 | sink.EmitEvent("my.job", "my.event", nil) 87 | sink.Drain() 88 | }) 89 | } 90 | 91 | func TestStatsDSinkEmitEventShouldSanitize(t *testing.T) { 92 | sink, err := NewStatsDSink(testAddr, &StatsDSinkOptions{Prefix: "metroid"}) 93 | defer sink.Stop() 94 | assert.NoError(t, err) 95 | listenFor(t, []string{"metroid.my$event:1|c\nmetroid.my$job.my$event:1|c\n"}, func() { 96 | sink.EmitEvent("my|job", "my:event", nil) 97 | sink.Drain() 98 | }) 99 | } 100 | 101 | func TestStatsDSinkEmitEventNoPrefix(t *testing.T) { 102 | sink, err := NewStatsDSink(testAddr, nil) 103 | defer sink.Stop() 104 | assert.NoError(t, err) 105 | listenFor(t, []string{"my.event:1|c\nmy.job.my.event:1|c\n"}, func() { 106 | sink.EmitEvent("my.job", "my.event", nil) 107 | sink.Drain() 108 | }) 109 | } 110 | 111 | func TestStatsDSinkEmitEventSkipNested(t *testing.T) { 112 | sink, err := NewStatsDSink(testAddr, &StatsDSinkOptions{SkipNestedEvents: true}) 113 | defer sink.Stop() 114 | assert.NoError(t, err) 115 | listenFor(t, []string{"my.event:1|c\n"}, func() { 116 | sink.EmitEvent("my.job", "my.event", nil) 117 | sink.Drain() 118 | }) 119 | } 120 | 121 | func TestStatsDSinkEmitEventSkipTopLevel(t *testing.T) { 122 | sink, err := NewStatsDSink(testAddr, &StatsDSinkOptions{SkipTopLevelEvents: true}) 123 | defer sink.Stop() 124 | assert.NoError(t, err) 125 | listenFor(t, []string{"my.job.my.event:1|c\n"}, func() { 126 | sink.EmitEvent("my.job", "my.event", nil) 127 | sink.Drain() 128 | }) 129 | } 130 | 131 | func TestStatsDSinkEmitEventErrPrefix(t *testing.T) { 132 | sink, err := NewStatsDSink(testAddr, &StatsDSinkOptions{Prefix: "metroid"}) 133 | defer sink.Stop() 134 | assert.NoError(t, err) 135 | listenFor(t, []string{"metroid.my.event.error:1|c\nmetroid.my.job.my.event.error:1|c\n"}, func() { 136 | sink.EmitEventErr("my.job", "my.event", testErr, nil) 137 | sink.Drain() 138 | }) 139 | } 140 | 141 | func TestStatsDSinkEmitEventErrNoPrefix(t *testing.T) { 142 | sink, err := NewStatsDSink(testAddr, nil) 143 | defer sink.Stop() 144 | assert.NoError(t, err) 145 | listenFor(t, []string{"my.event.error:1|c\nmy.job.my.event.error:1|c\n"}, func() { 146 | sink.EmitEventErr("my.job", "my.event", testErr, nil) 147 | sink.Drain() 148 | }) 149 | } 150 | 151 | func TestStatsDSinkEmitEventErrSkipNested(t *testing.T) { 152 | sink, err := NewStatsDSink(testAddr, &StatsDSinkOptions{SkipNestedEvents: true}) 153 | defer sink.Stop() 154 | assert.NoError(t, err) 155 | listenFor(t, []string{"my.event.error:1|c\n"}, func() { 156 | sink.EmitEventErr("my.job", "my.event", testErr, nil) 157 | sink.Drain() 158 | }) 159 | } 160 | 161 | func TestStatsDSinkEmitEventErrSkipTopLevel(t *testing.T) { 162 | sink, err := NewStatsDSink(testAddr, &StatsDSinkOptions{SkipTopLevelEvents: true}) 163 | defer sink.Stop() 164 | assert.NoError(t, err) 165 | listenFor(t, []string{"my.job.my.event.error:1|c\n"}, func() { 166 | sink.EmitEventErr("my.job", "my.event", testErr, nil) 167 | sink.Drain() 168 | }) 169 | } 170 | 171 | func TestStatsDSinkEmitTimingPrefix(t *testing.T) { 172 | sink, err := NewStatsDSink(testAddr, &StatsDSinkOptions{Prefix: "metroid"}) 173 | defer sink.Stop() 174 | assert.NoError(t, err) 175 | listenFor(t, []string{"metroid.my.event:123|ms\nmetroid.my.job.my.event:123|ms\n"}, func() { 176 | sink.EmitTiming("my.job", "my.event", 123456789, nil) 177 | sink.Drain() 178 | }) 179 | } 180 | 181 | func TestStatsDSinkEmitTimingNoPrefix(t *testing.T) { 182 | sink, err := NewStatsDSink(testAddr, nil) 183 | defer sink.Stop() 184 | assert.NoError(t, err) 185 | listenFor(t, []string{"my.event:123|ms\nmy.job.my.event:123|ms\n"}, func() { 186 | sink.EmitTiming("my.job", "my.event", 123456789, nil) 187 | sink.Drain() 188 | }) 189 | } 190 | 191 | func TestStatsDSinkEmitTimingSkipNested(t *testing.T) { 192 | sink, err := NewStatsDSink(testAddr, &StatsDSinkOptions{SkipNestedEvents: true}) 193 | defer sink.Stop() 194 | assert.NoError(t, err) 195 | listenFor(t, []string{"my.event:123|ms\n"}, func() { 196 | sink.EmitTiming("my.job", "my.event", 123456789, nil) 197 | sink.Drain() 198 | }) 199 | } 200 | 201 | func TestStatsDSinkEmitTimingSkipTopLevel(t *testing.T) { 202 | sink, err := NewStatsDSink(testAddr, &StatsDSinkOptions{SkipTopLevelEvents: true}) 203 | defer sink.Stop() 204 | assert.NoError(t, err) 205 | listenFor(t, []string{"my.job.my.event:123|ms\n"}, func() { 206 | sink.EmitTiming("my.job", "my.event", 123456789, nil) 207 | sink.Drain() 208 | }) 209 | } 210 | 211 | func TestStatsDSinkEmitTimingShort(t *testing.T) { 212 | sink, err := NewStatsDSink(testAddr, nil) 213 | defer sink.Stop() 214 | assert.NoError(t, err) 215 | listenFor(t, []string{"my.event:1.23|ms\nmy.job.my.event:1.23|ms\n"}, func() { 216 | sink.EmitTiming("my.job", "my.event", 1234567, nil) 217 | sink.Drain() 218 | }) 219 | } 220 | 221 | func TestStatsDSinkEmitGaugePrefix(t *testing.T) { 222 | sink, err := NewStatsDSink(testAddr, &StatsDSinkOptions{Prefix: "metroid"}) 223 | defer sink.Stop() 224 | assert.NoError(t, err) 225 | listenFor(t, []string{"metroid.my.event:3.14|g\nmetroid.my.job.my.event:3.14|g\n"}, func() { 226 | sink.EmitGauge("my.job", "my.event", 3.14, nil) 227 | sink.Drain() 228 | }) 229 | } 230 | 231 | func TestStatsDSinkEmitGaugeSmall(t *testing.T) { 232 | sink, err := NewStatsDSink(testAddr, &StatsDSinkOptions{Prefix: "metroid", SkipNestedEvents: true}) 233 | defer sink.Stop() 234 | assert.NoError(t, err) 235 | listenFor(t, []string{"metroid.my.event:0.14|g\nmetroid.my.event:0.0401|g\nmetroid.my.event:-0.0001|g\n"}, func() { 236 | sink.EmitGauge("my.job", "my.event", 0.1401, nil) 237 | sink.EmitGauge("my.job", "my.event", 0.0401, nil) 238 | sink.EmitGauge("my.job", "my.event", -0.0001, nil) 239 | sink.Drain() 240 | }) 241 | } 242 | 243 | func TestStatsDSinkEmitGaugeNoPrefix(t *testing.T) { 244 | sink, err := NewStatsDSink(testAddr, nil) 245 | defer sink.Stop() 246 | assert.NoError(t, err) 247 | listenFor(t, []string{"my.event:3.00|g\nmy.job.my.event:3.00|g\n"}, func() { 248 | sink.EmitGauge("my.job", "my.event", 3, nil) 249 | sink.Drain() 250 | }) 251 | } 252 | 253 | func TestStatsDSinkEmitGaugeSkipNested(t *testing.T) { 254 | sink, err := NewStatsDSink(testAddr, &StatsDSinkOptions{SkipNestedEvents: true}) 255 | defer sink.Stop() 256 | assert.NoError(t, err) 257 | listenFor(t, []string{"my.event:3.00|g\n"}, func() { 258 | sink.EmitGauge("my.job", "my.event", 3, nil) 259 | sink.Drain() 260 | }) 261 | } 262 | 263 | func TestStatsDSinkEmitGaugeSkipTopLevel(t *testing.T) { 264 | sink, err := NewStatsDSink(testAddr, &StatsDSinkOptions{SkipTopLevelEvents: true}) 265 | defer sink.Stop() 266 | assert.NoError(t, err) 267 | listenFor(t, []string{"my.job.my.event:3.00|g\n"}, func() { 268 | sink.EmitGauge("my.job", "my.event", 3, nil) 269 | sink.Drain() 270 | }) 271 | } 272 | 273 | func TestStatsDSinkEmitCompletePrefix(t *testing.T) { 274 | sink, err := NewStatsDSink(testAddr, &StatsDSinkOptions{Prefix: "metroid"}) 275 | defer sink.Stop() 276 | assert.NoError(t, err) 277 | for kind, kindStr := range completionStatusToString { 278 | str := fmt.Sprintf("metroid.my.job.%s:129|ms\n", kindStr) 279 | listenFor(t, []string{str}, func() { 280 | sink.EmitComplete("my.job", kind, 129456789, nil) 281 | sink.Drain() 282 | }) 283 | } 284 | } 285 | 286 | func TestStatsDSinkEmitCompleteNoPrefix(t *testing.T) { 287 | sink, err := NewStatsDSink(testAddr, nil) 288 | defer sink.Stop() 289 | assert.NoError(t, err) 290 | for kind, kindStr := range completionStatusToString { 291 | str := fmt.Sprintf("my.job.%s:129|ms\n", kindStr) 292 | listenFor(t, []string{str}, func() { 293 | sink.EmitComplete("my.job", kind, 129456789, nil) 294 | sink.Drain() 295 | }) 296 | } 297 | } 298 | 299 | func TestStatsDSinkEmitTimingSubMillisecond(t *testing.T) { 300 | sink, err := NewStatsDSink(testAddr, &StatsDSinkOptions{Prefix: "metroid"}) 301 | defer sink.Stop() 302 | assert.NoError(t, err) 303 | listenFor(t, []string{"metroid.my.event:0.46|ms\nmetroid.my.job.my.event:0.46|ms\n"}, func() { 304 | sink.EmitTiming("my.job", "my.event", 456789, nil) 305 | sink.Drain() 306 | }) 307 | } 308 | 309 | func BenchmarkStatsDSinkProcessEvent(b *testing.B) { 310 | sink, _ := NewStatsDSink(testAddr, &StatsDSinkOptions{Prefix: "metroid"}) 311 | sink.Stop() // Don't do periodic things while we're benching 312 | 313 | b.ResetTimer() 314 | for i := 0; i < b.N; i++ { 315 | sink.processEvent("myjob", "myevent") 316 | } 317 | } 318 | 319 | func BenchmarkStatsDSinkProcessEventErr(b *testing.B) { 320 | sink, _ := NewStatsDSink(testAddr, &StatsDSinkOptions{Prefix: "metroid"}) 321 | sink.Stop() // Don't do periodic things while we're benching 322 | 323 | b.ResetTimer() 324 | for i := 0; i < b.N; i++ { 325 | sink.processEventErr("myjob", "myevent") 326 | } 327 | } 328 | 329 | func BenchmarkStatsDSinkProcessTimingBig(b *testing.B) { 330 | sink, _ := NewStatsDSink(testAddr, &StatsDSinkOptions{Prefix: "metroid"}) 331 | sink.Stop() // Don't do periodic things while we're benching 332 | 333 | b.ResetTimer() 334 | for i := 0; i < b.N; i++ { 335 | sink.processTiming("myjob", "myevent", 30000000) 336 | } 337 | } 338 | 339 | func BenchmarkStatsDSinkProcessTimingSmall(b *testing.B) { 340 | sink, _ := NewStatsDSink(testAddr, &StatsDSinkOptions{Prefix: "metroid"}) 341 | sink.Stop() // Don't do periodic things while we're benching 342 | 343 | b.ResetTimer() 344 | for i := 0; i < b.N; i++ { 345 | sink.processTiming("myjob", "myevent", 1230000) 346 | } 347 | } 348 | 349 | func BenchmarkStatsDSinkProcessGauge(b *testing.B) { 350 | sink, _ := NewStatsDSink(testAddr, &StatsDSinkOptions{Prefix: "metroid"}) 351 | sink.Stop() // Don't do periodic things while we're benching 352 | 353 | b.ResetTimer() 354 | for i := 0; i < b.N; i++ { 355 | sink.processGauge("myjob", "myevent", 3.14) 356 | } 357 | } 358 | 359 | func BenchmarkStatsDSinkProcessComplete(b *testing.B) { 360 | sink, _ := NewStatsDSink(testAddr, &StatsDSinkOptions{Prefix: "metroid"}) 361 | sink.Stop() // Don't do periodic things while we're benching 362 | 363 | b.ResetTimer() 364 | for i := 0; i < b.N; i++ { 365 | sink.processComplete("myjob", Success, 1230000) 366 | } 367 | } 368 | 369 | func BenchmarkStatsDSinkOverall(b *testing.B) { 370 | const numGoroutines = 100 371 | var requestsPerGoroutine = b.N / numGoroutines 372 | 373 | stream := NewStream() 374 | sink, _ := NewStatsDSink(testAddr, &StatsDSinkOptions{Prefix: "metroid"}) 375 | stream.AddSink(sink) 376 | job := stream.NewJob("foo") 377 | 378 | wg := sync.WaitGroup{} 379 | for i := 0; i < numGoroutines; i++ { 380 | wg.Add(1) 381 | go func() { 382 | for j := 0; j < requestsPerGoroutine; j++ { 383 | job.Event("evt") 384 | } 385 | wg.Done() 386 | }() 387 | } 388 | 389 | wg.Wait() 390 | sink.Drain() 391 | } 392 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # gocraft/health [![GoDoc](https://godoc.org/github.com/gocraft/health?status.png)](https://godoc.org/github.com/gocraft/health) 2 | 3 | gocraft/health allows you to instrument your service for logging and metrics, and then send that instrumentation to log files, StatsD, Bugsnag, or to be polled and aggregated via a JSON API. 4 | 5 | gocraft/health also ships with a New Relic-like aggregator (called healthd) that shows you your slowest endpoints, top error producers, top throughput endpoints, and so on. 6 | 7 | ## Instrumenting your service 8 | 9 | ### Make a new stream with sinks 10 | 11 | First, you'll want to make a new Stream and attach your sinks to it. Streams are commonly saved in a global variable. 12 | 13 | ```go 14 | import ( 15 | "github.com/gocraft/health" 16 | "github.com/gocraft/health/sinks/bugsnag" 17 | "os" 18 | ) 19 | 20 | // Save the stream as a global variable 21 | var stream = health.NewStream() 22 | 23 | // In your main func, initiailze the stream with your sinks. 24 | func main() { 25 | // Log to stdout! (can also use WriterSink to write to a log file, Syslog, etc) 26 | stream.AddSink(&health.WriterSink{os.Stdout}) 27 | 28 | // Log to StatsD! 29 | statsdSink, err = health.NewStatsDSink("127.0.0.1:8125", "myapp") 30 | if err != nil { 31 | stream.EventErr("new_statsd_sink", err) 32 | return 33 | } 34 | stream.AddSink(statsdSink) 35 | 36 | // Expose instrumentation in this app on a JSON endpoint that healthd can poll! 37 | sink := health.NewJsonPollingSink(time.Minute, time.Minute*5) 38 | stream.AddSink(sink) 39 | sink.StartServer(addr) 40 | 41 | // Send errors to bugsnag! 42 | stream.AddSink(bugsnag.NewSink(&bugsnag.Config{APIKey: "myApiKey"})) 43 | 44 | // Now that your stream is setup, start a web server or something... 45 | } 46 | ``` 47 | 48 | ### Jobs 49 | 50 | gocraft/health excels at instrumenting services that perform *jobs*. Examples of jobs: serving an HTTP request, serving an RPC request, or processing a message from a work queue. Jobs are encoded semantically into gocraft/health in order to provide out-of-the-box answers to questions like, "what is my slowest endpoint?" 51 | 52 | Jobs serve three functions: 53 | * Jobs record a timing (eg, it took 21ms to complete this job) 54 | * Jobs record a status (eg, did the job complete successfully or was there an error?) 55 | * Jobs group instrumentation inside that job together so that you can analyze it later. 56 | 57 | Let's say you're writing a web service that processes JSON requests/responses. You might write something like this: 58 | 59 | ```go 60 | import ( 61 | "github.com/gocraft/health" 62 | "net/http" 63 | ) 64 | var stream = health.NewStream() 65 | func main() { 66 | // setup stream with sinks 67 | stream.AddSink(&health.WriterSink{os.Stdout}) 68 | http.HandleFunc("/users", getUsers) 69 | } 70 | 71 | func getUsers(rw http.ResponseWriter, r *http.Request) { 72 | // All logging and instrumentation should be within the context of a job! 73 | job := stream.NewJob("get_users") 74 | 75 | err := fetchUsersFromDatabase(r) 76 | if err != nil { 77 | // When in your job's context, you can log errors, events, timings, etc. 78 | job.EventErr("fetch_user_from_database", err) 79 | } 80 | 81 | // When done with the job, call job.Complete with a completion status. 82 | if err == nil { 83 | job.Complete(health.Success) 84 | } else { 85 | job.Complete(health.Error) 86 | } 87 | } 88 | 89 | ``` 90 | 91 | (This example is just used for illustration -- in practice, you'll probably want to use middleware to create your job if you have more than a few endpoints.) 92 | 93 | There are five types of completion statuses: 94 | * **Success** - Your job completed successfully. 95 | * **Error** - Some library call resulted in an error that prevented you from successfully completing your job. 96 | * **Panic** - Some code paniced! 97 | * **ValidationError** - Your code was fine, but the user passed in bad inputs, and so the job wasn't completed successfully. 98 | * **Junk** - The job wasn't completed successfully, but not really because of an Error or ValidationError. For instance, maybe there's just a 404 (not found) or 401 (unauthorized) request to your app. This status code might not apply to all apps. 99 | 100 | ### Events, Timings, Gauges, and Errors 101 | 102 | Within jobs, you can emit events, timings, gauges, and errors. The first argument of each of these methods is supposed to be a *key*. Camel case with dots is good because it works with other metrics stores like StatsD. Each method has a basic version as well as a version that accepts keys/values. 103 | 104 | #### Events 105 | 106 | ```go 107 | // Events. Notice the camel case with dots. 108 | // (This is helpful when you want to use StatsD sinks) 109 | job.Event("starting_server") 110 | job.Event("proccess_user.by_email.gmail") 111 | 112 | // Event with keys and values: 113 | job.EventKv("failover.started", health.Kvs{"from_ip": fmt.Sprint(currentIP)}) 114 | ``` 115 | 116 | * For the WriterSink, an event is just like logging to a file: 117 | ``` 118 | [2015-03-11T22:53:22.115855203Z]: job:/api/v2/user_stories event:starting_request kvs:[path:/api/v2/user_stories request-id:F8a8bQOWmRpO6ky] 119 | ``` 120 | 121 | * For the StatsD sink (and other metrics sinks), an event is like incrementing a counter. 122 | 123 | #### Timings 124 | 125 | ```go 126 | // Timings: 127 | startTime := time.Now() 128 | // Do something... 129 | job.Timing("fetch_user", time.Since(startTime).Nanoseconds()) // NOTE: Nanoseconds! 130 | 131 | // Timings also support keys/values: 132 | job.TimingKv("fetch_user", time.Since(startTime).Nanoseconds(), 133 | health.Kvs{"user_email": userEmail}) 134 | ``` 135 | 136 | * NOTE: All timing values are in nanoseconds. 137 | * For the WriterSink, a timing is just like logging to a file: 138 | ``` 139 | [2014-12-17T20:36:24.136663759Z]: job:/api/v2/user_stories event:dbr.select time:371 μs kvs:[request-id:F8a8bQOWmRpO6ky sql:SELECT COUNT(*) FROM user_stories WHERE (subdomain_id = 1221) AND (deleted_at IS NULL) AND (ticket_id IN (38327))] 140 | ``` 141 | 142 | * For the StatsD sink, we'll send it to StatsD as a timing. 143 | * The JSON polling sink will compute a summary of your timings: min, max, avg, stddev, count, sum. 144 | 145 | #### Gauges 146 | 147 | ```go 148 | // Gauges: 149 | job.Gauge("num_goroutines", numRunningGoroutines()) 150 | 151 | // Timings also support keys/values: 152 | job.GaugeKv("num_goroutines", numRunningGoroutines(), 153 | health.Kvs{"dispatcher": dispatcherStatus()}) 154 | ``` 155 | 156 | * For the WriterSink, a timing is just like logging to a file: 157 | ``` 158 | [2014-12-17T20:36:24.136663759Z]: job:/api/v2/user_stories event:num_goroutines gauge:17 kvs:[request-id:F8a8bQOWmRpO6ky dispatcher:running] 159 | ``` 160 | 161 | * For the StatsD sink, we'll send it to StatsD as a gauge. 162 | 163 | #### Errors 164 | 165 | ```go 166 | // Errors: 167 | err := someFunc(user.Email) 168 | if err != nil { 169 | return job.EventErr("some_func", err) 170 | } 171 | 172 | // And with keys/Values: 173 | job.EventErrKv("some_func", err, health.Kvs{"email": user.Email}) 174 | ``` 175 | 176 | * For the WriterSink, and error will just log to the file with the error: 177 | ``` 178 | job:/api/v2/user_stories event:load_session.populate err:not_found kvs:[request-id:F8a8bQOWmRpO6ky] 179 | ``` 180 | 181 | * For metrics sinks, Errors are just like Events 182 | * The JSON polling sink and healthd will let you see which errors are trending. 183 | * For the Bugsnag sink, we'll push each error to bugsnag. 184 | 185 | Errors will capture a stacktrace by default so that you can diagnose it in things like Bugsnag. If an error is common or not worth sending to something like Bugsnag, you can mute it. This will cause health to not capture a stack trace or send it to bugsnag: 186 | 187 | ```go 188 | i, err := strconv.ParseInt(userInput, 10, 0) 189 | if err != nil { 190 | // Mute this error! It's pretty common and 191 | // does not indicate a problem with our code! 192 | job.EventErr("myfunc.parse_int", health.Mute(err)) 193 | i = 2 // We have a default anyway. No big deal. 194 | } 195 | ``` 196 | 197 | Since error handling is so prevalent in Go code, you'll have sitations where multiple functions have the option of loggin the same root error. The best practice that we've identified is to just not think about it and log it on every level of the call stack. Keep in mind that gocraft/health will handle this intelligently and only send one error to Bugsnag, have a correct root backtrace, and so on. 198 | 199 | ```go 200 | func showUser(ctx *Context) error { 201 | user, err := ctx.getUser() 202 | if err != nil { 203 | // But we'll just log it here too! 204 | return ctx.EventErr("show_user.get_user", err) 205 | } 206 | } 207 | 208 | func getUser(ctx *Context) (*User, error) { 209 | var u User 210 | err := ctx.db.Select("SELECT * FROM users WHERE id = ?", ctx.userID).LoadStruct(&u) 211 | if err != nil { 212 | // Original error is here: 213 | return nil, ctx.EventErr("get_user.select", err) 214 | } 215 | return &u, nil 216 | } 217 | ``` 218 | 219 | ### Keys and Values 220 | 221 | Most objects and methods in health work with key/value pairs. Key/value pairs are just maps of strings to strings. Keys and values are only relevant right now for logging sinks: The keys and values will be printed on each line written. 222 | 223 | You can add keys/values to a stream. This is useful for things like hostname or pid. They keys/values will show up on every future event/timing/error. 224 | ```go 225 | stream := health.NewStream() 226 | stream.KeyValue("hostname", hostname) 227 | stream.KeyValue("pid", pid) 228 | ``` 229 | 230 | You can add keys/values to a job. This is useful for things like a request-id or the current user id: 231 | ```go 232 | job.KeyValue("request_id", makeRequestID()) 233 | if user != nil { 234 | job.KeyValue("user_id", fmt.Sprint(user.ID)) 235 | } 236 | ``` 237 | 238 | And as previously discussed, each individual event/timing/error can have its own keys and values. 239 | 240 | ### Writing your own Sink 241 | 242 | If you need a custom sink, you can just implement the Sink interface: 243 | 244 | ```go 245 | type Sink interface { 246 | EmitEvent(job string, event string, kvs map[string]string) 247 | EmitEventErr(job string, event string, err error, kvs map[string]string) 248 | EmitTiming(job string, event string, nanoseconds int64, kvs map[string]string) 249 | EmitGauge(job string, event string, value float64, kvs map[string]string) 250 | EmitComplete(job string, status CompletionStatus, nanoseconds int64, kvs map[string]string) 251 | } 252 | ``` 253 | 254 | If you do implement a custom sink that you think would be valuable to other people, I'd be interested in including it in this package. Get in touch via an issue or send a pull requset. 255 | 256 | ### Miscellaneous logging 257 | 258 | If you need to, you can log via a stream directly without creating a job. This will emit events under a job named 'general'. This is useful during application initialization: 259 | 260 | ```go 261 | stream := NewStream() 262 | stream.EventKv("starting_app", health.Kvs{"listen_ip": listenIP}) 263 | ``` 264 | 265 | ## healthd and healthtop 266 | 267 | We've built a set of tools to give you New Relic-like application performance monitoring for your Go app. It can show you things like your slowest endpoints, top error producers, top throughput endpoints, and so on. 268 | 269 | These tools are completely optional -- health is super useful without them. But with them, it becomes even better. 270 | 271 | 272 | ![Healthtop Screenshot](https://gocraft.github.io/health/images/healthtop.png) 273 | 274 | ### Add a JsonPollingSink to your stream 275 | 276 | ```go 277 | // Make sink and add it to stream: 278 | sink := health.NewJsonPollingSink(time.Minute, time.Minute*5) 279 | stream.AddSink(sink) 280 | 281 | // Start the HTTP server! This will expose metrics via a JSON API. 282 | // NOTE: this won't interfere with your main app (if it also serves HTTP), 283 | // since it starts a separate net/http server. 284 | // In prod, addr should be a private network interface and port, like "10.2.1.4:5020" 285 | // In local dev, it can be something like "127.0.0.1:5020" 286 | sink.StartServer(addr) 287 | ``` 288 | 289 | Once you start your app, you can browse to ```/health``` endpoint (eg, ```127.0.0.1:5020/health```) to see your metrics. Per the initialization options above, your metrics are aggregated in 1-minute chunks. We'll keep 5 minutes worth of data in memory. Nothing is ever persisted to disk. 290 | 291 | 292 | ### Start healthd 293 | 294 | healthd will poll multiple services that are exposing a ```/health``` endpoint and aggregate that data. It will then expose that data via its own JSON API. You can query the healthd API to answer questions like 'what are my slowest endpoints'? 295 | 296 | Install the healthd binary: 297 | 298 | ```bash 299 | go get github.com/gocraft/health/cmd/healthd 300 | ``` 301 | 302 | Now you can run it. It accepts two main inputs as environment variables: 303 | 304 | * **HEALTHD_MONITORED_HOSTPORTS**: comma separated list of hostports that represent your services running the JsonPollingSink. Example: ```HEALTHD_MONITORED_HOSTPORTS=10.18.23.130:5020,10.18.23.131:5020``` 305 | * **HEALTHD_SERVER_HOSTPORT**: interface and port where you want to expose the healthd endpoints. Example: ```HEALTHD_SERVER_HOSTPORT=10.18.23.132:5032``` 306 | 307 | Putting those together: 308 | ```bash 309 | HEALTHD_MONITORED_HOSTPORTS=10.18.23.130:5020,10.18.23.131:5020 HEALTHD_SERVER_HOSTPORT=10.18.23.132:5030 healthd 310 | ``` 311 | 312 | Of course, in local development mode, you can do something like this: 313 | ```bash 314 | HEALTHD_MONITORED_HOSTPORTS=:5020 HEALTHD_SERVER_HOSTPORT=:5032 healthd 315 | ``` 316 | 317 | Great! To get a sense of the type of data healthd serves, you can manually navigate to: 318 | 319 | * ```/jobs```: Lists top jobs 320 | * ```/aggregations```: Provides a time series of aggregations 321 | * ```/aggregations/overall```: Squishes all time series aggregations into one aggregation. 322 | * ```/hosts```: Lists all monitored hosts and their statuses. 323 | 324 | However, viewing raw JSON is just to give you a sense of the data. See the next section... 325 | 326 | ### Use healthtop to query healthd 327 | 328 | healthtop is a command-line tool that repeatedly queries a healthd and displays the results. 329 | 330 | Install the healthtop binary: 331 | 332 | ```bash 333 | go get github.com/gocraft/health/cmd/healthtop 334 | ``` 335 | 336 | See your top jobs: 337 | 338 | ```bash 339 | healthtop jobs 340 | ``` 341 | 342 | ![Healthtop Screenshot](https://gocraft.github.io/health/images/healthtop.png) 343 | 344 | (By default, healthop will query healthd on localhost:5032 -- if this is not the case, you can use the source option: ```healthtop --source=10.28.3.132:5032 jobs```) 345 | 346 | You can sort your top jobs by a variety of things: 347 | 348 | ```bash 349 | $ healthtop jobs --sort 350 | Error: flag needs an argument: --sort 351 | Usage of jobs: 352 | -h, --help=false: help for jobs 353 | --name="": name is a partial match on the name 354 | --sort="name": sort ∈ {name, count, count_success, count_XXX, min, max, avg} 355 | --source="localhost:5032": source is the host:port of the healthd to query. ex: localhost:5031 356 | 357 | $ healthtop jobs --sort=count_error 358 | ``` 359 | 360 | 361 | See your hosts: 362 | 363 | ```bash 364 | healthtop hosts 365 | ``` 366 | 367 | ![Healthtop Screenshot](https://gocraft.github.io/health/images/healthtop_hosts.png) 368 | 369 | To get help: 370 | 371 | ```bash 372 | healthtop help 373 | ``` 374 | 375 | ## Current Status and Contributing 376 | 377 | Currently, the core instrumentation component is very solid. Healthd is good. healthtop is functional but could use some love. 378 | 379 | Request for contributions: 380 | 381 | health core: 382 | 383 | * A way to do fine-grained histograms with variable binning. 384 | 385 | healthd & healthtop 386 | 387 | * A web UI that is built into healthd 388 | * Keep track of multiple service types so that we can use one healthd to monitor multiple types of applications 389 | * Ability to drill into specific jobs to see top errors 390 | * tests 391 | * general love 392 | 393 | If anything here interests you, let me know by opening an issue and we can collaborate on it. 394 | 395 | ## gocraft 396 | 397 | gocraft offers a toolkit for building web apps. Currently these packages are available: 398 | 399 | * [gocraft/web](https://github.com/gocraft/web) - Go Router + Middleware. Your Contexts. 400 | * [gocraft/dbr](https://github.com/gocraft/dbr) - Additions to Go's database/sql for super fast performance and convenience. 401 | * [gocraft/health](https://github.com/gocraft/health) - Instrument your web apps with logging and metrics. 402 | * [gocraft/work](https://github.com/gocraft/work) - Process background jobs in Go. 403 | 404 | These packages were developed by the [engineering team](https://eng.uservoice.com) at [UserVoice](https://www.uservoice.com) and currently power much of its infrastructure and tech stack. 405 | 406 | ## Authors 407 | 408 | * Jonathan Novak -- [https://github.com/cypriss](https://github.com/cypriss) 409 | * Sponsored by [UserVoice](https://eng.uservoice.com) 410 | --------------------------------------------------------------------------------