├── .travis.yml ├── LICENSE ├── graphite.go ├── graphite_test.go ├── metrics.go ├── metrics_test.go ├── opentsdb.go ├── opentsdb_test.go ├── print_benchmark.go ├── readme.md └── submitter.go /.travis.yml: -------------------------------------------------------------------------------- 1 | language: go 2 | 3 | go: 4 | - 1.4 5 | - tip 6 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright 2014 The Cockroach Authors. 191 | Copyright 2016 Tyler Neely 192 | 193 | Licensed under the Apache License, Version 2.0 (the "License"); 194 | you may not use this file except in compliance with the License. 195 | You may obtain a copy of the License at 196 | 197 | http://www.apache.org/licenses/LICENSE-2.0 198 | 199 | Unless required by applicable law or agreed to in writing, software 200 | distributed under the License is distributed on an "AS IS" BASIS, 201 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 202 | See the License for the specific language governing permissions and 203 | limitations under the License. 204 | -------------------------------------------------------------------------------- /graphite.go: -------------------------------------------------------------------------------- 1 | // Copyright 2014 The Cockroach Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 12 | // implied. See the License for the specific language governing 13 | // permissions and limitations under the License. See the AUTHORS file 14 | // for names of contributors. 15 | // 16 | // Author: Tyler Neely (t@jujit.su) 17 | 18 | package loghisto 19 | 20 | import ( 21 | "bytes" 22 | "fmt" 23 | "os" 24 | "strings" 25 | 26 | ) 27 | 28 | type graphiteStat struct { 29 | Metric string 30 | Time int64 31 | Value float64 32 | Host string 33 | } 34 | 35 | type graphiteStatArray []*graphiteStat 36 | 37 | func (stats graphiteStatArray) ToRequest() []byte { 38 | var request bytes.Buffer 39 | for _, stat := range stats { 40 | request.Write([]byte(fmt.Sprintf("cockroach.%s.%s %f %d\n", 41 | stat.Host, 42 | strings.Replace(stat.Metric, "_", ".", -1), 43 | stat.Value, 44 | stat.Time, 45 | ))) 46 | } 47 | return []byte(request.String()) 48 | } 49 | 50 | func (metricSet *ProcessedMetricSet) tographiteStats() graphiteStatArray { 51 | hostname, err := os.Hostname() 52 | if err != nil { 53 | hostname = "unknown" 54 | } 55 | 56 | stats := make([]*graphiteStat, 0, len(metricSet.Metrics)) 57 | i := 0 58 | for metric, value := range metricSet.Metrics { 59 | //TODO(tyler) custom tags 60 | stats = append(stats, &graphiteStat{ 61 | Metric: metric, 62 | Time: metricSet.Time.Unix(), 63 | Value: value, 64 | Host: hostname, 65 | }) 66 | i++ 67 | } 68 | return stats 69 | } 70 | 71 | // GraphiteProtocol generates a wire representation of a ProcessedMetricSet 72 | // for submission to a Graphite Carbon instance using the plaintext protocol. 73 | func GraphiteProtocol(ms *ProcessedMetricSet) []byte { 74 | return ms.tographiteStats().ToRequest() 75 | } 76 | -------------------------------------------------------------------------------- /graphite_test.go: -------------------------------------------------------------------------------- 1 | package loghisto 2 | 3 | import ( 4 | "testing" 5 | "time" 6 | ) 7 | 8 | func TestGraphite(t *testing.T) { 9 | ms := NewMetricSystem(time.Second, true) 10 | s := NewSubmitter(ms, GraphiteProtocol, "tcp", "localhost:7777") 11 | s.Start() 12 | 13 | metrics := &ProcessedMetricSet{ 14 | Time: time.Now(), 15 | Metrics: map[string]float64{ 16 | "test.3": 50.54, 17 | "test.4": 10.21, 18 | }, 19 | } 20 | request := s.serializer(metrics) 21 | s.submit(request) 22 | s.Shutdown() 23 | } 24 | -------------------------------------------------------------------------------- /metrics.go: -------------------------------------------------------------------------------- 1 | // Copyright 2014 The Cockroach Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 12 | // implied. See the License for the specific language governing 13 | // permissions and limitations under the License. See the AUTHORS file 14 | // for names of contributors. 15 | // 16 | // Author: Tyler Neely (t@jujit.su) 17 | 18 | // IMPORTANT: only subscribe to the metric stream 19 | // using buffered channels that are regularly 20 | // flushed, as reaper will NOT block while trying 21 | // to send metrics to a subscriber, and will ignore 22 | // a subscriber if they fail to clear their channel 23 | // 3 times in a row! 24 | 25 | package loghisto 26 | 27 | import ( 28 | "errors" 29 | "fmt" 30 | "math" 31 | "runtime" 32 | "sort" 33 | "sync" 34 | "sync/atomic" 35 | "time" 36 | 37 | "github.com/golang/glog" 38 | ) 39 | 40 | const ( 41 | // precision effects the bucketing used during histogram value compression. 42 | precision = 100 43 | ) 44 | 45 | // ProcessedMetricSet contains human-readable metrics that may also be 46 | // suitable for storage in time-series databases. 47 | type ProcessedMetricSet struct { 48 | Time time.Time 49 | Metrics map[string]float64 50 | } 51 | 52 | // RawMetricSet contains metrics in a form that supports generation of 53 | // percentiles and other rich statistics. 54 | type RawMetricSet struct { 55 | Time time.Time 56 | Counters map[string]uint64 57 | Rates map[string]uint64 58 | Histograms map[string]map[int16]*uint64 59 | Gauges map[string]float64 60 | } 61 | 62 | // TimerToken facilitates concurrent timings of durations of the same label. 63 | type TimerToken struct { 64 | Name string 65 | Start time.Time 66 | MetricSystem *MetricSystem 67 | } 68 | 69 | // proportion is a compact value with a corresponding count of 70 | // occurrences in this interval. 71 | type proportion struct { 72 | Value float64 73 | Count uint64 74 | } 75 | 76 | // proportionArray is a sortable collection of proportion types. 77 | type proportionArray []proportion 78 | 79 | // MetricSystem facilitates the collection and distribution of metrics. 80 | type MetricSystem struct { 81 | // percentiles is a mapping from labels to desired percentiles to be 82 | // calculated by the MetricSystem 83 | percentiles map[string]float64 84 | // interval is the duration between collections and broadcasts of metrics 85 | // to subscribers. 86 | interval time.Duration 87 | // subscribeToRawMetrics allows subscription to a RawMetricSet generated 88 | // by reaper at the end of each interval on a sent channel. 89 | subscribeToRawMetrics chan chan *RawMetricSet 90 | // unsubscribeFromRawMetrics allows subscribers to unsubscribe from 91 | // receiving a RawMetricSet on the sent channel. 92 | unsubscribeFromRawMetrics chan chan *RawMetricSet 93 | // subscribeToProcessedMetrics allows subscription to a ProcessedMetricSet 94 | // generated by reaper at the end of each interval on a sent channel. 95 | subscribeToProcessedMetrics chan chan *ProcessedMetricSet 96 | // unsubscribeFromProcessedMetrics allows subscribers to unsubscribe from 97 | // receiving a ProcessedMetricSet on the sent channel. 98 | unsubscribeFromProcessedMetrics chan chan *ProcessedMetricSet 99 | // rawSubscribers stores current subscribers to RawMetrics 100 | rawSubscribers map[chan *RawMetricSet]struct{} 101 | // rawBadSubscribers tracks misbehaving subscribers who do not clear their 102 | // subscription channels regularly. 103 | rawBadSubscribers map[chan *RawMetricSet]int 104 | // processedSubscribers stores current subscribers to ProcessedMetrics 105 | processedSubscribers map[chan *ProcessedMetricSet]struct{} 106 | // processedBadSubscribers tracks misbehaving subscribers who do not clear 107 | // their subscription channels regularly. 108 | processedBadSubscribers map[chan *ProcessedMetricSet]int 109 | // subscribersMu controls access to subscription structures 110 | subscribersMu sync.RWMutex 111 | // counterStore maintains the total counts of counters. 112 | counterStore map[string]*uint64 113 | counterStoreMu sync.RWMutex 114 | // counterCache aggregates new Counters until they are collected by reaper(). 115 | counterCache map[string]*uint64 116 | // counterMu controls access to counterCache. 117 | counterMu sync.RWMutex 118 | // histogramCache aggregates Histograms until they are collected by reaper(). 119 | histogramCache map[string]map[int16]*uint64 120 | // histogramMu controls access to histogramCache. 121 | histogramMu sync.RWMutex 122 | // histogramCountStore keeps track of aggregate counts and sums for aggregate 123 | // mean calculation. 124 | histogramCountStore map[string]*uint64 125 | // histogramCountMu controls access to the histogramCountStore. 126 | histogramCountMu sync.RWMutex 127 | // gaugeFuncs maps metrics to functions used for calculating their value 128 | gaugeFuncs map[string]func() float64 129 | // gaugeFuncsMu controls access to the gaugeFuncs map. 130 | gaugeFuncsMu sync.Mutex 131 | // Has reaper() been started? 132 | reaping bool 133 | // Close this to bring down this MetricSystem 134 | shutdownChan chan struct{} 135 | } 136 | 137 | // Metrics is the default metric system, which collects and broadcasts metrics 138 | // to subscribers once every 60 seconds. Also includes default system stats. 139 | var Metrics = NewMetricSystem(60*time.Second, true) 140 | 141 | // NewMetricSystem returns a new metric system that collects and broadcasts 142 | // metrics after each interval. 143 | func NewMetricSystem(interval time.Duration, sysStats bool) *MetricSystem { 144 | ms := &MetricSystem{ 145 | percentiles: map[string]float64{ 146 | "%s_min": 0, 147 | "%s_50": .5, 148 | "%s_75": .75, 149 | "%s_90": .9, 150 | "%s_95": .95, 151 | "%s_99": .99, 152 | "%s_99.9": .999, 153 | "%s_99.99": .9999, 154 | "%s_max": 1, 155 | }, 156 | interval: interval, 157 | subscribeToRawMetrics: make(chan chan *RawMetricSet, 64), 158 | unsubscribeFromRawMetrics: make(chan chan *RawMetricSet, 64), 159 | subscribeToProcessedMetrics: make(chan chan *ProcessedMetricSet, 64), 160 | unsubscribeFromProcessedMetrics: make(chan chan *ProcessedMetricSet, 64), 161 | rawSubscribers: make(map[chan *RawMetricSet]struct{}), 162 | rawBadSubscribers: make(map[chan *RawMetricSet]int), 163 | processedSubscribers: make(map[chan *ProcessedMetricSet]struct{}), 164 | processedBadSubscribers: make(map[chan *ProcessedMetricSet]int), 165 | counterStore: make(map[string]*uint64), 166 | counterCache: make(map[string]*uint64), 167 | histogramCache: make(map[string]map[int16]*uint64), 168 | histogramCountStore: make(map[string]*uint64), 169 | gaugeFuncs: make(map[string]func() float64), 170 | shutdownChan: make(chan struct{}), 171 | } 172 | if sysStats { 173 | ms.gaugeFuncsMu.Lock() 174 | ms.gaugeFuncs["sys.Alloc"] = func() float64 { 175 | memStats := new(runtime.MemStats) 176 | runtime.ReadMemStats(memStats) 177 | return float64(memStats.Alloc) 178 | } 179 | ms.gaugeFuncs["sys.NumGC"] = func() float64 { 180 | memStats := new(runtime.MemStats) 181 | runtime.ReadMemStats(memStats) 182 | return float64(memStats.NumGC) 183 | } 184 | ms.gaugeFuncs["sys.PauseTotalNs"] = func() float64 { 185 | memStats := new(runtime.MemStats) 186 | runtime.ReadMemStats(memStats) 187 | return float64(memStats.PauseTotalNs) 188 | } 189 | ms.gaugeFuncs["sys.NumGoroutine"] = func() float64 { 190 | return float64(runtime.NumGoroutine()) 191 | } 192 | ms.gaugeFuncsMu.Unlock() 193 | } 194 | return ms 195 | } 196 | 197 | // SpecifyPercentiles allows users to override the default collected 198 | // and reported percentiles. 199 | func (ms *MetricSystem) SpecifyPercentiles(percentiles map[string]float64) { 200 | ms.percentiles = percentiles 201 | } 202 | 203 | // SubscribeToRawMetrics registers a channel to receive RawMetricSets 204 | // periodically generated by reaper at each interval. 205 | func (ms *MetricSystem) SubscribeToRawMetrics(metricStream chan *RawMetricSet) { 206 | ms.subscribeToRawMetrics <- metricStream 207 | } 208 | 209 | // UnsubscribeFromRawMetrics registers a channel to receive RawMetricSets 210 | // periodically generated by reaper at each interval. 211 | func (ms *MetricSystem) UnsubscribeFromRawMetrics( 212 | metricStream chan *RawMetricSet) { 213 | ms.unsubscribeFromRawMetrics <- metricStream 214 | } 215 | 216 | // SubscribeToProcessedMetrics registers a channel to receive 217 | // ProcessedMetricSets periodically generated by reaper at each interval. 218 | func (ms *MetricSystem) SubscribeToProcessedMetrics( 219 | metricStream chan *ProcessedMetricSet) { 220 | ms.subscribeToProcessedMetrics <- metricStream 221 | } 222 | 223 | // UnsubscribeFromProcessedMetrics registers a channel to receive 224 | // ProcessedMetricSets periodically generated by reaper at each interval. 225 | func (ms *MetricSystem) UnsubscribeFromProcessedMetrics( 226 | metricStream chan *ProcessedMetricSet) { 227 | ms.unsubscribeFromProcessedMetrics <- metricStream 228 | } 229 | 230 | // StartTimer begins a timer and returns a token which is required for halting 231 | // the timer. This allows for concurrent timings under the same name. 232 | func (ms *MetricSystem) StartTimer(name string) TimerToken { 233 | return TimerToken{ 234 | Name: name, 235 | Start: time.Now(), 236 | MetricSystem: ms, 237 | } 238 | } 239 | 240 | // Stop stops a timer given by StartTimer, submits a Histogram of its duration 241 | // in nanoseconds, and returns its duration in nanoseconds. 242 | func (tt *TimerToken) Stop() time.Duration { 243 | duration := time.Since(tt.Start) 244 | tt.MetricSystem.Histogram(tt.Name, float64(duration.Nanoseconds())) 245 | return duration 246 | } 247 | 248 | // Counter is used for recording a running count of the total occurrences of 249 | // a particular event. A rate is also exported for the amount that a counter 250 | // has increased during an interval of this MetricSystem. 251 | func (ms *MetricSystem) Counter(name string, amount uint64) { 252 | ms.counterMu.RLock() 253 | _, exists := ms.counterCache[name] 254 | // perform lock promotion when we need more control 255 | if exists { 256 | atomic.AddUint64(ms.counterCache[name], amount) 257 | ms.counterMu.RUnlock() 258 | } else { 259 | ms.counterMu.RUnlock() 260 | ms.counterMu.Lock() 261 | _, syncExists := ms.counterCache[name] 262 | if !syncExists { 263 | var z uint64 264 | ms.counterCache[name] = &z 265 | } 266 | atomic.AddUint64(ms.counterCache[name], amount) 267 | ms.counterMu.Unlock() 268 | } 269 | } 270 | 271 | // Histogram is used for generating rich metrics, such as percentiles, from 272 | // periodically occurring continuous values. 273 | func (ms *MetricSystem) Histogram(name string, value float64) { 274 | compressedValue := compress(value) 275 | ms.histogramMu.RLock() 276 | _, present := ms.histogramCache[name][compressedValue] 277 | if present { 278 | atomic.AddUint64(ms.histogramCache[name][compressedValue], 1) 279 | ms.histogramMu.RUnlock() 280 | } else { 281 | ms.histogramMu.RUnlock() 282 | ms.histogramMu.Lock() 283 | _, syncPresent := ms.histogramCache[name][compressedValue] 284 | if !syncPresent { 285 | var z uint64 286 | _, mapPresent := ms.histogramCache[name] 287 | if !mapPresent { 288 | ms.histogramCache[name] = make(map[int16]*uint64) 289 | } 290 | ms.histogramCache[name][compressedValue] = &z 291 | } 292 | atomic.AddUint64(ms.histogramCache[name][compressedValue], 1) 293 | ms.histogramMu.Unlock() 294 | } 295 | } 296 | 297 | // RegisterGaugeFunc registers a function to be called at each interval 298 | // whose return value will be used to populate the metric. 299 | func (ms *MetricSystem) RegisterGaugeFunc(name string, f func() float64) { 300 | ms.gaugeFuncsMu.Lock() 301 | ms.gaugeFuncs[name] = f 302 | ms.gaugeFuncsMu.Unlock() 303 | } 304 | 305 | // DeregisterGaugeFunc deregisters a function for the metric. 306 | func (ms *MetricSystem) DeregisterGaugeFunc(name string) { 307 | ms.gaugeFuncsMu.Lock() 308 | delete(ms.gaugeFuncs, name) 309 | ms.gaugeFuncsMu.Unlock() 310 | } 311 | 312 | // compress takes a float64 and lossily shrinks it to an int16 to facilitate 313 | // bucketing of histogram values, staying within 1% of the true value. This 314 | // fails for large values of 1e142 and above, and is inaccurate for values 315 | // closer to 0 than +/- 0.51 or +/- math.Inf. 316 | func compress(value float64) int16 { 317 | i := int16(precision*math.Log(1.0+math.Abs(value)) + 0.5) 318 | if value < 0 { 319 | return -1 * i 320 | } 321 | return i 322 | } 323 | 324 | // decompress takes a lossily shrunk int16 and returns a float64 within 1% of 325 | // the original float64 passed to compress. 326 | func decompress(compressedValue int16) float64 { 327 | f := math.Exp(math.Abs(float64(compressedValue))/precision) - 1.0 328 | if compressedValue < 0 { 329 | return -1.0 * f 330 | } 331 | return f 332 | } 333 | 334 | // processHistograms derives rich metrics from histograms, currently 335 | // percentiles, sum, count, and mean. 336 | func (ms *MetricSystem) processHistograms(name string, 337 | valuesToCounts map[int16]*uint64) map[string]float64 { 338 | output := make(map[string]float64) 339 | totalSum := float64(0) 340 | totalCount := uint64(0) 341 | proportions := make([]proportion, 0, len(valuesToCounts)) 342 | for compressedValue, count := range valuesToCounts { 343 | value := decompress(compressedValue) 344 | totalSum += value * float64(*count) 345 | totalCount += *count 346 | proportions = append(proportions, proportion{Value: value, Count: *count}) 347 | } 348 | 349 | sumName := fmt.Sprintf("%s_sum", name) 350 | countName := fmt.Sprintf("%s_count", name) 351 | avgName := fmt.Sprintf("%s_avg", name) 352 | 353 | // increment interval sum and count 354 | output[countName] = float64(totalCount) 355 | output[sumName] = totalSum 356 | output[avgName] = totalSum / float64(totalCount) 357 | 358 | // increment aggregate sum and count 359 | ms.histogramCountMu.RLock() 360 | _, present := ms.histogramCountStore[sumName] 361 | if !present { 362 | ms.histogramCountMu.RUnlock() 363 | ms.histogramCountMu.Lock() 364 | _, syncPresent := ms.histogramCountStore[sumName] 365 | if !syncPresent { 366 | var x uint64 367 | ms.histogramCountStore[sumName] = &x 368 | var z uint64 369 | ms.histogramCountStore[countName] = &z 370 | } 371 | ms.histogramCountMu.Unlock() 372 | ms.histogramCountMu.RLock() 373 | } 374 | atomic.AddUint64(ms.histogramCountStore[sumName], uint64(totalSum)) 375 | atomic.AddUint64(ms.histogramCountStore[countName], totalCount) 376 | ms.histogramCountMu.RUnlock() 377 | 378 | for label, p := range ms.percentiles { 379 | value, err := percentile(totalCount, proportions, p) 380 | if err != nil { 381 | glog.Errorf("unable to calculate percentile: %s", err) 382 | } else { 383 | output[fmt.Sprintf(label, name)] = value 384 | } 385 | } 386 | return output 387 | } 388 | 389 | // These next 3 methods are for the implementation of sort.Interface 390 | 391 | func (s proportionArray) Len() int { 392 | return len(s) 393 | } 394 | 395 | func (s proportionArray) Less(i, j int) bool { 396 | return s[i].Value < s[j].Value 397 | } 398 | 399 | func (s proportionArray) Swap(i, j int) { 400 | s[i], s[j] = s[j], s[i] 401 | } 402 | 403 | // percentile calculates a percentile represented as a float64 between 0 and 1 404 | // inclusive from a proportionArray. totalCount is the sum of all counts of 405 | // elements in the proportionArray. 406 | func percentile(totalCount uint64, proportions proportionArray, 407 | percentile float64) (float64, error) { 408 | //TODO(tyler) handle multiple percentiles at once for efficiency 409 | sort.Sort(proportions) 410 | sofar := uint64(0) 411 | for _, proportion := range proportions { 412 | sofar += proportion.Count 413 | if float64(sofar)/float64(totalCount) >= percentile { 414 | return proportion.Value, nil 415 | } 416 | } 417 | return 0, errors.New("Invalid percentile. Should be between 0 and 1.") 418 | } 419 | 420 | func (ms *MetricSystem) collectRawMetrics() *RawMetricSet { 421 | normalizedInterval := time.Unix(0, time.Now().UnixNano()/ 422 | ms.interval.Nanoseconds()* 423 | ms.interval.Nanoseconds()) 424 | 425 | ms.counterMu.Lock() 426 | freshCounters := ms.counterCache 427 | ms.counterCache = make(map[string]*uint64) 428 | ms.counterMu.Unlock() 429 | 430 | rates := make(map[string]uint64) 431 | for name, count := range freshCounters { 432 | rates[name] = *count 433 | } 434 | 435 | counters := make(map[string]uint64) 436 | ms.counterStoreMu.RLock() 437 | // update counters 438 | for name, count := range freshCounters { 439 | _, exists := ms.counterStore[name] 440 | // only take a write lock when it's a totally new counter 441 | if !exists { 442 | ms.counterStoreMu.RUnlock() 443 | ms.counterStoreMu.Lock() 444 | _, syncExists := ms.counterStore[name] 445 | if !syncExists { 446 | var z uint64 447 | ms.counterStore[name] = &z 448 | } 449 | ms.counterStoreMu.Unlock() 450 | ms.counterStoreMu.RLock() 451 | } 452 | atomic.AddUint64(ms.counterStore[name], *count) 453 | } 454 | // copy counters for export 455 | for name, count := range ms.counterStore { 456 | counters[name] = *count 457 | } 458 | ms.counterStoreMu.RUnlock() 459 | 460 | ms.histogramMu.Lock() 461 | histograms := ms.histogramCache 462 | ms.histogramCache = make(map[string]map[int16]*uint64) 463 | ms.histogramMu.Unlock() 464 | 465 | ms.gaugeFuncsMu.Lock() 466 | gauges := make(map[string]float64) 467 | for name, f := range ms.gaugeFuncs { 468 | gauges[name] = f() 469 | } 470 | ms.gaugeFuncsMu.Unlock() 471 | 472 | return &RawMetricSet{ 473 | Time: normalizedInterval, 474 | Counters: counters, 475 | Rates: rates, 476 | Histograms: histograms, 477 | Gauges: gauges, 478 | } 479 | } 480 | 481 | // processMetrics (potentially slowly) creates human consumable metrics from a 482 | // RawMetricSet, deriving rich statistics from histograms such as percentiles. 483 | func (ms *MetricSystem) processMetrics( 484 | rawMetrics *RawMetricSet) *ProcessedMetricSet { 485 | metrics := make(map[string]float64) 486 | 487 | for name, count := range rawMetrics.Counters { 488 | metrics[name] = float64(count) 489 | } 490 | 491 | for name, count := range rawMetrics.Rates { 492 | metrics[fmt.Sprintf("%s_rate", name)] = float64(count) 493 | } 494 | 495 | for name, valuesToCounts := range rawMetrics.Histograms { 496 | for histoName, histoValue := range ms.processHistograms(name, valuesToCounts) { 497 | metrics[histoName] = histoValue 498 | } 499 | } 500 | 501 | for name, value := range rawMetrics.Gauges { 502 | metrics[name] = value 503 | } 504 | 505 | return &ProcessedMetricSet{Time: rawMetrics.Time, Metrics: metrics} 506 | } 507 | 508 | func (ms *MetricSystem) updateSubscribers() { 509 | ms.subscribersMu.Lock() 510 | defer ms.subscribersMu.Unlock() 511 | for { 512 | select { 513 | case subscriber := <-ms.subscribeToRawMetrics: 514 | ms.rawSubscribers[subscriber] = struct{}{} 515 | case unsubscriber := <-ms.unsubscribeFromRawMetrics: 516 | delete(ms.rawSubscribers, unsubscriber) 517 | case subscriber := <-ms.subscribeToProcessedMetrics: 518 | ms.processedSubscribers[subscriber] = struct{}{} 519 | case unsubscriber := <-ms.unsubscribeFromProcessedMetrics: 520 | delete(ms.processedSubscribers, unsubscriber) 521 | default: // no changes in subscribers 522 | return 523 | } 524 | } 525 | } 526 | 527 | // reaper wakes up every seconds, 528 | // collects and processes metrics, and pushes 529 | // them to the corresponding subscribing channels. 530 | func (ms *MetricSystem) reaper() { 531 | ms.reaping = true 532 | 533 | // create goroutine pool to handle multiple processing tasks at once 534 | processChan := make(chan func(), 16) 535 | for i := 0; i < int(math.Max(float64(runtime.NumCPU()/4), 4)); i++ { 536 | go func() { 537 | for { 538 | c, ok := <-processChan 539 | if !ok { 540 | return 541 | } 542 | c() 543 | } 544 | }() 545 | } 546 | 547 | // begin reaper main loop 548 | for { 549 | // sleep until the next interval, or die if shutdownChan is closed 550 | tts := ms.interval.Nanoseconds() - 551 | (time.Now().UnixNano() % ms.interval.Nanoseconds()) 552 | select { 553 | case <-time.After(time.Duration(tts)): 554 | case <-ms.shutdownChan: 555 | ms.reaping = false 556 | close(processChan) 557 | return 558 | } 559 | 560 | rawMetrics := ms.collectRawMetrics() 561 | 562 | ms.updateSubscribers() 563 | 564 | // broadcast raw metrics 565 | for subscriber := range ms.rawSubscribers { 566 | // new subscribers get all counters, otherwise just the new diffs 567 | select { 568 | case subscriber <- rawMetrics: 569 | delete(ms.rawBadSubscribers, subscriber) 570 | default: 571 | ms.rawBadSubscribers[subscriber]++ 572 | glog.Error("a raw subscriber has allowed their channel to fill up. ", 573 | "dropping their metrics on the floor rather than blocking.") 574 | if ms.rawBadSubscribers[subscriber] >= 2 { 575 | glog.Error("this raw subscriber has caused dropped metrics at ", 576 | "least 3 times in a row. closing the channel.") 577 | delete(ms.rawSubscribers, subscriber) 578 | close(subscriber) 579 | } 580 | } 581 | } 582 | 583 | // Perform the rest in another goroutine since processing is not 584 | // guaranteed to complete before the interval is up. 585 | sendProcessed := func() { 586 | // this is potentially expensive if there is a massive number of metrics 587 | processedMetrics := ms.processMetrics(rawMetrics) 588 | 589 | // add aggregate mean 590 | for name := range rawMetrics.Histograms { 591 | ms.histogramCountMu.RLock() 592 | aggCountPtr, countPresent := 593 | ms.histogramCountStore[fmt.Sprintf("%s_count", name)] 594 | aggCount := atomic.LoadUint64(aggCountPtr) 595 | aggSumPtr, sumPresent := 596 | ms.histogramCountStore[fmt.Sprintf("%s_sum", name)] 597 | aggSum := atomic.LoadUint64(aggSumPtr) 598 | ms.histogramCountMu.RUnlock() 599 | 600 | if countPresent && sumPresent && aggCount > 0 { 601 | processedMetrics.Metrics[fmt.Sprintf("%s_agg_avg", name)] = 602 | float64(aggSum / aggCount) 603 | processedMetrics.Metrics[fmt.Sprintf("%s_agg_count", name)] = 604 | float64(aggCount) 605 | processedMetrics.Metrics[fmt.Sprintf("%s_agg_sum", name)] = 606 | float64(aggSum) 607 | } 608 | } 609 | 610 | // broadcast processed metrics 611 | ms.subscribersMu.Lock() 612 | for subscriber := range ms.processedSubscribers { 613 | select { 614 | case subscriber <- processedMetrics: 615 | delete(ms.processedBadSubscribers, subscriber) 616 | default: 617 | ms.processedBadSubscribers[subscriber]++ 618 | glog.Error("a subscriber has allowed their channel to fill up. ", 619 | "dropping their metrics on the floor rather than blocking.") 620 | if ms.processedBadSubscribers[subscriber] >= 2 { 621 | glog.Error("this subscriber has caused dropped metrics at ", 622 | "least 3 times in a row. closing the channel.") 623 | delete(ms.processedSubscribers, subscriber) 624 | close(subscriber) 625 | } 626 | } 627 | } 628 | ms.subscribersMu.Unlock() 629 | } 630 | select { 631 | case processChan <- sendProcessed: 632 | default: 633 | // processChan has filled up, this metric load is not sustainable 634 | glog.Errorf("processing of metrics is taking longer than this node can "+ 635 | "handle. dropping this entire interval of %s metrics on the "+ 636 | "floor rather than blocking the reaper.", rawMetrics.Time) 637 | } 638 | } // end main reaper loop 639 | } 640 | 641 | // Start spawns a goroutine for merging metrics into caches from 642 | // metric submitters, and a reaper goroutine that harvests metrics at the 643 | // default interval of every 60 seconds. 644 | func (ms *MetricSystem) Start() { 645 | if !ms.reaping { 646 | go ms.reaper() 647 | } 648 | } 649 | 650 | // Stop shuts down a MetricSystem 651 | func (ms *MetricSystem) Stop() { 652 | close(ms.shutdownChan) 653 | } 654 | -------------------------------------------------------------------------------- /metrics_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2014 The Cockroach Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 12 | // implied. See the License for the specific language governing 13 | // permissions and limitations under the License. See the AUTHORS file 14 | // for names of contributors. 15 | // 16 | // Author: Tyler Neely (t@jujit.su) 17 | 18 | package loghisto 19 | 20 | import ( 21 | "fmt" 22 | "math" 23 | "runtime" 24 | "testing" 25 | "time" 26 | ) 27 | 28 | func ExampleMetricSystem() { 29 | ms := NewMetricSystem(time.Microsecond, true) 30 | ms.Start() 31 | myMetricStream := make(chan *ProcessedMetricSet, 2) 32 | ms.SubscribeToProcessedMetrics(myMetricStream) 33 | 34 | timeToken := ms.StartTimer("submit_metrics") 35 | ms.Counter("range_splits", 1) 36 | ms.Histogram("some_ipc_latency", 123) 37 | timeToken.Stop() 38 | 39 | processedMetricSet := <-myMetricStream 40 | ms.UnsubscribeFromProcessedMetrics(myMetricStream) 41 | 42 | m := processedMetricSet.Metrics 43 | 44 | example := []struct { 45 | Name string 46 | Value float64 47 | }{ 48 | { 49 | "total range splits during the process lifetime", 50 | m["range_splits"], 51 | }, { 52 | "range splits in this period", 53 | m["range_splits_rate"], 54 | }, { 55 | "some_ipc 99.9th percentile", 56 | m["some_ipc_latency_99.9"], 57 | }, { 58 | "some_ipc max", 59 | m["some_ipc_latency_max"], 60 | }, { 61 | "some_ipc calls this period", 62 | m["some_ipc_latency_count"], 63 | }, { 64 | "some_ipc calls during the process lifetime", 65 | m["some_ipc_latency_agg_count"], 66 | }, { 67 | "some_ipc total latency this period", 68 | m["some_ipc_latency_sum"], 69 | }, { 70 | "some_ipc mean this period", 71 | m["some_ipc_latency_avg"], 72 | }, { 73 | "some_ipc aggregate man", 74 | m["some_ipc_latency_agg_avg"], 75 | }, { 76 | "time spent submitting metrics this period", 77 | m["submit_metrics_sum"], 78 | }, { 79 | "number of goroutines", 80 | m["sys.NumGoroutine"], 81 | }, { 82 | "time spent in GC", 83 | m["sys.PauseTotalNs"], 84 | }, 85 | } 86 | for _, nameValue := range example { 87 | var result string 88 | if nameValue.Value == float64(0) { 89 | result = "NOT present" 90 | } else { 91 | result = "present" 92 | } 93 | fmt.Println(nameValue.Name, result) 94 | } 95 | ms.Stop() 96 | // Output: 97 | // total range splits during the process lifetime present 98 | // range splits in this period present 99 | // some_ipc 99.9th percentile present 100 | // some_ipc max present 101 | // some_ipc calls this period present 102 | // some_ipc calls during the process lifetime present 103 | // some_ipc total latency this period present 104 | // some_ipc mean this period present 105 | // some_ipc aggregate man present 106 | // time spent submitting metrics this period present 107 | // number of goroutines present 108 | // time spent in GC present 109 | } 110 | 111 | func TestPercentile(t *testing.T) { 112 | metrics := map[float64]uint64{ 113 | 10: 9000, 114 | 25: 900, 115 | 33: 90, 116 | 47: 9, 117 | 500: 1, 118 | } 119 | 120 | percentileToExpected := map[float64]float64{ 121 | 0: 10, 122 | .99: 25, 123 | .999: 33, 124 | .9991: 47, 125 | .9999: 47, 126 | 1: 500, 127 | } 128 | 129 | totalcount := uint64(0) 130 | proportions := make([]proportion, 0, len(metrics)) 131 | for value, count := range metrics { 132 | totalcount += count 133 | proportions = append(proportions, proportion{Value: value, Count: count}) 134 | } 135 | 136 | for p, expected := range percentileToExpected { 137 | result, err := percentile(totalcount, proportions, p) 138 | if err != nil { 139 | t.Error("error:", err) 140 | } 141 | 142 | // results must be within 1% of their expected values. 143 | diff := math.Abs(expected/result - 1) 144 | if diff > .01 { 145 | t.Errorf("percentile: %.04f, expected: %.04f, actual: %.04f, %% off: %.04f\n", 146 | p, expected, result, diff*100) 147 | } 148 | } 149 | } 150 | 151 | func TestCompress(t *testing.T) { 152 | toTest := []float64{ 153 | -421408208120481, 154 | -1, 155 | 0, 156 | 1, 157 | 214141241241241, 158 | } 159 | for _, f := range toTest { 160 | result := decompress(compress(f)) 161 | var diff float64 162 | if result == 0 { 163 | diff = math.Abs(f - result) 164 | } else { 165 | diff = math.Abs(f/result - 1) 166 | } 167 | if diff > .01 { 168 | t.Errorf("expected: %f, actual: %f, %% off: %.04f\n", 169 | f, result, diff*100) 170 | } 171 | } 172 | } 173 | 174 | func TestSysStats(t *testing.T) { 175 | metricSystem := NewMetricSystem(time.Microsecond, true) 176 | gauges := metricSystem.collectRawMetrics().Gauges 177 | v, present := gauges["sys.Alloc"] 178 | if v <= 0 || !present { 179 | t.Errorf("expected positive reported allocated bytes, got %f\n", v) 180 | } 181 | } 182 | 183 | func TestTimer(t *testing.T) { 184 | metricSystem := NewMetricSystem(time.Microsecond, false) 185 | token1 := metricSystem.StartTimer("timer1") 186 | token2 := metricSystem.StartTimer("timer1") 187 | time.Sleep(50 & time.Microsecond) 188 | token1.Stop() 189 | time.Sleep(5 * time.Microsecond) 190 | token2.Stop() 191 | token3 := metricSystem.StartTimer("timer1") 192 | time.Sleep(10 * time.Microsecond) 193 | token3.Stop() 194 | result := metricSystem.processMetrics(metricSystem.collectRawMetrics()).Metrics 195 | 196 | if result["timer1_min"] > result["timer1_50"] || 197 | result["timer1_50"] > result["timer1_max"] { 198 | t.Error("bad result map:", result) 199 | } 200 | } 201 | 202 | func TestRate(t *testing.T) { 203 | metricSystem := NewMetricSystem(time.Microsecond, false) 204 | metricSystem.Counter("rate1", 777) 205 | time.Sleep(20 * time.Millisecond) 206 | metrics := metricSystem.processMetrics(metricSystem.collectRawMetrics()).Metrics 207 | if metrics["rate1_rate"] != 777 { 208 | t.Error("count one value") 209 | } 210 | metricSystem.Counter("rate1", 1223) 211 | time.Sleep(20 * time.Millisecond) 212 | metrics = metricSystem.processMetrics(metricSystem.collectRawMetrics()).Metrics 213 | if metrics["rate1_rate"] != 1223 { 214 | t.Errorf("expected rate: 1223, actual: %f", metrics["rate1_rate"]) 215 | } 216 | metricSystem.Counter("rate1", 1223) 217 | metricSystem.Counter("rate1", 1223) 218 | time.Sleep(20 * time.Millisecond) 219 | metrics = metricSystem.processMetrics(metricSystem.collectRawMetrics()).Metrics 220 | if metrics["rate1_rate"] != 2446 { 221 | t.Errorf("expected rate: 2446, actual: %f", metrics["rate1_rate"]) 222 | } 223 | } 224 | 225 | func TestCounter(t *testing.T) { 226 | metricSystem := NewMetricSystem(time.Microsecond, false) 227 | metricSystem.Counter("counter1", 3290) 228 | time.Sleep(20 * time.Millisecond) 229 | metrics := metricSystem.processMetrics(metricSystem.collectRawMetrics()).Metrics 230 | if metrics["counter1"] != 3290 { 231 | t.Error("count one value", metrics) 232 | } 233 | metricSystem.Counter("counter1", 10000) 234 | time.Sleep(20 * time.Millisecond) 235 | metrics = metricSystem.processMetrics(metricSystem.collectRawMetrics()).Metrics 236 | if metrics["counter1"] != 13290 { 237 | t.Error("accumulate counts across broadcasts") 238 | } 239 | 240 | } 241 | 242 | func TestUpdateSubscribers(t *testing.T) { 243 | rawMetricStream := make(chan *RawMetricSet) 244 | processedMetricStream := make(chan *ProcessedMetricSet) 245 | 246 | metricSystem := NewMetricSystem(2*time.Microsecond, false) 247 | metricSystem.SubscribeToRawMetrics(rawMetricStream) 248 | metricSystem.SubscribeToProcessedMetrics(processedMetricStream) 249 | 250 | metricSystem.Counter("counter5", 33) 251 | 252 | go func() { 253 | select { 254 | case <-rawMetricStream: 255 | case <-time.After(20 * time.Millisecond): 256 | t.Error("received no raw metrics from the MetricSystem after 2 milliseconds.") 257 | } 258 | metricSystem.UnsubscribeFromRawMetrics(rawMetricStream) 259 | }() 260 | go func() { 261 | select { 262 | case <-processedMetricStream: 263 | case <-time.After(20 * time.Millisecond): 264 | t.Error("received no processed metrics from the MetricSystem after 2 milliseconds.") 265 | } 266 | metricSystem.UnsubscribeFromProcessedMetrics(processedMetricStream) 267 | }() 268 | 269 | metricSystem.Start() 270 | time.Sleep(20 * time.Millisecond) 271 | 272 | go func() { 273 | select { 274 | case <-rawMetricStream: 275 | t.Error("received raw metrics from the MetricSystem after unsubscribing.") 276 | default: 277 | } 278 | }() 279 | go func() { 280 | select { 281 | case <-processedMetricStream: 282 | t.Error("received processed metrics from the MetricSystem after unsubscribing.") 283 | default: 284 | } 285 | }() 286 | time.Sleep(20 * time.Millisecond) 287 | } 288 | 289 | func TestProcessedBroadcast(t *testing.T) { 290 | processedMetricStream := make(chan *ProcessedMetricSet, 128) 291 | metricSystem := NewMetricSystem(time.Microsecond, false) 292 | metricSystem.SubscribeToProcessedMetrics(processedMetricStream) 293 | 294 | metricSystem.Histogram("histogram1", 33) 295 | metricSystem.Histogram("histogram1", 59) 296 | metricSystem.Histogram("histogram1", 330000) 297 | metricSystem.Start() 298 | 299 | select { 300 | case processedMetrics := <-processedMetricStream: 301 | if int(processedMetrics.Metrics["histogram1_sum"]) != 331132 { 302 | t.Error("expected histogram1_sum to be 331132, instead was", 303 | processedMetrics.Metrics["histogram1_sum"]) 304 | } 305 | if int(processedMetrics.Metrics["histogram1_agg_avg"]) != 110377 { 306 | t.Error("expected histogram1_agg_avg to be 110377, instead was", 307 | processedMetrics.Metrics["histogram1_agg_avg"]) 308 | } 309 | if int(processedMetrics.Metrics["histogram1_count"]) != 3 { 310 | t.Error("expected histogram1_count to be 3, instead was", 311 | processedMetrics.Metrics["histogram1_count"]) 312 | } 313 | case <-time.After(20 * time.Millisecond): 314 | t.Error("received no metrics from the MetricSystem after 2 milliseconds.") 315 | } 316 | 317 | metricSystem.UnsubscribeFromProcessedMetrics(processedMetricStream) 318 | metricSystem.Stop() 319 | } 320 | 321 | func TestRawBroadcast(t *testing.T) { 322 | rawMetricStream := make(chan *RawMetricSet, 128) 323 | metricSystem := NewMetricSystem(time.Microsecond, false) 324 | metricSystem.SubscribeToRawMetrics(rawMetricStream) 325 | 326 | metricSystem.Counter("counter2", 10) 327 | metricSystem.Counter("counter2", 111) 328 | metricSystem.Start() 329 | 330 | select { 331 | case rawMetrics := <-rawMetricStream: 332 | if rawMetrics.Counters["counter2"] != 121 { 333 | t.Error("expected counter2 to be 121, instead was", 334 | rawMetrics.Counters["counter2"]) 335 | } 336 | if rawMetrics.Rates["counter2"] != 121 { 337 | t.Error("expected counter2 rate to be 121, instead was", 338 | rawMetrics.Counters["counter2"]) 339 | } 340 | case <-time.After(20 * time.Millisecond): 341 | t.Error("received no metrics from the MetricSystem after 2 milliseconds.") 342 | } 343 | 344 | metricSystem.UnsubscribeFromRawMetrics(rawMetricStream) 345 | metricSystem.Stop() 346 | } 347 | 348 | func TestMetricSystemStop(t *testing.T) { 349 | metricSystem := NewMetricSystem(time.Microsecond, false) 350 | 351 | startingRoutines := runtime.NumGoroutine() 352 | 353 | metricSystem.Start() 354 | metricSystem.Stop() 355 | 356 | time.Sleep(20 * time.Millisecond) 357 | 358 | endRoutines := runtime.NumGoroutine() 359 | if startingRoutines < endRoutines { 360 | t.Errorf("lingering goroutines have not been cleaned up: "+ 361 | "before: %d, after: %d\n", startingRoutines, endRoutines) 362 | } 363 | } 364 | -------------------------------------------------------------------------------- /opentsdb.go: -------------------------------------------------------------------------------- 1 | // Copyright 2014 The Cockroach Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 12 | // implied. See the License for the specific language governing 13 | // permissions and limitations under the License. See the AUTHORS file 14 | // for names of contributors. 15 | // 16 | // Author: Tyler Neely (t@jujit.su) 17 | 18 | package loghisto 19 | 20 | import ( 21 | "bytes" 22 | "fmt" 23 | "os" 24 | "strings" 25 | 26 | ) 27 | 28 | type openTSDBStat struct { 29 | Metric string 30 | Time int64 31 | Value float64 32 | Tags map[string]string 33 | } 34 | 35 | type openTSDBStatArray []*openTSDBStat 36 | 37 | func mapToTSDProtocolTags(tagMap map[string]string) string { 38 | tags := make([]string, 0, len(tagMap)) 39 | for tag, value := range tagMap { 40 | tags = append(tags, fmt.Sprintf("%s=%s", tag, value)) 41 | } 42 | return strings.Join(tags, " ") 43 | } 44 | 45 | func (stats openTSDBStatArray) ToRequest() []byte { 46 | var request bytes.Buffer 47 | for _, stat := range stats { 48 | request.Write([]byte(fmt.Sprintf("put %s %d %f %s\n", 49 | stat.Metric, 50 | stat.Time, 51 | stat.Value, 52 | mapToTSDProtocolTags(stat.Tags)))) 53 | } 54 | return []byte(request.String()) 55 | } 56 | 57 | func (metricSet *ProcessedMetricSet) toopenTSDBStats() openTSDBStatArray { 58 | hostname, err := os.Hostname() 59 | if err != nil { 60 | hostname = "unknown" 61 | } 62 | 63 | stats := make([]*openTSDBStat, 0, len(metricSet.Metrics)) 64 | i := 0 65 | for metric, value := range metricSet.Metrics { 66 | var tags = map[string]string{ 67 | "host": hostname, 68 | } 69 | //TODO(tyler) custom tags 70 | stats = append(stats, &openTSDBStat{ 71 | Metric: metric, 72 | Time: metricSet.Time.Unix(), 73 | Value: value, 74 | Tags: tags, 75 | }) 76 | i++ 77 | } 78 | return stats 79 | } 80 | 81 | // OpenTSDBProtocol generates a wire representation of a ProcessedMetricSet 82 | // for submission to an OpenTSDB instance. 83 | func OpenTSDBProtocol(ms *ProcessedMetricSet) []byte { 84 | return ms.toopenTSDBStats().ToRequest() 85 | } 86 | -------------------------------------------------------------------------------- /opentsdb_test.go: -------------------------------------------------------------------------------- 1 | package loghisto 2 | 3 | import ( 4 | "testing" 5 | "time" 6 | ) 7 | 8 | func TestOpenTSDB(t *testing.T) { 9 | ms := NewMetricSystem(time.Second, true) 10 | s := NewSubmitter(ms, OpenTSDBProtocol, "tcp", "localhost:7777") 11 | s.Start() 12 | 13 | metrics := &ProcessedMetricSet{ 14 | Time: time.Now(), 15 | Metrics: map[string]float64{ 16 | "test.1": 43.32, 17 | "test.2": 12.3, 18 | }, 19 | } 20 | request := s.serializer(metrics) 21 | s.submit(request) 22 | s.Shutdown() 23 | } 24 | -------------------------------------------------------------------------------- /print_benchmark.go: -------------------------------------------------------------------------------- 1 | // Copyright 2014 The Cockroach Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 12 | // implied. See the License for the specific language governing 13 | // permissions and limitations under the License. See the AUTHORS file 14 | // for names of contributors. 15 | // 16 | // Author: Tyler Neely (t@jujit.su) 17 | 18 | package loghisto 19 | 20 | import ( 21 | "fmt" 22 | "os" 23 | "runtime" 24 | "text/tabwriter" 25 | "time" 26 | ) 27 | 28 | // PrintBenchmark will run the provided function at the specified 29 | // concurrency, time the operation, and once per second write the 30 | // following information to standard out: 31 | // 32 | // 2014-08-09 17:44:57 -0400 EDT 33 | // raft_AppendLogEntries_count: 16488 34 | // raft_AppendLogEntries_max: 3.982478339757623e+07 35 | // raft_AppendLogEntries_99.99: 3.864778314316012e+07 36 | // raft_AppendLogEntries_99.9: 3.4366224772310276e+06 37 | // raft_AppendLogEntries_99: 2.0228126576114902e+06 38 | // raft_AppendLogEntries_50: 469769.7083161708 39 | // raft_AppendLogEntries_min: 129313.15075081984 40 | // raft_AppendLogEntries_sum: 9.975892639594093e+09 41 | // raft_AppendLogEntries_avg: 605039.5827022133 42 | // raft_AppendLogEntries_agg_avg: 618937 43 | // raft_AppendLogEntries_agg_count: 121095 44 | // raft_AppendLogEntries_agg_sum: 7.4950269894e+10 45 | // sys.Alloc: 997328 46 | // sys.NumGC: 1115 47 | // sys.PauseTotalNs: 2.94946542e+08 48 | // sys.NumGoroutine: 26 49 | func PrintBenchmark(name string, concurrency uint, op func()) { 50 | runtime.GOMAXPROCS(runtime.NumCPU()) 51 | var ms = NewMetricSystem(time.Second, true) 52 | mc := make(chan *ProcessedMetricSet, 1) 53 | ms.SubscribeToProcessedMetrics(mc) 54 | ms.Start() 55 | defer ms.Stop() 56 | 57 | go receiver(name, mc) 58 | 59 | for i := uint(0); i < concurrency; i++ { 60 | go func() { 61 | for { 62 | timer := ms.StartTimer(name) 63 | op() 64 | timer.Stop() 65 | } 66 | }() 67 | } 68 | 69 | <-make(chan struct{}) 70 | } 71 | 72 | func receiver(name string, mc chan *ProcessedMetricSet) { 73 | interesting := []string{ 74 | fmt.Sprintf("%s_count", name), 75 | fmt.Sprintf("%s_max", name), 76 | fmt.Sprintf("%s_99.99", name), 77 | fmt.Sprintf("%s_99.9", name), 78 | fmt.Sprintf("%s_99", name), 79 | fmt.Sprintf("%s_95", name), 80 | fmt.Sprintf("%s_90", name), 81 | fmt.Sprintf("%s_75", name), 82 | fmt.Sprintf("%s_50", name), 83 | fmt.Sprintf("%s_min", name), 84 | fmt.Sprintf("%s_sum", name), 85 | fmt.Sprintf("%s_avg", name), 86 | fmt.Sprintf("%s_agg_avg", name), 87 | fmt.Sprintf("%s_agg_count", name), 88 | fmt.Sprintf("%s_agg_sum", name), 89 | "sys.Alloc", 90 | "sys.NumGC", 91 | "sys.PauseTotalNs", 92 | "sys.NumGoroutine", 93 | } 94 | 95 | w := new(tabwriter.Writer) 96 | w.Init(os.Stdout, 0, 8, 0, '\t', 0) 97 | 98 | for m := range mc { 99 | fmt.Fprintln(w, m.Time) 100 | for _, e := range interesting { 101 | fmt.Fprintln(w, fmt.Sprintf("%s:\t", e), m.Metrics[e]) 102 | } 103 | fmt.Fprintln(w) 104 | w.Flush() 105 | } 106 | } 107 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | loghisto 2 | ============ 3 | [![Build Status](https://travis-ci.org/spacejam/loghisto.svg)](https://travis-ci.org/spacejam/loghisto) 4 | 5 | A metric system for high performance counters and histograms. Unlike popular metric systems today, this does not destroy the accuracy of histograms by sampling. Instead, a logarithmic bucketing function compresses values, generally within 1% of their true value (although between 0 and 1 the precision loss may not be within this boundary). This allows for extreme compression, which allows us to calculate arbitrarily high percentiles with no loss of accuracy - just a small amount of precision. This is particularly useful for highly-clustered events that are tolerant of a small precision loss, but for which you REALLY care about what the tail looks like, such as measuring latency across a distributed system. 6 | 7 | Copied out of my work for the CockroachDB metrics system. Based on an algorithm created by Keith Frost. 8 | 9 | 10 | ### running a print benchmark for quick analysis 11 | ```go 12 | package main 13 | 14 | import ( 15 | "runtime" 16 | "github.com/spacejam/loghisto" 17 | ) 18 | 19 | func benchmark() { 20 | // do some stuff 21 | } 22 | 23 | func main() { 24 | numCPU := runtime.NumCPU() 25 | runtime.GOMAXPROCS(numCPU) 26 | 27 | desiredConcurrency := uint(100) 28 | loghisto.PrintBenchmark("benchmark1234", desiredConcurrency, benchmark) 29 | } 30 | ``` 31 | results in something like this printed to stdout each second: 32 | ``` 33 | 2014-12-11 21:41:45 -0500 EST 34 | benchmark1234_count: 2.0171025e+07 35 | benchmark1234_max: 2.4642914167480484e+07 36 | benchmark1234_99.99: 4913.768840299134 37 | benchmark1234_99.9: 1001.2472422902518 38 | benchmark1234_99: 71.24044000732538 39 | benchmark1234_95: 67.03348428941965 40 | benchmark1234_90: 65.68633104092515 41 | benchmark1234_75: 63.07152259993664 42 | benchmark1234_50: 58.739891704145194 43 | benchmark1234_min: -657.5233632152207 // Corollary: time.Since(time.Now()) is often < 0 44 | benchmark1234_sum: 1.648051169322668e+09 45 | benchmark1234_avg: 81.70388809307748 46 | benchmark1234_agg_avg: 89 47 | benchmark1234_agg_count: 6.0962226e+07 48 | benchmark1234_agg_sum: 5.454779078e+09 49 | sys.Alloc: 1.132672e+06 50 | sys.NumGC: 5741 51 | sys.PauseTotalNs: 1.569390954e+09 52 | sys.NumGoroutine: 113 53 | ``` 54 | ### adding an embedded metric system to your code 55 | ```go 56 | import ( 57 | "time" 58 | "fmt" 59 | "github.com/spacejam/loghisto" 60 | ) 61 | func ExampleMetricSystem() { 62 | // Create metric system that reports once a minute, and includes stats 63 | // about goroutines, memory usage and GC. 64 | includeGoProcessStats := true 65 | ms := loghisto.NewMetricSystem(time.Minute, includeGoProcessStats) 66 | ms.Start() 67 | 68 | // create a channel that subscribes to metrics as they are produced once 69 | // per minute. 70 | // NOTE: if you allow this channel to fill up, the metric system will NOT 71 | // block, and will FORGET about your channel if you fail to unblock the 72 | // channel after 3 configured intervals (in this case 3 minutes) rather 73 | // than causing a memory leak. 74 | myMetricStream := make(chan *loghisto.ProcessedMetricSet, 2) 75 | ms.SubscribeToProcessedMetrics(myMetricStream) 76 | 77 | // create some metrics 78 | timeToken := ms.StartTimer("time for creating a counter and histo") 79 | ms.Counter("some event", 1) 80 | ms.Histogram("some measured thing", 123) 81 | timeToken.Stop() 82 | 83 | for m := range myMetricStream { 84 | fmt.Printf("number of goroutines: %f\n", m.Metrics["sys.NumGoroutine"]) 85 | } 86 | 87 | // if you want to manually unsubscribe from the metric stream 88 | ms.UnsubscribeFromProcessedMetrics(myMetricStream) 89 | 90 | // to stop and clean up your metric system 91 | ms.Stop() 92 | } 93 | ``` 94 | ### automatically sending your metrics to OpenTSDB, KairosDB or Graphite 95 | ```go 96 | func ExampleExternalSubmitter() { 97 | includeGoProcessStats := true 98 | ms := NewMetricSystem(time.Minute, includeGoProcessStats) 99 | ms.Start() 100 | // graphite 101 | s := NewSubmitter(ms, GraphiteProtocol, "tcp", "localhost:7777") 102 | s.Start() 103 | 104 | // opentsdb / kairosdb 105 | s := NewSubmitter(ms, OpenTSDBProtocol, "tcp", "localhost:7777") 106 | s.Start() 107 | 108 | // to tear down: 109 | s.Shutdown() 110 | } 111 | ``` 112 | 113 | See code for the Graphite/OpenTSDB protocols for adding your own output plugins, it's pretty simple. 114 | -------------------------------------------------------------------------------- /submitter.go: -------------------------------------------------------------------------------- 1 | // Copyright 2014 The Cockroach Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 12 | // implied. See the License for the specific language governing 13 | // permissions and limitations under the License. See the AUTHORS file 14 | // for names of contributors. 15 | // 16 | // Author: Tyler Neely (t@jujit.su) 17 | 18 | package loghisto 19 | 20 | import ( 21 | "net" 22 | "sync" 23 | "time" 24 | 25 | ) 26 | 27 | type requestable interface{} 28 | 29 | type requestableArray interface { 30 | ToRequest() []byte 31 | } 32 | 33 | // Submitter encapsulates the state of a metric submitter. 34 | type Submitter struct { 35 | // backlog works as an evicting queue 36 | backlog [60][]byte 37 | backlogHead uint 38 | backlogTail uint 39 | backlogMu sync.Mutex 40 | serializer func(*ProcessedMetricSet) []byte 41 | DestinationNetwork string 42 | DestinationAddress string 43 | metricSystem *MetricSystem 44 | metricChan chan *ProcessedMetricSet 45 | shutdownChan chan struct{} 46 | } 47 | 48 | // NewSubmitter creates a Submitter that receives metrics off of a 49 | // specified metric channel, serializes them using the provided 50 | // serialization function, and attempts to send them to the 51 | // specified destination. 52 | func NewSubmitter(metricSystem *MetricSystem, 53 | serializer func(*ProcessedMetricSet) []byte, destinationNetwork string, 54 | destinationAddress string) *Submitter { 55 | metricChan := make(chan *ProcessedMetricSet, 60) 56 | metricSystem.SubscribeToProcessedMetrics(metricChan) 57 | return &Submitter{ 58 | backlog: [60][]byte{}, 59 | backlogHead: 0, 60 | backlogTail: 0, 61 | serializer: serializer, 62 | DestinationNetwork: destinationNetwork, 63 | DestinationAddress: destinationAddress, 64 | metricSystem: metricSystem, 65 | metricChan: metricChan, 66 | shutdownChan: make(chan struct{}), 67 | } 68 | } 69 | 70 | func (s *Submitter) retryBacklog() error { 71 | var request []byte 72 | for { 73 | s.backlogMu.Lock() 74 | head := s.backlogHead 75 | tail := s.backlogTail 76 | if head != tail { 77 | request = s.backlog[head] 78 | } 79 | s.backlogMu.Unlock() 80 | 81 | if head == tail { 82 | return nil 83 | } 84 | 85 | err := s.submit(request) 86 | if err != nil { 87 | return err 88 | } 89 | s.backlogMu.Lock() 90 | s.backlogHead = (s.backlogHead + 1) % 60 91 | s.backlogMu.Unlock() 92 | } 93 | } 94 | 95 | func (s *Submitter) appendToBacklog(request []byte) { 96 | s.backlogMu.Lock() 97 | s.backlog[s.backlogTail] = request 98 | s.backlogTail = (s.backlogTail + 1) % 60 99 | // if we've run into the head, evict it 100 | if s.backlogHead == s.backlogTail { 101 | s.backlogHead = (s.backlogHead + 1) % 60 102 | } 103 | s.backlogMu.Unlock() 104 | } 105 | 106 | func (s *Submitter) submit(request []byte) error { 107 | conn, err := net.DialTimeout(s.DestinationNetwork, s.DestinationAddress, 108 | 5*time.Second) 109 | if err != nil { 110 | return err 111 | } 112 | conn.SetDeadline(time.Now().Add(5 * time.Second)) 113 | _, err = conn.Write(request) 114 | conn.Close() 115 | return err 116 | } 117 | 118 | // Start creates the goroutines that receive, serialize, and send metrics. 119 | func (s *Submitter) Start() { 120 | go func() { 121 | for { 122 | select { 123 | case metrics, ok := <-s.metricChan: 124 | if !ok { 125 | // We can no longer make progress. 126 | return 127 | } 128 | request := s.serializer(metrics) 129 | s.appendToBacklog(request) 130 | case <-s.shutdownChan: 131 | return 132 | } 133 | } 134 | }() 135 | 136 | go func() { 137 | for { 138 | select { 139 | case <-s.shutdownChan: 140 | return 141 | default: 142 | s.retryBacklog() 143 | tts := s.metricSystem.interval.Nanoseconds() - 144 | (time.Now().UnixNano() % s.metricSystem.interval.Nanoseconds()) 145 | time.Sleep(time.Duration(tts)) 146 | } 147 | } 148 | }() 149 | } 150 | 151 | // Shutdown shuts down a submitter 152 | func (s *Submitter) Shutdown() { 153 | select { 154 | case <-s.shutdownChan: 155 | // already closed 156 | default: 157 | close(s.shutdownChan) 158 | } 159 | } 160 | --------------------------------------------------------------------------------