├── .gitignore ├── LICENSE.txt ├── README.txt └── src ├── estatsd.app.src ├── estatsd.erl ├── estatsd_app.erl ├── estatsd_server.erl └── estatsd_sup.erl /.gitignore: -------------------------------------------------------------------------------- 1 | ebin 2 | .* 3 | *.beam 4 | *~ 5 | erl_crash.dump 6 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2011 Richard Jones 2 | 3 | Permission is hereby granted, free of charge, to any person 4 | obtaining a copy of this software and associated documentation 5 | files (the "Software"), to deal in the Software without 6 | restriction, including without limitation the rights to use, 7 | copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | copies of the Software, and to permit persons to whom the 9 | Software is furnished to do so, subject to the following 10 | conditions: 11 | 12 | The above copyright notice and this permission notice shall be 13 | included in all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 16 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 17 | OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 18 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 19 | HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 20 | WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 21 | FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 | OTHER DEALINGS IN THE SOFTWARE. 23 | 24 | -------------------------------------------------------------------------------- /README.txt: -------------------------------------------------------------------------------- 1 | estatsd is a simple stats aggregation service that periodically dumps data to 2 | Graphite: http://graphite.wikidot.com/ 3 | 4 | NB: Graphite is good, despite the website being a bit ghetto. 5 | 6 | Inspired heavily by etsy statsd: 7 | http://codeascraft.etsy.com/2011/02/15/measure-anything-measure-everything/ 8 | 9 | QUICK DEMO 10 | ========== 11 | 12 | 1) Install and configure graphite (quick-ish) 13 | 2) Install rebar, have it in your path 14 | 3) rebar compile 15 | 4) erl -pa ebin 16 | 5) > application:start(estatsd). 17 | > estatsd:increment(foo, 123). 18 | 6) Observe graphite now has 1 data point. 19 | 20 | USAGE 21 | ===== 22 | 23 | Add this app to your rebar deps, and make sure it's started somehow 24 | eg: application:start(estatsd). 25 | 26 | You can configure custom graphite host/port and flush interval using 27 | application environment vars. See estatsd_sup for details. 28 | 29 | The following calls to estatsd are all gen_server:cast, ie non-blocking. 30 | 31 | Gauges 32 | -------- 33 | 34 | estatsd:gauge(temperature, 45). %% set temperature to 45 35 | 36 | Counters 37 | -------- 38 | 39 | estatsd:increment(num_foos). %% increment num_foos by one 40 | 41 | estatsd:decrement(<<"num_bars">>, 3). %% decrement num_bars by 3 42 | 43 | estatsd:increment("tcp.bytes_in", 512). %% increment tcp.bytes_in by 512 44 | 45 | Timers 46 | ------ 47 | 48 | estatsd:timing(sometask, 1534). %% report that sometask took 1534ms 49 | 50 | Or for your convenience: 51 | 52 | Start = erlang:now(), 53 | do_sometask(), 54 | estatsd:timing(sometast, Start). %% uses now() and now_diff for you 55 | 56 | 57 | 58 | NOTES 59 | ===== 60 | 61 | This could be extended to take a callback for reporting mechanisms. 62 | Right now it's hardcoded to stick data into graphite. 63 | 64 | I've been running this since May 2011 in production for irccloud. 65 | 66 | 67 | Richard Jones 68 | @metabrew 69 | -------------------------------------------------------------------------------- /src/estatsd.app.src: -------------------------------------------------------------------------------- 1 | {application, estatsd, 2 | [ 3 | {description, "Stats aggregation service that writes to graphite"}, 4 | {vsn, "1.0"}, 5 | {registered, []}, 6 | {applications, [ 7 | kernel, 8 | stdlib 9 | ]}, 10 | {mod, { estatsd_app, []}}, 11 | {env, []} 12 | ]}. 13 | -------------------------------------------------------------------------------- /src/estatsd.erl: -------------------------------------------------------------------------------- 1 | -module(estatsd). 2 | 3 | -export([ 4 | gauge/2, 5 | increment/1, increment/2, increment/3, 6 | decrement/1, decrement/2, decrement/3, 7 | timing/2 8 | ]). 9 | 10 | -define(SERVER, estatsd_server). 11 | 12 | % Convenience: just give it the now() tuple when the work started 13 | timing(Key, StartTime = {_,_,_}) -> 14 | Dur = erlang:round(timer:now_diff(erlang:now(), StartTime)/1000), 15 | timing(Key,Dur); 16 | 17 | % Log timing information, ms 18 | timing(Key, Duration) when is_integer(Duration) -> 19 | gen_server:cast(?SERVER, {timing, Key, Duration}); 20 | 21 | timing(Key, Duration) -> 22 | gen_server:cast(?SERVER, {timing, Key, erlang:round(Duration)}). 23 | 24 | 25 | 26 | 27 | % Increments one or more stats counters 28 | increment(Key) -> increment(Key, 1, 1). 29 | increment(Key, Amount) -> increment(Key, Amount, 1). 30 | increment(Key, Amount, Sample) -> 31 | gen_server:cast(?SERVER, {increment, Key, Amount, Sample}). 32 | 33 | decrement(Key) -> decrement(Key, -1, 1). 34 | decrement(Key, Amount) -> decrement(Key, Amount, 1). 35 | decrement(Key, Amount, Sample) -> 36 | increment(Key, 0 - Amount, Sample). 37 | 38 | % Sets a gauge value 39 | gauge(Key, Value) when is_number(Value) -> 40 | gen_server:cast(?SERVER, {gauge, Key, Value}). 41 | -------------------------------------------------------------------------------- /src/estatsd_app.erl: -------------------------------------------------------------------------------- 1 | -module(estatsd_app). 2 | 3 | -behaviour(application). 4 | 5 | %% Application callbacks 6 | -export([start/2, stop/1]). 7 | 8 | %% =================================================================== 9 | %% Application callbacks 10 | %% =================================================================== 11 | 12 | start(_StartType, _StartArgs) -> 13 | estatsd_sup:start_link(). 14 | 15 | stop(_State) -> 16 | ok. 17 | -------------------------------------------------------------------------------- /src/estatsd_server.erl: -------------------------------------------------------------------------------- 1 | %% Stats aggregation process that periodically dumps data to graphite 2 | %% Will calculate 90th percentile etc. 3 | %% Inspired by etsy statsd: 4 | %% http://codeascraft.etsy.com/2011/02/15/measure-anything-measure-everything/ 5 | %% 6 | %% This could be extended to take a callback for reporting mechanisms. 7 | %% Right now it's hardcoded to stick data into graphite. 8 | %% 9 | %% Richard Jones 10 | %% 11 | -module(estatsd_server). 12 | -behaviour(gen_server). 13 | 14 | -export([start_link/4]). 15 | 16 | %-export([key2str/1,flush/0]). %% export for debugging 17 | 18 | -export([init/1, handle_call/3, handle_cast/2, handle_info/2, 19 | terminate/2, code_change/3]). 20 | 21 | -record(state, {timers, % gb_tree of timer data 22 | flush_interval, % ms interval between stats flushing 23 | flush_timer, % TRef of interval timer 24 | graphite_host, % graphite server host 25 | graphite_port, % graphite server port 26 | vm_metrics % flag to enable sending VM metrics on flush 27 | }). 28 | 29 | start_link(FlushIntervalMs, GraphiteHost, GraphitePort, VmMetrics) -> 30 | gen_server:start_link({local, ?MODULE}, 31 | ?MODULE, 32 | [FlushIntervalMs, GraphiteHost, GraphitePort, VmMetrics], 33 | []). 34 | 35 | %% 36 | 37 | init([FlushIntervalMs, GraphiteHost, GraphitePort, VmMetrics]) -> 38 | error_logger:info_msg("estatsd will flush stats to ~p:~w every ~wms\n", 39 | [ GraphiteHost, GraphitePort, FlushIntervalMs ]), 40 | ets:new(statsd, [named_table, set]), 41 | ets:new(statsdgauge, [named_table, set]), 42 | %% Flush out stats to graphite periodically 43 | {ok, Tref} = timer:apply_interval(FlushIntervalMs, gen_server, cast, 44 | [?MODULE, flush]), 45 | State = #state{ timers = gb_trees:empty(), 46 | flush_interval = FlushIntervalMs, 47 | flush_timer = Tref, 48 | graphite_host = GraphiteHost, 49 | graphite_port = GraphitePort, 50 | vm_metrics = VmMetrics 51 | }, 52 | {ok, State}. 53 | 54 | handle_cast({gauge, Key, Value0}, State) -> 55 | Value = {Value0, num2str(unixtime())}, 56 | case ets:lookup(statsdgauge, Key) of 57 | [] -> 58 | ets:insert(statsdgauge, {Key, [Value]}); 59 | [{Key, Values}] -> 60 | ets:insert(statsdgauge, {Key, [Value | Values]}) 61 | end, 62 | {noreply, State}; 63 | 64 | handle_cast({increment, Key, Delta0, Sample}, State) when Sample >= 0, Sample =< 1 -> 65 | Delta = Delta0 * ( 1 / Sample ), %% account for sample rates < 1.0 66 | case ets:lookup(statsd, Key) of 67 | [] -> 68 | ets:insert(statsd, {Key, {Delta,1}}); 69 | [{Key,{Tot,Times}}] -> 70 | ets:insert(statsd, {Key,{Tot+Delta, Times+1}}), 71 | ok 72 | end, 73 | {noreply, State}; 74 | 75 | handle_cast({timing, Key, Duration}, State) -> 76 | case gb_trees:lookup(Key, State#state.timers) of 77 | none -> 78 | {noreply, State#state{timers = gb_trees:insert(Key, [Duration], State#state.timers)}}; 79 | {value, Val} -> 80 | {noreply, State#state{timers = gb_trees:update(Key, [Duration|Val], State#state.timers)}} 81 | end; 82 | 83 | handle_cast(flush, State) -> 84 | All = ets:tab2list(statsd), 85 | Gauges = ets:tab2list(statsdgauge), 86 | spawn( fun() -> do_report(All, Gauges, State) end ), 87 | %% WIPE ALL 88 | ets:delete_all_objects(statsd), 89 | ets:delete_all_objects(statsdgauge), 90 | NewState = State#state{timers = gb_trees:empty()}, 91 | {noreply, NewState}. 92 | 93 | handle_call(_,_,State) -> {reply, ok, State}. 94 | 95 | handle_info(_Msg, State) -> {noreply, State}. 96 | 97 | code_change(_, _, State) -> {noreply, State}. 98 | 99 | terminate(_, _) -> ok. 100 | 101 | %% INTERNAL STUFF 102 | 103 | send_to_graphite(Msg, State) -> 104 | % io:format("SENDING: ~s\n", [Msg]), 105 | case gen_tcp:connect(State#state.graphite_host, 106 | State#state.graphite_port, 107 | [list, {packet, 0}]) of 108 | {ok, Sock} -> 109 | gen_tcp:send(Sock, Msg), 110 | gen_tcp:close(Sock), 111 | ok; 112 | E -> 113 | %error_logger:error_msg("Failed to connect to graphite: ~p", [E]), 114 | E 115 | end. 116 | 117 | % this string munging is damn ugly compared to javascript :( 118 | key2str(K) when is_atom(K) -> 119 | atom_to_list(K); 120 | key2str(K) when is_binary(K) -> 121 | key2str(binary_to_list(K)); 122 | key2str(K) when is_list(K) -> 123 | {ok, R1} = re:compile("\\s+"), 124 | {ok, R2} = re:compile("/"), 125 | {ok, R3} = re:compile("[^a-zA-Z_\\-0-9\\.]"), 126 | Opts = [global, {return, list}], 127 | S1 = re:replace(K, R1, "_", Opts), 128 | S2 = re:replace(S1, R2, "-", Opts), 129 | S3 = re:replace(S2, R3, "", Opts), 130 | S3. 131 | 132 | num2str(NN) -> lists:flatten(io_lib:format("~w",[NN])). 133 | 134 | unixtime() -> {Meg,S,_Mic} = erlang:now(), Meg*1000000 + S. 135 | 136 | %% Aggregate the stats and generate a report to send to graphite 137 | do_report(All, Gauges, State) -> 138 | % One time stamp string used in all stats lines: 139 | TsStr = num2str(unixtime()), 140 | {MsgCounters, NumCounters} = do_report_counters(All, TsStr, State), 141 | {MsgTimers, NumTimers} = do_report_timers(TsStr, State), 142 | {MsgGauges, NumGauges} = do_report_gauges(Gauges), 143 | {MsgVmMetrics, NumVmMetrics} = do_report_vm_metrics(TsStr, State), 144 | %% REPORT TO GRAPHITE 145 | case NumTimers + NumCounters + NumGauges + NumVmMetrics of 146 | 0 -> nothing_to_report; 147 | NumStats -> 148 | FinalMsg = [ MsgCounters, 149 | MsgTimers, 150 | MsgGauges, 151 | MsgVmMetrics, 152 | %% Also graph the number of graphs we're graphing: 153 | "stats.num_stats ", num2str(NumStats), " ", TsStr, "\n" 154 | ], 155 | send_to_graphite(FinalMsg, State) 156 | end. 157 | 158 | do_report_counters(All, TsStr, State) -> 159 | Msg = lists:foldl( 160 | fun({Key, {Val0,NumVals}}, Acc) -> 161 | KeyS = key2str(Key), 162 | Val = Val0 / (State#state.flush_interval/1000), 163 | %% Build stats string for graphite 164 | Fragment = [ "stats.counters.", KeyS, " ", 165 | io_lib:format("~w", [Val]), " ", 166 | TsStr, "\n", 167 | 168 | "stats.counters.counts.", KeyS, " ", 169 | io_lib:format("~w",[NumVals]), " ", 170 | TsStr, "\n" 171 | ], 172 | [ Fragment | Acc ] 173 | end, [], All), 174 | {Msg, length(All)}. 175 | 176 | do_report_timers(TsStr, State) -> 177 | Timings = gb_trees:to_list(State#state.timers), 178 | Msg = lists:foldl( 179 | fun({Key, Vals}, Acc) -> 180 | KeyS = key2str(Key), 181 | Values = lists:sort(Vals), 182 | Count = length(Values), 183 | Min = hd(Values), 184 | Max = lists:last(Values), 185 | PctThreshold = 90, 186 | ThresholdIndex = erlang:round(((100-PctThreshold)/100)*Count), 187 | NumInThreshold = Count - ThresholdIndex, 188 | Values1 = lists:sublist(Values, NumInThreshold), 189 | MaxAtThreshold = lists:nth(NumInThreshold, Values), 190 | Mean = lists:sum(Values1) / NumInThreshold, 191 | %% Build stats string for graphite 192 | Startl = [ "stats.timers.", KeyS, "." ], 193 | Endl = [" ", TsStr, "\n"], 194 | Fragment = [ [Startl, Name, " ", num2str(Val), Endl] || {Name,Val} <- 195 | [ {"mean", Mean}, 196 | {"upper", Max}, 197 | {"upper_"++num2str(PctThreshold), MaxAtThreshold}, 198 | {"lower", Min}, 199 | {"count", Count} 200 | ]], 201 | [ Fragment | Acc ] 202 | end, [], Timings), 203 | {Msg, length(Msg)}. 204 | 205 | do_report_gauges(Gauges) -> 206 | Msg = lists:foldl( 207 | fun({Key, Vals}, Acc) -> 208 | KeyS = key2str(Key), 209 | Fragments = lists:foldl( 210 | fun ({Val, TsStr}, KeyAcc) -> 211 | %% Build stats string for graphite 212 | Fragment = [ 213 | "stats.gauges.", KeyS, " ", 214 | io_lib:format("~w", [Val]), " ", 215 | TsStr, "\n" 216 | ], 217 | [ Fragment | KeyAcc ] 218 | end, [], Vals 219 | ), 220 | [ Fragments | Acc ] 221 | end, [], Gauges 222 | ), 223 | {Msg, length(Gauges)}. 224 | 225 | do_report_vm_metrics(TsStr, State) -> 226 | case State#state.vm_metrics of 227 | true -> 228 | NodeKey = node_key(), 229 | {TotalReductions, Reductions} = erlang:statistics(reductions), 230 | {NumberOfGCs, WordsReclaimed, _} = erlang:statistics(garbage_collection), 231 | {{input, Input}, {output, Output}} = erlang:statistics(io), 232 | RunQueue = erlang:statistics(run_queue), 233 | StatsData = [ 234 | {process_count, erlang:system_info(process_count)}, 235 | {reductions, Reductions}, 236 | {total_reductions, TotalReductions}, 237 | {number_of_gcs, NumberOfGCs}, 238 | {words_reclaimed, WordsReclaimed}, 239 | {input, Input}, 240 | {output, Output}, 241 | {run_queue, RunQueue} 242 | ], 243 | StatsMsg = lists:map(fun({Key, Val}) -> 244 | [ 245 | "stats.vm.", NodeKey, ".stats.", key2str(Key), " ", 246 | io_lib:format("~w", [Val]), " ", 247 | TsStr, "\n" 248 | ] 249 | end, StatsData), 250 | MemoryMsg = lists:map(fun({Key, Val}) -> 251 | [ 252 | "stats.vm.", NodeKey, ".memory.", key2str(Key), " ", 253 | io_lib:format("~w", [Val]), " ", 254 | TsStr, "\n" 255 | ] 256 | end, erlang:memory()), 257 | Msg = StatsMsg ++ MemoryMsg; 258 | false -> 259 | Msg = [] 260 | end, 261 | {Msg, length(Msg)}. 262 | 263 | node_key() -> 264 | NodeList = atom_to_list(node()), 265 | {ok, R} = re:compile("[\@\.]"), 266 | Opts = [global, {return, list}], 267 | S = re:replace(NodeList, R, "_", Opts), 268 | key2str(S). 269 | -------------------------------------------------------------------------------- /src/estatsd_sup.erl: -------------------------------------------------------------------------------- 1 | -module(estatsd_sup). 2 | 3 | -behaviour(supervisor). 4 | 5 | %% API 6 | -export([start_link/0, start_link/1, start_link/3]). 7 | 8 | %% Supervisor callbacks 9 | -export([init/1]). 10 | 11 | -define(FLUSH_INTERVAL, appvar(flush_interval, 10000)). 12 | -define(GRAPHITE_HOST, appvar(graphite_host, "127.0.0.1")). 13 | -define(GRAPHITE_PORT, appvar(graphite_port, 2003)). 14 | -define(VM_METRICS, appvar(vm_metrics, true)). 15 | 16 | %% =================================================================== 17 | %% API functions 18 | %% =================================================================== 19 | 20 | 21 | start_link() -> 22 | start_link( ?FLUSH_INTERVAL, ?GRAPHITE_HOST, ?GRAPHITE_PORT, ?VM_METRICS). 23 | 24 | start_link(FlushIntervalMs) -> 25 | start_link( FlushIntervalMs, ?GRAPHITE_HOST, ?GRAPHITE_PORT, ?VM_METRICS). 26 | 27 | start_link(FlushIntervalMs, GraphiteHost, GraphitePort) -> 28 | start_link( FlushIntervalMs, GraphiteHost, GraphitePort, ?VM_METRICS). 29 | 30 | start_link(FlushIntervalMs, GraphiteHost, GraphitePort, VmMetrics) -> 31 | supervisor:start_link({local, ?MODULE}, 32 | ?MODULE, 33 | [FlushIntervalMs, GraphiteHost, GraphitePort, VmMetrics]). 34 | 35 | %% =================================================================== 36 | %% Supervisor callbacks 37 | %% =================================================================== 38 | 39 | init([FlushIntervalMs, GraphiteHost, GraphitePort, VmMetrics]) -> 40 | Children = [ 41 | {estatsd_server, 42 | {estatsd_server, start_link, 43 | [FlushIntervalMs, GraphiteHost, GraphitePort, VmMetrics]}, 44 | permanent, 5000, worker, [estatsd_server]} 45 | ], 46 | {ok, { {one_for_one, 10000, 10}, Children} }. 47 | 48 | appvar(K, Def) -> 49 | case application:get_env(estatsd, K) of 50 | {ok, Val} -> Val; 51 | undefined -> Def 52 | end. 53 | --------------------------------------------------------------------------------