├── test ├── test_helper.exs └── gen_metrics_test.exs ├── .gitignore ├── config └── config.exs ├── PITCHME.yaml ├── priv └── assets │ └── img │ └── elixir-logo.png ├── lib ├── gen_server │ ├── supervisor.ex │ ├── window.ex │ ├── server.ex │ ├── metric.ex │ ├── stats.ex │ ├── summary.ex │ ├── cluster.ex │ ├── manager.ex │ └── monitor.ex ├── gen_stage │ ├── supervisor.ex │ ├── stage.ex │ ├── window.ex │ ├── stats.ex │ ├── summary.ex │ ├── metric.ex │ ├── pipeline.ex │ ├── manager.ex │ └── monitor.ex ├── reporter_supervisor.ex ├── application.ex ├── reporter.ex ├── utils │ ├── math.ex │ ├── stats_push.ex │ └── runtime.ex └── gen_metrics.ex ├── LICENSE ├── bench ├── infinite_server.exs ├── infinite_pipeline.exs ├── trace_server.exs ├── sample_server.exs ├── trace_pipeline.exs ├── sample_pipeline.exs ├── support │ ├── server.exs │ └── stages.exs └── README.md ├── mix.lock ├── mix.exs ├── examples ├── genserver_events.exs ├── genstage_producer_consumer.exs ├── genstage_rate_limiter.exs └── genstage_gen_event.exs ├── README.md └── PITCHME.md /test/test_helper.exs: -------------------------------------------------------------------------------- 1 | ExUnit.start() 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /_build 2 | /cover 3 | /deps 4 | /doc 5 | erl_crash.dump 6 | *.ez 7 | -------------------------------------------------------------------------------- /config/config.exs: -------------------------------------------------------------------------------- 1 | use Mix.Config 2 | 3 | config :gen_metrics, sample_rate: 1.0 4 | -------------------------------------------------------------------------------- /PITCHME.yaml: -------------------------------------------------------------------------------- 1 | theme : sky 2 | logo : priv/assets/img/elixir-logo.png 3 | revealjs: 3.4.1 4 | -------------------------------------------------------------------------------- /priv/assets/img/elixir-logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/onetapbeyond/gen_metrics/HEAD/priv/assets/img/elixir-logo.png -------------------------------------------------------------------------------- /test/gen_metrics_test.exs: -------------------------------------------------------------------------------- 1 | defmodule GenMetricsTest do 2 | use ExUnit.Case 3 | doctest GenMetrics 4 | 5 | test "the truth" do 6 | assert 1 + 1 == 2 7 | end 8 | end 9 | -------------------------------------------------------------------------------- /lib/gen_server/supervisor.ex: -------------------------------------------------------------------------------- 1 | defmodule GenMetrics.GenServer.Supervisor do 2 | @moduledoc false 3 | use Supervisor 4 | alias GenMetrics.GenServer.Monitor 5 | 6 | def start_link do 7 | Supervisor.start_link(__MODULE__, [], [name: __MODULE__]) 8 | end 9 | 10 | def init(_) do 11 | 12 | children = [ 13 | worker(Monitor, [], restart: :transient) 14 | ] 15 | 16 | supervise(children, strategy: :simple_one_for_one) 17 | end 18 | 19 | end 20 | -------------------------------------------------------------------------------- /lib/gen_stage/supervisor.ex: -------------------------------------------------------------------------------- 1 | defmodule GenMetrics.GenStage.Supervisor do 2 | @moduledoc false 3 | use Supervisor 4 | alias GenMetrics.GenStage.Monitor 5 | 6 | def start_link do 7 | Supervisor.start_link(__MODULE__, [], [name: __MODULE__]) 8 | end 9 | 10 | def init(_) do 11 | 12 | children = [ 13 | worker(Monitor, [], restart: :transient) 14 | ] 15 | 16 | supervise(children, strategy: :simple_one_for_one) 17 | end 18 | 19 | end 20 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2017, David Russell 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. -------------------------------------------------------------------------------- /lib/reporter_supervisor.ex: -------------------------------------------------------------------------------- 1 | defmodule GenMetrics.Reporter.Supervisor do 2 | use Supervisor 3 | 4 | @moduledoc false 5 | 6 | def start_link do 7 | Supervisor.start_link(__MODULE__, [], [name: __MODULE__]) 8 | end 9 | 10 | def init(_) do 11 | 12 | children = [ 13 | worker(GenMetrics.Reporter, 14 | [GenMetrics.GenServer.Reporter], 15 | [id: GenMetrics.GenServer.Reporter]), 16 | worker(GenMetrics.Reporter, 17 | [GenMetrics.GenStage.Reporter], 18 | [id: GenMetrics.GenStage.Reporter]) 19 | ] 20 | 21 | supervise(children, strategy: :one_for_one) 22 | end 23 | 24 | end 25 | -------------------------------------------------------------------------------- /lib/gen_stage/stage.ex: -------------------------------------------------------------------------------- 1 | defmodule GenMetrics.GenStage.Stage do 2 | alias GenMetrics.GenStage.Stats 3 | 4 | @moduledoc """ 5 | A struct used to aggregate statistical metrics data for a GenStage process. 6 | 7 | The fields are: 8 | * `name` - the module name for the GenStage process 9 | 10 | * `pid` - the `pid` for the GenStage process 11 | 12 | * `demand` - `GenMetrics.GenStage.Stats` for upstream demand 13 | 14 | * `events` - `GenMetrics.GenStage.Stats` for emitted events 15 | 16 | * `timings` - `GenMetrics.GenStage.Stats` for time on GenStage callbacks 17 | """ 18 | 19 | defstruct name: nil, pid: nil, 20 | demand: %Stats{}, events: %Stats{}, timings: %Stats{} 21 | 22 | end 23 | -------------------------------------------------------------------------------- /lib/application.ex: -------------------------------------------------------------------------------- 1 | defmodule GenMetrics.Application do 2 | @moduledoc false 3 | use Application 4 | 5 | alias GenMetrics.GenServer 6 | alias GenMetrics.GenStage 7 | alias GenMetrics.Reporter 8 | alias GenMetrics.Utils.StatsPush 9 | 10 | def start(_type, _args) do 11 | import Supervisor.Spec, warn: false 12 | 13 | # Activate Statix (statsd) connection on startup. 14 | :ok = StatsPush.connect 15 | 16 | children = [ 17 | supervisor(GenServer.Supervisor, []), 18 | supervisor(GenStage.Supervisor, []), 19 | supervisor(Reporter.Supervisor, []) 20 | ] 21 | 22 | opts = [strategy: :one_for_one, name: GenMetrics.Supervisor] 23 | Supervisor.start_link(children, opts) 24 | end 25 | end 26 | -------------------------------------------------------------------------------- /lib/gen_server/window.ex: -------------------------------------------------------------------------------- 1 | defmodule GenMetrics.GenServer.Window do 2 | 3 | @moduledoc """ 4 | A struct used by the GenMetrics reporting process to periodically 5 | publish metrics data for a GenServer cluster. 6 | 7 | The fields are: 8 | 9 | * `cluster` - the associated `GenMetrics.GenServer.Cluster` 10 | 11 | * `start` - the start time for the current metrics window interval 12 | 13 | * `duration` - the length (ms) of the current metrics window interval 14 | 15 | * `summary` - a list of `GenMetrics.GenServer.Summary`, item per process 16 | on the pipeline 17 | 18 | * `stats` - (optional) a list of `GenMetrics.GenServer.Server` 19 | """ 20 | 21 | defstruct cluster: nil, start: 0, duration: 0, stats: [], summary: [] 22 | end 23 | -------------------------------------------------------------------------------- /lib/gen_server/server.ex: -------------------------------------------------------------------------------- 1 | defmodule GenMetrics.GenServer.Server do 2 | alias GenMetrics.GenServer.Stats 3 | 4 | @moduledoc """ 5 | A struct used to aggregate statistical metrics data for a GenServer process. 6 | 7 | The fields are: 8 | * `name` - the module name for the GenServer process 9 | 10 | * `pid` - the `pid` for the GenServer process 11 | 12 | * `calls` - `GenMetrics.GenServer.Stats` for `GenServer.handle_call/3` callbacks 13 | 14 | * `casts` - `GenMetrics.GenServer.Stats` for `GenServer.handle_cast/2` callbacks 15 | 16 | * `infos` - `GenMetrics.GenServer.Stats` for `GenServer.handle_info/2` callbacks 17 | """ 18 | 19 | defstruct name: nil, pid: nil, 20 | calls: %Stats{}, casts: %Stats{}, infos: %Stats{} 21 | 22 | end 23 | -------------------------------------------------------------------------------- /lib/gen_stage/window.ex: -------------------------------------------------------------------------------- 1 | defmodule GenMetrics.GenStage.Window do 2 | 3 | @moduledoc """ 4 | A struct used by the GenMetrics reporting process to periodically 5 | publish metrics data for a GenStage pipeline. 6 | 7 | The fields are: 8 | 9 | * `pipeline` - the associated `GenMetrics.GenStage.Pipeline` 10 | 11 | * `start` - the start time for the current metrics window interval 12 | 13 | * `duration` - the length (ms) of the current metrics window interval 14 | 15 | * `summary` - a list of `GenMetrics.GenStage.Summary`, item per process 16 | on the pipeline 17 | 18 | * `stats` - (optional) a list of `GenMetrics.GenStage.Stage` 19 | """ 20 | 21 | defstruct pipeline: nil, start: 0, duration: 0, stats: [], summary: [] 22 | 23 | end 24 | -------------------------------------------------------------------------------- /lib/gen_server/metric.ex: -------------------------------------------------------------------------------- 1 | defmodule GenMetrics.GenServer.Metric do 2 | @moduledoc false 3 | alias GenMetrics.GenServer.Metric 4 | alias GenMetrics.Utils.Runtime 5 | 6 | @nano_to_micro 1000 7 | 8 | defstruct start: 0, duration: 0 9 | 10 | def partial(ts) do 11 | ts 12 | end 13 | 14 | def pair(summary_paired, mkey, ts, partial) do 15 | start_mkey = {1, ts - partial} 16 | Map.update(summary_paired, mkey, start_mkey, 17 | fn {calls, toc} -> {calls + 1, toc + (ts - partial)} end) 18 | end 19 | 20 | def no_pair do 21 | {0, 0} 22 | end 23 | 24 | def start(ts) do 25 | %Metric{start: ts} 26 | end 27 | 28 | def stop(partial, ts) do 29 | %Metric{partial | 30 | duration: Runtime.safe_div(ts - partial.start, @nano_to_micro)} 31 | end 32 | 33 | end 34 | -------------------------------------------------------------------------------- /lib/gen_stage/stats.ex: -------------------------------------------------------------------------------- 1 | defmodule GenMetrics.GenStage.Stats do 2 | 3 | @moduledoc """ 4 | A struct used to report statistical metrics data for a GenStage process. 5 | 6 | The fields are: 7 | 8 | * `callbacks` - the total number of callbacks handled by the process 9 | 10 | * `total` - the total time spent (µs) on all callbacks 11 | 12 | * `max` - the maximum time spent (µs) on any callback 13 | 14 | * `min` - the minimum time spent (µs) on any callback 15 | 16 | * `mean` - the mean time spent (µs) on any callback 17 | 18 | * `stdev` - the standard deviation around the mean time spent (µs) on 19 | any callback 20 | 21 | * `range` - the difference between max and min time spent (µs) on all callbacks 22 | """ 23 | 24 | defstruct callbacks: 0, min: 0, max: 0, total: 0, 25 | mean: 0, stdev: 0, range: 0 26 | end 27 | -------------------------------------------------------------------------------- /lib/gen_stage/summary.ex: -------------------------------------------------------------------------------- 1 | defmodule GenMetrics.GenStage.Summary do 2 | 3 | @moduledoc """ 4 | A struct used to report summary metrics data for a GenStage process. 5 | The numbers reported reflect totals during a given metrics collection 6 | window interval. 7 | 8 | The fields are: 9 | 10 | * `name` - the module name for the GenStage process 11 | 12 | * `pid` - the `pid` for the GenStage process 13 | 14 | * `callbacks` - the number of callbacks on the GenStage process 15 | 16 | * `time_on_callbacks` - the number of milliseconds spent on callbacks 17 | 18 | * `demand` - the upstream demand on the GenStage process 19 | 20 | * `events` - the number of events emitted by the GenStage process 21 | """ 22 | 23 | defstruct name: nil, pid: nil, 24 | callbacks: 0, time_on_callbacks: 0, demand: 0, events: 0 25 | 26 | end 27 | -------------------------------------------------------------------------------- /lib/gen_server/stats.ex: -------------------------------------------------------------------------------- 1 | defmodule GenMetrics.GenServer.Stats do 2 | 3 | @moduledoc """ 4 | A struct used to report statistical metrics data for a GenServer process. 5 | 6 | The fields are: 7 | 8 | * `callbacks` - the total number of callbacks handled by the process 9 | 10 | * `total` - the total time spent (µs) on all callbacks 11 | 12 | * `max` - the maximum time spent (µs) on any callback 13 | 14 | * `min` - the minimum time spent (µs) on any callback 15 | 16 | * `mean` - the mean time spent (µs) on any callback 17 | 18 | * `stdev` - the standard deviation around the mean time spent (µs) on 19 | any callback 20 | 21 | * `range` - the difference between max and min time spent (µs) on all callbacks 22 | """ 23 | 24 | defstruct callbacks: 0, min: 0, max: 0, total: 0, 25 | mean: 0, stdev: 0, range: 0 26 | 27 | end 28 | -------------------------------------------------------------------------------- /lib/reporter.ex: -------------------------------------------------------------------------------- 1 | defmodule GenMetrics.Reporter do 2 | use GenStage 3 | 4 | @moduledoc false 5 | 6 | def start_link(name) do 7 | GenStage.start_link(__MODULE__, 0, name: name) 8 | end 9 | 10 | def push(reporter, window) do 11 | GenStage.cast(reporter, {:monitor_metrics, window}) 12 | end 13 | 14 | def init(state) do 15 | {:producer, state, dispatcher: GenStage.BroadcastDispatcher} 16 | end 17 | 18 | def handle_subscribe(_, _, _, state) do 19 | {:automatic, state + 1} 20 | end 21 | 22 | def handle_cancel(_, _, state) do 23 | {:noreply, [], max(state - 1, 0)} 24 | end 25 | 26 | def handle_cast({:monitor_metrics, window}, subscriber_count) do 27 | if subscriber_count == 0 do 28 | {:noreply, [], subscriber_count} 29 | else 30 | {:noreply, [window], subscriber_count} 31 | end 32 | end 33 | 34 | def handle_demand(_demand, state) do 35 | {:noreply, [], state} 36 | end 37 | 38 | end 39 | -------------------------------------------------------------------------------- /lib/gen_stage/metric.ex: -------------------------------------------------------------------------------- 1 | defmodule GenMetrics.GenStage.Metric do 2 | @moduledoc false 3 | alias GenMetrics.GenStage.Metric 4 | alias GenMetrics.Utils.Runtime 5 | 6 | @nano_to_micro 1000 7 | 8 | defstruct demand: 0, events: 0, duration: 0 9 | 10 | def demand(demand, start) do 11 | %Metric{demand: demand, duration: start} 12 | end 13 | 14 | def event(partial, events, ts) do 15 | %Metric{partial | events: events, 16 | duration: Runtime.safe_div(ts - partial.duration, @nano_to_micro)} 17 | end 18 | 19 | def pair(summary_paired, pid, events, ts, partial) do 20 | start_pid = {1, partial.demand, events, ts - partial.duration} 21 | Map.update(summary_paired, pid, start_pid, 22 | fn {calls, dmd, evts, toc} -> 23 | {calls + 1, dmd + partial.demand, evts + events, 24 | toc + (ts - partial.duration)} 25 | end) 26 | end 27 | 28 | def no_pair do 29 | {0, 0, 0, 0} 30 | end 31 | 32 | end 33 | -------------------------------------------------------------------------------- /bench/infinite_server.exs: -------------------------------------------------------------------------------- 1 | Code.require_file("server.exs", "./bench/support") 2 | Application.ensure_all_started(:gen_metrics) 3 | 4 | {:ok, _untraced} = UntracedServer.start_link(99999999999999) 5 | {:ok, _sampled} = SampledServer.start_link(99999999999999) 6 | 7 | alias GenMetrics.GenServer.Cluster 8 | cluster = %Cluster{name: "infinite_sampled_server", 9 | servers: [SampledServer], 10 | opts: [sample_rate: 0.1]} 11 | 12 | GenMetrics.monitor_cluster cluster 13 | 14 | :observer.start 15 | 16 | Benchee.run(%{time: 30, warmup: 5}, %{ 17 | "infinite-sampled-server" => fn -> 18 | SampledServer.init_state(99999999999999) 19 | data = %{id: self(), data: String.duplicate("a", 100)} 20 | stream = Stream.cycle([data]) 21 | for _ <- stream, do: SampledServer.do_call(data) 22 | receive do 23 | :benchmark_completed -> :ok 24 | end 25 | end}) 26 | -------------------------------------------------------------------------------- /lib/gen_server/summary.ex: -------------------------------------------------------------------------------- 1 | defmodule GenMetrics.GenServer.Summary do 2 | 3 | @moduledoc """ 4 | A struct used to report summary metrics data for a GenServer process. 5 | The numbers reported reflect totals during a given metrics collection 6 | window interval. 7 | 8 | The fields are: 9 | 10 | * `name` - the module name for the GenServer process 11 | 12 | * `pid` - the `pid` for the GenServer process 13 | 14 | * `calls` - the number of `GenServer.handle_call/3` calls 15 | 16 | * `casts` - the number of `GenServer.handle_cast/2` calls 17 | 18 | * `infos` - the number of `GenServer.handle_info/2` calls 19 | 20 | * `time_on_calls` - the number of milliseconds spent on calls 21 | 22 | * `time_on_casts` - the number of milliseconds spent on casts 23 | 24 | * `time_on_infos` - the number of milliseconds spent on infos 25 | """ 26 | 27 | defstruct name: nil, pid: nil, 28 | calls: 0, casts: 0, infos: 0, 29 | time_on_calls: 0, time_on_casts: 0, time_on_infos: 0 30 | 31 | end 32 | -------------------------------------------------------------------------------- /lib/utils/math.ex: -------------------------------------------------------------------------------- 1 | defmodule GenMetrics.Utils.Math do 2 | @moduledoc false 3 | 4 | # For runtime performance reasons this library requires 5 | # the input data length to be provided, not calculated. 6 | 7 | def sum(data), do: Enum.sum data 8 | 9 | def sort(data), do: Enum.sort data 10 | 11 | def max([]), do: 0 12 | def max(data), do: Enum.max data 13 | 14 | def min([]), do: 0 15 | def min(data), do: Enum.min data 16 | 17 | def mean([], _), do: 0 18 | def mean(data, length), do: round(sum(data) / length) 19 | 20 | def variance([], _), do: 0 21 | def variance(data, length) do 22 | mean = mean(data, length) 23 | round(sum(Enum.map(data, &((mean - &1) * (mean - &1)))) / length) 24 | end 25 | 26 | def stdev([], _), do: 0 27 | def stdev(data, length) do 28 | round(:math.sqrt(variance(data, length))) 29 | end 30 | 31 | def range([]), do: 0 32 | def range(data) do 33 | sorted = sort(data) 34 | List.last(sorted) - List.first(sorted) 35 | end 36 | 37 | end 38 | -------------------------------------------------------------------------------- /bench/infinite_pipeline.exs: -------------------------------------------------------------------------------- 1 | Code.require_file("stages.exs", "./bench/support") 2 | Application.ensure_all_started(:gen_metrics) 3 | alias GenMetrics.GenStage.Pipeline 4 | 5 | {:ok, _sampledp} = SampledProducer.start_link() 6 | {:ok, _sampledc} = SampledConsumer.start_link() 7 | 8 | infinite_pipeline = %Pipeline{name: "infinite_pipeline", 9 | producer: [SampledProducer], 10 | consumer: [SampledConsumer], 11 | opts: [statistics: false, 12 | synchronous: true, 13 | sample_rate: 0.05]} 14 | 15 | {:ok, _imon} = GenMetrics.monitor_pipeline infinite_pipeline 16 | 17 | :observer.start 18 | 19 | Benchee.run(%{time: 30, warmup: 5}, %{ 20 | "infinite-sampled-pipeline" => fn -> 21 | data = %{id: self(), data: String.duplicate("a", 100)} 22 | stream = Stream.cycle([data]) 23 | for _ <- stream, do: SampledProducer.emit(data) 24 | receive do 25 | :benchmark_completed -> :ok 26 | end 27 | 28 | end}) 29 | -------------------------------------------------------------------------------- /mix.lock: -------------------------------------------------------------------------------- 1 | %{"benchee": {:hex, :benchee, "0.7.0", "98e4ed2c86b633df9b0190d6b3bf38bc2e385ba6200f68201fb575d39909816c", [:mix], [{:deep_merge, "~> 0.1", [hex: :deep_merge, optional: false]}]}, 2 | "bunt": {:hex, :bunt, "0.2.0", "951c6e801e8b1d2cbe58ebbd3e616a869061ddadcc4863d0a2182541acae9a38", [:mix], []}, 3 | "credo": {:hex, :credo, "0.7.3", "9827ab04002186af1aec014a811839a06f72aaae6cd5eed3919b248c8767dbf3", [:mix], [{:bunt, "~> 0.2.0", [hex: :bunt, optional: false]}]}, 4 | "deep_merge": {:hex, :deep_merge, "0.1.1", "c27866a7524a337b6a039eeb8dd4f17d458fd40fbbcb8c54661b71a22fffe846", [:mix], []}, 5 | "earmark": {:hex, :earmark, "1.2.0", "bf1ce17aea43ab62f6943b97bd6e3dc032ce45d4f787504e3adf738e54b42f3a", [:mix], []}, 6 | "ex_doc": {:hex, :ex_doc, "0.15.1", "d5f9d588fd802152516fccfdb96d6073753f77314fcfee892b15b6724ca0d596", [:mix], [{:earmark, "~> 1.1", [hex: :earmark, optional: false]}]}, 7 | "ex_statsd": {:hex, :ex_statsd, "0.5.3", "e86dd97e25dbc80786e7d22b3c5537f2052a7e12daaaa7e6f2b9c34d03dbbd44", [:mix], []}, 8 | "gen_stage": {:hex, :gen_stage, "0.11.0", "943bdfa85c75fa624e0a36a9d135baad20a523be040178f5a215444b45c66ea4", [:mix], []}, 9 | "statix": {:hex, :statix, "1.0.0", "836c0752ad2b568dcdc9b1e67df0df91ad491ea1e19965ac219a9a0569e7e338", [:mix], []}} 10 | -------------------------------------------------------------------------------- /lib/gen_server/cluster.ex: -------------------------------------------------------------------------------- 1 | defmodule GenMetrics.GenServer.Cluster do 2 | 3 | @moduledoc """ 4 | A struct used to identify one or more GenServer modules that become 5 | candidates for metrics collection. 6 | 7 | The fields are: 8 | 9 | * `name` - a `String.t` used to identify the cluster 10 | 11 | * `servers` - a list of one or more GenServer modules 12 | 13 | * `opts` - a keyword list of options that alter GenMetrics behaviour 14 | for the cluster 15 | 16 | The `name` can be used to filter metrics events from the GenMetrics 17 | reporting process as well as provding context when logging metrics data. 18 | 19 | The following `opts` are supported: 20 | 21 | * `statistics` - when `true`, statistical metrics are generated for 22 | the cluster, defaults to `false` 23 | * `window_interval` - metrics collection interval in `ms`, defaults to `1000 ms` 24 | 25 | ### Usage: 26 | 27 | Assuming your application has a `Session.Server` and a `Logging.Server`, 28 | you can activate metrics collection on both GenServers as follows: 29 | 30 | ``` 31 | alias GenMetrics.GenServer.Cluster 32 | cluster = %Cluster{name: "demo", servers: [Session.Server, Logging.Server]} 33 | GenMetrics.monitor_cluster(cluster) 34 | ``` 35 | 36 | The *cluster* in this context is simply a named set of one or more GenServer 37 | modules about which you would like to collect metrics data. Metrics data 38 | are collected on server processes executing on the local node. 39 | """ 40 | 41 | defstruct name: nil, servers: [], opts: [] 42 | 43 | end 44 | -------------------------------------------------------------------------------- /bench/trace_server.exs: -------------------------------------------------------------------------------- 1 | Code.require_file("server.exs", "./bench/support") 2 | Application.ensure_all_started(:gen_metrics) 3 | 4 | data = Enum.map(1..500_000, fn i -> %{id: i, data: String.duplicate("a", 100)} end) 5 | 6 | {:ok, _untraced} = UntracedServer.start_link(length(data)) 7 | {:ok, _traced} = TracedServer.start_link(length(data)) 8 | 9 | alias GenMetrics.GenServer.Cluster 10 | traced_cluster = %Cluster{name: "traced_cluster", 11 | servers: [TracedServer], 12 | opts: [statistics: false, 13 | sample_rate: 1.0, 14 | synchronous: true]} 15 | 16 | {:ok, _tmon} = GenMetrics.monitor_cluster(traced_cluster) 17 | 18 | # :observer.start 19 | 20 | Benchee.run(%{time: 30, warmup: 5}, %{ 21 | "1-untraced-server [ repeat 500k callbacks N times within ~30s ]" => fn -> 22 | UntracedServer.init_state(length(data)) 23 | pid = self() 24 | for item <- data do 25 | UntracedServer.do_call(%{item | id: pid}) 26 | end 27 | receive do 28 | :benchmark_completed -> :ok 29 | end 30 | IO.puts "1-untraced-server 500k callbacks completed" 31 | end, 32 | "2-traced---server [ repeat 500k callbacks N times within ~30s ]" => fn -> 33 | TracedServer.init_state(length(data)) 34 | pid = self() 35 | for item <- data do 36 | TracedServer.do_call(%{item | id: pid}) 37 | end 38 | receive do 39 | :benchmark_completed -> :ok 40 | end 41 | IO.puts "2-traced--server 500k callbacks completed" 42 | end 43 | }) 44 | -------------------------------------------------------------------------------- /bench/sample_server.exs: -------------------------------------------------------------------------------- 1 | Code.require_file("server.exs", "./bench/support") 2 | Application.ensure_all_started(:gen_metrics) 3 | 4 | data = Enum.map(1..500_000, fn i -> %{id: i, data: String.duplicate("a", 100)} end) 5 | 6 | {:ok, _untraced} = UntracedServer.start_link(length(data)) 7 | {:ok, _sampled} = SampledServer.start_link(length(data)) 8 | 9 | alias GenMetrics.GenServer.Cluster 10 | sampled_cluster = %Cluster{name: "sampled_cluster", 11 | servers: [SampledServer], 12 | opts: [statistics: false, 13 | sample_rate: 0.1, 14 | synchronous: true]} 15 | 16 | {:ok, _smon} = GenMetrics.monitor_cluster(sampled_cluster) 17 | 18 | # :observer.start 19 | 20 | Benchee.run(%{time: 30, warmup: 5}, %{ 21 | "1-untraced-server [ repeat 500k callbacks N times within ~30s ]" => fn -> 22 | UntracedServer.init_state(length(data)) 23 | pid = self() 24 | for item <- data do 25 | UntracedServer.do_call(%{item | id: pid}) 26 | end 27 | receive do 28 | :benchmark_completed -> :ok 29 | end 30 | IO.puts "1-untraced-server 500k callbacks completed" 31 | end, 32 | "2-sampled--server [ repeat 500k callbacks N times within ~30s ]" => fn -> 33 | SampledServer.init_state(length(data)) 34 | pid = self() 35 | for item <- data do 36 | SampledServer.do_call(%{item | id: pid}) 37 | end 38 | receive do 39 | :benchmark_completed -> :ok 40 | end 41 | IO.puts "2-sampled--server 500k callbacks completed" 42 | end 43 | }) 44 | -------------------------------------------------------------------------------- /bench/trace_pipeline.exs: -------------------------------------------------------------------------------- 1 | Code.require_file("stages.exs", "./bench/support") 2 | Application.ensure_all_started(:gen_metrics) 3 | alias GenMetrics.GenStage.Pipeline 4 | 5 | data = Enum.map(1..500_000, fn i -> %{id: i, data: String.duplicate("a", 100)} end) 6 | 7 | {:ok, _untracedp} = UntracedProducer.start_link() 8 | {:ok, _untracedc} = UntracedConsumer.start_link() 9 | {:ok, _tracedp} = TracedProducer.start_link() 10 | {:ok, _tracedc} = TracedConsumer.start_link() 11 | 12 | traced_pipeline = %Pipeline{name: "traced_pipeline", 13 | producer: [TracedProducer], 14 | consumer: [TracedConsumer], 15 | opts: [statistics: false, 16 | synchronous: true, 17 | sample_rate: 1.0]} 18 | 19 | {:ok, _tmon} = GenMetrics.monitor_pipeline(traced_pipeline) 20 | 21 | # :observer.start 22 | 23 | Benchee.run(%{time: 30, warmup: 5}, %{ 24 | "1-untraced-pipeline [ repeat 500k msgs N times within ~30s ]" => fn -> 25 | for %{id: id} = item <- data do 26 | {:ok, ^id} = UntracedProducer.emit(item) 27 | end 28 | for i <- 1..length(data) do 29 | receive do 30 | ^i -> :ok 31 | end 32 | end 33 | IO.puts "1-untraced-pipeline 500k msgs completed" 34 | end, 35 | "2-traced---pipeline [ repeat 500k msgs N times within ~30s ]" => fn -> 36 | for %{id: id} = item <- data do 37 | {:ok, ^id} = TracedProducer.emit(item) 38 | end 39 | for i <- 1..length(data) do 40 | receive do 41 | ^i -> :ok 42 | end 43 | end 44 | IO.puts "2-traced---pipeline 500k msgs completed" 45 | end 46 | }) 47 | -------------------------------------------------------------------------------- /bench/sample_pipeline.exs: -------------------------------------------------------------------------------- 1 | Code.require_file("stages.exs", "./bench/support") 2 | Application.ensure_all_started(:gen_metrics) 3 | alias GenMetrics.GenStage.Pipeline 4 | 5 | data = Enum.map(1..500_000, fn i -> %{id: i, data: String.duplicate("a", 100)} end) 6 | 7 | {:ok, _untracedp} = UntracedProducer.start_link() 8 | {:ok, _untracedc} = UntracedConsumer.start_link() 9 | {:ok, _sampledp} = SampledProducer.start_link() 10 | {:ok, _sampledc} = SampledConsumer.start_link() 11 | 12 | sampled_pipeline = %Pipeline{name: "traced_pipeline", 13 | producer: [SampledProducer], 14 | consumer: [SampledConsumer], 15 | opts: [statistics: false, 16 | synchronous: true, 17 | sample_rate: 0.1]} 18 | 19 | {:ok, _smon} = GenMetrics.monitor_pipeline(sampled_pipeline) 20 | 21 | # :observer.start 22 | 23 | Benchee.run(%{time: 30, warmup: 5}, %{ 24 | "1-untraced-pipeline [ repeat 500k msgs N times within ~30s ]" => fn -> 25 | for %{id: id} = item <- data do 26 | {:ok, ^id} = UntracedProducer.emit(item) 27 | end 28 | for i <- 1..length(data) do 29 | receive do 30 | ^i -> :ok 31 | end 32 | end 33 | IO.puts "1-untraced-pipeline 500k msgs completed" 34 | end, 35 | "2-sampled--pipeline [ repeat 500k msgs N times within ~30s ]" => fn -> 36 | for %{id: id} = item <- data do 37 | {:ok, ^id} = SampledProducer.emit(item) 38 | end 39 | for i <- 1..length(data) do 40 | receive do 41 | ^i -> :ok 42 | end 43 | end 44 | IO.puts "2-sampled--pipeline 500k msgs completed" 45 | end 46 | }) 47 | -------------------------------------------------------------------------------- /mix.exs: -------------------------------------------------------------------------------- 1 | defmodule GenMetrics.Mixfile do 2 | use Mix.Project 3 | 4 | def project do 5 | [app: :gen_metrics, 6 | version: "0.3.0", 7 | elixir: "~> 1.4", 8 | build_embedded: Mix.env == :prod, 9 | start_permanent: Mix.env == :prod, 10 | description: description(), 11 | package: package(), 12 | deps: deps(), 13 | aliases: aliases(), 14 | docs: [main: "GenMetrics", source_url: "https://github.com/onetapbeyond/gen_metrics"]] 15 | end 16 | 17 | # Configuration for the OTP application 18 | # 19 | # Type "mix help compile.app" for more information 20 | def application do 21 | # Specify extra applications you'll use from Erlang/Elixir 22 | [extra_applications: [:logger], 23 | mod: {GenMetrics.Application, []}] 24 | end 25 | 26 | # Dependencies can be Hex packages: 27 | # 28 | # {:my_dep, "~> 0.3.0"} 29 | # 30 | # Or git/path repositories: 31 | # 32 | # {:my_dep, git: "https://github.com/elixir-lang/my_dep.git", tag: "0.1.0"} 33 | # 34 | # Type "mix help deps" for more examples and options 35 | defp deps do 36 | [{:gen_stage, "~> 0.11"}, 37 | {:statix, ">= 0.0.0"}, 38 | {:ex_doc, "~> 0.14", only: :dev, runtime: false}, 39 | {:credo, "~> 0.7", only: [:dev, :test]}, 40 | {:benchee, "~> 0.7", only: :dev}] 41 | end 42 | 43 | defp aliases do 44 | [trace_cluster: "run ./bench/trace_cluster.exs", 45 | sample_cluster: "run ./bench/sample_cluster.exs", 46 | trace_pipeline: "run ./bench/trace_pipeline.exs", 47 | sample_pipeline: "run ./bench/sample_pipeline.exs", 48 | infinite_server: "run ./bench/infinite_server.exs", 49 | infinite_pipeline: "run ./bench/infinite_pipeline.exs"] 50 | end 51 | 52 | defp description do 53 | """ 54 | Elixir GenServer and GenStage runtime metrics. 55 | """ 56 | end 57 | 58 | defp package do 59 | [ 60 | name: :gen_metrics, 61 | maintainers: ["David Russell"], 62 | licenses: ["Apache 2.0"], 63 | links: %{"GitHub" => "https://github.com/onetapbeyond/gen_metrics"} 64 | ] 65 | end 66 | 67 | end 68 | -------------------------------------------------------------------------------- /lib/gen_stage/pipeline.ex: -------------------------------------------------------------------------------- 1 | defmodule GenMetrics.GenStage.Pipeline do 2 | 3 | @moduledoc """ 4 | A struct used to identify one or more GenStages that become candidates 5 | for metrics collection. 6 | 7 | The fields are: 8 | 9 | * `name` - a `String.t` used to identify the pipeline 10 | 11 | * `producer` - a list of one or more GenStage `:producer` modules 12 | 13 | * `producer_consumer` - a list of one or more GenStage 14 | `:producer_consumer` modules 15 | 16 | * `consumer` - a list of one or more GenStage `:consumer` modules 17 | 18 | * `opts` - a keyword list of options that alter GenMetrics behaviour 19 | for the pipeline 20 | 21 | The `name` can be used to filter metrics events from the GenMetrics 22 | reporting process as well as provding context when logging metrics data. 23 | 24 | The following `opts` are supported: 25 | 26 | * `statistics` - when `true`, statistical metrics are generated, 27 | defaults to `false` 28 | * `window_interval` - metrics collection interval in `ms`, defaults to `1000 ms` 29 | 30 | ### Usage 31 | 32 | Assuming your GenStage application has a `Data.Producer`, a `Data.Scrubber`, 33 | a `Data.Analyzer` and a `Data.Consumer` you can activate metrics collection 34 | for the entire pipeline as follows: 35 | 36 | ``` 37 | alias GenMetrics.GenStage.Pipeline 38 | pipeline = %Pipeline{name: "demo", 39 | producer: [Data.Producer], 40 | producer_consumer: [Data.Scrubber, Data.Analyzer], 41 | consumer: [Data.Consumer]} 42 | GenMetrics.monitor_pipeline(pipeline) 43 | ``` 44 | 45 | Alternatively, if you only wanted to activate metrics collection for the 46 | `:producer_consumer` stages within the pipeline you can do the following: 47 | 48 | ``` 49 | alias GenMetrics.GenStage.Pipeline 50 | pipeline = %Pipeline{name: "demo", 51 | producer_consumer: [Data.Scrubber, Data.Analyzer]} 52 | GenMetrics.monitor_pipeline(pipeline) 53 | ``` 54 | 55 | The *pipeline* in this context is simply a named set of one or more GenStage 56 | modules about which you would like to collect metrics data. Metrics data are 57 | collected on stage processes executing on the local node. 58 | """ 59 | 60 | defstruct name: nil, producer: [], 61 | producer_consumer: [], consumer: [], opts: [] 62 | end 63 | -------------------------------------------------------------------------------- /examples/genserver_events.exs: -------------------------------------------------------------------------------- 1 | # Usage: mix run examples/genserver_events.exs 2 | # 3 | # This basic example demonstrates the collection and 4 | # reporting of metrics data for a simple GenServer cluster. 5 | # 6 | # The sample Metrics.Consumer module simply prints the metrics 7 | # data reported by the GenMetrics library to standard out. 8 | # 9 | defmodule Demo.Server do 10 | use GenServer 11 | 12 | def start_link do 13 | GenServer.start_link(__MODULE__, []) 14 | end 15 | 16 | def init(state) do 17 | {:ok, state} 18 | end 19 | 20 | def handle_call(_msg, _from, state) do 21 | {:reply, :ok, state} 22 | end 23 | 24 | def handle_cast(_msg, state) do 25 | {:noreply, state} 26 | end 27 | 28 | def handle_info(_msg, state) do 29 | {:noreply, state} 30 | end 31 | end 32 | 33 | defmodule Metrics.Consumer do 34 | use GenStage 35 | 36 | def start_link do 37 | GenStage.start_link(__MODULE__, []) 38 | end 39 | 40 | def init(_state) do 41 | {:consumer, :state_does_not_matter, 42 | subscribe_to: [{GenMetrics.GenServer.Reporter, max_demand: 1}]} 43 | end 44 | 45 | def handle_events([window | _], _from, state) do 46 | IO.puts "\n\nGenServer Cluster: #{inspect window.cluster.name}" 47 | IO.puts "Metrics-Window: Start:=#{inspect window.start}, Duration=#{inspect window.duration}" 48 | IO.puts "Summary Metrics" 49 | for summary <- window.summary do 50 | IO.puts "#{inspect summary}" 51 | end 52 | IO.puts "Statistical Metrics" 53 | for server <- window.stats do 54 | IO.puts "Server:=#{inspect server.name} [ #{inspect server.pid} ]" 55 | IO.puts "Calls:=#{inspect server.calls}" 56 | IO.puts "Casts:=#{inspect server.casts}" 57 | IO.puts "Infos:=#{inspect server.infos}" 58 | end 59 | IO.puts "\n" 60 | {:noreply, [], state} 61 | end 62 | end 63 | 64 | 65 | # 66 | # Initialize GenMetrics Monitoring for GenServer Cluster 67 | # 68 | alias GenMetrics.GenServer.Cluster 69 | 70 | Application.start(GenMetrics.Application) 71 | Metrics.Consumer.start_link 72 | 73 | cluster = %Cluster{name: "demo", 74 | servers: [Demo.Server], 75 | opts: [statistics: true, 76 | sample_rate: 0.95, 77 | window_interval: 2000, 78 | synchronous: true]} 79 | 80 | {:ok, _pid} = GenMetrics.monitor_cluster(cluster) 81 | 82 | # 83 | # Start Sample GenServer To Handle Events 84 | # 85 | {:ok, pid} = GenServer.start_link(Demo.Server, []) 86 | spawn fn -> 87 | for _ <- 1..3500 do 88 | GenServer.call(pid, :demo) 89 | GenServer.cast(pid, :demo) 90 | Kernel.send(pid, :demo) 91 | end 92 | end 93 | GenServer.call(pid, :demo) 94 | Process.sleep(5000) 95 | -------------------------------------------------------------------------------- /lib/utils/stats_push.ex: -------------------------------------------------------------------------------- 1 | defmodule GenMetrics.Utils.StatsPush do 2 | use Statix, runtime_config: true 3 | 4 | @moduledoc false 5 | 6 | alias GenMetrics.GenServer 7 | alias GenMetrics.GenStage 8 | alias GenMetrics.Utils.Runtime 9 | 10 | @genserver_prefix "GenMetrics.GenServer.Cluster" 11 | @genstage_prefix "GenMetrics.GenStage.Pipeline" 12 | @genserver_dogtag "genserver" 13 | @genstage_dogtag "genstage" 14 | @count ".count" 15 | @demand ".demand" 16 | @events ".events" 17 | @stats ".stats" 18 | @timing ".timing" 19 | @total ".total" 20 | @sample_rate 1.0 21 | 22 | def statsd(context, mod, pid, fun, %GenServer.Metric{} = metric) do 23 | base = as_label(@genserver_prefix, context, mod, pid, fun) 24 | __MODULE__.increment(base <> @count) 25 | __MODULE__.timing(base <> @stats, Runtime.nano_to_milli(metric.duration)) 26 | end 27 | 28 | def statsd(context, mod, pid, _fun, %GenStage.Metric{} = metric) do 29 | base = as_label(@genstage_prefix, context, mod, pid) 30 | __MODULE__.increment(base <> @count) 31 | __MODULE__.increment(base <> @demand <> @total, metric.demand) 32 | __MODULE__.increment(base <> @events <> @total, metric.events) 33 | __MODULE__.timing(base <> @timing, Runtime.nano_to_milli(metric.duration)) 34 | end 35 | 36 | def datadog(context, mod, pid, fun, %GenServer.Metric{} = metric) do 37 | base = as_label(@genserver_prefix, context, mod, pid, fun) 38 | dogtag = as_dogtag(@genserver_dogtag, context) 39 | __MODULE__.increment(base <> @count, 1, 40 | tags: [dogtag], sample_rate: @sample_rate) 41 | __MODULE__.histogram(base <> @stats, 42 | Runtime.nano_to_milli(metric.duration), 43 | tags: [dogtag], sample_rate: @sample_rate) 44 | end 45 | 46 | def datadog(context, mod, pid, _fun, %GenStage.Metric{} = metric) do 47 | base = as_label(@genstage_prefix, context, mod, pid) 48 | dogtag = as_dogtag(@genstage_dogtag, context) 49 | __MODULE__.increment(base <> @count, 1, 50 | tags: [dogtag], sample_rate: @sample_rate) 51 | __MODULE__.increment(base <> @demand <> @total, metric.demand, 52 | tags: [dogtag], sample_rate: @sample_rate) 53 | __MODULE__.increment(base <> @events <> @total, metric.events, 54 | tags: [dogtag], sample_rate: @sample_rate) 55 | __MODULE__.histogram(base <> @demand, metric.demand, 56 | tags: [dogtag], sample_rate: @sample_rate) 57 | __MODULE__.histogram(base <> @events, metric.events, 58 | tags: [dogtag], sample_rate: @sample_rate) 59 | __MODULE__.histogram(base <> @timing, 60 | Runtime.nano_to_milli(metric.duration), 61 | tags: [dogtag], sample_rate: @sample_rate) 62 | end 63 | 64 | defp as_label(prefix, cluster, mod, _pid, fun \\ nil) do 65 | if fun do 66 | [prefix, cluster, as_mod_label(mod), as_fun_label(fun)] 67 | |> build_label 68 | else 69 | [prefix, cluster, as_mod_label(mod)] |> build_label 70 | end 71 | end 72 | 73 | # defp as_pid_label(pid) when is_pid(pid) do 74 | # Regex.replace(~r/\D/, "#{inspect pid}", "") 75 | # end 76 | 77 | defp as_mod_label(mod) when is_atom(mod) do 78 | "#{inspect mod}" |> String.split(".") |> Enum.reverse() |> Enum.fetch!(0) 79 | end 80 | 81 | defp as_fun_label(fun) when is_atom(fun) do 82 | Atom.to_string fun 83 | end 84 | defp as_fun_label(fun), do: fun 85 | 86 | defp build_label(fragments) do 87 | label = fragments |> Enum.join(".") 88 | Regex.replace(~r/\.\./, label, ".") 89 | end 90 | 91 | defp as_dogtag(prefix, context) do 92 | [prefix, context] |> build_label 93 | end 94 | 95 | end 96 | -------------------------------------------------------------------------------- /examples/genstage_producer_consumer.exs: -------------------------------------------------------------------------------- 1 | # Usage: mix run examples/genstage_producer_consumer.exs 2 | # 3 | # Hit Ctrl+C twice to stop it. 4 | # 5 | # This basic example demonstrates the collection and 6 | # reporting of metrics data for a simple GenStage pipeline. 7 | # 8 | # The sample Metrics.Consumer module simply prints the metrics 9 | # data reported by the GenMetrics library to standard out. 10 | # 11 | # The simple GenStage pipeline used in this example is a copy 12 | # of the ProducerConsumer example pipeline found in the 13 | # GenStage project repository: 14 | # 15 | # https://github.com/elixir-lang/gen_stage. 16 | # 17 | defmodule A do 18 | use GenStage 19 | 20 | def init(counter) do 21 | {:producer, counter} 22 | end 23 | 24 | def handle_demand(demand, counter) when demand > 0 do 25 | events = Enum.to_list(counter..counter+demand-1) 26 | {:noreply, events, counter + demand} 27 | end 28 | end 29 | 30 | defmodule B do 31 | use GenStage 32 | 33 | def init(number) do 34 | {:producer_consumer, number} 35 | end 36 | 37 | def handle_events(events, _from, number) do 38 | events = 39 | for event <- events, 40 | entry <- event..event+number, 41 | do: entry 42 | {:noreply, events, number} 43 | end 44 | end 45 | 46 | defmodule C do 47 | use GenStage 48 | 49 | def init(:ok) do 50 | {:consumer, :the_state_does_not_matter} 51 | end 52 | 53 | def handle_events(_events, _from, state) do 54 | :timer.sleep(1000) 55 | {:noreply, [], state} 56 | end 57 | end 58 | 59 | defmodule Metrics.Consumer do 60 | use GenStage 61 | 62 | def start_link do 63 | GenStage.start_link(__MODULE__, []) 64 | end 65 | 66 | def init(_state) do 67 | {:consumer, :state_does_not_matter, 68 | subscribe_to: [{GenMetrics.GenStage.Reporter, max_demand: 1}]} 69 | end 70 | 71 | def handle_events([window | _], _from, state) do 72 | IO.puts "\n\nGenStage Pipeline: #{inspect window.pipeline.name}" 73 | IO.puts "Metrics-Window: Start:=#{inspect window.start}, Duration=#{inspect window.duration}" 74 | IO.puts "Summary Metrics" 75 | for summary <- window.summary do 76 | IO.puts "#{inspect summary}" 77 | end 78 | IO.puts "Statistical Metrics" 79 | for stage <- window.stats do 80 | IO.puts "Stage:=#{inspect stage.name} [ #{inspect stage.pid} ]" 81 | IO.puts "Demand:=#{inspect stage.demand}" 82 | IO.puts "Events:=#{inspect stage.events}" 83 | IO.puts "Timings:=#{inspect stage.timings}" 84 | end 85 | IO.puts "\n" 86 | {:noreply, [], state} 87 | end 88 | end 89 | 90 | # 91 | # Initialize GenMetrics Monitoring for GenStage Pipeline 92 | # 93 | alias GenMetrics.GenStage.Pipeline 94 | 95 | Application.start(GenMetrics.Application) 96 | Metrics.Consumer.start_link 97 | 98 | pipeline = %Pipeline{name: "demo", 99 | producer: [A], 100 | producer_consumer: [B], 101 | consumer: [C], 102 | opts: [statistics: true, window_interval: 3000]} 103 | 104 | {:ok, _pid} = GenMetrics.monitor_pipeline(pipeline) 105 | 106 | # 107 | # Start Sample GenStage ProducerConsumer Pipeline 108 | # 109 | {:ok, a} = GenStage.start_link(A, 0) # starting from zero 110 | {:ok, b} = GenStage.start_link(B, 2) # expand by 2 111 | {:ok, c} = GenStage.start_link(C, :ok) # state does not matter 112 | 113 | GenStage.sync_subscribe(b, to: a) 114 | GenStage.sync_subscribe(c, to: b) 115 | Process.sleep(:infinity) 116 | -------------------------------------------------------------------------------- /bench/support/server.exs: -------------------------------------------------------------------------------- 1 | defmodule UntracedServer do 2 | use GenServer 3 | 4 | def start_link(target) do 5 | GenServer.start_link(__MODULE__, target, name: __MODULE__) 6 | end 7 | def init(target) do 8 | {:ok, {target, 1}} 9 | end 10 | 11 | def init_state(target) do 12 | GenServer.call(__MODULE__, {:init_state, target}) 13 | end 14 | def do_call(item) do 15 | GenServer.call(__MODULE__, {:do_call, item}) 16 | end 17 | 18 | def do_cast(item) do 19 | GenServer.cast(__MODULE__, {:do_cast, item}) 20 | end 21 | 22 | def do_info(item) do 23 | send(__MODULE__, {:do_info, item}) 24 | end 25 | 26 | def handle_call({:init_state, target}, _from, _) do 27 | {:reply, :ok, {target, 1}} 28 | end 29 | def handle_call({:do_call, %{id: id}}, _from, {target, count}) do 30 | if count >= target, do: send(id, :benchmark_completed) 31 | {:reply, {:ok, id}, {target, count + 1}} 32 | end 33 | 34 | def handle_cast({:do_cast, %{id: id}}, {target, count}) do 35 | if count + 1 >= target, do: send(id, :benchmark_completed) 36 | {:noreply, {target, count + 1}} 37 | end 38 | 39 | def handle_info({:do_info, %{id: id}}, {target, count}) do 40 | if count >= target, do: send(id, :benchmark_completed) 41 | {:noreply, {target, count + 1}} 42 | end 43 | 44 | end 45 | 46 | defmodule TracedServer do 47 | use GenServer 48 | 49 | def start_link(target) do 50 | GenServer.start_link(__MODULE__, target, name: __MODULE__) 51 | end 52 | def init(target) do 53 | {:ok, {target, 1}} 54 | end 55 | 56 | def init_state(target) do 57 | GenServer.call(__MODULE__, {:init_state, target}) 58 | end 59 | 60 | def do_call(item) do 61 | GenServer.call(__MODULE__, {:do_call, item}) 62 | end 63 | 64 | def do_cast(item) do 65 | GenServer.cast(__MODULE__, {:do_cast, item}) 66 | end 67 | 68 | def do_info(item) do 69 | send(__MODULE__, {:do_info, item}) 70 | end 71 | 72 | def handle_call({:init_state, target}, _from, _) do 73 | {:reply, :ok, {target, 1}} 74 | end 75 | 76 | def handle_call({:do_call, %{id: id}}, _from, {target, count}) do 77 | if count >= target, do: send(id, :benchmark_completed) 78 | {:reply, {:ok, id}, {target, count + 1}} 79 | end 80 | 81 | def handle_cast({:do_cast, %{id: id}}, {target, count}) do 82 | if count >= target, do: send(id, :benchmark_completed) 83 | {:noreply, {target, count + 1}} 84 | end 85 | 86 | def handle_info({:do_info, %{id: id}}, {target, count}) do 87 | if count >= target, do: send(id, :benchmark_completed) 88 | {:noreply, {target, count + 1}} 89 | end 90 | 91 | end 92 | 93 | defmodule SampledServer do 94 | use GenServer 95 | 96 | def start_link(target) do 97 | GenServer.start_link(__MODULE__, target, name: __MODULE__) 98 | end 99 | def init(target) do 100 | {:ok, {target, 1}} 101 | end 102 | 103 | def init_state(target) do 104 | GenServer.call(__MODULE__, {:init_state, target}) 105 | end 106 | 107 | def do_call(item) do 108 | GenServer.call(__MODULE__, {:do_call, item}) 109 | end 110 | 111 | def do_cast(item) do 112 | GenServer.cast(__MODULE__, {:do_cast, item}) 113 | end 114 | 115 | def do_info(item) do 116 | send(__MODULE__, {:do_info, item}) 117 | end 118 | 119 | def handle_call({:init_state, target}, _from, _) do 120 | {:reply, :ok, {target, 1}} 121 | end 122 | 123 | def handle_call({:do_call, %{id: id}}, _from, {target, count}) do 124 | if count >= target, do: send(id, :benchmark_completed) 125 | {:reply, {:ok, id}, {target, count + 1}} 126 | end 127 | 128 | def handle_cast({:do_cast, %{id: id}}, {target, count}) do 129 | if count >= target, do: send(id, :benchmark_completed) 130 | {:noreply, {target, count + 1}} 131 | end 132 | 133 | def handle_info({:do_info, %{id: id}}, {target, count}) do 134 | if count >= target, do: send(id, :benchmark_completed) 135 | {:noreply, {target, count + 1}} 136 | end 137 | 138 | end 139 | -------------------------------------------------------------------------------- /lib/utils/runtime.ex: -------------------------------------------------------------------------------- 1 | defmodule GenMetrics.Utils.Runtime do 2 | 3 | @window_interval_default 1000 4 | @sample_rate_default 1.0 5 | 6 | @moduledoc false 7 | 8 | @doc """ 9 | Verify modules are compiled and loaded. 10 | 11 | Returns an empty list if all modules are 12 | successfully compiled and loaded. 13 | 14 | Returns a non-empty list of error messages describing 15 | each module that fails to compile or load. 16 | """ 17 | @spec require_modules([module]) :: [String.t] 18 | def require_modules(module_list) do 19 | module_list 20 | |> Enum.uniq 21 | |> Enum.reduce([], fn(module, acc) -> 22 | try do 23 | Code.eval_string("require #{inspect module}") 24 | acc 25 | rescue 26 | _ -> ["Module #{inspect module} not loaded and could not be found." | acc] 27 | end 28 | end) 29 | end 30 | 31 | @doc """ 32 | Verify modules implement a required behaviour. 33 | 34 | Returns an empty list if all modules successfully 35 | implement the required behaviour. 36 | 37 | Returns a non-empty list of error messages describing 38 | each module that fails to implement the required behaviour. 39 | """ 40 | @spec require_behaviour([module], module) :: [String.t] 41 | def require_behaviour(module_list, behaviour) do 42 | module_list 43 | |> Enum.uniq 44 | |> Enum.reduce([], fn(module, acc) -> 45 | try do 46 | attrs = apply(module, :__info__, [:attributes]) 47 | behaviours = get_in(attrs, [:behaviour]) 48 | if behaviour in behaviours do 49 | acc 50 | else 51 | ["Module #{inspect module} does not implement #{inspect behaviour}." | acc] 52 | end 53 | rescue 54 | _ -> ["Module #{inspect module} does not implement #{inspect behaviour}." | acc] 55 | end 56 | end) 57 | end 58 | 59 | # Return interval for monitor window rollover. 60 | def window_interval(monitor) do 61 | window_interval = monitor.opts[:window_interval] || @window_interval_default 62 | round(window_interval) 63 | end 64 | 65 | # Return interval for sampling within window. 66 | def sample_interval(monitor) do 67 | window_interval = 68 | monitor.opts[:window_interval] || @window_interval_default 69 | sample_interval = round(window_interval * sample_rate(monitor)) 70 | if sample_interval == window_interval do 71 | # adjust sample interval to fit inside window_interval 72 | round(sample_interval * 0.90) 73 | else 74 | round(sample_interval) 75 | end 76 | end 77 | 78 | # Return active metrics sampling rate. 79 | def sample_rate(monitor) do 80 | if sampling?(monitor) do 81 | sample_rate = monitor.opts[:sample_rate] 82 | if sample_rate > 0.9 do 83 | # Enforce upper limit on sampling rate. Rate must 84 | # be either 1.0 (no sampling) or <= 0.9. 85 | 0.9 86 | else 87 | sample_rate 88 | end 89 | else 90 | @sample_rate_default 91 | end 92 | end 93 | 94 | # Return true if sampling rate below 1.0 in use. 95 | def sampling?(monitor) do 96 | sample_rate = monitor.opts[:sample_rate] 97 | if sample_rate == nil || sample_rate == 1.0 do 98 | false 99 | else 100 | true 101 | end 102 | end 103 | 104 | # Return true if monitor is required to generate optional statistics. 105 | def statistics?(monitor), do: monitor.opts[:statistics] || false 106 | 107 | # Return true if monitor is required to trace synchronous calls. 108 | def synchronous?(monitor), do: monitor.opts[:synchronous] || true 109 | 110 | def safe_div(0, _), do: 0 111 | def safe_div(num, d), do: div(num, d) 112 | 113 | def micro_to_milli(0), do: 0 114 | def micro_to_milli(milli), do: safe_div(milli, 1000) 115 | 116 | def nano_to_micro(0), do: 0 117 | def nano_to_micro(nano), do: safe_div(nano, 1000) 118 | def nano_to_milli(0), do: 0 119 | def nano_to_milli(nano), do: safe_div(nano, 1_000_000) 120 | 121 | end 122 | -------------------------------------------------------------------------------- /examples/genstage_rate_limiter.exs: -------------------------------------------------------------------------------- 1 | # Usage: mix run examples/genstage_rate_limiter.exs 2 | # 3 | # Hit Ctrl+C twice to stop it. 4 | # 5 | # This example demonstrates the collection and reporting of 6 | # metrics data for a GenStage pipeline implemented to enforce 7 | # rate limiting work on a consumer. 8 | # 9 | # The sample Metrics.Consumer module simply prints the metrics 10 | # data reported by the GenMetics library to standard out. 11 | # 12 | # The GenStage pipeline used in this example is a copy of the 13 | # RateLimiter example pipeline found in the GenStage project repo: 14 | # 15 | # https://github.com/elixir-lang/gen_stage. 16 | # 17 | defmodule Producer do 18 | use GenStage 19 | 20 | def init(counter) do 21 | {:producer, counter} 22 | end 23 | 24 | def handle_demand(demand, counter) when demand > 0 do 25 | events = Enum.to_list(counter..counter+demand-1) 26 | {:noreply, events, counter + demand} 27 | end 28 | end 29 | 30 | defmodule RateLimiter do 31 | use GenStage 32 | 33 | def init(_) do 34 | {:consumer, %{}} 35 | end 36 | 37 | def handle_subscribe(:producer, opts, from, producers) do 38 | pending = opts[:max_demand] || 1000 39 | interval = opts[:interval] || 5000 40 | producers = Map.put(producers, from, {pending, interval}) 41 | producers = ask_and_schedule(producers, from) 42 | {:manual, producers} 43 | end 44 | 45 | def handle_cancel(_, from, producers) do 46 | {:noreply, [], Map.delete(producers, from)} 47 | end 48 | 49 | def handle_events(events, from, producers) do 50 | producers = Map.update!(producers, from, fn {pending, interval} -> 51 | {pending + length(events), interval} 52 | end) 53 | {:noreply, [], producers} 54 | end 55 | 56 | def handle_info({:ask, from}, producers) do 57 | {:noreply, [], ask_and_schedule(producers, from)} 58 | end 59 | 60 | defp ask_and_schedule(producers, from) do 61 | case producers do 62 | %{^from => {pending, interval}} -> 63 | GenStage.ask(from, pending) 64 | Process.send_after(self(), {:ask, from}, interval) 65 | Map.put(producers, from, {0, interval}) 66 | %{} -> 67 | producers 68 | end 69 | end 70 | end 71 | 72 | defmodule Metrics.Consumer do 73 | use GenStage 74 | 75 | def start_link do 76 | GenStage.start_link(__MODULE__, []) 77 | end 78 | 79 | def init(_state) do 80 | {:consumer, :state_does_not_matter, 81 | subscribe_to: [{GenMetrics.GenStage.Reporter, max_demand: 1}]} 82 | end 83 | 84 | def handle_events([window | _], _from, state) do 85 | IO.puts "\n\nGenStage Pipeline: #{inspect window.pipeline.name}" 86 | IO.puts "Metrics-Window: Start:=#{inspect window.start},Duration=#{inspect window.duration}" 87 | IO.puts "Summary Metrics" 88 | for summary <- window.summary do 89 | IO.puts "#{inspect summary}" 90 | end 91 | IO.puts "Statistical Metrics" 92 | for stage <- window.stats do 93 | IO.puts "Stage:=#{inspect stage.name} [ #{inspect stage.pid} ]" 94 | IO.puts "Demand:=#{inspect stage.demand}" 95 | IO.puts "Events:=#{inspect stage.events}" 96 | IO.puts "Timings:=#{inspect stage.timings}" 97 | end 98 | IO.puts "\n" 99 | {:noreply, [], state} 100 | end 101 | end 102 | 103 | # 104 | # Initialize GenMetrics Monitoring for GenStage Pipeline 105 | # 106 | alias GenMetrics.GenStage.Pipeline 107 | 108 | Application.start(GenMetrics.Application) 109 | Metrics.Consumer.start_link 110 | 111 | pipeline = %Pipeline{name: "demo", 112 | producer: [Producer], 113 | consumer: [RateLimiter], 114 | opts: [statistics: true, window_interval: 2000]} 115 | 116 | {:ok, _pid} = GenMetrics.monitor_pipeline(pipeline) 117 | 118 | # 119 | # Start Sample GenStage RateLimiter Pipeline 120 | # 121 | {:ok, a} = GenStage.start_link(Producer, 0) # starting from zero 122 | {:ok, b} = GenStage.start_link(RateLimiter, :ok) # expand by 2 123 | GenStage.sync_subscribe(b, to: a, max_demand: 10, interval: 2000) 124 | Process.sleep(:infinity) 125 | -------------------------------------------------------------------------------- /examples/genstage_gen_event.exs: -------------------------------------------------------------------------------- 1 | # Usage: mix run examples/genstage_gen_event.exs 2 | # 3 | # This example demonstrates the collection and reporting of 4 | # metrics data for a GenStage pipeline implemented as a 5 | # replacement for GenEvent. 6 | # 7 | # The sample Metrics.Consumer module simply prints the metrics 8 | # data reported by the GenMetics library to standard out. 9 | # 10 | # The GenStage pipeline used in this example is a copy of the 11 | # GenEvent example pipeline found in the GenStage project repo: 12 | # 13 | # https://github.com/elixir-lang/gen_stage. 14 | # 15 | defmodule Broadcaster do 16 | use GenStage 17 | 18 | def start_link() do 19 | GenStage.start_link(__MODULE__, :ok, name: __MODULE__) 20 | end 21 | 22 | def sync_notify(event, timeout \\ 5000) do 23 | GenStage.call(__MODULE__, {:notify, event}, timeout) 24 | end 25 | 26 | def init(:ok) do 27 | {:producer, {:queue.new, 0}, dispatcher: GenStage.BroadcastDispatcher} 28 | end 29 | 30 | def handle_call({:notify, event}, from, {queue, demand}) do 31 | dispatch_events(:queue.in({from, event}, queue), demand, []) 32 | end 33 | 34 | def handle_demand(incoming_demand, {queue, demand}) do 35 | dispatch_events(queue, incoming_demand + demand, []) 36 | end 37 | 38 | defp dispatch_events(queue, demand, events) do 39 | with d when d > 0 <- demand, 40 | {{:value, {from, event}}, queue} <- :queue.out(queue) do 41 | GenStage.reply(from, :ok) 42 | dispatch_events(queue, demand - 1, [event | events]) 43 | else 44 | _ -> {:noreply, Enum.reverse(events), {queue, demand}} 45 | end 46 | end 47 | end 48 | 49 | defmodule Consumer do 50 | use GenStage 51 | 52 | def start_link() do 53 | GenStage.start_link(__MODULE__, :ok) 54 | end 55 | 56 | def init(:ok) do 57 | {:consumer, :ok, subscribe_to: [Broadcaster]} 58 | end 59 | 60 | def handle_events(_events, _from, state) do 61 | {:noreply, [], state} 62 | end 63 | end 64 | 65 | defmodule App do 66 | 67 | def start do 68 | import Supervisor.Spec 69 | 70 | children = [ 71 | worker(Broadcaster, []), 72 | worker(Consumer, [], id: 1), 73 | worker(Consumer, [], id: 2), 74 | worker(Consumer, [], id: 3), 75 | worker(Consumer, [], id: 4) 76 | ] 77 | 78 | Supervisor.start_link(children, strategy: :one_for_one) 79 | end 80 | end 81 | 82 | defmodule Metrics.Consumer do 83 | use GenStage 84 | 85 | def start_link do 86 | GenStage.start_link(__MODULE__, []) 87 | end 88 | 89 | def init(_state) do 90 | {:consumer, :state_does_not_matter, 91 | subscribe_to: [{GenMetrics.GenStage.Reporter, max_demand: 1}]} 92 | end 93 | 94 | def handle_events([window | _], _from, state) do 95 | IO.puts "\n\nGenStage Pipeline: #{inspect window.pipeline.name}" 96 | IO.puts "Metrics-Window: Start:=#{inspect window.start},Duration=#{inspect window.duration}" 97 | IO.puts "Summary Metrics" 98 | for summary <- window.summary do 99 | IO.puts "#{inspect summary}" 100 | end 101 | IO.puts "Statistical Metrics" 102 | for stage <- window.stats do 103 | IO.puts "Stage:=#{inspect stage.name} [ #{inspect stage.pid} ]" 104 | IO.puts "Demand:=#{inspect stage.demand}" 105 | IO.puts "Events:=#{inspect stage.events}" 106 | IO.puts "Timings:=#{inspect stage.timings}" 107 | end 108 | IO.puts "\n" 109 | {:noreply, [], state} 110 | end 111 | end 112 | 113 | # 114 | # Initialize GenMetrics Monitoring for GenStage Pipeline 115 | # 116 | alias GenMetrics.GenStage.Pipeline 117 | 118 | Application.start(GenMetrics.Application) 119 | Metrics.Consumer.start_link 120 | 121 | pipeline = %Pipeline{name: "demo", 122 | producer: [Broadcaster], 123 | consumer: [Consumer], 124 | opts: [statistics: true]} 125 | 126 | {:ok, _pid} = GenMetrics.monitor_pipeline(pipeline) 127 | 128 | # 129 | # Start Sample GenStage GenEvent-Replacement Pipeline 130 | # 131 | App.start 132 | Broadcaster.sync_notify(1) 133 | Broadcaster.sync_notify(2) 134 | Broadcaster.sync_notify(3) 135 | Broadcaster.sync_notify(4) 136 | Broadcaster.sync_notify(5) 137 | Process.sleep(2000) 138 | -------------------------------------------------------------------------------- /bench/support/stages.exs: -------------------------------------------------------------------------------- 1 | defmodule UntracedProducer do 2 | use GenStage 3 | 4 | def start_link do 5 | GenStage.start_link(__MODULE__, [], name: __MODULE__) 6 | end 7 | def init(_) do 8 | {:producer, {:queue.new, 0}} 9 | end 10 | 11 | def emit(item) do 12 | GenStage.call(__MODULE__, {:emit, item}) 13 | end 14 | 15 | def handle_call({:emit, item}, {pid, _ref} = from, {queue, demand}) do 16 | event = Map.put(item, :pid, pid) 17 | dispatch_events(:queue.in({from, event}, queue), demand, []) 18 | end 19 | def handle_demand(incoming_demand, {queue, demand}) do 20 | dispatch_events(queue, incoming_demand + demand, []) 21 | end 22 | 23 | defp dispatch_events(queue, demand, events) do 24 | with d when d > 0 <- demand, 25 | {{:value, {from, event}}, queue} <- :queue.out(queue) do 26 | GenStage.reply(from, {:ok, event.id}) 27 | dispatch_events(queue, demand - 1, [event | events]) 28 | else 29 | _ -> {:noreply, Enum.reverse(events), {queue, demand}} 30 | end 31 | end 32 | end 33 | 34 | defmodule UntracedConsumer do 35 | use GenStage 36 | 37 | def start_link do 38 | GenStage.start_link(__MODULE__, [], name: __MODULE__) 39 | end 40 | def init(_) do 41 | {:consumer, nil, subscribe_to: [{UntracedProducer, max_demand: 1}]} 42 | end 43 | 44 | def handle_events([%{id: id, pid: pid} | _], _from, state) do 45 | send(pid, id) 46 | {:noreply, [], state} 47 | end 48 | end 49 | 50 | defmodule TracedProducer do 51 | use GenStage 52 | 53 | def start_link do 54 | GenStage.start_link(__MODULE__, [], name: __MODULE__) 55 | end 56 | def init(_) do 57 | {:producer, {:queue.new, 0}} 58 | end 59 | 60 | def emit(item) do 61 | GenStage.call(__MODULE__, {:emit, item}) 62 | end 63 | 64 | def handle_call({:emit, item}, {pid, _ref} = from, {queue, demand}) do 65 | event = Map.put(item, :pid, pid) 66 | dispatch_events(:queue.in({from, event}, queue), demand, []) 67 | end 68 | def handle_demand(incoming_demand, {queue, demand}) do 69 | dispatch_events(queue, incoming_demand + demand, []) 70 | end 71 | 72 | defp dispatch_events(queue, demand, events) do 73 | with d when d > 0 <- demand, 74 | {{:value, {from, event}}, queue} <- :queue.out(queue) do 75 | GenStage.reply(from, {:ok, event.id}) 76 | dispatch_events(queue, demand - 1, [event | events]) 77 | else 78 | _ -> {:noreply, Enum.reverse(events), {queue, demand}} 79 | end 80 | end 81 | end 82 | 83 | defmodule TracedConsumer do 84 | use GenStage 85 | 86 | def start_link do 87 | GenStage.start_link(__MODULE__, [], name: __MODULE__) 88 | end 89 | def init(_) do 90 | {:consumer, nil, subscribe_to: [{TracedProducer, max_demand: 1}]} 91 | end 92 | 93 | def handle_events([%{id: id, pid: pid} | _], _from, state) do 94 | send(pid, id) 95 | {:noreply, [], state} 96 | end 97 | end 98 | 99 | defmodule SampledProducer do 100 | use GenStage 101 | 102 | def start_link do 103 | GenStage.start_link(__MODULE__, [], name: __MODULE__) 104 | end 105 | def init(_) do 106 | {:producer, {:queue.new, 0}} 107 | end 108 | 109 | def emit(item) do 110 | GenStage.call(__MODULE__, {:emit, item}) 111 | end 112 | 113 | def handle_call({:emit, item}, {pid, _ref} = from, {queue, demand}) do 114 | event = Map.put(item, :pid, pid) 115 | dispatch_events(:queue.in({from, event}, queue), demand, []) 116 | end 117 | def handle_demand(incoming_demand, {queue, demand}) do 118 | dispatch_events(queue, incoming_demand + demand, []) 119 | end 120 | 121 | defp dispatch_events(queue, demand, events) do 122 | with d when d > 0 <- demand, 123 | {{:value, {from, event}}, queue} <- :queue.out(queue) do 124 | GenStage.reply(from, {:ok, event.id}) 125 | dispatch_events(queue, demand - 1, [event | events]) 126 | else 127 | _ -> {:noreply, Enum.reverse(events), {queue, demand}} 128 | end 129 | end 130 | end 131 | 132 | defmodule SampledConsumer do 133 | use GenStage 134 | 135 | def start_link do 136 | GenStage.start_link(__MODULE__, [], name: __MODULE__) 137 | end 138 | def init(_) do 139 | {:consumer, nil, subscribe_to: [{SampledProducer, max_demand: 1}]} 140 | end 141 | 142 | def handle_events([%{id: id, pid: pid} | _], _from, state) do 143 | send(pid, id) 144 | {:noreply, [], state} 145 | end 146 | end 147 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![GitPitch](https://gitpitch.com/assets/badge.svg)](https://gitpitch.com/onetapbeyond/gen_metrics) 2 | [![Hex Version](https://img.shields.io/hexpm/v/gen_metrics.svg "Hex Version")](https://hex.pm/packages/gen_metrics) 3 | 4 | # GenMetrics 5 | 6 | Runtime metrics for GenServer and GenStage applications. 7 | 8 | > Important! The GenMetrics library is not suitable for use within long-running production environments. For further details, see the [benchmarks performance guide](bench/README.md). 9 | 10 | This library supports the collection and publication of GenServer and GenStage runtime metrics. Metrics data are generated by an introspection agent. No instrumentation is required within the GenServer or GenStage library or within your application source code. 11 | 12 | By default, metrics are published by a dedicated GenMetrics reporting process. Any application can subscribe to this process in order to handle metrics data at runtime. Metrics data can also be pushed directly to a `statsd` agent which makes it possible to analyze, and visualize the metrics within existing tools and services like `Graphana` and `Datadog`. 13 | 14 | ## Quick Look: GenServer Metrics 15 | 16 | Given an application with the following GenServers: `Session.Server`, `Logging.Server`, activate metrics collection for the server cluster as follows: 17 | 18 | ```elixir 19 | alias GenMetrics.GenServer.Cluster 20 | cluster = %Cluster{name: "demo", 21 | servers: [Session.Server, Logging.Server], 22 | opts: [window_interval: 5000]} 23 | GenMetrics.monitor_cluster(cluster) 24 | ``` 25 | 26 | Metrics are published by a dedicated GenMetrics reporting process. Any application can subscribe to this process in order to receive metrics data. Sample summary metrics data for a GenServer process looks as follows: 27 | 28 | ``` 29 | # Server Name: Demo.Server, PID<0.176.0> 30 | 31 | %GenMetrics.GenServer.Summary{name: Demo.Server, 32 | pid: #PID<0.176.0>, 33 | calls: 8000, 34 | casts: 34500, 35 | infos: 3333, 36 | time_on_calls: 28, 37 | time_on_casts: 161, 38 | time_on_infos: 15} 39 | 40 | # Summary timings measured in milliseconds (ms). 41 | ``` 42 | 43 | Detailed statistical metrics data per process are also available. See the [documentation](https://hexdocs.pm/gen_metrics) for details. 44 | 45 | ## Quick Look: GenStage Metrics 46 | 47 | Given a GenStage application with the following stages: `Data.Producer`, `Data.Scrubber`, `Data.Analyzer` and a `Data.Consumer`, activate metrics collection for the entire pipeline as follows: 48 | 49 | ```elixir 50 | alias GenMetrics.GenStage.Pipeline 51 | pipeline = %Pipeline{name: "demo", 52 | producer: [Data.Producer], 53 | producer_consumer: [Data.Scrubber, Data.Analyzer], 54 | consumer: [Data.Consumer]} 55 | GenMetrics.monitor_pipeline(pipeline) 56 | ``` 57 | 58 | Metrics are published by a dedicated GenMetrics reporting process. Any application can subscribe to this process in order to receive metrics data. Sample summary metrics data for a GenStage process looks as follows: 59 | 60 | ``` 61 | # Stage Name: Data.Producer, PID<0.195.0> 62 | 63 | %GenMetrics.GenStage.Summary{stage: Data.Producer, 64 | pid: #PID<0.195.0>, 65 | callbacks: 9536, 66 | time_on_callbacks: 407, 67 | demand: 4768000, 68 | events: 4768000} 69 | 70 | # Summary timings measured in milliseconds (ms). 71 | ``` 72 | 73 | Detailed statistical metrics data per process are also available. See the [documentation](https://hexdocs.pm/gen_metrics) for details. 74 | 75 | ## Quick Look: GenMetrics Sampling 76 | 77 | Given an application with the following GenServers: `Session.Server`, `Logging.Server`, activate metrics-sampling for the server cluster as follows: 78 | 79 | ```elixir 80 | alias GenMetrics.GenServer.Cluster 81 | cluster = %Cluster{name: "demo", 82 | servers: [Session.Server, Logging.Server], 83 | opts: [sample_rate: 0.3]} 84 | GenMetrics.monitor_cluster(cluster) 85 | ``` 86 | 87 | Given a GenStage application with the following stages: `Data.Producer`, `Data.Scrubber`, `Data.Analyzer` and a `Data.Consumer`, activate metrics-sampling for the entire pipeline as follows: 88 | 89 | ```elixir 90 | alias GenMetrics.GenStage.Pipeline 91 | pipeline = %Pipeline{name: "demo", 92 | producer: [Data.Producer], 93 | producer_consumer: [Data.Scrubber, Data.Analyzer], 94 | consumer: [Data.Consumer], 95 | opts: [sample_rate: 0.1]} 96 | GenMetrics.monitor_pipeline(pipeline) 97 | ``` 98 | 99 | ## Quick Look: Metrics Reporting 100 | 101 | Redirect your GenServer cluster metrics data to the Datadog service as follows: 102 | 103 | ```elixir 104 | alias GenMetrics.GenServer.Cluster 105 | cluster = %Cluster{name: "demo", 106 | servers: [Session.Server, Logging.Server], 107 | opts: [statistics: :datadog]} 108 | GenMetrics.monitor_cluster(cluster) 109 | ``` 110 | 111 | Redirect your GenStage pipeline metrics data to a `statsd` agent as follows: 112 | 113 | ``` 114 | alias GenMetrics.GenStage.Pipeline 115 | pipeline = %Pipeline{name: "demo", 116 | producer: [Data.Producer], 117 | producer_consumer: [Data.Scrubber, Data.Analyzer], 118 | consumer: [Data.Consumer], 119 | opts: [statistics: :statsd]} 120 | GenMetrics.monitor_pipeline(pipeline) 121 | ``` 122 | 123 | ## Documentation 124 | 125 | Find detailed documentation for the GenMetrics library on [HexDocs](https://hexdocs.pm/gen_metrics). 126 | 127 | ## Installation 128 | 129 | GenStage requires Elixir v1.4. Just add `:gen_metrics` to your list of dependencies in mix.exs: 130 | 131 | ```elixir 132 | def deps do 133 | [{:gen_metrics, "~> 0.3.0"}] 134 | end 135 | ``` 136 | 137 | ## Benchmarks 138 | 139 | For those of you curious about the performance impact `gen_metrics` has on the servers and pipelines it is monitoring, we've put together a number of benchmarks along with a detailed performance analysis which you can [find here](bench/README.md). 140 | 141 | ## Examples 142 | 143 | Examples using GenMetrics to collect and report runtime metrics for GenServer applications can be found in the [examples](examples) directory: 144 | 145 | * [genserver_events](examples/genserver_events.exs) 146 | 147 | Examples using GenMetrics to collect and report runtime metrics for GenStage applications can also be found in the [examples](examples) directory: 148 | 149 | * [genstage_producer_consumer](examples/genstage_producer_consumer.exs) 150 | 151 | * [genstage_gen_event](examples/genstage_gen_event.exs) 152 | 153 | * [genstage_rate_limiter](examples/genstage_rate_limiter.exs) 154 | 155 | All of these GenStage example applications are clones of the example applications provided in the [GenStage](http://github.com/elixir-lang/gen_stage) project repository. 156 | 157 | ## License 158 | 159 | See the [LICENSE](LICENSE) file for license rights and limitations (Apache License 2.0). 160 | -------------------------------------------------------------------------------- /lib/gen_stage/manager.ex: -------------------------------------------------------------------------------- 1 | defmodule GenMetrics.GenStage.Manager do 2 | alias GenMetrics.GenStage.Manager 3 | alias GenMetrics.GenStage.Stage 4 | alias GenMetrics.GenStage.Summary 5 | alias GenMetrics.GenStage.Stats 6 | alias GenMetrics.GenStage.Window 7 | alias GenMetrics.GenStage.Metric 8 | alias GenMetrics.Utils.Math 9 | alias GenMetrics.Utils.Runtime 10 | alias GenMetrics.Utils.StatsPush 11 | 12 | @moduledoc false 13 | 14 | defstruct stages: %{}, summary_partials: %{}, summary_paired: %{}, 15 | stats_partials: %{}, stats_paired: %{} 16 | 17 | def initialize do 18 | %Manager{} 19 | end 20 | 21 | def reinitialize(metrics) do 22 | %Manager{stages: metrics.stages, 23 | summary_partials: metrics.summary_partials, 24 | stats_partials: metrics.stats_partials} 25 | end 26 | 27 | def open_summary_metric(metrics, mod, pid, demand, ts) do 28 | metrics = register_pid_on_stage(metrics, mod, pid) 29 | do_open_summary_metric(metrics, mod, pid, demand, ts) 30 | end 31 | 32 | def close_summary_metric(metrics, mod, pid, events, ts) do 33 | do_close_summary_metric(metrics, mod, pid, events, ts) 34 | end 35 | 36 | def open_stats_metric(metrics, {mod, pid, demand, ts}) do 37 | metrics = register_pid_on_stage(metrics, mod, pid) 38 | do_open_stats_metric(metrics, {mod, pid, demand, ts}) 39 | end 40 | 41 | def close_stats_metric(pipeline, metrics, {mod, pid, events, ts}) do 42 | do_close_stats_metric(pipeline, metrics, {mod, pid, events, ts}) 43 | end 44 | 45 | def as_window(metrics, gen_stats, sample_rate) do 46 | window = %Window{summary: build_stage_summary(metrics, sample_rate)} 47 | if gen_stats do 48 | with stage_metrics <- build_stage_metrics(metrics), 49 | stage_stats <- build_stage_stats(stage_metrics, sample_rate), 50 | do: %Window{window | stats: stage_stats} 51 | else 52 | window 53 | end 54 | end 55 | 56 | # 57 | # Metrics manager private utility functions follow. 58 | # 59 | 60 | defp register_pid_on_stage(metrics, stage, pid) do 61 | stages = Map.update(metrics.stages, stage, 62 | MapSet.new |> MapSet.put(pid), & MapSet.put(&1, pid)) 63 | %Manager{metrics | stages: stages} 64 | end 65 | 66 | defp do_open_summary_metric(metrics, _mod, pid, demand, ts) do 67 | mdemand = Metric.demand(demand, ts) 68 | summary_partials = Map.put(metrics.summary_partials, pid, mdemand) 69 | %Manager{metrics | summary_partials: summary_partials} 70 | end 71 | 72 | defp do_close_summary_metric(metrics, _mod, pid, events, ts) do 73 | if Map.has_key?(metrics.summary_partials, pid) do 74 | {partial, summary_partials} = Map.pop(metrics.summary_partials, pid) 75 | summary_paired = 76 | Metric.pair(metrics.summary_paired, pid, events, ts, partial) 77 | %Manager{metrics | summary_partials: summary_partials, 78 | summary_paired: summary_paired} 79 | else 80 | metrics 81 | end 82 | end 83 | 84 | defp do_open_stats_metric(metrics, {_mod, pid, demand, ts}) do 85 | mdemand = Metric.demand(demand, ts) 86 | stats_partials = Map.put(metrics.stats_partials, pid, mdemand) 87 | %Manager{metrics | stats_partials: stats_partials} 88 | end 89 | 90 | defp do_close_stats_metric(pipeline, metrics, {mod, pid, events, ts}) do 91 | if Map.has_key?(metrics.stats_partials, pid) do 92 | {partial, partials} = Map.pop(metrics.stats_partials, pid) 93 | mevent = Metric.event(partial, events, ts) 94 | statsd_args = {mod, pid, mevent, partials} 95 | case pipeline.opts[:statistics] do 96 | :statsd -> 97 | push_metric_to_statsd(pipeline, metrics, statsd_args) 98 | :datadog -> 99 | push_metric_to_datadog(pipeline, metrics, statsd_args) 100 | _ -> 101 | push_metric_in_memory(pipeline, metrics, pid, mevent, partials) 102 | end 103 | else 104 | metrics 105 | end 106 | end 107 | 108 | defp build_stage_summary(metrics, sample_rate) do 109 | for {stage, pids} <- metrics.stages, pid <- pids, into: [] do 110 | summary = generate_stage_summary(Map.get(metrics.summary_paired, 111 | pid, Metric.no_pair), sample_rate) 112 | %Summary{summary | name: stage, pid: pid} 113 | end 114 | end 115 | 116 | defp build_stage_metrics(metrics) do 117 | for {stage, pids} <- metrics.stages, pid <- pids, into: [] do 118 | {stage, pid, Map.get(metrics.stats_paired, pid, [])} 119 | end 120 | end 121 | 122 | defp build_stage_stats([], _), do: [] 123 | defp build_stage_stats(stage_metrics, sample_rate) do 124 | for {module, pid, metrics} <- stage_metrics do 125 | len = length(metrics) 126 | %Stage{name: module, pid: pid, 127 | demand: generate_demand_stats(metrics, len, sample_rate), 128 | events: generate_events_stats(metrics, len, sample_rate), 129 | timings: generate_timings_stats(metrics, len, sample_rate)} 130 | end 131 | end 132 | 133 | defp generate_stage_summary({calls, demand, events, time_on_callbacks}, 134 | sample_rate) do 135 | do_generate_stage_summary(calls, demand, events, 136 | Runtime.nano_to_milli(time_on_callbacks), sample_rate) 137 | end 138 | 139 | defp generate_stage_summary(stage = %Stage{}, sample_rate) do 140 | do_generate_stage_summary(stage.demand.calls, 141 | stage.demand.total, stage.events.total, 142 | Runtime.micro_to_milli(stage.timings.total), sample_rate) 143 | end 144 | 145 | defp do_generate_stage_summary(calls, demand, events, 146 | time_on_callbacks, sample_rate) do 147 | srate_multiplier = 1 / sample_rate 148 | %Summary{callbacks: round(calls * srate_multiplier), 149 | demand: round(demand * srate_multiplier), 150 | events: round(events * srate_multiplier), 151 | time_on_callbacks: round(time_on_callbacks * srate_multiplier)} 152 | end 153 | 154 | defp generate_demand_stats(metrics, len, sample_rate) do 155 | demand = metrics |> Enum.map(& &1.demand) |> Enum.sort 156 | generate_stats(demand, len, sample_rate) 157 | end 158 | 159 | defp generate_events_stats(metrics, len, sample_rate) do 160 | events = metrics |> Enum.map(& &1.events) |> Enum.sort 161 | generate_stats(events, len, sample_rate) 162 | end 163 | 164 | defp generate_timings_stats(metrics, len, sample_rate) do 165 | durations = metrics |> Enum.map(& &1.duration) |> Enum.sort 166 | generate_stats(durations, len, sample_rate) 167 | end 168 | 169 | defp generate_stats(data, len, sample_rate) do 170 | srate_multiplier = 1 / sample_rate 171 | %Stats{callbacks: round(len * srate_multiplier), 172 | min: Math.min(data), max: Math.max(data), 173 | total: round(Math.sum(data) * srate_multiplier), 174 | mean: Math.mean(data, len), 175 | stdev: Math.stdev(data, len), range: Math.range(data)} 176 | end 177 | 178 | defp push_metric_in_memory(_cluster, metrics, pid, mevent, stats_partials) do 179 | stats_paired = 180 | Map.update(metrics.stats_paired, pid, [mevent], & [mevent | &1]) 181 | %Manager{metrics | stats_partials: stats_partials, 182 | stats_paired: stats_paired} 183 | end 184 | 185 | defp push_metric_to_statsd(pipeline, metrics, {mod, pid, mevent, partials}) do 186 | StatsPush.statsd(pipeline.name, mod, pid, nil, mevent) 187 | %Manager{metrics | stats_partials: partials} 188 | end 189 | 190 | defp push_metric_to_datadog(pipeline, metrics, {mod, pid, mevent, partials}) do 191 | StatsPush.datadog(pipeline.name, mod, pid, nil, mevent) 192 | %Manager{metrics | stats_partials: partials} 193 | end 194 | 195 | end 196 | -------------------------------------------------------------------------------- /lib/gen_server/manager.ex: -------------------------------------------------------------------------------- 1 | defmodule GenMetrics.GenServer.Manager do 2 | alias GenMetrics.GenServer.Manager 3 | alias GenMetrics.GenServer.Server 4 | alias GenMetrics.GenServer.Summary 5 | alias GenMetrics.GenServer.Stats 6 | alias GenMetrics.GenServer.Window 7 | alias GenMetrics.GenServer.Metric 8 | alias GenMetrics.Utils.Math 9 | alias GenMetrics.Utils.Runtime 10 | alias GenMetrics.Utils.StatsPush 11 | 12 | @moduledoc false 13 | 14 | @call_cast_info [:handle_call, :handle_cast, :handle_info] 15 | 16 | defstruct servers: %{}, summary_partials: %{}, summary_paired: %{}, 17 | stats_partials: %{}, stats_paired: %{} 18 | 19 | def initialize do 20 | %Manager{} 21 | end 22 | 23 | def reinitialize(metrics) do 24 | %Manager{servers: metrics.servers, 25 | summary_partials: metrics.summary_partials, 26 | stats_partials: metrics.stats_partials} 27 | end 28 | 29 | def open_summary_metric(metrics, mod, pid, fun, ts) do 30 | metrics = register_pid_on_server(metrics, mod, pid) 31 | do_open_summary_metric(metrics, mod, pid, fun, ts) 32 | end 33 | 34 | def close_summary_metric(metrics, pid, events, ts) do 35 | do_close_summary_metric(metrics, pid, events, ts) 36 | end 37 | 38 | def open_stats_metric(metrics, {mod, pid, fun, ts}) do 39 | metrics = register_pid_on_server(metrics, mod, pid) 40 | do_open_stats_metric(metrics, {pid, fun, ts}) 41 | end 42 | 43 | def close_stats_metric(cluster, metrics, {mod, pid, events, ts}) do 44 | do_close_stats_metric(cluster, metrics, {mod, pid, events, ts}) 45 | end 46 | 47 | def as_window(metrics, gen_stats, sample_rate) do 48 | window = %Window{summary: build_server_summary(metrics, sample_rate)} 49 | if gen_stats do 50 | with server_metrics <- build_server_metrics(metrics), 51 | server_stats <- build_server_stats(server_metrics, sample_rate), 52 | do: %Window{window | stats: server_stats} 53 | else 54 | window 55 | end 56 | end 57 | 58 | # 59 | # Metrics manager private utility functions follow. 60 | # 61 | 62 | defp register_pid_on_server(metrics, server, pid) do 63 | servers = Map.update(metrics.servers, server, 64 | MapSet.new |> MapSet.put(pid), & MapSet.put(&1, pid)) 65 | %Manager{metrics | servers: servers} 66 | end 67 | 68 | defp do_open_summary_metric(metrics, _mod, pid, fun, ts) do 69 | mkey = as_metric_key(pid, fun) 70 | mevent = Metric.partial(ts) 71 | summary_partials = Map.put(metrics.summary_partials, mkey, mevent) 72 | %Manager{metrics | summary_partials: summary_partials} 73 | end 74 | 75 | defp do_close_summary_metric(metrics, pid, fun, ts) do 76 | mkey = as_metric_key(pid, fun) 77 | if Map.has_key?(metrics.summary_partials, mkey) do 78 | {partial, summary_partials} = Map.pop(metrics.summary_partials, mkey) 79 | summary_paired = Metric.pair(metrics.summary_paired, mkey, ts, partial) 80 | %Manager{metrics | summary_partials: summary_partials, 81 | summary_paired: summary_paired} 82 | else 83 | metrics 84 | end 85 | end 86 | 87 | defp do_open_stats_metric(metrics, {pid, fun, ts}) do 88 | mkey = as_metric_key(pid, fun) 89 | mevent = Metric.start(ts) 90 | stats_partials = Map.put(metrics.stats_partials, mkey, mevent) 91 | %Manager{metrics | stats_partials: stats_partials} 92 | end 93 | 94 | defp do_close_stats_metric(cluster, metrics, {mod, pid, fun, ts}) do 95 | mkey = as_metric_key(pid, fun) 96 | if Map.has_key?(metrics.stats_partials, mkey) do 97 | {partial, partials} = Map.pop(metrics.stats_partials, mkey) 98 | mevent = Metric.stop(partial, ts) 99 | statsd_args = {mod, pid, fun, mevent, partials} 100 | case cluster.opts[:statistics] do 101 | :statsd -> 102 | push_metric_to_statsd(cluster, metrics, statsd_args) 103 | :datadog -> 104 | push_metric_to_datadog(cluster, metrics, statsd_args) 105 | _ -> 106 | push_metric_in_memory(cluster, metrics, mkey, mevent, partials) 107 | end 108 | else 109 | metrics 110 | end 111 | end 112 | 113 | defp build_server_summary(metrics, sample_rate) do 114 | for {server, pids} <- metrics.servers, pid <- pids, into: [] do 115 | mkeys = for key <- @call_cast_info, do: as_metric_key(pid, key) 116 | metrics_on_pid = for mkey <- mkeys do 117 | Map.get(metrics.summary_paired, mkey, Metric.no_pair) 118 | end 119 | summary = generate_server_summary(metrics_on_pid, sample_rate) 120 | %Summary{summary | name: server, pid: pid} 121 | end 122 | end 123 | 124 | defp build_server_metrics(metrics) do 125 | for {server, pids} <- metrics.servers, pid <- pids, into: [] do 126 | mkeys = for key <- @call_cast_info, do: as_metric_key(pid, key) 127 | {server, pid, 128 | (for mkey <- mkeys, do: Map.get(metrics.stats_paired, mkey, []))} 129 | end 130 | end 131 | 132 | defp build_server_stats([], _), do: [] 133 | defp build_server_stats(server_metrics, sample_rate) do 134 | for {module, pid, [calls, casts, infos]} <- server_metrics do 135 | %Server{name: module, pid: pid, 136 | calls: generate_metric_stats(calls, length(calls), sample_rate), 137 | casts: generate_metric_stats(casts, length(casts), sample_rate), 138 | infos: generate_metric_stats(infos, length(infos), sample_rate)} 139 | end 140 | end 141 | 142 | defp generate_server_summary([calls, casts, infos], sample_rate) do 143 | do_generate_server_summary(calls, casts, infos, sample_rate) 144 | end 145 | 146 | defp generate_server_summary(server = %Server{}, sample_rate) do 147 | calls = {server.calls.calls, server.calls.total, 0} 148 | casts = {server.casts.calls, server.casts.total, 0} 149 | infos = {server.infos.calls, server.casts.total, 0} 150 | do_generate_server_summary(calls, casts, infos, sample_rate) 151 | end 152 | 153 | defp do_generate_server_summary({calls, tcalls}, {casts, tcasts}, 154 | {infos, tinfos}, sample_rate) do 155 | srate_multiplier = 1 / sample_rate 156 | %Summary{calls: round(calls * srate_multiplier), 157 | casts: round(casts * srate_multiplier), 158 | infos: round(infos * srate_multiplier), 159 | time_on_calls: Runtime.nano_to_milli(round(tcalls * srate_multiplier)), 160 | time_on_casts: Runtime.nano_to_milli(round(tcasts * srate_multiplier)), 161 | time_on_infos: Runtime.nano_to_milli(round(tinfos * srate_multiplier))} 162 | end 163 | 164 | defp generate_metric_stats([], _, sample_rate), do: generate_stats([], 0, sample_rate) 165 | defp generate_metric_stats(metrics, len, sample_rate) do 166 | metric_durations = 167 | metrics |> Enum.map(fn metric -> metric.duration end) |> Enum.sort 168 | generate_stats(metric_durations, len, sample_rate) 169 | end 170 | 171 | defp generate_stats(data, len, sample_rate) do 172 | srate_multiplier = 1 / sample_rate 173 | %Stats{callbacks: round(len * srate_multiplier), 174 | min: Math.min(data), max: Math.max(data), 175 | total: round(Math.sum(data) * srate_multiplier), 176 | mean: Math.mean(data, len), 177 | stdev: Math.stdev(data, len), range: Math.range(data)} 178 | end 179 | 180 | defp push_metric_in_memory(_cluster, metrics, mkey, mevent, stats_partials) do 181 | stats_paired = 182 | Map.update(metrics.stats_paired, mkey, [mevent], & [mevent | &1]) 183 | %Manager{metrics | stats_partials: stats_partials, 184 | stats_paired: stats_paired} 185 | end 186 | 187 | defp push_metric_to_statsd(cluster, metrics, {mod, pid, fun, mevent, partials}) do 188 | StatsPush.statsd(cluster.name, mod, pid, fun, mevent) 189 | %Manager{metrics | stats_partials: partials} 190 | end 191 | 192 | defp push_metric_to_datadog(cluster, metrics, {mod, pid, fun, mevent, partials}) do 193 | StatsPush.datadog(cluster.name, mod, pid, fun, mevent) 194 | %Manager{metrics | stats_partials: partials} 195 | end 196 | 197 | defp as_metric_key(pid, fun) do 198 | "#{inspect pid}-#{inspect fun}" 199 | end 200 | 201 | end 202 | -------------------------------------------------------------------------------- /lib/gen_server/monitor.ex: -------------------------------------------------------------------------------- 1 | defmodule GenMetrics.GenServer.Monitor do 2 | use GenServer 3 | alias GenMetrics.GenServer.Manager 4 | alias GenMetrics.GenServer.Monitor 5 | alias GenMetrics.GenServer.Cluster 6 | alias GenMetrics.GenServer.Window 7 | alias GenMetrics.Reporter 8 | alias GenMetrics.Utils.Runtime 9 | 10 | @moduledoc false 11 | @call_cast_info [:handle_call, :handle_cast, :handle_info] 12 | 13 | defstruct cluster: %Cluster{}, metrics: nil, start: 0, duration: 0 14 | 15 | def start_link(cluster) do 16 | GenServer.start_link(__MODULE__, cluster) 17 | end 18 | 19 | def init(cluster) do 20 | with {:ok, _} <- validate_modules(cluster), 21 | {:ok, _} <- validate_behaviours(cluster), 22 | {:ok, _} <- activate_tracing(cluster), 23 | state <- initialize_monitor(cluster), 24 | do: start_monitor(state) 25 | end 26 | 27 | # 28 | # Handlers for intercepting :erlang.trace/3 and :erlang.trace_pattern/2 29 | # callbacks for modules registered on the cluster. 30 | # 31 | 32 | def handle_info({:trace_ts, pid, :call, {mod, fun, _args}, ts}, state) do 33 | {:noreply, 34 | do_intercept_call_request(state, mod, pid, fun, ts)} 35 | end 36 | 37 | # Intercept {:reply, reply, new_state} 38 | def handle_info({:trace_ts, pid, :return_from, {mod, fun, _arity}, 39 | {:reply, _, _}, ts}, state) do 40 | {:noreply, 41 | do_intercept_call_response(state, mod, pid, fun, ts)} 42 | end 43 | 44 | # Intercept {:reply, reply, new_state, timeout | :hibernate} 45 | def handle_info({:trace_ts, pid, :return_from, {mod, fun, _arity}, 46 | {:reply, _, _, _}, ts}, state) do 47 | {:noreply, 48 | do_intercept_call_response(state, mod, pid, fun, ts)} 49 | end 50 | 51 | # Intercept {:noreply, new_state} 52 | def handle_info({:trace_ts, pid, :return_from, {mod, fun, _arity}, 53 | {:noreply, _}, ts}, state) do 54 | {:noreply, 55 | do_intercept_call_response(state, mod, pid, fun, ts)} 56 | end 57 | 58 | # Intercept {:noreply, new_state, timeout | :hibernate} 59 | def handle_info({:trace_ts, pid, :return_from, {mod, fun, _arity}, 60 | {:noreply, _, _}, ts}, state) do 61 | {:noreply, 62 | do_intercept_call_response(state, mod, pid, fun, ts)} 63 | end 64 | 65 | # Intercept {:stop, reason, reply, new_state} 66 | def handle_info({:trace_ts, pid, :return_from, {mod, fun, _arity}, 67 | {:stop, _, _, _}, ts}, state) do 68 | {:noreply, 69 | do_intercept_call_response(state, mod, pid, fun, ts)} 70 | end 71 | 72 | # Intercept {:stop, reason, new_state} 73 | def handle_info({:trace_ts, pid, :return_from, {mod, fun, _arity}, 74 | {:stop, _, _}, ts}, state) do 75 | {:noreply, 76 | do_intercept_call_response(state, mod, pid, fun, ts)} 77 | end 78 | 79 | # Report and rollover metrics window. 80 | def handle_info(:rollover_metrics_window, state) do 81 | now = :erlang.system_time 82 | state = %Monitor{state | duration: Runtime.nano_to_milli(now - state.start)} 83 | window = Manager.as_window(state.metrics, 84 | Runtime.statistics?(state.cluster), Runtime.sample_rate(state.cluster)) 85 | window = %Window{window | cluster: state.cluster, 86 | start: state.start, duration: state.duration} 87 | Reporter.push(GenMetrics.GenServer.Reporter, window) 88 | Process.send_after(self(), 89 | :rollover_metrics_window, Runtime.window_interval(state.cluster)) 90 | if Runtime.sampling?(state.cluster) do 91 | activate_tracing(state.cluster) 92 | Process.send_after(self(), 93 | :silence_metrics_window, Runtime.sample_interval(state.cluster)) 94 | end 95 | {:noreply, initialize_monitor(state.cluster, state.metrics)} 96 | end 97 | 98 | # Sampling window is closed for current metrics windows 99 | # so temporarily silence tracing. 100 | def handle_info(:silence_metrics_window, state) do 101 | activate_tracing(state.cluster, true) 102 | {:noreply, state} 103 | end 104 | 105 | # Catch-all for calls not intercepted by monitor. 106 | def handle_info(_msg, state), do: {:noreply, state} 107 | 108 | # 109 | # Private utility functions follow. 110 | # 111 | 112 | # Initialize GenServer state for monitor. 113 | defp initialize_monitor(cluster, metrics \\ nil) do 114 | if metrics do 115 | %Monitor{cluster: cluster, 116 | metrics: Manager.reinitialize(metrics), 117 | start: :erlang.system_time} 118 | else 119 | %Monitor{cluster: cluster, 120 | metrics: Manager.initialize(), 121 | start: :erlang.system_time} 122 | end 123 | end 124 | 125 | # Initialize periodic callback for metrics reporting and window rollover. 126 | defp start_monitor(state) do 127 | Process.send_after(self(), 128 | :rollover_metrics_window, Runtime.window_interval(state.cluster)) 129 | if Runtime.sampling?(state.cluster) do 130 | Process.send_after(self(), 131 | :silence_metrics_window, Runtime.sample_interval(state.cluster)) 132 | end 133 | {:ok, state} 134 | end 135 | 136 | # Activate tracing for servers within cluster. 137 | defp activate_tracing(cluster, silent \\ false) do 138 | 139 | if silent do 140 | :erlang.trace(:processes, false, [:call, :monotonic_timestamp]) 141 | else 142 | :erlang.trace(:processes, true, [:call, :monotonic_timestamp]) 143 | for server <- cluster.servers do 144 | 145 | if Runtime.synchronous?(cluster) do 146 | :erlang.trace_pattern({server, :handle_call, 3}, 147 | [{:_, [], [{:return_trace}]}]) 148 | end 149 | :erlang.trace_pattern({server, :handle_cast, 2}, 150 | [{:_, [], [{:return_trace}]}]) 151 | :erlang.trace_pattern({server, :handle_info, 2}, 152 | [{:_, [], [{:return_trace}]}]) 153 | end 154 | end 155 | 156 | {:ok, cluster} 157 | end 158 | 159 | # Validate cluster modules can be loaded or report failures. 160 | defp validate_modules(cluster) do 161 | case require_modules(cluster) do 162 | [] -> {:ok, cluster} 163 | errs -> {:stop, {:bad_cluster, errs}} 164 | end 165 | end 166 | 167 | # Ensure cluster modules are available and can be loaded. 168 | defp require_modules(cluster) do 169 | cluster.servers 170 | |> Enum.uniq 171 | |> Runtime.require_modules 172 | end 173 | 174 | # Validate cluster modules implement GenServer or report failures. 175 | defp validate_behaviours(cluster) do 176 | case require_behaviour(cluster, GenServer) do 177 | [] -> {:ok, cluster} 178 | errs -> {:stop, {:bad_cluster, errs}} 179 | end 180 | end 181 | 182 | # Ensure cluster modules implement GenServer behaviour. 183 | defp require_behaviour(cluster, behaviour) do 184 | cluster.servers 185 | |> Enum.uniq 186 | |> Runtime.require_behaviour(behaviour) 187 | end 188 | 189 | defp do_intercept_call_request(state, mod, pid, fun, ts) do 190 | if fun in @call_cast_info do 191 | do_open_metric(state, mod, pid, fun, ts) 192 | else 193 | state 194 | end 195 | end 196 | 197 | defp do_intercept_call_response(state, mod, pid, fun, ts) do 198 | do_close_metric(state, mod, pid, fun, ts) 199 | end 200 | 201 | # Open partial metric on handle_ function call trace. 202 | defp do_open_metric(state, mod, pid, fun, ts) do 203 | metrics = 204 | Manager.open_summary_metric(state.metrics, mod, pid, fun, ts) 205 | state = %Monitor{state | metrics: metrics} 206 | 207 | if Runtime.statistics?(state.cluster) do 208 | metrics = 209 | Manager.open_stats_metric(state.metrics, {mod, pid, fun, ts}) 210 | %Monitor{state | metrics: metrics} 211 | else 212 | state 213 | end 214 | end 215 | 216 | # Close complete metric on handle_ function return trace. 217 | defp do_close_metric(state, mod, pid, events, ts) do 218 | metrics = Manager.close_summary_metric(state.metrics, pid, events, ts) 219 | state = %Monitor{state | metrics: metrics} 220 | 221 | if Runtime.statistics?(state.cluster) do 222 | metrics = Manager.close_stats_metric(state.cluster, 223 | state.metrics, {mod, pid, events, ts}) 224 | %Monitor{state | metrics: metrics} 225 | else 226 | state 227 | end 228 | end 229 | 230 | end 231 | -------------------------------------------------------------------------------- /lib/gen_stage/monitor.ex: -------------------------------------------------------------------------------- 1 | defmodule GenMetrics.GenStage.Monitor do 2 | use GenServer 3 | alias GenMetrics.GenStage.Manager 4 | alias GenMetrics.GenStage.Monitor 5 | alias GenMetrics.GenStage.Pipeline 6 | alias GenMetrics.GenStage.Window 7 | alias GenMetrics.Reporter 8 | alias GenMetrics.Utils.Runtime 9 | 10 | @moduledoc false 11 | @handle_demand :handle_demand 12 | @handle_events :handle_events 13 | @handle_call :handle_call 14 | @handle_cast :handle_cast 15 | 16 | defstruct pipeline: %Pipeline{}, metrics: nil, start: 0, duration: 0 17 | 18 | def start_link(pipeline) do 19 | GenServer.start_link(__MODULE__, pipeline) 20 | end 21 | 22 | def init(pipeline) do 23 | with {:ok, _} <- validate_modules(pipeline), 24 | {:ok, _} <- validate_behaviours(pipeline), 25 | {:ok, _} <- activate_tracing(pipeline), 26 | state <- initialize_monitor(pipeline), 27 | do: start_monitor(state) 28 | end 29 | 30 | # 31 | # Handlers for intercepting :erlang.trace/3 and :erlang.trace_pattern/2 32 | # callbacks for modules registered on the pipeline. 33 | # 34 | 35 | def handle_info({:trace_ts, pid, :call, {mod, fun, [demand | _]}, ts}, state) do 36 | {:noreply, 37 | do_intercept_call_request(state, pid, {mod, fun}, demand, ts)} 38 | end 39 | 40 | # Intercept {:noreply, [event], new_state} response. 41 | def handle_info({:trace_ts, pid, :return_from, {mod, _, _}, 42 | {:noreply, events, _}, ts}, state) do 43 | {:noreply, 44 | do_intercept_call_response(state, mod, pid, length(events), ts)} 45 | end 46 | 47 | # Intercept {:noreply, [event], new_state, :hibernate} response. 48 | def handle_info({:trace_ts, pid, :return_from, {mod, _, _}, 49 | {:noreply, events, _, _}, ts}, state) do 50 | {:noreply, 51 | do_intercept_call_response(state, mod, pid, length(events), ts)} 52 | end 53 | 54 | # Intercept {:reply, _reply, [event], new_state} response. 55 | def handle_info({:trace_ts, pid, :return_from, {mod, _, _}, 56 | {:reply, _, events, _}, ts}, state) do 57 | {:noreply, 58 | do_intercept_call_response(state, mod, pid, length(events), ts)} 59 | end 60 | 61 | # Intercept {:reply, _reply, [event], new_state, :hibernate} response. 62 | def handle_info({:trace_ts, pid, :return_from, {mod, _, _}, 63 | {:noreply, _, events, _, _}, ts}, state) do 64 | {:noreply, 65 | do_intercept_call_response(state, mod, pid, length(events), ts)} 66 | end 67 | 68 | # Intercept {:stop, reason, new_state} response. 69 | def handle_info({:trace_ts, pid, :return_from, {mod, _, _}, 70 | {:stop, _, _}, ts}, state) do 71 | {:noreply, 72 | do_intercept_call_response(state, mod, pid, 0, ts)} 73 | end 74 | 75 | # Report and rollover metrics window. 76 | def handle_info(:rollover_metrics_window, state) do 77 | now = :erlang.system_time 78 | state = %Monitor{state | duration: Runtime.nano_to_milli(now - state.start)} 79 | window = Manager.as_window(state.metrics, 80 | Runtime.statistics?(state.pipeline), Runtime.sample_rate(state.pipeline)) 81 | window = %Window{window | pipeline: state.pipeline, 82 | start: state.start, duration: state.duration} 83 | Reporter.push(GenMetrics.GenStage.Reporter, window) 84 | Process.send_after(self(), 85 | :rollover_metrics_window, Runtime.window_interval(state.pipeline)) 86 | if Runtime.sampling?(state.pipeline) do 87 | activate_tracing(state.pipeline) 88 | Process.send_after(self(), 89 | :silence_metrics_window, Runtime.sample_interval(state.pipeline)) 90 | end 91 | {:noreply, initialize_monitor(state.pipeline, state.metrics)} 92 | end 93 | 94 | # Sampling window is closed for current metrics windows 95 | # so temporarily silence tracing. 96 | def handle_info(:silence_metrics_window, state) do 97 | activate_tracing(state.pipeline, true) 98 | {:noreply, state} 99 | end 100 | 101 | # Catch-all for calls not intercepted by monitor. 102 | def handle_info(_msg, state), do: {:noreply, state} 103 | 104 | # 105 | # Private utility functions follow. 106 | # 107 | 108 | # Initialize GenServer state for monitor. 109 | defp initialize_monitor(pipeline, metrics \\ nil) do 110 | if metrics do 111 | %Monitor{pipeline: pipeline, 112 | metrics: Manager.reinitialize(metrics), 113 | start: :erlang.system_time} 114 | else 115 | %Monitor{pipeline: pipeline, 116 | metrics: Manager.initialize(), 117 | start: :erlang.system_time} 118 | end 119 | end 120 | 121 | # Initialize periodic callback for metrics reporting and window rollover. 122 | defp start_monitor(state) do 123 | Process.send_after(self(), 124 | :rollover_metrics_window, Runtime.window_interval(state.pipeline)) 125 | if Runtime.sampling?(state.pipeline) do 126 | Process.send_after(self(), 127 | :silence_metrics_window, Runtime.sample_interval(state.pipeline)) 128 | end 129 | {:ok, state} 130 | end 131 | 132 | # Activate tracing for stages within pipeline. 133 | defp activate_tracing(pipeline, silent \\ false) do 134 | 135 | if silent do 136 | :erlang.trace(:processes, false, [:call, :monotonic_timestamp]) 137 | else 138 | :erlang.trace(:processes, true, [:call, :monotonic_timestamp]) 139 | 140 | for pmod <- pipeline.producer do 141 | :erlang.trace_pattern({pmod, :handle_demand, 2}, 142 | [{:_, [], [{:return_trace}]}]) 143 | :erlang.trace_pattern({pmod, :handle_cast, 2}, 144 | [{:_, [], [{:return_trace}]}]) 145 | if Runtime.synchronous?(pipeline) do 146 | :erlang.trace_pattern({pmod, :handle_call, 3}, 147 | [{:_, [], [{:return_trace}]}]) 148 | end 149 | end 150 | 151 | for pcmod <- pipeline.producer_consumer do 152 | :erlang.trace_pattern({pcmod, :handle_events, 3}, 153 | [{:_, [], [{:return_trace}]}]) 154 | :erlang.trace_pattern({pcmod, :handle_cast, 2}, 155 | [{:_, [], [{:return_trace}]}]) 156 | if Runtime.synchronous?(pipeline) do 157 | :erlang.trace_pattern({pcmod, :handle_call, 3}, 158 | [{:_, [], [{:return_trace}]}]) 159 | end 160 | end 161 | 162 | for cmod <- pipeline.consumer do 163 | :erlang.trace_pattern({cmod, :handle_events, 3}, 164 | [{:_, [], [{:return_trace}]}]) 165 | end 166 | end 167 | 168 | {:ok, pipeline} 169 | end 170 | 171 | # Validate pipeline modules can be loaded or report failures. 172 | defp validate_modules(pipeline) do 173 | case require_modules(pipeline) do 174 | [] -> {:ok, pipeline} 175 | errs -> {:stop, {:bad_pipeline, errs}} 176 | end 177 | end 178 | 179 | # Ensure pipeline modules are available and can be loaded. 180 | defp require_modules(pipeline) do 181 | [pipeline.producer, pipeline.producer_consumer, pipeline.consumer] 182 | |> Enum.flat_map(fn(modules) -> modules end) 183 | |> Enum.uniq 184 | |> Runtime.require_modules 185 | end 186 | 187 | # Validate pipeline modules implement GenStage or report failures. 188 | defp validate_behaviours(pipeline) do 189 | case require_behaviour(pipeline, GenStage) do 190 | [] -> {:ok, pipeline} 191 | errs -> {:stop, {:bad_pipeline, errs}} 192 | end 193 | end 194 | 195 | # Ensure pipeline modules implement GenStage behaviour. 196 | defp require_behaviour(pipeline, behaviour) do 197 | [pipeline.producer, pipeline.producer_consumer, pipeline.consumer] 198 | |> Enum.flat_map(fn(modules) -> modules end) 199 | |> Enum.uniq 200 | |> Runtime.require_behaviour(behaviour) 201 | end 202 | 203 | defp do_intercept_call_request(state, pid, {mod, fun}, demand, ts) do 204 | case fun do 205 | @handle_demand -> do_open_metric(state, mod, pid, demand, ts) 206 | @handle_events -> do_open_metric(state, mod, pid, length(demand), ts) 207 | @handle_call -> do_open_metric(state, mod, pid, 0, ts) 208 | @handle_cast -> do_open_metric(state, mod, pid, 0, ts) 209 | _ -> state 210 | end 211 | end 212 | 213 | defp do_intercept_call_response(state, mod, pid, events, ts) do 214 | do_close_metric(state, mod, pid, events, ts) 215 | end 216 | 217 | # Open partial metric on handle_ function call trace. 218 | defp do_open_metric(state, mod, pid, demand, ts) do 219 | metrics = 220 | Manager.open_summary_metric(state.metrics, mod, pid, demand, ts) 221 | state = %Monitor{state | metrics: metrics} 222 | 223 | if Runtime.statistics?(state.pipeline) do 224 | metrics = 225 | Manager.open_stats_metric(state.metrics, {mod, pid, demand, ts}) 226 | %Monitor{state | metrics: metrics} 227 | else 228 | state 229 | end 230 | end 231 | 232 | # Close complete metric on handle_ function return trace. 233 | defp do_close_metric(state, mod, pid, events, ts) do 234 | metrics = Manager.close_summary_metric(state.metrics, mod, pid, events, ts) 235 | state = %Monitor{state | metrics: metrics} 236 | 237 | if Runtime.statistics?(state.pipeline) do 238 | metrics = Manager.close_stats_metric(state.pipeline, 239 | state.metrics, {mod, pid, events, ts}) 240 | %Monitor{state | metrics: metrics} 241 | else 242 | state 243 | end 244 | end 245 | 246 | end 247 | -------------------------------------------------------------------------------- /bench/README.md: -------------------------------------------------------------------------------- 1 | ## GenMetrics Runtime Performance Benchmarks 2 | 3 | For those of you curious about the performance impact `gen_metrics` has on the servers and pipelines it is monitoring, we've put together a number of benchmarks to compare the overhead of *untraced* vs *traced* vs *sampled* servers and pipelines. You can tweak and run the benchmarks yourself from the project root directory. 4 | 5 | The following sections introduce each of the available benchmark tests. We examine the results and explain the implications of those results in each case. The benchmark reports that follow were generated by running the benchmarks on a 2011 Macbook Air (1.8ghz i7 [4 Core], 4GB RAM, SSD). All benchmarks are implemented and run using the [benchee benchmark](https://github.com/PragTob/benchee) library. 6 | 7 | ## GenMetrics Runtime Performance Summary 8 | 9 | When GenMetrics is activated, varying degress of runtime overhead *may* be incurred by the application being monitored depending on the rate of GenServer or GenStage callbacks within the application. In order to prevent GenMetrics negatively impacting on your application it is strongly recommended that you activate *metrics-sampling* for high-callback applications. 10 | 11 | To activate metrics-sampling for your server or pipeline simply specify the `sample_rate` option when declaring your monitoring preferences. For example, to reduce the runtime overhead of GenMetrics by sampling just 10% of all callbacks within your server or pipeline simply specify `opts : [sample_rate: 0.1]`. 12 | 13 | It is important to understand that when sampling is disabled, metrics data reflect the exact behaviour of the processes being monitored. When sampling is enabled, metrics data reflect an approximation of the behaviour of the processes being monitored. 14 | 15 | **IMPORTANT!** 16 | 17 | GenMetrics depends on Erlang tracing to collect runtime metrics for your application. One consequence of this depedency is that tail-call optimization is automatically disabled by the tracing agent. Given this, eventual resource exhaustion due to unbounded stack growth for long-running applications is inevitable. Resource exhaustion may be significantly delayed by activating metrics-sampling. But such resource exhaustion can not be avoided indefinitely. 18 | 19 | **DO NOT ACTIVATE GenMetrics IN LONG-RUNNING PRODUCTION APPLICATIONS.** 20 | 21 | To understand and observe resource use when GenMetrics is activated use the `mix infinite_server` or `mix infinite_pipeline` tasks which automatically launch the `:observer` tool which allows you to profile BEAM metrics. 22 | 23 | ## GenMetrics + Synchronous / Asynchronous Callbacks 24 | 25 | By default, GenMetrics monitors all synchronous and asynchronous callbacks within a server or pipeline. However, the monitoring of synchronous callbacks is optional. To disable monitoring of synchronous callbacks simply specify the `opts: [synchronous: false]` option when declaring the monitoring preferences for your server or pipeline. 26 | 27 | ## GenServer Benchmarks 28 | 29 | The following set of benchmarks are designed to test and measure the runtime impact of GenMetrics on a simple GenServer application. Benchmark specific context is provided in each case along with an analysis of the results. 30 | 31 | ### GenServer Benchmark 1. bench/trace_server.exs 32 | 33 | ``` 34 | mix trace_server 35 | ``` 36 | 37 | This benchmark runs the following tests: 38 | 39 | 1. untraced-server [ repeat 500k callbacks N times within ~30s ] 40 | 2. traced----server [ repeat 500k callbacks N times within ~30s ] 41 | 42 | Both tests attempt to push as many messages as possible to a GenServer process using the `GenServer.call/3` function. These tests each run for approximately 30 seconds. The server process within the `untraced-server` test is not being monitored by GenMetrics. The server process within the `traced-server` test is being monitored by GenMetrics. As metrics-sampling has not been enabled for this benchmark *all* callbacks on the `traced-server` are monitored. 43 | 44 | 45 | ``` 46 | Elixir 1.4.1 47 | Erlang 19.2 48 | Benchmark suite executing with the following configuration: 49 | warmup: 5.0s 50 | time: 30.0s 51 | parallel: 1 52 | inputs: none specified 53 | Estimated total run time: 70.0s 54 | 55 | Benchmarking 1-untraced-server [ repeat 500k calls N times within ~30s ]... 56 | Benchmarking 2-traced---server [ repeat 500k calls N times within ~30s ]... 57 | 58 | Name ips average deviation median 59 | 1-untraced-server [ repeat 500k calls N times within ~30s ] 0.21 4.75 s ±0.73% 4.73 s 60 | 2-traced---server [ repeat 500k calls N times within ~30s ] 0.0878 11.39 s ±2.61% 11.38 s 61 | 62 | Comparison: 63 | 1-untraced-server [ repeat 500k calls N times within ~30s ] 0.21 64 | 2-traced---server [ repeat 500k calls N times within ~30s ] 0.0878 - 2.40x slower 65 | ``` 66 | 67 | On our test hardware, the `untraced-server` mananged to push approximately 4.5 million messages to its GenServer processes within the 30 second test window. That's approximately 150k messages-per-second. The `traced-server` only managed to push approximately 2 million messages to its GenServer process. That's approximately 67k messages-per-second. 68 | 69 | The results indicate a significant runtime overhead has been introduced by the GenMetrics library. As indicated by the results the `traced-server` test performed `2.40x slower`. We can directly attribute this slowdown to the runtime overhead introduced by the GenMetrics library. 70 | 71 | While not all applications require metrics-sampling to reduce the runtime overhead associated with GenMetrics, this result strongly suggests this test application is a good candidate for sampling. See the following benchmark to see the immediate and significant positive effects when sampling is activated. 72 | 73 | 74 | ### GenServer Benchmark 2. bench/sample_server.exs 75 | 76 | ``` 77 | mix sample_server 78 | ``` 79 | 80 | This benchmark runs the following tests: 81 | 82 | 1. untraced-server [ repeat 500k callbacks N times within ~30s ] 83 | 2. sampled-server [ repeat 500k callbacks N times within ~30s ] 84 | 85 | Both tests attempt to push as many messages as possible to a GenServer process using the `GenServer.call/3` function. These tests each run for approximately 30 seconds. The server process within the `untraced-server` test is not being monitored by GenMetrics. The server process within the `sampled-server` test is being monitored by GenMetrics. Metrics-sampling has been activated for this server using the following monitoring preferences, `opts: [sample_rate: 0.1]`. 86 | 87 | 88 | ``` 89 | Elixir 1.4.1 90 | Erlang 19.2 91 | Benchmark suite executing with the following configuration: 92 | warmup: 5.0s 93 | time: 30.0s 94 | parallel: 1 95 | inputs: none specified 96 | Estimated total run time: 70.0s 97 | 98 | Benchmarking 1-untraced-server [ repeat 500k callbacks N times within ~30s ]... 99 | Benchmarking 2-sampled--server [ repeat 500k callbacks N times within ~30s ]... 100 | 101 | Name ips average deviation median 102 | 1-untraced-server [ repeat 500k callbacks N times within ~30s ] 0.22 4.51 s ±1.42% 4.49 s 103 | 2-sampled--server [ repeat 500k callbacks N times within ~30s ] 0.21 4.84 s ±1.83% 4.85 s 104 | 105 | Comparison: 106 | 1-untraced-server [ repeat 500k callbacks N times within ~30s ] 0.22 107 | 2-sampled--server [ repeat 500k callbacks N times within ~30s ] 0.21 - 1.07x slower 108 | ``` 109 | 110 | On our test hardware, both tests managed to push approximately 4.5 million messages to their respective GenServer processes within the 30 second test window. That's approximately 150k messages-per-second. 111 | 112 | In this benchmark, the `sampled-server` test performed just `1.07x slower` than the `untraced-server` test. Compared to the `traced-server` test in the previous benchmark that performed `2.40x sower` we can see the significant, positive impact activating metrics-sampling has on reducing the runtime overhead associated with GenMetrics. 113 | 114 | ## GenStage Benchmarks 115 | 116 | The following set of benchmarks are designed to test and measure the runtime impact of GenMetrics on a simple GenStage pipeline application. Benchmark specific context is provided in each case along with an analytis of the results. 117 | 118 | ### GenStage Benchmark 1. bench/trace_pipeline.exs 119 | 120 | ``` 121 | mix trace_pipeline 122 | ``` 123 | 124 | This benchmark runs the following tests: 125 | 126 | 1. untraced-pipeline [ repeat 500k msgs N times within ~30s ] 127 | 2. traced----pipeline [ repeat 500k msgs N times within ~30s ] 128 | 129 | Each test attempts to push as many messages as possible through a GenStage pipeline. These tests each run for approximately 30 seconds. The GenStage processes within the `untraced-pipeline` test are not being monitored by GenMetrics. The GenStage processes within the `traced-pipeline` test are being monitored by GenMetrics. As metrics-sampling has not been enabled for this benchmark *all* callbacks within the `traced-pipeline` are monitored. 130 | 131 | ``` 132 | Elixir 1.4.1 133 | Erlang 19.2 134 | Benchmark suite executing with the following configuration: 135 | warmup: 5.0s 136 | time: 30.0s 137 | parallel: 1 138 | inputs: none specified 139 | Estimated total run time: 70.0s 140 | 141 | Benchmarking 1-untraced-pipeline [ repeat 500k msgs N times within ~30s ]... 142 | Benchmarking 2-traced---pipeline [ repeat 500k msgs N times within ~30s ]... 143 | 144 | Name ips average deviation median 145 | 1-untraced-pipeline [ repeat 500k msgs N times within ~30s ] 0.0643 15.55 s ±1.17% 15.55 s 146 | 2-traced---pipeline [ repeat 500k msgs N times within ~30s ] 0.0281 35.53 s ±0.00% 35.53 s 147 | 148 | Comparison: 149 | 1-untraced-pipeline [ repeat 500k msgs N times within ~30s ] 0.0643 150 | 2-traced---pipeline [ repeat 500k msgs N times within ~30s ] 0.0281 - 2.29x slower 151 | ``` 152 | 153 | On our test hardware, the `untraced-pipeline` mananged to push approximately 1.5 million messages to its GenServer processes within the 30 second test window. That's approximately 50k messages-per-second. The `traced-pipeline` only managed to push approximately 1 million messages to its GenServer process. That's approximately 33k messages-per-second. 154 | 155 | The results indicate a significant runtime overhead has been introduced by the GenMetrics library. As indicated by the results the `traced-pipeline` test performed `2.29 slower`. We can directly attribute this slowdown to the runtime overhead introduced by the GenMetrics library. 156 | 157 | While not all applications require metrics-sampling to reduce the runtime overhead associated with GenMetrics, this result strongly suggests this test application is a good candidate for sampling. See the following benchmark to see the immediate and significant positive effects when sampling is activated. 158 | 159 | ### GenStage Benchmark 2. bench/sample_pipeline.exs 160 | 161 | ``` 162 | mix sample_pipeline 163 | ``` 164 | 165 | This benchmark runs the following tests: 166 | 167 | 1. untraced-pipeline [ repeat 500k msgs N times within ~30s ] 168 | 2. sampled-pipeline [ repeat 500k msgs N times within ~30s ] 169 | 170 | Each test attempts to push as many messages as possible through a GenStage pipeline. These tests each run for approximately 30 seconds. The GenStage processes within the `untraced-pipeline` test are not being monitored by GenMetrics. The GenStage processes within the `sampled-pipeline` test are being monitored by GenMetrics. Metrics-sampling has been activated for this pipeline using the following monitoring preferences, `opts: [sample_rate: 0.1]`. 171 | 172 | ``` 173 | Elixir 1.4.1 174 | Erlang 19.2 175 | Benchmark suite executing with the following configuration: 176 | warmup: 5.0s 177 | time: 30.0s 178 | parallel: 1 179 | inputs: none specified 180 | Estimated total run time: 70.0s 181 | 182 | Benchmarking 1-untraced-pipeline [ repeat 500k msgs N times within ~30s ]... 183 | Benchmarking 2-sampled--pipeline [ repeat 500k msgs N times within ~30s ]... 184 | 185 | Name ips average deviation median 186 | 1-untraced-pipeline [ repeat 500k msgs N times within ~30s ] 0.0728 13.74 s ±2.06% 13.87 s 187 | 2-sampled--pipeline [ repeat 500k msgs N times within ~30s ] 0.0672 14.88 s ±0.99% 14.81 s 188 | 189 | Comparison: 190 | 1-untraced-pipeline [ repeat 500k msgs N times within ~30s ] 0.0728 191 | 2-sampled--pipeline [ repeat 500k msgs N times within ~30s ] 0.0672 - 1.08x slower 192 | ``` 193 | 194 | On our test hardware, both tests managed to push approximately 2 million messages to their respective GenServer processes within the 30 second test window. That's approximately 67k messages-per-second. 195 | 196 | In this benchmark, the `sampled-pipeline` test performed just `1.08x slower` than the `untraced-pipeline` test. Compared to the `sampled-pipeline` test in the previous benchmark that performed `2.29x sower` we can see the significant, positive impact activating metrics-sampling has on reducing the runtime overhead associated with GenMetrics. 197 | 198 | ## GenMetrics + BEAM Garbage Collection 199 | 200 | Some final remarks about GenMetrics and it's memory usage profile within the BEAM. 201 | 202 | By default, when GenMetrics is enabled it collects and reports only summary metrics data. This type of metrics data collection has very little runtime overhead in terms of memory usage and should never trigger spikes in memory usage or GC. 203 | 204 | If detailed statistical metrics are activated using the `statistics: true` option, significant amounts of metrics data are collected. Activating this feature is a lot like activating a `statsd agent` directly within the BEAM. The exact amount of data collected is directly proportional to the *rate-of-callbacks* within the server or pipeline. It is therefore strongly recommended that this feature only be enabled in environments where the *rate-of-callbacks* is known to be low. Otherwise, spiked memory usage and frequent GC will occur. 205 | 206 | If the type of insights provided by statistical metrics are needed then we strongly recommend using the existing support for redirecting metrics data to an external `statsd` agent. This can be achieved using the `opts: [statistics: :statsd]` and `opts: [statistics: :datadog]` options. Just remember that activating metrics-sampling will push only the metrics that were actually monitored by GenMetrics to these agents. Any dashboard rendering these metrics data will have to account for the sampling rate in order to display total-values-over-time for metrics data. When using these external `statsd` agents GenMetrics incurs very little runtime overhead in terms of memory usage and should never trigger spikes in memory usage or GC. 207 | -------------------------------------------------------------------------------- /PITCHME.md: -------------------------------------------------------------------------------- 1 | ## GenMetrics 2 | 3 | Elixir GenServer and GenStage Runtime Metrics 4 | 5 | Note: 6 | Provide brief background, then state agenda: GenSever + GenStage 7 | behaviours and realtime metrics collection and reporting by GenMetrics. 8 | 9 | --- 10 | 11 | ### Application Runtime Metrics 12 | 13 | - Summary Metrics 14 | - Plus optional Statistical Metrics 15 | - Delivered In-Memory, Or To STATSD Agent 16 | - For any GenServer or GenStage Application 17 | - Without requiring changes to existing code 18 | 19 | Note: 20 | Introduce GenServer, GenStage behaviours on OTP. Emphasize metrics 21 | by introspection. 22 | 23 | --- 24 | 25 | ### Hex Package Dependency 26 | 27 | ```elixir 28 | def deps do 29 | [{:gen_metrics, "~> 0.3.0"}] 30 | end 31 | ``` 32 | 33 | Note: 34 | Mention detailed HexDocs documentation available on hexdocs.pm. 35 | 36 | --- 37 | 38 | ### GenServer Metrics 39 | 40 | +++ 41 | 42 | #### GenServer Metrics Per Server Process 43 | 44 | - Number of `call`, `cast`, and `info` callbacks 45 | - Time taken on these callbacks 46 | - Plus optional detailed statistical metrics 47 | 48 | Note: 49 | Explain that *callbacks* are the *unit-of-work* in a GenServer. 50 | Also elaborate on differences between summary and statistical metrics. 51 | 52 | +++ 53 | 54 | #### GenMetrics Activation 55 | 56 | ```elixir 57 | alias GenMetrics.GenServer.Cluster 58 | 59 | cluster = %Cluster{name: "demo", 60 | servers: [Session.Server, Logging.Server]} 61 | 62 | GenMetrics.monitor_cluster(cluster) 63 | 64 | # Here Session.Server and Logging.Server are example GenServers. 65 | ``` 66 | 67 | Note: 68 | Point out that GenMetrics provides it's own supervision tree. 69 | 70 | +++ 71 | 72 | #### GenMetrics Sampling 73 | 74 | ```elixir 75 | alias GenMetrics.GenServer.Cluster 76 | 77 | cluster = %Cluster{name: "demo", 78 | servers: [Session.Server, Logging.Server], 79 | opts: [sample_rate: 0.2]} 80 | 81 | GenMetrics.monitor_cluster(cluster) 82 | 83 | # Here Session.Server and Logging.Server are example GenServers. 84 | ``` 85 | 86 | Note: 87 | Sampling reduces runtime overhead of GenMetrics monitoring agent. 88 | 89 | +++ 90 | 91 | #### GenServer Summary Metrics 92 | 93 | #### Sample Metrics Data 94 | 95 | ```elixir 96 | # Server Name: Demo.Server, PID<0.176.0> 97 | 98 | %GenMetrics.GenServer.Summary{name: Demo.Server, 99 | pid: #PID<0.176.0>, 100 | calls: 8000, 101 | casts: 34500, 102 | infos: 3333, 103 | time_on_calls: 28, 104 | time_on_casts: 161, 105 | time_on_infos: 15} 106 | 107 | # Summary timings measured in milliseconds (ms). 108 | ``` 109 | 110 | Note: 111 | Provide example by explaining how *calls* and *time_on_calls* relate. 112 | +++ 113 | 114 | #### GenServer Statistical Metrics 115 | 116 | #### Optional Statsd Activation 117 | 118 | ```elixir 119 | alias GenMetrics.GenServer.Cluster 120 | 121 | cluster = %Cluster{name: "demo", 122 | servers: [Session.Server, Logging.Server], 123 | opts: [statistics: :statsd]} 124 | 125 | GenMetrics.monitor_cluster(cluster) 126 | 127 | # Here Session.Server and Logging.Server are example GenServers. 128 | ``` 129 | 130 | Note: 131 | Explain `:statsd` integration with analysis and visualization 132 | tools such as Grafana and Datadog. 133 | 134 | +++ 135 | 136 | #### GenServer Statistical Metrics 137 | 138 | #### Optional Datadog Activation 139 | 140 | ```elixir 141 | alias GenMetrics.GenServer.Cluster 142 | 143 | cluster = %Cluster{name: "demo", 144 | servers: [Session.Server, Logging.Server], 145 | opts: [statistics: :datadog]} 146 | 147 | GenMetrics.monitor_cluster(cluster) 148 | 149 | # Here Session.Server and Logging.Server are example GenServers. 150 | ``` 151 | 152 | Note: 153 | Mention `:datadog` tagging feature is automatically activated 154 | to support filtering on individual GenServer clusters. 155 | 156 | +++ 157 | 158 | #### GenServer Statistical Metrics 159 | 160 | #### Optional In-Memory Activation 161 | 162 | ```elixir 163 | alias GenMetrics.GenServer.Cluster 164 | 165 | cluster = %Cluster{name: "demo", 166 | servers: [Session.Server, Logging.Server], 167 | opts: [statistics: true]} 168 | 169 | GenMetrics.monitor_cluster(cluster) 170 | 171 | # Here Session.Server and Logging.Server are example GenServers. 172 | ``` 173 | 174 | Note: 175 | Mention additional *opts* such as *window_interval* and how it works. 176 | 177 | +++ 178 | 179 | #### GenServer Statistical Metrics 180 | 181 | #### Sample In-Memory Metrics Data 182 | 183 | ```elixir 184 | # Server Name: Demo.Server, PID<0.176.0> 185 | 186 | # handle_call/3 187 | %GenMetrics.GenServer.Stats{callbacks: 8000, 188 | max: 149, 189 | mean: 3, 190 | min: 2, 191 | range: 147, 192 | stdev: 2, 193 | total: 25753} 194 | 195 | # Statistical timings measured in microseconds (µs). 196 | ``` 197 | 198 | Note: 199 | Briefly explain how `in-memory` statistical metrics are captured 200 | and calculated. Recommend judicious use. 201 | 202 | +++ 203 | 204 | #### GenServer Statistical Metrics 205 | 206 | #### Sample In-Memory Metrics Data 207 | 208 | ```elixir 209 | # Server Name: Demo.Server, PID<0.176.0> 210 | 211 | # handle_cast/2 212 | %GenMetrics.GenServer.Stats{callbacks: 34500, 213 | max: 3368, 214 | mean: 4, 215 | min: 2, 216 | range: 3366, 217 | stdev: 31, 218 | total: 141383} 219 | 220 | # Statistical timings measured in microseconds (µs). 221 | ``` 222 | 223 | +++ 224 | 225 | #### GenServer Statistical Metrics 226 | 227 | #### Sample In-Memory Metrics Data 228 | 229 | ```elixir 230 | # Server Name: Demo.Server, PID<0.176.0> 231 | 232 | # handle_info/2 233 | %GenMetrics.GenServer.Stats{callbacks: 3333, 234 | max: 37, 235 | mean: 4, 236 | min: 2, 237 | range: 35, 238 | stdev: 2, 239 | total: 13510} 240 | 241 | # Statistical timings measured in microseconds (µs). 242 | ``` 243 | 244 | --- 245 | 246 | ### GenStage Metrics 247 | 248 | +++ 249 | 250 | #### GenStage Metrics Per Stage Process 251 | 252 | - Number of `demand` and `events` callbacks 253 | - Time taken on these callbacks 254 | - Size of upstream demand 255 | - Size of events emitted to meet demand 256 | - Plus optional detailed statistical metrics 257 | 258 | Note: 259 | Briefly discuss GenStage demand, events and back-pressure. 260 | 261 | +++ 262 | 263 | #### GenStage Activation 264 | 265 | ```elixir 266 | alias GenMetrics.GenStage.Pipeline 267 | 268 | pipeline = %Pipeline{name: "demo", 269 | producer: [Data.Producer], 270 | producer_consumer: 271 | [Data.Scrubber, Data.Analyzer], 272 | consumer: [Data.Consumer]} 273 | 274 | GenMetrics.monitor_pipeline(pipeline) 275 | 276 | # Here Data.* are simply example GenStages. 277 | ``` 278 | 279 | Note: 280 | Mention GenMetrics monitoring supports both complete and 281 | partial pipelines. 282 | 283 | +++ 284 | 285 | #### GenStage Sampling 286 | 287 | ```elixir 288 | alias GenMetrics.GenStage.Pipeline 289 | 290 | pipeline = %Pipeline{name: "demo", 291 | producer: [Data.Producer], 292 | producer_consumer: 293 | [Data.Scrubber, Data.Analyzer], 294 | consumer: [Data.Consumer], 295 | opts: [sample_rate: 0.1]} 296 | 297 | GenMetrics.monitor_pipeline(pipeline) 298 | 299 | # Here Data.* are simply example GenStages. 300 | ``` 301 | 302 | Note: 303 | Sampling reduces runtime overhead of the GenMetrics monitoring agent. 304 | 305 | +++ 306 | 307 | #### GenStage Summary Metrics 308 | 309 | #### Sample Metrics Data 310 | 311 | ```elixir 312 | # Stage Name: Data.Producer, PID<0.195.0> 313 | 314 | %GenMetrics.GenStage.Summary{stage: Data.Producer, 315 | pid: #PID<0.195.0>, 316 | callbacks: 9536, 317 | time_on_callbacks: 407, 318 | demand: 4768000, 319 | events: 4768000} 320 | 321 | # Summary timings measured in milliseconds (ms). 322 | ``` 323 | 324 | Note: 325 | Explain *callbacks*, *demand*, and *events* concepts and 326 | how they are reflected in the metrics data shown. 327 | 328 | +++ 329 | 330 | #### GenStage Statistical Metrics 331 | 332 | #### Optional Statsd Activation 333 | 334 | ```elixir 335 | alias GenMetrics.GenStage.Pipeline 336 | 337 | pipeline = %Pipeline{name: "demo", 338 | producer_consumer: 339 | [Data.Scrubber, Data.Analyzer], 340 | opts: [statistics: :statsd]} 341 | 342 | GenMetrics.monitor_pipeline(pipeline) 343 | 344 | # Here Data.Scrubber and Data.Analyzer are example GenStages. 345 | ``` 346 | 347 | Note: 348 | Explain `:statsd` integration with analysis and visualization 349 | tools such as Grafana and Datadog. 350 | 351 | +++ 352 | 353 | #### GenStage Statistical Metrics 354 | 355 | #### Optional Datadog Activation 356 | 357 | ```elixir 358 | alias GenMetrics.GenStage.Pipeline 359 | 360 | pipeline = %Pipeline{name: "demo", 361 | producer_consumer: 362 | [Data.Scrubber, Data.Analyzer], 363 | opts: [statistics: :datadog]} 364 | 365 | GenMetrics.monitor_pipeline(pipeline) 366 | 367 | # Here Data.Scrubber and Data.Analyzer are example GenStages. 368 | ``` 369 | 370 | Note: 371 | Mention `:datadog` tagging feature is automatically activated 372 | to support filtering on individual GenStage pipelines. 373 | 374 | +++ 375 | 376 | #### GenStage Statistical Metrics 377 | 378 | #### Optional In-Memory Activation 379 | 380 | ```elixir 381 | alias GenMetrics.GenStage.Pipeline 382 | 383 | pipeline = %Pipeline{name: "demo", 384 | producer_consumer: 385 | [Data.Scrubber, Data.Analyzer], 386 | opts: [statistics: true]} 387 | 388 | GenMetrics.monitor_pipeline(pipeline) 389 | 390 | # Here Data.Scrubber and Data.Analyzer are example GenStages. 391 | ``` 392 | 393 | Note: 394 | Again mention availability of *window_interval* option. 395 | 396 | +++ 397 | 398 | #### GenStage Statistical Metrics 399 | 400 | #### Sample In-Memory Metrics Data 401 | 402 | ```elixir 403 | # Stage Name: Data.Producer, PID<0.195.0> 404 | 405 | # callback demand 406 | %GenMetrics.GenStage.Stats{callbacks: 9536, 407 | max: 500, 408 | mean: 500, 409 | min: 500, 410 | range: 0, 411 | stdev: 0, 412 | total: 4768000} 413 | 414 | # Statistical timings measured in microseconds (µs). 415 | ``` 416 | 417 | Note: 418 | Note GenStage summary metrics split across *demand*, *events* 419 | and *timings* as we will see on the following slides. 420 | 421 | +++ 422 | 423 | #### GenStage Statistical Metrics 424 | 425 | #### Sample In-Memory Metrics Data 426 | 427 | ```elixir 428 | # callback events 429 | %GenMetrics.GenStage.Stats{callbacks: 9536, 430 | max: 500, 431 | mean: 500, 432 | min: 500, 433 | range: 0, 434 | stdev: 0, 435 | total: 4768000} 436 | 437 | # Statistical timings measured in microseconds (µs). 438 | ``` 439 | 440 | +++ 441 | 442 | #### GenStage Statistical Metrics 443 | 444 | #### Sample In-Memory Metrics Data 445 | 446 | ```elixir 447 | # callback timings 448 | %GenMetrics.GenStage.Stats{callbacks: 9536, 449 | max: 2979, 450 | mean: 42, 451 | min: 24, 452 | range: 2955, 453 | stdev: 38, 454 | total: 403170} 455 | 456 | # Statistical timings measured in microseconds (µs). 457 | ``` 458 | 459 | --- 460 | 461 | ### GenMetrics Reporting 462 | 463 | - Metrics are published periodically 464 | - By a dedicated reporting process 465 | - Or by a statsd agent 466 | - Any application can subscribe for metrics events 467 | - Then aggregate, render, persist, etc metrics data 468 | 469 | Note: 470 | Emphasize separation of metrics collection, reporting, and consumption. 471 | 472 | --- 473 | 474 | ### GenServer Metrics Reporting 475 | 476 | +++ 477 | 478 | #### GenMetrics.GenServer.Reporter 479 | 480 | A GenStage Broadcasting Producer 481 | 482 | For In-Memory Metrics Data 483 | 484 | Note: 485 | Clarify that the producer name is registered by GenMetrics. 486 | 487 | +++ 488 | 489 | #### Subscribing For GenMetrics Events 490 | 491 | ```elixir 492 | def init(:ok) do 493 | 494 | {:consumer, :state_does_not_matter, 495 | subscribe_to: 496 | [{GenMetrics.GenServer.Reporter, max_demand: 1}]} 497 | 498 | end 499 | ``` 500 | 501 | Note: 502 | Mention the reporting process is a *BroadcastDispatcher* 503 | producer so there is opportunity for filtering using *selector*. 504 | 505 | +++ 506 | 507 | #### Handling GenMetrics Events 508 | 509 | ```elixir 510 | def handle_events([metrics | _], _from, state) do 511 | 512 | for summary <- metrics.summary do 513 | Logger.info "GenMetrics.Consumer: #{inspect summary}" 514 | end 515 | 516 | {:noreply, [], state} 517 | 518 | end 519 | ``` 520 | 521 | Note: 522 | Explain metrics can be analyzed or processed in any number 523 | of ways including logging, persistence, Statsd, Graphana, 524 | DataDog, etc. 525 | 526 | --- 527 | 528 | ### GenStage Metrics Reporting 529 | 530 | +++ 531 | 532 | #### GenMetrics.GenStage.Reporter 533 | 534 | A GenStage Broadcasting Producer 535 | 536 | For In-Memory Metrics Data 537 | 538 | +++ 539 | 540 | #### Subscribing For GenMetrics Events 541 | 542 | ```elixir 543 | def init(:ok) do 544 | 545 | {:consumer, :state_does_not_matter, 546 | subscribe_to: 547 | [{GenMetrics.GenStage.Reporter, max_demand: 1}]} 548 | 549 | end 550 | ``` 551 | 552 | Note: 553 | Again clarify that the producer name is registered by GenMetrics. 554 | 555 | +++ 556 | 557 | #### Handling GenMetrics Events 558 | 559 | ```elixir 560 | def handle_events([metrics | _], _from, state) do 561 | 562 | for summary <- metrics.summary do 563 | Logger.info "GenMetrics.Consumer: #{inspect summary}" 564 | end 565 | 566 | {:noreply, [], state} 567 | 568 | end 569 | ``` 570 | 571 | --- 572 | 573 | ### GenMetrics is open source 574 | 575 | - The Hex Docs 576 | - The GitHub Repo 577 | - Welcome feedback, PRs, issues, etc. 578 | 579 | Note: 580 | Encourage the audience to get involved, test, report, contribute. 581 | 582 | -------------------------------------------------------------------------------- /lib/gen_metrics.ex: -------------------------------------------------------------------------------- 1 | defmodule GenMetrics do 2 | 3 | @moduledoc """ 4 | Runtime metrics for `GenServer` and `GenStage` applications. 5 | 6 | **Important:** 7 | The GenMetrics library is not suitable for use within long-running 8 | production environments. For further details, see the [benchmarks 9 | performance guide](https://github.com/onetapbeyond/gen_metrics#benchmarks). 10 | 11 | This library supports the collection and publication of GenServer and GenStage 12 | runtime metrics. Metrics data are generated by an introspection agent. No 13 | instrumentation is required within the GenServer or GenStage library 14 | or within your application source code. 15 | 16 | GenMetrics data can be used to reveal insights into live application 17 | performance and identify patterns of behaviour within an application over 18 | time. Metrics data can be used to drive any number of operational systems, 19 | including realtime dashboards, monitoring and alerting systems. 20 | 21 | By default, metrics are published by a dedicated GenMetrics reporting process. 22 | Any application can subscribe to this process in order to aggregate, render, 23 | persist, or generally handle metrics data. Metrics data can also be pushed 24 | directly to a `statsd` agent which makes it possible to analyze, and visualize 25 | the metrics within existing tools and services like `Graphana` and `Datadog`. 26 | 27 | The metrics data collected by this library includes both summary metrics and 28 | optional detailed statistical metrics. Summary metrics and statistical 29 | metrics for GenServer and GenStage applications are described in detail below. 30 | 31 | ## GenMetrics Installation 32 | 33 | Simply add `gen_metrics` as a `deps` dependency in your Mixfile. 34 | 35 | ## GenMetrics for GenServer Applications 36 | 37 | Any application using the `GenServer` behaviour can immediately benefit from 38 | the insights afforded by GenMetrics. The following sections explain how. 39 | For `GenStage` applications, see the docs 40 | [here](#module-genmetrics-for-genstage-applications). 41 | 42 | ### GenServer Metrics Activation 43 | 44 | A `GenMetrics.GenServer.Cluster` struct is used to identify one or more 45 | GenServer modules that become candidates for metrics collection. For example, 46 | assuming your application has a `Session.Server` and a `Logging.Server` you 47 | can activate metrics collection on both GenServers as follows: 48 | 49 | ``` 50 | alias GenMetrics.GenServer.Cluster 51 | cluster = %Cluster{name: "demo", servers: [Session.Server, Logging.Server]} 52 | GenMetrics.monitor_cluster(cluster) 53 | ``` 54 | 55 | The *cluster* in this context is simply a named set of one or more GenServer 56 | modules about which you would like to collect metrics data. Metrics data 57 | are collected on server processes executing on the local node. 58 | 59 | GenMetrics will instantly attach to running GenServer processes associated 60 | with your cluster. If there are no running server processes associated with 61 | your cluster when `GenMetrics.monitor_cluster/1` is called, GenMetrics will 62 | monitor for process activation and automatically begin metrics collection 63 | for each new process. 64 | 65 | ### GenServer Metrics Sampling 66 | 67 | Sampling metrics is a effective way to collect and report metrics for any 68 | server while minimizing the runtime overhead introduced by the GenMetics 69 | monitoring agent. 70 | 71 | When sampling is disabled, metrics data reflect the exact behaviour of the 72 | processes being monitored. When sampling is enabled, metrics data reflect 73 | an approximation of the behaviour of the processes being monitored. 74 | 75 | Given an application with the following GenServers: `Session.Server`, 76 | `Logging.Server`, activate metrics-sampling for the server cluster as follows: 77 | 78 | ```elixir 79 | alias GenMetrics.GenServer.Cluster 80 | cluster = %Cluster{name: "demo", 81 | servers: [Session.Server, Logging.Server], 82 | opts: [sample_rate: 0.3]} 83 | GenMetrics.monitor_cluster(cluster) 84 | ``` 85 | 86 | ### GenServer Summary Metrics 87 | 88 | Summary metrics are collected for activity within the following GenServer 89 | callbacks: 90 | 91 | - `GenServer.handle_call/3` 92 | - `GenServer.handle_cast/2` 93 | - `GenServer.handle_info/2` 94 | 95 | GenMetrics collects both the number of callbacks and the time taken on 96 | those callbacks for each of the server processes within your cluster. 97 | 98 | Summary metrics are aggregated across a periodic time interval, known as a 99 | *window*. By default, the window interval is `1000 ms`. This interval may be 100 | customized using the `window_interval` option on `GenMetrics.monitor_cluster/1` 101 | as shown here: 102 | 103 | ``` 104 | alias GenMetrics.GenServer.Cluster 105 | cluster = %Cluster{name: "demo", 106 | servers: [Session.Server, Logging.Server], 107 | opts: [window_interval: 5000]} 108 | GenMetrics.monitor_cluster(cluster) 109 | ``` 110 | 111 | The following are sample summary metrics reported for a single window interval 112 | on a GenServer process: 113 | 114 | ``` 115 | # Server Name: Demo.Server, PID<0.176.0> 116 | 117 | %GenMetrics.GenServer.Summary{name: Demo.Server, 118 | pid: #PID<0.176.0>, 119 | calls: 8000, 120 | casts: 34500, 121 | infos: 3333, 122 | time_on_calls: 28, 123 | time_on_casts: 161, 124 | time_on_infos: 15} 125 | ``` 126 | 127 | All timings reported on summary metrics are reported in `milliseconds (ms)`. 128 | For example, during this sample window interval, the `handle_cast/2` callback 129 | was executed `34500` times. The total time spent processing those callbacks 130 | was just `161 ms`. 131 | 132 | ### GenServer Statistical Metrics 133 | 134 | Summary metrics provide near-realtime insights into the runtime behaviour 135 | of any GenServer application. However, sometimes more fine grained metrics 136 | data may be required to truly understand the subtleties of your application's 137 | runtime behaviour. To cater for those cases, GenMetrics supports optional 138 | statistical metrics. 139 | 140 | Statistical metrics may be activated using the `statistics` option on 141 | `GenMetrics.monitor_cluster/1`. GenMetrics `in-memory` metrics are activated 142 | as shown here: 143 | 144 | ``` 145 | alias GenMetrics.GenServer.Cluster 146 | cluster = %Cluster{name: "demo", 147 | servers: [Session.Server, Logging.Server], 148 | opts: [statistics: true]} 149 | GenMetrics.monitor_cluster(cluster) 150 | ``` 151 | 152 | Activating in-memory statistical metrics is a lot like activating a 153 | `statsd agent` directly within the BEAM. This can impact the runtime 154 | performance of some applications so redirecting metrics to an external 155 | agent is typically recommended. 156 | 157 | Redirecting statistical metrics to a `statsd` agent simply requires the 158 | following `opts` configuration: 159 | 160 | ``` 161 | opts: [statistics: :statsd]} 162 | ``` 163 | 164 | Redirecting statistical metrics to the `Datadog` statsd-agent requires the 165 | following `opts` configuration: 166 | 167 | ``` 168 | opts: [statistics: :datadog]} 169 | ``` 170 | 171 | Metrics directed to Datadog include tagging data which makes it very easy 172 | to subset and query the metrics that you need to monitor. 173 | 174 | The following are sample `in-memory` statistical metrics reported for a 175 | single window interval on a GenServer process: 176 | 177 | ``` 178 | # Server Name: Demo.Server, PID<0.176.0> 179 | 180 | # handle_call/3 181 | %GenMetrics.GenServer.Stats{callbacks: 8000, 182 | max: 149, 183 | mean: 3, 184 | min: 2, 185 | range: 147, 186 | stdev: 2, 187 | total: 25753} 188 | 189 | # handle_cast/2 190 | %GenMetrics.GenServer.Stats{callbacks: 34500, 191 | max: 3368, 192 | mean: 4, 193 | min: 2, 194 | range: 3366, 195 | stdev: 31, 196 | total: 141383} 197 | 198 | # handle_info/2 199 | %GenMetrics.GenServer.Stats{callbacks: 3333, 200 | max: 37, 201 | mean: 4, 202 | min: 2, 203 | range: 35, 204 | stdev: 2, 205 | total: 13510} 206 | ``` 207 | 208 | All timings reported on `in-memory` statistical metrics are reported in 209 | `microseconds (µs)`. For example, during this sample window interval, the 210 | `handle_cast/2` callback was executed `34500` times. The total time spent 211 | processing those callbacks was `141383 µs`. The `mean` time taken per 212 | callback was `4 µs` while the `standard deviation` around the mean was `31 µs`. 213 | 214 | *Note:* Under heavy load the generation of `in-memory` statistical metrics can 215 | become computationally expensive. It is therefore recommended that 216 | `in-memory` metrics be activated in production environments *judiciously*. 217 | These concerns are negligible when redirecting statistical metrics to 218 | `:statsd` or `:datadog` as custom sampling-rates may be configured. 219 | 220 | 221 | ### GenServer Reporting Metrics 222 | 223 | Runtime `in-memory` metrics for servers in your cluster are published via 224 | a dedicated reporting process. The reporting process is registered locally 225 | by the GenMetrics library at startup. This process is registered under the 226 | name `GenMetrics.GenServer.Reporter`. 227 | 228 | The reporting process is a `GenStage` producer that broadcasts metrics data. 229 | Any number of consumers can subscribe to this process in order to handle 230 | metrics data. 231 | 232 | Note, if you are redirecting statistical metrics to `:statsd` or `:datadog` 233 | there is no need to subscribe to this reporting process. 234 | 235 | In order to subscribe, a simple GenStage `:consumer` can initialize itself 236 | to receive events from the reporting process as follows: 237 | 238 | ``` 239 | def init(:ok) do 240 | # Subscribe as consumer to the GenMetrics.GenServer.Reporter producer. 241 | {:consumer, :state_does_not_matter, 242 | subscribe_to: [{GenMetrics.GenServer.Reporter, max_demand: 1}]} 243 | end 244 | ``` 245 | 246 | On receipt of events from the reporting process, metrics data can be extracted 247 | for processing to suit any need. The following example demonstrates simple 248 | logging of summary metrics data: 249 | 250 | ``` 251 | def handle_events([metrics | _], _from, state) do 252 | # Log summary metrics for each server within the GenServer cluster. 253 | for summary <- metrics.summary do 254 | Logger.info "GenMetrics.Consumer: cluster.server summary=\#{inspect summary}" 255 | end 256 | {:noreply, [], state} 257 | end 258 | ``` 259 | 260 | ## GenMetrics for GenStage Applications 261 | 262 | Any application using the `GenStage` behaviour can immediately benefit from 263 | the insights afforded by GenMetrics. The following sections explain how. For 264 | `GenServer` applications, see the docs 265 | [here](#module-genmetrics-for-genserver-applications). 266 | 267 | ### GenStage Metrics Activation 268 | 269 | A `GenMetrics.GenStage.Pipeline` struct is used to identify one or more 270 | GenStages that become candidates for metrics collection. You can 271 | identify a complete pipeline including all `:producers`, `:producer_consumers` 272 | and `:consumers`, or any subset of stages within a pipeline. 273 | 274 | For example, assuming your GenStage application has a `Data.Producer`, 275 | a `Data.Scrubber`, a `Data.Analyzer` and a `Data.Consumer`, you can activate 276 | metrics collection for the entire pipeline as follows: 277 | 278 | ``` 279 | alias GenMetrics.GenStage.Pipeline 280 | pipeline = %Pipeline{name: "demo", 281 | producer: [Data.Producer], 282 | producer_consumer: [Data.Scrubber, Data.Analyzer], 283 | consumer: [Data.Consumer]} 284 | GenMetrics.monitor_pipeline(pipeline) 285 | ``` 286 | 287 | Alternatively, if you only wanted to activate metrics collection for the 288 | `:producer_consumer` stages within the pipeline you can do the following: 289 | 290 | ``` 291 | alias GenMetrics.GenStage.Pipeline 292 | pipeline = %Pipeline{name: "demo", 293 | producer_consumer: [Data.Scrubber, Data.Analyzer]} 294 | GenMetrics.monitor_pipeline(pipeline) 295 | ``` 296 | 297 | The *pipeline* in this context is simply a named set of one or more GenStage 298 | modules about which you would like to collect metrics data. Metrics data are 299 | collected on stage processes executing on the local node. 300 | 301 | GenMetrics will instantly attach to running GenStage processes associated 302 | with your pipeline. If there are no running GenStage processes associated with 303 | your pipleline when `GenMetrics.monitor_pipeline/1` is called, GenMetrics will 304 | monitor for process activation and automatically begin metrics collection 305 | for each new process. 306 | 307 | 308 | ### GenStage Metrics Sampling 309 | 310 | Sampling metrics is a effective way to collect and report metrics for 311 | any pipeline while minimizing the runtime overhead introduced by 312 | the GenMetrics monitoring agent. 313 | 314 | When sampling is disabled, metrics data reflect the exact behaviour of the 315 | processes being monitored. When sampling is enabled, metrics data reflect 316 | an approximation of the behaviour of the processes being monitored. 317 | 318 | Given a GenStage application with the following stages: `Data.Producer`, 319 | `Data.Scrubber`, `Data.Analyzer` and a `Data.Consumer`, activate 320 | metrics-sampling for the entire pipeline as follows: 321 | 322 | ```elixir 323 | alias GenMetrics.GenStage.Pipeline 324 | pipeline = %Pipeline{name: "demo", 325 | producer: [Data.Producer], 326 | producer_consumer: [Data.Scrubber, Data.Analyzer], 327 | consumer: [Data.Consumer], 328 | opts: [sample_rate: 0.1]} 329 | GenMetrics.monitor_pipeline(pipeline) 330 | ``` 331 | 332 | ### GenMetrics Summary Metrics 333 | 334 | Summary metrics are collected for activity within the following GenStage 335 | callbacks: 336 | 337 | - `GenStage.handle_demand/2` 338 | - `GenStage.handle_events/3` 339 | - `GenStage.handle_call/3` 340 | - `GenStage.handle_cast/2` 341 | 342 | GenMetrics collects the number of callbacks, the time taken on those 343 | callbacks, the size of upstream demand, and the number of events generated 344 | in response to that demand, for each of the stages within your pipeline. 345 | 346 | Summary metrics are aggregated across a periodic time interval, known as a 347 | *window*. By default, the window interval is `1000 ms`. This interval may be 348 | customized using the `window_interval` option on 349 | `GenMetrics.monitor_pipeline/1` as shown here: 350 | 351 | ``` 352 | alias GenMetrics.GenStage.Pipeline 353 | pipeline = %Pipeline{name: "demo", 354 | producer_consumer: [Data.Scrubber, Data.Analyzer], 355 | opts: [window_interval: 5000]} 356 | GenMetrics.monitor_pipeline(pipeline) 357 | ``` 358 | 359 | The following are sample summary metrics reported for a single window interval 360 | on a GenStage process: 361 | 362 | ``` 363 | # Stage Name: Data.Producer, PID<0.195.0> 364 | 365 | %GenMetrics.GenStage.Summary{stage: Data.Producer, 366 | pid: #PID<0.195.0>, 367 | callbacks: 9536, 368 | time_on_callbacks: 407, 369 | demand: 4768000, 370 | events: 4768000} 371 | ``` 372 | 373 | All timings reported on summary metrics are reported in `milliseconds (ms)`. 374 | For example, during this sample window interval, `9536` callbacks were 375 | handled by the `Data.Producer` stage. The total time spent processing those 376 | callbacks was `407 ms`. 377 | 378 | During that time, total upstream demand on the stage was `4768000`. A total of 379 | `4768000` events were also generated and emitted by the stage. This tells us 380 | that the stage was able to fully meet upstream demand during this specific 381 | sample window interval. 382 | 383 | ### GenMetrics Statistical Metrics 384 | 385 | Summary metrics provide near-realtime insights into the runtime behaviour 386 | of any GenStage application. However, sometimes more fine grained metrics 387 | data may be required to truly understand the subtleties of your application's 388 | runtime behaviour. To cater for those cases, GenMetrics supports optional 389 | statistical metrics. 390 | 391 | Statistical metrics may be activated using the `statistics` option on 392 | `GenMetrics.monitor_pipeline/1`. GenMetrics `in-memory` metrics are activated 393 | as shown here: 394 | 395 | ``` 396 | alias GenMetrics.GenStage.Pipeline 397 | pipeline = %Pipeline{name: "demo", 398 | producer_consumer: [Data.Scrubber, Data.Analyzer], 399 | opts: [statistics: true]} 400 | GenMetrics.monitor_pipeline(pipeline) 401 | ``` 402 | 403 | Redirecting statistical metrics to a `statsd` agent simply requires the 404 | following `opts` configuration: 405 | 406 | ``` 407 | opts: [statistics: :statsd]} 408 | ``` 409 | 410 | Redirecting statistical metrics to the `Datadog` statsd-agent requires the 411 | following `opts` configuration: 412 | 413 | ``` 414 | opts: [statistics: :datadog]} 415 | ``` 416 | 417 | Metrics directed to Datadog include tagging data which makes it very easy 418 | to subset and query the metrics that you need to monitor. 419 | 420 | The following are sample `in-memory` statistical metrics reported for a 421 | single window interval on a GenStage process: 422 | 423 | ``` 424 | # Stage Name: Data.Producer, PID<0.195.0> 425 | 426 | # callback demand 427 | %GenMetrics.GenStage.Stats{callbacks: 9536, 428 | max: 500, 429 | mean: 500, 430 | min: 500, 431 | range: 0, 432 | stdev: 0, 433 | total: 4768000} 434 | # callback events 435 | %GenMetrics.GenStage.Stats{callbacks: 9536, 436 | max: 500, 437 | mean: 500, 438 | min: 500, 439 | range: 0, 440 | stdev: 0, 441 | total: 4768000} 442 | 443 | # callback timings 444 | %GenMetrics.GenStage.Stats{callbacks: 9536, 445 | max: 2979, 446 | mean: 42, 447 | min: 24, 448 | range: 2955, 449 | stdev: 38, 450 | total: 403170} 451 | ``` 452 | 453 | All timings reported on `in-memory` statistical metrics are reported in 454 | `microseconds (µs)`. For example, during this sample window interval, `9536` 455 | callbacks were handled by the `Data.Producer` stage. The total time spent 456 | processing those callbacks was `403170 µs`. The `mean` time taken per 457 | callback was `42 µs` while the `standard deviation` around the mean was 458 | `38 µs`. 459 | 460 | Here, the total upstream demand of `4768000` equalled the total events emitted 461 | by the stage. This tells us that the stage was able to fully meet upstream 462 | demand during this specific sample window interval. 463 | 464 | *Note:* Under heavy load the generation of `in-memory` statistical metrics can 465 | become computationally expensive. It is therefore recommended that 466 | `in-memory` metrics be activated in production environments *judiciously*. 467 | These concerns are negligible when redirecting statistical metrics to 468 | `:statsd` or `:datadog` as custom sampling-rates may be configured. 469 | 470 | ### GenMetrics Reporting Metrics 471 | 472 | Runtime `in-memory` metrics for stages in your pipeline are published 473 | via a dedicated reporting process. The reporting process is registered 474 | locally by the GenMetrics library at startup. This process is registered 475 | under the name `GenMetrics.GenStage.Reporter`. 476 | 477 | The reporting process itself is a `GenStage` producer that broadcasts metrics 478 | data. Any number of consumers can subscribe to this process in order to handle 479 | metrics data. 480 | 481 | Note, if you are redirecting statistical metrics to `:statsd` or `:datadog` 482 | there is no need to subscribe to this reporting process. 483 | 484 | In order to subscribe, a simple GenStage `:consumer` can initialize itself 485 | to receive events from the reporting process as follows: 486 | 487 | ``` 488 | def init(:ok) do 489 | # Subscribe as consumer to the GenMetrics.GenStage.Reporter producer. 490 | {:consumer, :state_does_not_matter, 491 | subscribe_to: [{GenMetrics.GenStage.Reporter, max_demand: 1}]} 492 | end 493 | ``` 494 | 495 | On receipt of events from the reporting process, metrics data can be extracted 496 | for processing to suit any need. The following example demonstrates simple 497 | logging of summary metrics data: 498 | 499 | ``` 500 | def handle_events([metrics | _], _from, state) do 501 | # Log summary metrics for each stage within the GenStage pipeline. 502 | for summary <- metrics.summary do 503 | Logger.info "GenMetrics.Consumer: pipeline.stage summary=\#{inspect summary}" 504 | end 505 | {:noreply, [], state} 506 | end 507 | ``` 508 | 509 | """ 510 | 511 | alias GenMetrics.GenServer 512 | alias GenMetrics.GenStage 513 | alias GenMetrics.GenServer.Cluster 514 | alias GenMetrics.GenStage.Pipeline 515 | 516 | @doc """ 517 | Activate metrics collection and publishing for one or more GenServers. 518 | 519 | ## Example Usage 520 | 521 | Assuming an application has a `Session.Server` and a `Logging.Server` you 522 | can activate metrics collection on both GenServers as follows: 523 | 524 | ``` 525 | alias GenMetrics.GenServer.Cluster 526 | cluster = %Cluster{name: "demo", 527 | servers: [Session.Server, Logging.Server], 528 | opts: [window_interval: 5000]} 529 | GenMetrics.monitor_cluster(cluster) 530 | ``` 531 | 532 | ## Cluster Validation 533 | 534 | When this function is called the GenMetrics library checks and verifies 535 | the following conditions are met: 536 | 537 | 1. All server modules specified on the cluster can be located and loaded 538 | 1. All server modules specified on the cluster implement the GenServer 539 | behaviour 540 | 541 | If any module in the cluster does not meet these conditions the 542 | function terminates with a `:bad_cluster` response and supporting error 543 | messages. 544 | 545 | ## Metrics Reporting 546 | 547 | By default, metrics data gathered on your cluster are maintained `in-memory` 548 | and reported by a dedicated reporting process. However, metrics data can 549 | be redirected to `:statsd` or `:datadog` using the `statistics` configuration 550 | option on this call. 551 | 552 | For example: redirect your cluster metrics data to the `Datadog` service as 553 | follows: 554 | 555 | ``` 556 | alias GenMetrics.GenServer.Cluster 557 | cluster = %Cluster{name: "demo", 558 | servers: [Session.Server, Logging.Server], 559 | opts: [statistics: :datadog]} 560 | GenMetrics.monitor_cluster(cluster) 561 | ``` 562 | 563 | """ 564 | @spec monitor_cluster(%Cluster{}) :: 565 | {:ok, pid} | {:error, :bad_server, [String.t]} 566 | def monitor_cluster(%Cluster{} = cluster) do 567 | Supervisor.start_child(GenServer.Supervisor, [cluster]) 568 | end 569 | 570 | @doc """ 571 | Activate metrics collection and publishing for one or more stages 572 | within a GenStage pipeline. 573 | 574 | ## Example Usage 575 | 576 | Assuming a GenStage application has a `Data.Producer`, a `Data.Scrubber`, 577 | a `Data.Analyzer` and a `Data.Consumer`, you can activate metrics 578 | collection for the entire pipeline as follows: 579 | 580 | ``` 581 | alias GenMetrics.GenStage.Pipeline 582 | pipeline = %Pipeline{name: "demo", 583 | producer: [Data.Producer], 584 | producer_consumer: [Data.Scrubber, Data.Analyzer], 585 | consumer: [Data.Consumer]} 586 | GenMetrics.monitor_pipeline(pipeline) 587 | ``` 588 | 589 | ## Pipeline Validation 590 | 591 | When this function is called the GenMetrics library checks and verifies 592 | the following conditions are met: 593 | 594 | 1. All stage modules specified on the pipeline can be located and loaded 595 | 1. All stage modules specified on the pipeline implement the GenStage behaviour 596 | 597 | If any module in the pipeline does not meet these conditions the 598 | function terminates with a `:bad_pipeline` response and supporting error 599 | messages. 600 | 601 | 602 | ## Metrics Reporting 603 | 604 | By default, metrics data gathered on your pipeline are maintained `in-memory` 605 | and reported by a dedicated reporting process. However, metrics data can 606 | be redirected to `:statsd` or `:datadog` using the `statistics` configuration 607 | option on this call. 608 | 609 | For example: redirect your pipeline metrics data to a `statsd` agent as 610 | follows: 611 | 612 | ``` 613 | alias GenMetrics.GenStage.Pipeline 614 | pipeline = %Pipeline{name: "demo", 615 | producer: [Data.Producer], 616 | producer_consumer: [Data.Scrubber, Data.Analyzer], 617 | consumer: [Data.Consumer], 618 | opts: [statistics: :statsd]} 619 | GenMetrics.monitor_pipeline(pipeline) 620 | ``` 621 | """ 622 | @spec monitor_pipeline(%Pipeline{}) :: 623 | {:ok, pid} | {:error, :bad_pipeline, [String.t]} 624 | def monitor_pipeline(%Pipeline{} = pipeline) do 625 | Supervisor.start_child(GenStage.Supervisor, [pipeline]) 626 | end 627 | 628 | end 629 | --------------------------------------------------------------------------------