├── test
├── test_helper.exs
└── gen_metrics_test.exs
├── .gitignore
├── config
└── config.exs
├── PITCHME.yaml
├── priv
└── assets
│ └── img
│ └── elixir-logo.png
├── lib
├── gen_server
│ ├── supervisor.ex
│ ├── window.ex
│ ├── server.ex
│ ├── metric.ex
│ ├── stats.ex
│ ├── summary.ex
│ ├── cluster.ex
│ ├── manager.ex
│ └── monitor.ex
├── gen_stage
│ ├── supervisor.ex
│ ├── stage.ex
│ ├── window.ex
│ ├── stats.ex
│ ├── summary.ex
│ ├── metric.ex
│ ├── pipeline.ex
│ ├── manager.ex
│ └── monitor.ex
├── reporter_supervisor.ex
├── application.ex
├── reporter.ex
├── utils
│ ├── math.ex
│ ├── stats_push.ex
│ └── runtime.ex
└── gen_metrics.ex
├── LICENSE
├── bench
├── infinite_server.exs
├── infinite_pipeline.exs
├── trace_server.exs
├── sample_server.exs
├── trace_pipeline.exs
├── sample_pipeline.exs
├── support
│ ├── server.exs
│ └── stages.exs
└── README.md
├── mix.lock
├── mix.exs
├── examples
├── genserver_events.exs
├── genstage_producer_consumer.exs
├── genstage_rate_limiter.exs
└── genstage_gen_event.exs
├── README.md
└── PITCHME.md
/test/test_helper.exs:
--------------------------------------------------------------------------------
1 | ExUnit.start()
2 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | /_build
2 | /cover
3 | /deps
4 | /doc
5 | erl_crash.dump
6 | *.ez
7 |
--------------------------------------------------------------------------------
/config/config.exs:
--------------------------------------------------------------------------------
1 | use Mix.Config
2 |
3 | config :gen_metrics, sample_rate: 1.0
4 |
--------------------------------------------------------------------------------
/PITCHME.yaml:
--------------------------------------------------------------------------------
1 | theme : sky
2 | logo : priv/assets/img/elixir-logo.png
3 | revealjs: 3.4.1
4 |
--------------------------------------------------------------------------------
/priv/assets/img/elixir-logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/onetapbeyond/gen_metrics/HEAD/priv/assets/img/elixir-logo.png
--------------------------------------------------------------------------------
/test/gen_metrics_test.exs:
--------------------------------------------------------------------------------
1 | defmodule GenMetricsTest do
2 | use ExUnit.Case
3 | doctest GenMetrics
4 |
5 | test "the truth" do
6 | assert 1 + 1 == 2
7 | end
8 | end
9 |
--------------------------------------------------------------------------------
/lib/gen_server/supervisor.ex:
--------------------------------------------------------------------------------
1 | defmodule GenMetrics.GenServer.Supervisor do
2 | @moduledoc false
3 | use Supervisor
4 | alias GenMetrics.GenServer.Monitor
5 |
6 | def start_link do
7 | Supervisor.start_link(__MODULE__, [], [name: __MODULE__])
8 | end
9 |
10 | def init(_) do
11 |
12 | children = [
13 | worker(Monitor, [], restart: :transient)
14 | ]
15 |
16 | supervise(children, strategy: :simple_one_for_one)
17 | end
18 |
19 | end
20 |
--------------------------------------------------------------------------------
/lib/gen_stage/supervisor.ex:
--------------------------------------------------------------------------------
1 | defmodule GenMetrics.GenStage.Supervisor do
2 | @moduledoc false
3 | use Supervisor
4 | alias GenMetrics.GenStage.Monitor
5 |
6 | def start_link do
7 | Supervisor.start_link(__MODULE__, [], [name: __MODULE__])
8 | end
9 |
10 | def init(_) do
11 |
12 | children = [
13 | worker(Monitor, [], restart: :transient)
14 | ]
15 |
16 | supervise(children, strategy: :simple_one_for_one)
17 | end
18 |
19 | end
20 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright 2017, David Russell
2 |
3 | Licensed under the Apache License, Version 2.0 (the "License");
4 | you may not use this file except in compliance with the License.
5 | You may obtain a copy of the License at
6 |
7 | http://www.apache.org/licenses/LICENSE-2.0
8 |
9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
--------------------------------------------------------------------------------
/lib/reporter_supervisor.ex:
--------------------------------------------------------------------------------
1 | defmodule GenMetrics.Reporter.Supervisor do
2 | use Supervisor
3 |
4 | @moduledoc false
5 |
6 | def start_link do
7 | Supervisor.start_link(__MODULE__, [], [name: __MODULE__])
8 | end
9 |
10 | def init(_) do
11 |
12 | children = [
13 | worker(GenMetrics.Reporter,
14 | [GenMetrics.GenServer.Reporter],
15 | [id: GenMetrics.GenServer.Reporter]),
16 | worker(GenMetrics.Reporter,
17 | [GenMetrics.GenStage.Reporter],
18 | [id: GenMetrics.GenStage.Reporter])
19 | ]
20 |
21 | supervise(children, strategy: :one_for_one)
22 | end
23 |
24 | end
25 |
--------------------------------------------------------------------------------
/lib/gen_stage/stage.ex:
--------------------------------------------------------------------------------
1 | defmodule GenMetrics.GenStage.Stage do
2 | alias GenMetrics.GenStage.Stats
3 |
4 | @moduledoc """
5 | A struct used to aggregate statistical metrics data for a GenStage process.
6 |
7 | The fields are:
8 | * `name` - the module name for the GenStage process
9 |
10 | * `pid` - the `pid` for the GenStage process
11 |
12 | * `demand` - `GenMetrics.GenStage.Stats` for upstream demand
13 |
14 | * `events` - `GenMetrics.GenStage.Stats` for emitted events
15 |
16 | * `timings` - `GenMetrics.GenStage.Stats` for time on GenStage callbacks
17 | """
18 |
19 | defstruct name: nil, pid: nil,
20 | demand: %Stats{}, events: %Stats{}, timings: %Stats{}
21 |
22 | end
23 |
--------------------------------------------------------------------------------
/lib/application.ex:
--------------------------------------------------------------------------------
1 | defmodule GenMetrics.Application do
2 | @moduledoc false
3 | use Application
4 |
5 | alias GenMetrics.GenServer
6 | alias GenMetrics.GenStage
7 | alias GenMetrics.Reporter
8 | alias GenMetrics.Utils.StatsPush
9 |
10 | def start(_type, _args) do
11 | import Supervisor.Spec, warn: false
12 |
13 | # Activate Statix (statsd) connection on startup.
14 | :ok = StatsPush.connect
15 |
16 | children = [
17 | supervisor(GenServer.Supervisor, []),
18 | supervisor(GenStage.Supervisor, []),
19 | supervisor(Reporter.Supervisor, [])
20 | ]
21 |
22 | opts = [strategy: :one_for_one, name: GenMetrics.Supervisor]
23 | Supervisor.start_link(children, opts)
24 | end
25 | end
26 |
--------------------------------------------------------------------------------
/lib/gen_server/window.ex:
--------------------------------------------------------------------------------
1 | defmodule GenMetrics.GenServer.Window do
2 |
3 | @moduledoc """
4 | A struct used by the GenMetrics reporting process to periodically
5 | publish metrics data for a GenServer cluster.
6 |
7 | The fields are:
8 |
9 | * `cluster` - the associated `GenMetrics.GenServer.Cluster`
10 |
11 | * `start` - the start time for the current metrics window interval
12 |
13 | * `duration` - the length (ms) of the current metrics window interval
14 |
15 | * `summary` - a list of `GenMetrics.GenServer.Summary`, item per process
16 | on the pipeline
17 |
18 | * `stats` - (optional) a list of `GenMetrics.GenServer.Server`
19 | """
20 |
21 | defstruct cluster: nil, start: 0, duration: 0, stats: [], summary: []
22 | end
23 |
--------------------------------------------------------------------------------
/lib/gen_server/server.ex:
--------------------------------------------------------------------------------
1 | defmodule GenMetrics.GenServer.Server do
2 | alias GenMetrics.GenServer.Stats
3 |
4 | @moduledoc """
5 | A struct used to aggregate statistical metrics data for a GenServer process.
6 |
7 | The fields are:
8 | * `name` - the module name for the GenServer process
9 |
10 | * `pid` - the `pid` for the GenServer process
11 |
12 | * `calls` - `GenMetrics.GenServer.Stats` for `GenServer.handle_call/3` callbacks
13 |
14 | * `casts` - `GenMetrics.GenServer.Stats` for `GenServer.handle_cast/2` callbacks
15 |
16 | * `infos` - `GenMetrics.GenServer.Stats` for `GenServer.handle_info/2` callbacks
17 | """
18 |
19 | defstruct name: nil, pid: nil,
20 | calls: %Stats{}, casts: %Stats{}, infos: %Stats{}
21 |
22 | end
23 |
--------------------------------------------------------------------------------
/lib/gen_stage/window.ex:
--------------------------------------------------------------------------------
1 | defmodule GenMetrics.GenStage.Window do
2 |
3 | @moduledoc """
4 | A struct used by the GenMetrics reporting process to periodically
5 | publish metrics data for a GenStage pipeline.
6 |
7 | The fields are:
8 |
9 | * `pipeline` - the associated `GenMetrics.GenStage.Pipeline`
10 |
11 | * `start` - the start time for the current metrics window interval
12 |
13 | * `duration` - the length (ms) of the current metrics window interval
14 |
15 | * `summary` - a list of `GenMetrics.GenStage.Summary`, item per process
16 | on the pipeline
17 |
18 | * `stats` - (optional) a list of `GenMetrics.GenStage.Stage`
19 | """
20 |
21 | defstruct pipeline: nil, start: 0, duration: 0, stats: [], summary: []
22 |
23 | end
24 |
--------------------------------------------------------------------------------
/lib/gen_server/metric.ex:
--------------------------------------------------------------------------------
1 | defmodule GenMetrics.GenServer.Metric do
2 | @moduledoc false
3 | alias GenMetrics.GenServer.Metric
4 | alias GenMetrics.Utils.Runtime
5 |
6 | @nano_to_micro 1000
7 |
8 | defstruct start: 0, duration: 0
9 |
10 | def partial(ts) do
11 | ts
12 | end
13 |
14 | def pair(summary_paired, mkey, ts, partial) do
15 | start_mkey = {1, ts - partial}
16 | Map.update(summary_paired, mkey, start_mkey,
17 | fn {calls, toc} -> {calls + 1, toc + (ts - partial)} end)
18 | end
19 |
20 | def no_pair do
21 | {0, 0}
22 | end
23 |
24 | def start(ts) do
25 | %Metric{start: ts}
26 | end
27 |
28 | def stop(partial, ts) do
29 | %Metric{partial |
30 | duration: Runtime.safe_div(ts - partial.start, @nano_to_micro)}
31 | end
32 |
33 | end
34 |
--------------------------------------------------------------------------------
/lib/gen_stage/stats.ex:
--------------------------------------------------------------------------------
1 | defmodule GenMetrics.GenStage.Stats do
2 |
3 | @moduledoc """
4 | A struct used to report statistical metrics data for a GenStage process.
5 |
6 | The fields are:
7 |
8 | * `callbacks` - the total number of callbacks handled by the process
9 |
10 | * `total` - the total time spent (µs) on all callbacks
11 |
12 | * `max` - the maximum time spent (µs) on any callback
13 |
14 | * `min` - the minimum time spent (µs) on any callback
15 |
16 | * `mean` - the mean time spent (µs) on any callback
17 |
18 | * `stdev` - the standard deviation around the mean time spent (µs) on
19 | any callback
20 |
21 | * `range` - the difference between max and min time spent (µs) on all callbacks
22 | """
23 |
24 | defstruct callbacks: 0, min: 0, max: 0, total: 0,
25 | mean: 0, stdev: 0, range: 0
26 | end
27 |
--------------------------------------------------------------------------------
/lib/gen_stage/summary.ex:
--------------------------------------------------------------------------------
1 | defmodule GenMetrics.GenStage.Summary do
2 |
3 | @moduledoc """
4 | A struct used to report summary metrics data for a GenStage process.
5 | The numbers reported reflect totals during a given metrics collection
6 | window interval.
7 |
8 | The fields are:
9 |
10 | * `name` - the module name for the GenStage process
11 |
12 | * `pid` - the `pid` for the GenStage process
13 |
14 | * `callbacks` - the number of callbacks on the GenStage process
15 |
16 | * `time_on_callbacks` - the number of milliseconds spent on callbacks
17 |
18 | * `demand` - the upstream demand on the GenStage process
19 |
20 | * `events` - the number of events emitted by the GenStage process
21 | """
22 |
23 | defstruct name: nil, pid: nil,
24 | callbacks: 0, time_on_callbacks: 0, demand: 0, events: 0
25 |
26 | end
27 |
--------------------------------------------------------------------------------
/lib/gen_server/stats.ex:
--------------------------------------------------------------------------------
1 | defmodule GenMetrics.GenServer.Stats do
2 |
3 | @moduledoc """
4 | A struct used to report statistical metrics data for a GenServer process.
5 |
6 | The fields are:
7 |
8 | * `callbacks` - the total number of callbacks handled by the process
9 |
10 | * `total` - the total time spent (µs) on all callbacks
11 |
12 | * `max` - the maximum time spent (µs) on any callback
13 |
14 | * `min` - the minimum time spent (µs) on any callback
15 |
16 | * `mean` - the mean time spent (µs) on any callback
17 |
18 | * `stdev` - the standard deviation around the mean time spent (µs) on
19 | any callback
20 |
21 | * `range` - the difference between max and min time spent (µs) on all callbacks
22 | """
23 |
24 | defstruct callbacks: 0, min: 0, max: 0, total: 0,
25 | mean: 0, stdev: 0, range: 0
26 |
27 | end
28 |
--------------------------------------------------------------------------------
/lib/reporter.ex:
--------------------------------------------------------------------------------
1 | defmodule GenMetrics.Reporter do
2 | use GenStage
3 |
4 | @moduledoc false
5 |
6 | def start_link(name) do
7 | GenStage.start_link(__MODULE__, 0, name: name)
8 | end
9 |
10 | def push(reporter, window) do
11 | GenStage.cast(reporter, {:monitor_metrics, window})
12 | end
13 |
14 | def init(state) do
15 | {:producer, state, dispatcher: GenStage.BroadcastDispatcher}
16 | end
17 |
18 | def handle_subscribe(_, _, _, state) do
19 | {:automatic, state + 1}
20 | end
21 |
22 | def handle_cancel(_, _, state) do
23 | {:noreply, [], max(state - 1, 0)}
24 | end
25 |
26 | def handle_cast({:monitor_metrics, window}, subscriber_count) do
27 | if subscriber_count == 0 do
28 | {:noreply, [], subscriber_count}
29 | else
30 | {:noreply, [window], subscriber_count}
31 | end
32 | end
33 |
34 | def handle_demand(_demand, state) do
35 | {:noreply, [], state}
36 | end
37 |
38 | end
39 |
--------------------------------------------------------------------------------
/lib/gen_stage/metric.ex:
--------------------------------------------------------------------------------
1 | defmodule GenMetrics.GenStage.Metric do
2 | @moduledoc false
3 | alias GenMetrics.GenStage.Metric
4 | alias GenMetrics.Utils.Runtime
5 |
6 | @nano_to_micro 1000
7 |
8 | defstruct demand: 0, events: 0, duration: 0
9 |
10 | def demand(demand, start) do
11 | %Metric{demand: demand, duration: start}
12 | end
13 |
14 | def event(partial, events, ts) do
15 | %Metric{partial | events: events,
16 | duration: Runtime.safe_div(ts - partial.duration, @nano_to_micro)}
17 | end
18 |
19 | def pair(summary_paired, pid, events, ts, partial) do
20 | start_pid = {1, partial.demand, events, ts - partial.duration}
21 | Map.update(summary_paired, pid, start_pid,
22 | fn {calls, dmd, evts, toc} ->
23 | {calls + 1, dmd + partial.demand, evts + events,
24 | toc + (ts - partial.duration)}
25 | end)
26 | end
27 |
28 | def no_pair do
29 | {0, 0, 0, 0}
30 | end
31 |
32 | end
33 |
--------------------------------------------------------------------------------
/bench/infinite_server.exs:
--------------------------------------------------------------------------------
1 | Code.require_file("server.exs", "./bench/support")
2 | Application.ensure_all_started(:gen_metrics)
3 |
4 | {:ok, _untraced} = UntracedServer.start_link(99999999999999)
5 | {:ok, _sampled} = SampledServer.start_link(99999999999999)
6 |
7 | alias GenMetrics.GenServer.Cluster
8 | cluster = %Cluster{name: "infinite_sampled_server",
9 | servers: [SampledServer],
10 | opts: [sample_rate: 0.1]}
11 |
12 | GenMetrics.monitor_cluster cluster
13 |
14 | :observer.start
15 |
16 | Benchee.run(%{time: 30, warmup: 5}, %{
17 | "infinite-sampled-server" => fn ->
18 | SampledServer.init_state(99999999999999)
19 | data = %{id: self(), data: String.duplicate("a", 100)}
20 | stream = Stream.cycle([data])
21 | for _ <- stream, do: SampledServer.do_call(data)
22 | receive do
23 | :benchmark_completed -> :ok
24 | end
25 | end})
26 |
--------------------------------------------------------------------------------
/lib/gen_server/summary.ex:
--------------------------------------------------------------------------------
1 | defmodule GenMetrics.GenServer.Summary do
2 |
3 | @moduledoc """
4 | A struct used to report summary metrics data for a GenServer process.
5 | The numbers reported reflect totals during a given metrics collection
6 | window interval.
7 |
8 | The fields are:
9 |
10 | * `name` - the module name for the GenServer process
11 |
12 | * `pid` - the `pid` for the GenServer process
13 |
14 | * `calls` - the number of `GenServer.handle_call/3` calls
15 |
16 | * `casts` - the number of `GenServer.handle_cast/2` calls
17 |
18 | * `infos` - the number of `GenServer.handle_info/2` calls
19 |
20 | * `time_on_calls` - the number of milliseconds spent on calls
21 |
22 | * `time_on_casts` - the number of milliseconds spent on casts
23 |
24 | * `time_on_infos` - the number of milliseconds spent on infos
25 | """
26 |
27 | defstruct name: nil, pid: nil,
28 | calls: 0, casts: 0, infos: 0,
29 | time_on_calls: 0, time_on_casts: 0, time_on_infos: 0
30 |
31 | end
32 |
--------------------------------------------------------------------------------
/lib/utils/math.ex:
--------------------------------------------------------------------------------
1 | defmodule GenMetrics.Utils.Math do
2 | @moduledoc false
3 |
4 | # For runtime performance reasons this library requires
5 | # the input data length to be provided, not calculated.
6 |
7 | def sum(data), do: Enum.sum data
8 |
9 | def sort(data), do: Enum.sort data
10 |
11 | def max([]), do: 0
12 | def max(data), do: Enum.max data
13 |
14 | def min([]), do: 0
15 | def min(data), do: Enum.min data
16 |
17 | def mean([], _), do: 0
18 | def mean(data, length), do: round(sum(data) / length)
19 |
20 | def variance([], _), do: 0
21 | def variance(data, length) do
22 | mean = mean(data, length)
23 | round(sum(Enum.map(data, &((mean - &1) * (mean - &1)))) / length)
24 | end
25 |
26 | def stdev([], _), do: 0
27 | def stdev(data, length) do
28 | round(:math.sqrt(variance(data, length)))
29 | end
30 |
31 | def range([]), do: 0
32 | def range(data) do
33 | sorted = sort(data)
34 | List.last(sorted) - List.first(sorted)
35 | end
36 |
37 | end
38 |
--------------------------------------------------------------------------------
/bench/infinite_pipeline.exs:
--------------------------------------------------------------------------------
1 | Code.require_file("stages.exs", "./bench/support")
2 | Application.ensure_all_started(:gen_metrics)
3 | alias GenMetrics.GenStage.Pipeline
4 |
5 | {:ok, _sampledp} = SampledProducer.start_link()
6 | {:ok, _sampledc} = SampledConsumer.start_link()
7 |
8 | infinite_pipeline = %Pipeline{name: "infinite_pipeline",
9 | producer: [SampledProducer],
10 | consumer: [SampledConsumer],
11 | opts: [statistics: false,
12 | synchronous: true,
13 | sample_rate: 0.05]}
14 |
15 | {:ok, _imon} = GenMetrics.monitor_pipeline infinite_pipeline
16 |
17 | :observer.start
18 |
19 | Benchee.run(%{time: 30, warmup: 5}, %{
20 | "infinite-sampled-pipeline" => fn ->
21 | data = %{id: self(), data: String.duplicate("a", 100)}
22 | stream = Stream.cycle([data])
23 | for _ <- stream, do: SampledProducer.emit(data)
24 | receive do
25 | :benchmark_completed -> :ok
26 | end
27 |
28 | end})
29 |
--------------------------------------------------------------------------------
/mix.lock:
--------------------------------------------------------------------------------
1 | %{"benchee": {:hex, :benchee, "0.7.0", "98e4ed2c86b633df9b0190d6b3bf38bc2e385ba6200f68201fb575d39909816c", [:mix], [{:deep_merge, "~> 0.1", [hex: :deep_merge, optional: false]}]},
2 | "bunt": {:hex, :bunt, "0.2.0", "951c6e801e8b1d2cbe58ebbd3e616a869061ddadcc4863d0a2182541acae9a38", [:mix], []},
3 | "credo": {:hex, :credo, "0.7.3", "9827ab04002186af1aec014a811839a06f72aaae6cd5eed3919b248c8767dbf3", [:mix], [{:bunt, "~> 0.2.0", [hex: :bunt, optional: false]}]},
4 | "deep_merge": {:hex, :deep_merge, "0.1.1", "c27866a7524a337b6a039eeb8dd4f17d458fd40fbbcb8c54661b71a22fffe846", [:mix], []},
5 | "earmark": {:hex, :earmark, "1.2.0", "bf1ce17aea43ab62f6943b97bd6e3dc032ce45d4f787504e3adf738e54b42f3a", [:mix], []},
6 | "ex_doc": {:hex, :ex_doc, "0.15.1", "d5f9d588fd802152516fccfdb96d6073753f77314fcfee892b15b6724ca0d596", [:mix], [{:earmark, "~> 1.1", [hex: :earmark, optional: false]}]},
7 | "ex_statsd": {:hex, :ex_statsd, "0.5.3", "e86dd97e25dbc80786e7d22b3c5537f2052a7e12daaaa7e6f2b9c34d03dbbd44", [:mix], []},
8 | "gen_stage": {:hex, :gen_stage, "0.11.0", "943bdfa85c75fa624e0a36a9d135baad20a523be040178f5a215444b45c66ea4", [:mix], []},
9 | "statix": {:hex, :statix, "1.0.0", "836c0752ad2b568dcdc9b1e67df0df91ad491ea1e19965ac219a9a0569e7e338", [:mix], []}}
10 |
--------------------------------------------------------------------------------
/lib/gen_server/cluster.ex:
--------------------------------------------------------------------------------
1 | defmodule GenMetrics.GenServer.Cluster do
2 |
3 | @moduledoc """
4 | A struct used to identify one or more GenServer modules that become
5 | candidates for metrics collection.
6 |
7 | The fields are:
8 |
9 | * `name` - a `String.t` used to identify the cluster
10 |
11 | * `servers` - a list of one or more GenServer modules
12 |
13 | * `opts` - a keyword list of options that alter GenMetrics behaviour
14 | for the cluster
15 |
16 | The `name` can be used to filter metrics events from the GenMetrics
17 | reporting process as well as provding context when logging metrics data.
18 |
19 | The following `opts` are supported:
20 |
21 | * `statistics` - when `true`, statistical metrics are generated for
22 | the cluster, defaults to `false`
23 | * `window_interval` - metrics collection interval in `ms`, defaults to `1000 ms`
24 |
25 | ### Usage:
26 |
27 | Assuming your application has a `Session.Server` and a `Logging.Server`,
28 | you can activate metrics collection on both GenServers as follows:
29 |
30 | ```
31 | alias GenMetrics.GenServer.Cluster
32 | cluster = %Cluster{name: "demo", servers: [Session.Server, Logging.Server]}
33 | GenMetrics.monitor_cluster(cluster)
34 | ```
35 |
36 | The *cluster* in this context is simply a named set of one or more GenServer
37 | modules about which you would like to collect metrics data. Metrics data
38 | are collected on server processes executing on the local node.
39 | """
40 |
41 | defstruct name: nil, servers: [], opts: []
42 |
43 | end
44 |
--------------------------------------------------------------------------------
/bench/trace_server.exs:
--------------------------------------------------------------------------------
1 | Code.require_file("server.exs", "./bench/support")
2 | Application.ensure_all_started(:gen_metrics)
3 |
4 | data = Enum.map(1..500_000, fn i -> %{id: i, data: String.duplicate("a", 100)} end)
5 |
6 | {:ok, _untraced} = UntracedServer.start_link(length(data))
7 | {:ok, _traced} = TracedServer.start_link(length(data))
8 |
9 | alias GenMetrics.GenServer.Cluster
10 | traced_cluster = %Cluster{name: "traced_cluster",
11 | servers: [TracedServer],
12 | opts: [statistics: false,
13 | sample_rate: 1.0,
14 | synchronous: true]}
15 |
16 | {:ok, _tmon} = GenMetrics.monitor_cluster(traced_cluster)
17 |
18 | # :observer.start
19 |
20 | Benchee.run(%{time: 30, warmup: 5}, %{
21 | "1-untraced-server [ repeat 500k callbacks N times within ~30s ]" => fn ->
22 | UntracedServer.init_state(length(data))
23 | pid = self()
24 | for item <- data do
25 | UntracedServer.do_call(%{item | id: pid})
26 | end
27 | receive do
28 | :benchmark_completed -> :ok
29 | end
30 | IO.puts "1-untraced-server 500k callbacks completed"
31 | end,
32 | "2-traced---server [ repeat 500k callbacks N times within ~30s ]" => fn ->
33 | TracedServer.init_state(length(data))
34 | pid = self()
35 | for item <- data do
36 | TracedServer.do_call(%{item | id: pid})
37 | end
38 | receive do
39 | :benchmark_completed -> :ok
40 | end
41 | IO.puts "2-traced--server 500k callbacks completed"
42 | end
43 | })
44 |
--------------------------------------------------------------------------------
/bench/sample_server.exs:
--------------------------------------------------------------------------------
1 | Code.require_file("server.exs", "./bench/support")
2 | Application.ensure_all_started(:gen_metrics)
3 |
4 | data = Enum.map(1..500_000, fn i -> %{id: i, data: String.duplicate("a", 100)} end)
5 |
6 | {:ok, _untraced} = UntracedServer.start_link(length(data))
7 | {:ok, _sampled} = SampledServer.start_link(length(data))
8 |
9 | alias GenMetrics.GenServer.Cluster
10 | sampled_cluster = %Cluster{name: "sampled_cluster",
11 | servers: [SampledServer],
12 | opts: [statistics: false,
13 | sample_rate: 0.1,
14 | synchronous: true]}
15 |
16 | {:ok, _smon} = GenMetrics.monitor_cluster(sampled_cluster)
17 |
18 | # :observer.start
19 |
20 | Benchee.run(%{time: 30, warmup: 5}, %{
21 | "1-untraced-server [ repeat 500k callbacks N times within ~30s ]" => fn ->
22 | UntracedServer.init_state(length(data))
23 | pid = self()
24 | for item <- data do
25 | UntracedServer.do_call(%{item | id: pid})
26 | end
27 | receive do
28 | :benchmark_completed -> :ok
29 | end
30 | IO.puts "1-untraced-server 500k callbacks completed"
31 | end,
32 | "2-sampled--server [ repeat 500k callbacks N times within ~30s ]" => fn ->
33 | SampledServer.init_state(length(data))
34 | pid = self()
35 | for item <- data do
36 | SampledServer.do_call(%{item | id: pid})
37 | end
38 | receive do
39 | :benchmark_completed -> :ok
40 | end
41 | IO.puts "2-sampled--server 500k callbacks completed"
42 | end
43 | })
44 |
--------------------------------------------------------------------------------
/bench/trace_pipeline.exs:
--------------------------------------------------------------------------------
1 | Code.require_file("stages.exs", "./bench/support")
2 | Application.ensure_all_started(:gen_metrics)
3 | alias GenMetrics.GenStage.Pipeline
4 |
5 | data = Enum.map(1..500_000, fn i -> %{id: i, data: String.duplicate("a", 100)} end)
6 |
7 | {:ok, _untracedp} = UntracedProducer.start_link()
8 | {:ok, _untracedc} = UntracedConsumer.start_link()
9 | {:ok, _tracedp} = TracedProducer.start_link()
10 | {:ok, _tracedc} = TracedConsumer.start_link()
11 |
12 | traced_pipeline = %Pipeline{name: "traced_pipeline",
13 | producer: [TracedProducer],
14 | consumer: [TracedConsumer],
15 | opts: [statistics: false,
16 | synchronous: true,
17 | sample_rate: 1.0]}
18 |
19 | {:ok, _tmon} = GenMetrics.monitor_pipeline(traced_pipeline)
20 |
21 | # :observer.start
22 |
23 | Benchee.run(%{time: 30, warmup: 5}, %{
24 | "1-untraced-pipeline [ repeat 500k msgs N times within ~30s ]" => fn ->
25 | for %{id: id} = item <- data do
26 | {:ok, ^id} = UntracedProducer.emit(item)
27 | end
28 | for i <- 1..length(data) do
29 | receive do
30 | ^i -> :ok
31 | end
32 | end
33 | IO.puts "1-untraced-pipeline 500k msgs completed"
34 | end,
35 | "2-traced---pipeline [ repeat 500k msgs N times within ~30s ]" => fn ->
36 | for %{id: id} = item <- data do
37 | {:ok, ^id} = TracedProducer.emit(item)
38 | end
39 | for i <- 1..length(data) do
40 | receive do
41 | ^i -> :ok
42 | end
43 | end
44 | IO.puts "2-traced---pipeline 500k msgs completed"
45 | end
46 | })
47 |
--------------------------------------------------------------------------------
/bench/sample_pipeline.exs:
--------------------------------------------------------------------------------
1 | Code.require_file("stages.exs", "./bench/support")
2 | Application.ensure_all_started(:gen_metrics)
3 | alias GenMetrics.GenStage.Pipeline
4 |
5 | data = Enum.map(1..500_000, fn i -> %{id: i, data: String.duplicate("a", 100)} end)
6 |
7 | {:ok, _untracedp} = UntracedProducer.start_link()
8 | {:ok, _untracedc} = UntracedConsumer.start_link()
9 | {:ok, _sampledp} = SampledProducer.start_link()
10 | {:ok, _sampledc} = SampledConsumer.start_link()
11 |
12 | sampled_pipeline = %Pipeline{name: "traced_pipeline",
13 | producer: [SampledProducer],
14 | consumer: [SampledConsumer],
15 | opts: [statistics: false,
16 | synchronous: true,
17 | sample_rate: 0.1]}
18 |
19 | {:ok, _smon} = GenMetrics.monitor_pipeline(sampled_pipeline)
20 |
21 | # :observer.start
22 |
23 | Benchee.run(%{time: 30, warmup: 5}, %{
24 | "1-untraced-pipeline [ repeat 500k msgs N times within ~30s ]" => fn ->
25 | for %{id: id} = item <- data do
26 | {:ok, ^id} = UntracedProducer.emit(item)
27 | end
28 | for i <- 1..length(data) do
29 | receive do
30 | ^i -> :ok
31 | end
32 | end
33 | IO.puts "1-untraced-pipeline 500k msgs completed"
34 | end,
35 | "2-sampled--pipeline [ repeat 500k msgs N times within ~30s ]" => fn ->
36 | for %{id: id} = item <- data do
37 | {:ok, ^id} = SampledProducer.emit(item)
38 | end
39 | for i <- 1..length(data) do
40 | receive do
41 | ^i -> :ok
42 | end
43 | end
44 | IO.puts "2-sampled--pipeline 500k msgs completed"
45 | end
46 | })
47 |
--------------------------------------------------------------------------------
/mix.exs:
--------------------------------------------------------------------------------
1 | defmodule GenMetrics.Mixfile do
2 | use Mix.Project
3 |
4 | def project do
5 | [app: :gen_metrics,
6 | version: "0.3.0",
7 | elixir: "~> 1.4",
8 | build_embedded: Mix.env == :prod,
9 | start_permanent: Mix.env == :prod,
10 | description: description(),
11 | package: package(),
12 | deps: deps(),
13 | aliases: aliases(),
14 | docs: [main: "GenMetrics", source_url: "https://github.com/onetapbeyond/gen_metrics"]]
15 | end
16 |
17 | # Configuration for the OTP application
18 | #
19 | # Type "mix help compile.app" for more information
20 | def application do
21 | # Specify extra applications you'll use from Erlang/Elixir
22 | [extra_applications: [:logger],
23 | mod: {GenMetrics.Application, []}]
24 | end
25 |
26 | # Dependencies can be Hex packages:
27 | #
28 | # {:my_dep, "~> 0.3.0"}
29 | #
30 | # Or git/path repositories:
31 | #
32 | # {:my_dep, git: "https://github.com/elixir-lang/my_dep.git", tag: "0.1.0"}
33 | #
34 | # Type "mix help deps" for more examples and options
35 | defp deps do
36 | [{:gen_stage, "~> 0.11"},
37 | {:statix, ">= 0.0.0"},
38 | {:ex_doc, "~> 0.14", only: :dev, runtime: false},
39 | {:credo, "~> 0.7", only: [:dev, :test]},
40 | {:benchee, "~> 0.7", only: :dev}]
41 | end
42 |
43 | defp aliases do
44 | [trace_cluster: "run ./bench/trace_cluster.exs",
45 | sample_cluster: "run ./bench/sample_cluster.exs",
46 | trace_pipeline: "run ./bench/trace_pipeline.exs",
47 | sample_pipeline: "run ./bench/sample_pipeline.exs",
48 | infinite_server: "run ./bench/infinite_server.exs",
49 | infinite_pipeline: "run ./bench/infinite_pipeline.exs"]
50 | end
51 |
52 | defp description do
53 | """
54 | Elixir GenServer and GenStage runtime metrics.
55 | """
56 | end
57 |
58 | defp package do
59 | [
60 | name: :gen_metrics,
61 | maintainers: ["David Russell"],
62 | licenses: ["Apache 2.0"],
63 | links: %{"GitHub" => "https://github.com/onetapbeyond/gen_metrics"}
64 | ]
65 | end
66 |
67 | end
68 |
--------------------------------------------------------------------------------
/lib/gen_stage/pipeline.ex:
--------------------------------------------------------------------------------
1 | defmodule GenMetrics.GenStage.Pipeline do
2 |
3 | @moduledoc """
4 | A struct used to identify one or more GenStages that become candidates
5 | for metrics collection.
6 |
7 | The fields are:
8 |
9 | * `name` - a `String.t` used to identify the pipeline
10 |
11 | * `producer` - a list of one or more GenStage `:producer` modules
12 |
13 | * `producer_consumer` - a list of one or more GenStage
14 | `:producer_consumer` modules
15 |
16 | * `consumer` - a list of one or more GenStage `:consumer` modules
17 |
18 | * `opts` - a keyword list of options that alter GenMetrics behaviour
19 | for the pipeline
20 |
21 | The `name` can be used to filter metrics events from the GenMetrics
22 | reporting process as well as provding context when logging metrics data.
23 |
24 | The following `opts` are supported:
25 |
26 | * `statistics` - when `true`, statistical metrics are generated,
27 | defaults to `false`
28 | * `window_interval` - metrics collection interval in `ms`, defaults to `1000 ms`
29 |
30 | ### Usage
31 |
32 | Assuming your GenStage application has a `Data.Producer`, a `Data.Scrubber`,
33 | a `Data.Analyzer` and a `Data.Consumer` you can activate metrics collection
34 | for the entire pipeline as follows:
35 |
36 | ```
37 | alias GenMetrics.GenStage.Pipeline
38 | pipeline = %Pipeline{name: "demo",
39 | producer: [Data.Producer],
40 | producer_consumer: [Data.Scrubber, Data.Analyzer],
41 | consumer: [Data.Consumer]}
42 | GenMetrics.monitor_pipeline(pipeline)
43 | ```
44 |
45 | Alternatively, if you only wanted to activate metrics collection for the
46 | `:producer_consumer` stages within the pipeline you can do the following:
47 |
48 | ```
49 | alias GenMetrics.GenStage.Pipeline
50 | pipeline = %Pipeline{name: "demo",
51 | producer_consumer: [Data.Scrubber, Data.Analyzer]}
52 | GenMetrics.monitor_pipeline(pipeline)
53 | ```
54 |
55 | The *pipeline* in this context is simply a named set of one or more GenStage
56 | modules about which you would like to collect metrics data. Metrics data are
57 | collected on stage processes executing on the local node.
58 | """
59 |
60 | defstruct name: nil, producer: [],
61 | producer_consumer: [], consumer: [], opts: []
62 | end
63 |
--------------------------------------------------------------------------------
/examples/genserver_events.exs:
--------------------------------------------------------------------------------
1 | # Usage: mix run examples/genserver_events.exs
2 | #
3 | # This basic example demonstrates the collection and
4 | # reporting of metrics data for a simple GenServer cluster.
5 | #
6 | # The sample Metrics.Consumer module simply prints the metrics
7 | # data reported by the GenMetrics library to standard out.
8 | #
9 | defmodule Demo.Server do
10 | use GenServer
11 |
12 | def start_link do
13 | GenServer.start_link(__MODULE__, [])
14 | end
15 |
16 | def init(state) do
17 | {:ok, state}
18 | end
19 |
20 | def handle_call(_msg, _from, state) do
21 | {:reply, :ok, state}
22 | end
23 |
24 | def handle_cast(_msg, state) do
25 | {:noreply, state}
26 | end
27 |
28 | def handle_info(_msg, state) do
29 | {:noreply, state}
30 | end
31 | end
32 |
33 | defmodule Metrics.Consumer do
34 | use GenStage
35 |
36 | def start_link do
37 | GenStage.start_link(__MODULE__, [])
38 | end
39 |
40 | def init(_state) do
41 | {:consumer, :state_does_not_matter,
42 | subscribe_to: [{GenMetrics.GenServer.Reporter, max_demand: 1}]}
43 | end
44 |
45 | def handle_events([window | _], _from, state) do
46 | IO.puts "\n\nGenServer Cluster: #{inspect window.cluster.name}"
47 | IO.puts "Metrics-Window: Start:=#{inspect window.start}, Duration=#{inspect window.duration}"
48 | IO.puts "Summary Metrics"
49 | for summary <- window.summary do
50 | IO.puts "#{inspect summary}"
51 | end
52 | IO.puts "Statistical Metrics"
53 | for server <- window.stats do
54 | IO.puts "Server:=#{inspect server.name} [ #{inspect server.pid} ]"
55 | IO.puts "Calls:=#{inspect server.calls}"
56 | IO.puts "Casts:=#{inspect server.casts}"
57 | IO.puts "Infos:=#{inspect server.infos}"
58 | end
59 | IO.puts "\n"
60 | {:noreply, [], state}
61 | end
62 | end
63 |
64 |
65 | #
66 | # Initialize GenMetrics Monitoring for GenServer Cluster
67 | #
68 | alias GenMetrics.GenServer.Cluster
69 |
70 | Application.start(GenMetrics.Application)
71 | Metrics.Consumer.start_link
72 |
73 | cluster = %Cluster{name: "demo",
74 | servers: [Demo.Server],
75 | opts: [statistics: true,
76 | sample_rate: 0.95,
77 | window_interval: 2000,
78 | synchronous: true]}
79 |
80 | {:ok, _pid} = GenMetrics.monitor_cluster(cluster)
81 |
82 | #
83 | # Start Sample GenServer To Handle Events
84 | #
85 | {:ok, pid} = GenServer.start_link(Demo.Server, [])
86 | spawn fn ->
87 | for _ <- 1..3500 do
88 | GenServer.call(pid, :demo)
89 | GenServer.cast(pid, :demo)
90 | Kernel.send(pid, :demo)
91 | end
92 | end
93 | GenServer.call(pid, :demo)
94 | Process.sleep(5000)
95 |
--------------------------------------------------------------------------------
/lib/utils/stats_push.ex:
--------------------------------------------------------------------------------
1 | defmodule GenMetrics.Utils.StatsPush do
2 | use Statix, runtime_config: true
3 |
4 | @moduledoc false
5 |
6 | alias GenMetrics.GenServer
7 | alias GenMetrics.GenStage
8 | alias GenMetrics.Utils.Runtime
9 |
10 | @genserver_prefix "GenMetrics.GenServer.Cluster"
11 | @genstage_prefix "GenMetrics.GenStage.Pipeline"
12 | @genserver_dogtag "genserver"
13 | @genstage_dogtag "genstage"
14 | @count ".count"
15 | @demand ".demand"
16 | @events ".events"
17 | @stats ".stats"
18 | @timing ".timing"
19 | @total ".total"
20 | @sample_rate 1.0
21 |
22 | def statsd(context, mod, pid, fun, %GenServer.Metric{} = metric) do
23 | base = as_label(@genserver_prefix, context, mod, pid, fun)
24 | __MODULE__.increment(base <> @count)
25 | __MODULE__.timing(base <> @stats, Runtime.nano_to_milli(metric.duration))
26 | end
27 |
28 | def statsd(context, mod, pid, _fun, %GenStage.Metric{} = metric) do
29 | base = as_label(@genstage_prefix, context, mod, pid)
30 | __MODULE__.increment(base <> @count)
31 | __MODULE__.increment(base <> @demand <> @total, metric.demand)
32 | __MODULE__.increment(base <> @events <> @total, metric.events)
33 | __MODULE__.timing(base <> @timing, Runtime.nano_to_milli(metric.duration))
34 | end
35 |
36 | def datadog(context, mod, pid, fun, %GenServer.Metric{} = metric) do
37 | base = as_label(@genserver_prefix, context, mod, pid, fun)
38 | dogtag = as_dogtag(@genserver_dogtag, context)
39 | __MODULE__.increment(base <> @count, 1,
40 | tags: [dogtag], sample_rate: @sample_rate)
41 | __MODULE__.histogram(base <> @stats,
42 | Runtime.nano_to_milli(metric.duration),
43 | tags: [dogtag], sample_rate: @sample_rate)
44 | end
45 |
46 | def datadog(context, mod, pid, _fun, %GenStage.Metric{} = metric) do
47 | base = as_label(@genstage_prefix, context, mod, pid)
48 | dogtag = as_dogtag(@genstage_dogtag, context)
49 | __MODULE__.increment(base <> @count, 1,
50 | tags: [dogtag], sample_rate: @sample_rate)
51 | __MODULE__.increment(base <> @demand <> @total, metric.demand,
52 | tags: [dogtag], sample_rate: @sample_rate)
53 | __MODULE__.increment(base <> @events <> @total, metric.events,
54 | tags: [dogtag], sample_rate: @sample_rate)
55 | __MODULE__.histogram(base <> @demand, metric.demand,
56 | tags: [dogtag], sample_rate: @sample_rate)
57 | __MODULE__.histogram(base <> @events, metric.events,
58 | tags: [dogtag], sample_rate: @sample_rate)
59 | __MODULE__.histogram(base <> @timing,
60 | Runtime.nano_to_milli(metric.duration),
61 | tags: [dogtag], sample_rate: @sample_rate)
62 | end
63 |
64 | defp as_label(prefix, cluster, mod, _pid, fun \\ nil) do
65 | if fun do
66 | [prefix, cluster, as_mod_label(mod), as_fun_label(fun)]
67 | |> build_label
68 | else
69 | [prefix, cluster, as_mod_label(mod)] |> build_label
70 | end
71 | end
72 |
73 | # defp as_pid_label(pid) when is_pid(pid) do
74 | # Regex.replace(~r/\D/, "#{inspect pid}", "")
75 | # end
76 |
77 | defp as_mod_label(mod) when is_atom(mod) do
78 | "#{inspect mod}" |> String.split(".") |> Enum.reverse() |> Enum.fetch!(0)
79 | end
80 |
81 | defp as_fun_label(fun) when is_atom(fun) do
82 | Atom.to_string fun
83 | end
84 | defp as_fun_label(fun), do: fun
85 |
86 | defp build_label(fragments) do
87 | label = fragments |> Enum.join(".")
88 | Regex.replace(~r/\.\./, label, ".")
89 | end
90 |
91 | defp as_dogtag(prefix, context) do
92 | [prefix, context] |> build_label
93 | end
94 |
95 | end
96 |
--------------------------------------------------------------------------------
/examples/genstage_producer_consumer.exs:
--------------------------------------------------------------------------------
1 | # Usage: mix run examples/genstage_producer_consumer.exs
2 | #
3 | # Hit Ctrl+C twice to stop it.
4 | #
5 | # This basic example demonstrates the collection and
6 | # reporting of metrics data for a simple GenStage pipeline.
7 | #
8 | # The sample Metrics.Consumer module simply prints the metrics
9 | # data reported by the GenMetrics library to standard out.
10 | #
11 | # The simple GenStage pipeline used in this example is a copy
12 | # of the ProducerConsumer example pipeline found in the
13 | # GenStage project repository:
14 | #
15 | # https://github.com/elixir-lang/gen_stage.
16 | #
17 | defmodule A do
18 | use GenStage
19 |
20 | def init(counter) do
21 | {:producer, counter}
22 | end
23 |
24 | def handle_demand(demand, counter) when demand > 0 do
25 | events = Enum.to_list(counter..counter+demand-1)
26 | {:noreply, events, counter + demand}
27 | end
28 | end
29 |
30 | defmodule B do
31 | use GenStage
32 |
33 | def init(number) do
34 | {:producer_consumer, number}
35 | end
36 |
37 | def handle_events(events, _from, number) do
38 | events =
39 | for event <- events,
40 | entry <- event..event+number,
41 | do: entry
42 | {:noreply, events, number}
43 | end
44 | end
45 |
46 | defmodule C do
47 | use GenStage
48 |
49 | def init(:ok) do
50 | {:consumer, :the_state_does_not_matter}
51 | end
52 |
53 | def handle_events(_events, _from, state) do
54 | :timer.sleep(1000)
55 | {:noreply, [], state}
56 | end
57 | end
58 |
59 | defmodule Metrics.Consumer do
60 | use GenStage
61 |
62 | def start_link do
63 | GenStage.start_link(__MODULE__, [])
64 | end
65 |
66 | def init(_state) do
67 | {:consumer, :state_does_not_matter,
68 | subscribe_to: [{GenMetrics.GenStage.Reporter, max_demand: 1}]}
69 | end
70 |
71 | def handle_events([window | _], _from, state) do
72 | IO.puts "\n\nGenStage Pipeline: #{inspect window.pipeline.name}"
73 | IO.puts "Metrics-Window: Start:=#{inspect window.start}, Duration=#{inspect window.duration}"
74 | IO.puts "Summary Metrics"
75 | for summary <- window.summary do
76 | IO.puts "#{inspect summary}"
77 | end
78 | IO.puts "Statistical Metrics"
79 | for stage <- window.stats do
80 | IO.puts "Stage:=#{inspect stage.name} [ #{inspect stage.pid} ]"
81 | IO.puts "Demand:=#{inspect stage.demand}"
82 | IO.puts "Events:=#{inspect stage.events}"
83 | IO.puts "Timings:=#{inspect stage.timings}"
84 | end
85 | IO.puts "\n"
86 | {:noreply, [], state}
87 | end
88 | end
89 |
90 | #
91 | # Initialize GenMetrics Monitoring for GenStage Pipeline
92 | #
93 | alias GenMetrics.GenStage.Pipeline
94 |
95 | Application.start(GenMetrics.Application)
96 | Metrics.Consumer.start_link
97 |
98 | pipeline = %Pipeline{name: "demo",
99 | producer: [A],
100 | producer_consumer: [B],
101 | consumer: [C],
102 | opts: [statistics: true, window_interval: 3000]}
103 |
104 | {:ok, _pid} = GenMetrics.monitor_pipeline(pipeline)
105 |
106 | #
107 | # Start Sample GenStage ProducerConsumer Pipeline
108 | #
109 | {:ok, a} = GenStage.start_link(A, 0) # starting from zero
110 | {:ok, b} = GenStage.start_link(B, 2) # expand by 2
111 | {:ok, c} = GenStage.start_link(C, :ok) # state does not matter
112 |
113 | GenStage.sync_subscribe(b, to: a)
114 | GenStage.sync_subscribe(c, to: b)
115 | Process.sleep(:infinity)
116 |
--------------------------------------------------------------------------------
/bench/support/server.exs:
--------------------------------------------------------------------------------
1 | defmodule UntracedServer do
2 | use GenServer
3 |
4 | def start_link(target) do
5 | GenServer.start_link(__MODULE__, target, name: __MODULE__)
6 | end
7 | def init(target) do
8 | {:ok, {target, 1}}
9 | end
10 |
11 | def init_state(target) do
12 | GenServer.call(__MODULE__, {:init_state, target})
13 | end
14 | def do_call(item) do
15 | GenServer.call(__MODULE__, {:do_call, item})
16 | end
17 |
18 | def do_cast(item) do
19 | GenServer.cast(__MODULE__, {:do_cast, item})
20 | end
21 |
22 | def do_info(item) do
23 | send(__MODULE__, {:do_info, item})
24 | end
25 |
26 | def handle_call({:init_state, target}, _from, _) do
27 | {:reply, :ok, {target, 1}}
28 | end
29 | def handle_call({:do_call, %{id: id}}, _from, {target, count}) do
30 | if count >= target, do: send(id, :benchmark_completed)
31 | {:reply, {:ok, id}, {target, count + 1}}
32 | end
33 |
34 | def handle_cast({:do_cast, %{id: id}}, {target, count}) do
35 | if count + 1 >= target, do: send(id, :benchmark_completed)
36 | {:noreply, {target, count + 1}}
37 | end
38 |
39 | def handle_info({:do_info, %{id: id}}, {target, count}) do
40 | if count >= target, do: send(id, :benchmark_completed)
41 | {:noreply, {target, count + 1}}
42 | end
43 |
44 | end
45 |
46 | defmodule TracedServer do
47 | use GenServer
48 |
49 | def start_link(target) do
50 | GenServer.start_link(__MODULE__, target, name: __MODULE__)
51 | end
52 | def init(target) do
53 | {:ok, {target, 1}}
54 | end
55 |
56 | def init_state(target) do
57 | GenServer.call(__MODULE__, {:init_state, target})
58 | end
59 |
60 | def do_call(item) do
61 | GenServer.call(__MODULE__, {:do_call, item})
62 | end
63 |
64 | def do_cast(item) do
65 | GenServer.cast(__MODULE__, {:do_cast, item})
66 | end
67 |
68 | def do_info(item) do
69 | send(__MODULE__, {:do_info, item})
70 | end
71 |
72 | def handle_call({:init_state, target}, _from, _) do
73 | {:reply, :ok, {target, 1}}
74 | end
75 |
76 | def handle_call({:do_call, %{id: id}}, _from, {target, count}) do
77 | if count >= target, do: send(id, :benchmark_completed)
78 | {:reply, {:ok, id}, {target, count + 1}}
79 | end
80 |
81 | def handle_cast({:do_cast, %{id: id}}, {target, count}) do
82 | if count >= target, do: send(id, :benchmark_completed)
83 | {:noreply, {target, count + 1}}
84 | end
85 |
86 | def handle_info({:do_info, %{id: id}}, {target, count}) do
87 | if count >= target, do: send(id, :benchmark_completed)
88 | {:noreply, {target, count + 1}}
89 | end
90 |
91 | end
92 |
93 | defmodule SampledServer do
94 | use GenServer
95 |
96 | def start_link(target) do
97 | GenServer.start_link(__MODULE__, target, name: __MODULE__)
98 | end
99 | def init(target) do
100 | {:ok, {target, 1}}
101 | end
102 |
103 | def init_state(target) do
104 | GenServer.call(__MODULE__, {:init_state, target})
105 | end
106 |
107 | def do_call(item) do
108 | GenServer.call(__MODULE__, {:do_call, item})
109 | end
110 |
111 | def do_cast(item) do
112 | GenServer.cast(__MODULE__, {:do_cast, item})
113 | end
114 |
115 | def do_info(item) do
116 | send(__MODULE__, {:do_info, item})
117 | end
118 |
119 | def handle_call({:init_state, target}, _from, _) do
120 | {:reply, :ok, {target, 1}}
121 | end
122 |
123 | def handle_call({:do_call, %{id: id}}, _from, {target, count}) do
124 | if count >= target, do: send(id, :benchmark_completed)
125 | {:reply, {:ok, id}, {target, count + 1}}
126 | end
127 |
128 | def handle_cast({:do_cast, %{id: id}}, {target, count}) do
129 | if count >= target, do: send(id, :benchmark_completed)
130 | {:noreply, {target, count + 1}}
131 | end
132 |
133 | def handle_info({:do_info, %{id: id}}, {target, count}) do
134 | if count >= target, do: send(id, :benchmark_completed)
135 | {:noreply, {target, count + 1}}
136 | end
137 |
138 | end
139 |
--------------------------------------------------------------------------------
/lib/utils/runtime.ex:
--------------------------------------------------------------------------------
1 | defmodule GenMetrics.Utils.Runtime do
2 |
3 | @window_interval_default 1000
4 | @sample_rate_default 1.0
5 |
6 | @moduledoc false
7 |
8 | @doc """
9 | Verify modules are compiled and loaded.
10 |
11 | Returns an empty list if all modules are
12 | successfully compiled and loaded.
13 |
14 | Returns a non-empty list of error messages describing
15 | each module that fails to compile or load.
16 | """
17 | @spec require_modules([module]) :: [String.t]
18 | def require_modules(module_list) do
19 | module_list
20 | |> Enum.uniq
21 | |> Enum.reduce([], fn(module, acc) ->
22 | try do
23 | Code.eval_string("require #{inspect module}")
24 | acc
25 | rescue
26 | _ -> ["Module #{inspect module} not loaded and could not be found." | acc]
27 | end
28 | end)
29 | end
30 |
31 | @doc """
32 | Verify modules implement a required behaviour.
33 |
34 | Returns an empty list if all modules successfully
35 | implement the required behaviour.
36 |
37 | Returns a non-empty list of error messages describing
38 | each module that fails to implement the required behaviour.
39 | """
40 | @spec require_behaviour([module], module) :: [String.t]
41 | def require_behaviour(module_list, behaviour) do
42 | module_list
43 | |> Enum.uniq
44 | |> Enum.reduce([], fn(module, acc) ->
45 | try do
46 | attrs = apply(module, :__info__, [:attributes])
47 | behaviours = get_in(attrs, [:behaviour])
48 | if behaviour in behaviours do
49 | acc
50 | else
51 | ["Module #{inspect module} does not implement #{inspect behaviour}." | acc]
52 | end
53 | rescue
54 | _ -> ["Module #{inspect module} does not implement #{inspect behaviour}." | acc]
55 | end
56 | end)
57 | end
58 |
59 | # Return interval for monitor window rollover.
60 | def window_interval(monitor) do
61 | window_interval = monitor.opts[:window_interval] || @window_interval_default
62 | round(window_interval)
63 | end
64 |
65 | # Return interval for sampling within window.
66 | def sample_interval(monitor) do
67 | window_interval =
68 | monitor.opts[:window_interval] || @window_interval_default
69 | sample_interval = round(window_interval * sample_rate(monitor))
70 | if sample_interval == window_interval do
71 | # adjust sample interval to fit inside window_interval
72 | round(sample_interval * 0.90)
73 | else
74 | round(sample_interval)
75 | end
76 | end
77 |
78 | # Return active metrics sampling rate.
79 | def sample_rate(monitor) do
80 | if sampling?(monitor) do
81 | sample_rate = monitor.opts[:sample_rate]
82 | if sample_rate > 0.9 do
83 | # Enforce upper limit on sampling rate. Rate must
84 | # be either 1.0 (no sampling) or <= 0.9.
85 | 0.9
86 | else
87 | sample_rate
88 | end
89 | else
90 | @sample_rate_default
91 | end
92 | end
93 |
94 | # Return true if sampling rate below 1.0 in use.
95 | def sampling?(monitor) do
96 | sample_rate = monitor.opts[:sample_rate]
97 | if sample_rate == nil || sample_rate == 1.0 do
98 | false
99 | else
100 | true
101 | end
102 | end
103 |
104 | # Return true if monitor is required to generate optional statistics.
105 | def statistics?(monitor), do: monitor.opts[:statistics] || false
106 |
107 | # Return true if monitor is required to trace synchronous calls.
108 | def synchronous?(monitor), do: monitor.opts[:synchronous] || true
109 |
110 | def safe_div(0, _), do: 0
111 | def safe_div(num, d), do: div(num, d)
112 |
113 | def micro_to_milli(0), do: 0
114 | def micro_to_milli(milli), do: safe_div(milli, 1000)
115 |
116 | def nano_to_micro(0), do: 0
117 | def nano_to_micro(nano), do: safe_div(nano, 1000)
118 | def nano_to_milli(0), do: 0
119 | def nano_to_milli(nano), do: safe_div(nano, 1_000_000)
120 |
121 | end
122 |
--------------------------------------------------------------------------------
/examples/genstage_rate_limiter.exs:
--------------------------------------------------------------------------------
1 | # Usage: mix run examples/genstage_rate_limiter.exs
2 | #
3 | # Hit Ctrl+C twice to stop it.
4 | #
5 | # This example demonstrates the collection and reporting of
6 | # metrics data for a GenStage pipeline implemented to enforce
7 | # rate limiting work on a consumer.
8 | #
9 | # The sample Metrics.Consumer module simply prints the metrics
10 | # data reported by the GenMetics library to standard out.
11 | #
12 | # The GenStage pipeline used in this example is a copy of the
13 | # RateLimiter example pipeline found in the GenStage project repo:
14 | #
15 | # https://github.com/elixir-lang/gen_stage.
16 | #
17 | defmodule Producer do
18 | use GenStage
19 |
20 | def init(counter) do
21 | {:producer, counter}
22 | end
23 |
24 | def handle_demand(demand, counter) when demand > 0 do
25 | events = Enum.to_list(counter..counter+demand-1)
26 | {:noreply, events, counter + demand}
27 | end
28 | end
29 |
30 | defmodule RateLimiter do
31 | use GenStage
32 |
33 | def init(_) do
34 | {:consumer, %{}}
35 | end
36 |
37 | def handle_subscribe(:producer, opts, from, producers) do
38 | pending = opts[:max_demand] || 1000
39 | interval = opts[:interval] || 5000
40 | producers = Map.put(producers, from, {pending, interval})
41 | producers = ask_and_schedule(producers, from)
42 | {:manual, producers}
43 | end
44 |
45 | def handle_cancel(_, from, producers) do
46 | {:noreply, [], Map.delete(producers, from)}
47 | end
48 |
49 | def handle_events(events, from, producers) do
50 | producers = Map.update!(producers, from, fn {pending, interval} ->
51 | {pending + length(events), interval}
52 | end)
53 | {:noreply, [], producers}
54 | end
55 |
56 | def handle_info({:ask, from}, producers) do
57 | {:noreply, [], ask_and_schedule(producers, from)}
58 | end
59 |
60 | defp ask_and_schedule(producers, from) do
61 | case producers do
62 | %{^from => {pending, interval}} ->
63 | GenStage.ask(from, pending)
64 | Process.send_after(self(), {:ask, from}, interval)
65 | Map.put(producers, from, {0, interval})
66 | %{} ->
67 | producers
68 | end
69 | end
70 | end
71 |
72 | defmodule Metrics.Consumer do
73 | use GenStage
74 |
75 | def start_link do
76 | GenStage.start_link(__MODULE__, [])
77 | end
78 |
79 | def init(_state) do
80 | {:consumer, :state_does_not_matter,
81 | subscribe_to: [{GenMetrics.GenStage.Reporter, max_demand: 1}]}
82 | end
83 |
84 | def handle_events([window | _], _from, state) do
85 | IO.puts "\n\nGenStage Pipeline: #{inspect window.pipeline.name}"
86 | IO.puts "Metrics-Window: Start:=#{inspect window.start},Duration=#{inspect window.duration}"
87 | IO.puts "Summary Metrics"
88 | for summary <- window.summary do
89 | IO.puts "#{inspect summary}"
90 | end
91 | IO.puts "Statistical Metrics"
92 | for stage <- window.stats do
93 | IO.puts "Stage:=#{inspect stage.name} [ #{inspect stage.pid} ]"
94 | IO.puts "Demand:=#{inspect stage.demand}"
95 | IO.puts "Events:=#{inspect stage.events}"
96 | IO.puts "Timings:=#{inspect stage.timings}"
97 | end
98 | IO.puts "\n"
99 | {:noreply, [], state}
100 | end
101 | end
102 |
103 | #
104 | # Initialize GenMetrics Monitoring for GenStage Pipeline
105 | #
106 | alias GenMetrics.GenStage.Pipeline
107 |
108 | Application.start(GenMetrics.Application)
109 | Metrics.Consumer.start_link
110 |
111 | pipeline = %Pipeline{name: "demo",
112 | producer: [Producer],
113 | consumer: [RateLimiter],
114 | opts: [statistics: true, window_interval: 2000]}
115 |
116 | {:ok, _pid} = GenMetrics.monitor_pipeline(pipeline)
117 |
118 | #
119 | # Start Sample GenStage RateLimiter Pipeline
120 | #
121 | {:ok, a} = GenStage.start_link(Producer, 0) # starting from zero
122 | {:ok, b} = GenStage.start_link(RateLimiter, :ok) # expand by 2
123 | GenStage.sync_subscribe(b, to: a, max_demand: 10, interval: 2000)
124 | Process.sleep(:infinity)
125 |
--------------------------------------------------------------------------------
/examples/genstage_gen_event.exs:
--------------------------------------------------------------------------------
1 | # Usage: mix run examples/genstage_gen_event.exs
2 | #
3 | # This example demonstrates the collection and reporting of
4 | # metrics data for a GenStage pipeline implemented as a
5 | # replacement for GenEvent.
6 | #
7 | # The sample Metrics.Consumer module simply prints the metrics
8 | # data reported by the GenMetics library to standard out.
9 | #
10 | # The GenStage pipeline used in this example is a copy of the
11 | # GenEvent example pipeline found in the GenStage project repo:
12 | #
13 | # https://github.com/elixir-lang/gen_stage.
14 | #
15 | defmodule Broadcaster do
16 | use GenStage
17 |
18 | def start_link() do
19 | GenStage.start_link(__MODULE__, :ok, name: __MODULE__)
20 | end
21 |
22 | def sync_notify(event, timeout \\ 5000) do
23 | GenStage.call(__MODULE__, {:notify, event}, timeout)
24 | end
25 |
26 | def init(:ok) do
27 | {:producer, {:queue.new, 0}, dispatcher: GenStage.BroadcastDispatcher}
28 | end
29 |
30 | def handle_call({:notify, event}, from, {queue, demand}) do
31 | dispatch_events(:queue.in({from, event}, queue), demand, [])
32 | end
33 |
34 | def handle_demand(incoming_demand, {queue, demand}) do
35 | dispatch_events(queue, incoming_demand + demand, [])
36 | end
37 |
38 | defp dispatch_events(queue, demand, events) do
39 | with d when d > 0 <- demand,
40 | {{:value, {from, event}}, queue} <- :queue.out(queue) do
41 | GenStage.reply(from, :ok)
42 | dispatch_events(queue, demand - 1, [event | events])
43 | else
44 | _ -> {:noreply, Enum.reverse(events), {queue, demand}}
45 | end
46 | end
47 | end
48 |
49 | defmodule Consumer do
50 | use GenStage
51 |
52 | def start_link() do
53 | GenStage.start_link(__MODULE__, :ok)
54 | end
55 |
56 | def init(:ok) do
57 | {:consumer, :ok, subscribe_to: [Broadcaster]}
58 | end
59 |
60 | def handle_events(_events, _from, state) do
61 | {:noreply, [], state}
62 | end
63 | end
64 |
65 | defmodule App do
66 |
67 | def start do
68 | import Supervisor.Spec
69 |
70 | children = [
71 | worker(Broadcaster, []),
72 | worker(Consumer, [], id: 1),
73 | worker(Consumer, [], id: 2),
74 | worker(Consumer, [], id: 3),
75 | worker(Consumer, [], id: 4)
76 | ]
77 |
78 | Supervisor.start_link(children, strategy: :one_for_one)
79 | end
80 | end
81 |
82 | defmodule Metrics.Consumer do
83 | use GenStage
84 |
85 | def start_link do
86 | GenStage.start_link(__MODULE__, [])
87 | end
88 |
89 | def init(_state) do
90 | {:consumer, :state_does_not_matter,
91 | subscribe_to: [{GenMetrics.GenStage.Reporter, max_demand: 1}]}
92 | end
93 |
94 | def handle_events([window | _], _from, state) do
95 | IO.puts "\n\nGenStage Pipeline: #{inspect window.pipeline.name}"
96 | IO.puts "Metrics-Window: Start:=#{inspect window.start},Duration=#{inspect window.duration}"
97 | IO.puts "Summary Metrics"
98 | for summary <- window.summary do
99 | IO.puts "#{inspect summary}"
100 | end
101 | IO.puts "Statistical Metrics"
102 | for stage <- window.stats do
103 | IO.puts "Stage:=#{inspect stage.name} [ #{inspect stage.pid} ]"
104 | IO.puts "Demand:=#{inspect stage.demand}"
105 | IO.puts "Events:=#{inspect stage.events}"
106 | IO.puts "Timings:=#{inspect stage.timings}"
107 | end
108 | IO.puts "\n"
109 | {:noreply, [], state}
110 | end
111 | end
112 |
113 | #
114 | # Initialize GenMetrics Monitoring for GenStage Pipeline
115 | #
116 | alias GenMetrics.GenStage.Pipeline
117 |
118 | Application.start(GenMetrics.Application)
119 | Metrics.Consumer.start_link
120 |
121 | pipeline = %Pipeline{name: "demo",
122 | producer: [Broadcaster],
123 | consumer: [Consumer],
124 | opts: [statistics: true]}
125 |
126 | {:ok, _pid} = GenMetrics.monitor_pipeline(pipeline)
127 |
128 | #
129 | # Start Sample GenStage GenEvent-Replacement Pipeline
130 | #
131 | App.start
132 | Broadcaster.sync_notify(1)
133 | Broadcaster.sync_notify(2)
134 | Broadcaster.sync_notify(3)
135 | Broadcaster.sync_notify(4)
136 | Broadcaster.sync_notify(5)
137 | Process.sleep(2000)
138 |
--------------------------------------------------------------------------------
/bench/support/stages.exs:
--------------------------------------------------------------------------------
1 | defmodule UntracedProducer do
2 | use GenStage
3 |
4 | def start_link do
5 | GenStage.start_link(__MODULE__, [], name: __MODULE__)
6 | end
7 | def init(_) do
8 | {:producer, {:queue.new, 0}}
9 | end
10 |
11 | def emit(item) do
12 | GenStage.call(__MODULE__, {:emit, item})
13 | end
14 |
15 | def handle_call({:emit, item}, {pid, _ref} = from, {queue, demand}) do
16 | event = Map.put(item, :pid, pid)
17 | dispatch_events(:queue.in({from, event}, queue), demand, [])
18 | end
19 | def handle_demand(incoming_demand, {queue, demand}) do
20 | dispatch_events(queue, incoming_demand + demand, [])
21 | end
22 |
23 | defp dispatch_events(queue, demand, events) do
24 | with d when d > 0 <- demand,
25 | {{:value, {from, event}}, queue} <- :queue.out(queue) do
26 | GenStage.reply(from, {:ok, event.id})
27 | dispatch_events(queue, demand - 1, [event | events])
28 | else
29 | _ -> {:noreply, Enum.reverse(events), {queue, demand}}
30 | end
31 | end
32 | end
33 |
34 | defmodule UntracedConsumer do
35 | use GenStage
36 |
37 | def start_link do
38 | GenStage.start_link(__MODULE__, [], name: __MODULE__)
39 | end
40 | def init(_) do
41 | {:consumer, nil, subscribe_to: [{UntracedProducer, max_demand: 1}]}
42 | end
43 |
44 | def handle_events([%{id: id, pid: pid} | _], _from, state) do
45 | send(pid, id)
46 | {:noreply, [], state}
47 | end
48 | end
49 |
50 | defmodule TracedProducer do
51 | use GenStage
52 |
53 | def start_link do
54 | GenStage.start_link(__MODULE__, [], name: __MODULE__)
55 | end
56 | def init(_) do
57 | {:producer, {:queue.new, 0}}
58 | end
59 |
60 | def emit(item) do
61 | GenStage.call(__MODULE__, {:emit, item})
62 | end
63 |
64 | def handle_call({:emit, item}, {pid, _ref} = from, {queue, demand}) do
65 | event = Map.put(item, :pid, pid)
66 | dispatch_events(:queue.in({from, event}, queue), demand, [])
67 | end
68 | def handle_demand(incoming_demand, {queue, demand}) do
69 | dispatch_events(queue, incoming_demand + demand, [])
70 | end
71 |
72 | defp dispatch_events(queue, demand, events) do
73 | with d when d > 0 <- demand,
74 | {{:value, {from, event}}, queue} <- :queue.out(queue) do
75 | GenStage.reply(from, {:ok, event.id})
76 | dispatch_events(queue, demand - 1, [event | events])
77 | else
78 | _ -> {:noreply, Enum.reverse(events), {queue, demand}}
79 | end
80 | end
81 | end
82 |
83 | defmodule TracedConsumer do
84 | use GenStage
85 |
86 | def start_link do
87 | GenStage.start_link(__MODULE__, [], name: __MODULE__)
88 | end
89 | def init(_) do
90 | {:consumer, nil, subscribe_to: [{TracedProducer, max_demand: 1}]}
91 | end
92 |
93 | def handle_events([%{id: id, pid: pid} | _], _from, state) do
94 | send(pid, id)
95 | {:noreply, [], state}
96 | end
97 | end
98 |
99 | defmodule SampledProducer do
100 | use GenStage
101 |
102 | def start_link do
103 | GenStage.start_link(__MODULE__, [], name: __MODULE__)
104 | end
105 | def init(_) do
106 | {:producer, {:queue.new, 0}}
107 | end
108 |
109 | def emit(item) do
110 | GenStage.call(__MODULE__, {:emit, item})
111 | end
112 |
113 | def handle_call({:emit, item}, {pid, _ref} = from, {queue, demand}) do
114 | event = Map.put(item, :pid, pid)
115 | dispatch_events(:queue.in({from, event}, queue), demand, [])
116 | end
117 | def handle_demand(incoming_demand, {queue, demand}) do
118 | dispatch_events(queue, incoming_demand + demand, [])
119 | end
120 |
121 | defp dispatch_events(queue, demand, events) do
122 | with d when d > 0 <- demand,
123 | {{:value, {from, event}}, queue} <- :queue.out(queue) do
124 | GenStage.reply(from, {:ok, event.id})
125 | dispatch_events(queue, demand - 1, [event | events])
126 | else
127 | _ -> {:noreply, Enum.reverse(events), {queue, demand}}
128 | end
129 | end
130 | end
131 |
132 | defmodule SampledConsumer do
133 | use GenStage
134 |
135 | def start_link do
136 | GenStage.start_link(__MODULE__, [], name: __MODULE__)
137 | end
138 | def init(_) do
139 | {:consumer, nil, subscribe_to: [{SampledProducer, max_demand: 1}]}
140 | end
141 |
142 | def handle_events([%{id: id, pid: pid} | _], _from, state) do
143 | send(pid, id)
144 | {:noreply, [], state}
145 | end
146 | end
147 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | [](https://gitpitch.com/onetapbeyond/gen_metrics)
2 | [](https://hex.pm/packages/gen_metrics)
3 |
4 | # GenMetrics
5 |
6 | Runtime metrics for GenServer and GenStage applications.
7 |
8 | > Important! The GenMetrics library is not suitable for use within long-running production environments. For further details, see the [benchmarks performance guide](bench/README.md).
9 |
10 | This library supports the collection and publication of GenServer and GenStage runtime metrics. Metrics data are generated by an introspection agent. No instrumentation is required within the GenServer or GenStage library or within your application source code.
11 |
12 | By default, metrics are published by a dedicated GenMetrics reporting process. Any application can subscribe to this process in order to handle metrics data at runtime. Metrics data can also be pushed directly to a `statsd` agent which makes it possible to analyze, and visualize the metrics within existing tools and services like `Graphana` and `Datadog`.
13 |
14 | ## Quick Look: GenServer Metrics
15 |
16 | Given an application with the following GenServers: `Session.Server`, `Logging.Server`, activate metrics collection for the server cluster as follows:
17 |
18 | ```elixir
19 | alias GenMetrics.GenServer.Cluster
20 | cluster = %Cluster{name: "demo",
21 | servers: [Session.Server, Logging.Server],
22 | opts: [window_interval: 5000]}
23 | GenMetrics.monitor_cluster(cluster)
24 | ```
25 |
26 | Metrics are published by a dedicated GenMetrics reporting process. Any application can subscribe to this process in order to receive metrics data. Sample summary metrics data for a GenServer process looks as follows:
27 |
28 | ```
29 | # Server Name: Demo.Server, PID<0.176.0>
30 |
31 | %GenMetrics.GenServer.Summary{name: Demo.Server,
32 | pid: #PID<0.176.0>,
33 | calls: 8000,
34 | casts: 34500,
35 | infos: 3333,
36 | time_on_calls: 28,
37 | time_on_casts: 161,
38 | time_on_infos: 15}
39 |
40 | # Summary timings measured in milliseconds (ms).
41 | ```
42 |
43 | Detailed statistical metrics data per process are also available. See the [documentation](https://hexdocs.pm/gen_metrics) for details.
44 |
45 | ## Quick Look: GenStage Metrics
46 |
47 | Given a GenStage application with the following stages: `Data.Producer`, `Data.Scrubber`, `Data.Analyzer` and a `Data.Consumer`, activate metrics collection for the entire pipeline as follows:
48 |
49 | ```elixir
50 | alias GenMetrics.GenStage.Pipeline
51 | pipeline = %Pipeline{name: "demo",
52 | producer: [Data.Producer],
53 | producer_consumer: [Data.Scrubber, Data.Analyzer],
54 | consumer: [Data.Consumer]}
55 | GenMetrics.monitor_pipeline(pipeline)
56 | ```
57 |
58 | Metrics are published by a dedicated GenMetrics reporting process. Any application can subscribe to this process in order to receive metrics data. Sample summary metrics data for a GenStage process looks as follows:
59 |
60 | ```
61 | # Stage Name: Data.Producer, PID<0.195.0>
62 |
63 | %GenMetrics.GenStage.Summary{stage: Data.Producer,
64 | pid: #PID<0.195.0>,
65 | callbacks: 9536,
66 | time_on_callbacks: 407,
67 | demand: 4768000,
68 | events: 4768000}
69 |
70 | # Summary timings measured in milliseconds (ms).
71 | ```
72 |
73 | Detailed statistical metrics data per process are also available. See the [documentation](https://hexdocs.pm/gen_metrics) for details.
74 |
75 | ## Quick Look: GenMetrics Sampling
76 |
77 | Given an application with the following GenServers: `Session.Server`, `Logging.Server`, activate metrics-sampling for the server cluster as follows:
78 |
79 | ```elixir
80 | alias GenMetrics.GenServer.Cluster
81 | cluster = %Cluster{name: "demo",
82 | servers: [Session.Server, Logging.Server],
83 | opts: [sample_rate: 0.3]}
84 | GenMetrics.monitor_cluster(cluster)
85 | ```
86 |
87 | Given a GenStage application with the following stages: `Data.Producer`, `Data.Scrubber`, `Data.Analyzer` and a `Data.Consumer`, activate metrics-sampling for the entire pipeline as follows:
88 |
89 | ```elixir
90 | alias GenMetrics.GenStage.Pipeline
91 | pipeline = %Pipeline{name: "demo",
92 | producer: [Data.Producer],
93 | producer_consumer: [Data.Scrubber, Data.Analyzer],
94 | consumer: [Data.Consumer],
95 | opts: [sample_rate: 0.1]}
96 | GenMetrics.monitor_pipeline(pipeline)
97 | ```
98 |
99 | ## Quick Look: Metrics Reporting
100 |
101 | Redirect your GenServer cluster metrics data to the Datadog service as follows:
102 |
103 | ```elixir
104 | alias GenMetrics.GenServer.Cluster
105 | cluster = %Cluster{name: "demo",
106 | servers: [Session.Server, Logging.Server],
107 | opts: [statistics: :datadog]}
108 | GenMetrics.monitor_cluster(cluster)
109 | ```
110 |
111 | Redirect your GenStage pipeline metrics data to a `statsd` agent as follows:
112 |
113 | ```
114 | alias GenMetrics.GenStage.Pipeline
115 | pipeline = %Pipeline{name: "demo",
116 | producer: [Data.Producer],
117 | producer_consumer: [Data.Scrubber, Data.Analyzer],
118 | consumer: [Data.Consumer],
119 | opts: [statistics: :statsd]}
120 | GenMetrics.monitor_pipeline(pipeline)
121 | ```
122 |
123 | ## Documentation
124 |
125 | Find detailed documentation for the GenMetrics library on [HexDocs](https://hexdocs.pm/gen_metrics).
126 |
127 | ## Installation
128 |
129 | GenStage requires Elixir v1.4. Just add `:gen_metrics` to your list of dependencies in mix.exs:
130 |
131 | ```elixir
132 | def deps do
133 | [{:gen_metrics, "~> 0.3.0"}]
134 | end
135 | ```
136 |
137 | ## Benchmarks
138 |
139 | For those of you curious about the performance impact `gen_metrics` has on the servers and pipelines it is monitoring, we've put together a number of benchmarks along with a detailed performance analysis which you can [find here](bench/README.md).
140 |
141 | ## Examples
142 |
143 | Examples using GenMetrics to collect and report runtime metrics for GenServer applications can be found in the [examples](examples) directory:
144 |
145 | * [genserver_events](examples/genserver_events.exs)
146 |
147 | Examples using GenMetrics to collect and report runtime metrics for GenStage applications can also be found in the [examples](examples) directory:
148 |
149 | * [genstage_producer_consumer](examples/genstage_producer_consumer.exs)
150 |
151 | * [genstage_gen_event](examples/genstage_gen_event.exs)
152 |
153 | * [genstage_rate_limiter](examples/genstage_rate_limiter.exs)
154 |
155 | All of these GenStage example applications are clones of the example applications provided in the [GenStage](http://github.com/elixir-lang/gen_stage) project repository.
156 |
157 | ## License
158 |
159 | See the [LICENSE](LICENSE) file for license rights and limitations (Apache License 2.0).
160 |
--------------------------------------------------------------------------------
/lib/gen_stage/manager.ex:
--------------------------------------------------------------------------------
1 | defmodule GenMetrics.GenStage.Manager do
2 | alias GenMetrics.GenStage.Manager
3 | alias GenMetrics.GenStage.Stage
4 | alias GenMetrics.GenStage.Summary
5 | alias GenMetrics.GenStage.Stats
6 | alias GenMetrics.GenStage.Window
7 | alias GenMetrics.GenStage.Metric
8 | alias GenMetrics.Utils.Math
9 | alias GenMetrics.Utils.Runtime
10 | alias GenMetrics.Utils.StatsPush
11 |
12 | @moduledoc false
13 |
14 | defstruct stages: %{}, summary_partials: %{}, summary_paired: %{},
15 | stats_partials: %{}, stats_paired: %{}
16 |
17 | def initialize do
18 | %Manager{}
19 | end
20 |
21 | def reinitialize(metrics) do
22 | %Manager{stages: metrics.stages,
23 | summary_partials: metrics.summary_partials,
24 | stats_partials: metrics.stats_partials}
25 | end
26 |
27 | def open_summary_metric(metrics, mod, pid, demand, ts) do
28 | metrics = register_pid_on_stage(metrics, mod, pid)
29 | do_open_summary_metric(metrics, mod, pid, demand, ts)
30 | end
31 |
32 | def close_summary_metric(metrics, mod, pid, events, ts) do
33 | do_close_summary_metric(metrics, mod, pid, events, ts)
34 | end
35 |
36 | def open_stats_metric(metrics, {mod, pid, demand, ts}) do
37 | metrics = register_pid_on_stage(metrics, mod, pid)
38 | do_open_stats_metric(metrics, {mod, pid, demand, ts})
39 | end
40 |
41 | def close_stats_metric(pipeline, metrics, {mod, pid, events, ts}) do
42 | do_close_stats_metric(pipeline, metrics, {mod, pid, events, ts})
43 | end
44 |
45 | def as_window(metrics, gen_stats, sample_rate) do
46 | window = %Window{summary: build_stage_summary(metrics, sample_rate)}
47 | if gen_stats do
48 | with stage_metrics <- build_stage_metrics(metrics),
49 | stage_stats <- build_stage_stats(stage_metrics, sample_rate),
50 | do: %Window{window | stats: stage_stats}
51 | else
52 | window
53 | end
54 | end
55 |
56 | #
57 | # Metrics manager private utility functions follow.
58 | #
59 |
60 | defp register_pid_on_stage(metrics, stage, pid) do
61 | stages = Map.update(metrics.stages, stage,
62 | MapSet.new |> MapSet.put(pid), & MapSet.put(&1, pid))
63 | %Manager{metrics | stages: stages}
64 | end
65 |
66 | defp do_open_summary_metric(metrics, _mod, pid, demand, ts) do
67 | mdemand = Metric.demand(demand, ts)
68 | summary_partials = Map.put(metrics.summary_partials, pid, mdemand)
69 | %Manager{metrics | summary_partials: summary_partials}
70 | end
71 |
72 | defp do_close_summary_metric(metrics, _mod, pid, events, ts) do
73 | if Map.has_key?(metrics.summary_partials, pid) do
74 | {partial, summary_partials} = Map.pop(metrics.summary_partials, pid)
75 | summary_paired =
76 | Metric.pair(metrics.summary_paired, pid, events, ts, partial)
77 | %Manager{metrics | summary_partials: summary_partials,
78 | summary_paired: summary_paired}
79 | else
80 | metrics
81 | end
82 | end
83 |
84 | defp do_open_stats_metric(metrics, {_mod, pid, demand, ts}) do
85 | mdemand = Metric.demand(demand, ts)
86 | stats_partials = Map.put(metrics.stats_partials, pid, mdemand)
87 | %Manager{metrics | stats_partials: stats_partials}
88 | end
89 |
90 | defp do_close_stats_metric(pipeline, metrics, {mod, pid, events, ts}) do
91 | if Map.has_key?(metrics.stats_partials, pid) do
92 | {partial, partials} = Map.pop(metrics.stats_partials, pid)
93 | mevent = Metric.event(partial, events, ts)
94 | statsd_args = {mod, pid, mevent, partials}
95 | case pipeline.opts[:statistics] do
96 | :statsd ->
97 | push_metric_to_statsd(pipeline, metrics, statsd_args)
98 | :datadog ->
99 | push_metric_to_datadog(pipeline, metrics, statsd_args)
100 | _ ->
101 | push_metric_in_memory(pipeline, metrics, pid, mevent, partials)
102 | end
103 | else
104 | metrics
105 | end
106 | end
107 |
108 | defp build_stage_summary(metrics, sample_rate) do
109 | for {stage, pids} <- metrics.stages, pid <- pids, into: [] do
110 | summary = generate_stage_summary(Map.get(metrics.summary_paired,
111 | pid, Metric.no_pair), sample_rate)
112 | %Summary{summary | name: stage, pid: pid}
113 | end
114 | end
115 |
116 | defp build_stage_metrics(metrics) do
117 | for {stage, pids} <- metrics.stages, pid <- pids, into: [] do
118 | {stage, pid, Map.get(metrics.stats_paired, pid, [])}
119 | end
120 | end
121 |
122 | defp build_stage_stats([], _), do: []
123 | defp build_stage_stats(stage_metrics, sample_rate) do
124 | for {module, pid, metrics} <- stage_metrics do
125 | len = length(metrics)
126 | %Stage{name: module, pid: pid,
127 | demand: generate_demand_stats(metrics, len, sample_rate),
128 | events: generate_events_stats(metrics, len, sample_rate),
129 | timings: generate_timings_stats(metrics, len, sample_rate)}
130 | end
131 | end
132 |
133 | defp generate_stage_summary({calls, demand, events, time_on_callbacks},
134 | sample_rate) do
135 | do_generate_stage_summary(calls, demand, events,
136 | Runtime.nano_to_milli(time_on_callbacks), sample_rate)
137 | end
138 |
139 | defp generate_stage_summary(stage = %Stage{}, sample_rate) do
140 | do_generate_stage_summary(stage.demand.calls,
141 | stage.demand.total, stage.events.total,
142 | Runtime.micro_to_milli(stage.timings.total), sample_rate)
143 | end
144 |
145 | defp do_generate_stage_summary(calls, demand, events,
146 | time_on_callbacks, sample_rate) do
147 | srate_multiplier = 1 / sample_rate
148 | %Summary{callbacks: round(calls * srate_multiplier),
149 | demand: round(demand * srate_multiplier),
150 | events: round(events * srate_multiplier),
151 | time_on_callbacks: round(time_on_callbacks * srate_multiplier)}
152 | end
153 |
154 | defp generate_demand_stats(metrics, len, sample_rate) do
155 | demand = metrics |> Enum.map(& &1.demand) |> Enum.sort
156 | generate_stats(demand, len, sample_rate)
157 | end
158 |
159 | defp generate_events_stats(metrics, len, sample_rate) do
160 | events = metrics |> Enum.map(& &1.events) |> Enum.sort
161 | generate_stats(events, len, sample_rate)
162 | end
163 |
164 | defp generate_timings_stats(metrics, len, sample_rate) do
165 | durations = metrics |> Enum.map(& &1.duration) |> Enum.sort
166 | generate_stats(durations, len, sample_rate)
167 | end
168 |
169 | defp generate_stats(data, len, sample_rate) do
170 | srate_multiplier = 1 / sample_rate
171 | %Stats{callbacks: round(len * srate_multiplier),
172 | min: Math.min(data), max: Math.max(data),
173 | total: round(Math.sum(data) * srate_multiplier),
174 | mean: Math.mean(data, len),
175 | stdev: Math.stdev(data, len), range: Math.range(data)}
176 | end
177 |
178 | defp push_metric_in_memory(_cluster, metrics, pid, mevent, stats_partials) do
179 | stats_paired =
180 | Map.update(metrics.stats_paired, pid, [mevent], & [mevent | &1])
181 | %Manager{metrics | stats_partials: stats_partials,
182 | stats_paired: stats_paired}
183 | end
184 |
185 | defp push_metric_to_statsd(pipeline, metrics, {mod, pid, mevent, partials}) do
186 | StatsPush.statsd(pipeline.name, mod, pid, nil, mevent)
187 | %Manager{metrics | stats_partials: partials}
188 | end
189 |
190 | defp push_metric_to_datadog(pipeline, metrics, {mod, pid, mevent, partials}) do
191 | StatsPush.datadog(pipeline.name, mod, pid, nil, mevent)
192 | %Manager{metrics | stats_partials: partials}
193 | end
194 |
195 | end
196 |
--------------------------------------------------------------------------------
/lib/gen_server/manager.ex:
--------------------------------------------------------------------------------
1 | defmodule GenMetrics.GenServer.Manager do
2 | alias GenMetrics.GenServer.Manager
3 | alias GenMetrics.GenServer.Server
4 | alias GenMetrics.GenServer.Summary
5 | alias GenMetrics.GenServer.Stats
6 | alias GenMetrics.GenServer.Window
7 | alias GenMetrics.GenServer.Metric
8 | alias GenMetrics.Utils.Math
9 | alias GenMetrics.Utils.Runtime
10 | alias GenMetrics.Utils.StatsPush
11 |
12 | @moduledoc false
13 |
14 | @call_cast_info [:handle_call, :handle_cast, :handle_info]
15 |
16 | defstruct servers: %{}, summary_partials: %{}, summary_paired: %{},
17 | stats_partials: %{}, stats_paired: %{}
18 |
19 | def initialize do
20 | %Manager{}
21 | end
22 |
23 | def reinitialize(metrics) do
24 | %Manager{servers: metrics.servers,
25 | summary_partials: metrics.summary_partials,
26 | stats_partials: metrics.stats_partials}
27 | end
28 |
29 | def open_summary_metric(metrics, mod, pid, fun, ts) do
30 | metrics = register_pid_on_server(metrics, mod, pid)
31 | do_open_summary_metric(metrics, mod, pid, fun, ts)
32 | end
33 |
34 | def close_summary_metric(metrics, pid, events, ts) do
35 | do_close_summary_metric(metrics, pid, events, ts)
36 | end
37 |
38 | def open_stats_metric(metrics, {mod, pid, fun, ts}) do
39 | metrics = register_pid_on_server(metrics, mod, pid)
40 | do_open_stats_metric(metrics, {pid, fun, ts})
41 | end
42 |
43 | def close_stats_metric(cluster, metrics, {mod, pid, events, ts}) do
44 | do_close_stats_metric(cluster, metrics, {mod, pid, events, ts})
45 | end
46 |
47 | def as_window(metrics, gen_stats, sample_rate) do
48 | window = %Window{summary: build_server_summary(metrics, sample_rate)}
49 | if gen_stats do
50 | with server_metrics <- build_server_metrics(metrics),
51 | server_stats <- build_server_stats(server_metrics, sample_rate),
52 | do: %Window{window | stats: server_stats}
53 | else
54 | window
55 | end
56 | end
57 |
58 | #
59 | # Metrics manager private utility functions follow.
60 | #
61 |
62 | defp register_pid_on_server(metrics, server, pid) do
63 | servers = Map.update(metrics.servers, server,
64 | MapSet.new |> MapSet.put(pid), & MapSet.put(&1, pid))
65 | %Manager{metrics | servers: servers}
66 | end
67 |
68 | defp do_open_summary_metric(metrics, _mod, pid, fun, ts) do
69 | mkey = as_metric_key(pid, fun)
70 | mevent = Metric.partial(ts)
71 | summary_partials = Map.put(metrics.summary_partials, mkey, mevent)
72 | %Manager{metrics | summary_partials: summary_partials}
73 | end
74 |
75 | defp do_close_summary_metric(metrics, pid, fun, ts) do
76 | mkey = as_metric_key(pid, fun)
77 | if Map.has_key?(metrics.summary_partials, mkey) do
78 | {partial, summary_partials} = Map.pop(metrics.summary_partials, mkey)
79 | summary_paired = Metric.pair(metrics.summary_paired, mkey, ts, partial)
80 | %Manager{metrics | summary_partials: summary_partials,
81 | summary_paired: summary_paired}
82 | else
83 | metrics
84 | end
85 | end
86 |
87 | defp do_open_stats_metric(metrics, {pid, fun, ts}) do
88 | mkey = as_metric_key(pid, fun)
89 | mevent = Metric.start(ts)
90 | stats_partials = Map.put(metrics.stats_partials, mkey, mevent)
91 | %Manager{metrics | stats_partials: stats_partials}
92 | end
93 |
94 | defp do_close_stats_metric(cluster, metrics, {mod, pid, fun, ts}) do
95 | mkey = as_metric_key(pid, fun)
96 | if Map.has_key?(metrics.stats_partials, mkey) do
97 | {partial, partials} = Map.pop(metrics.stats_partials, mkey)
98 | mevent = Metric.stop(partial, ts)
99 | statsd_args = {mod, pid, fun, mevent, partials}
100 | case cluster.opts[:statistics] do
101 | :statsd ->
102 | push_metric_to_statsd(cluster, metrics, statsd_args)
103 | :datadog ->
104 | push_metric_to_datadog(cluster, metrics, statsd_args)
105 | _ ->
106 | push_metric_in_memory(cluster, metrics, mkey, mevent, partials)
107 | end
108 | else
109 | metrics
110 | end
111 | end
112 |
113 | defp build_server_summary(metrics, sample_rate) do
114 | for {server, pids} <- metrics.servers, pid <- pids, into: [] do
115 | mkeys = for key <- @call_cast_info, do: as_metric_key(pid, key)
116 | metrics_on_pid = for mkey <- mkeys do
117 | Map.get(metrics.summary_paired, mkey, Metric.no_pair)
118 | end
119 | summary = generate_server_summary(metrics_on_pid, sample_rate)
120 | %Summary{summary | name: server, pid: pid}
121 | end
122 | end
123 |
124 | defp build_server_metrics(metrics) do
125 | for {server, pids} <- metrics.servers, pid <- pids, into: [] do
126 | mkeys = for key <- @call_cast_info, do: as_metric_key(pid, key)
127 | {server, pid,
128 | (for mkey <- mkeys, do: Map.get(metrics.stats_paired, mkey, []))}
129 | end
130 | end
131 |
132 | defp build_server_stats([], _), do: []
133 | defp build_server_stats(server_metrics, sample_rate) do
134 | for {module, pid, [calls, casts, infos]} <- server_metrics do
135 | %Server{name: module, pid: pid,
136 | calls: generate_metric_stats(calls, length(calls), sample_rate),
137 | casts: generate_metric_stats(casts, length(casts), sample_rate),
138 | infos: generate_metric_stats(infos, length(infos), sample_rate)}
139 | end
140 | end
141 |
142 | defp generate_server_summary([calls, casts, infos], sample_rate) do
143 | do_generate_server_summary(calls, casts, infos, sample_rate)
144 | end
145 |
146 | defp generate_server_summary(server = %Server{}, sample_rate) do
147 | calls = {server.calls.calls, server.calls.total, 0}
148 | casts = {server.casts.calls, server.casts.total, 0}
149 | infos = {server.infos.calls, server.casts.total, 0}
150 | do_generate_server_summary(calls, casts, infos, sample_rate)
151 | end
152 |
153 | defp do_generate_server_summary({calls, tcalls}, {casts, tcasts},
154 | {infos, tinfos}, sample_rate) do
155 | srate_multiplier = 1 / sample_rate
156 | %Summary{calls: round(calls * srate_multiplier),
157 | casts: round(casts * srate_multiplier),
158 | infos: round(infos * srate_multiplier),
159 | time_on_calls: Runtime.nano_to_milli(round(tcalls * srate_multiplier)),
160 | time_on_casts: Runtime.nano_to_milli(round(tcasts * srate_multiplier)),
161 | time_on_infos: Runtime.nano_to_milli(round(tinfos * srate_multiplier))}
162 | end
163 |
164 | defp generate_metric_stats([], _, sample_rate), do: generate_stats([], 0, sample_rate)
165 | defp generate_metric_stats(metrics, len, sample_rate) do
166 | metric_durations =
167 | metrics |> Enum.map(fn metric -> metric.duration end) |> Enum.sort
168 | generate_stats(metric_durations, len, sample_rate)
169 | end
170 |
171 | defp generate_stats(data, len, sample_rate) do
172 | srate_multiplier = 1 / sample_rate
173 | %Stats{callbacks: round(len * srate_multiplier),
174 | min: Math.min(data), max: Math.max(data),
175 | total: round(Math.sum(data) * srate_multiplier),
176 | mean: Math.mean(data, len),
177 | stdev: Math.stdev(data, len), range: Math.range(data)}
178 | end
179 |
180 | defp push_metric_in_memory(_cluster, metrics, mkey, mevent, stats_partials) do
181 | stats_paired =
182 | Map.update(metrics.stats_paired, mkey, [mevent], & [mevent | &1])
183 | %Manager{metrics | stats_partials: stats_partials,
184 | stats_paired: stats_paired}
185 | end
186 |
187 | defp push_metric_to_statsd(cluster, metrics, {mod, pid, fun, mevent, partials}) do
188 | StatsPush.statsd(cluster.name, mod, pid, fun, mevent)
189 | %Manager{metrics | stats_partials: partials}
190 | end
191 |
192 | defp push_metric_to_datadog(cluster, metrics, {mod, pid, fun, mevent, partials}) do
193 | StatsPush.datadog(cluster.name, mod, pid, fun, mevent)
194 | %Manager{metrics | stats_partials: partials}
195 | end
196 |
197 | defp as_metric_key(pid, fun) do
198 | "#{inspect pid}-#{inspect fun}"
199 | end
200 |
201 | end
202 |
--------------------------------------------------------------------------------
/lib/gen_server/monitor.ex:
--------------------------------------------------------------------------------
1 | defmodule GenMetrics.GenServer.Monitor do
2 | use GenServer
3 | alias GenMetrics.GenServer.Manager
4 | alias GenMetrics.GenServer.Monitor
5 | alias GenMetrics.GenServer.Cluster
6 | alias GenMetrics.GenServer.Window
7 | alias GenMetrics.Reporter
8 | alias GenMetrics.Utils.Runtime
9 |
10 | @moduledoc false
11 | @call_cast_info [:handle_call, :handle_cast, :handle_info]
12 |
13 | defstruct cluster: %Cluster{}, metrics: nil, start: 0, duration: 0
14 |
15 | def start_link(cluster) do
16 | GenServer.start_link(__MODULE__, cluster)
17 | end
18 |
19 | def init(cluster) do
20 | with {:ok, _} <- validate_modules(cluster),
21 | {:ok, _} <- validate_behaviours(cluster),
22 | {:ok, _} <- activate_tracing(cluster),
23 | state <- initialize_monitor(cluster),
24 | do: start_monitor(state)
25 | end
26 |
27 | #
28 | # Handlers for intercepting :erlang.trace/3 and :erlang.trace_pattern/2
29 | # callbacks for modules registered on the cluster.
30 | #
31 |
32 | def handle_info({:trace_ts, pid, :call, {mod, fun, _args}, ts}, state) do
33 | {:noreply,
34 | do_intercept_call_request(state, mod, pid, fun, ts)}
35 | end
36 |
37 | # Intercept {:reply, reply, new_state}
38 | def handle_info({:trace_ts, pid, :return_from, {mod, fun, _arity},
39 | {:reply, _, _}, ts}, state) do
40 | {:noreply,
41 | do_intercept_call_response(state, mod, pid, fun, ts)}
42 | end
43 |
44 | # Intercept {:reply, reply, new_state, timeout | :hibernate}
45 | def handle_info({:trace_ts, pid, :return_from, {mod, fun, _arity},
46 | {:reply, _, _, _}, ts}, state) do
47 | {:noreply,
48 | do_intercept_call_response(state, mod, pid, fun, ts)}
49 | end
50 |
51 | # Intercept {:noreply, new_state}
52 | def handle_info({:trace_ts, pid, :return_from, {mod, fun, _arity},
53 | {:noreply, _}, ts}, state) do
54 | {:noreply,
55 | do_intercept_call_response(state, mod, pid, fun, ts)}
56 | end
57 |
58 | # Intercept {:noreply, new_state, timeout | :hibernate}
59 | def handle_info({:trace_ts, pid, :return_from, {mod, fun, _arity},
60 | {:noreply, _, _}, ts}, state) do
61 | {:noreply,
62 | do_intercept_call_response(state, mod, pid, fun, ts)}
63 | end
64 |
65 | # Intercept {:stop, reason, reply, new_state}
66 | def handle_info({:trace_ts, pid, :return_from, {mod, fun, _arity},
67 | {:stop, _, _, _}, ts}, state) do
68 | {:noreply,
69 | do_intercept_call_response(state, mod, pid, fun, ts)}
70 | end
71 |
72 | # Intercept {:stop, reason, new_state}
73 | def handle_info({:trace_ts, pid, :return_from, {mod, fun, _arity},
74 | {:stop, _, _}, ts}, state) do
75 | {:noreply,
76 | do_intercept_call_response(state, mod, pid, fun, ts)}
77 | end
78 |
79 | # Report and rollover metrics window.
80 | def handle_info(:rollover_metrics_window, state) do
81 | now = :erlang.system_time
82 | state = %Monitor{state | duration: Runtime.nano_to_milli(now - state.start)}
83 | window = Manager.as_window(state.metrics,
84 | Runtime.statistics?(state.cluster), Runtime.sample_rate(state.cluster))
85 | window = %Window{window | cluster: state.cluster,
86 | start: state.start, duration: state.duration}
87 | Reporter.push(GenMetrics.GenServer.Reporter, window)
88 | Process.send_after(self(),
89 | :rollover_metrics_window, Runtime.window_interval(state.cluster))
90 | if Runtime.sampling?(state.cluster) do
91 | activate_tracing(state.cluster)
92 | Process.send_after(self(),
93 | :silence_metrics_window, Runtime.sample_interval(state.cluster))
94 | end
95 | {:noreply, initialize_monitor(state.cluster, state.metrics)}
96 | end
97 |
98 | # Sampling window is closed for current metrics windows
99 | # so temporarily silence tracing.
100 | def handle_info(:silence_metrics_window, state) do
101 | activate_tracing(state.cluster, true)
102 | {:noreply, state}
103 | end
104 |
105 | # Catch-all for calls not intercepted by monitor.
106 | def handle_info(_msg, state), do: {:noreply, state}
107 |
108 | #
109 | # Private utility functions follow.
110 | #
111 |
112 | # Initialize GenServer state for monitor.
113 | defp initialize_monitor(cluster, metrics \\ nil) do
114 | if metrics do
115 | %Monitor{cluster: cluster,
116 | metrics: Manager.reinitialize(metrics),
117 | start: :erlang.system_time}
118 | else
119 | %Monitor{cluster: cluster,
120 | metrics: Manager.initialize(),
121 | start: :erlang.system_time}
122 | end
123 | end
124 |
125 | # Initialize periodic callback for metrics reporting and window rollover.
126 | defp start_monitor(state) do
127 | Process.send_after(self(),
128 | :rollover_metrics_window, Runtime.window_interval(state.cluster))
129 | if Runtime.sampling?(state.cluster) do
130 | Process.send_after(self(),
131 | :silence_metrics_window, Runtime.sample_interval(state.cluster))
132 | end
133 | {:ok, state}
134 | end
135 |
136 | # Activate tracing for servers within cluster.
137 | defp activate_tracing(cluster, silent \\ false) do
138 |
139 | if silent do
140 | :erlang.trace(:processes, false, [:call, :monotonic_timestamp])
141 | else
142 | :erlang.trace(:processes, true, [:call, :monotonic_timestamp])
143 | for server <- cluster.servers do
144 |
145 | if Runtime.synchronous?(cluster) do
146 | :erlang.trace_pattern({server, :handle_call, 3},
147 | [{:_, [], [{:return_trace}]}])
148 | end
149 | :erlang.trace_pattern({server, :handle_cast, 2},
150 | [{:_, [], [{:return_trace}]}])
151 | :erlang.trace_pattern({server, :handle_info, 2},
152 | [{:_, [], [{:return_trace}]}])
153 | end
154 | end
155 |
156 | {:ok, cluster}
157 | end
158 |
159 | # Validate cluster modules can be loaded or report failures.
160 | defp validate_modules(cluster) do
161 | case require_modules(cluster) do
162 | [] -> {:ok, cluster}
163 | errs -> {:stop, {:bad_cluster, errs}}
164 | end
165 | end
166 |
167 | # Ensure cluster modules are available and can be loaded.
168 | defp require_modules(cluster) do
169 | cluster.servers
170 | |> Enum.uniq
171 | |> Runtime.require_modules
172 | end
173 |
174 | # Validate cluster modules implement GenServer or report failures.
175 | defp validate_behaviours(cluster) do
176 | case require_behaviour(cluster, GenServer) do
177 | [] -> {:ok, cluster}
178 | errs -> {:stop, {:bad_cluster, errs}}
179 | end
180 | end
181 |
182 | # Ensure cluster modules implement GenServer behaviour.
183 | defp require_behaviour(cluster, behaviour) do
184 | cluster.servers
185 | |> Enum.uniq
186 | |> Runtime.require_behaviour(behaviour)
187 | end
188 |
189 | defp do_intercept_call_request(state, mod, pid, fun, ts) do
190 | if fun in @call_cast_info do
191 | do_open_metric(state, mod, pid, fun, ts)
192 | else
193 | state
194 | end
195 | end
196 |
197 | defp do_intercept_call_response(state, mod, pid, fun, ts) do
198 | do_close_metric(state, mod, pid, fun, ts)
199 | end
200 |
201 | # Open partial metric on handle_ function call trace.
202 | defp do_open_metric(state, mod, pid, fun, ts) do
203 | metrics =
204 | Manager.open_summary_metric(state.metrics, mod, pid, fun, ts)
205 | state = %Monitor{state | metrics: metrics}
206 |
207 | if Runtime.statistics?(state.cluster) do
208 | metrics =
209 | Manager.open_stats_metric(state.metrics, {mod, pid, fun, ts})
210 | %Monitor{state | metrics: metrics}
211 | else
212 | state
213 | end
214 | end
215 |
216 | # Close complete metric on handle_ function return trace.
217 | defp do_close_metric(state, mod, pid, events, ts) do
218 | metrics = Manager.close_summary_metric(state.metrics, pid, events, ts)
219 | state = %Monitor{state | metrics: metrics}
220 |
221 | if Runtime.statistics?(state.cluster) do
222 | metrics = Manager.close_stats_metric(state.cluster,
223 | state.metrics, {mod, pid, events, ts})
224 | %Monitor{state | metrics: metrics}
225 | else
226 | state
227 | end
228 | end
229 |
230 | end
231 |
--------------------------------------------------------------------------------
/lib/gen_stage/monitor.ex:
--------------------------------------------------------------------------------
1 | defmodule GenMetrics.GenStage.Monitor do
2 | use GenServer
3 | alias GenMetrics.GenStage.Manager
4 | alias GenMetrics.GenStage.Monitor
5 | alias GenMetrics.GenStage.Pipeline
6 | alias GenMetrics.GenStage.Window
7 | alias GenMetrics.Reporter
8 | alias GenMetrics.Utils.Runtime
9 |
10 | @moduledoc false
11 | @handle_demand :handle_demand
12 | @handle_events :handle_events
13 | @handle_call :handle_call
14 | @handle_cast :handle_cast
15 |
16 | defstruct pipeline: %Pipeline{}, metrics: nil, start: 0, duration: 0
17 |
18 | def start_link(pipeline) do
19 | GenServer.start_link(__MODULE__, pipeline)
20 | end
21 |
22 | def init(pipeline) do
23 | with {:ok, _} <- validate_modules(pipeline),
24 | {:ok, _} <- validate_behaviours(pipeline),
25 | {:ok, _} <- activate_tracing(pipeline),
26 | state <- initialize_monitor(pipeline),
27 | do: start_monitor(state)
28 | end
29 |
30 | #
31 | # Handlers for intercepting :erlang.trace/3 and :erlang.trace_pattern/2
32 | # callbacks for modules registered on the pipeline.
33 | #
34 |
35 | def handle_info({:trace_ts, pid, :call, {mod, fun, [demand | _]}, ts}, state) do
36 | {:noreply,
37 | do_intercept_call_request(state, pid, {mod, fun}, demand, ts)}
38 | end
39 |
40 | # Intercept {:noreply, [event], new_state} response.
41 | def handle_info({:trace_ts, pid, :return_from, {mod, _, _},
42 | {:noreply, events, _}, ts}, state) do
43 | {:noreply,
44 | do_intercept_call_response(state, mod, pid, length(events), ts)}
45 | end
46 |
47 | # Intercept {:noreply, [event], new_state, :hibernate} response.
48 | def handle_info({:trace_ts, pid, :return_from, {mod, _, _},
49 | {:noreply, events, _, _}, ts}, state) do
50 | {:noreply,
51 | do_intercept_call_response(state, mod, pid, length(events), ts)}
52 | end
53 |
54 | # Intercept {:reply, _reply, [event], new_state} response.
55 | def handle_info({:trace_ts, pid, :return_from, {mod, _, _},
56 | {:reply, _, events, _}, ts}, state) do
57 | {:noreply,
58 | do_intercept_call_response(state, mod, pid, length(events), ts)}
59 | end
60 |
61 | # Intercept {:reply, _reply, [event], new_state, :hibernate} response.
62 | def handle_info({:trace_ts, pid, :return_from, {mod, _, _},
63 | {:noreply, _, events, _, _}, ts}, state) do
64 | {:noreply,
65 | do_intercept_call_response(state, mod, pid, length(events), ts)}
66 | end
67 |
68 | # Intercept {:stop, reason, new_state} response.
69 | def handle_info({:trace_ts, pid, :return_from, {mod, _, _},
70 | {:stop, _, _}, ts}, state) do
71 | {:noreply,
72 | do_intercept_call_response(state, mod, pid, 0, ts)}
73 | end
74 |
75 | # Report and rollover metrics window.
76 | def handle_info(:rollover_metrics_window, state) do
77 | now = :erlang.system_time
78 | state = %Monitor{state | duration: Runtime.nano_to_milli(now - state.start)}
79 | window = Manager.as_window(state.metrics,
80 | Runtime.statistics?(state.pipeline), Runtime.sample_rate(state.pipeline))
81 | window = %Window{window | pipeline: state.pipeline,
82 | start: state.start, duration: state.duration}
83 | Reporter.push(GenMetrics.GenStage.Reporter, window)
84 | Process.send_after(self(),
85 | :rollover_metrics_window, Runtime.window_interval(state.pipeline))
86 | if Runtime.sampling?(state.pipeline) do
87 | activate_tracing(state.pipeline)
88 | Process.send_after(self(),
89 | :silence_metrics_window, Runtime.sample_interval(state.pipeline))
90 | end
91 | {:noreply, initialize_monitor(state.pipeline, state.metrics)}
92 | end
93 |
94 | # Sampling window is closed for current metrics windows
95 | # so temporarily silence tracing.
96 | def handle_info(:silence_metrics_window, state) do
97 | activate_tracing(state.pipeline, true)
98 | {:noreply, state}
99 | end
100 |
101 | # Catch-all for calls not intercepted by monitor.
102 | def handle_info(_msg, state), do: {:noreply, state}
103 |
104 | #
105 | # Private utility functions follow.
106 | #
107 |
108 | # Initialize GenServer state for monitor.
109 | defp initialize_monitor(pipeline, metrics \\ nil) do
110 | if metrics do
111 | %Monitor{pipeline: pipeline,
112 | metrics: Manager.reinitialize(metrics),
113 | start: :erlang.system_time}
114 | else
115 | %Monitor{pipeline: pipeline,
116 | metrics: Manager.initialize(),
117 | start: :erlang.system_time}
118 | end
119 | end
120 |
121 | # Initialize periodic callback for metrics reporting and window rollover.
122 | defp start_monitor(state) do
123 | Process.send_after(self(),
124 | :rollover_metrics_window, Runtime.window_interval(state.pipeline))
125 | if Runtime.sampling?(state.pipeline) do
126 | Process.send_after(self(),
127 | :silence_metrics_window, Runtime.sample_interval(state.pipeline))
128 | end
129 | {:ok, state}
130 | end
131 |
132 | # Activate tracing for stages within pipeline.
133 | defp activate_tracing(pipeline, silent \\ false) do
134 |
135 | if silent do
136 | :erlang.trace(:processes, false, [:call, :monotonic_timestamp])
137 | else
138 | :erlang.trace(:processes, true, [:call, :monotonic_timestamp])
139 |
140 | for pmod <- pipeline.producer do
141 | :erlang.trace_pattern({pmod, :handle_demand, 2},
142 | [{:_, [], [{:return_trace}]}])
143 | :erlang.trace_pattern({pmod, :handle_cast, 2},
144 | [{:_, [], [{:return_trace}]}])
145 | if Runtime.synchronous?(pipeline) do
146 | :erlang.trace_pattern({pmod, :handle_call, 3},
147 | [{:_, [], [{:return_trace}]}])
148 | end
149 | end
150 |
151 | for pcmod <- pipeline.producer_consumer do
152 | :erlang.trace_pattern({pcmod, :handle_events, 3},
153 | [{:_, [], [{:return_trace}]}])
154 | :erlang.trace_pattern({pcmod, :handle_cast, 2},
155 | [{:_, [], [{:return_trace}]}])
156 | if Runtime.synchronous?(pipeline) do
157 | :erlang.trace_pattern({pcmod, :handle_call, 3},
158 | [{:_, [], [{:return_trace}]}])
159 | end
160 | end
161 |
162 | for cmod <- pipeline.consumer do
163 | :erlang.trace_pattern({cmod, :handle_events, 3},
164 | [{:_, [], [{:return_trace}]}])
165 | end
166 | end
167 |
168 | {:ok, pipeline}
169 | end
170 |
171 | # Validate pipeline modules can be loaded or report failures.
172 | defp validate_modules(pipeline) do
173 | case require_modules(pipeline) do
174 | [] -> {:ok, pipeline}
175 | errs -> {:stop, {:bad_pipeline, errs}}
176 | end
177 | end
178 |
179 | # Ensure pipeline modules are available and can be loaded.
180 | defp require_modules(pipeline) do
181 | [pipeline.producer, pipeline.producer_consumer, pipeline.consumer]
182 | |> Enum.flat_map(fn(modules) -> modules end)
183 | |> Enum.uniq
184 | |> Runtime.require_modules
185 | end
186 |
187 | # Validate pipeline modules implement GenStage or report failures.
188 | defp validate_behaviours(pipeline) do
189 | case require_behaviour(pipeline, GenStage) do
190 | [] -> {:ok, pipeline}
191 | errs -> {:stop, {:bad_pipeline, errs}}
192 | end
193 | end
194 |
195 | # Ensure pipeline modules implement GenStage behaviour.
196 | defp require_behaviour(pipeline, behaviour) do
197 | [pipeline.producer, pipeline.producer_consumer, pipeline.consumer]
198 | |> Enum.flat_map(fn(modules) -> modules end)
199 | |> Enum.uniq
200 | |> Runtime.require_behaviour(behaviour)
201 | end
202 |
203 | defp do_intercept_call_request(state, pid, {mod, fun}, demand, ts) do
204 | case fun do
205 | @handle_demand -> do_open_metric(state, mod, pid, demand, ts)
206 | @handle_events -> do_open_metric(state, mod, pid, length(demand), ts)
207 | @handle_call -> do_open_metric(state, mod, pid, 0, ts)
208 | @handle_cast -> do_open_metric(state, mod, pid, 0, ts)
209 | _ -> state
210 | end
211 | end
212 |
213 | defp do_intercept_call_response(state, mod, pid, events, ts) do
214 | do_close_metric(state, mod, pid, events, ts)
215 | end
216 |
217 | # Open partial metric on handle_ function call trace.
218 | defp do_open_metric(state, mod, pid, demand, ts) do
219 | metrics =
220 | Manager.open_summary_metric(state.metrics, mod, pid, demand, ts)
221 | state = %Monitor{state | metrics: metrics}
222 |
223 | if Runtime.statistics?(state.pipeline) do
224 | metrics =
225 | Manager.open_stats_metric(state.metrics, {mod, pid, demand, ts})
226 | %Monitor{state | metrics: metrics}
227 | else
228 | state
229 | end
230 | end
231 |
232 | # Close complete metric on handle_ function return trace.
233 | defp do_close_metric(state, mod, pid, events, ts) do
234 | metrics = Manager.close_summary_metric(state.metrics, mod, pid, events, ts)
235 | state = %Monitor{state | metrics: metrics}
236 |
237 | if Runtime.statistics?(state.pipeline) do
238 | metrics = Manager.close_stats_metric(state.pipeline,
239 | state.metrics, {mod, pid, events, ts})
240 | %Monitor{state | metrics: metrics}
241 | else
242 | state
243 | end
244 | end
245 |
246 | end
247 |
--------------------------------------------------------------------------------
/bench/README.md:
--------------------------------------------------------------------------------
1 | ## GenMetrics Runtime Performance Benchmarks
2 |
3 | For those of you curious about the performance impact `gen_metrics` has on the servers and pipelines it is monitoring, we've put together a number of benchmarks to compare the overhead of *untraced* vs *traced* vs *sampled* servers and pipelines. You can tweak and run the benchmarks yourself from the project root directory.
4 |
5 | The following sections introduce each of the available benchmark tests. We examine the results and explain the implications of those results in each case. The benchmark reports that follow were generated by running the benchmarks on a 2011 Macbook Air (1.8ghz i7 [4 Core], 4GB RAM, SSD). All benchmarks are implemented and run using the [benchee benchmark](https://github.com/PragTob/benchee) library.
6 |
7 | ## GenMetrics Runtime Performance Summary
8 |
9 | When GenMetrics is activated, varying degress of runtime overhead *may* be incurred by the application being monitored depending on the rate of GenServer or GenStage callbacks within the application. In order to prevent GenMetrics negatively impacting on your application it is strongly recommended that you activate *metrics-sampling* for high-callback applications.
10 |
11 | To activate metrics-sampling for your server or pipeline simply specify the `sample_rate` option when declaring your monitoring preferences. For example, to reduce the runtime overhead of GenMetrics by sampling just 10% of all callbacks within your server or pipeline simply specify `opts : [sample_rate: 0.1]`.
12 |
13 | It is important to understand that when sampling is disabled, metrics data reflect the exact behaviour of the processes being monitored. When sampling is enabled, metrics data reflect an approximation of the behaviour of the processes being monitored.
14 |
15 | **IMPORTANT!**
16 |
17 | GenMetrics depends on Erlang tracing to collect runtime metrics for your application. One consequence of this depedency is that tail-call optimization is automatically disabled by the tracing agent. Given this, eventual resource exhaustion due to unbounded stack growth for long-running applications is inevitable. Resource exhaustion may be significantly delayed by activating metrics-sampling. But such resource exhaustion can not be avoided indefinitely.
18 |
19 | **DO NOT ACTIVATE GenMetrics IN LONG-RUNNING PRODUCTION APPLICATIONS.**
20 |
21 | To understand and observe resource use when GenMetrics is activated use the `mix infinite_server` or `mix infinite_pipeline` tasks which automatically launch the `:observer` tool which allows you to profile BEAM metrics.
22 |
23 | ## GenMetrics + Synchronous / Asynchronous Callbacks
24 |
25 | By default, GenMetrics monitors all synchronous and asynchronous callbacks within a server or pipeline. However, the monitoring of synchronous callbacks is optional. To disable monitoring of synchronous callbacks simply specify the `opts: [synchronous: false]` option when declaring the monitoring preferences for your server or pipeline.
26 |
27 | ## GenServer Benchmarks
28 |
29 | The following set of benchmarks are designed to test and measure the runtime impact of GenMetrics on a simple GenServer application. Benchmark specific context is provided in each case along with an analysis of the results.
30 |
31 | ### GenServer Benchmark 1. bench/trace_server.exs
32 |
33 | ```
34 | mix trace_server
35 | ```
36 |
37 | This benchmark runs the following tests:
38 |
39 | 1. untraced-server [ repeat 500k callbacks N times within ~30s ]
40 | 2. traced----server [ repeat 500k callbacks N times within ~30s ]
41 |
42 | Both tests attempt to push as many messages as possible to a GenServer process using the `GenServer.call/3` function. These tests each run for approximately 30 seconds. The server process within the `untraced-server` test is not being monitored by GenMetrics. The server process within the `traced-server` test is being monitored by GenMetrics. As metrics-sampling has not been enabled for this benchmark *all* callbacks on the `traced-server` are monitored.
43 |
44 |
45 | ```
46 | Elixir 1.4.1
47 | Erlang 19.2
48 | Benchmark suite executing with the following configuration:
49 | warmup: 5.0s
50 | time: 30.0s
51 | parallel: 1
52 | inputs: none specified
53 | Estimated total run time: 70.0s
54 |
55 | Benchmarking 1-untraced-server [ repeat 500k calls N times within ~30s ]...
56 | Benchmarking 2-traced---server [ repeat 500k calls N times within ~30s ]...
57 |
58 | Name ips average deviation median
59 | 1-untraced-server [ repeat 500k calls N times within ~30s ] 0.21 4.75 s ±0.73% 4.73 s
60 | 2-traced---server [ repeat 500k calls N times within ~30s ] 0.0878 11.39 s ±2.61% 11.38 s
61 |
62 | Comparison:
63 | 1-untraced-server [ repeat 500k calls N times within ~30s ] 0.21
64 | 2-traced---server [ repeat 500k calls N times within ~30s ] 0.0878 - 2.40x slower
65 | ```
66 |
67 | On our test hardware, the `untraced-server` mananged to push approximately 4.5 million messages to its GenServer processes within the 30 second test window. That's approximately 150k messages-per-second. The `traced-server` only managed to push approximately 2 million messages to its GenServer process. That's approximately 67k messages-per-second.
68 |
69 | The results indicate a significant runtime overhead has been introduced by the GenMetrics library. As indicated by the results the `traced-server` test performed `2.40x slower`. We can directly attribute this slowdown to the runtime overhead introduced by the GenMetrics library.
70 |
71 | While not all applications require metrics-sampling to reduce the runtime overhead associated with GenMetrics, this result strongly suggests this test application is a good candidate for sampling. See the following benchmark to see the immediate and significant positive effects when sampling is activated.
72 |
73 |
74 | ### GenServer Benchmark 2. bench/sample_server.exs
75 |
76 | ```
77 | mix sample_server
78 | ```
79 |
80 | This benchmark runs the following tests:
81 |
82 | 1. untraced-server [ repeat 500k callbacks N times within ~30s ]
83 | 2. sampled-server [ repeat 500k callbacks N times within ~30s ]
84 |
85 | Both tests attempt to push as many messages as possible to a GenServer process using the `GenServer.call/3` function. These tests each run for approximately 30 seconds. The server process within the `untraced-server` test is not being monitored by GenMetrics. The server process within the `sampled-server` test is being monitored by GenMetrics. Metrics-sampling has been activated for this server using the following monitoring preferences, `opts: [sample_rate: 0.1]`.
86 |
87 |
88 | ```
89 | Elixir 1.4.1
90 | Erlang 19.2
91 | Benchmark suite executing with the following configuration:
92 | warmup: 5.0s
93 | time: 30.0s
94 | parallel: 1
95 | inputs: none specified
96 | Estimated total run time: 70.0s
97 |
98 | Benchmarking 1-untraced-server [ repeat 500k callbacks N times within ~30s ]...
99 | Benchmarking 2-sampled--server [ repeat 500k callbacks N times within ~30s ]...
100 |
101 | Name ips average deviation median
102 | 1-untraced-server [ repeat 500k callbacks N times within ~30s ] 0.22 4.51 s ±1.42% 4.49 s
103 | 2-sampled--server [ repeat 500k callbacks N times within ~30s ] 0.21 4.84 s ±1.83% 4.85 s
104 |
105 | Comparison:
106 | 1-untraced-server [ repeat 500k callbacks N times within ~30s ] 0.22
107 | 2-sampled--server [ repeat 500k callbacks N times within ~30s ] 0.21 - 1.07x slower
108 | ```
109 |
110 | On our test hardware, both tests managed to push approximately 4.5 million messages to their respective GenServer processes within the 30 second test window. That's approximately 150k messages-per-second.
111 |
112 | In this benchmark, the `sampled-server` test performed just `1.07x slower` than the `untraced-server` test. Compared to the `traced-server` test in the previous benchmark that performed `2.40x sower` we can see the significant, positive impact activating metrics-sampling has on reducing the runtime overhead associated with GenMetrics.
113 |
114 | ## GenStage Benchmarks
115 |
116 | The following set of benchmarks are designed to test and measure the runtime impact of GenMetrics on a simple GenStage pipeline application. Benchmark specific context is provided in each case along with an analytis of the results.
117 |
118 | ### GenStage Benchmark 1. bench/trace_pipeline.exs
119 |
120 | ```
121 | mix trace_pipeline
122 | ```
123 |
124 | This benchmark runs the following tests:
125 |
126 | 1. untraced-pipeline [ repeat 500k msgs N times within ~30s ]
127 | 2. traced----pipeline [ repeat 500k msgs N times within ~30s ]
128 |
129 | Each test attempts to push as many messages as possible through a GenStage pipeline. These tests each run for approximately 30 seconds. The GenStage processes within the `untraced-pipeline` test are not being monitored by GenMetrics. The GenStage processes within the `traced-pipeline` test are being monitored by GenMetrics. As metrics-sampling has not been enabled for this benchmark *all* callbacks within the `traced-pipeline` are monitored.
130 |
131 | ```
132 | Elixir 1.4.1
133 | Erlang 19.2
134 | Benchmark suite executing with the following configuration:
135 | warmup: 5.0s
136 | time: 30.0s
137 | parallel: 1
138 | inputs: none specified
139 | Estimated total run time: 70.0s
140 |
141 | Benchmarking 1-untraced-pipeline [ repeat 500k msgs N times within ~30s ]...
142 | Benchmarking 2-traced---pipeline [ repeat 500k msgs N times within ~30s ]...
143 |
144 | Name ips average deviation median
145 | 1-untraced-pipeline [ repeat 500k msgs N times within ~30s ] 0.0643 15.55 s ±1.17% 15.55 s
146 | 2-traced---pipeline [ repeat 500k msgs N times within ~30s ] 0.0281 35.53 s ±0.00% 35.53 s
147 |
148 | Comparison:
149 | 1-untraced-pipeline [ repeat 500k msgs N times within ~30s ] 0.0643
150 | 2-traced---pipeline [ repeat 500k msgs N times within ~30s ] 0.0281 - 2.29x slower
151 | ```
152 |
153 | On our test hardware, the `untraced-pipeline` mananged to push approximately 1.5 million messages to its GenServer processes within the 30 second test window. That's approximately 50k messages-per-second. The `traced-pipeline` only managed to push approximately 1 million messages to its GenServer process. That's approximately 33k messages-per-second.
154 |
155 | The results indicate a significant runtime overhead has been introduced by the GenMetrics library. As indicated by the results the `traced-pipeline` test performed `2.29 slower`. We can directly attribute this slowdown to the runtime overhead introduced by the GenMetrics library.
156 |
157 | While not all applications require metrics-sampling to reduce the runtime overhead associated with GenMetrics, this result strongly suggests this test application is a good candidate for sampling. See the following benchmark to see the immediate and significant positive effects when sampling is activated.
158 |
159 | ### GenStage Benchmark 2. bench/sample_pipeline.exs
160 |
161 | ```
162 | mix sample_pipeline
163 | ```
164 |
165 | This benchmark runs the following tests:
166 |
167 | 1. untraced-pipeline [ repeat 500k msgs N times within ~30s ]
168 | 2. sampled-pipeline [ repeat 500k msgs N times within ~30s ]
169 |
170 | Each test attempts to push as many messages as possible through a GenStage pipeline. These tests each run for approximately 30 seconds. The GenStage processes within the `untraced-pipeline` test are not being monitored by GenMetrics. The GenStage processes within the `sampled-pipeline` test are being monitored by GenMetrics. Metrics-sampling has been activated for this pipeline using the following monitoring preferences, `opts: [sample_rate: 0.1]`.
171 |
172 | ```
173 | Elixir 1.4.1
174 | Erlang 19.2
175 | Benchmark suite executing with the following configuration:
176 | warmup: 5.0s
177 | time: 30.0s
178 | parallel: 1
179 | inputs: none specified
180 | Estimated total run time: 70.0s
181 |
182 | Benchmarking 1-untraced-pipeline [ repeat 500k msgs N times within ~30s ]...
183 | Benchmarking 2-sampled--pipeline [ repeat 500k msgs N times within ~30s ]...
184 |
185 | Name ips average deviation median
186 | 1-untraced-pipeline [ repeat 500k msgs N times within ~30s ] 0.0728 13.74 s ±2.06% 13.87 s
187 | 2-sampled--pipeline [ repeat 500k msgs N times within ~30s ] 0.0672 14.88 s ±0.99% 14.81 s
188 |
189 | Comparison:
190 | 1-untraced-pipeline [ repeat 500k msgs N times within ~30s ] 0.0728
191 | 2-sampled--pipeline [ repeat 500k msgs N times within ~30s ] 0.0672 - 1.08x slower
192 | ```
193 |
194 | On our test hardware, both tests managed to push approximately 2 million messages to their respective GenServer processes within the 30 second test window. That's approximately 67k messages-per-second.
195 |
196 | In this benchmark, the `sampled-pipeline` test performed just `1.08x slower` than the `untraced-pipeline` test. Compared to the `sampled-pipeline` test in the previous benchmark that performed `2.29x sower` we can see the significant, positive impact activating metrics-sampling has on reducing the runtime overhead associated with GenMetrics.
197 |
198 | ## GenMetrics + BEAM Garbage Collection
199 |
200 | Some final remarks about GenMetrics and it's memory usage profile within the BEAM.
201 |
202 | By default, when GenMetrics is enabled it collects and reports only summary metrics data. This type of metrics data collection has very little runtime overhead in terms of memory usage and should never trigger spikes in memory usage or GC.
203 |
204 | If detailed statistical metrics are activated using the `statistics: true` option, significant amounts of metrics data are collected. Activating this feature is a lot like activating a `statsd agent` directly within the BEAM. The exact amount of data collected is directly proportional to the *rate-of-callbacks* within the server or pipeline. It is therefore strongly recommended that this feature only be enabled in environments where the *rate-of-callbacks* is known to be low. Otherwise, spiked memory usage and frequent GC will occur.
205 |
206 | If the type of insights provided by statistical metrics are needed then we strongly recommend using the existing support for redirecting metrics data to an external `statsd` agent. This can be achieved using the `opts: [statistics: :statsd]` and `opts: [statistics: :datadog]` options. Just remember that activating metrics-sampling will push only the metrics that were actually monitored by GenMetrics to these agents. Any dashboard rendering these metrics data will have to account for the sampling rate in order to display total-values-over-time for metrics data. When using these external `statsd` agents GenMetrics incurs very little runtime overhead in terms of memory usage and should never trigger spikes in memory usage or GC.
207 |
--------------------------------------------------------------------------------
/PITCHME.md:
--------------------------------------------------------------------------------
1 | ## GenMetrics
2 |
3 | Elixir GenServer and GenStage Runtime Metrics
4 |
5 | Note:
6 | Provide brief background, then state agenda: GenSever + GenStage
7 | behaviours and realtime metrics collection and reporting by GenMetrics.
8 |
9 | ---
10 |
11 | ### Application Runtime Metrics
12 |
13 | - Summary Metrics
14 | - Plus optional Statistical Metrics
15 | - Delivered In-Memory, Or To STATSD Agent
16 | - For any GenServer or GenStage Application
17 | - Without requiring changes to existing code
18 |
19 | Note:
20 | Introduce GenServer, GenStage behaviours on OTP. Emphasize metrics
21 | by introspection.
22 |
23 | ---
24 |
25 | ### Hex Package Dependency
26 |
27 | ```elixir
28 | def deps do
29 | [{:gen_metrics, "~> 0.3.0"}]
30 | end
31 | ```
32 |
33 | Note:
34 | Mention detailed HexDocs documentation available on hexdocs.pm.
35 |
36 | ---
37 |
38 | ### GenServer Metrics
39 |
40 | +++
41 |
42 | #### GenServer Metrics Per Server Process
43 |
44 | - Number of `call`, `cast`, and `info` callbacks
45 | - Time taken on these callbacks
46 | - Plus optional detailed statistical metrics
47 |
48 | Note:
49 | Explain that *callbacks* are the *unit-of-work* in a GenServer.
50 | Also elaborate on differences between summary and statistical metrics.
51 |
52 | +++
53 |
54 | #### GenMetrics Activation
55 |
56 | ```elixir
57 | alias GenMetrics.GenServer.Cluster
58 |
59 | cluster = %Cluster{name: "demo",
60 | servers: [Session.Server, Logging.Server]}
61 |
62 | GenMetrics.monitor_cluster(cluster)
63 |
64 | # Here Session.Server and Logging.Server are example GenServers.
65 | ```
66 |
67 | Note:
68 | Point out that GenMetrics provides it's own supervision tree.
69 |
70 | +++
71 |
72 | #### GenMetrics Sampling
73 |
74 | ```elixir
75 | alias GenMetrics.GenServer.Cluster
76 |
77 | cluster = %Cluster{name: "demo",
78 | servers: [Session.Server, Logging.Server],
79 | opts: [sample_rate: 0.2]}
80 |
81 | GenMetrics.monitor_cluster(cluster)
82 |
83 | # Here Session.Server and Logging.Server are example GenServers.
84 | ```
85 |
86 | Note:
87 | Sampling reduces runtime overhead of GenMetrics monitoring agent.
88 |
89 | +++
90 |
91 | #### GenServer Summary Metrics
92 |
93 | #### Sample Metrics Data
94 |
95 | ```elixir
96 | # Server Name: Demo.Server, PID<0.176.0>
97 |
98 | %GenMetrics.GenServer.Summary{name: Demo.Server,
99 | pid: #PID<0.176.0>,
100 | calls: 8000,
101 | casts: 34500,
102 | infos: 3333,
103 | time_on_calls: 28,
104 | time_on_casts: 161,
105 | time_on_infos: 15}
106 |
107 | # Summary timings measured in milliseconds (ms).
108 | ```
109 |
110 | Note:
111 | Provide example by explaining how *calls* and *time_on_calls* relate.
112 | +++
113 |
114 | #### GenServer Statistical Metrics
115 |
116 | #### Optional Statsd Activation
117 |
118 | ```elixir
119 | alias GenMetrics.GenServer.Cluster
120 |
121 | cluster = %Cluster{name: "demo",
122 | servers: [Session.Server, Logging.Server],
123 | opts: [statistics: :statsd]}
124 |
125 | GenMetrics.monitor_cluster(cluster)
126 |
127 | # Here Session.Server and Logging.Server are example GenServers.
128 | ```
129 |
130 | Note:
131 | Explain `:statsd` integration with analysis and visualization
132 | tools such as Grafana and Datadog.
133 |
134 | +++
135 |
136 | #### GenServer Statistical Metrics
137 |
138 | #### Optional Datadog Activation
139 |
140 | ```elixir
141 | alias GenMetrics.GenServer.Cluster
142 |
143 | cluster = %Cluster{name: "demo",
144 | servers: [Session.Server, Logging.Server],
145 | opts: [statistics: :datadog]}
146 |
147 | GenMetrics.monitor_cluster(cluster)
148 |
149 | # Here Session.Server and Logging.Server are example GenServers.
150 | ```
151 |
152 | Note:
153 | Mention `:datadog` tagging feature is automatically activated
154 | to support filtering on individual GenServer clusters.
155 |
156 | +++
157 |
158 | #### GenServer Statistical Metrics
159 |
160 | #### Optional In-Memory Activation
161 |
162 | ```elixir
163 | alias GenMetrics.GenServer.Cluster
164 |
165 | cluster = %Cluster{name: "demo",
166 | servers: [Session.Server, Logging.Server],
167 | opts: [statistics: true]}
168 |
169 | GenMetrics.monitor_cluster(cluster)
170 |
171 | # Here Session.Server and Logging.Server are example GenServers.
172 | ```
173 |
174 | Note:
175 | Mention additional *opts* such as *window_interval* and how it works.
176 |
177 | +++
178 |
179 | #### GenServer Statistical Metrics
180 |
181 | #### Sample In-Memory Metrics Data
182 |
183 | ```elixir
184 | # Server Name: Demo.Server, PID<0.176.0>
185 |
186 | # handle_call/3
187 | %GenMetrics.GenServer.Stats{callbacks: 8000,
188 | max: 149,
189 | mean: 3,
190 | min: 2,
191 | range: 147,
192 | stdev: 2,
193 | total: 25753}
194 |
195 | # Statistical timings measured in microseconds (µs).
196 | ```
197 |
198 | Note:
199 | Briefly explain how `in-memory` statistical metrics are captured
200 | and calculated. Recommend judicious use.
201 |
202 | +++
203 |
204 | #### GenServer Statistical Metrics
205 |
206 | #### Sample In-Memory Metrics Data
207 |
208 | ```elixir
209 | # Server Name: Demo.Server, PID<0.176.0>
210 |
211 | # handle_cast/2
212 | %GenMetrics.GenServer.Stats{callbacks: 34500,
213 | max: 3368,
214 | mean: 4,
215 | min: 2,
216 | range: 3366,
217 | stdev: 31,
218 | total: 141383}
219 |
220 | # Statistical timings measured in microseconds (µs).
221 | ```
222 |
223 | +++
224 |
225 | #### GenServer Statistical Metrics
226 |
227 | #### Sample In-Memory Metrics Data
228 |
229 | ```elixir
230 | # Server Name: Demo.Server, PID<0.176.0>
231 |
232 | # handle_info/2
233 | %GenMetrics.GenServer.Stats{callbacks: 3333,
234 | max: 37,
235 | mean: 4,
236 | min: 2,
237 | range: 35,
238 | stdev: 2,
239 | total: 13510}
240 |
241 | # Statistical timings measured in microseconds (µs).
242 | ```
243 |
244 | ---
245 |
246 | ### GenStage Metrics
247 |
248 | +++
249 |
250 | #### GenStage Metrics Per Stage Process
251 |
252 | - Number of `demand` and `events` callbacks
253 | - Time taken on these callbacks
254 | - Size of upstream demand
255 | - Size of events emitted to meet demand
256 | - Plus optional detailed statistical metrics
257 |
258 | Note:
259 | Briefly discuss GenStage demand, events and back-pressure.
260 |
261 | +++
262 |
263 | #### GenStage Activation
264 |
265 | ```elixir
266 | alias GenMetrics.GenStage.Pipeline
267 |
268 | pipeline = %Pipeline{name: "demo",
269 | producer: [Data.Producer],
270 | producer_consumer:
271 | [Data.Scrubber, Data.Analyzer],
272 | consumer: [Data.Consumer]}
273 |
274 | GenMetrics.monitor_pipeline(pipeline)
275 |
276 | # Here Data.* are simply example GenStages.
277 | ```
278 |
279 | Note:
280 | Mention GenMetrics monitoring supports both complete and
281 | partial pipelines.
282 |
283 | +++
284 |
285 | #### GenStage Sampling
286 |
287 | ```elixir
288 | alias GenMetrics.GenStage.Pipeline
289 |
290 | pipeline = %Pipeline{name: "demo",
291 | producer: [Data.Producer],
292 | producer_consumer:
293 | [Data.Scrubber, Data.Analyzer],
294 | consumer: [Data.Consumer],
295 | opts: [sample_rate: 0.1]}
296 |
297 | GenMetrics.monitor_pipeline(pipeline)
298 |
299 | # Here Data.* are simply example GenStages.
300 | ```
301 |
302 | Note:
303 | Sampling reduces runtime overhead of the GenMetrics monitoring agent.
304 |
305 | +++
306 |
307 | #### GenStage Summary Metrics
308 |
309 | #### Sample Metrics Data
310 |
311 | ```elixir
312 | # Stage Name: Data.Producer, PID<0.195.0>
313 |
314 | %GenMetrics.GenStage.Summary{stage: Data.Producer,
315 | pid: #PID<0.195.0>,
316 | callbacks: 9536,
317 | time_on_callbacks: 407,
318 | demand: 4768000,
319 | events: 4768000}
320 |
321 | # Summary timings measured in milliseconds (ms).
322 | ```
323 |
324 | Note:
325 | Explain *callbacks*, *demand*, and *events* concepts and
326 | how they are reflected in the metrics data shown.
327 |
328 | +++
329 |
330 | #### GenStage Statistical Metrics
331 |
332 | #### Optional Statsd Activation
333 |
334 | ```elixir
335 | alias GenMetrics.GenStage.Pipeline
336 |
337 | pipeline = %Pipeline{name: "demo",
338 | producer_consumer:
339 | [Data.Scrubber, Data.Analyzer],
340 | opts: [statistics: :statsd]}
341 |
342 | GenMetrics.monitor_pipeline(pipeline)
343 |
344 | # Here Data.Scrubber and Data.Analyzer are example GenStages.
345 | ```
346 |
347 | Note:
348 | Explain `:statsd` integration with analysis and visualization
349 | tools such as Grafana and Datadog.
350 |
351 | +++
352 |
353 | #### GenStage Statistical Metrics
354 |
355 | #### Optional Datadog Activation
356 |
357 | ```elixir
358 | alias GenMetrics.GenStage.Pipeline
359 |
360 | pipeline = %Pipeline{name: "demo",
361 | producer_consumer:
362 | [Data.Scrubber, Data.Analyzer],
363 | opts: [statistics: :datadog]}
364 |
365 | GenMetrics.monitor_pipeline(pipeline)
366 |
367 | # Here Data.Scrubber and Data.Analyzer are example GenStages.
368 | ```
369 |
370 | Note:
371 | Mention `:datadog` tagging feature is automatically activated
372 | to support filtering on individual GenStage pipelines.
373 |
374 | +++
375 |
376 | #### GenStage Statistical Metrics
377 |
378 | #### Optional In-Memory Activation
379 |
380 | ```elixir
381 | alias GenMetrics.GenStage.Pipeline
382 |
383 | pipeline = %Pipeline{name: "demo",
384 | producer_consumer:
385 | [Data.Scrubber, Data.Analyzer],
386 | opts: [statistics: true]}
387 |
388 | GenMetrics.monitor_pipeline(pipeline)
389 |
390 | # Here Data.Scrubber and Data.Analyzer are example GenStages.
391 | ```
392 |
393 | Note:
394 | Again mention availability of *window_interval* option.
395 |
396 | +++
397 |
398 | #### GenStage Statistical Metrics
399 |
400 | #### Sample In-Memory Metrics Data
401 |
402 | ```elixir
403 | # Stage Name: Data.Producer, PID<0.195.0>
404 |
405 | # callback demand
406 | %GenMetrics.GenStage.Stats{callbacks: 9536,
407 | max: 500,
408 | mean: 500,
409 | min: 500,
410 | range: 0,
411 | stdev: 0,
412 | total: 4768000}
413 |
414 | # Statistical timings measured in microseconds (µs).
415 | ```
416 |
417 | Note:
418 | Note GenStage summary metrics split across *demand*, *events*
419 | and *timings* as we will see on the following slides.
420 |
421 | +++
422 |
423 | #### GenStage Statistical Metrics
424 |
425 | #### Sample In-Memory Metrics Data
426 |
427 | ```elixir
428 | # callback events
429 | %GenMetrics.GenStage.Stats{callbacks: 9536,
430 | max: 500,
431 | mean: 500,
432 | min: 500,
433 | range: 0,
434 | stdev: 0,
435 | total: 4768000}
436 |
437 | # Statistical timings measured in microseconds (µs).
438 | ```
439 |
440 | +++
441 |
442 | #### GenStage Statistical Metrics
443 |
444 | #### Sample In-Memory Metrics Data
445 |
446 | ```elixir
447 | # callback timings
448 | %GenMetrics.GenStage.Stats{callbacks: 9536,
449 | max: 2979,
450 | mean: 42,
451 | min: 24,
452 | range: 2955,
453 | stdev: 38,
454 | total: 403170}
455 |
456 | # Statistical timings measured in microseconds (µs).
457 | ```
458 |
459 | ---
460 |
461 | ### GenMetrics Reporting
462 |
463 | - Metrics are published periodically
464 | - By a dedicated reporting process
465 | - Or by a statsd agent
466 | - Any application can subscribe for metrics events
467 | - Then aggregate, render, persist, etc metrics data
468 |
469 | Note:
470 | Emphasize separation of metrics collection, reporting, and consumption.
471 |
472 | ---
473 |
474 | ### GenServer Metrics Reporting
475 |
476 | +++
477 |
478 | #### GenMetrics.GenServer.Reporter
479 |
480 | A GenStage Broadcasting Producer
481 |
482 | For In-Memory Metrics Data
483 |
484 | Note:
485 | Clarify that the producer name is registered by GenMetrics.
486 |
487 | +++
488 |
489 | #### Subscribing For GenMetrics Events
490 |
491 | ```elixir
492 | def init(:ok) do
493 |
494 | {:consumer, :state_does_not_matter,
495 | subscribe_to:
496 | [{GenMetrics.GenServer.Reporter, max_demand: 1}]}
497 |
498 | end
499 | ```
500 |
501 | Note:
502 | Mention the reporting process is a *BroadcastDispatcher*
503 | producer so there is opportunity for filtering using *selector*.
504 |
505 | +++
506 |
507 | #### Handling GenMetrics Events
508 |
509 | ```elixir
510 | def handle_events([metrics | _], _from, state) do
511 |
512 | for summary <- metrics.summary do
513 | Logger.info "GenMetrics.Consumer: #{inspect summary}"
514 | end
515 |
516 | {:noreply, [], state}
517 |
518 | end
519 | ```
520 |
521 | Note:
522 | Explain metrics can be analyzed or processed in any number
523 | of ways including logging, persistence, Statsd, Graphana,
524 | DataDog, etc.
525 |
526 | ---
527 |
528 | ### GenStage Metrics Reporting
529 |
530 | +++
531 |
532 | #### GenMetrics.GenStage.Reporter
533 |
534 | A GenStage Broadcasting Producer
535 |
536 | For In-Memory Metrics Data
537 |
538 | +++
539 |
540 | #### Subscribing For GenMetrics Events
541 |
542 | ```elixir
543 | def init(:ok) do
544 |
545 | {:consumer, :state_does_not_matter,
546 | subscribe_to:
547 | [{GenMetrics.GenStage.Reporter, max_demand: 1}]}
548 |
549 | end
550 | ```
551 |
552 | Note:
553 | Again clarify that the producer name is registered by GenMetrics.
554 |
555 | +++
556 |
557 | #### Handling GenMetrics Events
558 |
559 | ```elixir
560 | def handle_events([metrics | _], _from, state) do
561 |
562 | for summary <- metrics.summary do
563 | Logger.info "GenMetrics.Consumer: #{inspect summary}"
564 | end
565 |
566 | {:noreply, [], state}
567 |
568 | end
569 | ```
570 |
571 | ---
572 |
573 | ### GenMetrics is open source
574 |
575 | - The Hex Docs
576 | - The GitHub Repo
577 | - Welcome feedback, PRs, issues, etc.
578 |
579 | Note:
580 | Encourage the audience to get involved, test, report, contribute.
581 |
582 |
--------------------------------------------------------------------------------
/lib/gen_metrics.ex:
--------------------------------------------------------------------------------
1 | defmodule GenMetrics do
2 |
3 | @moduledoc """
4 | Runtime metrics for `GenServer` and `GenStage` applications.
5 |
6 | **Important:**
7 | The GenMetrics library is not suitable for use within long-running
8 | production environments. For further details, see the [benchmarks
9 | performance guide](https://github.com/onetapbeyond/gen_metrics#benchmarks).
10 |
11 | This library supports the collection and publication of GenServer and GenStage
12 | runtime metrics. Metrics data are generated by an introspection agent. No
13 | instrumentation is required within the GenServer or GenStage library
14 | or within your application source code.
15 |
16 | GenMetrics data can be used to reveal insights into live application
17 | performance and identify patterns of behaviour within an application over
18 | time. Metrics data can be used to drive any number of operational systems,
19 | including realtime dashboards, monitoring and alerting systems.
20 |
21 | By default, metrics are published by a dedicated GenMetrics reporting process.
22 | Any application can subscribe to this process in order to aggregate, render,
23 | persist, or generally handle metrics data. Metrics data can also be pushed
24 | directly to a `statsd` agent which makes it possible to analyze, and visualize
25 | the metrics within existing tools and services like `Graphana` and `Datadog`.
26 |
27 | The metrics data collected by this library includes both summary metrics and
28 | optional detailed statistical metrics. Summary metrics and statistical
29 | metrics for GenServer and GenStage applications are described in detail below.
30 |
31 | ## GenMetrics Installation
32 |
33 | Simply add `gen_metrics` as a `deps` dependency in your Mixfile.
34 |
35 | ## GenMetrics for GenServer Applications
36 |
37 | Any application using the `GenServer` behaviour can immediately benefit from
38 | the insights afforded by GenMetrics. The following sections explain how.
39 | For `GenStage` applications, see the docs
40 | [here](#module-genmetrics-for-genstage-applications).
41 |
42 | ### GenServer Metrics Activation
43 |
44 | A `GenMetrics.GenServer.Cluster` struct is used to identify one or more
45 | GenServer modules that become candidates for metrics collection. For example,
46 | assuming your application has a `Session.Server` and a `Logging.Server` you
47 | can activate metrics collection on both GenServers as follows:
48 |
49 | ```
50 | alias GenMetrics.GenServer.Cluster
51 | cluster = %Cluster{name: "demo", servers: [Session.Server, Logging.Server]}
52 | GenMetrics.monitor_cluster(cluster)
53 | ```
54 |
55 | The *cluster* in this context is simply a named set of one or more GenServer
56 | modules about which you would like to collect metrics data. Metrics data
57 | are collected on server processes executing on the local node.
58 |
59 | GenMetrics will instantly attach to running GenServer processes associated
60 | with your cluster. If there are no running server processes associated with
61 | your cluster when `GenMetrics.monitor_cluster/1` is called, GenMetrics will
62 | monitor for process activation and automatically begin metrics collection
63 | for each new process.
64 |
65 | ### GenServer Metrics Sampling
66 |
67 | Sampling metrics is a effective way to collect and report metrics for any
68 | server while minimizing the runtime overhead introduced by the GenMetics
69 | monitoring agent.
70 |
71 | When sampling is disabled, metrics data reflect the exact behaviour of the
72 | processes being monitored. When sampling is enabled, metrics data reflect
73 | an approximation of the behaviour of the processes being monitored.
74 |
75 | Given an application with the following GenServers: `Session.Server`,
76 | `Logging.Server`, activate metrics-sampling for the server cluster as follows:
77 |
78 | ```elixir
79 | alias GenMetrics.GenServer.Cluster
80 | cluster = %Cluster{name: "demo",
81 | servers: [Session.Server, Logging.Server],
82 | opts: [sample_rate: 0.3]}
83 | GenMetrics.monitor_cluster(cluster)
84 | ```
85 |
86 | ### GenServer Summary Metrics
87 |
88 | Summary metrics are collected for activity within the following GenServer
89 | callbacks:
90 |
91 | - `GenServer.handle_call/3`
92 | - `GenServer.handle_cast/2`
93 | - `GenServer.handle_info/2`
94 |
95 | GenMetrics collects both the number of callbacks and the time taken on
96 | those callbacks for each of the server processes within your cluster.
97 |
98 | Summary metrics are aggregated across a periodic time interval, known as a
99 | *window*. By default, the window interval is `1000 ms`. This interval may be
100 | customized using the `window_interval` option on `GenMetrics.monitor_cluster/1`
101 | as shown here:
102 |
103 | ```
104 | alias GenMetrics.GenServer.Cluster
105 | cluster = %Cluster{name: "demo",
106 | servers: [Session.Server, Logging.Server],
107 | opts: [window_interval: 5000]}
108 | GenMetrics.monitor_cluster(cluster)
109 | ```
110 |
111 | The following are sample summary metrics reported for a single window interval
112 | on a GenServer process:
113 |
114 | ```
115 | # Server Name: Demo.Server, PID<0.176.0>
116 |
117 | %GenMetrics.GenServer.Summary{name: Demo.Server,
118 | pid: #PID<0.176.0>,
119 | calls: 8000,
120 | casts: 34500,
121 | infos: 3333,
122 | time_on_calls: 28,
123 | time_on_casts: 161,
124 | time_on_infos: 15}
125 | ```
126 |
127 | All timings reported on summary metrics are reported in `milliseconds (ms)`.
128 | For example, during this sample window interval, the `handle_cast/2` callback
129 | was executed `34500` times. The total time spent processing those callbacks
130 | was just `161 ms`.
131 |
132 | ### GenServer Statistical Metrics
133 |
134 | Summary metrics provide near-realtime insights into the runtime behaviour
135 | of any GenServer application. However, sometimes more fine grained metrics
136 | data may be required to truly understand the subtleties of your application's
137 | runtime behaviour. To cater for those cases, GenMetrics supports optional
138 | statistical metrics.
139 |
140 | Statistical metrics may be activated using the `statistics` option on
141 | `GenMetrics.monitor_cluster/1`. GenMetrics `in-memory` metrics are activated
142 | as shown here:
143 |
144 | ```
145 | alias GenMetrics.GenServer.Cluster
146 | cluster = %Cluster{name: "demo",
147 | servers: [Session.Server, Logging.Server],
148 | opts: [statistics: true]}
149 | GenMetrics.monitor_cluster(cluster)
150 | ```
151 |
152 | Activating in-memory statistical metrics is a lot like activating a
153 | `statsd agent` directly within the BEAM. This can impact the runtime
154 | performance of some applications so redirecting metrics to an external
155 | agent is typically recommended.
156 |
157 | Redirecting statistical metrics to a `statsd` agent simply requires the
158 | following `opts` configuration:
159 |
160 | ```
161 | opts: [statistics: :statsd]}
162 | ```
163 |
164 | Redirecting statistical metrics to the `Datadog` statsd-agent requires the
165 | following `opts` configuration:
166 |
167 | ```
168 | opts: [statistics: :datadog]}
169 | ```
170 |
171 | Metrics directed to Datadog include tagging data which makes it very easy
172 | to subset and query the metrics that you need to monitor.
173 |
174 | The following are sample `in-memory` statistical metrics reported for a
175 | single window interval on a GenServer process:
176 |
177 | ```
178 | # Server Name: Demo.Server, PID<0.176.0>
179 |
180 | # handle_call/3
181 | %GenMetrics.GenServer.Stats{callbacks: 8000,
182 | max: 149,
183 | mean: 3,
184 | min: 2,
185 | range: 147,
186 | stdev: 2,
187 | total: 25753}
188 |
189 | # handle_cast/2
190 | %GenMetrics.GenServer.Stats{callbacks: 34500,
191 | max: 3368,
192 | mean: 4,
193 | min: 2,
194 | range: 3366,
195 | stdev: 31,
196 | total: 141383}
197 |
198 | # handle_info/2
199 | %GenMetrics.GenServer.Stats{callbacks: 3333,
200 | max: 37,
201 | mean: 4,
202 | min: 2,
203 | range: 35,
204 | stdev: 2,
205 | total: 13510}
206 | ```
207 |
208 | All timings reported on `in-memory` statistical metrics are reported in
209 | `microseconds (µs)`. For example, during this sample window interval, the
210 | `handle_cast/2` callback was executed `34500` times. The total time spent
211 | processing those callbacks was `141383 µs`. The `mean` time taken per
212 | callback was `4 µs` while the `standard deviation` around the mean was `31 µs`.
213 |
214 | *Note:* Under heavy load the generation of `in-memory` statistical metrics can
215 | become computationally expensive. It is therefore recommended that
216 | `in-memory` metrics be activated in production environments *judiciously*.
217 | These concerns are negligible when redirecting statistical metrics to
218 | `:statsd` or `:datadog` as custom sampling-rates may be configured.
219 |
220 |
221 | ### GenServer Reporting Metrics
222 |
223 | Runtime `in-memory` metrics for servers in your cluster are published via
224 | a dedicated reporting process. The reporting process is registered locally
225 | by the GenMetrics library at startup. This process is registered under the
226 | name `GenMetrics.GenServer.Reporter`.
227 |
228 | The reporting process is a `GenStage` producer that broadcasts metrics data.
229 | Any number of consumers can subscribe to this process in order to handle
230 | metrics data.
231 |
232 | Note, if you are redirecting statistical metrics to `:statsd` or `:datadog`
233 | there is no need to subscribe to this reporting process.
234 |
235 | In order to subscribe, a simple GenStage `:consumer` can initialize itself
236 | to receive events from the reporting process as follows:
237 |
238 | ```
239 | def init(:ok) do
240 | # Subscribe as consumer to the GenMetrics.GenServer.Reporter producer.
241 | {:consumer, :state_does_not_matter,
242 | subscribe_to: [{GenMetrics.GenServer.Reporter, max_demand: 1}]}
243 | end
244 | ```
245 |
246 | On receipt of events from the reporting process, metrics data can be extracted
247 | for processing to suit any need. The following example demonstrates simple
248 | logging of summary metrics data:
249 |
250 | ```
251 | def handle_events([metrics | _], _from, state) do
252 | # Log summary metrics for each server within the GenServer cluster.
253 | for summary <- metrics.summary do
254 | Logger.info "GenMetrics.Consumer: cluster.server summary=\#{inspect summary}"
255 | end
256 | {:noreply, [], state}
257 | end
258 | ```
259 |
260 | ## GenMetrics for GenStage Applications
261 |
262 | Any application using the `GenStage` behaviour can immediately benefit from
263 | the insights afforded by GenMetrics. The following sections explain how. For
264 | `GenServer` applications, see the docs
265 | [here](#module-genmetrics-for-genserver-applications).
266 |
267 | ### GenStage Metrics Activation
268 |
269 | A `GenMetrics.GenStage.Pipeline` struct is used to identify one or more
270 | GenStages that become candidates for metrics collection. You can
271 | identify a complete pipeline including all `:producers`, `:producer_consumers`
272 | and `:consumers`, or any subset of stages within a pipeline.
273 |
274 | For example, assuming your GenStage application has a `Data.Producer`,
275 | a `Data.Scrubber`, a `Data.Analyzer` and a `Data.Consumer`, you can activate
276 | metrics collection for the entire pipeline as follows:
277 |
278 | ```
279 | alias GenMetrics.GenStage.Pipeline
280 | pipeline = %Pipeline{name: "demo",
281 | producer: [Data.Producer],
282 | producer_consumer: [Data.Scrubber, Data.Analyzer],
283 | consumer: [Data.Consumer]}
284 | GenMetrics.monitor_pipeline(pipeline)
285 | ```
286 |
287 | Alternatively, if you only wanted to activate metrics collection for the
288 | `:producer_consumer` stages within the pipeline you can do the following:
289 |
290 | ```
291 | alias GenMetrics.GenStage.Pipeline
292 | pipeline = %Pipeline{name: "demo",
293 | producer_consumer: [Data.Scrubber, Data.Analyzer]}
294 | GenMetrics.monitor_pipeline(pipeline)
295 | ```
296 |
297 | The *pipeline* in this context is simply a named set of one or more GenStage
298 | modules about which you would like to collect metrics data. Metrics data are
299 | collected on stage processes executing on the local node.
300 |
301 | GenMetrics will instantly attach to running GenStage processes associated
302 | with your pipeline. If there are no running GenStage processes associated with
303 | your pipleline when `GenMetrics.monitor_pipeline/1` is called, GenMetrics will
304 | monitor for process activation and automatically begin metrics collection
305 | for each new process.
306 |
307 |
308 | ### GenStage Metrics Sampling
309 |
310 | Sampling metrics is a effective way to collect and report metrics for
311 | any pipeline while minimizing the runtime overhead introduced by
312 | the GenMetrics monitoring agent.
313 |
314 | When sampling is disabled, metrics data reflect the exact behaviour of the
315 | processes being monitored. When sampling is enabled, metrics data reflect
316 | an approximation of the behaviour of the processes being monitored.
317 |
318 | Given a GenStage application with the following stages: `Data.Producer`,
319 | `Data.Scrubber`, `Data.Analyzer` and a `Data.Consumer`, activate
320 | metrics-sampling for the entire pipeline as follows:
321 |
322 | ```elixir
323 | alias GenMetrics.GenStage.Pipeline
324 | pipeline = %Pipeline{name: "demo",
325 | producer: [Data.Producer],
326 | producer_consumer: [Data.Scrubber, Data.Analyzer],
327 | consumer: [Data.Consumer],
328 | opts: [sample_rate: 0.1]}
329 | GenMetrics.monitor_pipeline(pipeline)
330 | ```
331 |
332 | ### GenMetrics Summary Metrics
333 |
334 | Summary metrics are collected for activity within the following GenStage
335 | callbacks:
336 |
337 | - `GenStage.handle_demand/2`
338 | - `GenStage.handle_events/3`
339 | - `GenStage.handle_call/3`
340 | - `GenStage.handle_cast/2`
341 |
342 | GenMetrics collects the number of callbacks, the time taken on those
343 | callbacks, the size of upstream demand, and the number of events generated
344 | in response to that demand, for each of the stages within your pipeline.
345 |
346 | Summary metrics are aggregated across a periodic time interval, known as a
347 | *window*. By default, the window interval is `1000 ms`. This interval may be
348 | customized using the `window_interval` option on
349 | `GenMetrics.monitor_pipeline/1` as shown here:
350 |
351 | ```
352 | alias GenMetrics.GenStage.Pipeline
353 | pipeline = %Pipeline{name: "demo",
354 | producer_consumer: [Data.Scrubber, Data.Analyzer],
355 | opts: [window_interval: 5000]}
356 | GenMetrics.monitor_pipeline(pipeline)
357 | ```
358 |
359 | The following are sample summary metrics reported for a single window interval
360 | on a GenStage process:
361 |
362 | ```
363 | # Stage Name: Data.Producer, PID<0.195.0>
364 |
365 | %GenMetrics.GenStage.Summary{stage: Data.Producer,
366 | pid: #PID<0.195.0>,
367 | callbacks: 9536,
368 | time_on_callbacks: 407,
369 | demand: 4768000,
370 | events: 4768000}
371 | ```
372 |
373 | All timings reported on summary metrics are reported in `milliseconds (ms)`.
374 | For example, during this sample window interval, `9536` callbacks were
375 | handled by the `Data.Producer` stage. The total time spent processing those
376 | callbacks was `407 ms`.
377 |
378 | During that time, total upstream demand on the stage was `4768000`. A total of
379 | `4768000` events were also generated and emitted by the stage. This tells us
380 | that the stage was able to fully meet upstream demand during this specific
381 | sample window interval.
382 |
383 | ### GenMetrics Statistical Metrics
384 |
385 | Summary metrics provide near-realtime insights into the runtime behaviour
386 | of any GenStage application. However, sometimes more fine grained metrics
387 | data may be required to truly understand the subtleties of your application's
388 | runtime behaviour. To cater for those cases, GenMetrics supports optional
389 | statistical metrics.
390 |
391 | Statistical metrics may be activated using the `statistics` option on
392 | `GenMetrics.monitor_pipeline/1`. GenMetrics `in-memory` metrics are activated
393 | as shown here:
394 |
395 | ```
396 | alias GenMetrics.GenStage.Pipeline
397 | pipeline = %Pipeline{name: "demo",
398 | producer_consumer: [Data.Scrubber, Data.Analyzer],
399 | opts: [statistics: true]}
400 | GenMetrics.monitor_pipeline(pipeline)
401 | ```
402 |
403 | Redirecting statistical metrics to a `statsd` agent simply requires the
404 | following `opts` configuration:
405 |
406 | ```
407 | opts: [statistics: :statsd]}
408 | ```
409 |
410 | Redirecting statistical metrics to the `Datadog` statsd-agent requires the
411 | following `opts` configuration:
412 |
413 | ```
414 | opts: [statistics: :datadog]}
415 | ```
416 |
417 | Metrics directed to Datadog include tagging data which makes it very easy
418 | to subset and query the metrics that you need to monitor.
419 |
420 | The following are sample `in-memory` statistical metrics reported for a
421 | single window interval on a GenStage process:
422 |
423 | ```
424 | # Stage Name: Data.Producer, PID<0.195.0>
425 |
426 | # callback demand
427 | %GenMetrics.GenStage.Stats{callbacks: 9536,
428 | max: 500,
429 | mean: 500,
430 | min: 500,
431 | range: 0,
432 | stdev: 0,
433 | total: 4768000}
434 | # callback events
435 | %GenMetrics.GenStage.Stats{callbacks: 9536,
436 | max: 500,
437 | mean: 500,
438 | min: 500,
439 | range: 0,
440 | stdev: 0,
441 | total: 4768000}
442 |
443 | # callback timings
444 | %GenMetrics.GenStage.Stats{callbacks: 9536,
445 | max: 2979,
446 | mean: 42,
447 | min: 24,
448 | range: 2955,
449 | stdev: 38,
450 | total: 403170}
451 | ```
452 |
453 | All timings reported on `in-memory` statistical metrics are reported in
454 | `microseconds (µs)`. For example, during this sample window interval, `9536`
455 | callbacks were handled by the `Data.Producer` stage. The total time spent
456 | processing those callbacks was `403170 µs`. The `mean` time taken per
457 | callback was `42 µs` while the `standard deviation` around the mean was
458 | `38 µs`.
459 |
460 | Here, the total upstream demand of `4768000` equalled the total events emitted
461 | by the stage. This tells us that the stage was able to fully meet upstream
462 | demand during this specific sample window interval.
463 |
464 | *Note:* Under heavy load the generation of `in-memory` statistical metrics can
465 | become computationally expensive. It is therefore recommended that
466 | `in-memory` metrics be activated in production environments *judiciously*.
467 | These concerns are negligible when redirecting statistical metrics to
468 | `:statsd` or `:datadog` as custom sampling-rates may be configured.
469 |
470 | ### GenMetrics Reporting Metrics
471 |
472 | Runtime `in-memory` metrics for stages in your pipeline are published
473 | via a dedicated reporting process. The reporting process is registered
474 | locally by the GenMetrics library at startup. This process is registered
475 | under the name `GenMetrics.GenStage.Reporter`.
476 |
477 | The reporting process itself is a `GenStage` producer that broadcasts metrics
478 | data. Any number of consumers can subscribe to this process in order to handle
479 | metrics data.
480 |
481 | Note, if you are redirecting statistical metrics to `:statsd` or `:datadog`
482 | there is no need to subscribe to this reporting process.
483 |
484 | In order to subscribe, a simple GenStage `:consumer` can initialize itself
485 | to receive events from the reporting process as follows:
486 |
487 | ```
488 | def init(:ok) do
489 | # Subscribe as consumer to the GenMetrics.GenStage.Reporter producer.
490 | {:consumer, :state_does_not_matter,
491 | subscribe_to: [{GenMetrics.GenStage.Reporter, max_demand: 1}]}
492 | end
493 | ```
494 |
495 | On receipt of events from the reporting process, metrics data can be extracted
496 | for processing to suit any need. The following example demonstrates simple
497 | logging of summary metrics data:
498 |
499 | ```
500 | def handle_events([metrics | _], _from, state) do
501 | # Log summary metrics for each stage within the GenStage pipeline.
502 | for summary <- metrics.summary do
503 | Logger.info "GenMetrics.Consumer: pipeline.stage summary=\#{inspect summary}"
504 | end
505 | {:noreply, [], state}
506 | end
507 | ```
508 |
509 | """
510 |
511 | alias GenMetrics.GenServer
512 | alias GenMetrics.GenStage
513 | alias GenMetrics.GenServer.Cluster
514 | alias GenMetrics.GenStage.Pipeline
515 |
516 | @doc """
517 | Activate metrics collection and publishing for one or more GenServers.
518 |
519 | ## Example Usage
520 |
521 | Assuming an application has a `Session.Server` and a `Logging.Server` you
522 | can activate metrics collection on both GenServers as follows:
523 |
524 | ```
525 | alias GenMetrics.GenServer.Cluster
526 | cluster = %Cluster{name: "demo",
527 | servers: [Session.Server, Logging.Server],
528 | opts: [window_interval: 5000]}
529 | GenMetrics.monitor_cluster(cluster)
530 | ```
531 |
532 | ## Cluster Validation
533 |
534 | When this function is called the GenMetrics library checks and verifies
535 | the following conditions are met:
536 |
537 | 1. All server modules specified on the cluster can be located and loaded
538 | 1. All server modules specified on the cluster implement the GenServer
539 | behaviour
540 |
541 | If any module in the cluster does not meet these conditions the
542 | function terminates with a `:bad_cluster` response and supporting error
543 | messages.
544 |
545 | ## Metrics Reporting
546 |
547 | By default, metrics data gathered on your cluster are maintained `in-memory`
548 | and reported by a dedicated reporting process. However, metrics data can
549 | be redirected to `:statsd` or `:datadog` using the `statistics` configuration
550 | option on this call.
551 |
552 | For example: redirect your cluster metrics data to the `Datadog` service as
553 | follows:
554 |
555 | ```
556 | alias GenMetrics.GenServer.Cluster
557 | cluster = %Cluster{name: "demo",
558 | servers: [Session.Server, Logging.Server],
559 | opts: [statistics: :datadog]}
560 | GenMetrics.monitor_cluster(cluster)
561 | ```
562 |
563 | """
564 | @spec monitor_cluster(%Cluster{}) ::
565 | {:ok, pid} | {:error, :bad_server, [String.t]}
566 | def monitor_cluster(%Cluster{} = cluster) do
567 | Supervisor.start_child(GenServer.Supervisor, [cluster])
568 | end
569 |
570 | @doc """
571 | Activate metrics collection and publishing for one or more stages
572 | within a GenStage pipeline.
573 |
574 | ## Example Usage
575 |
576 | Assuming a GenStage application has a `Data.Producer`, a `Data.Scrubber`,
577 | a `Data.Analyzer` and a `Data.Consumer`, you can activate metrics
578 | collection for the entire pipeline as follows:
579 |
580 | ```
581 | alias GenMetrics.GenStage.Pipeline
582 | pipeline = %Pipeline{name: "demo",
583 | producer: [Data.Producer],
584 | producer_consumer: [Data.Scrubber, Data.Analyzer],
585 | consumer: [Data.Consumer]}
586 | GenMetrics.monitor_pipeline(pipeline)
587 | ```
588 |
589 | ## Pipeline Validation
590 |
591 | When this function is called the GenMetrics library checks and verifies
592 | the following conditions are met:
593 |
594 | 1. All stage modules specified on the pipeline can be located and loaded
595 | 1. All stage modules specified on the pipeline implement the GenStage behaviour
596 |
597 | If any module in the pipeline does not meet these conditions the
598 | function terminates with a `:bad_pipeline` response and supporting error
599 | messages.
600 |
601 |
602 | ## Metrics Reporting
603 |
604 | By default, metrics data gathered on your pipeline are maintained `in-memory`
605 | and reported by a dedicated reporting process. However, metrics data can
606 | be redirected to `:statsd` or `:datadog` using the `statistics` configuration
607 | option on this call.
608 |
609 | For example: redirect your pipeline metrics data to a `statsd` agent as
610 | follows:
611 |
612 | ```
613 | alias GenMetrics.GenStage.Pipeline
614 | pipeline = %Pipeline{name: "demo",
615 | producer: [Data.Producer],
616 | producer_consumer: [Data.Scrubber, Data.Analyzer],
617 | consumer: [Data.Consumer],
618 | opts: [statistics: :statsd]}
619 | GenMetrics.monitor_pipeline(pipeline)
620 | ```
621 | """
622 | @spec monitor_pipeline(%Pipeline{}) ::
623 | {:ok, pid} | {:error, :bad_pipeline, [String.t]}
624 | def monitor_pipeline(%Pipeline{} = pipeline) do
625 | Supervisor.start_child(GenStage.Supervisor, [pipeline])
626 | end
627 |
628 | end
629 |
--------------------------------------------------------------------------------