├── config └── config.exs ├── test ├── test_helper.exs └── delayed_sup_test.exs ├── .gitignore ├── lib ├── default_sup.ex ├── delayed_server.ex └── delayed_sup.ex ├── mix.lock ├── mix.exs └── README.md /config/config.exs: -------------------------------------------------------------------------------- 1 | use Mix.Config 2 | 3 | -------------------------------------------------------------------------------- /test/test_helper.exs: -------------------------------------------------------------------------------- 1 | ExUnit.start() 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /_build 2 | /cover 3 | /deps 4 | erl_crash.dump 5 | *.ez 6 | tags 7 | -------------------------------------------------------------------------------- /lib/default_sup.ex: -------------------------------------------------------------------------------- 1 | defmodule DelayedSup.Default do 2 | @moduledoc false 3 | @behaviour :supervisor 4 | 5 | def init(args) do 6 | args 7 | end 8 | end 9 | -------------------------------------------------------------------------------- /mix.lock: -------------------------------------------------------------------------------- 1 | %{"earmark": {:hex, :earmark, "1.0.3", "89bdbaf2aca8bbb5c97d8b3b55c5dd0cff517ecc78d417e87f1d0982e514557b", [:mix], []}, 2 | "ex_doc": {:hex, :ex_doc, "0.14.5", "c0433c8117e948404d93ca69411dd575ec6be39b47802e81ca8d91017a0cf83c", [:mix], [{:earmark, "~> 1.0", [hex: :earmark, optional: false]}]}} 3 | -------------------------------------------------------------------------------- /mix.exs: -------------------------------------------------------------------------------- 1 | defmodule DelayedOTP.Mixfile do 2 | use Mix.Project 3 | 4 | def project do 5 | [app: :delayed_otp, 6 | version: "0.0.4", 7 | elixir: ">= 1.2.0", 8 | build_embedded: Mix.env == :prod, 9 | start_permanent: Mix.env == :prod, 10 | package: [ 11 | maintainers: ["Arnaud Wetzel,Kbrw"], 12 | licenses: ["MIT"], 13 | links: %{ 14 | "GitHub" => "https://github.com/kbrw/delayed_otp" 15 | } 16 | ], 17 | description: """ 18 | Delay death of supervisor children or gen_server : for instance 19 | Erlang supervisor with exponential backoff restart strategy. 20 | """, 21 | deps: [{:ex_doc, ">= 0.0.0", only: :dev}]] 22 | end 23 | 24 | def application do 25 | [applications: [:logger]] 26 | end 27 | end 28 | -------------------------------------------------------------------------------- /lib/delayed_server.ex: -------------------------------------------------------------------------------- 1 | defmodule DelayedServer do 2 | use GenServer 3 | require Logger 4 | @doc """ 5 | starts process `apply(mod,options[:function] || :start_link,args)` but 6 | - proc death can only occur `options[:delay]` minimum after process creation 7 | - on sup termination: if proc exit takes longer than `options[:shutdown]`, then brutal kill it 8 | (options[:shutdown] is equivalent to the sup child_spec one: :brutal_kill | int_timeout | :infinity) 9 | """ 10 | def start_link(mod,args,options \\ []), do: 11 | GenServer.start_link(__MODULE__, {mod,args,options}) 12 | 13 | def init({mod,args,options}) do 14 | Process.flag(:trap_exit, true) 15 | delay = options[:delay] || 100 16 | shutdown = options[:shutdown] || 100 17 | fun = options[:function] || :start_link 18 | name = options[:name] || inspect({mod,fun}) 19 | call_timeout = options[:call_timeout] || 5000 20 | Logger.debug("starting #{name} with delay of #{delay}") 21 | started = :erlang.system_time(:milli_seconds) 22 | case apply(mod, fun, args) do 23 | {:ok, pid} -> 24 | {:ok, %{name: name, delay: delay, started: started, pid: pid, shutdown: shutdown, call_timeout: call_timeout}} 25 | err -> 26 | {:ok, delayed_death(err, %{name: name, delay: delay, started: started, pid: nil, shutdown: shutdown, call_timeout: call_timeout})} 27 | end 28 | end 29 | 30 | def delayed_death(reason, state) do 31 | lifetime = :erlang.system_time(:milli_seconds) - state.started 32 | Process.send_after(self, {:die, reason, lifetime}, max(state.delay - lifetime, 0)) 33 | %{state| pid: nil} 34 | end 35 | 36 | def handle_call(:delayed_pid, _from, state), do: {:reply,state.pid,state} 37 | def handle_call(req, _from, state), do: {:reply,GenServer.call(state.pid,req, state.call_timeout),state} 38 | def handle_cast(req, state), do: (GenServer.cast(state.pid,req); {:noreply,state}) 39 | 40 | def handle_info({:EXIT,_pid,reason}, state) do 41 | Logger.info("Delayed proc #{state.name} failed : #{inspect reason}") 42 | {:noreply, delayed_death(reason, state)} 43 | end 44 | def handle_info({:die, reason, lifetime}, state), do: {:stop, {:delayed_death,lifetime,reason}, state} 45 | def handle_info(msg, state), do: (send(state.pid, msg); {:noreply, state}) 46 | 47 | def terminate(_, %{pid: nil}), do: :ok 48 | def terminate(_, %{pid: pid, shutdown: :brutal_kill}), do: Process.exit(pid, :kill) 49 | def terminate(reason, %{pid: pid, shutdown: shutdown, name: name}) do 50 | Process.exit(pid, reason) 51 | receive do 52 | {:EXIT, ^pid, _}-> :ok 53 | after shutdown-> 54 | Logger.warn("Delayed server #{name} failed to terminate within #{shutdown}, killing it brutally") 55 | Process.exit(pid, :kill) 56 | receive do {:EXIT, ^pid, _}-> :ok end 57 | end 58 | end 59 | end 60 | -------------------------------------------------------------------------------- /test/delayed_sup_test.exs: -------------------------------------------------------------------------------- 1 | defmodule DelayedSupTest do 2 | use ExUnit.Case 3 | require Logger 4 | 5 | setup_all do 6 | Agent.start_link(fn->true end, name: :working?) 7 | Agent.start_link(fn->[] end, name: :restart_queue) 8 | 9 | defmodule Elixir.FakeServer do 10 | use GenServer 11 | def init([]) do 12 | if Agent.get(:working?, & &1), 13 | do: {:ok,[]}, 14 | else: {:stop, :die_too_soon} 15 | end 16 | end 17 | 18 | defmodule Elixir.TestSup1 do 19 | use DelayedSup 20 | import Bitwise 21 | 22 | @reset_backoff_lifetime 5_000 23 | @init_backoff_delay 200 24 | def init(_) do 25 | supervise([ 26 | worker(GenServer,[FakeServer,[],[name: FakeServer]]) 27 | ], strategy: :one_for_one, max_restarts: 9999, max_seconds: 3600, 28 | delay_fun: fn _id,lifetime,count_or_nil-> 29 | count = count_or_nil || 0 30 | Agent.update(:restart_queue, &[:erlang.system_time(:milli_seconds)|&1]) 31 | if lifetime > @reset_backoff_lifetime, 32 | do: {0,0}, 33 | else: {@init_backoff_delay*(1 <<< count),count+1} 34 | end) 35 | end 36 | end 37 | :ok 38 | end 39 | 40 | @test_duration 5_000 41 | @test_precision 100 42 | test "exp backoff delayed restart" do 43 | Agent.update(:restart_queue,fn _->[] end) 44 | Agent.update(:working?,fn _->false end) 45 | start_ts = :erlang.system_time(:milli_seconds) 46 | {:ok,pid} = DelayedSup.start_link(TestSup1,[]) 47 | receive do after @test_duration-> :ok end 48 | queue = Agent.get(:restart_queue, & &1) |> Enum.map(& &1 - start_ts) 49 | expected = Stream.iterate({0,0},fn {i,dur}-> {i+1,dur + 200 * :math.pow(2,i)} end) |> 50 | Stream.map(& trunc(elem(&1,1))) |> Enum.take_while(& &1 < @test_duration) |> Enum.reverse 51 | 52 | assert Enum.map(expected,& div(&1,@test_precision)) == Enum.map(queue,& div(&1,@test_precision)) 53 | Process.exit(pid,:shutdown) 54 | end 55 | 56 | @server_recovery_after 2000 57 | @server_death_after 6100 58 | @test_duration 10_000 59 | @test_precision 100 60 | test "exp backoff with recovery" do 61 | Agent.update(:restart_queue,fn _->[] end) 62 | Agent.update(:working?,fn _->false end) 63 | start_ts = :erlang.system_time(:milli_seconds) 64 | {:ok,pid} = DelayedSup.start_link(TestSup1,[]) 65 | receive do after @server_recovery_after-> :ok end 66 | Agent.update(:working?,& !&1) 67 | receive do after @server_death_after-> :ok end 68 | Agent.update(:working?,& !&1) 69 | Process.exit(Process.whereis(FakeServer),:new_death) 70 | receive do after @test_duration-@server_death_after-@server_recovery_after-> :ok end 71 | queue = Agent.get(:restart_queue, & &1) |> Enum.map(& &1 - start_ts) 72 | 73 | assert [95, 87, 83, 81, 81, 30, 14, 6, 2, 0] == Enum.map(queue,& div(&1,@test_precision)) 74 | Process.exit(pid,:shutdown) 75 | end 76 | end 77 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # DelayedOTP 2 | 3 | `DelayedSup and DelayedServer` are respectively `Supervisor` and `GenServer` 4 | but with the capability to delay the death of the child or the server to have a 5 | better supervision restart time control. 6 | 7 | With exactly the same API as `Supervisor`, create a supervisor which allows you 8 | to control a minimum delay for the supervised children to die. 9 | 10 | You can for instance : 11 | 12 | - get an Exponential backoff restarting strategy for children 13 | - normalized death time for port managed external processes 14 | 15 | Useful to manage external service in Elixir supervisors. 16 | 17 | ## Usage 18 | 19 | All the same as `Supervisor`, but a new option is available: `:delay_fun` which is a 20 | function returning the minimum lifetime of a child in millisecond (child death will be delayed 21 | if it occurs too soon). 22 | 23 | The signature of `:delay_fun` is: `(child_id :: term, ms_lifetime :: integer, acc :: term) -> {ms_delay_death :: integer, newacc:: term}` 24 | First start accumulator is `nil`. 25 | 26 | Below an example usage with an exponential backoff strategy: (200*2^count) ms 27 | delay where the backoff count is reset when previous run lifetime was > 5 secondes. 28 | 29 | ```Elixir 30 | import DelayedSup.Spec 31 | import Bitwise 32 | @reset_backoff_lifetime 5_000 33 | @init_backoff_delay 200 34 | DelayedSup.start_link([ 35 | worker(MyServer1,[]), 36 | worker(MyServer2,[]) 37 | ], restart_strategy: :one_for_one, 38 | delay_fun: fn _id,lifetime,count_or_nil-> 39 | count = count_or_nil || 0 40 | if lifetime > @reset_backoff_lifetime, 41 | do: {0,0}, 42 | else: {@init_backoff_delay*(1 <<< count),count+1} 43 | end) 44 | ``` 45 | 46 | ## How it works 47 | 48 | The created "supervisor" creates actually the following supervision tree : 49 | 50 | `supervise([child1,child2], strategy: :one_for_one)` => 51 | 52 | ``` 53 | +--------------+ +----------+ +---------+ 54 | | Supervisor +------>+TempServer+----->+ Child1 | 55 | +--------------+ +----------+ +---------+ 56 | | +----------+ +---------+ 57 | +------>+TempServer+----->+ Child2 | 58 | +----------+ +---------+ 59 | ``` 60 | 61 | `TempServer` (actually `DelayedServer`) is an intermediate process 62 | which can delay its death relatively to its linked server. 63 | 64 | Restart Counter, and delay computation function are kept into the supervisor 65 | process dictionnary. 66 | 67 | The shutdown strategy (brutal kill or max shutdown duration) is handled by the temp server. 68 | 69 | ## Installation 70 | 71 | If [available in Hex](https://hex.pm/docs/publish), the package can be installed as: 72 | 73 | 1. Add `delayed_otp` to your list of dependencies in `mix.exs`: 74 | 75 | def deps do 76 | [{:delayed_otp, "~> 0.0.1"}] 77 | end 78 | 79 | 2. Ensure `delayed_otp` is started before your application: 80 | 81 | def application do 82 | [applications: [:delayed_otp]] 83 | end 84 | 85 | 86 | # CONTRIBUTING 87 | 88 | Hi, and thank you for wanting to contribute. 89 | Please refer to the centralized informations available at: https://github.com/kbrw#contributing 90 | 91 | -------------------------------------------------------------------------------- /lib/delayed_sup.ex: -------------------------------------------------------------------------------- 1 | defmodule DelayedSup do 2 | @moduledoc """ 3 | The API is exactly the same as Elixir stdlib `Supervisor`, 4 | except that the supervisor options now supports `:delay_fun` as 5 | an option. 6 | 7 | The signature of `:delay_fun` is: `(child_id :: term, ms_lifetime :: integer, acc :: term) -> {ms_delay_death :: integer, newacc:: term}` 8 | The second argument `ms_lifetime` is the lifetime of the previously dead process. 9 | First start accumulator is `nil`. 10 | 11 | This delay will be the minimum lifetime of the child in millisecond : child death will be delayed 12 | if it occurs too soon. 13 | 14 | Below an example usage with an exponential backoff strategy: (200*2^count) ms 15 | delay where the backoff count is reset when previous run lifetime was > 10 minutes. 16 | 17 | iex> import Bitwise 18 | ...> DelayedSup.start_link([ 19 | ...> MyServer1, 20 | ...> MyServer2 21 | ...> ], restart_strategy: :one_for_one, delay_fun: fn _id, lifetime, acc -> 22 | ...> delay = if lifetime > :timer.minutes(10), do: 1, else: min((acc || 200) * 2, :timer.minutes(10)) 23 | ...> {delay, delay} 24 | ...> end) 25 | """ 26 | 27 | ## erlang supervisor callback delegates 28 | use GenServer 29 | def init({supname,mod,arg}) do 30 | {sup_spec,options} = mod.init(arg) 31 | Process.put(:delay_fun,options[:delay_fun] || fn _,_,_->{0,0} end) 32 | :supervisor.init({erl_supname(supname), DelayedSup.Default, sup_spec}) 33 | end 34 | 35 | defp erl_supname(nil), do: :self 36 | defp erl_supname(sup) when is_atom(sup), do: {:local,sup} 37 | defp erl_supname(sup), do: sup 38 | 39 | def handle_info({:EXIT,pid,{:delayed_death,lifetime,reason}},state) do 40 | {:reply,children,_} = :supervisor.handle_call(:which_children,nil,state) 41 | if id = Enum.find_value(children, fn {id, ^pid, _worker, _modules} -> id ; _ -> false end) do 42 | acc = Process.get({:delay_acc, id}, nil) 43 | {delay, acc} = Process.get(:delay_fun).(id, lifetime, acc) 44 | Process.put({:delay_acc, id}, acc) 45 | Process.put({:next_delay, id}, delay) 46 | end 47 | :supervisor.handle_info({:EXIT,pid,reason},state) 48 | end 49 | def handle_info(req,state), do: :supervisor.handle_info(req,state) 50 | 51 | defdelegate terminate(r,s), to: :supervisor 52 | 53 | defdelegate code_change(vsn,s,extra), to: :supervisor 54 | 55 | defdelegate handle_call(req,rep_to,s), to: :supervisor 56 | 57 | defdelegate handle_cast(req,s), to: :supervisor 58 | 59 | ## Elixir Supervisor API 60 | def start_link(children, options) when is_list(children) do 61 | start_link(DelayedSup.Default, DelayedSup.Spec.supervise(children, options), options) 62 | end 63 | 64 | def start_link(module, arg, options \\ []) when is_list(options) do 65 | GenServer.start_link(__MODULE__, {options[:name], module, arg}, options) 66 | end 67 | 68 | def which_children(supervisor) do 69 | for {id,pid,worker,modules} <- Supervisor.which_children(supervisor) do 70 | {id, GenServer.call(pid, :delayed_pid), worker, modules} 71 | end 72 | end 73 | 74 | def start_child(supervisor, child_spec) do 75 | Supervisor.start_child(supervisor, DelayedSup.Spec.map_childspec(child_spec)) 76 | end 77 | 78 | defdelegate stop(sup), to: Supervisor 79 | 80 | defdelegate stop(sup, r), to: Supervisor 81 | 82 | defdelegate stop(sup, r, t), to: Supervisor 83 | 84 | defdelegate count_children(sup), to: Supervisor 85 | 86 | defdelegate terminate_child(sup, child), to: Supervisor 87 | 88 | defdelegate delete_child(sup, childid), to: Supervisor 89 | 90 | defdelegate restart_child(sup, childid), to: Supervisor 91 | 92 | defmacro __using__(_) do 93 | quote location: :keep do 94 | @behaviour :supervisor 95 | import DelayedSup.Spec 96 | end 97 | end 98 | 99 | defmodule Spec do 100 | def supervise(children, options) do 101 | {Supervisor.Spec.supervise(Enum.map(children,&map_childspec/1), options),options} 102 | end 103 | 104 | def map_childspec({id,mfa,restart,shutdown,worker,modules}) do 105 | {id,{__MODULE__, :start_delayed, [id,mfa,shutdown]},restart,:infinity,worker,modules} 106 | end 107 | 108 | def start_delayed(id,{m,f,a},shutdown) do 109 | DelayedServer.start_link(m, a, function: f, delay: Process.get({:next_delay,id},0), shutdown: shutdown) 110 | end 111 | 112 | defdelegate worker(mod, args), to: Supervisor.Spec 113 | 114 | defdelegate worker(mod, args, opts), to: Supervisor.Spec 115 | 116 | defdelegate supervisor(mod, args), to: Supervisor.Spec 117 | 118 | defdelegate supervisor(mod, args, opts), to: Supervisor.Spec 119 | end 120 | end 121 | --------------------------------------------------------------------------------