├── src ├── sups_app.erl ├── sups.app.src ├── sups.erl ├── sups_worker_sup.erl ├── sups_db_sup.erl ├── sups_supersup.erl ├── sups_worker.erl ├── sups_db_worker.erl └── sups_lib.erl ├── rebar.config ├── .gitignore ├── test ├── prop_sups.erl └── sups_statem.erl ├── README.md └── LICENSE /src/sups_app.erl: -------------------------------------------------------------------------------- 1 | -module(sups_app). 2 | -export([start/2, stop/1]). 3 | 4 | start(_Type, _Args) -> sups_supersup:start_link(). 5 | 6 | stop(_) -> ok. 7 | 8 | -------------------------------------------------------------------------------- /rebar.config: -------------------------------------------------------------------------------- 1 | {plugins, [rebar3_proper]}. 2 | 3 | {profiles, [ 4 | {test, [ 5 | {erl_opts, [nowarn_export_all, {parse_transform, lager_transform}]}, 6 | {deps, [proper, lager]} 7 | ]} 8 | ]}. 9 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .rebar3 2 | _* 3 | .eunit 4 | *.o 5 | *.beam 6 | *.plt 7 | *.swp 8 | *.swo 9 | .erlang.cookie 10 | ebin 11 | log 12 | erl_crash.dump 13 | .rebar 14 | logs 15 | _build 16 | .idea 17 | *.iml 18 | rebar3.crashdump 19 | -------------------------------------------------------------------------------- /src/sups.app.src: -------------------------------------------------------------------------------- 1 | {application, sups, 2 | [{description, "An OTP library"}, 3 | {vsn, "0.1.0"}, 4 | {registered, []}, 5 | {mod, {sups_app, []}}, 6 | {applications, 7 | [kernel, 8 | stdlib 9 | ]}, 10 | {env,[]}, 11 | {modules, []}, 12 | 13 | {maintainers, []}, 14 | {licenses, ["Apache 2.0"]}, 15 | {links, []} 16 | ]}. 17 | -------------------------------------------------------------------------------- /src/sups.erl: -------------------------------------------------------------------------------- 1 | -module(sups). 2 | 3 | %% API exports 4 | -export([]). 5 | 6 | %%==================================================================== 7 | %% API functions 8 | %%==================================================================== 9 | 10 | 11 | %%==================================================================== 12 | %% Internal functions 13 | %%==================================================================== 14 | -------------------------------------------------------------------------------- /src/sups_worker_sup.erl: -------------------------------------------------------------------------------- 1 | -module(sups_worker_sup). 2 | -export([start_link/0, init/1]). 3 | -behaviour(supervisor). 4 | 5 | start_link() -> supervisor:start_link(?MODULE, []). 6 | 7 | init([]) -> 8 | {ok, {#{strategy => one_for_one, intensity => 5, period => 10}, 9 | [#{id => worker1, 10 | start => {sups_worker, start_link, []}, 11 | restart => permanent, modules => [sups_worker]}] 12 | }}. 13 | 14 | 15 | -------------------------------------------------------------------------------- /src/sups_db_sup.erl: -------------------------------------------------------------------------------- 1 | -module(sups_db_sup). 2 | -export([start_link/0, init/1]). 3 | -behaviour(supervisor). 4 | 5 | start_link() -> supervisor:start_link(?MODULE, []). 6 | 7 | init([]) -> 8 | put(sups_tags, [db]), 9 | {ok, {#{strategy => one_for_one, intensity => 10, period => 1}, 10 | [#{id => worker1, 11 | start => {sups_db_worker, start_link, []}, 12 | restart => permanent, 13 | type => worker, 14 | modules => [sups_db_worker]}] 15 | }}. 16 | 17 | -------------------------------------------------------------------------------- /src/sups_supersup.erl: -------------------------------------------------------------------------------- 1 | -module(sups_supersup). 2 | -export([start_link/0, init/1]). 3 | -behaviour(supervisor). 4 | 5 | start_link() -> supervisor:start_link(?MODULE, []). 6 | 7 | init([]) -> 8 | SupFlags = #{strategy => one_for_one, 9 | intensity => 10, 10 | period => 1}, 11 | ChildSpecs = [ 12 | #{id => db_sup, 13 | start => {sups_db_sup, start_link, []}, 14 | restart => permanent, 15 | type => supervisor, 16 | modules => [sups_db_sup]}, 17 | #{id => workers_sup, 18 | start => {sups_worker_sup, start_link, []}, 19 | type => supervisor, 20 | modules => [sups_worker_sup]} 21 | ], 22 | {ok, {SupFlags, ChildSpecs}}. 23 | -------------------------------------------------------------------------------- /src/sups_worker.erl: -------------------------------------------------------------------------------- 1 | -module(sups_worker). 2 | -behaviour(gen_server). 3 | -export([start_link/0]). 4 | -export([init/1, handle_call/3, handle_cast/2, handle_info/2, terminate/2]). 5 | 6 | start_link() -> 7 | gen_server:start_link(?MODULE, [], []). 8 | 9 | init([]) -> 10 | self() ! req, 11 | {ok, undefined}. 12 | 13 | handle_call(_, _From, State) -> 14 | {noreply, State}. 15 | 16 | handle_cast(_, State) -> 17 | {noreply, State}. 18 | 19 | handle_info(req, State) -> 20 | case sups_db_worker:req(req, infinity) of 21 | {ok, _} -> ok; 22 | {error, disconnected} -> retry_later 23 | end, 24 | self() ! req, 25 | {noreply, State}. 26 | 27 | terminate(_, _) -> 28 | ok. 29 | 30 | % case sups_db_worker:req(req, infinity) of 31 | % {error, disconnected} -> ignore; 32 | % {ok,_} -> ok % good! request went through! 33 | % end, -------------------------------------------------------------------------------- /src/sups_db_worker.erl: -------------------------------------------------------------------------------- 1 | -module(sups_db_worker). 2 | -behaviour(gen_server). 3 | -export([start_link/0, req/2]). 4 | -export([init/1, handle_call/3, handle_cast/2, handle_info/2, terminate/2]). 5 | 6 | start_link() -> 7 | gen_server:start_link({local, ?MODULE}, ?MODULE, [], []). 8 | 9 | req(Req, Timeout) -> 10 | gen_server:call(?MODULE, {req, Req}, Timeout). 11 | 12 | init([]) -> 13 | put(sups_tags, [db]), 14 | {ok, connected}. 15 | 16 | handle_call({req, _}, _From, disconnected = State) -> 17 | {reply, {error, disconnected}, State}; 18 | handle_call({req, Req}, _From, connected = State) -> 19 | timer:sleep(500), 20 | {reply, {ok, Req}, State}; 21 | handle_call(disconnect, _From, _) -> 22 | {reply, ok, disconnected}; 23 | handle_call(connect, _From, _) -> 24 | {reply, ok, connected}. 25 | 26 | handle_cast(_, State) -> 27 | {noreply, State}. 28 | 29 | handle_info(_, State) -> 30 | {noreply, State}. 31 | 32 | terminate(_, _) -> 33 | ok. 34 | -------------------------------------------------------------------------------- /test/prop_sups.erl: -------------------------------------------------------------------------------- 1 | -module(prop_sups). 2 | -include_lib("proper/include/proper.hrl"). 3 | 4 | prop_check_tree() -> 5 | ?FORALL(Cmds, commands(sups_statem), 6 | begin 7 | %% Pre 8 | silence_logs(), 9 | {ok, Apps} = application:ensure_all_started(sups), 10 | %% Tests 11 | {History, State, Result} = run_commands(sups_statem, Cmds), 12 | %% Post 13 | [application:stop(App) || App <- Apps], 14 | %% Reporting 15 | ?WHENFAIL(io:format("History: ~p~nState: ~p~nResult: ~p~n", 16 | [History,State,Result]), 17 | collect(bucket(length(Cmds), 10), 18 | Result =:= ok)) 19 | end). 20 | 21 | bucket(N, M) -> 22 | Base = N div M, 23 | {Base*M, (Base+1)*M}. 24 | 25 | silence_logs() -> 26 | application:load(lager), 27 | application:set_env(lager, handlers, []), 28 | application:ensure_all_started(lager). -------------------------------------------------------------------------------- /test/sups_statem.erl: -------------------------------------------------------------------------------- 1 | -module(sups_statem). 2 | -include_lib("proper/include/proper.hrl"). 3 | -compile(export_all). 4 | -define(APPS, [sups]). 5 | 6 | initial_state() -> undefined. 7 | 8 | command(undefined) -> 9 | {call, sups_lib, init_state, [?APPS]}; 10 | command(State) -> 11 | oneof([ 12 | {call, sups_lib, mock_success, 13 | [State, fun mock_db_call/0, fun unmock_db_call/0, ?APPS]}, 14 | {call, sups_lib, mark_as_dead, 15 | [State, non_neg_integer(), [{not_tagged, db}], ?APPS]} 16 | ]). 17 | 18 | precondition(undefined, {call, _, init_state, _}) -> 19 | true; 20 | precondition(State, {call, _, mock_success, _}) when State =/= undefined -> 21 | true; 22 | precondition(State, {call, _, mark_as_dead, _}) when State =/= undefined -> 23 | true; 24 | precondition(_, _) -> 25 | false. 26 | 27 | postcondition(_, {call, _, init_state, _}, _Apptree) -> 28 | true; 29 | postcondition({OldTree, _Deaths}, {call, _, mark_as_dead, _}, {NewTree,NewDeaths}) -> 30 | sups_lib:validate_mark_as_dead(OldTree, NewTree, NewDeaths); 31 | postcondition({OldTree, _Deaths}, {call, _, mock_success, _}, {NewTree,NewDeaths}) -> 32 | sups_lib:validate_mock_success(OldTree, NewTree, NewDeaths). 33 | 34 | next_state(undefined, NewState, {call, _, init_state, _}) -> 35 | NewState; 36 | next_state(_State, NewState, {call, _, mock_success, _}) -> 37 | NewState; 38 | next_state(_State, NewState, {call, _, mark_as_dead, _}) -> 39 | NewState. 40 | 41 | 42 | %% This is actually using a stub because the demo didn't quite like me flipping 43 | %% the switch super hard on a central process through meck and lotsa code loading. 44 | mock_db_call() -> 45 | gen_server:call(sups_db_worker, disconnect, infinity), 46 | 100. 47 | 48 | unmock_db_call() -> 49 | gen_server:call(sups_db_worker, connect, infinity), 50 | ok. 51 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | sups 2 | ===== 3 | 4 | Experimental code library to be used with PropEr or Quickcheck to validate that an OTP application is properly implemented to match the structure of its supervision tree. 5 | 6 | Basically, in a scenario where the supervision structure encodes the expected failure semantics of a program, this library can be used to do fault-injection and failure simulation to see that the failures taking place within the program actually respect the model exposed by the supervision tree. If random process kills or simulated faults end up killing unexpected supervision subtrees, it is assumed that the caller processes have weaknesses in how they handle events. 7 | 8 | This documentation is still a work in progress. 9 | 10 | Calls 11 | ----- 12 | 13 | The API needs rework to be a bit more transparent. This doc needs rework to be a bit more helpful. 14 | 15 | - `sups_lib:init_state()` and `sups_lib:init_state([WhiteList])`: takes a snapshot of the supervision tree of currently running applications. Should be called before any other function of this library in a PropEr model to give it a baseline data set against which to run. 16 | - `mark_as_dead(State, N, Filters, WhiteList)` where `N` should be an integer picked randomly by the test framework. See the _Filters_ section for filters. This function kills a random process in the supervision tree, and makes guesses (based on the tree structure) as to what processes should be expected to die along with it. 17 | - `mock_success(State, MockFun, UnmockFun, WhiteList)` takes two functions: the first one of the form `fun() -> DoSomeMocking, IntegerValue end`, where you can set up any mocking you want, and `IntegerValue` tells the system how long to sleep (in milliseconds) before calling the unmocking function (`fun() -> Whatever end`), returning the system to normal. 18 | - `validate_mark_as_dead(OldTree, NewTree, DeadList) -> Bool` (TODO: rework to use `State`) can be used as a postcondition to validate that the right processes are living or dead in the application. 19 | - `validate_mock_success(OldTree, NewTree, DeadList) -> Bool` (TODO: rework to use `State`) can be used as a postcondition to validate that no unexpected processes died during fault injection. 20 | 21 | Other functions are exported in `sups_lib` to let you implement custom validation. 22 | 23 | Filters 24 | ------- 25 | 26 | Zero of more filters can be used in a list: 27 | 28 | - `{named, Atom}`: only kill processes with that given name (in the native registry) 29 | - `{not_named, Atom}`: only kill processes aside from a known named one 30 | - `{tagged, Term}`: only kill processes that have an entry of the form `put(sups_tags, [Term])` in their process dictionary 31 | - `{not_tagged, Term}`: only kill processes that don't have an entry of the form `put(sups_tags, [Term])` in their process dictionary 32 | 33 | How It Works 34 | ------------ 35 | 36 | By building a supervision tree data structure with all annotations, we can create an integer `N` through a regular PropEr or QuickCheck generator that is applied to the tree to denote a specific node. The count starts depth-first, from the right-most child to the leftmost child (meaning that by default shrinking rules, we start by killing newer processes than older and more critical ones). 37 | 38 | This numeric value is adapted according to filters and whatnot, and since it relies on the shape of the tree rather than the processes it contains, it should allow proper Shrinking to work fine. 39 | 40 | On a process kill, we analyze the structure of the tree and supervision structure, maintain a list of processes we know should have died, and use it to resolve what the actual tree should be doing as a model. 41 | 42 | Example 43 | ------- 44 | TODO 45 | 46 | Actually the test code for this lib (`rebar3 proper`) 47 | 48 | Caveats 49 | ------- 50 | 51 | Currently, the system does not track nor model unexpected worker faults in remote subtrees (only local ones), and so those may end up impacting tolerance rates of other supervisors and lower the accuracy of the model. Not too sure if this becomes a problem in practice or not. 52 | 53 | The system must be running under constant simulation load to be realistic. 54 | 55 | The sleeping / waiting timer for propagation is a bit ad hoc and requires tweaking 56 | 57 | Not seen enough testing with real world apps. 58 | 59 | Roadmap 60 | ------- 61 | 62 | - Fix arguments to functions 63 | - Fix app/demo structure 64 | - Rename mocking functions to be related to fault injection 65 | - Write tests instead of just running them 66 | - See if this holds up in real world projects -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | Copyright 2018, Fred Hebert . 179 | 180 | Licensed under the Apache License, Version 2.0 (the "License"); 181 | you may not use this file except in compliance with the License. 182 | You may obtain a copy of the License at 183 | 184 | http://www.apache.org/licenses/LICENSE-2.0 185 | 186 | Unless required by applicable law or agreed to in writing, software 187 | distributed under the License is distributed on an "AS IS" BASIS, 188 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 189 | See the License for the specific language governing permissions and 190 | limitations under the License. 191 | 192 | -------------------------------------------------------------------------------- /src/sups_lib.erl: -------------------------------------------------------------------------------- 1 | -module(sups_lib). 2 | -export([find_supervisors/0, find_supervisors/1, init_state/0, init_state/1, extract_dead/1, 3 | mark_as_dead/4, mock_success/4, 4 | validate_mark_as_dead/3, validate_mock_success/3, 5 | dead_as_expected/2, sups_still_living/3]). 6 | 7 | -type strategy() :: one_for_one | simple_one_for_one | rest_for_one | one_for_all. 8 | -type intensity() :: non_neg_integer(). 9 | -type period() :: pos_integer(). 10 | -type restart() :: permanent | transient | temporary. 11 | -type attr() :: {term(), term()}. 12 | -type worker() :: {worker | non_otp_sup, pid(), [attr()]}. 13 | -type sup() :: {strategy(), pid(), {intensity(), period()}, [suptree()], [attr()]}. 14 | -type suptree() :: [{restart(), worker() | sup()}]. 15 | -type app() :: atom(). 16 | -type apptree() :: [{app(), [suptree()]}]. 17 | -type death_event() :: {dead|child_dead, pid(), stamp()}. 18 | -type stamp() :: integer(). 19 | -type mockfun() :: fun(() -> pos_integer()). % returns sleep time 20 | -type unmockfun() :: fun(() -> ok). 21 | -type filter() :: {named|not_named, atom()} | {tagged|not_tagged, term()}. 22 | 23 | -export_type([apptree/0]). 24 | 25 | %%%%%%%%%%%%%% 26 | %%% PUBLIC %%% 27 | %%%%%%%%%%%%%% 28 | 29 | %% @doc find all the supervisors in a running system 30 | -spec find_supervisors() -> apptree(). 31 | find_supervisors() -> find_supervisors([]). 32 | 33 | %% @doc find all the supervisors in a running system within a 34 | %% list of whitelisted applications. An empty list means all 35 | %% apps are scanned. 36 | -spec find_supervisors([atom()]) -> apptree(). 37 | find_supervisors(Whitelist) -> 38 | [{App, [{permanent, dig_sup(P)}]} 39 | || {App,P} <- root_sups(), 40 | Whitelist =:= [] orelse lists:member(App, Whitelist)]. 41 | 42 | %% @doc wrapper to initialize the state in a PropEr statem test so that other functions get the 43 | %% right state. 44 | -spec init_state() -> {apptree(), []}. 45 | init_state() -> {find_supervisors(), []}. 46 | 47 | %% @doc wrapper to initialize the state in a PropEr statem test so that other functions get the 48 | %% right state. Takes a whitelist of applications to damage and test. 49 | -spec init_state([atom()]) -> {apptree(), []}. 50 | init_state(WhiteList) -> {find_supervisors(WhiteList), []}. 51 | 52 | %% @doc from a list of death events, extract the pids that are definitely dead 53 | %% under the form of a set for quick matching 54 | -spec extract_dead([death_event()]) -> sets:set(pid()). 55 | extract_dead(List) -> sets:from_list([Pid || {dead, Pid, _} <- List]). 56 | 57 | %% @doc Takes in an app tree with all the related deaths seen so far, 58 | %% along with a random number `N' that identifies what should die, 59 | %% and a whitelist of applications to look into to kill stuff in. 60 | %% Then, the call will go inside the tree and: 61 | %% 62 | %% 1. find how many processes are in the tree 63 | %% 2. mark them with numbers 0..M based on position (implicit) 64 | %% 3. mark the pid or supervisor at M-N rem M as dead (prioritize workers at first) 65 | %% 4. propagate expected status to other supervisors based on tolerance 66 | %% 5. kill the actual process 67 | %% 6. wait a few milliseconds for propagation (arbitrary) 68 | %% 7. take a snapshot of the program tree and compare with the old one. 69 | %% @end 70 | %% @TODO make the millisecond wait for propagation more solid 71 | -spec mark_as_dead({apptree(), [death_event()]}, non_neg_integer(), [filter()], [atom()]) -> 72 | {apptree(), [death_event()]}. 73 | mark_as_dead({Tree, Deaths}, N, Filters, Whitelist) when is_list(Tree) -> 74 | %% 1. find how many procs are in the tree, 75 | M = count_procs(Tree, Filters), 76 | mark_as_dead({Tree, Deaths}, N, M, Filters, Whitelist). 77 | 78 | %% @doc runs a mocked bit of code that can simulate some sort of 79 | %% failure or return value for an arbitrary period of time 80 | %% and then reverts it. 81 | %% A healthy supervision tree should be coming back, with no supervisor 82 | %% failures in it. 83 | -spec mock_success({apptree(), [death_event()]}, mockfun(), unmockfun(), [atom()]) -> 84 | {apptree(), [death_event()]}. 85 | mock_success({Tree, Deaths}, Mock, Unmock, Whitelist) when is_list(Tree) -> 86 | Sleep = Mock(), 87 | timer:sleep(Sleep), 88 | Unmock(), 89 | NewTree = find_supervisors(Whitelist), 90 | {NewTree, Deaths}. 91 | 92 | %% @doc Recommended validation helper for `mark_as_dead' function; 93 | %% checks that the processes that were expected to die are actually gone, 94 | %% and that the supervisors in unrelated subtrees are unaffected. This should 95 | %% capture unhandled expected faults in subtrees. 96 | %% Outputs the old tree, the new tree, and the expected missing processes in 97 | %% case a counter-example. 98 | -spec validate_mark_as_dead(apptree(), apptree(), [death_event()]) -> boolean(). 99 | validate_mark_as_dead(OldTree, NewTree, NewDeaths) -> 100 | MustBeMissing = extract_dead(NewDeaths), 101 | Res = dead_as_expected(NewTree, MustBeMissing) 102 | andalso sups_still_living(OldTree, NewTree, MustBeMissing), 103 | case Res of 104 | true -> 105 | true; 106 | false -> 107 | io:format("Old: ~p~nNew: ~p~nDead: ~p~n", 108 | [OldTree, NewTree, sets:to_list(MustBeMissing)]), 109 | false 110 | end. 111 | 112 | %% @doc Recommended validation helper for `mock_success' function; 113 | %% Checks that no supervisor has unexpectedly died, which would capture 114 | %% a massive failure subsequent to a fault injection that would have been 115 | %% considered survivable. 116 | -spec validate_mock_success(apptree(), apptree(), [death_event()]) -> boolean(). 117 | validate_mock_success(OldTree, NewTree, NewDeaths) -> 118 | %% Should not see any deaths on a successful call. 119 | MustBeMissing = extract_dead(NewDeaths), 120 | sups_still_living(OldTree, NewTree, MustBeMissing). 121 | 122 | %% @doc Takes a supervision tree model and ensures that none of the 123 | %% processes in `Set' are to be found in it. 124 | -spec dead_as_expected(apptree(), sets:set(pid())) -> boolean(). 125 | dead_as_expected([], _) -> true; 126 | dead_as_expected([{_Restart, noproc} | T], Set) -> 127 | dead_as_expected(T, Set); 128 | dead_as_expected([{_Restart, {_Type, Pid, _Attrs}} | T], Set) -> % worker 129 | (not sets:is_element(Pid, Set)) andalso dead_as_expected(T, Set); 130 | dead_as_expected([{_Restart, {_, Pid, _, Children, _Attrs}} | T], Set) -> % sup 131 | (not sets:is_element(Pid, Set)) 132 | andalso dead_as_expected(Children, Set) 133 | andalso dead_as_expected(T, Set); 134 | dead_as_expected([{_App, Sup}|T], Set) when is_list(Sup) -> % app 135 | dead_as_expected(Sup, Set) andalso dead_as_expected(T, Set). 136 | 137 | %% @doc compares two supervision trees (an old one and a newer one) and a set of 138 | %% pids that are expected to be dead, and makes sure that the new supervision tree 139 | %% does contain all of the supervisors that were in the old tree and should 140 | %% not have died according to the model. 141 | -spec sups_still_living(apptree(), apptree(), sets:set(pid())) -> boolean(). 142 | sups_still_living(Old, New, ShouldBeDead) -> 143 | OldSupPids = supervisor_pids(Old), 144 | NewSupPids = supervisor_pids(New), 145 | MustLive = OldSupPids -- sets:to_list(ShouldBeDead), 146 | lists:all(fun(Pid) -> lists:member(Pid, NewSupPids) end, MustLive). 147 | 148 | %%%%%%%%%%%%%%% 149 | %%% PRIVATE %%% 150 | %%%%%%%%%%%%%%% 151 | 152 | %%% DIG WITHIN A SUPERVISOR 153 | dig_sup(Pid) -> 154 | try sys:get_state(Pid) of 155 | {state, _Name, Strategy, Children, _Dynamics, 156 | Intensity, Period, _Restarts, _DynamicRestarts, 157 | _Mod, _Args} -> 158 | Attrs = dig_attrs(Pid), 159 | {Strategy, Pid, {Intensity, Period}, dig_children(Children, Pid), Attrs}; 160 | _Other -> 161 | {non_otp_supervisor, Pid, dig_attrs(Pid)} 162 | catch 163 | exit:{noproc,_} -> noproc 164 | end. 165 | 166 | dig_children([{child, undefined, _Name, _MFA, Restart, _Kill, worker, _Type}], Parent) -> 167 | %% Simple one for one worker 168 | Children = supervisor:which_children(Parent), 169 | [{Restart, {worker, Pid, dig_attrs(Pid)}} || {_,Pid,_,_} <- Children]; 170 | dig_children([{child, undefined, _Name, _MFA, Restart, _Kill, supervisor, _Type}], Parent) -> 171 | Children = supervisor:which_children(Parent), 172 | [{Restart, handle_dig_result(dig_sup(Pid))} || {_,Pid,_,_} <- Children]; 173 | dig_children(Children, _Parent) -> 174 | dig_children_(Children). 175 | 176 | dig_children_([]) -> []; 177 | dig_children_([{child, Pid, _Name, _MFA, Restart, _Kill, worker, _Type} | T]) -> 178 | [{Restart, {worker, Pid, dig_attrs(Pid)}} | dig_children_(T)]; 179 | dig_children_([{child, Pid, _Name, _MFA, Restart, _Kill, supervisor, _} | T]) -> 180 | [{Restart, handle_dig_result(dig_sup(Pid))} | dig_children_(T)]. 181 | 182 | handle_dig_result({non_otp_supervisor, Pid, Attrs}) -> {non_otp_sup, Pid, Attrs}; 183 | handle_dig_result(noproc) -> noproc; 184 | handle_dig_result(Res) -> Res. 185 | 186 | %% @private find process attributes that can be used to filter processes in 187 | %% or out of the kill process 188 | -spec dig_attrs(pid()) -> [attr()]. 189 | dig_attrs(Pid) -> 190 | [{_, Name}, {_, PDict}] = process_info(Pid, [registered_name, dictionary]), 191 | [{name, Name} || Name =/= []] 192 | ++ [{tag, T} || T <- proplists:get_value(sups_tags, PDict, [])]. 193 | 194 | root_sups() -> 195 | RunningApps = proplists:get_value(running, application:info()), 196 | Apps = [{App, Pid} || {App, Pid} <- RunningApps, is_pid(Pid)], 197 | [{App, P} || 198 | {App, MasterOuter} <- Apps, 199 | {links, MasterInners} <- [process_info(MasterOuter, links)], 200 | M <- MasterInners, 201 | {_,{application_master,start_it,4}} <- [process_info(M, initial_call)], 202 | {links, Links} <- [process_info(M, links)], 203 | P <- Links, 204 | {supervisor,_,_} <- [proc_lib:translate_initial_call(P)]]. 205 | 206 | 207 | %%% MARK AS DEAD COMPLEX STUFF %%% 208 | 209 | %% @private mark_as_dead continuation. 210 | %% @TODO: fix the logging for the last process left maybe 211 | -spec mark_as_dead({apptree(), [death_event()]}, non_neg_integer(), non_neg_integer(), [filter()], [atom()]) -> 212 | {apptree(), [death_event()]}. 213 | mark_as_dead(State, _, 0, _, _) -> 214 | io:format("Null case, supervisor tree is gone or only root left~n", []), 215 | State; 216 | mark_as_dead({Tree, Deaths}, N, Count, Filters, Whitelist) -> 217 | M = Count-1, 218 | %% 2. mark them with numbers 0..M based on position (implicit) 219 | %% 3. mark the pid or supervisor at M-N rem M as dead (prioritize workers at first) 220 | ChosenN = M - (N rem M), 221 | %% 4. propagate expected status to other supervisors based on tolerance 222 | {Pid, NewDeaths} = propagate_death(Tree, Deaths, ChosenN, Filters), 223 | %% 5. kill the actual process 224 | kill_and_wait(Pid), % should this be conditional in case a proc choice failed? 225 | %% 6. wait a few milliseconds for propagation 226 | DeadSleep = lists:sum([case Dead of % TODO: tweak 227 | dead -> 150; 228 | child_dead -> 75 229 | end || {Dead, _, _} <- NewDeaths]), 230 | timer:sleep(min(DeadSleep, 1000)), % very tolerant sups may be killed at random anyway 231 | %% 7. take a snapshot of the program tree and compare them 232 | NewTree = find_supervisors(Whitelist), 233 | {NewTree, NewDeaths ++ Deaths}. 234 | 235 | %% @private returns how many processes are in a supervision tree 236 | -spec count_procs(apptree(), [filter()]) -> non_neg_integer(). 237 | count_procs([], _) -> 0; 238 | count_procs([{_Restart, noproc} | T], Filters) -> 239 | %% This happens somehow 240 | count_procs(T, Filters); 241 | count_procs([{_Restart, {_, _Pid, _, Children, Attrs}}|T], Filters) -> 242 | Val = case filter_attrs(Filters, Attrs) of 243 | true -> 1; 244 | false -> 0 245 | end, 246 | Val + count_procs(Children, Filters) + count_procs(T, Filters); 247 | count_procs([{_Restart, {_Type, _Pid, Attrs}} | T], Filters) -> 248 | Val = case filter_attrs(Filters, Attrs) of 249 | true -> 1; 250 | false -> 0 251 | end, 252 | Val + count_procs(T, Filters); 253 | count_procs([{App, [{_, {_, _, _, Children, _Attrs}}]}|T], Filters) when is_atom(App) -> 254 | count_procs(Children, Filters) + count_procs(T, Filters). 255 | 256 | %% @private returns `true' if all the filters match a given process' attributes 257 | -spec filter_attrs([filter()], [attr()]) -> boolean(). 258 | filter_attrs(Filters, Attrs) -> 259 | lists:all(fun(Filter) -> filter(Filter, Attrs) end, Filters). 260 | 261 | filter({tagged, Tag}, Attrs) -> 262 | lists:member({tag, Tag}, Attrs); 263 | filter({not_tagged, Tag}, Attrs) -> 264 | not filter({tagged, Tag}, Attrs); 265 | filter({named, Name}, Attrs) -> 266 | lists:member({name, Name}, Attrs); 267 | filter({not_named, Name}, Attrs) -> 268 | not filter({named, Name}, Attrs). 269 | 270 | 271 | %% @private send an exit signal and return once the process has died. 272 | kill_and_wait(Pid) -> 273 | Ref = erlang:monitor(process, Pid), 274 | exit(Pid, kill), 275 | receive 276 | {'DOWN', Ref, process, Pid, _} -> ok 277 | after 5000 -> 278 | error({timeout, {kill, Pid}}) 279 | end. 280 | 281 | %% @private 282 | %% Take the app tree, the deaths seen so far, and then kill the process 283 | %% that has been targeted by its integer position. Once it is killed, update 284 | %% the death events and propagate all deaths up the supervision tree 285 | %% according to the model. 286 | %% @end 287 | -spec propagate_death(apptree(), [death_event()], non_neg_integer(), [filter()]) -> 288 | {pid(), [death_event()]}. 289 | %% kill shots 290 | propagate_death([{_Restart, {_Type, Pid, Attrs}}|T], Deaths, 0, Filters) -> 291 | case filter_attrs(Filters, Attrs) of 292 | true -> {Pid, [{dead, Pid, stamp()}]}; 293 | false -> propagate_death(T, Deaths, 0, Filters) 294 | end; 295 | propagate_death([{_Restart, {Strategy, Pid, Tolerance, Children, Attrs}}|T], Deaths, 0, Filters) -> 296 | case filter_attrs(Filters, Attrs) of 297 | true -> 298 | {Pid, [{dead, Pid, stamp()} | recursive_all_dead(Children)]}; 299 | false -> 300 | sup_propagation(Pid, Strategy, Tolerance, Children, T, Deaths, 0, Filters) 301 | end; 302 | %% propagation steps 303 | propagate_death([], _Deaths, N, _Filters) -> 304 | %% proc not found 305 | {not_in_tree, N}; 306 | propagate_death([{_Restart, noproc} | T], Deaths, N, Filters) -> 307 | %% skip process as non-existing 308 | propagate_death(T, Deaths, N, Filters); 309 | propagate_death([{_Restart, {Strategy, Pid, Tolerance, Children, _Attrs}}|T], Deaths, N, Filters) -> 310 | %% supervisor (not the target). Propagate the kill signal, and if it comes 311 | %% back up to us and a child (direct or not) was the target, propagate 312 | %% the death to other siblings or even ourselves 313 | sup_propagation(Pid, Strategy, Tolerance, Children, T, Deaths, N-1, Filters); 314 | propagate_death([{_Restart, {_Atom, _Pid, _Attrs}}|T], Deaths, N, Filters) -> 315 | %% non-target worker 316 | propagate_death(T, Deaths, N-1, Filters); 317 | propagate_death([{App, [{_,{Strategy,Pid,Tolerance,Children,_Attrs}}]}|T], Deaths, N, Filters) when is_atom(App) -> 318 | %% Skip to the next app. Since we represent the root process of the app, we may 319 | %% need to do propagation of our own. 320 | sup_propagation(Pid, Strategy, Tolerance, Children, T, Deaths, N, Filters). 321 | 322 | sup_propagation(Pid, Strategy, Tolerance, Children, Rest, Deaths, Count, Filters) -> 323 | case propagate_death(Children, Deaths, Count, Filters) of 324 | {not_in_tree, NewN} -> 325 | propagate_death(Rest, Deaths, NewN, Filters); 326 | {KillPid, NewDeaths} when is_pid(KillPid) -> 327 | handle_child_death(Pid, KillPid, Strategy, Tolerance, NewDeaths, Deaths, Children) 328 | end. 329 | 330 | %% @private Act as a supervisor and apply a modeled version of the various 331 | %% restart strategies to children: 332 | %% - if a one_for_one/sofo supervisor sibling dies, none of the other siblings should die 333 | %% - if a rest_for_one supervisor sibling (ancestor) dies, the newer ones should die 334 | %% - if a one_for_all supervisor sibling dies, they all die. 335 | %% - if a worker dies, add the count to the parent ({child_dead, SupPid, Stamp}) 336 | -spec handle_child_death(pid(), pid(), strategy(), {intensity(), period()}, 337 | [death_event()], [death_event()], apptree()) -> {pid(), [death_event()]}. 338 | handle_child_death(Pid, KillPid, Strategy, {Intensity, Period}, NewDeaths, Deaths, Children) -> 339 | Now = stamp(), 340 | Deadline = Now-Period, 341 | ChildPids = get_child_pids(Children), 342 | DeadPids = [{child_dead, Pid, S} || {dead, P, S} <- NewDeaths, 343 | lists:member(P, ChildPids)], 344 | %% Should the supervisor die, or just some of its children? 345 | ShouldDie = Intensity < length(qualifying_deaths(Pid, Deadline, DeadPids++Deaths)), 346 | CurrentDeaths = if ShouldDie -> 347 | [{dead, Pid, Now} | all_dead(Children)] 348 | ; not ShouldDie -> 349 | ShutdownPids = propagate(Strategy, DeadPids, Children), 350 | dedupe_append([ShutdownPids, DeadPids, NewDeaths]) 351 | end, 352 | {KillPid, CurrentDeaths}. 353 | 354 | %% @private implement the propagation strategy on a list of children 355 | -spec propagate(strategy(), [pid()], [worker()|sup()]) -> [pid()]. 356 | propagate(_, [], _) -> 357 | % no dead children 358 | []; 359 | propagate(one_for_all, _, Children) -> 360 | all_dead(Children); 361 | propagate(rest_for_one, [DeadPid], Children) -> 362 | %% The children are in reverse order, so we dropwhile to the child 363 | lists:dropwhile(fun({_, Pid, _}) -> Pid =/= DeadPid end, all_dead(Children)); 364 | propagate(T, Dead, _) when T =:= one_for_one; T =:= simple_one_for_one -> 365 | %% one_for_one and simple_one_for_one remain as-is 366 | Dead. 367 | 368 | %% @private Add events to the end of a list, but skip duplicate entries since 369 | %% those can interfere with the frequency counts. Keep the latest instances 370 | %% only. 371 | -spec dedupe_append([[death_event()]]) -> [death_event()]. 372 | dedupe_append([]) -> []; 373 | dedupe_append([[]|T]) -> dedupe_append(T); 374 | dedupe_append([[H={Tag,Pid,_}|T] | Rest]) -> 375 | try 376 | [throw(dupe) || List <- [T | Rest], 377 | {Type,P,_} <- List, 378 | {Type,P} == {Tag,Pid}], 379 | [H | dedupe_append([T | Rest])] 380 | catch 381 | dupe -> dedupe_append([T|Rest]) 382 | end. 383 | 384 | %% @private mark all direct children as dead 385 | -spec all_dead([worker()|sup()]) -> [death_event()]. 386 | all_dead(Children) -> 387 | Now = stamp(), 388 | [{dead, Pid, Now} || Pid <- get_child_pids(Children)]. 389 | 390 | %% @private mark all direct and indirect children as dead 391 | -spec recursive_all_dead([worker()|sup()]) -> [death_event()]. 392 | recursive_all_dead(Children) -> 393 | Now = stamp(), 394 | [{dead, Pid, Now} || Pid <- get_subtree_pids(Children)]. 395 | 396 | %% @private all deaths that have happened on or after a deadline 397 | -spec qualifying_deaths(pid(), stamp(), [death_event()]) -> [death_event()]. 398 | qualifying_deaths(Pid, Deadline, Deaths) -> 399 | [D || D = {child_dead,P,S} <- Deaths, 400 | P =:= Pid, S >= Deadline]. 401 | 402 | %% @private monotonic timestamp. Must have the same granularity as 403 | %% what supervisors use on their own to get filtering to work (seconds). 404 | -spec stamp() -> stamp(). 405 | stamp() -> erlang:monotonic_time(second). 406 | 407 | %% @private get the pids of all direct children of a process' 408 | %% child list 409 | -spec get_child_pids([worker()|sup()]) -> [pid()]. 410 | get_child_pids([]) -> []; 411 | get_child_pids([{_, noproc} | T]) -> get_child_pids(T); 412 | get_child_pids([{_, {_, Pid, _, _, _}} | T]) -> [Pid | get_child_pids(T)]; 413 | get_child_pids([{_, {_, Pid, _}}|T]) -> [Pid | get_child_pids(T)]. 414 | 415 | %% @private get the pids of all direct or indirect children of a process' 416 | %% child list 417 | -spec get_subtree_pids([worker()|sup()]) -> [pid()]. 418 | get_subtree_pids([]) -> []; 419 | get_subtree_pids([{_, noproc} | T]) -> get_subtree_pids(T); 420 | get_subtree_pids([{_, {_, Pid, _}}|T]) -> [Pid | get_subtree_pids(T)]; 421 | get_subtree_pids([{_, {_, Pid, _, Children, _}} | T]) -> 422 | [Pid | get_subtree_pids(Children)] ++ get_subtree_pids(T). 423 | 424 | %% @private get the pids of all supervisors that are direct or indirect 425 | %% children of a process' child list 426 | -spec supervisor_pids([worker()|sup()]) -> [pid()]. 427 | supervisor_pids([]) -> []; 428 | supervisor_pids([{_, noproc} | T]) -> supervisor_pids(T); 429 | supervisor_pids([{_, {_,_,_}} | T]) -> supervisor_pids(T); 430 | supervisor_pids([{_, {_, Pid, _, Children, _}} | T]) -> 431 | [Pid | supervisor_pids(Children)] ++ supervisor_pids(T); 432 | supervisor_pids([{_, Sup} | T]) when is_list(Sup) -> 433 | supervisor_pids(Sup) ++ supervisor_pids(T). --------------------------------------------------------------------------------