├── .circleci └── config.yml ├── .gitignore ├── LICENSE ├── README.md ├── include └── lashup.hrl ├── rebar.config ├── rebar.lock ├── rebar3 ├── src ├── lashup.app.src ├── lashup.erl ├── lashup_app.erl ├── lashup_config.erl ├── lashup_core_sup.erl ├── lashup_gm.erl ├── lashup_gm_events.erl ├── lashup_gm_fanout.erl ├── lashup_gm_mc.erl ├── lashup_gm_mc_events.erl ├── lashup_gm_mc_sup.erl ├── lashup_gm_probe.erl ├── lashup_gm_route.erl ├── lashup_gm_route_events.erl ├── lashup_gm_sup.erl ├── lashup_gm_sync_worker.erl ├── lashup_gm_worker_sup.erl ├── lashup_hyparview_events.erl ├── lashup_hyparview_membership.erl ├── lashup_hyparview_ping_handler.erl ├── lashup_kv.erl ├── lashup_kv_aae_mgr.erl ├── lashup_kv_aae_sup.erl ├── lashup_kv_sup.erl ├── lashup_kv_sync_rx_fsm.erl ├── lashup_kv_sync_tx_fsm.erl ├── lashup_platform_sup.erl ├── lashup_save.erl ├── lashup_sup.erl └── lashup_utils.erl └── test ├── lashup_gm_route_SUITE.erl ├── lashup_hyparview_SUITE.erl ├── lashup_kv_SUITE.erl └── lashup_kv_aae_SUITE.erl /.circleci/config.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | jobs: 3 | build: 4 | working_directory: '~/lashup' 5 | docker: 6 | - image: erlang:21.1 7 | steps: 8 | - checkout 9 | - run: 10 | name: Updating rebar3 11 | command: ./rebar3 update 12 | - run: 13 | name: Fetching dependencies 14 | command: ./rebar3 get-deps 15 | - run: 16 | name: Building 17 | command: ./rebar3 compile 18 | - run: 19 | name: Checking eunit tests 20 | command: ./rebar3 eunit -v 21 | - run: 22 | name: Checking common tests 23 | command: ./rebar3 ct -v 24 | - run: 25 | name: Running cross reference analysis 26 | command: ./rebar3 xref 27 | - run: 28 | name: Running static analyzer 29 | command: ./rebar3 dialyzer 30 | - run: 31 | name: Checking code style 32 | command: ./rebar3 as lint lint 33 | - run: 34 | name: Performing coverage analysis 35 | command: ./rebar3 as test cover 36 | - run: 37 | name: Generating cover report 38 | command: ./rebar3 as test covertool generate 39 | - run: 40 | name: Installing python 41 | command: | 42 | apt-get update 43 | apt-get install -y --no-install-recommends python3-pip 44 | - run: 45 | name: Installing codecov 46 | command: pip3 install codecov 47 | - run: 48 | name: Sending cover report 49 | command: | 50 | codecov -X gcov -f _build/test/covertool/lashup.covertool.xml 51 | - store_artifacts: 52 | path: ~/lashup/_build/test/logs 53 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | _build/ 2 | doc/ 3 | *.plt 4 | *.xml 5 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright 2013 Mesosphere 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Lashup 2 | 3 | [![CircleCI][circleci badge]][circleci] 4 | [![Coverage][coverage badge]][covercov] 5 | [![Jira][jira badge]][jira] 6 | [![License][license badge]][license] 7 | [![Erlang Versions][erlang version badge]][erlang] 8 | 9 | ## Summary 10 | 11 | Lashup is a building block for a distributed control plane. It acts as a failure detector, a distributed fully-replicated CRDT store, and a multicast system. It isn't meant to be used by itself, but rather in conjunction with other components. In this it has several components described below. We currently use it in our [Minuteman Distributed Load Balancer](https://github.com/dcos/minuteman), and we've [publicly evaluated](https://github.com/dcos/minuteman#evaluation) its capability for fault tolerance with Minuteman. 12 | 13 | ### Overlay Builder 14 | 15 | The core of Lashup is the overlay builder. This is a strongly connected, sparse graph of TCP connections used for communication. The primary reason to do this is scalability. If every node had to check the health of every other node, or maintain connections to every other node, we would quickly see scalability problems. In the code, these are the files prefixed with `lashup_hyparview`, as the implementation is heavily based on [HyParView](http://asc.di.fct.unl.pt/~jleitao/pdf/dsn07-leitao.pdf), but with a few changes: 16 | 17 | 1. Randomized timers are used instead of rounds. 18 | 2. A throttling mechanism is employed. We found that it takes longer for the overlay to become stable if joins are not throttled. 19 | 3. The overlay members are stored to disk and their health is occasionally checked. 20 | 21 | ### Routing & Failure Detection 22 | 23 | Atop the overlay, we have a routing and failure detection layer. Each node in the Lashup network distributes its adjacencies to every other node in the system, and these are stored in memory. We found that at 1000 nodes, the typical size of this database is under 50 mb. The *fan-out* is set to **6** based on some static tunables in the overlay layer. Whenever there are changes to a given node's adjacency table, it gossips them throughout the network. 24 | 25 | The nodes also check the health of one another. The health-check algorithm is an adaptive algorithm. We set the initial expected round trip time to 1 second. We then proceed to ping every one of our adjacencies every second. The historical round trip times are cached, and we use a logarithmic scale to determine how long future round trips should take. If a node fails to respond, we immediately disconnect it, and propagate that information through the gossip protocol. At this point, all of the nodes that are also connected to that node proceed to check its health. This results in typical sub-second failure detection. If the overlay is unstable, the ping intervals are naturally elevated, and therefore inhibit rejection of nodes from the overlay. 26 | 27 | Once we have this adjacency table, we then run a depth-first search on it to build a minimum spanning tree. This minimum spanning tree then becomes a mechanism to build sender-rooted multicast messages. 28 | 29 | ### Distributed Database 30 | 31 | Lastly, and most interestingly, Lashup is a distributed database. Lashup acts as a CRDT store. It is built on top of Basho's [riak_dt](https://github.com/basho/riak_dt). Every key is replicated to every node. Rather than performing read-modify writes, the CRDT store offers operations. Specifically, you can operate on counters, maps, sets, flags, or registers. You can also compose these data structures to build a complex model. Given that Lashup isn't meant to work with a high key count, due to the naiveté of its anti-entropy algorithm, it is preferential to build a map. 32 | 33 | The data is also persisted to disk via Mnesia. Mnesia was chosen due to its reliability, and testing in production, as well as the fact that it requires no dependencies outside of Erlang itself. 34 | 35 | ## Background 36 | 37 | Lashup is an underlying infrastructure project of Minuteman. It enables Minuteman to determine the list of available backends to fulfill a connection request. In the context of project Peacekeeper, Lashup could disseminate the list of available IP and port endpoints. The system has to have strong availability performance properties, and high-availability properties. It is also known as the "Event Fabric". 38 | 39 | Lashup runs on every agent in a cluster. Most of the information needed can be derived from a Mesos agent's `TaskInfo` and `TaskStatus` records. Therefore, the initial goal of Lashup is to advertise up-to-date task records of an agent. In addition to this, it must replicate this state to all other nodes running Lashup. This can be modeled as a key-value system, where every agent has its own namespace, only it can write to, and other agents can read from. The estimate of the size of the information used from a `TaskInfo` record is likely to be under a kilobyte, and the `TaskStatus` size is likely to be under 512 bytes. Efficiencies can be taken in order to advertise only mutations, such as the work from the [Delta-CRDT paper](http://arxiv.org/abs/1410.2803). 40 | 41 | Since this information is used to make routing decisions, it must be disseminated in a timely manner. Lashup should provide strong guarantees about timely convergence not only under partial failure, but typical operating conditions. Although it is not a strong requirement, Lashup should also provide a mechanism to determine whether a replica (an agent) has entirely failed to avoid sending requests to it. It is likely that a different mechanism will have to be used for state dissemination versus liveness checking. 42 | 43 | ### Failure Detector 44 | 45 | Prior to having a gossip protocol, Lashup needs to provide a failure detector for Minuteman. The purpose of the failure detector is to allow faster dissemination of data — whether it be through a cache, a broker, or something else. The reason for this is that agents can advertise their state independently of their health. Doing this allows us to handle stale data better because we can retire the advertised VIPs of agents more quickly than a period a epidemic-gossip system would take to disseminate said data. 46 | 47 | We need a failure detector that's quick to converge (less than 1.5 seconds for disseminating a member failure to 90% of the cluster), and a failure detector that can detect asymmetric network partitions. For example, assume we have masters M1, M2, and M3, and agents A1, A2, A3, A4, and A5. We want to be able to detect the case where A[1-5] can communicate to M[1-3], but A[1-2] cannot communicate with A[3-5]. Ideally, we also want the failure detector to enable instigating health checks based on Minuteman connections. We do not want to mark any member of the set A[1-5] as failed if A[1-5] cannot communicate with M[1-3]. Such [network partitions happen often](https://queue.acm.org/detail.cfm?id=2655736). Many of these are caused by misconfiguration. 48 | 49 | ## Use-cases 50 | 51 | ### Detecting failures 52 | 53 | The first obvious use-case for Lashup is to detect agent failures in the system. If we're sending traffic to a node via a load balancing mechanism, we can react to node failures faster than Mesos (10M) or other mechanisms can detect failure by using techniques like gossip, and indirect health-checks. Alternatively, DCOS takes advantage of Mesos, which can have slow periods for detecting faults. 54 | 55 | ### Publishing VIPs 56 | 57 | The second use-case for Lashup is to publish VIPs for Minuteman. These VIPs are derived by reading the local agent's `state.json`, and extracting the VIPs from it. The result of this is a map of sets. These maps can be simply merged across agents, with the merge operation for a set being a simple merge. 58 | 59 | ### Powering Novel Load Balancing Algorithms 60 | 61 | Once there is a reliable failure detector, and a mechanism to disseminate data, it can be extended. Although we could trivially implement the local least connections, or global random balancing algorithms, such algorithms are quite often not the ideal mechanism for interactive web services. There is one algorithm that serves the purpose better: Microsoft's [Join-Idle-Queue](http://research.microsoft.com/apps/pubs/default.aspx?id=153348). It effectively requires a mechanism to expose queue depth to routers. Other algorithms, such as performance-based routing require very small amounts of metadata to be exposed, with very short update periods. 62 | 63 | ### Security 64 | 65 | Security is being implemented by project Peacekeeper. Peacekeeper intercepts the first packet of every connection. The complexity is determining the destination. In order to determine the destination, the task list must be queried to find the task with a matching `NetworkInfo` record, and allocated ports. This information is difficult to cache, as it will have high churn, and overly aggressive caching can potentially lead to a security compromise. 66 | 67 | In such a distributed filtering system, there is also a requirement to distribute the filtering rules to all of the nodes in a timely fashion, otherwise users can be left without connectivity, or users may have too much connectivity. 68 | 69 | ## Usage 70 | 71 | Integrating Lashup into your own project is easy! You must set the configuration, `{mnesia, dir}` for the KV store to persist its data, and the `{lashup, work_dir}` to persist the Lamport clock used in the gossip algorithm. You must also configure contact nodes via `{lashup, contact_nodes}`. These nodes are used to bootstrap the overlay. At least one of them must be up for a new node to join. 72 | 73 | ### Membership 74 | 75 | You can get the global membership snapshot by calling `lashup_gm:gm()`. If you're interested in finding out about changes in the global membership, you can subscribe to the `lashup_gm_events` `gen_event`. If you're interested in finding out reachability information, you can call `lashup_gm_route:get_tree(Origin)` to get the DFS tree from `Origin`. It returns a map, where the key is a destination node name, and the value is either the distance from the given origin, or `infinity`, if the node is unreachable. 76 | 77 | ### Key-Value Store 78 | 79 | The key-value store exposes a traditional API, and a subscription API. You can request operations to the key-value store by simply executing `lashup_kv:request_op(Key, Op)`, where `Op` is a `riak_dt` operation. A few examples can be found in the [test suite](https://github.com/dcos/lashup/blob/master/test/lashup_kv_SUITE.erl). 80 | 81 | #### Subscription API 82 | 83 | The subscription API allows you to subscribe to key-value changes that match a specific pattern. We use standard `ets` match specs for this purpose. The match spec must return `true`. For example: 84 | `ets:fun2ms(fun({[node_metadata|_]}) -> true end)`. The match spec must be a 1-tuple, where the only member of the tuple is a key pattern. 85 | 86 | Then you can simply call `lashup_kv:subscribe(ets:fun2ms(fun({[node_metadata|_]}) -> true end))`. This will dump all existing keys matching the given pattern, and send you future keys matching the same pattern. In addition, Lashup deduplicates updates and only sends you new updates. 87 | 88 | ### Multicast 89 | 90 | The simplest way to use the multicast API is to call `lashup_gm_events:subscribe([Topic])`. All updates to `Topic` are streamed to the caller. You can send updates to a topic via `lashup_gm_mc:multicast(Topic, Data)`, where `Topic` is the aforementioned topic, and `Data` is whatever you'd like. 91 | 92 | 93 | [circleci badge]: https://img.shields.io/circleci/project/github/dcos/lashup/master.svg?style=flat-square 94 | [coverage badge]: https://img.shields.io/codecov/c/github/dcos/lashup/master.svg?style=flat-square 95 | [jira badge]: https://img.shields.io/badge/issues-jira-yellow.svg?style=flat-square 96 | [license badge]: https://img.shields.io/github/license/dcos/lashup.svg?style=flat-square 97 | [erlang version badge]: https://img.shields.io/badge/erlang-21.x-blue.svg?style=flat-square 98 | 99 | 100 | [circleci]: https://circleci.com/gh/dcos/lashup 101 | [covercov]: https://codecov.io/gh/dcos/lashup 102 | [jira]: https://jira.dcos.io/issues/?jql=component+%3D+networking+AND+project+%3D+DCOS_OSS 103 | [license]: ./LICENSE 104 | [erlang]: http://erlang.org/ 105 | -------------------------------------------------------------------------------- /include/lashup.hrl: -------------------------------------------------------------------------------- 1 | -record(member, { 2 | node :: Node :: node(), 3 | locally_updated_at = [] :: [integer()], 4 | clock_deltas = [] :: [integer()], 5 | active_view = erlang:error() :: [node()], 6 | value = erlang:error() :: map() 7 | }). 8 | 9 | -record(member2, { 10 | node :: Node :: node(), 11 | last_heard :: integer(), 12 | active_view = erlang:error() :: [node()], 13 | value = erlang:error() :: map() 14 | }). 15 | 16 | -type member() :: #member{}. 17 | -type member2() :: #member2{}. 18 | -------------------------------------------------------------------------------- /rebar.config: -------------------------------------------------------------------------------- 1 | {minimum_otp_vsn, "21"}. 2 | 3 | {erl_opts, [ 4 | debug_info, 5 | warnings_as_errors 6 | ]}. 7 | 8 | {deps, [ 9 | prometheus, 10 | {riak_dt, {git, "https://github.com/dcos/riak_dt.git", {branch, "make-faster"}}} 11 | ]}. 12 | 13 | {eunit_opts, [ 14 | {cover_enabled, true}, 15 | verbose, 16 | {report, {eunit_surefire, [{dir, "."}]}} 17 | ]}. 18 | 19 | {cover_enabled, true}. 20 | {cover_print_enabled, true}. 21 | {cover_export_enabled, true}. 22 | 23 | {xref_checks, []}. 24 | {xref_queries, [{"(XC - UC) || (XU - X - B - \"(dtrace)\" : Mod)", []}]}. 25 | 26 | {profiles, [ 27 | {test, [ 28 | {plugins, [ 29 | {covertool, "2.0.0"} 30 | ]}, 31 | {deps, [ 32 | proper 33 | ]} 34 | ]}, 35 | {dev, [ 36 | {deps, [ 37 | sync 38 | ]} 39 | ]}, 40 | {lint, [ 41 | {plugins, [ 42 | {rebar3_lint, "0.1.10"} 43 | ]} 44 | ]}, 45 | {prod, [ 46 | {relx, [ 47 | {dev_mode, false} 48 | ]} 49 | ]} 50 | ]}. 51 | 52 | {elvis, [ 53 | #{ 54 | dirs => [ 55 | "src" 56 | ], 57 | filter => "lashup*.erl", 58 | ignore => [zbase32], 59 | rules => [ 60 | {elvis_style, max_function_length, #{ 61 | max_length => 30, 62 | ignore_functions => [ 63 | {lashup_gm_route, handle_event}, 64 | {lashup_hyparview_membership, handle_info}, 65 | %% There is a bug in elvis that makes these problematic (https://github.com/inaka/elvis/issues/407) 66 | {lashup_config, passive_view_size}, 67 | {lashup_config, active_view_size}, 68 | {lashup_kv, get_lclock}, 69 | {lashup_kv, upgrade_table} 70 | ] 71 | }}, 72 | {elvis_style, no_spec_with_records}, 73 | {elvis_style, dont_repeat_yourself, #{min_complexity => 20}}, 74 | {elvis_style, no_behavior_info}, 75 | {elvis_style, used_ignored_variable}, 76 | {elvis_style, nesting_level, #{level => 3}}, 77 | {elvis_style, god_modules, #{limit => 25}}, 78 | {elvis_style, no_if_expression}, 79 | {elvis_style, line_length, #{limit => 120, count_comments => false}}, 80 | {elvis_style, no_tabs}, 81 | {elvis_style, no_trailing_whitespace}, 82 | {elvis_style, macro_names}, 83 | {elvis_style, macro_module_names}, 84 | {elvis_style, operator_spaces, #{rules => [{right, ","}, {right, "++"}, {left, "++"}]}} 85 | ] 86 | } 87 | ]}. 88 | -------------------------------------------------------------------------------- /rebar.lock: -------------------------------------------------------------------------------- 1 | {"1.1.0", 2 | [{<<"prometheus">>,{pkg,<<"prometheus">>,<<"4.2.0">>},0}, 3 | {<<"riak_dt">>, 4 | {git,"https://github.com/dcos/riak_dt.git", 5 | {ref,"b5c3fa058f9e56ec14f33b68d5d07ea6fbb97538"}}, 6 | 0}]}. 7 | [ 8 | {pkg_hash,[ 9 | {<<"prometheus">>, <<"06C58BFDFE28D3168B926DA614CB9A6D39593DEEBDE648A5480E32DFA3C370E9">>}]} 10 | ]. 11 | -------------------------------------------------------------------------------- /rebar3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dcos/lashup/a0661006c38237147252ebe0b98cc2c29352c976/rebar3 -------------------------------------------------------------------------------- /src/lashup.app.src: -------------------------------------------------------------------------------- 1 | {application, lashup, [ 2 | {description, "Distributed control plane"}, 3 | {vsn, "0.1"}, 4 | {registered, []}, 5 | {licenses, ["Apache"]}, 6 | {applications, [ 7 | kernel, 8 | stdlib, 9 | riak_dt, 10 | prometheus 11 | ]}, 12 | {mod, {lashup_app, []}}, 13 | {env, []} 14 | ]}. 15 | -------------------------------------------------------------------------------- /src/lashup.erl: -------------------------------------------------------------------------------- 1 | -module(lashup). 2 | -author("sdhillon"). 3 | 4 | -export([ 5 | start/0, 6 | stop/0 7 | ]). 8 | 9 | start() -> 10 | application:ensure_all_started(lashup). 11 | 12 | stop() -> 13 | application:stop(lashup). 14 | -------------------------------------------------------------------------------- /src/lashup_app.erl: -------------------------------------------------------------------------------- 1 | -module(lashup_app). 2 | -behaviour(application). 3 | -export([start/2, stop/1]). 4 | 5 | start(_StartType, _StartArgs) -> 6 | lashup_sup:start_link(). 7 | 8 | stop(_State) -> 9 | ok. 10 | -------------------------------------------------------------------------------- /src/lashup_config.erl: -------------------------------------------------------------------------------- 1 | -module(lashup_config). 2 | -author("sdhillon"). 3 | 4 | %% These are the constants for the sizes of views from the lashup paper 5 | -define(K, 6). 6 | 7 | %% The original C was 1 8 | %% I think for our use-case, we can bump it to 3? 9 | -define(C, 3). 10 | % This number is actually log10(10000) 11 | -define(LOG_TOTAL_MEMBERS, 4). 12 | 13 | -define(DEFAULT_ACTIVE_VIEW_SIZE, ?LOG_TOTAL_MEMBERS + ?C). 14 | -define(DEFAULT_PASSIVE_VIEW_SIZE, ?K * (?LOG_TOTAL_MEMBERS + ?C)). 15 | %% The interval that we try to join the contact nodes in milliseconds 16 | -define(DEFAULT_JOIN_INTERVAL, 1000). 17 | -define(DEFAULT_NEIGHBOR_INTERVAL, 10000). 18 | 19 | -define(DEFAULT_SHUFFLE_INTERVAL, 60000). 20 | 21 | -define(DEFAULT_MAX_PING_MS, 1000). 22 | %% This is here as a "noise floor" 23 | -define(DEFAULT_MIN_PING_MS, 100). 24 | 25 | %% LOG_BASE calculated by taking 26 | %% log(?MAX_PING_MS) / ?LOG_BASE ~= ?MAX_PING_MS 27 | -define(DEFAULT_LOG_BASE, 1.007). 28 | 29 | %% API 30 | -export([ 31 | arwl/0, 32 | prwl/0, 33 | contact_nodes/0, 34 | protocol_period/0, 35 | full_probe_period/0, 36 | min_departition_probe_interval/0, 37 | max_mc_replication/0, 38 | aae_interval/0, 39 | work_dir/0, 40 | bloom_interval/0, 41 | aae_after/0, 42 | join_timeout/0, 43 | aae_neighbor_check_interval/0, 44 | shuffle_interval/0, 45 | active_view_size/0, 46 | passive_view_size/0, 47 | join_interval/0, 48 | neighbor_interval/0, 49 | min_ping_ms/0, 50 | max_ping_ms/0, 51 | ping_log_base/0, 52 | gc_timeout/0, 53 | aae_route_event_wait/0, 54 | update_contact_nodes/1 55 | ]). 56 | 57 | %% @doc 58 | %% the following three config values are hyparview internals 59 | %% Associated to the join procedure, there are two configuration parameters, 60 | %% named Active Random Walk Length (ARWL), that specifies the maximum number of hops a 61 | %% ForwardJoin request is propagated, and Passive Random Walk Length (PRWL), that specifies 62 | %% at which point in the walk the node is inserted in a passive view. To use these parameters, the 63 | %% ForwardJoin request carries a “time to live” field that is initially set to ARWL and decreased 64 | %% at every hop. 65 | 66 | %% Effectively, they're used during the join process to disseminate the joining node into other nodes 67 | %% Contact nodes are the first members of the overlay that the lashup knows about 68 | %% @end 69 | 70 | 71 | %% Active Random Walk Length 72 | -spec(arwl() -> non_neg_integer()). 73 | arwl() -> 74 | get_env(arwl, 8). 75 | 76 | %% Passive Random Walk Length 77 | -spec(prwl() -> non_neg_integer()). 78 | prwl() -> 79 | get_env(prwl, 5). 80 | 81 | %% How long to wait for net_adm:ping when doing initial join 82 | %% After this it becomes async 83 | -spec(join_timeout() -> non_neg_integer()). 84 | join_timeout() -> 85 | get_env(join_timeout, 250). 86 | 87 | %% 88 | -spec(contact_nodes() -> ordsets:ordset(node())). 89 | contact_nodes() -> 90 | Nodes = ordsets:from_list(get_env(contact_nodes, [])), 91 | ordsets:del_element(node(), Nodes). 92 | 93 | %% We handle reactive changes a little bit differently than the paper. 94 | %% In empirical testing, making everything reactive resulted in a thundering herd 95 | %% The protocol period is effectively how many ms we check the protocol for activating reactive changes 96 | 97 | -spec(protocol_period() -> non_neg_integer()). 98 | protocol_period() -> 99 | get_env(protocol_period, 300). 100 | 101 | 102 | %% The next two variables are to the de-partitioning behaviour 103 | %% full_probe_period shouldn't be lower than 10 minutes, and it's used to probe the global membership table's 104 | %% down nodes 105 | 106 | %% Using a probablistic algorithm we try to scan all the unreachable nodes every full_probe_period, without 107 | %% sending more than one probe every min_departition_probe_interval ms 108 | 109 | -spec(full_probe_period() -> non_neg_integer()). 110 | full_probe_period() -> 111 | get_env(full_probe_period, 600000). 112 | 113 | -spec(min_departition_probe_interval() -> non_neg_integer()). 114 | min_departition_probe_interval() -> 115 | get_env(min_departition_probe_interval, 12000). 116 | 117 | %% @doc 118 | %% How many extra copies of a message to send through multicast 119 | %% @end 120 | -spec(max_mc_replication() -> pos_integer()). 121 | max_mc_replication() -> 122 | get_env(max_mc_replication, 2). 123 | 124 | %% @doc 125 | %% How often we message our Vector Clocks for AAE in milliseconds 126 | aae_interval() -> 127 | get_env(aae_interval, 60000). 128 | 129 | %% @doc 130 | %% Lashup working directory 131 | work_dir() -> 132 | %% This is /var/lib/dcos/lashup on DCOS 133 | WorkDir = get_env(work_dir, "."), 134 | NodeNameStr = atom_to_list(node()), 135 | filename:join(WorkDir, NodeNameStr). 136 | 137 | 138 | %% @doc 139 | %% How often we message our bloom filter for AAE in milliseconds 140 | bloom_interval() -> 141 | get_env(bloom_interval, 30000). 142 | 143 | 144 | %% @doc 145 | %% How long we wait until we begin to do AAE 146 | aae_after() -> 147 | get_env(aae_after, 30000). 148 | 149 | %% @doc 150 | %% How often we see if there are any neighbors connected 151 | aae_neighbor_check_interval() -> 152 | get_env(aae_neighbor_check_interval, 5000). 153 | 154 | -spec(shuffle_interval() -> non_neg_integer()). 155 | shuffle_interval() -> 156 | get_env(default_shuffle_interval, ?DEFAULT_SHUFFLE_INTERVAL). 157 | 158 | -spec(join_interval() -> non_neg_integer()). 159 | join_interval() -> 160 | get_env(join_interval, ?DEFAULT_JOIN_INTERVAL). 161 | 162 | -spec(active_view_size() -> non_neg_integer()). 163 | active_view_size() -> 164 | get_env(active_view_size, ?DEFAULT_ACTIVE_VIEW_SIZE). 165 | 166 | -spec(passive_view_size() -> non_neg_integer()). 167 | passive_view_size() -> 168 | get_env(passive_view_size, ?DEFAULT_PASSIVE_VIEW_SIZE). 169 | 170 | -spec(neighbor_interval() -> non_neg_integer()). 171 | neighbor_interval() -> 172 | get_env(neighbor_interval, ?DEFAULT_NEIGHBOR_INTERVAL). 173 | 174 | -spec(min_ping_ms() -> non_neg_integer()). 175 | min_ping_ms() -> 176 | get_env(min_ping_ms, ?DEFAULT_MIN_PING_MS). 177 | 178 | -spec(max_ping_ms() -> non_neg_integer()). 179 | max_ping_ms() -> 180 | get_env(max_ping_ms, ?DEFAULT_MAX_PING_MS). 181 | 182 | -spec(ping_log_base() -> float()). 183 | ping_log_base() -> 184 | get_env(ping_log_base, ?DEFAULT_LOG_BASE). 185 | 186 | -spec(aae_route_event_wait() -> non_neg_integer()). 187 | aae_route_event_wait() -> 188 | get_env(aae_route_event_wait, 120000). % 2 min 189 | 190 | -spec(gc_timeout() -> timeout()). 191 | gc_timeout() -> 192 | get_env(gc_timeout, 15000). 193 | 194 | get_env(Var, Default) -> 195 | application:get_env(lashup, Var, Default). 196 | 197 | -spec(update_contact_nodes([node()]) -> ok). 198 | update_contact_nodes(Nodes) -> 199 | application:set_env(lashup, contact_nodes, Nodes). 200 | -------------------------------------------------------------------------------- /src/lashup_core_sup.erl: -------------------------------------------------------------------------------- 1 | -module(lashup_core_sup). 2 | -behaviour(supervisor). 3 | 4 | -export([start_link/0]). 5 | -export([init/1]). 6 | 7 | -define(CHILD(I, Type), {I, {I, start_link, []}, permanent, 5000, Type, [I]}). 8 | 9 | start_link() -> 10 | supervisor:start_link({local, ?MODULE}, ?MODULE, []). 11 | 12 | init([]) -> 13 | {ok, {#{}, [ 14 | ?CHILD(lashup_hyparview_events, worker), 15 | ?CHILD(lashup_hyparview_ping_handler, worker), 16 | ?CHILD(lashup_hyparview_membership, worker), 17 | ?CHILD(lashup_gm_sup, supervisor) 18 | ]}}. 19 | -------------------------------------------------------------------------------- /src/lashup_gm.erl: -------------------------------------------------------------------------------- 1 | -module(lashup_gm). 2 | -author("sdhillon"). 3 | -behaviour(gen_server). 4 | 5 | -include_lib("kernel/include/logger.hrl"). 6 | -include_lib("kernel/include/inet.hrl"). 7 | -include_lib("stdlib/include/ms_transform.hrl"). 8 | -include("lashup.hrl"). 9 | 10 | %% API 11 | -export([ 12 | start_link/0, 13 | init_metrics/0, 14 | get_neighbor_recommendations/1, 15 | gm/0 16 | ]). 17 | 18 | %% gen_server callbacks 19 | -export([init/1, handle_call/3, 20 | handle_cast/2, handle_info/2]). 21 | 22 | -record(subscriber, { 23 | monitor_ref, 24 | node, 25 | pid 26 | }). 27 | 28 | -record(subscription, { 29 | node, 30 | pid, 31 | monitor_ref 32 | }). 33 | 34 | -record(state, { 35 | subscriptions = [], 36 | epoch = erlang:error() :: non_neg_integer(), 37 | active_view = [], 38 | subscribers = [], 39 | hyparview_event_ref :: reference() 40 | }). 41 | -type state() :: #state{}. 42 | 43 | %% @doc 44 | %% TODO: Get rid of DVVSet, and move to a pruneable datastructure 45 | %% Timeout here is limited to 500 ms, and not less 46 | %% empirically, dumping 1000 nodes pauses lashup_gm for ~300 ms. 47 | %% So we bumped this up to sit above that. We should decrease it when we get a chance 48 | %% because lashup_hyparview_membership depends on it not pausing for a long time 49 | 50 | get_neighbor_recommendations(ActiveViewSize) -> 51 | gen_server:call(?MODULE, {get_neighbor_recommendations, ActiveViewSize}, 500). 52 | 53 | gm() -> 54 | get_membership(). 55 | 56 | -spec(start_link() -> 57 | {ok, Pid :: pid()} | ignore | {error, Reason :: term()}). 58 | start_link() -> 59 | gen_server:start_link({local, ?MODULE}, ?MODULE, [], []). 60 | 61 | %%%=================================================================== 62 | %%% gen_server callbacks 63 | %%%=================================================================== 64 | 65 | init([]) -> 66 | rand:seed(exsplus), 67 | %% TODO: Add jitter 68 | MyPid = self(), 69 | spawn_link(fun() -> update_node_backoff_loop(5000, MyPid) end), 70 | ets:new(members, [ordered_set, named_table, {keypos, #member2.node}]), 71 | {ok, HyparviewEventsRef} = lashup_hyparview_events:subscribe(), 72 | State = #state{epoch = new_epoch(), hyparview_event_ref = HyparviewEventsRef}, 73 | init_node(State), 74 | timer:send_interval(3600 * 1000, trim_nodes), 75 | {ok, State}. 76 | 77 | handle_call(gm, _From, State) -> 78 | {reply, get_membership(), State, lashup_utils:hibernate()}; 79 | handle_call({subscribe, Pid}, _From, State) -> 80 | {Reply, State1} = handle_subscribe(Pid, State), 81 | {reply, Reply, State1}; 82 | handle_call(update_node, _From, State) -> 83 | State1 = update_node(timed_refresh, State), 84 | {reply, 300000, State1, lashup_utils:hibernate()}; 85 | handle_call({get_neighbor_recommendations, ActiveViewSize}, _From, State) -> 86 | Reply = handle_get_neighbor_recommendations(ActiveViewSize), 87 | {reply, Reply, State}; 88 | handle_call(Request, _From, State) -> 89 | ?LOG_DEBUG("Received unknown request: ~p", [Request]), 90 | {reply, ok, State}. 91 | 92 | handle_cast({compressed, Data}, State) when is_binary(Data) -> 93 | Data1 = binary_to_term(Data), 94 | handle_cast(Data1, State); 95 | handle_cast({sync, Pid}, State) -> 96 | handle_sync(Pid, State), 97 | {noreply, State}; 98 | handle_cast(#{message := remote_event, from := From, event := #{message := updated_node} = UpdatedNode}, State) -> 99 | % ?LOG_DEBUG("Received Updated Node: ~p", [UpdatedNode]), 100 | State1 = handle_updated_node(From, UpdatedNode, State), 101 | {noreply, State1, lashup_utils:hibernate()}; 102 | handle_cast(update_node, State) -> 103 | State1 = update_node(internal_cast, State), 104 | {noreply, State1, lashup_utils:hibernate()}; 105 | handle_cast(_Request, State) -> 106 | {noreply, State}. 107 | 108 | handle_info(_Down = {'DOWN', MonitorRef, _Type, _Object, _Info}, State) when is_reference(MonitorRef) -> 109 | State1 = prune_subscribers(MonitorRef, State), 110 | State2 = prune_subscriptions(MonitorRef, State1), 111 | {noreply, State2}; 112 | handle_info({lashup_hyparview_events, #{type := current_views, ref := EventRef, active_view := ActiveView}}, 113 | State0 = #state{hyparview_event_ref = EventRef}) -> 114 | State1 = handle_current_views(ActiveView, State0), 115 | {noreply, State1, lashup_utils:hibernate()}; 116 | handle_info({nodedown, Node}, State) -> 117 | State1 = handle_nodedown(Node, State), 118 | {noreply, State1, lashup_utils:hibernate()}; 119 | handle_info(trim_nodes, State) -> 120 | trim_nodes(State), 121 | {noreply, State, lashup_utils:hibernate()}; 122 | handle_info(_Info, State) -> 123 | {noreply, State}. 124 | 125 | %%%=================================================================== 126 | %%% Internal functions 127 | %%%=================================================================== 128 | 129 | handle_sync(Pid, _State) -> 130 | lashup_gm_sync_worker:handle(Pid). 131 | 132 | %% @private Generates new epoch 133 | -spec(new_epoch() -> non_neg_integer()). 134 | new_epoch() -> 135 | WorkDir = lashup_config:work_dir(), 136 | EpochFilename = filename:join(WorkDir, "lashup_gm_epoch"), 137 | case lashup_save:read(EpochFilename) of 138 | not_found -> 139 | Epoch = erlang:system_time(seconds), 140 | Data = #{epoch => Epoch}, 141 | ok = lashup_save:write(EpochFilename, term_to_binary(Data)), 142 | Epoch; 143 | {ok, BinaryData} -> 144 | OldData = #{epoch := OldEpoch} = binary_to_term(BinaryData), 145 | %% Should we check if our time is (Too far) behind the last epoch? 146 | NewEpoch = max(erlang:system_time(seconds), OldEpoch) + 1, 147 | NewData = OldData#{epoch := NewEpoch}, 148 | ok = lashup_save:write(EpochFilename, term_to_binary(NewData)), 149 | NewEpoch 150 | end. 151 | 152 | -spec(handle_subscribe(Pid :: pid(), State :: state()) -> {{ok, Self :: pid()}, State1 :: state()}). 153 | handle_subscribe(Pid, State = #state{subscribers = Subscribers}) -> 154 | MonitorRef = monitor(process, Pid), 155 | Subscriber = #subscriber{node = node(Pid), monitor_ref = MonitorRef, pid = Pid}, 156 | Subscribers1 = [Subscriber | Subscribers], 157 | prometheus_gauge:set(lashup, gm_subscribers, [], length(Subscribers1)), 158 | {{ok, self()}, State#state{subscribers = Subscribers1}}. 159 | 160 | handle_current_views(ActiveView, State = #state{subscriptions = Subscriptions}) -> 161 | Subscriptions1 = lists:foldl(fun check_member/2, Subscriptions, ActiveView), 162 | OldActiveView = State#state.active_view, 163 | case {ActiveView, Subscriptions} of 164 | {OldActiveView, Subscriptions1} -> 165 | ok; 166 | _ -> 167 | gen_server:cast(self(), update_node) 168 | end, 169 | prometheus_gauge:set(lashup, gm_subscriptions, [], length(Subscriptions1)), 170 | State#state{subscriptions = Subscriptions1, active_view = ActiveView}. 171 | 172 | 173 | check_member(Node, Subscriptions) -> 174 | %% Make sure that the node doesn't exist in subscriptions and it's in our connected nodes list 175 | case {lists:keyfind(Node, #subscription.node, Subscriptions), lists:member(Node, nodes())} of 176 | {false, true} -> 177 | %% We should also ensure that the node is up 178 | case catch lashup_gm_fanout:start_monitor(Node) of 179 | {ok, {Pid, Monitor}} -> 180 | Subscription = #subscription{node = Node, pid = Pid, monitor_ref = Monitor}, 181 | ?LOG_DEBUG("Added handler for node: ~p", [Node]), 182 | [Subscription | Subscriptions]; 183 | Else -> 184 | ?LOG_DEBUG("Unable to add handler for node: ~p, error: ~p", [Node, Else]), 185 | Subscriptions 186 | end; 187 | _ -> 188 | Subscriptions 189 | end. 190 | 191 | %% @private Creates a new value the node 192 | new_value(_State = #state{active_view = ActiveView, epoch = Epoch}) -> 193 | #{ 194 | active_view => ActiveView, 195 | server_id => node(), 196 | epoch => Epoch, 197 | %% Positive looks nicer... 198 | clock => erlang:unique_integer([positive, monotonic]) 199 | }. 200 | 201 | 202 | %% @private Creates the first Member record representing the local node 203 | init_node(State) -> 204 | LocalUpdate = erlang:system_time(nano_seconds), 205 | Value = new_value(State), 206 | Member = #member2{ 207 | node = node(), 208 | last_heard = LocalUpdate, 209 | value = Value, 210 | active_view = maps:get(active_view, Value) 211 | }, 212 | persist(Member, State). 213 | 214 | 215 | %% @private Update the local node's DVVSet 216 | update_node(Reason, State) -> 217 | NewValue = new_value(State), 218 | update_node(NewValue, Reason, State). 219 | 220 | %% @private Take an updated Value from the local node, turn it into a message and propagate it 221 | update_node(NewValue, Reason, State) -> 222 | %% TODO: 223 | %% Adjust TTL based on maximum path length from link-state database 224 | Message = #{ 225 | message => updated_node, 226 | node => node(), 227 | ttl => 10, 228 | value => NewValue, 229 | reason => Reason 230 | }, 231 | handle_updated_node(node(), Message, State). 232 | 233 | handle_updated_node(_From, UpdatedNode = #{ttl := TTL}, State) when TTL < 0 -> 234 | ?LOG_WARNING("TTL Exceeded on Updated Node: ~p", [UpdatedNode]), 235 | State; 236 | 237 | handle_updated_node(From, UpdatedNode = #{node := Node}, State) -> 238 | prometheus_counter:inc(lashup, gm_updates_total, [], 1), 239 | {message_queue_len, MsgQueueLen} = 240 | erlang:process_info(self(), message_queue_len), 241 | prometheus_gauge:set(lashup, gm_message_queue_length, [], MsgQueueLen), 242 | case ets:lookup(members, Node) of 243 | [] -> 244 | %% Brand new, store it 245 | store_and_forward_new_updated_node(From, UpdatedNode, State); 246 | [Member] -> 247 | maybe_store_store_and_forward_updated_node(Member, From, UpdatedNode, State) 248 | end. 249 | 250 | %% @private Take a new node we've never seen before, and store it in the membership database 251 | store_and_forward_new_updated_node(From, 252 | UpdatedNode = #{ 253 | node := Node, 254 | ttl := TTL, 255 | value := Value 256 | }, State) -> 257 | LocalUpdate = erlang:monotonic_time(nano_seconds), 258 | Member = #member2{ 259 | node = Node, 260 | last_heard = LocalUpdate, 261 | value = Value, 262 | active_view = maps:get(active_view, Value) 263 | }, 264 | persist(Member, State), 265 | NewUpdatedNode = UpdatedNode#{exempt_nodes => [From], ttl => TTL - 1}, 266 | forward(NewUpdatedNode, State), 267 | State. 268 | 269 | 270 | maybe_store_store_and_forward_updated_node(Member, From, UpdatedNode = #{value := RemoteValue}, State) -> 271 | %% Should be true, if the remote one is newer 272 | #{epoch := RemoteEpoch, clock := RemoteClock} = RemoteValue, 273 | #{epoch := LocalEpoch, clock := LocalClock} = Member#member2.value, 274 | case {RemoteEpoch, RemoteClock} > {LocalEpoch, LocalClock} of 275 | true -> 276 | store_and_forward_updated_node(Member, From, UpdatedNode, State); 277 | %% We've seen an old clock 278 | false -> 279 | ok 280 | end, 281 | State. 282 | 283 | store_and_forward_updated_node(Member, From, _UpdatedNode, _State) 284 | when Member#member2.node == node() andalso From =/= node() -> 285 | ok; 286 | store_and_forward_updated_node(Member, From, 287 | UpdatedNode = #{ 288 | value := Value 289 | }, State) -> 290 | update_local_member(Value, Member, State), 291 | NewUpdatedNode = UpdatedNode#{exempt_nodes => [From]}, 292 | forward(NewUpdatedNode, State). 293 | 294 | 295 | %% @doc update a local member, and persist it to ets, from a Value 296 | %% The value is gauranteed to be bigger than the one we have now 297 | update_local_member(Value, Member, State) -> 298 | Now = erlang:monotonic_time(nano_seconds), 299 | NewMember = Member#member2{ 300 | last_heard = Now, 301 | value = Value, 302 | active_view = maps:get(active_view, Value) 303 | }, 304 | process_new_member(Member, NewMember, State), 305 | persist(NewMember, State). 306 | 307 | 308 | forward(_NewUpdatedNode = #{ttl := TTL}, _State) when TTL =< 0 -> 309 | ok; 310 | forward(NewUpdatedNode = #{ttl := TTL}, _State = #state{subscribers = Subscribers}) -> 311 | NewUpdatedNode1 = NewUpdatedNode#{ttl := TTL - 1}, 312 | %% This should be small enough, no need to compress 313 | CompressedTerm = term_to_binary(NewUpdatedNode1), 314 | Fun = 315 | fun(_Subscriber = #subscriber{pid = Pid}) -> 316 | erlang:send(Pid, {event, CompressedTerm}, [noconnect]) 317 | end, 318 | lists:foreach(Fun, Subscribers). 319 | 320 | 321 | handle_nodedown(Node, State = #state{subscriptions = Subscriptions, subscribers = Subscribers}) -> 322 | ?LOG_DEBUG("Removing subscription (nodedown) from node: ~p", [Node]), 323 | Subscriptions1 = lists:keydelete(Node, #subscription.node, Subscriptions), 324 | Subscribers1 = lists:keydelete(Node, #subscriber.node, Subscribers), 325 | prometheus_gauge:set(lashup, gm_subscriptions, [], length(Subscriptions1)), 326 | prometheus_gauge:set(lashup, gm_subscribers, [], length(Subscribers1)), 327 | State#state{subscriptions = Subscriptions1, subscribers = Subscribers1}. 328 | 329 | get_membership() -> 330 | ets:foldl(fun accumulate_membership/2, [], members). 331 | 332 | 333 | accumulate_membership(Member, Acc) -> 334 | Now = erlang:monotonic_time(), 335 | LastHeard = Member#member2.last_heard, 336 | TimeSinceLastHeard = erlang:convert_time_unit(Now - LastHeard, native, milli_seconds), 337 | Node = #{ 338 | node => Member#member2.node, 339 | time_since_last_heard => TimeSinceLastHeard, 340 | active_view => Member#member2.active_view 341 | }, 342 | [Node | Acc]. 343 | 344 | trim_nodes(State) -> 345 | Now = erlang:monotonic_time(), 346 | Delta = erlang:convert_time_unit(86400, seconds, native), 347 | MatchSpec = ets:fun2ms( 348 | fun(Member = #member2{last_heard = LastHeard}) 349 | when Now - LastHeard > Delta andalso Member#member2.node =/= node() 350 | -> Member 351 | end 352 | ), 353 | Members = ets:select(members, MatchSpec), 354 | lists:foreach(fun(X) -> delete(X, State) end, Members). 355 | 356 | update_node_backoff_loop(Delay, Pid) -> 357 | timer:sleep(Delay), 358 | Backoff = gen_server:call(?MODULE, update_node, infinity), 359 | update_node_backoff_loop(Backoff, Pid). 360 | 361 | prune_subscribers(MonitorRef, State = #state{subscribers = Subscribers}) -> 362 | Subscribers1 = lists:keydelete(MonitorRef, #subscriber.monitor_ref, Subscribers), 363 | prometheus_gauge:set(lashup, gm_subscribers, [], length(Subscribers1)), 364 | State#state{subscribers = Subscribers1}. 365 | 366 | prune_subscriptions(MonitorRef, State = #state{subscriptions = Subscription}) -> 367 | Subscriptions1 = lists:keydelete(MonitorRef, #subscription.monitor_ref, Subscription), 368 | prometheus_gauge:set(lashup, gm_subscriptions, [], length(Subscriptions1)), 369 | State#state{subscriptions = Subscriptions1}. 370 | 371 | 372 | %% @doc 373 | %% This function (at the moment) only triggers for the purposes to hint back to hyparview membership 374 | %% for aggressive probes 375 | %% Effectively, it means that we have observed another node evict one of our active neighbors from its active set 376 | %% Therefore, we are going to check if it's a dirty liar, or not. 377 | %% it's less new member, but more a change in another member 378 | 379 | %% @end 380 | -spec(process_new_member(MemberOld :: member2(), MemberNew :: member2(), State :: state()) -> ok). 381 | process_new_member(Member, NewMember, _State = #state{active_view = HyparViewActiveView}) -> 382 | ActiveView1 = Member#member2.active_view, 383 | ActiveView2 = NewMember#member2.active_view, 384 | ActiveView1Set = ordsets:from_list(ActiveView1), 385 | ActiveView2Set = ordsets:from_list(ActiveView2), 386 | RetiredMembersSet = ordsets:subtract(ActiveView1Set, ActiveView2Set), 387 | HyparViewActiveViewSet = ordsets:from_list(HyparViewActiveView), 388 | ProbeNodes = ordsets:intersection(RetiredMembersSet, HyparViewActiveViewSet), 389 | [lashup_hyparview_ping_handler:ping(ProbeNode) || ProbeNode <- ProbeNodes], 390 | ok. 391 | 392 | handle_get_neighbor_recommendations(ActiveViewSize) -> 393 | MatchSpec = ets:fun2ms( 394 | fun(Member = #member2{active_view = ActiveView}) 395 | when length(ActiveView) < ActiveViewSize andalso Member#member2.node =/= node() 396 | -> Member#member2.node 397 | end 398 | ), 399 | case ets:select(members, MatchSpec, 100) of 400 | {Members, _Continuation} -> 401 | [Member|_] = lashup_utils:shuffle_list(Members), 402 | {ok, Member}; 403 | '$end_of_table' -> 404 | false 405 | end. 406 | 407 | %% ETS write functions 408 | delete(Member = #member2{}, _State) -> 409 | lashup_gm_route:delete_node(Member#member2.node), 410 | ets:delete(members, Member#member2.node), 411 | prometheus_gauge:set( 412 | lashup, gm_members, [], 413 | ets:info(members, size)), 414 | true. 415 | 416 | 417 | %% TODO: 418 | %% Rewrite both 419 | -spec(persist(Member :: member2(), State :: state()) -> ok). 420 | persist(Member, _State) -> 421 | lashup_gm_route:update_node(Member#member2.node, Member#member2.active_view), 422 | case ets:lookup(members, Member#member2.node) of 423 | [OldMember] -> 424 | ets:insert(members, Member), 425 | lashup_gm_events:ingest(OldMember, Member); 426 | [] -> 427 | ets:insert(members, Member), 428 | prometheus_gauge:set( 429 | lashup, gm_members, [], 430 | ets:info(members, size)), 431 | lashup_gm_events:ingest(Member) 432 | end, 433 | %% Find the component I'm part of 434 | ok. 435 | 436 | %%%=================================================================== 437 | %%% Metrics functions 438 | %%%=================================================================== 439 | 440 | -spec(init_metrics() -> ok). 441 | init_metrics() -> 442 | prometheus_gauge:new([ 443 | {registry, lashup}, 444 | {name, gm_members}, 445 | {help, "The size of global membership table."} 446 | ]), 447 | prometheus_gauge:new([ 448 | {registry, lashup}, 449 | {name, gm_subscriptions}, 450 | {help, "The number of global membership subscriptions."} 451 | ]), 452 | prometheus_gauge:new([ 453 | {registry, lashup}, 454 | {name, gm_subscribers}, 455 | {help, "The number of global membership subscribers."} 456 | ]), 457 | prometheus_counter:new([ 458 | {registry, lashup}, 459 | {name, gm_updates_total}, 460 | {help, "Total number of global membership table updates."} 461 | ]), 462 | prometheus_gauge:new([ 463 | {registry, lashup}, 464 | {name, gm_message_queue_length}, 465 | {help, "The length of global membership process message box."} 466 | ]). 467 | -------------------------------------------------------------------------------- /src/lashup_gm_events.erl: -------------------------------------------------------------------------------- 1 | -module(lashup_gm_events). 2 | -author("sdhillon"). 3 | -behaviour(gen_event). 4 | 5 | -include("lashup.hrl"). 6 | 7 | %% API 8 | -export([ 9 | start_link/0, 10 | subscribe/0, 11 | remote_subscribe/1, 12 | ingest/1, 13 | ingest/2 14 | ]). 15 | 16 | %% gen_event callbacks 17 | -export([init/1, handle_event/2, handle_call/2, 18 | handle_info/2, terminate/2, code_change/3]). 19 | 20 | -record(state, { 21 | reference = erlang:error() :: reference() , 22 | pid = erlang:error() :: pid() 23 | }). 24 | -type state() :: state(). 25 | 26 | 27 | -spec(ingest(member2(), member2()) -> ok). 28 | ingest(OldMember, NewMember) -> 29 | gen_event:notify(?MODULE, {ingest, OldMember, NewMember}), 30 | ok. 31 | 32 | -spec(ingest(member2()) -> ok). 33 | ingest(Member) -> 34 | gen_event:notify(?MODULE, {ingest, Member}), 35 | ok. 36 | 37 | %% @doc 38 | %% Equivalent to {@link {@module}:remote_subscribe/1} with `Node' set to `node()' 39 | %% @end 40 | -spec(subscribe() -> {ok, reference()} | {'EXIT', term()} | {error, term()}). 41 | subscribe() -> 42 | remote_subscribe(node()). 43 | 44 | %% @doc 45 | %% Subscribes calling process to zero or more topics produced by Node. 46 | %% 47 | %% Processes then get messages like: 48 | %% `{{@module}, #{ref => Reference, payload => Payload}}' 49 | %% @end 50 | -spec(remote_subscribe(Node :: node()) -> 51 | {ok, reference()} | {'EXIT', term()} | {error, term()}). 52 | remote_subscribe(Node) -> 53 | Reference = make_ref(), 54 | State = #state{pid = self(), reference = Reference}, 55 | EventMgrRef = event_mgr_ref(Node), 56 | case gen_event:add_sup_handler(EventMgrRef, ?MODULE, State) of 57 | ok -> 58 | {ok, Reference}; 59 | {'EXIT', Term} -> 60 | {'EXIT', Term}; 61 | Error -> 62 | {error, Error} 63 | end. 64 | 65 | event_mgr_ref(Node) when Node == node() -> 66 | ?MODULE; 67 | event_mgr_ref(Node) -> 68 | {?MODULE, Node}. 69 | 70 | -spec(start_link() -> {ok, pid()} | {error, {already_started, pid()}}). 71 | start_link() -> 72 | gen_event:start_link({local, ?MODULE}). 73 | 74 | %%%=================================================================== 75 | %%% gen_event callbacks 76 | %%%=================================================================== 77 | 78 | init(State) -> 79 | {ok, State}. 80 | 81 | handle_event({ingest, Member}, State) -> 82 | handle_ingest(Member, State), 83 | {ok, State}; 84 | handle_event({ingest, OldMember, NewMember}, State) -> 85 | handle_ingest(OldMember, NewMember, State), 86 | {ok, State}; 87 | handle_event(_Message, State) -> 88 | {ok, State}. 89 | 90 | handle_call(_Request, State) -> 91 | Reply = ok, 92 | {ok, Reply, State}. 93 | 94 | handle_info(_Info, State) -> 95 | {ok, State}. 96 | 97 | terminate(_Arg, _State) -> 98 | ok. 99 | 100 | code_change(_OldVsn, State, _Extra) -> 101 | {ok, State}. 102 | 103 | %%%=================================================================== 104 | %%% Internal functions 105 | %%%=================================================================== 106 | 107 | -spec(handle_ingest(member2(), state()) -> ok). 108 | handle_ingest(Member = #member2{}, _State = #state{reference = Reference, pid = Pid}) -> 109 | Event = #{type => new_member, member => Member, ref => Reference}, 110 | Pid ! {?MODULE, Event}, 111 | ok. 112 | 113 | -spec(handle_ingest(member2(), member2(), state()) -> ok). 114 | handle_ingest(OldMember = #member2{}, NewMember = #member2{}, 115 | _State = #state{reference = Reference, pid = Pid}) -> 116 | Event = #{type => member_change, old_member => OldMember, member => NewMember, ref => Reference}, 117 | Pid ! {?MODULE, Event}, 118 | ok. 119 | -------------------------------------------------------------------------------- /src/lashup_gm_fanout.erl: -------------------------------------------------------------------------------- 1 | -module(lashup_gm_fanout). 2 | -author("sdhillon"). 3 | 4 | -include_lib("kernel/include/logger.hrl"). 5 | -include_lib("stdlib/include/ms_transform.hrl"). 6 | -include("lashup.hrl"). 7 | 8 | %% Lashup GM fanout is the global membership fanout system 9 | %% It is composed of three components 10 | %% 1. The lashup_gm process that's generating the events 11 | %% 2. Itself 12 | %% 3. The lashup_gm process that's receiving the events 13 | 14 | %% The actual fan-out is handled by lashup_gm 15 | 16 | %% API 17 | -export([ 18 | start_monitor/1, 19 | init/1 20 | ]). 21 | 22 | -record(state, { 23 | parent, 24 | receiver, 25 | receiver_mon, 26 | parent_mon, 27 | node 28 | }). 29 | 30 | 31 | start_monitor(Node) -> 32 | State = #state{receiver = self(), node = Node}, 33 | {Pid, Monitor} = spawn_monitor(?MODULE, init, [State]), 34 | {ok, {Pid, Monitor}}. 35 | 36 | init(State = #state{receiver = Receiver, node = Node}) when node() == node(Receiver) -> 37 | %% TODO: 38 | %% This might result in a reconnect, 39 | %% But alas, until we start changing cookies to prevent connections 40 | %% We can't have this work well 41 | ReceiverMon = monitor(process, Receiver), 42 | 43 | %% Investigate turning on dist_auto_connect -> once 44 | %% And then replacing this with explicit calls 45 | case gen_server:call({lashup_gm, Node}, {subscribe, self()}) of 46 | {ok, Parent} -> 47 | ParentMon = monitor(process, Parent), 48 | %% TODO: Implement dump_events in here 49 | %% Just access the ets table directly 50 | gen_server:cast({lashup_gm, Node}, {sync, self()}), 51 | State1 = State#state{receiver_mon = ReceiverMon, parent_mon = ParentMon, parent = Parent}, 52 | event_loop(State1); 53 | Else -> 54 | ?LOG_DEBUG("Lashup GM Fanout unable to subscribe: ~p", [Else]), 55 | exit({unable_to_subscribe, Else}) 56 | end. 57 | 58 | event_loop(State) -> 59 | State1 = 60 | receive 61 | #{type := aae_keys} = AAEKeys -> 62 | aae_keys(AAEKeys, State); 63 | {event, Event} -> 64 | forward_event(Event, State); 65 | {'DOWN', MonitorRef, _Type, _Object, Info} when MonitorRef == State#state.parent_mon -> 66 | exit({parent_down, Info}); 67 | {'DOWN', MonitorRef, _Type, _Object, Info} when MonitorRef == State#state.receiver_mon -> 68 | exit({receiver_down, Info}); 69 | Else -> 70 | exit({unknown_event, Else}) 71 | end, 72 | event_loop(State1). 73 | 74 | forward_event(Event, State) when is_binary(Event) -> 75 | forward_event(binary_to_term(Event), State); 76 | forward_event(Event, State = #state{receiver = Receiver}) -> 77 | % I'm on the same node as the receiver 78 | gen_server:cast(Receiver, #{message => remote_event, from => State#state.node, event => Event}), 79 | State. 80 | 81 | %% AAE Keys is the initial sync process 82 | %% We have to send out full list of [{Node, VClock}] list to this pid 83 | %% In return it filters the one that it has newer vclocks for 84 | %% And sends them back 85 | %% TODO: Monitor / link against the sync worker 86 | aae_keys(#{pid := Pid}, State) -> 87 | NodeClocks = node_clocks(), 88 | [Pid ! #{type => node_clock, node_clock => NodeClock} || NodeClock <- NodeClocks], 89 | Pid ! #{type => node_clock_complete}, 90 | State. 91 | 92 | -spec(node_clocks() -> [{node(), riak_dt_vclock:vclock()}]). 93 | node_clocks() -> 94 | MatchSpec = ets:fun2ms( 95 | fun(Member = #member2{value = Value}) -> 96 | {Member#member2.node, Value} 97 | end 98 | ), 99 | Result = ets:select(members, MatchSpec), 100 | NodeClocks = [{NodeName, {Epoch, Clock}} || {NodeName, #{epoch := Epoch, clock := Clock}} <- Result], 101 | orddict:from_list(NodeClocks). 102 | -------------------------------------------------------------------------------- /src/lashup_gm_mc.erl: -------------------------------------------------------------------------------- 1 | -module(lashup_gm_mc). 2 | -author("sdhillon"). 3 | -behaviour(gen_server). 4 | 5 | -include_lib("kernel/include/logger.hrl"). 6 | 7 | %% API 8 | -export([ 9 | start_link/0, 10 | multicast/2, 11 | init_metrics/0 12 | ]). 13 | 14 | %% Packet API 15 | -export([ 16 | topic/1, 17 | payload/1, 18 | origin/1 19 | ]). 20 | 21 | %% gen_server callbacks 22 | -export([init/1, handle_call/3, 23 | handle_cast/2, handle_info/2]). 24 | 25 | -export_type([topic/0, payload/0, multicast_packet/0]). 26 | 27 | -record(state, { 28 | }). 29 | 30 | -type topic() :: atom(). 31 | -type payload() :: term(). 32 | -type multicast_packet() :: map(). 33 | 34 | -define(DEFAULT_TTL, 20). 35 | 36 | -spec(multicast(Topic :: topic(), Payload :: payload()) -> ok). 37 | multicast(Topic, Payload) -> 38 | gen_server:cast(?MODULE, {do_multicast, Topic, Payload}). 39 | 40 | -spec(start_link() -> 41 | {ok, Pid :: pid()} | ignore | {error, Reason :: term()}). 42 | start_link() -> 43 | gen_server:start_link({local, ?MODULE}, ?MODULE, [], []). 44 | 45 | %%%=================================================================== 46 | %%% Packet functions 47 | %%%=================================================================== 48 | 49 | -spec(topic(Packet :: multicast_packet()) -> topic()). 50 | topic(_Packet = #{topic := Topic}) -> 51 | Topic. 52 | 53 | -spec(payload(Packet :: multicast_packet()) -> payload()). 54 | payload(#{payload := Payload}) -> 55 | Payload; 56 | payload(#{compressed_payload := CompresssedPayload}) -> 57 | binary_to_term(CompresssedPayload). 58 | 59 | -spec(origin(Packet :: multicast_packet()) -> node()). 60 | origin(Packet) -> 61 | maps:get(origin, Packet). 62 | 63 | -spec(new_multicast_packet(topic(), payload()) -> multicast_packet()). 64 | new_multicast_packet(Topic, Payload) -> 65 | #{ 66 | type => multicast_packet, subtype => multicast, 67 | origin => node(), topic => Topic, ttl => ?DEFAULT_TTL, 68 | compressed_payload => term_to_binary(Payload, [compressed]), 69 | options => [] 70 | }. 71 | 72 | %%%=================================================================== 73 | %%% gen_server callbacks 74 | %%%=================================================================== 75 | 76 | init([]) -> 77 | {ok, #state{}}. 78 | 79 | handle_call(_Request, _From, State) -> 80 | {reply, ok, State}. 81 | 82 | handle_cast({do_multicast, Topic, Payload}, State) -> 83 | handle_do_original_multicast(Topic, Payload), 84 | {noreply, State, lashup_utils:hibernate()}; 85 | handle_cast(_Request, State) -> 86 | {noreply, State}. 87 | 88 | handle_info(#{type := multicast_packet} = MulticastPacket, State) -> 89 | handle_multicast_packet(MulticastPacket), 90 | {noreply, State, lashup_utils:hibernate()}; 91 | handle_info(_Info, State) -> 92 | {noreply, State}. 93 | 94 | %%%=================================================================== 95 | %%% Internal functions 96 | %%%=================================================================== 97 | 98 | -spec(handle_do_original_multicast(topic(), payload()) -> ok). 99 | handle_do_original_multicast(Topic, Payload) -> 100 | Begin = erlang:monotonic_time(), 101 | try 102 | original_multicast(Topic, Payload) 103 | after 104 | prometheus_summary:observe( 105 | lashup, mc_multicast_seconds, [], 106 | erlang:monotonic_time() - Begin) 107 | end. 108 | 109 | %% TODO: 110 | %% -Buffer messages if my neighbors / active view are flapping too much 111 | %% -Experiment with adding the tree that I build to the messages. 112 | %% Although this will make messages larger, it might be more efficient (I also don't know how much larger) 113 | %% The tree representation could also be significantly reduced in size rather than the map 114 | %% Which has a bunch of extraneous metadata 115 | 116 | -spec(original_multicast(topic(), payload()) -> ok). 117 | original_multicast(Topic, Payload) -> 118 | Packet = new_multicast_packet(Topic, Payload), 119 | ActiveView = lashup_hyparview_membership:get_active_view(), 120 | Fanout = lashup_config:max_mc_replication(), 121 | ActiveViewTruncated = determine_fakeroots(ActiveView, Fanout - 1), 122 | Nodes = [node() | ActiveViewTruncated], 123 | %% TODO: Try to make the trees intersect as little as possible 124 | lists:foreach(fun (Node) -> do_original_cast(Node, Packet) end, Nodes). 125 | 126 | %% @doc 127 | %% determine the fakeroots 128 | %% If the ActiveView is bigger than the fanout, we're going to dump the first node in the active view 129 | %% (lexically sorted). 130 | %% The reason behind this with the BSP Algorithm + Hyparview, it's likely that when the multicast is done 131 | %% treating self = fakeroot, it's going to reach the entire graph 132 | %% Therefore, we want to spread traffic to the upper nodes 133 | %% @end 134 | -spec(determine_fakeroots(ActiveView :: [node()], Fanout :: pos_integer()) -> [node()]). 135 | determine_fakeroots([], _Fanout) -> 136 | []; 137 | determine_fakeroots(_ActiveView, 0) -> 138 | []; 139 | determine_fakeroots(ActiveView, Fanout) when length(ActiveView) > Fanout -> 140 | % We can do this safely, without losing fidelity, because we know the active view 141 | % has at least one more member than the fanout 142 | [_|ActiveView1] = ActiveView, 143 | Seed = lashup_utils:seed(), 144 | ActiveView1Shuffled = lashup_utils:shuffle_list(ActiveView1, Seed), 145 | lists:sublist(ActiveView1Shuffled, Fanout); 146 | determine_fakeroots(ActiveView, _Fanout) -> 147 | ActiveView. 148 | 149 | -spec(do_original_cast(node(), multicast_packet()) -> ok). 150 | do_original_cast(Node, Packet) -> 151 | % At the original cast, we replicate the packet the size of the active view 152 | % This is done by reassigning the root for the purposes of the LSA calculation 153 | % to a surrogate root, (fakeroot) 154 | % We may want 155 | Packet0 = Packet#{fakeroot => Node}, 156 | Packet1 = 157 | case lashup_gm_route:get_tree(Node) of 158 | {tree, Tree} -> 159 | Packet0#{tree => Tree}; 160 | _ -> 161 | Packet0 162 | end, 163 | bsend(Packet1, [Node]). 164 | 165 | -spec(handle_multicast_packet(multicast_packet()) -> ok). 166 | handle_multicast_packet(MulticastPacket) -> 167 | Size = erlang:external_size(MulticastPacket), 168 | prometheus_counter:inc(lashup, mc_incoming_packets_total, [], 1), 169 | prometheus_counter:inc(lashup, mc_incoming_bytes_total, [], Size), 170 | % 1. Process the packet, and forward it on 171 | maybe_forward_packet(MulticastPacket), 172 | % 2. Fan it out to lashup_gm_mc_events 173 | maybe_ingest(MulticastPacket). 174 | 175 | -spec(maybe_ingest(multicast_packet()) -> ok). 176 | maybe_ingest(#{origin := Origin}) when Origin == node() -> 177 | ok; 178 | maybe_ingest(MulticastPacket) -> 179 | lashup_gm_mc_events:ingest(MulticastPacket). 180 | 181 | -spec(maybe_forward_packet(multicast_packet()) -> ok). 182 | maybe_forward_packet(_MulticastPacket = #{ttl := 0}) -> 183 | ?LOG_WARNING("TTL Exceeded on Multicast Packet"), 184 | ok; 185 | maybe_forward_packet(MulticastPacket0 = #{tree := Tree, ttl := TTL}) -> 186 | MulticastPacket1 = MulticastPacket0#{ttl := TTL - 1}, 187 | forward_packet(MulticastPacket1, Tree); 188 | maybe_forward_packet(MulticastPacket0 = #{fakeroot := FakeRoot, ttl := TTL, origin := Origin}) -> 189 | case lashup_gm_route:get_tree(FakeRoot) of 190 | {tree, Tree} -> 191 | MulticastPacket1 = MulticastPacket0#{ttl := TTL - 1}, 192 | forward_packet(MulticastPacket1, Tree); 193 | false -> 194 | ?LOG_WARNING("Dropping multicast packet due to unknown root: ~p", [Origin]), 195 | ok 196 | end. 197 | 198 | -spec(forward_packet(multicast_packet(), lashup_gm_route:tree()) -> ok). 199 | forward_packet(MulticastPacket, Tree) -> 200 | %% TODO: Only abcast to connected nodes 201 | Children = lashup_gm_route:children(node(), Tree), 202 | bsend(MulticastPacket, Children). 203 | 204 | bsend(MulticastPacket, Children) -> 205 | lists:foreach(fun (Child) -> 206 | case erlang:send({?MODULE, Child}, MulticastPacket, [noconnect]) of 207 | noconnect -> 208 | ?LOG_WARNING("Dropping packet due to stale tree"); 209 | _Result -> 210 | prometheus_counter:inc( 211 | lashup, mc_outgoing_bytes_total, [], 212 | erlang:external_size(MulticastPacket)) 213 | end 214 | end, Children). 215 | 216 | %%%=================================================================== 217 | %%% Metrics functions 218 | %%%=================================================================== 219 | 220 | -spec(init_metrics() -> ok). 221 | init_metrics() -> 222 | prometheus_summary:new([ 223 | {registry, lashup}, 224 | {name, mc_multicast_seconds}, 225 | {duration_unit, seconds}, 226 | {help, "The time spent sending out multicast packets."} 227 | ]), 228 | prometheus_counter:new([ 229 | {registry, lashup}, 230 | {name, mc_outgoing_bytes_total}, 231 | {help, "Total number of multicast packets sent in bytes."} 232 | ]), 233 | prometheus_counter:new([ 234 | {registry, lashup}, 235 | {name, mc_incoming_bytes_total}, 236 | {help, "Total number of bytes multicast packets received in bytes."} 237 | ]), 238 | prometheus_counter:new([ 239 | {registry, lashup}, 240 | {name, mc_incoming_packets_total}, 241 | {help, "Total number of bytes multicast packets received."} 242 | ]). 243 | -------------------------------------------------------------------------------- /src/lashup_gm_mc_events.erl: -------------------------------------------------------------------------------- 1 | -module(lashup_gm_mc_events). 2 | -author("sdhillon"). 3 | -behaviour(gen_event). 4 | 5 | %% API 6 | -export([ 7 | start_link/0, 8 | subscribe/1, 9 | remote_subscribe/2, 10 | ingest/1 11 | ]). 12 | 13 | %% gen_event callbacks 14 | -export([init/1, handle_event/2, handle_call/2, 15 | handle_info/2, terminate/2, code_change/3]). 16 | 17 | -record(state, { 18 | reference = erlang:error() :: reference() , 19 | topics_set = erlang:error() :: ordsets:ordset(lashup_gm_mc:topic()), 20 | pid = erlang:error() :: pid() 21 | }). 22 | -type state() :: state(). 23 | 24 | %%%=================================================================== 25 | %%% gen_event callbacks 26 | %%%=================================================================== 27 | 28 | -spec(ingest(lashup_gm_mc:multicast_packet()) -> ok). 29 | ingest(MulticastPacket) -> 30 | gen_event:notify(?MODULE, {ingest, MulticastPacket}). 31 | 32 | %% @doc 33 | %% Equivalent to {@link {@module}:remote_subscribe/2} with `Node' set to `node()' 34 | %% @end 35 | -spec(subscribe([lashup_gm_mc:topic()]) -> {ok, reference()} | {'EXIT', term()} | {error, term()}). 36 | subscribe(Topics) -> 37 | remote_subscribe(node(), Topics). 38 | 39 | %% @doc 40 | %% Subscribes calling process to zero or more topics produced by Node. 41 | %% 42 | %% Processes then get messages like: 43 | %% `{{@module}, #{ref => Reference, payload => Payload}}' 44 | %% @end 45 | -spec(remote_subscribe(Node :: node(), [lashup_gm_mc:topic()]) -> 46 | {ok, reference()} | {'EXIT', term()} | {error, term()}). 47 | remote_subscribe(Node, Topics) -> 48 | TopicsSet = ordsets:from_list(Topics), 49 | Reference = make_ref(), 50 | State = #state{pid = self(), reference = Reference, topics_set = TopicsSet}, 51 | EventMgrRef = event_mgr_ref(Node), 52 | case gen_event:add_sup_handler(EventMgrRef, ?MODULE, State) of 53 | ok -> 54 | {ok, Reference}; 55 | {'EXIT', Term} -> 56 | {'EXIT', Term}; 57 | Error -> 58 | {error, Error} 59 | end. 60 | 61 | event_mgr_ref(Node) when Node == node() -> 62 | ?MODULE; 63 | event_mgr_ref(Node) -> 64 | {?MODULE, Node}. 65 | 66 | -spec(start_link() -> {ok, pid()} | {error, {already_started, pid()}}). 67 | start_link() -> 68 | gen_event:start_link({local, ?MODULE}). 69 | 70 | %%%=================================================================== 71 | %%% gen_event callbacks 72 | %%%=================================================================== 73 | 74 | init(State) -> 75 | {ok, State}. 76 | 77 | handle_event({ingest, Message}, State) -> 78 | handle_ingest(Message, State), 79 | {ok, State}; 80 | handle_event(_Message, State) -> 81 | {ok, State}. 82 | 83 | handle_call(_Request, State) -> 84 | Reply = ok, 85 | {ok, Reply, State}. 86 | 87 | handle_info(_Info, State) -> 88 | {ok, State}. 89 | 90 | terminate(_Arg, _State) -> 91 | ok. 92 | 93 | code_change(_OldVsn, State, _Extra) -> 94 | {ok, State}. 95 | 96 | %%%=================================================================== 97 | %%% Internal functions 98 | %%%=================================================================== 99 | 100 | -spec(handle_ingest(lashup_gm_mc:multicast_packet(), state()) -> ok). 101 | handle_ingest(Message, State = #state{topics_set = TopicSet}) -> 102 | Topic = lashup_gm_mc:topic(Message), 103 | case ordsets:is_element(Topic, TopicSet) of 104 | true -> 105 | handle_ingest2(Message, State); 106 | false -> 107 | ok 108 | end. 109 | 110 | -spec(handle_ingest2(lashup_gm_mc:multicast_packet(), state()) -> ok). 111 | handle_ingest2(Message, _State = #state{reference = Reference, pid = Pid}) -> 112 | Payload = lashup_gm_mc:payload(Message), 113 | Origin = lashup_gm_mc:origin(Message), 114 | Event = #{payload => Payload, ref => Reference, origin => Origin}, 115 | Pid ! {lashup_gm_mc_event, Event}, 116 | ok. 117 | -------------------------------------------------------------------------------- /src/lashup_gm_mc_sup.erl: -------------------------------------------------------------------------------- 1 | -module(lashup_gm_mc_sup). 2 | -behaviour(supervisor). 3 | 4 | -export([start_link/0]). 5 | -export([init/1]). 6 | 7 | -define(CHILD(I, Type), {I, {I, start_link, []}, permanent, 5000, Type, [I]}). 8 | 9 | start_link() -> 10 | supervisor:start_link({local, ?MODULE}, ?MODULE, []). 11 | 12 | init([]) -> 13 | {ok, {#{}, [ 14 | ?CHILD(lashup_gm_mc, worker), 15 | ?CHILD(lashup_gm_mc_events, worker) 16 | ]}}. 17 | 18 | -------------------------------------------------------------------------------- /src/lashup_gm_probe.erl: -------------------------------------------------------------------------------- 1 | -module(lashup_gm_probe). 2 | -author("sdhillon"). 3 | 4 | %%-------------------------------------------------------------------- 5 | %% @doc 6 | %% Probe loop 7 | %% It goes node by node in the global membership table 8 | %% and checks if we have a path to them or not 9 | %% If it doesn't find a path, then it checks if we have a path to the next one or not 10 | %% Up until it hits a node greater the last node it probed 11 | 12 | %% This is really only useful for extended partitions 13 | %% Where either side has been partitioned from the other for an extended period of time 14 | %% and 15 | %% API 16 | %% @end 17 | %%-------------------------------------------------------------------- 18 | 19 | -behaviour(gen_server). 20 | 21 | -include_lib("kernel/include/logger.hrl"). 22 | 23 | -export([start_link/0]). 24 | 25 | %% gen_server callbacks 26 | -export([init/1, handle_call/3, handle_cast/2, 27 | handle_info/2, terminate/2, code_change/3]). 28 | 29 | -record(state, {}). 30 | 31 | -type state() :: #state{}. 32 | 33 | -spec(start_link() -> 34 | {ok, Pid :: pid()} | ignore | {error, Reason :: term()}). 35 | start_link() -> 36 | gen_server:start_link({local, ?MODULE}, ?MODULE, [], []). 37 | 38 | %%%=================================================================== 39 | %%% gen_server callbacks 40 | %%%=================================================================== 41 | 42 | init([]) -> 43 | rand:seed(exsplus), 44 | State = #state{}, 45 | schedule_next_probe(), 46 | {ok, State}. 47 | 48 | handle_call(_Request, _From, State) -> 49 | {reply, ok, State}. 50 | 51 | handle_cast(_Request, State) -> 52 | {noreply, State}. 53 | 54 | handle_info(do_probe, State) -> 55 | maybe_do_probe(State), 56 | {noreply, State, hibernate}; 57 | handle_info(_Info, State) -> 58 | {noreply, State}. 59 | 60 | terminate(_Reason, _State) -> 61 | ok. 62 | 63 | code_change(_OldVsn, State, _Extra) -> 64 | {ok, State}. 65 | 66 | %%%=================================================================== 67 | %%% Internal functions 68 | %%%=================================================================== 69 | 70 | -spec(schedule_next_probe() -> ok). 71 | schedule_next_probe() -> 72 | ProbeInterval = lashup_config:min_departition_probe_interval(), 73 | schedule_next_probe(ProbeInterval). 74 | 75 | %% We should make this configurable. It's the decision of when to make the first ping 76 | -spec(schedule_next_probe(Time :: non_neg_integer()) -> ok). 77 | schedule_next_probe(Time) when is_integer(Time) -> 78 | RandFloat = rand:uniform(), 79 | Multipler = 1 + round(RandFloat), 80 | Delay = Multipler * Time, 81 | timer:send_after(Delay, do_probe), 82 | ok. 83 | 84 | -spec(determine_next_probe(ReachableNodes :: [node()], UnreachableNode :: [node()]) -> non_neg_integer()). 85 | determine_next_probe(ReachableNodes, UnreachableNode) -> 86 | %% We want to ensure that component pings the entire other component every 10 minutes? 87 | %% But, we don't want to do more than 5 pings / sec as an individual node 88 | %% That number is somewhat arbitrary, but let's start there 89 | Ratio = length(ReachableNodes) / (length(UnreachableNode) + 1), 90 | %% Ratio is how many nodes I must ping over a probe period to fulfill the requirement set forth 91 | %% We divide by two, because schedule_next_probe calculates from 1x the time up to 2x the time 92 | FullProbePeriod = lashup_config:full_probe_period() / 2, 93 | ProbeInterval = FullProbePeriod / Ratio, 94 | MinProbeInterval = lashup_config:min_departition_probe_interval(), 95 | Interval = max(ProbeInterval, MinProbeInterval), 96 | Interval1 = min(Interval, FullProbePeriod), 97 | trunc(Interval1). 98 | 99 | -spec(maybe_do_probe(state()) -> ok). 100 | maybe_do_probe(_State) -> 101 | case lashup_gm_route:get_tree(node(), infinity) of 102 | {tree, Tree} -> 103 | do_probe(Tree); 104 | false -> 105 | ?LOG_WARNING("Lashup GM Probe unable to get LSA Tree"), 106 | schedule_next_probe(), 107 | ok 108 | end. 109 | 110 | -spec(do_probe(lashup_gm_route:tree()) -> ok). 111 | do_probe(Tree) -> 112 | case unreachable_nodes(Tree) of 113 | [] -> 114 | schedule_next_probe(), 115 | ok; 116 | UnreachableNodes -> 117 | probe_oneof(UnreachableNodes), 118 | ReachableNodes = lashup_gm_route:reachable_nodes(Tree), 119 | ProbeTime = determine_next_probe(ReachableNodes, UnreachableNodes), 120 | schedule_next_probe(ProbeTime), 121 | ok 122 | end. 123 | 124 | -spec(unreachable_nodes(lashup_gm_route:tree()) -> [node()]). 125 | unreachable_nodes(Tree) -> 126 | UnreachableNodes = lashup_gm_route:unreachable_nodes(Tree), 127 | UnreachableContactNodes = 128 | lists:filter(fun (Node) -> 129 | lashup_gm_route:distance(Node, Tree) =:= infinity 130 | end, lashup_config:contact_nodes()), 131 | lists:usort(UnreachableNodes ++ UnreachableContactNodes). 132 | 133 | -spec(probe_oneof(UnreachableNodes :: [node()]) -> ok). 134 | probe_oneof(UnreachableNodes) -> 135 | Idx = rand:uniform(length(UnreachableNodes)), 136 | OtherNode = lists:nth(Idx, UnreachableNodes), 137 | lashup_hyparview_membership:recommend_neighbor(OtherNode), 138 | ok. 139 | -------------------------------------------------------------------------------- /src/lashup_gm_route.erl: -------------------------------------------------------------------------------- 1 | %%% @doc 2 | %%% An LSA routing database (RIB) routing information base 3 | %%% It has a cache that makes it suitable for a FIB 4 | %%% @end 5 | %%% 6 | %%% TODO: 7 | %%% -Determine whether it makes more sense to stash the graph in a sofs 8 | %%% -Determine whether it makes sense to use sofs rather than maps to store the BFS trees 9 | 10 | -module(lashup_gm_route). 11 | -author("sdhillon"). 12 | 13 | -compile(inline). 14 | 15 | -include_lib("kernel/include/logger.hrl"). 16 | -include_lib("stdlib/include/ms_transform.hrl"). 17 | 18 | -behaviour(gen_statem). 19 | 20 | %% API 21 | -export([ 22 | start_link/0, 23 | update_node/2, 24 | delete_node/1, 25 | reachable/1, 26 | get_tree/1, 27 | get_tree/2, 28 | distance/2, 29 | reachable_nodes/1, 30 | unreachable_nodes/1, 31 | path_to/1, 32 | path_to/2, 33 | reverse_children/2, 34 | children/2, 35 | prune_tree/2, 36 | flush_events_helper/0, 37 | init_metrics/0 38 | ]). 39 | 40 | 41 | -ifdef(TEST). 42 | 43 | -include_lib("proper/include/proper.hrl"). 44 | -include_lib("eunit/include/eunit.hrl"). 45 | 46 | -behaviour(proper_statem). 47 | 48 | -export([proper/0]). 49 | -export([ 50 | initial_state/0, 51 | stop/0, 52 | command/1, 53 | precondition/2, 54 | postcondition/3, 55 | next_state/3, 56 | verify_routes/2 57 | ]). 58 | 59 | -endif. 60 | 61 | %% gen_statem callbacks 62 | -export([init/1, terminate/3, code_change/4, 63 | callback_mode/0, handle_event/4]). 64 | 65 | -record(state, { 66 | events = 0 :: non_neg_integer(), 67 | cache = #{} :: #{node() => tree()} 68 | }). 69 | -type state() :: state(). 70 | 71 | -type distance() :: non_neg_integer() | infinity. 72 | -type tree() :: map(). 73 | -export_type([tree/0]). 74 | 75 | %% Egress paths = active view 76 | -record(vertex, { 77 | node = erlang:error() :: node(), 78 | dsts = ordsets:new() :: ordsets:ordset(node()) 79 | }). 80 | 81 | -record(tree_entry, { 82 | parent = undefined :: node() | undefined, 83 | distance = infinity :: non_neg_integer() | infinity, 84 | children = ordsets:new() :: ordsets:ordset(node()) 85 | }). 86 | 87 | %%%=================================================================== 88 | %%% API 89 | %%%=================================================================== 90 | 91 | %% TODO: 92 | %% -Add reachable by node 93 | %% -Add reachable by IP 94 | 95 | 96 | %% @doc 97 | %% Persist or update a node 98 | %% @end 99 | -spec(update_node(Node :: node(), Dsts :: [node()]) -> ok). 100 | update_node(Node, Dsts) -> 101 | gen_statem:cast(?MODULE, {update_node, Node, Dsts}). 102 | 103 | -spec(delete_node(Node :: node()) -> ok). 104 | delete_node(Node) -> 105 | gen_statem:cast(?MODULE, {delete_node, Node}). 106 | 107 | %% @doc 108 | %% Checks the reachability from this node to node Node 109 | %% @end 110 | -spec(reachable(Node :: node()) -> true | false). 111 | reachable(Node) when Node == node() -> true; 112 | reachable(Node) -> 113 | case get_tree(node()) of 114 | {tree, Tree} -> 115 | distance(Node, Tree) =/= infinity; 116 | _ -> 117 | false 118 | end. 119 | 120 | 121 | -spec(path_to(Node :: node()) -> [node()] | false). 122 | path_to(Node) -> 123 | case get_tree(node()) of 124 | {tree, Tree} -> 125 | path_to(Node, Tree); 126 | _ -> 127 | false 128 | end. 129 | 130 | 131 | -spec(path_to(Node :: node(), Tree :: tree()) -> false | [node()]). 132 | path_to(Node, Tree) -> 133 | case distance(Node, Tree) of 134 | infinity -> 135 | false; 136 | _ -> 137 | path_to(Node, Tree, [Node]) 138 | end. 139 | 140 | -spec(path_to(Node :: node(), Tree :: tree(), Path :: [node()]) -> [node()]). 141 | path_to(Node, Tree, Acc) -> 142 | Entry = maps:get(Node, Tree), 143 | case Entry of 144 | #tree_entry{distance = 0} -> 145 | Acc; 146 | #tree_entry{parent = Parent} -> 147 | path_to(Parent, Tree, [Parent|Acc]) 148 | end. 149 | 150 | -spec(get_tree(Node :: node()) -> {tree, tree()} | false). 151 | get_tree(Node) -> 152 | get_tree(Node, 5000). 153 | 154 | -spec(get_tree(Node :: node(), Timeout :: non_neg_integer() | infinity) -> {tree, tree()} | false). 155 | get_tree(Node, Timeout) -> 156 | gen_statem:call(?MODULE, {get_tree, Node}, Timeout). 157 | 158 | -spec(distance(Node :: node(), Tree :: tree()) -> non_neg_integer() | infinity). 159 | distance(Node, Tree) -> 160 | case Tree of 161 | #{Node := Entry} -> 162 | Entry#tree_entry.distance; 163 | _ -> 164 | infinity 165 | end. 166 | 167 | -spec(reachable_nodes(Tree :: tree()) -> [node()]). 168 | reachable_nodes(Tree) -> 169 | TreeEntries = 170 | maps:filter( 171 | fun(_Key, _TreeEntry = #tree_entry{distance = Distance}) -> 172 | Distance =/= infinity 173 | end, 174 | Tree), 175 | maps:keys(TreeEntries). 176 | 177 | -spec(unreachable_nodes(Tree :: tree()) -> [node()]). 178 | unreachable_nodes(Tree) -> 179 | TreeEntries = 180 | maps:filter( 181 | fun(_Key, _TreeEntry = #tree_entry{distance = Distance}) -> 182 | Distance == infinity 183 | end, 184 | Tree), 185 | maps:keys(TreeEntries). 186 | 187 | -spec(reverse_children(Parent :: node(), Tree :: tree()) -> [node()]). 188 | reverse_children(Parent, Tree) -> 189 | TreeEntries = 190 | maps:filter( 191 | fun(_Node, _TreeEntry = #tree_entry{parent = P, distance = Distance}) -> 192 | P == Parent andalso Distance =/= 0 193 | end, 194 | Tree), 195 | maps:keys(TreeEntries). 196 | 197 | -spec(children(Parent :: node(), Tree :: tree()) -> [node()]). 198 | children(Parent, Tree) -> 199 | #{Parent := #tree_entry{children = Children}} = Tree, 200 | Children. 201 | 202 | %% @doc 203 | %% Ensures there is a path between the root and the node 204 | %% @end 205 | -spec(prune_tree(Node :: node(), Tree :: tree()) -> tree()). 206 | prune_tree(Node, Tree) -> 207 | prune_tree(Node, Tree, #{}). 208 | -spec(prune_tree(Node :: node(), Tree :: tree(), PrunedTree :: tree()) -> tree()). 209 | prune_tree(Node, Tree, PrunedTree) -> 210 | case maps:get(Node, Tree, unknown) of 211 | unknown -> 212 | %% We did the best we could 213 | PrunedTree; 214 | TreeEntry = #tree_entry{distance = 0} -> 215 | PrunedTree#{Node => TreeEntry}; 216 | TreeEntry = #tree_entry{parent = Parent} -> 217 | PrunedTree1 = PrunedTree#{Node => TreeEntry}, 218 | prune_tree(Parent, Tree, PrunedTree1) 219 | end. 220 | 221 | flush_events_helper() -> 222 | gen_statem:cast(?MODULE, flush_events_helper). 223 | 224 | -spec(start_link() -> 225 | {ok, Pid :: pid()} | ignore | {error, Reason :: term()}). 226 | start_link() -> 227 | gen_statem:start_link({local, ?MODULE}, ?MODULE, [], []). 228 | 229 | %%%=================================================================== 230 | %%% gen_statem callbacks 231 | %%%=================================================================== 232 | 233 | init([]) -> 234 | Interval = trunc(200 + rand:uniform(50)), 235 | timer:send_interval(Interval, decrement_busy), 236 | vertices = ets:new(vertices, [set, named_table, {keypos, #vertex.node}]), 237 | {ok, idle, #state{}}. 238 | 239 | callback_mode() -> 240 | handle_event_function. 241 | 242 | %% Get tree logic 243 | handle_event({call, From}, {get_tree, Node}, cached, StateData0) when Node == node() -> 244 | {StateData1, Reply} = handle_get_tree(Node, StateData0), 245 | {keep_state, StateData1, {reply, From, Reply}}; 246 | handle_event({call, From}, {get_tree, Node}, _StateName, StateData0 = #state{events = EC}) when Node == node() -> 247 | {StateData1, Reply} = handle_get_tree(Node, StateData0), 248 | {next_state, cached, StateData1#state{events = EC + 1}, {reply, From, Reply}}; 249 | handle_event({call, From}, {get_tree, Node}, _StateName, StateData0 = #state{events = EC}) -> 250 | {StateData1, Reply} = handle_get_tree(Node, StateData0), 251 | {keep_state, StateData1#state{events = EC + 1}, {reply, From, Reply}}; 252 | 253 | %% Rewrite all other calls into synchronous casts (for testing) 254 | handle_event({call, From}, EventContent, StateName, StateData0) -> 255 | Ret = handle_event(cast, EventContent, StateName, StateData0), 256 | gen_statem:reply(From, ok), 257 | Ret; 258 | 259 | %% Rewrite flush_events message into an advertise message 260 | handle_event(cast, flush_events_helper, _StateName, _StateData0) -> 261 | {keep_state_and_data, [{next_event, internal, maybe_advertise_state}]}; 262 | 263 | %% Advertisements for cached state are free 264 | handle_event(internal, maybe_advertise_state, cached, StateData0) -> 265 | {StateData1, {tree, Tree}} = handle_get_tree(node(), StateData0), 266 | lashup_gm_route_events:ingest(Tree), 267 | {keep_state, StateData1}; 268 | handle_event(internal, maybe_advertise_state, _StateName, StateData = #state{events = EC}) when EC > 5 -> 269 | %% Ignore that the tree is dirty 270 | {next_state, busy, StateData}; 271 | handle_event(internal, maybe_advertise_state, _StateName, StateData0 = #state{events = EC}) -> 272 | {StateData1, {tree, Tree}} = handle_get_tree(node(), StateData0), 273 | prometheus_gauge:set( 274 | lashup, gm_unreachable_nodes, [], 275 | length(unreachable_nodes(Tree))), 276 | lashup_gm_route_events:ingest(Tree), 277 | {next_state, cached, StateData1#state{events = EC + 1}}; 278 | 279 | handle_event(info, decrement_busy, busy, StateData0 = #state{events = EC}) -> 280 | {keep_state, StateData0#state{events = max(0, EC - 1)}, [{next_event, internal, maybe_advertise_state}]}; 281 | handle_event(info, decrement_busy, _, StateData0 = #state{events = EC}) -> 282 | {keep_state, StateData0#state{events = max(0, EC - 1)}}; 283 | 284 | handle_event(cast, {update_node, Node, Dsts}, _StateName, StateData0) -> 285 | StateData1 = handle_update_node(Node, Dsts, StateData0), 286 | {next_state, dirty_tree, StateData1, [{next_event, internal, maybe_advertise_state}]}; 287 | handle_event(cast, {delete_node, Node}, _StateName, StateData0) -> 288 | StateData1 = handle_delete_node(Node, StateData0), 289 | {next_state, dirty_tree, StateData1, [{next_event, internal, maybe_advertise_state}]}. 290 | 291 | terminate(Reason, State, Data = #state{}) -> 292 | ?LOG_WARNING("Terminating in State: ~p, due to reason: ~p, with data: ~p", [State, Reason, Data]), 293 | ok. 294 | 295 | code_change(_OldVsn, OldState, OldData, _Extra) -> 296 | {ok, OldState, OldData}. 297 | 298 | %%%=================================================================== 299 | %%% Internal functions 300 | %%%=================================================================== 301 | 302 | %% TODO: 303 | %% -Handle short-circuiting updating the tree if we're seeing too many events per second 304 | %% -Add metrics around tree updates / sec 305 | 306 | -spec(handle_get_tree(Root :: node(), State :: state()) -> {state(), {tree, tree()}}). 307 | handle_get_tree(Root, State0 = #state{cache = Cache0}) -> 308 | case Cache0 of 309 | #{Root := Tree} -> 310 | {State0, {tree, Tree}}; 311 | _ -> 312 | Tree = build_tree(Root), 313 | State1 = State0#state{cache = Cache0#{Root => Tree}}, 314 | {State1, {tree, Tree}} 315 | end. 316 | 317 | -spec(handle_update_node(Node :: node(), Edges :: [node()], State :: state()) -> state()). 318 | handle_update_node(Node, Dsts, State0) -> 319 | Dsts1 = lists:usort(Dsts), 320 | InitialNeighbors = neighbors(Node), 321 | persist_node(Node), 322 | % Ensure that the dsts exist 323 | [persist_node(Dst) || Dst <- Dsts1], 324 | update_edges(Node, Dsts1), 325 | %% We only bust the caches is the adjacency list has changed. 326 | %% Once we have properties on adjacencies and vertices, 327 | %% We have to augment this 328 | case Dsts1 of 329 | InitialNeighbors -> 330 | State0; 331 | _ -> 332 | bust_cache(State0) 333 | end. 334 | 335 | -spec(handle_delete_node(Node ::node(), State :: state()) -> state()). 336 | handle_delete_node(Node, State0) -> 337 | ets:delete(vertices, Node), 338 | State1 = handle_delete_node(Node, ets:first(vertices), State0), 339 | bust_cache(State1). 340 | 341 | -spec(handle_delete_node(DstNode :: node(), CurNode :: node() | '$end_of_table', State :: state()) -> state()). 342 | handle_delete_node(_DstNode, '$end_of_table', State) -> State; 343 | handle_delete_node(DstNode, CurNode, State) -> 344 | Dsts0 = ets:lookup_element(vertices, CurNode, #vertex.dsts), 345 | case ordsets:del_element(DstNode, Dsts0) of 346 | Dsts0 -> 347 | State; 348 | Dsts1 -> 349 | ets:update_element(vertices, CurNode, {#vertex.dsts, Dsts1}) 350 | end, 351 | handle_delete_node(DstNode, ets:next(vertices, CurNode), State). 352 | 353 | bust_cache(State0 = #state{}) -> 354 | State0#state{cache = #{}}. 355 | 356 | update_edges(Node, Dsts) -> 357 | true = ets:update_element(vertices, Node, {#vertex.dsts, Dsts}). 358 | 359 | neighbors(Node) -> 360 | %% Ets lookup should _always_ return in order. 361 | %% This means we do not need to ordset, since ets already does: 362 | %% (1) Dedupe (which shouldn't happen 363 | %% (2) Sort the list, because it ensure we always take the same path ("smallest") when building the BFS 364 | %% since we add the nodes in order 365 | case ets:lookup(vertices, Node) of 366 | [] -> []; 367 | [Vertex] -> 368 | Vertex#vertex.dsts 369 | end. 370 | 371 | -spec(persist_node(Node :: node()) -> boolean()). 372 | persist_node(Node) -> 373 | Vertex = #vertex{node = Node}, 374 | ets:insert_new(vertices, Vertex). 375 | 376 | %% @doc 377 | %% Build the tree representation for Node = root 378 | %% @end 379 | 380 | %% We could put the tree in ets, but right now it's on the process heap 381 | %% We do store it in the tree_cache table 382 | %% The reason that it's not stored in an ets table is that it's a pain 383 | %% to incrementally (safely) update the tree 384 | 385 | %% We can't edit the tree while other people are observing it 386 | %% And this process can stall out for a little bit 387 | 388 | %% Therefore, we have this workaround of stashing the tree in a serialized object 389 | %% Also, we assume the routing table never grows to more than ~10k nodes 390 | 391 | -spec(build_tree(node()) -> tree()). 392 | build_tree(Node) -> 393 | Tree = ets:foldl(fun initialize_tree/2, #{}, vertices), 394 | %% Special case - it's its own parent 395 | Tree1 = update_node(Node, Node, 0, Tree), 396 | Queue = queue:new(), 397 | Queue1 = queue:in(Node, Queue), 398 | build_tree(Queue1, Tree1). 399 | 400 | initialize_tree(_Node = #vertex{node = Key}, Tree) -> 401 | Tree#{Key => #tree_entry{}}. 402 | 403 | -spec(update_node(Node :: node(), Parent :: node(), Distance :: distance(), Tree :: tree()) -> tree()). 404 | %% The exceptional case for the root node 405 | update_node(Node, Parent, Distance, Tree) when Node == Parent -> 406 | Entry0 = maps:get(Node, Tree, #tree_entry{}), 407 | Entry1 = Entry0#tree_entry{distance = Distance, parent = Parent}, 408 | Tree#{Node => Entry1}; 409 | update_node(Node, Parent, Distance, Tree) -> 410 | Entry0 = maps:get(Node, Tree), 411 | Entry1 = Entry0#tree_entry{distance = Distance, parent = Parent}, 412 | ParentEntry0 = #tree_entry{children = Children0} = maps:get(Parent, Tree, #tree_entry{}), 413 | Children1 = ordsets:add_element(Node, Children0), 414 | ParentEntry1 = ParentEntry0#tree_entry{children = Children1}, 415 | Tree#{Node => Entry1, Parent => ParentEntry1}. 416 | %Tree#{Node => NewEntry}. 417 | 418 | 419 | build_tree(Queue, Tree) -> 420 | case queue:out(Queue) of 421 | {{value, Current}, Queue2} -> 422 | Neighbors = neighbors(Current), 423 | FoldFun = fun(Neighbor, Acc) -> update_adjacency(Current, Neighbor, Acc) end, 424 | {Queue3, Tree1} = lists:foldl(FoldFun, {Queue2, Tree}, Neighbors), 425 | build_tree(Queue3, Tree1); 426 | {empty, Queue} -> 427 | Tree 428 | end. 429 | 430 | %% BFS - https://en.wikipedia.org/wiki/Breadth-first_search 431 | update_adjacency(Current, Neighbor, {Queue, Tree}) -> 432 | case distance(Neighbor, Tree) of 433 | infinity -> 434 | NewDistance = distance(Current, Tree) + 1, 435 | Tree1 = update_node(Neighbor, Current, NewDistance, Tree), 436 | Queue1 = queue:in(Neighbor, Queue), 437 | {Queue1, Tree1}; 438 | _ -> 439 | {Queue, Tree} 440 | end. 441 | 442 | %%%=================================================================== 443 | %%% Metrics functions 444 | %%%=================================================================== 445 | 446 | -spec(init_metrics() -> ok). 447 | init_metrics() -> 448 | prometheus_gauge:new([ 449 | {registry, lashup}, 450 | {name, gm_unreachable_nodes}, 451 | {help, "The number of unreachable nodes in the global membership table."} 452 | ]). 453 | 454 | %%%=================================================================== 455 | %%% Test functions 456 | %%%=================================================================== 457 | 458 | -ifdef(TEST). 459 | 460 | proper_test_() -> 461 | {setup, fun setup/0, fun cleanup/1, 462 | {timeout, 3600, [fun proper/0]} 463 | }. 464 | 465 | setup() -> 466 | ok = logger:remove_handler(default), 467 | ok = application:start(prometheus), 468 | init_metrics(). 469 | 470 | cleanup(_) -> 471 | ok = application:stop(prometheus). 472 | 473 | proper() -> 474 | [] = proper:module(?MODULE, [{numtests, 10000}]). 475 | 476 | initial_state() -> 477 | Initial = #{}, 478 | Digraph = digraph:new(), 479 | update_state(Digraph, Initial). 480 | 481 | state_to_digraph(_State = #{family := Family}) -> 482 | sofs:family_to_digraph(Family). 483 | 484 | update_state(Digraph, State) -> 485 | Family = sofs:digraph_to_family(Digraph), 486 | digraph:delete(Digraph), 487 | State#{family => Family}. 488 | 489 | get_path(_State = #{family := Family}, A, B) -> 490 | Digraph = sofs:family_to_digraph(Family), 491 | Result = digraph:get_path(Digraph, A, B), 492 | digraph:delete(Digraph), 493 | Result. 494 | 495 | get_short_path(_State = #{family := Family}, A, B) -> 496 | Digraph = sofs:family_to_digraph(Family), 497 | Result = digraph:get_short_path(Digraph, A, B), 498 | digraph:delete(Digraph), 499 | Result. 500 | 501 | 502 | -define(NODES, [ 503 | node(), 504 | node1@localhost, node2@localhost, node3@localhost, 505 | node4@localhost, node5@localhost, node6@localhost, 506 | node7@localhost, node8@localhost, node9@localhost, 507 | node10@localhost, node11@localhost, node12@localhost, 508 | node13@localhost, node14@localhost, node15@localhost, 509 | node16@localhost, node17@localhost, node18@localhost, 510 | node19@localhost, node20@localhost, node21@localhost, 511 | node22@localhost, node23@localhost, node24@localhost, 512 | node25@localhost 513 | ]). 514 | 515 | precondition(_State, _Call) -> true. 516 | 517 | postcondition(_State, {call, ?MODULE, reachable, [Node]}, Result) when Node == node() -> 518 | true == Result; 519 | postcondition(State, {call, ?MODULE, reachable, [Node]}, Result) -> 520 | GetPath = get_path(State, node(), Node), 521 | is_list(GetPath) == Result; 522 | 523 | %% This is a divergence from the digraph module 524 | %% If the node is in our routing table 525 | %% We will say the route back to the node is itself. 526 | postcondition(_State, {call, ?MODULE, verify_routes, [FromNode, ToNode]}, Result) when FromNode == ToNode -> 527 | %% A node always has a route to itself. 528 | Result == [FromNode]; 529 | postcondition(State, {call, ?MODULE, verify_routes, [FromNode, ToNode]}, Result) -> 530 | DigraphPath = get_short_path(State, FromNode, ToNode), 531 | case {Result, DigraphPath} of 532 | {false, false} -> 533 | true; 534 | {false, P} when is_list(P) -> 535 | false; 536 | {P, false} when is_list(P) -> 537 | false; 538 | {Path1, Path2} when Path1 == Path2 -> 539 | true; 540 | {Path1, Path2} when length(Path1) =/= length(Path2) -> 541 | false; 542 | %% This is to take care of the case when there are multiple shortest paths 543 | {Path1, _Path2} -> 544 | Digraph = state_to_digraph(State), 545 | Subgraph = digraph_utils:subgraph(Digraph, Path1), 546 | digraph:delete(Digraph), 547 | SubgraphPath = digraph:get_short_path(Subgraph, FromNode, ToNode), 548 | digraph:delete(Subgraph), 549 | Path1 == SubgraphPath 550 | end; 551 | postcondition(_State, _Call, _Result) -> true. 552 | 553 | next_state(State, _V, 554 | {call, gen_statem, call, [lashup_gm_route, {update_node, Node, NewNodes}]}) -> 555 | Digraph = state_to_digraph(State), 556 | digraph:add_vertex(Digraph, Node), 557 | [digraph:add_vertex(Digraph, NewNode) || NewNode <- NewNodes], 558 | OldEdges = digraph:out_edges(Digraph, Node), 559 | [digraph:del_edge(Digraph, OldEdge) || OldEdge <- OldEdges], 560 | [digraph:add_edge(Digraph, Node, NewNode) || NewNode <- NewNodes], 561 | update_state(Digraph, State); 562 | 563 | next_state(State, _V, 564 | {call, gen_statem, call, [lashup_gm_route, {delete_node, Node}]}) -> 565 | Digraph = state_to_digraph(State), 566 | digraph:del_vertex(Digraph, Node), 567 | update_state(Digraph, State); 568 | 569 | next_state(State, _V, _Call) -> 570 | State. 571 | 572 | node_gen() -> 573 | oneof(?NODES). 574 | 575 | node_gen_list(Except) -> 576 | list(elements(?NODES -- [Except])). 577 | 578 | update_node_gen() -> 579 | ?LET(Node, oneof(?NODES), {update_node, Node, node_gen_list(Node)}). 580 | 581 | verify_routes(FromNode, ToNode) -> 582 | case get_tree(FromNode) of 583 | false -> 584 | false; 585 | {tree, Tree} -> 586 | path_to(ToNode, Tree) 587 | end. 588 | 589 | command(_S) -> 590 | oneof([ 591 | {call, gen_statem, call, [?MODULE, update_node_gen()]}, 592 | {call, gen_statem, call, [?MODULE, {delete_node, node_gen()}]}, 593 | {call, ?MODULE, reachable, [node_gen()]}, 594 | {call, ?MODULE, verify_routes, [node_gen(), node_gen()]} 595 | ]). 596 | 597 | 598 | prop_server_works_fine() -> 599 | ?FORALL(Cmds, commands(?MODULE), 600 | ?TRAPEXIT( 601 | begin 602 | ?MODULE:start_link(), 603 | {History, State, Result} = run_commands(?MODULE, Cmds), 604 | ?MODULE:stop(), 605 | ?WHENFAIL(io:format("History: ~p\nState: ~p\nResult: ~p\n", 606 | [History, State, Result]), 607 | Result =:= ok) 608 | end)). 609 | 610 | stop() -> 611 | gen_statem:stop(?MODULE). 612 | 613 | -endif. 614 | -------------------------------------------------------------------------------- /src/lashup_gm_route_events.erl: -------------------------------------------------------------------------------- 1 | -module(lashup_gm_route_events). 2 | -author("sdhillon"). 3 | -behaviour(gen_event). 4 | 5 | %% API 6 | -export([ 7 | start_link/0, 8 | subscribe/0, 9 | remote_subscribe/1, 10 | ingest/1 11 | ]). 12 | 13 | %% gen_event callbacks 14 | -export([init/1, handle_event/2, handle_call/2, 15 | handle_info/2, terminate/2, code_change/3]). 16 | 17 | -record(state, { 18 | reference = erlang:error() :: reference() , 19 | pid = erlang:error() :: pid(), 20 | tree = undefined :: lashup_gm_route:tree() | undefined 21 | }). 22 | -type state() :: #state{}. 23 | 24 | -spec(ingest(lashup_gm_route:tree()) -> ok). 25 | ingest(Tree) -> 26 | %% Because the lashup_gm_route, and lashup_gm_route_events fate-share in the supervisor, we should have the sup 27 | %% deal with restarting, and during tests, if lashup_gm_route_events isn't running, it should be okay 28 | catch gen_event:notify(?MODULE, {ingest, Tree}), 29 | ok. 30 | 31 | %% @doc 32 | %% Equivalent to {@link {@module}:remote_subscribe/1} with `Node' set to `node()' 33 | %% @end 34 | -spec(subscribe() -> {ok, reference()} | {'EXIT', term()} | {error, term()}). 35 | subscribe() -> 36 | remote_subscribe(node()). 37 | 38 | %% @doc 39 | %% Subscribes calling process to zero or more topics produced by Node. 40 | %% 41 | %% Processes then get messages like: 42 | %% `{{@module}, #{ref => Reference, payload => Payload}}' 43 | %% @end 44 | -spec(remote_subscribe(Node :: node()) -> 45 | {ok, reference()} | {'EXIT', term()} | {error, term()}). 46 | remote_subscribe(Node) -> 47 | Reference = make_ref(), 48 | State = #state{pid = self(), reference = Reference}, 49 | EventMgrRef = event_mgr_ref(Node), 50 | case gen_event:add_sup_handler(EventMgrRef, ?MODULE, State) of 51 | ok -> 52 | {ok, Reference}; 53 | {'EXIT', Term} -> 54 | {'EXIT', Term}; 55 | Error -> 56 | {error, Error} 57 | end. 58 | 59 | event_mgr_ref(Node) when Node == node() -> 60 | ?MODULE; 61 | event_mgr_ref(Node) -> 62 | {?MODULE, Node}. 63 | 64 | -spec(start_link() -> {ok, pid()} | {error, {already_started, pid()}}). 65 | start_link() -> 66 | gen_event:start_link({local, ?MODULE}). 67 | 68 | %%%=================================================================== 69 | %%% gen_event callbacks 70 | %%%=================================================================== 71 | 72 | init(State) -> 73 | self() ! flush, 74 | {ok, State}. 75 | 76 | handle_event({ingest, Tree}, State) -> 77 | handle_ingest(Tree, State), 78 | {ok, State}; 79 | handle_event(_Event, State) -> 80 | {ok, State}. 81 | 82 | handle_call(_Request, State) -> 83 | Reply = ok, 84 | {ok, Reply, State}. 85 | 86 | handle_info(flush, State) -> 87 | lashup_gm_route:flush_events_helper(), 88 | {ok, State}; 89 | handle_info(_Info, State) -> 90 | {ok, State}. 91 | 92 | terminate(_Arg, _State) -> 93 | ok. 94 | 95 | code_change(_OldVsn, State, _Extra) -> 96 | {ok, State}. 97 | 98 | %%%=================================================================== 99 | %%% Internal functions 100 | %%%=================================================================== 101 | 102 | -spec(handle_ingest(lashup_gm_route:tree(), state()) -> state()). 103 | handle_ingest(Tree, State0 = #state{tree = Tree}) -> 104 | State0; 105 | handle_ingest(Tree, State0 = #state{reference = Reference, pid = Pid}) -> 106 | Event = #{type => tree, tree => Tree, ref => Reference}, 107 | Pid ! {?MODULE, Event}, 108 | State0#state{tree = Tree}. 109 | -------------------------------------------------------------------------------- /src/lashup_gm_sup.erl: -------------------------------------------------------------------------------- 1 | -module(lashup_gm_sup). 2 | -behaviour(supervisor). 3 | 4 | -export([start_link/0]). 5 | -export([init/1]). 6 | 7 | -define(CHILD(I, Type), {I, {I, start_link, []}, permanent, 5000, Type, [I]}). 8 | 9 | start_link() -> 10 | supervisor:start_link({local, ?MODULE}, ?MODULE, []). 11 | 12 | init([]) -> 13 | {ok, {#{}, [ 14 | ?CHILD(lashup_gm_worker_sup, supervisor), 15 | ?CHILD(lashup_gm_events, worker), 16 | ?CHILD(lashup_gm_route, worker), 17 | ?CHILD(lashup_gm, worker), 18 | ?CHILD(lashup_gm_route_events, worker), 19 | ?CHILD(lashup_gm_probe, worker), 20 | ?CHILD(lashup_gm_mc_sup, supervisor) 21 | ]}}. 22 | 23 | -------------------------------------------------------------------------------- /src/lashup_gm_sync_worker.erl: -------------------------------------------------------------------------------- 1 | %%% @doc 2 | %%% This module is launched when dump_events is called to start the process of syncing the gm data between the two nodes 3 | %%% It's a temporary worker, unlike lashup_gm_fanout 4 | %%% @end 5 | 6 | -module(lashup_gm_sync_worker). 7 | -author("sdhillon"). 8 | 9 | -include_lib("stdlib/include/ms_transform.hrl"). 10 | -include("lashup.hrl"). 11 | 12 | %% API 13 | -export([ 14 | handle/1, 15 | start_link/1, 16 | do_handle/1 17 | ]). 18 | 19 | -record(state, { 20 | fanout_pid, 21 | nodes_checked = [] 22 | }). 23 | 24 | 25 | handle(Pid) -> 26 | Args = #{lashup_gm_fanout_pid => Pid}, 27 | ChildSpec = #{ 28 | id => make_ref(), 29 | start => {?MODULE, start_link, [Args]}, 30 | restart => temporary 31 | }, 32 | supervisor:start_child(lashup_gm_worker_sup, ChildSpec). 33 | 34 | 35 | start_link(Args) -> 36 | Opts = [link, {priority, low}], 37 | %% Basically never full sweep, because the process dies pretty quickly 38 | Pid = proc_lib:spawn_opt(?MODULE, do_handle, [Args], Opts), 39 | {ok, Pid}. 40 | 41 | 42 | do_handle(#{lashup_gm_fanout_pid := Pid}) -> 43 | link(Pid), 44 | State = #state{fanout_pid = Pid}, 45 | start_exchange(State). 46 | 47 | 48 | start_exchange(State) -> 49 | Message = #{type => aae_keys, pid => self()}, 50 | State#state.fanout_pid ! Message, 51 | do_exchange(State). 52 | 53 | 54 | do_exchange(State) -> 55 | receive 56 | #{type := node_clock} = NodeClock -> 57 | State1 = handle_node_clock(NodeClock, State), 58 | do_exchange(State1); 59 | #{type := node_clock_complete} -> 60 | finish_exchange(State) 61 | end. 62 | finish_exchange(State = #state{nodes_checked = NodesChecked}) -> 63 | NodesCheckedSet = ordsets:from_list(NodesChecked), 64 | send_unchecked_nodes(NodesCheckedSet, State). 65 | 66 | 67 | send_unchecked_nodes(NodesCheckedSet, State) -> 68 | MatchSpec = ets:fun2ms( 69 | fun(_Member = #member2{node = Node}) -> 70 | Node 71 | end 72 | ), 73 | Members = ets:select(members, MatchSpec), 74 | MembersList = ordsets:from_list(Members), 75 | NodesToSend = ordsets:subtract(MembersList, NodesCheckedSet), 76 | [send_member(Node, State) || Node <- NodesToSend], 77 | unlink(State#state.fanout_pid). 78 | 79 | 80 | handle_node_clock(_NodeClock = #{node_clock := {Node, RemoteClock = {_RemoteEpoch, _RemoteClock}}}, 81 | State = #state{nodes_checked = NodeChecked, fanout_pid = Pid}) -> 82 | case ets:lookup(members, Node) of 83 | [] -> 84 | State; 85 | [Member = #member2{value = #{epoch := LocalEpoch, clock := LocalClock}}] -> 86 | State1 = State#state{nodes_checked = [Node|NodeChecked]}, 87 | case RemoteClock < {LocalEpoch, LocalClock} of 88 | %% Only send my local version if I have a strictly "newer" clock 89 | true -> 90 | send_event(Pid, Member); 91 | false -> 92 | ok 93 | end, 94 | State1 95 | end. 96 | 97 | send_event(Pid, Member) -> 98 | UpdatedNode = to_event(Member), 99 | BinaryTerm = term_to_binary(UpdatedNode), 100 | erlang:send(Pid, {event, BinaryTerm}, [noconnect]). 101 | 102 | 103 | send_member(Node, _State = #state{fanout_pid = Pid}) -> 104 | case ets:lookup(members, Node) of 105 | [] -> 106 | ok; 107 | [Member] -> 108 | UpdatedNode = to_event(Member), 109 | BinaryTerm = term_to_binary(UpdatedNode), 110 | erlang:send(Pid, {event, BinaryTerm}, [noconnect]) 111 | end. 112 | 113 | 114 | to_event(Member = #member2{}) -> 115 | #{ 116 | message => updated_node, 117 | node => Member#member2.node, 118 | value => Member#member2.value, 119 | ttl => 1 120 | }. 121 | -------------------------------------------------------------------------------- /src/lashup_gm_worker_sup.erl: -------------------------------------------------------------------------------- 1 | -module(lashup_gm_worker_sup). 2 | -behaviour(supervisor). 3 | 4 | -export([init/1]). 5 | -export([start_link/0]). 6 | 7 | start_link() -> 8 | supervisor:start_link({local, ?MODULE}, ?MODULE, []). 9 | 10 | init([]) -> 11 | {ok, {#{}, []}}. 12 | -------------------------------------------------------------------------------- /src/lashup_hyparview_events.erl: -------------------------------------------------------------------------------- 1 | -module(lashup_hyparview_events). 2 | -author("sdhillon"). 3 | -behaviour(gen_event). 4 | 5 | -include_lib("kernel/include/logger.hrl"). 6 | 7 | %% API 8 | -export([ 9 | start_link/0, 10 | subscribe/0, 11 | remote_subscribe/1, 12 | ingest/2 13 | ]). 14 | 15 | %% gen_event callbacks 16 | -export([init/1, handle_event/2, handle_call/2, 17 | handle_info/2, terminate/2, code_change/3]). 18 | 19 | -record(state, { 20 | pid, 21 | reference, 22 | active_view = ordsets:new(), 23 | passive_view = ordsets:new() 24 | }). 25 | 26 | 27 | -spec(start_link() -> {ok, pid()} | {error, {already_started, pid()}}). 28 | start_link() -> 29 | gen_event:start_link({local, ?MODULE}). 30 | 31 | -spec(subscribe() -> {ok, reference()} | {'EXIT', term()} | {error, term()}). 32 | subscribe() -> 33 | remote_subscribe(node()). 34 | 35 | ingest(ActiveView, PassiveView) -> 36 | gen_event:notify(?MODULE, {ingest, ActiveView, PassiveView}), 37 | ok. 38 | 39 | -spec(remote_subscribe(Node :: node()) -> 40 | {ok, reference()} | {'EXIT', term()} | {error, term()}). 41 | remote_subscribe(Node) -> 42 | Reference = make_ref(), 43 | State = #state{pid = self(), reference = Reference}, 44 | EventMgrRef = event_mgr_ref(Node), 45 | case gen_event:add_sup_handler(EventMgrRef, ?MODULE, State) of 46 | ok -> 47 | {ok, Reference}; 48 | {'EXIT', Term} -> 49 | {'EXIT', Term}; 50 | Error -> 51 | {error, Error} 52 | end. 53 | 54 | event_mgr_ref(Node) when Node == node() -> 55 | ?MODULE; 56 | event_mgr_ref(Node) -> 57 | {?MODULE, Node}. 58 | 59 | 60 | %%%=================================================================== 61 | %%% gen_event callbacks 62 | %%%=================================================================== 63 | 64 | init(State0) -> 65 | ActiveView = lashup_hyparview_membership:get_active_view(), 66 | PassiveView = lashup_hyparview_membership:get_passive_view(), 67 | State1 = State0#state{passive_view = PassiveView, active_view = ActiveView}, 68 | advertise(State1, [], [], ActiveView, PassiveView), 69 | {ok, State1}. 70 | 71 | handle_event({ingest, ActiveView1, PassiveView1}, 72 | State0 = #state{active_view = ActiveView0, passive_view = PassiveView0}) -> 73 | State1 = State0#state{active_view = ActiveView1, passive_view = PassiveView1}, 74 | advertise(State1, ActiveView0, PassiveView0, ActiveView1, PassiveView1), 75 | {ok, State1}. 76 | 77 | handle_call(_Request, State) -> 78 | Reply = ok, 79 | {ok, Reply, State}. 80 | 81 | handle_info(Info, State) -> 82 | ?LOG_WARNING("Received unknown info: ~p, in state: ~p", [Info, State]), 83 | {ok, State}. 84 | 85 | terminate(_Arg, _State) -> 86 | ok. 87 | 88 | code_change(_OldVsn, State, _Extra) -> 89 | {ok, State}. 90 | 91 | %%%=================================================================== 92 | %%% Internal functions 93 | %%%=================================================================== 94 | 95 | advertise(#state{reference = Reference, pid = Pid}, ActiveView0, PassiveView0, ActiveView1, PassiveView1) -> 96 | Event = #{type => current_views, ref => Reference, old_passive_view => PassiveView0, old_active_view => ActiveView0, 97 | passive_view => PassiveView1, active_view => ActiveView1}, 98 | Pid ! {?MODULE, Event}, 99 | ok. 100 | -------------------------------------------------------------------------------- /src/lashup_hyparview_ping_handler.erl: -------------------------------------------------------------------------------- 1 | -module(lashup_hyparview_ping_handler). 2 | -author("sdhillon"). 3 | -behaviour(gen_server). 4 | 5 | -include_lib("kernel/include/logger.hrl"). 6 | 7 | %% API 8 | -export([ 9 | start_link/0, 10 | ping/1, 11 | check_max_ping_ms/0 12 | ]). 13 | 14 | %% gen_server callbacks 15 | -export([init/1, handle_call/3, 16 | handle_cast/2, handle_info/2]). 17 | 18 | -record(state, { 19 | pings_in_flight = orddict:new() :: orddict:orddict(Reference :: reference(), Node :: node()), 20 | ping_times = #{} :: map() 21 | }). 22 | -type state() :: #state{}. 23 | 24 | -type pong_message() :: map(). 25 | -type ping_message() :: map(). 26 | 27 | 28 | -spec(ping(node()) -> ok). 29 | ping(Node) -> 30 | gen_server:call(?MODULE, {ping, Node}), 31 | ok. 32 | 33 | check_max_ping_ms() -> 34 | %% Check if the user has manually set max ping ms, or if it's one of the settings we could have set for them 35 | case application:get_env(lashup, max_ping_ms) of 36 | Val when Val == undefined orelse Val == 10000 orelse Val == 30000-> 37 | check_max_ping_ms2(); 38 | _ -> 39 | ok 40 | end. 41 | 42 | check_max_ping_ms2() -> 43 | case lashup_gm:gm() of 44 | Members when length(Members) > 1000 -> 45 | application:set_env(lashup, ping_log_base, 1.0009), 46 | application:set_env(lashup, max_ping_ms, 30000); 47 | Members when length(Members) > 500 -> 48 | application:set_env(lashup, ping_log_base, 1.00034), 49 | application:set_env(lashup, max_ping_ms, 10000); 50 | _ -> 51 | ok 52 | end. 53 | 54 | -spec(start_link() -> 55 | {ok, Pid :: pid()} | ignore | {error, Reason :: term()}). 56 | start_link() -> 57 | gen_server:start_link({local, ?MODULE}, ?MODULE, [], []). 58 | 59 | %%%=================================================================== 60 | %%% gen_server callbacks 61 | %%%=================================================================== 62 | 63 | init([]) -> 64 | process_flag(priority, high), 65 | ok = net_kernel:monitor_nodes(true), 66 | %% The reason not to randomize this is that we'd prefer all nodes pause around the same time 67 | %% It creates an easier to debug situation if this call actually does kill performance 68 | timer:apply_interval(10000, ?MODULE, check_max_ping_ms, []), 69 | {ok, #state{}}. 70 | 71 | handle_call({ping, Node}, _From, State) -> 72 | State1 = do_ping(Node, State), 73 | {reply, ok, State1}; 74 | handle_call(_Request, _From, State) -> 75 | {reply, ok, State}. 76 | 77 | handle_cast(_Request, State) -> 78 | {noreply, State}. 79 | 80 | handle_info(PingMessage = #{message := ping}, State) -> 81 | handle_ping(PingMessage, State), 82 | {noreply, State, lashup_utils:hibernate()}; 83 | handle_info(PongMessage = #{message := pong}, State) -> 84 | State1 = handle_pong(PongMessage, State), 85 | {noreply, State1, lashup_utils:hibernate()}; 86 | handle_info({nodedown, NodeName}, State0 = #state{ping_times = PingTimes0}) -> 87 | PingTimes1 = maps:remove(NodeName, PingTimes0), 88 | State1 = State0#state{ping_times = PingTimes1}, 89 | {noreply, State1}; 90 | handle_info({ping_failed, NRef}, State) -> 91 | State1 = handle_ping_failed(NRef, State), 92 | {noreply, State1}; 93 | handle_info(_Info, State) -> 94 | {noreply, State}. 95 | 96 | %%%=================================================================== 97 | %%% Internal functions 98 | %%%=================================================================== 99 | 100 | -spec(do_ping(node(), state()) -> state()). 101 | do_ping(Node, State0 = #state{pings_in_flight = PIF, ping_times = PingTimes}) -> 102 | Now = erlang:monotonic_time(milli_seconds), 103 | Ref = make_ref(), 104 | MaxEstRTT = determine_ping_time(Node, State0), 105 | {ok, TimerRef} = timer:send_after(MaxEstRTT, {ping_failed, Ref}), 106 | Message = #{message => ping, from => self(), now => Now, ref => Ref, timer_ref => TimerRef}, 107 | case erlang:send({?MODULE, Node}, Message, [noconnect, nosuspend]) of 108 | ok -> 109 | PIF2 = orddict:store(Ref, {MaxEstRTT, Node}, PIF), 110 | State0#state{pings_in_flight = PIF2}; 111 | %% Treat ping as failed 112 | _ -> 113 | ?LOG_INFO("Ping to node ~p failed, because erlang:send failed", [Node]), 114 | timer:cancel(TimerRef), 115 | lashup_hyparview_membership:ping_failed(Node), 116 | PingTimes2 = maps:remove(Node, PingTimes), 117 | State0#state{ping_times = PingTimes2} 118 | end. 119 | 120 | -spec(handle_ping_failed(reference(), state()) -> state()). 121 | handle_ping_failed(Ref, State = #state{ping_times = PingTimes, pings_in_flight = PIF}) -> 122 | case orddict:find(Ref, PIF) of 123 | {ok, {RTT, Node}} -> 124 | ?LOG_INFO("Didn't receive Pong from Node: ~p in time: ~p", [Node, RTT]), 125 | lashup_hyparview_membership:ping_failed(Node), 126 | PIF2 = orddict:erase(Ref, PIF), 127 | PingTimes2 = maps:remove(Node, PingTimes), 128 | State#state{pings_in_flight = PIF2, ping_times = PingTimes2}; 129 | error -> 130 | State 131 | end. 132 | 133 | -spec(handle_ping(ping_message(), state()) -> ok). 134 | handle_ping(PingMessage = #{from := From}, _State) -> 135 | PongMessage = PingMessage#{message => pong, receiving_node => node()}, 136 | erlang:send(From, PongMessage, [noconnect, nosuspend]), 137 | ok. 138 | 139 | -spec(handle_pong(pong_message(), state()) -> state()). 140 | handle_pong(PongMessage = #{ref := Ref, timer_ref := TimerRef}, State0 = #state{pings_in_flight = PIF0}) -> 141 | lashup_hyparview_membership:recognize_pong(PongMessage), 142 | timer:cancel(TimerRef), 143 | PIF1 = orddict:erase(Ref, PIF0), 144 | State1 = record_pong(PongMessage, State0), 145 | State1#state{pings_in_flight = PIF1}. 146 | 147 | %% This stores the pongs and pong timings 148 | -spec(record_pong(pong_message(), state()) -> state()). 149 | record_pong(_PongMessage = #{receiving_node := ReceivingNode, now := SendTime}, 150 | State0 = #state{ping_times = PingTimes}) -> 151 | %% {RecordedTime :: integer(), RTT :: integer()} 152 | Now = erlang:monotonic_time(milli_seconds), 153 | LastRTT = Now - SendTime, 154 | PingTimes1 = PingTimes#{ReceivingNode => LastRTT}, 155 | State0#state{ping_times = PingTimes1}. 156 | 157 | %% RTT is in milliseconds 158 | -spec(determine_ping_time(node(), state()) -> RTT :: non_neg_integer()). 159 | determine_ping_time(Node, #state{ping_times = PingTimes}) -> 160 | %% If unknown then might as well return the MAX PING 161 | case maps:find(Node, PingTimes) of 162 | error -> 163 | lashup_config:max_ping_ms(); 164 | {ok, LastRTT} -> 165 | %% 2 MS is the noise floor 166 | MinPingMs = lashup_config:min_ping_ms(), 167 | RTT = lists:max([MinPingMs, LastRTT]), 168 | trunc(math:log(RTT) / math:log(lashup_config:ping_log_base())) 169 | end. 170 | -------------------------------------------------------------------------------- /src/lashup_kv.erl: -------------------------------------------------------------------------------- 1 | -module(lashup_kv). 2 | -author("sdhillon"). 3 | -behaviour(gen_server). 4 | 5 | -include_lib("kernel/include/logger.hrl"). 6 | -include_lib("stdlib/include/ms_transform.hrl"). 7 | 8 | %% API 9 | -export([ 10 | start_link/0, 11 | request_op/2, 12 | request_op/3, 13 | keys/1, 14 | value/1, 15 | value2/1, 16 | raw_value/1, 17 | descends/2, 18 | subscribe/1, 19 | unsubscribe/1, 20 | flush/2, 21 | first_key/0, 22 | next_key/1, 23 | read_lclock/1, 24 | write_lclock/2, 25 | init_metrics/0 26 | ]). 27 | 28 | %% gen_server callbacks 29 | -export([init/1, handle_call/3, 30 | handle_cast/2, handle_info/2]). 31 | 32 | -export_type([key/0, lclock/0, kv2map/0, kv2raw/0]). 33 | 34 | -define(KV_TABLE, kv2). 35 | -define(INIT_LCLOCK, -1). 36 | -define(WARN_OBJECT_SIZE_MB, 60). 37 | -define(REJECT_OBJECT_SIZE_MB, 100). 38 | -define(MAX_MESSAGE_QUEUE_LEN, 32). 39 | -define(KV_TOPIC, lashup_kv_20161114). 40 | 41 | -record(kv2, { 42 | key = erlang:error() :: key() | '_', 43 | map = riak_dt_map:new() :: riak_dt_map:dt_map() | '_', 44 | vclock = riak_dt_vclock:fresh() :: riak_dt_vclock:vclock() | '_', 45 | lclock = 0 :: lclock() | '_' 46 | }). 47 | -type kv() :: #kv2{}. 48 | 49 | -record(nclock, { 50 | key :: node(), 51 | lclock :: lclock() 52 | }). 53 | -type nclock() :: #nclock{}. 54 | 55 | -type key() :: term(). 56 | -type lclock() :: non_neg_integer(). % logical clock 57 | -type kv2map() :: #{key => key(), 58 | value => riak_dt_map:value(), 59 | old_value => riak_dt_map:value()}. 60 | -type kv2raw() :: #{key => key(), 61 | value => term(), 62 | vclock => riak_dt_vclock:vclock(), 63 | lclock => lclock()}. 64 | 65 | -record(state, { 66 | mc_ref = erlang:error() :: reference(), 67 | subscribers = #{} :: #{pid() => {ets:comp_match_spec(), reference()}} 68 | }). 69 | -type state() :: #state{}. 70 | 71 | 72 | -spec(request_op(Key :: key(), Op :: riak_dt_map:map_op()) -> 73 | {ok, riak_dt_map:value()} | {error, Reason :: term()}). 74 | request_op(Key, Op) -> 75 | request_op(Key, undefined, Op). 76 | 77 | -spec(request_op(Key :: key(), Context :: riak_dt_vclock:vclock() | undefined, Op :: riak_dt_map:map_op()) -> 78 | {ok, riak_dt_map:value()} | {error, Reason :: term()}). 79 | request_op(Key, VClock, Op) -> 80 | Pid = whereis(?MODULE), 81 | Args = {op, Key, VClock, Op}, 82 | try erlang:process_info(Pid, message_queue_len) of 83 | {message_queue_len, MsgQueueLen} -> 84 | maybe_request_op_call(Pid, MsgQueueLen, Args) 85 | catch error:badarg -> 86 | exit({noproc, {gen_server, call, [?MODULE, Args]}}) 87 | end. 88 | 89 | -spec(maybe_request_op_call(Pid :: pid(), MsgQueueLen :: integer(), 90 | {op, Key :: key(), Context :: riak_dt_vclock:vclock() | undefined, Op :: riak_dt_map:map_op()}) -> 91 | {ok, riak_dt_map:value()} | {error, Reason :: term()}). 92 | maybe_request_op_call(Pid, MsgQueueLen, Args) -> 93 | prometheus_gauge:set(lashup, kv_message_queue_length, [], MsgQueueLen), 94 | MaxMsgQueueLen = max_message_queue_len(), 95 | case MsgQueueLen > MaxMsgQueueLen of 96 | false -> 97 | prometheus_summary:observe_duration( 98 | lashup, kv_op_with_latency_seconds, [], 99 | fun () -> gen_server:call(Pid, Args, infinity) end); 100 | true -> 101 | prometheus_counter:inc(lashup, kv_message_queue_overflows_total, [], 1), 102 | {error, overflow} 103 | end. 104 | 105 | -spec(keys(ets:match_spec()) -> [key()]). 106 | keys(MatchSpec) -> 107 | op_getkeys(MatchSpec). 108 | 109 | -spec(value(Key :: key()) -> riak_dt_map:value()). 110 | value(Key) -> 111 | {_, KV} = op_getkv(Key), 112 | riak_dt_map:value(KV#kv2.map). 113 | 114 | -spec(value2(Key :: key()) -> {riak_dt_map:value(), riak_dt_vclock:vclock()}). 115 | value2(Key) -> 116 | {_, KV} = op_getkv(Key), 117 | {riak_dt_map:value(KV#kv2.map), KV#kv2.vclock}. 118 | 119 | -spec(raw_value(key()) -> kv2raw() | false). 120 | raw_value(Key) -> 121 | case op_getkv(Key) of 122 | {existing, #kv2{map=Map, vclock=VClock, lclock=LClock}} -> 123 | #{key => Key, value => Map, vclock => VClock, lclock => LClock}; 124 | {new, _Value} -> 125 | false 126 | end. 127 | 128 | -spec(descends(key(), riak_dt_vclock:vclock()) -> boolean()). 129 | descends(Key, VClock) -> 130 | %% Check if LocalVClock is a direct descendant of the VClock 131 | case op_getkv(Key) of 132 | {existing, #kv2{vclock = LocalVClock}} -> 133 | riak_dt_vclock:descends(LocalVClock, VClock); 134 | {new, _Value} -> 135 | false 136 | end. 137 | 138 | -spec(subscribe(ets:match_spec()) -> {ok, reference()}). 139 | subscribe(MatchSpec) -> 140 | CompMatchSpec = ets:match_spec_compile(MatchSpec), 141 | {ok, Ref} = gen_server:call(?MODULE, {subscribe, CompMatchSpec}), 142 | lists:foreach(fun (Key) -> 143 | self() ! {lashup_kv_event, Ref, Key} 144 | end, keys(MatchSpec)), 145 | {ok, Ref}. 146 | 147 | -spec(unsubscribe(Ref :: reference()) -> ok). 148 | unsubscribe(Ref) -> 149 | ok = gen_server:call(?MODULE, {unsubscribe, Ref}), 150 | flush_all(Ref). 151 | 152 | -spec(flush_all(reference()) -> ok). 153 | flush_all(Ref) -> 154 | receive 155 | {lashup_kv_event, Ref, _Key} -> 156 | flush_all(Ref) 157 | after 0 -> 158 | ok 159 | end. 160 | 161 | -spec(flush(Ref :: reference(), Key :: term()) -> ok). 162 | flush(Ref, Key) -> 163 | receive 164 | {lashup_kv_event, Ref, Key} -> 165 | flush(Ref, Key) 166 | after 0 -> 167 | ok 168 | end. 169 | 170 | -spec(first_key() -> key() | '$end_of_table'). 171 | first_key() -> 172 | prometheus_summary:observe_duration( 173 | lashup, kv_backend_read_seconds, [kv2], 174 | fun () -> mnesia:dirty_first(?KV_TABLE) end). 175 | 176 | -spec(next_key(key()) -> key() | '$end_of_table'). 177 | next_key(Key) -> 178 | prometheus_summary:observe_duration( 179 | lashup, kv_backend_read_seconds, [kv2], 180 | fun () -> mnesia:dirty_next(?KV_TABLE, Key) end). 181 | 182 | -spec(read_lclock(node()) -> lclock()). 183 | read_lclock(Node) -> 184 | get_lclock(fun mnesia:dirty_read/2, Node). 185 | 186 | -spec(write_lclock(node(), lclock()) -> ok | {error, term()}). 187 | write_lclock(Node, LClock) -> 188 | Begin = erlang:monotonic_time(), 189 | Fun = fun () -> mnesia:write(#nclock{key = Node, lclock = LClock}) end, 190 | try mnesia:sync_transaction(Fun) of 191 | {atomic, _} -> 192 | ok; 193 | {aborted, Reason} -> 194 | ?LOG_ERROR("Couldn't write to nclock table because ~p", [Reason]), 195 | {error, Reason} 196 | after 197 | prometheus_summary:observe( 198 | lashup, kv_backend_write_seconds, [nclock], 199 | erlang:monotonic_time() - Begin) 200 | end. 201 | 202 | -spec(start_link() -> 203 | {ok, Pid :: pid()} | ignore | {error, Reason :: term()}). 204 | start_link() -> 205 | gen_server:start_link({local, ?MODULE}, ?MODULE, [], []). 206 | 207 | %%%=================================================================== 208 | %%% gen_server callbacks 209 | %%%=================================================================== 210 | 211 | init([]) -> 212 | set_off_heap(), 213 | init_db(), 214 | %% Maybe read_concurrency? 215 | {ok, Reference} = lashup_gm_mc_events:subscribe([?KV_TOPIC]), 216 | {ok, #state{mc_ref = Reference}}. 217 | 218 | handle_call({op, Key, VClock, Op}, _From, State) -> 219 | {Reply, State1} = handle_op(Key, Op, VClock, State), 220 | {reply, Reply, State1, lashup_utils:hibernate()}; 221 | handle_call({start_kv_sync_fsm, RemoteInitiatorNode, RemoteInitiatorPid}, _From, State) -> 222 | Result = lashup_kv_aae_sup:receive_aae(RemoteInitiatorNode, RemoteInitiatorPid), 223 | {reply, Result, State}; 224 | handle_call({subscribe, CompMatchSpec}, {Pid, _Tag}, State) -> 225 | {Ref, State0} = handle_subscribe(Pid, CompMatchSpec, State), 226 | {reply, {ok, Ref}, State0}; 227 | handle_call({unsubscribe, _Ref}, {Pid, _Tag}, State) -> 228 | State0 = handle_unsubscribe(Pid, State), 229 | {reply, ok, State0}; 230 | handle_call(_Request, _From, State) -> 231 | {reply, {error, unknown_request}, State}. 232 | 233 | %% A maybe update from the sync FSM 234 | handle_cast({maybe_update, Key, VClock, Map}, State0) -> 235 | State1 = handle_full_update(#{key => Key, vclock => VClock, map => Map}, State0), 236 | {noreply, State1, lashup_utils:hibernate()}; 237 | handle_cast(_Request, State) -> 238 | {noreply, State}. 239 | 240 | handle_info({lashup_gm_mc_event, Event = #{ref := Ref}}, State = #state{mc_ref = Ref}) -> 241 | MaxMsgQueueLen = max_message_queue_len(), 242 | {message_queue_len, MsgQueueLen} = 243 | erlang:process_info(self(), message_queue_len), 244 | prometheus_gauge:set(lashup, kv_message_queue_length, [], MsgQueueLen), 245 | case MsgQueueLen > MaxMsgQueueLen of 246 | true -> 247 | ?LOG_ERROR("lashup_kv: message box is overflowed, ~p", [MsgQueueLen]), 248 | prometheus_counter:inc(lashup, kv_message_queue_overflows_total, [], 1), 249 | {noreply, State, lashup_utils:hibernate()}; 250 | false -> 251 | State1 = handle_lashup_gm_mc_event(Event, State), 252 | {noreply, State1, lashup_utils:hibernate()} 253 | end; 254 | handle_info({'DOWN', _MonRef, process, Pid, _Info}, State) -> 255 | State0 = handle_unsubscribe(Pid, State), 256 | {noreply, State0}; 257 | handle_info(_Info, State) -> 258 | {noreply, State}. 259 | 260 | %%%=================================================================== 261 | %%% Internal functions 262 | %%%=================================================================== 263 | 264 | -spec(max_message_queue_len() -> pos_integer()). 265 | max_message_queue_len() -> 266 | application:get_env(lashup, max_message_queue_len, ?MAX_MESSAGE_QUEUE_LEN). 267 | 268 | -spec(set_off_heap() -> on_heap | off_heap). 269 | set_off_heap() -> 270 | try 271 | % Garbage collection with many messages placed on the heap can become 272 | % extremely expensive and the process can consume large amounts of memory. 273 | erlang:process_flag(message_queue_data, off_heap) 274 | catch error:badarg -> 275 | % off_heap options is avaliable in OTP 20.0-rc2 and later 276 | off_heap 277 | end. 278 | 279 | %% Mostly borrowed from: https://github.com/ChicagoBoss/ChicagoBoss/wiki/Automatic-schema-initialization-for-mnesia 280 | -spec(init_db() -> ok). 281 | init_db() -> 282 | init_db([node()]). 283 | 284 | -spec(init_db([node()]) -> ok). 285 | init_db(Nodes) -> 286 | mnesia:create_schema(Nodes), 287 | mnesia:change_table_copy_type (schema, node(), disc_copies), % If the node was already running 288 | {ok, _} = application:ensure_all_started(mnesia), 289 | ExistingTables = mnesia:system_info(tables), 290 | Tables = [?KV_TABLE, nclock], 291 | TablesToCreate = Tables -- ExistingTables, 292 | Alltables = TablesToCreate ++ ExistingTables, 293 | lists:foreach(fun create_table/1, TablesToCreate), 294 | case mnesia:wait_for_tables(Alltables, 60000) of 295 | ok -> 296 | ok; 297 | {timeout, BadTables} -> 298 | ?LOG_ALERT("Couldn't initialize mnesia tables: ~p", [BadTables]), 299 | init:stop(1); 300 | {error, Error} -> 301 | ?LOG_ALERT("Couldn't initialize mnesia tables: ~p", [Error]), 302 | init:stop(1) 303 | end. 304 | 305 | create_table(Table) -> 306 | {atomic, ok} = mnesia:create_table(Table, [ 307 | {attributes, get_record_info(Table)}, 308 | {disc_copies, [node()]}, 309 | {type, set} 310 | ]). 311 | 312 | get_record_info(kv2) -> 313 | record_info(fields, kv2); 314 | get_record_info(nclock) -> 315 | record_info(fields, nclock). 316 | 317 | -spec(mk_write_fun(Key :: key(), OldVClock :: riak_dt_vclock:vclock() | undefined, 318 | Op :: riak_dt_map:map_op()) -> (fun())). 319 | mk_write_fun(Key, OldVClock, Op) -> 320 | fun() -> 321 | {NewKV, NClock} = 322 | case safe_read(?KV_TABLE, Key) of 323 | [] -> 324 | prepare_kv(Key, riak_dt_map:new(), riak_dt_vclock:fresh(), Op); 325 | [#kv2{vclock = VClock}] when OldVClock =/= undefined andalso VClock =/= OldVClock -> 326 | mnesia:abort(concurrency_violation); 327 | [#kv2{vclock = VClock, map = Map}] -> 328 | prepare_kv(Key, Map, VClock, Op) 329 | end, 330 | case check_map(NewKV) of 331 | {error, Error} -> 332 | mnesia:abort(Error); 333 | ok -> 334 | op_write(NewKV), 335 | op_write(NClock) 336 | end, 337 | NewKV 338 | end. 339 | 340 | -spec(safe_read(Table :: atom(), Key :: key()) -> [kv()]). 341 | safe_read(Table, Key) -> 342 | prometheus_summary:observe_duration( 343 | lashup, kv_backend_read_seconds, [Table], 344 | fun () -> mnesia:read(Table, Key, write) end). 345 | 346 | -spec(prepare_kv(Key :: key(), Map0 :: riak_dt_map:dt_map(), VClock0 :: riak_dt_vclock:vclock() | undefined, 347 | Op :: riak_dt_map:map_op()) -> {kv(), nclock()}). 348 | prepare_kv(Key, Map0, VClock0, Op) -> 349 | Node = node(), 350 | VClock1 = riak_dt_vclock:increment(Node, VClock0), 351 | Counter = riak_dt_vclock:get_counter(Node, VClock1), 352 | Dot = {Node, Counter}, 353 | Map2 = 354 | case 355 | prometheus_summary:observe_duration( 356 | lashup, kv_crdt_op_seconds, [], 357 | fun () -> riak_dt_map:update(Op, Dot, Map0) end) 358 | of 359 | {ok, Map1} -> Map1; 360 | {error, {precondition, {not_present, _Field}}} -> Map0 361 | end, 362 | LClock0 = get_lclock(Node), 363 | LClock1 = increment_lclock(LClock0), 364 | {#kv2{key = Key, vclock = VClock1, map = Map2, lclock = LClock1}, 365 | #nclock{key = Node, lclock = LClock1}}. 366 | 367 | -spec handle_op(Key :: term(), Op :: riak_dt_map:map_op(), OldVClock :: riak_dt_vclock:vclock() | undefined, 368 | State :: state()) -> {Reply :: term(), State1 :: state()}. 369 | handle_op(Key, Op, OldVClock, State) -> 370 | Begin = erlang:monotonic_time(), 371 | %% We really want to make sure this persists and we don't have backwards traveling clocks 372 | Fun = mk_write_fun(Key, OldVClock, Op), 373 | try mnesia:sync_transaction(Fun) of 374 | {atomic, NewKV} -> 375 | ok = mnesia:sync_log(), 376 | dumped = mnesia:dump_log(), 377 | propagate(NewKV), 378 | NewValue = riak_dt_map:value(NewKV#kv2.map), 379 | State0 = notify_subscribers(Key, State), 380 | {{ok, NewValue}, State0}; 381 | {aborted, Reason} -> 382 | {{error, Reason}, State} 383 | after 384 | prometheus_summary:observe( 385 | lashup, kv_op_seconds, [], 386 | erlang:monotonic_time() - Begin) 387 | end. 388 | 389 | %% TODO: Add metrics 390 | -spec(check_map(kv()) -> {error, Reason :: term()} | ok). 391 | check_map(NewKV = #kv2{key = Key}) -> 392 | case erlang:external_size(NewKV) of 393 | Size when Size > ?REJECT_OBJECT_SIZE_MB * 1000000 -> 394 | {error, value_too_large}; 395 | Size when Size > (?WARN_OBJECT_SIZE_MB + ?REJECT_OBJECT_SIZE_MB) / 2 * 1000000 -> 396 | ?LOG_WARNING("WARNING: Object '~p' is growing too large at ~p bytes (REJECTION IMMINENT)", [Key, Size]), 397 | ok; 398 | Size when Size > ?WARN_OBJECT_SIZE_MB * 1000000 -> 399 | ?LOG_WARNING("WARNING: Object '~p' is growing too large at ~p bytes", [Key, Size]), 400 | ok; 401 | _ -> 402 | ok 403 | end. 404 | 405 | -spec (propagate(kv()) -> ok). 406 | propagate(_KV = #kv2{key = Key, map = Map, vclock = VClock}) -> 407 | Payload = #{type => full_update, reason => op, key => Key, map => Map, vclock => VClock}, 408 | lashup_gm_mc:multicast(?KV_TOPIC, Payload), 409 | ok. 410 | 411 | % @private either gets the KV object for a given key, or returns an empty one 412 | -spec(op_getkv(key()) -> {new, kv()} | {existing, kv()}). 413 | op_getkv(Key) -> 414 | Begin = erlang:monotonic_time(), 415 | try mnesia:dirty_read(?KV_TABLE, Key) of 416 | [] -> 417 | {new, #kv2{key = Key}}; 418 | [KV] -> 419 | {existing, KV} 420 | after 421 | prometheus_summary:observe( 422 | lashup, kv_backend_read_seconds, [kv2], 423 | erlang:monotonic_time() - Begin) 424 | end. 425 | 426 | -spec(op_getkeys(ets:match_spec()) -> [key()]). 427 | op_getkeys(MatchSpec) -> 428 | Keys = op_dirty_all_keys(?KV_TABLE), 429 | MatchSpecCompiled = ets:match_spec_compile(MatchSpec), 430 | [Key || Key <- Keys, [true] == ets:match_spec_run([{Key}], MatchSpecCompiled)]. 431 | 432 | -spec(op_dirty_all_keys(Table :: atom()) -> [term()]). 433 | op_dirty_all_keys(Table) -> 434 | prometheus_summary:observe_duration( 435 | lashup, kv_backend_read_seconds, [Table], 436 | fun () -> mnesia:dirty_all_keys(Table) end). 437 | 438 | -spec(get_lclock(node()) -> lclock()). 439 | get_lclock(Key) -> 440 | get_lclock(fun mnesia:read/2, Key). 441 | 442 | -spec(get_lclock(fun(), node()) -> lclock()). 443 | get_lclock(ReadFun, Key) -> 444 | Begin = erlang:monotonic_time(), 445 | try ReadFun(nclock, Key) of 446 | [] -> 447 | ?INIT_LCLOCK; 448 | [#nclock{lclock = LClock}] -> 449 | LClock 450 | after 451 | prometheus_summary:observe( 452 | lashup, kv_backend_read_seconds, [nclock], 453 | erlang:monotonic_time() - Begin) 454 | end. 455 | 456 | -spec(handle_lashup_gm_mc_event(map(), state()) -> state()). 457 | handle_lashup_gm_mc_event(#{payload := #{type := full_update} = Payload}, State) -> 458 | handle_full_update(Payload, State); 459 | handle_lashup_gm_mc_event(Payload, State) -> 460 | ?LOG_DEBUG("Unknown GM MC event: ~p", [Payload]), 461 | State. 462 | 463 | -spec(mk_full_update_fun(Key :: key(), RemoteMap :: riak_dt_map:dt_map(), 464 | RemoteVClock :: riak_dt_vclock:vclock()) 465 | -> fun(() -> kv())). 466 | mk_full_update_fun(Key, RemoteMap, RemoteVClock) -> 467 | fun() -> 468 | case mnesia:read(?KV_TABLE, Key, write) of 469 | [] -> 470 | LClock0 = get_lclock(node()), 471 | LClock1 = increment_lclock(LClock0), 472 | KV = #kv2{key = Key, vclock = RemoteVClock, map = RemoteMap, lclock = LClock1}, 473 | NClock = #nclock{key = node(), lclock = LClock1}, 474 | ok = op_write(KV), 475 | ok = op_write(NClock), 476 | KV; 477 | [KV] -> 478 | maybe_full_update(should_full_update(KV, RemoteMap, RemoteVClock)) 479 | end 480 | end. 481 | 482 | -spec(maybe_full_update({true | false, kv(), nclock()}) -> kv()). 483 | maybe_full_update({false, KV, _}) -> 484 | KV; 485 | maybe_full_update({true, KV, NClock}) -> 486 | ok = op_write(KV), 487 | ok = op_write(NClock), 488 | KV. 489 | 490 | -spec(should_full_update(LocalKV :: kv(), RemoteMap :: riak_dt_map:dt_map(), 491 | RemoteVClock :: riak_dt_vclock:vclock()) 492 | -> {true | false, kv(), nclock()}). 493 | should_full_update(LocalKV = #kv2{vclock = LocalVClock}, RemoteMap, RemoteVClock) -> 494 | case {riak_dt_vclock:descends(RemoteVClock, LocalVClock), riak_dt_vclock:descends(LocalVClock, RemoteVClock)} of 495 | {true, false} -> 496 | create_full_update(LocalKV, RemoteMap, RemoteVClock); 497 | {false, false} -> 498 | create_full_update(LocalKV, RemoteMap, RemoteVClock); 499 | %% Either they are equal, or the local one is newer - perhaps trigger AAE? 500 | _ -> 501 | LClock0 = get_lclock(node()), 502 | LClock1 = increment_lclock(LClock0), 503 | NClock = #nclock{key = node(), lclock = LClock1}, 504 | {false, LocalKV, NClock} 505 | end. 506 | 507 | -spec(create_full_update(LocalKV :: kv(), RemoteMap :: riak_dt_map:dt_map(), 508 | RemoteVClock :: riak_dt_vclock:vclock()) -> 509 | {true, kv(), nclock()}). 510 | create_full_update(KV = #kv2{vclock = LocalVClock}, RemoteMap, RemoteVClock) -> 511 | Map1 = 512 | prometheus_summary:observe_duration( 513 | lashup, kv_crdt_op_seconds, [], 514 | fun () -> riak_dt_map:merge(RemoteMap, KV#kv2.map) end), 515 | VClock1 = riak_dt_vclock:merge([LocalVClock, RemoteVClock]), 516 | LClock0 = get_lclock(node()), 517 | LClock1 = increment_lclock(LClock0), 518 | KV1 = KV#kv2{map = Map1, vclock = VClock1, lclock = LClock1}, 519 | NClock = #nclock{key = node(), lclock = LClock1}, 520 | {true, KV1, NClock}. 521 | 522 | -spec(handle_full_update(map(), state()) -> state()). 523 | handle_full_update(_Payload = #{key := Key, vclock := RemoteVClock, map := RemoteMap}, State) -> 524 | Begin = erlang:monotonic_time(), 525 | Fun = mk_full_update_fun(Key, RemoteMap, RemoteVClock), 526 | {atomic, _} = mnesia:sync_transaction(Fun), 527 | prometheus_summary:observe( 528 | lashup, kv_full_update_seconds, [], 529 | erlang:monotonic_time() - Begin), 530 | notify_subscribers(Key, State). 531 | 532 | increment_lclock(N) -> 533 | N + 1. 534 | 535 | -spec(op_write(tuple()) -> ok). 536 | op_write(Record) -> 537 | Table = element(1, Record), 538 | prometheus_summary:observe_duration( 539 | lashup, kv_backend_write_seconds, [Table], 540 | fun () -> mnesia:write(Record) end). 541 | 542 | %%%=================================================================== 543 | %%% Pub/Sub functions 544 | %%%=================================================================== 545 | 546 | -spec(handle_subscribe(pid(), ets:comp_match_spec(), state()) -> 547 | {reference(), state()}). 548 | handle_subscribe(Pid, CompMatchSpec, #state{subscribers=Subs}=State) -> 549 | MonRef = erlang:monitor(process, Pid), 550 | Subs0 = Subs#{Pid => {CompMatchSpec, MonRef}}, 551 | {MonRef, State#state{subscribers=Subs0}}. 552 | 553 | -spec(handle_unsubscribe(pid(), state()) -> state()). 554 | handle_unsubscribe(Pid, #state{subscribers=Subs}=State) -> 555 | case maps:find(Pid, Subs) of 556 | {ok, {_CompMatchSpec, MonRef}} -> 557 | _ = erlang:demonitor(MonRef, [flush]), 558 | State#state{subscribers=maps:remove(Pid, Subs)}; 559 | error -> 560 | State 561 | end. 562 | 563 | -spec(notify_subscribers(key(), state()) -> state()). 564 | notify_subscribers(Key, #state{subscribers=Subs}=State) -> 565 | mforeach(fun (Pid, {CompMatchSpec, Ref}) -> 566 | case ets:match_spec_run([{Key}], CompMatchSpec) of 567 | [true] -> Pid ! {lashup_kv_event, Ref, Key}; 568 | _Other -> ok 569 | end 570 | end, Subs), 571 | State. 572 | 573 | -spec(mforeach(Fun :: fun((Key, Value) -> term()), Map) -> ok 574 | when Map :: #{Key => Value} | maps:iterator(Key, Value), 575 | Key :: term(), Value :: term()). 576 | mforeach(Fun, Map) when is_map(Map) -> 577 | Iter = maps:iterator(Map), 578 | mforeach(Fun, Iter); 579 | mforeach(Fun, Iter) -> 580 | case maps:next(Iter) of 581 | {Key, Value, Iter0} -> 582 | _Result = Fun(Key, Value), 583 | mforeach(Fun, Iter0); 584 | none -> 585 | ok 586 | end. 587 | 588 | %%%=================================================================== 589 | %%% Metrics functions 590 | %%%=================================================================== 591 | 592 | -spec(init_metrics() -> ok). 593 | init_metrics() -> 594 | init_op_metrics(), 595 | init_kv_metrics(), 596 | init_backend_metrics(). 597 | 598 | -spec(init_op_metrics() -> ok). 599 | init_op_metrics() -> 600 | prometheus_summary:new([ 601 | {registry, lashup}, 602 | {name, kv_op_seconds}, 603 | {duration_unit, seconds}, 604 | {help, "The time spent processing KV operations."} 605 | ]), 606 | prometheus_summary:new([ 607 | {registry, lashup}, 608 | {name, kv_op_with_latency_seconds}, 609 | {duration_unit, seconds}, 610 | {help, "The time spent waiting for KV operation execution and " 611 | "actually executing it."} 612 | ]), 613 | prometheus_counter:new([ 614 | {registry, lashup}, 615 | {name, kv_message_queue_overflows_total}, 616 | {help, "Total number of messages dropped due to queue overflows."} 617 | ]). 618 | 619 | -spec(init_kv_metrics() -> ok). 620 | init_kv_metrics() -> 621 | prometheus_summary:new([ 622 | {registry, lashup}, 623 | {name, kv_full_update_seconds}, 624 | {duration_unit, seconds}, 625 | {help, "The time spent processing KV full updates."} 626 | ]), 627 | prometheus_gauge:new([ 628 | {registry, lashup}, 629 | {name, kv_message_queue_length}, 630 | {help, "The length of KV process message box."} 631 | ]), 632 | prometheus_summary:new([ 633 | {registry, lashup}, 634 | {name, kv_crdt_op_seconds}, 635 | {duration_unit, seconds}, 636 | {help, "The time spent merging/updating CRDT in KV process."} 637 | ]). 638 | 639 | -spec(init_backend_metrics() -> ok). 640 | init_backend_metrics() -> 641 | prometheus_summary:new([ 642 | {registry, lashup}, 643 | {name, kv_backend_read_seconds}, 644 | {labels, [table]}, 645 | {duration_unit, seconds}, 646 | {help, "The time spent reading data from KV backend."} 647 | ]), 648 | prometheus_summary:new([ 649 | {registry, lashup}, 650 | {name, kv_backend_write_seconds}, 651 | {labels, [table]}, 652 | {duration_unit, seconds}, 653 | {help, "The time spent writing data to KV backend."} 654 | ]). 655 | -------------------------------------------------------------------------------- /src/lashup_kv_aae_mgr.erl: -------------------------------------------------------------------------------- 1 | %%% @doc 2 | %%% Waits for remote nodes to come up on hyparview, and then starts an 3 | %%% AAE sync FSM with them to attempt to synchronize the data. 4 | 5 | -module(lashup_kv_aae_mgr). 6 | -author("sdhillon"). 7 | -behaviour(gen_server). 8 | 9 | -include_lib("kernel/include/logger.hrl"). 10 | 11 | %% API 12 | -export([start_link/0]). 13 | 14 | %% gen_server callbacks 15 | -export([init/1, handle_call/3, 16 | handle_cast/2, handle_info/2]). 17 | 18 | %% A node must be connected for 30 seconds before we attempt AAE 19 | -define(AAE_AFTER, 30000). 20 | 21 | -record(state, { 22 | hyparview_event_ref, 23 | route_event_ref, 24 | route_event_timer_ref = make_ref(), 25 | active_view = [] 26 | }). 27 | 28 | 29 | -spec(start_link() -> 30 | {ok, Pid :: pid()} | ignore | {error, Reason :: term()}). 31 | start_link() -> 32 | gen_server:start_link({local, ?MODULE}, ?MODULE, [], []). 33 | 34 | %%%=================================================================== 35 | %%% gen_server callbacks 36 | %%%=================================================================== 37 | 38 | init([]) -> 39 | {ok, HyparviewEventsRef} = lashup_hyparview_events:subscribe(), 40 | {ok, RouteEventsRef} = lashup_gm_route_events:subscribe(), 41 | timer:send_after(0, refresh), 42 | {ok, #state{hyparview_event_ref = HyparviewEventsRef, route_event_ref = RouteEventsRef}}. 43 | 44 | handle_call(_Request, _From, State) -> 45 | {reply, ok, State}. 46 | 47 | handle_cast(_Request, State) -> 48 | {noreply, State}. 49 | 50 | handle_info({lashup_hyparview_events, #{type := current_views, ref := EventRef, active_view := ActiveView}}, 51 | State0 = #state{hyparview_event_ref = EventRef}) -> 52 | State1 = State0#state{active_view = ActiveView}, 53 | refresh(ActiveView), 54 | {noreply, State1, lashup_utils:hibernate()}; 55 | handle_info({lashup_gm_route_events, #{ref := Ref}}, 56 | State = #state{route_event_ref = Ref, route_event_timer_ref = TimerRef}) -> 57 | erlang:cancel_timer(TimerRef), 58 | TimerRef0 = start_route_event_timer(), 59 | State0 = State#state{route_event_timer_ref = TimerRef0}, 60 | {noreply, State0, lashup_utils:hibernate()}; 61 | handle_info({timeout, Ref, route_event}, State = #state{route_event_timer_ref = Ref}) -> 62 | State0 = handle_route_event(State), 63 | {noreply, State0, lashup_utils:hibernate()}; 64 | handle_info(refresh, State = #state{active_view = ActiveView}) -> 65 | refresh(ActiveView), 66 | timer:send_after(lashup_config:aae_neighbor_check_interval(), refresh), 67 | {noreply, State, lashup_utils:hibernate()}; 68 | handle_info({start_child, Child}, State = #state{active_view = ActiveView}) -> 69 | maybe_start_child(Child, ActiveView), 70 | {noreply, State, lashup_utils:hibernate()}; 71 | handle_info(_Info, State) -> 72 | {noreply, State}. 73 | 74 | %%%=================================================================== 75 | %%% Internal functions 76 | %%%=================================================================== 77 | 78 | refresh(ActiveView) -> 79 | AllChildren = supervisor:which_children(lashup_kv_aae_sup), 80 | TxChildren = [Id || {{tx, Id}, _Child, _Type, _Modules} <- AllChildren], 81 | ChildrenToStart = ActiveView -- TxChildren, 82 | lists:foreach( 83 | fun(Child) -> 84 | SleepTime = trunc((1 + rand:uniform()) * lashup_config:aae_after()), 85 | timer:send_after(SleepTime, {start_child, Child}) 86 | end, 87 | ChildrenToStart). 88 | 89 | maybe_start_child(Child, ActiveView) -> 90 | case lists:member(Child, ActiveView) of 91 | true -> 92 | lashup_kv_aae_sup:start_aae(Child); 93 | false -> 94 | ok 95 | end. 96 | 97 | handle_route_event(State) -> 98 | case lashup_gm_route:get_tree(node()) of 99 | {tree, Tree} -> 100 | UnreachableNodes = lashup_gm_route:unreachable_nodes(Tree), 101 | ?LOG_INFO("Purging nclock for nodes: ~p", [UnreachableNodes]), 102 | lists:foreach(fun(Node) -> 103 | mnesia:dirty_delete(nclock, Node) 104 | end, 105 | UnreachableNodes), 106 | State; 107 | Error -> 108 | ?LOG_WARNING("get_tree() call failed ~p", [Error]), 109 | TimerRef = start_route_event_timer(), 110 | State#state{route_event_timer_ref = TimerRef} 111 | end. 112 | 113 | start_route_event_timer() -> 114 | erlang:start_timer(lashup_config:aae_route_event_wait(), self(), route_event). 115 | -------------------------------------------------------------------------------- /src/lashup_kv_aae_sup.erl: -------------------------------------------------------------------------------- 1 | -module(lashup_kv_aae_sup). 2 | -behaviour(supervisor). 3 | 4 | -export([ 5 | start_link/0, 6 | start_aae/1, 7 | receive_aae/2 8 | ]). 9 | -export([init/1]). 10 | 11 | -define(CHILD(I, Type), {I, {I, start_link, []}, permanent, 5000, Type, [I]}). 12 | 13 | start_link() -> 14 | supervisor:start_link({local, ?MODULE}, ?MODULE, []). 15 | 16 | start_aae(Node) -> 17 | ChildSpec = #{ 18 | id => {tx, Node}, 19 | start => {lashup_kv_sync_tx_fsm, start_link, [Node]}, 20 | restart => temporary, 21 | shutdown => 5000, 22 | type => worker, 23 | modules => [lashup_kv_sync_tx_fsm] 24 | }, 25 | supervisor:start_child(?MODULE, ChildSpec). 26 | 27 | receive_aae(Node, RemotePid) -> 28 | ChildSpec = #{ 29 | id => {rx, Node, erlang:unique_integer([positive, monotonic])}, 30 | start => {lashup_kv_sync_rx_fsm, start_link, [Node, RemotePid]}, 31 | restart => temporary, 32 | shutdown => 5000, 33 | type => worker, 34 | modules => [lashup_kv_sync_fsm] 35 | }, 36 | supervisor:start_child(?MODULE, ChildSpec). 37 | 38 | init([]) -> 39 | {ok, {#{}, []}}. 40 | -------------------------------------------------------------------------------- /src/lashup_kv_sup.erl: -------------------------------------------------------------------------------- 1 | -module(lashup_kv_sup). 2 | -behaviour(supervisor). 3 | 4 | -export([start_link/0]). 5 | -export([init/1]). 6 | 7 | -define(CHILD(I, Type), {I, {I, start_link, []}, permanent, 5000, Type, [I]}). 8 | 9 | start_link() -> 10 | supervisor:start_link({local, ?MODULE}, ?MODULE, []). 11 | 12 | init([]) -> 13 | {ok, {#{}, [ 14 | ?CHILD(lashup_kv, worker), 15 | ?CHILD(lashup_kv_aae_sup, supervisor), 16 | ?CHILD(lashup_kv_aae_mgr, worker) 17 | ]}}. 18 | -------------------------------------------------------------------------------- /src/lashup_kv_sync_rx_fsm.erl: -------------------------------------------------------------------------------- 1 | -module(lashup_kv_sync_rx_fsm). 2 | -author("sdhillon"). 3 | 4 | -behaviour(gen_statem). 5 | 6 | -include_lib("kernel/include/logger.hrl"). 7 | 8 | %% API 9 | -export([ 10 | start_link/2, 11 | init_metrics/0 12 | ]). 13 | 14 | 15 | %% Internal APIs 16 | -export([init/1, code_change/4, terminate/3, callback_mode/0]). 17 | 18 | -export([handle/3]). 19 | 20 | -record(state, {node, monitor_ref, remote_pid}). 21 | 22 | 23 | start_link(Node, RemotePID) -> 24 | gen_statem:start_link(?MODULE, [Node, RemotePID], []). 25 | 26 | %% Start in the initiator role 27 | 28 | init([Node, RemotePid]) -> 29 | MonitorRef = monitor(process, RemotePid), 30 | StateData = #state{node = Node, monitor_ref = MonitorRef, remote_pid = RemotePid}, 31 | {ok, handle, StateData, []}. 32 | 33 | callback_mode() -> 34 | state_functions. 35 | 36 | code_change(_OldVsn, OldState, OldData, _Extra) -> 37 | {ok, OldState, OldData}. 38 | 39 | terminate(Reason, State, _Data) -> 40 | ?LOG_WARNING("KV AAE RX FSM terminated (~p): ~p", [State, Reason]). 41 | 42 | handle(info, Message = #{from := RemotePID}, StateData = #state{remote_pid = RemotePID}) -> 43 | Size = erlang:external_size(Message), 44 | prometheus_counter:inc(lashup, aae_rx_messages_total, [], 1), 45 | prometheus_counter:inc(lashup, aae_rx_bytes_total, [], Size), 46 | rx_sync(info, Message, StateData); 47 | handle(Type, Message, StateData) -> 48 | rx_sync(Type, Message, StateData). 49 | 50 | rx_sync(info, Disconnect = {'DOWN', MonitorRef, _Type, _Object, _Info}, #state{monitor_ref = MonitorRef}) -> 51 | handle_disconnect(Disconnect); 52 | rx_sync(info, #{key := Key, from := RemotePID, message := keydata, vclock := VClock}, 53 | StateData = #state{remote_pid = RemotePID}) -> 54 | case lashup_kv:descends(Key, VClock) of 55 | false -> 56 | ?LOG_DEBUG("Synchronizing key ~p from ~p", [Key, node(RemotePID)]), 57 | request_key(Key, StateData); 58 | true -> 59 | ok 60 | end, 61 | keep_state_and_data; 62 | rx_sync(info, #{from := RemotePID, message := done}, #state{remote_pid = RemotePID}) -> 63 | Message = #{from => self(), message => rx_sync_complete}, 64 | erlang:send(RemotePID, Message, [noconnect]), 65 | erlang:garbage_collect(self()), 66 | keep_state_and_data; 67 | rx_sync(info, #{from := RemotePID}, #state{remote_pid = RemotePID}) -> 68 | Message = #{from => self(), message => unknown}, 69 | erlang:send(RemotePID, Message, [noconnect]), 70 | keep_state_and_data. 71 | 72 | request_key(Key, #state{remote_pid = RemotePID}) -> 73 | #{vclock := VClock, value := Map} = gen_statem:call(RemotePID, {request_key, Key}), 74 | sync_kv(Key, VClock, Map). 75 | 76 | sync_kv(Key, VClock, Map) -> 77 | gen_server:cast(lashup_kv, {maybe_update, Key, VClock, Map}). 78 | 79 | 80 | handle_disconnect({'DOWN', _MonitorRef, _Type, _Object, noconnection}) -> 81 | {stop, normal}; 82 | handle_disconnect({'DOWN', _MonitorRef, _Type, _Object, Reason}) -> 83 | ?LOG_WARNING("Lashup AAE TX Process disconnected: ~p", [Reason]), 84 | {stop, normal}. 85 | 86 | %%%=================================================================== 87 | %%% Metrics functions 88 | %%%=================================================================== 89 | 90 | -spec(init_metrics() -> ok). 91 | init_metrics() -> 92 | prometheus_counter:new([ 93 | {registry, lashup}, 94 | {name, aae_rx_messages_total}, 95 | {help, "Total number of Active Anti-Entropy messages received " 96 | "by this node."} 97 | ]), 98 | prometheus_counter:new([ 99 | {registry, lashup}, 100 | {name, aae_rx_bytes_total}, 101 | {help, "Total size of Active Anti-Entropy messages in bytes " 102 | "received by this node."} 103 | ]). 104 | -------------------------------------------------------------------------------- /src/lashup_kv_sync_tx_fsm.erl: -------------------------------------------------------------------------------- 1 | -module(lashup_kv_sync_tx_fsm). 2 | -author("sdhillon"). 3 | 4 | -behaviour(gen_statem). 5 | 6 | %% API 7 | -export([ 8 | start_link/1, 9 | init_metrics/0 10 | ]). 11 | 12 | -export([init/3, tx_sync/3, idle/3]). 13 | 14 | %% Internal APIs 15 | -export([init/1, code_change/4, terminate/3, callback_mode/0]). 16 | 17 | -include_lib("kernel/include/logger.hrl"). 18 | -include_lib("stdlib/include/ms_transform.hrl"). 19 | 20 | -record(state, {node, monitor_ref, remote_pid, lclock, maxclock}). 21 | 22 | 23 | start_link(Node) -> 24 | gen_statem:start_link(?MODULE, [Node], []). 25 | 26 | %% Start in the initiator role 27 | init([Node]) -> 28 | case lists:member(Node, nodes()) of 29 | true -> 30 | {ok, init, [Node], {timeout, 0, init}}; 31 | false -> 32 | {stop, node_disconnected} 33 | end. 34 | 35 | callback_mode() -> 36 | state_functions. 37 | 38 | init(timeout, init, [Node]) -> 39 | case gen_server:call({lashup_kv, Node}, {start_kv_sync_fsm, node(), self()}) of 40 | {error, unknown_request} -> 41 | {stop, remote_node_no_aae}; 42 | {error, Reason} -> 43 | {stop, {other_error, Reason}}; 44 | {ok, RemoteChildPid} -> 45 | LClock = lashup_kv:read_lclock(Node), 46 | MonitorRef = monitor(process, RemoteChildPid), 47 | StateData = #state{node = Node, monitor_ref = MonitorRef, 48 | remote_pid = RemoteChildPid, lclock = LClock, maxclock = LClock}, 49 | {next_state, tx_sync, StateData, [{next_event, internal, start_sync}]} 50 | end. 51 | 52 | tx_sync(info, Disconnect = {'DOWN', MonitorRef, _Type, _Object, _Info}, #state{monitor_ref = MonitorRef}) -> 53 | handle_disconnect(Disconnect); 54 | 55 | tx_sync({call, From}, {request_key, Key}, _) -> 56 | #{key := Key, vclock := VClock, value := Value} = lashup_kv:raw_value(Key), 57 | gen_statem:reply(From, #{vclock => VClock, value => Value}), 58 | keep_state_and_data; 59 | 60 | tx_sync(info, #{from := RemotePID, message := rx_sync_complete}, 61 | StateData = #state{node = Node, remote_pid = RemotePID, maxclock = MaxClock}) -> 62 | case lashup_kv:write_lclock(Node, MaxClock) of 63 | ok -> 64 | {next_state, idle, StateData, [{next_event, internal, reschedule_sync}]}; 65 | {error, Reason} -> 66 | {stop, Reason} 67 | end; 68 | 69 | tx_sync(internal, start_sync, StateData = #state{maxclock = MaxClock}) -> 70 | LClock = MaxClock, 71 | NextKey = maybe_fetch_next_key(lashup_kv:first_key(), LClock), 72 | defer_sync_key(NextKey), 73 | {keep_state, StateData#state{lclock = LClock}}; 74 | 75 | tx_sync(cast, {sync, '$end_of_table'}, #state{remote_pid = RemotePID}) -> 76 | finish_sync(RemotePID), 77 | keep_state_and_data; 78 | 79 | tx_sync(cast, {sync, Key}, StateData = #state{remote_pid = RemotePID, lclock = LClock, maxclock = MaxClock0}) -> 80 | KeyClock = send_key_vclock(Key, RemotePID), 81 | NextKey = maybe_fetch_next_key(lashup_kv:next_key(Key), LClock), 82 | defer_sync_key(NextKey), 83 | MaxClock1 = erlang:max(KeyClock, MaxClock0), 84 | {keep_state, StateData#state{maxclock = MaxClock1}}. 85 | 86 | idle(info, do_sync, StateData = #state{node = RemoteNode}) -> 87 | ?LOG_INFO("Starting tx sync with ~p", [RemoteNode]), 88 | {next_state, tx_sync, StateData, [{next_event, internal, start_sync}]}; 89 | 90 | idle(internal, reschedule_sync, #state{node = RemoteNode}) -> 91 | BaseAAEInterval = lashup_config:aae_interval(), 92 | NextSync = trunc(BaseAAEInterval * (1 + rand:uniform())), 93 | ?LOG_INFO("Scheduling sync with ~p in ~p milliseconds", [RemoteNode, NextSync]), 94 | timer:send_after(NextSync, do_sync), 95 | keep_state_and_data; 96 | 97 | idle(info, Disconnect = {'DOWN', MonitorRef, _Type, _Object, _Info}, #state{monitor_ref = MonitorRef}) -> 98 | handle_disconnect(Disconnect). 99 | 100 | code_change(_OldVsn, OldState, OldData, _Extra) -> 101 | {ok, OldState, OldData}. 102 | 103 | terminate(Reason, State, _Data) -> 104 | ?LOG_WARNING("KV AAE TX FSMs terminated (~p): ~p", [State, Reason]). 105 | 106 | finish_sync(RemotePID) -> 107 | erlang:garbage_collect(self()), 108 | %% This is to ensure that all messages have flushed 109 | Message = #{from => self(), message => done}, 110 | send(RemotePID, Message). 111 | 112 | send_key_vclock(Key, RemotePID) -> 113 | #{vclock := VClock, lclock := KeyClock} = lashup_kv:raw_value(Key), 114 | Message = #{from => self(), key => Key, vclock => VClock, message => keydata}, 115 | send(RemotePID, Message), 116 | KeyClock. 117 | 118 | defer_sync_key(Key) -> 119 | Sleep = trunc((rand:uniform() + 0.5) * 10), 120 | timer:apply_after(Sleep, gen_statem, cast, [self(), {sync, Key}]). 121 | 122 | maybe_fetch_next_key(Key, _) when Key == '$end_of_table' -> 123 | Key; 124 | maybe_fetch_next_key(Key, LClock) -> 125 | #{lclock := KeyClock} = lashup_kv:raw_value(Key), 126 | maybe_fetch_next_key(Key, KeyClock, LClock). 127 | 128 | maybe_fetch_next_key(Key, KeyClock, LClock) when KeyClock >= LClock -> 129 | Key; 130 | maybe_fetch_next_key(Key, _, LClock) -> 131 | NextKey = lashup_kv:next_key(Key), 132 | maybe_fetch_next_key(NextKey, LClock). 133 | 134 | handle_disconnect({'DOWN', _MonitorRef, _Type, _Object, noconnection}) -> 135 | {stop, normal}; 136 | handle_disconnect({'DOWN', _MonitorRef, _Type, _Object, Reason}) -> 137 | ?LOG_WARNING("Lashup AAE RX Process disconnected: ~p", [Reason]), 138 | {stop, normal}. 139 | 140 | send(RemotePID, Message) -> 141 | try 142 | erlang:send(RemotePID, Message, [noconnect]) 143 | after 144 | Size = erlang:external_size(Message), 145 | prometheus_counter:inc(lashup, aae_tx_messages_total, [], 1), 146 | prometheus_counter:inc(lashup, aae_tx_bytes_total, [], Size) 147 | end. 148 | 149 | %%%=================================================================== 150 | %%% Metrics functions 151 | %%%=================================================================== 152 | 153 | -spec(init_metrics() -> ok). 154 | init_metrics() -> 155 | prometheus_counter:new([ 156 | {registry, lashup}, 157 | {name, aae_tx_messages_total}, 158 | {help, "Total number of Active Anti-Entropy messages sent by " 159 | "this node."} 160 | ]), 161 | prometheus_counter:new([ 162 | {registry, lashup}, 163 | {name, aae_tx_bytes_total}, 164 | {help, "Total size of Active Anti-Entropy messages in bytes " 165 | "sent by this node."} 166 | ]). 167 | -------------------------------------------------------------------------------- /src/lashup_platform_sup.erl: -------------------------------------------------------------------------------- 1 | -module(lashup_platform_sup). 2 | -behaviour(supervisor). 3 | 4 | -export([start_link/0]). 5 | -export([init/1]). 6 | 7 | -define(CHILD(I, Type), {I, {I, start_link, []}, permanent, 5000, Type, [I]}). 8 | 9 | start_link() -> 10 | supervisor:start_link({local, ?MODULE}, ?MODULE, []). 11 | 12 | init([]) -> 13 | {ok, {#{}, [ 14 | ?CHILD(lashup_kv_sup, supervisor) 15 | ]}}. 16 | 17 | -------------------------------------------------------------------------------- /src/lashup_save.erl: -------------------------------------------------------------------------------- 1 | %% Borrowed from: https://github.com/basho/riak_ensemble/blob/develop/src/riak_ensemble_save.erl 2 | %% ------------------------------------------------------------------- 3 | %% 4 | %% Copyright (c) 2013 Basho Technologies, Inc. All Rights Reserved. 5 | %% 6 | %% This file is provided to you under the Apache License, 7 | %% Version 2.0 (the "License"); you may not use this file 8 | %% except in compliance with the License. You may obtain 9 | %% a copy of the License at 10 | %% 11 | %% http://www.apache.org/licenses/LICENSE-2.0 12 | %% 13 | %% Unless required by applicable law or agreed to in writing, 14 | %% software distributed under the License is distributed on an 15 | %% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | %% KIND, either express or implied. See the License for the 17 | %% specific language governing permissions and limitations 18 | %% under the License. 19 | %% 20 | %% ------------------------------------------------------------------- 21 | 22 | %% @doc 23 | %% Provide a safe method of saving data to disk along with a checksum 24 | %% that is verified on read. Additionally, four replicas of the data 25 | %% are stored across two files for greater redundancy/durability. 26 | 27 | -module(lashup_save). 28 | -export([write/2, read/1]). 29 | 30 | %%=================================================================== 31 | 32 | -spec write(file:filename(), binary()) -> ok | {error, term()}. 33 | write(File, Data) -> 34 | CRC = erlang:crc32(Data), 35 | Size = byte_size(Data), 36 | Meta = <>, 37 | Out = [Meta, Data, %% copy 1 38 | Data, Meta], %% copy 2 39 | ok = filelib:ensure_dir(File), 40 | try 41 | _ = Out, 42 | ok = lashup_utils:replace_file(File, Out), 43 | ok = lashup_utils:replace_file(File ++ ".backup", Out), 44 | ok 45 | catch 46 | _:Err -> 47 | {error, Err} 48 | end. 49 | 50 | -spec read(file:filename()) -> {ok, binary()} | not_found. 51 | read(File) -> 52 | case do_read(File) of 53 | not_found -> 54 | do_read(File ++ ".backup"); 55 | Result -> 56 | Result 57 | end. 58 | 59 | %%=================================================================== 60 | 61 | -spec do_read(file:filename()) -> {ok, binary()} | not_found. 62 | do_read(File) -> 63 | case lashup_utils:read_file(File) of 64 | {ok, Binary} -> 65 | safe_read(Binary); 66 | {error, _} -> 67 | not_found 68 | end. 69 | 70 | -spec safe_read(binary()) -> {ok, binary()} | not_found. 71 | safe_read(<>) -> 72 | case erlang:crc32(Data) of 73 | CRC -> 74 | {ok, Data}; 75 | _ -> 76 | safe_read_backup(Rest) 77 | end; 78 | safe_read(Binary) -> 79 | safe_read_backup(Binary). 80 | 81 | -spec safe_read_backup(binary()) -> {ok, binary()} | not_found. 82 | safe_read_backup(Binary) when byte_size(Binary) =< 8 -> 83 | not_found; 84 | safe_read_backup(Binary) -> 85 | BinSize = byte_size(Binary), 86 | Skip = BinSize - 8, 87 | <<_:Skip/binary, CRC:32/integer, Size:32/integer>> = Binary, 88 | Skip2 = Skip - Size, 89 | case Binary of 90 | <<_:Skip2/binary, Data:Size/binary, _:8/binary>> -> 91 | case erlang:crc32(Data) of 92 | CRC -> 93 | {ok, Data}; 94 | _ -> 95 | not_found 96 | end; 97 | _ -> 98 | not_found 99 | end. 100 | -------------------------------------------------------------------------------- /src/lashup_sup.erl: -------------------------------------------------------------------------------- 1 | -module(lashup_sup). 2 | -behaviour(supervisor). 3 | 4 | -export([start_link/0]). 5 | -export([init/1]). 6 | 7 | -define(CHILD(I, Type), {I, {I, start_link, []}, permanent, 5000, Type, [I]}). 8 | 9 | start_link() -> 10 | supervisor:start_link({local, ?MODULE}, ?MODULE, []). 11 | 12 | init([]) -> 13 | lashup_kv:init_metrics(), 14 | lashup_kv_sync_rx_fsm:init_metrics(), 15 | lashup_kv_sync_tx_fsm:init_metrics(), 16 | lashup_gm_mc:init_metrics(), 17 | lashup_gm:init_metrics(), 18 | lashup_gm_route:init_metrics(), 19 | lashup_hyparview_membership:init_metrics(), 20 | 21 | {ok, {#{strategy => rest_for_one}, [ 22 | ?CHILD(lashup_core_sup, supervisor), 23 | ?CHILD(lashup_platform_sup, supervisor) 24 | ]}}. 25 | -------------------------------------------------------------------------------- /src/lashup_utils.erl: -------------------------------------------------------------------------------- 1 | -module(lashup_utils). 2 | -author("sdhillon"). 3 | 4 | -include_lib("kernel/include/file.hrl"). 5 | 6 | %% API 7 | -export([ 8 | seed/0, 9 | shuffle_list/2, 10 | new_window/1, 11 | add_tick/1, 12 | count_ticks/1, 13 | compare_vclocks/2, 14 | subtract/2, 15 | shuffle_list/1, 16 | replace_file/2, 17 | read_file/1, 18 | hibernate/0 19 | ]). 20 | 21 | -export_type([window/0]). 22 | 23 | -record(window, { 24 | samples = [] :: list(integer()), 25 | window_time = 0 :: non_neg_integer()}). 26 | -type window() :: #window{}. 27 | 28 | 29 | -spec(seed() -> rand:state()). 30 | seed() -> 31 | rand:seed(exsplus). 32 | 33 | -spec shuffle_list(List, Seed :: rand:state()) -> List1 when 34 | List :: [T, ...], 35 | List1 :: [T, ...], 36 | T :: term(). 37 | shuffle_list(List, FixedSeed) -> 38 | {_, PrefixedList} = 39 | lists:foldl(fun(X, {SeedState, Acc}) -> 40 | {N, SeedState1} = rand:uniform_s(1000000, SeedState), 41 | {SeedState1, [{N, X} | Acc]} 42 | end, 43 | {FixedSeed, []}, 44 | List), 45 | PrefixedListSorted = lists:sort(PrefixedList), 46 | [Value || {_N, Value} <- PrefixedListSorted]. 47 | 48 | -spec shuffle_list(List) -> List1 when 49 | List :: [T, ...], 50 | List1 :: [T, ...], 51 | T :: term(). 52 | shuffle_list(List) -> 53 | PrefixedList = [{rand:uniform(1000000), Item} || Item <- List], 54 | PrefixedListSorted = lists:sort(PrefixedList), 55 | [Value || {_N, Value} <- PrefixedListSorted]. 56 | 57 | -spec(new_window(WindowTime :: non_neg_integer()) -> window()). 58 | new_window(WindowTime) -> 59 | #window{window_time = WindowTime}. 60 | 61 | -spec(add_tick(window()) -> window()). 62 | add_tick(Window = #window{window_time = WindowTime, samples = Samples}) -> 63 | Sample = erlang:monotonic_time(milli_seconds), 64 | Now = erlang:monotonic_time(milli_seconds), 65 | Samples1 = [Sample|Samples], 66 | {Samples2, _} = lists:splitwith(fun(X) -> X > Now - WindowTime end, Samples1), 67 | Window#window{samples = Samples2}. 68 | 69 | -spec(count_ticks(window()) -> non_neg_integer()). 70 | count_ticks(_Window = #window{window_time = WindowTime, samples = Samples}) -> 71 | Now = erlang:monotonic_time(milli_seconds), 72 | {Samples1, _} = lists:splitwith(fun(X) -> X > Now - WindowTime end, Samples), 73 | length(Samples1). 74 | 75 | -spec(compare_vclocks(V1 :: riak_dt_vclock:vclock(), V2 :: riak_dt_vclock:vclock()) -> gt | lt | equal | concurrent). 76 | compare_vclocks(V1, V2) -> 77 | %% V1 dominates V2 78 | DominatesGT = riak_dt_vclock:dominates(V1, V2), 79 | DominatesLT = riak_dt_vclock:dominates(V2, V1), 80 | Equal = riak_dt_vclock:equal(V1, V2), 81 | case {DominatesGT, DominatesLT, Equal} of 82 | {true, _, _} -> 83 | gt; 84 | {_, true, _} -> 85 | lt; 86 | {_, _, true} -> 87 | equal; 88 | {_, _, _} -> 89 | concurrent 90 | end. 91 | 92 | -spec(subtract(List1, List2) -> Set when 93 | List1 :: [Type, ...], 94 | List2 :: [Type, ...], 95 | Set :: ordsets:ordset(Type), 96 | Type :: term()). 97 | 98 | %% @doc 99 | %% This is equivalent to lists:subtract 100 | %% This comes from a bunch of empirical benchmarking 101 | %% That for small sets, it's cheaper to do ordsets:from_list 102 | %% and use that subtraction method 103 | %% It returns a sorted set back 104 | %% @end 105 | subtract(List1, List2) -> 106 | List1Set = ordsets:from_list(List1), 107 | List2Set = ordsets:from_list(List2), 108 | ordsets:subtract(List1Set, List2Set). 109 | 110 | %% Borrowed from: https://github.com/basho/riak_ensemble/blob/develop/src/riak_ensemble_util.erl 111 | -spec replace_file(file:filename(), iodata()) -> ok | {error, term()}. 112 | replace_file(FN, Data) -> 113 | TmpFN = FN ++ ".tmp", 114 | {ok, FH} = file:open(TmpFN, [write, raw]), 115 | try 116 | ok = file:write(FH, Data), 117 | ok = file:sync(FH), 118 | ok = file:close(FH), 119 | ok = file:rename(TmpFN, FN), 120 | {ok, Contents} = read_file(FN), 121 | true = (Contents == iolist_to_binary(Data)), 122 | ok 123 | catch _:Err -> 124 | {error, Err} 125 | end. 126 | 127 | %%=================================================================== 128 | 129 | %% @doc Similar to {@link file:read_file/1} but uses raw file I/O 130 | -spec read_file(file:filename()) -> {ok, binary()} | {error, _}. 131 | read_file(FName) -> 132 | case file:open(FName, [read, raw, binary]) of 133 | {ok, FD} -> 134 | Result = read_file(FD, []), 135 | ok = file:close(FD), 136 | case Result of 137 | {ok, IOList} -> 138 | {ok, iolist_to_binary(IOList)}; 139 | {error, _} = Err -> 140 | Err 141 | end; 142 | {error, _} = Err -> 143 | Err 144 | end. 145 | 146 | -spec read_file(file:fd(), [binary()]) -> {ok, [binary()]} | {error, _}. 147 | read_file(FD, Acc) -> 148 | case file:read(FD, 4096) of 149 | {ok, Data} -> 150 | read_file(FD, [Data | Acc]); 151 | eof -> 152 | {ok, lists:reverse(Acc)}; 153 | {error, _} = Err -> 154 | Err 155 | end. 156 | 157 | %%=================================================================== 158 | 159 | % NOTE: The function stores its state in the process dictionary. 160 | -spec(hibernate() -> hibernate | infinity). 161 | hibernate() -> 162 | Now = erlang:monotonic_time(millisecond), 163 | Timeout = gc_timeout(), 164 | case erlang:get(gc_at) of 165 | undefined -> 166 | erlang:put(gc_at, Now), 167 | infinity; 168 | Time when Now - Time >= Timeout -> 169 | erlang:put(gc_at, Now), 170 | hibernate; 171 | _Time -> 172 | infinity 173 | end. 174 | 175 | -spec(gc_timeout() -> timeout()). 176 | gc_timeout() -> 177 | case erlang:get(gc_timeout) of 178 | undefined -> 179 | Timeout = lashup_config:gc_timeout(), 180 | erlang:put(gc_timeout, Timeout), 181 | Timeout; 182 | Timeout -> 183 | Timeout 184 | end. 185 | -------------------------------------------------------------------------------- /test/lashup_gm_route_SUITE.erl: -------------------------------------------------------------------------------- 1 | -module(lashup_gm_route_SUITE). 2 | -author("sdhillon"). 3 | 4 | -include_lib("common_test/include/ct.hrl"). 5 | 6 | %% API 7 | -export([all/0]). 8 | -export([init_per_testcase/2, end_per_testcase/2]). 9 | 10 | %% Testcases 11 | -export([benchmark/1, basic_events/1, busy_wait_events/1]). 12 | 13 | all() -> [benchmark, basic_events, busy_wait_events]. 14 | 15 | init_per_testcase(_TestCase, Config) -> 16 | ok = application:start(prometheus), 17 | ok = lashup_gm_route:init_metrics(), 18 | {ok, _} = lashup_gm_route:start_link(), 19 | {ok, _} = lashup_gm_route_events:start_link(), 20 | Config. 21 | 22 | end_per_testcase(_TestCase, _Config) -> 23 | gen_event:stop(lashup_gm_route_events), 24 | lashup_gm_route:stop(), 25 | ok = application:stop(prometheus). 26 | 27 | %% Create a list of X nodes 28 | %% Until every node has N adjacencies, choose a node another node from the list and make then adjacent 29 | generate_graph(Nodes) -> 30 | Tuple = erlang:make_tuple(Nodes, []), 31 | populate_graph(1, Tuple). 32 | 33 | find_candidates(N, Tuple, _Exempt, Acc) when N > size(Tuple) -> 34 | Acc; 35 | find_candidates(N, Tuple, Exempt, Acc) when N == Exempt -> 36 | find_candidates(N + 1, Tuple, Exempt, Acc); 37 | find_candidates(N, Tuple, Exempt, Acc) when length(element(N, Tuple)) < 5 -> 38 | case ordsets:is_element(Exempt, element(N, Tuple)) of 39 | true -> 40 | find_candidates(N + 1, Tuple, Exempt, Acc); 41 | false -> 42 | find_candidates(N + 1, Tuple, Exempt, [N|Acc]) 43 | end; 44 | find_candidates(N, Tuple, Exempt, Acc) -> 45 | find_candidates(N + 1, Tuple, Exempt, Acc). 46 | 47 | populate_graph(N, Tuple) when N > size(Tuple) -> 48 | Tuple; 49 | populate_graph(N, Tuple) when length(element(N, Tuple)) < 5 -> 50 | Candidates = find_candidates(1, Tuple, N, []), 51 | case Candidates of 52 | [] -> 53 | Tuple; 54 | _ -> 55 | Candidate = lists:nth(rand:uniform(length(Candidates)), Candidates), 56 | OldLocalElements = element(N, Tuple), 57 | OldCandidateElements = element(Candidate, Tuple), 58 | NewLocalElements = [Candidate|OldLocalElements], 59 | NewCandidateElements = [N|OldCandidateElements], 60 | Tuple1 = setelement(N, Tuple, NewLocalElements), 61 | Tuple2 = setelement(Candidate, Tuple1, NewCandidateElements), 62 | populate_graph(1, Tuple2) 63 | end; 64 | populate_graph(N, Tuple) -> 65 | populate_graph(N + 1, Tuple). 66 | 67 | insert_graph(N, Tuple) when N > size(Tuple) -> 68 | ok; 69 | insert_graph(N, Tuple) -> 70 | AdjNodes = [{node, X} || X <- element(N, Tuple)], 71 | lashup_gm_route:update_node({node, N}, AdjNodes), 72 | insert_graph(N + 1, Tuple). 73 | 74 | get_tree() -> 75 | {Val, _} = timer:tc(lashup_gm_route, get_tree, [{node, 10}, 50000]), 76 | timer:sleep(500), 77 | ct:pal("Time: ~p", [Val]). 78 | 79 | benchmark(_Config) -> 80 | Graph = generate_graph(1000), 81 | %eprof:start(), 82 | %eprof:start_profiling([whereis(lashup_gm_route)]), 83 | %fprof:start(), 84 | insert_graph(1, Graph), 85 | %fprof:trace([start, {file, "fprof.trace"}, verbose, {procs, [whereis(lashup_gm_route)]}]), 86 | get_tree(), 87 | lashup_gm_route:update_node(foo, [1]), 88 | get_tree(), 89 | lashup_gm_route:update_node(foo, [2]), 90 | get_tree(), 91 | lashup_gm_route:update_node(foo, [1]), 92 | get_tree(), 93 | lashup_gm_route:update_node(foo, [2]), 94 | get_tree(), 95 | lashup_gm_route:update_node(foo, [1]), 96 | get_tree(). 97 | %fprof:trace([stop]), 98 | %fprof:profile({file, "fprof.trace"}), 99 | %ct:pal("Time: ~p", [Val]). 100 | %fprof:analyse([totals, {dest, "fprof.analysis"}]). 101 | 102 | % eprof:stop_profiling(), 103 | % eprof:log("eprof.log"), 104 | % eprof:analyze(). 105 | 106 | basic_events(_Config) -> 107 | lashup_gm_route:update_node(node(), [1]), 108 | lashup_gm_route:update_node(500, [150]), 109 | lashup_gm_route:update_node(500, [200]), 110 | {ok, Ref} = lashup_gm_route_events:subscribe(), 111 | true = (count_events(Ref) > 0), 112 | lashup_gm_route:update_node(500, [150]), 113 | 1 = count_events(Ref). 114 | 115 | count_events(Ref) -> count_events(Ref, 0). 116 | 117 | count_events(Ref, Acc) -> 118 | receive 119 | {lashup_gm_route_events, _Event = #{ref := Ref}} -> 120 | count_events(Ref, Acc + 1) 121 | after 500 -> 122 | Acc 123 | end. 124 | 125 | busy_wait_events(_Config) -> 126 | lashup_gm_route:update_node(node(), [1]), 127 | {ok, Ref} = lashup_gm_route_events:subscribe(), 128 | lists:foreach(fun(X) -> lashup_gm_route:update_node(1, [X]) end, lists:seq(1, 50)), 129 | true = (count_events(Ref) < 20), 130 | timer:sleep(2000), 131 | lashup_gm_route:update_node(node(), [1, 2]), 132 | Num = count_events(Ref), 133 | true = (Num < 5) andalso (Num > 0). 134 | -------------------------------------------------------------------------------- /test/lashup_hyparview_SUITE.erl: -------------------------------------------------------------------------------- 1 | -module(lashup_hyparview_SUITE). 2 | -author("sdhillon"). 3 | 4 | -include_lib("common_test/include/ct.hrl"). 5 | -include_lib("eunit/include/eunit.hrl"). 6 | 7 | -export([ 8 | all/0, 9 | init_per_testcase/2, end_per_testcase/2, 10 | init_per_suite/1, end_per_suite/1 11 | ]). 12 | 13 | -export([ 14 | hyparview_test/1, 15 | hyparview_random_kill_test/1, 16 | ping_test/1, 17 | failure_test/1, 18 | mc_test/1, 19 | kv_test/1 20 | ]). 21 | 22 | -define(MAX_MC_REPLICATION, 3). 23 | 24 | all() -> [ 25 | hyparview_test, 26 | hyparview_random_kill_test, 27 | ping_test, 28 | failure_test, 29 | mc_test, 30 | kv_test 31 | ]. 32 | 33 | -define(MASTERS, [master1, master2, master3]). 34 | -define(AGENTS, [agent1, agent2, agent3, agent4, agent5, agent6, agent7]). 35 | 36 | init_per_suite(Config) -> 37 | os:cmd(os:find_executable("epmd") ++ " -daemon"), 38 | {ok, Hostname} = inet:gethostname(), 39 | case net_kernel:start([list_to_atom("runner@" ++ Hostname), shortnames]) of 40 | {ok, _} -> ok; 41 | {error, {already_started, _}} -> ok 42 | end, 43 | [{hostname, Hostname}| Config]. 44 | 45 | end_per_suite(Config) -> 46 | net_kernel:stop(), 47 | Config. 48 | 49 | init_per_testcase(TestCase, Config) -> 50 | ct:pal("Starting Testcase: ~p", [TestCase]), 51 | Nodes = start_nodes(?MASTERS ++ ?AGENTS), 52 | configure_nodes(Nodes, masters(Nodes)), 53 | [{nodes, Nodes} | Config]. 54 | 55 | end_per_testcase(_, Config) -> 56 | stop_nodes(?config(nodes, Config)), 57 | %% remove lashup and mnesia directory 58 | os:cmd("rm -rf *@" ++ ?config(hostname, Config)), 59 | Config. 60 | 61 | masters(Nodes) -> 62 | element(1, lists:split(length(?MASTERS), Nodes)). 63 | 64 | agents(Nodes) -> 65 | element(2, lists:split(length(?MASTERS), Nodes)). 66 | 67 | start_nodes(Nodes) -> 68 | Opts = [{monitor_master, true}, {erl_flags, "-connect_all false"}], 69 | Result = [ct_slave:start(Node, Opts) || Node <- Nodes], 70 | NodeNames = [NodeName || {ok, NodeName} <- Result], 71 | lists:foreach(fun(Node) -> pong = net_adm:ping(Node) end, NodeNames), 72 | NodeNames. 73 | 74 | configure_nodes(Nodes, Masters) -> 75 | Env = [lashup, contact_nodes, Masters], 76 | {_, []} = rpc:multicall(Nodes, code, add_pathsa, [code:get_path()]), 77 | {_, []} = rpc:multicall(Nodes, application, set_env, Env). 78 | 79 | stop_nodes(Nodes) -> 80 | StoppedResult = [ct_slave:stop(Node) || Node <- Nodes], 81 | lists:foreach(fun(Node) -> pang = net_adm:ping(Node) end, Nodes), 82 | ct:pal("Stopped result: ~p", [StoppedResult]). 83 | 84 | hyparview_test(Config) -> 85 | Nodes = ?config(nodes, Config), 86 | {_, []} = rpc:multicall(Nodes, application, ensure_all_started, [lashup]), 87 | LeftOverTime = wait_for_convergence(60000, 5000, Nodes), 88 | ct:pal("Converged in ~p milliseconds", [60000 - LeftOverTime]), 89 | ok. 90 | 91 | hyparview_random_kill_test(Config) -> 92 | Nodes = ?config(nodes, Config), 93 | hyparview_test(Config), 94 | kill_nodes(Nodes, length(Nodes) * 2), 95 | LeftOverTime = wait_for_convergence(60000, 5000, Nodes), 96 | ct:pal("ReConverged in ~p milliseconds", [60000 - LeftOverTime]), 97 | ok. 98 | 99 | kill_nodes(_, 0) -> 100 | ok; 101 | kill_nodes(Nodes, Remaining) -> 102 | Idx = rand:uniform(length(Nodes)), 103 | Node = lists:nth(Idx, Nodes), 104 | ct:pal("Killing node: ~p", [Node]), 105 | RemotePid = rpc:call(Node, erlang, whereis, [lashup_hyparview_membership]), 106 | exit(RemotePid, kill), 107 | timer:sleep(5000), 108 | kill_nodes(Nodes, Remaining - 1). 109 | 110 | ping_test(Config) -> 111 | hyparview_test(Config), 112 | ok = stop_start_nodes(?config(nodes, Config), 10), 113 | ok. 114 | 115 | stop_start_nodes(_, 0) -> 116 | ok; 117 | stop_start_nodes(Nodes, Remaining) -> 118 | Node = random_node(Nodes), 119 | ct:pal("Stopping: ~s", [Node]), 120 | stop_nodes([Node]), 121 | RestNodes = lists:delete(Node, Nodes), 122 | Now = erlang:monotonic_time(), 123 | wait_for_unreachability(Node, RestNodes, Now), 124 | Now2 = erlang:monotonic_time(), 125 | DetectTime = erlang:convert_time_unit(Now2 - Now, native, milli_seconds), 126 | ct:pal("Failure detection in ~p ms", [DetectTime]), 127 | start_nodes([Node]), 128 | configure_nodes([Node], masters(Nodes)), 129 | rpc:call(Node, application, ensure_all_started, [lashup]), 130 | ct:pal("Starting: ~s", [Node]), 131 | wait_for_convergence(60000, 5000, Nodes), 132 | stop_start_nodes(Nodes, Remaining - 1). 133 | 134 | random_node(Nodes) -> 135 | Idx = rand:uniform(length(Nodes)), 136 | lists:nth(Idx, Nodes). 137 | 138 | wait_for_unreachability(DstNode, RestNodes, Now) -> 139 | SrcNode = random_node(RestNodes), 140 | Now2 = erlang:monotonic_time(), 141 | case erlang:convert_time_unit(Now2 - Now, native, seconds) of 142 | Time when Time > 10 -> 143 | exit(too_much_time); 144 | _ -> 145 | case rpc:call(SrcNode, lashup_gm_route, path_to, [DstNode]) of 146 | false -> 147 | ok; 148 | Else -> 149 | ct:pal("Node still reachable: ~p", [Else]), 150 | timer:sleep(100), 151 | wait_for_unreachability(DstNode, RestNodes, Now) 152 | end 153 | end. 154 | 155 | 156 | failure_test(Config) -> 157 | hyparview_test(Config), 158 | ct:pal("Testing failure conditions"), 159 | Nodes = ?config(nodes, Config), 160 | {Nodes1, Nodes2} = split(masters(Nodes), agents(Nodes)), 161 | ct:pal("Splitting networks ~p ~p", [Nodes1, Nodes2]), 162 | {_, []} = rpc:multicall(Nodes1, net_kernel, allow, [[node() | Nodes1]]), 163 | {_, []} = rpc:multicall(Nodes2, net_kernel, allow, [[node() | Nodes2]]), 164 | lists:foreach(fun(Node) -> 165 | {_, []} = rpc:multicall(Nodes1, erlang, disconnect_node, [Node]) 166 | end, Nodes2), 167 | lists:foreach(fun(Node) -> 168 | {_, []} = rpc:multicall(Nodes2, erlang, disconnect_node, [Node]) 169 | end, Nodes1), 170 | ct:pal("Allowing either side to converge independently"), 171 | wait_for_convergence(60000, 5000, Nodes, 2), 172 | Healing = rpc:multicall(Nodes, net_kernel, allow, [[node() | Nodes]]), 173 | ct:pal("Healing networks: ~p", [Healing]), 174 | LeftOverTime = wait_for_convergence(60000, 5000, Nodes), 175 | ct:pal("Converged in ~p milliseconds", [60000 - LeftOverTime]), 176 | ok. 177 | 178 | split(Masters, Agents) -> 179 | A = round(length(Agents) / 2), 180 | {Agents1, Agents2} = lists:split(A, Agents), 181 | M = round(length(Masters) / 2), 182 | {Masters1, Masters2} = lists:split(M, Masters), 183 | {Masters1 ++ Agents1, Masters2 ++ Agents2}. 184 | 185 | wait_for_convergence(TotalTime, Interval, Nodes) -> 186 | wait_for_convergence(TotalTime, Interval, Nodes, 1). 187 | 188 | wait_for_convergence(TotalTime, Interval, Nodes, Size) when TotalTime > 0 -> 189 | timer:sleep(Interval), 190 | case check_graph(Nodes, Size) of 191 | true -> 192 | TotalTime; 193 | false -> 194 | ct:pal("Unconverged at: ~p remaining~n", [TotalTime]), 195 | wait_for_convergence(TotalTime - Interval, Interval, Nodes, Size) 196 | end; 197 | wait_for_convergence(_TotalTime, _Interval, Nodes, _Size) -> 198 | {Replies, _} = gen_server:multi_call(Nodes, lashup_hyparview_membership, get_active_view, 60000), 199 | ActiveViews = lists:flatten([ActiveView || {_Node, ActiveView} <- Replies]), 200 | InitDict = lists:foldl(fun(Node, Acc) -> 201 | orddict:update_counter(Node, 0, Acc) 202 | end, [], Nodes), 203 | DictCounted = lists:foldl(fun(Node, Acc) -> 204 | orddict:update_counter(Node, 1, Acc) 205 | end, InitDict, ActiveViews), 206 | Unconverged = orddict:filter(fun(_Key, Value) -> 207 | Value == 0 208 | end, DictCounted), 209 | ct:pal("Unconverged: ~p", [Unconverged]), 210 | ct:fail(never_converged). 211 | 212 | check_graph(Nodes, Size) -> 213 | Digraph = digraph:new(), 214 | lists:foreach(fun(Node) -> digraph:add_vertex(Digraph, Node) end, Nodes), 215 | lists:foreach(fun(Node) -> 216 | ActiveView = gen_server:call({lashup_hyparview_membership, Node}, get_active_view, 60000), 217 | lists:foreach(fun(V2) -> digraph:add_edge(Digraph, Node, V2) end, ActiveView) 218 | end, Nodes), 219 | Components = digraph_utils:strong_components(Digraph), 220 | ct:pal("Components: ~p~n", [Components]), 221 | digraph:delete(Digraph), 222 | length(Components) == Size. 223 | 224 | mc_test(Config) -> 225 | hyparview_test(Config), 226 | Nodes = ?config(nodes, Config), 227 | Env = [lashup, max_mc_replication, ?MAX_MC_REPLICATION], 228 | {_, []} = rpc:multicall(Nodes, application, set_env, Env), 229 | timer:sleep(60000), %% Let things settle out 230 | [Node1, Node2, Node3] = choose_nodes(Nodes, 3), 231 | %% Test general messaging 232 | {ok, Topic1RefNode1} = lashup_gm_mc_events:remote_subscribe(Node1, [topic1]), 233 | R1 = make_ref(), 234 | rpc:call(Node2, lashup_gm_mc, multicast, [topic1, R1]), 235 | ?assertEqual(?MAX_MC_REPLICATION, expect_replies(Topic1RefNode1, R1)), 236 | timer:sleep(5000), 237 | %% Make sure that we don't see "old" events 238 | {ok, Topic1RefNode3} = lashup_gm_mc_events:remote_subscribe(Node3, [topic1]), 239 | ?assertEqual(0, expect_replies(Topic1RefNode3, R1)), 240 | ok. 241 | 242 | 243 | expect_replies(Reference, Payload) -> 244 | expect_replies(Reference, Payload, 0). 245 | 246 | expect_replies(Reference, Payload, Count) -> 247 | receive 248 | {lashup_gm_mc_event, Event = #{ref := Reference, payload := Payload}} -> 249 | ct:pal("Received event (~p): ~p", [Count + 1, Event]), 250 | expect_replies(Reference, Payload, Count + 1) 251 | after 5000 -> 252 | Count 253 | end. 254 | 255 | choose_nodes(Nodes, Count) -> 256 | choose_nodes(Nodes, Count, []). 257 | 258 | choose_nodes(_, 0, Acc) -> 259 | Acc; 260 | choose_nodes(Nodes, Count, Acc) -> 261 | Idx = rand:uniform(length(Nodes)), 262 | Node = lists:nth(Idx, Nodes), 263 | Nodes1 = lists:delete(Node, Nodes), 264 | choose_nodes(Nodes1, Count - 1, [Node | Acc]). 265 | 266 | %% TODO: 267 | %% -Add Kill 268 | %% -Add concurrency 269 | kv_test(Config) -> 270 | Nodes = ?config(nodes, Config), 271 | rpc:multicall(Nodes, application, ensure_all_started, [lashup]), 272 | %% Normal value is 5 minutes, let's not wait that long 273 | {_, []} = rpc:multicall(Nodes, application, set_env, [lashup, aae_interval, 30000]), 274 | {_, []} = rpc:multicall(Nodes, application, set_env, [lashup, key_aae_interval, 30000]), 275 | Update1 = {update, [{update, {test_counter, riak_dt_pncounter}, {increment, 5}}]}, 276 | [rpc:call(Node, lashup_kv, request_op, [Node, Update1]) || Node <- Nodes], 277 | [rpc:call(Node, lashup_kv, request_op, [god_counter, Update1]) || Node <- Nodes], 278 | LeftOverTime1 = wait_for_convergence(60000, 5000, Nodes), 279 | ct:pal("Converged in ~p milliseconds", [60000 - LeftOverTime1]), 280 | LeftOverTime2 = wait_for_consistency(90000, 5000, Nodes), 281 | ct:pal("Consistency acheived in ~p milliseconds", [90000 - LeftOverTime2]), 282 | ok. 283 | 284 | wait_for_consistency(TotalTime, Interval, Nodes) when TotalTime > 0 -> 285 | timer:sleep(Interval), 286 | case check_nodes_for_consistency(Nodes, Nodes, 0) of 287 | true -> 288 | TotalTime; 289 | false -> 290 | ct:pal("Inconsistent at: ~p remaining~n", [TotalTime]), 291 | wait_for_consistency(TotalTime - Interval, Interval, Nodes) 292 | end; 293 | wait_for_consistency(_TotalTime, _Interval, _Nodes) -> 294 | ct:fail(never_consistent). 295 | 296 | check_nodes_for_consistency([], _, 0) -> 297 | true; 298 | check_nodes_for_consistency([], _, _) -> 299 | false; 300 | check_nodes_for_consistency([Node | Rest], Nodes, InconsistentNodeCount) -> 301 | {ConsistentKeys, InconsistentKeys} = 302 | lists:partition(fun(OtherNode) -> 303 | Value = rpc:call(Node, lashup_kv, value, [OtherNode]), 304 | Value == [{{test_counter, riak_dt_pncounter}, 5}] 305 | end, Nodes), 306 | ExpectedGodCounterValue = length(Nodes) * 5, 307 | {ConsistentKeys1, InconsistentKeys1} = 308 | case rpc:call(Node, lashup_kv, value, [god_counter]) of 309 | [{{test_counter, riak_dt_pncounter}, ExpectedGodCounterValue}] -> 310 | {[god_counter|ConsistentKeys], InconsistentKeys}; 311 | GodCounter -> 312 | ct:pal("God counter (~p): ~p", [Node, GodCounter]), 313 | {ConsistentKeys, [god_counter|InconsistentKeys]} 314 | end, 315 | ct:pal("Consistent keys (~p): ~p", [Node, ConsistentKeys1]), 316 | ct:pal("Inconsistent keys (~p): ~p", [Node, InconsistentKeys1]), 317 | 318 | case InconsistentKeys1 of 319 | [] -> 320 | check_nodes_for_consistency(Rest, Nodes, InconsistentNodeCount); 321 | _ -> 322 | check_nodes_for_consistency(Rest, Nodes, InconsistentNodeCount + 1) 323 | end. 324 | -------------------------------------------------------------------------------- /test/lashup_kv_SUITE.erl: -------------------------------------------------------------------------------- 1 | -module(lashup_kv_SUITE). 2 | 3 | -include_lib("common_test/include/ct.hrl"). 4 | -include_lib("stdlib/include/ms_transform.hrl"). 5 | 6 | -export([ 7 | all/0, 8 | init_per_suite/1, end_per_suite/1, 9 | init_per_testcase/2, end_per_testcase/2 10 | ]). 11 | 12 | -export([ 13 | fetch_keys/1, 14 | kv_subscribe/1, 15 | remove_forgiving/1 16 | ]). 17 | 18 | all() -> [ 19 | fetch_keys, 20 | kv_subscribe, 21 | remove_forgiving 22 | ]. 23 | 24 | init_per_suite(Config) -> 25 | os:cmd(os:find_executable("epmd") ++ " -daemon"), 26 | {ok, Hostname} = inet:gethostname(), 27 | case net_kernel:start([list_to_atom("runner@" ++ Hostname), shortnames]) of 28 | {ok, _} -> ok; 29 | {error, {already_started, _}} -> ok 30 | end, 31 | Config. 32 | 33 | end_per_suite(Config) -> 34 | net_kernel:stop(), 35 | Config. 36 | 37 | init_per_testcase(_, Config) -> 38 | application:ensure_all_started(lashup), 39 | Config. 40 | 41 | end_per_testcase(_, Config) -> 42 | application:stop(lashup), 43 | application:stop(prometheus), 44 | Config. 45 | 46 | fetch_keys(_Config) -> 47 | Key1 = [a,b,c], 48 | {ok, _} = lashup_kv:request_op(Key1, {update, 49 | [{update, 50 | {flag, riak_dt_lwwreg}, 51 | {assign, true, erlang:system_time(nano_seconds)} 52 | }] 53 | }), 54 | Key2 = [a,b,d], 55 | {ok, _} = lashup_kv:request_op(Key2, {update, 56 | [{update, 57 | {flag, riak_dt_lwwreg}, 58 | {assign, true, erlang:system_time(nano_seconds)} 59 | }] 60 | }), 61 | Key3 = [x,y,z], 62 | {ok, _} = lashup_kv:request_op(Key3, {update, 63 | [{update, 64 | {flag, riak_dt_lwwreg}, 65 | {assign, true, erlang:system_time(nano_seconds)} 66 | }] 67 | }), 68 | Keys = lashup_kv:keys(ets:fun2ms(fun({[a, b, '_']}) -> true end)), 69 | true = lists:member(Key1, Keys) and 70 | lists:member(Key2, Keys) and 71 | not lists:member(Key3, Keys), 72 | ok. 73 | 74 | kv_subscribe(_Config) -> 75 | {ok, _} = lashup_kv:request_op(flag, {update, 76 | [{update, 77 | {color, riak_dt_lwwreg}, 78 | {assign, red, erlang:system_time(nano_seconds)} 79 | }] 80 | }), 81 | {ok, Ref} = lashup_kv:subscribe(ets:fun2ms(fun({flag}) -> true end)), 82 | receive 83 | {lashup_kv_event, Ref, flag} -> 84 | ok 85 | after 5000 -> 86 | ct:fail("Nothing received") 87 | end, 88 | {ok, _} = lashup_kv:request_op(flag, {update, 89 | [{update, 90 | {color, riak_dt_lwwreg}, 91 | {assign, blue, erlang:system_time(nano_seconds)} 92 | }] 93 | }), 94 | receive 95 | {lashup_kv_event, Ref, Key} -> 96 | [{{color, riak_dt_lwwreg}, blue}] = lashup_kv:value(Key) 97 | after 5000 -> 98 | ct:fail("Nothing received") 99 | end, 100 | lashup_kv:unsubscribe(Ref). 101 | 102 | remove_forgiving(_Config) -> 103 | Key = [x, y, z], 104 | Field = {tratataField, riak_dt_lwwreg}, 105 | {ok, _} = lashup_kv:request_op(Key, 106 | {update, [{update, Field, {assign, true, erlang:system_time(nano_seconds)}}]}), 107 | {ok, Map} = lashup_kv:request_op(Key, {update, [{remove, Field}]}), 108 | {ok, Map} = lashup_kv:request_op(Key, {update, [{remove, Field}]}), 109 | {ok, Map} = lashup_kv:request_op(Key, {update, [{remove, Field}]}). 110 | -------------------------------------------------------------------------------- /test/lashup_kv_aae_SUITE.erl: -------------------------------------------------------------------------------- 1 | -module(lashup_kv_aae_SUITE). 2 | 3 | -include_lib("common_test/include/ct.hrl"). 4 | 5 | -export([ 6 | all/0, 7 | init_per_suite/1, end_per_suite/1, 8 | init_per_testcase/2, end_per_testcase/2, 9 | lashup_kv_aae_test/1 10 | ]). 11 | 12 | -define(MASTERS, [master1]). 13 | -define(AGENTS, [agent1, agent2]). 14 | 15 | -define(WAIT, 60000). 16 | 17 | init_per_suite(Config) -> 18 | os:cmd(os:find_executable("epmd") ++ " -daemon"), 19 | {ok, Hostname} = inet:gethostname(), 20 | case net_kernel:start([list_to_atom("runner@" ++ Hostname), shortnames]) of 21 | {ok, _} -> ok; 22 | {error, {already_started, _}} -> ok 23 | end, 24 | [{hostname, Hostname} | Config]. 25 | 26 | end_per_suite(Config) -> 27 | application:stop(lashup), 28 | application:stop(prometheus), 29 | net_kernel:stop(), 30 | Config. 31 | 32 | all() -> 33 | [lashup_kv_aae_test]. 34 | 35 | init_per_testcase(TestCaseName, Config) -> 36 | ct:pal("Starting Testcase: ~p", [TestCaseName]), 37 | Nodes = start_nodes(?MASTERS ++ ?AGENTS), 38 | configure_nodes(Nodes, masters(Nodes)), 39 | [{nodes, Nodes} | Config]. 40 | 41 | end_per_testcase(_, Config) -> 42 | stop_nodes(?config(nodes, Config)), 43 | %% remove lashup and mnesia directory 44 | %%os:cmd("rm -rf *@" ++ ?config(hostname, Config)), 45 | Config. 46 | 47 | masters(Nodes) -> 48 | element(1, lists:split(length(?MASTERS), Nodes)). 49 | 50 | agents(Nodes) -> 51 | element(2, lists:split(length(?MASTERS), Nodes)). 52 | 53 | start_nodes(Nodes) -> 54 | Opts = [{monitor_master, true}, {erl_flags, "-connect_all false"}], 55 | Result = [ct_slave:start(Node, Opts) || Node <- Nodes], 56 | NodeNames = [NodeName || {ok, NodeName} <- Result], 57 | lists:foreach(fun(Node) -> pong = net_adm:ping(Node) end, NodeNames), 58 | NodeNames. 59 | 60 | configure_nodes(Nodes, Masters) -> 61 | Env = [lashup, contact_nodes, Masters], 62 | {_, []} = rpc:multicall(Nodes, code, add_pathsa, [code:get_path()]), 63 | {_, []} = rpc:multicall(Nodes, application, set_env, Env). 64 | 65 | stop_nodes(Nodes) -> 66 | StoppedResult = [ct_slave:stop(Node) || Node <- Nodes], 67 | lists:foreach(fun(Node) -> pang = net_adm:ping(Node) end, Nodes), 68 | ct:pal("Stopped result: ~p", [StoppedResult]). 69 | 70 | lashup_kv_aae_test(Config) -> 71 | %% Insert a record in lashup 72 | Nodes = ?config(nodes, Config), 73 | {_, []} = rpc:multicall(Nodes, application, ensure_all_started, [lashup]), 74 | {_, []} = rpc:multicall(Nodes, application, set_env, [lashup, aae_interval, 20000]), 75 | {[Master|_], [Agent|_]} = {masters(Nodes), agents(Nodes)}, 76 | SystemTime = erlang:system_time(nano_seconds), 77 | Val = {update, [{update, {flag, riak_dt_lwwreg}, {assign, true, SystemTime}}]}, 78 | {ok, _} = rpc:call(Master, lashup_kv, request_op, [[test], Val]), 79 | {ok, _} = rpc:call(Master, lashup_kv, request_op, [[test1], Val]), 80 | timer:sleep(?WAIT), % sleep for aae to kick in 81 | 1 = rpc:call(Master, lashup_kv, read_lclock, [Agent]), 82 | %% stop Agent 83 | ct:pal("Stopping agent ~p", [Agent]), 84 | stop_nodes([Agent]), 85 | timer:sleep(?WAIT), 86 | %% Master should reset the clock only after 2 min 87 | 1 = rpc:call(Master, lashup_kv, read_lclock, [Agent]), 88 | timer:sleep(?WAIT), % wait for 1 more min 89 | %% Verify that Master resetted the clock for the agent 90 | -1 = rpc:call(Master, lashup_kv, read_lclock, [Agent]). 91 | --------------------------------------------------------------------------------