├── .gitignore ├── .travis.yml ├── Documentation ├── design.md ├── etcd.md ├── faq.md ├── images │ ├── metafora_logical_integration_diagram.png │ ├── metafora_node_recovery.png │ └── metafora_nodefailure.png └── introduction.md ├── LICENSE ├── README.md ├── balancer.go ├── balancer_res.go ├── balancer_res_test.go ├── balancer_sleep.go ├── balancer_test.go ├── client.go ├── cmd └── metaforactl │ └── main.go ├── command.go ├── command_test.go ├── coordinator.go ├── doc.go ├── embedded ├── README.md ├── client.go ├── commander.go ├── commander_test.go ├── coordinator.go ├── embedded_test.go ├── statestore.go └── util.go ├── go.mod ├── go.sum ├── handler.go ├── httputil ├── httputil.go └── httputil_test.go ├── ignore.go ├── ignore_test.go ├── logger.go ├── metafora.go ├── metafora_test.go ├── metcdv3 ├── README.md ├── balancer.go ├── balancer_test.go ├── client.go ├── client_test.go ├── commander.go ├── commander_test.go ├── conf.go ├── const.go ├── coordinator.go ├── coordinator_test.go ├── doc.go ├── helpers_test.go ├── integration_test.go ├── statestore.go ├── task.go ├── task_test.go └── testutil │ └── testutil.go ├── resreporter ├── mem_linux.go └── mem_linux_test.go ├── scripts └── docker_run_etcd.sh ├── slowtask_test.go ├── statemachine ├── README.md ├── commander.go ├── doc.go ├── errors.go ├── errors_test.go ├── run_test.go ├── statemachine.go ├── statemachine_test.go └── statestore.go ├── task.go └── util_test.go /.gitignore: -------------------------------------------------------------------------------- 1 | # Compiled Object files, Static and Dynamic libs (Shared Objects) 2 | *.o 3 | *.a 4 | *.so 5 | 6 | # Folders 7 | _obj 8 | _test 9 | 10 | # Architecture specific extensions/prefixes 11 | *.[568vq] 12 | [568vq].out 13 | 14 | *.cgo1.go 15 | *.cgo2.c 16 | _cgo_defun.c 17 | _cgo_gotypes.go 18 | _cgo_export.* 19 | 20 | _testmain.go 21 | 22 | # Build files 23 | *.exe 24 | *.test 25 | *.prof 26 | cover.out 27 | 28 | *.orig 29 | *.swp 30 | 31 | # Executables 32 | cmd/metaforactl/metaforactl 33 | examples/koalemosd/koalemosd 34 | examples/koalemosctl/koalemosctl 35 | 36 | # bazel local only 37 | bazel-* 38 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: go 2 | script: go test -race -cpu 1,2,4 -v -timeout 5m ./... 3 | sudo: false 4 | go: 5 | - 1.16.x 6 | notifications: 7 | webhooks: 8 | urls: 9 | - https://webhooks.gitter.im/e/737918445727692fe8d1 10 | on_success: "change" # options: [always|never|change] default: always 11 | on_failure: "always" # options: [always|never|change] default: always 12 | on_start: false # default: false 13 | before_script: 14 | - curl -sL https://github.com/etcd-io/etcd/releases/download/v3.3.7/etcd-v3.3.7-linux-amd64.tar.gz | tar xz 15 | - etcd-v3.3.7-linux-amd64/etcd 2> /dev/null & 16 | -------------------------------------------------------------------------------- /Documentation/design.md: -------------------------------------------------------------------------------- 1 | # Design 2 | 3 | ## Exactly Once 4 | 5 | Metafora makes a *best effort* to ensure that exactly one instance of a 6 | submitted task is executing in a cluster. In other words, for task `T`, only 7 | one node, may be executing 8 | [`HandlerFunc(T).Run`](https://godoc.org/github.com/lytics/metafora#Handler). 9 | 10 | ### Implementation 11 | 12 | *Implementations are Coordinator specific, so this covers the builtin etcd 13 | coordinator.* 14 | 15 | Task claims are represented as keys with a TTL in etcd. A claim key is 16 | refreshed before the TTL expires in order to ensure the node running the task 17 | maintains the claim as long as the node is still executing normally. 18 | 19 | If the node ceases to execute normally due to a crash, high CPU utilization, 20 | network partition between the node and etcd, a bug, etc. the claim in etcd will 21 | expire and the task will be available for claiming by another node. When the 22 | problematic coordinator detects it has failed to maintain its claim, it informs 23 | the consumer it has `Lost` the task, the consumer calls `Handler.Stop` on the 24 | task, and ideally the task exits before it starts executing on a new node (see 25 | Limitations below). 26 | 27 | If a node is unable to reliably communicate with etcd it will stop all of its 28 | tasks and release all of its claims, effectively leaving the cluster. It will 29 | begin claiming tasks once reliable communication with etcd is restored 30 | (although it will probably have to wait on other nodes to `Rebalance` tasks 31 | first). 32 | 33 | All communication with etcd is done with strong consistency. 34 | 35 | ### Limitations 36 | 37 | Metafora cannot stop `Handler.Run` from continuing to execute the moment its 38 | claim expires. Goroutines are cooperative and threads of execution are subject 39 | to arbitrary pauses and scheduling. 40 | 41 | Using the etcd coordinator, if `Handler.Run` does not exit within 30 seconds, 42 | the task is eligible for simultaneous execution on multiple nodes.#139 43 | 44 | In other words: the "exactly once guarantee" relies on well behaved user code 45 | and accurate timers - both of which are out of Metafora's control. 46 | 47 | Handler's should be designed to exit as quickly as possible when `Stop` is 48 | called if they rely on Metafora's exactly-once behavior. Tasks which shutdown 49 | slowly should be written to tolerate at-least-once semantics. 50 | -------------------------------------------------------------------------------- /Documentation/etcd.md: -------------------------------------------------------------------------------- 1 | # etcd integration 2 | 3 | Requires etcd v2. See [travis.yml](../.travis.yml) to see what version of etcd 4 | automated tests are run against. 5 | 6 | Metafora contains an [etcd](https://go.etcd.io/etcd) implementation of 7 | the core 8 | [`Coordinator`](https://godoc.org/github.com/lytics/metafora#Coordinator) and 9 | [`Client`](http://godoc.org/github.com/lytics/metafora#Client) interfaces, so 10 | that implementing Metafora with etcd in your own work system is quick and easy. 11 | 12 | ## etcd layout 13 | 14 | ``` 15 | / 16 | └── 17 | ├── nodes 18 | │   └── Ephemeral 19 | │   └── commands 20 | │   └── JSON value 21 | │ 22 | ├── tasks 23 | │ └── 24 | │ ├── props JSON value (optional) 25 | │ └── owner Ephemeral, JSON value 26 | │ 27 | ├── state Optional, only if using state store 28 | │ └── Permanent, JSON value 29 | │ 30 | └── commands Optional, only if using command listener 31 | └── Ephermeral, JSON value 32 | 33 | ``` 34 | 35 | ### Tasks 36 | 37 | Metafora clients submit tasks by making an empty directory in 38 | `//tasks/` without a TTL. 39 | 40 | Metafora nodes claim tasks by watching the `tasks` directory and -- if 41 | `Balancer.CanClaim` returns `true` -- tries to create the 42 | `//tasks//owner` file with the contents set to the nodes 43 | name and a short TTL. The node must touch the file before the TTL expires 44 | otherwise another node will claim the task and begin working on it. 45 | 46 | The JSON format is: 47 | 48 | ```json 49 | {"node": ""} 50 | ``` 51 | 52 | Note that Metafora does not handle task parameters or configuration. 53 | 54 | #### Task Properties 55 | 56 | Optionally tasks may have a properties key with a JSON value. The value must be 57 | immutable for the life of the task. 58 | 59 | Users may set a custom `NewTask` function on their `EtcdCoordinator` in order 60 | to unmarshal properties into a custom struct. The struct must implement the 61 | `metafora.Task` interface and code that wishes to use implementation specific 62 | methods or fields will have to type assert. 63 | 64 | ### Node Commands 65 | 66 | Metafora clients can send node commands by making a file inside 67 | `//nodes//commands/` with any name (preferably using a time-ordered 68 | UUID). 69 | 70 | Metafora nodes watch their own node's `commands` directory for new files. The 71 | contents of the files are a command to be executed. Only one command will be 72 | executed at a time, and pending commands are lost on node shutdown. 73 | 74 | ```json 75 | {"command": "", "parameters": {}} 76 | ``` 77 | 78 | Where parameters is an arbitrary JSON Object. 79 | 80 | ### Task State 81 | 82 | If you're using the etcd state store, it will persist a task's state as JSON in 83 | `//state/`. The format of the JSON is defined by the 84 | `statemachine` package. 85 | 86 | Task state keys are permanent so they exist even after a task reaches a 87 | terminal state and is unscheduled for 2 reasons: 88 | 89 | 1. Provide a task history for users to inspect or prune at their discretion. 90 | 2. Allow state store to default non-existant task states to Runnable since if 91 | they were running already or had run to completion before, the task key 92 | would exist. 93 | 94 | See [`statemachine`'s Documentation](../statemachine/README.md) for details. 95 | 96 | ### Task Commands 97 | 98 | If you're using the etcd commander and command listener, task commands are sent 99 | as JSON in `//commands/`. Commands are deleted after 100 | they're handled. If more than one command is sent before either can be 101 | processed only the last command sent will be processed. 102 | 103 | Commands have a TTL of 1 week so they're eventually cleaned up if a task 104 | terminates before it handles a command. 105 | 106 | See [`statemachine`'s Documentation](../statemachine/README.md) for details. 107 | 108 | ## Useful links for managing etcd 109 | 110 | [The etcd API](https://coreos.com/docs/distributed-configuration/etcd-api/) 111 | 112 | [etcd cli tool](https://go.etcd.io/etcdctl) 113 | 114 | -------------------------------------------------------------------------------- /Documentation/faq.md: -------------------------------------------------------------------------------- 1 | Frequently Asked Questions 2 | ========================== 3 | 4 | #### Q. Why not use [Ordasity](https://github.com/boundary/ordasity)? 5 | 6 | [We](http://lytics.io) have an existing work running system written in Go and 7 | needed a new distribution library for it. There's over 25k lines of Go we'd 8 | like to reuse and couldn't with Ordasity as it runs on the JVM. 9 | 10 | #### Q. Why not use [donut](https://github.com/dforsyth/donut)? 11 | 12 | [We](http://lytics.io) evaluated donut and found it far from production ready. 13 | While we've been inspired by many of its basic interfaces there really wasn't 14 | much code we were interested in reusing. At ~600 lines of code in donut, 15 | starting from scratch didn't seem like it would lose us much. 16 | 17 | That being said we're very appreciative of donut! It heavily influenced our 18 | initial design. 19 | 20 | #### Q. Why not use [goworker](http://www.goworker.org/) (or similar)? 21 | 22 | goworker does not support rebalancing and appears to be more focused on a high 23 | rate (>1/s) of short lived work items. Metafora is designed for a low rate 24 | (<1/s) of long lived work items. This means rebalancing running work is 25 | critical. 26 | 27 | There are a lot of projects in the short-lived offline task processing space, 28 | but few if any handle task state, rebalancing, consistent operation during 29 | partitions, and other features critical for long running tasks. 30 | 31 | #### Q. Why not use a cluster management framework like [Mesos](http://mesos.apache.org/) or [Kubernetes](http://kubernetes.io/)? 32 | 33 | You can use a cluster management framework to run Metafora, but you *shouldn't* 34 | use Metafora as a cluster management framework. 35 | 36 | While Metafora tasks are long lived, they're often not individually large or 37 | necessarily resource intensive. For example, tasks in the Sleeping state stay 38 | resident in memory to handle any wakeup events (either from a timer or external 39 | command). Cluster management frameworks' smallest unit of work tends to be an 40 | operating system process. 41 | 42 | Lytics often runs over 500 tasks per server in a Metafora cluster. 500 OS 43 | processes would incur nontrivial overhead compared to 500 Metafora tasks, not 44 | to mention be much harder to manage. 45 | 46 | The second reason for preferring Metafora tasks to OS processes is a much 47 | richer command structure. Signals are the only command mechanism OS processes 48 | have builtin. Metafora's [state machine](../statemachine/README.md) provides a 49 | much easier to use and more featureful interface for tasks. 50 | 51 | Cluster management frameworks are quite large in terms of code and operational 52 | complexity -- for good reason! They're a much more powerful and general purpose 53 | tool than Metafora. Metafora is being written, deployed, and maintained by a 54 | very small team, so minimizing operational complexity and overhead is a key 55 | feature. 56 | 57 | #### Q. What are Metafora's limits? 58 | 59 | While Lytics has not run into any firm limits, our current estimates are that 60 | Metafora with the etcd coordinator can scale to: 61 | 62 | * Tens of thousands of concurrently running tasks (number of servers depends on 63 | resource utilization of each task). 64 | * Hundreds of state transitions (task created, sleeping, etc.) per second. 65 | 66 | Since etcd is designed for consistency before raw throughput, it is the 67 | limiting factor for cluster growth. 68 | 69 | If you need more concurrent tasks or transtions it's recommended you run 70 | multiple etcd clusters and multiple Metafora consumers. A single OS process can 71 | run multiple Metafora consumers, so you only have to manage a single logical 72 | Metafora cluster of servers despite there being multiple etcd clusters and 73 | namespaces. 74 | 75 | #### Q. What does metafora mean? 76 | 77 | It's Greek for "transfer" and also refers to a winch on boats. 78 | [We](http://lytics.io) borrowed the Greek naval naming theme from 79 | [Kubernetes](http://kubernetes.io/). 80 | -------------------------------------------------------------------------------- /Documentation/images/metafora_logical_integration_diagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lytics/metafora/3c171a91c2055a449aa58d0ec9cedbe848bd386b/Documentation/images/metafora_logical_integration_diagram.png -------------------------------------------------------------------------------- /Documentation/images/metafora_node_recovery.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lytics/metafora/3c171a91c2055a449aa58d0ec9cedbe848bd386b/Documentation/images/metafora_node_recovery.png -------------------------------------------------------------------------------- /Documentation/images/metafora_nodefailure.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lytics/metafora/3c171a91c2055a449aa58d0ec9cedbe848bd386b/Documentation/images/metafora_nodefailure.png -------------------------------------------------------------------------------- /Documentation/introduction.md: -------------------------------------------------------------------------------- 1 | ## Introduction 2 | 3 | Metafora is a framework for creating highly available and distributed services written in Go. Metafora is embeded meaning your code controls how and when metafora is started. It uses etcd to coordinate across the nodes in your cluster. Metafora is a leaderless task distribution system where the nodes coordinate with each other to ensure that work is evenly distributed over the cluster. 4 | 5 | Metafora is an embedded work stealing framework built on top of etcd. 6 | 7 | ![logical1](/Documentation/images/metafora_logical_integration_diagram.png) 8 | 9 | ## Overview 10 | 11 | Metafora gives you the ablity to build an elastic distributed application. It makes it easy to build applications that scale in or out, and that can recover from node failures. The following diagrams are examples of how this works. 12 | 13 | #### Node failure or scaling in 14 | 15 | When a node fails (or you scale in your nodes), metafora will release the tasks from the missing node back into the task pool. Other metafora nodes will detect the unclaimed tasks and attempt to claim them. It's important to note that metafora simply manages the reassigment of tasks; it's up to your code (possibly in your metafora handler) to clean up any bad state caused by a task crashing during processing. 16 | 17 | ![logical1](/Documentation/images/metafora_nodefailure.png) 18 | 19 | 20 | #### Node recovery or scaling out 21 | 22 | When a new node joins the cluster it begins picking up new tasks immediately. Initially the other nodes may have more tasks because they've been in the cluster longer. To address this, occasionally the members compare task load and rebalance the tasks between them. 23 | 24 | ![logical1](/Documentation/images/metafora_node_recovery.png) 25 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright 2014 Lytics 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | 203 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | metafora 2 | ======== 3 | 4 | [![Join the chat at https://gitter.im/lytics/metafora](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/lytics/metafora?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge) 5 | [![Build Status](https://travis-ci.org/lytics/metafora.svg?branch=master)](https://travis-ci.org/lytics/metafora) 6 | [![GoDoc](https://godoc.org/github.com/lytics/metafora?status.svg)](https://godoc.org/github.com/lytics/metafora) 7 | 8 | Metafora is a [Go](https://golang.org) library designed to run long-running 9 | (minutes to permanent) tasks in a cluster. 10 | 11 | IRC: `#lytics/metafora` on [irc.gitter.im](https://irc.gitter.im) 12 | 13 | Features 14 | -------- 15 | 16 | * **Distributed** - horizontally scalable 17 | * **Elastic** - online cluster resizing with automated rebalancing 18 | * **Masterless** - work stealing, not assigning, pluggable balancing 19 | * **Fault tolerant** - tasks are reassigned if nodes disappear 20 | * **Simple** - few states, no checkpointing, no configuration management 21 | * **Extensible** - well defined interfaces for implementing balancing and 22 | coordinating 23 | * **Exactly-once** - designed to enforce one-and-only-one instance of each 24 | submitted task is running[ref](Documentation/design.md#exactly-once) 25 | 26 | Metafora is a library for building distributed task work systems. You're 27 | responsible for creating a `main()` entrypoint for your application, writing a 28 | `metafora.Handler` and `HandlerFunc` to actually process tasks, and then 29 | starting Metafora's `Consumer`. 30 | 31 | Metafora's task state machine is implemented as a `Handler` adapter. Simply 32 | implement your task processor as a 33 | [`StatefulHandler`](https://godoc.org/github.com/lytics/metafora/statemachine#StatefulHandler) 34 | function, and create a `metafora.Handler` with 35 | [`statemachine.New`](https://godoc.org/github.com/lytics/metafora/statemachine#New). 36 | 37 | Example 38 | ------- 39 | 40 | [koalemosd](https://github.com/lytics/metafora/blob/master/examples/koalemosd/main.go) 41 | is a sample consumer implementation that can be run as a daemon 42 | (it requires etcd). 43 | [koalemosctl](https://github.com/lytics/metafora/blob/master/examples/koalemosctl/main.go) 44 | is a sample command line client for submitting tasks to `koalemosd`. 45 | 46 | ```sh 47 | # Install etcd as per https://go.etcd.io/etcd#getting-etcd 48 | # Run the following in one terminal: 49 | go get -v -u github.com/lytics/metafora/examples/koalemosd 50 | koalemosd 51 | 52 | # Run the client in another 53 | go get -v -u github.com/lytics/metafora/examples/koalemosctl 54 | koalemosctl sleep 3 # where "sleep 3" is any command on your $PATH 55 | ``` 56 | 57 | Since koalemosd is a simple wrapper around OS processes, it does not use the 58 | state machine (`statemachine.StatefulHandler`). 59 | 60 | Terms 61 | ----- 62 | 63 | 64 | 65 | 68 | 69 | 70 | 73 | 74 | 76 | 77 | 78 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 |
BalancerGo interface consulted by Consumer for determining 66 | which tasks can be claimed and which should be released. See balancer.go.
Brokerexternal task and command store like 71 | etcd for the Coordinator to 72 | use.
Consumercore work runner. Integrates Balancer, 75 | Coordinator, and Handlers to get work done.
Coordinatorclient Go interface to Broker. See 79 | coordinator.go.
HandlerGo interface for executing tasks.
Taskunit of work. Executed by Handlers.
88 | 89 | FAQ 90 | --- 91 | 92 | **Q. Is it ready for production use?** 93 | 94 | *Yes.* Metafora with the etcd coordinator has been the production work system at 95 | [Lytics](http://lytics.io) since January 2014 and runs thousands of tasks 96 | concurrently across a cluster of VMs. 97 | 98 | Since Metafora is still under heavy development, you probably want to pin the 99 | dependencies to a commit hash or 100 | [tag](https://github.com/lytics/metafora/releases) to keep the API stable. The 101 | `master` branch is automatically tested and is safe for use if you can tolerate 102 | API changes. 103 | 104 | **Q. Where is the metaforad daemon?** 105 | 106 | It doesn't exist. Metafora is library for you to import and use in a service 107 | you write. Metafora handles task management but leaves implementation details 108 | such as task implementation and daemonization up to the user. 109 | 110 | [FAQ continued in Documentation...](Documentation/faq.md) 111 | -------------------------------------------------------------------------------- /balancer.go: -------------------------------------------------------------------------------- 1 | package metafora 2 | 3 | import ( 4 | "math" 5 | "math/rand" 6 | "time" 7 | ) 8 | 9 | const ( 10 | // Default threshold is 120% of cluster average 11 | defaultThreshold float64 = 1.2 12 | ) 13 | 14 | // NoDelay is simply the zero value for time and meant to be a more meaningful 15 | // value for CanClaim methods to return instead of initializing a new empty 16 | // time struct. 17 | var NoDelay = time.Time{} 18 | 19 | // BalancerContext is a limited interface exposed to Balancers from the 20 | // Consumer for access to limited Consumer state. 21 | type BalancerContext interface { 22 | // Tasks returns a sorted list of task IDs owned by this Consumer. The 23 | // Consumer stops task manipulations during claiming and balancing, so the 24 | // list will be accurate unless a task naturally completes. 25 | Tasks() []RunningTask 26 | } 27 | 28 | // Balancer is the core task balancing interface. Without a master Metafora 29 | // clusters are cooperatively balanced -- meaning each node needs to know how 30 | // to balance itself. 31 | type Balancer interface { 32 | // Init is called once and only once before any other Balancer methods are 33 | // called. The context argument is meant to expose functionality that might 34 | // be useful for CanClaim and Balance implementations. 35 | Init(BalancerContext) 36 | 37 | // CanClaim should return true if the consumer should accept a task. 38 | // 39 | // When denying a claim by returning false, CanClaim should return the time 40 | // at which to reconsider the task for claiming. 41 | CanClaim(task Task) (ignoreUntil time.Time, claim bool) 42 | 43 | // Balance should return the list of Task IDs that should be released. The 44 | // criteria used to determine which tasks should be released is left up to 45 | // the implementation. 46 | Balance() (release []string) 47 | } 48 | 49 | // DumbBalancer is the simplest possible balancer implementation which simply 50 | // accepts all tasks. Since it has no state a single global instance exists. 51 | var DumbBalancer = dumbBalancer{} 52 | 53 | type dumbBalancer struct{} 54 | 55 | // Init does nothing. 56 | func (dumbBalancer) Init(BalancerContext) {} 57 | 58 | // CanClaim always returns true. 59 | func (dumbBalancer) CanClaim(Task) (time.Time, bool) { return NoDelay, true } 60 | 61 | // Balance never returns any tasks to balance. 62 | func (dumbBalancer) Balance() []string { return nil } 63 | 64 | // Provides information about the cluster to be used by FairBalancer 65 | type ClusterState interface { 66 | // Provide the current number of jobs 67 | NodeTaskCount() (map[string]int, error) 68 | } 69 | 70 | // NewDefaultFairBalancer creates a new FairBalancer but requires a 71 | // ClusterState implementation to gain more information about the cluster than 72 | // BalancerContext provides. 73 | func NewDefaultFairBalancer(nodeid string, cs ClusterState) Balancer { 74 | return NewDefaultFairBalancerWithThreshold(nodeid, cs, defaultThreshold) 75 | } 76 | 77 | // NewDefaultFairBalancerWithThreshold allows callers to override 78 | // FairBalancer's default 120% task load release threshold. 79 | func NewDefaultFairBalancerWithThreshold(nodeid string, cs ClusterState, threshold float64) Balancer { 80 | return &FairBalancer{ 81 | nodeid: nodeid, 82 | clusterstate: cs, 83 | releaseThreshold: threshold, 84 | } 85 | } 86 | 87 | // An implementation of Balancer which attempts to randomly release tasks in 88 | // the case when the count of those currently running on this node is greater 89 | // than some percentage of the cluster average (default 120%). 90 | // 91 | // This balancer will claim all tasks which were not released on the last call 92 | // to Balance. 93 | type FairBalancer struct { 94 | nodeid string 95 | 96 | bc BalancerContext 97 | clusterstate ClusterState 98 | 99 | releaseThreshold float64 100 | delay time.Time 101 | } 102 | 103 | func (e *FairBalancer) Init(s BalancerContext) { 104 | e.bc = s 105 | } 106 | 107 | // CanClaim rejects tasks for a period of time if the last balance released 108 | // tasks. Otherwise all tasks are accepted. 109 | func (e *FairBalancer) CanClaim(task Task) (time.Time, bool) { 110 | if e.delay.After(time.Now()) { 111 | // Return delay set by Balance() 112 | return e.delay, false 113 | } 114 | 115 | // Sleep proportional to number of tasks 116 | n := len(e.bc.Tasks()) 117 | time.Sleep(time.Duration(n>>2) * time.Millisecond) 118 | return NoDelay, true 119 | } 120 | 121 | // Balance releases tasks if this node has 120% more tasks than the average 122 | // node in the cluster. 123 | func (e *FairBalancer) Balance() []string { 124 | nodetasks := e.bc.Tasks() 125 | 126 | // Reset delay 127 | e.delay = time.Time{} 128 | 129 | // If local tasks <= 1 this node should never rebalance 130 | if len(nodetasks) < 2 { 131 | Infof("balancing skipped: nodetasks:%v ", nodetasks) 132 | return nil 133 | } 134 | 135 | current, err := e.clusterstate.NodeTaskCount() 136 | if err != nil { 137 | Warnf("balancing skipped: retrieving cluster state: %v", err) 138 | return nil 139 | } 140 | 141 | desired := e.desiredCount(current) 142 | shouldrelease := current[e.nodeid] - desired 143 | if shouldrelease < 1 { 144 | Infof("balancing skipped: shouldrelease <1 nodetasks:%v desired:%v shouldrelease:%v", len(nodetasks), desired, shouldrelease) 145 | return nil 146 | } 147 | 148 | releasetasks := make([]string, 0, shouldrelease) 149 | releaseset := make(map[string]struct{}, shouldrelease) 150 | 151 | random := rand.New(rand.NewSource(time.Now().UnixNano())) 152 | for len(releasetasks) < shouldrelease { 153 | tid := nodetasks[random.Intn(len(nodetasks))].Task().ID() 154 | if _, ok := releaseset[tid]; !ok { 155 | releasetasks = append(releasetasks, tid) 156 | releaseset[tid] = struct{}{} 157 | } 158 | } 159 | 160 | e.delay = time.Now().Add(time.Duration(len(releasetasks)) * time.Second) 161 | return releasetasks 162 | } 163 | 164 | // Retrieve the desired maximum count, based on current cluster state 165 | func (e *FairBalancer) desiredCount(current map[string]int) int { 166 | total := 0 167 | for _, c := range current { 168 | total += c 169 | } 170 | 171 | avg := 0 172 | if len(current) > 0 { 173 | avg = total / len(current) 174 | } 175 | 176 | return int(math.Ceil(float64(avg) * e.releaseThreshold)) 177 | } 178 | -------------------------------------------------------------------------------- /balancer_res.go: -------------------------------------------------------------------------------- 1 | package metafora 2 | 3 | import ( 4 | "fmt" 5 | "time" 6 | ) 7 | 8 | // ResourceReporter is required by the ResourceBalancer to read the resource 9 | // being used for balancing. 10 | type ResourceReporter interface { 11 | // Used returns the amount of a resource used and the total amount of that 12 | // resource. 13 | Used() (used uint64, total uint64) 14 | 15 | // String returns the unit resources are reported in. 16 | String() string 17 | } 18 | 19 | // ResourceBalancer is a balancer implemntation which uses two thresholds to 20 | // limit claiming and rebalance work based upon a resource reported by a 21 | // ResourceReporter. When the claim threshold is exceeded, no new work will be 22 | // claimed. When the release threshold is exceeded work will be released until 23 | // below that threshold. The claim threshold must be less than the release 24 | // threshold (otherwise claims would continue just to have the work 25 | // rebalanced.) 26 | // 27 | // Even below the claim limit, claims are delayed by the percent of resources 28 | // used (in milliseconds) to give less loaded nodes a claim advantage. 29 | // 30 | // The balancer releases the oldest tasks first (skipping those who are already 31 | // stopping) to try to prevent rebalancing the same tasks repeatedly within a 32 | // cluster. 33 | type ResourceBalancer struct { 34 | ctx BalancerContext 35 | reporter ResourceReporter 36 | 37 | claimLimit int 38 | releaseLimit int 39 | } 40 | 41 | // NewResourceBalancer creates a new ResourceBalancer or returns an error if 42 | // the limits are invalid. 43 | // 44 | // Limits should be a percentage expressed as an integer between 1 and 100 45 | // inclusive. 46 | func NewResourceBalancer(src ResourceReporter, claimLimit, releaseLimit int) (*ResourceBalancer, error) { 47 | if claimLimit < 1 || claimLimit > 100 || releaseLimit < 1 || releaseLimit > 100 { 48 | return nil, fmt.Errorf("Limits must be between 1 and 100. claim=%d release=%d", claimLimit, releaseLimit) 49 | } 50 | if claimLimit >= releaseLimit { 51 | return nil, fmt.Errorf("Claim threshold must be < release threshold. claim=%d >= release=%d", claimLimit, releaseLimit) 52 | } 53 | 54 | return &ResourceBalancer{ 55 | reporter: src, 56 | claimLimit: claimLimit, 57 | releaseLimit: releaseLimit, 58 | }, nil 59 | } 60 | 61 | func (b *ResourceBalancer) Init(ctx BalancerContext) { 62 | b.ctx = ctx 63 | } 64 | 65 | func (b *ResourceBalancer) CanClaim(string) bool { 66 | used, total := b.reporter.Used() 67 | threshold := int(float32(used) / float32(total) * 100) 68 | if threshold >= b.claimLimit { 69 | //FIXME Until #93 is fixed returning false is very dangerous as it could 70 | // cause a tight loop with the coordinator. Sleep longer than more 71 | // lightly loaded nodes. 72 | dur := time.Duration(100+(threshold-b.claimLimit)) * time.Millisecond 73 | Infof("%d is over the claim limit of %d. Used %d of %d %s. Sleeping %s before claiming.", 74 | threshold, b.claimLimit, used, total, b.reporter, dur) 75 | time.Sleep(dur) 76 | return true 77 | } 78 | 79 | // Always sleep based on resource usage to give less loaded nodes an advantage 80 | dur := time.Duration(threshold) * time.Millisecond 81 | time.Sleep(dur) 82 | return true 83 | } 84 | 85 | func (b *ResourceBalancer) Balance() []string { 86 | used, total := b.reporter.Used() 87 | threshold := int(float32(used) / float32(total) * 100) 88 | if threshold < b.releaseLimit { 89 | // We're below the limit! Don't release anything. 90 | return nil 91 | } 92 | 93 | // Release the oldest task that isn't already stopping 94 | var oldest RunningTask 95 | for _, t := range b.ctx.Tasks() { 96 | if t.Stopped().IsZero() && (oldest == nil || oldest.Started().After(t.Started())) { 97 | oldest = t 98 | } 99 | } 100 | 101 | // No tasks or all tasks are stopping, don't bother rebalancing 102 | if oldest == nil { 103 | return nil 104 | } 105 | 106 | Infof("Releasing task %s (started %s) because %d > %d (%d of %d %s used)", 107 | oldest.Task().ID(), oldest.Started(), threshold, b.releaseLimit, used, total, b.reporter) 108 | return []string{oldest.Task().ID()} 109 | } 110 | -------------------------------------------------------------------------------- /balancer_res_test.go: -------------------------------------------------------------------------------- 1 | package metafora 2 | 3 | import "testing" 4 | 5 | type fakeReporter struct { 6 | used uint64 7 | total uint64 8 | } 9 | 10 | func (r *fakeReporter) Used() (uint64, uint64) { return r.used, r.total } 11 | func (r *fakeReporter) String() string { return "fakes" } 12 | 13 | func TestResourceBalancer(t *testing.T) { 14 | t.Parallel() 15 | 16 | fr := &fakeReporter{used: 750, total: 1000} 17 | _, err := NewResourceBalancer(fr, 80, 75) 18 | if err == nil { 19 | t.Fatal("Expected an error: release threshold was lower than claim.") 20 | } 21 | 22 | bal, err := NewResourceBalancer(fr, 80, 90) 23 | if err != nil { 24 | t.Fatalf("Unexpected error creating resource balancer: %v", err) 25 | } 26 | 27 | ctx := &TestConsumerState{ 28 | Current: []string{"1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11"}, 29 | } 30 | bal.Init(ctx) 31 | 32 | release := bal.Balance() 33 | if len(release) > 0 { 34 | t.Errorf("Released tasks when we were well below limits! %v", release) 35 | } 36 | 37 | // Bump resource usage and rebalance 38 | fr.used = 901 39 | release = bal.Balance() 40 | if len(release) != 1 && release[0] == "1" { 41 | t.Errorf("Expected 1 released task but found: %v", release) 42 | } 43 | 44 | // Make sure we scale up the number we release proportionally 45 | fr.used = 999 46 | release = bal.Balance() 47 | if len(release) != 1 && release[0] == "1" { 48 | t.Errorf("Expected 1 released task but found: %v", release) 49 | } 50 | 51 | //FIXME When #93 is fixed this test should break as CanClaim should actually 52 | // return false 53 | if !bal.CanClaim("claimmepls") { 54 | t.Errorf("Until #93 is fixed, CanClaim should always return true") 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /balancer_sleep.go: -------------------------------------------------------------------------------- 1 | package metafora 2 | 3 | import "time" 4 | 5 | /* 6 | Q. Why 30ms? 7 | 8 | A. It's sufficiently long that unless a node is under heavy load (either 9 | computational, GC-induced, or network latency) it should win the claim-race 10 | against nodes with more tasks. If it's under so much load that it loses against 11 | nodes with more tasks, it's probably best to let those other nodes win! 12 | 13 | 30ms should scale fairly well up to hundreds of tasks per node as Metafora 14 | isn't really intended for high-throughput/low-latency tasks churn. 15 | */ 16 | const sleepBalLen = 30 * time.Millisecond 17 | 18 | // SleepBalancer is a simplistic Balancer implementation which sleeps 30ms per 19 | // claimed task in its CanClaim() method. This means the node with the fewest 20 | // claimed tasks in a cluster should sleep the shortest length of time and win 21 | // the claim race. 22 | // 23 | // It never releases tasks during Balance() calls. 24 | type SleepBalancer struct { 25 | ctx BalancerContext 26 | } 27 | 28 | // Init is called by the Consumer. 29 | func (b *SleepBalancer) Init(ctx BalancerContext) { b.ctx = ctx } 30 | 31 | // Balance never returns any tasks for the sleepy balancer. 32 | func (*SleepBalancer) Balance() []string { return nil } 33 | 34 | // CanClaim sleeps 30ms per claimed task. 35 | func (b *SleepBalancer) CanClaim(string) bool { 36 | num := len(b.ctx.Tasks()) 37 | time.Sleep(time.Duration(num) * sleepBalLen) 38 | return true 39 | } 40 | -------------------------------------------------------------------------------- /balancer_test.go: -------------------------------------------------------------------------------- 1 | package metafora 2 | 3 | import ( 4 | "testing" 5 | "time" 6 | ) 7 | 8 | var ( 9 | _ BalancerContext = (*TestConsumerState)(nil) 10 | _ ClusterState = (*TestClusterState)(nil) 11 | ) 12 | 13 | func TestFairBalancerOneNode(t *testing.T) { 14 | t.Parallel() 15 | // Single node should never release tasks 16 | clusterstate := &TestClusterState{ 17 | Current: map[string]int{"node1": 5}, 18 | } 19 | 20 | consumerstate := &TestConsumerState{ 21 | []string{"1", "2", "3", "4", "5"}, 22 | } 23 | 24 | fb := NewDefaultFairBalancer("node1", clusterstate) 25 | fb.Init(consumerstate) 26 | 27 | if _, ok := fb.CanClaim(testTask{"23"}); !ok { 28 | t.Fatal("Expected claim to be true") 29 | } 30 | 31 | rebalance := fb.Balance() 32 | if len(rebalance) != 0 { 33 | t.Fatalf("Expected 0 rebalance tasks: %v", rebalance) 34 | } 35 | } 36 | 37 | func TestFairBalanceOver(t *testing.T) { 38 | t.Parallel() 39 | clusterstate := &TestClusterState{ 40 | Current: map[string]int{ 41 | "node1": 10, 42 | "node2": 2, 43 | }, 44 | } 45 | 46 | consumerstate := &TestConsumerState{ 47 | []string{"1", "2", "3", "4", "5"}, 48 | } 49 | 50 | fb := NewDefaultFairBalancer("node1", clusterstate) 51 | fb.Init(consumerstate) 52 | 53 | if _, ok := fb.CanClaim(testTask{"23"}); !ok { 54 | t.Fatal("Expected claim to be true") 55 | } 56 | 57 | expect := 2 58 | rebalance := fb.Balance() 59 | if len(rebalance) != expect { 60 | t.Fatalf("Expected %d rebalanced tasks, received %d", expect, len(rebalance)) 61 | } 62 | } 63 | 64 | func TestFairBalanceNothing(t *testing.T) { 65 | t.Parallel() 66 | clusterstate := &TestClusterState{ 67 | Current: map[string]int{ 68 | "node1": 2, 69 | "node2": 10, 70 | }, 71 | } 72 | 73 | consumerstate := &TestConsumerState{ 74 | []string{"1", "2", "3", "4", "5"}, 75 | } 76 | 77 | fb := NewDefaultFairBalancer("node1", clusterstate) 78 | fb.Init(consumerstate) 79 | 80 | if _, ok := fb.CanClaim(testTask{"23"}); !ok { 81 | t.Fatal("Expected claim to be true") 82 | } 83 | 84 | expect := 0 85 | rebalance := fb.Balance() 86 | if len(rebalance) != expect { 87 | t.Fatalf("Expected %d rebalanced tasks, received %d", expect, len(rebalance)) 88 | } 89 | 90 | } 91 | 92 | type testTask struct { 93 | id string 94 | } 95 | 96 | func (t testTask) ID() string { return t.id } 97 | 98 | type TestClusterState struct { 99 | Current map[string]int 100 | Err error 101 | } 102 | 103 | func (ts *TestClusterState) NodeTaskCount() (map[string]int, error) { 104 | if ts.Err != nil { 105 | return nil, ts.Err 106 | } 107 | 108 | return ts.Current, nil 109 | } 110 | 111 | type TestConsumerState struct { 112 | Current []string 113 | } 114 | 115 | func (tc *TestConsumerState) Tasks() []RunningTask { 116 | tasks := []RunningTask{} 117 | for _, id := range tc.Current { 118 | tasks = append(tasks, newTask(testTask{id}, nil)) 119 | } 120 | return tasks 121 | } 122 | 123 | // Sleepy Balancer Tests 124 | 125 | type sbCtx struct { 126 | t *testing.T 127 | tasks []string 128 | } 129 | 130 | func (ctx *sbCtx) Tasks() []RunningTask { 131 | tasks := []RunningTask{} 132 | for _, id := range ctx.tasks { 133 | tasks = append(tasks, newTask(testTask{id}, nil)) 134 | } 135 | return tasks 136 | } 137 | func (ctx *sbCtx) Log(l int, v string, args ...interface{}) { 138 | Infof(v, args) 139 | } 140 | 141 | func TestSleepBalancer(t *testing.T) { 142 | t.Parallel() 143 | c := &sbCtx{t: t, tasks: make([]string, 0, 10)} 144 | 145 | b := &SleepBalancer{} 146 | b.Init(c) 147 | 148 | task := "test-task" 149 | pre := time.Now() 150 | total := 0 151 | for i := 0; i < 10; i++ { 152 | total += i 153 | b.CanClaim(task) 154 | c.tasks = append(c.tasks, task) 155 | } 156 | post := time.Now() 157 | minimum := pre.Add(time.Duration(total) * sleepBalLen) 158 | 159 | // Sleep balancer should never finish before the minimum timeout threshold 160 | if post.Before(minimum) { 161 | t.Fatalf("SleepBalancer finished too early: %s < %s", post, minimum) 162 | } 163 | 164 | // Sleep balancer shouldn't experience much overhead 165 | if post.After(minimum.Add(50 * time.Millisecond)) { 166 | t.Fatalf("SleepBalancer went a worrying amount over the expected time: %s > %s", post, minimum) 167 | } 168 | } 169 | -------------------------------------------------------------------------------- /client.go: -------------------------------------------------------------------------------- 1 | package metafora 2 | 3 | type Client interface { 4 | // SubmitTask submits a task to the system, the task id must be unique. 5 | SubmitTask(Task) error 6 | 7 | // Delete a task 8 | DeleteTask(taskId string) error 9 | 10 | // SubmitCommand submits a command to a particular node. 11 | SubmitCommand(node string, command Command) error 12 | 13 | // Nodes retrieves the current set of registered nodes. 14 | Nodes() ([]string, error) 15 | } 16 | -------------------------------------------------------------------------------- /cmd/metaforactl/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | func main() { 4 | 5 | } 6 | -------------------------------------------------------------------------------- /command.go: -------------------------------------------------------------------------------- 1 | package metafora 2 | 3 | import "encoding/json" 4 | 5 | const ( 6 | cmdFreeze = "freeze" 7 | cmdUnfreeze = "unfreeze" 8 | cmdBalance = "balance" 9 | cmdStopTask = "stop_task" 10 | ) 11 | 12 | // Commands are a way clients can communicate directly with nodes for cluster 13 | // maintenance. 14 | // 15 | // Use the Command functions to generate implementations of this interface. 16 | // Metafora's consumer will discard unknown commands. 17 | type Command interface { 18 | // Name returns the name of the command. 19 | Name() string 20 | 21 | // Parameters returns the parameters, if any, the command will be executed 22 | // with. 23 | Parameters() map[string]interface{} 24 | 25 | // Marshal turns a command into its wire representation. 26 | Marshal() ([]byte, error) 27 | } 28 | 29 | // command is the internal representation of commands used for serialization. 30 | type command struct { 31 | C string `json:"command"` 32 | P map[string]interface{} `json:"parameters,omitempty"` 33 | } 34 | 35 | // Name returns the name of the command. 36 | func (c *command) Name() string { 37 | return c.C 38 | } 39 | 40 | // Parameters returns the parameters, if any, the command will be executed 41 | // with. 42 | func (c *command) Parameters() map[string]interface{} { 43 | return c.P 44 | } 45 | 46 | // Marshal turns a command into its wire representation. 47 | func (c *command) Marshal() ([]byte, error) { 48 | return json.Marshal(c) 49 | } 50 | 51 | // Unmarshal parses a command from its wire representation. 52 | func UnmarshalCommand(p []byte) (Command, error) { 53 | c := &command{} 54 | err := json.Unmarshal(p, c) 55 | return c, err 56 | } 57 | 58 | // CommandFreeze stops all task watching and balancing. 59 | func CommandFreeze() Command { 60 | return &command{C: cmdFreeze} 61 | } 62 | 63 | // CommandUnfreeze resumes task watching and balancing. 64 | func CommandUnfreeze() Command { 65 | return &command{C: cmdUnfreeze} 66 | } 67 | 68 | // CommandBalance forces the node's balancer.Balance method to be called even 69 | // if frozen. 70 | func CommandBalance() Command { 71 | return &command{C: cmdBalance} 72 | } 73 | 74 | // CommandStopTask forces a node to stop a task even if frozen. 75 | func CommandStopTask(task string) Command { 76 | return &command{C: cmdStopTask, P: map[string]interface{}{"task": task}} 77 | } 78 | -------------------------------------------------------------------------------- /command_test.go: -------------------------------------------------------------------------------- 1 | package metafora_test 2 | 3 | import ( 4 | "reflect" 5 | "testing" 6 | 7 | . "github.com/lytics/metafora" 8 | ) 9 | 10 | func testCmd(t *testing.T, cmd Command, name string, params map[string]interface{}) { 11 | if cmd.Name() != name { 12 | t.Errorf("%s command's name is wrong: %s", name, cmd.Name()) 13 | } 14 | if !reflect.DeepEqual(cmd.Parameters(), params) { 15 | t.Errorf("%s command's params are wrong. expected %#v != %#v", name, params, cmd.Parameters()) 16 | } 17 | b, err := cmd.Marshal() 18 | if err != nil { 19 | t.Errorf("%s command's Marshal() returned an error: %v", name, err) 20 | return 21 | } 22 | cmd2, err := UnmarshalCommand(b) 23 | if err != nil { 24 | t.Errorf("%s command's Marshal() output could not be Unmarshalled: %v", name, err) 25 | return 26 | } 27 | if cmd2.Name() != name { 28 | t.Errorf("%s command's name didn't Unmarshal properly: %s", name, cmd2.Name()) 29 | } 30 | if !reflect.DeepEqual(cmd2.Parameters(), params) { 31 | t.Errorf("%s command's params didn't Unmarshal properly. expected %#v != %#v", 32 | name, params, cmd2.Parameters()) 33 | } 34 | } 35 | 36 | func TestCommands(t *testing.T) { 37 | t.Parallel() 38 | testCmd(t, CommandFreeze(), "freeze", nil) 39 | testCmd(t, CommandUnfreeze(), "unfreeze", nil) 40 | testCmd(t, CommandBalance(), "balance", nil) 41 | testCmd(t, CommandStopTask("test"), "stop_task", map[string]interface{}{"task": "test"}) 42 | } 43 | -------------------------------------------------------------------------------- /coordinator.go: -------------------------------------------------------------------------------- 1 | package metafora 2 | 3 | // CoordinatorContext is the context passed to coordinators by the core 4 | // consumer. 5 | type CoordinatorContext interface { 6 | // Lost is called by the Coordinator when a claimed task is lost to another 7 | // node. The Consumer will stop the task locally. 8 | // 9 | // Since this implies there is a window of time where the task is executing 10 | // more than once, this is a sign of an unhealthy cluster. 11 | Lost(Task) 12 | } 13 | 14 | // Coordinator is the core interface Metafora uses to discover, claim, and 15 | // tasks as well as receive commands. 16 | type Coordinator interface { 17 | // Init is called once by the consumer to provide a Logger to Coordinator 18 | // implementations. NewConsumer will return Init's return value. 19 | Init(CoordinatorContext) error 20 | 21 | // Watch the broker for claimable tasks. Watch blocks until Close is called 22 | // or it encounters an error. Tasks are sent to consumer via the tasks chan. 23 | Watch(tasks chan<- Task) (err error) 24 | 25 | // Claim is called by the Consumer when a Balancer has determined that a task 26 | // ID can be claimed. Claim returns false if another consumer has already 27 | // claimed the ID. 28 | Claim(Task) bool 29 | 30 | // Release a task for other consumers to claim. May be called after Close. 31 | Release(Task) 32 | 33 | // Done is called by Metafora when a task has been completed and should never 34 | // be scheduled to run again (in other words: deleted from the broker). 35 | // 36 | // May be called after Close. 37 | Done(Task) 38 | 39 | // Command blocks until a command for this node is received from the broker 40 | // by the coordinator. Command must return (nil, nil) when Close is called. 41 | Command() (Command, error) 42 | 43 | // Close the coordinator. Stop waiting for tasks and commands. Remove node from broker. 44 | // 45 | // Do not release tasks. The consumer will handle task releasing. 46 | Close() 47 | 48 | // Name of the coordinator for use in logs and other tooling. 49 | Name() string 50 | } 51 | 52 | type coordinatorContext struct { 53 | *Consumer 54 | } 55 | 56 | // Lost is a light wrapper around Coordinator.stopTask to make it suitable for 57 | // calling by Coordinator implementations via the CoordinatorContext interface. 58 | func (ctx *coordinatorContext) Lost(t Task) { 59 | tid := t.ID() 60 | Errorf("Lost task %s", tid) 61 | ctx.stopTask(tid) 62 | } 63 | -------------------------------------------------------------------------------- /doc.go: -------------------------------------------------------------------------------- 1 | // Metafora is a library for building distributed work systems. It's masterless 2 | // and extensible via core Balancer and Coordinator interfaces. 3 | // 4 | // If you use the builtin FairBalancer and EtcdCoordinator, all you have to do 5 | // is implement a Handler and HandlerFunc, and then run the Consumer. 6 | // 7 | // See https://github.com/lytics/metafora 8 | package metafora 9 | -------------------------------------------------------------------------------- /embedded/README.md: -------------------------------------------------------------------------------- 1 | Creates client/coordinator pairs which use channels to communicate. 2 | 3 | Meant to be used embedded in applications which do not need/want external 4 | coordination, especially tests. 5 | -------------------------------------------------------------------------------- /embedded/client.go: -------------------------------------------------------------------------------- 1 | package embedded 2 | 3 | import "github.com/lytics/metafora" 4 | 5 | func NewEmbeddedClient(taskchan chan metafora.Task, cmdchan chan *NodeCommand, nodechan chan []string) metafora.Client { 6 | return &EmbeddedClient{taskchan, cmdchan, nodechan} 7 | } 8 | 9 | type EmbeddedClient struct { 10 | taskchan chan<- metafora.Task 11 | cmdchan chan<- *NodeCommand 12 | nodechan <-chan []string 13 | } 14 | 15 | func (ec *EmbeddedClient) SubmitTask(t metafora.Task) error { 16 | ec.taskchan <- t 17 | return nil 18 | } 19 | 20 | func (ec *EmbeddedClient) DeleteTask(taskid string) error { 21 | nodes, _ := ec.Nodes() 22 | // Simply submit stop for all nodes 23 | for _, nid := range nodes { 24 | _ = ec.SubmitCommand(nid, metafora.CommandStopTask(taskid)) 25 | } 26 | return nil 27 | } 28 | 29 | func (ec *EmbeddedClient) SubmitCommand(nodeid string, command metafora.Command) error { 30 | ec.cmdchan <- &NodeCommand{command, nodeid} 31 | return nil 32 | } 33 | 34 | func (ec *EmbeddedClient) Nodes() ([]string, error) { 35 | nodes := <-ec.nodechan 36 | return nodes, nil 37 | } 38 | -------------------------------------------------------------------------------- /embedded/commander.go: -------------------------------------------------------------------------------- 1 | package embedded 2 | 3 | import ( 4 | "fmt" 5 | 6 | "github.com/lytics/metafora/statemachine" 7 | ) 8 | 9 | var _ statemachine.Commander = (*Commander)(nil) 10 | 11 | // Commander is an embedable statemachine.Commander implementation. 12 | // Task-specific command listeners are created by calling NewListener. 13 | type Commander struct { 14 | listeners map[string]chan *statemachine.Message 15 | } 16 | 17 | // NewCommander creates a new statemachine.Commander implementation. 18 | func NewCommander() *Commander { 19 | return &Commander{listeners: make(map[string]chan *statemachine.Message)} 20 | } 21 | 22 | // NewListener creates a task specific command listener linked to an embedded 23 | // Commander. 24 | func (c *Commander) NewListener(taskID string) statemachine.CommandListener { 25 | // Buffer chan to make sending/recving asynchronous 26 | c.listeners[taskID] = make(chan *statemachine.Message, 1) 27 | return &commandListener{c: c.listeners[taskID]} 28 | } 29 | 30 | func (c *Commander) Send(taskID string, m *statemachine.Message) error { 31 | cl, ok := c.listeners[taskID] 32 | if !ok { 33 | return fmt.Errorf("task=%q not running", taskID) 34 | } 35 | cl <- m 36 | return nil 37 | } 38 | 39 | type commandListener struct { 40 | c <-chan *statemachine.Message 41 | } 42 | 43 | func (cl *commandListener) Receive() <-chan *statemachine.Message { return cl.c } 44 | func (*commandListener) Stop() {} 45 | -------------------------------------------------------------------------------- /embedded/commander_test.go: -------------------------------------------------------------------------------- 1 | package embedded_test 2 | 3 | import ( 4 | "testing" 5 | "time" 6 | 7 | "github.com/lytics/metafora/embedded" 8 | "github.com/lytics/metafora/statemachine" 9 | ) 10 | 11 | func TestEmbeddedCommander(t *testing.T) { 12 | t.Parallel() 13 | cmdr := embedded.NewCommander() 14 | cl1 := cmdr.NewListener("task1") 15 | cl2 := cmdr.NewListener("task2") 16 | 17 | if err := cmdr.Send("task1", statemachine.RunMessage()); err != nil { 18 | t.Fatalf("Error sending message to task1: %v", err) 19 | } 20 | if err := cmdr.Send("task2", statemachine.ReleaseMessage()); err != nil { 21 | t.Fatalf("Error sending message to task2: %v", err) 22 | } 23 | if err := cmdr.Send("invalid-task", statemachine.PauseMessage()); err == nil { 24 | t.Fatal("Expected an error when sending to an invalid task, but didn't receive one.") 25 | } 26 | 27 | msg2 := <-cl2.Receive() 28 | if msg2.Code != statemachine.Release { 29 | t.Fatalf("listener2 expected a Run message but received: %#v", msg2) 30 | } 31 | msg1 := <-cl1.Receive() 32 | if msg1.Code != statemachine.Run { 33 | t.Fatalf("listener1 expected a Run message but received: %#v", msg1) 34 | } 35 | 36 | // Stop listeners and make sure nothing works (but doesn't panic) 37 | cl1.Stop() 38 | cl2.Stop() 39 | 40 | select { 41 | case <-cl1.Receive(): 42 | t.Fatal("expected listener1 to be close but it still received a message!") 43 | case <-cl2.Receive(): 44 | t.Fatal("expected listener2 to be close but it still received a message!") 45 | case <-time.After(50 * time.Millisecond): 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /embedded/coordinator.go: -------------------------------------------------------------------------------- 1 | package embedded 2 | 3 | import ( 4 | "errors" 5 | 6 | "github.com/lytics/metafora" 7 | ) 8 | 9 | func NewEmbeddedCoordinator(nodeid string, taskchan chan metafora.Task, cmdchan chan *NodeCommand, nodechan chan []string) metafora.Coordinator { 10 | e := &EmbeddedCoordinator{inchan: taskchan, cmdchan: cmdchan, stopchan: make(chan struct{}), nodechan: nodechan} 11 | // HACK - need to respond to node requests, assuming a single coordinator/client pair 12 | go func() { 13 | for { 14 | select { 15 | case e.nodechan <- []string{e.nodeid}: 16 | case <-e.stopchan: 17 | return 18 | } 19 | } 20 | }() 21 | 22 | return e 23 | } 24 | 25 | // Coordinator which listens for tasks on a channel 26 | type EmbeddedCoordinator struct { 27 | nodeid string 28 | ctx metafora.CoordinatorContext 29 | inchan chan metafora.Task 30 | cmdchan chan *NodeCommand 31 | nodechan chan<- []string 32 | stopchan chan struct{} 33 | } 34 | 35 | func (e *EmbeddedCoordinator) Init(c metafora.CoordinatorContext) error { 36 | e.ctx = c 37 | return nil 38 | } 39 | 40 | func (e *EmbeddedCoordinator) Watch(out chan<- metafora.Task) error { 41 | for { 42 | // wait for incoming tasks 43 | select { 44 | case id, ok := <-e.inchan: 45 | if !ok { 46 | return errors.New("Input closed") 47 | } 48 | select { 49 | case out <- id: 50 | case <-e.stopchan: 51 | return nil 52 | } 53 | case <-e.stopchan: 54 | return nil 55 | } 56 | } 57 | } 58 | 59 | func (e *EmbeddedCoordinator) Claim(task metafora.Task) bool { 60 | // We recieved on a channel, we are the only ones to pull that value 61 | return true 62 | } 63 | 64 | func (e *EmbeddedCoordinator) Release(task metafora.Task) { 65 | // Releasing should be async to avoid deadlocks (and better reflect the 66 | // behavior of "real" coordinators) 67 | go func() { 68 | select { 69 | case e.inchan <- task: 70 | case <-e.stopchan: 71 | } 72 | }() 73 | } 74 | 75 | func (e *EmbeddedCoordinator) Done(task metafora.Task) {} 76 | 77 | func (e *EmbeddedCoordinator) Command() (metafora.Command, error) { 78 | select { 79 | case cmd, ok := <-e.cmdchan: 80 | if !ok { 81 | return nil, errors.New("Cmd channel closed") 82 | } 83 | return cmd.Cmd, nil 84 | case <-e.stopchan: 85 | return nil, nil 86 | } 87 | } 88 | 89 | func (e *EmbeddedCoordinator) Close() { 90 | close(e.stopchan) 91 | } 92 | 93 | func (e *EmbeddedCoordinator) Name() string { 94 | return "embedded" 95 | } 96 | -------------------------------------------------------------------------------- /embedded/embedded_test.go: -------------------------------------------------------------------------------- 1 | package embedded 2 | 3 | import ( 4 | "log" 5 | "os" 6 | "sync" 7 | "testing" 8 | "time" 9 | 10 | "github.com/lytics/metafora" 11 | ) 12 | 13 | func init() { 14 | metafora.SetLogger(log.New(os.Stderr, "", log.Lmicroseconds|log.Lshortfile)) 15 | } 16 | 17 | func TestEmbedded(t *testing.T) { 18 | 19 | tc := newTestCounter() 20 | adds := make(chan string, 4) 21 | 22 | thfunc := metafora.SimpleHandler(func(task metafora.Task, _ <-chan bool) bool { 23 | tc.Add(task.ID()) 24 | adds <- task.ID() 25 | return true 26 | }) 27 | 28 | coord, client := NewEmbeddedPair("testnode") 29 | runner, _ := metafora.NewConsumer(coord, thfunc, metafora.DumbBalancer) 30 | 31 | go runner.Run() 32 | 33 | for _, taskid := range []string{"one", "two", "three", "four"} { 34 | err := client.SubmitTask(metafora.NewTask(taskid)) 35 | if err != nil { 36 | t.Fatalf("Expected no error, got %v", err) 37 | } 38 | } 39 | 40 | deadline := time.Now().Add(500 * time.Millisecond) 41 | for time.Now().Before(deadline) { 42 | if len(adds) == 4 { 43 | break 44 | } 45 | time.Sleep(10 * time.Millisecond) 46 | } 47 | if len(adds) != 4 { 48 | t.Errorf("Handlers didn't run in expected amount of time") 49 | } 50 | runner.Shutdown() 51 | 52 | runs := tc.Runs() 53 | if len(runs) != 4 { 54 | t.Fatalf("Expected 4 runs, got %d", len(runs)) 55 | } 56 | 57 | } 58 | 59 | func TestEmbeddedShutdown(t *testing.T) { 60 | const n = 4 61 | runs := make(chan int, n) 62 | stops := make(chan int, n) 63 | thfunc := metafora.SimpleHandler(func(_ metafora.Task, s <-chan bool) bool { 64 | runs <- 1 65 | select { 66 | case <-s: 67 | stops <- 1 68 | return false 69 | case <-time.After(time.Second * 3): 70 | return true 71 | } 72 | }) 73 | 74 | coord, client := NewEmbeddedPair("testnode") 75 | runner, _ := metafora.NewConsumer(coord, thfunc, metafora.DumbBalancer) 76 | 77 | go runner.Run() 78 | 79 | // len(tasks) must == n 80 | tasks := []string{"one", "two", "three", "four"} 81 | 82 | // submit tasks 83 | for _, taskid := range tasks { 84 | err := client.SubmitTask(metafora.NewTask(taskid)) 85 | if err != nil { 86 | t.Fatalf("Expected no error, got %v", err) 87 | } 88 | } 89 | 90 | // make sure all 4 start 91 | for i := 0; i < n; i++ { 92 | <-runs 93 | } 94 | 95 | // tell them to stop 96 | runner.Shutdown() 97 | 98 | // make sure all 4 stop 99 | for i := 0; i < n; i++ { 100 | <-stops 101 | } 102 | } 103 | 104 | func newTestCounter() *testcounter { 105 | return &testcounter{runs: []string{}} 106 | } 107 | 108 | type testcounter struct { 109 | runs []string 110 | cmut sync.Mutex 111 | } 112 | 113 | func (t *testcounter) Add(r string) { 114 | t.cmut.Lock() 115 | defer t.cmut.Unlock() 116 | t.runs = append(t.runs, r) 117 | } 118 | 119 | func (t *testcounter) Runs() []string { 120 | t.cmut.Lock() 121 | defer t.cmut.Unlock() 122 | return t.runs 123 | } 124 | -------------------------------------------------------------------------------- /embedded/statestore.go: -------------------------------------------------------------------------------- 1 | package embedded 2 | 3 | import ( 4 | "sync" 5 | 6 | "github.com/lytics/metafora" 7 | "github.com/lytics/metafora/statemachine" 8 | ) 9 | 10 | type StateChanged struct { 11 | TaskID string 12 | State *statemachine.State 13 | } 14 | 15 | // StateStore is an in-memory implementation of statemachine.StateStore 16 | // intended for use in tests. 17 | type StateStore struct { 18 | mu *sync.RWMutex 19 | store map[string]*statemachine.State 20 | 21 | // Stored is intended for tests to block until a Store() is called as an 22 | // alternative to time.Sleep()s. 23 | // 24 | // Will deliver asynchronously and drop states if there's no receivers. 25 | Stored chan StateChanged 26 | } 27 | 28 | func NewStateStore() statemachine.StateStore { 29 | return &StateStore{ 30 | mu: &sync.RWMutex{}, 31 | store: map[string]*statemachine.State{}, 32 | Stored: make(chan StateChanged, 1), 33 | } 34 | } 35 | 36 | func (s *StateStore) Load(task metafora.Task) (*statemachine.State, error) { 37 | s.mu.RLock() 38 | defer s.mu.RUnlock() 39 | state, ok := s.store[task.ID()] 40 | if !ok { 41 | return &statemachine.State{Code: statemachine.Runnable}, nil 42 | } 43 | return state, nil 44 | } 45 | 46 | func (s *StateStore) Store(task metafora.Task, state *statemachine.State) error { 47 | s.mu.Lock() 48 | s.store[task.ID()] = state 49 | s.mu.Unlock() 50 | stored := StateChanged{TaskID: task.ID(), State: state} 51 | select { 52 | case s.Stored <- stored: 53 | default: 54 | } 55 | return nil 56 | } 57 | -------------------------------------------------------------------------------- /embedded/util.go: -------------------------------------------------------------------------------- 1 | package embedded 2 | 3 | import "github.com/lytics/metafora" 4 | 5 | type NodeCommand struct { 6 | Cmd metafora.Command 7 | NodeId string 8 | } 9 | 10 | // Returns a connected client/coordinator pair for embedded/testing use 11 | func NewEmbeddedPair(nodeid string) (metafora.Coordinator, metafora.Client) { 12 | taskchan := make(chan metafora.Task) 13 | cmdchan := make(chan *NodeCommand) 14 | nodechan := make(chan []string, 1) 15 | 16 | coord := NewEmbeddedCoordinator(nodeid, taskchan, cmdchan, nodechan) 17 | client := NewEmbeddedClient(taskchan, cmdchan, nodechan) 18 | 19 | return coord, client 20 | } 21 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/lytics/metafora 2 | 3 | go 1.13 4 | 5 | require ( 6 | github.com/araddon/gou v0.0.0-20190110011759-c797efecbb61 7 | github.com/kr/pretty v0.2.1 // indirect 8 | github.com/kr/text v0.2.0 // indirect 9 | github.com/stretchr/testify v1.7.0 10 | go.etcd.io/etcd/client/v3 v3.5.7 11 | ) 12 | -------------------------------------------------------------------------------- /handler.go: -------------------------------------------------------------------------------- 1 | package metafora 2 | 3 | // Handler is the core task handling interface. The Consumer will create a new 4 | // Handler for each claimed task, call Run once and only once, and call Stop 5 | // when the task should persist its progress and exit. 6 | type Handler interface { 7 | // Run handles a task and blocks until completion or Stop is called. 8 | // 9 | // If Run returns true, Metafora will mark the task as Done via the 10 | // Coordinator. The task will not be rescheduled. 11 | // 12 | // If Run returns false, Metafora will Release the task via the Coordinator. 13 | // The task will be scheduled to run again. 14 | // 15 | // Panics are treated the same as returning true. 16 | Run() (done bool) 17 | 18 | // Stop signals to the handler to shutdown gracefully. Stop implementations 19 | // should not block until Run exits. 20 | // 21 | // Stop may be called more than once but calls are serialized. Implmentations 22 | // may perform different operations on subsequent calls to Stop to implement 23 | // graceful vs. forced shutdown conditions. 24 | // 25 | // Run probably wants to return false when stop is called, but this is left 26 | // up to the implementation as races between Run finishing and Stop being 27 | // called can happen. 28 | Stop() 29 | } 30 | 31 | // HandlerFunc is called by the Consumer to create a new Handler for each task. 32 | // 33 | // HandlerFunc is meant to be the New function for handlers. Since Run and Stop 34 | // are called concurrently, any state used by both should be initialized in the 35 | // HandlerFunc. Since Handlerfunc is uninterruptable, only the minimum amount 36 | // of work necessary to initialize a handler should be done. 37 | type HandlerFunc func(Task) Handler 38 | 39 | // SimpleHander creates a HandlerFunc for a simple function that accepts a stop 40 | // channel. The channel will be closed when Stop is called. 41 | func SimpleHandler(f func(t Task, stop <-chan bool) bool) HandlerFunc { 42 | return func(t Task) Handler { 43 | return &simpleHandler{ 44 | task: t, 45 | stop: make(chan bool), 46 | f: f, 47 | } 48 | } 49 | } 50 | 51 | type simpleHandler struct { 52 | task Task 53 | stop chan bool 54 | f func(Task, <-chan bool) bool 55 | } 56 | 57 | func (h *simpleHandler) Run() bool { 58 | return h.f(h.task, h.stop) 59 | } 60 | 61 | func (h *simpleHandler) Stop() { 62 | select { 63 | case <-h.stop: 64 | default: 65 | close(h.stop) 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /httputil/httputil.go: -------------------------------------------------------------------------------- 1 | package httputil 2 | 3 | import ( 4 | "encoding/json" 5 | "net/http" 6 | "time" 7 | 8 | "github.com/lytics/metafora" 9 | "github.com/lytics/metafora/statemachine" 10 | ) 11 | 12 | // Consumer contains just the Metafora methods exposed by the HTTP 13 | // introspection endpoints. 14 | type Consumer interface { 15 | Frozen() bool 16 | Tasks() []metafora.RunningTask 17 | String() string 18 | } 19 | 20 | type stateMachine interface { 21 | State() (*statemachine.State, time.Time) 22 | } 23 | 24 | type Task struct { 25 | ID string `json:"id"` 26 | Started time.Time `json:"started"` 27 | Stopped *time.Time `json:"stopped,omitempty"` 28 | State string `json:"state,omitempty"` 29 | Modified *time.Time `json:"modified,omitempty"` 30 | Task metafora.Task `json:"task"` 31 | } 32 | 33 | // InfoResponse is the JSON response marshalled by the MakeInfoHandler. 34 | type InfoResponse struct { 35 | Frozen bool `json:"frozen"` 36 | Name string `json:"name"` 37 | Started time.Time `json:"started"` 38 | Tasks []Task `json:"tasks"` 39 | } 40 | 41 | // MakeInfoHandler returns an HTTP handler which can be added to an exposed 42 | // HTTP server mux by Metafora applications to provide operators with basic 43 | // node introspection. 44 | func MakeInfoHandler(c Consumer, started time.Time) http.HandlerFunc { 45 | return func(w http.ResponseWriter, _ *http.Request) { 46 | tasks := c.Tasks() 47 | resp := InfoResponse{ 48 | Frozen: c.Frozen(), 49 | Name: c.String(), 50 | Started: started, 51 | Tasks: make([]Task, len(tasks)), 52 | } 53 | for i, task := range tasks { 54 | resp.Tasks[i] = Task{ 55 | ID: task.Task().ID(), 56 | Started: task.Started(), 57 | Task: task.Task(), 58 | } 59 | 60 | // Set stopped if it's non-zero 61 | stopped := task.Stopped() 62 | if !stopped.IsZero() { 63 | resp.Tasks[i].Stopped = &stopped 64 | } 65 | 66 | // Expose state if it exists 67 | if sh, ok := task.Handler().(stateMachine); ok { 68 | s, ts := sh.State() 69 | resp.Tasks[i].State = s.String() 70 | resp.Tasks[i].Modified = &ts 71 | } 72 | } 73 | w.Header().Set("Content-Type", "application/json") 74 | _ = json.NewEncoder(w).Encode(&resp) 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /httputil/httputil_test.go: -------------------------------------------------------------------------------- 1 | package httputil_test 2 | 3 | import ( 4 | "encoding/json" 5 | "net/http/httptest" 6 | "testing" 7 | "time" 8 | 9 | "github.com/lytics/metafora" 10 | . "github.com/lytics/metafora/httputil" 11 | ) 12 | 13 | type tc struct { 14 | stop chan bool 15 | } 16 | 17 | func (*tc) Init(metafora.CoordinatorContext) error { return nil } 18 | func (c *tc) Watch(chan<- metafora.Task) error { 19 | <-c.stop 20 | return nil 21 | } 22 | func (c *tc) Claim(metafora.Task) bool { return false } 23 | func (c *tc) Release(metafora.Task) {} 24 | func (c *tc) Done(metafora.Task) {} 25 | func (c *tc) Command() (metafora.Command, error) { 26 | <-c.stop 27 | return nil, nil 28 | } 29 | func (c *tc) Close() { close(c.stop) } 30 | func (c *tc) Name() string { return "tc" } 31 | 32 | func TestMakeInfoHandler(t *testing.T) { 33 | t.Parallel() 34 | 35 | c, _ := metafora.NewConsumer(&tc{stop: make(chan bool)}, nil, metafora.DumbBalancer) 36 | defer c.Shutdown() 37 | now := time.Now().Truncate(time.Second) 38 | 39 | resp := httptest.NewRecorder() 40 | MakeInfoHandler(c, now)(resp, nil) 41 | 42 | info := InfoResponse{} 43 | if err := json.Unmarshal(resp.Body.Bytes(), &info); err != nil { 44 | t.Fatalf("Error unmarshalling response body: %v", err) 45 | } 46 | if info.Frozen { 47 | t.Errorf("Consumer should not start frozen.") 48 | } 49 | if !info.Started.Equal(now) { 50 | t.Errorf("Started time %s != %s", info.Started, now) 51 | } 52 | if info.Name != "tc" { 53 | t.Errorf("Node name %q != tc", info.Name) 54 | } 55 | if len(info.Tasks) != 0 { 56 | t.Errorf("Unexpected tasks: %v", info.Tasks) 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /ignore.go: -------------------------------------------------------------------------------- 1 | package metafora 2 | 3 | import ( 4 | "container/heap" 5 | "sync" 6 | "time" 7 | ) 8 | 9 | // ignoremgr handles ignoring tasks and sending them back to the consumer once 10 | // their ignore deadline is reached. 11 | type ignoremgr struct { 12 | incoming chan *timetask 13 | stop <-chan struct{} 14 | 15 | mu *sync.RWMutex 16 | ignores map[string]struct{} 17 | } 18 | 19 | func ignorer(tasks chan<- Task, stop <-chan struct{}) *ignoremgr { 20 | im := &ignoremgr{ 21 | incoming: make(chan *timetask), 22 | stop: stop, 23 | mu: &sync.RWMutex{}, 24 | ignores: make(map[string]struct{}), 25 | } 26 | go im.monitor(tasks, stop) 27 | return im 28 | } 29 | 30 | func (im *ignoremgr) add(task Task, until time.Time) { 31 | // short circuit zero times; queue everything else 32 | if until.IsZero() { 33 | return 34 | } 35 | 36 | // Add to ignore map 37 | im.mu.Lock() 38 | im.ignores[task.ID()] = struct{}{} 39 | im.mu.Unlock() 40 | 41 | // Send to monitor for pushing onto time heap 42 | select { 43 | case im.incoming <- &timetask{time: until, task: task}: 44 | case <-im.stop: 45 | // Don't bother adding ignore if we're just exiting 46 | } 47 | } 48 | 49 | func (im *ignoremgr) ignored(taskID string) (ignored bool) { 50 | im.mu.RLock() 51 | _, ok := im.ignores[taskID] 52 | im.mu.RUnlock() 53 | 54 | return ok 55 | } 56 | 57 | func (im *ignoremgr) monitor(tasks chan<- Task, stop <-chan struct{}) { 58 | times := timeheap{} 59 | heap.Init(×) 60 | var next *timetask 61 | for { 62 | if times.Len() > 0 { 63 | // Get next ignore from the ignore heap 64 | next = heap.Pop(×).(*timetask) 65 | } else { 66 | // No ignores! Wait for one to come in or an exit signal 67 | select { 68 | case <-stop: 69 | return 70 | case newtask := <-im.incoming: 71 | next = newtask 72 | } 73 | } 74 | 75 | // this duration *may* be negative, in which case the 76 | // task will be pushed immediately 77 | timer := time.NewTimer(time.Until(next.time)) 78 | 79 | select { 80 | case newtask := <-im.incoming: 81 | // Push onto next task and new task onto time heap 82 | heap.Push(×, newtask) 83 | heap.Push(×, next) 84 | 85 | // Stop the existing timer for this loop iteration 86 | timer.Stop() 87 | case <-timer.C: 88 | // Ignore expired, remove the entry 89 | im.mu.Lock() 90 | delete(im.ignores, next.task.ID()) 91 | im.mu.Unlock() 92 | 93 | // Notify the consumer 94 | select { 95 | case tasks <- next.task: 96 | case <-stop: 97 | return 98 | } 99 | case <-stop: 100 | return 101 | } 102 | } 103 | } 104 | 105 | func (im *ignoremgr) all() []string { 106 | im.mu.RLock() 107 | defer im.mu.RUnlock() 108 | ignores := make([]string, len(im.ignores)) 109 | i := 0 110 | for k := range im.ignores { 111 | ignores[i] = k 112 | i++ 113 | } 114 | return ignores 115 | } 116 | 117 | type timetask struct { 118 | time time.Time 119 | task Task 120 | } 121 | 122 | // timeheap is a min-heap of time/task tuples sorted by time. 123 | type timeheap []*timetask 124 | 125 | func (h timeheap) Len() int { return len(h) } 126 | func (h timeheap) Less(i, j int) bool { return h[i].time.Before(h[j].time) } 127 | func (h timeheap) Swap(i, j int) { h[i], h[j] = h[j], h[i] } 128 | 129 | func (h *timeheap) Push(x interface{}) { 130 | // Push and Pop use pointer receivers because they modify the slice's length, 131 | // not just its contents. 132 | *h = append(*h, x.(*timetask)) 133 | } 134 | 135 | func (h *timeheap) Pop() interface{} { 136 | old := *h 137 | n := len(old) 138 | x := old[n-1] 139 | *h = old[0 : n-1] 140 | return x 141 | } 142 | -------------------------------------------------------------------------------- /ignore_test.go: -------------------------------------------------------------------------------- 1 | package metafora 2 | 3 | import ( 4 | "testing" 5 | "time" 6 | ) 7 | 8 | func TestIgnore(t *testing.T) { 9 | t.Parallel() 10 | out := make(chan Task) 11 | stop := make(chan struct{}) 12 | defer close(stop) 13 | 14 | // Create ignorer 15 | im := ignorer(out, stop) 16 | 17 | // Ignore task for 200ms. Yes this is racy. Might need to bump deadline. 18 | deadline1 := time.Now().Add(200 * time.Millisecond) 19 | im.add(testTask{"1"}, deadline1) 20 | 21 | // Ensure it's ignored 22 | if !im.ignored("1") { 23 | t.Fatal("test task should have been ignored but wasn't") 24 | } 25 | 26 | // Ignore task for 10ms to make sure tasks are returned in order (they aren't 27 | // *guaranteed* to be in order since adds and evictions are concurrent) 28 | deadline2 := time.Now().Add(10 * time.Millisecond) 29 | im.add(testTask{"2"}, deadline2) 30 | 31 | // Wait for the first eviction 32 | eviction := <-out 33 | if eviction.ID() != "2" { 34 | t.Fatal("Expected 2 to be evicted before 1") 35 | } 36 | now := time.Now() 37 | if now.Before(deadline2) { 38 | t.Fatalf("First eviction happened too soon: %s < %s", now, deadline2) 39 | } 40 | 41 | eviction = <-out 42 | if eviction.ID() != "1" { 43 | t.Fatal("Expected 1 to be evicted second, found ", eviction) 44 | } 45 | now = time.Now() 46 | if now.Before(deadline1) { 47 | t.Fatalf("First eviction happened too soon: %s < %s", now, deadline1) 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /logger.go: -------------------------------------------------------------------------------- 1 | package metafora 2 | 3 | import ( 4 | "github.com/araddon/gou" 5 | ) 6 | 7 | var LogLevel int = gou.LogLevel 8 | 9 | type LogOutputter interface { 10 | Output(calldepth int, s string) error 11 | } 12 | 13 | // SetLogger switches where Metafora logs. 14 | func SetLogger(l LogOutputter) { 15 | } 16 | 17 | var Debug func(v ...interface{}) = gou.Debug 18 | var Debugf func(format string, v ...interface{}) = gou.Debugf 19 | var Info func(v ...interface{}) = gou.Info 20 | var Infof func(format string, v ...interface{}) = gou.Infof 21 | var Warn func(v ...interface{}) = gou.Warn 22 | var Warnf func(format string, v ...interface{}) = gou.Warnf 23 | var Error func(v ...interface{}) = gou.Error 24 | var Errorf func(format string, v ...interface{}) = gou.Errorf 25 | -------------------------------------------------------------------------------- /metafora.go: -------------------------------------------------------------------------------- 1 | package metafora 2 | 3 | import ( 4 | "fmt" 5 | "math/rand" 6 | "runtime" 7 | "sort" 8 | "sync" 9 | "time" 10 | ) 11 | 12 | var ( 13 | // balance calls are randomized and this is the upper bound of the random 14 | // amount 15 | balanceJitterMax = 10 * int64(time.Second) 16 | ) 17 | 18 | // Consumer is the core Metafora task runner. 19 | type Consumer struct { 20 | // Func to create new handlers 21 | handler HandlerFunc 22 | 23 | // Map of task:Handler 24 | running map[string]*runtask 25 | 26 | // Mutex to protect access to running 27 | runL sync.Mutex 28 | 29 | // WaitGroup for running handlers and consumer goroutines 30 | hwg sync.WaitGroup 31 | 32 | // WaitGroup so Shutdown() can block on Run() exiting fully 33 | runwg sync.WaitGroup 34 | runwgL sync.Mutex 35 | 36 | bal Balancer 37 | balEvery time.Duration 38 | coord Coordinator 39 | im *ignoremgr 40 | stop chan struct{} // closed by Shutdown to cause Run to exit 41 | tasks chan Task // channel for watcher to send tasks to main loop 42 | 43 | // Set by command handler, read anywhere via Consumer.frozen() 44 | freezeL sync.Mutex 45 | freeze bool 46 | } 47 | 48 | var BalanceEvery = 15 * time.Minute //TODO make balance wait configurable 49 | 50 | // NewConsumer returns a new consumer and calls Init on the Balancer and Coordinator. 51 | func NewConsumer(coord Coordinator, h HandlerFunc, b Balancer) (*Consumer, error) { 52 | c := &Consumer{ 53 | running: make(map[string]*runtask), 54 | handler: h, 55 | bal: b, 56 | balEvery: BalanceEvery, 57 | coord: coord, 58 | stop: make(chan struct{}), 59 | tasks: make(chan Task), 60 | } 61 | c.im = ignorer(c.tasks, c.stop) 62 | 63 | // initialize balancer with the consumer and a prefixed logger 64 | b.Init(c) 65 | 66 | if err := coord.Init(&coordinatorContext{c}); err != nil { 67 | return nil, err 68 | } 69 | return c, nil 70 | } 71 | 72 | // Run is the core run loop of Metafora. It is responsible for calling into the 73 | // Coordinator to claim work and Balancer to rebalance work. 74 | // 75 | // Run blocks until Shutdown is called or an internal error occurs. 76 | func (c *Consumer) Run() { 77 | Debug(c, " Starting consumer") 78 | 79 | // Increment run wait group so Shutdown() can block on Run() exiting fully. 80 | c.runwgL.Lock() 81 | c.runwg.Add(1) 82 | c.runwgL.Unlock() 83 | defer c.runwg.Done() 84 | 85 | // chans for core goroutines to communicate with main loop 86 | balance := make(chan bool) 87 | cmdChan := make(chan Command) 88 | 89 | // Balance is called by the main loop when the balance channel is ticked 90 | go func() { 91 | randInt := rand.New(rand.NewSource(time.Now().UnixNano())).Int63n 92 | for { 93 | select { 94 | case <-c.stop: 95 | // Shutdown has been called. 96 | return 97 | case <-time.After(c.balEvery + time.Duration(randInt(balanceJitterMax))): 98 | select { 99 | case balance <- true: 100 | // Ticked balance 101 | case <-c.stop: 102 | // Shutdown has been called. 103 | return 104 | } 105 | } 106 | } 107 | }() 108 | 109 | // Watch for new tasks in a goroutine 110 | go c.watcher() 111 | 112 | // Watch for new commands in a goroutine 113 | go func() { 114 | defer close(cmdChan) 115 | for { 116 | cmd, err := c.coord.Command() 117 | if err != nil { 118 | panic(fmt.Errorf("coordinator returned an error during command: %v", err)) 119 | } 120 | if cmd == nil { 121 | Debug(c, " Command coordinator exited") 122 | return 123 | } 124 | // Send command to watcher (or shutdown) 125 | select { 126 | case <-c.stop: 127 | return 128 | case cmdChan <- cmd: 129 | } 130 | } 131 | }() 132 | 133 | // Make sure Run() cleans up on exit (stops coordinator, releases tasks, etc) 134 | defer c.shutdown() 135 | 136 | // Main Loop ensures events are processed synchronously 137 | for { 138 | if c.Frozen() { 139 | // Only recv commands while frozen 140 | select { 141 | case <-c.stop: 142 | // Shutdown has been called. 143 | return 144 | case cmd, ok := <-cmdChan: 145 | if !ok { 146 | Debug(c, " Command channel closed. Exiting main loop.") 147 | return 148 | } 149 | Debugf("%s Received command: %s", c, cmd) 150 | c.handleCommand(cmd) 151 | } 152 | continue 153 | } 154 | 155 | select { 156 | case <-c.stop: 157 | // Shutdown has been called. 158 | return 159 | case <-balance: 160 | c.balance() 161 | case task := <-c.tasks: 162 | tid := task.ID() 163 | if c.ignored(tid) { 164 | Debugf("%s task=%q ignored", c, tid) 165 | continue 166 | } 167 | if until, ok := c.bal.CanClaim(task); !ok { 168 | Infof("%s Balancer rejected task=%q until %s", c, tid, until) 169 | c.ignore(task, until) 170 | break 171 | } 172 | if !c.coord.Claim(task) { 173 | Debugf("%s Coordinator unable to claim task=%q", c, tid) 174 | break 175 | } 176 | c.claimed(task) 177 | case cmd, ok := <-cmdChan: 178 | if !ok { 179 | Debug(c, " Command channel closed. Exiting main loop.") 180 | return 181 | } 182 | c.handleCommand(cmd) 183 | } 184 | } 185 | } 186 | 187 | func (c *Consumer) watcher() { 188 | // The watcher dying unexpectedly should close the consumer to cause a 189 | // shutdown. 190 | defer c.close() 191 | 192 | err := c.coord.Watch(c.tasks) 193 | if err != nil { 194 | panic(fmt.Errorf("coordinator returned an error during watch: %v", err)) 195 | } 196 | } 197 | 198 | func (c *Consumer) balance() { 199 | tasks := c.bal.Balance() 200 | if len(tasks) > 0 { 201 | Infof("%s balancer releasing %d tasks: %v", c, len(tasks), tasks) 202 | } 203 | for _, task := range tasks { 204 | // Actually release the rebalanced task. 205 | c.stopTask(task) 206 | } 207 | } 208 | 209 | // close the c.stop channel which signals for the consumer to shutdown. 210 | func (c *Consumer) close() { 211 | // acquire the runL lock to make sure we don't race with claimed()'s <-c.stop 212 | // check 213 | c.runL.Lock() 214 | defer c.runL.Unlock() 215 | select { 216 | case <-c.stop: 217 | // already stopped 218 | default: 219 | Debug("Stopping Run loop") 220 | close(c.stop) 221 | } 222 | } 223 | 224 | // shutdown is the actual shutdown logic called when Run() exits. 225 | func (c *Consumer) shutdown() { 226 | c.close() 227 | 228 | // Build list of of currently running tasks 229 | runningtasks := c.Tasks() 230 | Infof("Sending stop signal to %d handler(s)", len(runningtasks)) 231 | 232 | for _, rt := range runningtasks { 233 | c.stopTask(rt.Task().ID()) 234 | } 235 | 236 | Info(c, " Waiting for handlers to exit") 237 | c.hwg.Wait() 238 | 239 | Debug("Closing Coordinator ", c) 240 | c.coord.Close() 241 | } 242 | 243 | // Shutdown stops the main Run loop, calls Stop on all handlers, and calls 244 | // Close on the Coordinator. Running tasks will be released for other nodes to 245 | // claim. 246 | func (c *Consumer) Shutdown() { 247 | c.close() 248 | 249 | // Wait for task handlers to exit. 250 | c.hwg.Wait() 251 | 252 | // Make sure Run() exits, otherwise Shutdown() might exit before 253 | // coord.Close() is called. 254 | c.runwgL.Lock() 255 | c.runwg.Wait() 256 | c.runwgL.Unlock() 257 | } 258 | 259 | // Tasks returns a lexicographically sorted list of running Task IDs. 260 | func (c *Consumer) Tasks() []RunningTask { 261 | c.runL.Lock() 262 | defer c.runL.Unlock() 263 | 264 | // Create a sorted list of task IDs 265 | ids := make([]string, len(c.running)) 266 | i := 0 267 | for id := range c.running { 268 | ids[i] = id 269 | i++ 270 | } 271 | sort.Strings(ids) 272 | 273 | // Add tasks in lexicographic order 274 | t := make([]RunningTask, len(ids)) 275 | for i, id := range ids { 276 | t[i] = c.running[id] 277 | } 278 | return t 279 | } 280 | 281 | // claimed starts a handler for a claimed task. It is the only method to 282 | // manipulate c.running and closes the task channel when a handler's Run 283 | // method exits. 284 | func (c *Consumer) claimed(task Task) { 285 | h := c.handler(task) 286 | 287 | tid := task.ID() 288 | Debugf("%s is attempting to start task=%q", c, tid) 289 | // Associate handler with taskID 290 | // **This is the only place tasks should be added to c.running** 291 | c.runL.Lock() 292 | defer c.runL.Unlock() 293 | select { 294 | case <-c.stop: 295 | // We're closing, don't bother starting this task 296 | c.coord.Release(task) 297 | return 298 | default: 299 | } 300 | if _, ok := c.running[tid]; ok { 301 | // If a coordinator returns an already claimed task from Watch(), then it's 302 | // a coordinator (or broker) bug. 303 | Warnf("%s Attempted to claim already running task %s", c, tid) 304 | return 305 | } 306 | rt := newTask(task, h) 307 | c.running[tid] = rt 308 | 309 | // This must be done in the runL lock after the stop chan check so Shutdown 310 | // doesn't close(stop) and start Wait()ing concurrently. 311 | // See "Note" http://golang.org/pkg/sync/#WaitGroup.Add 312 | c.hwg.Add(1) 313 | 314 | // Start handler in its own goroutine 315 | go func() { 316 | defer c.hwg.Done() // Must be run after task exit and Done/Release called 317 | 318 | // Run the task 319 | Infof("%s Task %q started", c, tid) 320 | done := c.runTask(h.Run, tid) 321 | var status string 322 | if done { 323 | status = "done" 324 | c.coord.Done(task) 325 | } else { 326 | status = "released" 327 | c.coord.Release(task) 328 | } 329 | 330 | stopped := rt.Stopped() 331 | if stopped.IsZero() { 332 | // Task exited on its own 333 | Infof("%s Task %q exited (%s)", c, tid, status) 334 | } else { 335 | // Task exited due to Stop() being called 336 | Infof("%s Task %q exited (%s) after %s", c, tid, status, time.Since(stopped)) 337 | } 338 | 339 | // **This is the only place tasks should be removed from c.running** 340 | c.runL.Lock() 341 | delete(c.running, tid) 342 | c.runL.Unlock() 343 | }() 344 | 345 | // Pause slightly after a successful claim to give starting tasks some 346 | // breathing room and to bias the next claim toward a node that lost this 347 | // one. 348 | time.Sleep(10 * time.Millisecond) 349 | } 350 | 351 | // runTask executes a handler's Run method and recovers from panic()s. 352 | func (c *Consumer) runTask(run func() bool, task string) bool { 353 | done := false 354 | func() { 355 | defer func() { 356 | if err := recover(); err != nil { 357 | stack := make([]byte, 50*1024) 358 | sz := runtime.Stack(stack, false) 359 | Errorf("%s Handler %s panic()'d: %v\n%s", c, task, err, stack[:sz]) 360 | // panics are considered fatal errors. Make sure the task isn't 361 | // rescheduled. 362 | done = true 363 | } 364 | }() 365 | done = run() 366 | }() 367 | return done 368 | } 369 | 370 | // stopTask asynchronously calls the task handlers' Stop method. While stopTask 371 | // calls don't block, calls to task handler's Stop method are serialized with a 372 | // lock. 373 | func (c *Consumer) stopTask(taskID string) { 374 | c.runL.Lock() 375 | task, ok := c.running[taskID] 376 | c.runL.Unlock() 377 | 378 | if !ok { 379 | // This can happen if a task completes during Balance() and is not an error. 380 | Warnf("%s tried to release a non-running task=%q", c, taskID) 381 | return 382 | } 383 | 384 | // all handler methods must be wrapped in a recover to prevent a misbehaving 385 | // handler from crashing the entire consumer 386 | go func() { 387 | defer func() { 388 | if err := recover(); err != nil { 389 | stack := make([]byte, 50*1024) 390 | sz := runtime.Stack(stack, false) 391 | Errorf("%s Handler %s panic()'d on Stop: %v\n%s", c, taskID, err, stack[:sz]) 392 | } 393 | }() 394 | 395 | // Serialize calls to Stop as a convenience to handler implementors. 396 | task.stop() 397 | }() 398 | } 399 | 400 | // Frozen returns true if Metafora is no longer watching for new tasks or 401 | // rebalancing. 402 | // 403 | // Metafora will remain frozen until receiving an Unfreeze command or it is 404 | // restarted (frozen state is not persisted). 405 | func (c *Consumer) Frozen() bool { 406 | c.freezeL.Lock() 407 | r := c.freeze 408 | c.freezeL.Unlock() 409 | return r 410 | } 411 | 412 | func (c *Consumer) handleCommand(cmd Command) { 413 | switch cmd.Name() { 414 | case cmdFreeze: 415 | if c.Frozen() { 416 | Info(c, " Ignoring freeze command: already frozen") 417 | return 418 | } 419 | Info(c, " Freezing") 420 | c.freezeL.Lock() 421 | c.freeze = true 422 | c.freezeL.Unlock() 423 | case cmdUnfreeze: 424 | if !c.Frozen() { 425 | Info(c, " Ignoring unfreeze command: not frozen") 426 | return 427 | } 428 | Info(c, " Unfreezing") 429 | c.freezeL.Lock() 430 | c.freeze = false 431 | c.freezeL.Unlock() 432 | case cmdBalance: 433 | Info(c, " Balancing due to command") 434 | c.balance() 435 | Debug(c, " Finished balancing due to command") 436 | case cmdStopTask: 437 | taskI, ok := cmd.Parameters()["task"] 438 | task, ok2 := taskI.(string) 439 | if !ok || !ok2 { 440 | Error(c, " Stop task command didn't contain a valid task", c) 441 | return 442 | } 443 | Infof("%s Stopping task %s due to command", c, task) 444 | c.stopTask(task) 445 | default: 446 | Warnf("%s Discarding unknown command: %s", c, cmd.Name()) 447 | } 448 | } 449 | 450 | func (c *Consumer) ignored(taskID string) bool { return c.im.ignored(taskID) } 451 | func (c *Consumer) ignore(t Task, until time.Time) { c.im.add(t, until) } 452 | 453 | // Ignores is a list of all ignored tasks. 454 | func (c *Consumer) Ignores() []string { return c.im.all() } 455 | 456 | func (c *Consumer) String() string { 457 | return c.coord.Name() 458 | } 459 | -------------------------------------------------------------------------------- /metafora_test.go: -------------------------------------------------------------------------------- 1 | package metafora 2 | 3 | import ( 4 | "os" 5 | "testing" 6 | "time" 7 | ) 8 | 9 | func init() { 10 | if os.Getenv("VERBOSE_TESTS") != "" { 11 | SetLogger(testlogger{}) 12 | } 13 | } 14 | 15 | type testlogger struct{} 16 | 17 | func (testlogger) Output(int, string) error { return nil } 18 | 19 | // Handler/Consumer test 20 | 21 | type testHandler struct { 22 | stop chan int 23 | t *testing.T 24 | task Task 25 | tasksRun chan string 26 | } 27 | 28 | func (h *testHandler) Run() bool { 29 | h.tasksRun <- h.task.ID() 30 | h.t.Logf("Run(%s)", h.task.ID()) 31 | <-h.stop 32 | h.t.Logf("Stop received for %s", h.task.ID()) 33 | return true 34 | } 35 | 36 | func (h *testHandler) Stop() { 37 | h.t.Logf("Stopping %s", h.task.ID()) 38 | close(h.stop) 39 | } 40 | 41 | func newTestHandlerFunc(t *testing.T) (HandlerFunc, chan string) { 42 | tasksRun := make(chan string, 10) 43 | return func(task Task) Handler { 44 | return &testHandler{ 45 | task: task, 46 | stop: make(chan int), 47 | t: t, 48 | tasksRun: tasksRun, 49 | } 50 | }, tasksRun 51 | } 52 | 53 | // TestConsumer ensures the consumers main loop properly handles tasks as well 54 | // as errors and Shutdown. 55 | func TestConsumer(t *testing.T) { 56 | t.Parallel() 57 | 58 | // Setup some tasks to run in a fake coordinator 59 | tc := NewTestCoord() 60 | tc.Tasks <- testTask{"test1"} 61 | tc.Tasks <- testTask{"test2"} 62 | 63 | // Setup a handler func that lets us know what tasks are running 64 | hf, tasksRun := newTestHandlerFunc(t) 65 | 66 | // Create the consumer and run it 67 | c, _ := NewConsumer(tc, hf, DumbBalancer) 68 | s := make(chan int) 69 | go func() { 70 | c.Run() 71 | s <- 1 72 | }() 73 | 74 | for i := 0; i < 2; i++ { 75 | select { 76 | case <-s: 77 | t.Fatalf("Run exited early") 78 | case tr := <-tasksRun: 79 | if tr != "test1" && tr != "test2" { 80 | t.Errorf("Expected `test1` or `test2` but received: %s", tr) 81 | } 82 | t.Logf("Received task=%q", tr) 83 | case <-time.After(100 * time.Millisecond): 84 | t.Errorf("First task didn't execute in a timely fashion") 85 | } 86 | } 87 | 88 | // Ensure Tasks() is accurate 89 | tasks := c.Tasks() 90 | if len(tasks) != 2 { 91 | t.Errorf("Expected 2 tasks to be running but found: %v", tasks) 92 | } 93 | 94 | go func() { 95 | c.Shutdown() 96 | s <- 1 97 | }() 98 | for i := 0; i < 2; i++ { 99 | select { 100 | case <-s: 101 | case <-time.After(100 * time.Millisecond): 102 | t.Errorf("Run and Shutdown didn't finish in a timely fashion") 103 | } 104 | } 105 | } 106 | 107 | // Balancer/Consumer test 108 | 109 | type testBalancer struct { 110 | c BalancerContext 111 | t *testing.T 112 | secondRun bool 113 | done chan struct{} 114 | } 115 | 116 | func (b *testBalancer) Init(c BalancerContext) { b.c = c } 117 | func (b *testBalancer) CanClaim(task Task) (time.Time, bool) { 118 | b.t.Logf("CanClaim(%s) -> %t", task.ID(), task.ID() == "ok-task") 119 | return time.Now().Add(100 * time.Hour), task.ID() == "ok-task" 120 | } 121 | 122 | func (b *testBalancer) Balance() []string { 123 | if b.secondRun { 124 | return nil 125 | } 126 | b.secondRun = true 127 | tsks := b.c.Tasks() 128 | if len(tsks) != 1 { 129 | b.t.Errorf("len(ConsumerState.Tasks()) != 1 ==> %v", tsks) 130 | return nil 131 | } 132 | if tsks[0].Task().ID() != "ok-task" { 133 | b.t.Errorf("Wrong task in ConsumerState.Tasks(): %v", tsks) 134 | } 135 | close(b.done) 136 | return nil 137 | } 138 | 139 | func TestBalancer(t *testing.T) { 140 | t.Parallel() 141 | if testing.Short() { 142 | t.Skip("skipping due to -short") 143 | } 144 | 145 | hf, tasksRun := newTestHandlerFunc(t) 146 | tc := NewTestCoord() 147 | balDone := make(chan struct{}) 148 | c, _ := NewConsumer(tc, hf, &testBalancer{t: t, done: balDone}) 149 | c.balEvery = 0 150 | go c.Run() 151 | tc.Tasks <- testTask{"test1"} 152 | tc.Tasks <- testTask{"ok-task"} 153 | tc.Tasks <- testTask{"test2"} 154 | 155 | // Wait for balance 156 | select { 157 | case <-balDone: 158 | case <-time.After(time.Duration(balanceJitterMax) + 10*time.Millisecond): 159 | t.Error("Didn't balance in a timely fashion") 160 | } 161 | 162 | select { 163 | case run := <-tasksRun: 164 | if run != "ok-task" { 165 | t.Errorf("Balancer didn't reject tasks properly. Ran task %s", run) 166 | } 167 | case <-time.After(100 * time.Millisecond): 168 | t.Error("Task didn't run in a timely fashion") 169 | } 170 | 171 | /* 172 | if r := c.bal.Balance(); len(r) > 0 { 173 | t.Errorf("Balance() should return 0, not: %v", r) 174 | } 175 | */ 176 | 177 | s := make(chan int) 178 | go func() { 179 | c.Shutdown() 180 | close(s) 181 | }() 182 | select { 183 | case <-s: 184 | case <-time.After(100 * time.Millisecond): 185 | t.Errorf("Shutdown didn't finish in a timely fashion") 186 | } 187 | if len(c.Tasks()) != 0 { 188 | t.Errorf("Shutdown didn't stop all tasks") 189 | } 190 | } 191 | 192 | type noopHandler struct{} 193 | 194 | func (noopHandler) Run() bool { return true } 195 | func (noopHandler) Stop() {} 196 | 197 | // TestHandleTask ensures that tasks are marked as done once handled. 198 | func TestHandleTask(t *testing.T) { 199 | hf := func(Task) Handler { return noopHandler{} } 200 | coord := NewTestCoord() 201 | c, _ := NewConsumer(coord, hf, DumbBalancer) 202 | go c.Run() 203 | coord.Tasks <- testTask{"task1"} 204 | select { 205 | case <-coord.Releases: 206 | t.Errorf("Release called, expected Done!") 207 | case <-coord.Dones: 208 | case <-time.After(100 * time.Millisecond): 209 | t.Fatalf("Took too long to mark task as done") 210 | } 211 | c.Shutdown() 212 | } 213 | 214 | // TestTaskPanic ensures panics from Run methods are turned into Done calls. 215 | func TestTaskPanic(t *testing.T) { 216 | t.Parallel() 217 | hf := SimpleHandler(func(Task, <-chan bool) bool { 218 | panic("TestTaskPanic") 219 | }) 220 | coord := NewTestCoord() 221 | c, _ := NewConsumer(coord, hf, DumbBalancer) 222 | go c.Run() 223 | coord.Tasks <- testTask{"1"} 224 | coord.Tasks <- testTask{"2"} 225 | coord.Tasks <- testTask{"3"} 226 | for i := 3; i > 0; i-- { 227 | select { 228 | case task := <-coord.Dones: 229 | t.Logf("%s done", task) 230 | case task := <-coord.Releases: 231 | t.Errorf("%s released when it should have been marked Done!", task) 232 | case <-time.After(200 * time.Millisecond): 233 | t.Fatalf("Took too long to mark task(s) as done.") 234 | } 235 | } 236 | c.Shutdown() 237 | } 238 | 239 | // TestShutdown ensures Shutdown causes Run() to exit cleanly. 240 | func TestShutdown(t *testing.T) { 241 | t.Parallel() 242 | hf := SimpleHandler(func(_ Task, c <-chan bool) bool { 243 | <-c 244 | return false 245 | }) 246 | coord := NewTestCoord() 247 | c, _ := NewConsumer(coord, hf, DumbBalancer) 248 | go c.Run() 249 | coord.Tasks <- testTask{"1"} 250 | coord.Tasks <- testTask{"2"} 251 | coord.Tasks <- testTask{"3"} 252 | time.Sleep(100 * time.Millisecond) 253 | if len(coord.Dones)+len(coord.Releases) > 0 { 254 | t.Fatalf("Didn't expect any tasks to exit before Shutdown was called.") 255 | } 256 | c.Shutdown() 257 | for i := 3; i > 0; i-- { 258 | select { 259 | case task := <-coord.Dones: 260 | t.Errorf("%s makred done when it should have been Released!", task) 261 | case task := <-coord.Releases: 262 | t.Logf("%s relased", task) 263 | case <-time.After(200 * time.Millisecond): 264 | t.Fatalf("Took too long to mark task(s) as released.") 265 | } 266 | } 267 | } 268 | -------------------------------------------------------------------------------- /metcdv3/README.md: -------------------------------------------------------------------------------- 1 | metafora etcdv3 client 2 | ==================== 3 | 4 | See [Documentation/etcdv3.md](../Documentation/etcdv3.md) for details. 5 | 6 | Testing 7 | ------- 8 | 9 | Testing the metafora etcd client requires that a new etcd instance be running. 10 | The etcd instances should be reachable via the connection described by the 11 | connection string `localhost:5001,localhost:5002,localhost:5003` or a similar 12 | connection string should be exported as an environment variable `ETCDCTL_PEERS`. 13 | 14 | An example of running the integration tests is given in the command line below: 15 | 16 | ```sh 17 | IP="127.0.0.1" ETCDCTL_PEERS="$IP:5001,$IP:5002,$IP:5003" go test -v 18 | ``` 19 | -------------------------------------------------------------------------------- /metcdv3/balancer.go: -------------------------------------------------------------------------------- 1 | package metcdv3 2 | 3 | import ( 4 | "context" 5 | "encoding/json" 6 | "path" 7 | 8 | "github.com/lytics/metafora" 9 | 10 | etcdv3 "go.etcd.io/etcd/client/v3" 11 | ) 12 | 13 | // NewFairBalancer creates a new metafora.DefaultFairBalancer that uses etcd 14 | // for counting tasks per node. 15 | func NewFairBalancer(conf *Config, etcdv3c *etcdv3.Client, filter func(*FilterableValue) bool) metafora.Balancer { 16 | e := etcdClusterState{ 17 | etcdv3c: etcdv3c, 18 | kvc: etcdv3.NewKV(etcdv3c), 19 | taskPath: path.Join(conf.Namespace, TasksPath), 20 | nodePath: path.Join(conf.Namespace, NodesPath), 21 | filter: filter, 22 | } 23 | return metafora.NewDefaultFairBalancer(conf.Name, &e) 24 | } 25 | 26 | // Checks the current state of an Etcd cluster 27 | type etcdClusterState struct { 28 | etcdv3c *etcdv3.Client 29 | kvc etcdv3.KV 30 | taskPath string 31 | nodePath string 32 | filter func(*FilterableValue) bool 33 | } 34 | 35 | type FilterableValue struct { 36 | Name string 37 | } 38 | 39 | func (e *etcdClusterState) NodeTaskCount() (map[string]int, error) { 40 | state := map[string]int{} 41 | 42 | // First initialize state with nodes as keys 43 | resp, err := e.kvc.Get(context.Background(), e.nodePath, etcdv3.WithPrefix()) 44 | if err != nil { 45 | return nil, err 46 | } 47 | 48 | if resp == nil || len(resp.Kvs) == 0 { 49 | metafora.Warnf("balancer received empty response from GET %s", e.nodePath) 50 | return state, nil 51 | } 52 | 53 | for _, kv := range resp.Kvs { 54 | // We're guarunteed to find nodes under the _metadata path (created on Coordinator startup) 55 | dir, _ := path.Split(string(kv.Key)) 56 | dir, node := path.Split(path.Clean(dir)) 57 | if path.Clean(dir) == e.nodePath && e.filter(&FilterableValue{Name: node}) { 58 | state[node] = 0 59 | } 60 | } 61 | 62 | resp, err = e.kvc.Get(context.Background(), e.taskPath, etcdv3.WithPrefix()) 63 | if err != nil { 64 | return nil, err 65 | } 66 | 67 | // No current tasks 68 | if resp == nil || len(resp.Kvs) == 0 { 69 | return state, nil 70 | } 71 | 72 | // Get the list of all claimed work, create a map of the counts and 73 | // node values 74 | // We ignore tasks which have no claims 75 | for _, kv := range resp.Kvs { 76 | ownerPath := path.Base(string(kv.Key)) 77 | if ownerPath == OwnerPath { 78 | ov := &ownerValue{} 79 | err := json.Unmarshal(kv.Value, ov) 80 | if err != nil { 81 | return nil, err 82 | } 83 | // We want to only include those nodes which were initially included, 84 | // as some nodes may be shutting down, etc, and should not be counted 85 | if _, ok := state[ov.Node]; ok { 86 | state[ov.Node]++ 87 | } 88 | } 89 | } 90 | return state, nil 91 | } 92 | -------------------------------------------------------------------------------- /metcdv3/balancer_test.go: -------------------------------------------------------------------------------- 1 | package metcdv3 2 | 3 | import ( 4 | "testing" 5 | "time" 6 | 7 | "github.com/lytics/metafora" 8 | ) 9 | 10 | func TestFairBalancer(t *testing.T) { 11 | t.Parallel() 12 | etcdv3c, coord1, conf1 := setupEtcd(t) 13 | defer etcdv3c.Close() 14 | conf2 := conf1.Copy() 15 | conf2.Name = "coord2" 16 | coord2 := NewEtcdV3Coordinator(conf2, etcdv3c) 17 | 18 | cli := NewClient(conf1.Namespace, etcdv3c) 19 | 20 | h := metafora.SimpleHandler(func(task metafora.Task, stop <-chan bool) bool { 21 | metafora.Debugf("Starting %s", task.ID()) 22 | <-stop 23 | metafora.Debugf("Stopping %s", task.ID()) 24 | return false // never done 25 | }) 26 | 27 | filter := func(_ *FilterableValue) bool { return true } 28 | // Create two consumers 29 | b1 := NewFairBalancer(conf1, etcdv3c, filter) 30 | con1, err := metafora.NewConsumer(coord1, h, b1) 31 | if err != nil { 32 | t.Fatal(err) 33 | } 34 | 35 | b2 := NewFairBalancer(conf2, etcdv3c, filter) 36 | con2, err := metafora.NewConsumer(coord2, h, b2) 37 | if err != nil { 38 | t.Fatal(err) 39 | } 40 | 41 | // Start the first and let it claim a bunch of tasks 42 | go con1.Run() 43 | defer con1.Shutdown() 44 | _ = cli.SubmitTask(DefaultTaskFunc("t1", "")) 45 | _ = cli.SubmitTask(DefaultTaskFunc("t2", "")) 46 | _ = cli.SubmitTask(DefaultTaskFunc("t3", "")) 47 | _ = cli.SubmitTask(DefaultTaskFunc("t4", "")) 48 | _ = cli.SubmitTask(DefaultTaskFunc("t5", "")) 49 | _ = cli.SubmitTask(DefaultTaskFunc("t6", "")) 50 | 51 | time.Sleep(5 * time.Second) 52 | 53 | if len(con1.Tasks()) != 6 { 54 | t.Fatalf("con1 should have claimed 6 tasks: %d", len(con1.Tasks())) 55 | } 56 | 57 | // Start the second consumer and force the 1st to rebalance 58 | go con2.Run() 59 | defer con2.Shutdown() 60 | 61 | // Wait for node to startup and register 62 | time.Sleep(1 * time.Second) 63 | 64 | _ = cli.SubmitCommand(conf1.Name, metafora.CommandBalance()) 65 | 66 | time.Sleep(2 * time.Second) 67 | 68 | c1Tasks := con1.Tasks() 69 | c2Tasks := con2.Tasks() 70 | if len(c1Tasks) != 4 || len(c2Tasks) != 2 { 71 | t.Fatalf("expected consumers to have 4|2 tasks: %d|%d", len(c1Tasks), len(c2Tasks)) 72 | } 73 | 74 | // Finally make sure that balancing the other node does nothing 75 | _ = cli.SubmitCommand("node2", metafora.CommandBalance()) 76 | 77 | time.Sleep(2 * time.Second) 78 | 79 | c1Tasks2 := con1.Tasks() 80 | c2Tasks2 := con2.Tasks() 81 | if len(c1Tasks2) != 4 || len(c2Tasks2) != 2 { 82 | t.Fatalf("expected consumers to have 4|2 tasks: %d|%d", len(c1Tasks2), len(c2Tasks2)) 83 | } 84 | for i := 0; i < 4; i++ { 85 | if c1Tasks[i] != c1Tasks2[i] { 86 | t.Errorf("task mismatch: %s != %s", c1Tasks[i], c1Tasks2[i]) 87 | } 88 | } 89 | for i := 0; i < 2; i++ { 90 | if c2Tasks[i] != c2Tasks2[i] { 91 | t.Errorf("task mismatch: %s != %s", c2Tasks[i], c2Tasks2[i]) 92 | } 93 | } 94 | } 95 | 96 | func TestFairBalancerFilter(t *testing.T) { 97 | t.Parallel() 98 | etcdv3c, coord1, conf1 := setupEtcd(t) 99 | defer etcdv3c.Close() 100 | conf2 := conf1.Copy() 101 | conf2.Name = "coord2" 102 | coord2 := NewEtcdV3Coordinator(conf2, etcdv3c) 103 | 104 | cli := NewClient(conf1.Namespace, etcdv3c) 105 | 106 | h := metafora.SimpleHandler(func(task metafora.Task, stop <-chan bool) bool { 107 | metafora.Debugf("Starting %s", task.ID()) 108 | <-stop 109 | metafora.Debugf("Stopping %s", task.ID()) 110 | return false // never done 111 | }) 112 | 113 | filter := func(fv *FilterableValue) bool { return fv.Name == conf1.Name } 114 | // Create two consumers 115 | b1 := NewFairBalancer(conf1, etcdv3c, filter) 116 | con1, err := metafora.NewConsumer(coord1, h, b1) 117 | if err != nil { 118 | t.Fatal(err) 119 | } 120 | 121 | filter2 := func(fv *FilterableValue) bool { return fv.Name == conf2.Name } 122 | b2 := NewFairBalancer(conf2, etcdv3c, filter2) 123 | con2, err := metafora.NewConsumer(coord2, h, b2) 124 | if err != nil { 125 | t.Fatal(err) 126 | } 127 | 128 | // Start the first and let it claim a bunch of tasks 129 | go con1.Run() 130 | defer con1.Shutdown() 131 | _ = cli.SubmitTask(DefaultTaskFunc("t1", "")) 132 | _ = cli.SubmitTask(DefaultTaskFunc("t2", "")) 133 | _ = cli.SubmitTask(DefaultTaskFunc("t3", "")) 134 | _ = cli.SubmitTask(DefaultTaskFunc("t4", "")) 135 | _ = cli.SubmitTask(DefaultTaskFunc("t5", "")) 136 | _ = cli.SubmitTask(DefaultTaskFunc("t6", "")) 137 | _ = cli.SubmitTask(DefaultTaskFunc("t7", "")) 138 | _ = cli.SubmitTask(DefaultTaskFunc("t8", "")) 139 | _ = cli.SubmitTask(DefaultTaskFunc("t9", "")) 140 | 141 | time.Sleep(5 * time.Second) 142 | 143 | if len(con1.Tasks()) != 9 { 144 | t.Fatalf("con1 should have claimed 9 tasks: %d", len(con1.Tasks())) 145 | } 146 | 147 | // Start the second consumer and force the 1st to rebalance 148 | go con2.Run() 149 | defer con2.Shutdown() 150 | 151 | // Wait for node to startup and register 152 | time.Sleep(1 * time.Second) 153 | 154 | _ = cli.SubmitCommand(conf1.Name, metafora.CommandBalance()) 155 | 156 | time.Sleep(2 * time.Second) 157 | 158 | // Make sure that balancing never happened 159 | c2Tasks := con2.Tasks() 160 | if len(c2Tasks) != 0 { 161 | t.Fatalf("expected no tasks to be rebalanced but got: %d", len(c2Tasks)) 162 | } 163 | 164 | } 165 | 166 | // Fair balancer shouldn't consider a shutting-down node 167 | // See https://github.com/lytics/metafora/issues/92 168 | func TestFairBalancerShutdown(t *testing.T) { 169 | etcdv3c, coord1, conf1 := setupEtcd(t) 170 | defer etcdv3c.Close() 171 | conf2 := conf1.Copy() 172 | conf2.Name = "node2" 173 | coord2 := NewEtcdV3Coordinator(conf2, etcdv3c) 174 | 175 | cli := NewClient(conf1.Namespace, etcdv3c) 176 | 177 | // This handler always returns immediately 178 | h1 := metafora.SimpleHandler(func(task metafora.Task, stop <-chan bool) bool { 179 | metafora.Debugf("H1 Starting %s", task.ID()) 180 | <-stop 181 | metafora.Debugf("H1 Stopping %s", task.ID()) 182 | return false // never done 183 | }) 184 | 185 | // Block forever on a single task 186 | stop2 := make(chan struct{}) 187 | stopr := make(chan chan struct{}, 1) 188 | stopr <- stop2 189 | h2 := metafora.SimpleHandler(func(task metafora.Task, stop <-chan bool) bool { 190 | metafora.Debugf("H2 Starting %s", task.ID()) 191 | blockchan, ok := <-stopr 192 | if ok { 193 | <-blockchan 194 | } 195 | <-stop 196 | metafora.Debugf("H2 Stopping %s", task.ID()) 197 | return false // never done 198 | }) 199 | 200 | filter := func(_ *FilterableValue) bool { return true } 201 | // Create two consumers 202 | b1 := NewFairBalancer(conf1, etcdv3c, filter) 203 | con1, err := metafora.NewConsumer(coord1, h1, b1) 204 | if err != nil { 205 | t.Fatal(err) 206 | } 207 | 208 | b2 := NewFairBalancer(conf2, etcdv3c, filter) 209 | con2, err := metafora.NewConsumer(coord2, h2, b2) 210 | if err != nil { 211 | t.Fatal(err) 212 | } 213 | 214 | // Start the first and let it claim a bunch of tasks 215 | go con1.Run() 216 | defer con1.Shutdown() 217 | _ = cli.SubmitTask(DefaultTaskFunc("t1", "")) 218 | _ = cli.SubmitTask(DefaultTaskFunc("t2", "")) 219 | _ = cli.SubmitTask(DefaultTaskFunc("t3", "")) 220 | _ = cli.SubmitTask(DefaultTaskFunc("t4", "")) 221 | _ = cli.SubmitTask(DefaultTaskFunc("t5", "")) 222 | _ = cli.SubmitTask(DefaultTaskFunc("t6", "")) 223 | 224 | time.Sleep(1000 * time.Millisecond) 225 | 226 | if len(con1.Tasks()) != 6 { 227 | t.Fatalf("con1 should have claimed 6 tasks: %d", len(con1.Tasks())) 228 | } 229 | 230 | // Start the second consumer and force the 1st to rebalance 231 | go con2.Run() 232 | 233 | close(stopr) 234 | 235 | // Wait for node to startup and register 236 | time.Sleep(500 * time.Millisecond) 237 | 238 | _ = cli.SubmitCommand(conf1.Name, metafora.CommandBalance()) 239 | 240 | time.Sleep(2 * time.Second) 241 | 242 | c1Tasks := con1.Tasks() 243 | c2Tasks := con2.Tasks() 244 | if len(c1Tasks) != 4 || len(c2Tasks) != 2 { 245 | t.Fatalf("expected consumers to have 4|2 tasks: %d|%d", len(c1Tasks), len(c2Tasks)) 246 | } 247 | 248 | // Make sure that balancing the other node does nothing 249 | _ = cli.SubmitCommand("node2", metafora.CommandBalance()) 250 | 251 | time.Sleep(2 * time.Second) 252 | 253 | c1Tasks2 := con1.Tasks() 254 | c2Tasks2 := con2.Tasks() 255 | if len(c1Tasks2) != 4 || len(c2Tasks2) != 2 { 256 | t.Fatalf("expected consumers to have 4|2 tasks: %d|%d", len(c1Tasks2), len(c2Tasks2)) 257 | } 258 | for i := 0; i < 4; i++ { 259 | if c1Tasks[i] != c1Tasks2[i] { 260 | t.Errorf("task mismatch: %s != %s", c1Tasks[i], c1Tasks2[i]) 261 | } 262 | } 263 | for i := 0; i < 2; i++ { 264 | if c2Tasks[i] != c2Tasks2[i] { 265 | t.Errorf("task mismatch: %s != %s", c2Tasks[i], c2Tasks2[i]) 266 | } 267 | } 268 | 269 | // Second consumer should block on a single task forever 270 | // Rebalancing the first node should then cause it to pickup all but 271 | // one task 272 | c2stop := make(chan struct{}) 273 | go func() { 274 | con2.Shutdown() 275 | close(c2stop) 276 | }() 277 | 278 | time.Sleep(500 * time.Millisecond) 279 | 280 | _ = cli.SubmitCommand(conf1.Name, metafora.CommandBalance()) 281 | 282 | time.Sleep(2 * time.Second) 283 | 284 | c1Tasks3 := con1.Tasks() 285 | c2Tasks3 := con2.Tasks() 286 | if len(c1Tasks3) != 5 || len(c2Tasks3) != 1 { 287 | t.Fatalf("Expected consumers to have 5|1 tasks: %d|%d", len(c1Tasks3), len(c2Tasks3)) 288 | } 289 | 290 | // Now stop blocking task, rebalance and make sure the first node picked up the remaining 291 | close(stop2) 292 | 293 | time.Sleep(500 * time.Millisecond) 294 | // Consumer 2 should stop now 295 | <-c2stop 296 | 297 | _ = cli.SubmitCommand(conf1.Name, metafora.CommandBalance()) 298 | 299 | time.Sleep(2 * time.Second) 300 | 301 | // con2 is out of the picture. con1 has all the tasks. 302 | c1Tasks4 := con1.Tasks() 303 | c2Tasks4 := con2.Tasks() 304 | if len(c1Tasks4) != 6 || len(c2Tasks4) != 0 { 305 | t.Fatalf("Expected consumers to have 6|0 tasks: %d|%d", len(c1Tasks4), len(c2Tasks4)) 306 | } 307 | } 308 | -------------------------------------------------------------------------------- /metcdv3/client.go: -------------------------------------------------------------------------------- 1 | package metcdv3 2 | 3 | import ( 4 | "context" 5 | "encoding/json" 6 | "errors" 7 | "math/rand" 8 | "path" 9 | "strconv" 10 | "time" 11 | 12 | "github.com/lytics/metafora" 13 | etcdv3 "go.etcd.io/etcd/client/v3" 14 | ) 15 | 16 | var ( 17 | // ErrFailedSubmitTask because the task already existed most likely 18 | ErrFailedSubmitTask = errors.New("metafora etcdv3 client: failed submit task") 19 | ErrLeaseDurationTooShort = errors.New("metafora etcd clientv3: lease duration too short") 20 | ErrKeepAliveClosedUnexpectedly = errors.New("metafora etcd clientv3: keep alive closed unexpectedly") 21 | ) 22 | 23 | var ( 24 | minLeaseDuration = 10 * time.Second 25 | ) 26 | 27 | // NewClient creates a new client using an etcd backend. 28 | func NewClient(namespace string, etcdv3c *etcdv3.Client) metafora.Client { 29 | return &mclient{ 30 | etcdv3c: etcdv3c, 31 | kvc: etcdv3.NewKV(etcdv3c), 32 | namespace: namespace, 33 | } 34 | } 35 | 36 | type keepAliveStats struct { 37 | success int 38 | failure int 39 | } 40 | 41 | // Type 'mclient' is an internal implementation of metafora.Client with an etcd backend. 42 | type mclient struct { 43 | etcdv3c *etcdv3.Client 44 | kvc etcdv3.KV 45 | namespace string 46 | } 47 | 48 | // nodesPath is the base path of nodes, represented as a directory in etcd. 49 | func (mc *mclient) nodesPath() string { 50 | return path.Join("/", mc.namespace, NodesPath) 51 | } 52 | 53 | // taskPath is the path to a particular taskId, represented as a file in etcd. 54 | func (mc *mclient) taskPath(taskID string) string { 55 | return path.Join("/", mc.namespace, TasksPath, taskID) 56 | } 57 | 58 | // cmdPath is the path to a particular nodeId, represented as a directory in etcd. 59 | func (mc *mclient) cmdPath(node string) string { 60 | return path.Join("/", mc.namespace, NodesPath, node, "commands") 61 | } 62 | 63 | // SubmitTask creates a new task in etcd 64 | func (mc *mclient) SubmitTask(task metafora.Task) error { 65 | c := context.Background() 66 | fullpath := path.Join(mc.taskPath(task.ID()), PropsPath) 67 | buf, err := json.Marshal(task) 68 | if err != nil { 69 | return err 70 | } 71 | txnRes, err := mc.kvc.Txn(c). 72 | If(etcdv3.Compare(etcdv3.Version(fullpath), "=", 0)). 73 | // Should we create both of these? 74 | Then(etcdv3.OpPut(fullpath, string(buf)), etcdv3.OpPut(mc.taskPath(task.ID()), "")). 75 | Commit() 76 | if err != nil { 77 | return err 78 | } 79 | if !txnRes.Succeeded { 80 | return ErrFailedSubmitTask 81 | } 82 | metafora.Debugf("task %s submitted: %s", task.ID(), fullpath) 83 | return nil 84 | } 85 | 86 | // Delete a task 87 | func (mc *mclient) DeleteTask(taskID string) error { 88 | c := context.Background() 89 | fullpath := mc.taskPath(taskID) 90 | _, err := mc.kvc.Delete(c, fullpath, etcdv3.WithPrefix()) 91 | metafora.Debugf("task %s deleted: %s", taskID, fullpath) 92 | return err 93 | } 94 | 95 | // SubmitCommand creates a new command for a particular nodeId, the 96 | // command has a random name and is added to the particular nodeId 97 | // directory in etcd. 98 | func (mc *mclient) SubmitCommand(node string, command metafora.Command) error { 99 | cmdPath := mc.cmdPath(node) 100 | body, err := command.Marshal() 101 | if err != nil { 102 | // This is either a bug in metafora or someone implemented their own 103 | // command incorrectly. 104 | return err 105 | } 106 | key := path.Join(cmdPath, strconv.FormatUint(rand.Uint64(), 10)) 107 | if _, err := mc.kvc.Put(context.Background(), key, string(body)); err != nil { 108 | metafora.Errorf("Error submitting command: %s to node: %s", command, node) 109 | return err 110 | } 111 | metafora.Debugf("Submitted command: %s to node: %s", string(body), node) 112 | return nil 113 | } 114 | 115 | // Nodes fetchs the currently registered nodes. A non-nil error means that some 116 | // error occured trying to get the node list. The node list may be nil if no 117 | // nodes are registered. 118 | func (mc *mclient) Nodes() ([]string, error) { 119 | res, err := mc.kvc.Get(context.Background(), mc.nodesPath(), etcdv3.WithPrefix()) 120 | if err != nil && res != nil && len(res.Kvs) > 0 { 121 | nodes := make([]string, len(res.Kvs)) 122 | for i, kv := range res.Kvs { 123 | var node string 124 | err = json.Unmarshal(kv.Key, &node) 125 | if err != nil { 126 | return nil, err 127 | } 128 | nodes[i] = path.Base(node) 129 | } 130 | return nodes, nil 131 | } 132 | 133 | return nil, nil 134 | } 135 | 136 | func (mc *mclient) Tasks() ([]string, error) { 137 | res, err := mc.kvc.Get( 138 | context.Background(), 139 | path.Join("/", mc.namespace, TasksPath), 140 | etcdv3.WithPrefix()) 141 | if err != nil { 142 | return nil, err 143 | } 144 | 145 | var tasks []string 146 | for _, kv := range res.Kvs { 147 | key := string(kv.Key) 148 | if base := path.Base(key); base == OwnerPath || base == MetadataPath || base == PropsPath { 149 | continue 150 | } else { 151 | tasks = append(tasks, base) 152 | } 153 | } 154 | return tasks, nil 155 | } 156 | -------------------------------------------------------------------------------- /metcdv3/client_test.go: -------------------------------------------------------------------------------- 1 | package metcdv3 2 | 3 | // NOTES 4 | // 5 | // These tests are in reality integration tests which require that 6 | // etcd is running on the test system and its peers are found 7 | // in the ENV variable ETCDCTL_PEERS. The tests do not clean 8 | // out data and require a fresh set of etcd instances for 9 | // each run. You can consider this a known bug which 10 | // will be fixed in a future release. 11 | // 12 | // See: https://github.com/lytics/metafora/issues/31 13 | 14 | import ( 15 | "context" 16 | "testing" 17 | 18 | "github.com/lytics/metafora" 19 | "github.com/lytics/metafora/metcdv3/testutil" 20 | etcdv3 "go.etcd.io/etcd/client/v3" 21 | ) 22 | 23 | const ( 24 | Namespace = "test" 25 | NodesDir = "/test/nodes" 26 | Node1 = "node1" 27 | Node1Path = "/test/nodes/node1" 28 | ) 29 | 30 | // TestNodes tests that client.Nodes() returns the metafora nodes 31 | // registered in etcd. 32 | func TestNodes(t *testing.T) { 33 | c := context.Background() 34 | eclient := testutil.NewEtcdV3Client(t) 35 | kvc := etcdv3.NewKV(eclient) 36 | _, _ = eclient.Delete(c, Node1Path, etcdv3.WithPrefix()) 37 | 38 | mclient := NewClient(Namespace, eclient) 39 | 40 | if _, err := kvc.Put(c, Node1Path, "0"); err != nil { 41 | t.Fatalf("AddChild %v returned error: %v", NodesDir, err) 42 | } 43 | 44 | if nodes, err := mclient.Nodes(); err != nil { 45 | t.Fatalf("Nodes returned error: %v", err) 46 | } else { 47 | for i, n := range nodes { 48 | t.Logf("%v -> %v", i, n) 49 | } 50 | } 51 | } 52 | 53 | // TestSubmitTask tests that client.SubmitTask(...) adds a task to 54 | // the proper path in etcd, and that the same task id cannot be 55 | // submitted more than once. 56 | func TestSubmitTask(t *testing.T) { 57 | client := testutil.NewEtcdV3Client(t) 58 | mclient := NewClient(Namespace, client) 59 | 60 | task := DefaultTaskFunc("testid1", "") 61 | 62 | if err := mclient.DeleteTask(task.ID()); err != nil { 63 | t.Logf("DeleteTask returned an error, which maybe ok. Error:%v", err) 64 | } 65 | 66 | if err := mclient.SubmitTask(task); err != nil { 67 | t.Fatalf("Submit task failed on initial submission, error: %v", err) 68 | } 69 | 70 | if err := mclient.SubmitTask(task); err == nil { 71 | t.Fatalf("Submit task did not fail, but should of, when using existing tast id") 72 | } 73 | } 74 | 75 | // TestSubmitCommand tests that client.SubmitCommand(...) adds a command 76 | // to the proper node path in etcd, and that it can be read back. 77 | func TestSubmitCommand(t *testing.T) { 78 | eclient := testutil.NewEtcdV3Client(t) 79 | kvc := etcdv3.NewKV(eclient) 80 | mclient := NewClient(Namespace, eclient) 81 | 82 | if err := mclient.SubmitCommand(Node1, metafora.CommandFreeze()); err != nil { 83 | t.Fatalf("Unable to submit command. error:%v", err) 84 | } 85 | 86 | if res, err := kvc.Get(context.Background(), NodesDir, etcdv3.WithPrefix()); err != nil { 87 | t.Fatalf("Get on path %v returned error: %v", NodesDir, err) 88 | } else if res.Count == 0 { 89 | t.Fatalf("Get on path %v returned nil for child nodes", NodesDir) 90 | } 91 | } 92 | -------------------------------------------------------------------------------- /metcdv3/commander.go: -------------------------------------------------------------------------------- 1 | package metcdv3 2 | 3 | import ( 4 | "context" 5 | "encoding/json" 6 | "errors" 7 | "fmt" 8 | "path" 9 | "sync" 10 | "time" 11 | 12 | "github.com/lytics/metafora" 13 | "github.com/lytics/metafora/statemachine" 14 | etcdv3 "go.etcd.io/etcd/client/v3" 15 | ) 16 | 17 | var ( 18 | ErrWatchClosedUnexpectedly = errors.New("metafora: watch closed unexpectedly") 19 | ) 20 | 21 | type cmdr struct { 22 | etcdv3c *etcdv3.Client 23 | kvc etcdv3.KV 24 | taskspath string 25 | } 26 | 27 | func NewCommander(namespace string, c *etcdv3.Client) statemachine.Commander { 28 | return &cmdr{ 29 | taskspath: path.Join("/", namespace, TasksPath), 30 | etcdv3c: c, 31 | kvc: etcdv3.NewKV(c), 32 | } 33 | } 34 | 35 | // Send command to a task. Overwrites existing commands. 36 | func (c *cmdr) Send(taskID string, m *statemachine.Message) error { 37 | buf, err := json.Marshal(m) 38 | if err != nil { 39 | return err 40 | } 41 | 42 | cmdPath := path.Join(c.taskspath, taskID, CommandsPath) 43 | _, err = c.kvc.Put(context.Background(), cmdPath, string(buf)) 44 | return err 45 | } 46 | 47 | type cmdListener struct { 48 | etcdv3c *etcdv3.Client 49 | kvc etcdv3.KV 50 | name string 51 | taskcmdpath string 52 | 53 | commands chan *statemachine.Message 54 | 55 | mu *sync.Mutex 56 | stop chan bool 57 | } 58 | 59 | // NewCommandListener makes a statemachine.CommandListener implementation 60 | // backed by etcd. The namespace should be the same as the coordinator as 61 | // commands use a separate path within a namespace than tasks or nodes. 62 | func NewCommandListener(conf *Config, task metafora.Task, c *etcdv3.Client) statemachine.CommandListener { 63 | taskcmdpath := path.Join("/", conf.Namespace, TasksPath, task.ID(), CommandsPath) 64 | cl := &cmdListener{ 65 | etcdv3c: c, 66 | name: conf.Name, 67 | taskcmdpath: taskcmdpath, 68 | kvc: etcdv3.NewKV(c), 69 | commands: make(chan *statemachine.Message), 70 | mu: &sync.Mutex{}, 71 | stop: make(chan bool), 72 | } 73 | ctxt := context.Background() 74 | go cl.watch(ctxt, taskcmdpath) 75 | return cl 76 | } 77 | 78 | func (c *cmdListener) Receive() <-chan *statemachine.Message { 79 | return c.commands 80 | } 81 | 82 | func (c *cmdListener) ownerValueString() string { 83 | p, err := json.Marshal(&ownerValue{Node: c.name}) 84 | if err != nil { 85 | panic(fmt.Sprintf("command listener: error marshalling node body: %v", err)) 86 | } 87 | return string(p) 88 | } 89 | 90 | func (c *cmdListener) Stop() { 91 | c.mu.Lock() 92 | defer c.mu.Unlock() 93 | select { 94 | case <-c.stop: 95 | default: 96 | close(c.stop) 97 | } 98 | } 99 | 100 | func (cl *cmdListener) watch(c context.Context, prefix string) { 101 | getRes, err := cl.kvc.Get(c, prefix, etcdv3.WithPrefix()) 102 | if err != nil { 103 | metafora.Errorf("Error GETting %s - sending error to stateful handler: %v", prefix, err) 104 | select { 105 | case <-c.Done(): 106 | // TODO Do I need the stop channel? 107 | case <-cl.stop: 108 | case cl.commands <- statemachine.ErrorMessage(err): 109 | } 110 | return 111 | } 112 | 113 | // Create a message from an event. 114 | createMessage := func(key string, value []byte) (*statemachine.Message, error) { 115 | msg := &statemachine.Message{} 116 | if err := json.Unmarshal(value, msg); err != nil { 117 | metafora.Errorf("Error unmarshalling command from %s - sending error to stateful handler: %v", key, err) 118 | return nil, err 119 | } 120 | 121 | txnRes, err := cl.kvc.Txn(c). 122 | If(etcdv3.Compare(etcdv3.Value(path.Join(path.Dir(key), OwnerPath)), "=", cl.ownerValueString())). 123 | Then(etcdv3.OpDelete(key, etcdv3.WithPrefix())). 124 | Commit() 125 | if err != nil { 126 | metafora.Errorf("Error deleting command %s: %s - sending error to stateful handler: %v", key, msg, err) 127 | return nil, err 128 | } 129 | if !txnRes.Succeeded { 130 | metafora.Infof("Received successive commands; attempting to retrieve the latest") 131 | return nil, nil 132 | } 133 | return msg, nil 134 | } 135 | // Write a change or exit the watcher. 136 | put := func(msg *statemachine.Message) { 137 | select { 138 | case <-c.Done(): 139 | case cl.commands <- msg: 140 | } 141 | } 142 | for _, kv := range getRes.Kvs { 143 | key := string(kv.Key) 144 | if path.Base(key) == MetadataPath { 145 | continue 146 | } 147 | value := kv.Value 148 | msg, err := createMessage(key, value) 149 | if err != nil { 150 | msg = statemachine.ErrorMessage(err) 151 | } 152 | if msg != nil { 153 | put(msg) 154 | } 155 | } 156 | 157 | putTerminalError := func(msg *statemachine.Message) { 158 | go func() { 159 | select { 160 | case <-c.Done(): 161 | // TODO Do I need the stop channel? 162 | case <-cl.stop: 163 | case <-time.After(10 * time.Minute): 164 | metafora.Warnf("metafora command listener timed out putting message on channel: %v", msg) 165 | case cl.commands <- msg: 166 | } 167 | }() 168 | } 169 | 170 | // Watch deltas in etcd, with the give prefix, starting 171 | // at the revision of the get call above. 172 | deltas := cl.etcdv3c.Watch(c, prefix, etcdv3.WithPrefix(), etcdv3.WithRev(getRes.Header.Revision+1), etcdv3.WithFilterDelete()) 173 | for { 174 | select { 175 | case <-c.Done(): 176 | return 177 | case <-cl.stop: 178 | return 179 | case delta, open := <-deltas: 180 | if !open { 181 | putTerminalError(statemachine.ErrorMessage(ErrWatchClosedUnexpectedly)) 182 | return 183 | } 184 | if delta.Err() != nil { 185 | putTerminalError(statemachine.ErrorMessage(delta.Err())) 186 | return 187 | } 188 | for _, event := range delta.Events { 189 | msg, err := createMessage(string(event.Kv.Key), event.Kv.Value) 190 | if err != nil { 191 | msg = statemachine.ErrorMessage(err) 192 | } 193 | if msg != nil { 194 | put(msg) 195 | } 196 | } 197 | } 198 | } 199 | } 200 | -------------------------------------------------------------------------------- /metcdv3/commander_test.go: -------------------------------------------------------------------------------- 1 | package metcdv3 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | "path" 7 | "testing" 8 | "time" 9 | 10 | "github.com/lytics/metafora" 11 | "github.com/lytics/metafora/statemachine" 12 | etcdv3 "go.etcd.io/etcd/client/v3" 13 | ) 14 | 15 | func TestCommandListener(t *testing.T) { 16 | t.Parallel() 17 | 18 | etcdv3c, _, conf := setupEtcd(t) 19 | kvc := etcdv3.NewKV(etcdv3c) 20 | 21 | namespace := "/cltest" 22 | conf.Namespace = namespace 23 | _, _ = kvc.Delete(context.Background(), namespace, etcdv3.WithPrefix()) 24 | 25 | task := metafora.NewTask("testtask") 26 | _, err := kvc.Put(context.Background(), path.Join(conf.Namespace, TasksPath, task.ID(), OwnerPath), fmt.Sprintf(`{"node":"%s"}`, conf.Name)) 27 | if err != nil { 28 | t.Fatalf("Error creating fake claim: %v", err) 29 | } 30 | 31 | cmdr := NewCommander(namespace, etcdv3c) 32 | 33 | // Only the last command should be received once the listener is started 34 | _ = cmdr.Send(task.ID(), statemachine.PauseMessage()) 35 | _ = cmdr.Send(task.ID(), statemachine.KillMessage()) 36 | 37 | cl := NewCommandListener(conf, task, etcdv3c) 38 | defer cl.Stop() 39 | 40 | // Ensure last command was received 41 | select { 42 | case cmd := <-cl.Receive(): 43 | if cmd.Code != statemachine.Kill { 44 | t.Fatalf("Expected Kill message, received %v", cmd) 45 | } 46 | case <-time.After(10 * time.Second): 47 | t.Fatal("CommandListener took too long to receive message") 48 | } 49 | 50 | // Ensure only one command was received 51 | select { 52 | case cmd := <-cl.Receive(): 53 | t.Fatalf("Unexpected command received: %v", cmd) 54 | case <-time.After(300 * time.Millisecond): 55 | // Ok! 56 | } 57 | 58 | cl.Stop() 59 | 60 | // Stop doesn't block until watching loop exits, so wait briefly 61 | time.Sleep(10 * time.Millisecond) 62 | 63 | // Ensure receiving after Stopping never succeeds 64 | _ = cmdr.Send(task.ID(), statemachine.RunMessage()) 65 | select { 66 | case cmd := <-cl.Receive(): 67 | t.Fatalf("Unexpected command received: %v", cmd) 68 | case <-time.After(300 * time.Millisecond): 69 | // Ok 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /metcdv3/conf.go: -------------------------------------------------------------------------------- 1 | package metcdv3 2 | 3 | import ( 4 | "fmt" 5 | "path" 6 | "strings" 7 | ) 8 | 9 | type Config struct { 10 | // Namespace is the key prefix to allow for multitenant use of etcd. 11 | // 12 | // Namespaces must start with a / (added by NewConfig if needed). 13 | Namespace string 14 | 15 | // Name of this Metafora consumer. Only one instance of a Name is allowed to 16 | // run in a Namespace at a time, so if you set the Name to hostname you can 17 | // effectively limit Metafora to one process per server. 18 | Name string 19 | 20 | // NewTaskFunc is the function called to unmarshal tasks from etcd into a 21 | // custom struct. The struct must implement the metafora.Task interface. 22 | // 23 | // If nil it is set to DefaultTaskFunc 24 | NewTaskFunc TaskFunc 25 | } 26 | 27 | // NewConfig creates a Config with the required fields and uses defaults for 28 | // the others. 29 | // 30 | // Panics on empty values. 31 | func NewConfig(name, namespace string) *Config { 32 | if namespace == "" || name == "" { 33 | panic("invalid etcd config") 34 | } 35 | 36 | namespace = path.Join("/", strings.Trim(namespace, "/ ")) 37 | return &Config{ 38 | Name: name, 39 | Namespace: namespace, 40 | NewTaskFunc: DefaultTaskFunc, 41 | } 42 | } 43 | 44 | // Copy returns a shallow copy of this config. 45 | func (c *Config) Copy() *Config { 46 | return &Config{ 47 | Name: c.Name, 48 | Namespace: c.Namespace, 49 | NewTaskFunc: c.NewTaskFunc, 50 | } 51 | } 52 | 53 | func (c *Config) String() string { 54 | return fmt.Sprintf("etcd:%s/%s", c.Namespace, c.Name) 55 | } 56 | -------------------------------------------------------------------------------- /metcdv3/const.go: -------------------------------------------------------------------------------- 1 | package metcdv3 2 | 3 | const ( 4 | TasksPath = "tasks" 5 | NodesPath = "nodes" 6 | CommandsPath = "commands" 7 | // Is this true for etcdv3? 8 | MetadataPath = "_metafora" // _{KEYs} are hidden files, so this will not trigger our watches 9 | OwnerPath = "owner" 10 | PropsPath = "props" 11 | 12 | //Etcd Error codes are passed directly through go-etcd from the http response, 13 | //So to find the error codes use this ref: 14 | // https://go.etcd.io/etcd/blob/master/error/error.go#L67 15 | EcodeKeyNotFound = 100 16 | EcodeCompareFailed = 101 17 | EcodeNodeExist = 105 18 | EcodeExpiredIndex = 401 // The event in requested index is outdated and cleared 19 | ) 20 | -------------------------------------------------------------------------------- /metcdv3/coordinator_test.go: -------------------------------------------------------------------------------- 1 | package metcdv3 2 | 3 | import ( 4 | "context" 5 | "path" 6 | "strings" 7 | "testing" 8 | "time" 9 | 10 | "github.com/lytics/metafora" 11 | 12 | etcdv3 "go.etcd.io/etcd/client/v3" 13 | ) 14 | 15 | /* 16 | Running the Integration Test: 17 | 18 | go test -v ./... 19 | */ 20 | 21 | func TestCoordinatorFirstNodeJoiner(t *testing.T) { 22 | t.Parallel() 23 | etcdv3c, coordinator1, conf := setupEtcd(t) 24 | if err := coordinator1.Init(newCtx(t, "coordinator1")); err != nil { 25 | t.Fatalf("Unexpected error initialzing coordinator: %v", err) 26 | } 27 | defer coordinator1.Close() 28 | kvc := etcdv3.NewKV(etcdv3c) 29 | 30 | tpath := path.Join(conf.Namespace, TasksPath) 31 | _, err := kvc.Get(context.Background(), tpath) 32 | if err != nil && strings.Contains(err.Error(), "Key not found") { 33 | t.Fatalf("The tasks path wasn't created when the first node joined: %s", tpath) 34 | } else if err != nil { 35 | t.Fatalf("Unknown error trying to test: err: %s", err.Error()) 36 | } 37 | 38 | //TODO test for node path too... 39 | } 40 | 41 | // Ensure that Watch() picks up new tasks and returns them. 42 | func TestCoordinatorTC1(t *testing.T) { 43 | t.Parallel() 44 | etcdv3c, coordinator1, conf := setupEtcd(t) 45 | if err := coordinator1.Init(newCtx(t, "coordinator1")); err != nil { 46 | t.Fatalf("Unexpected error initialzing coordinator: %v", err) 47 | } 48 | defer coordinator1.Close() 49 | kvc := etcdv3.NewKV(etcdv3c) 50 | 51 | tasks := make(chan metafora.Task) 52 | task001 := &task{id: "test-task"} 53 | taskPath := path.Join(conf.Namespace, TasksPath, task001.ID()) 54 | errc := make(chan error) 55 | 56 | go func() { 57 | //Watch blocks, so we need to test it in its own go routine. 58 | errc <- coordinator1.Watch(tasks) 59 | }() 60 | 61 | _, _ = kvc.Put(context.Background(), taskPath, "5") 62 | 63 | select { 64 | case task := <-tasks: 65 | if task.ID() != task001.ID() { 66 | t.Fatalf("coordinator1.Watch() test failed: We received the incorrect taskId. Got [%s] Expected[%s]", task, task001) 67 | } 68 | case <-time.After(time.Second * 5): 69 | t.Fatalf("coordinator1.Watch() test failed: The testcase timed out after 5 seconds.") 70 | } 71 | 72 | coordinator1.Close() 73 | err := <-errc 74 | if err != nil { 75 | t.Fatalf("coordinator1.Watch() returned an err: %v", err) 76 | } 77 | } 78 | 79 | // Submit a task while a coordinator is actively watching for tasks. 80 | func TestCoordinatorTC2(t *testing.T) { 81 | t.Parallel() 82 | etcdv3c, coordinator1, conf := setupEtcd(t) 83 | if err := coordinator1.Init(newCtx(t, "coordinator1")); err != nil { 84 | t.Fatalf("Unexpected error initialzing coordinator: %v", err) 85 | } 86 | defer coordinator1.Close() 87 | 88 | testTasks := []string{"test1", "test2", "test3"} 89 | 90 | mclient := NewClient(conf.Namespace, etcdv3c) 91 | 92 | tasks := make(chan metafora.Task) 93 | errc := make(chan error) 94 | go func() { 95 | //Watch blocks, so we need to test it in its own go routine. 96 | errc <- coordinator1.Watch(tasks) 97 | }() 98 | 99 | for _, taskid := range testTasks { 100 | err := mclient.SubmitTask(DefaultTaskFunc(taskid, "")) 101 | if err != nil { 102 | t.Fatalf("Error submitting a task to metafora via the client. Error:\n%v", err) 103 | } 104 | recvd := <-tasks 105 | if recvd.ID() != taskid { 106 | t.Fatalf("%s != %s - received an unexpected task", recvd.ID(), taskid) 107 | } 108 | if ok := coordinator1.Claim(recvd); !ok { 109 | t.Fatal("coordinator1.Claim() unable to claim the task") 110 | } 111 | } 112 | 113 | coordinator1.Close() 114 | err := <-errc 115 | if err != nil { 116 | t.Fatalf("coordinator1.Watch() returned an err: %v", err) 117 | } 118 | } 119 | 120 | // Start two coordinators to ensure that fighting over claims results in only 121 | // one coordinator winning (and the other not crashing). 122 | func TestCoordinatorTC3(t *testing.T) { 123 | t.Parallel() 124 | etcdv3c, coordinator1, conf1 := setupEtcd(t) 125 | if err := coordinator1.Init(newCtx(t, "coordinator1")); err != nil { 126 | t.Fatalf("Unexpected error initialzing coordinator: %v", err) 127 | } 128 | defer coordinator1.Close() 129 | conf2 := conf1.Copy() 130 | conf2.Name = "node2" 131 | coordinator2 := NewEtcdV3Coordinator(conf2, etcdv3c) 132 | if err := coordinator2.Init(newCtx(t, "coordinator2")); err != nil { 133 | t.Fatalf("Unexpected error initialzing coordinator: %v", err) 134 | } 135 | defer coordinator2.Close() 136 | 137 | testTasks := []string{"test-claiming-task0001", "test-claiming-task0002", "test-claiming-task0003"} 138 | 139 | mclient := NewClient(conf1.Namespace, etcdv3c) 140 | 141 | // Start the watchers 142 | errc := make(chan error, 2) 143 | c1tasks := make(chan metafora.Task) 144 | c2tasks := make(chan metafora.Task) 145 | go func() { 146 | errc <- coordinator1.Watch(c1tasks) 147 | }() 148 | go func() { 149 | errc <- coordinator2.Watch(c2tasks) 150 | }() 151 | 152 | // Submit the tasks 153 | for _, tid := range testTasks { 154 | err := mclient.SubmitTask(DefaultTaskFunc(tid, "")) 155 | if err != nil { 156 | t.Fatalf("Error submitting task=%q to metafora via the client. Error:\n%v", tid, err) 157 | } 158 | } 159 | 160 | //XXX This assumes tasks are sent by watchers in the order they were 161 | // submitted to etcd which, while /possible/ to guarantee, isn't a gurantee 162 | // we're interested in making. 163 | // We only want to guarantee that exactly one coordinator can claim a task. 164 | c1t := <-c1tasks 165 | c2t := <-c2tasks 166 | if c1t.ID() != c2t.ID() { 167 | t.Logf("Watchers didn't receive the same task %s != %s. It's fine; watch order isn't guaranteed", c1t, c2t) 168 | } 169 | 170 | // Make sure c1 can claim and c2 cannot 171 | if ok := coordinator1.Claim(c1t); !ok { 172 | t.Fatalf("coordinator1.Claim() unable to claim the task=%q", c1t) 173 | } 174 | if ok := coordinator2.Claim(c1t); ok { 175 | t.Fatalf("coordinator2.Claim() succeeded for task=%q when it shouldn't have!", c2t) 176 | } 177 | 178 | // Make sure coordinators close down properly and quickly 179 | coordinator1.Close() 180 | if err := <-errc; err != nil { 181 | t.Errorf("Error shutting down coordinator1: %v", err) 182 | } 183 | coordinator2.Close() 184 | if err := <-errc; err != nil { 185 | t.Errorf("Error shutting down coordinator2: %v", err) 186 | } 187 | } 188 | 189 | // Submit a task before any coordinators are active. Then start a coordinator to 190 | // ensure the tasks are picked up by the new coordinator 191 | // 192 | // Then call coordinator.Release() on the task to make sure a coordinator picks it 193 | // up again. 194 | func TestCoordinatorTC4(t *testing.T) { 195 | t.Parallel() 196 | etcdv3c, coordinator1, conf1 := setupEtcd(t) 197 | 198 | task := "testtask4" 199 | 200 | mclient := NewClient(conf1.Namespace, etcdv3c) 201 | 202 | if err := mclient.SubmitTask(DefaultTaskFunc(task, "")); err != nil { 203 | t.Fatalf("Error submitting a task to metafora via the client. Error:\n%v", err) 204 | } 205 | 206 | // Don't start up the coordinator until after the metafora client has submitted work. 207 | if err := coordinator1.Init(newCtx(t, "coordinator1")); err != nil { 208 | t.Fatalf("Unexpected error initialzing coordinator: %v", err) 209 | } 210 | defer coordinator1.Close() 211 | 212 | errc := make(chan error) 213 | c1tasks := make(chan metafora.Task) 214 | go func() { 215 | errc <- coordinator1.Watch(c1tasks) 216 | }() 217 | 218 | tid := <-c1tasks 219 | 220 | if ok := coordinator1.Claim(tid); !ok { 221 | t.Fatal("coordinator1.Claim() unable to claim the task") 222 | } 223 | 224 | // Startup a second 225 | conf2 := conf1.Copy() 226 | conf2.Name = "node2" 227 | coordinator2 := NewEtcdV3Coordinator(conf2, etcdv3c) 228 | if err := coordinator2.Init(newCtx(t, "coordinator2")); err != nil { 229 | t.Fatalf("Unexpected error initialzing coordinator: %v", err) 230 | } 231 | defer coordinator2.Close() 232 | 233 | c2tasks := make(chan metafora.Task) 234 | go func() { 235 | errc <- coordinator2.Watch(c2tasks) 236 | }() 237 | 238 | // coordinator 2 shouldn't see anything yet 239 | select { 240 | case <-c2tasks: 241 | t.Fatal("coordinator2.Watch() returned a task when there are none to claim!") 242 | case <-time.After(100 * time.Millisecond): 243 | } 244 | 245 | // Now release the task from coordinator1 and claim it with coordinator2 246 | coordinator1.Release(tid) 247 | tid = <-c2tasks 248 | if ok := coordinator2.Claim(tid); !ok { 249 | t.Fatalf("coordinator2.Claim() should have succeded on released task=%q", tid) 250 | } 251 | 252 | coordinator1.Close() 253 | coordinator2.Close() 254 | for i := 0; i < 2; i++ { 255 | if err := <-errc; err != nil { 256 | t.Errorf("coordinator returned an error after closing: %v", err) 257 | } 258 | } 259 | } 260 | 261 | // TestNodeCleanup ensures the coordinator properly cleans up its node entry 262 | // upon exit. 263 | func TestNodeCleanup(t *testing.T) { 264 | t.Parallel() 265 | etcdv3c, c1, conf1 := setupEtcd(t) 266 | if err := c1.Init(newCtx(t, "coordinator1")); err != nil { 267 | t.Fatalf("Unexpected error initialzing coordinator: %v", err) 268 | } 269 | conf2 := conf1.Copy() 270 | conf2.Name = "node2" 271 | c2 := NewEtcdV3Coordinator(conf2, etcdv3c) 272 | kvc := etcdv3.NewKV(etcdv3c) 273 | if err := c2.Init(newCtx(t, "coordinator2")); err != nil { 274 | t.Fatalf("Unexpected error initialzing coordinator: %v", err) 275 | } 276 | defer c1.Close() 277 | defer c2.Close() 278 | 279 | // Make sure node directories were created 280 | c1nodep := path.Join(conf1.Namespace, NodesPath, conf1.Name, MetadataPath) 281 | c := context.Background() 282 | resp, err := kvc.Get(c, c1nodep) 283 | if err != nil { 284 | t.Fatalf("Error retrieving node key from etcd: %v", err) 285 | } 286 | if resp.Count == 0 { 287 | t.Error(c1nodep + " isn't a directory!") 288 | } 289 | 290 | c2nodep := path.Join(conf2.Namespace, NodesPath, conf2.Name, MetadataPath) 291 | resp, err = kvc.Get(c, c2nodep) 292 | if err != nil { 293 | t.Fatalf("Error retrieving node key from etcd: %v", err) 294 | } 295 | if resp.Count == 0 { 296 | t.Error(c2nodep + " isn't a directory!") 297 | } 298 | 299 | // Shutdown one and make sure its node directory is gone 300 | c1.Close() 301 | 302 | resp, err = kvc.Get(c, c1nodep) 303 | if err != nil { 304 | t.Errorf("Unexpected error %T retrieving node key from etcd: %v", err, err) 305 | } 306 | if resp.Count != 0 { 307 | t.Errorf("Expected Not Found error, but directory still exists!") 308 | } 309 | 310 | // Make sure c2 is untouched 311 | resp, err = kvc.Get(c, c2nodep) 312 | if err != nil { 313 | t.Fatalf("Error retrieving node key from etcd: %v", err) 314 | } 315 | if resp.Count == 0 { 316 | t.Error(c2nodep + " isn't a directory!") 317 | } 318 | } 319 | 320 | // TestExpiration ensures that expired claims get reclaimed properly. 321 | func TestExpiration(t *testing.T) { 322 | t.Parallel() 323 | etcdv3c, coord, conf := setupEtcd(t) 324 | kvc := etcdv3.NewKV(etcdv3c) 325 | claims := make(chan int, 10) 326 | hf := metafora.HandlerFunc(metafora.SimpleHandler(func(_ metafora.Task, stop <-chan bool) bool { 327 | claims <- 1 328 | <-stop 329 | return true 330 | })) 331 | consumer, err := metafora.NewConsumer(coord, hf, metafora.DumbBalancer) 332 | if err != nil { 333 | t.Fatalf("Error creating consumer: %+v", err) 334 | } 335 | 336 | _, err = kvc.Put(context.Background(), path.Join(conf.Namespace, TasksPath, "abc", OwnerPath), `{"node":"--"}`) 337 | if err != nil { 338 | t.Fatalf("Error creating fake claim: %v", err) 339 | } 340 | _, err = kvc.Put(context.Background(), path.Join(conf.Namespace, TasksPath, "abc"), "") 341 | if err != nil { 342 | t.Fatalf("Error creating fake task: %v", err) 343 | } 344 | _, err = kvc.Delete(context.Background(), path.Join(conf.Namespace, TasksPath, "abc", OwnerPath)) 345 | if err != nil { 346 | t.Fatalf("Error deleting fake claim: %v", err) 347 | } 348 | 349 | defer consumer.Shutdown() 350 | go consumer.Run() 351 | 352 | // Wait for claim to expire and coordinator to pick up task 353 | select { 354 | case <-claims: 355 | // Task claimed! 356 | case <-time.After(5 * time.Second): 357 | t.Fatal("Task not claimed long after it should have been.") 358 | } 359 | 360 | tasks := consumer.Tasks() 361 | if len(tasks) != 1 { 362 | t.Fatalf("Expected 1 task to be claimed but found: %v", tasks) 363 | } 364 | } 365 | -------------------------------------------------------------------------------- /metcdv3/doc.go: -------------------------------------------------------------------------------- 1 | // Package metcdv3 contains implementations of all Metafora interfaces using 2 | // etcd as the broker/backing store. 3 | // 4 | // See https://github.com/lytics/metafora/Documentation/etcdv3.md for details. 5 | package metcdv3 6 | -------------------------------------------------------------------------------- /metcdv3/helpers_test.go: -------------------------------------------------------------------------------- 1 | package metcdv3 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | "log" 7 | "os" 8 | "sync/atomic" 9 | "testing" 10 | 11 | "github.com/lytics/metafora" 12 | "github.com/lytics/metafora/metcdv3/testutil" 13 | 14 | etcdv3 "go.etcd.io/etcd/client/v3" 15 | ) 16 | 17 | func init() { 18 | metafora.SetLogger(log.New(os.Stderr, "", log.Lmicroseconds|log.Lshortfile)) 19 | //metafora.SetLogLevel(metafora.LogLevelDebug) 20 | } 21 | 22 | var testcounter uint64 23 | 24 | // setupEtcd should be used for all etcd integration tests. It handles the following tasks: 25 | // - Create and return an etcd client 26 | // - Create and return an initial etcd coordinator 27 | // - Clearing the test namespace in etcd 28 | func setupEtcd(t *testing.T) (*etcdv3.Client, *EtcdV3Coordinator, *Config) { 29 | c := context.Background() 30 | client := testutil.NewEtcdV3Client(t) 31 | kvc := etcdv3.NewKV(client) 32 | n := atomic.AddUint64(&testcounter, 1) 33 | ns := fmt.Sprintf("/metaforatests-%d", n) 34 | _, err := kvc.Delete(c, ns, etcdv3.WithPrefix()) 35 | if err != nil { 36 | t.Errorf("failed to clean up namespace in etcd") 37 | } 38 | conf := NewConfig("testclient", ns) 39 | coord := NewEtcdV3Coordinator(conf, client) 40 | return client, coord, conf 41 | } 42 | 43 | type testLogger struct { 44 | prefix string 45 | *testing.T 46 | } 47 | 48 | func (l testLogger) Log(lvl int, m string, v ...interface{}) { 49 | l.T.Logf("%s:[%d] %s", l.prefix, lvl, fmt.Sprintf(m, v...)) 50 | } 51 | 52 | type testCoordCtx struct { 53 | testLogger 54 | lost chan string 55 | } 56 | 57 | func newCtx(t *testing.T, prefix string) *testCoordCtx { 58 | return &testCoordCtx{ 59 | testLogger: testLogger{prefix: prefix, T: t}, 60 | lost: make(chan string, 10), 61 | } 62 | } 63 | 64 | func (t *testCoordCtx) Lost(task metafora.Task) { 65 | t.Log(4, "Lost(%s)", task.ID()) 66 | t.lost <- task.ID() 67 | } 68 | -------------------------------------------------------------------------------- /metcdv3/integration_test.go: -------------------------------------------------------------------------------- 1 | package metcdv3_test 2 | 3 | import ( 4 | "context" 5 | "errors" 6 | "testing" 7 | "time" 8 | 9 | "github.com/lytics/metafora" 10 | "github.com/lytics/metafora/metcdv3" 11 | "github.com/lytics/metafora/metcdv3/testutil" 12 | "github.com/lytics/metafora/statemachine" 13 | etcdv3 "go.etcd.io/etcd/client/v3" 14 | ) 15 | 16 | // TestSleepTest is an integration test for all of m_etcd's components. 17 | func TestSleepTest(t *testing.T) { 18 | etcdv3c := testutil.NewEtcdV3Client(t) 19 | kvc := etcdv3.NewKV(etcdv3c) 20 | t.Parallel() 21 | const namespace = "/sleeptest-metafora" 22 | const sleepingtasks = "sleeping-task1" 23 | 24 | _, _ = kvc.Delete(context.Background(), namespace, etcdv3.WithPrefix()) 25 | 26 | holdtask := make(chan bool) 27 | h := func(task metafora.Task, cmds <-chan *statemachine.Message) *statemachine.Message { 28 | 29 | if task.ID() == sleepingtasks { 30 | sleeptil := 5 * time.Second 31 | nextstarttime := (time.Now().Add(sleeptil)) 32 | t.Logf("sleeping task:%v sleepfor:%v", task, nextstarttime) 33 | <-holdtask 34 | return statemachine.SleepMessage(nextstarttime) 35 | } 36 | 37 | cmd := <-cmds 38 | t.Logf("non sleeping task:%v", task) 39 | 40 | return cmd 41 | } 42 | 43 | newC := func(name, ns string) *metafora.Consumer { 44 | conf := metcdv3.NewConfig(name, ns) 45 | coord, hf, bal := metcdv3.New(conf, etcdv3c, h) 46 | cons, err := metafora.NewConsumer(coord, hf, bal) 47 | if err != nil { 48 | t.Fatalf("Error creating consumer %s:%s: %v", ns, name, err) 49 | } 50 | go func() { 51 | cons.Run() 52 | t.Logf("Consumer:%s exited.", name) 53 | }() 54 | return cons 55 | } 56 | 57 | assertRunning := func(tid string, cons ...*metafora.Consumer) { 58 | found := false 59 | for _, c := range cons { 60 | tasks := c.Tasks() 61 | if len(tasks) > 0 && found { 62 | t.Fatal("Task already found running but another task is running on a different consumer") 63 | } 64 | if len(tasks) > 1 { 65 | t.Fatalf("Expected at most 1 task, but found: %d", len(tasks)) 66 | } 67 | if len(tasks) == 1 && tasks[0].Task().ID() == tid { 68 | found = true 69 | } 70 | } 71 | if !found { 72 | t.Fatalf("Could not find task=%q", tid) 73 | } 74 | } 75 | 76 | // Start 2 consumers 77 | cons1 := newC("node1", namespace) 78 | cons2 := newC("node2", namespace) 79 | 80 | // Create clients and start some tests 81 | cliA := metcdv3.NewClient(namespace, etcdv3c) 82 | 83 | if err := cliA.SubmitTask(metcdv3.DefaultTaskFunc(sleepingtasks, "")); err != nil { 84 | t.Fatalf("Error submitting task1 to a: %v", err) 85 | } 86 | 87 | // Give consumers a bit to pick up tasks 88 | time.Sleep(500 * time.Millisecond) 89 | 90 | assertRunning(sleepingtasks, cons1, cons2) 91 | 92 | holdtask <- true 93 | // Give consumers a bit to pick up tasks 94 | time.Sleep(500 * time.Millisecond) 95 | 96 | assertRunning(sleepingtasks, cons1, cons2) // not sure if this should be true or false. 97 | 98 | wait1 := make(chan bool) 99 | go func() { 100 | defer close(wait1) 101 | // Shutdown 102 | cons1.Shutdown() 103 | cons2.Shutdown() 104 | }() 105 | 106 | timeout := time.NewTimer(5 * time.Second) 107 | select { 108 | case <-wait1: 109 | case <-timeout.C: 110 | t.Fatalf("failed waiting for shutdown") 111 | } 112 | 113 | // make sure all tasks are released 114 | for _, c := range []*metafora.Consumer{cons1, cons2} { 115 | tasks := c.Tasks() 116 | for _, work := range tasks { 117 | t.Fatalf("work id %v is still running", work) 118 | } 119 | } 120 | } 121 | 122 | // TestAll is an integration test for all of m_etcd's components. 123 | // 124 | // While huge integration tests like this are rarely desirable as they can be 125 | // overly fragile and complex, I found myself manually repeating the tests I've 126 | // automated here over and over. This is far more reliable than expecting 127 | // developers to do adhoc testing of all of the m_etcd package's features. 128 | func TestAll(t *testing.T) { 129 | etcdv3c := testutil.NewEtcdV3Client(t) 130 | kvc := etcdv3.NewKV(etcdv3c) 131 | t.Parallel() 132 | 133 | c := context.Background() 134 | _, _ = kvc.Delete(c, "/test-a", etcdv3.WithPrefix()) 135 | _, _ = kvc.Delete(c, "/test-b", etcdv3.WithPrefix()) 136 | 137 | h := func(task metafora.Task, cmds <-chan *statemachine.Message) *statemachine.Message { 138 | cmd := <-cmds 139 | if task.ID() == "error-test" { 140 | return statemachine.ErrorMessage(errors.New("error-test")) 141 | } 142 | return cmd 143 | } 144 | 145 | newC := func(name, ns string) *metafora.Consumer { 146 | conf := metcdv3.NewConfig(name, ns) 147 | conf.Name = name 148 | coord, hf, bal := metcdv3.New(conf, etcdv3c, h) 149 | cons, err := metafora.NewConsumer(coord, hf, bal) 150 | if err != nil { 151 | t.Fatalf("Error creating consumer %s:%s: %v", ns, name, err) 152 | } 153 | go cons.Run() 154 | return cons 155 | } 156 | // Start 4 consumers, 2 per namespace 157 | cons1a := newC("node1", "/test-a") 158 | cons2a := newC("node2", "/test-a") 159 | cons1b := newC("node1", "/test-b") 160 | cons2b := newC("node2", "/test-b") 161 | 162 | // Create clients and start some tests 163 | cliA := metcdv3.NewClient("/test-a", etcdv3c) 164 | cliB := metcdv3.NewClient("/test-b", etcdv3c) 165 | 166 | if err := cliA.SubmitTask(metcdv3.DefaultTaskFunc("task1", "")); err != nil { 167 | t.Fatalf("Error submitting task1 to a: %v", err) 168 | } 169 | if err := cliB.SubmitTask(metcdv3.DefaultTaskFunc("task1", "")); err != nil { 170 | t.Fatalf("Error submitting task1 to b: %v", err) 171 | } 172 | 173 | // Give consumers a bit to pick up tasks 174 | time.Sleep(500 * time.Millisecond) 175 | 176 | assertRunning := func(tid string, cons ...*metafora.Consumer) { 177 | found := false 178 | for _, c := range cons { 179 | tasks := c.Tasks() 180 | if len(tasks) > 0 && found { 181 | t.Fatal("Task already found running but another task is running on a different consumer") 182 | } 183 | if len(tasks) > 1 { 184 | t.Fatalf("Expected at most 1 task, but found: %d", len(tasks)) 185 | } 186 | if len(tasks) == 1 && tasks[0].Task().ID() == tid { 187 | found = true 188 | } 189 | } 190 | if !found { 191 | t.Fatalf("Could not find task=%q", tid) 192 | } 193 | } 194 | 195 | assertRunning("task1", cons1a, cons2a) 196 | assertRunning("task1", cons1b, cons2b) 197 | 198 | // Kill task1 in A 199 | { 200 | cmdr := metcdv3.NewCommander("/test-a", etcdv3c) 201 | if err := cmdr.Send("task1", statemachine.KillMessage()); err != nil { 202 | t.Fatalf("Error sending kill to task1: %v", err) 203 | } 204 | time.Sleep(1000 * time.Millisecond) 205 | 206 | for _, c := range []*metafora.Consumer{cons1a, cons2a} { 207 | tasks := c.Tasks() 208 | if len(tasks) != 0 { 209 | t.Fatalf("Expected no tasks but found: %d", len(tasks)) 210 | } 211 | } 212 | } 213 | 214 | // Submit a bunch of tasks to A 215 | { 216 | tasks := []string{"task2", "task3", "task4", "task5", "task6", "task7"} 217 | for _, tid := range tasks { 218 | if err := cliA.SubmitTask(metcdv3.DefaultTaskFunc(tid, "")); err != nil { 219 | t.Fatalf("Error submitting task=%q to A: %v", tid, err) 220 | } 221 | } 222 | 223 | // Give them time to start 224 | time.Sleep(800 * time.Millisecond) 225 | 226 | // Ensure they're balanced 227 | if err := cliA.SubmitCommand("node1", metafora.CommandBalance()); err != nil { 228 | t.Fatalf("Error submitting balance command to cons1a: %v", err) 229 | } 230 | time.Sleep(800 * time.Millisecond) 231 | if err := cliA.SubmitCommand("node2", metafora.CommandBalance()); err != nil { 232 | t.Fatalf("Error submitting balance command to cons1a: %v", err) 233 | } 234 | 235 | a1tasks := cons1a.Tasks() 236 | a2tasks := cons2a.Tasks() 237 | for _, task := range a1tasks { 238 | metafora.Debug("A1: ", task.Task(), " - ", task.Stopped().IsZero()) 239 | } 240 | for _, task := range a2tasks { 241 | metafora.Debug("A2: ", task.Task(), " - ", task.Stopped().IsZero()) 242 | } 243 | time.Sleep(800 * time.Millisecond) 244 | 245 | a1tasks = cons1a.Tasks() 246 | a2tasks = cons2a.Tasks() 247 | if len(a1tasks) < 2 || len(a1tasks) > 4 || len(a2tasks) < 2 || len(a2tasks) > 4 { 248 | t.Fatalf("Namespace A isn't fairly balanced: node1: %d; node2: %d", len(a1tasks), len(a2tasks)) 249 | } 250 | 251 | // Shutting down a consumer should migrate all tasks to the other 252 | cons1a.Shutdown() 253 | time.Sleep(800 * time.Millisecond) 254 | 255 | a2tasks = cons2a.Tasks() 256 | if len(a2tasks) != len(tasks) { 257 | t.Fatalf("Consumer 2a should have received all %d tasks but only has %d.", len(tasks), len(a2tasks)) 258 | } 259 | } 260 | 261 | // Use Namespace B to check Error state handling 262 | { 263 | tasks := []string{"task8", "error-test"} 264 | for _, tid := range tasks { 265 | if err := cliB.SubmitTask(metcdv3.DefaultTaskFunc(tid, "")); err != nil { 266 | t.Fatalf("Error submitting task=%q to B: %v", tid, err) 267 | } 268 | } 269 | 270 | // Give them time to start 271 | time.Sleep(time.Second) 272 | 273 | n := len(cons1b.Tasks()) + len(cons2b.Tasks()) 274 | if n != 3 { 275 | t.Fatalf("Expected B to be running 3 tasks but found %d", n) 276 | } 277 | 278 | // Resuming error-test 8*2 times should cause it to be failed 279 | cmdr := metcdv3.NewCommander("/test-b", etcdv3c) 280 | for i := 0; i < statemachine.DefaultErrMax*2; i++ { 281 | if err := cmdr.Send("error-test", statemachine.RunMessage()); err != nil { 282 | t.Fatalf("Unexpected error resuming error-test in B: %v", err) 283 | } 284 | time.Sleep(500 * time.Millisecond) 285 | } 286 | 287 | n = len(cons1b.Tasks()) + len(cons2b.Tasks()) 288 | if n != 2 { 289 | t.Fatalf("Expected B to be running 2 tasks but found %d", n) 290 | } 291 | 292 | // Resubmitting a failed task shouldn't error but also shouldn't run. 293 | if err := cliB.SubmitTask(metcdv3.DefaultTaskFunc("error-test", "")); err != nil { 294 | t.Fatalf("Error resubmitting error-test task to B: %v", err) 295 | } 296 | 297 | // Give the statemachine a moment to load the initial state and exit 298 | time.Sleep(time.Second) 299 | 300 | n = len(cons1b.Tasks()) + len(cons2b.Tasks()) 301 | if n != 2 { 302 | t.Fatalf("Expected B to be running 2 tasks but found %d", n) 303 | } 304 | } 305 | 306 | // Shutdown 307 | cons2a.Shutdown() 308 | cons1b.Shutdown() 309 | cons2b.Shutdown() 310 | } 311 | 312 | // TestTaskResurrectionInt ensures that a Claim won't recreate a task that had 313 | // been deleted (marked as done). taskmgr has a non-integration version of this 314 | // test. 315 | func TestTaskResurrectionInt(t *testing.T) { 316 | etcdv3c := testutil.NewEtcdV3Client(t) 317 | kvc := etcdv3.NewKV(etcdv3c) 318 | c := context.Background() 319 | t.Parallel() 320 | 321 | _, _ = kvc.Delete(c, "/test-resurrect", etcdv3.WithPrefix()) 322 | 323 | task := metcdv3.DefaultTaskFunc("xyz", "") 324 | 325 | conf := metcdv3.NewConfig("testclient", "/test-resurrect") 326 | coord := metcdv3.NewEtcdV3Coordinator(conf, etcdv3c) 327 | if err := coord.Init(nil); err != nil { 328 | t.Fatalf("Error initializing coordinator: %v", err) 329 | } 330 | defer coord.Close() 331 | 332 | // Try to claim a nonexistent 333 | if claimed := coord.Claim(task); claimed { 334 | t.Fatal("Claiming a nonexistent task should not work but did!") 335 | } 336 | 337 | // Create a task, mark it as done, and try to claim it again 338 | client := metcdv3.NewClient("/test-resurrect", etcdv3c) 339 | if err := client.SubmitTask(metcdv3.DefaultTaskFunc("xyz", "")); err != nil { 340 | t.Fatalf("Error submitting task xyz: %v", err) 341 | } 342 | 343 | if claimed := coord.Claim(task); !claimed { 344 | t.Fatal("Failed to claim task xyz") 345 | } 346 | 347 | coord.Done(task) 348 | 349 | if claimed := coord.Claim(task); claimed { 350 | t.Fatal("Reclaimed task that was marked as done.") 351 | } 352 | } 353 | -------------------------------------------------------------------------------- /metcdv3/statestore.go: -------------------------------------------------------------------------------- 1 | package metcdv3 2 | 3 | import ( 4 | "context" 5 | "encoding/json" 6 | "path" 7 | 8 | "github.com/lytics/metafora" 9 | "github.com/lytics/metafora/statemachine" 10 | etcdv3 "go.etcd.io/etcd/client/v3" 11 | ) 12 | 13 | const statePath = "state" 14 | 15 | // stateStore is an etcd implementation of statemachine.StateStore. 16 | type stateStore struct { 17 | etcdv3c *etcdv3.Client 18 | kvc etcdv3.KV 19 | path string 20 | } 21 | 22 | // NewStateStore returns a StateStore implementation that persists task states 23 | // in etcd. 24 | func NewStateStore(namespace string, etcdv3c *etcdv3.Client) statemachine.StateStore { 25 | return &stateStore{ 26 | etcdv3c: etcdv3c, 27 | kvc: etcdv3.NewKV(etcdv3c), 28 | path: path.Join("/", namespace, statePath), 29 | } 30 | } 31 | 32 | // Load retrieves the given task's state from etcd or stores and returns 33 | // Runnable if no state exists. 34 | func (s *stateStore) Load(task metafora.Task) (*statemachine.State, error) { 35 | resp, err := s.kvc.Get(context.Background(), path.Join(s.path, task.ID()), etcdv3.WithLimit(1)) 36 | if err != nil { 37 | return nil, err 38 | 39 | } 40 | 41 | if resp.Count == 0 { 42 | metafora.Infof("task=%q has no existing state, default to Runnable", task.ID()) 43 | state := &statemachine.State{Code: statemachine.Runnable} 44 | if err := s.Store(task, state); err != nil { 45 | return nil, err 46 | } 47 | return state, nil 48 | } 49 | 50 | // Unmarshal state from key 51 | state := &statemachine.State{} 52 | if err := json.Unmarshal([]byte(resp.Kvs[0].Value), state); err != nil { 53 | return nil, err 54 | } 55 | return state, nil 56 | } 57 | 58 | // Store taskID's state in etcd overwriting any prior state. 59 | func (s *stateStore) Store(task metafora.Task, state *statemachine.State) error { 60 | buf, err := json.Marshal(state) 61 | if err != nil { 62 | return err 63 | } 64 | 65 | _, err = s.kvc.Put(context.Background(), path.Join(s.path, task.ID()), string(buf)) 66 | return err 67 | } 68 | -------------------------------------------------------------------------------- /metcdv3/task.go: -------------------------------------------------------------------------------- 1 | package metcdv3 2 | 3 | import "github.com/lytics/metafora" 4 | 5 | type task struct { 6 | id string 7 | } 8 | 9 | func (t *task) ID() string { return t.id } 10 | 11 | // TaskFunc creates a Task interface from a task ID and etcd Node. The Node 12 | // corresponds to the task directory. 13 | // 14 | // Implementations must support value being an empty string. 15 | // 16 | // If nil is returned the task is ignored. 17 | type TaskFunc func(id, value string) metafora.Task 18 | 19 | // DefaultTaskFunc is the default new task function used by the EtcdCoordinator 20 | // and does not attempt to process the properties value. 21 | func DefaultTaskFunc(id, _ string) metafora.Task { return &task{id: id} } 22 | -------------------------------------------------------------------------------- /metcdv3/task_test.go: -------------------------------------------------------------------------------- 1 | package metcdv3_test 2 | 3 | import ( 4 | "context" 5 | "encoding/json" 6 | "fmt" 7 | "testing" 8 | "time" 9 | 10 | "github.com/lytics/metafora" 11 | "github.com/lytics/metafora/metcdv3" 12 | "github.com/lytics/metafora/metcdv3/testutil" 13 | "github.com/lytics/metafora/statemachine" 14 | etcdv3 "go.etcd.io/etcd/client/v3" 15 | ) 16 | 17 | // exTask is an extended Task type to demonstrate using an alternative NewTask 18 | // TaskFunc. 19 | type exTask struct { 20 | id string 21 | SubmittedT *time.Time `json:"_submitted"` 22 | UserID string `json:"UserID"` 23 | } 24 | 25 | func (t *exTask) ID() string { return t.id } 26 | func (t *exTask) Submitted() *time.Time { return t.SubmittedT } 27 | func (t *exTask) String() string { 28 | if t.SubmittedT == nil { 29 | return t.id 30 | } 31 | return fmt.Sprintf("%s submitted %s", t.id, t.SubmittedT) 32 | } 33 | 34 | func TestAltTask(t *testing.T) { 35 | etcdv3c := testutil.NewEtcdV3Client(t) 36 | kvc := etcdv3.NewKV(etcdv3c) 37 | c := context.Background() 38 | t.Parallel() 39 | const namespace = "/alttask-metafora" 40 | _, _ = kvc.Delete(c, namespace, etcdv3.WithPrefix()) 41 | 42 | conf := metcdv3.NewConfig("testclient", namespace) 43 | 44 | // Sample overridden NewTask func 45 | conf.NewTaskFunc = func(id, props string) metafora.Task { 46 | task := exTask{id: id} 47 | if err := json.Unmarshal([]byte(props), &task); err != nil { 48 | metafora.Warnf("%s properties could not be unmarshalled: %v", id, err) 49 | } 50 | return &task 51 | } 52 | 53 | // Create a handler that returns results through a chan for synchronization 54 | results := make(chan string, 1) 55 | 56 | h := func(task metafora.Task, _ <-chan *statemachine.Message) *statemachine.Message { 57 | alttask, ok := task.(*exTask) 58 | if !ok { 59 | results <- fmt.Sprintf("%q is of type %T", task.ID(), task) 60 | return statemachine.PauseMessage() 61 | } 62 | if alttask.UserID == "" { 63 | results <- "missing UserID" 64 | return statemachine.PauseMessage() 65 | } 66 | results <- "ok" 67 | return statemachine.PauseMessage() 68 | } 69 | 70 | coord, hf, bal := metcdv3.New(conf, etcdv3c, h) 71 | consumer, err := metafora.NewConsumer(coord, hf, bal) 72 | if err != nil { 73 | t.Fatal(err) 74 | } 75 | go consumer.Run() 76 | defer consumer.Shutdown() 77 | 78 | cli := metcdv3.NewClient(namespace, etcdv3c) 79 | if err := cli.SubmitTask(&exTask{id: "test1", UserID: "test2"}); err != nil { 80 | t.Fatal(err) 81 | } 82 | 83 | result := <-results 84 | if result != "ok" { 85 | t.Fatal(result) 86 | } 87 | } 88 | -------------------------------------------------------------------------------- /metcdv3/testutil/testutil.go: -------------------------------------------------------------------------------- 1 | // Package testutil is a collection of utilities for use by Metafora's etcd 2 | // tests. Since tests are spread across the m_etcd and m_etcd_test packages 3 | // utilities must be in a shared location. 4 | // 5 | // Unless you're making changes to the m_etcd package you don't need to use 6 | // this. 7 | package testutil 8 | 9 | import ( 10 | "os" 11 | "strings" 12 | "time" 13 | 14 | etcdv3 "go.etcd.io/etcd/client/v3" 15 | ) 16 | 17 | // TestCase just defines the subset of *testing.T methods needed to avoid 18 | // pulling in the testing package. 19 | type TestCase interface { 20 | Skip(args ...interface{}) 21 | Fatalf(format string, args ...interface{}) 22 | } 23 | 24 | // NewEtcdClient creates a new etcd client for use by the metafora client during testing. 25 | func NewEtcdV3Client(t TestCase) *etcdv3.Client { 26 | if os.Getenv("ETCDTESTS") == "" { 27 | t.Skip("ETCDTESTS unset. Skipping etcd tests.") 28 | } 29 | 30 | // This is the same ENV variable that etcdctl uses for peers. 31 | peerAddrs := os.Getenv("ETCD_PEERS") 32 | if peerAddrs == "" { 33 | peerAddrs = "127.0.0.1:2379" 34 | } 35 | 36 | peers := strings.Split(peerAddrs, ",") 37 | cli, err := etcdv3.New(etcdv3.Config{ 38 | Endpoints: peers, 39 | DialTimeout: 5 * time.Second, 40 | }) 41 | if err != nil { 42 | t.Fatalf("failed to create etcdv3 client: %v", err) 43 | } 44 | //defer cli.Close() 45 | return cli 46 | } 47 | -------------------------------------------------------------------------------- /resreporter/mem_linux.go: -------------------------------------------------------------------------------- 1 | package resreporter 2 | 3 | import ( 4 | "bufio" 5 | "fmt" 6 | "os" 7 | 8 | "github.com/lytics/metafora" 9 | ) 10 | 11 | const meminfo = "/proc/meminfo" 12 | 13 | var Memory = memory{} 14 | 15 | type memory struct{} 16 | 17 | func (memory) Used() (used uint64, total uint64) { 18 | fd, err := os.Open(meminfo) 19 | if err != nil { 20 | metafora.Errorf("Error reading free memory via "+meminfo+": %v", err) 21 | 22 | // Effectively disable the balancer since an error happened 23 | return 0, 100 24 | } 25 | defer fd.Close() 26 | 27 | s := bufio.NewScanner(fd) 28 | foundFree, foundCache, foundBuf := false, false, false 29 | var cache uint64 30 | var buffered uint64 31 | var free uint64 32 | for s.Scan() { 33 | if total > 0 && foundFree && foundCache && foundBuf { 34 | break 35 | } 36 | if total == 0 { 37 | if n, _ := fmt.Sscanf(s.Text(), "MemTotal:%d", &total); n == 1 { 38 | continue 39 | } 40 | } 41 | if foundFree { 42 | if n, _ := fmt.Sscanf(s.Text(), "MemFree:%d", &free); n == 1 { 43 | continue 44 | } 45 | } 46 | if !foundCache { 47 | if n, _ := fmt.Sscanf(s.Text(), "Cached:%d", &cache); n == 1 { 48 | foundCache = true 49 | continue 50 | } 51 | } 52 | if !foundBuf { 53 | if n, _ := fmt.Sscanf(s.Text(), "Buffers:%d", &buffered); n == 1 { 54 | foundBuf = true 55 | continue 56 | } 57 | } 58 | } 59 | if err := s.Err(); err != nil { 60 | metafora.Errorf("Error reading free memory via "+meminfo+": %v", err) 61 | 62 | // Effectively disable the balancer since an error happened 63 | return 0, 100 64 | } 65 | 66 | return total - (free + buffered + cache), total 67 | } 68 | 69 | func (memory) String() string { return "kB" } 70 | -------------------------------------------------------------------------------- /resreporter/mem_linux_test.go: -------------------------------------------------------------------------------- 1 | package resreporter_test 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/lytics/metafora/resreporter" 7 | ) 8 | 9 | func TestMemReporter(t *testing.T) { 10 | used, total := resreporter.Memory.Used() 11 | t.Logf("Used: %d %s (%d mB)", used, resreporter.Memory, used/1024) 12 | t.Logf("Total: %d %s (%d mB)", total, resreporter.Memory, total/1024) 13 | if used == 0 && total == 100 { 14 | t.Fatal("Memory reporter failed!") 15 | } 16 | if used > total { 17 | t.Fatal("More memory used than available?!") 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /scripts/docker_run_etcd.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | export RunningEtcdDockers=$(sudo docker ps -a | grep metafora-etcd- | awk '{print $1}') 3 | if [[ -n $RunningEtcdDockers ]]; then 4 | echo stopping existing etcd metafora docker containers 5 | echo -------------------------------------------------------------------------------- 6 | echo sudo docker stop ${RunningEtcdDockers} 7 | sudo docker stop ${RunningEtcdDockers} 8 | echo 9 | 10 | 11 | echo removing existing etcd docker containers 12 | echo -------------------------------------------------------------------------------- 13 | sudo docker rm ${RunningEtcdDockers} 14 | echo 15 | fi 16 | 17 | if [[ $1 = "-stop" ]]; then 18 | echo "-stop specified; not starting new containers" 19 | exit 0 20 | fi 21 | 22 | echo starting new etcd metafora docker containers 23 | echo -------------------------------------------------------------------------------- 24 | sudo docker run -d --name="metafora-etcd-a" --net=host coreos/etcd \ 25 | -peer-addr 127.0.0.1:8001 -peer-bind-addr 127.0.0.1:8001 -addr 127.0.0.1:5001 -bind-addr 127.0.0.1:5001 -name metafora-a 26 | sudo docker run -d --name="metafora-etcd-b" --net=host coreos/etcd \ 27 | -peer-addr 127.0.0.1:8002 -peer-bind-addr 127.0.0.1:8002 -addr 127.0.0.1:5002 -bind-addr 127.0.0.1:5002 -name metafora-b -peers 127.0.0.1:8001,127.0.0.1:8002,127.0.0.1:8003 28 | sudo docker run -d --name="metafora-etcd-c" --net=host coreos/etcd \ 29 | -peer-addr 127.0.0.1:8003 -peer-bind-addr 127.0.0.1:8003 -addr 127.0.0.1:5003 -bind-addr 127.0.0.1:5003 -name metafora-c -peers 127.0.0.1:8001,127.0.0.1:8002,127.0.0.1:8003 30 | echo 31 | 32 | echo list of running metafora docker containers 33 | echo -------------------------------------------------------------------------------- 34 | sudo docker ps | head -n 1 35 | sudo docker ps | grep metafora-etcd- 36 | -------------------------------------------------------------------------------- /slowtask_test.go: -------------------------------------------------------------------------------- 1 | package metafora 2 | 3 | import ( 4 | "testing" 5 | "time" 6 | ) 7 | 8 | type releaseAllBalancer struct { 9 | balances chan int 10 | ctx BalancerContext 11 | } 12 | 13 | func (b *releaseAllBalancer) Init(c BalancerContext) { 14 | b.ctx = c 15 | b.balances = make(chan int) 16 | } 17 | func (b *releaseAllBalancer) CanClaim(Task) (time.Time, bool) { return NoDelay, true } 18 | func (b *releaseAllBalancer) Balance() []string { 19 | b.balances <- 1 20 | ids := []string{} 21 | for _, task := range b.ctx.Tasks() { 22 | ids = append(ids, task.Task().ID()) 23 | } 24 | return ids 25 | } 26 | 27 | func TestDoubleRelease(t *testing.T) { 28 | t.Parallel() 29 | 30 | started := make(chan int) 31 | reallyStop := make(chan bool) 32 | h := SimpleHandler(func(task Task, stop <-chan bool) bool { 33 | started <- 1 34 | t.Logf("TestDoubleRelease handler recieved %s - blocking until reallyStop closed.", task) 35 | <-reallyStop 36 | return true 37 | }) 38 | 39 | tc := NewTestCoord() 40 | 41 | b := &releaseAllBalancer{} 42 | c, err := NewConsumer(tc, h, b) 43 | if err != nil { 44 | t.Fatalf("Error creating consumer: %v", err) 45 | } 46 | go c.Run() 47 | 48 | // This won't exit when told to 49 | tc.Tasks <- testTask{"1"} 50 | <-started 51 | 52 | // Make sure balancing/mainloop isn't blocked 53 | tc.Commands <- CommandBalance() 54 | <-b.balances 55 | tc.Commands <- CommandBalance() 56 | <-b.balances 57 | tc.Commands <- CommandBalance() 58 | <-b.balances 59 | 60 | shutdownComplete := make(chan bool) 61 | go func() { 62 | c.Shutdown() 63 | close(shutdownComplete) 64 | }() 65 | 66 | // Make sure the release insidiously blocks until we close reallyStop 67 | select { 68 | case <-shutdownComplete: 69 | t.Fatal("Shutdown completed when it should have blocked indefinitely") 70 | case <-time.After(100 * time.Millisecond): 71 | } 72 | 73 | // Close reallyStop and make sure Shutdown actually exits 74 | close(reallyStop) 75 | // Make sure the release insidiously blocks until we close reallyStop 76 | <-shutdownComplete 77 | } 78 | -------------------------------------------------------------------------------- /statemachine/README.md: -------------------------------------------------------------------------------- 1 | # Metafora Finite State Machine 2 | 3 | The `statemachine` package provides a featureful state machine for use by 4 | Metafora task handlers. 5 | 6 | ## Features 7 | 8 | * Static state machine; no custom states or messages (transitions) 9 | * Per task state machine; task may intercept commands 10 | * Flexible state store (see `StateStore` interface) 11 | * Flexible command sending/receiving (see `Commander`, `CommandListener`, or 12 | [the etcd implementation](../m_etcd/commander.go)). 13 | * Flexible error handling with builtin retry logic (see 14 | [`errors.go`](errors.go)). 15 | * States: Runnable, Paused, Sleeping, Fault, Completed, Failed, Killed 16 | * Commands/Messages: Run, Pause, Sleep, Release, Error, Kill, Complete, Checkpoint 17 | * Tasks in a terminal state are unscheduled and will take no cluster resources. 18 | 19 | ## Control Flow 20 | 21 | 1. Coordinator receives a claimable task from a Watch 22 | 2. Consumer calls `Balancer.CanClaim(task)` 23 | 3. If claimable, Consumer calls `Coordinator.Claim(task)` to claim it. 24 | 4. If claim was successful, Consumer starts the task handler which is created 25 | by `statemachine.New(...)`. 26 | 5. State machine loads initial state via `StateStore.Load(task)`. 27 | 6. If the task is `Runnable` hand over control to the `StatefulHandler` 28 | implementation provided by the user. 29 | 7. Run until task returns a `Message` either due to completion, an error, or a 30 | received command. 31 | 32 | There are quite a few moving parts that are hooked together: 33 | 34 | * The Consumer needs a `Coordinator`, `Balancer`, and `HandlerFunc` like 35 | normal, but you should use `statemachine.New(...)` to create the `Handler` 36 | returned by your `HandlerFunc`. 37 | * The state machine requires a `StateStore` and `CommandListener`. The `m_etcd` 38 | package includes an etcd implemenation of `CommandLister` (as well as 39 | `Commander` for sending commands), but no default `StateStore` is provided. 40 | * Your task handling code must be implemented in a function (or method) that 41 | fulfills the `StatefulHandler` signature. When your handler receives a 42 | command it should return it (or override it with a new `Message`) to the 43 | state machine to handle state transitions. 44 | 45 | ## States 46 | 47 | State | Description 48 | ------|------------ 49 | Runnable | Task is runnable and control is passed to the task handler. 50 | Paused | Task is paused until a command is received. 51 | Sleeping | Task is paused until a specified time (or a command is received). 52 | Fault | An error occurred and a custom error handler is invoked. 53 | Completed | **Terminal** Task returned the `Complete` message because it finished succesfully. 54 | Failed | **Terminal** The error handler executed during the Fault state determined the task has failed permanently. 55 | Killed | **Terminal** Task received a `Kill` message. 56 | 57 | **Terminal** states are final. The task is removed from from the broker and will never be scheduled to run again. 58 | 59 | ## Messages 60 | 61 | AKA Events or Commands 62 | 63 | Messages cause transitions between states. 64 | 65 | Message | Description 66 | --------|------------ 67 | Run | Causes a `Paused` or `Sleeping` task to transition to `Runnable` and begin executing. 68 | Pause | Causes a `Runnable` or `Sleeping` task to transition to `Paused`. 69 | Sleep | Requires an `Until time.Time` to be set. Causes non-terminal states to pause until the time is reached. 70 | Error | Requires an `Err error` to be set. Usually returned by tasks to transition to `Fault` state. 71 | Release | *See below* 72 | Checkpoint | *See below* 73 | Kill | Causes a non-terminal state to transition to `Killed`. 74 | Complete | Should only be returned by tasks. Causes a `Runnable` state to transition to `Completed`. 75 | 76 | 77 | ### Release 78 | 79 | Release is a special message that does *not* transition between states. Instead 80 | the task handler exits and the Coordinator's claim on the task is released. 81 | 82 | Metafora's `Handler.Stop()` method sends the `Release` command to a running 83 | task to request it exit. It's most often used when cleanly restarting Metafora 84 | nodes. 85 | 86 | ### Checkpoint 87 | 88 | Checkpoint is a special message that - like `Release` - does *not* transition 89 | between states. It is meant to be a signal to tasks to persist any internal 90 | state and optionally exit to allow the state machine to store. 91 | 92 | Since a `Checkpoint` is a noop in the state machine a task may decide to 93 | intercept the message and *not* return. 94 | -------------------------------------------------------------------------------- /statemachine/commander.go: -------------------------------------------------------------------------------- 1 | package statemachine 2 | 3 | type CommandListener interface { 4 | Receive() <-chan *Message 5 | Stop() 6 | } 7 | 8 | type Commander interface { 9 | Send(taskID string, m *Message) error 10 | } 11 | -------------------------------------------------------------------------------- /statemachine/doc.go: -------------------------------------------------------------------------------- 1 | // Statemachine is a featureful statemachine implementation for Metafora 2 | // handlers to use. It is implemented as a Handler wrapper which provides a 3 | // channel of incoming commands to wrapped handlers. Internal handlers are 4 | // expected to shutdown cleanly and exit upon receiving a command from the 5 | // state machine. The state machine will handle the state transition and 6 | // restart the internal handler if necesary. 7 | // 8 | // Users must provide a StateStore implementation for persisting task state and 9 | // Command Listener implementation for receiving commands. See the m_etcd or 10 | // embedded packages for example Command Listener implementations. 11 | // 12 | // See the README in this package for details. 13 | package statemachine 14 | -------------------------------------------------------------------------------- /statemachine/errors.go: -------------------------------------------------------------------------------- 1 | package statemachine 2 | 3 | import ( 4 | "errors" 5 | "time" 6 | 7 | "github.com/lytics/metafora" 8 | ) 9 | 10 | // ExceededErrorRate is returned by error handlers in an Error Message when 11 | // retry logic has been exhausted for a handler and it should transition to 12 | // Failed. 13 | var ExceededErrorRate = errors.New("exceeded error rate") 14 | 15 | // Err represents an error that occurred while a stateful handler was running. 16 | // 17 | // NewErr was added to allow callers to construct an instance from an underlying error. 18 | // The underlying error is now preserved so that Err can be converted back using errors.As 19 | // This is useful for custom error handlers that wish to inspect underlying error types 20 | // and decision accordingly. 21 | type Err struct { 22 | Time time.Time `json:"timestamp"` 23 | Err string `json:"error"` 24 | baseErr error 25 | } 26 | 27 | // NewErr constructs an Err from an underlying error e. 28 | func NewErr(e error, t time.Time) Err { 29 | return Err{Err: e.Error(), Time: t, baseErr: e} 30 | } 31 | 32 | // Error implements the Error interface. 33 | func (e Err) Error() string { 34 | return e.Err 35 | } 36 | 37 | // Unwrap returns baseErr. 38 | func (e Err) Unwrap() error { 39 | return e.baseErr 40 | } 41 | 42 | // ErrHandler functions should return Run, Sleep, or Fail messages depending on 43 | // the rate of errors. 44 | // 45 | // Either ErrHandler and/or StateStore should trim the error slice to keep it 46 | // from growing without bound. 47 | type ErrHandler func(task metafora.Task, errs []Err) (*Message, []Err) 48 | 49 | const ( 50 | DefaultErrLifetime = -4 * time.Hour 51 | DefaultErrMax = 8 52 | ) 53 | 54 | // DefaultErrHandler returns a Fail message if 8 errors have occurred in 4 55 | // hours. Otherwise it enters the Sleep state for 10 minutes before trying 56 | // again. 57 | func DefaultErrHandler(_ metafora.Task, errs []Err) (*Message, []Err) { 58 | recent := time.Now().Add(DefaultErrLifetime) 59 | strikes := 0 60 | for _, err := range errs { 61 | if err.Time.After(recent) { 62 | strikes++ 63 | } 64 | } 65 | 66 | if len(errs) > DefaultErrMax { 67 | errs = errs[len(errs)-DefaultErrMax:] 68 | } 69 | 70 | if strikes >= DefaultErrMax { 71 | // Return a new error to transition to Failed as well as the original 72 | // errors to store what caused this failure. 73 | return ErrorMessage(ExceededErrorRate), errs 74 | } 75 | return SleepMessage(time.Now().Add(10 * time.Minute)), errs 76 | } 77 | -------------------------------------------------------------------------------- /statemachine/errors_test.go: -------------------------------------------------------------------------------- 1 | package statemachine_test 2 | 3 | import ( 4 | "errors" 5 | "testing" 6 | "time" 7 | 8 | . "github.com/lytics/metafora/statemachine" 9 | "github.com/stretchr/testify/assert" 10 | "github.com/stretchr/testify/require" 11 | ) 12 | 13 | type task string 14 | 15 | func (t task) ID() string { return string(t) } 16 | 17 | func TestDefaultErrHandler(t *testing.T) { 18 | t.Parallel() 19 | tid := "" 20 | 21 | errs := []Err{{Time: time.Now()}} 22 | 23 | { 24 | msg, errs := DefaultErrHandler(task(tid), errs) 25 | if len(errs) != 1 { 26 | t.Fatalf("Expected 1 err, found: %d", len(errs)) 27 | } 28 | if msg.Code != Sleep || msg.Until == nil || msg.Until.Before(time.Now().Add(9*time.Minute)) { 29 | t.Fatalf("Expected sleep until +10m state but found: %s", msg) 30 | } 31 | } 32 | 33 | // Push error list over limit 34 | for i := 0; i < DefaultErrMax+1; i++ { 35 | errs = append(errs, Err{Time: time.Now()}) 36 | } 37 | 38 | { 39 | msg, errs := DefaultErrHandler(task(tid), errs) 40 | if len(errs) > DefaultErrMax { 41 | t.Fatalf("Expected %d errors but received: %d", DefaultErrMax, len(errs)) 42 | } 43 | if msg.Code != Error || msg.Err != ExceededErrorRate { 44 | t.Fatalf("Expected error handler to permanently fail but receied: %s", msg) 45 | } 46 | } 47 | } 48 | 49 | type errType1 struct{ error } 50 | type errType2 struct{ error } 51 | 52 | func TestErr(t *testing.T) { 53 | err := errType1{errors.New("some underlying error")} 54 | se := NewErr(err, time.Now()) 55 | 56 | // confirm se implements the error interface 57 | require.Implements(t, (*error)(nil), se) 58 | 59 | // confirm we can only convert se to an error of the same underlying type 60 | assert.True(t, errors.As(se, new(errType1))) 61 | assert.False(t, errors.As(se, new(errType2))) 62 | 63 | // make sure we don't panic if someone uses it the old way and baseErr is nil 64 | se = Err{Time: time.Now(), Err: "something bad"} 65 | assert.Equal(t, "something bad", se.Error()) 66 | assert.False(t, errors.As(se, new(errType1))) 67 | 68 | // confirm we can check for a specific instance of baseErr too 69 | e1 := errType1{errors.New("target instance")} 70 | e2 := errType1{errors.New("different instance")} 71 | se = NewErr(e1, time.Now()) 72 | assert.True(t, errors.Is(se, e1)) 73 | assert.False(t, errors.Is(se, e2)) 74 | } 75 | -------------------------------------------------------------------------------- /statemachine/run_test.go: -------------------------------------------------------------------------------- 1 | package statemachine 2 | 3 | import ( 4 | "testing" 5 | "time" 6 | 7 | "github.com/lytics/metafora" 8 | ) 9 | 10 | type task string 11 | 12 | func (t task) ID() string { return string(t) } 13 | 14 | // TestCommandBlackhole is meant to demonstrate what happens if a 15 | // StatefulHandler implementation receives commands in a goroutine that lives 16 | // past the SatefulHandler func exiting. This is a very easy bug to write, so 17 | // defensive code was added to prevent the leaked goroutine from "stealing" 18 | // commands meant for other states (Paused or Sleeping being the two states 19 | // that absolutely need to accept commands). 20 | // 21 | // This test breaking isn't necessarily the sign of a bug. It may just mean 22 | // we've decided to remove the defensive code protecting against such errors in 23 | // which case this test should be removed as well. 24 | func TestCommandBlackhole(t *testing.T) { 25 | t.Parallel() 26 | stop := make(chan bool) 27 | rdy := make(chan int, 1) 28 | defer close(stop) 29 | 30 | f := func(_ metafora.Task, c <-chan *Message) *Message { 31 | go func() { 32 | rdy <- 1 33 | select { 34 | case <-c: 35 | t.Log("Intercepted!") 36 | case <-stop: 37 | return 38 | } 39 | }() 40 | return nil 41 | } 42 | cmds := make(chan *Message) 43 | 44 | // Ignore the return message, the point is to make sure it doesn't intercept 45 | // further commands. 46 | run(f, task("test-task"), cmds) 47 | <-rdy 48 | 49 | go func() { cmds <- RunMessage() }() 50 | 51 | select { 52 | case <-cmds: 53 | // Yay! command wasn't intercepted by leaked goroutine! 54 | case <-time.After(time.Second): 55 | t.Fatalf("Command was intercepted by leaked goroutine.") 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /statemachine/statemachine.go: -------------------------------------------------------------------------------- 1 | package statemachine 2 | 3 | import ( 4 | "errors" 5 | "fmt" 6 | "runtime" 7 | "strings" 8 | "sync" 9 | "time" 10 | 11 | "github.com/lytics/metafora" 12 | ) 13 | 14 | var ( 15 | MissingUntilError = errors.New("sleeping state missing deadline") 16 | MissingErrorsError = errors.New("fault state has no errors") 17 | ReleasableError = errors.New("network error, release and retry") 18 | ) 19 | 20 | // StateCode is the actual state key. The State struct adds additional metadata 21 | // related to certain StateCodes. 22 | type StateCode string 23 | 24 | const ( 25 | Runnable StateCode = "runnable" // Scheduled 26 | Sleeping StateCode = "sleeping" // Scheduled, not running until time has elapsed 27 | Completed StateCode = "completed" // Terminal, not scheduled 28 | Killed StateCode = "killed" // Terminal, not scheduled 29 | Failed StateCode = "failed" // Terminal, not scheduled 30 | Fault StateCode = "fault" // Scheduled, in error handling / retry logic 31 | Paused StateCode = "paused" // Scheduled, not running 32 | ) 33 | 34 | // Terminal states will never run and cannot transition to a non-terminal 35 | // state. 36 | func (s StateCode) Terminal() bool { 37 | switch s { 38 | case Runnable, Sleeping, Paused, Fault: 39 | return false 40 | case Completed, Killed, Failed: 41 | return true 42 | default: 43 | metafora.Error("unknown state: ", s) 44 | return false 45 | } 46 | } 47 | 48 | func (s StateCode) String() string { return string(s) } 49 | 50 | // State represents the current state of a stateful handler. See StateCode for 51 | // details. Until and Errors are extra state used by the Sleeping and Fault 52 | // states respectively. 53 | type State struct { 54 | Code StateCode `json:"state"` 55 | Until *time.Time `json:"until,omitempty"` 56 | Errors []Err `json:"errors,omitempty"` 57 | } 58 | 59 | // copy state so mutations to Until and Errors aren't shared. 60 | func (s *State) copy() *State { 61 | ns := &State{Code: s.Code} 62 | if s.Until != nil { 63 | until := *s.Until 64 | ns.Until = &until 65 | } 66 | ns.Errors = append(ns.Errors, s.Errors...) 67 | return ns 68 | } 69 | 70 | func (s *State) String() string { 71 | switch s.Code { 72 | case Sleeping: 73 | return fmt.Sprintf("%s until %s", s.Code, s.Until) 74 | case Fault: 75 | return fmt.Sprintf("%s (%d errors)", s.Code, len(s.Errors)) 76 | default: 77 | return string(s.Code) 78 | } 79 | } 80 | 81 | func (s *State) Valid() error { 82 | switch s.Code { 83 | case Completed, Failed, Killed, Paused, Runnable: 84 | case Sleeping: 85 | if s.Until == nil { 86 | return MissingUntilError 87 | } 88 | case Fault: 89 | if len(s.Errors) == 0 { 90 | return MissingErrorsError 91 | } 92 | default: 93 | return fmt.Errorf("unknown state: %q", s.Code) 94 | } 95 | return nil 96 | } 97 | 98 | // Messages are events that cause state transitions. Until and Err are used by 99 | // the Sleep and Error messages respectively. 100 | type Message struct { 101 | Code MessageCode `json:"message"` 102 | 103 | // Until is when the statemachine should transition from sleeping to runnable 104 | Until *time.Time `json:"until,omitempty"` 105 | 106 | // Err is the error that caused this Error message 107 | Err error `json:"error,omitempty"` 108 | } 109 | 110 | // ErrorMessage is a simpler helper for creating error messages from an error. 111 | func ErrorMessage(err error) *Message { 112 | return &Message{Code: Error, Err: err} 113 | } 114 | 115 | // SleepMessage is a simpler helper for creating sleep messages from a time. 116 | func SleepMessage(t time.Time) *Message { 117 | return &Message{Code: Sleep, Until: &t} 118 | } 119 | 120 | func RunMessage() *Message { return &Message{Code: Run} } 121 | func PauseMessage() *Message { return &Message{Code: Pause} } 122 | func KillMessage() *Message { return &Message{Code: Kill} } 123 | func CheckpointMessage() *Message { return &Message{Code: Checkpoint} } 124 | func ReleaseMessage() *Message { return &Message{Code: Release} } 125 | func CompleteMessage() *Message { return &Message{Code: Complete} } 126 | 127 | // Valid returns true if the Message is valid. Invalid messages sent as 128 | // commands are discarded by the state machine. 129 | func (m *Message) Valid() bool { 130 | switch m.Code { 131 | case Run, Pause, Release, Checkpoint, Complete, Kill: 132 | return true 133 | case Sleep: 134 | return m.Until != nil 135 | case Error: 136 | return m.Err != nil 137 | default: 138 | return false 139 | } 140 | } 141 | 142 | func (m *Message) String() string { 143 | switch m.Code { 144 | case Sleep: 145 | if m.Until != nil { 146 | return fmt.Sprintf("%s until %s", m.Code, m.Until) 147 | } 148 | case Error: 149 | if m.Err != nil { 150 | return fmt.Sprintf("%s: %s", m.Code, m.Err.Error()) 151 | } 152 | } 153 | return string(m.Code) 154 | } 155 | 156 | // MessageCode is the symbolic name of a state transition. 157 | type MessageCode string 158 | 159 | func (m MessageCode) String() string { return string(m) } 160 | 161 | const ( 162 | Run MessageCode = "run" 163 | Sleep MessageCode = "sleep" 164 | Pause MessageCode = "pause" 165 | Kill MessageCode = "kill" 166 | Error MessageCode = "error" 167 | Complete MessageCode = "complete" 168 | Checkpoint MessageCode = "checkpoint" 169 | 170 | // Special event which triggers state machine to exit without transitioning 171 | // between states. 172 | Release MessageCode = "release" 173 | ) 174 | 175 | // Transitions represent a state machine transition from one state to another 176 | // given an event message. 177 | type Transition struct { 178 | Event MessageCode 179 | From StateCode 180 | To StateCode 181 | } 182 | 183 | func (t Transition) String() string { 184 | return fmt.Sprintf("%v---%v--->%v", t.From, t.Event, t.To) 185 | } 186 | 187 | var ( 188 | // Rules is the state transition table. 189 | Rules = [...]Transition{ 190 | // Runnable can transition to anything 191 | {Event: Checkpoint, From: Runnable, To: Runnable}, 192 | {Event: Release, From: Runnable, To: Runnable}, 193 | {Event: Sleep, From: Runnable, To: Sleeping}, 194 | {Event: Complete, From: Runnable, To: Completed}, 195 | {Event: Kill, From: Runnable, To: Killed}, 196 | {Event: Error, From: Runnable, To: Fault}, 197 | {Event: Pause, From: Runnable, To: Paused}, 198 | {Event: Run, From: Runnable, To: Runnable}, 199 | 200 | // Sleeping can return to Runnable or be Killed/Paused 201 | {Event: Checkpoint, From: Sleeping, To: Sleeping}, 202 | {Event: Release, From: Sleeping, To: Sleeping}, 203 | {Event: Sleep, From: Sleeping, To: Sleeping}, 204 | {Event: Run, From: Sleeping, To: Runnable}, 205 | {Event: Kill, From: Sleeping, To: Killed}, 206 | {Event: Pause, From: Sleeping, To: Paused}, 207 | {Event: Error, From: Sleeping, To: Fault}, 208 | 209 | // The error state transitions to either sleeping, failed, or released (to 210 | // allow custom error handlers to workaround localitly related errors). 211 | {Event: Sleep, From: Fault, To: Sleeping}, 212 | {Event: Error, From: Fault, To: Failed}, 213 | 214 | // Paused can return to Runnable, be put to Sleep, or Killed 215 | {Event: Checkpoint, From: Paused, To: Paused}, 216 | {Event: Release, From: Paused, To: Paused}, 217 | {Event: Run, From: Paused, To: Runnable}, 218 | {Event: Sleep, From: Paused, To: Sleeping}, 219 | {Event: Kill, From: Paused, To: Killed}, 220 | {Event: Pause, From: Paused, To: Paused}, 221 | 222 | // Completed, Failed, and Killed are terminal states that cannot transition 223 | // to anything. 224 | } 225 | ) 226 | 227 | // StatefulHandler is the function signature that the state machine is able to 228 | // run. Instead of metafora.Handler's Stop method, StatefulHandlers receive 229 | // Messages via the commands chan and return their exit status via a Message. 230 | // 231 | // Normally StatefulHandlers simply return a Message as soon as it's received 232 | // on the commands chan. However, it's also acceptable for a handler to return 233 | // a different Message. For example if it encounters an error during shutdown, 234 | // it may choose to return that error as an Error Message as opposed to the 235 | // original command. 236 | type StatefulHandler func(task metafora.Task, commands <-chan *Message) *Message 237 | 238 | type stateMachine struct { 239 | task metafora.Task 240 | h StatefulHandler 241 | ss StateStore 242 | cl CommandListener 243 | cmds chan *Message 244 | errHandler ErrHandler 245 | 246 | mu *sync.RWMutex 247 | state *State 248 | ts time.Time 249 | 250 | stopL *sync.Mutex 251 | stopped chan bool 252 | } 253 | 254 | // New handler that creates a state machine and exposes state transitions to 255 | // the given handler by calling its Transition method. It should be created in 256 | // the HandlerFunc you use with metafora's Consumer. 257 | // 258 | // If ErrHandler is nil DefaultErrHandler will be used. 259 | func New(task metafora.Task, h StatefulHandler, ss StateStore, cl CommandListener, e ErrHandler) metafora.Handler { 260 | if e == nil { 261 | e = DefaultErrHandler 262 | } 263 | return &stateMachine{ 264 | task: task, 265 | h: h, 266 | ss: ss, 267 | cl: cl, 268 | errHandler: e, 269 | mu: &sync.RWMutex{}, 270 | ts: time.Now(), 271 | stopL: &sync.Mutex{}, 272 | stopped: make(chan bool), 273 | } 274 | } 275 | 276 | // State returns the current state the state machine is in and what time it 277 | // entered that state. The State may be nil if Run() has yet to be called. 278 | func (s *stateMachine) State() (*State, time.Time) { 279 | s.mu.RLock() 280 | defer s.mu.RUnlock() 281 | return s.state, s.ts 282 | } 283 | 284 | func (s *stateMachine) setState(state *State) { 285 | s.mu.Lock() 286 | s.state = state.copy() 287 | s.ts = time.Now() 288 | s.mu.Unlock() 289 | } 290 | 291 | // Run the state machine enabled handler. Loads the initial state and passes 292 | // control to the internal stateful handler passing commands from the command 293 | // listener into the handler's commands chan. 294 | func (s *stateMachine) Run() (done bool) { 295 | // Multiplex external (Stop) messages and internal ones 296 | s.cmds = make(chan *Message) 297 | go func() { 298 | for { 299 | select { 300 | case m := <-s.cl.Receive(): 301 | if !m.Valid() { 302 | metafora.Warnf("Ignoring invalid command: %q", m) 303 | continue 304 | } 305 | select { 306 | case s.cmds <- m: 307 | case <-s.stopped: 308 | return 309 | } 310 | case <-s.stopped: 311 | return 312 | } 313 | } 314 | }() 315 | 316 | // Stop the command listener and internal message multiplexer when Run exits 317 | defer func() { 318 | s.cl.Stop() 319 | s.stop() 320 | }() 321 | 322 | tid := s.task.ID() 323 | 324 | // Load the initial state 325 | state, err := s.ss.Load(s.task) 326 | if err == ReleasableError { 327 | // A failure to load was reported by our provided loader, but the loader believed the failure 328 | // was retriable. In most cases this will be some type of network partition or communication error, 329 | // too many file handles, etc. 330 | metafora.Errorf("task=%q could not load initial state but the task is retriable!", tid) 331 | time.Sleep(time.Second) //defer releasing the task so other nodes don't thunder herd retrying it. 332 | return false 333 | } else if err != nil { 334 | // A failure to load the state for a task is *fatal* - the task will be 335 | // unscheduled and requires operator intervention to reschedule. 336 | metafora.Errorf("task=%q could not load initial state. Marking done! Error: %v", tid, err) 337 | return true 338 | } 339 | if state == nil { 340 | // Note to StateStore implementors: This should not happen! Either state or 341 | // err must be non-nil. This code is simply to prevent a nil pointer panic. 342 | metafora.Errorf("statestore %T returned nil state and err for task=%q - unscheduling", s.ss, tid) 343 | return true 344 | } 345 | if state.Code.Terminal() { 346 | metafora.Warnf("task=%q in terminal state %s - exiting.", tid, state.Code) 347 | return true 348 | } 349 | 350 | s.setState(state) // for introspection/debugging 351 | 352 | // Main Statemachine Loop 353 | done = false 354 | for { 355 | // Enter State 356 | metafora.Debugf("task=%q in state %s", tid, state.Code) 357 | msg := s.exec(state) 358 | 359 | // Apply Message 360 | newstate, ok := apply(state, msg) 361 | if !ok { 362 | metafora.Warnf("task=%q Invalid state transition=%q returned by task. Old state=%q msg.Err=%s", tid, msg.Code, state.Code, msg.Err) 363 | msg = ErrorMessage(msg.Err) 364 | if newstate, ok = apply(state, msg); !ok { 365 | metafora.Errorf("task=%q Unable to transition to error state! Exiting with state=%q", tid, state.Code) 366 | return state.Code.Terminal() 367 | } 368 | } 369 | 370 | metafora.Infof("task=%q transitioning %s --> %s --> %s", tid, state, msg, newstate) 371 | 372 | // Save state - second part of logic probably should never happen 373 | if msg.Code != Release || (msg.Code == Release && (state.Code != newstate.Code || len(state.Errors) != len(newstate.Errors))) { 374 | if err := s.ss.Store(s.task, newstate); err != nil { 375 | // After upgrading to 1.25.5-gke.2000 we started experiencing the metadata server throwing POD_FINDER_IP_MISMATCH 376 | // errors resulting in failures authenticating to spanner. This panic will cause the pod to cyle 377 | // See https://github.com/lytics/lio/issues/30414 378 | if strings.Contains(err.Error(), "spanner: code = \"Unauthenticated\"") { 379 | metafora.Errorf("task=%q Unable to persist state=%q due to failure to authenticate to spanner.", tid, newstate.Code) 380 | panic(err) 381 | } 382 | 383 | metafora.Errorf("task=%q Unable to persist state=%q. Continuing.", tid, newstate.Code) 384 | return true 385 | } 386 | } 387 | 388 | // Set next state and loop if non-terminal 389 | state = newstate 390 | 391 | // Expose the state for introspection 392 | s.setState(state) 393 | 394 | // Exit and unschedule task on terminal state. 395 | if state.Code.Terminal() { 396 | return true 397 | } 398 | 399 | // Release messages indicate the task should exit but not unschedule. 400 | if msg.Code == Release { 401 | return false 402 | } 403 | 404 | // Alternatively Stop() may have been called but the handler may not have 405 | // returned the Release message. Always exit if we've been told to Stop() 406 | // even if the handler has returned a different Message. 407 | select { 408 | case <-s.stopped: 409 | return false 410 | default: 411 | } 412 | } 413 | } 414 | 415 | // execute non-terminal states 416 | func (s *stateMachine) exec(state *State) *Message { 417 | switch state.Code { 418 | case Runnable: 419 | // Runnable passes control to the stateful handler 420 | return run(s.h, s.task, s.cmds) 421 | case Paused: 422 | // Paused until a message arrives 423 | return <-s.cmds 424 | case Sleeping: 425 | // Sleeping until the specified time (or a message) 426 | if state.Until == nil { 427 | metafora.Warnf("task=%q told to sleep without a time. Resuming.", s.task.ID()) 428 | return RunMessage() 429 | } 430 | dur := time.Until(*state.Until) 431 | metafora.Infof("task=%q sleeping for %s", s.task.ID(), dur) 432 | timer := time.NewTimer(dur) 433 | select { 434 | case <-timer.C: 435 | return RunMessage() 436 | case msg := <-s.cmds: 437 | timer.Stop() 438 | // Checkpoint & Release are special cases that shouldn't affect sleep 439 | // time, so maintain it across the state transition 440 | if msg.Code == Checkpoint || msg.Code == Release { 441 | msg.Until = state.Until 442 | } 443 | return msg 444 | } 445 | case Fault: 446 | // Special case where we potentially trim the current state to keep 447 | // errors from growing without bound. 448 | var msg *Message 449 | msg, state.Errors = s.errHandler(s.task, state.Errors) 450 | return msg 451 | default: 452 | panic("invalid state: " + state.String()) 453 | } 454 | } 455 | 456 | func run(f StatefulHandler, task metafora.Task, cmd <-chan *Message) (m *Message) { 457 | defer func() { 458 | if r := recover(); r != nil { 459 | stackBuf := make([]byte, 6000) 460 | stackBufLen := runtime.Stack(stackBuf, false) 461 | stackTraceStr := string(stackBuf[0:stackBufLen]) 462 | metafora.Errorf("task=%q Run method panic()d! Applying Error message. Panic: %v\nStack: %s", task.ID(), r, stackTraceStr) 463 | m = &Message{Code: Error, Err: fmt.Errorf("panic: %v\nstack: %s", r, stackTraceStr)} 464 | } 465 | }() 466 | 467 | // Defensive code to give handlers a *copy* of the command chan. That way if 468 | // a handler keeps receiving on the command chan in a goroutine past the 469 | // handler's lifetime it doesn't intercept commands intended for the 470 | // statemachine. 471 | internalcmd := make(chan *Message) 472 | stopped := make(chan struct{}) 473 | go func() { 474 | for { 475 | select { 476 | case c := <-cmd: 477 | internalcmd <- c 478 | case <-stopped: 479 | return 480 | } 481 | } 482 | }() 483 | defer close(stopped) 484 | 485 | return f(task, internalcmd) 486 | } 487 | 488 | // Stop sends a Release message to the state machine through the command chan. 489 | func (s *stateMachine) Stop() { 490 | select { 491 | case s.cmds <- ReleaseMessage(): 492 | // Also inform the state machine it should exit since the internal handler 493 | // may override the release message causing the task to be unreleaseable. 494 | s.stop() 495 | case <-s.stopped: 496 | // Already stopped! 497 | } 498 | } 499 | 500 | func (s *stateMachine) stop() { 501 | s.stopL.Lock() 502 | defer s.stopL.Unlock() 503 | select { 504 | case <-s.stopped: 505 | return 506 | default: 507 | close(s.stopped) 508 | } 509 | } 510 | 511 | // apply a message to cause a state transition. Returns false if the state 512 | // transition is invalid. 513 | func apply(cur *State, m *Message) (*State, bool) { 514 | //XXX Is a linear scan of all rules really the best option here? 515 | for _, trans := range Rules { 516 | if trans.Event == m.Code && trans.From == cur.Code { 517 | metafora.Debugf("Transitioned %s", trans) 518 | if m.Err != nil { 519 | // Append errors from message 520 | cur.Errors = append(cur.Errors, NewErr(m.Err, time.Now())) 521 | } 522 | 523 | // New State + Message's Until + Combined Errors 524 | return &State{Code: trans.To, Until: m.Until, Errors: cur.Errors}, true 525 | } 526 | } 527 | return cur, false 528 | } 529 | -------------------------------------------------------------------------------- /statemachine/statemachine_test.go: -------------------------------------------------------------------------------- 1 | package statemachine_test 2 | 3 | import ( 4 | "errors" 5 | "testing" 6 | "time" 7 | 8 | "github.com/lytics/metafora" 9 | "github.com/lytics/metafora/embedded" 10 | . "github.com/lytics/metafora/statemachine" 11 | ) 12 | 13 | func testhandler(task metafora.Task, cmds <-chan *Message) *Message { 14 | metafora.Debugf("Starting %s", task.ID()) 15 | m := <-cmds 16 | metafora.Debugf("%s recvd %s", task.ID(), m.Code) 17 | return m 18 | } 19 | 20 | type testStore struct { 21 | initial *State 22 | out chan<- *State 23 | } 24 | 25 | func (s testStore) Load(metafora.Task) (*State, error) { 26 | s.out <- s.initial 27 | return s.initial, nil 28 | } 29 | func (s testStore) Store(task metafora.Task, newstate *State) error { 30 | metafora.Debugf("%s storing %s", task.ID(), newstate.Code) 31 | s.out <- newstate 32 | return nil 33 | } 34 | 35 | // setup a task with the specified task ID in a stateful handler and run it. 36 | func setup(t *testing.T, tid string) (*embedded.StateStore, Commander, metafora.Handler, chan bool) { 37 | t.Parallel() 38 | ss := embedded.NewStateStore().(*embedded.StateStore) 39 | _ = ss.Store(task(tid), &State{Code: Runnable}) 40 | <-ss.Stored // pop initial state out 41 | cmdr := embedded.NewCommander() 42 | cmdlistener := cmdr.NewListener(tid) 43 | sm := New(task(tid), testhandler, ss, cmdlistener, nil) 44 | done := make(chan bool) 45 | go func() { done <- sm.Run() }() 46 | return ss, cmdr, sm, done 47 | } 48 | 49 | // FIXME leaks goroutines 50 | func TestRules(t *testing.T) { 51 | t.Parallel() 52 | for i, trans := range Rules { 53 | metafora.Debugf("Trying %s", trans) 54 | cmdr := embedded.NewCommander() 55 | cmdlistener := cmdr.NewListener("test") 56 | store := make(chan *State) 57 | 58 | state := &State{Code: trans.From} 59 | 60 | // Sleeping state needs extra Until state 61 | if trans.From == Sleeping { 62 | until := time.Now().Add(100 * time.Millisecond) 63 | state.Until = &until 64 | } 65 | 66 | ts := testStore{initial: state, out: store} 67 | 68 | // Create a new statemachine that starts from the From state 69 | sm := New(task("test"), testhandler, ts, cmdlistener, nil) 70 | go sm.Run() 71 | initial := <-store 72 | if initial.Code != trans.From { 73 | t.Fatalf("%d Initial state %q not set. Found: %q", i, trans.From, initial.Code) 74 | } 75 | 76 | // The Fault state transitions itself to either sleeping or failed 77 | if trans.From != Fault { 78 | // Apply the Event to transition to the To state 79 | msg := &Message{Code: trans.Event} 80 | 81 | // Sleep messages need extra state 82 | if trans.Event == Sleep { 83 | until := time.Now().Add(10 * time.Millisecond) 84 | msg.Until = &until 85 | } 86 | if trans.Event == Error { 87 | msg.Err = errors.New("test") 88 | } 89 | if err := cmdr.Send("test", msg); err != nil { 90 | t.Fatalf("Error sending message %s: %v", trans.Event, err) 91 | } 92 | } 93 | newstate := <-store 94 | if trans.From == Fault && trans.To == Failed { 95 | // continue on as this transition relies on state this test doesn't exercise 96 | continue 97 | } 98 | if newstate.Code != trans.To { 99 | t.Fatalf("%d Expected %q but found %q for transition %s", i, trans.To, newstate.Code, trans) 100 | } 101 | } 102 | } 103 | 104 | func TestCheckpointRelease(t *testing.T) { 105 | ss, cmdr, _, done := setup(t, "test1") 106 | 107 | // Should just cause statemachine to loop 108 | if err := cmdr.Send("test1", CheckpointMessage()); err != nil { 109 | t.Fatalf("Error sending checkpoint: %v", err) 110 | } 111 | select { 112 | case <-done: 113 | t.Fatalf("Checkpoint command should not have caused statemachine to exit.") 114 | case <-time.After(100 * time.Millisecond): 115 | } 116 | 117 | // Should cause the statemachine to exit 118 | if err := cmdr.Send("test1", ReleaseMessage()); err != nil { 119 | t.Fatalf("Error sending release: %v", err) 120 | } 121 | select { 122 | case d := <-done: 123 | if d { 124 | t.Fatalf("Release command should not have caused the task to be marked as done.") 125 | } 126 | case <-time.After(100 * time.Millisecond): 127 | t.Fatalf("Expected statemachine to exit but it did not.") 128 | } 129 | state, err := ss.Load(task("test1")) 130 | if err != nil { 131 | t.Fatal(err) 132 | } 133 | if state.Code != Runnable { 134 | t.Fatalf("Expected released task to be runnable but found state %q", state.Code) 135 | } 136 | } 137 | 138 | func TestSleep(t *testing.T) { 139 | ss, cmdr, _, _ := setup(t, "sleep-test") 140 | 141 | { 142 | // Put to sleep forever 143 | until := time.Now().Add(9001 * time.Hour) 144 | if err := cmdr.Send("sleep-test", SleepMessage(until)); err != nil { 145 | t.Fatalf("Error sending sleep: %v", err) 146 | } 147 | 148 | newstate := <-ss.Stored 149 | if newstate.State.Code != Sleeping || !newstate.State.Until.Equal(until) { 150 | t.Fatalf("Expected task to store state Sleeping, but stored: %s", newstate) 151 | } 152 | } 153 | 154 | // Make sure it stays sleeping for at least a bit 155 | select { 156 | case newstate := <-ss.Stored: 157 | t.Fatalf("Expected task to stay asleep forever but transitioned to: %s", newstate) 158 | case <-time.After(100 * time.Millisecond): 159 | } 160 | 161 | // Override current sleep with a shorter one 162 | dur := 1 * time.Second 163 | start := time.Now() 164 | until := start.Add(dur) 165 | if err := cmdr.Send("sleep-test", SleepMessage(until)); err != nil { 166 | t.Fatalf("Error sending sleep: %v", err) 167 | } 168 | 169 | newstate := <-ss.Stored 170 | if newstate.State.Code != Sleeping || !newstate.State.Until.Equal(until) { 171 | t.Fatalf("Expected task to store state Sleeping, but stored: %s", newstate) 172 | } 173 | 174 | // Make sure it transitions to Runnable after sleep has elapsed 175 | newstate = <-ss.Stored 176 | transitioned := time.Now() 177 | if newstate.State.Code != Runnable || newstate.State.Until != nil { 178 | t.Fatalf("Expected task to be runnable without an Until time but found: %s", newstate.State) 179 | } 180 | elapsed := transitioned.Sub(start) 181 | if transitioned.Sub(start) < dur { 182 | t.Fatalf("Expected task to sleep for %s but slept for %s", dur, elapsed) 183 | } 184 | t.Logf("Statemachine latency: %s", elapsed-dur) 185 | } 186 | 187 | func TestSleepRelease(t *testing.T) { 188 | ss, cmdr, _, returned := setup(t, "sleep-test") 189 | 190 | until := time.Now().Add(9001 * time.Hour) 191 | { 192 | // Put to sleep forever 193 | if err := cmdr.Send("sleep-test", SleepMessage(until)); err != nil { 194 | t.Fatalf("Error sending sleep: %v", err) 195 | } 196 | 197 | newstate := <-ss.Stored 198 | if newstate.State.Code != Sleeping || !newstate.State.Until.Equal(until) { 199 | t.Fatalf("Expected task to store state Sleeping, but stored: %s", newstate) 200 | } 201 | } 202 | 203 | { 204 | // Releasing should maintain sleep state but exit 205 | if err := cmdr.Send("sleep-test", ReleaseMessage()); err != nil { 206 | t.Fatalf("Error sending release: %v", err) 207 | } 208 | newstate := <-ss.Stored 209 | if newstate.State.Code != Sleeping || newstate.State.Until == nil || !newstate.State.Until.Equal(until) { 210 | t.Fatalf("Releasing unexpectedly changed state: %s != Sleeping || %v != %s", newstate.State.Code, newstate.State.Until, until) 211 | } 212 | if done := <-returned; done { 213 | t.Fatal("Releasing should not have returned done.") 214 | } 215 | } 216 | } 217 | 218 | func TestTerminal(t *testing.T) { 219 | ss, cmdr, sm, done := setup(t, "terminal-test") 220 | 221 | // Kill the task 222 | if err := cmdr.Send("terminal-test", &Message{Code: Kill}); err != nil { 223 | t.Fatalf("Error sending kill command: %v", err) 224 | } 225 | 226 | // Task should be killed and done (unscheduled) 227 | newstate := <-ss.Stored 228 | if newstate.State.Code != Killed { 229 | t.Fatalf("Expected task to be killed but found: %s", newstate.State) 230 | } 231 | if !(<-done) { 232 | t.Fatal("Expected task to be done.") 233 | } 234 | if state, err := ss.Load(task("terminal-test")); err != nil || state.Code != Killed { 235 | t.Fatalf("Failed to load expected killed state for task: state=%s err=%v", state, err) 236 | } 237 | 238 | // Task should just die again if we try to reschedule it 239 | go func() { done <- sm.Run() }() 240 | select { 241 | case newstate := <-ss.Stored: 242 | t.Fatalf("Re-running a terminated task should *not* store state, but it stored: %v", newstate.State) 243 | case <-time.After(100 * time.Millisecond): 244 | // State shouldn't even be stored since it's not being changed and terminal 245 | // states should be immutable 246 | } 247 | 248 | if !(<-done) { 249 | t.Fatal("Expected task to be done.") 250 | } 251 | } 252 | 253 | func TestPause(t *testing.T) { 254 | ss, cmdr, sm, done := setup(t, "test-pause") 255 | 256 | pause := func() { 257 | if err := cmdr.Send("test-pause", PauseMessage()); err != nil { 258 | t.Fatalf("Error sending pause command to test-pause: %v", err) 259 | } 260 | newstate := <-ss.Stored 261 | if newstate.State.Code != Paused { 262 | t.Fatalf("Expected paused state but found: %s", newstate.State) 263 | } 264 | if state, err := ss.Load(task("test-pause")); err != nil || state.Code != Paused { 265 | t.Fatalf("Failed to load expected pause state for task: state=%s err=%v", state, err) 266 | } 267 | 268 | // Task should not be Done; pausing doesn't exit the statemachine 269 | select { 270 | case <-done: 271 | t.Fatal("Task exited unexpectedly.") 272 | case <-time.After(100 * time.Millisecond): 273 | } 274 | } 275 | 276 | // Pause the work 277 | pause() 278 | 279 | // Should be able to resume paused work 280 | if err := cmdr.Send("test-pause", RunMessage()); err != nil { 281 | t.Fatalf("Error sending run command to test-pause: %v", err) 282 | } 283 | newstate := <-ss.Stored 284 | if newstate.State.Code != Runnable { 285 | t.Fatalf("Expected runnable state but found: %s", newstate.State) 286 | } 287 | if state, err := ss.Load(task("test-pause")); err != nil || state.Code != Runnable { 288 | t.Fatalf("Failed to load expected runnable state for task: state=%s err=%v", state, err) 289 | } 290 | 291 | // Re-pause the work 292 | pause() 293 | 294 | // Pausing paused work is silly but fine 295 | pause() 296 | 297 | // Releasing paused work should make it exit but leave it in the paused state 298 | sm.Stop() 299 | newstate = <-ss.Stored 300 | if newstate.State.Code != Paused { 301 | t.Fatalf("Releasing should not have changed paused state but stored: %s", newstate.State) 302 | } 303 | select { 304 | case d := <-done: 305 | if d { 306 | t.Fatal("Releasing task should not have marked it as done.") 307 | } 308 | case <-time.After(100 * time.Millisecond): 309 | t.Fatal("Releasing paused task should have exited the statemachine, but didn't.") 310 | } 311 | 312 | // Ensure task is stored with the paused state 313 | if state, err := ss.Load(task("test-pause")); err != nil || state.Code != Paused { 314 | t.Fatalf("Failed to load expected paused state for task: state=%s err=%v", state, err) 315 | } 316 | } 317 | 318 | func TestMessageValid(t *testing.T) { 319 | t.Parallel() 320 | until := time.Now() 321 | validmsgs := []Message{ 322 | {Code: Run}, 323 | {Code: Sleep, Until: &until}, 324 | {Code: Pause}, 325 | {Code: Kill}, 326 | {Code: Error, Err: errors.New("test")}, 327 | {Code: Complete}, 328 | {Code: Checkpoint}, 329 | {Code: Release}, 330 | } 331 | for _, m := range validmsgs { 332 | if !m.Valid() { 333 | t.Errorf("Expected %s to be valid.", m) 334 | } 335 | } 336 | 337 | invalidmsgs := []Message{ 338 | {}, 339 | {Code: Sleep}, 340 | {Code: Error}, 341 | } 342 | for _, m := range invalidmsgs { 343 | if m.Valid() { 344 | t.Errorf("Expected %s to be invalid.", m) 345 | } 346 | } 347 | } 348 | -------------------------------------------------------------------------------- /statemachine/statestore.go: -------------------------------------------------------------------------------- 1 | package statemachine 2 | 3 | import "github.com/lytics/metafora" 4 | 5 | // StateStore is an interface implementations must provide for persisting task 6 | // state. Since the task ID is provided on each method call a single global 7 | // StateStore can be used and implementations should be safe for concurrent 8 | // access. 9 | type StateStore interface { 10 | // Load the persisted or initial state for a task. Errors will cause tasks to 11 | // be marked as done. 12 | // 13 | // The one exception is the special error StateNotFound which will cause the 14 | // state machine to start from the initial (Runnable) state. 15 | Load(metafora.Task) (*State, error) 16 | 17 | // Store the current task state. Errors will prevent current state from being 18 | // persisted and prevent state transitions. 19 | Store(metafora.Task, *State) error 20 | } 21 | -------------------------------------------------------------------------------- /task.go: -------------------------------------------------------------------------------- 1 | package metafora 2 | 3 | import ( 4 | "encoding/json" 5 | "sync" 6 | "time" 7 | ) 8 | 9 | // Task is the minimum interface for Tasks to implement. 10 | type Task interface { 11 | // ID is the immutable globally unique ID for this task. 12 | ID() string 13 | } 14 | 15 | type basictask string 16 | 17 | // NewTask creates the most basic Task implementation: just a string ID. 18 | func NewTask(id string) Task { return basictask(id) } 19 | func (t basictask) ID() string { return string(t) } 20 | 21 | // RunningTask represents tasks running within a consumer. 22 | type RunningTask interface { 23 | Task() Task 24 | 25 | // Started is the time the task was started by this consumer. 26 | Started() time.Time 27 | 28 | // Stopped is the first time Stop() was called on this task or zero is it has 29 | // yet to be called. Tasks may take an indeterminate amount of time to 30 | // shutdown after Stop() is called. 31 | Stopped() time.Time 32 | 33 | // Handler implementation called for this task. 34 | Handler() Handler 35 | } 36 | 37 | // runtask is the per-task state Metafora tracks internally. 38 | type runtask struct { 39 | // task is the original Task from the coordinator 40 | task Task 41 | 42 | // handler on which Run and Stop are called 43 | h Handler 44 | 45 | // stopL serializes calls to task.h.Stop() to make handler implementations 46 | // easier/safer as well as guard stopped 47 | stopL sync.Mutex 48 | 49 | // when task was started and when Stop was first called 50 | started time.Time 51 | stopped time.Time 52 | } 53 | 54 | func newTask(task Task, h Handler) *runtask { 55 | return &runtask{task: task, h: h, started: time.Now()} 56 | } 57 | 58 | func (t *runtask) stop() { 59 | t.stopL.Lock() 60 | defer t.stopL.Unlock() 61 | if t.stopped.IsZero() { 62 | t.stopped = time.Now() 63 | } 64 | t.h.Stop() 65 | } 66 | 67 | func (t *runtask) Task() Task { return t.task } 68 | func (t *runtask) Handler() Handler { return t.h } 69 | func (t *runtask) Started() time.Time { return t.started } 70 | func (t *runtask) Stopped() time.Time { 71 | t.stopL.Lock() 72 | defer t.stopL.Unlock() 73 | return t.stopped 74 | } 75 | 76 | func (t *runtask) MarshalJSON() ([]byte, error) { 77 | js := struct { 78 | ID string `json:"id"` 79 | Started time.Time `json:"started"` 80 | Stopped *time.Time `json:"stopped,omitempty"` 81 | }{ID: t.task.ID(), Started: t.started} 82 | 83 | // Only set stopped if it's non-zero 84 | if s := t.Stopped(); !s.IsZero() { 85 | js.Stopped = &s 86 | } 87 | 88 | return json.Marshal(&js) 89 | } 90 | -------------------------------------------------------------------------------- /util_test.go: -------------------------------------------------------------------------------- 1 | package metafora 2 | 3 | import ( 4 | "errors" 5 | "log" 6 | "os" 7 | ) 8 | 9 | func init() { 10 | SetLogger(log.New(os.Stderr, "", log.Lmicroseconds|log.Lshortfile)) 11 | } 12 | 13 | //TODO Move out into a testutil package for other packages to use. The problem 14 | //is that existing metafora tests would have to be moved to the metafora_test 15 | //package which means no manipulating unexported globals like balance jitter. 16 | 17 | type TestCoord struct { 18 | name string 19 | Tasks chan Task // will be returned in order, "" indicates return an error 20 | Commands chan Command 21 | Releases chan Task 22 | Dones chan Task 23 | closed chan bool 24 | } 25 | 26 | func NewTestCoord() *TestCoord { 27 | return &TestCoord{ 28 | name: "testcoord", 29 | Tasks: make(chan Task, 10), 30 | Commands: make(chan Command, 10), 31 | Releases: make(chan Task, 10), 32 | Dones: make(chan Task, 10), 33 | closed: make(chan bool), 34 | } 35 | } 36 | 37 | func (*TestCoord) Init(CoordinatorContext) error { return nil } 38 | func (*TestCoord) Claim(Task) bool { return true } 39 | func (c *TestCoord) Close() { close(c.closed) } 40 | func (c *TestCoord) Release(task Task) { c.Releases <- task } 41 | func (c *TestCoord) Done(task Task) { c.Dones <- task } 42 | func (c *TestCoord) Name() string { return c.name } 43 | 44 | // Watch sends tasks from the Tasks channel unless an empty string is sent. 45 | // Then an error is returned. 46 | func (c *TestCoord) Watch(out chan<- Task) error { 47 | var task Task 48 | for { 49 | select { 50 | case task = <-c.Tasks: 51 | Debugf("TestCoord recvd: %s", task) 52 | if task == nil || task.ID() == "" { 53 | return errors.New("test error") 54 | } 55 | case <-c.closed: 56 | return nil 57 | } 58 | select { 59 | case out <- task: 60 | Debugf("TestCoord sent: %s", task) 61 | case <-c.closed: 62 | return nil 63 | } 64 | } 65 | } 66 | 67 | // Command returns commands from the Commands channel unless a nil is sent. 68 | // Then an error is returned. 69 | func (c *TestCoord) Command() (Command, error) { 70 | cmd := <-c.Commands 71 | if cmd == nil { 72 | return cmd, errors.New("test error") 73 | } 74 | return cmd, nil 75 | } 76 | --------------------------------------------------------------------------------