├── .gitignore
├── .travis.yml
├── Documentation
    ├── design.md
    ├── etcd.md
    ├── faq.md
    ├── images
    │   ├── metafora_logical_integration_diagram.png
    │   ├── metafora_node_recovery.png
    │   └── metafora_nodefailure.png
    └── introduction.md
├── LICENSE
├── README.md
├── balancer.go
├── balancer_res.go
├── balancer_res_test.go
├── balancer_sleep.go
├── balancer_test.go
├── client.go
├── cmd
    └── metaforactl
    │   └── main.go
├── command.go
├── command_test.go
├── coordinator.go
├── doc.go
├── embedded
    ├── README.md
    ├── client.go
    ├── commander.go
    ├── commander_test.go
    ├── coordinator.go
    ├── embedded_test.go
    ├── statestore.go
    └── util.go
├── go.mod
├── go.sum
├── handler.go
├── httputil
    ├── httputil.go
    └── httputil_test.go
├── ignore.go
├── ignore_test.go
├── logger.go
├── metafora.go
├── metafora_test.go
├── metcdv3
    ├── README.md
    ├── balancer.go
    ├── balancer_test.go
    ├── client.go
    ├── client_test.go
    ├── commander.go
    ├── commander_test.go
    ├── conf.go
    ├── const.go
    ├── coordinator.go
    ├── coordinator_test.go
    ├── doc.go
    ├── helpers_test.go
    ├── integration_test.go
    ├── statestore.go
    ├── task.go
    ├── task_test.go
    └── testutil
    │   └── testutil.go
├── resreporter
    ├── mem_linux.go
    └── mem_linux_test.go
├── scripts
    └── docker_run_etcd.sh
├── slowtask_test.go
├── statemachine
    ├── README.md
    ├── commander.go
    ├── doc.go
    ├── errors.go
    ├── errors_test.go
    ├── run_test.go
    ├── statemachine.go
    ├── statemachine_test.go
    └── statestore.go
├── task.go
└── util_test.go


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Compiled Object files, Static and Dynamic libs (Shared Objects)
 2 | *.o
 3 | *.a
 4 | *.so
 5 | 
 6 | # Folders
 7 | _obj
 8 | _test
 9 | 
10 | # Architecture specific extensions/prefixes
11 | *.[568vq]
12 | [568vq].out
13 | 
14 | *.cgo1.go
15 | *.cgo2.c
16 | _cgo_defun.c
17 | _cgo_gotypes.go
18 | _cgo_export.*
19 | 
20 | _testmain.go
21 | 
22 | # Build files
23 | *.exe
24 | *.test
25 | *.prof
26 | cover.out
27 | 
28 | *.orig
29 | *.swp
30 | 
31 | # Executables
32 | cmd/metaforactl/metaforactl
33 | examples/koalemosd/koalemosd
34 | examples/koalemosctl/koalemosctl
35 | 
36 | # bazel local only
37 | bazel-*
38 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: go
 2 | script: go test -race -cpu 1,2,4 -v -timeout 5m ./...
 3 | sudo: false
 4 | go:
 5 |   - 1.16.x
 6 | notifications:
 7 |   webhooks:
 8 |     urls:
 9 |       - https://webhooks.gitter.im/e/737918445727692fe8d1
10 |     on_success: "change"  # options: [always|never|change] default: always
11 |     on_failure: "always"  # options: [always|never|change] default: always
12 |     on_start: false     # default: false
13 | before_script:
14 |   - curl -sL https://github.com/etcd-io/etcd/releases/download/v3.3.7/etcd-v3.3.7-linux-amd64.tar.gz | tar xz
15 |   - etcd-v3.3.7-linux-amd64/etcd 2> /dev/null &
16 | 


--------------------------------------------------------------------------------
/Documentation/design.md:
--------------------------------------------------------------------------------
 1 | # Design
 2 | 
 3 | ## Exactly Once
 4 | 
 5 | Metafora makes a *best effort* to ensure that exactly one instance of a
 6 | submitted task is executing in a cluster. In other words, for task `T`, only
 7 | one node, may be executing
 8 | [`HandlerFunc(T).Run`](https://godoc.org/github.com/lytics/metafora#Handler).
 9 | 
10 | ### Implementation
11 | 
12 | *Implementations are Coordinator specific, so this covers the builtin etcd
13 | coordinator.*
14 | 
15 | Task claims are represented as keys with a TTL in etcd. A claim key is
16 | refreshed before the TTL expires in order to ensure the node running the task
17 | maintains the claim as long as the node is still executing normally.
18 | 
19 | If the node ceases to execute normally due to a crash, high CPU utilization,
20 | network partition between the node and etcd, a bug, etc. the claim in etcd will
21 | expire and the task will be available for claiming by another node. When the
22 | problematic coordinator detects it has failed to maintain its claim, it informs
23 | the consumer it has `Lost` the task, the consumer calls `Handler.Stop` on the
24 | task, and ideally the task exits before it starts executing on a new node (see
25 | Limitations below).
26 | 
27 | If a node is unable to reliably communicate with etcd it will stop all of its
28 | tasks and release all of its claims, effectively leaving the cluster. It will
29 | begin claiming tasks once reliable communication with etcd is restored
30 | (although it will probably have to wait on other nodes to `Rebalance` tasks
31 | first).
32 | 
33 | All communication with etcd is done with strong consistency.
34 | 
35 | ### Limitations
36 | 
37 | Metafora cannot stop `Handler.Run` from continuing to execute the moment its
38 | claim expires. Goroutines are cooperative and threads of execution are subject
39 | to arbitrary pauses and scheduling.
40 | 
41 | Using the etcd coordinator, if `Handler.Run` does not exit within 30 seconds,
42 | the task is eligible for simultaneous execution on multiple nodes.<sup>#139</sup>
43 | 
44 | In other words: the "exactly once guarantee" relies on well behaved user code
45 | and accurate timers - both of which are out of Metafora's control.
46 | 
47 | Handler's should be designed to exit as quickly as possible when `Stop` is
48 | called if they rely on Metafora's exactly-once behavior. Tasks which shutdown
49 | slowly should be written to tolerate at-least-once semantics.
50 | 


--------------------------------------------------------------------------------
/Documentation/etcd.md:
--------------------------------------------------------------------------------
  1 | # etcd integration
  2 | 
  3 | Requires etcd v2. See [travis.yml](../.travis.yml) to see what version of etcd
  4 | automated tests are run against.
  5 | 
  6 | Metafora contains an [etcd](https://go.etcd.io/etcd) implementation of
  7 | the core
  8 | [`Coordinator`](https://godoc.org/github.com/lytics/metafora#Coordinator) and
  9 | [`Client`](http://godoc.org/github.com/lytics/metafora#Client) interfaces, so
 10 | that implementing Metafora with etcd in your own work system is quick and easy.
 11 | 
 12 | ## etcd layout
 13 | 
 14 | ```
 15 | /
 16 | └── <namespace>
 17 |     ├── nodes
 18 |     │   └── <node_id>          Ephemeral
 19 |     │       └── commands  
 20 |     │           └── <command>  JSON value
 21 |     │
 22 |     ├── tasks
 23 |     │   └── <task_id>
 24 |     │       ├── props          JSON value (optional)
 25 |     │       └── owner          Ephemeral, JSON value
 26 |     │
 27 |     ├── state                  Optional, only if using state store
 28 |     │   └── <task_id>          Permanent, JSON value
 29 |     │
 30 |     └── commands               Optional, only if using command listener
 31 |         └── <task_id>          Ephermeral, JSON value
 32 | 
 33 | ```
 34 | 
 35 | ### Tasks
 36 | 
 37 | Metafora clients submit tasks by making an empty directory in
 38 | `/<namespace>/tasks/` without a TTL.
 39 | 
 40 | Metafora nodes claim tasks by watching the `tasks` directory and -- if
 41 | `Balancer.CanClaim` returns `true` -- tries to create the
 42 | `/<namespace>/tasks/<tasks_id>/owner` file with the contents set to the nodes
 43 | name and a short TTL. The node must touch the file before the TTL expires
 44 | otherwise another node will claim the task and begin working on it.
 45 | 
 46 | The JSON format is:
 47 | 
 48 | ```json
 49 | {"node": "<node ID>"}
 50 | ```
 51 | 
 52 | Note that Metafora does not handle task parameters or configuration.
 53 | 
 54 | #### Task Properties
 55 | 
 56 | Optionally tasks may have a properties key with a JSON value. The value must be
 57 | immutable for the life of the task.
 58 | 
 59 | Users may set a custom `NewTask` function on their `EtcdCoordinator` in order
 60 | to unmarshal properties into a custom struct. The struct must implement the
 61 | `metafora.Task` interface and code that wishes to use implementation specific
 62 | methods or fields will have to type assert.
 63 | 
 64 | ### Node Commands
 65 | 
 66 | Metafora clients can send node commands by making a file inside
 67 | `/<namespace>/nodes/<node_id>/commands/` with any name (preferably using a time-ordered
 68 | UUID).
 69 | 
 70 | Metafora nodes watch their own node's `commands` directory for new files. The
 71 | contents of the files are a command to be executed. Only one command will be
 72 | executed at a time, and pending commands are lost on node shutdown.
 73 | 
 74 | ```json
 75 | {"command": "<command name>", "parameters": {}}
 76 | ```
 77 | 
 78 | Where parameters is an arbitrary JSON Object.
 79 | 
 80 | ### Task State
 81 | 
 82 | If you're using the etcd state store, it will persist a task's state as JSON in
 83 | `/<namespace>/state/<task_id>`.  The format of the JSON is defined by the
 84 | `statemachine` package.
 85 | 
 86 | Task state keys are permanent so they exist even after a task reaches a
 87 | terminal state and is unscheduled for 2 reasons:
 88 | 
 89 | 1. Provide a task history for users to inspect or prune at their discretion.
 90 | 2. Allow state store to default non-existant task states to Runnable since if
 91 |    they were running already or had run to completion before, the task key
 92 |    would exist.
 93 | 
 94 | See [`statemachine`'s Documentation](../statemachine/README.md) for details.
 95 | 
 96 | ### Task Commands
 97 | 
 98 | If you're using the etcd commander and command listener, task commands are sent
 99 | as JSON in `/<namespace>/commands/<task_id>`. Commands are deleted after
100 | they're handled. If more than one command is sent before either can be
101 | processed only the last command sent will be processed.
102 | 
103 | Commands have a TTL of 1 week so they're eventually cleaned up if a task
104 | terminates before it handles a command.
105 | 
106 | See [`statemachine`'s Documentation](../statemachine/README.md) for details.
107 | 
108 | ## Useful links for managing etcd
109 | 
110 | [The etcd API](https://coreos.com/docs/distributed-configuration/etcd-api/)
111 | 
112 | [etcd cli tool](https://go.etcd.io/etcdctl)
113 | 
114 | 


--------------------------------------------------------------------------------
/Documentation/faq.md:
--------------------------------------------------------------------------------
 1 | Frequently Asked Questions
 2 | ==========================
 3 | 
 4 | #### Q. Why not use [Ordasity](https://github.com/boundary/ordasity)?
 5 | 
 6 | [We](http://lytics.io) have an existing work running system written in Go and
 7 | needed a new distribution library for it. There's over 25k lines of Go we'd
 8 | like to reuse and couldn't with Ordasity as it runs on the JVM.
 9 | 
10 | #### Q. Why not use [donut](https://github.com/dforsyth/donut)?
11 | 
12 | [We](http://lytics.io) evaluated donut and found it far from production ready.
13 | While we've been inspired by many of its basic interfaces there really wasn't
14 | much code we were interested in reusing. At ~600 lines of code in donut,
15 | starting from scratch didn't seem like it would lose us much.
16 | 
17 | That being said we're very appreciative of donut! It heavily influenced our
18 | initial design.
19 | 
20 | #### Q. Why not use [goworker](http://www.goworker.org/) (or similar)?
21 | 
22 | goworker does not support rebalancing and appears to be more focused on a high
23 | rate (>1/s) of short lived work items. Metafora is designed for a low rate
24 | (<1/s) of long lived work items. This means rebalancing running work is
25 | critical.
26 | 
27 | There are a lot of projects in the short-lived offline task processing space,
28 | but few if any handle task state, rebalancing, consistent operation during
29 | partitions, and other features critical for long running tasks.
30 | 
31 | #### Q. Why not use a cluster management framework like [Mesos](http://mesos.apache.org/) or [Kubernetes](http://kubernetes.io/)?
32 | 
33 | You can use a cluster management framework to run Metafora, but you *shouldn't*
34 | use Metafora as a cluster management framework.
35 | 
36 | While Metafora tasks are long lived, they're often not individually large or
37 | necessarily resource intensive. For example, tasks in the Sleeping state stay
38 | resident in memory to handle any wakeup events (either from a timer or external
39 | command). Cluster management frameworks' smallest unit of work tends to be an
40 | operating system process.
41 | 
42 | Lytics often runs over 500 tasks per server in a Metafora cluster. 500 OS
43 | processes would incur nontrivial overhead compared to 500 Metafora tasks, not
44 | to mention be much harder to manage.
45 | 
46 | The second reason for preferring Metafora tasks to OS processes is a much
47 | richer command structure. Signals are the only command mechanism OS processes
48 | have builtin. Metafora's [state machine](../statemachine/README.md) provides a
49 | much easier to use and more featureful interface for tasks.
50 | 
51 | Cluster management frameworks are quite large in terms of code and operational
52 | complexity -- for good reason! They're a much more powerful and general purpose
53 | tool than Metafora. Metafora is being written, deployed, and maintained by a
54 | very small team, so minimizing operational complexity and overhead is a key
55 | feature.
56 | 
57 | #### Q. What are Metafora's limits?
58 | 
59 | While Lytics has not run into any firm limits, our current estimates are that
60 | Metafora with the etcd coordinator can scale to:
61 | 
62 | * Tens of thousands of concurrently running tasks (number of servers depends on
63 |   resource utilization of each task).
64 | * Hundreds of state transitions (task created, sleeping, etc.) per second.
65 | 
66 | Since etcd is designed for consistency before raw throughput, it is the
67 | limiting factor for cluster growth.
68 | 
69 | If you need more concurrent tasks or transtions it's recommended you run
70 | multiple etcd clusters and multiple Metafora consumers. A single OS process can
71 | run multiple Metafora consumers, so you only have to manage a single logical
72 | Metafora cluster of servers despite there being multiple etcd clusters and
73 | namespaces.
74 | 
75 | #### Q. What does metafora mean?
76 | 
77 | It's Greek for "transfer" and also refers to a winch on boats.
78 | [We](http://lytics.io) borrowed the Greek naval naming theme from
79 | [Kubernetes](http://kubernetes.io/).
80 | 


--------------------------------------------------------------------------------
/Documentation/images/metafora_logical_integration_diagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lytics/metafora/3c171a91c2055a449aa58d0ec9cedbe848bd386b/Documentation/images/metafora_logical_integration_diagram.png


--------------------------------------------------------------------------------
/Documentation/images/metafora_node_recovery.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lytics/metafora/3c171a91c2055a449aa58d0ec9cedbe848bd386b/Documentation/images/metafora_node_recovery.png


--------------------------------------------------------------------------------
/Documentation/images/metafora_nodefailure.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lytics/metafora/3c171a91c2055a449aa58d0ec9cedbe848bd386b/Documentation/images/metafora_nodefailure.png


--------------------------------------------------------------------------------
/Documentation/introduction.md:
--------------------------------------------------------------------------------
 1 | ## Introduction
 2 | 
 3 | Metafora is a framework for creating highly available and distributed services written in Go.  Metafora is embeded meaning your code controls how and when metafora is started.  It uses etcd to coordinate across the nodes in your cluster.  Metafora is a leaderless task distribution system where the nodes coordinate with each other to ensure that work is evenly distributed over the cluster.
 4 | 
 5 | Metafora is an embedded work stealing framework built on top of etcd.
 6 | 
 7 | ![logical1](/Documentation/images/metafora_logical_integration_diagram.png)
 8 | 
 9 | ## Overview
10 | 
11 | Metafora gives you the ablity to build an elastic distributed application.  It makes it easy to build applications that scale in or out, and that can recover from node failures.  The following diagrams are examples of how this works.
12 | 
13 | #### Node failure or scaling in
14 | 
15 | When a node fails (or you scale in your nodes), metafora will release the tasks from the missing node back into the task pool.  Other metafora nodes will detect the unclaimed tasks and attempt to claim them.  It's important to note that metafora simply manages the reassigment of tasks; it's up to your code (possibly in your metafora handler) to clean up any bad state caused by a task crashing during processing.
16 | 
17 | ![logical1](/Documentation/images/metafora_nodefailure.png)
18 | 
19 | 
20 | #### Node recovery or scaling out
21 | 
22 | When a new node joins the cluster it begins picking up new tasks immediately.  Initially the other nodes may have more tasks because they've been in the cluster longer.  To address this, occasionally the members compare task load and rebalance the tasks between them.
23 | 
24 | ![logical1](/Documentation/images/metafora_node_recovery.png)
25 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "{}"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright 2014 Lytics
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 
203 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | metafora
  2 | ========
  3 | 
  4 | [![Join the chat at https://gitter.im/lytics/metafora](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/lytics/metafora?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
  5 | [![Build Status](https://travis-ci.org/lytics/metafora.svg?branch=master)](https://travis-ci.org/lytics/metafora)
  6 | [![GoDoc](https://godoc.org/github.com/lytics/metafora?status.svg)](https://godoc.org/github.com/lytics/metafora)
  7 | 
  8 | Metafora is a [Go](https://golang.org) library designed to run long-running
  9 | (minutes to permanent) tasks in a cluster.
 10 | 
 11 | IRC: `#lytics/metafora` on [irc.gitter.im](https://irc.gitter.im)
 12 | 
 13 | Features
 14 | --------
 15 | 
 16 | * **Distributed** - horizontally scalable
 17 | * **Elastic** - online cluster resizing with automated rebalancing
 18 | * **Masterless** - work stealing, not assigning, pluggable balancing
 19 | * **Fault tolerant** - tasks are reassigned if nodes disappear
 20 | * **Simple** - few states, no checkpointing, no configuration management
 21 | * **Extensible** - well defined interfaces for implementing balancing and
 22 |   coordinating
 23 | * **Exactly-once** - designed to enforce one-and-only-one instance of each
 24 |   submitted task is running<sup>[ref](Documentation/design.md#exactly-once)</sup>
 25 | 
 26 | Metafora is a library for building distributed task work systems. You're
 27 | responsible for creating a `main()` entrypoint for your application, writing a
 28 | `metafora.Handler` and `HandlerFunc` to actually process tasks, and then
 29 | starting Metafora's `Consumer`.
 30 | 
 31 | Metafora's task state machine is implemented as a `Handler` adapter. Simply
 32 | implement your task processor as a
 33 | [`StatefulHandler`](https://godoc.org/github.com/lytics/metafora/statemachine#StatefulHandler)
 34 | function, and create a `metafora.Handler` with
 35 | [`statemachine.New`](https://godoc.org/github.com/lytics/metafora/statemachine#New).
 36 | 
 37 | Example
 38 | -------
 39 | 
 40 | [koalemosd](https://github.com/lytics/metafora/blob/master/examples/koalemosd/main.go)
 41 | is a sample consumer implementation that can be run as a daemon
 42 | (it requires etcd).
 43 | [koalemosctl](https://github.com/lytics/metafora/blob/master/examples/koalemosctl/main.go)
 44 | is a sample command line client for submitting tasks to `koalemosd`.
 45 | 
 46 | ```sh
 47 | # Install etcd as per https://go.etcd.io/etcd#getting-etcd
 48 | # Run the following in one terminal:
 49 | go get -v -u github.com/lytics/metafora/examples/koalemosd
 50 | koalemosd
 51 | 
 52 | # Run the client in another
 53 | go get -v -u github.com/lytics/metafora/examples/koalemosctl
 54 | koalemosctl sleep 3 # where "sleep 3" is any command on your $PATH
 55 | ```
 56 | 
 57 | Since koalemosd is a simple wrapper around OS processes, it does not use the
 58 | state machine (`statemachine.StatefulHandler`).
 59 | 
 60 | Terms
 61 | -----
 62 | 
 63 | <table>
 64 | <tr>
 65 | <th>Balancer</th><td>Go interface consulted by <i>Consumer</i> for determining
 66 | which tasks can be claimed and which should be released. See <a
 67 | href="balancer.go">balancer.go</a>.</td>
 68 | </tr>
 69 | <tr>
 70 | <th>Broker</th><td>external task and command store like
 71 | <a href="https://go.etcd.io/etcd">etcd</a> for the <i>Coordinator</i> to
 72 | use.</td>
 73 | </tr>
 74 | <th>Consumer</th><td>core work runner. Integrates <i>Balancer</i>,
 75 | <i>Coordinator</i>, and <i>Handlers</i> to get work done.</td>
 76 | </tr>
 77 | <tr>
 78 | <th>Coordinator</th><td>client Go interface to <i>Broker</i>. See
 79 | <a href="coordinator.go">coordinator.go</a>.</td>
 80 | </tr>
 81 | <tr>
 82 | <th>Handler</th><td>Go interface for executing tasks.</td>
 83 | </tr>
 84 | <tr>
 85 | <th>Task</th><td>unit of work. Executed by <i>Handlers</i>.</td>
 86 | </tr>
 87 | </table>
 88 | 
 89 | FAQ
 90 | ---
 91 | 
 92 | **Q. Is it ready for production use?**
 93 | 
 94 | *Yes.* Metafora with the etcd coordinator has been the production work system at
 95 | [Lytics](http://lytics.io) since January 2014 and runs thousands of tasks
 96 | concurrently across a cluster of VMs.
 97 | 
 98 | Since Metafora is still under heavy development, you probably want to pin the
 99 | dependencies to a commit hash or
100 | [tag](https://github.com/lytics/metafora/releases) to keep the API stable. The
101 | `master` branch is automatically tested and is safe for use if you can tolerate
102 | API changes.
103 | 
104 | **Q. Where is the metaforad daemon?**
105 | 
106 | It doesn't exist. Metafora is library for you to import and use in a service
107 | you write. Metafora handles task management but leaves implementation details
108 | such as task implementation and daemonization up to the user.
109 | 
110 | [FAQ continued in Documentation...](Documentation/faq.md)
111 | 


--------------------------------------------------------------------------------
/balancer.go:
--------------------------------------------------------------------------------
  1 | package metafora
  2 | 
  3 | import (
  4 | 	"math"
  5 | 	"math/rand"
  6 | 	"time"
  7 | )
  8 | 
  9 | const (
 10 | 	// Default threshold is 120% of cluster average
 11 | 	defaultThreshold float64 = 1.2
 12 | )
 13 | 
 14 | // NoDelay is simply the zero value for time and meant to be a more meaningful
 15 | // value for CanClaim methods to return instead of initializing a new empty
 16 | // time struct.
 17 | var NoDelay = time.Time{}
 18 | 
 19 | // BalancerContext is a limited interface exposed to Balancers from the
 20 | // Consumer for access to limited Consumer state.
 21 | type BalancerContext interface {
 22 | 	// Tasks returns a sorted list of task IDs owned by this Consumer. The
 23 | 	// Consumer stops task manipulations during claiming and balancing, so the
 24 | 	// list will be accurate unless a task naturally completes.
 25 | 	Tasks() []RunningTask
 26 | }
 27 | 
 28 | // Balancer is the core task balancing interface. Without a master Metafora
 29 | // clusters are cooperatively balanced -- meaning each node needs to know how
 30 | // to balance itself.
 31 | type Balancer interface {
 32 | 	// Init is called once and only once before any other Balancer methods are
 33 | 	// called. The context argument is meant to expose functionality that might
 34 | 	// be useful for CanClaim and Balance implementations.
 35 | 	Init(BalancerContext)
 36 | 
 37 | 	// CanClaim should return true if the consumer should accept a task.
 38 | 	//
 39 | 	// When denying a claim by returning false, CanClaim should return the time
 40 | 	// at which to reconsider the task for claiming.
 41 | 	CanClaim(task Task) (ignoreUntil time.Time, claim bool)
 42 | 
 43 | 	// Balance should return the list of Task IDs that should be released. The
 44 | 	// criteria used to determine which tasks should be released is left up to
 45 | 	// the implementation.
 46 | 	Balance() (release []string)
 47 | }
 48 | 
 49 | // DumbBalancer is the simplest possible balancer implementation which simply
 50 | // accepts all tasks. Since it has no state a single global instance exists.
 51 | var DumbBalancer = dumbBalancer{}
 52 | 
 53 | type dumbBalancer struct{}
 54 | 
 55 | // Init does nothing.
 56 | func (dumbBalancer) Init(BalancerContext) {}
 57 | 
 58 | // CanClaim always returns true.
 59 | func (dumbBalancer) CanClaim(Task) (time.Time, bool) { return NoDelay, true }
 60 | 
 61 | // Balance never returns any tasks to balance.
 62 | func (dumbBalancer) Balance() []string { return nil }
 63 | 
 64 | // Provides information about the cluster to be used by FairBalancer
 65 | type ClusterState interface {
 66 | 	// Provide the current number of jobs
 67 | 	NodeTaskCount() (map[string]int, error)
 68 | }
 69 | 
 70 | // NewDefaultFairBalancer creates a new FairBalancer but requires a
 71 | // ClusterState implementation to gain more information about the cluster than
 72 | // BalancerContext provides.
 73 | func NewDefaultFairBalancer(nodeid string, cs ClusterState) Balancer {
 74 | 	return NewDefaultFairBalancerWithThreshold(nodeid, cs, defaultThreshold)
 75 | }
 76 | 
 77 | // NewDefaultFairBalancerWithThreshold allows callers to override
 78 | // FairBalancer's default 120% task load release threshold.
 79 | func NewDefaultFairBalancerWithThreshold(nodeid string, cs ClusterState, threshold float64) Balancer {
 80 | 	return &FairBalancer{
 81 | 		nodeid:           nodeid,
 82 | 		clusterstate:     cs,
 83 | 		releaseThreshold: threshold,
 84 | 	}
 85 | }
 86 | 
 87 | // An implementation of Balancer which attempts to randomly release tasks in
 88 | // the case when the count of those currently running on this node is greater
 89 | // than some percentage of the cluster average (default 120%).
 90 | //
 91 | // This balancer will claim all tasks which were not released on the last call
 92 | // to Balance.
 93 | type FairBalancer struct {
 94 | 	nodeid string
 95 | 
 96 | 	bc           BalancerContext
 97 | 	clusterstate ClusterState
 98 | 
 99 | 	releaseThreshold float64
100 | 	delay            time.Time
101 | }
102 | 
103 | func (e *FairBalancer) Init(s BalancerContext) {
104 | 	e.bc = s
105 | }
106 | 
107 | // CanClaim rejects tasks for a period of time if the last balance released
108 | // tasks. Otherwise all tasks are accepted.
109 | func (e *FairBalancer) CanClaim(task Task) (time.Time, bool) {
110 | 	if e.delay.After(time.Now()) {
111 | 		// Return delay set by Balance()
112 | 		return e.delay, false
113 | 	}
114 | 
115 | 	// Sleep proportional to number of tasks
116 | 	n := len(e.bc.Tasks())
117 | 	time.Sleep(time.Duration(n>>2) * time.Millisecond)
118 | 	return NoDelay, true
119 | }
120 | 
121 | // Balance releases tasks if this node has 120% more tasks than the average
122 | // node in the cluster.
123 | func (e *FairBalancer) Balance() []string {
124 | 	nodetasks := e.bc.Tasks()
125 | 
126 | 	// Reset delay
127 | 	e.delay = time.Time{}
128 | 
129 | 	// If local tasks <= 1 this node should never rebalance
130 | 	if len(nodetasks) < 2 {
131 | 		Infof("balancing skipped: nodetasks:%v ", nodetasks)
132 | 		return nil
133 | 	}
134 | 
135 | 	current, err := e.clusterstate.NodeTaskCount()
136 | 	if err != nil {
137 | 		Warnf("balancing skipped: retrieving cluster state: %v", err)
138 | 		return nil
139 | 	}
140 | 
141 | 	desired := e.desiredCount(current)
142 | 	shouldrelease := current[e.nodeid] - desired
143 | 	if shouldrelease < 1 {
144 | 		Infof("balancing skipped: shouldrelease <1 nodetasks:%v desired:%v shouldrelease:%v", len(nodetasks), desired, shouldrelease)
145 | 		return nil
146 | 	}
147 | 
148 | 	releasetasks := make([]string, 0, shouldrelease)
149 | 	releaseset := make(map[string]struct{}, shouldrelease)
150 | 
151 | 	random := rand.New(rand.NewSource(time.Now().UnixNano()))
152 | 	for len(releasetasks) < shouldrelease {
153 | 		tid := nodetasks[random.Intn(len(nodetasks))].Task().ID()
154 | 		if _, ok := releaseset[tid]; !ok {
155 | 			releasetasks = append(releasetasks, tid)
156 | 			releaseset[tid] = struct{}{}
157 | 		}
158 | 	}
159 | 
160 | 	e.delay = time.Now().Add(time.Duration(len(releasetasks)) * time.Second)
161 | 	return releasetasks
162 | }
163 | 
164 | // Retrieve the desired maximum count, based on current cluster state
165 | func (e *FairBalancer) desiredCount(current map[string]int) int {
166 | 	total := 0
167 | 	for _, c := range current {
168 | 		total += c
169 | 	}
170 | 
171 | 	avg := 0
172 | 	if len(current) > 0 {
173 | 		avg = total / len(current)
174 | 	}
175 | 
176 | 	return int(math.Ceil(float64(avg) * e.releaseThreshold))
177 | }
178 | 


--------------------------------------------------------------------------------
/balancer_res.go:
--------------------------------------------------------------------------------
  1 | package metafora
  2 | 
  3 | import (
  4 | 	"fmt"
  5 | 	"time"
  6 | )
  7 | 
  8 | // ResourceReporter is required by the ResourceBalancer to read the resource
  9 | // being used for balancing.
 10 | type ResourceReporter interface {
 11 | 	// Used returns the amount of a resource used and the total amount of that
 12 | 	// resource.
 13 | 	Used() (used uint64, total uint64)
 14 | 
 15 | 	// String returns the unit resources are reported in.
 16 | 	String() string
 17 | }
 18 | 
 19 | // ResourceBalancer is a balancer implemntation which uses two thresholds to
 20 | // limit claiming and rebalance work based upon a resource reported by a
 21 | // ResourceReporter. When the claim threshold is exceeded, no new work will be
 22 | // claimed. When the release threshold is exceeded work will be released until
 23 | // below that threshold. The claim threshold must be less than the release
 24 | // threshold (otherwise claims would continue just to have the work
 25 | // rebalanced.)
 26 | //
 27 | // Even below the claim limit, claims are delayed by the percent of resources
 28 | // used (in milliseconds) to give less loaded nodes a claim advantage.
 29 | //
 30 | // The balancer releases the oldest tasks first (skipping those who are already
 31 | // stopping) to try to prevent rebalancing the same tasks repeatedly within a
 32 | // cluster.
 33 | type ResourceBalancer struct {
 34 | 	ctx      BalancerContext
 35 | 	reporter ResourceReporter
 36 | 
 37 | 	claimLimit   int
 38 | 	releaseLimit int
 39 | }
 40 | 
 41 | // NewResourceBalancer creates a new ResourceBalancer or returns an error if
 42 | // the limits are invalid.
 43 | //
 44 | // Limits should be a percentage expressed as an integer between 1 and 100
 45 | // inclusive.
 46 | func NewResourceBalancer(src ResourceReporter, claimLimit, releaseLimit int) (*ResourceBalancer, error) {
 47 | 	if claimLimit < 1 || claimLimit > 100 || releaseLimit < 1 || releaseLimit > 100 {
 48 | 		return nil, fmt.Errorf("Limits must be between 1 and 100. claim=%d release=%d", claimLimit, releaseLimit)
 49 | 	}
 50 | 	if claimLimit >= releaseLimit {
 51 | 		return nil, fmt.Errorf("Claim threshold must be < release threshold. claim=%d >= release=%d", claimLimit, releaseLimit)
 52 | 	}
 53 | 
 54 | 	return &ResourceBalancer{
 55 | 		reporter:     src,
 56 | 		claimLimit:   claimLimit,
 57 | 		releaseLimit: releaseLimit,
 58 | 	}, nil
 59 | }
 60 | 
 61 | func (b *ResourceBalancer) Init(ctx BalancerContext) {
 62 | 	b.ctx = ctx
 63 | }
 64 | 
 65 | func (b *ResourceBalancer) CanClaim(string) bool {
 66 | 	used, total := b.reporter.Used()
 67 | 	threshold := int(float32(used) / float32(total) * 100)
 68 | 	if threshold >= b.claimLimit {
 69 | 		//FIXME Until #93 is fixed returning false is very dangerous as it could
 70 | 		//      cause a tight loop with the coordinator. Sleep longer than more
 71 | 		//      lightly loaded nodes.
 72 | 		dur := time.Duration(100+(threshold-b.claimLimit)) * time.Millisecond
 73 | 		Infof("%d is over the claim limit of %d. Used %d of %d %s. Sleeping %s before claiming.",
 74 | 			threshold, b.claimLimit, used, total, b.reporter, dur)
 75 | 		time.Sleep(dur)
 76 | 		return true
 77 | 	}
 78 | 
 79 | 	// Always sleep based on resource usage to give less loaded nodes an advantage
 80 | 	dur := time.Duration(threshold) * time.Millisecond
 81 | 	time.Sleep(dur)
 82 | 	return true
 83 | }
 84 | 
 85 | func (b *ResourceBalancer) Balance() []string {
 86 | 	used, total := b.reporter.Used()
 87 | 	threshold := int(float32(used) / float32(total) * 100)
 88 | 	if threshold < b.releaseLimit {
 89 | 		// We're below the limit! Don't release anything.
 90 | 		return nil
 91 | 	}
 92 | 
 93 | 	// Release the oldest task that isn't already stopping
 94 | 	var oldest RunningTask
 95 | 	for _, t := range b.ctx.Tasks() {
 96 | 		if t.Stopped().IsZero() && (oldest == nil || oldest.Started().After(t.Started())) {
 97 | 			oldest = t
 98 | 		}
 99 | 	}
100 | 
101 | 	// No tasks or all tasks are stopping, don't bother rebalancing
102 | 	if oldest == nil {
103 | 		return nil
104 | 	}
105 | 
106 | 	Infof("Releasing task %s (started %s) because %d > %d (%d of %d %s used)",
107 | 		oldest.Task().ID(), oldest.Started(), threshold, b.releaseLimit, used, total, b.reporter)
108 | 	return []string{oldest.Task().ID()}
109 | }
110 | 


--------------------------------------------------------------------------------
/balancer_res_test.go:
--------------------------------------------------------------------------------
 1 | package metafora
 2 | 
 3 | import "testing"
 4 | 
 5 | type fakeReporter struct {
 6 | 	used  uint64
 7 | 	total uint64
 8 | }
 9 | 
10 | func (r *fakeReporter) Used() (uint64, uint64) { return r.used, r.total }
11 | func (r *fakeReporter) String() string         { return "fakes" }
12 | 
13 | func TestResourceBalancer(t *testing.T) {
14 | 	t.Parallel()
15 | 
16 | 	fr := &fakeReporter{used: 750, total: 1000}
17 | 	_, err := NewResourceBalancer(fr, 80, 75)
18 | 	if err == nil {
19 | 		t.Fatal("Expected an error: release threshold was lower than claim.")
20 | 	}
21 | 
22 | 	bal, err := NewResourceBalancer(fr, 80, 90)
23 | 	if err != nil {
24 | 		t.Fatalf("Unexpected error creating resource balancer: %v", err)
25 | 	}
26 | 
27 | 	ctx := &TestConsumerState{
28 | 		Current: []string{"1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11"},
29 | 	}
30 | 	bal.Init(ctx)
31 | 
32 | 	release := bal.Balance()
33 | 	if len(release) > 0 {
34 | 		t.Errorf("Released tasks when we were well below limits! %v", release)
35 | 	}
36 | 
37 | 	// Bump resource usage and rebalance
38 | 	fr.used = 901
39 | 	release = bal.Balance()
40 | 	if len(release) != 1 && release[0] == "1" {
41 | 		t.Errorf("Expected 1 released task but found: %v", release)
42 | 	}
43 | 
44 | 	// Make sure we scale up the number we release proportionally
45 | 	fr.used = 999
46 | 	release = bal.Balance()
47 | 	if len(release) != 1 && release[0] == "1" {
48 | 		t.Errorf("Expected 1 released task but found: %v", release)
49 | 	}
50 | 
51 | 	//FIXME When #93 is fixed this test should break as CanClaim should actually
52 | 	//      return false
53 | 	if !bal.CanClaim("claimmepls") {
54 | 		t.Errorf("Until #93 is fixed, CanClaim should always return true")
55 | 	}
56 | }
57 | 


--------------------------------------------------------------------------------
/balancer_sleep.go:
--------------------------------------------------------------------------------
 1 | package metafora
 2 | 
 3 | import "time"
 4 | 
 5 | /*
 6 | Q. Why 30ms?
 7 | 
 8 | A. It's sufficiently long that unless a node is under heavy load (either
 9 | computational, GC-induced, or network latency) it should win the claim-race
10 | against nodes with more tasks. If it's under so much load that it loses against
11 | nodes with more tasks, it's probably best to let those other nodes win!
12 | 
13 | 30ms should scale fairly well up to hundreds of tasks per node as Metafora
14 | isn't really intended for high-throughput/low-latency tasks churn.
15 | */
16 | const sleepBalLen = 30 * time.Millisecond
17 | 
18 | // SleepBalancer is a simplistic Balancer implementation which sleeps 30ms per
19 | // claimed task in its CanClaim() method. This means the node with the fewest
20 | // claimed tasks in a cluster should sleep the shortest length of time and win
21 | // the claim race.
22 | //
23 | // It never releases tasks during Balance() calls.
24 | type SleepBalancer struct {
25 | 	ctx BalancerContext
26 | }
27 | 
28 | // Init is called by the Consumer.
29 | func (b *SleepBalancer) Init(ctx BalancerContext) { b.ctx = ctx }
30 | 
31 | // Balance never returns any tasks for the sleepy balancer.
32 | func (*SleepBalancer) Balance() []string { return nil }
33 | 
34 | // CanClaim sleeps 30ms per claimed task.
35 | func (b *SleepBalancer) CanClaim(string) bool {
36 | 	num := len(b.ctx.Tasks())
37 | 	time.Sleep(time.Duration(num) * sleepBalLen)
38 | 	return true
39 | }
40 | 


--------------------------------------------------------------------------------
/balancer_test.go:
--------------------------------------------------------------------------------
  1 | package metafora
  2 | 
  3 | import (
  4 | 	"testing"
  5 | 	"time"
  6 | )
  7 | 
  8 | var (
  9 | 	_ BalancerContext = (*TestConsumerState)(nil)
 10 | 	_ ClusterState    = (*TestClusterState)(nil)
 11 | )
 12 | 
 13 | func TestFairBalancerOneNode(t *testing.T) {
 14 | 	t.Parallel()
 15 | 	// Single node should never release tasks
 16 | 	clusterstate := &TestClusterState{
 17 | 		Current: map[string]int{"node1": 5},
 18 | 	}
 19 | 
 20 | 	consumerstate := &TestConsumerState{
 21 | 		[]string{"1", "2", "3", "4", "5"},
 22 | 	}
 23 | 
 24 | 	fb := NewDefaultFairBalancer("node1", clusterstate)
 25 | 	fb.Init(consumerstate)
 26 | 
 27 | 	if _, ok := fb.CanClaim(testTask{"23"}); !ok {
 28 | 		t.Fatal("Expected claim to be true")
 29 | 	}
 30 | 
 31 | 	rebalance := fb.Balance()
 32 | 	if len(rebalance) != 0 {
 33 | 		t.Fatalf("Expected 0 rebalance tasks: %v", rebalance)
 34 | 	}
 35 | }
 36 | 
 37 | func TestFairBalanceOver(t *testing.T) {
 38 | 	t.Parallel()
 39 | 	clusterstate := &TestClusterState{
 40 | 		Current: map[string]int{
 41 | 			"node1": 10,
 42 | 			"node2": 2,
 43 | 		},
 44 | 	}
 45 | 
 46 | 	consumerstate := &TestConsumerState{
 47 | 		[]string{"1", "2", "3", "4", "5"},
 48 | 	}
 49 | 
 50 | 	fb := NewDefaultFairBalancer("node1", clusterstate)
 51 | 	fb.Init(consumerstate)
 52 | 
 53 | 	if _, ok := fb.CanClaim(testTask{"23"}); !ok {
 54 | 		t.Fatal("Expected claim to be true")
 55 | 	}
 56 | 
 57 | 	expect := 2
 58 | 	rebalance := fb.Balance()
 59 | 	if len(rebalance) != expect {
 60 | 		t.Fatalf("Expected %d rebalanced tasks, received %d", expect, len(rebalance))
 61 | 	}
 62 | }
 63 | 
 64 | func TestFairBalanceNothing(t *testing.T) {
 65 | 	t.Parallel()
 66 | 	clusterstate := &TestClusterState{
 67 | 		Current: map[string]int{
 68 | 			"node1": 2,
 69 | 			"node2": 10,
 70 | 		},
 71 | 	}
 72 | 
 73 | 	consumerstate := &TestConsumerState{
 74 | 		[]string{"1", "2", "3", "4", "5"},
 75 | 	}
 76 | 
 77 | 	fb := NewDefaultFairBalancer("node1", clusterstate)
 78 | 	fb.Init(consumerstate)
 79 | 
 80 | 	if _, ok := fb.CanClaim(testTask{"23"}); !ok {
 81 | 		t.Fatal("Expected claim to be true")
 82 | 	}
 83 | 
 84 | 	expect := 0
 85 | 	rebalance := fb.Balance()
 86 | 	if len(rebalance) != expect {
 87 | 		t.Fatalf("Expected %d rebalanced tasks, received %d", expect, len(rebalance))
 88 | 	}
 89 | 
 90 | }
 91 | 
 92 | type testTask struct {
 93 | 	id string
 94 | }
 95 | 
 96 | func (t testTask) ID() string { return t.id }
 97 | 
 98 | type TestClusterState struct {
 99 | 	Current map[string]int
100 | 	Err     error
101 | }
102 | 
103 | func (ts *TestClusterState) NodeTaskCount() (map[string]int, error) {
104 | 	if ts.Err != nil {
105 | 		return nil, ts.Err
106 | 	}
107 | 
108 | 	return ts.Current, nil
109 | }
110 | 
111 | type TestConsumerState struct {
112 | 	Current []string
113 | }
114 | 
115 | func (tc *TestConsumerState) Tasks() []RunningTask {
116 | 	tasks := []RunningTask{}
117 | 	for _, id := range tc.Current {
118 | 		tasks = append(tasks, newTask(testTask{id}, nil))
119 | 	}
120 | 	return tasks
121 | }
122 | 
123 | // Sleepy Balancer Tests
124 | 
125 | type sbCtx struct {
126 | 	t     *testing.T
127 | 	tasks []string
128 | }
129 | 
130 | func (ctx *sbCtx) Tasks() []RunningTask {
131 | 	tasks := []RunningTask{}
132 | 	for _, id := range ctx.tasks {
133 | 		tasks = append(tasks, newTask(testTask{id}, nil))
134 | 	}
135 | 	return tasks
136 | }
137 | func (ctx *sbCtx) Log(l int, v string, args ...interface{}) {
138 | 	Infof(v, args)
139 | }
140 | 
141 | func TestSleepBalancer(t *testing.T) {
142 | 	t.Parallel()
143 | 	c := &sbCtx{t: t, tasks: make([]string, 0, 10)}
144 | 
145 | 	b := &SleepBalancer{}
146 | 	b.Init(c)
147 | 
148 | 	task := "test-task"
149 | 	pre := time.Now()
150 | 	total := 0
151 | 	for i := 0; i < 10; i++ {
152 | 		total += i
153 | 		b.CanClaim(task)
154 | 		c.tasks = append(c.tasks, task)
155 | 	}
156 | 	post := time.Now()
157 | 	minimum := pre.Add(time.Duration(total) * sleepBalLen)
158 | 
159 | 	// Sleep balancer should never finish before the minimum timeout threshold
160 | 	if post.Before(minimum) {
161 | 		t.Fatalf("SleepBalancer finished too early: %s < %s", post, minimum)
162 | 	}
163 | 
164 | 	// Sleep balancer shouldn't experience much overhead
165 | 	if post.After(minimum.Add(50 * time.Millisecond)) {
166 | 		t.Fatalf("SleepBalancer went a worrying amount over the expected time: %s > %s", post, minimum)
167 | 	}
168 | }
169 | 


--------------------------------------------------------------------------------
/client.go:
--------------------------------------------------------------------------------
 1 | package metafora
 2 | 
 3 | type Client interface {
 4 | 	// SubmitTask submits a task to the system, the task id must be unique.
 5 | 	SubmitTask(Task) error
 6 | 
 7 | 	// Delete a task
 8 | 	DeleteTask(taskId string) error
 9 | 
10 | 	// SubmitCommand submits a command to a particular node.
11 | 	SubmitCommand(node string, command Command) error
12 | 
13 | 	// Nodes retrieves the current set of registered nodes.
14 | 	Nodes() ([]string, error)
15 | }
16 | 


--------------------------------------------------------------------------------
/cmd/metaforactl/main.go:
--------------------------------------------------------------------------------
1 | package main
2 | 
3 | func main() {
4 | 
5 | }
6 | 


--------------------------------------------------------------------------------
/command.go:
--------------------------------------------------------------------------------
 1 | package metafora
 2 | 
 3 | import "encoding/json"
 4 | 
 5 | const (
 6 | 	cmdFreeze   = "freeze"
 7 | 	cmdUnfreeze = "unfreeze"
 8 | 	cmdBalance  = "balance"
 9 | 	cmdStopTask = "stop_task"
10 | )
11 | 
12 | // Commands are a way clients can communicate directly with nodes for cluster
13 | // maintenance.
14 | //
15 | // Use the Command functions to generate implementations of this interface.
16 | // Metafora's consumer will discard unknown commands.
17 | type Command interface {
18 | 	// Name returns the name of the command.
19 | 	Name() string
20 | 
21 | 	// Parameters returns the parameters, if any, the command will be executed
22 | 	// with.
23 | 	Parameters() map[string]interface{}
24 | 
25 | 	// Marshal turns a command into its wire representation.
26 | 	Marshal() ([]byte, error)
27 | }
28 | 
29 | // command is the internal representation of commands used for serialization.
30 | type command struct {
31 | 	C string                 `json:"command"`
32 | 	P map[string]interface{} `json:"parameters,omitempty"`
33 | }
34 | 
35 | // Name returns the name of the command.
36 | func (c *command) Name() string {
37 | 	return c.C
38 | }
39 | 
40 | // Parameters returns the parameters, if any, the command will be executed
41 | // with.
42 | func (c *command) Parameters() map[string]interface{} {
43 | 	return c.P
44 | }
45 | 
46 | // Marshal turns a command into its wire representation.
47 | func (c *command) Marshal() ([]byte, error) {
48 | 	return json.Marshal(c)
49 | }
50 | 
51 | // Unmarshal parses a command from its wire representation.
52 | func UnmarshalCommand(p []byte) (Command, error) {
53 | 	c := &command{}
54 | 	err := json.Unmarshal(p, c)
55 | 	return c, err
56 | }
57 | 
58 | // CommandFreeze stops all task watching and balancing.
59 | func CommandFreeze() Command {
60 | 	return &command{C: cmdFreeze}
61 | }
62 | 
63 | // CommandUnfreeze resumes task watching and balancing.
64 | func CommandUnfreeze() Command {
65 | 	return &command{C: cmdUnfreeze}
66 | }
67 | 
68 | // CommandBalance forces the node's balancer.Balance method to be called even
69 | // if frozen.
70 | func CommandBalance() Command {
71 | 	return &command{C: cmdBalance}
72 | }
73 | 
74 | // CommandStopTask forces a node to stop a task even if frozen.
75 | func CommandStopTask(task string) Command {
76 | 	return &command{C: cmdStopTask, P: map[string]interface{}{"task": task}}
77 | }
78 | 


--------------------------------------------------------------------------------
/command_test.go:
--------------------------------------------------------------------------------
 1 | package metafora_test
 2 | 
 3 | import (
 4 | 	"reflect"
 5 | 	"testing"
 6 | 
 7 | 	. "github.com/lytics/metafora"
 8 | )
 9 | 
10 | func testCmd(t *testing.T, cmd Command, name string, params map[string]interface{}) {
11 | 	if cmd.Name() != name {
12 | 		t.Errorf("%s command's name is wrong: %s", name, cmd.Name())
13 | 	}
14 | 	if !reflect.DeepEqual(cmd.Parameters(), params) {
15 | 		t.Errorf("%s command's params are wrong. expected %#v != %#v", name, params, cmd.Parameters())
16 | 	}
17 | 	b, err := cmd.Marshal()
18 | 	if err != nil {
19 | 		t.Errorf("%s command's Marshal() returned an error: %v", name, err)
20 | 		return
21 | 	}
22 | 	cmd2, err := UnmarshalCommand(b)
23 | 	if err != nil {
24 | 		t.Errorf("%s command's Marshal() output could not be Unmarshalled: %v", name, err)
25 | 		return
26 | 	}
27 | 	if cmd2.Name() != name {
28 | 		t.Errorf("%s command's name didn't Unmarshal properly: %s", name, cmd2.Name())
29 | 	}
30 | 	if !reflect.DeepEqual(cmd2.Parameters(), params) {
31 | 		t.Errorf("%s command's params didn't Unmarshal properly. expected %#v != %#v",
32 | 			name, params, cmd2.Parameters())
33 | 	}
34 | }
35 | 
36 | func TestCommands(t *testing.T) {
37 | 	t.Parallel()
38 | 	testCmd(t, CommandFreeze(), "freeze", nil)
39 | 	testCmd(t, CommandUnfreeze(), "unfreeze", nil)
40 | 	testCmd(t, CommandBalance(), "balance", nil)
41 | 	testCmd(t, CommandStopTask("test"), "stop_task", map[string]interface{}{"task": "test"})
42 | }
43 | 


--------------------------------------------------------------------------------
/coordinator.go:
--------------------------------------------------------------------------------
 1 | package metafora
 2 | 
 3 | // CoordinatorContext is the context passed to coordinators by the core
 4 | // consumer.
 5 | type CoordinatorContext interface {
 6 | 	// Lost is called by the Coordinator when a claimed task is lost to another
 7 | 	// node. The Consumer will stop the task locally.
 8 | 	//
 9 | 	// Since this implies there is a window of time where the task is executing
10 | 	// more than once, this is a sign of an unhealthy cluster.
11 | 	Lost(Task)
12 | }
13 | 
14 | // Coordinator is the core interface Metafora uses to discover, claim, and
15 | // tasks as well as receive commands.
16 | type Coordinator interface {
17 | 	// Init is called once by the consumer to provide a Logger to Coordinator
18 | 	// implementations. NewConsumer will return Init's return value.
19 | 	Init(CoordinatorContext) error
20 | 
21 | 	// Watch the broker for claimable tasks. Watch blocks until Close is called
22 | 	// or it encounters an error. Tasks are sent to consumer via the tasks chan.
23 | 	Watch(tasks chan<- Task) (err error)
24 | 
25 | 	// Claim is called by the Consumer when a Balancer has determined that a task
26 | 	// ID can be claimed. Claim returns false if another consumer has already
27 | 	// claimed the ID.
28 | 	Claim(Task) bool
29 | 
30 | 	// Release a task for other consumers to claim. May be called after Close.
31 | 	Release(Task)
32 | 
33 | 	// Done is called by Metafora when a task has been completed and should never
34 | 	// be scheduled to run again (in other words: deleted from the broker).
35 | 	//
36 | 	// May be called after Close.
37 | 	Done(Task)
38 | 
39 | 	// Command blocks until a command for this node is received from the broker
40 | 	// by the coordinator. Command must return (nil, nil) when Close is called.
41 | 	Command() (Command, error)
42 | 
43 | 	// Close the coordinator. Stop waiting for tasks and commands. Remove node from broker.
44 | 	//
45 | 	// Do not release tasks. The consumer will handle task releasing.
46 | 	Close()
47 | 
48 | 	// Name of the coordinator for use in logs and other tooling.
49 | 	Name() string
50 | }
51 | 
52 | type coordinatorContext struct {
53 | 	*Consumer
54 | }
55 | 
56 | // Lost is a light wrapper around Coordinator.stopTask to make it suitable for
57 | // calling by Coordinator implementations via the CoordinatorContext interface.
58 | func (ctx *coordinatorContext) Lost(t Task) {
59 | 	tid := t.ID()
60 | 	Errorf("Lost task %s", tid)
61 | 	ctx.stopTask(tid)
62 | }
63 | 


--------------------------------------------------------------------------------
/doc.go:
--------------------------------------------------------------------------------
1 | // Metafora is a library for building distributed work systems. It's masterless
2 | // and extensible via core Balancer and Coordinator interfaces.
3 | //
4 | // If you use the builtin FairBalancer and EtcdCoordinator, all you have to do
5 | // is implement a Handler and HandlerFunc, and then run the Consumer.
6 | //
7 | // See https://github.com/lytics/metafora
8 | package metafora
9 | 


--------------------------------------------------------------------------------
/embedded/README.md:
--------------------------------------------------------------------------------
1 | Creates client/coordinator pairs which use channels to communicate.
2 | 
3 | Meant to be used embedded in applications which do not need/want external
4 | coordination, especially tests.
5 | 


--------------------------------------------------------------------------------
/embedded/client.go:
--------------------------------------------------------------------------------
 1 | package embedded
 2 | 
 3 | import "github.com/lytics/metafora"
 4 | 
 5 | func NewEmbeddedClient(taskchan chan metafora.Task, cmdchan chan *NodeCommand, nodechan chan []string) metafora.Client {
 6 | 	return &EmbeddedClient{taskchan, cmdchan, nodechan}
 7 | }
 8 | 
 9 | type EmbeddedClient struct {
10 | 	taskchan chan<- metafora.Task
11 | 	cmdchan  chan<- *NodeCommand
12 | 	nodechan <-chan []string
13 | }
14 | 
15 | func (ec *EmbeddedClient) SubmitTask(t metafora.Task) error {
16 | 	ec.taskchan <- t
17 | 	return nil
18 | }
19 | 
20 | func (ec *EmbeddedClient) DeleteTask(taskid string) error {
21 | 	nodes, _ := ec.Nodes()
22 | 	// Simply submit stop for all nodes
23 | 	for _, nid := range nodes {
24 | 		_ = ec.SubmitCommand(nid, metafora.CommandStopTask(taskid))
25 | 	}
26 | 	return nil
27 | }
28 | 
29 | func (ec *EmbeddedClient) SubmitCommand(nodeid string, command metafora.Command) error {
30 | 	ec.cmdchan <- &NodeCommand{command, nodeid}
31 | 	return nil
32 | }
33 | 
34 | func (ec *EmbeddedClient) Nodes() ([]string, error) {
35 | 	nodes := <-ec.nodechan
36 | 	return nodes, nil
37 | }
38 | 


--------------------------------------------------------------------------------
/embedded/commander.go:
--------------------------------------------------------------------------------
 1 | package embedded
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 
 6 | 	"github.com/lytics/metafora/statemachine"
 7 | )
 8 | 
 9 | var _ statemachine.Commander = (*Commander)(nil)
10 | 
11 | // Commander is an embedable statemachine.Commander implementation.
12 | // Task-specific command listeners are created by calling NewListener.
13 | type Commander struct {
14 | 	listeners map[string]chan *statemachine.Message
15 | }
16 | 
17 | // NewCommander creates a new statemachine.Commander implementation.
18 | func NewCommander() *Commander {
19 | 	return &Commander{listeners: make(map[string]chan *statemachine.Message)}
20 | }
21 | 
22 | // NewListener creates a task specific command listener linked to an embedded
23 | // Commander.
24 | func (c *Commander) NewListener(taskID string) statemachine.CommandListener {
25 | 	// Buffer chan to make sending/recving asynchronous
26 | 	c.listeners[taskID] = make(chan *statemachine.Message, 1)
27 | 	return &commandListener{c: c.listeners[taskID]}
28 | }
29 | 
30 | func (c *Commander) Send(taskID string, m *statemachine.Message) error {
31 | 	cl, ok := c.listeners[taskID]
32 | 	if !ok {
33 | 		return fmt.Errorf("task=%q not running", taskID)
34 | 	}
35 | 	cl <- m
36 | 	return nil
37 | }
38 | 
39 | type commandListener struct {
40 | 	c <-chan *statemachine.Message
41 | }
42 | 
43 | func (cl *commandListener) Receive() <-chan *statemachine.Message { return cl.c }
44 | func (*commandListener) Stop()                                    {}
45 | 


--------------------------------------------------------------------------------
/embedded/commander_test.go:
--------------------------------------------------------------------------------
 1 | package embedded_test
 2 | 
 3 | import (
 4 | 	"testing"
 5 | 	"time"
 6 | 
 7 | 	"github.com/lytics/metafora/embedded"
 8 | 	"github.com/lytics/metafora/statemachine"
 9 | )
10 | 
11 | func TestEmbeddedCommander(t *testing.T) {
12 | 	t.Parallel()
13 | 	cmdr := embedded.NewCommander()
14 | 	cl1 := cmdr.NewListener("task1")
15 | 	cl2 := cmdr.NewListener("task2")
16 | 
17 | 	if err := cmdr.Send("task1", statemachine.RunMessage()); err != nil {
18 | 		t.Fatalf("Error sending message to task1: %v", err)
19 | 	}
20 | 	if err := cmdr.Send("task2", statemachine.ReleaseMessage()); err != nil {
21 | 		t.Fatalf("Error sending message to task2: %v", err)
22 | 	}
23 | 	if err := cmdr.Send("invalid-task", statemachine.PauseMessage()); err == nil {
24 | 		t.Fatal("Expected an error when sending to an invalid task, but didn't receive one.")
25 | 	}
26 | 
27 | 	msg2 := <-cl2.Receive()
28 | 	if msg2.Code != statemachine.Release {
29 | 		t.Fatalf("listener2 expected a Run message but received: %#v", msg2)
30 | 	}
31 | 	msg1 := <-cl1.Receive()
32 | 	if msg1.Code != statemachine.Run {
33 | 		t.Fatalf("listener1 expected a Run message but received: %#v", msg1)
34 | 	}
35 | 
36 | 	// Stop listeners and make sure nothing works (but doesn't panic)
37 | 	cl1.Stop()
38 | 	cl2.Stop()
39 | 
40 | 	select {
41 | 	case <-cl1.Receive():
42 | 		t.Fatal("expected listener1 to be close but it still received a message!")
43 | 	case <-cl2.Receive():
44 | 		t.Fatal("expected listener2 to be close but it still received a message!")
45 | 	case <-time.After(50 * time.Millisecond):
46 | 	}
47 | }
48 | 


--------------------------------------------------------------------------------
/embedded/coordinator.go:
--------------------------------------------------------------------------------
 1 | package embedded
 2 | 
 3 | import (
 4 | 	"errors"
 5 | 
 6 | 	"github.com/lytics/metafora"
 7 | )
 8 | 
 9 | func NewEmbeddedCoordinator(nodeid string, taskchan chan metafora.Task, cmdchan chan *NodeCommand, nodechan chan []string) metafora.Coordinator {
10 | 	e := &EmbeddedCoordinator{inchan: taskchan, cmdchan: cmdchan, stopchan: make(chan struct{}), nodechan: nodechan}
11 | 	// HACK - need to respond to node requests, assuming a single coordinator/client pair
12 | 	go func() {
13 | 		for {
14 | 			select {
15 | 			case e.nodechan <- []string{e.nodeid}:
16 | 			case <-e.stopchan:
17 | 				return
18 | 			}
19 | 		}
20 | 	}()
21 | 
22 | 	return e
23 | }
24 | 
25 | // Coordinator which listens for tasks on a channel
26 | type EmbeddedCoordinator struct {
27 | 	nodeid   string
28 | 	ctx      metafora.CoordinatorContext
29 | 	inchan   chan metafora.Task
30 | 	cmdchan  chan *NodeCommand
31 | 	nodechan chan<- []string
32 | 	stopchan chan struct{}
33 | }
34 | 
35 | func (e *EmbeddedCoordinator) Init(c metafora.CoordinatorContext) error {
36 | 	e.ctx = c
37 | 	return nil
38 | }
39 | 
40 | func (e *EmbeddedCoordinator) Watch(out chan<- metafora.Task) error {
41 | 	for {
42 | 		// wait for incoming tasks
43 | 		select {
44 | 		case id, ok := <-e.inchan:
45 | 			if !ok {
46 | 				return errors.New("Input closed")
47 | 			}
48 | 			select {
49 | 			case out <- id:
50 | 			case <-e.stopchan:
51 | 				return nil
52 | 			}
53 | 		case <-e.stopchan:
54 | 			return nil
55 | 		}
56 | 	}
57 | }
58 | 
59 | func (e *EmbeddedCoordinator) Claim(task metafora.Task) bool {
60 | 	// We recieved on a channel, we are the only ones to pull that value
61 | 	return true
62 | }
63 | 
64 | func (e *EmbeddedCoordinator) Release(task metafora.Task) {
65 | 	// Releasing should be async to avoid deadlocks (and better reflect the
66 | 	// behavior of "real" coordinators)
67 | 	go func() {
68 | 		select {
69 | 		case e.inchan <- task:
70 | 		case <-e.stopchan:
71 | 		}
72 | 	}()
73 | }
74 | 
75 | func (e *EmbeddedCoordinator) Done(task metafora.Task) {}
76 | 
77 | func (e *EmbeddedCoordinator) Command() (metafora.Command, error) {
78 | 	select {
79 | 	case cmd, ok := <-e.cmdchan:
80 | 		if !ok {
81 | 			return nil, errors.New("Cmd channel closed")
82 | 		}
83 | 		return cmd.Cmd, nil
84 | 	case <-e.stopchan:
85 | 		return nil, nil
86 | 	}
87 | }
88 | 
89 | func (e *EmbeddedCoordinator) Close() {
90 | 	close(e.stopchan)
91 | }
92 | 
93 | func (e *EmbeddedCoordinator) Name() string {
94 | 	return "embedded"
95 | }
96 | 


--------------------------------------------------------------------------------
/embedded/embedded_test.go:
--------------------------------------------------------------------------------
  1 | package embedded
  2 | 
  3 | import (
  4 | 	"log"
  5 | 	"os"
  6 | 	"sync"
  7 | 	"testing"
  8 | 	"time"
  9 | 
 10 | 	"github.com/lytics/metafora"
 11 | )
 12 | 
 13 | func init() {
 14 | 	metafora.SetLogger(log.New(os.Stderr, "", log.Lmicroseconds|log.Lshortfile))
 15 | }
 16 | 
 17 | func TestEmbedded(t *testing.T) {
 18 | 
 19 | 	tc := newTestCounter()
 20 | 	adds := make(chan string, 4)
 21 | 
 22 | 	thfunc := metafora.SimpleHandler(func(task metafora.Task, _ <-chan bool) bool {
 23 | 		tc.Add(task.ID())
 24 | 		adds <- task.ID()
 25 | 		return true
 26 | 	})
 27 | 
 28 | 	coord, client := NewEmbeddedPair("testnode")
 29 | 	runner, _ := metafora.NewConsumer(coord, thfunc, metafora.DumbBalancer)
 30 | 
 31 | 	go runner.Run()
 32 | 
 33 | 	for _, taskid := range []string{"one", "two", "three", "four"} {
 34 | 		err := client.SubmitTask(metafora.NewTask(taskid))
 35 | 		if err != nil {
 36 | 			t.Fatalf("Expected no error, got %v", err)
 37 | 		}
 38 | 	}
 39 | 
 40 | 	deadline := time.Now().Add(500 * time.Millisecond)
 41 | 	for time.Now().Before(deadline) {
 42 | 		if len(adds) == 4 {
 43 | 			break
 44 | 		}
 45 | 		time.Sleep(10 * time.Millisecond)
 46 | 	}
 47 | 	if len(adds) != 4 {
 48 | 		t.Errorf("Handlers didn't run in expected amount of time")
 49 | 	}
 50 | 	runner.Shutdown()
 51 | 
 52 | 	runs := tc.Runs()
 53 | 	if len(runs) != 4 {
 54 | 		t.Fatalf("Expected 4 runs, got %d", len(runs))
 55 | 	}
 56 | 
 57 | }
 58 | 
 59 | func TestEmbeddedShutdown(t *testing.T) {
 60 | 	const n = 4
 61 | 	runs := make(chan int, n)
 62 | 	stops := make(chan int, n)
 63 | 	thfunc := metafora.SimpleHandler(func(_ metafora.Task, s <-chan bool) bool {
 64 | 		runs <- 1
 65 | 		select {
 66 | 		case <-s:
 67 | 			stops <- 1
 68 | 			return false
 69 | 		case <-time.After(time.Second * 3):
 70 | 			return true
 71 | 		}
 72 | 	})
 73 | 
 74 | 	coord, client := NewEmbeddedPair("testnode")
 75 | 	runner, _ := metafora.NewConsumer(coord, thfunc, metafora.DumbBalancer)
 76 | 
 77 | 	go runner.Run()
 78 | 
 79 | 	// len(tasks) must == n
 80 | 	tasks := []string{"one", "two", "three", "four"}
 81 | 
 82 | 	// submit tasks
 83 | 	for _, taskid := range tasks {
 84 | 		err := client.SubmitTask(metafora.NewTask(taskid))
 85 | 		if err != nil {
 86 | 			t.Fatalf("Expected no error, got %v", err)
 87 | 		}
 88 | 	}
 89 | 
 90 | 	// make sure all 4 start
 91 | 	for i := 0; i < n; i++ {
 92 | 		<-runs
 93 | 	}
 94 | 
 95 | 	// tell them to stop
 96 | 	runner.Shutdown()
 97 | 
 98 | 	// make sure all 4 stop
 99 | 	for i := 0; i < n; i++ {
100 | 		<-stops
101 | 	}
102 | }
103 | 
104 | func newTestCounter() *testcounter {
105 | 	return &testcounter{runs: []string{}}
106 | }
107 | 
108 | type testcounter struct {
109 | 	runs []string
110 | 	cmut sync.Mutex
111 | }
112 | 
113 | func (t *testcounter) Add(r string) {
114 | 	t.cmut.Lock()
115 | 	defer t.cmut.Unlock()
116 | 	t.runs = append(t.runs, r)
117 | }
118 | 
119 | func (t *testcounter) Runs() []string {
120 | 	t.cmut.Lock()
121 | 	defer t.cmut.Unlock()
122 | 	return t.runs
123 | }
124 | 


--------------------------------------------------------------------------------
/embedded/statestore.go:
--------------------------------------------------------------------------------
 1 | package embedded
 2 | 
 3 | import (
 4 | 	"sync"
 5 | 
 6 | 	"github.com/lytics/metafora"
 7 | 	"github.com/lytics/metafora/statemachine"
 8 | )
 9 | 
10 | type StateChanged struct {
11 | 	TaskID string
12 | 	State  *statemachine.State
13 | }
14 | 
15 | // StateStore is an in-memory implementation of statemachine.StateStore
16 | // intended for use in tests.
17 | type StateStore struct {
18 | 	mu    *sync.RWMutex
19 | 	store map[string]*statemachine.State
20 | 
21 | 	// Stored is intended for tests to block until a Store() is called as an
22 | 	// alternative to time.Sleep()s.
23 | 	//
24 | 	// Will deliver asynchronously and drop states if there's no receivers.
25 | 	Stored chan StateChanged
26 | }
27 | 
28 | func NewStateStore() statemachine.StateStore {
29 | 	return &StateStore{
30 | 		mu:     &sync.RWMutex{},
31 | 		store:  map[string]*statemachine.State{},
32 | 		Stored: make(chan StateChanged, 1),
33 | 	}
34 | }
35 | 
36 | func (s *StateStore) Load(task metafora.Task) (*statemachine.State, error) {
37 | 	s.mu.RLock()
38 | 	defer s.mu.RUnlock()
39 | 	state, ok := s.store[task.ID()]
40 | 	if !ok {
41 | 		return &statemachine.State{Code: statemachine.Runnable}, nil
42 | 	}
43 | 	return state, nil
44 | }
45 | 
46 | func (s *StateStore) Store(task metafora.Task, state *statemachine.State) error {
47 | 	s.mu.Lock()
48 | 	s.store[task.ID()] = state
49 | 	s.mu.Unlock()
50 | 	stored := StateChanged{TaskID: task.ID(), State: state}
51 | 	select {
52 | 	case s.Stored <- stored:
53 | 	default:
54 | 	}
55 | 	return nil
56 | }
57 | 


--------------------------------------------------------------------------------
/embedded/util.go:
--------------------------------------------------------------------------------
 1 | package embedded
 2 | 
 3 | import "github.com/lytics/metafora"
 4 | 
 5 | type NodeCommand struct {
 6 | 	Cmd    metafora.Command
 7 | 	NodeId string
 8 | }
 9 | 
10 | // Returns a connected client/coordinator pair for embedded/testing use
11 | func NewEmbeddedPair(nodeid string) (metafora.Coordinator, metafora.Client) {
12 | 	taskchan := make(chan metafora.Task)
13 | 	cmdchan := make(chan *NodeCommand)
14 | 	nodechan := make(chan []string, 1)
15 | 
16 | 	coord := NewEmbeddedCoordinator(nodeid, taskchan, cmdchan, nodechan)
17 | 	client := NewEmbeddedClient(taskchan, cmdchan, nodechan)
18 | 
19 | 	return coord, client
20 | }
21 | 


--------------------------------------------------------------------------------
/go.mod:
--------------------------------------------------------------------------------
 1 | module github.com/lytics/metafora
 2 | 
 3 | go 1.13
 4 | 
 5 | require (
 6 | 	github.com/araddon/gou v0.0.0-20190110011759-c797efecbb61
 7 | 	github.com/kr/pretty v0.2.1 // indirect
 8 | 	github.com/kr/text v0.2.0 // indirect
 9 | 	github.com/stretchr/testify v1.7.0
10 | 	go.etcd.io/etcd/client/v3 v3.5.7
11 | )
12 | 


--------------------------------------------------------------------------------
/handler.go:
--------------------------------------------------------------------------------
 1 | package metafora
 2 | 
 3 | // Handler is the core task handling interface. The Consumer will create a new
 4 | // Handler for each claimed task, call Run once and only once, and call Stop
 5 | // when the task should persist its progress and exit.
 6 | type Handler interface {
 7 | 	// Run handles a task and blocks until completion or Stop is called.
 8 | 	//
 9 | 	// If Run returns true, Metafora will mark the task as Done via the
10 | 	// Coordinator. The task will not be rescheduled.
11 | 	//
12 | 	// If Run returns false, Metafora will Release the task via the Coordinator.
13 | 	// The task will be scheduled to run again.
14 | 	//
15 | 	// Panics are treated the same as returning true.
16 | 	Run() (done bool)
17 | 
18 | 	// Stop signals to the handler to shutdown gracefully. Stop implementations
19 | 	// should not block until Run exits.
20 | 	//
21 | 	// Stop may be called more than once but calls are serialized. Implmentations
22 | 	// may perform different operations on subsequent calls to Stop to implement
23 | 	// graceful vs. forced shutdown conditions.
24 | 	//
25 | 	// Run probably wants to return false when stop is called, but this is left
26 | 	// up to the implementation as races between Run finishing and Stop being
27 | 	// called can happen.
28 | 	Stop()
29 | }
30 | 
31 | // HandlerFunc is called by the Consumer to create a new Handler for each task.
32 | //
33 | // HandlerFunc is meant to be the New function for handlers. Since Run and Stop
34 | // are called concurrently, any state used by both should be initialized in the
35 | // HandlerFunc. Since Handlerfunc is uninterruptable, only the minimum amount
36 | // of work necessary to initialize a handler should be done.
37 | type HandlerFunc func(Task) Handler
38 | 
39 | // SimpleHander creates a HandlerFunc for a simple function that accepts a stop
40 | // channel. The channel will be closed when Stop is called.
41 | func SimpleHandler(f func(t Task, stop <-chan bool) bool) HandlerFunc {
42 | 	return func(t Task) Handler {
43 | 		return &simpleHandler{
44 | 			task: t,
45 | 			stop: make(chan bool),
46 | 			f:    f,
47 | 		}
48 | 	}
49 | }
50 | 
51 | type simpleHandler struct {
52 | 	task Task
53 | 	stop chan bool
54 | 	f    func(Task, <-chan bool) bool
55 | }
56 | 
57 | func (h *simpleHandler) Run() bool {
58 | 	return h.f(h.task, h.stop)
59 | }
60 | 
61 | func (h *simpleHandler) Stop() {
62 | 	select {
63 | 	case <-h.stop:
64 | 	default:
65 | 		close(h.stop)
66 | 	}
67 | }
68 | 


--------------------------------------------------------------------------------
/httputil/httputil.go:
--------------------------------------------------------------------------------
 1 | package httputil
 2 | 
 3 | import (
 4 | 	"encoding/json"
 5 | 	"net/http"
 6 | 	"time"
 7 | 
 8 | 	"github.com/lytics/metafora"
 9 | 	"github.com/lytics/metafora/statemachine"
10 | )
11 | 
12 | // Consumer contains just the Metafora methods exposed by the HTTP
13 | // introspection endpoints.
14 | type Consumer interface {
15 | 	Frozen() bool
16 | 	Tasks() []metafora.RunningTask
17 | 	String() string
18 | }
19 | 
20 | type stateMachine interface {
21 | 	State() (*statemachine.State, time.Time)
22 | }
23 | 
24 | type Task struct {
25 | 	ID       string        `json:"id"`
26 | 	Started  time.Time     `json:"started"`
27 | 	Stopped  *time.Time    `json:"stopped,omitempty"`
28 | 	State    string        `json:"state,omitempty"`
29 | 	Modified *time.Time    `json:"modified,omitempty"`
30 | 	Task     metafora.Task `json:"task"`
31 | }
32 | 
33 | // InfoResponse is the JSON response marshalled by the MakeInfoHandler.
34 | type InfoResponse struct {
35 | 	Frozen  bool      `json:"frozen"`
36 | 	Name    string    `json:"name"`
37 | 	Started time.Time `json:"started"`
38 | 	Tasks   []Task    `json:"tasks"`
39 | }
40 | 
41 | // MakeInfoHandler returns an HTTP handler which can be added to an exposed
42 | // HTTP server mux by Metafora applications to provide operators with basic
43 | // node introspection.
44 | func MakeInfoHandler(c Consumer, started time.Time) http.HandlerFunc {
45 | 	return func(w http.ResponseWriter, _ *http.Request) {
46 | 		tasks := c.Tasks()
47 | 		resp := InfoResponse{
48 | 			Frozen:  c.Frozen(),
49 | 			Name:    c.String(),
50 | 			Started: started,
51 | 			Tasks:   make([]Task, len(tasks)),
52 | 		}
53 | 		for i, task := range tasks {
54 | 			resp.Tasks[i] = Task{
55 | 				ID:      task.Task().ID(),
56 | 				Started: task.Started(),
57 | 				Task:    task.Task(),
58 | 			}
59 | 
60 | 			// Set stopped if it's non-zero
61 | 			stopped := task.Stopped()
62 | 			if !stopped.IsZero() {
63 | 				resp.Tasks[i].Stopped = &stopped
64 | 			}
65 | 
66 | 			// Expose state if it exists
67 | 			if sh, ok := task.Handler().(stateMachine); ok {
68 | 				s, ts := sh.State()
69 | 				resp.Tasks[i].State = s.String()
70 | 				resp.Tasks[i].Modified = &ts
71 | 			}
72 | 		}
73 | 		w.Header().Set("Content-Type", "application/json")
74 | 		_ = json.NewEncoder(w).Encode(&resp)
75 | 	}
76 | }
77 | 


--------------------------------------------------------------------------------
/httputil/httputil_test.go:
--------------------------------------------------------------------------------
 1 | package httputil_test
 2 | 
 3 | import (
 4 | 	"encoding/json"
 5 | 	"net/http/httptest"
 6 | 	"testing"
 7 | 	"time"
 8 | 
 9 | 	"github.com/lytics/metafora"
10 | 	. "github.com/lytics/metafora/httputil"
11 | )
12 | 
13 | type tc struct {
14 | 	stop chan bool
15 | }
16 | 
17 | func (*tc) Init(metafora.CoordinatorContext) error { return nil }
18 | func (c *tc) Watch(chan<- metafora.Task) error {
19 | 	<-c.stop
20 | 	return nil
21 | }
22 | func (c *tc) Claim(metafora.Task) bool { return false }
23 | func (c *tc) Release(metafora.Task)    {}
24 | func (c *tc) Done(metafora.Task)       {}
25 | func (c *tc) Command() (metafora.Command, error) {
26 | 	<-c.stop
27 | 	return nil, nil
28 | }
29 | func (c *tc) Close()       { close(c.stop) }
30 | func (c *tc) Name() string { return "tc" }
31 | 
32 | func TestMakeInfoHandler(t *testing.T) {
33 | 	t.Parallel()
34 | 
35 | 	c, _ := metafora.NewConsumer(&tc{stop: make(chan bool)}, nil, metafora.DumbBalancer)
36 | 	defer c.Shutdown()
37 | 	now := time.Now().Truncate(time.Second)
38 | 
39 | 	resp := httptest.NewRecorder()
40 | 	MakeInfoHandler(c, now)(resp, nil)
41 | 
42 | 	info := InfoResponse{}
43 | 	if err := json.Unmarshal(resp.Body.Bytes(), &info); err != nil {
44 | 		t.Fatalf("Error unmarshalling response body: %v", err)
45 | 	}
46 | 	if info.Frozen {
47 | 		t.Errorf("Consumer should not start frozen.")
48 | 	}
49 | 	if !info.Started.Equal(now) {
50 | 		t.Errorf("Started time %s != %s", info.Started, now)
51 | 	}
52 | 	if info.Name != "tc" {
53 | 		t.Errorf("Node name %q != tc", info.Name)
54 | 	}
55 | 	if len(info.Tasks) != 0 {
56 | 		t.Errorf("Unexpected tasks: %v", info.Tasks)
57 | 	}
58 | }
59 | 


--------------------------------------------------------------------------------
/ignore.go:
--------------------------------------------------------------------------------
  1 | package metafora
  2 | 
  3 | import (
  4 | 	"container/heap"
  5 | 	"sync"
  6 | 	"time"
  7 | )
  8 | 
  9 | // ignoremgr handles ignoring tasks and sending them back to the consumer once
 10 | // their ignore deadline is reached.
 11 | type ignoremgr struct {
 12 | 	incoming chan *timetask
 13 | 	stop     <-chan struct{}
 14 | 
 15 | 	mu      *sync.RWMutex
 16 | 	ignores map[string]struct{}
 17 | }
 18 | 
 19 | func ignorer(tasks chan<- Task, stop <-chan struct{}) *ignoremgr {
 20 | 	im := &ignoremgr{
 21 | 		incoming: make(chan *timetask),
 22 | 		stop:     stop,
 23 | 		mu:       &sync.RWMutex{},
 24 | 		ignores:  make(map[string]struct{}),
 25 | 	}
 26 | 	go im.monitor(tasks, stop)
 27 | 	return im
 28 | }
 29 | 
 30 | func (im *ignoremgr) add(task Task, until time.Time) {
 31 | 	// short circuit zero times; queue everything else
 32 | 	if until.IsZero() {
 33 | 		return
 34 | 	}
 35 | 
 36 | 	// Add to ignore map
 37 | 	im.mu.Lock()
 38 | 	im.ignores[task.ID()] = struct{}{}
 39 | 	im.mu.Unlock()
 40 | 
 41 | 	// Send to monitor for pushing onto time heap
 42 | 	select {
 43 | 	case im.incoming <- &timetask{time: until, task: task}:
 44 | 	case <-im.stop:
 45 | 		// Don't bother adding ignore if we're just exiting
 46 | 	}
 47 | }
 48 | 
 49 | func (im *ignoremgr) ignored(taskID string) (ignored bool) {
 50 | 	im.mu.RLock()
 51 | 	_, ok := im.ignores[taskID]
 52 | 	im.mu.RUnlock()
 53 | 
 54 | 	return ok
 55 | }
 56 | 
 57 | func (im *ignoremgr) monitor(tasks chan<- Task, stop <-chan struct{}) {
 58 | 	times := timeheap{}
 59 | 	heap.Init(&times)
 60 | 	var next *timetask
 61 | 	for {
 62 | 		if times.Len() > 0 {
 63 | 			// Get next ignore from the ignore heap
 64 | 			next = heap.Pop(&times).(*timetask)
 65 | 		} else {
 66 | 			// No ignores! Wait for one to come in or an exit signal
 67 | 			select {
 68 | 			case <-stop:
 69 | 				return
 70 | 			case newtask := <-im.incoming:
 71 | 				next = newtask
 72 | 			}
 73 | 		}
 74 | 
 75 | 		// this duration *may* be negative, in which case the
 76 | 		// task will be pushed immediately
 77 | 		timer := time.NewTimer(time.Until(next.time))
 78 | 
 79 | 		select {
 80 | 		case newtask := <-im.incoming:
 81 | 			// Push onto next task and new task onto time heap
 82 | 			heap.Push(&times, newtask)
 83 | 			heap.Push(&times, next)
 84 | 
 85 | 			// Stop the existing timer for this loop iteration
 86 | 			timer.Stop()
 87 | 		case <-timer.C:
 88 | 			// Ignore expired, remove the entry
 89 | 			im.mu.Lock()
 90 | 			delete(im.ignores, next.task.ID())
 91 | 			im.mu.Unlock()
 92 | 
 93 | 			// Notify the consumer
 94 | 			select {
 95 | 			case tasks <- next.task:
 96 | 			case <-stop:
 97 | 				return
 98 | 			}
 99 | 		case <-stop:
100 | 			return
101 | 		}
102 | 	}
103 | }
104 | 
105 | func (im *ignoremgr) all() []string {
106 | 	im.mu.RLock()
107 | 	defer im.mu.RUnlock()
108 | 	ignores := make([]string, len(im.ignores))
109 | 	i := 0
110 | 	for k := range im.ignores {
111 | 		ignores[i] = k
112 | 		i++
113 | 	}
114 | 	return ignores
115 | }
116 | 
117 | type timetask struct {
118 | 	time time.Time
119 | 	task Task
120 | }
121 | 
122 | // timeheap is a min-heap of time/task tuples sorted by time.
123 | type timeheap []*timetask
124 | 
125 | func (h timeheap) Len() int           { return len(h) }
126 | func (h timeheap) Less(i, j int) bool { return h[i].time.Before(h[j].time) }
127 | func (h timeheap) Swap(i, j int)      { h[i], h[j] = h[j], h[i] }
128 | 
129 | func (h *timeheap) Push(x interface{}) {
130 | 	// Push and Pop use pointer receivers because they modify the slice's length,
131 | 	// not just its contents.
132 | 	*h = append(*h, x.(*timetask))
133 | }
134 | 
135 | func (h *timeheap) Pop() interface{} {
136 | 	old := *h
137 | 	n := len(old)
138 | 	x := old[n-1]
139 | 	*h = old[0 : n-1]
140 | 	return x
141 | }
142 | 


--------------------------------------------------------------------------------
/ignore_test.go:
--------------------------------------------------------------------------------
 1 | package metafora
 2 | 
 3 | import (
 4 | 	"testing"
 5 | 	"time"
 6 | )
 7 | 
 8 | func TestIgnore(t *testing.T) {
 9 | 	t.Parallel()
10 | 	out := make(chan Task)
11 | 	stop := make(chan struct{})
12 | 	defer close(stop)
13 | 
14 | 	// Create ignorer
15 | 	im := ignorer(out, stop)
16 | 
17 | 	// Ignore task for 200ms. Yes this is racy. Might need to bump deadline.
18 | 	deadline1 := time.Now().Add(200 * time.Millisecond)
19 | 	im.add(testTask{"1"}, deadline1)
20 | 
21 | 	// Ensure it's ignored
22 | 	if !im.ignored("1") {
23 | 		t.Fatal("test task should have been ignored but wasn't")
24 | 	}
25 | 
26 | 	// Ignore task for 10ms to make sure tasks are returned in order (they aren't
27 | 	// *guaranteed* to be in order since adds and evictions are concurrent)
28 | 	deadline2 := time.Now().Add(10 * time.Millisecond)
29 | 	im.add(testTask{"2"}, deadline2)
30 | 
31 | 	// Wait for the first eviction
32 | 	eviction := <-out
33 | 	if eviction.ID() != "2" {
34 | 		t.Fatal("Expected 2 to be evicted before 1")
35 | 	}
36 | 	now := time.Now()
37 | 	if now.Before(deadline2) {
38 | 		t.Fatalf("First eviction happened too soon: %s < %s", now, deadline2)
39 | 	}
40 | 
41 | 	eviction = <-out
42 | 	if eviction.ID() != "1" {
43 | 		t.Fatal("Expected 1 to be evicted second, found ", eviction)
44 | 	}
45 | 	now = time.Now()
46 | 	if now.Before(deadline1) {
47 | 		t.Fatalf("First eviction happened too soon: %s < %s", now, deadline1)
48 | 	}
49 | }
50 | 


--------------------------------------------------------------------------------
/logger.go:
--------------------------------------------------------------------------------
 1 | package metafora
 2 | 
 3 | import (
 4 | 	"github.com/araddon/gou"
 5 | )
 6 | 
 7 | var LogLevel int = gou.LogLevel
 8 | 
 9 | type LogOutputter interface {
10 | 	Output(calldepth int, s string) error
11 | }
12 | 
13 | // SetLogger switches where Metafora logs.
14 | func SetLogger(l LogOutputter) {
15 | }
16 | 
17 | var Debug func(v ...interface{}) = gou.Debug
18 | var Debugf func(format string, v ...interface{}) = gou.Debugf
19 | var Info func(v ...interface{}) = gou.Info
20 | var Infof func(format string, v ...interface{}) = gou.Infof
21 | var Warn func(v ...interface{}) = gou.Warn
22 | var Warnf func(format string, v ...interface{}) = gou.Warnf
23 | var Error func(v ...interface{}) = gou.Error
24 | var Errorf func(format string, v ...interface{}) = gou.Errorf
25 | 


--------------------------------------------------------------------------------
/metafora.go:
--------------------------------------------------------------------------------
  1 | package metafora
  2 | 
  3 | import (
  4 | 	"fmt"
  5 | 	"math/rand"
  6 | 	"runtime"
  7 | 	"sort"
  8 | 	"sync"
  9 | 	"time"
 10 | )
 11 | 
 12 | var (
 13 | 	// balance calls are randomized and this is the upper bound of the random
 14 | 	// amount
 15 | 	balanceJitterMax = 10 * int64(time.Second)
 16 | )
 17 | 
 18 | // Consumer is the core Metafora task runner.
 19 | type Consumer struct {
 20 | 	// Func to create new handlers
 21 | 	handler HandlerFunc
 22 | 
 23 | 	// Map of task:Handler
 24 | 	running map[string]*runtask
 25 | 
 26 | 	// Mutex to protect access to running
 27 | 	runL sync.Mutex
 28 | 
 29 | 	// WaitGroup for running handlers and consumer goroutines
 30 | 	hwg sync.WaitGroup
 31 | 
 32 | 	// WaitGroup so Shutdown() can block on Run() exiting fully
 33 | 	runwg  sync.WaitGroup
 34 | 	runwgL sync.Mutex
 35 | 
 36 | 	bal      Balancer
 37 | 	balEvery time.Duration
 38 | 	coord    Coordinator
 39 | 	im       *ignoremgr
 40 | 	stop     chan struct{} // closed by Shutdown to cause Run to exit
 41 | 	tasks    chan Task     // channel for watcher to send tasks to main loop
 42 | 
 43 | 	// Set by command handler, read anywhere via Consumer.frozen()
 44 | 	freezeL sync.Mutex
 45 | 	freeze  bool
 46 | }
 47 | 
 48 | var BalanceEvery = 15 * time.Minute //TODO make balance wait configurable
 49 | 
 50 | // NewConsumer returns a new consumer and calls Init on the Balancer and Coordinator.
 51 | func NewConsumer(coord Coordinator, h HandlerFunc, b Balancer) (*Consumer, error) {
 52 | 	c := &Consumer{
 53 | 		running:  make(map[string]*runtask),
 54 | 		handler:  h,
 55 | 		bal:      b,
 56 | 		balEvery: BalanceEvery,
 57 | 		coord:    coord,
 58 | 		stop:     make(chan struct{}),
 59 | 		tasks:    make(chan Task),
 60 | 	}
 61 | 	c.im = ignorer(c.tasks, c.stop)
 62 | 
 63 | 	// initialize balancer with the consumer and a prefixed logger
 64 | 	b.Init(c)
 65 | 
 66 | 	if err := coord.Init(&coordinatorContext{c}); err != nil {
 67 | 		return nil, err
 68 | 	}
 69 | 	return c, nil
 70 | }
 71 | 
 72 | // Run is the core run loop of Metafora. It is responsible for calling into the
 73 | // Coordinator to claim work and Balancer to rebalance work.
 74 | //
 75 | // Run blocks until Shutdown is called or an internal error occurs.
 76 | func (c *Consumer) Run() {
 77 | 	Debug(c, " Starting consumer")
 78 | 
 79 | 	// Increment run wait group so Shutdown() can block on Run() exiting fully.
 80 | 	c.runwgL.Lock()
 81 | 	c.runwg.Add(1)
 82 | 	c.runwgL.Unlock()
 83 | 	defer c.runwg.Done()
 84 | 
 85 | 	// chans for core goroutines to communicate with main loop
 86 | 	balance := make(chan bool)
 87 | 	cmdChan := make(chan Command)
 88 | 
 89 | 	// Balance is called by the main loop when the balance channel is ticked
 90 | 	go func() {
 91 | 		randInt := rand.New(rand.NewSource(time.Now().UnixNano())).Int63n
 92 | 		for {
 93 | 			select {
 94 | 			case <-c.stop:
 95 | 				// Shutdown has been called.
 96 | 				return
 97 | 			case <-time.After(c.balEvery + time.Duration(randInt(balanceJitterMax))):
 98 | 				select {
 99 | 				case balance <- true:
100 | 					// Ticked balance
101 | 				case <-c.stop:
102 | 					// Shutdown has been called.
103 | 					return
104 | 				}
105 | 			}
106 | 		}
107 | 	}()
108 | 
109 | 	// Watch for new tasks in a goroutine
110 | 	go c.watcher()
111 | 
112 | 	// Watch for new commands in a goroutine
113 | 	go func() {
114 | 		defer close(cmdChan)
115 | 		for {
116 | 			cmd, err := c.coord.Command()
117 | 			if err != nil {
118 | 				panic(fmt.Errorf("coordinator returned an error during command: %v", err))
119 | 			}
120 | 			if cmd == nil {
121 | 				Debug(c, " Command coordinator exited")
122 | 				return
123 | 			}
124 | 			// Send command to watcher (or shutdown)
125 | 			select {
126 | 			case <-c.stop:
127 | 				return
128 | 			case cmdChan <- cmd:
129 | 			}
130 | 		}
131 | 	}()
132 | 
133 | 	// Make sure Run() cleans up on exit (stops coordinator, releases tasks, etc)
134 | 	defer c.shutdown()
135 | 
136 | 	// Main Loop ensures events are processed synchronously
137 | 	for {
138 | 		if c.Frozen() {
139 | 			// Only recv commands while frozen
140 | 			select {
141 | 			case <-c.stop:
142 | 				// Shutdown has been called.
143 | 				return
144 | 			case cmd, ok := <-cmdChan:
145 | 				if !ok {
146 | 					Debug(c, " Command channel closed. Exiting main loop.")
147 | 					return
148 | 				}
149 | 				Debugf("%s Received command: %s", c, cmd)
150 | 				c.handleCommand(cmd)
151 | 			}
152 | 			continue
153 | 		}
154 | 
155 | 		select {
156 | 		case <-c.stop:
157 | 			// Shutdown has been called.
158 | 			return
159 | 		case <-balance:
160 | 			c.balance()
161 | 		case task := <-c.tasks:
162 | 			tid := task.ID()
163 | 			if c.ignored(tid) {
164 | 				Debugf("%s task=%q ignored", c, tid)
165 | 				continue
166 | 			}
167 | 			if until, ok := c.bal.CanClaim(task); !ok {
168 | 				Infof("%s Balancer rejected task=%q until %s", c, tid, until)
169 | 				c.ignore(task, until)
170 | 				break
171 | 			}
172 | 			if !c.coord.Claim(task) {
173 | 				Debugf("%s Coordinator unable to claim task=%q", c, tid)
174 | 				break
175 | 			}
176 | 			c.claimed(task)
177 | 		case cmd, ok := <-cmdChan:
178 | 			if !ok {
179 | 				Debug(c, " Command channel closed. Exiting main loop.")
180 | 				return
181 | 			}
182 | 			c.handleCommand(cmd)
183 | 		}
184 | 	}
185 | }
186 | 
187 | func (c *Consumer) watcher() {
188 | 	// The watcher dying unexpectedly should close the consumer to cause a
189 | 	// shutdown.
190 | 	defer c.close()
191 | 
192 | 	err := c.coord.Watch(c.tasks)
193 | 	if err != nil {
194 | 		panic(fmt.Errorf("coordinator returned an error during watch: %v", err))
195 | 	}
196 | }
197 | 
198 | func (c *Consumer) balance() {
199 | 	tasks := c.bal.Balance()
200 | 	if len(tasks) > 0 {
201 | 		Infof("%s balancer releasing %d tasks: %v", c, len(tasks), tasks)
202 | 	}
203 | 	for _, task := range tasks {
204 | 		// Actually release the rebalanced task.
205 | 		c.stopTask(task)
206 | 	}
207 | }
208 | 
209 | // close the c.stop channel which signals for the consumer to shutdown.
210 | func (c *Consumer) close() {
211 | 	// acquire the runL lock to make sure we don't race with claimed()'s <-c.stop
212 | 	// check
213 | 	c.runL.Lock()
214 | 	defer c.runL.Unlock()
215 | 	select {
216 | 	case <-c.stop:
217 | 		// already stopped
218 | 	default:
219 | 		Debug("Stopping Run loop")
220 | 		close(c.stop)
221 | 	}
222 | }
223 | 
224 | // shutdown is the actual shutdown logic called when Run() exits.
225 | func (c *Consumer) shutdown() {
226 | 	c.close()
227 | 
228 | 	// Build list of of currently running tasks
229 | 	runningtasks := c.Tasks()
230 | 	Infof("Sending stop signal to %d handler(s)", len(runningtasks))
231 | 
232 | 	for _, rt := range runningtasks {
233 | 		c.stopTask(rt.Task().ID())
234 | 	}
235 | 
236 | 	Info(c, " Waiting for handlers to exit")
237 | 	c.hwg.Wait()
238 | 
239 | 	Debug("Closing Coordinator ", c)
240 | 	c.coord.Close()
241 | }
242 | 
243 | // Shutdown stops the main Run loop, calls Stop on all handlers, and calls
244 | // Close on the Coordinator. Running tasks will be released for other nodes to
245 | // claim.
246 | func (c *Consumer) Shutdown() {
247 | 	c.close()
248 | 
249 | 	// Wait for task handlers to exit.
250 | 	c.hwg.Wait()
251 | 
252 | 	// Make sure Run() exits, otherwise Shutdown() might exit before
253 | 	// coord.Close() is called.
254 | 	c.runwgL.Lock()
255 | 	c.runwg.Wait()
256 | 	c.runwgL.Unlock()
257 | }
258 | 
259 | // Tasks returns a lexicographically sorted list of running Task IDs.
260 | func (c *Consumer) Tasks() []RunningTask {
261 | 	c.runL.Lock()
262 | 	defer c.runL.Unlock()
263 | 
264 | 	// Create a sorted list of task IDs
265 | 	ids := make([]string, len(c.running))
266 | 	i := 0
267 | 	for id := range c.running {
268 | 		ids[i] = id
269 | 		i++
270 | 	}
271 | 	sort.Strings(ids)
272 | 
273 | 	// Add tasks in lexicographic order
274 | 	t := make([]RunningTask, len(ids))
275 | 	for i, id := range ids {
276 | 		t[i] = c.running[id]
277 | 	}
278 | 	return t
279 | }
280 | 
281 | // claimed starts a handler for a claimed task. It is the only method to
282 | // manipulate c.running and closes the task channel when a handler's Run
283 | // method exits.
284 | func (c *Consumer) claimed(task Task) {
285 | 	h := c.handler(task)
286 | 
287 | 	tid := task.ID()
288 | 	Debugf("%s is attempting to start task=%q", c, tid)
289 | 	// Associate handler with taskID
290 | 	// **This is the only place tasks should be added to c.running**
291 | 	c.runL.Lock()
292 | 	defer c.runL.Unlock()
293 | 	select {
294 | 	case <-c.stop:
295 | 		// We're closing, don't bother starting this task
296 | 		c.coord.Release(task)
297 | 		return
298 | 	default:
299 | 	}
300 | 	if _, ok := c.running[tid]; ok {
301 | 		// If a coordinator returns an already claimed task from Watch(), then it's
302 | 		// a coordinator (or broker) bug.
303 | 		Warnf("%s Attempted to claim already running task %s", c, tid)
304 | 		return
305 | 	}
306 | 	rt := newTask(task, h)
307 | 	c.running[tid] = rt
308 | 
309 | 	// This must be done in the runL lock after the stop chan check so Shutdown
310 | 	// doesn't close(stop) and start Wait()ing concurrently.
311 | 	// See "Note" http://golang.org/pkg/sync/#WaitGroup.Add
312 | 	c.hwg.Add(1)
313 | 
314 | 	// Start handler in its own goroutine
315 | 	go func() {
316 | 		defer c.hwg.Done() // Must be run after task exit and Done/Release called
317 | 
318 | 		// Run the task
319 | 		Infof("%s Task %q started", c, tid)
320 | 		done := c.runTask(h.Run, tid)
321 | 		var status string
322 | 		if done {
323 | 			status = "done"
324 | 			c.coord.Done(task)
325 | 		} else {
326 | 			status = "released"
327 | 			c.coord.Release(task)
328 | 		}
329 | 
330 | 		stopped := rt.Stopped()
331 | 		if stopped.IsZero() {
332 | 			// Task exited on its own
333 | 			Infof("%s Task %q exited (%s)", c, tid, status)
334 | 		} else {
335 | 			// Task exited due to Stop() being called
336 | 			Infof("%s Task %q exited (%s) after %s", c, tid, status, time.Since(stopped))
337 | 		}
338 | 
339 | 		// **This is the only place tasks should be removed from c.running**
340 | 		c.runL.Lock()
341 | 		delete(c.running, tid)
342 | 		c.runL.Unlock()
343 | 	}()
344 | 
345 | 	// Pause slightly after a successful claim to give starting tasks some
346 | 	// breathing room and to bias the next claim toward a node that lost this
347 | 	// one.
348 | 	time.Sleep(10 * time.Millisecond)
349 | }
350 | 
351 | // runTask executes a handler's Run method and recovers from panic()s.
352 | func (c *Consumer) runTask(run func() bool, task string) bool {
353 | 	done := false
354 | 	func() {
355 | 		defer func() {
356 | 			if err := recover(); err != nil {
357 | 				stack := make([]byte, 50*1024)
358 | 				sz := runtime.Stack(stack, false)
359 | 				Errorf("%s Handler %s panic()'d: %v\n%s", c, task, err, stack[:sz])
360 | 				// panics are considered fatal errors. Make sure the task isn't
361 | 				// rescheduled.
362 | 				done = true
363 | 			}
364 | 		}()
365 | 		done = run()
366 | 	}()
367 | 	return done
368 | }
369 | 
370 | // stopTask asynchronously calls the task handlers' Stop method. While stopTask
371 | // calls don't block, calls to task handler's Stop method are serialized with a
372 | // lock.
373 | func (c *Consumer) stopTask(taskID string) {
374 | 	c.runL.Lock()
375 | 	task, ok := c.running[taskID]
376 | 	c.runL.Unlock()
377 | 
378 | 	if !ok {
379 | 		// This can happen if a task completes during Balance() and is not an error.
380 | 		Warnf("%s tried to release a non-running task=%q", c, taskID)
381 | 		return
382 | 	}
383 | 
384 | 	// all handler methods must be wrapped in a recover to prevent a misbehaving
385 | 	// handler from crashing the entire consumer
386 | 	go func() {
387 | 		defer func() {
388 | 			if err := recover(); err != nil {
389 | 				stack := make([]byte, 50*1024)
390 | 				sz := runtime.Stack(stack, false)
391 | 				Errorf("%s Handler %s panic()'d on Stop: %v\n%s", c, taskID, err, stack[:sz])
392 | 			}
393 | 		}()
394 | 
395 | 		// Serialize calls to Stop as a convenience to handler implementors.
396 | 		task.stop()
397 | 	}()
398 | }
399 | 
400 | // Frozen returns true if Metafora is no longer watching for new tasks or
401 | // rebalancing.
402 | //
403 | // Metafora will remain frozen until receiving an Unfreeze command or it is
404 | // restarted (frozen state is not persisted).
405 | func (c *Consumer) Frozen() bool {
406 | 	c.freezeL.Lock()
407 | 	r := c.freeze
408 | 	c.freezeL.Unlock()
409 | 	return r
410 | }
411 | 
412 | func (c *Consumer) handleCommand(cmd Command) {
413 | 	switch cmd.Name() {
414 | 	case cmdFreeze:
415 | 		if c.Frozen() {
416 | 			Info(c, " Ignoring freeze command: already frozen")
417 | 			return
418 | 		}
419 | 		Info(c, " Freezing")
420 | 		c.freezeL.Lock()
421 | 		c.freeze = true
422 | 		c.freezeL.Unlock()
423 | 	case cmdUnfreeze:
424 | 		if !c.Frozen() {
425 | 			Info(c, " Ignoring unfreeze command: not frozen")
426 | 			return
427 | 		}
428 | 		Info(c, " Unfreezing")
429 | 		c.freezeL.Lock()
430 | 		c.freeze = false
431 | 		c.freezeL.Unlock()
432 | 	case cmdBalance:
433 | 		Info(c, " Balancing due to command")
434 | 		c.balance()
435 | 		Debug(c, " Finished balancing due to command")
436 | 	case cmdStopTask:
437 | 		taskI, ok := cmd.Parameters()["task"]
438 | 		task, ok2 := taskI.(string)
439 | 		if !ok || !ok2 {
440 | 			Error(c, " Stop task command didn't contain a valid task", c)
441 | 			return
442 | 		}
443 | 		Infof("%s Stopping task %s due to command", c, task)
444 | 		c.stopTask(task)
445 | 	default:
446 | 		Warnf("%s Discarding unknown command: %s", c, cmd.Name())
447 | 	}
448 | }
449 | 
450 | func (c *Consumer) ignored(taskID string) bool     { return c.im.ignored(taskID) }
451 | func (c *Consumer) ignore(t Task, until time.Time) { c.im.add(t, until) }
452 | 
453 | // Ignores is a list of all ignored tasks.
454 | func (c *Consumer) Ignores() []string { return c.im.all() }
455 | 
456 | func (c *Consumer) String() string {
457 | 	return c.coord.Name()
458 | }
459 | 


--------------------------------------------------------------------------------
/metafora_test.go:
--------------------------------------------------------------------------------
  1 | package metafora
  2 | 
  3 | import (
  4 | 	"os"
  5 | 	"testing"
  6 | 	"time"
  7 | )
  8 | 
  9 | func init() {
 10 | 	if os.Getenv("VERBOSE_TESTS") != "" {
 11 | 		SetLogger(testlogger{})
 12 | 	}
 13 | }
 14 | 
 15 | type testlogger struct{}
 16 | 
 17 | func (testlogger) Output(int, string) error { return nil }
 18 | 
 19 | // Handler/Consumer test
 20 | 
 21 | type testHandler struct {
 22 | 	stop     chan int
 23 | 	t        *testing.T
 24 | 	task     Task
 25 | 	tasksRun chan string
 26 | }
 27 | 
 28 | func (h *testHandler) Run() bool {
 29 | 	h.tasksRun <- h.task.ID()
 30 | 	h.t.Logf("Run(%s)", h.task.ID())
 31 | 	<-h.stop
 32 | 	h.t.Logf("Stop received for %s", h.task.ID())
 33 | 	return true
 34 | }
 35 | 
 36 | func (h *testHandler) Stop() {
 37 | 	h.t.Logf("Stopping %s", h.task.ID())
 38 | 	close(h.stop)
 39 | }
 40 | 
 41 | func newTestHandlerFunc(t *testing.T) (HandlerFunc, chan string) {
 42 | 	tasksRun := make(chan string, 10)
 43 | 	return func(task Task) Handler {
 44 | 		return &testHandler{
 45 | 			task:     task,
 46 | 			stop:     make(chan int),
 47 | 			t:        t,
 48 | 			tasksRun: tasksRun,
 49 | 		}
 50 | 	}, tasksRun
 51 | }
 52 | 
 53 | // TestConsumer ensures the consumers main loop properly handles tasks as well
 54 | // as errors and Shutdown.
 55 | func TestConsumer(t *testing.T) {
 56 | 	t.Parallel()
 57 | 
 58 | 	// Setup some tasks to run in a fake coordinator
 59 | 	tc := NewTestCoord()
 60 | 	tc.Tasks <- testTask{"test1"}
 61 | 	tc.Tasks <- testTask{"test2"}
 62 | 
 63 | 	// Setup a handler func that lets us know what tasks are running
 64 | 	hf, tasksRun := newTestHandlerFunc(t)
 65 | 
 66 | 	// Create the consumer and run it
 67 | 	c, _ := NewConsumer(tc, hf, DumbBalancer)
 68 | 	s := make(chan int)
 69 | 	go func() {
 70 | 		c.Run()
 71 | 		s <- 1
 72 | 	}()
 73 | 
 74 | 	for i := 0; i < 2; i++ {
 75 | 		select {
 76 | 		case <-s:
 77 | 			t.Fatalf("Run exited early")
 78 | 		case tr := <-tasksRun:
 79 | 			if tr != "test1" && tr != "test2" {
 80 | 				t.Errorf("Expected `test1` or `test2` but received: %s", tr)
 81 | 			}
 82 | 			t.Logf("Received task=%q", tr)
 83 | 		case <-time.After(100 * time.Millisecond):
 84 | 			t.Errorf("First task didn't execute in a timely fashion")
 85 | 		}
 86 | 	}
 87 | 
 88 | 	// Ensure Tasks() is accurate
 89 | 	tasks := c.Tasks()
 90 | 	if len(tasks) != 2 {
 91 | 		t.Errorf("Expected 2 tasks to be running but found: %v", tasks)
 92 | 	}
 93 | 
 94 | 	go func() {
 95 | 		c.Shutdown()
 96 | 		s <- 1
 97 | 	}()
 98 | 	for i := 0; i < 2; i++ {
 99 | 		select {
100 | 		case <-s:
101 | 		case <-time.After(100 * time.Millisecond):
102 | 			t.Errorf("Run and Shutdown didn't finish in a timely fashion")
103 | 		}
104 | 	}
105 | }
106 | 
107 | // Balancer/Consumer test
108 | 
109 | type testBalancer struct {
110 | 	c         BalancerContext
111 | 	t         *testing.T
112 | 	secondRun bool
113 | 	done      chan struct{}
114 | }
115 | 
116 | func (b *testBalancer) Init(c BalancerContext) { b.c = c }
117 | func (b *testBalancer) CanClaim(task Task) (time.Time, bool) {
118 | 	b.t.Logf("CanClaim(%s) -> %t", task.ID(), task.ID() == "ok-task")
119 | 	return time.Now().Add(100 * time.Hour), task.ID() == "ok-task"
120 | }
121 | 
122 | func (b *testBalancer) Balance() []string {
123 | 	if b.secondRun {
124 | 		return nil
125 | 	}
126 | 	b.secondRun = true
127 | 	tsks := b.c.Tasks()
128 | 	if len(tsks) != 1 {
129 | 		b.t.Errorf("len(ConsumerState.Tasks()) != 1 ==> %v", tsks)
130 | 		return nil
131 | 	}
132 | 	if tsks[0].Task().ID() != "ok-task" {
133 | 		b.t.Errorf("Wrong task in ConsumerState.Tasks(): %v", tsks)
134 | 	}
135 | 	close(b.done)
136 | 	return nil
137 | }
138 | 
139 | func TestBalancer(t *testing.T) {
140 | 	t.Parallel()
141 | 	if testing.Short() {
142 | 		t.Skip("skipping due to -short")
143 | 	}
144 | 
145 | 	hf, tasksRun := newTestHandlerFunc(t)
146 | 	tc := NewTestCoord()
147 | 	balDone := make(chan struct{})
148 | 	c, _ := NewConsumer(tc, hf, &testBalancer{t: t, done: balDone})
149 | 	c.balEvery = 0
150 | 	go c.Run()
151 | 	tc.Tasks <- testTask{"test1"}
152 | 	tc.Tasks <- testTask{"ok-task"}
153 | 	tc.Tasks <- testTask{"test2"}
154 | 
155 | 	// Wait for balance
156 | 	select {
157 | 	case <-balDone:
158 | 	case <-time.After(time.Duration(balanceJitterMax) + 10*time.Millisecond):
159 | 		t.Error("Didn't balance in a timely fashion")
160 | 	}
161 | 
162 | 	select {
163 | 	case run := <-tasksRun:
164 | 		if run != "ok-task" {
165 | 			t.Errorf("Balancer didn't reject tasks properly. Ran task %s", run)
166 | 		}
167 | 	case <-time.After(100 * time.Millisecond):
168 | 		t.Error("Task didn't run in a timely fashion")
169 | 	}
170 | 
171 | 	/*
172 | 		if r := c.bal.Balance(); len(r) > 0 {
173 | 			t.Errorf("Balance() should return 0, not: %v", r)
174 | 		}
175 | 	*/
176 | 
177 | 	s := make(chan int)
178 | 	go func() {
179 | 		c.Shutdown()
180 | 		close(s)
181 | 	}()
182 | 	select {
183 | 	case <-s:
184 | 	case <-time.After(100 * time.Millisecond):
185 | 		t.Errorf("Shutdown didn't finish in a timely fashion")
186 | 	}
187 | 	if len(c.Tasks()) != 0 {
188 | 		t.Errorf("Shutdown didn't stop all tasks")
189 | 	}
190 | }
191 | 
192 | type noopHandler struct{}
193 | 
194 | func (noopHandler) Run() bool { return true }
195 | func (noopHandler) Stop()     {}
196 | 
197 | // TestHandleTask ensures that tasks are marked as done once handled.
198 | func TestHandleTask(t *testing.T) {
199 | 	hf := func(Task) Handler { return noopHandler{} }
200 | 	coord := NewTestCoord()
201 | 	c, _ := NewConsumer(coord, hf, DumbBalancer)
202 | 	go c.Run()
203 | 	coord.Tasks <- testTask{"task1"}
204 | 	select {
205 | 	case <-coord.Releases:
206 | 		t.Errorf("Release called, expected Done!")
207 | 	case <-coord.Dones:
208 | 	case <-time.After(100 * time.Millisecond):
209 | 		t.Fatalf("Took too long to mark task as done")
210 | 	}
211 | 	c.Shutdown()
212 | }
213 | 
214 | // TestTaskPanic ensures panics from Run methods are turned into Done calls.
215 | func TestTaskPanic(t *testing.T) {
216 | 	t.Parallel()
217 | 	hf := SimpleHandler(func(Task, <-chan bool) bool {
218 | 		panic("TestTaskPanic")
219 | 	})
220 | 	coord := NewTestCoord()
221 | 	c, _ := NewConsumer(coord, hf, DumbBalancer)
222 | 	go c.Run()
223 | 	coord.Tasks <- testTask{"1"}
224 | 	coord.Tasks <- testTask{"2"}
225 | 	coord.Tasks <- testTask{"3"}
226 | 	for i := 3; i > 0; i-- {
227 | 		select {
228 | 		case task := <-coord.Dones:
229 | 			t.Logf("%s done", task)
230 | 		case task := <-coord.Releases:
231 | 			t.Errorf("%s released when it should have been marked Done!", task)
232 | 		case <-time.After(200 * time.Millisecond):
233 | 			t.Fatalf("Took too long to mark task(s) as done.")
234 | 		}
235 | 	}
236 | 	c.Shutdown()
237 | }
238 | 
239 | // TestShutdown ensures Shutdown causes Run() to exit cleanly.
240 | func TestShutdown(t *testing.T) {
241 | 	t.Parallel()
242 | 	hf := SimpleHandler(func(_ Task, c <-chan bool) bool {
243 | 		<-c
244 | 		return false
245 | 	})
246 | 	coord := NewTestCoord()
247 | 	c, _ := NewConsumer(coord, hf, DumbBalancer)
248 | 	go c.Run()
249 | 	coord.Tasks <- testTask{"1"}
250 | 	coord.Tasks <- testTask{"2"}
251 | 	coord.Tasks <- testTask{"3"}
252 | 	time.Sleep(100 * time.Millisecond)
253 | 	if len(coord.Dones)+len(coord.Releases) > 0 {
254 | 		t.Fatalf("Didn't expect any tasks to exit before Shutdown was called.")
255 | 	}
256 | 	c.Shutdown()
257 | 	for i := 3; i > 0; i-- {
258 | 		select {
259 | 		case task := <-coord.Dones:
260 | 			t.Errorf("%s makred done when it should have been Released!", task)
261 | 		case task := <-coord.Releases:
262 | 			t.Logf("%s relased", task)
263 | 		case <-time.After(200 * time.Millisecond):
264 | 			t.Fatalf("Took too long to mark task(s) as released.")
265 | 		}
266 | 	}
267 | }
268 | 


--------------------------------------------------------------------------------
/metcdv3/README.md:
--------------------------------------------------------------------------------
 1 | metafora etcdv3 client
 2 | ====================
 3 | 
 4 | See [Documentation/etcdv3.md](../Documentation/etcdv3.md) for details.
 5 | 
 6 | Testing
 7 | -------
 8 | 
 9 | Testing the metafora etcd client requires that a new etcd instance be running.
10 | The etcd instances should be reachable via the connection described by the 
11 | connection string `localhost:5001,localhost:5002,localhost:5003` or a similar 
12 | connection string should be exported as an environment variable `ETCDCTL_PEERS`.
13 | 
14 | An example of running the integration tests is given in the command line below:
15 | 
16 | ```sh
17 | IP="127.0.0.1" ETCDCTL_PEERS="$IP:5001,$IP:5002,$IP:5003"  go test -v
18 | ```
19 | 


--------------------------------------------------------------------------------
/metcdv3/balancer.go:
--------------------------------------------------------------------------------
 1 | package metcdv3
 2 | 
 3 | import (
 4 | 	"context"
 5 | 	"encoding/json"
 6 | 	"path"
 7 | 
 8 | 	"github.com/lytics/metafora"
 9 | 
10 | 	etcdv3 "go.etcd.io/etcd/client/v3"
11 | )
12 | 
13 | // NewFairBalancer creates a new metafora.DefaultFairBalancer that uses etcd
14 | // for counting tasks per node.
15 | func NewFairBalancer(conf *Config, etcdv3c *etcdv3.Client, filter func(*FilterableValue) bool) metafora.Balancer {
16 | 	e := etcdClusterState{
17 | 		etcdv3c:  etcdv3c,
18 | 		kvc:      etcdv3.NewKV(etcdv3c),
19 | 		taskPath: path.Join(conf.Namespace, TasksPath),
20 | 		nodePath: path.Join(conf.Namespace, NodesPath),
21 | 		filter:   filter,
22 | 	}
23 | 	return metafora.NewDefaultFairBalancer(conf.Name, &e)
24 | }
25 | 
26 | // Checks the current state of an Etcd cluster
27 | type etcdClusterState struct {
28 | 	etcdv3c  *etcdv3.Client
29 | 	kvc      etcdv3.KV
30 | 	taskPath string
31 | 	nodePath string
32 | 	filter   func(*FilterableValue) bool
33 | }
34 | 
35 | type FilterableValue struct {
36 | 	Name string
37 | }
38 | 
39 | func (e *etcdClusterState) NodeTaskCount() (map[string]int, error) {
40 | 	state := map[string]int{}
41 | 
42 | 	// First initialize state with nodes as keys
43 | 	resp, err := e.kvc.Get(context.Background(), e.nodePath, etcdv3.WithPrefix())
44 | 	if err != nil {
45 | 		return nil, err
46 | 	}
47 | 
48 | 	if resp == nil || len(resp.Kvs) == 0 {
49 | 		metafora.Warnf("balancer received empty response from GET %s", e.nodePath)
50 | 		return state, nil
51 | 	}
52 | 
53 | 	for _, kv := range resp.Kvs {
54 | 		// We're guarunteed to find nodes under the _metadata path (created on Coordinator startup)
55 | 		dir, _ := path.Split(string(kv.Key))
56 | 		dir, node := path.Split(path.Clean(dir))
57 | 		if path.Clean(dir) == e.nodePath && e.filter(&FilterableValue{Name: node}) {
58 | 			state[node] = 0
59 | 		}
60 | 	}
61 | 
62 | 	resp, err = e.kvc.Get(context.Background(), e.taskPath, etcdv3.WithPrefix())
63 | 	if err != nil {
64 | 		return nil, err
65 | 	}
66 | 
67 | 	// No current tasks
68 | 	if resp == nil || len(resp.Kvs) == 0 {
69 | 		return state, nil
70 | 	}
71 | 
72 | 	// Get the list of all claimed work, create a map of the counts and
73 | 	// node values
74 | 	// We ignore tasks which have no claims
75 | 	for _, kv := range resp.Kvs {
76 | 		ownerPath := path.Base(string(kv.Key))
77 | 		if ownerPath == OwnerPath {
78 | 			ov := &ownerValue{}
79 | 			err := json.Unmarshal(kv.Value, ov)
80 | 			if err != nil {
81 | 				return nil, err
82 | 			}
83 | 			// We want to only include those nodes which were initially included,
84 | 			// as some nodes may be shutting down, etc, and should not be counted
85 | 			if _, ok := state[ov.Node]; ok {
86 | 				state[ov.Node]++
87 | 			}
88 | 		}
89 | 	}
90 | 	return state, nil
91 | }
92 | 


--------------------------------------------------------------------------------
/metcdv3/balancer_test.go:
--------------------------------------------------------------------------------
  1 | package metcdv3
  2 | 
  3 | import (
  4 | 	"testing"
  5 | 	"time"
  6 | 
  7 | 	"github.com/lytics/metafora"
  8 | )
  9 | 
 10 | func TestFairBalancer(t *testing.T) {
 11 | 	t.Parallel()
 12 | 	etcdv3c, coord1, conf1 := setupEtcd(t)
 13 | 	defer etcdv3c.Close()
 14 | 	conf2 := conf1.Copy()
 15 | 	conf2.Name = "coord2"
 16 | 	coord2 := NewEtcdV3Coordinator(conf2, etcdv3c)
 17 | 
 18 | 	cli := NewClient(conf1.Namespace, etcdv3c)
 19 | 
 20 | 	h := metafora.SimpleHandler(func(task metafora.Task, stop <-chan bool) bool {
 21 | 		metafora.Debugf("Starting %s", task.ID())
 22 | 		<-stop
 23 | 		metafora.Debugf("Stopping %s", task.ID())
 24 | 		return false // never done
 25 | 	})
 26 | 
 27 | 	filter := func(_ *FilterableValue) bool { return true }
 28 | 	// Create two consumers
 29 | 	b1 := NewFairBalancer(conf1, etcdv3c, filter)
 30 | 	con1, err := metafora.NewConsumer(coord1, h, b1)
 31 | 	if err != nil {
 32 | 		t.Fatal(err)
 33 | 	}
 34 | 
 35 | 	b2 := NewFairBalancer(conf2, etcdv3c, filter)
 36 | 	con2, err := metafora.NewConsumer(coord2, h, b2)
 37 | 	if err != nil {
 38 | 		t.Fatal(err)
 39 | 	}
 40 | 
 41 | 	// Start the first and let it claim a bunch of tasks
 42 | 	go con1.Run()
 43 | 	defer con1.Shutdown()
 44 | 	_ = cli.SubmitTask(DefaultTaskFunc("t1", ""))
 45 | 	_ = cli.SubmitTask(DefaultTaskFunc("t2", ""))
 46 | 	_ = cli.SubmitTask(DefaultTaskFunc("t3", ""))
 47 | 	_ = cli.SubmitTask(DefaultTaskFunc("t4", ""))
 48 | 	_ = cli.SubmitTask(DefaultTaskFunc("t5", ""))
 49 | 	_ = cli.SubmitTask(DefaultTaskFunc("t6", ""))
 50 | 
 51 | 	time.Sleep(5 * time.Second)
 52 | 
 53 | 	if len(con1.Tasks()) != 6 {
 54 | 		t.Fatalf("con1 should have claimed 6 tasks: %d", len(con1.Tasks()))
 55 | 	}
 56 | 
 57 | 	// Start the second consumer and force the 1st to rebalance
 58 | 	go con2.Run()
 59 | 	defer con2.Shutdown()
 60 | 
 61 | 	// Wait for node to startup and register
 62 | 	time.Sleep(1 * time.Second)
 63 | 
 64 | 	_ = cli.SubmitCommand(conf1.Name, metafora.CommandBalance())
 65 | 
 66 | 	time.Sleep(2 * time.Second)
 67 | 
 68 | 	c1Tasks := con1.Tasks()
 69 | 	c2Tasks := con2.Tasks()
 70 | 	if len(c1Tasks) != 4 || len(c2Tasks) != 2 {
 71 | 		t.Fatalf("expected consumers to have 4|2 tasks: %d|%d", len(c1Tasks), len(c2Tasks))
 72 | 	}
 73 | 
 74 | 	// Finally make sure that balancing the other node does nothing
 75 | 	_ = cli.SubmitCommand("node2", metafora.CommandBalance())
 76 | 
 77 | 	time.Sleep(2 * time.Second)
 78 | 
 79 | 	c1Tasks2 := con1.Tasks()
 80 | 	c2Tasks2 := con2.Tasks()
 81 | 	if len(c1Tasks2) != 4 || len(c2Tasks2) != 2 {
 82 | 		t.Fatalf("expected consumers to have 4|2 tasks: %d|%d", len(c1Tasks2), len(c2Tasks2))
 83 | 	}
 84 | 	for i := 0; i < 4; i++ {
 85 | 		if c1Tasks[i] != c1Tasks2[i] {
 86 | 			t.Errorf("task mismatch: %s != %s", c1Tasks[i], c1Tasks2[i])
 87 | 		}
 88 | 	}
 89 | 	for i := 0; i < 2; i++ {
 90 | 		if c2Tasks[i] != c2Tasks2[i] {
 91 | 			t.Errorf("task mismatch: %s != %s", c2Tasks[i], c2Tasks2[i])
 92 | 		}
 93 | 	}
 94 | }
 95 | 
 96 | func TestFairBalancerFilter(t *testing.T) {
 97 | 	t.Parallel()
 98 | 	etcdv3c, coord1, conf1 := setupEtcd(t)
 99 | 	defer etcdv3c.Close()
100 | 	conf2 := conf1.Copy()
101 | 	conf2.Name = "coord2"
102 | 	coord2 := NewEtcdV3Coordinator(conf2, etcdv3c)
103 | 
104 | 	cli := NewClient(conf1.Namespace, etcdv3c)
105 | 
106 | 	h := metafora.SimpleHandler(func(task metafora.Task, stop <-chan bool) bool {
107 | 		metafora.Debugf("Starting %s", task.ID())
108 | 		<-stop
109 | 		metafora.Debugf("Stopping %s", task.ID())
110 | 		return false // never done
111 | 	})
112 | 
113 | 	filter := func(fv *FilterableValue) bool { return fv.Name == conf1.Name }
114 | 	// Create two consumers
115 | 	b1 := NewFairBalancer(conf1, etcdv3c, filter)
116 | 	con1, err := metafora.NewConsumer(coord1, h, b1)
117 | 	if err != nil {
118 | 		t.Fatal(err)
119 | 	}
120 | 
121 | 	filter2 := func(fv *FilterableValue) bool { return fv.Name == conf2.Name }
122 | 	b2 := NewFairBalancer(conf2, etcdv3c, filter2)
123 | 	con2, err := metafora.NewConsumer(coord2, h, b2)
124 | 	if err != nil {
125 | 		t.Fatal(err)
126 | 	}
127 | 
128 | 	// Start the first and let it claim a bunch of tasks
129 | 	go con1.Run()
130 | 	defer con1.Shutdown()
131 | 	_ = cli.SubmitTask(DefaultTaskFunc("t1", ""))
132 | 	_ = cli.SubmitTask(DefaultTaskFunc("t2", ""))
133 | 	_ = cli.SubmitTask(DefaultTaskFunc("t3", ""))
134 | 	_ = cli.SubmitTask(DefaultTaskFunc("t4", ""))
135 | 	_ = cli.SubmitTask(DefaultTaskFunc("t5", ""))
136 | 	_ = cli.SubmitTask(DefaultTaskFunc("t6", ""))
137 | 	_ = cli.SubmitTask(DefaultTaskFunc("t7", ""))
138 | 	_ = cli.SubmitTask(DefaultTaskFunc("t8", ""))
139 | 	_ = cli.SubmitTask(DefaultTaskFunc("t9", ""))
140 | 
141 | 	time.Sleep(5 * time.Second)
142 | 
143 | 	if len(con1.Tasks()) != 9 {
144 | 		t.Fatalf("con1 should have claimed 9 tasks: %d", len(con1.Tasks()))
145 | 	}
146 | 
147 | 	// Start the second consumer and force the 1st to rebalance
148 | 	go con2.Run()
149 | 	defer con2.Shutdown()
150 | 
151 | 	// Wait for node to startup and register
152 | 	time.Sleep(1 * time.Second)
153 | 
154 | 	_ = cli.SubmitCommand(conf1.Name, metafora.CommandBalance())
155 | 
156 | 	time.Sleep(2 * time.Second)
157 | 
158 | 	// Make sure that balancing never happened
159 | 	c2Tasks := con2.Tasks()
160 | 	if len(c2Tasks) != 0 {
161 | 		t.Fatalf("expected no tasks to be rebalanced but got: %d", len(c2Tasks))
162 | 	}
163 | 
164 | }
165 | 
166 | // Fair balancer shouldn't consider a shutting-down node
167 | // See https://github.com/lytics/metafora/issues/92
168 | func TestFairBalancerShutdown(t *testing.T) {
169 | 	etcdv3c, coord1, conf1 := setupEtcd(t)
170 | 	defer etcdv3c.Close()
171 | 	conf2 := conf1.Copy()
172 | 	conf2.Name = "node2"
173 | 	coord2 := NewEtcdV3Coordinator(conf2, etcdv3c)
174 | 
175 | 	cli := NewClient(conf1.Namespace, etcdv3c)
176 | 
177 | 	// This handler always returns immediately
178 | 	h1 := metafora.SimpleHandler(func(task metafora.Task, stop <-chan bool) bool {
179 | 		metafora.Debugf("H1 Starting %s", task.ID())
180 | 		<-stop
181 | 		metafora.Debugf("H1 Stopping %s", task.ID())
182 | 		return false // never done
183 | 	})
184 | 
185 | 	// Block forever on a single task
186 | 	stop2 := make(chan struct{})
187 | 	stopr := make(chan chan struct{}, 1)
188 | 	stopr <- stop2
189 | 	h2 := metafora.SimpleHandler(func(task metafora.Task, stop <-chan bool) bool {
190 | 		metafora.Debugf("H2 Starting %s", task.ID())
191 | 		blockchan, ok := <-stopr
192 | 		if ok {
193 | 			<-blockchan
194 | 		}
195 | 		<-stop
196 | 		metafora.Debugf("H2 Stopping %s", task.ID())
197 | 		return false // never done
198 | 	})
199 | 
200 | 	filter := func(_ *FilterableValue) bool { return true }
201 | 	// Create two consumers
202 | 	b1 := NewFairBalancer(conf1, etcdv3c, filter)
203 | 	con1, err := metafora.NewConsumer(coord1, h1, b1)
204 | 	if err != nil {
205 | 		t.Fatal(err)
206 | 	}
207 | 
208 | 	b2 := NewFairBalancer(conf2, etcdv3c, filter)
209 | 	con2, err := metafora.NewConsumer(coord2, h2, b2)
210 | 	if err != nil {
211 | 		t.Fatal(err)
212 | 	}
213 | 
214 | 	// Start the first and let it claim a bunch of tasks
215 | 	go con1.Run()
216 | 	defer con1.Shutdown()
217 | 	_ = cli.SubmitTask(DefaultTaskFunc("t1", ""))
218 | 	_ = cli.SubmitTask(DefaultTaskFunc("t2", ""))
219 | 	_ = cli.SubmitTask(DefaultTaskFunc("t3", ""))
220 | 	_ = cli.SubmitTask(DefaultTaskFunc("t4", ""))
221 | 	_ = cli.SubmitTask(DefaultTaskFunc("t5", ""))
222 | 	_ = cli.SubmitTask(DefaultTaskFunc("t6", ""))
223 | 
224 | 	time.Sleep(1000 * time.Millisecond)
225 | 
226 | 	if len(con1.Tasks()) != 6 {
227 | 		t.Fatalf("con1 should have claimed 6 tasks: %d", len(con1.Tasks()))
228 | 	}
229 | 
230 | 	// Start the second consumer and force the 1st to rebalance
231 | 	go con2.Run()
232 | 
233 | 	close(stopr)
234 | 
235 | 	// Wait for node to startup and register
236 | 	time.Sleep(500 * time.Millisecond)
237 | 
238 | 	_ = cli.SubmitCommand(conf1.Name, metafora.CommandBalance())
239 | 
240 | 	time.Sleep(2 * time.Second)
241 | 
242 | 	c1Tasks := con1.Tasks()
243 | 	c2Tasks := con2.Tasks()
244 | 	if len(c1Tasks) != 4 || len(c2Tasks) != 2 {
245 | 		t.Fatalf("expected consumers to have 4|2 tasks: %d|%d", len(c1Tasks), len(c2Tasks))
246 | 	}
247 | 
248 | 	// Make sure that balancing the other node does nothing
249 | 	_ = cli.SubmitCommand("node2", metafora.CommandBalance())
250 | 
251 | 	time.Sleep(2 * time.Second)
252 | 
253 | 	c1Tasks2 := con1.Tasks()
254 | 	c2Tasks2 := con2.Tasks()
255 | 	if len(c1Tasks2) != 4 || len(c2Tasks2) != 2 {
256 | 		t.Fatalf("expected consumers to have 4|2 tasks: %d|%d", len(c1Tasks2), len(c2Tasks2))
257 | 	}
258 | 	for i := 0; i < 4; i++ {
259 | 		if c1Tasks[i] != c1Tasks2[i] {
260 | 			t.Errorf("task mismatch: %s != %s", c1Tasks[i], c1Tasks2[i])
261 | 		}
262 | 	}
263 | 	for i := 0; i < 2; i++ {
264 | 		if c2Tasks[i] != c2Tasks2[i] {
265 | 			t.Errorf("task mismatch: %s != %s", c2Tasks[i], c2Tasks2[i])
266 | 		}
267 | 	}
268 | 
269 | 	// Second consumer should block on a single task forever
270 | 	// Rebalancing the first node should then cause it to pickup all but
271 | 	// one task
272 | 	c2stop := make(chan struct{})
273 | 	go func() {
274 | 		con2.Shutdown()
275 | 		close(c2stop)
276 | 	}()
277 | 
278 | 	time.Sleep(500 * time.Millisecond)
279 | 
280 | 	_ = cli.SubmitCommand(conf1.Name, metafora.CommandBalance())
281 | 
282 | 	time.Sleep(2 * time.Second)
283 | 
284 | 	c1Tasks3 := con1.Tasks()
285 | 	c2Tasks3 := con2.Tasks()
286 | 	if len(c1Tasks3) != 5 || len(c2Tasks3) != 1 {
287 | 		t.Fatalf("Expected consumers to have 5|1 tasks: %d|%d", len(c1Tasks3), len(c2Tasks3))
288 | 	}
289 | 
290 | 	// Now stop blocking task, rebalance and make sure the first node picked up the remaining
291 | 	close(stop2)
292 | 
293 | 	time.Sleep(500 * time.Millisecond)
294 | 	// Consumer 2 should stop now
295 | 	<-c2stop
296 | 
297 | 	_ = cli.SubmitCommand(conf1.Name, metafora.CommandBalance())
298 | 
299 | 	time.Sleep(2 * time.Second)
300 | 
301 | 	// con2 is out of the picture. con1 has all the tasks.
302 | 	c1Tasks4 := con1.Tasks()
303 | 	c2Tasks4 := con2.Tasks()
304 | 	if len(c1Tasks4) != 6 || len(c2Tasks4) != 0 {
305 | 		t.Fatalf("Expected consumers to have 6|0 tasks: %d|%d", len(c1Tasks4), len(c2Tasks4))
306 | 	}
307 | }
308 | 


--------------------------------------------------------------------------------
/metcdv3/client.go:
--------------------------------------------------------------------------------
  1 | package metcdv3
  2 | 
  3 | import (
  4 | 	"context"
  5 | 	"encoding/json"
  6 | 	"errors"
  7 | 	"math/rand"
  8 | 	"path"
  9 | 	"strconv"
 10 | 	"time"
 11 | 
 12 | 	"github.com/lytics/metafora"
 13 | 	etcdv3 "go.etcd.io/etcd/client/v3"
 14 | )
 15 | 
 16 | var (
 17 | 	// ErrFailedSubmitTask because the task already existed most likely
 18 | 	ErrFailedSubmitTask            = errors.New("metafora etcdv3 client: failed submit task")
 19 | 	ErrLeaseDurationTooShort       = errors.New("metafora etcd clientv3: lease duration too short")
 20 | 	ErrKeepAliveClosedUnexpectedly = errors.New("metafora etcd clientv3: keep alive closed unexpectedly")
 21 | )
 22 | 
 23 | var (
 24 | 	minLeaseDuration = 10 * time.Second
 25 | )
 26 | 
 27 | // NewClient creates a new client using an etcd backend.
 28 | func NewClient(namespace string, etcdv3c *etcdv3.Client) metafora.Client {
 29 | 	return &mclient{
 30 | 		etcdv3c:   etcdv3c,
 31 | 		kvc:       etcdv3.NewKV(etcdv3c),
 32 | 		namespace: namespace,
 33 | 	}
 34 | }
 35 | 
 36 | type keepAliveStats struct {
 37 | 	success int
 38 | 	failure int
 39 | }
 40 | 
 41 | // Type 'mclient' is an internal implementation of metafora.Client with an etcd backend.
 42 | type mclient struct {
 43 | 	etcdv3c   *etcdv3.Client
 44 | 	kvc       etcdv3.KV
 45 | 	namespace string
 46 | }
 47 | 
 48 | // nodesPath is the base path of nodes, represented as a directory in etcd.
 49 | func (mc *mclient) nodesPath() string {
 50 | 	return path.Join("/", mc.namespace, NodesPath)
 51 | }
 52 | 
 53 | // taskPath is the path to a particular taskId, represented as a file in etcd.
 54 | func (mc *mclient) taskPath(taskID string) string {
 55 | 	return path.Join("/", mc.namespace, TasksPath, taskID)
 56 | }
 57 | 
 58 | // cmdPath is the path to a particular nodeId, represented as a directory in etcd.
 59 | func (mc *mclient) cmdPath(node string) string {
 60 | 	return path.Join("/", mc.namespace, NodesPath, node, "commands")
 61 | }
 62 | 
 63 | // SubmitTask creates a new task in etcd
 64 | func (mc *mclient) SubmitTask(task metafora.Task) error {
 65 | 	c := context.Background()
 66 | 	fullpath := path.Join(mc.taskPath(task.ID()), PropsPath)
 67 | 	buf, err := json.Marshal(task)
 68 | 	if err != nil {
 69 | 		return err
 70 | 	}
 71 | 	txnRes, err := mc.kvc.Txn(c).
 72 | 		If(etcdv3.Compare(etcdv3.Version(fullpath), "=", 0)).
 73 | 		// Should we create both of these?
 74 | 		Then(etcdv3.OpPut(fullpath, string(buf)), etcdv3.OpPut(mc.taskPath(task.ID()), "")).
 75 | 		Commit()
 76 | 	if err != nil {
 77 | 		return err
 78 | 	}
 79 | 	if !txnRes.Succeeded {
 80 | 		return ErrFailedSubmitTask
 81 | 	}
 82 | 	metafora.Debugf("task %s submitted: %s", task.ID(), fullpath)
 83 | 	return nil
 84 | }
 85 | 
 86 | // Delete a task
 87 | func (mc *mclient) DeleteTask(taskID string) error {
 88 | 	c := context.Background()
 89 | 	fullpath := mc.taskPath(taskID)
 90 | 	_, err := mc.kvc.Delete(c, fullpath, etcdv3.WithPrefix())
 91 | 	metafora.Debugf("task %s deleted: %s", taskID, fullpath)
 92 | 	return err
 93 | }
 94 | 
 95 | // SubmitCommand creates a new command for a particular nodeId, the
 96 | // command has a random name and is added to the particular nodeId
 97 | // directory in etcd.
 98 | func (mc *mclient) SubmitCommand(node string, command metafora.Command) error {
 99 | 	cmdPath := mc.cmdPath(node)
100 | 	body, err := command.Marshal()
101 | 	if err != nil {
102 | 		// This is either a bug in metafora or someone implemented their own
103 | 		// command incorrectly.
104 | 		return err
105 | 	}
106 | 	key := path.Join(cmdPath, strconv.FormatUint(rand.Uint64(), 10))
107 | 	if _, err := mc.kvc.Put(context.Background(), key, string(body)); err != nil {
108 | 		metafora.Errorf("Error submitting command: %s to node: %s", command, node)
109 | 		return err
110 | 	}
111 | 	metafora.Debugf("Submitted command: %s to node: %s", string(body), node)
112 | 	return nil
113 | }
114 | 
115 | // Nodes fetchs the currently registered nodes. A non-nil error means that some
116 | // error occured trying to get the node list. The node list may be nil if no
117 | // nodes are registered.
118 | func (mc *mclient) Nodes() ([]string, error) {
119 | 	res, err := mc.kvc.Get(context.Background(), mc.nodesPath(), etcdv3.WithPrefix())
120 | 	if err != nil && res != nil && len(res.Kvs) > 0 {
121 | 		nodes := make([]string, len(res.Kvs))
122 | 		for i, kv := range res.Kvs {
123 | 			var node string
124 | 			err = json.Unmarshal(kv.Key, &node)
125 | 			if err != nil {
126 | 				return nil, err
127 | 			}
128 | 			nodes[i] = path.Base(node)
129 | 		}
130 | 		return nodes, nil
131 | 	}
132 | 
133 | 	return nil, nil
134 | }
135 | 
136 | func (mc *mclient) Tasks() ([]string, error) {
137 | 	res, err := mc.kvc.Get(
138 | 		context.Background(),
139 | 		path.Join("/", mc.namespace, TasksPath),
140 | 		etcdv3.WithPrefix())
141 | 	if err != nil {
142 | 		return nil, err
143 | 	}
144 | 
145 | 	var tasks []string
146 | 	for _, kv := range res.Kvs {
147 | 		key := string(kv.Key)
148 | 		if base := path.Base(key); base == OwnerPath || base == MetadataPath || base == PropsPath {
149 | 			continue
150 | 		} else {
151 | 			tasks = append(tasks, base)
152 | 		}
153 | 	}
154 | 	return tasks, nil
155 | }
156 | 


--------------------------------------------------------------------------------
/metcdv3/client_test.go:
--------------------------------------------------------------------------------
 1 | package metcdv3
 2 | 
 3 | // NOTES
 4 | //
 5 | // These tests are in reality integration tests which require that
 6 | // etcd is running on the test system and its peers are found
 7 | // in the ENV variable ETCDCTL_PEERS. The tests do not clean
 8 | // out data and require a fresh set of etcd instances for
 9 | // each run. You can consider this a known bug which
10 | // will be fixed in a future release.
11 | //
12 | // See: https://github.com/lytics/metafora/issues/31
13 | 
14 | import (
15 | 	"context"
16 | 	"testing"
17 | 
18 | 	"github.com/lytics/metafora"
19 | 	"github.com/lytics/metafora/metcdv3/testutil"
20 | 	etcdv3 "go.etcd.io/etcd/client/v3"
21 | )
22 | 
23 | const (
24 | 	Namespace = "test"
25 | 	NodesDir  = "/test/nodes"
26 | 	Node1     = "node1"
27 | 	Node1Path = "/test/nodes/node1"
28 | )
29 | 
30 | // TestNodes tests that client.Nodes() returns the metafora nodes
31 | // registered in etcd.
32 | func TestNodes(t *testing.T) {
33 | 	c := context.Background()
34 | 	eclient := testutil.NewEtcdV3Client(t)
35 | 	kvc := etcdv3.NewKV(eclient)
36 | 	_, _ = eclient.Delete(c, Node1Path, etcdv3.WithPrefix())
37 | 
38 | 	mclient := NewClient(Namespace, eclient)
39 | 
40 | 	if _, err := kvc.Put(c, Node1Path, "0"); err != nil {
41 | 		t.Fatalf("AddChild %v returned error: %v", NodesDir, err)
42 | 	}
43 | 
44 | 	if nodes, err := mclient.Nodes(); err != nil {
45 | 		t.Fatalf("Nodes returned error: %v", err)
46 | 	} else {
47 | 		for i, n := range nodes {
48 | 			t.Logf("%v -> %v", i, n)
49 | 		}
50 | 	}
51 | }
52 | 
53 | // TestSubmitTask tests that client.SubmitTask(...) adds a task to
54 | // the proper path in etcd, and that the same task id cannot be
55 | // submitted more than once.
56 | func TestSubmitTask(t *testing.T) {
57 | 	client := testutil.NewEtcdV3Client(t)
58 | 	mclient := NewClient(Namespace, client)
59 | 
60 | 	task := DefaultTaskFunc("testid1", "")
61 | 
62 | 	if err := mclient.DeleteTask(task.ID()); err != nil {
63 | 		t.Logf("DeleteTask returned an error, which maybe ok.  Error:%v", err)
64 | 	}
65 | 
66 | 	if err := mclient.SubmitTask(task); err != nil {
67 | 		t.Fatalf("Submit task failed on initial submission, error: %v", err)
68 | 	}
69 | 
70 | 	if err := mclient.SubmitTask(task); err == nil {
71 | 		t.Fatalf("Submit task did not fail, but should of, when using existing tast id")
72 | 	}
73 | }
74 | 
75 | // TestSubmitCommand tests that client.SubmitCommand(...) adds a command
76 | // to the proper node path in etcd, and that it can be read back.
77 | func TestSubmitCommand(t *testing.T) {
78 | 	eclient := testutil.NewEtcdV3Client(t)
79 | 	kvc := etcdv3.NewKV(eclient)
80 | 	mclient := NewClient(Namespace, eclient)
81 | 
82 | 	if err := mclient.SubmitCommand(Node1, metafora.CommandFreeze()); err != nil {
83 | 		t.Fatalf("Unable to submit command.   error:%v", err)
84 | 	}
85 | 
86 | 	if res, err := kvc.Get(context.Background(), NodesDir, etcdv3.WithPrefix()); err != nil {
87 | 		t.Fatalf("Get on path %v returned error: %v", NodesDir, err)
88 | 	} else if res.Count == 0 {
89 | 		t.Fatalf("Get on path %v returned nil for child nodes", NodesDir)
90 | 	}
91 | }
92 | 


--------------------------------------------------------------------------------
/metcdv3/commander.go:
--------------------------------------------------------------------------------
  1 | package metcdv3
  2 | 
  3 | import (
  4 | 	"context"
  5 | 	"encoding/json"
  6 | 	"errors"
  7 | 	"fmt"
  8 | 	"path"
  9 | 	"sync"
 10 | 	"time"
 11 | 
 12 | 	"github.com/lytics/metafora"
 13 | 	"github.com/lytics/metafora/statemachine"
 14 | 	etcdv3 "go.etcd.io/etcd/client/v3"
 15 | )
 16 | 
 17 | var (
 18 | 	ErrWatchClosedUnexpectedly = errors.New("metafora: watch closed unexpectedly")
 19 | )
 20 | 
 21 | type cmdr struct {
 22 | 	etcdv3c   *etcdv3.Client
 23 | 	kvc       etcdv3.KV
 24 | 	taskspath string
 25 | }
 26 | 
 27 | func NewCommander(namespace string, c *etcdv3.Client) statemachine.Commander {
 28 | 	return &cmdr{
 29 | 		taskspath: path.Join("/", namespace, TasksPath),
 30 | 		etcdv3c:   c,
 31 | 		kvc:       etcdv3.NewKV(c),
 32 | 	}
 33 | }
 34 | 
 35 | // Send command to a task. Overwrites existing commands.
 36 | func (c *cmdr) Send(taskID string, m *statemachine.Message) error {
 37 | 	buf, err := json.Marshal(m)
 38 | 	if err != nil {
 39 | 		return err
 40 | 	}
 41 | 
 42 | 	cmdPath := path.Join(c.taskspath, taskID, CommandsPath)
 43 | 	_, err = c.kvc.Put(context.Background(), cmdPath, string(buf))
 44 | 	return err
 45 | }
 46 | 
 47 | type cmdListener struct {
 48 | 	etcdv3c     *etcdv3.Client
 49 | 	kvc         etcdv3.KV
 50 | 	name        string
 51 | 	taskcmdpath string
 52 | 
 53 | 	commands chan *statemachine.Message
 54 | 
 55 | 	mu   *sync.Mutex
 56 | 	stop chan bool
 57 | }
 58 | 
 59 | // NewCommandListener makes a statemachine.CommandListener implementation
 60 | // backed by etcd. The namespace should be the same as the coordinator as
 61 | // commands use a separate path within a namespace than tasks or nodes.
 62 | func NewCommandListener(conf *Config, task metafora.Task, c *etcdv3.Client) statemachine.CommandListener {
 63 | 	taskcmdpath := path.Join("/", conf.Namespace, TasksPath, task.ID(), CommandsPath)
 64 | 	cl := &cmdListener{
 65 | 		etcdv3c:     c,
 66 | 		name:        conf.Name,
 67 | 		taskcmdpath: taskcmdpath,
 68 | 		kvc:         etcdv3.NewKV(c),
 69 | 		commands:    make(chan *statemachine.Message),
 70 | 		mu:          &sync.Mutex{},
 71 | 		stop:        make(chan bool),
 72 | 	}
 73 | 	ctxt := context.Background()
 74 | 	go cl.watch(ctxt, taskcmdpath)
 75 | 	return cl
 76 | }
 77 | 
 78 | func (c *cmdListener) Receive() <-chan *statemachine.Message {
 79 | 	return c.commands
 80 | }
 81 | 
 82 | func (c *cmdListener) ownerValueString() string {
 83 | 	p, err := json.Marshal(&ownerValue{Node: c.name})
 84 | 	if err != nil {
 85 | 		panic(fmt.Sprintf("command listener: error marshalling node body: %v", err))
 86 | 	}
 87 | 	return string(p)
 88 | }
 89 | 
 90 | func (c *cmdListener) Stop() {
 91 | 	c.mu.Lock()
 92 | 	defer c.mu.Unlock()
 93 | 	select {
 94 | 	case <-c.stop:
 95 | 	default:
 96 | 		close(c.stop)
 97 | 	}
 98 | }
 99 | 
100 | func (cl *cmdListener) watch(c context.Context, prefix string) {
101 | 	getRes, err := cl.kvc.Get(c, prefix, etcdv3.WithPrefix())
102 | 	if err != nil {
103 | 		metafora.Errorf("Error GETting %s - sending error to stateful handler: %v", prefix, err)
104 | 		select {
105 | 		case <-c.Done():
106 | 			// TODO Do I need the stop channel?
107 | 		case <-cl.stop:
108 | 		case cl.commands <- statemachine.ErrorMessage(err):
109 | 		}
110 | 		return
111 | 	}
112 | 
113 | 	// Create a message from an event.
114 | 	createMessage := func(key string, value []byte) (*statemachine.Message, error) {
115 | 		msg := &statemachine.Message{}
116 | 		if err := json.Unmarshal(value, msg); err != nil {
117 | 			metafora.Errorf("Error unmarshalling command from %s - sending error to stateful handler: %v", key, err)
118 | 			return nil, err
119 | 		}
120 | 
121 | 		txnRes, err := cl.kvc.Txn(c).
122 | 			If(etcdv3.Compare(etcdv3.Value(path.Join(path.Dir(key), OwnerPath)), "=", cl.ownerValueString())).
123 | 			Then(etcdv3.OpDelete(key, etcdv3.WithPrefix())).
124 | 			Commit()
125 | 		if err != nil {
126 | 			metafora.Errorf("Error deleting command %s: %s - sending error to stateful handler: %v", key, msg, err)
127 | 			return nil, err
128 | 		}
129 | 		if !txnRes.Succeeded {
130 | 			metafora.Infof("Received successive commands; attempting to retrieve the latest")
131 | 			return nil, nil
132 | 		}
133 | 		return msg, nil
134 | 	}
135 | 	// Write a change or exit the watcher.
136 | 	put := func(msg *statemachine.Message) {
137 | 		select {
138 | 		case <-c.Done():
139 | 		case cl.commands <- msg:
140 | 		}
141 | 	}
142 | 	for _, kv := range getRes.Kvs {
143 | 		key := string(kv.Key)
144 | 		if path.Base(key) == MetadataPath {
145 | 			continue
146 | 		}
147 | 		value := kv.Value
148 | 		msg, err := createMessage(key, value)
149 | 		if err != nil {
150 | 			msg = statemachine.ErrorMessage(err)
151 | 		}
152 | 		if msg != nil {
153 | 			put(msg)
154 | 		}
155 | 	}
156 | 
157 | 	putTerminalError := func(msg *statemachine.Message) {
158 | 		go func() {
159 | 			select {
160 | 			case <-c.Done():
161 | 				// TODO Do I need the stop channel?
162 | 			case <-cl.stop:
163 | 			case <-time.After(10 * time.Minute):
164 | 				metafora.Warnf("metafora command listener timed out putting message on channel: %v", msg)
165 | 			case cl.commands <- msg:
166 | 			}
167 | 		}()
168 | 	}
169 | 
170 | 	// Watch deltas in etcd, with the give prefix, starting
171 | 	// at the revision of the get call above.
172 | 	deltas := cl.etcdv3c.Watch(c, prefix, etcdv3.WithPrefix(), etcdv3.WithRev(getRes.Header.Revision+1), etcdv3.WithFilterDelete())
173 | 	for {
174 | 		select {
175 | 		case <-c.Done():
176 | 			return
177 | 		case <-cl.stop:
178 | 			return
179 | 		case delta, open := <-deltas:
180 | 			if !open {
181 | 				putTerminalError(statemachine.ErrorMessage(ErrWatchClosedUnexpectedly))
182 | 				return
183 | 			}
184 | 			if delta.Err() != nil {
185 | 				putTerminalError(statemachine.ErrorMessage(delta.Err()))
186 | 				return
187 | 			}
188 | 			for _, event := range delta.Events {
189 | 				msg, err := createMessage(string(event.Kv.Key), event.Kv.Value)
190 | 				if err != nil {
191 | 					msg = statemachine.ErrorMessage(err)
192 | 				}
193 | 				if msg != nil {
194 | 					put(msg)
195 | 				}
196 | 			}
197 | 		}
198 | 	}
199 | }
200 | 


--------------------------------------------------------------------------------
/metcdv3/commander_test.go:
--------------------------------------------------------------------------------
 1 | package metcdv3
 2 | 
 3 | import (
 4 | 	"context"
 5 | 	"fmt"
 6 | 	"path"
 7 | 	"testing"
 8 | 	"time"
 9 | 
10 | 	"github.com/lytics/metafora"
11 | 	"github.com/lytics/metafora/statemachine"
12 | 	etcdv3 "go.etcd.io/etcd/client/v3"
13 | )
14 | 
15 | func TestCommandListener(t *testing.T) {
16 | 	t.Parallel()
17 | 
18 | 	etcdv3c, _, conf := setupEtcd(t)
19 | 	kvc := etcdv3.NewKV(etcdv3c)
20 | 
21 | 	namespace := "/cltest"
22 | 	conf.Namespace = namespace
23 | 	_, _ = kvc.Delete(context.Background(), namespace, etcdv3.WithPrefix())
24 | 
25 | 	task := metafora.NewTask("testtask")
26 | 	_, err := kvc.Put(context.Background(), path.Join(conf.Namespace, TasksPath, task.ID(), OwnerPath), fmt.Sprintf(`{"node":"%s"}`, conf.Name))
27 | 	if err != nil {
28 | 		t.Fatalf("Error creating fake claim: %v", err)
29 | 	}
30 | 
31 | 	cmdr := NewCommander(namespace, etcdv3c)
32 | 
33 | 	// Only the last command should be received once the listener is started
34 | 	_ = cmdr.Send(task.ID(), statemachine.PauseMessage())
35 | 	_ = cmdr.Send(task.ID(), statemachine.KillMessage())
36 | 
37 | 	cl := NewCommandListener(conf, task, etcdv3c)
38 | 	defer cl.Stop()
39 | 
40 | 	// Ensure last command was received
41 | 	select {
42 | 	case cmd := <-cl.Receive():
43 | 		if cmd.Code != statemachine.Kill {
44 | 			t.Fatalf("Expected Kill message, received %v", cmd)
45 | 		}
46 | 	case <-time.After(10 * time.Second):
47 | 		t.Fatal("CommandListener took too long to receive message")
48 | 	}
49 | 
50 | 	// Ensure only one command was received
51 | 	select {
52 | 	case cmd := <-cl.Receive():
53 | 		t.Fatalf("Unexpected command received: %v", cmd)
54 | 	case <-time.After(300 * time.Millisecond):
55 | 		// Ok!
56 | 	}
57 | 
58 | 	cl.Stop()
59 | 
60 | 	// Stop doesn't block until watching loop exits, so wait briefly
61 | 	time.Sleep(10 * time.Millisecond)
62 | 
63 | 	// Ensure receiving after Stopping never succeeds
64 | 	_ = cmdr.Send(task.ID(), statemachine.RunMessage())
65 | 	select {
66 | 	case cmd := <-cl.Receive():
67 | 		t.Fatalf("Unexpected command received: %v", cmd)
68 | 	case <-time.After(300 * time.Millisecond):
69 | 		// Ok
70 | 	}
71 | }
72 | 


--------------------------------------------------------------------------------
/metcdv3/conf.go:
--------------------------------------------------------------------------------
 1 | package metcdv3
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"path"
 6 | 	"strings"
 7 | )
 8 | 
 9 | type Config struct {
10 | 	// Namespace is the key prefix to allow for multitenant use of etcd.
11 | 	//
12 | 	// Namespaces must start with a / (added by NewConfig if needed).
13 | 	Namespace string
14 | 
15 | 	// Name of this Metafora consumer. Only one instance of a Name is allowed to
16 | 	// run in a Namespace at a time, so if you set the Name to hostname you can
17 | 	// effectively limit Metafora to one process per server.
18 | 	Name string
19 | 
20 | 	// NewTaskFunc is the function called to unmarshal tasks from etcd into a
21 | 	// custom struct. The struct must implement the metafora.Task interface.
22 | 	//
23 | 	// If nil it is set to DefaultTaskFunc
24 | 	NewTaskFunc TaskFunc
25 | }
26 | 
27 | // NewConfig creates a Config with the required fields and uses defaults for
28 | // the others.
29 | //
30 | // Panics on empty values.
31 | func NewConfig(name, namespace string) *Config {
32 | 	if namespace == "" || name == "" {
33 | 		panic("invalid etcd config")
34 | 	}
35 | 
36 | 	namespace = path.Join("/", strings.Trim(namespace, "/ "))
37 | 	return &Config{
38 | 		Name:        name,
39 | 		Namespace:   namespace,
40 | 		NewTaskFunc: DefaultTaskFunc,
41 | 	}
42 | }
43 | 
44 | // Copy returns a shallow copy of this config.
45 | func (c *Config) Copy() *Config {
46 | 	return &Config{
47 | 		Name:        c.Name,
48 | 		Namespace:   c.Namespace,
49 | 		NewTaskFunc: c.NewTaskFunc,
50 | 	}
51 | }
52 | 
53 | func (c *Config) String() string {
54 | 	return fmt.Sprintf("etcd:%s/%s", c.Namespace, c.Name)
55 | }
56 | 


--------------------------------------------------------------------------------
/metcdv3/const.go:
--------------------------------------------------------------------------------
 1 | package metcdv3
 2 | 
 3 | const (
 4 | 	TasksPath    = "tasks"
 5 | 	NodesPath    = "nodes"
 6 | 	CommandsPath = "commands"
 7 | 	// Is this true for etcdv3?
 8 | 	MetadataPath = "_metafora" // _{KEYs} are hidden files, so this will not trigger our watches
 9 | 	OwnerPath    = "owner"
10 | 	PropsPath    = "props"
11 | 
12 | 	//Etcd Error codes are passed directly through go-etcd from the http response,
13 | 	//So to find the error codes use this ref:
14 | 	//       https://go.etcd.io/etcd/blob/master/error/error.go#L67
15 | 	EcodeKeyNotFound   = 100
16 | 	EcodeCompareFailed = 101
17 | 	EcodeNodeExist     = 105
18 | 	EcodeExpiredIndex  = 401 // The event in requested index is outdated and cleared
19 | )
20 | 


--------------------------------------------------------------------------------
/metcdv3/coordinator_test.go:
--------------------------------------------------------------------------------
  1 | package metcdv3
  2 | 
  3 | import (
  4 | 	"context"
  5 | 	"path"
  6 | 	"strings"
  7 | 	"testing"
  8 | 	"time"
  9 | 
 10 | 	"github.com/lytics/metafora"
 11 | 
 12 | 	etcdv3 "go.etcd.io/etcd/client/v3"
 13 | )
 14 | 
 15 | /*
 16 | 	Running the Integration Test:
 17 | 
 18 | go test -v ./...
 19 | */
 20 | 
 21 | func TestCoordinatorFirstNodeJoiner(t *testing.T) {
 22 | 	t.Parallel()
 23 | 	etcdv3c, coordinator1, conf := setupEtcd(t)
 24 | 	if err := coordinator1.Init(newCtx(t, "coordinator1")); err != nil {
 25 | 		t.Fatalf("Unexpected error initialzing coordinator: %v", err)
 26 | 	}
 27 | 	defer coordinator1.Close()
 28 | 	kvc := etcdv3.NewKV(etcdv3c)
 29 | 
 30 | 	tpath := path.Join(conf.Namespace, TasksPath)
 31 | 	_, err := kvc.Get(context.Background(), tpath)
 32 | 	if err != nil && strings.Contains(err.Error(), "Key not found") {
 33 | 		t.Fatalf("The tasks path wasn't created when the first node joined: %s", tpath)
 34 | 	} else if err != nil {
 35 | 		t.Fatalf("Unknown error trying to test: err: %s", err.Error())
 36 | 	}
 37 | 
 38 | 	//TODO test for node path too...
 39 | }
 40 | 
 41 | // Ensure that Watch() picks up new tasks and returns them.
 42 | func TestCoordinatorTC1(t *testing.T) {
 43 | 	t.Parallel()
 44 | 	etcdv3c, coordinator1, conf := setupEtcd(t)
 45 | 	if err := coordinator1.Init(newCtx(t, "coordinator1")); err != nil {
 46 | 		t.Fatalf("Unexpected error initialzing coordinator: %v", err)
 47 | 	}
 48 | 	defer coordinator1.Close()
 49 | 	kvc := etcdv3.NewKV(etcdv3c)
 50 | 
 51 | 	tasks := make(chan metafora.Task)
 52 | 	task001 := &task{id: "test-task"}
 53 | 	taskPath := path.Join(conf.Namespace, TasksPath, task001.ID())
 54 | 	errc := make(chan error)
 55 | 
 56 | 	go func() {
 57 | 		//Watch blocks, so we need to test it in its own go routine.
 58 | 		errc <- coordinator1.Watch(tasks)
 59 | 	}()
 60 | 
 61 | 	_, _ = kvc.Put(context.Background(), taskPath, "5")
 62 | 
 63 | 	select {
 64 | 	case task := <-tasks:
 65 | 		if task.ID() != task001.ID() {
 66 | 			t.Fatalf("coordinator1.Watch() test failed: We received the incorrect taskId.  Got [%s] Expected[%s]", task, task001)
 67 | 		}
 68 | 	case <-time.After(time.Second * 5):
 69 | 		t.Fatalf("coordinator1.Watch() test failed: The testcase timed out after 5 seconds.")
 70 | 	}
 71 | 
 72 | 	coordinator1.Close()
 73 | 	err := <-errc
 74 | 	if err != nil {
 75 | 		t.Fatalf("coordinator1.Watch() returned an err: %v", err)
 76 | 	}
 77 | }
 78 | 
 79 | // Submit a task while a coordinator is actively watching for tasks.
 80 | func TestCoordinatorTC2(t *testing.T) {
 81 | 	t.Parallel()
 82 | 	etcdv3c, coordinator1, conf := setupEtcd(t)
 83 | 	if err := coordinator1.Init(newCtx(t, "coordinator1")); err != nil {
 84 | 		t.Fatalf("Unexpected error initialzing coordinator: %v", err)
 85 | 	}
 86 | 	defer coordinator1.Close()
 87 | 
 88 | 	testTasks := []string{"test1", "test2", "test3"}
 89 | 
 90 | 	mclient := NewClient(conf.Namespace, etcdv3c)
 91 | 
 92 | 	tasks := make(chan metafora.Task)
 93 | 	errc := make(chan error)
 94 | 	go func() {
 95 | 		//Watch blocks, so we need to test it in its own go routine.
 96 | 		errc <- coordinator1.Watch(tasks)
 97 | 	}()
 98 | 
 99 | 	for _, taskid := range testTasks {
100 | 		err := mclient.SubmitTask(DefaultTaskFunc(taskid, ""))
101 | 		if err != nil {
102 | 			t.Fatalf("Error submitting a task to metafora via the client.  Error:\n%v", err)
103 | 		}
104 | 		recvd := <-tasks
105 | 		if recvd.ID() != taskid {
106 | 			t.Fatalf("%s != %s - received an unexpected task", recvd.ID(), taskid)
107 | 		}
108 | 		if ok := coordinator1.Claim(recvd); !ok {
109 | 			t.Fatal("coordinator1.Claim() unable to claim the task")
110 | 		}
111 | 	}
112 | 
113 | 	coordinator1.Close()
114 | 	err := <-errc
115 | 	if err != nil {
116 | 		t.Fatalf("coordinator1.Watch() returned an err: %v", err)
117 | 	}
118 | }
119 | 
120 | // Start two coordinators to ensure that fighting over claims results in only
121 | // one coordinator winning (and the other not crashing).
122 | func TestCoordinatorTC3(t *testing.T) {
123 | 	t.Parallel()
124 | 	etcdv3c, coordinator1, conf1 := setupEtcd(t)
125 | 	if err := coordinator1.Init(newCtx(t, "coordinator1")); err != nil {
126 | 		t.Fatalf("Unexpected error initialzing coordinator: %v", err)
127 | 	}
128 | 	defer coordinator1.Close()
129 | 	conf2 := conf1.Copy()
130 | 	conf2.Name = "node2"
131 | 	coordinator2 := NewEtcdV3Coordinator(conf2, etcdv3c)
132 | 	if err := coordinator2.Init(newCtx(t, "coordinator2")); err != nil {
133 | 		t.Fatalf("Unexpected error initialzing coordinator: %v", err)
134 | 	}
135 | 	defer coordinator2.Close()
136 | 
137 | 	testTasks := []string{"test-claiming-task0001", "test-claiming-task0002", "test-claiming-task0003"}
138 | 
139 | 	mclient := NewClient(conf1.Namespace, etcdv3c)
140 | 
141 | 	// Start the watchers
142 | 	errc := make(chan error, 2)
143 | 	c1tasks := make(chan metafora.Task)
144 | 	c2tasks := make(chan metafora.Task)
145 | 	go func() {
146 | 		errc <- coordinator1.Watch(c1tasks)
147 | 	}()
148 | 	go func() {
149 | 		errc <- coordinator2.Watch(c2tasks)
150 | 	}()
151 | 
152 | 	// Submit the tasks
153 | 	for _, tid := range testTasks {
154 | 		err := mclient.SubmitTask(DefaultTaskFunc(tid, ""))
155 | 		if err != nil {
156 | 			t.Fatalf("Error submitting task=%q to metafora via the client. Error:\n%v", tid, err)
157 | 		}
158 | 	}
159 | 
160 | 	//XXX This assumes tasks are sent by watchers in the order they were
161 | 	//    submitted to etcd which, while /possible/ to guarantee, isn't a gurantee
162 | 	//    we're interested in making.
163 | 	//    We only want to guarantee that exactly one coordinator can claim a task.
164 | 	c1t := <-c1tasks
165 | 	c2t := <-c2tasks
166 | 	if c1t.ID() != c2t.ID() {
167 | 		t.Logf("Watchers didn't receive the same task %s != %s. It's fine; watch order isn't guaranteed", c1t, c2t)
168 | 	}
169 | 
170 | 	// Make sure c1 can claim and c2 cannot
171 | 	if ok := coordinator1.Claim(c1t); !ok {
172 | 		t.Fatalf("coordinator1.Claim() unable to claim the task=%q", c1t)
173 | 	}
174 | 	if ok := coordinator2.Claim(c1t); ok {
175 | 		t.Fatalf("coordinator2.Claim() succeeded for task=%q when it shouldn't have!", c2t)
176 | 	}
177 | 
178 | 	// Make sure coordinators close down properly and quickly
179 | 	coordinator1.Close()
180 | 	if err := <-errc; err != nil {
181 | 		t.Errorf("Error shutting down coordinator1: %v", err)
182 | 	}
183 | 	coordinator2.Close()
184 | 	if err := <-errc; err != nil {
185 | 		t.Errorf("Error shutting down coordinator2: %v", err)
186 | 	}
187 | }
188 | 
189 | // Submit a task before any coordinators are active.  Then start a coordinator to
190 | // ensure the tasks are picked up by the new coordinator
191 | //
192 | // Then call coordinator.Release() on the task to make sure a coordinator picks it
193 | // up again.
194 | func TestCoordinatorTC4(t *testing.T) {
195 | 	t.Parallel()
196 | 	etcdv3c, coordinator1, conf1 := setupEtcd(t)
197 | 
198 | 	task := "testtask4"
199 | 
200 | 	mclient := NewClient(conf1.Namespace, etcdv3c)
201 | 
202 | 	if err := mclient.SubmitTask(DefaultTaskFunc(task, "")); err != nil {
203 | 		t.Fatalf("Error submitting a task to metafora via the client. Error:\n%v", err)
204 | 	}
205 | 
206 | 	// Don't start up the coordinator until after the metafora client has submitted work.
207 | 	if err := coordinator1.Init(newCtx(t, "coordinator1")); err != nil {
208 | 		t.Fatalf("Unexpected error initialzing coordinator: %v", err)
209 | 	}
210 | 	defer coordinator1.Close()
211 | 
212 | 	errc := make(chan error)
213 | 	c1tasks := make(chan metafora.Task)
214 | 	go func() {
215 | 		errc <- coordinator1.Watch(c1tasks)
216 | 	}()
217 | 
218 | 	tid := <-c1tasks
219 | 
220 | 	if ok := coordinator1.Claim(tid); !ok {
221 | 		t.Fatal("coordinator1.Claim() unable to claim the task")
222 | 	}
223 | 
224 | 	// Startup a second
225 | 	conf2 := conf1.Copy()
226 | 	conf2.Name = "node2"
227 | 	coordinator2 := NewEtcdV3Coordinator(conf2, etcdv3c)
228 | 	if err := coordinator2.Init(newCtx(t, "coordinator2")); err != nil {
229 | 		t.Fatalf("Unexpected error initialzing coordinator: %v", err)
230 | 	}
231 | 	defer coordinator2.Close()
232 | 
233 | 	c2tasks := make(chan metafora.Task)
234 | 	go func() {
235 | 		errc <- coordinator2.Watch(c2tasks)
236 | 	}()
237 | 
238 | 	// coordinator 2 shouldn't see anything yet
239 | 	select {
240 | 	case <-c2tasks:
241 | 		t.Fatal("coordinator2.Watch() returned a task when there are none to claim!")
242 | 	case <-time.After(100 * time.Millisecond):
243 | 	}
244 | 
245 | 	// Now release the task from coordinator1 and claim it with coordinator2
246 | 	coordinator1.Release(tid)
247 | 	tid = <-c2tasks
248 | 	if ok := coordinator2.Claim(tid); !ok {
249 | 		t.Fatalf("coordinator2.Claim() should have succeded on released task=%q", tid)
250 | 	}
251 | 
252 | 	coordinator1.Close()
253 | 	coordinator2.Close()
254 | 	for i := 0; i < 2; i++ {
255 | 		if err := <-errc; err != nil {
256 | 			t.Errorf("coordinator returned an error after closing: %v", err)
257 | 		}
258 | 	}
259 | }
260 | 
261 | // TestNodeCleanup ensures the coordinator properly cleans up its node entry
262 | // upon exit.
263 | func TestNodeCleanup(t *testing.T) {
264 | 	t.Parallel()
265 | 	etcdv3c, c1, conf1 := setupEtcd(t)
266 | 	if err := c1.Init(newCtx(t, "coordinator1")); err != nil {
267 | 		t.Fatalf("Unexpected error initialzing coordinator: %v", err)
268 | 	}
269 | 	conf2 := conf1.Copy()
270 | 	conf2.Name = "node2"
271 | 	c2 := NewEtcdV3Coordinator(conf2, etcdv3c)
272 | 	kvc := etcdv3.NewKV(etcdv3c)
273 | 	if err := c2.Init(newCtx(t, "coordinator2")); err != nil {
274 | 		t.Fatalf("Unexpected error initialzing coordinator: %v", err)
275 | 	}
276 | 	defer c1.Close()
277 | 	defer c2.Close()
278 | 
279 | 	// Make sure node directories were created
280 | 	c1nodep := path.Join(conf1.Namespace, NodesPath, conf1.Name, MetadataPath)
281 | 	c := context.Background()
282 | 	resp, err := kvc.Get(c, c1nodep)
283 | 	if err != nil {
284 | 		t.Fatalf("Error retrieving node key from etcd: %v", err)
285 | 	}
286 | 	if resp.Count == 0 {
287 | 		t.Error(c1nodep + " isn't a directory!")
288 | 	}
289 | 
290 | 	c2nodep := path.Join(conf2.Namespace, NodesPath, conf2.Name, MetadataPath)
291 | 	resp, err = kvc.Get(c, c2nodep)
292 | 	if err != nil {
293 | 		t.Fatalf("Error retrieving node key from etcd: %v", err)
294 | 	}
295 | 	if resp.Count == 0 {
296 | 		t.Error(c2nodep + " isn't a directory!")
297 | 	}
298 | 
299 | 	// Shutdown one and make sure its node directory is gone
300 | 	c1.Close()
301 | 
302 | 	resp, err = kvc.Get(c, c1nodep)
303 | 	if err != nil {
304 | 		t.Errorf("Unexpected error %T retrieving node key from etcd: %v", err, err)
305 | 	}
306 | 	if resp.Count != 0 {
307 | 		t.Errorf("Expected Not Found error, but directory still exists!")
308 | 	}
309 | 
310 | 	// Make sure c2 is untouched
311 | 	resp, err = kvc.Get(c, c2nodep)
312 | 	if err != nil {
313 | 		t.Fatalf("Error retrieving node key from etcd: %v", err)
314 | 	}
315 | 	if resp.Count == 0 {
316 | 		t.Error(c2nodep + " isn't a directory!")
317 | 	}
318 | }
319 | 
320 | // TestExpiration ensures that expired claims get reclaimed properly.
321 | func TestExpiration(t *testing.T) {
322 | 	t.Parallel()
323 | 	etcdv3c, coord, conf := setupEtcd(t)
324 | 	kvc := etcdv3.NewKV(etcdv3c)
325 | 	claims := make(chan int, 10)
326 | 	hf := metafora.HandlerFunc(metafora.SimpleHandler(func(_ metafora.Task, stop <-chan bool) bool {
327 | 		claims <- 1
328 | 		<-stop
329 | 		return true
330 | 	}))
331 | 	consumer, err := metafora.NewConsumer(coord, hf, metafora.DumbBalancer)
332 | 	if err != nil {
333 | 		t.Fatalf("Error creating consumer: %+v", err)
334 | 	}
335 | 
336 | 	_, err = kvc.Put(context.Background(), path.Join(conf.Namespace, TasksPath, "abc", OwnerPath), `{"node":"--"}`)
337 | 	if err != nil {
338 | 		t.Fatalf("Error creating fake claim: %v", err)
339 | 	}
340 | 	_, err = kvc.Put(context.Background(), path.Join(conf.Namespace, TasksPath, "abc"), "")
341 | 	if err != nil {
342 | 		t.Fatalf("Error creating fake task: %v", err)
343 | 	}
344 | 	_, err = kvc.Delete(context.Background(), path.Join(conf.Namespace, TasksPath, "abc", OwnerPath))
345 | 	if err != nil {
346 | 		t.Fatalf("Error deleting fake claim: %v", err)
347 | 	}
348 | 
349 | 	defer consumer.Shutdown()
350 | 	go consumer.Run()
351 | 
352 | 	// Wait for claim to expire and coordinator to pick up task
353 | 	select {
354 | 	case <-claims:
355 | 		// Task claimed!
356 | 	case <-time.After(5 * time.Second):
357 | 		t.Fatal("Task not claimed long after it should have been.")
358 | 	}
359 | 
360 | 	tasks := consumer.Tasks()
361 | 	if len(tasks) != 1 {
362 | 		t.Fatalf("Expected 1 task to be claimed but found: %v", tasks)
363 | 	}
364 | }
365 | 


--------------------------------------------------------------------------------
/metcdv3/doc.go:
--------------------------------------------------------------------------------
1 | // Package metcdv3 contains implementations of all Metafora interfaces using
2 | // etcd as the broker/backing store.
3 | //
4 | // See https://github.com/lytics/metafora/Documentation/etcdv3.md for details.
5 | package metcdv3
6 | 


--------------------------------------------------------------------------------
/metcdv3/helpers_test.go:
--------------------------------------------------------------------------------
 1 | package metcdv3
 2 | 
 3 | import (
 4 | 	"context"
 5 | 	"fmt"
 6 | 	"log"
 7 | 	"os"
 8 | 	"sync/atomic"
 9 | 	"testing"
10 | 
11 | 	"github.com/lytics/metafora"
12 | 	"github.com/lytics/metafora/metcdv3/testutil"
13 | 
14 | 	etcdv3 "go.etcd.io/etcd/client/v3"
15 | )
16 | 
17 | func init() {
18 | 	metafora.SetLogger(log.New(os.Stderr, "", log.Lmicroseconds|log.Lshortfile))
19 | 	//metafora.SetLogLevel(metafora.LogLevelDebug)
20 | }
21 | 
22 | var testcounter uint64
23 | 
24 | // setupEtcd should be used for all etcd integration tests. It handles the following tasks:
25 | //   - Create and return an etcd client
26 | //   - Create and return an initial etcd coordinator
27 | //   - Clearing the test namespace in etcd
28 | func setupEtcd(t *testing.T) (*etcdv3.Client, *EtcdV3Coordinator, *Config) {
29 | 	c := context.Background()
30 | 	client := testutil.NewEtcdV3Client(t)
31 | 	kvc := etcdv3.NewKV(client)
32 | 	n := atomic.AddUint64(&testcounter, 1)
33 | 	ns := fmt.Sprintf("/metaforatests-%d", n)
34 | 	_, err := kvc.Delete(c, ns, etcdv3.WithPrefix())
35 | 	if err != nil {
36 | 		t.Errorf("failed to clean up namespace in etcd")
37 | 	}
38 | 	conf := NewConfig("testclient", ns)
39 | 	coord := NewEtcdV3Coordinator(conf, client)
40 | 	return client, coord, conf
41 | }
42 | 
43 | type testLogger struct {
44 | 	prefix string
45 | 	*testing.T
46 | }
47 | 
48 | func (l testLogger) Log(lvl int, m string, v ...interface{}) {
49 | 	l.T.Logf("%s:[%d] %s", l.prefix, lvl, fmt.Sprintf(m, v...))
50 | }
51 | 
52 | type testCoordCtx struct {
53 | 	testLogger
54 | 	lost chan string
55 | }
56 | 
57 | func newCtx(t *testing.T, prefix string) *testCoordCtx {
58 | 	return &testCoordCtx{
59 | 		testLogger: testLogger{prefix: prefix, T: t},
60 | 		lost:       make(chan string, 10),
61 | 	}
62 | }
63 | 
64 | func (t *testCoordCtx) Lost(task metafora.Task) {
65 | 	t.Log(4, "Lost(%s)", task.ID())
66 | 	t.lost <- task.ID()
67 | }
68 | 


--------------------------------------------------------------------------------
/metcdv3/integration_test.go:
--------------------------------------------------------------------------------
  1 | package metcdv3_test
  2 | 
  3 | import (
  4 | 	"context"
  5 | 	"errors"
  6 | 	"testing"
  7 | 	"time"
  8 | 
  9 | 	"github.com/lytics/metafora"
 10 | 	"github.com/lytics/metafora/metcdv3"
 11 | 	"github.com/lytics/metafora/metcdv3/testutil"
 12 | 	"github.com/lytics/metafora/statemachine"
 13 | 	etcdv3 "go.etcd.io/etcd/client/v3"
 14 | )
 15 | 
 16 | // TestSleepTest is an integration test for all of m_etcd's components.
 17 | func TestSleepTest(t *testing.T) {
 18 | 	etcdv3c := testutil.NewEtcdV3Client(t)
 19 | 	kvc := etcdv3.NewKV(etcdv3c)
 20 | 	t.Parallel()
 21 | 	const namespace = "/sleeptest-metafora"
 22 | 	const sleepingtasks = "sleeping-task1"
 23 | 
 24 | 	_, _ = kvc.Delete(context.Background(), namespace, etcdv3.WithPrefix())
 25 | 
 26 | 	holdtask := make(chan bool)
 27 | 	h := func(task metafora.Task, cmds <-chan *statemachine.Message) *statemachine.Message {
 28 | 
 29 | 		if task.ID() == sleepingtasks {
 30 | 			sleeptil := 5 * time.Second
 31 | 			nextstarttime := (time.Now().Add(sleeptil))
 32 | 			t.Logf("sleeping task:%v sleepfor:%v", task, nextstarttime)
 33 | 			<-holdtask
 34 | 			return statemachine.SleepMessage(nextstarttime)
 35 | 		}
 36 | 
 37 | 		cmd := <-cmds
 38 | 		t.Logf("non sleeping task:%v", task)
 39 | 
 40 | 		return cmd
 41 | 	}
 42 | 
 43 | 	newC := func(name, ns string) *metafora.Consumer {
 44 | 		conf := metcdv3.NewConfig(name, ns)
 45 | 		coord, hf, bal := metcdv3.New(conf, etcdv3c, h)
 46 | 		cons, err := metafora.NewConsumer(coord, hf, bal)
 47 | 		if err != nil {
 48 | 			t.Fatalf("Error creating consumer %s:%s: %v", ns, name, err)
 49 | 		}
 50 | 		go func() {
 51 | 			cons.Run()
 52 | 			t.Logf("Consumer:%s exited.", name)
 53 | 		}()
 54 | 		return cons
 55 | 	}
 56 | 
 57 | 	assertRunning := func(tid string, cons ...*metafora.Consumer) {
 58 | 		found := false
 59 | 		for _, c := range cons {
 60 | 			tasks := c.Tasks()
 61 | 			if len(tasks) > 0 && found {
 62 | 				t.Fatal("Task already found running but another task is running on a different consumer")
 63 | 			}
 64 | 			if len(tasks) > 1 {
 65 | 				t.Fatalf("Expected at most 1 task, but found: %d", len(tasks))
 66 | 			}
 67 | 			if len(tasks) == 1 && tasks[0].Task().ID() == tid {
 68 | 				found = true
 69 | 			}
 70 | 		}
 71 | 		if !found {
 72 | 			t.Fatalf("Could not find task=%q", tid)
 73 | 		}
 74 | 	}
 75 | 
 76 | 	// Start 2 consumers
 77 | 	cons1 := newC("node1", namespace)
 78 | 	cons2 := newC("node2", namespace)
 79 | 
 80 | 	// Create clients and start some tests
 81 | 	cliA := metcdv3.NewClient(namespace, etcdv3c)
 82 | 
 83 | 	if err := cliA.SubmitTask(metcdv3.DefaultTaskFunc(sleepingtasks, "")); err != nil {
 84 | 		t.Fatalf("Error submitting task1 to a: %v", err)
 85 | 	}
 86 | 
 87 | 	// Give consumers a bit to pick up tasks
 88 | 	time.Sleep(500 * time.Millisecond)
 89 | 
 90 | 	assertRunning(sleepingtasks, cons1, cons2)
 91 | 
 92 | 	holdtask <- true
 93 | 	// Give consumers a bit to pick up tasks
 94 | 	time.Sleep(500 * time.Millisecond)
 95 | 
 96 | 	assertRunning(sleepingtasks, cons1, cons2) // not sure if this should be true or false.
 97 | 
 98 | 	wait1 := make(chan bool)
 99 | 	go func() {
100 | 		defer close(wait1)
101 | 		// Shutdown
102 | 		cons1.Shutdown()
103 | 		cons2.Shutdown()
104 | 	}()
105 | 
106 | 	timeout := time.NewTimer(5 * time.Second)
107 | 	select {
108 | 	case <-wait1:
109 | 	case <-timeout.C:
110 | 		t.Fatalf("failed waiting for shutdown")
111 | 	}
112 | 
113 | 	//	make sure all tasks are released
114 | 	for _, c := range []*metafora.Consumer{cons1, cons2} {
115 | 		tasks := c.Tasks()
116 | 		for _, work := range tasks {
117 | 			t.Fatalf("work id %v is still running", work)
118 | 		}
119 | 	}
120 | }
121 | 
122 | // TestAll is an integration test for all of m_etcd's components.
123 | //
124 | // While huge integration tests like this are rarely desirable as they can be
125 | // overly fragile and complex, I found myself manually repeating the tests I've
126 | // automated here over and over. This is far more reliable than expecting
127 | // developers to do adhoc testing of all of the m_etcd package's features.
128 | func TestAll(t *testing.T) {
129 | 	etcdv3c := testutil.NewEtcdV3Client(t)
130 | 	kvc := etcdv3.NewKV(etcdv3c)
131 | 	t.Parallel()
132 | 
133 | 	c := context.Background()
134 | 	_, _ = kvc.Delete(c, "/test-a", etcdv3.WithPrefix())
135 | 	_, _ = kvc.Delete(c, "/test-b", etcdv3.WithPrefix())
136 | 
137 | 	h := func(task metafora.Task, cmds <-chan *statemachine.Message) *statemachine.Message {
138 | 		cmd := <-cmds
139 | 		if task.ID() == "error-test" {
140 | 			return statemachine.ErrorMessage(errors.New("error-test"))
141 | 		}
142 | 		return cmd
143 | 	}
144 | 
145 | 	newC := func(name, ns string) *metafora.Consumer {
146 | 		conf := metcdv3.NewConfig(name, ns)
147 | 		conf.Name = name
148 | 		coord, hf, bal := metcdv3.New(conf, etcdv3c, h)
149 | 		cons, err := metafora.NewConsumer(coord, hf, bal)
150 | 		if err != nil {
151 | 			t.Fatalf("Error creating consumer %s:%s: %v", ns, name, err)
152 | 		}
153 | 		go cons.Run()
154 | 		return cons
155 | 	}
156 | 	// Start 4 consumers, 2 per namespace
157 | 	cons1a := newC("node1", "/test-a")
158 | 	cons2a := newC("node2", "/test-a")
159 | 	cons1b := newC("node1", "/test-b")
160 | 	cons2b := newC("node2", "/test-b")
161 | 
162 | 	// Create clients and start some tests
163 | 	cliA := metcdv3.NewClient("/test-a", etcdv3c)
164 | 	cliB := metcdv3.NewClient("/test-b", etcdv3c)
165 | 
166 | 	if err := cliA.SubmitTask(metcdv3.DefaultTaskFunc("task1", "")); err != nil {
167 | 		t.Fatalf("Error submitting task1 to a: %v", err)
168 | 	}
169 | 	if err := cliB.SubmitTask(metcdv3.DefaultTaskFunc("task1", "")); err != nil {
170 | 		t.Fatalf("Error submitting task1 to b: %v", err)
171 | 	}
172 | 
173 | 	// Give consumers a bit to pick up tasks
174 | 	time.Sleep(500 * time.Millisecond)
175 | 
176 | 	assertRunning := func(tid string, cons ...*metafora.Consumer) {
177 | 		found := false
178 | 		for _, c := range cons {
179 | 			tasks := c.Tasks()
180 | 			if len(tasks) > 0 && found {
181 | 				t.Fatal("Task already found running but another task is running on a different consumer")
182 | 			}
183 | 			if len(tasks) > 1 {
184 | 				t.Fatalf("Expected at most 1 task, but found: %d", len(tasks))
185 | 			}
186 | 			if len(tasks) == 1 && tasks[0].Task().ID() == tid {
187 | 				found = true
188 | 			}
189 | 		}
190 | 		if !found {
191 | 			t.Fatalf("Could not find task=%q", tid)
192 | 		}
193 | 	}
194 | 
195 | 	assertRunning("task1", cons1a, cons2a)
196 | 	assertRunning("task1", cons1b, cons2b)
197 | 
198 | 	// Kill task1 in A
199 | 	{
200 | 		cmdr := metcdv3.NewCommander("/test-a", etcdv3c)
201 | 		if err := cmdr.Send("task1", statemachine.KillMessage()); err != nil {
202 | 			t.Fatalf("Error sending kill to task1: %v", err)
203 | 		}
204 | 		time.Sleep(1000 * time.Millisecond)
205 | 
206 | 		for _, c := range []*metafora.Consumer{cons1a, cons2a} {
207 | 			tasks := c.Tasks()
208 | 			if len(tasks) != 0 {
209 | 				t.Fatalf("Expected no tasks but found: %d", len(tasks))
210 | 			}
211 | 		}
212 | 	}
213 | 
214 | 	// Submit a bunch of tasks to A
215 | 	{
216 | 		tasks := []string{"task2", "task3", "task4", "task5", "task6", "task7"}
217 | 		for _, tid := range tasks {
218 | 			if err := cliA.SubmitTask(metcdv3.DefaultTaskFunc(tid, "")); err != nil {
219 | 				t.Fatalf("Error submitting task=%q to A: %v", tid, err)
220 | 			}
221 | 		}
222 | 
223 | 		// Give them time to start
224 | 		time.Sleep(800 * time.Millisecond)
225 | 
226 | 		// Ensure they're balanced
227 | 		if err := cliA.SubmitCommand("node1", metafora.CommandBalance()); err != nil {
228 | 			t.Fatalf("Error submitting balance command to cons1a: %v", err)
229 | 		}
230 | 		time.Sleep(800 * time.Millisecond)
231 | 		if err := cliA.SubmitCommand("node2", metafora.CommandBalance()); err != nil {
232 | 			t.Fatalf("Error submitting balance command to cons1a: %v", err)
233 | 		}
234 | 
235 | 		a1tasks := cons1a.Tasks()
236 | 		a2tasks := cons2a.Tasks()
237 | 		for _, task := range a1tasks {
238 | 			metafora.Debug("A1: ", task.Task(), " - ", task.Stopped().IsZero())
239 | 		}
240 | 		for _, task := range a2tasks {
241 | 			metafora.Debug("A2: ", task.Task(), " - ", task.Stopped().IsZero())
242 | 		}
243 | 		time.Sleep(800 * time.Millisecond)
244 | 
245 | 		a1tasks = cons1a.Tasks()
246 | 		a2tasks = cons2a.Tasks()
247 | 		if len(a1tasks) < 2 || len(a1tasks) > 4 || len(a2tasks) < 2 || len(a2tasks) > 4 {
248 | 			t.Fatalf("Namespace A isn't fairly balanced: node1: %d; node2: %d", len(a1tasks), len(a2tasks))
249 | 		}
250 | 
251 | 		// Shutting down a consumer should migrate all tasks to the other
252 | 		cons1a.Shutdown()
253 | 		time.Sleep(800 * time.Millisecond)
254 | 
255 | 		a2tasks = cons2a.Tasks()
256 | 		if len(a2tasks) != len(tasks) {
257 | 			t.Fatalf("Consumer 2a should have received all %d tasks but only has %d.", len(tasks), len(a2tasks))
258 | 		}
259 | 	}
260 | 
261 | 	// Use Namespace B to check Error state handling
262 | 	{
263 | 		tasks := []string{"task8", "error-test"}
264 | 		for _, tid := range tasks {
265 | 			if err := cliB.SubmitTask(metcdv3.DefaultTaskFunc(tid, "")); err != nil {
266 | 				t.Fatalf("Error submitting task=%q to B: %v", tid, err)
267 | 			}
268 | 		}
269 | 
270 | 		// Give them time to start
271 | 		time.Sleep(time.Second)
272 | 
273 | 		n := len(cons1b.Tasks()) + len(cons2b.Tasks())
274 | 		if n != 3 {
275 | 			t.Fatalf("Expected B to be running 3 tasks but found %d", n)
276 | 		}
277 | 
278 | 		// Resuming error-test 8*2 times should cause it to be failed
279 | 		cmdr := metcdv3.NewCommander("/test-b", etcdv3c)
280 | 		for i := 0; i < statemachine.DefaultErrMax*2; i++ {
281 | 			if err := cmdr.Send("error-test", statemachine.RunMessage()); err != nil {
282 | 				t.Fatalf("Unexpected error resuming error-test in B: %v", err)
283 | 			}
284 | 			time.Sleep(500 * time.Millisecond)
285 | 		}
286 | 
287 | 		n = len(cons1b.Tasks()) + len(cons2b.Tasks())
288 | 		if n != 2 {
289 | 			t.Fatalf("Expected B to be running 2 tasks but found %d", n)
290 | 		}
291 | 
292 | 		// Resubmitting a failed task shouldn't error but also shouldn't run.
293 | 		if err := cliB.SubmitTask(metcdv3.DefaultTaskFunc("error-test", "")); err != nil {
294 | 			t.Fatalf("Error resubmitting error-test task to B: %v", err)
295 | 		}
296 | 
297 | 		// Give the statemachine a moment to load the initial state and exit
298 | 		time.Sleep(time.Second)
299 | 
300 | 		n = len(cons1b.Tasks()) + len(cons2b.Tasks())
301 | 		if n != 2 {
302 | 			t.Fatalf("Expected B to be running 2 tasks but found %d", n)
303 | 		}
304 | 	}
305 | 
306 | 	// Shutdown
307 | 	cons2a.Shutdown()
308 | 	cons1b.Shutdown()
309 | 	cons2b.Shutdown()
310 | }
311 | 
312 | // TestTaskResurrectionInt ensures that a Claim won't recreate a task that had
313 | // been deleted (marked as done). taskmgr has a non-integration version of this
314 | // test.
315 | func TestTaskResurrectionInt(t *testing.T) {
316 | 	etcdv3c := testutil.NewEtcdV3Client(t)
317 | 	kvc := etcdv3.NewKV(etcdv3c)
318 | 	c := context.Background()
319 | 	t.Parallel()
320 | 
321 | 	_, _ = kvc.Delete(c, "/test-resurrect", etcdv3.WithPrefix())
322 | 
323 | 	task := metcdv3.DefaultTaskFunc("xyz", "")
324 | 
325 | 	conf := metcdv3.NewConfig("testclient", "/test-resurrect")
326 | 	coord := metcdv3.NewEtcdV3Coordinator(conf, etcdv3c)
327 | 	if err := coord.Init(nil); err != nil {
328 | 		t.Fatalf("Error initializing coordinator: %v", err)
329 | 	}
330 | 	defer coord.Close()
331 | 
332 | 	// Try to claim a nonexistent
333 | 	if claimed := coord.Claim(task); claimed {
334 | 		t.Fatal("Claiming a nonexistent task should not work but did!")
335 | 	}
336 | 
337 | 	// Create a task, mark it as done, and try to claim it again
338 | 	client := metcdv3.NewClient("/test-resurrect", etcdv3c)
339 | 	if err := client.SubmitTask(metcdv3.DefaultTaskFunc("xyz", "")); err != nil {
340 | 		t.Fatalf("Error submitting task xyz: %v", err)
341 | 	}
342 | 
343 | 	if claimed := coord.Claim(task); !claimed {
344 | 		t.Fatal("Failed to claim task xyz")
345 | 	}
346 | 
347 | 	coord.Done(task)
348 | 
349 | 	if claimed := coord.Claim(task); claimed {
350 | 		t.Fatal("Reclaimed task that was marked as done.")
351 | 	}
352 | }
353 | 


--------------------------------------------------------------------------------
/metcdv3/statestore.go:
--------------------------------------------------------------------------------
 1 | package metcdv3
 2 | 
 3 | import (
 4 | 	"context"
 5 | 	"encoding/json"
 6 | 	"path"
 7 | 
 8 | 	"github.com/lytics/metafora"
 9 | 	"github.com/lytics/metafora/statemachine"
10 | 	etcdv3 "go.etcd.io/etcd/client/v3"
11 | )
12 | 
13 | const statePath = "state"
14 | 
15 | // stateStore is an etcd implementation of statemachine.StateStore.
16 | type stateStore struct {
17 | 	etcdv3c *etcdv3.Client
18 | 	kvc     etcdv3.KV
19 | 	path    string
20 | }
21 | 
22 | // NewStateStore returns a StateStore implementation that persists task states
23 | // in etcd.
24 | func NewStateStore(namespace string, etcdv3c *etcdv3.Client) statemachine.StateStore {
25 | 	return &stateStore{
26 | 		etcdv3c: etcdv3c,
27 | 		kvc:     etcdv3.NewKV(etcdv3c),
28 | 		path:    path.Join("/", namespace, statePath),
29 | 	}
30 | }
31 | 
32 | // Load retrieves the given task's state from etcd or stores and returns
33 | // Runnable if no state exists.
34 | func (s *stateStore) Load(task metafora.Task) (*statemachine.State, error) {
35 | 	resp, err := s.kvc.Get(context.Background(), path.Join(s.path, task.ID()), etcdv3.WithLimit(1))
36 | 	if err != nil {
37 | 		return nil, err
38 | 
39 | 	}
40 | 
41 | 	if resp.Count == 0 {
42 | 		metafora.Infof("task=%q has no existing state, default to Runnable", task.ID())
43 | 		state := &statemachine.State{Code: statemachine.Runnable}
44 | 		if err := s.Store(task, state); err != nil {
45 | 			return nil, err
46 | 		}
47 | 		return state, nil
48 | 	}
49 | 
50 | 	// Unmarshal state from key
51 | 	state := &statemachine.State{}
52 | 	if err := json.Unmarshal([]byte(resp.Kvs[0].Value), state); err != nil {
53 | 		return nil, err
54 | 	}
55 | 	return state, nil
56 | }
57 | 
58 | // Store taskID's state in etcd overwriting any prior state.
59 | func (s *stateStore) Store(task metafora.Task, state *statemachine.State) error {
60 | 	buf, err := json.Marshal(state)
61 | 	if err != nil {
62 | 		return err
63 | 	}
64 | 
65 | 	_, err = s.kvc.Put(context.Background(), path.Join(s.path, task.ID()), string(buf))
66 | 	return err
67 | }
68 | 


--------------------------------------------------------------------------------
/metcdv3/task.go:
--------------------------------------------------------------------------------
 1 | package metcdv3
 2 | 
 3 | import "github.com/lytics/metafora"
 4 | 
 5 | type task struct {
 6 | 	id string
 7 | }
 8 | 
 9 | func (t *task) ID() string { return t.id }
10 | 
11 | // TaskFunc creates a Task interface from a task ID and etcd Node. The Node
12 | // corresponds to the task directory.
13 | //
14 | // Implementations must support value being an empty string.
15 | //
16 | // If nil is returned the task is ignored.
17 | type TaskFunc func(id, value string) metafora.Task
18 | 
19 | // DefaultTaskFunc is the default new task function used by the EtcdCoordinator
20 | // and does not attempt to process the properties value.
21 | func DefaultTaskFunc(id, _ string) metafora.Task { return &task{id: id} }
22 | 


--------------------------------------------------------------------------------
/metcdv3/task_test.go:
--------------------------------------------------------------------------------
 1 | package metcdv3_test
 2 | 
 3 | import (
 4 | 	"context"
 5 | 	"encoding/json"
 6 | 	"fmt"
 7 | 	"testing"
 8 | 	"time"
 9 | 
10 | 	"github.com/lytics/metafora"
11 | 	"github.com/lytics/metafora/metcdv3"
12 | 	"github.com/lytics/metafora/metcdv3/testutil"
13 | 	"github.com/lytics/metafora/statemachine"
14 | 	etcdv3 "go.etcd.io/etcd/client/v3"
15 | )
16 | 
17 | // exTask is an extended Task type to demonstrate using an alternative NewTask
18 | // TaskFunc.
19 | type exTask struct {
20 | 	id         string
21 | 	SubmittedT *time.Time `json:"_submitted"`
22 | 	UserID     string     `json:"UserID"`
23 | }
24 | 
25 | func (t *exTask) ID() string            { return t.id }
26 | func (t *exTask) Submitted() *time.Time { return t.SubmittedT }
27 | func (t *exTask) String() string {
28 | 	if t.SubmittedT == nil {
29 | 		return t.id
30 | 	}
31 | 	return fmt.Sprintf("%s submitted %s", t.id, t.SubmittedT)
32 | }
33 | 
34 | func TestAltTask(t *testing.T) {
35 | 	etcdv3c := testutil.NewEtcdV3Client(t)
36 | 	kvc := etcdv3.NewKV(etcdv3c)
37 | 	c := context.Background()
38 | 	t.Parallel()
39 | 	const namespace = "/alttask-metafora"
40 | 	_, _ = kvc.Delete(c, namespace, etcdv3.WithPrefix())
41 | 
42 | 	conf := metcdv3.NewConfig("testclient", namespace)
43 | 
44 | 	// Sample overridden NewTask func
45 | 	conf.NewTaskFunc = func(id, props string) metafora.Task {
46 | 		task := exTask{id: id}
47 | 		if err := json.Unmarshal([]byte(props), &task); err != nil {
48 | 			metafora.Warnf("%s properties could not be unmarshalled: %v", id, err)
49 | 		}
50 | 		return &task
51 | 	}
52 | 
53 | 	// Create a handler that returns results through a chan for synchronization
54 | 	results := make(chan string, 1)
55 | 
56 | 	h := func(task metafora.Task, _ <-chan *statemachine.Message) *statemachine.Message {
57 | 		alttask, ok := task.(*exTask)
58 | 		if !ok {
59 | 			results <- fmt.Sprintf("%q is of type %T", task.ID(), task)
60 | 			return statemachine.PauseMessage()
61 | 		}
62 | 		if alttask.UserID == "" {
63 | 			results <- "missing UserID"
64 | 			return statemachine.PauseMessage()
65 | 		}
66 | 		results <- "ok"
67 | 		return statemachine.PauseMessage()
68 | 	}
69 | 
70 | 	coord, hf, bal := metcdv3.New(conf, etcdv3c, h)
71 | 	consumer, err := metafora.NewConsumer(coord, hf, bal)
72 | 	if err != nil {
73 | 		t.Fatal(err)
74 | 	}
75 | 	go consumer.Run()
76 | 	defer consumer.Shutdown()
77 | 
78 | 	cli := metcdv3.NewClient(namespace, etcdv3c)
79 | 	if err := cli.SubmitTask(&exTask{id: "test1", UserID: "test2"}); err != nil {
80 | 		t.Fatal(err)
81 | 	}
82 | 
83 | 	result := <-results
84 | 	if result != "ok" {
85 | 		t.Fatal(result)
86 | 	}
87 | }
88 | 


--------------------------------------------------------------------------------
/metcdv3/testutil/testutil.go:
--------------------------------------------------------------------------------
 1 | // Package testutil is a collection of utilities for use by Metafora's etcd
 2 | // tests. Since tests are spread across the m_etcd and m_etcd_test packages
 3 | // utilities must be in a shared location.
 4 | //
 5 | // Unless you're making changes to the m_etcd package you don't need to use
 6 | // this.
 7 | package testutil
 8 | 
 9 | import (
10 | 	"os"
11 | 	"strings"
12 | 	"time"
13 | 
14 | 	etcdv3 "go.etcd.io/etcd/client/v3"
15 | )
16 | 
17 | // TestCase just defines the subset of *testing.T methods needed to avoid
18 | // pulling in the testing package.
19 | type TestCase interface {
20 | 	Skip(args ...interface{})
21 | 	Fatalf(format string, args ...interface{})
22 | }
23 | 
24 | // NewEtcdClient creates a new etcd client for use by the metafora client during testing.
25 | func NewEtcdV3Client(t TestCase) *etcdv3.Client {
26 | 	if os.Getenv("ETCDTESTS") == "" {
27 | 		t.Skip("ETCDTESTS unset. Skipping etcd tests.")
28 | 	}
29 | 
30 | 	// This is the same ENV variable that etcdctl uses for peers.
31 | 	peerAddrs := os.Getenv("ETCD_PEERS")
32 | 	if peerAddrs == "" {
33 | 		peerAddrs = "127.0.0.1:2379"
34 | 	}
35 | 
36 | 	peers := strings.Split(peerAddrs, ",")
37 | 	cli, err := etcdv3.New(etcdv3.Config{
38 | 		Endpoints:   peers,
39 | 		DialTimeout: 5 * time.Second,
40 | 	})
41 | 	if err != nil {
42 | 		t.Fatalf("failed to create etcdv3 client: %v", err)
43 | 	}
44 | 	//defer cli.Close()
45 | 	return cli
46 | }
47 | 


--------------------------------------------------------------------------------
/resreporter/mem_linux.go:
--------------------------------------------------------------------------------
 1 | package resreporter
 2 | 
 3 | import (
 4 | 	"bufio"
 5 | 	"fmt"
 6 | 	"os"
 7 | 
 8 | 	"github.com/lytics/metafora"
 9 | )
10 | 
11 | const meminfo = "/proc/meminfo"
12 | 
13 | var Memory = memory{}
14 | 
15 | type memory struct{}
16 | 
17 | func (memory) Used() (used uint64, total uint64) {
18 | 	fd, err := os.Open(meminfo)
19 | 	if err != nil {
20 | 		metafora.Errorf("Error reading free memory via "+meminfo+": %v", err)
21 | 
22 | 		// Effectively disable the balancer since an error happened
23 | 		return 0, 100
24 | 	}
25 | 	defer fd.Close()
26 | 
27 | 	s := bufio.NewScanner(fd)
28 | 	foundFree, foundCache, foundBuf := false, false, false
29 | 	var cache uint64
30 | 	var buffered uint64
31 | 	var free uint64
32 | 	for s.Scan() {
33 | 		if total > 0 && foundFree && foundCache && foundBuf {
34 | 			break
35 | 		}
36 | 		if total == 0 {
37 | 			if n, _ := fmt.Sscanf(s.Text(), "MemTotal:%d", &total); n == 1 {
38 | 				continue
39 | 			}
40 | 		}
41 | 		if foundFree {
42 | 			if n, _ := fmt.Sscanf(s.Text(), "MemFree:%d", &free); n == 1 {
43 | 				continue
44 | 			}
45 | 		}
46 | 		if !foundCache {
47 | 			if n, _ := fmt.Sscanf(s.Text(), "Cached:%d", &cache); n == 1 {
48 | 				foundCache = true
49 | 				continue
50 | 			}
51 | 		}
52 | 		if !foundBuf {
53 | 			if n, _ := fmt.Sscanf(s.Text(), "Buffers:%d", &buffered); n == 1 {
54 | 				foundBuf = true
55 | 				continue
56 | 			}
57 | 		}
58 | 	}
59 | 	if err := s.Err(); err != nil {
60 | 		metafora.Errorf("Error reading free memory via "+meminfo+": %v", err)
61 | 
62 | 		// Effectively disable the balancer since an error happened
63 | 		return 0, 100
64 | 	}
65 | 
66 | 	return total - (free + buffered + cache), total
67 | }
68 | 
69 | func (memory) String() string { return "kB" }
70 | 


--------------------------------------------------------------------------------
/resreporter/mem_linux_test.go:
--------------------------------------------------------------------------------
 1 | package resreporter_test
 2 | 
 3 | import (
 4 | 	"testing"
 5 | 
 6 | 	"github.com/lytics/metafora/resreporter"
 7 | )
 8 | 
 9 | func TestMemReporter(t *testing.T) {
10 | 	used, total := resreporter.Memory.Used()
11 | 	t.Logf("Used: %d %s (%d mB)", used, resreporter.Memory, used/1024)
12 | 	t.Logf("Total: %d %s (%d mB)", total, resreporter.Memory, total/1024)
13 | 	if used == 0 && total == 100 {
14 | 		t.Fatal("Memory reporter failed!")
15 | 	}
16 | 	if used > total {
17 | 		t.Fatal("More memory used than available?!")
18 | 	}
19 | }
20 | 


--------------------------------------------------------------------------------
/scripts/docker_run_etcd.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | export RunningEtcdDockers=$(sudo docker ps -a | grep metafora-etcd- | awk '{print $1}')
 3 | if [[ -n $RunningEtcdDockers ]]; then
 4 | echo stopping existing etcd metafora docker containers 
 5 | echo --------------------------------------------------------------------------------
 6 |     echo sudo docker stop ${RunningEtcdDockers}
 7 |     sudo docker stop ${RunningEtcdDockers}
 8 |     echo 
 9 | 
10 | 
11 |     echo removing existing etcd docker containers 
12 |     echo --------------------------------------------------------------------------------
13 |     sudo docker rm ${RunningEtcdDockers}
14 |     echo 
15 | fi
16 | 
17 | if [[ $1 = "-stop" ]]; then
18 |     echo "-stop specified; not starting new containers"
19 |     exit 0
20 | fi
21 | 
22 | echo starting new etcd metafora docker containers 
23 | echo --------------------------------------------------------------------------------
24 | sudo docker run -d --name="metafora-etcd-a" --net=host coreos/etcd \
25 |     -peer-addr 127.0.0.1:8001 -peer-bind-addr 127.0.0.1:8001 -addr 127.0.0.1:5001 -bind-addr 127.0.0.1:5001 -name metafora-a
26 | sudo docker run -d --name="metafora-etcd-b" --net=host coreos/etcd \
27 |     -peer-addr 127.0.0.1:8002 -peer-bind-addr 127.0.0.1:8002 -addr 127.0.0.1:5002 -bind-addr 127.0.0.1:5002 -name metafora-b -peers 127.0.0.1:8001,127.0.0.1:8002,127.0.0.1:8003
28 | sudo docker run -d --name="metafora-etcd-c" --net=host coreos/etcd \
29 |     -peer-addr 127.0.0.1:8003 -peer-bind-addr 127.0.0.1:8003 -addr 127.0.0.1:5003 -bind-addr 127.0.0.1:5003 -name metafora-c -peers 127.0.0.1:8001,127.0.0.1:8002,127.0.0.1:8003
30 | echo 
31 | 
32 | echo list of running metafora docker containers 
33 | echo --------------------------------------------------------------------------------
34 | sudo docker ps | head -n 1
35 | sudo docker ps | grep metafora-etcd-
36 | 


--------------------------------------------------------------------------------
/slowtask_test.go:
--------------------------------------------------------------------------------
 1 | package metafora
 2 | 
 3 | import (
 4 | 	"testing"
 5 | 	"time"
 6 | )
 7 | 
 8 | type releaseAllBalancer struct {
 9 | 	balances chan int
10 | 	ctx      BalancerContext
11 | }
12 | 
13 | func (b *releaseAllBalancer) Init(c BalancerContext) {
14 | 	b.ctx = c
15 | 	b.balances = make(chan int)
16 | }
17 | func (b *releaseAllBalancer) CanClaim(Task) (time.Time, bool) { return NoDelay, true }
18 | func (b *releaseAllBalancer) Balance() []string {
19 | 	b.balances <- 1
20 | 	ids := []string{}
21 | 	for _, task := range b.ctx.Tasks() {
22 | 		ids = append(ids, task.Task().ID())
23 | 	}
24 | 	return ids
25 | }
26 | 
27 | func TestDoubleRelease(t *testing.T) {
28 | 	t.Parallel()
29 | 
30 | 	started := make(chan int)
31 | 	reallyStop := make(chan bool)
32 | 	h := SimpleHandler(func(task Task, stop <-chan bool) bool {
33 | 		started <- 1
34 | 		t.Logf("TestDoubleRelease handler recieved %s - blocking until reallyStop closed.", task)
35 | 		<-reallyStop
36 | 		return true
37 | 	})
38 | 
39 | 	tc := NewTestCoord()
40 | 
41 | 	b := &releaseAllBalancer{}
42 | 	c, err := NewConsumer(tc, h, b)
43 | 	if err != nil {
44 | 		t.Fatalf("Error creating consumer: %v", err)
45 | 	}
46 | 	go c.Run()
47 | 
48 | 	// This won't exit when told to
49 | 	tc.Tasks <- testTask{"1"}
50 | 	<-started
51 | 
52 | 	// Make sure balancing/mainloop isn't blocked
53 | 	tc.Commands <- CommandBalance()
54 | 	<-b.balances
55 | 	tc.Commands <- CommandBalance()
56 | 	<-b.balances
57 | 	tc.Commands <- CommandBalance()
58 | 	<-b.balances
59 | 
60 | 	shutdownComplete := make(chan bool)
61 | 	go func() {
62 | 		c.Shutdown()
63 | 		close(shutdownComplete)
64 | 	}()
65 | 
66 | 	// Make sure the release insidiously blocks until we close reallyStop
67 | 	select {
68 | 	case <-shutdownComplete:
69 | 		t.Fatal("Shutdown completed when it should have blocked indefinitely")
70 | 	case <-time.After(100 * time.Millisecond):
71 | 	}
72 | 
73 | 	// Close reallyStop and make sure Shutdown actually exits
74 | 	close(reallyStop)
75 | 	// Make sure the release insidiously blocks until we close reallyStop
76 | 	<-shutdownComplete
77 | }
78 | 


--------------------------------------------------------------------------------
/statemachine/README.md:
--------------------------------------------------------------------------------
 1 | # Metafora Finite State Machine
 2 | 
 3 | The `statemachine` package provides a featureful state machine for use by
 4 | Metafora task handlers.
 5 | 
 6 | ## Features
 7 | 
 8 | * Static state machine; no custom states or messages (transitions)
 9 | * Per task state machine; task may intercept commands
10 | * Flexible state store (see `StateStore` interface)
11 | * Flexible command sending/receiving (see `Commander`, `CommandListener`, or
12 |   [the etcd implementation](../m_etcd/commander.go)).
13 | * Flexible error handling with builtin retry logic (see
14 |   [`errors.go`](errors.go)).
15 | * States: Runnable, Paused, Sleeping, Fault, Completed, Failed, Killed
16 | * Commands/Messages: Run, Pause, Sleep, Release, Error, Kill, Complete, Checkpoint
17 | * Tasks in a terminal state are unscheduled and will take no cluster resources.
18 | 
19 | ## Control Flow
20 | 
21 | 1. Coordinator receives a claimable task from a Watch
22 | 2. Consumer calls `Balancer.CanClaim(task)`
23 | 3. If claimable, Consumer calls `Coordinator.Claim(task)` to claim it.
24 | 4. If claim was successful, Consumer starts the task handler which is created
25 |    by `statemachine.New(...)`.
26 | 5. State machine loads initial state via `StateStore.Load(task)`.
27 | 6. If the task is `Runnable` hand over control to the `StatefulHandler`
28 |    implementation provided by the user.
29 | 7. Run until task returns a `Message` either due to completion, an error, or a
30 |    received command.
31 | 
32 | There are quite a few moving parts that are hooked together:
33 | 
34 | * The Consumer needs a `Coordinator`, `Balancer`, and `HandlerFunc` like
35 |   normal, but you should use `statemachine.New(...)` to create the `Handler`
36 |   returned by your `HandlerFunc`.
37 | * The state machine requires a `StateStore` and `CommandListener`. The `m_etcd`
38 |   package includes an etcd implemenation of `CommandLister` (as well as
39 |   `Commander` for sending commands), but no default `StateStore` is provided.
40 | * Your task handling code must be implemented in a function (or method) that
41 |   fulfills the `StatefulHandler` signature. When your handler receives a
42 |   command it should return it (or override it with a new `Message`) to the
43 |   state machine to handle state transitions.
44 | 
45 | ## States
46 | 
47 | State | Description
48 | ------|------------
49 | Runnable | Task is runnable and control is passed to the task handler.
50 | Paused | Task is paused until a command is received.
51 | Sleeping | Task is paused until a specified time (or a command is received).
52 | Fault | An error occurred and a custom error handler is invoked.
53 | Completed | **Terminal** Task returned the `Complete` message because it finished succesfully.
54 | Failed | **Terminal** The error handler executed during the Fault state determined the task has failed permanently.
55 | Killed | **Terminal** Task received a `Kill` message.
56 | 
57 | **Terminal** states are final. The task is removed from from the broker and will never be scheduled to run again.
58 | 
59 | ## Messages
60 | 
61 | AKA Events or Commands
62 | 
63 | Messages cause transitions between states.
64 | 
65 | Message | Description
66 | --------|------------
67 | Run | Causes a `Paused` or `Sleeping` task to transition to `Runnable` and begin executing.
68 | Pause | Causes a `Runnable` or `Sleeping` task to transition to `Paused`.
69 | Sleep | Requires an `Until time.Time` to be set. Causes non-terminal states to pause until the time is reached.
70 | Error | Requires an `Err error` to be set. Usually returned by tasks to transition to `Fault` state.
71 | Release | *See below*
72 | Checkpoint | *See below*
73 | Kill  | Causes a non-terminal state to transition to `Killed`.
74 | Complete | Should only be returned by tasks. Causes a `Runnable` state to transition to `Completed`.
75 | 
76 | 
77 | ### Release
78 | 
79 | Release is a special message that does *not* transition between states. Instead
80 | the task handler exits and the Coordinator's claim on the task is released.
81 | 
82 | Metafora's `Handler.Stop()` method sends the `Release` command to a running
83 | task to request it exit. It's most often used when cleanly restarting Metafora
84 | nodes.
85 | 
86 | ### Checkpoint
87 | 
88 | Checkpoint is a special message that - like `Release` - does *not* transition
89 | between states. It is meant to be a signal to tasks to persist any internal
90 | state and optionally exit to allow the state machine to store.
91 | 
92 | Since a `Checkpoint` is a noop in the state machine a task may decide to
93 | intercept the message and *not* return.
94 | 


--------------------------------------------------------------------------------
/statemachine/commander.go:
--------------------------------------------------------------------------------
 1 | package statemachine
 2 | 
 3 | type CommandListener interface {
 4 | 	Receive() <-chan *Message
 5 | 	Stop()
 6 | }
 7 | 
 8 | type Commander interface {
 9 | 	Send(taskID string, m *Message) error
10 | }
11 | 


--------------------------------------------------------------------------------
/statemachine/doc.go:
--------------------------------------------------------------------------------
 1 | // Statemachine is a featureful statemachine implementation for Metafora
 2 | // handlers to use. It is implemented as a Handler wrapper which provides a
 3 | // channel of incoming commands to wrapped handlers. Internal handlers are
 4 | // expected to shutdown cleanly and exit upon receiving a command from the
 5 | // state machine. The state machine will handle the state transition and
 6 | // restart the internal handler if necesary.
 7 | //
 8 | // Users must provide a StateStore implementation for persisting task state and
 9 | // Command Listener implementation for receiving commands. See the m_etcd or
10 | // embedded packages for example Command Listener implementations.
11 | //
12 | // See the README in this package for details.
13 | package statemachine
14 | 


--------------------------------------------------------------------------------
/statemachine/errors.go:
--------------------------------------------------------------------------------
 1 | package statemachine
 2 | 
 3 | import (
 4 | 	"errors"
 5 | 	"time"
 6 | 
 7 | 	"github.com/lytics/metafora"
 8 | )
 9 | 
10 | // ExceededErrorRate is returned by error handlers in an Error Message when
11 | // retry logic has been exhausted for a handler and it should transition to
12 | // Failed.
13 | var ExceededErrorRate = errors.New("exceeded error rate")
14 | 
15 | // Err represents an error that occurred while a stateful handler was running.
16 | //
17 | // NewErr was added to allow callers to construct an instance from an underlying error.
18 | // The underlying error is now preserved so that Err can be converted back using errors.As
19 | // This is useful for custom error handlers that wish to inspect underlying error types
20 | // and decision accordingly.
21 | type Err struct {
22 | 	Time    time.Time `json:"timestamp"`
23 | 	Err     string    `json:"error"`
24 | 	baseErr error
25 | }
26 | 
27 | // NewErr constructs an Err from an underlying error e.
28 | func NewErr(e error, t time.Time) Err {
29 | 	return Err{Err: e.Error(), Time: t, baseErr: e}
30 | }
31 | 
32 | // Error implements the Error interface.
33 | func (e Err) Error() string {
34 | 	return e.Err
35 | }
36 | 
37 | // Unwrap returns baseErr.
38 | func (e Err) Unwrap() error {
39 | 	return e.baseErr
40 | }
41 | 
42 | // ErrHandler functions should return Run, Sleep, or Fail messages depending on
43 | // the rate of errors.
44 | //
45 | // Either ErrHandler and/or StateStore should trim the error slice to keep it
46 | // from growing without bound.
47 | type ErrHandler func(task metafora.Task, errs []Err) (*Message, []Err)
48 | 
49 | const (
50 | 	DefaultErrLifetime = -4 * time.Hour
51 | 	DefaultErrMax      = 8
52 | )
53 | 
54 | // DefaultErrHandler returns a Fail message if 8 errors have occurred in 4
55 | // hours. Otherwise it enters the Sleep state for 10 minutes before trying
56 | // again.
57 | func DefaultErrHandler(_ metafora.Task, errs []Err) (*Message, []Err) {
58 | 	recent := time.Now().Add(DefaultErrLifetime)
59 | 	strikes := 0
60 | 	for _, err := range errs {
61 | 		if err.Time.After(recent) {
62 | 			strikes++
63 | 		}
64 | 	}
65 | 
66 | 	if len(errs) > DefaultErrMax {
67 | 		errs = errs[len(errs)-DefaultErrMax:]
68 | 	}
69 | 
70 | 	if strikes >= DefaultErrMax {
71 | 		// Return a new error to transition to Failed as well as the original
72 | 		// errors to store what caused this failure.
73 | 		return ErrorMessage(ExceededErrorRate), errs
74 | 	}
75 | 	return SleepMessage(time.Now().Add(10 * time.Minute)), errs
76 | }
77 | 


--------------------------------------------------------------------------------
/statemachine/errors_test.go:
--------------------------------------------------------------------------------
 1 | package statemachine_test
 2 | 
 3 | import (
 4 | 	"errors"
 5 | 	"testing"
 6 | 	"time"
 7 | 
 8 | 	. "github.com/lytics/metafora/statemachine"
 9 | 	"github.com/stretchr/testify/assert"
10 | 	"github.com/stretchr/testify/require"
11 | )
12 | 
13 | type task string
14 | 
15 | func (t task) ID() string { return string(t) }
16 | 
17 | func TestDefaultErrHandler(t *testing.T) {
18 | 	t.Parallel()
19 | 	tid := ""
20 | 
21 | 	errs := []Err{{Time: time.Now()}}
22 | 
23 | 	{
24 | 		msg, errs := DefaultErrHandler(task(tid), errs)
25 | 		if len(errs) != 1 {
26 | 			t.Fatalf("Expected 1 err, found: %d", len(errs))
27 | 		}
28 | 		if msg.Code != Sleep || msg.Until == nil || msg.Until.Before(time.Now().Add(9*time.Minute)) {
29 | 			t.Fatalf("Expected sleep until +10m state but found: %s", msg)
30 | 		}
31 | 	}
32 | 
33 | 	// Push error list over limit
34 | 	for i := 0; i < DefaultErrMax+1; i++ {
35 | 		errs = append(errs, Err{Time: time.Now()})
36 | 	}
37 | 
38 | 	{
39 | 		msg, errs := DefaultErrHandler(task(tid), errs)
40 | 		if len(errs) > DefaultErrMax {
41 | 			t.Fatalf("Expected %d errors but received: %d", DefaultErrMax, len(errs))
42 | 		}
43 | 		if msg.Code != Error || msg.Err != ExceededErrorRate {
44 | 			t.Fatalf("Expected error handler to permanently fail but receied: %s", msg)
45 | 		}
46 | 	}
47 | }
48 | 
49 | type errType1 struct{ error }
50 | type errType2 struct{ error }
51 | 
52 | func TestErr(t *testing.T) {
53 | 	err := errType1{errors.New("some underlying error")}
54 | 	se := NewErr(err, time.Now())
55 | 
56 | 	// confirm se implements the error interface
57 | 	require.Implements(t, (*error)(nil), se)
58 | 
59 | 	// confirm we can only convert se to an error of the same underlying type
60 | 	assert.True(t, errors.As(se, new(errType1)))
61 | 	assert.False(t, errors.As(se, new(errType2)))
62 | 
63 | 	// make sure we don't panic if someone uses it the old way and baseErr is nil
64 | 	se = Err{Time: time.Now(), Err: "something bad"}
65 | 	assert.Equal(t, "something bad", se.Error())
66 | 	assert.False(t, errors.As(se, new(errType1)))
67 | 
68 | 	// confirm we can check for a specific instance of baseErr too
69 | 	e1 := errType1{errors.New("target instance")}
70 | 	e2 := errType1{errors.New("different instance")}
71 | 	se = NewErr(e1, time.Now())
72 | 	assert.True(t, errors.Is(se, e1))
73 | 	assert.False(t, errors.Is(se, e2))
74 | }
75 | 


--------------------------------------------------------------------------------
/statemachine/run_test.go:
--------------------------------------------------------------------------------
 1 | package statemachine
 2 | 
 3 | import (
 4 | 	"testing"
 5 | 	"time"
 6 | 
 7 | 	"github.com/lytics/metafora"
 8 | )
 9 | 
10 | type task string
11 | 
12 | func (t task) ID() string { return string(t) }
13 | 
14 | // TestCommandBlackhole is meant to demonstrate what happens if a
15 | // StatefulHandler implementation receives commands in a goroutine that lives
16 | // past the SatefulHandler func exiting. This is a very easy bug to write, so
17 | // defensive code was added to prevent the leaked goroutine from "stealing"
18 | // commands meant for other states (Paused or Sleeping being the two states
19 | // that absolutely need to accept commands).
20 | //
21 | // This test breaking isn't necessarily the sign of a bug. It may just mean
22 | // we've decided to remove the defensive code protecting against such errors in
23 | // which case this test should be removed as well.
24 | func TestCommandBlackhole(t *testing.T) {
25 | 	t.Parallel()
26 | 	stop := make(chan bool)
27 | 	rdy := make(chan int, 1)
28 | 	defer close(stop)
29 | 
30 | 	f := func(_ metafora.Task, c <-chan *Message) *Message {
31 | 		go func() {
32 | 			rdy <- 1
33 | 			select {
34 | 			case <-c:
35 | 				t.Log("Intercepted!")
36 | 			case <-stop:
37 | 				return
38 | 			}
39 | 		}()
40 | 		return nil
41 | 	}
42 | 	cmds := make(chan *Message)
43 | 
44 | 	// Ignore the return message, the point is to make sure it doesn't intercept
45 | 	// further commands.
46 | 	run(f, task("test-task"), cmds)
47 | 	<-rdy
48 | 
49 | 	go func() { cmds <- RunMessage() }()
50 | 
51 | 	select {
52 | 	case <-cmds:
53 | 		// Yay! command wasn't intercepted by leaked goroutine!
54 | 	case <-time.After(time.Second):
55 | 		t.Fatalf("Command was intercepted by leaked goroutine.")
56 | 	}
57 | }
58 | 


--------------------------------------------------------------------------------
/statemachine/statemachine.go:
--------------------------------------------------------------------------------
  1 | package statemachine
  2 | 
  3 | import (
  4 | 	"errors"
  5 | 	"fmt"
  6 | 	"runtime"
  7 | 	"strings"
  8 | 	"sync"
  9 | 	"time"
 10 | 
 11 | 	"github.com/lytics/metafora"
 12 | )
 13 | 
 14 | var (
 15 | 	MissingUntilError  = errors.New("sleeping state missing deadline")
 16 | 	MissingErrorsError = errors.New("fault state has no errors")
 17 | 	ReleasableError    = errors.New("network error, release and retry")
 18 | )
 19 | 
 20 | // StateCode is the actual state key. The State struct adds additional metadata
 21 | // related to certain StateCodes.
 22 | type StateCode string
 23 | 
 24 | const (
 25 | 	Runnable  StateCode = "runnable"  // Scheduled
 26 | 	Sleeping  StateCode = "sleeping"  // Scheduled, not running until time has elapsed
 27 | 	Completed StateCode = "completed" // Terminal, not scheduled
 28 | 	Killed    StateCode = "killed"    // Terminal, not scheduled
 29 | 	Failed    StateCode = "failed"    // Terminal, not scheduled
 30 | 	Fault     StateCode = "fault"     // Scheduled, in error handling / retry logic
 31 | 	Paused    StateCode = "paused"    // Scheduled, not running
 32 | )
 33 | 
 34 | // Terminal states will never run and cannot transition to a non-terminal
 35 | // state.
 36 | func (s StateCode) Terminal() bool {
 37 | 	switch s {
 38 | 	case Runnable, Sleeping, Paused, Fault:
 39 | 		return false
 40 | 	case Completed, Killed, Failed:
 41 | 		return true
 42 | 	default:
 43 | 		metafora.Error("unknown state: ", s)
 44 | 		return false
 45 | 	}
 46 | }
 47 | 
 48 | func (s StateCode) String() string { return string(s) }
 49 | 
 50 | // State represents the current state of a stateful handler. See StateCode for
 51 | // details. Until and Errors are extra state used by the Sleeping and Fault
 52 | // states respectively.
 53 | type State struct {
 54 | 	Code   StateCode  `json:"state"`
 55 | 	Until  *time.Time `json:"until,omitempty"`
 56 | 	Errors []Err      `json:"errors,omitempty"`
 57 | }
 58 | 
 59 | // copy state so mutations to Until and Errors aren't shared.
 60 | func (s *State) copy() *State {
 61 | 	ns := &State{Code: s.Code}
 62 | 	if s.Until != nil {
 63 | 		until := *s.Until
 64 | 		ns.Until = &until
 65 | 	}
 66 | 	ns.Errors = append(ns.Errors, s.Errors...)
 67 | 	return ns
 68 | }
 69 | 
 70 | func (s *State) String() string {
 71 | 	switch s.Code {
 72 | 	case Sleeping:
 73 | 		return fmt.Sprintf("%s until %s", s.Code, s.Until)
 74 | 	case Fault:
 75 | 		return fmt.Sprintf("%s (%d errors)", s.Code, len(s.Errors))
 76 | 	default:
 77 | 		return string(s.Code)
 78 | 	}
 79 | }
 80 | 
 81 | func (s *State) Valid() error {
 82 | 	switch s.Code {
 83 | 	case Completed, Failed, Killed, Paused, Runnable:
 84 | 	case Sleeping:
 85 | 		if s.Until == nil {
 86 | 			return MissingUntilError
 87 | 		}
 88 | 	case Fault:
 89 | 		if len(s.Errors) == 0 {
 90 | 			return MissingErrorsError
 91 | 		}
 92 | 	default:
 93 | 		return fmt.Errorf("unknown state: %q", s.Code)
 94 | 	}
 95 | 	return nil
 96 | }
 97 | 
 98 | // Messages are events that cause state transitions. Until and Err are used by
 99 | // the Sleep and Error messages respectively.
100 | type Message struct {
101 | 	Code MessageCode `json:"message"`
102 | 
103 | 	// Until is when the statemachine should transition from sleeping to runnable
104 | 	Until *time.Time `json:"until,omitempty"`
105 | 
106 | 	// Err is the error that caused this Error message
107 | 	Err error `json:"error,omitempty"`
108 | }
109 | 
110 | // ErrorMessage is a simpler helper for creating error messages from an error.
111 | func ErrorMessage(err error) *Message {
112 | 	return &Message{Code: Error, Err: err}
113 | }
114 | 
115 | // SleepMessage is a simpler helper for creating sleep messages from a time.
116 | func SleepMessage(t time.Time) *Message {
117 | 	return &Message{Code: Sleep, Until: &t}
118 | }
119 | 
120 | func RunMessage() *Message        { return &Message{Code: Run} }
121 | func PauseMessage() *Message      { return &Message{Code: Pause} }
122 | func KillMessage() *Message       { return &Message{Code: Kill} }
123 | func CheckpointMessage() *Message { return &Message{Code: Checkpoint} }
124 | func ReleaseMessage() *Message    { return &Message{Code: Release} }
125 | func CompleteMessage() *Message   { return &Message{Code: Complete} }
126 | 
127 | // Valid returns true if the Message is valid. Invalid messages sent as
128 | // commands are discarded by the state machine.
129 | func (m *Message) Valid() bool {
130 | 	switch m.Code {
131 | 	case Run, Pause, Release, Checkpoint, Complete, Kill:
132 | 		return true
133 | 	case Sleep:
134 | 		return m.Until != nil
135 | 	case Error:
136 | 		return m.Err != nil
137 | 	default:
138 | 		return false
139 | 	}
140 | }
141 | 
142 | func (m *Message) String() string {
143 | 	switch m.Code {
144 | 	case Sleep:
145 | 		if m.Until != nil {
146 | 			return fmt.Sprintf("%s until %s", m.Code, m.Until)
147 | 		}
148 | 	case Error:
149 | 		if m.Err != nil {
150 | 			return fmt.Sprintf("%s: %s", m.Code, m.Err.Error())
151 | 		}
152 | 	}
153 | 	return string(m.Code)
154 | }
155 | 
156 | // MessageCode is the symbolic name of a state transition.
157 | type MessageCode string
158 | 
159 | func (m MessageCode) String() string { return string(m) }
160 | 
161 | const (
162 | 	Run        MessageCode = "run"
163 | 	Sleep      MessageCode = "sleep"
164 | 	Pause      MessageCode = "pause"
165 | 	Kill       MessageCode = "kill"
166 | 	Error      MessageCode = "error"
167 | 	Complete   MessageCode = "complete"
168 | 	Checkpoint MessageCode = "checkpoint"
169 | 
170 | 	// Special event which triggers state machine to exit without transitioning
171 | 	// between states.
172 | 	Release MessageCode = "release"
173 | )
174 | 
175 | // Transitions represent a state machine transition from one state to another
176 | // given an event message.
177 | type Transition struct {
178 | 	Event MessageCode
179 | 	From  StateCode
180 | 	To    StateCode
181 | }
182 | 
183 | func (t Transition) String() string {
184 | 	return fmt.Sprintf("%v---%v--->%v", t.From, t.Event, t.To)
185 | }
186 | 
187 | var (
188 | 	// Rules is the state transition table.
189 | 	Rules = [...]Transition{
190 | 		// Runnable can transition to anything
191 | 		{Event: Checkpoint, From: Runnable, To: Runnable},
192 | 		{Event: Release, From: Runnable, To: Runnable},
193 | 		{Event: Sleep, From: Runnable, To: Sleeping},
194 | 		{Event: Complete, From: Runnable, To: Completed},
195 | 		{Event: Kill, From: Runnable, To: Killed},
196 | 		{Event: Error, From: Runnable, To: Fault},
197 | 		{Event: Pause, From: Runnable, To: Paused},
198 | 		{Event: Run, From: Runnable, To: Runnable},
199 | 
200 | 		// Sleeping can return to Runnable or be Killed/Paused
201 | 		{Event: Checkpoint, From: Sleeping, To: Sleeping},
202 | 		{Event: Release, From: Sleeping, To: Sleeping},
203 | 		{Event: Sleep, From: Sleeping, To: Sleeping},
204 | 		{Event: Run, From: Sleeping, To: Runnable},
205 | 		{Event: Kill, From: Sleeping, To: Killed},
206 | 		{Event: Pause, From: Sleeping, To: Paused},
207 | 		{Event: Error, From: Sleeping, To: Fault},
208 | 
209 | 		// The error state transitions to either sleeping, failed, or released (to
210 | 		// allow custom error handlers to workaround localitly related errors).
211 | 		{Event: Sleep, From: Fault, To: Sleeping},
212 | 		{Event: Error, From: Fault, To: Failed},
213 | 
214 | 		// Paused can return to Runnable, be put to Sleep, or Killed
215 | 		{Event: Checkpoint, From: Paused, To: Paused},
216 | 		{Event: Release, From: Paused, To: Paused},
217 | 		{Event: Run, From: Paused, To: Runnable},
218 | 		{Event: Sleep, From: Paused, To: Sleeping},
219 | 		{Event: Kill, From: Paused, To: Killed},
220 | 		{Event: Pause, From: Paused, To: Paused},
221 | 
222 | 		// Completed, Failed, and Killed are terminal states that cannot transition
223 | 		// to anything.
224 | 	}
225 | )
226 | 
227 | // StatefulHandler is the function signature that the state machine is able to
228 | // run. Instead of metafora.Handler's Stop method, StatefulHandlers receive
229 | // Messages via the commands chan and return their exit status via a Message.
230 | //
231 | // Normally StatefulHandlers simply return a Message as soon as it's received
232 | // on the commands chan. However, it's also acceptable for a handler to return
233 | // a different Message. For example if it encounters an error during shutdown,
234 | // it may choose to return that error as an Error Message as opposed to the
235 | // original command.
236 | type StatefulHandler func(task metafora.Task, commands <-chan *Message) *Message
237 | 
238 | type stateMachine struct {
239 | 	task       metafora.Task
240 | 	h          StatefulHandler
241 | 	ss         StateStore
242 | 	cl         CommandListener
243 | 	cmds       chan *Message
244 | 	errHandler ErrHandler
245 | 
246 | 	mu    *sync.RWMutex
247 | 	state *State
248 | 	ts    time.Time
249 | 
250 | 	stopL   *sync.Mutex
251 | 	stopped chan bool
252 | }
253 | 
254 | // New handler that creates a state machine and exposes state transitions to
255 | // the given handler by calling its Transition method. It should be created in
256 | // the HandlerFunc you use with metafora's Consumer.
257 | //
258 | // If ErrHandler is nil DefaultErrHandler will be used.
259 | func New(task metafora.Task, h StatefulHandler, ss StateStore, cl CommandListener, e ErrHandler) metafora.Handler {
260 | 	if e == nil {
261 | 		e = DefaultErrHandler
262 | 	}
263 | 	return &stateMachine{
264 | 		task:       task,
265 | 		h:          h,
266 | 		ss:         ss,
267 | 		cl:         cl,
268 | 		errHandler: e,
269 | 		mu:         &sync.RWMutex{},
270 | 		ts:         time.Now(),
271 | 		stopL:      &sync.Mutex{},
272 | 		stopped:    make(chan bool),
273 | 	}
274 | }
275 | 
276 | // State returns the current state the state machine is in and what time it
277 | // entered that state. The State may be nil if Run() has yet to be called.
278 | func (s *stateMachine) State() (*State, time.Time) {
279 | 	s.mu.RLock()
280 | 	defer s.mu.RUnlock()
281 | 	return s.state, s.ts
282 | }
283 | 
284 | func (s *stateMachine) setState(state *State) {
285 | 	s.mu.Lock()
286 | 	s.state = state.copy()
287 | 	s.ts = time.Now()
288 | 	s.mu.Unlock()
289 | }
290 | 
291 | // Run the state machine enabled handler. Loads the initial state and passes
292 | // control to the internal stateful handler passing commands from the command
293 | // listener into the handler's commands chan.
294 | func (s *stateMachine) Run() (done bool) {
295 | 	// Multiplex external (Stop) messages and internal ones
296 | 	s.cmds = make(chan *Message)
297 | 	go func() {
298 | 		for {
299 | 			select {
300 | 			case m := <-s.cl.Receive():
301 | 				if !m.Valid() {
302 | 					metafora.Warnf("Ignoring invalid command: %q", m)
303 | 					continue
304 | 				}
305 | 				select {
306 | 				case s.cmds <- m:
307 | 				case <-s.stopped:
308 | 					return
309 | 				}
310 | 			case <-s.stopped:
311 | 				return
312 | 			}
313 | 		}
314 | 	}()
315 | 
316 | 	// Stop the command listener and internal message multiplexer when Run exits
317 | 	defer func() {
318 | 		s.cl.Stop()
319 | 		s.stop()
320 | 	}()
321 | 
322 | 	tid := s.task.ID()
323 | 
324 | 	// Load the initial state
325 | 	state, err := s.ss.Load(s.task)
326 | 	if err == ReleasableError {
327 | 		// A failure to load was reported by our provided loader, but the loader believed the failure
328 | 		// was retriable. In most cases this will be some type of network partition or communication error,
329 | 		// too many file handles, etc.
330 | 		metafora.Errorf("task=%q could not load initial state but the task is retriable!", tid)
331 | 		time.Sleep(time.Second) //defer releasing the task so other nodes don't thunder herd retrying it.
332 | 		return false
333 | 	} else if err != nil {
334 | 		// A failure to load the state for a task is *fatal* - the task will be
335 | 		// unscheduled and requires operator intervention to reschedule.
336 | 		metafora.Errorf("task=%q could not load initial state. Marking done! Error: %v", tid, err)
337 | 		return true
338 | 	}
339 | 	if state == nil {
340 | 		// Note to StateStore implementors: This should not happen! Either state or
341 | 		// err must be non-nil. This code is simply to prevent a nil pointer panic.
342 | 		metafora.Errorf("statestore %T returned nil state and err for task=%q - unscheduling", s.ss, tid)
343 | 		return true
344 | 	}
345 | 	if state.Code.Terminal() {
346 | 		metafora.Warnf("task=%q in terminal state %s - exiting.", tid, state.Code)
347 | 		return true
348 | 	}
349 | 
350 | 	s.setState(state) // for introspection/debugging
351 | 
352 | 	// Main Statemachine Loop
353 | 	done = false
354 | 	for {
355 | 		// Enter State
356 | 		metafora.Debugf("task=%q in state %s", tid, state.Code)
357 | 		msg := s.exec(state)
358 | 
359 | 		// Apply Message
360 | 		newstate, ok := apply(state, msg)
361 | 		if !ok {
362 | 			metafora.Warnf("task=%q Invalid state transition=%q returned by task. Old state=%q msg.Err=%s", tid, msg.Code, state.Code, msg.Err)
363 | 			msg = ErrorMessage(msg.Err)
364 | 			if newstate, ok = apply(state, msg); !ok {
365 | 				metafora.Errorf("task=%q Unable to transition to error state! Exiting with state=%q", tid, state.Code)
366 | 				return state.Code.Terminal()
367 | 			}
368 | 		}
369 | 
370 | 		metafora.Infof("task=%q transitioning %s --> %s --> %s", tid, state, msg, newstate)
371 | 
372 | 		// Save state - second part of logic probably should never happen
373 | 		if msg.Code != Release || (msg.Code == Release && (state.Code != newstate.Code || len(state.Errors) != len(newstate.Errors))) {
374 | 			if err := s.ss.Store(s.task, newstate); err != nil {
375 | 				// After upgrading to 1.25.5-gke.2000 we started experiencing the metadata server throwing POD_FINDER_IP_MISMATCH
376 | 				// errors resulting in failures authenticating to spanner. This panic will cause the pod to cyle
377 | 				// See https://github.com/lytics/lio/issues/30414
378 | 				if strings.Contains(err.Error(), "spanner: code = \"Unauthenticated\"") {
379 | 					metafora.Errorf("task=%q Unable to persist state=%q due to failure to authenticate to spanner.", tid, newstate.Code)
380 | 					panic(err)
381 | 				}
382 | 
383 | 				metafora.Errorf("task=%q Unable to persist state=%q. Continuing.", tid, newstate.Code)
384 | 				return true
385 | 			}
386 | 		}
387 | 
388 | 		// Set next state and loop if non-terminal
389 | 		state = newstate
390 | 
391 | 		// Expose the state for introspection
392 | 		s.setState(state)
393 | 
394 | 		// Exit and unschedule task on terminal state.
395 | 		if state.Code.Terminal() {
396 | 			return true
397 | 		}
398 | 
399 | 		// Release messages indicate the task should exit but not unschedule.
400 | 		if msg.Code == Release {
401 | 			return false
402 | 		}
403 | 
404 | 		// Alternatively Stop() may have been called but the handler may not have
405 | 		// returned the Release message. Always exit if we've been told to Stop()
406 | 		// even if the handler has returned a different Message.
407 | 		select {
408 | 		case <-s.stopped:
409 | 			return false
410 | 		default:
411 | 		}
412 | 	}
413 | }
414 | 
415 | // execute non-terminal states
416 | func (s *stateMachine) exec(state *State) *Message {
417 | 	switch state.Code {
418 | 	case Runnable:
419 | 		// Runnable passes control to the stateful handler
420 | 		return run(s.h, s.task, s.cmds)
421 | 	case Paused:
422 | 		// Paused until a message arrives
423 | 		return <-s.cmds
424 | 	case Sleeping:
425 | 		// Sleeping until the specified time (or a message)
426 | 		if state.Until == nil {
427 | 			metafora.Warnf("task=%q told to sleep without a time. Resuming.", s.task.ID())
428 | 			return RunMessage()
429 | 		}
430 | 		dur := time.Until(*state.Until)
431 | 		metafora.Infof("task=%q sleeping for %s", s.task.ID(), dur)
432 | 		timer := time.NewTimer(dur)
433 | 		select {
434 | 		case <-timer.C:
435 | 			return RunMessage()
436 | 		case msg := <-s.cmds:
437 | 			timer.Stop()
438 | 			// Checkpoint & Release are special cases that shouldn't affect sleep
439 | 			// time, so maintain it across the state transition
440 | 			if msg.Code == Checkpoint || msg.Code == Release {
441 | 				msg.Until = state.Until
442 | 			}
443 | 			return msg
444 | 		}
445 | 	case Fault:
446 | 		// Special case where we potentially trim the current state to keep
447 | 		// errors from growing without bound.
448 | 		var msg *Message
449 | 		msg, state.Errors = s.errHandler(s.task, state.Errors)
450 | 		return msg
451 | 	default:
452 | 		panic("invalid state: " + state.String())
453 | 	}
454 | }
455 | 
456 | func run(f StatefulHandler, task metafora.Task, cmd <-chan *Message) (m *Message) {
457 | 	defer func() {
458 | 		if r := recover(); r != nil {
459 | 			stackBuf := make([]byte, 6000)
460 | 			stackBufLen := runtime.Stack(stackBuf, false)
461 | 			stackTraceStr := string(stackBuf[0:stackBufLen])
462 | 			metafora.Errorf("task=%q Run method panic()d! Applying Error message. Panic: %v\nStack: %s", task.ID(), r, stackTraceStr)
463 | 			m = &Message{Code: Error, Err: fmt.Errorf("panic: %v\nstack: %s", r, stackTraceStr)}
464 | 		}
465 | 	}()
466 | 
467 | 	// Defensive code to give handlers a *copy* of the command chan. That way if
468 | 	// a handler keeps receiving on the command chan in a goroutine past the
469 | 	// handler's lifetime it doesn't intercept commands intended for the
470 | 	// statemachine.
471 | 	internalcmd := make(chan *Message)
472 | 	stopped := make(chan struct{})
473 | 	go func() {
474 | 		for {
475 | 			select {
476 | 			case c := <-cmd:
477 | 				internalcmd <- c
478 | 			case <-stopped:
479 | 				return
480 | 			}
481 | 		}
482 | 	}()
483 | 	defer close(stopped)
484 | 
485 | 	return f(task, internalcmd)
486 | }
487 | 
488 | // Stop sends a Release message to the state machine through the command chan.
489 | func (s *stateMachine) Stop() {
490 | 	select {
491 | 	case s.cmds <- ReleaseMessage():
492 | 		// Also inform the state machine it should exit since the internal handler
493 | 		// may override the release message causing the task to be unreleaseable.
494 | 		s.stop()
495 | 	case <-s.stopped:
496 | 		// Already stopped!
497 | 	}
498 | }
499 | 
500 | func (s *stateMachine) stop() {
501 | 	s.stopL.Lock()
502 | 	defer s.stopL.Unlock()
503 | 	select {
504 | 	case <-s.stopped:
505 | 		return
506 | 	default:
507 | 		close(s.stopped)
508 | 	}
509 | }
510 | 
511 | // apply a message to cause a state transition. Returns false if the state
512 | // transition is invalid.
513 | func apply(cur *State, m *Message) (*State, bool) {
514 | 	//XXX Is a linear scan of all rules really the best option here?
515 | 	for _, trans := range Rules {
516 | 		if trans.Event == m.Code && trans.From == cur.Code {
517 | 			metafora.Debugf("Transitioned %s", trans)
518 | 			if m.Err != nil {
519 | 				// Append errors from message
520 | 				cur.Errors = append(cur.Errors, NewErr(m.Err, time.Now()))
521 | 			}
522 | 
523 | 			// New State + Message's Until + Combined Errors
524 | 			return &State{Code: trans.To, Until: m.Until, Errors: cur.Errors}, true
525 | 		}
526 | 	}
527 | 	return cur, false
528 | }
529 | 


--------------------------------------------------------------------------------
/statemachine/statemachine_test.go:
--------------------------------------------------------------------------------
  1 | package statemachine_test
  2 | 
  3 | import (
  4 | 	"errors"
  5 | 	"testing"
  6 | 	"time"
  7 | 
  8 | 	"github.com/lytics/metafora"
  9 | 	"github.com/lytics/metafora/embedded"
 10 | 	. "github.com/lytics/metafora/statemachine"
 11 | )
 12 | 
 13 | func testhandler(task metafora.Task, cmds <-chan *Message) *Message {
 14 | 	metafora.Debugf("Starting %s", task.ID())
 15 | 	m := <-cmds
 16 | 	metafora.Debugf("%s recvd %s", task.ID(), m.Code)
 17 | 	return m
 18 | }
 19 | 
 20 | type testStore struct {
 21 | 	initial *State
 22 | 	out     chan<- *State
 23 | }
 24 | 
 25 | func (s testStore) Load(metafora.Task) (*State, error) {
 26 | 	s.out <- s.initial
 27 | 	return s.initial, nil
 28 | }
 29 | func (s testStore) Store(task metafora.Task, newstate *State) error {
 30 | 	metafora.Debugf("%s storing %s", task.ID(), newstate.Code)
 31 | 	s.out <- newstate
 32 | 	return nil
 33 | }
 34 | 
 35 | // setup a task with the specified task ID in a stateful handler and run it.
 36 | func setup(t *testing.T, tid string) (*embedded.StateStore, Commander, metafora.Handler, chan bool) {
 37 | 	t.Parallel()
 38 | 	ss := embedded.NewStateStore().(*embedded.StateStore)
 39 | 	_ = ss.Store(task(tid), &State{Code: Runnable})
 40 | 	<-ss.Stored // pop initial state out
 41 | 	cmdr := embedded.NewCommander()
 42 | 	cmdlistener := cmdr.NewListener(tid)
 43 | 	sm := New(task(tid), testhandler, ss, cmdlistener, nil)
 44 | 	done := make(chan bool)
 45 | 	go func() { done <- sm.Run() }()
 46 | 	return ss, cmdr, sm, done
 47 | }
 48 | 
 49 | // FIXME leaks goroutines
 50 | func TestRules(t *testing.T) {
 51 | 	t.Parallel()
 52 | 	for i, trans := range Rules {
 53 | 		metafora.Debugf("Trying %s", trans)
 54 | 		cmdr := embedded.NewCommander()
 55 | 		cmdlistener := cmdr.NewListener("test")
 56 | 		store := make(chan *State)
 57 | 
 58 | 		state := &State{Code: trans.From}
 59 | 
 60 | 		// Sleeping state needs extra Until state
 61 | 		if trans.From == Sleeping {
 62 | 			until := time.Now().Add(100 * time.Millisecond)
 63 | 			state.Until = &until
 64 | 		}
 65 | 
 66 | 		ts := testStore{initial: state, out: store}
 67 | 
 68 | 		// Create a new statemachine that starts from the From state
 69 | 		sm := New(task("test"), testhandler, ts, cmdlistener, nil)
 70 | 		go sm.Run()
 71 | 		initial := <-store
 72 | 		if initial.Code != trans.From {
 73 | 			t.Fatalf("%d Initial state %q not set. Found: %q", i, trans.From, initial.Code)
 74 | 		}
 75 | 
 76 | 		// The Fault state transitions itself to either sleeping or failed
 77 | 		if trans.From != Fault {
 78 | 			// Apply the Event to transition to the To state
 79 | 			msg := &Message{Code: trans.Event}
 80 | 
 81 | 			// Sleep messages need extra state
 82 | 			if trans.Event == Sleep {
 83 | 				until := time.Now().Add(10 * time.Millisecond)
 84 | 				msg.Until = &until
 85 | 			}
 86 | 			if trans.Event == Error {
 87 | 				msg.Err = errors.New("test")
 88 | 			}
 89 | 			if err := cmdr.Send("test", msg); err != nil {
 90 | 				t.Fatalf("Error sending message %s: %v", trans.Event, err)
 91 | 			}
 92 | 		}
 93 | 		newstate := <-store
 94 | 		if trans.From == Fault && trans.To == Failed {
 95 | 			// continue on as this transition relies on state this test doesn't exercise
 96 | 			continue
 97 | 		}
 98 | 		if newstate.Code != trans.To {
 99 | 			t.Fatalf("%d Expected %q but found %q for transition %s", i, trans.To, newstate.Code, trans)
100 | 		}
101 | 	}
102 | }
103 | 
104 | func TestCheckpointRelease(t *testing.T) {
105 | 	ss, cmdr, _, done := setup(t, "test1")
106 | 
107 | 	// Should just cause statemachine to loop
108 | 	if err := cmdr.Send("test1", CheckpointMessage()); err != nil {
109 | 		t.Fatalf("Error sending checkpoint: %v", err)
110 | 	}
111 | 	select {
112 | 	case <-done:
113 | 		t.Fatalf("Checkpoint command should not have caused statemachine to exit.")
114 | 	case <-time.After(100 * time.Millisecond):
115 | 	}
116 | 
117 | 	// Should cause the statemachine to exit
118 | 	if err := cmdr.Send("test1", ReleaseMessage()); err != nil {
119 | 		t.Fatalf("Error sending release: %v", err)
120 | 	}
121 | 	select {
122 | 	case d := <-done:
123 | 		if d {
124 | 			t.Fatalf("Release command should not have caused the task to be marked as done.")
125 | 		}
126 | 	case <-time.After(100 * time.Millisecond):
127 | 		t.Fatalf("Expected statemachine to exit but it did not.")
128 | 	}
129 | 	state, err := ss.Load(task("test1"))
130 | 	if err != nil {
131 | 		t.Fatal(err)
132 | 	}
133 | 	if state.Code != Runnable {
134 | 		t.Fatalf("Expected released task to be runnable but found state %q", state.Code)
135 | 	}
136 | }
137 | 
138 | func TestSleep(t *testing.T) {
139 | 	ss, cmdr, _, _ := setup(t, "sleep-test")
140 | 
141 | 	{
142 | 		// Put to sleep forever
143 | 		until := time.Now().Add(9001 * time.Hour)
144 | 		if err := cmdr.Send("sleep-test", SleepMessage(until)); err != nil {
145 | 			t.Fatalf("Error sending sleep: %v", err)
146 | 		}
147 | 
148 | 		newstate := <-ss.Stored
149 | 		if newstate.State.Code != Sleeping || !newstate.State.Until.Equal(until) {
150 | 			t.Fatalf("Expected task to store state Sleeping, but stored: %s", newstate)
151 | 		}
152 | 	}
153 | 
154 | 	// Make sure it stays sleeping for at least a bit
155 | 	select {
156 | 	case newstate := <-ss.Stored:
157 | 		t.Fatalf("Expected task to stay asleep forever but transitioned to: %s", newstate)
158 | 	case <-time.After(100 * time.Millisecond):
159 | 	}
160 | 
161 | 	// Override current sleep with a shorter one
162 | 	dur := 1 * time.Second
163 | 	start := time.Now()
164 | 	until := start.Add(dur)
165 | 	if err := cmdr.Send("sleep-test", SleepMessage(until)); err != nil {
166 | 		t.Fatalf("Error sending sleep: %v", err)
167 | 	}
168 | 
169 | 	newstate := <-ss.Stored
170 | 	if newstate.State.Code != Sleeping || !newstate.State.Until.Equal(until) {
171 | 		t.Fatalf("Expected task to store state Sleeping, but stored: %s", newstate)
172 | 	}
173 | 
174 | 	// Make sure it transitions to Runnable after sleep has elapsed
175 | 	newstate = <-ss.Stored
176 | 	transitioned := time.Now()
177 | 	if newstate.State.Code != Runnable || newstate.State.Until != nil {
178 | 		t.Fatalf("Expected task to be runnable without an Until time but found: %s", newstate.State)
179 | 	}
180 | 	elapsed := transitioned.Sub(start)
181 | 	if transitioned.Sub(start) < dur {
182 | 		t.Fatalf("Expected task to sleep for %s but slept for %s", dur, elapsed)
183 | 	}
184 | 	t.Logf("Statemachine latency: %s", elapsed-dur)
185 | }
186 | 
187 | func TestSleepRelease(t *testing.T) {
188 | 	ss, cmdr, _, returned := setup(t, "sleep-test")
189 | 
190 | 	until := time.Now().Add(9001 * time.Hour)
191 | 	{
192 | 		// Put to sleep forever
193 | 		if err := cmdr.Send("sleep-test", SleepMessage(until)); err != nil {
194 | 			t.Fatalf("Error sending sleep: %v", err)
195 | 		}
196 | 
197 | 		newstate := <-ss.Stored
198 | 		if newstate.State.Code != Sleeping || !newstate.State.Until.Equal(until) {
199 | 			t.Fatalf("Expected task to store state Sleeping, but stored: %s", newstate)
200 | 		}
201 | 	}
202 | 
203 | 	{
204 | 		// Releasing should maintain sleep state but exit
205 | 		if err := cmdr.Send("sleep-test", ReleaseMessage()); err != nil {
206 | 			t.Fatalf("Error sending release: %v", err)
207 | 		}
208 | 		newstate := <-ss.Stored
209 | 		if newstate.State.Code != Sleeping || newstate.State.Until == nil || !newstate.State.Until.Equal(until) {
210 | 			t.Fatalf("Releasing unexpectedly changed state: %s != Sleeping || %v != %s", newstate.State.Code, newstate.State.Until, until)
211 | 		}
212 | 		if done := <-returned; done {
213 | 			t.Fatal("Releasing should not have returned done.")
214 | 		}
215 | 	}
216 | }
217 | 
218 | func TestTerminal(t *testing.T) {
219 | 	ss, cmdr, sm, done := setup(t, "terminal-test")
220 | 
221 | 	// Kill the task
222 | 	if err := cmdr.Send("terminal-test", &Message{Code: Kill}); err != nil {
223 | 		t.Fatalf("Error sending kill command: %v", err)
224 | 	}
225 | 
226 | 	// Task should be killed and done (unscheduled)
227 | 	newstate := <-ss.Stored
228 | 	if newstate.State.Code != Killed {
229 | 		t.Fatalf("Expected task to be killed but found: %s", newstate.State)
230 | 	}
231 | 	if !(<-done) {
232 | 		t.Fatal("Expected task to be done.")
233 | 	}
234 | 	if state, err := ss.Load(task("terminal-test")); err != nil || state.Code != Killed {
235 | 		t.Fatalf("Failed to load expected killed state for task: state=%s err=%v", state, err)
236 | 	}
237 | 
238 | 	// Task should just die again if we try to reschedule it
239 | 	go func() { done <- sm.Run() }()
240 | 	select {
241 | 	case newstate := <-ss.Stored:
242 | 		t.Fatalf("Re-running a terminated task should *not* store state, but it stored: %v", newstate.State)
243 | 	case <-time.After(100 * time.Millisecond):
244 | 		// State shouldn't even be stored since it's not being changed and terminal
245 | 		// states should be immutable
246 | 	}
247 | 
248 | 	if !(<-done) {
249 | 		t.Fatal("Expected task to be done.")
250 | 	}
251 | }
252 | 
253 | func TestPause(t *testing.T) {
254 | 	ss, cmdr, sm, done := setup(t, "test-pause")
255 | 
256 | 	pause := func() {
257 | 		if err := cmdr.Send("test-pause", PauseMessage()); err != nil {
258 | 			t.Fatalf("Error sending pause command to test-pause: %v", err)
259 | 		}
260 | 		newstate := <-ss.Stored
261 | 		if newstate.State.Code != Paused {
262 | 			t.Fatalf("Expected paused state but found: %s", newstate.State)
263 | 		}
264 | 		if state, err := ss.Load(task("test-pause")); err != nil || state.Code != Paused {
265 | 			t.Fatalf("Failed to load expected pause state for task: state=%s err=%v", state, err)
266 | 		}
267 | 
268 | 		// Task should not be Done; pausing doesn't exit the statemachine
269 | 		select {
270 | 		case <-done:
271 | 			t.Fatal("Task exited unexpectedly.")
272 | 		case <-time.After(100 * time.Millisecond):
273 | 		}
274 | 	}
275 | 
276 | 	// Pause the work
277 | 	pause()
278 | 
279 | 	// Should be able to resume paused work
280 | 	if err := cmdr.Send("test-pause", RunMessage()); err != nil {
281 | 		t.Fatalf("Error sending run command to test-pause: %v", err)
282 | 	}
283 | 	newstate := <-ss.Stored
284 | 	if newstate.State.Code != Runnable {
285 | 		t.Fatalf("Expected runnable state but found: %s", newstate.State)
286 | 	}
287 | 	if state, err := ss.Load(task("test-pause")); err != nil || state.Code != Runnable {
288 | 		t.Fatalf("Failed to load expected runnable state for task: state=%s err=%v", state, err)
289 | 	}
290 | 
291 | 	// Re-pause the work
292 | 	pause()
293 | 
294 | 	// Pausing paused work is silly but fine
295 | 	pause()
296 | 
297 | 	// Releasing paused work should make it exit but leave it in the paused state
298 | 	sm.Stop()
299 | 	newstate = <-ss.Stored
300 | 	if newstate.State.Code != Paused {
301 | 		t.Fatalf("Releasing should not have changed paused state but stored: %s", newstate.State)
302 | 	}
303 | 	select {
304 | 	case d := <-done:
305 | 		if d {
306 | 			t.Fatal("Releasing task should not have marked it as done.")
307 | 		}
308 | 	case <-time.After(100 * time.Millisecond):
309 | 		t.Fatal("Releasing paused task should have exited the statemachine, but didn't.")
310 | 	}
311 | 
312 | 	// Ensure task is stored with the paused state
313 | 	if state, err := ss.Load(task("test-pause")); err != nil || state.Code != Paused {
314 | 		t.Fatalf("Failed to load expected paused state for task: state=%s err=%v", state, err)
315 | 	}
316 | }
317 | 
318 | func TestMessageValid(t *testing.T) {
319 | 	t.Parallel()
320 | 	until := time.Now()
321 | 	validmsgs := []Message{
322 | 		{Code: Run},
323 | 		{Code: Sleep, Until: &until},
324 | 		{Code: Pause},
325 | 		{Code: Kill},
326 | 		{Code: Error, Err: errors.New("test")},
327 | 		{Code: Complete},
328 | 		{Code: Checkpoint},
329 | 		{Code: Release},
330 | 	}
331 | 	for _, m := range validmsgs {
332 | 		if !m.Valid() {
333 | 			t.Errorf("Expected %s to be valid.", m)
334 | 		}
335 | 	}
336 | 
337 | 	invalidmsgs := []Message{
338 | 		{},
339 | 		{Code: Sleep},
340 | 		{Code: Error},
341 | 	}
342 | 	for _, m := range invalidmsgs {
343 | 		if m.Valid() {
344 | 			t.Errorf("Expected %s to be invalid.", m)
345 | 		}
346 | 	}
347 | }
348 | 


--------------------------------------------------------------------------------
/statemachine/statestore.go:
--------------------------------------------------------------------------------
 1 | package statemachine
 2 | 
 3 | import "github.com/lytics/metafora"
 4 | 
 5 | // StateStore is an interface implementations must provide for persisting task
 6 | // state. Since the task ID is provided on each method call a single global
 7 | // StateStore can be used and implementations should be safe for concurrent
 8 | // access.
 9 | type StateStore interface {
10 | 	// Load the persisted or initial state for a task. Errors will cause tasks to
11 | 	// be marked as done.
12 | 	//
13 | 	// The one exception is the special error StateNotFound which will cause the
14 | 	// state machine to start from the initial (Runnable) state.
15 | 	Load(metafora.Task) (*State, error)
16 | 
17 | 	// Store the current task state. Errors will prevent current state from being
18 | 	// persisted and prevent state transitions.
19 | 	Store(metafora.Task, *State) error
20 | }
21 | 


--------------------------------------------------------------------------------
/task.go:
--------------------------------------------------------------------------------
 1 | package metafora
 2 | 
 3 | import (
 4 | 	"encoding/json"
 5 | 	"sync"
 6 | 	"time"
 7 | )
 8 | 
 9 | // Task is the minimum interface for Tasks to implement.
10 | type Task interface {
11 | 	// ID is the immutable globally unique ID for this task.
12 | 	ID() string
13 | }
14 | 
15 | type basictask string
16 | 
17 | // NewTask creates the most basic Task implementation: just a string ID.
18 | func NewTask(id string) Task   { return basictask(id) }
19 | func (t basictask) ID() string { return string(t) }
20 | 
21 | // RunningTask represents tasks running within a consumer.
22 | type RunningTask interface {
23 | 	Task() Task
24 | 
25 | 	// Started is the time the task was started by this consumer.
26 | 	Started() time.Time
27 | 
28 | 	// Stopped is the first time Stop() was called on this task or zero is it has
29 | 	// yet to be called. Tasks may take an indeterminate amount of time to
30 | 	// shutdown after Stop() is called.
31 | 	Stopped() time.Time
32 | 
33 | 	// Handler implementation called for this task.
34 | 	Handler() Handler
35 | }
36 | 
37 | // runtask is the per-task state Metafora tracks internally.
38 | type runtask struct {
39 | 	// task is the original Task from the coordinator
40 | 	task Task
41 | 
42 | 	// handler on which Run and Stop are called
43 | 	h Handler
44 | 
45 | 	// stopL serializes calls to task.h.Stop() to make handler implementations
46 | 	// easier/safer as well as guard stopped
47 | 	stopL sync.Mutex
48 | 
49 | 	// when task was started and when Stop was first called
50 | 	started time.Time
51 | 	stopped time.Time
52 | }
53 | 
54 | func newTask(task Task, h Handler) *runtask {
55 | 	return &runtask{task: task, h: h, started: time.Now()}
56 | }
57 | 
58 | func (t *runtask) stop() {
59 | 	t.stopL.Lock()
60 | 	defer t.stopL.Unlock()
61 | 	if t.stopped.IsZero() {
62 | 		t.stopped = time.Now()
63 | 	}
64 | 	t.h.Stop()
65 | }
66 | 
67 | func (t *runtask) Task() Task         { return t.task }
68 | func (t *runtask) Handler() Handler   { return t.h }
69 | func (t *runtask) Started() time.Time { return t.started }
70 | func (t *runtask) Stopped() time.Time {
71 | 	t.stopL.Lock()
72 | 	defer t.stopL.Unlock()
73 | 	return t.stopped
74 | }
75 | 
76 | func (t *runtask) MarshalJSON() ([]byte, error) {
77 | 	js := struct {
78 | 		ID      string     `json:"id"`
79 | 		Started time.Time  `json:"started"`
80 | 		Stopped *time.Time `json:"stopped,omitempty"`
81 | 	}{ID: t.task.ID(), Started: t.started}
82 | 
83 | 	// Only set stopped if it's non-zero
84 | 	if s := t.Stopped(); !s.IsZero() {
85 | 		js.Stopped = &s
86 | 	}
87 | 
88 | 	return json.Marshal(&js)
89 | }
90 | 


--------------------------------------------------------------------------------
/util_test.go:
--------------------------------------------------------------------------------
 1 | package metafora
 2 | 
 3 | import (
 4 | 	"errors"
 5 | 	"log"
 6 | 	"os"
 7 | )
 8 | 
 9 | func init() {
10 | 	SetLogger(log.New(os.Stderr, "", log.Lmicroseconds|log.Lshortfile))
11 | }
12 | 
13 | //TODO Move out into a testutil package for other packages to use. The problem
14 | //is that existing metafora tests would have to be moved to the metafora_test
15 | //package which means no manipulating unexported globals like balance jitter.
16 | 
17 | type TestCoord struct {
18 | 	name     string
19 | 	Tasks    chan Task // will be returned in order, "" indicates return an error
20 | 	Commands chan Command
21 | 	Releases chan Task
22 | 	Dones    chan Task
23 | 	closed   chan bool
24 | }
25 | 
26 | func NewTestCoord() *TestCoord {
27 | 	return &TestCoord{
28 | 		name:     "testcoord",
29 | 		Tasks:    make(chan Task, 10),
30 | 		Commands: make(chan Command, 10),
31 | 		Releases: make(chan Task, 10),
32 | 		Dones:    make(chan Task, 10),
33 | 		closed:   make(chan bool),
34 | 	}
35 | }
36 | 
37 | func (*TestCoord) Init(CoordinatorContext) error { return nil }
38 | func (*TestCoord) Claim(Task) bool               { return true }
39 | func (c *TestCoord) Close()                      { close(c.closed) }
40 | func (c *TestCoord) Release(task Task)           { c.Releases <- task }
41 | func (c *TestCoord) Done(task Task)              { c.Dones <- task }
42 | func (c *TestCoord) Name() string                { return c.name }
43 | 
44 | // Watch sends tasks from the Tasks channel unless an empty string is sent.
45 | // Then an error is returned.
46 | func (c *TestCoord) Watch(out chan<- Task) error {
47 | 	var task Task
48 | 	for {
49 | 		select {
50 | 		case task = <-c.Tasks:
51 | 			Debugf("TestCoord recvd: %s", task)
52 | 			if task == nil || task.ID() == "" {
53 | 				return errors.New("test error")
54 | 			}
55 | 		case <-c.closed:
56 | 			return nil
57 | 		}
58 | 		select {
59 | 		case out <- task:
60 | 			Debugf("TestCoord sent: %s", task)
61 | 		case <-c.closed:
62 | 			return nil
63 | 		}
64 | 	}
65 | }
66 | 
67 | // Command returns commands from the Commands channel unless a nil is sent.
68 | // Then an error is returned.
69 | func (c *TestCoord) Command() (Command, error) {
70 | 	cmd := <-c.Commands
71 | 	if cmd == nil {
72 | 		return cmd, errors.New("test error")
73 | 	}
74 | 	return cmd, nil
75 | }
76 | 


--------------------------------------------------------------------------------