64 |
65 | Balancer | Go interface consulted by Consumer for determining
66 | which tasks can be claimed and which should be released. See balancer.go. |
68 |
69 |
70 | Broker | external task and command store like
71 | etcd for the Coordinator to
72 | use. |
73 |
74 | Consumer | core work runner. Integrates Balancer,
75 | Coordinator, and Handlers to get work done. |
76 |
77 |
78 | Coordinator | client Go interface to Broker. See
79 | coordinator.go. |
80 |
81 |
82 | Handler | Go interface for executing tasks. |
83 |
84 |
85 | Task | unit of work. Executed by Handlers. |
86 |
87 |
88 |
89 | FAQ
90 | ---
91 |
92 | **Q. Is it ready for production use?**
93 |
94 | *Yes.* Metafora with the etcd coordinator has been the production work system at
95 | [Lytics](http://lytics.io) since January 2014 and runs thousands of tasks
96 | concurrently across a cluster of VMs.
97 |
98 | Since Metafora is still under heavy development, you probably want to pin the
99 | dependencies to a commit hash or
100 | [tag](https://github.com/lytics/metafora/releases) to keep the API stable. The
101 | `master` branch is automatically tested and is safe for use if you can tolerate
102 | API changes.
103 |
104 | **Q. Where is the metaforad daemon?**
105 |
106 | It doesn't exist. Metafora is library for you to import and use in a service
107 | you write. Metafora handles task management but leaves implementation details
108 | such as task implementation and daemonization up to the user.
109 |
110 | [FAQ continued in Documentation...](Documentation/faq.md)
111 |
--------------------------------------------------------------------------------
/balancer.go:
--------------------------------------------------------------------------------
1 | package metafora
2 |
3 | import (
4 | "math"
5 | "math/rand"
6 | "time"
7 | )
8 |
9 | const (
10 | // Default threshold is 120% of cluster average
11 | defaultThreshold float64 = 1.2
12 | )
13 |
14 | // NoDelay is simply the zero value for time and meant to be a more meaningful
15 | // value for CanClaim methods to return instead of initializing a new empty
16 | // time struct.
17 | var NoDelay = time.Time{}
18 |
19 | // BalancerContext is a limited interface exposed to Balancers from the
20 | // Consumer for access to limited Consumer state.
21 | type BalancerContext interface {
22 | // Tasks returns a sorted list of task IDs owned by this Consumer. The
23 | // Consumer stops task manipulations during claiming and balancing, so the
24 | // list will be accurate unless a task naturally completes.
25 | Tasks() []RunningTask
26 | }
27 |
28 | // Balancer is the core task balancing interface. Without a master Metafora
29 | // clusters are cooperatively balanced -- meaning each node needs to know how
30 | // to balance itself.
31 | type Balancer interface {
32 | // Init is called once and only once before any other Balancer methods are
33 | // called. The context argument is meant to expose functionality that might
34 | // be useful for CanClaim and Balance implementations.
35 | Init(BalancerContext)
36 |
37 | // CanClaim should return true if the consumer should accept a task.
38 | //
39 | // When denying a claim by returning false, CanClaim should return the time
40 | // at which to reconsider the task for claiming.
41 | CanClaim(task Task) (ignoreUntil time.Time, claim bool)
42 |
43 | // Balance should return the list of Task IDs that should be released. The
44 | // criteria used to determine which tasks should be released is left up to
45 | // the implementation.
46 | Balance() (release []string)
47 | }
48 |
49 | // DumbBalancer is the simplest possible balancer implementation which simply
50 | // accepts all tasks. Since it has no state a single global instance exists.
51 | var DumbBalancer = dumbBalancer{}
52 |
53 | type dumbBalancer struct{}
54 |
55 | // Init does nothing.
56 | func (dumbBalancer) Init(BalancerContext) {}
57 |
58 | // CanClaim always returns true.
59 | func (dumbBalancer) CanClaim(Task) (time.Time, bool) { return NoDelay, true }
60 |
61 | // Balance never returns any tasks to balance.
62 | func (dumbBalancer) Balance() []string { return nil }
63 |
64 | // Provides information about the cluster to be used by FairBalancer
65 | type ClusterState interface {
66 | // Provide the current number of jobs
67 | NodeTaskCount() (map[string]int, error)
68 | }
69 |
70 | // NewDefaultFairBalancer creates a new FairBalancer but requires a
71 | // ClusterState implementation to gain more information about the cluster than
72 | // BalancerContext provides.
73 | func NewDefaultFairBalancer(nodeid string, cs ClusterState) Balancer {
74 | return NewDefaultFairBalancerWithThreshold(nodeid, cs, defaultThreshold)
75 | }
76 |
77 | // NewDefaultFairBalancerWithThreshold allows callers to override
78 | // FairBalancer's default 120% task load release threshold.
79 | func NewDefaultFairBalancerWithThreshold(nodeid string, cs ClusterState, threshold float64) Balancer {
80 | return &FairBalancer{
81 | nodeid: nodeid,
82 | clusterstate: cs,
83 | releaseThreshold: threshold,
84 | }
85 | }
86 |
87 | // An implementation of Balancer which attempts to randomly release tasks in
88 | // the case when the count of those currently running on this node is greater
89 | // than some percentage of the cluster average (default 120%).
90 | //
91 | // This balancer will claim all tasks which were not released on the last call
92 | // to Balance.
93 | type FairBalancer struct {
94 | nodeid string
95 |
96 | bc BalancerContext
97 | clusterstate ClusterState
98 |
99 | releaseThreshold float64
100 | delay time.Time
101 | }
102 |
103 | func (e *FairBalancer) Init(s BalancerContext) {
104 | e.bc = s
105 | }
106 |
107 | // CanClaim rejects tasks for a period of time if the last balance released
108 | // tasks. Otherwise all tasks are accepted.
109 | func (e *FairBalancer) CanClaim(task Task) (time.Time, bool) {
110 | if e.delay.After(time.Now()) {
111 | // Return delay set by Balance()
112 | return e.delay, false
113 | }
114 |
115 | // Sleep proportional to number of tasks
116 | n := len(e.bc.Tasks())
117 | time.Sleep(time.Duration(n>>2) * time.Millisecond)
118 | return NoDelay, true
119 | }
120 |
121 | // Balance releases tasks if this node has 120% more tasks than the average
122 | // node in the cluster.
123 | func (e *FairBalancer) Balance() []string {
124 | nodetasks := e.bc.Tasks()
125 |
126 | // Reset delay
127 | e.delay = time.Time{}
128 |
129 | // If local tasks <= 1 this node should never rebalance
130 | if len(nodetasks) < 2 {
131 | Infof("balancing skipped: nodetasks:%v ", nodetasks)
132 | return nil
133 | }
134 |
135 | current, err := e.clusterstate.NodeTaskCount()
136 | if err != nil {
137 | Warnf("balancing skipped: retrieving cluster state: %v", err)
138 | return nil
139 | }
140 |
141 | desired := e.desiredCount(current)
142 | shouldrelease := current[e.nodeid] - desired
143 | if shouldrelease < 1 {
144 | Infof("balancing skipped: shouldrelease <1 nodetasks:%v desired:%v shouldrelease:%v", len(nodetasks), desired, shouldrelease)
145 | return nil
146 | }
147 |
148 | releasetasks := make([]string, 0, shouldrelease)
149 | releaseset := make(map[string]struct{}, shouldrelease)
150 |
151 | random := rand.New(rand.NewSource(time.Now().UnixNano()))
152 | for len(releasetasks) < shouldrelease {
153 | tid := nodetasks[random.Intn(len(nodetasks))].Task().ID()
154 | if _, ok := releaseset[tid]; !ok {
155 | releasetasks = append(releasetasks, tid)
156 | releaseset[tid] = struct{}{}
157 | }
158 | }
159 |
160 | e.delay = time.Now().Add(time.Duration(len(releasetasks)) * time.Second)
161 | return releasetasks
162 | }
163 |
164 | // Retrieve the desired maximum count, based on current cluster state
165 | func (e *FairBalancer) desiredCount(current map[string]int) int {
166 | total := 0
167 | for _, c := range current {
168 | total += c
169 | }
170 |
171 | avg := 0
172 | if len(current) > 0 {
173 | avg = total / len(current)
174 | }
175 |
176 | return int(math.Ceil(float64(avg) * e.releaseThreshold))
177 | }
178 |
--------------------------------------------------------------------------------
/balancer_res.go:
--------------------------------------------------------------------------------
1 | package metafora
2 |
3 | import (
4 | "fmt"
5 | "time"
6 | )
7 |
8 | // ResourceReporter is required by the ResourceBalancer to read the resource
9 | // being used for balancing.
10 | type ResourceReporter interface {
11 | // Used returns the amount of a resource used and the total amount of that
12 | // resource.
13 | Used() (used uint64, total uint64)
14 |
15 | // String returns the unit resources are reported in.
16 | String() string
17 | }
18 |
19 | // ResourceBalancer is a balancer implemntation which uses two thresholds to
20 | // limit claiming and rebalance work based upon a resource reported by a
21 | // ResourceReporter. When the claim threshold is exceeded, no new work will be
22 | // claimed. When the release threshold is exceeded work will be released until
23 | // below that threshold. The claim threshold must be less than the release
24 | // threshold (otherwise claims would continue just to have the work
25 | // rebalanced.)
26 | //
27 | // Even below the claim limit, claims are delayed by the percent of resources
28 | // used (in milliseconds) to give less loaded nodes a claim advantage.
29 | //
30 | // The balancer releases the oldest tasks first (skipping those who are already
31 | // stopping) to try to prevent rebalancing the same tasks repeatedly within a
32 | // cluster.
33 | type ResourceBalancer struct {
34 | ctx BalancerContext
35 | reporter ResourceReporter
36 |
37 | claimLimit int
38 | releaseLimit int
39 | }
40 |
41 | // NewResourceBalancer creates a new ResourceBalancer or returns an error if
42 | // the limits are invalid.
43 | //
44 | // Limits should be a percentage expressed as an integer between 1 and 100
45 | // inclusive.
46 | func NewResourceBalancer(src ResourceReporter, claimLimit, releaseLimit int) (*ResourceBalancer, error) {
47 | if claimLimit < 1 || claimLimit > 100 || releaseLimit < 1 || releaseLimit > 100 {
48 | return nil, fmt.Errorf("Limits must be between 1 and 100. claim=%d release=%d", claimLimit, releaseLimit)
49 | }
50 | if claimLimit >= releaseLimit {
51 | return nil, fmt.Errorf("Claim threshold must be < release threshold. claim=%d >= release=%d", claimLimit, releaseLimit)
52 | }
53 |
54 | return &ResourceBalancer{
55 | reporter: src,
56 | claimLimit: claimLimit,
57 | releaseLimit: releaseLimit,
58 | }, nil
59 | }
60 |
61 | func (b *ResourceBalancer) Init(ctx BalancerContext) {
62 | b.ctx = ctx
63 | }
64 |
65 | func (b *ResourceBalancer) CanClaim(string) bool {
66 | used, total := b.reporter.Used()
67 | threshold := int(float32(used) / float32(total) * 100)
68 | if threshold >= b.claimLimit {
69 | //FIXME Until #93 is fixed returning false is very dangerous as it could
70 | // cause a tight loop with the coordinator. Sleep longer than more
71 | // lightly loaded nodes.
72 | dur := time.Duration(100+(threshold-b.claimLimit)) * time.Millisecond
73 | Infof("%d is over the claim limit of %d. Used %d of %d %s. Sleeping %s before claiming.",
74 | threshold, b.claimLimit, used, total, b.reporter, dur)
75 | time.Sleep(dur)
76 | return true
77 | }
78 |
79 | // Always sleep based on resource usage to give less loaded nodes an advantage
80 | dur := time.Duration(threshold) * time.Millisecond
81 | time.Sleep(dur)
82 | return true
83 | }
84 |
85 | func (b *ResourceBalancer) Balance() []string {
86 | used, total := b.reporter.Used()
87 | threshold := int(float32(used) / float32(total) * 100)
88 | if threshold < b.releaseLimit {
89 | // We're below the limit! Don't release anything.
90 | return nil
91 | }
92 |
93 | // Release the oldest task that isn't already stopping
94 | var oldest RunningTask
95 | for _, t := range b.ctx.Tasks() {
96 | if t.Stopped().IsZero() && (oldest == nil || oldest.Started().After(t.Started())) {
97 | oldest = t
98 | }
99 | }
100 |
101 | // No tasks or all tasks are stopping, don't bother rebalancing
102 | if oldest == nil {
103 | return nil
104 | }
105 |
106 | Infof("Releasing task %s (started %s) because %d > %d (%d of %d %s used)",
107 | oldest.Task().ID(), oldest.Started(), threshold, b.releaseLimit, used, total, b.reporter)
108 | return []string{oldest.Task().ID()}
109 | }
110 |
--------------------------------------------------------------------------------
/balancer_res_test.go:
--------------------------------------------------------------------------------
1 | package metafora
2 |
3 | import "testing"
4 |
5 | type fakeReporter struct {
6 | used uint64
7 | total uint64
8 | }
9 |
10 | func (r *fakeReporter) Used() (uint64, uint64) { return r.used, r.total }
11 | func (r *fakeReporter) String() string { return "fakes" }
12 |
13 | func TestResourceBalancer(t *testing.T) {
14 | t.Parallel()
15 |
16 | fr := &fakeReporter{used: 750, total: 1000}
17 | _, err := NewResourceBalancer(fr, 80, 75)
18 | if err == nil {
19 | t.Fatal("Expected an error: release threshold was lower than claim.")
20 | }
21 |
22 | bal, err := NewResourceBalancer(fr, 80, 90)
23 | if err != nil {
24 | t.Fatalf("Unexpected error creating resource balancer: %v", err)
25 | }
26 |
27 | ctx := &TestConsumerState{
28 | Current: []string{"1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11"},
29 | }
30 | bal.Init(ctx)
31 |
32 | release := bal.Balance()
33 | if len(release) > 0 {
34 | t.Errorf("Released tasks when we were well below limits! %v", release)
35 | }
36 |
37 | // Bump resource usage and rebalance
38 | fr.used = 901
39 | release = bal.Balance()
40 | if len(release) != 1 && release[0] == "1" {
41 | t.Errorf("Expected 1 released task but found: %v", release)
42 | }
43 |
44 | // Make sure we scale up the number we release proportionally
45 | fr.used = 999
46 | release = bal.Balance()
47 | if len(release) != 1 && release[0] == "1" {
48 | t.Errorf("Expected 1 released task but found: %v", release)
49 | }
50 |
51 | //FIXME When #93 is fixed this test should break as CanClaim should actually
52 | // return false
53 | if !bal.CanClaim("claimmepls") {
54 | t.Errorf("Until #93 is fixed, CanClaim should always return true")
55 | }
56 | }
57 |
--------------------------------------------------------------------------------
/balancer_sleep.go:
--------------------------------------------------------------------------------
1 | package metafora
2 |
3 | import "time"
4 |
5 | /*
6 | Q. Why 30ms?
7 |
8 | A. It's sufficiently long that unless a node is under heavy load (either
9 | computational, GC-induced, or network latency) it should win the claim-race
10 | against nodes with more tasks. If it's under so much load that it loses against
11 | nodes with more tasks, it's probably best to let those other nodes win!
12 |
13 | 30ms should scale fairly well up to hundreds of tasks per node as Metafora
14 | isn't really intended for high-throughput/low-latency tasks churn.
15 | */
16 | const sleepBalLen = 30 * time.Millisecond
17 |
18 | // SleepBalancer is a simplistic Balancer implementation which sleeps 30ms per
19 | // claimed task in its CanClaim() method. This means the node with the fewest
20 | // claimed tasks in a cluster should sleep the shortest length of time and win
21 | // the claim race.
22 | //
23 | // It never releases tasks during Balance() calls.
24 | type SleepBalancer struct {
25 | ctx BalancerContext
26 | }
27 |
28 | // Init is called by the Consumer.
29 | func (b *SleepBalancer) Init(ctx BalancerContext) { b.ctx = ctx }
30 |
31 | // Balance never returns any tasks for the sleepy balancer.
32 | func (*SleepBalancer) Balance() []string { return nil }
33 |
34 | // CanClaim sleeps 30ms per claimed task.
35 | func (b *SleepBalancer) CanClaim(string) bool {
36 | num := len(b.ctx.Tasks())
37 | time.Sleep(time.Duration(num) * sleepBalLen)
38 | return true
39 | }
40 |
--------------------------------------------------------------------------------
/balancer_test.go:
--------------------------------------------------------------------------------
1 | package metafora
2 |
3 | import (
4 | "testing"
5 | "time"
6 | )
7 |
8 | var (
9 | _ BalancerContext = (*TestConsumerState)(nil)
10 | _ ClusterState = (*TestClusterState)(nil)
11 | )
12 |
13 | func TestFairBalancerOneNode(t *testing.T) {
14 | t.Parallel()
15 | // Single node should never release tasks
16 | clusterstate := &TestClusterState{
17 | Current: map[string]int{"node1": 5},
18 | }
19 |
20 | consumerstate := &TestConsumerState{
21 | []string{"1", "2", "3", "4", "5"},
22 | }
23 |
24 | fb := NewDefaultFairBalancer("node1", clusterstate)
25 | fb.Init(consumerstate)
26 |
27 | if _, ok := fb.CanClaim(testTask{"23"}); !ok {
28 | t.Fatal("Expected claim to be true")
29 | }
30 |
31 | rebalance := fb.Balance()
32 | if len(rebalance) != 0 {
33 | t.Fatalf("Expected 0 rebalance tasks: %v", rebalance)
34 | }
35 | }
36 |
37 | func TestFairBalanceOver(t *testing.T) {
38 | t.Parallel()
39 | clusterstate := &TestClusterState{
40 | Current: map[string]int{
41 | "node1": 10,
42 | "node2": 2,
43 | },
44 | }
45 |
46 | consumerstate := &TestConsumerState{
47 | []string{"1", "2", "3", "4", "5"},
48 | }
49 |
50 | fb := NewDefaultFairBalancer("node1", clusterstate)
51 | fb.Init(consumerstate)
52 |
53 | if _, ok := fb.CanClaim(testTask{"23"}); !ok {
54 | t.Fatal("Expected claim to be true")
55 | }
56 |
57 | expect := 2
58 | rebalance := fb.Balance()
59 | if len(rebalance) != expect {
60 | t.Fatalf("Expected %d rebalanced tasks, received %d", expect, len(rebalance))
61 | }
62 | }
63 |
64 | func TestFairBalanceNothing(t *testing.T) {
65 | t.Parallel()
66 | clusterstate := &TestClusterState{
67 | Current: map[string]int{
68 | "node1": 2,
69 | "node2": 10,
70 | },
71 | }
72 |
73 | consumerstate := &TestConsumerState{
74 | []string{"1", "2", "3", "4", "5"},
75 | }
76 |
77 | fb := NewDefaultFairBalancer("node1", clusterstate)
78 | fb.Init(consumerstate)
79 |
80 | if _, ok := fb.CanClaim(testTask{"23"}); !ok {
81 | t.Fatal("Expected claim to be true")
82 | }
83 |
84 | expect := 0
85 | rebalance := fb.Balance()
86 | if len(rebalance) != expect {
87 | t.Fatalf("Expected %d rebalanced tasks, received %d", expect, len(rebalance))
88 | }
89 |
90 | }
91 |
92 | type testTask struct {
93 | id string
94 | }
95 |
96 | func (t testTask) ID() string { return t.id }
97 |
98 | type TestClusterState struct {
99 | Current map[string]int
100 | Err error
101 | }
102 |
103 | func (ts *TestClusterState) NodeTaskCount() (map[string]int, error) {
104 | if ts.Err != nil {
105 | return nil, ts.Err
106 | }
107 |
108 | return ts.Current, nil
109 | }
110 |
111 | type TestConsumerState struct {
112 | Current []string
113 | }
114 |
115 | func (tc *TestConsumerState) Tasks() []RunningTask {
116 | tasks := []RunningTask{}
117 | for _, id := range tc.Current {
118 | tasks = append(tasks, newTask(testTask{id}, nil))
119 | }
120 | return tasks
121 | }
122 |
123 | // Sleepy Balancer Tests
124 |
125 | type sbCtx struct {
126 | t *testing.T
127 | tasks []string
128 | }
129 |
130 | func (ctx *sbCtx) Tasks() []RunningTask {
131 | tasks := []RunningTask{}
132 | for _, id := range ctx.tasks {
133 | tasks = append(tasks, newTask(testTask{id}, nil))
134 | }
135 | return tasks
136 | }
137 | func (ctx *sbCtx) Log(l int, v string, args ...interface{}) {
138 | Infof(v, args)
139 | }
140 |
141 | func TestSleepBalancer(t *testing.T) {
142 | t.Parallel()
143 | c := &sbCtx{t: t, tasks: make([]string, 0, 10)}
144 |
145 | b := &SleepBalancer{}
146 | b.Init(c)
147 |
148 | task := "test-task"
149 | pre := time.Now()
150 | total := 0
151 | for i := 0; i < 10; i++ {
152 | total += i
153 | b.CanClaim(task)
154 | c.tasks = append(c.tasks, task)
155 | }
156 | post := time.Now()
157 | minimum := pre.Add(time.Duration(total) * sleepBalLen)
158 |
159 | // Sleep balancer should never finish before the minimum timeout threshold
160 | if post.Before(minimum) {
161 | t.Fatalf("SleepBalancer finished too early: %s < %s", post, minimum)
162 | }
163 |
164 | // Sleep balancer shouldn't experience much overhead
165 | if post.After(minimum.Add(50 * time.Millisecond)) {
166 | t.Fatalf("SleepBalancer went a worrying amount over the expected time: %s > %s", post, minimum)
167 | }
168 | }
169 |
--------------------------------------------------------------------------------
/client.go:
--------------------------------------------------------------------------------
1 | package metafora
2 |
3 | type Client interface {
4 | // SubmitTask submits a task to the system, the task id must be unique.
5 | SubmitTask(Task) error
6 |
7 | // Delete a task
8 | DeleteTask(taskId string) error
9 |
10 | // SubmitCommand submits a command to a particular node.
11 | SubmitCommand(node string, command Command) error
12 |
13 | // Nodes retrieves the current set of registered nodes.
14 | Nodes() ([]string, error)
15 | }
16 |
--------------------------------------------------------------------------------
/cmd/metaforactl/main.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | func main() {
4 |
5 | }
6 |
--------------------------------------------------------------------------------
/command.go:
--------------------------------------------------------------------------------
1 | package metafora
2 |
3 | import "encoding/json"
4 |
5 | const (
6 | cmdFreeze = "freeze"
7 | cmdUnfreeze = "unfreeze"
8 | cmdBalance = "balance"
9 | cmdStopTask = "stop_task"
10 | )
11 |
12 | // Commands are a way clients can communicate directly with nodes for cluster
13 | // maintenance.
14 | //
15 | // Use the Command functions to generate implementations of this interface.
16 | // Metafora's consumer will discard unknown commands.
17 | type Command interface {
18 | // Name returns the name of the command.
19 | Name() string
20 |
21 | // Parameters returns the parameters, if any, the command will be executed
22 | // with.
23 | Parameters() map[string]interface{}
24 |
25 | // Marshal turns a command into its wire representation.
26 | Marshal() ([]byte, error)
27 | }
28 |
29 | // command is the internal representation of commands used for serialization.
30 | type command struct {
31 | C string `json:"command"`
32 | P map[string]interface{} `json:"parameters,omitempty"`
33 | }
34 |
35 | // Name returns the name of the command.
36 | func (c *command) Name() string {
37 | return c.C
38 | }
39 |
40 | // Parameters returns the parameters, if any, the command will be executed
41 | // with.
42 | func (c *command) Parameters() map[string]interface{} {
43 | return c.P
44 | }
45 |
46 | // Marshal turns a command into its wire representation.
47 | func (c *command) Marshal() ([]byte, error) {
48 | return json.Marshal(c)
49 | }
50 |
51 | // Unmarshal parses a command from its wire representation.
52 | func UnmarshalCommand(p []byte) (Command, error) {
53 | c := &command{}
54 | err := json.Unmarshal(p, c)
55 | return c, err
56 | }
57 |
58 | // CommandFreeze stops all task watching and balancing.
59 | func CommandFreeze() Command {
60 | return &command{C: cmdFreeze}
61 | }
62 |
63 | // CommandUnfreeze resumes task watching and balancing.
64 | func CommandUnfreeze() Command {
65 | return &command{C: cmdUnfreeze}
66 | }
67 |
68 | // CommandBalance forces the node's balancer.Balance method to be called even
69 | // if frozen.
70 | func CommandBalance() Command {
71 | return &command{C: cmdBalance}
72 | }
73 |
74 | // CommandStopTask forces a node to stop a task even if frozen.
75 | func CommandStopTask(task string) Command {
76 | return &command{C: cmdStopTask, P: map[string]interface{}{"task": task}}
77 | }
78 |
--------------------------------------------------------------------------------
/command_test.go:
--------------------------------------------------------------------------------
1 | package metafora_test
2 |
3 | import (
4 | "reflect"
5 | "testing"
6 |
7 | . "github.com/lytics/metafora"
8 | )
9 |
10 | func testCmd(t *testing.T, cmd Command, name string, params map[string]interface{}) {
11 | if cmd.Name() != name {
12 | t.Errorf("%s command's name is wrong: %s", name, cmd.Name())
13 | }
14 | if !reflect.DeepEqual(cmd.Parameters(), params) {
15 | t.Errorf("%s command's params are wrong. expected %#v != %#v", name, params, cmd.Parameters())
16 | }
17 | b, err := cmd.Marshal()
18 | if err != nil {
19 | t.Errorf("%s command's Marshal() returned an error: %v", name, err)
20 | return
21 | }
22 | cmd2, err := UnmarshalCommand(b)
23 | if err != nil {
24 | t.Errorf("%s command's Marshal() output could not be Unmarshalled: %v", name, err)
25 | return
26 | }
27 | if cmd2.Name() != name {
28 | t.Errorf("%s command's name didn't Unmarshal properly: %s", name, cmd2.Name())
29 | }
30 | if !reflect.DeepEqual(cmd2.Parameters(), params) {
31 | t.Errorf("%s command's params didn't Unmarshal properly. expected %#v != %#v",
32 | name, params, cmd2.Parameters())
33 | }
34 | }
35 |
36 | func TestCommands(t *testing.T) {
37 | t.Parallel()
38 | testCmd(t, CommandFreeze(), "freeze", nil)
39 | testCmd(t, CommandUnfreeze(), "unfreeze", nil)
40 | testCmd(t, CommandBalance(), "balance", nil)
41 | testCmd(t, CommandStopTask("test"), "stop_task", map[string]interface{}{"task": "test"})
42 | }
43 |
--------------------------------------------------------------------------------
/coordinator.go:
--------------------------------------------------------------------------------
1 | package metafora
2 |
3 | // CoordinatorContext is the context passed to coordinators by the core
4 | // consumer.
5 | type CoordinatorContext interface {
6 | // Lost is called by the Coordinator when a claimed task is lost to another
7 | // node. The Consumer will stop the task locally.
8 | //
9 | // Since this implies there is a window of time where the task is executing
10 | // more than once, this is a sign of an unhealthy cluster.
11 | Lost(Task)
12 | }
13 |
14 | // Coordinator is the core interface Metafora uses to discover, claim, and
15 | // tasks as well as receive commands.
16 | type Coordinator interface {
17 | // Init is called once by the consumer to provide a Logger to Coordinator
18 | // implementations. NewConsumer will return Init's return value.
19 | Init(CoordinatorContext) error
20 |
21 | // Watch the broker for claimable tasks. Watch blocks until Close is called
22 | // or it encounters an error. Tasks are sent to consumer via the tasks chan.
23 | Watch(tasks chan<- Task) (err error)
24 |
25 | // Claim is called by the Consumer when a Balancer has determined that a task
26 | // ID can be claimed. Claim returns false if another consumer has already
27 | // claimed the ID.
28 | Claim(Task) bool
29 |
30 | // Release a task for other consumers to claim. May be called after Close.
31 | Release(Task)
32 |
33 | // Done is called by Metafora when a task has been completed and should never
34 | // be scheduled to run again (in other words: deleted from the broker).
35 | //
36 | // May be called after Close.
37 | Done(Task)
38 |
39 | // Command blocks until a command for this node is received from the broker
40 | // by the coordinator. Command must return (nil, nil) when Close is called.
41 | Command() (Command, error)
42 |
43 | // Close the coordinator. Stop waiting for tasks and commands. Remove node from broker.
44 | //
45 | // Do not release tasks. The consumer will handle task releasing.
46 | Close()
47 |
48 | // Name of the coordinator for use in logs and other tooling.
49 | Name() string
50 | }
51 |
52 | type coordinatorContext struct {
53 | *Consumer
54 | }
55 |
56 | // Lost is a light wrapper around Coordinator.stopTask to make it suitable for
57 | // calling by Coordinator implementations via the CoordinatorContext interface.
58 | func (ctx *coordinatorContext) Lost(t Task) {
59 | tid := t.ID()
60 | Errorf("Lost task %s", tid)
61 | ctx.stopTask(tid)
62 | }
63 |
--------------------------------------------------------------------------------
/doc.go:
--------------------------------------------------------------------------------
1 | // Metafora is a library for building distributed work systems. It's masterless
2 | // and extensible via core Balancer and Coordinator interfaces.
3 | //
4 | // If you use the builtin FairBalancer and EtcdCoordinator, all you have to do
5 | // is implement a Handler and HandlerFunc, and then run the Consumer.
6 | //
7 | // See https://github.com/lytics/metafora
8 | package metafora
9 |
--------------------------------------------------------------------------------
/embedded/README.md:
--------------------------------------------------------------------------------
1 | Creates client/coordinator pairs which use channels to communicate.
2 |
3 | Meant to be used embedded in applications which do not need/want external
4 | coordination, especially tests.
5 |
--------------------------------------------------------------------------------
/embedded/client.go:
--------------------------------------------------------------------------------
1 | package embedded
2 |
3 | import "github.com/lytics/metafora"
4 |
5 | func NewEmbeddedClient(taskchan chan metafora.Task, cmdchan chan *NodeCommand, nodechan chan []string) metafora.Client {
6 | return &EmbeddedClient{taskchan, cmdchan, nodechan}
7 | }
8 |
9 | type EmbeddedClient struct {
10 | taskchan chan<- metafora.Task
11 | cmdchan chan<- *NodeCommand
12 | nodechan <-chan []string
13 | }
14 |
15 | func (ec *EmbeddedClient) SubmitTask(t metafora.Task) error {
16 | ec.taskchan <- t
17 | return nil
18 | }
19 |
20 | func (ec *EmbeddedClient) DeleteTask(taskid string) error {
21 | nodes, _ := ec.Nodes()
22 | // Simply submit stop for all nodes
23 | for _, nid := range nodes {
24 | _ = ec.SubmitCommand(nid, metafora.CommandStopTask(taskid))
25 | }
26 | return nil
27 | }
28 |
29 | func (ec *EmbeddedClient) SubmitCommand(nodeid string, command metafora.Command) error {
30 | ec.cmdchan <- &NodeCommand{command, nodeid}
31 | return nil
32 | }
33 |
34 | func (ec *EmbeddedClient) Nodes() ([]string, error) {
35 | nodes := <-ec.nodechan
36 | return nodes, nil
37 | }
38 |
--------------------------------------------------------------------------------
/embedded/commander.go:
--------------------------------------------------------------------------------
1 | package embedded
2 |
3 | import (
4 | "fmt"
5 |
6 | "github.com/lytics/metafora/statemachine"
7 | )
8 |
9 | var _ statemachine.Commander = (*Commander)(nil)
10 |
11 | // Commander is an embedable statemachine.Commander implementation.
12 | // Task-specific command listeners are created by calling NewListener.
13 | type Commander struct {
14 | listeners map[string]chan *statemachine.Message
15 | }
16 |
17 | // NewCommander creates a new statemachine.Commander implementation.
18 | func NewCommander() *Commander {
19 | return &Commander{listeners: make(map[string]chan *statemachine.Message)}
20 | }
21 |
22 | // NewListener creates a task specific command listener linked to an embedded
23 | // Commander.
24 | func (c *Commander) NewListener(taskID string) statemachine.CommandListener {
25 | // Buffer chan to make sending/recving asynchronous
26 | c.listeners[taskID] = make(chan *statemachine.Message, 1)
27 | return &commandListener{c: c.listeners[taskID]}
28 | }
29 |
30 | func (c *Commander) Send(taskID string, m *statemachine.Message) error {
31 | cl, ok := c.listeners[taskID]
32 | if !ok {
33 | return fmt.Errorf("task=%q not running", taskID)
34 | }
35 | cl <- m
36 | return nil
37 | }
38 |
39 | type commandListener struct {
40 | c <-chan *statemachine.Message
41 | }
42 |
43 | func (cl *commandListener) Receive() <-chan *statemachine.Message { return cl.c }
44 | func (*commandListener) Stop() {}
45 |
--------------------------------------------------------------------------------
/embedded/commander_test.go:
--------------------------------------------------------------------------------
1 | package embedded_test
2 |
3 | import (
4 | "testing"
5 | "time"
6 |
7 | "github.com/lytics/metafora/embedded"
8 | "github.com/lytics/metafora/statemachine"
9 | )
10 |
11 | func TestEmbeddedCommander(t *testing.T) {
12 | t.Parallel()
13 | cmdr := embedded.NewCommander()
14 | cl1 := cmdr.NewListener("task1")
15 | cl2 := cmdr.NewListener("task2")
16 |
17 | if err := cmdr.Send("task1", statemachine.RunMessage()); err != nil {
18 | t.Fatalf("Error sending message to task1: %v", err)
19 | }
20 | if err := cmdr.Send("task2", statemachine.ReleaseMessage()); err != nil {
21 | t.Fatalf("Error sending message to task2: %v", err)
22 | }
23 | if err := cmdr.Send("invalid-task", statemachine.PauseMessage()); err == nil {
24 | t.Fatal("Expected an error when sending to an invalid task, but didn't receive one.")
25 | }
26 |
27 | msg2 := <-cl2.Receive()
28 | if msg2.Code != statemachine.Release {
29 | t.Fatalf("listener2 expected a Run message but received: %#v", msg2)
30 | }
31 | msg1 := <-cl1.Receive()
32 | if msg1.Code != statemachine.Run {
33 | t.Fatalf("listener1 expected a Run message but received: %#v", msg1)
34 | }
35 |
36 | // Stop listeners and make sure nothing works (but doesn't panic)
37 | cl1.Stop()
38 | cl2.Stop()
39 |
40 | select {
41 | case <-cl1.Receive():
42 | t.Fatal("expected listener1 to be close but it still received a message!")
43 | case <-cl2.Receive():
44 | t.Fatal("expected listener2 to be close but it still received a message!")
45 | case <-time.After(50 * time.Millisecond):
46 | }
47 | }
48 |
--------------------------------------------------------------------------------
/embedded/coordinator.go:
--------------------------------------------------------------------------------
1 | package embedded
2 |
3 | import (
4 | "errors"
5 |
6 | "github.com/lytics/metafora"
7 | )
8 |
9 | func NewEmbeddedCoordinator(nodeid string, taskchan chan metafora.Task, cmdchan chan *NodeCommand, nodechan chan []string) metafora.Coordinator {
10 | e := &EmbeddedCoordinator{inchan: taskchan, cmdchan: cmdchan, stopchan: make(chan struct{}), nodechan: nodechan}
11 | // HACK - need to respond to node requests, assuming a single coordinator/client pair
12 | go func() {
13 | for {
14 | select {
15 | case e.nodechan <- []string{e.nodeid}:
16 | case <-e.stopchan:
17 | return
18 | }
19 | }
20 | }()
21 |
22 | return e
23 | }
24 |
25 | // Coordinator which listens for tasks on a channel
26 | type EmbeddedCoordinator struct {
27 | nodeid string
28 | ctx metafora.CoordinatorContext
29 | inchan chan metafora.Task
30 | cmdchan chan *NodeCommand
31 | nodechan chan<- []string
32 | stopchan chan struct{}
33 | }
34 |
35 | func (e *EmbeddedCoordinator) Init(c metafora.CoordinatorContext) error {
36 | e.ctx = c
37 | return nil
38 | }
39 |
40 | func (e *EmbeddedCoordinator) Watch(out chan<- metafora.Task) error {
41 | for {
42 | // wait for incoming tasks
43 | select {
44 | case id, ok := <-e.inchan:
45 | if !ok {
46 | return errors.New("Input closed")
47 | }
48 | select {
49 | case out <- id:
50 | case <-e.stopchan:
51 | return nil
52 | }
53 | case <-e.stopchan:
54 | return nil
55 | }
56 | }
57 | }
58 |
59 | func (e *EmbeddedCoordinator) Claim(task metafora.Task) bool {
60 | // We recieved on a channel, we are the only ones to pull that value
61 | return true
62 | }
63 |
64 | func (e *EmbeddedCoordinator) Release(task metafora.Task) {
65 | // Releasing should be async to avoid deadlocks (and better reflect the
66 | // behavior of "real" coordinators)
67 | go func() {
68 | select {
69 | case e.inchan <- task:
70 | case <-e.stopchan:
71 | }
72 | }()
73 | }
74 |
75 | func (e *EmbeddedCoordinator) Done(task metafora.Task) {}
76 |
77 | func (e *EmbeddedCoordinator) Command() (metafora.Command, error) {
78 | select {
79 | case cmd, ok := <-e.cmdchan:
80 | if !ok {
81 | return nil, errors.New("Cmd channel closed")
82 | }
83 | return cmd.Cmd, nil
84 | case <-e.stopchan:
85 | return nil, nil
86 | }
87 | }
88 |
89 | func (e *EmbeddedCoordinator) Close() {
90 | close(e.stopchan)
91 | }
92 |
93 | func (e *EmbeddedCoordinator) Name() string {
94 | return "embedded"
95 | }
96 |
--------------------------------------------------------------------------------
/embedded/embedded_test.go:
--------------------------------------------------------------------------------
1 | package embedded
2 |
3 | import (
4 | "log"
5 | "os"
6 | "sync"
7 | "testing"
8 | "time"
9 |
10 | "github.com/lytics/metafora"
11 | )
12 |
13 | func init() {
14 | metafora.SetLogger(log.New(os.Stderr, "", log.Lmicroseconds|log.Lshortfile))
15 | }
16 |
17 | func TestEmbedded(t *testing.T) {
18 |
19 | tc := newTestCounter()
20 | adds := make(chan string, 4)
21 |
22 | thfunc := metafora.SimpleHandler(func(task metafora.Task, _ <-chan bool) bool {
23 | tc.Add(task.ID())
24 | adds <- task.ID()
25 | return true
26 | })
27 |
28 | coord, client := NewEmbeddedPair("testnode")
29 | runner, _ := metafora.NewConsumer(coord, thfunc, metafora.DumbBalancer)
30 |
31 | go runner.Run()
32 |
33 | for _, taskid := range []string{"one", "two", "three", "four"} {
34 | err := client.SubmitTask(metafora.NewTask(taskid))
35 | if err != nil {
36 | t.Fatalf("Expected no error, got %v", err)
37 | }
38 | }
39 |
40 | deadline := time.Now().Add(500 * time.Millisecond)
41 | for time.Now().Before(deadline) {
42 | if len(adds) == 4 {
43 | break
44 | }
45 | time.Sleep(10 * time.Millisecond)
46 | }
47 | if len(adds) != 4 {
48 | t.Errorf("Handlers didn't run in expected amount of time")
49 | }
50 | runner.Shutdown()
51 |
52 | runs := tc.Runs()
53 | if len(runs) != 4 {
54 | t.Fatalf("Expected 4 runs, got %d", len(runs))
55 | }
56 |
57 | }
58 |
59 | func TestEmbeddedShutdown(t *testing.T) {
60 | const n = 4
61 | runs := make(chan int, n)
62 | stops := make(chan int, n)
63 | thfunc := metafora.SimpleHandler(func(_ metafora.Task, s <-chan bool) bool {
64 | runs <- 1
65 | select {
66 | case <-s:
67 | stops <- 1
68 | return false
69 | case <-time.After(time.Second * 3):
70 | return true
71 | }
72 | })
73 |
74 | coord, client := NewEmbeddedPair("testnode")
75 | runner, _ := metafora.NewConsumer(coord, thfunc, metafora.DumbBalancer)
76 |
77 | go runner.Run()
78 |
79 | // len(tasks) must == n
80 | tasks := []string{"one", "two", "three", "four"}
81 |
82 | // submit tasks
83 | for _, taskid := range tasks {
84 | err := client.SubmitTask(metafora.NewTask(taskid))
85 | if err != nil {
86 | t.Fatalf("Expected no error, got %v", err)
87 | }
88 | }
89 |
90 | // make sure all 4 start
91 | for i := 0; i < n; i++ {
92 | <-runs
93 | }
94 |
95 | // tell them to stop
96 | runner.Shutdown()
97 |
98 | // make sure all 4 stop
99 | for i := 0; i < n; i++ {
100 | <-stops
101 | }
102 | }
103 |
104 | func newTestCounter() *testcounter {
105 | return &testcounter{runs: []string{}}
106 | }
107 |
108 | type testcounter struct {
109 | runs []string
110 | cmut sync.Mutex
111 | }
112 |
113 | func (t *testcounter) Add(r string) {
114 | t.cmut.Lock()
115 | defer t.cmut.Unlock()
116 | t.runs = append(t.runs, r)
117 | }
118 |
119 | func (t *testcounter) Runs() []string {
120 | t.cmut.Lock()
121 | defer t.cmut.Unlock()
122 | return t.runs
123 | }
124 |
--------------------------------------------------------------------------------
/embedded/statestore.go:
--------------------------------------------------------------------------------
1 | package embedded
2 |
3 | import (
4 | "sync"
5 |
6 | "github.com/lytics/metafora"
7 | "github.com/lytics/metafora/statemachine"
8 | )
9 |
10 | type StateChanged struct {
11 | TaskID string
12 | State *statemachine.State
13 | }
14 |
15 | // StateStore is an in-memory implementation of statemachine.StateStore
16 | // intended for use in tests.
17 | type StateStore struct {
18 | mu *sync.RWMutex
19 | store map[string]*statemachine.State
20 |
21 | // Stored is intended for tests to block until a Store() is called as an
22 | // alternative to time.Sleep()s.
23 | //
24 | // Will deliver asynchronously and drop states if there's no receivers.
25 | Stored chan StateChanged
26 | }
27 |
28 | func NewStateStore() statemachine.StateStore {
29 | return &StateStore{
30 | mu: &sync.RWMutex{},
31 | store: map[string]*statemachine.State{},
32 | Stored: make(chan StateChanged, 1),
33 | }
34 | }
35 |
36 | func (s *StateStore) Load(task metafora.Task) (*statemachine.State, error) {
37 | s.mu.RLock()
38 | defer s.mu.RUnlock()
39 | state, ok := s.store[task.ID()]
40 | if !ok {
41 | return &statemachine.State{Code: statemachine.Runnable}, nil
42 | }
43 | return state, nil
44 | }
45 |
46 | func (s *StateStore) Store(task metafora.Task, state *statemachine.State) error {
47 | s.mu.Lock()
48 | s.store[task.ID()] = state
49 | s.mu.Unlock()
50 | stored := StateChanged{TaskID: task.ID(), State: state}
51 | select {
52 | case s.Stored <- stored:
53 | default:
54 | }
55 | return nil
56 | }
57 |
--------------------------------------------------------------------------------
/embedded/util.go:
--------------------------------------------------------------------------------
1 | package embedded
2 |
3 | import "github.com/lytics/metafora"
4 |
5 | type NodeCommand struct {
6 | Cmd metafora.Command
7 | NodeId string
8 | }
9 |
10 | // Returns a connected client/coordinator pair for embedded/testing use
11 | func NewEmbeddedPair(nodeid string) (metafora.Coordinator, metafora.Client) {
12 | taskchan := make(chan metafora.Task)
13 | cmdchan := make(chan *NodeCommand)
14 | nodechan := make(chan []string, 1)
15 |
16 | coord := NewEmbeddedCoordinator(nodeid, taskchan, cmdchan, nodechan)
17 | client := NewEmbeddedClient(taskchan, cmdchan, nodechan)
18 |
19 | return coord, client
20 | }
21 |
--------------------------------------------------------------------------------
/go.mod:
--------------------------------------------------------------------------------
1 | module github.com/lytics/metafora
2 |
3 | go 1.13
4 |
5 | require (
6 | github.com/araddon/gou v0.0.0-20190110011759-c797efecbb61
7 | github.com/kr/pretty v0.2.1 // indirect
8 | github.com/kr/text v0.2.0 // indirect
9 | github.com/stretchr/testify v1.7.0
10 | go.etcd.io/etcd/client/v3 v3.5.7
11 | )
12 |
--------------------------------------------------------------------------------
/handler.go:
--------------------------------------------------------------------------------
1 | package metafora
2 |
3 | // Handler is the core task handling interface. The Consumer will create a new
4 | // Handler for each claimed task, call Run once and only once, and call Stop
5 | // when the task should persist its progress and exit.
6 | type Handler interface {
7 | // Run handles a task and blocks until completion or Stop is called.
8 | //
9 | // If Run returns true, Metafora will mark the task as Done via the
10 | // Coordinator. The task will not be rescheduled.
11 | //
12 | // If Run returns false, Metafora will Release the task via the Coordinator.
13 | // The task will be scheduled to run again.
14 | //
15 | // Panics are treated the same as returning true.
16 | Run() (done bool)
17 |
18 | // Stop signals to the handler to shutdown gracefully. Stop implementations
19 | // should not block until Run exits.
20 | //
21 | // Stop may be called more than once but calls are serialized. Implmentations
22 | // may perform different operations on subsequent calls to Stop to implement
23 | // graceful vs. forced shutdown conditions.
24 | //
25 | // Run probably wants to return false when stop is called, but this is left
26 | // up to the implementation as races between Run finishing and Stop being
27 | // called can happen.
28 | Stop()
29 | }
30 |
31 | // HandlerFunc is called by the Consumer to create a new Handler for each task.
32 | //
33 | // HandlerFunc is meant to be the New function for handlers. Since Run and Stop
34 | // are called concurrently, any state used by both should be initialized in the
35 | // HandlerFunc. Since Handlerfunc is uninterruptable, only the minimum amount
36 | // of work necessary to initialize a handler should be done.
37 | type HandlerFunc func(Task) Handler
38 |
39 | // SimpleHander creates a HandlerFunc for a simple function that accepts a stop
40 | // channel. The channel will be closed when Stop is called.
41 | func SimpleHandler(f func(t Task, stop <-chan bool) bool) HandlerFunc {
42 | return func(t Task) Handler {
43 | return &simpleHandler{
44 | task: t,
45 | stop: make(chan bool),
46 | f: f,
47 | }
48 | }
49 | }
50 |
51 | type simpleHandler struct {
52 | task Task
53 | stop chan bool
54 | f func(Task, <-chan bool) bool
55 | }
56 |
57 | func (h *simpleHandler) Run() bool {
58 | return h.f(h.task, h.stop)
59 | }
60 |
61 | func (h *simpleHandler) Stop() {
62 | select {
63 | case <-h.stop:
64 | default:
65 | close(h.stop)
66 | }
67 | }
68 |
--------------------------------------------------------------------------------
/httputil/httputil.go:
--------------------------------------------------------------------------------
1 | package httputil
2 |
3 | import (
4 | "encoding/json"
5 | "net/http"
6 | "time"
7 |
8 | "github.com/lytics/metafora"
9 | "github.com/lytics/metafora/statemachine"
10 | )
11 |
12 | // Consumer contains just the Metafora methods exposed by the HTTP
13 | // introspection endpoints.
14 | type Consumer interface {
15 | Frozen() bool
16 | Tasks() []metafora.RunningTask
17 | String() string
18 | }
19 |
20 | type stateMachine interface {
21 | State() (*statemachine.State, time.Time)
22 | }
23 |
24 | type Task struct {
25 | ID string `json:"id"`
26 | Started time.Time `json:"started"`
27 | Stopped *time.Time `json:"stopped,omitempty"`
28 | State string `json:"state,omitempty"`
29 | Modified *time.Time `json:"modified,omitempty"`
30 | Task metafora.Task `json:"task"`
31 | }
32 |
33 | // InfoResponse is the JSON response marshalled by the MakeInfoHandler.
34 | type InfoResponse struct {
35 | Frozen bool `json:"frozen"`
36 | Name string `json:"name"`
37 | Started time.Time `json:"started"`
38 | Tasks []Task `json:"tasks"`
39 | }
40 |
41 | // MakeInfoHandler returns an HTTP handler which can be added to an exposed
42 | // HTTP server mux by Metafora applications to provide operators with basic
43 | // node introspection.
44 | func MakeInfoHandler(c Consumer, started time.Time) http.HandlerFunc {
45 | return func(w http.ResponseWriter, _ *http.Request) {
46 | tasks := c.Tasks()
47 | resp := InfoResponse{
48 | Frozen: c.Frozen(),
49 | Name: c.String(),
50 | Started: started,
51 | Tasks: make([]Task, len(tasks)),
52 | }
53 | for i, task := range tasks {
54 | resp.Tasks[i] = Task{
55 | ID: task.Task().ID(),
56 | Started: task.Started(),
57 | Task: task.Task(),
58 | }
59 |
60 | // Set stopped if it's non-zero
61 | stopped := task.Stopped()
62 | if !stopped.IsZero() {
63 | resp.Tasks[i].Stopped = &stopped
64 | }
65 |
66 | // Expose state if it exists
67 | if sh, ok := task.Handler().(stateMachine); ok {
68 | s, ts := sh.State()
69 | resp.Tasks[i].State = s.String()
70 | resp.Tasks[i].Modified = &ts
71 | }
72 | }
73 | w.Header().Set("Content-Type", "application/json")
74 | _ = json.NewEncoder(w).Encode(&resp)
75 | }
76 | }
77 |
--------------------------------------------------------------------------------
/httputil/httputil_test.go:
--------------------------------------------------------------------------------
1 | package httputil_test
2 |
3 | import (
4 | "encoding/json"
5 | "net/http/httptest"
6 | "testing"
7 | "time"
8 |
9 | "github.com/lytics/metafora"
10 | . "github.com/lytics/metafora/httputil"
11 | )
12 |
13 | type tc struct {
14 | stop chan bool
15 | }
16 |
17 | func (*tc) Init(metafora.CoordinatorContext) error { return nil }
18 | func (c *tc) Watch(chan<- metafora.Task) error {
19 | <-c.stop
20 | return nil
21 | }
22 | func (c *tc) Claim(metafora.Task) bool { return false }
23 | func (c *tc) Release(metafora.Task) {}
24 | func (c *tc) Done(metafora.Task) {}
25 | func (c *tc) Command() (metafora.Command, error) {
26 | <-c.stop
27 | return nil, nil
28 | }
29 | func (c *tc) Close() { close(c.stop) }
30 | func (c *tc) Name() string { return "tc" }
31 |
32 | func TestMakeInfoHandler(t *testing.T) {
33 | t.Parallel()
34 |
35 | c, _ := metafora.NewConsumer(&tc{stop: make(chan bool)}, nil, metafora.DumbBalancer)
36 | defer c.Shutdown()
37 | now := time.Now().Truncate(time.Second)
38 |
39 | resp := httptest.NewRecorder()
40 | MakeInfoHandler(c, now)(resp, nil)
41 |
42 | info := InfoResponse{}
43 | if err := json.Unmarshal(resp.Body.Bytes(), &info); err != nil {
44 | t.Fatalf("Error unmarshalling response body: %v", err)
45 | }
46 | if info.Frozen {
47 | t.Errorf("Consumer should not start frozen.")
48 | }
49 | if !info.Started.Equal(now) {
50 | t.Errorf("Started time %s != %s", info.Started, now)
51 | }
52 | if info.Name != "tc" {
53 | t.Errorf("Node name %q != tc", info.Name)
54 | }
55 | if len(info.Tasks) != 0 {
56 | t.Errorf("Unexpected tasks: %v", info.Tasks)
57 | }
58 | }
59 |
--------------------------------------------------------------------------------
/ignore.go:
--------------------------------------------------------------------------------
1 | package metafora
2 |
3 | import (
4 | "container/heap"
5 | "sync"
6 | "time"
7 | )
8 |
9 | // ignoremgr handles ignoring tasks and sending them back to the consumer once
10 | // their ignore deadline is reached.
11 | type ignoremgr struct {
12 | incoming chan *timetask
13 | stop <-chan struct{}
14 |
15 | mu *sync.RWMutex
16 | ignores map[string]struct{}
17 | }
18 |
19 | func ignorer(tasks chan<- Task, stop <-chan struct{}) *ignoremgr {
20 | im := &ignoremgr{
21 | incoming: make(chan *timetask),
22 | stop: stop,
23 | mu: &sync.RWMutex{},
24 | ignores: make(map[string]struct{}),
25 | }
26 | go im.monitor(tasks, stop)
27 | return im
28 | }
29 |
30 | func (im *ignoremgr) add(task Task, until time.Time) {
31 | // short circuit zero times; queue everything else
32 | if until.IsZero() {
33 | return
34 | }
35 |
36 | // Add to ignore map
37 | im.mu.Lock()
38 | im.ignores[task.ID()] = struct{}{}
39 | im.mu.Unlock()
40 |
41 | // Send to monitor for pushing onto time heap
42 | select {
43 | case im.incoming <- &timetask{time: until, task: task}:
44 | case <-im.stop:
45 | // Don't bother adding ignore if we're just exiting
46 | }
47 | }
48 |
49 | func (im *ignoremgr) ignored(taskID string) (ignored bool) {
50 | im.mu.RLock()
51 | _, ok := im.ignores[taskID]
52 | im.mu.RUnlock()
53 |
54 | return ok
55 | }
56 |
57 | func (im *ignoremgr) monitor(tasks chan<- Task, stop <-chan struct{}) {
58 | times := timeheap{}
59 | heap.Init(×)
60 | var next *timetask
61 | for {
62 | if times.Len() > 0 {
63 | // Get next ignore from the ignore heap
64 | next = heap.Pop(×).(*timetask)
65 | } else {
66 | // No ignores! Wait for one to come in or an exit signal
67 | select {
68 | case <-stop:
69 | return
70 | case newtask := <-im.incoming:
71 | next = newtask
72 | }
73 | }
74 |
75 | // this duration *may* be negative, in which case the
76 | // task will be pushed immediately
77 | timer := time.NewTimer(time.Until(next.time))
78 |
79 | select {
80 | case newtask := <-im.incoming:
81 | // Push onto next task and new task onto time heap
82 | heap.Push(×, newtask)
83 | heap.Push(×, next)
84 |
85 | // Stop the existing timer for this loop iteration
86 | timer.Stop()
87 | case <-timer.C:
88 | // Ignore expired, remove the entry
89 | im.mu.Lock()
90 | delete(im.ignores, next.task.ID())
91 | im.mu.Unlock()
92 |
93 | // Notify the consumer
94 | select {
95 | case tasks <- next.task:
96 | case <-stop:
97 | return
98 | }
99 | case <-stop:
100 | return
101 | }
102 | }
103 | }
104 |
105 | func (im *ignoremgr) all() []string {
106 | im.mu.RLock()
107 | defer im.mu.RUnlock()
108 | ignores := make([]string, len(im.ignores))
109 | i := 0
110 | for k := range im.ignores {
111 | ignores[i] = k
112 | i++
113 | }
114 | return ignores
115 | }
116 |
117 | type timetask struct {
118 | time time.Time
119 | task Task
120 | }
121 |
122 | // timeheap is a min-heap of time/task tuples sorted by time.
123 | type timeheap []*timetask
124 |
125 | func (h timeheap) Len() int { return len(h) }
126 | func (h timeheap) Less(i, j int) bool { return h[i].time.Before(h[j].time) }
127 | func (h timeheap) Swap(i, j int) { h[i], h[j] = h[j], h[i] }
128 |
129 | func (h *timeheap) Push(x interface{}) {
130 | // Push and Pop use pointer receivers because they modify the slice's length,
131 | // not just its contents.
132 | *h = append(*h, x.(*timetask))
133 | }
134 |
135 | func (h *timeheap) Pop() interface{} {
136 | old := *h
137 | n := len(old)
138 | x := old[n-1]
139 | *h = old[0 : n-1]
140 | return x
141 | }
142 |
--------------------------------------------------------------------------------
/ignore_test.go:
--------------------------------------------------------------------------------
1 | package metafora
2 |
3 | import (
4 | "testing"
5 | "time"
6 | )
7 |
8 | func TestIgnore(t *testing.T) {
9 | t.Parallel()
10 | out := make(chan Task)
11 | stop := make(chan struct{})
12 | defer close(stop)
13 |
14 | // Create ignorer
15 | im := ignorer(out, stop)
16 |
17 | // Ignore task for 200ms. Yes this is racy. Might need to bump deadline.
18 | deadline1 := time.Now().Add(200 * time.Millisecond)
19 | im.add(testTask{"1"}, deadline1)
20 |
21 | // Ensure it's ignored
22 | if !im.ignored("1") {
23 | t.Fatal("test task should have been ignored but wasn't")
24 | }
25 |
26 | // Ignore task for 10ms to make sure tasks are returned in order (they aren't
27 | // *guaranteed* to be in order since adds and evictions are concurrent)
28 | deadline2 := time.Now().Add(10 * time.Millisecond)
29 | im.add(testTask{"2"}, deadline2)
30 |
31 | // Wait for the first eviction
32 | eviction := <-out
33 | if eviction.ID() != "2" {
34 | t.Fatal("Expected 2 to be evicted before 1")
35 | }
36 | now := time.Now()
37 | if now.Before(deadline2) {
38 | t.Fatalf("First eviction happened too soon: %s < %s", now, deadline2)
39 | }
40 |
41 | eviction = <-out
42 | if eviction.ID() != "1" {
43 | t.Fatal("Expected 1 to be evicted second, found ", eviction)
44 | }
45 | now = time.Now()
46 | if now.Before(deadline1) {
47 | t.Fatalf("First eviction happened too soon: %s < %s", now, deadline1)
48 | }
49 | }
50 |
--------------------------------------------------------------------------------
/logger.go:
--------------------------------------------------------------------------------
1 | package metafora
2 |
3 | import (
4 | "github.com/araddon/gou"
5 | )
6 |
7 | var LogLevel int = gou.LogLevel
8 |
9 | type LogOutputter interface {
10 | Output(calldepth int, s string) error
11 | }
12 |
13 | // SetLogger switches where Metafora logs.
14 | func SetLogger(l LogOutputter) {
15 | }
16 |
17 | var Debug func(v ...interface{}) = gou.Debug
18 | var Debugf func(format string, v ...interface{}) = gou.Debugf
19 | var Info func(v ...interface{}) = gou.Info
20 | var Infof func(format string, v ...interface{}) = gou.Infof
21 | var Warn func(v ...interface{}) = gou.Warn
22 | var Warnf func(format string, v ...interface{}) = gou.Warnf
23 | var Error func(v ...interface{}) = gou.Error
24 | var Errorf func(format string, v ...interface{}) = gou.Errorf
25 |
--------------------------------------------------------------------------------
/metafora.go:
--------------------------------------------------------------------------------
1 | package metafora
2 |
3 | import (
4 | "fmt"
5 | "math/rand"
6 | "runtime"
7 | "sort"
8 | "sync"
9 | "time"
10 | )
11 |
12 | var (
13 | // balance calls are randomized and this is the upper bound of the random
14 | // amount
15 | balanceJitterMax = 10 * int64(time.Second)
16 | )
17 |
18 | // Consumer is the core Metafora task runner.
19 | type Consumer struct {
20 | // Func to create new handlers
21 | handler HandlerFunc
22 |
23 | // Map of task:Handler
24 | running map[string]*runtask
25 |
26 | // Mutex to protect access to running
27 | runL sync.Mutex
28 |
29 | // WaitGroup for running handlers and consumer goroutines
30 | hwg sync.WaitGroup
31 |
32 | // WaitGroup so Shutdown() can block on Run() exiting fully
33 | runwg sync.WaitGroup
34 | runwgL sync.Mutex
35 |
36 | bal Balancer
37 | balEvery time.Duration
38 | coord Coordinator
39 | im *ignoremgr
40 | stop chan struct{} // closed by Shutdown to cause Run to exit
41 | tasks chan Task // channel for watcher to send tasks to main loop
42 |
43 | // Set by command handler, read anywhere via Consumer.frozen()
44 | freezeL sync.Mutex
45 | freeze bool
46 | }
47 |
48 | var BalanceEvery = 15 * time.Minute //TODO make balance wait configurable
49 |
50 | // NewConsumer returns a new consumer and calls Init on the Balancer and Coordinator.
51 | func NewConsumer(coord Coordinator, h HandlerFunc, b Balancer) (*Consumer, error) {
52 | c := &Consumer{
53 | running: make(map[string]*runtask),
54 | handler: h,
55 | bal: b,
56 | balEvery: BalanceEvery,
57 | coord: coord,
58 | stop: make(chan struct{}),
59 | tasks: make(chan Task),
60 | }
61 | c.im = ignorer(c.tasks, c.stop)
62 |
63 | // initialize balancer with the consumer and a prefixed logger
64 | b.Init(c)
65 |
66 | if err := coord.Init(&coordinatorContext{c}); err != nil {
67 | return nil, err
68 | }
69 | return c, nil
70 | }
71 |
72 | // Run is the core run loop of Metafora. It is responsible for calling into the
73 | // Coordinator to claim work and Balancer to rebalance work.
74 | //
75 | // Run blocks until Shutdown is called or an internal error occurs.
76 | func (c *Consumer) Run() {
77 | Debug(c, " Starting consumer")
78 |
79 | // Increment run wait group so Shutdown() can block on Run() exiting fully.
80 | c.runwgL.Lock()
81 | c.runwg.Add(1)
82 | c.runwgL.Unlock()
83 | defer c.runwg.Done()
84 |
85 | // chans for core goroutines to communicate with main loop
86 | balance := make(chan bool)
87 | cmdChan := make(chan Command)
88 |
89 | // Balance is called by the main loop when the balance channel is ticked
90 | go func() {
91 | randInt := rand.New(rand.NewSource(time.Now().UnixNano())).Int63n
92 | for {
93 | select {
94 | case <-c.stop:
95 | // Shutdown has been called.
96 | return
97 | case <-time.After(c.balEvery + time.Duration(randInt(balanceJitterMax))):
98 | select {
99 | case balance <- true:
100 | // Ticked balance
101 | case <-c.stop:
102 | // Shutdown has been called.
103 | return
104 | }
105 | }
106 | }
107 | }()
108 |
109 | // Watch for new tasks in a goroutine
110 | go c.watcher()
111 |
112 | // Watch for new commands in a goroutine
113 | go func() {
114 | defer close(cmdChan)
115 | for {
116 | cmd, err := c.coord.Command()
117 | if err != nil {
118 | panic(fmt.Errorf("coordinator returned an error during command: %v", err))
119 | }
120 | if cmd == nil {
121 | Debug(c, " Command coordinator exited")
122 | return
123 | }
124 | // Send command to watcher (or shutdown)
125 | select {
126 | case <-c.stop:
127 | return
128 | case cmdChan <- cmd:
129 | }
130 | }
131 | }()
132 |
133 | // Make sure Run() cleans up on exit (stops coordinator, releases tasks, etc)
134 | defer c.shutdown()
135 |
136 | // Main Loop ensures events are processed synchronously
137 | for {
138 | if c.Frozen() {
139 | // Only recv commands while frozen
140 | select {
141 | case <-c.stop:
142 | // Shutdown has been called.
143 | return
144 | case cmd, ok := <-cmdChan:
145 | if !ok {
146 | Debug(c, " Command channel closed. Exiting main loop.")
147 | return
148 | }
149 | Debugf("%s Received command: %s", c, cmd)
150 | c.handleCommand(cmd)
151 | }
152 | continue
153 | }
154 |
155 | select {
156 | case <-c.stop:
157 | // Shutdown has been called.
158 | return
159 | case <-balance:
160 | c.balance()
161 | case task := <-c.tasks:
162 | tid := task.ID()
163 | if c.ignored(tid) {
164 | Debugf("%s task=%q ignored", c, tid)
165 | continue
166 | }
167 | if until, ok := c.bal.CanClaim(task); !ok {
168 | Infof("%s Balancer rejected task=%q until %s", c, tid, until)
169 | c.ignore(task, until)
170 | break
171 | }
172 | if !c.coord.Claim(task) {
173 | Debugf("%s Coordinator unable to claim task=%q", c, tid)
174 | break
175 | }
176 | c.claimed(task)
177 | case cmd, ok := <-cmdChan:
178 | if !ok {
179 | Debug(c, " Command channel closed. Exiting main loop.")
180 | return
181 | }
182 | c.handleCommand(cmd)
183 | }
184 | }
185 | }
186 |
187 | func (c *Consumer) watcher() {
188 | // The watcher dying unexpectedly should close the consumer to cause a
189 | // shutdown.
190 | defer c.close()
191 |
192 | err := c.coord.Watch(c.tasks)
193 | if err != nil {
194 | panic(fmt.Errorf("coordinator returned an error during watch: %v", err))
195 | }
196 | }
197 |
198 | func (c *Consumer) balance() {
199 | tasks := c.bal.Balance()
200 | if len(tasks) > 0 {
201 | Infof("%s balancer releasing %d tasks: %v", c, len(tasks), tasks)
202 | }
203 | for _, task := range tasks {
204 | // Actually release the rebalanced task.
205 | c.stopTask(task)
206 | }
207 | }
208 |
209 | // close the c.stop channel which signals for the consumer to shutdown.
210 | func (c *Consumer) close() {
211 | // acquire the runL lock to make sure we don't race with claimed()'s <-c.stop
212 | // check
213 | c.runL.Lock()
214 | defer c.runL.Unlock()
215 | select {
216 | case <-c.stop:
217 | // already stopped
218 | default:
219 | Debug("Stopping Run loop")
220 | close(c.stop)
221 | }
222 | }
223 |
224 | // shutdown is the actual shutdown logic called when Run() exits.
225 | func (c *Consumer) shutdown() {
226 | c.close()
227 |
228 | // Build list of of currently running tasks
229 | runningtasks := c.Tasks()
230 | Infof("Sending stop signal to %d handler(s)", len(runningtasks))
231 |
232 | for _, rt := range runningtasks {
233 | c.stopTask(rt.Task().ID())
234 | }
235 |
236 | Info(c, " Waiting for handlers to exit")
237 | c.hwg.Wait()
238 |
239 | Debug("Closing Coordinator ", c)
240 | c.coord.Close()
241 | }
242 |
243 | // Shutdown stops the main Run loop, calls Stop on all handlers, and calls
244 | // Close on the Coordinator. Running tasks will be released for other nodes to
245 | // claim.
246 | func (c *Consumer) Shutdown() {
247 | c.close()
248 |
249 | // Wait for task handlers to exit.
250 | c.hwg.Wait()
251 |
252 | // Make sure Run() exits, otherwise Shutdown() might exit before
253 | // coord.Close() is called.
254 | c.runwgL.Lock()
255 | c.runwg.Wait()
256 | c.runwgL.Unlock()
257 | }
258 |
259 | // Tasks returns a lexicographically sorted list of running Task IDs.
260 | func (c *Consumer) Tasks() []RunningTask {
261 | c.runL.Lock()
262 | defer c.runL.Unlock()
263 |
264 | // Create a sorted list of task IDs
265 | ids := make([]string, len(c.running))
266 | i := 0
267 | for id := range c.running {
268 | ids[i] = id
269 | i++
270 | }
271 | sort.Strings(ids)
272 |
273 | // Add tasks in lexicographic order
274 | t := make([]RunningTask, len(ids))
275 | for i, id := range ids {
276 | t[i] = c.running[id]
277 | }
278 | return t
279 | }
280 |
281 | // claimed starts a handler for a claimed task. It is the only method to
282 | // manipulate c.running and closes the task channel when a handler's Run
283 | // method exits.
284 | func (c *Consumer) claimed(task Task) {
285 | h := c.handler(task)
286 |
287 | tid := task.ID()
288 | Debugf("%s is attempting to start task=%q", c, tid)
289 | // Associate handler with taskID
290 | // **This is the only place tasks should be added to c.running**
291 | c.runL.Lock()
292 | defer c.runL.Unlock()
293 | select {
294 | case <-c.stop:
295 | // We're closing, don't bother starting this task
296 | c.coord.Release(task)
297 | return
298 | default:
299 | }
300 | if _, ok := c.running[tid]; ok {
301 | // If a coordinator returns an already claimed task from Watch(), then it's
302 | // a coordinator (or broker) bug.
303 | Warnf("%s Attempted to claim already running task %s", c, tid)
304 | return
305 | }
306 | rt := newTask(task, h)
307 | c.running[tid] = rt
308 |
309 | // This must be done in the runL lock after the stop chan check so Shutdown
310 | // doesn't close(stop) and start Wait()ing concurrently.
311 | // See "Note" http://golang.org/pkg/sync/#WaitGroup.Add
312 | c.hwg.Add(1)
313 |
314 | // Start handler in its own goroutine
315 | go func() {
316 | defer c.hwg.Done() // Must be run after task exit and Done/Release called
317 |
318 | // Run the task
319 | Infof("%s Task %q started", c, tid)
320 | done := c.runTask(h.Run, tid)
321 | var status string
322 | if done {
323 | status = "done"
324 | c.coord.Done(task)
325 | } else {
326 | status = "released"
327 | c.coord.Release(task)
328 | }
329 |
330 | stopped := rt.Stopped()
331 | if stopped.IsZero() {
332 | // Task exited on its own
333 | Infof("%s Task %q exited (%s)", c, tid, status)
334 | } else {
335 | // Task exited due to Stop() being called
336 | Infof("%s Task %q exited (%s) after %s", c, tid, status, time.Since(stopped))
337 | }
338 |
339 | // **This is the only place tasks should be removed from c.running**
340 | c.runL.Lock()
341 | delete(c.running, tid)
342 | c.runL.Unlock()
343 | }()
344 |
345 | // Pause slightly after a successful claim to give starting tasks some
346 | // breathing room and to bias the next claim toward a node that lost this
347 | // one.
348 | time.Sleep(10 * time.Millisecond)
349 | }
350 |
351 | // runTask executes a handler's Run method and recovers from panic()s.
352 | func (c *Consumer) runTask(run func() bool, task string) bool {
353 | done := false
354 | func() {
355 | defer func() {
356 | if err := recover(); err != nil {
357 | stack := make([]byte, 50*1024)
358 | sz := runtime.Stack(stack, false)
359 | Errorf("%s Handler %s panic()'d: %v\n%s", c, task, err, stack[:sz])
360 | // panics are considered fatal errors. Make sure the task isn't
361 | // rescheduled.
362 | done = true
363 | }
364 | }()
365 | done = run()
366 | }()
367 | return done
368 | }
369 |
370 | // stopTask asynchronously calls the task handlers' Stop method. While stopTask
371 | // calls don't block, calls to task handler's Stop method are serialized with a
372 | // lock.
373 | func (c *Consumer) stopTask(taskID string) {
374 | c.runL.Lock()
375 | task, ok := c.running[taskID]
376 | c.runL.Unlock()
377 |
378 | if !ok {
379 | // This can happen if a task completes during Balance() and is not an error.
380 | Warnf("%s tried to release a non-running task=%q", c, taskID)
381 | return
382 | }
383 |
384 | // all handler methods must be wrapped in a recover to prevent a misbehaving
385 | // handler from crashing the entire consumer
386 | go func() {
387 | defer func() {
388 | if err := recover(); err != nil {
389 | stack := make([]byte, 50*1024)
390 | sz := runtime.Stack(stack, false)
391 | Errorf("%s Handler %s panic()'d on Stop: %v\n%s", c, taskID, err, stack[:sz])
392 | }
393 | }()
394 |
395 | // Serialize calls to Stop as a convenience to handler implementors.
396 | task.stop()
397 | }()
398 | }
399 |
400 | // Frozen returns true if Metafora is no longer watching for new tasks or
401 | // rebalancing.
402 | //
403 | // Metafora will remain frozen until receiving an Unfreeze command or it is
404 | // restarted (frozen state is not persisted).
405 | func (c *Consumer) Frozen() bool {
406 | c.freezeL.Lock()
407 | r := c.freeze
408 | c.freezeL.Unlock()
409 | return r
410 | }
411 |
412 | func (c *Consumer) handleCommand(cmd Command) {
413 | switch cmd.Name() {
414 | case cmdFreeze:
415 | if c.Frozen() {
416 | Info(c, " Ignoring freeze command: already frozen")
417 | return
418 | }
419 | Info(c, " Freezing")
420 | c.freezeL.Lock()
421 | c.freeze = true
422 | c.freezeL.Unlock()
423 | case cmdUnfreeze:
424 | if !c.Frozen() {
425 | Info(c, " Ignoring unfreeze command: not frozen")
426 | return
427 | }
428 | Info(c, " Unfreezing")
429 | c.freezeL.Lock()
430 | c.freeze = false
431 | c.freezeL.Unlock()
432 | case cmdBalance:
433 | Info(c, " Balancing due to command")
434 | c.balance()
435 | Debug(c, " Finished balancing due to command")
436 | case cmdStopTask:
437 | taskI, ok := cmd.Parameters()["task"]
438 | task, ok2 := taskI.(string)
439 | if !ok || !ok2 {
440 | Error(c, " Stop task command didn't contain a valid task", c)
441 | return
442 | }
443 | Infof("%s Stopping task %s due to command", c, task)
444 | c.stopTask(task)
445 | default:
446 | Warnf("%s Discarding unknown command: %s", c, cmd.Name())
447 | }
448 | }
449 |
450 | func (c *Consumer) ignored(taskID string) bool { return c.im.ignored(taskID) }
451 | func (c *Consumer) ignore(t Task, until time.Time) { c.im.add(t, until) }
452 |
453 | // Ignores is a list of all ignored tasks.
454 | func (c *Consumer) Ignores() []string { return c.im.all() }
455 |
456 | func (c *Consumer) String() string {
457 | return c.coord.Name()
458 | }
459 |
--------------------------------------------------------------------------------
/metafora_test.go:
--------------------------------------------------------------------------------
1 | package metafora
2 |
3 | import (
4 | "os"
5 | "testing"
6 | "time"
7 | )
8 |
9 | func init() {
10 | if os.Getenv("VERBOSE_TESTS") != "" {
11 | SetLogger(testlogger{})
12 | }
13 | }
14 |
15 | type testlogger struct{}
16 |
17 | func (testlogger) Output(int, string) error { return nil }
18 |
19 | // Handler/Consumer test
20 |
21 | type testHandler struct {
22 | stop chan int
23 | t *testing.T
24 | task Task
25 | tasksRun chan string
26 | }
27 |
28 | func (h *testHandler) Run() bool {
29 | h.tasksRun <- h.task.ID()
30 | h.t.Logf("Run(%s)", h.task.ID())
31 | <-h.stop
32 | h.t.Logf("Stop received for %s", h.task.ID())
33 | return true
34 | }
35 |
36 | func (h *testHandler) Stop() {
37 | h.t.Logf("Stopping %s", h.task.ID())
38 | close(h.stop)
39 | }
40 |
41 | func newTestHandlerFunc(t *testing.T) (HandlerFunc, chan string) {
42 | tasksRun := make(chan string, 10)
43 | return func(task Task) Handler {
44 | return &testHandler{
45 | task: task,
46 | stop: make(chan int),
47 | t: t,
48 | tasksRun: tasksRun,
49 | }
50 | }, tasksRun
51 | }
52 |
53 | // TestConsumer ensures the consumers main loop properly handles tasks as well
54 | // as errors and Shutdown.
55 | func TestConsumer(t *testing.T) {
56 | t.Parallel()
57 |
58 | // Setup some tasks to run in a fake coordinator
59 | tc := NewTestCoord()
60 | tc.Tasks <- testTask{"test1"}
61 | tc.Tasks <- testTask{"test2"}
62 |
63 | // Setup a handler func that lets us know what tasks are running
64 | hf, tasksRun := newTestHandlerFunc(t)
65 |
66 | // Create the consumer and run it
67 | c, _ := NewConsumer(tc, hf, DumbBalancer)
68 | s := make(chan int)
69 | go func() {
70 | c.Run()
71 | s <- 1
72 | }()
73 |
74 | for i := 0; i < 2; i++ {
75 | select {
76 | case <-s:
77 | t.Fatalf("Run exited early")
78 | case tr := <-tasksRun:
79 | if tr != "test1" && tr != "test2" {
80 | t.Errorf("Expected `test1` or `test2` but received: %s", tr)
81 | }
82 | t.Logf("Received task=%q", tr)
83 | case <-time.After(100 * time.Millisecond):
84 | t.Errorf("First task didn't execute in a timely fashion")
85 | }
86 | }
87 |
88 | // Ensure Tasks() is accurate
89 | tasks := c.Tasks()
90 | if len(tasks) != 2 {
91 | t.Errorf("Expected 2 tasks to be running but found: %v", tasks)
92 | }
93 |
94 | go func() {
95 | c.Shutdown()
96 | s <- 1
97 | }()
98 | for i := 0; i < 2; i++ {
99 | select {
100 | case <-s:
101 | case <-time.After(100 * time.Millisecond):
102 | t.Errorf("Run and Shutdown didn't finish in a timely fashion")
103 | }
104 | }
105 | }
106 |
107 | // Balancer/Consumer test
108 |
109 | type testBalancer struct {
110 | c BalancerContext
111 | t *testing.T
112 | secondRun bool
113 | done chan struct{}
114 | }
115 |
116 | func (b *testBalancer) Init(c BalancerContext) { b.c = c }
117 | func (b *testBalancer) CanClaim(task Task) (time.Time, bool) {
118 | b.t.Logf("CanClaim(%s) -> %t", task.ID(), task.ID() == "ok-task")
119 | return time.Now().Add(100 * time.Hour), task.ID() == "ok-task"
120 | }
121 |
122 | func (b *testBalancer) Balance() []string {
123 | if b.secondRun {
124 | return nil
125 | }
126 | b.secondRun = true
127 | tsks := b.c.Tasks()
128 | if len(tsks) != 1 {
129 | b.t.Errorf("len(ConsumerState.Tasks()) != 1 ==> %v", tsks)
130 | return nil
131 | }
132 | if tsks[0].Task().ID() != "ok-task" {
133 | b.t.Errorf("Wrong task in ConsumerState.Tasks(): %v", tsks)
134 | }
135 | close(b.done)
136 | return nil
137 | }
138 |
139 | func TestBalancer(t *testing.T) {
140 | t.Parallel()
141 | if testing.Short() {
142 | t.Skip("skipping due to -short")
143 | }
144 |
145 | hf, tasksRun := newTestHandlerFunc(t)
146 | tc := NewTestCoord()
147 | balDone := make(chan struct{})
148 | c, _ := NewConsumer(tc, hf, &testBalancer{t: t, done: balDone})
149 | c.balEvery = 0
150 | go c.Run()
151 | tc.Tasks <- testTask{"test1"}
152 | tc.Tasks <- testTask{"ok-task"}
153 | tc.Tasks <- testTask{"test2"}
154 |
155 | // Wait for balance
156 | select {
157 | case <-balDone:
158 | case <-time.After(time.Duration(balanceJitterMax) + 10*time.Millisecond):
159 | t.Error("Didn't balance in a timely fashion")
160 | }
161 |
162 | select {
163 | case run := <-tasksRun:
164 | if run != "ok-task" {
165 | t.Errorf("Balancer didn't reject tasks properly. Ran task %s", run)
166 | }
167 | case <-time.After(100 * time.Millisecond):
168 | t.Error("Task didn't run in a timely fashion")
169 | }
170 |
171 | /*
172 | if r := c.bal.Balance(); len(r) > 0 {
173 | t.Errorf("Balance() should return 0, not: %v", r)
174 | }
175 | */
176 |
177 | s := make(chan int)
178 | go func() {
179 | c.Shutdown()
180 | close(s)
181 | }()
182 | select {
183 | case <-s:
184 | case <-time.After(100 * time.Millisecond):
185 | t.Errorf("Shutdown didn't finish in a timely fashion")
186 | }
187 | if len(c.Tasks()) != 0 {
188 | t.Errorf("Shutdown didn't stop all tasks")
189 | }
190 | }
191 |
192 | type noopHandler struct{}
193 |
194 | func (noopHandler) Run() bool { return true }
195 | func (noopHandler) Stop() {}
196 |
197 | // TestHandleTask ensures that tasks are marked as done once handled.
198 | func TestHandleTask(t *testing.T) {
199 | hf := func(Task) Handler { return noopHandler{} }
200 | coord := NewTestCoord()
201 | c, _ := NewConsumer(coord, hf, DumbBalancer)
202 | go c.Run()
203 | coord.Tasks <- testTask{"task1"}
204 | select {
205 | case <-coord.Releases:
206 | t.Errorf("Release called, expected Done!")
207 | case <-coord.Dones:
208 | case <-time.After(100 * time.Millisecond):
209 | t.Fatalf("Took too long to mark task as done")
210 | }
211 | c.Shutdown()
212 | }
213 |
214 | // TestTaskPanic ensures panics from Run methods are turned into Done calls.
215 | func TestTaskPanic(t *testing.T) {
216 | t.Parallel()
217 | hf := SimpleHandler(func(Task, <-chan bool) bool {
218 | panic("TestTaskPanic")
219 | })
220 | coord := NewTestCoord()
221 | c, _ := NewConsumer(coord, hf, DumbBalancer)
222 | go c.Run()
223 | coord.Tasks <- testTask{"1"}
224 | coord.Tasks <- testTask{"2"}
225 | coord.Tasks <- testTask{"3"}
226 | for i := 3; i > 0; i-- {
227 | select {
228 | case task := <-coord.Dones:
229 | t.Logf("%s done", task)
230 | case task := <-coord.Releases:
231 | t.Errorf("%s released when it should have been marked Done!", task)
232 | case <-time.After(200 * time.Millisecond):
233 | t.Fatalf("Took too long to mark task(s) as done.")
234 | }
235 | }
236 | c.Shutdown()
237 | }
238 |
239 | // TestShutdown ensures Shutdown causes Run() to exit cleanly.
240 | func TestShutdown(t *testing.T) {
241 | t.Parallel()
242 | hf := SimpleHandler(func(_ Task, c <-chan bool) bool {
243 | <-c
244 | return false
245 | })
246 | coord := NewTestCoord()
247 | c, _ := NewConsumer(coord, hf, DumbBalancer)
248 | go c.Run()
249 | coord.Tasks <- testTask{"1"}
250 | coord.Tasks <- testTask{"2"}
251 | coord.Tasks <- testTask{"3"}
252 | time.Sleep(100 * time.Millisecond)
253 | if len(coord.Dones)+len(coord.Releases) > 0 {
254 | t.Fatalf("Didn't expect any tasks to exit before Shutdown was called.")
255 | }
256 | c.Shutdown()
257 | for i := 3; i > 0; i-- {
258 | select {
259 | case task := <-coord.Dones:
260 | t.Errorf("%s makred done when it should have been Released!", task)
261 | case task := <-coord.Releases:
262 | t.Logf("%s relased", task)
263 | case <-time.After(200 * time.Millisecond):
264 | t.Fatalf("Took too long to mark task(s) as released.")
265 | }
266 | }
267 | }
268 |
--------------------------------------------------------------------------------
/metcdv3/README.md:
--------------------------------------------------------------------------------
1 | metafora etcdv3 client
2 | ====================
3 |
4 | See [Documentation/etcdv3.md](../Documentation/etcdv3.md) for details.
5 |
6 | Testing
7 | -------
8 |
9 | Testing the metafora etcd client requires that a new etcd instance be running.
10 | The etcd instances should be reachable via the connection described by the
11 | connection string `localhost:5001,localhost:5002,localhost:5003` or a similar
12 | connection string should be exported as an environment variable `ETCDCTL_PEERS`.
13 |
14 | An example of running the integration tests is given in the command line below:
15 |
16 | ```sh
17 | IP="127.0.0.1" ETCDCTL_PEERS="$IP:5001,$IP:5002,$IP:5003" go test -v
18 | ```
19 |
--------------------------------------------------------------------------------
/metcdv3/balancer.go:
--------------------------------------------------------------------------------
1 | package metcdv3
2 |
3 | import (
4 | "context"
5 | "encoding/json"
6 | "path"
7 |
8 | "github.com/lytics/metafora"
9 |
10 | etcdv3 "go.etcd.io/etcd/client/v3"
11 | )
12 |
13 | // NewFairBalancer creates a new metafora.DefaultFairBalancer that uses etcd
14 | // for counting tasks per node.
15 | func NewFairBalancer(conf *Config, etcdv3c *etcdv3.Client, filter func(*FilterableValue) bool) metafora.Balancer {
16 | e := etcdClusterState{
17 | etcdv3c: etcdv3c,
18 | kvc: etcdv3.NewKV(etcdv3c),
19 | taskPath: path.Join(conf.Namespace, TasksPath),
20 | nodePath: path.Join(conf.Namespace, NodesPath),
21 | filter: filter,
22 | }
23 | return metafora.NewDefaultFairBalancer(conf.Name, &e)
24 | }
25 |
26 | // Checks the current state of an Etcd cluster
27 | type etcdClusterState struct {
28 | etcdv3c *etcdv3.Client
29 | kvc etcdv3.KV
30 | taskPath string
31 | nodePath string
32 | filter func(*FilterableValue) bool
33 | }
34 |
35 | type FilterableValue struct {
36 | Name string
37 | }
38 |
39 | func (e *etcdClusterState) NodeTaskCount() (map[string]int, error) {
40 | state := map[string]int{}
41 |
42 | // First initialize state with nodes as keys
43 | resp, err := e.kvc.Get(context.Background(), e.nodePath, etcdv3.WithPrefix())
44 | if err != nil {
45 | return nil, err
46 | }
47 |
48 | if resp == nil || len(resp.Kvs) == 0 {
49 | metafora.Warnf("balancer received empty response from GET %s", e.nodePath)
50 | return state, nil
51 | }
52 |
53 | for _, kv := range resp.Kvs {
54 | // We're guarunteed to find nodes under the _metadata path (created on Coordinator startup)
55 | dir, _ := path.Split(string(kv.Key))
56 | dir, node := path.Split(path.Clean(dir))
57 | if path.Clean(dir) == e.nodePath && e.filter(&FilterableValue{Name: node}) {
58 | state[node] = 0
59 | }
60 | }
61 |
62 | resp, err = e.kvc.Get(context.Background(), e.taskPath, etcdv3.WithPrefix())
63 | if err != nil {
64 | return nil, err
65 | }
66 |
67 | // No current tasks
68 | if resp == nil || len(resp.Kvs) == 0 {
69 | return state, nil
70 | }
71 |
72 | // Get the list of all claimed work, create a map of the counts and
73 | // node values
74 | // We ignore tasks which have no claims
75 | for _, kv := range resp.Kvs {
76 | ownerPath := path.Base(string(kv.Key))
77 | if ownerPath == OwnerPath {
78 | ov := &ownerValue{}
79 | err := json.Unmarshal(kv.Value, ov)
80 | if err != nil {
81 | return nil, err
82 | }
83 | // We want to only include those nodes which were initially included,
84 | // as some nodes may be shutting down, etc, and should not be counted
85 | if _, ok := state[ov.Node]; ok {
86 | state[ov.Node]++
87 | }
88 | }
89 | }
90 | return state, nil
91 | }
92 |
--------------------------------------------------------------------------------
/metcdv3/balancer_test.go:
--------------------------------------------------------------------------------
1 | package metcdv3
2 |
3 | import (
4 | "testing"
5 | "time"
6 |
7 | "github.com/lytics/metafora"
8 | )
9 |
10 | func TestFairBalancer(t *testing.T) {
11 | t.Parallel()
12 | etcdv3c, coord1, conf1 := setupEtcd(t)
13 | defer etcdv3c.Close()
14 | conf2 := conf1.Copy()
15 | conf2.Name = "coord2"
16 | coord2 := NewEtcdV3Coordinator(conf2, etcdv3c)
17 |
18 | cli := NewClient(conf1.Namespace, etcdv3c)
19 |
20 | h := metafora.SimpleHandler(func(task metafora.Task, stop <-chan bool) bool {
21 | metafora.Debugf("Starting %s", task.ID())
22 | <-stop
23 | metafora.Debugf("Stopping %s", task.ID())
24 | return false // never done
25 | })
26 |
27 | filter := func(_ *FilterableValue) bool { return true }
28 | // Create two consumers
29 | b1 := NewFairBalancer(conf1, etcdv3c, filter)
30 | con1, err := metafora.NewConsumer(coord1, h, b1)
31 | if err != nil {
32 | t.Fatal(err)
33 | }
34 |
35 | b2 := NewFairBalancer(conf2, etcdv3c, filter)
36 | con2, err := metafora.NewConsumer(coord2, h, b2)
37 | if err != nil {
38 | t.Fatal(err)
39 | }
40 |
41 | // Start the first and let it claim a bunch of tasks
42 | go con1.Run()
43 | defer con1.Shutdown()
44 | _ = cli.SubmitTask(DefaultTaskFunc("t1", ""))
45 | _ = cli.SubmitTask(DefaultTaskFunc("t2", ""))
46 | _ = cli.SubmitTask(DefaultTaskFunc("t3", ""))
47 | _ = cli.SubmitTask(DefaultTaskFunc("t4", ""))
48 | _ = cli.SubmitTask(DefaultTaskFunc("t5", ""))
49 | _ = cli.SubmitTask(DefaultTaskFunc("t6", ""))
50 |
51 | time.Sleep(5 * time.Second)
52 |
53 | if len(con1.Tasks()) != 6 {
54 | t.Fatalf("con1 should have claimed 6 tasks: %d", len(con1.Tasks()))
55 | }
56 |
57 | // Start the second consumer and force the 1st to rebalance
58 | go con2.Run()
59 | defer con2.Shutdown()
60 |
61 | // Wait for node to startup and register
62 | time.Sleep(1 * time.Second)
63 |
64 | _ = cli.SubmitCommand(conf1.Name, metafora.CommandBalance())
65 |
66 | time.Sleep(2 * time.Second)
67 |
68 | c1Tasks := con1.Tasks()
69 | c2Tasks := con2.Tasks()
70 | if len(c1Tasks) != 4 || len(c2Tasks) != 2 {
71 | t.Fatalf("expected consumers to have 4|2 tasks: %d|%d", len(c1Tasks), len(c2Tasks))
72 | }
73 |
74 | // Finally make sure that balancing the other node does nothing
75 | _ = cli.SubmitCommand("node2", metafora.CommandBalance())
76 |
77 | time.Sleep(2 * time.Second)
78 |
79 | c1Tasks2 := con1.Tasks()
80 | c2Tasks2 := con2.Tasks()
81 | if len(c1Tasks2) != 4 || len(c2Tasks2) != 2 {
82 | t.Fatalf("expected consumers to have 4|2 tasks: %d|%d", len(c1Tasks2), len(c2Tasks2))
83 | }
84 | for i := 0; i < 4; i++ {
85 | if c1Tasks[i] != c1Tasks2[i] {
86 | t.Errorf("task mismatch: %s != %s", c1Tasks[i], c1Tasks2[i])
87 | }
88 | }
89 | for i := 0; i < 2; i++ {
90 | if c2Tasks[i] != c2Tasks2[i] {
91 | t.Errorf("task mismatch: %s != %s", c2Tasks[i], c2Tasks2[i])
92 | }
93 | }
94 | }
95 |
96 | func TestFairBalancerFilter(t *testing.T) {
97 | t.Parallel()
98 | etcdv3c, coord1, conf1 := setupEtcd(t)
99 | defer etcdv3c.Close()
100 | conf2 := conf1.Copy()
101 | conf2.Name = "coord2"
102 | coord2 := NewEtcdV3Coordinator(conf2, etcdv3c)
103 |
104 | cli := NewClient(conf1.Namespace, etcdv3c)
105 |
106 | h := metafora.SimpleHandler(func(task metafora.Task, stop <-chan bool) bool {
107 | metafora.Debugf("Starting %s", task.ID())
108 | <-stop
109 | metafora.Debugf("Stopping %s", task.ID())
110 | return false // never done
111 | })
112 |
113 | filter := func(fv *FilterableValue) bool { return fv.Name == conf1.Name }
114 | // Create two consumers
115 | b1 := NewFairBalancer(conf1, etcdv3c, filter)
116 | con1, err := metafora.NewConsumer(coord1, h, b1)
117 | if err != nil {
118 | t.Fatal(err)
119 | }
120 |
121 | filter2 := func(fv *FilterableValue) bool { return fv.Name == conf2.Name }
122 | b2 := NewFairBalancer(conf2, etcdv3c, filter2)
123 | con2, err := metafora.NewConsumer(coord2, h, b2)
124 | if err != nil {
125 | t.Fatal(err)
126 | }
127 |
128 | // Start the first and let it claim a bunch of tasks
129 | go con1.Run()
130 | defer con1.Shutdown()
131 | _ = cli.SubmitTask(DefaultTaskFunc("t1", ""))
132 | _ = cli.SubmitTask(DefaultTaskFunc("t2", ""))
133 | _ = cli.SubmitTask(DefaultTaskFunc("t3", ""))
134 | _ = cli.SubmitTask(DefaultTaskFunc("t4", ""))
135 | _ = cli.SubmitTask(DefaultTaskFunc("t5", ""))
136 | _ = cli.SubmitTask(DefaultTaskFunc("t6", ""))
137 | _ = cli.SubmitTask(DefaultTaskFunc("t7", ""))
138 | _ = cli.SubmitTask(DefaultTaskFunc("t8", ""))
139 | _ = cli.SubmitTask(DefaultTaskFunc("t9", ""))
140 |
141 | time.Sleep(5 * time.Second)
142 |
143 | if len(con1.Tasks()) != 9 {
144 | t.Fatalf("con1 should have claimed 9 tasks: %d", len(con1.Tasks()))
145 | }
146 |
147 | // Start the second consumer and force the 1st to rebalance
148 | go con2.Run()
149 | defer con2.Shutdown()
150 |
151 | // Wait for node to startup and register
152 | time.Sleep(1 * time.Second)
153 |
154 | _ = cli.SubmitCommand(conf1.Name, metafora.CommandBalance())
155 |
156 | time.Sleep(2 * time.Second)
157 |
158 | // Make sure that balancing never happened
159 | c2Tasks := con2.Tasks()
160 | if len(c2Tasks) != 0 {
161 | t.Fatalf("expected no tasks to be rebalanced but got: %d", len(c2Tasks))
162 | }
163 |
164 | }
165 |
166 | // Fair balancer shouldn't consider a shutting-down node
167 | // See https://github.com/lytics/metafora/issues/92
168 | func TestFairBalancerShutdown(t *testing.T) {
169 | etcdv3c, coord1, conf1 := setupEtcd(t)
170 | defer etcdv3c.Close()
171 | conf2 := conf1.Copy()
172 | conf2.Name = "node2"
173 | coord2 := NewEtcdV3Coordinator(conf2, etcdv3c)
174 |
175 | cli := NewClient(conf1.Namespace, etcdv3c)
176 |
177 | // This handler always returns immediately
178 | h1 := metafora.SimpleHandler(func(task metafora.Task, stop <-chan bool) bool {
179 | metafora.Debugf("H1 Starting %s", task.ID())
180 | <-stop
181 | metafora.Debugf("H1 Stopping %s", task.ID())
182 | return false // never done
183 | })
184 |
185 | // Block forever on a single task
186 | stop2 := make(chan struct{})
187 | stopr := make(chan chan struct{}, 1)
188 | stopr <- stop2
189 | h2 := metafora.SimpleHandler(func(task metafora.Task, stop <-chan bool) bool {
190 | metafora.Debugf("H2 Starting %s", task.ID())
191 | blockchan, ok := <-stopr
192 | if ok {
193 | <-blockchan
194 | }
195 | <-stop
196 | metafora.Debugf("H2 Stopping %s", task.ID())
197 | return false // never done
198 | })
199 |
200 | filter := func(_ *FilterableValue) bool { return true }
201 | // Create two consumers
202 | b1 := NewFairBalancer(conf1, etcdv3c, filter)
203 | con1, err := metafora.NewConsumer(coord1, h1, b1)
204 | if err != nil {
205 | t.Fatal(err)
206 | }
207 |
208 | b2 := NewFairBalancer(conf2, etcdv3c, filter)
209 | con2, err := metafora.NewConsumer(coord2, h2, b2)
210 | if err != nil {
211 | t.Fatal(err)
212 | }
213 |
214 | // Start the first and let it claim a bunch of tasks
215 | go con1.Run()
216 | defer con1.Shutdown()
217 | _ = cli.SubmitTask(DefaultTaskFunc("t1", ""))
218 | _ = cli.SubmitTask(DefaultTaskFunc("t2", ""))
219 | _ = cli.SubmitTask(DefaultTaskFunc("t3", ""))
220 | _ = cli.SubmitTask(DefaultTaskFunc("t4", ""))
221 | _ = cli.SubmitTask(DefaultTaskFunc("t5", ""))
222 | _ = cli.SubmitTask(DefaultTaskFunc("t6", ""))
223 |
224 | time.Sleep(1000 * time.Millisecond)
225 |
226 | if len(con1.Tasks()) != 6 {
227 | t.Fatalf("con1 should have claimed 6 tasks: %d", len(con1.Tasks()))
228 | }
229 |
230 | // Start the second consumer and force the 1st to rebalance
231 | go con2.Run()
232 |
233 | close(stopr)
234 |
235 | // Wait for node to startup and register
236 | time.Sleep(500 * time.Millisecond)
237 |
238 | _ = cli.SubmitCommand(conf1.Name, metafora.CommandBalance())
239 |
240 | time.Sleep(2 * time.Second)
241 |
242 | c1Tasks := con1.Tasks()
243 | c2Tasks := con2.Tasks()
244 | if len(c1Tasks) != 4 || len(c2Tasks) != 2 {
245 | t.Fatalf("expected consumers to have 4|2 tasks: %d|%d", len(c1Tasks), len(c2Tasks))
246 | }
247 |
248 | // Make sure that balancing the other node does nothing
249 | _ = cli.SubmitCommand("node2", metafora.CommandBalance())
250 |
251 | time.Sleep(2 * time.Second)
252 |
253 | c1Tasks2 := con1.Tasks()
254 | c2Tasks2 := con2.Tasks()
255 | if len(c1Tasks2) != 4 || len(c2Tasks2) != 2 {
256 | t.Fatalf("expected consumers to have 4|2 tasks: %d|%d", len(c1Tasks2), len(c2Tasks2))
257 | }
258 | for i := 0; i < 4; i++ {
259 | if c1Tasks[i] != c1Tasks2[i] {
260 | t.Errorf("task mismatch: %s != %s", c1Tasks[i], c1Tasks2[i])
261 | }
262 | }
263 | for i := 0; i < 2; i++ {
264 | if c2Tasks[i] != c2Tasks2[i] {
265 | t.Errorf("task mismatch: %s != %s", c2Tasks[i], c2Tasks2[i])
266 | }
267 | }
268 |
269 | // Second consumer should block on a single task forever
270 | // Rebalancing the first node should then cause it to pickup all but
271 | // one task
272 | c2stop := make(chan struct{})
273 | go func() {
274 | con2.Shutdown()
275 | close(c2stop)
276 | }()
277 |
278 | time.Sleep(500 * time.Millisecond)
279 |
280 | _ = cli.SubmitCommand(conf1.Name, metafora.CommandBalance())
281 |
282 | time.Sleep(2 * time.Second)
283 |
284 | c1Tasks3 := con1.Tasks()
285 | c2Tasks3 := con2.Tasks()
286 | if len(c1Tasks3) != 5 || len(c2Tasks3) != 1 {
287 | t.Fatalf("Expected consumers to have 5|1 tasks: %d|%d", len(c1Tasks3), len(c2Tasks3))
288 | }
289 |
290 | // Now stop blocking task, rebalance and make sure the first node picked up the remaining
291 | close(stop2)
292 |
293 | time.Sleep(500 * time.Millisecond)
294 | // Consumer 2 should stop now
295 | <-c2stop
296 |
297 | _ = cli.SubmitCommand(conf1.Name, metafora.CommandBalance())
298 |
299 | time.Sleep(2 * time.Second)
300 |
301 | // con2 is out of the picture. con1 has all the tasks.
302 | c1Tasks4 := con1.Tasks()
303 | c2Tasks4 := con2.Tasks()
304 | if len(c1Tasks4) != 6 || len(c2Tasks4) != 0 {
305 | t.Fatalf("Expected consumers to have 6|0 tasks: %d|%d", len(c1Tasks4), len(c2Tasks4))
306 | }
307 | }
308 |
--------------------------------------------------------------------------------
/metcdv3/client.go:
--------------------------------------------------------------------------------
1 | package metcdv3
2 |
3 | import (
4 | "context"
5 | "encoding/json"
6 | "errors"
7 | "math/rand"
8 | "path"
9 | "strconv"
10 | "time"
11 |
12 | "github.com/lytics/metafora"
13 | etcdv3 "go.etcd.io/etcd/client/v3"
14 | )
15 |
16 | var (
17 | // ErrFailedSubmitTask because the task already existed most likely
18 | ErrFailedSubmitTask = errors.New("metafora etcdv3 client: failed submit task")
19 | ErrLeaseDurationTooShort = errors.New("metafora etcd clientv3: lease duration too short")
20 | ErrKeepAliveClosedUnexpectedly = errors.New("metafora etcd clientv3: keep alive closed unexpectedly")
21 | )
22 |
23 | var (
24 | minLeaseDuration = 10 * time.Second
25 | )
26 |
27 | // NewClient creates a new client using an etcd backend.
28 | func NewClient(namespace string, etcdv3c *etcdv3.Client) metafora.Client {
29 | return &mclient{
30 | etcdv3c: etcdv3c,
31 | kvc: etcdv3.NewKV(etcdv3c),
32 | namespace: namespace,
33 | }
34 | }
35 |
36 | type keepAliveStats struct {
37 | success int
38 | failure int
39 | }
40 |
41 | // Type 'mclient' is an internal implementation of metafora.Client with an etcd backend.
42 | type mclient struct {
43 | etcdv3c *etcdv3.Client
44 | kvc etcdv3.KV
45 | namespace string
46 | }
47 |
48 | // nodesPath is the base path of nodes, represented as a directory in etcd.
49 | func (mc *mclient) nodesPath() string {
50 | return path.Join("/", mc.namespace, NodesPath)
51 | }
52 |
53 | // taskPath is the path to a particular taskId, represented as a file in etcd.
54 | func (mc *mclient) taskPath(taskID string) string {
55 | return path.Join("/", mc.namespace, TasksPath, taskID)
56 | }
57 |
58 | // cmdPath is the path to a particular nodeId, represented as a directory in etcd.
59 | func (mc *mclient) cmdPath(node string) string {
60 | return path.Join("/", mc.namespace, NodesPath, node, "commands")
61 | }
62 |
63 | // SubmitTask creates a new task in etcd
64 | func (mc *mclient) SubmitTask(task metafora.Task) error {
65 | c := context.Background()
66 | fullpath := path.Join(mc.taskPath(task.ID()), PropsPath)
67 | buf, err := json.Marshal(task)
68 | if err != nil {
69 | return err
70 | }
71 | txnRes, err := mc.kvc.Txn(c).
72 | If(etcdv3.Compare(etcdv3.Version(fullpath), "=", 0)).
73 | // Should we create both of these?
74 | Then(etcdv3.OpPut(fullpath, string(buf)), etcdv3.OpPut(mc.taskPath(task.ID()), "")).
75 | Commit()
76 | if err != nil {
77 | return err
78 | }
79 | if !txnRes.Succeeded {
80 | return ErrFailedSubmitTask
81 | }
82 | metafora.Debugf("task %s submitted: %s", task.ID(), fullpath)
83 | return nil
84 | }
85 |
86 | // Delete a task
87 | func (mc *mclient) DeleteTask(taskID string) error {
88 | c := context.Background()
89 | fullpath := mc.taskPath(taskID)
90 | _, err := mc.kvc.Delete(c, fullpath, etcdv3.WithPrefix())
91 | metafora.Debugf("task %s deleted: %s", taskID, fullpath)
92 | return err
93 | }
94 |
95 | // SubmitCommand creates a new command for a particular nodeId, the
96 | // command has a random name and is added to the particular nodeId
97 | // directory in etcd.
98 | func (mc *mclient) SubmitCommand(node string, command metafora.Command) error {
99 | cmdPath := mc.cmdPath(node)
100 | body, err := command.Marshal()
101 | if err != nil {
102 | // This is either a bug in metafora or someone implemented their own
103 | // command incorrectly.
104 | return err
105 | }
106 | key := path.Join(cmdPath, strconv.FormatUint(rand.Uint64(), 10))
107 | if _, err := mc.kvc.Put(context.Background(), key, string(body)); err != nil {
108 | metafora.Errorf("Error submitting command: %s to node: %s", command, node)
109 | return err
110 | }
111 | metafora.Debugf("Submitted command: %s to node: %s", string(body), node)
112 | return nil
113 | }
114 |
115 | // Nodes fetchs the currently registered nodes. A non-nil error means that some
116 | // error occured trying to get the node list. The node list may be nil if no
117 | // nodes are registered.
118 | func (mc *mclient) Nodes() ([]string, error) {
119 | res, err := mc.kvc.Get(context.Background(), mc.nodesPath(), etcdv3.WithPrefix())
120 | if err != nil && res != nil && len(res.Kvs) > 0 {
121 | nodes := make([]string, len(res.Kvs))
122 | for i, kv := range res.Kvs {
123 | var node string
124 | err = json.Unmarshal(kv.Key, &node)
125 | if err != nil {
126 | return nil, err
127 | }
128 | nodes[i] = path.Base(node)
129 | }
130 | return nodes, nil
131 | }
132 |
133 | return nil, nil
134 | }
135 |
136 | func (mc *mclient) Tasks() ([]string, error) {
137 | res, err := mc.kvc.Get(
138 | context.Background(),
139 | path.Join("/", mc.namespace, TasksPath),
140 | etcdv3.WithPrefix())
141 | if err != nil {
142 | return nil, err
143 | }
144 |
145 | var tasks []string
146 | for _, kv := range res.Kvs {
147 | key := string(kv.Key)
148 | if base := path.Base(key); base == OwnerPath || base == MetadataPath || base == PropsPath {
149 | continue
150 | } else {
151 | tasks = append(tasks, base)
152 | }
153 | }
154 | return tasks, nil
155 | }
156 |
--------------------------------------------------------------------------------
/metcdv3/client_test.go:
--------------------------------------------------------------------------------
1 | package metcdv3
2 |
3 | // NOTES
4 | //
5 | // These tests are in reality integration tests which require that
6 | // etcd is running on the test system and its peers are found
7 | // in the ENV variable ETCDCTL_PEERS. The tests do not clean
8 | // out data and require a fresh set of etcd instances for
9 | // each run. You can consider this a known bug which
10 | // will be fixed in a future release.
11 | //
12 | // See: https://github.com/lytics/metafora/issues/31
13 |
14 | import (
15 | "context"
16 | "testing"
17 |
18 | "github.com/lytics/metafora"
19 | "github.com/lytics/metafora/metcdv3/testutil"
20 | etcdv3 "go.etcd.io/etcd/client/v3"
21 | )
22 |
23 | const (
24 | Namespace = "test"
25 | NodesDir = "/test/nodes"
26 | Node1 = "node1"
27 | Node1Path = "/test/nodes/node1"
28 | )
29 |
30 | // TestNodes tests that client.Nodes() returns the metafora nodes
31 | // registered in etcd.
32 | func TestNodes(t *testing.T) {
33 | c := context.Background()
34 | eclient := testutil.NewEtcdV3Client(t)
35 | kvc := etcdv3.NewKV(eclient)
36 | _, _ = eclient.Delete(c, Node1Path, etcdv3.WithPrefix())
37 |
38 | mclient := NewClient(Namespace, eclient)
39 |
40 | if _, err := kvc.Put(c, Node1Path, "0"); err != nil {
41 | t.Fatalf("AddChild %v returned error: %v", NodesDir, err)
42 | }
43 |
44 | if nodes, err := mclient.Nodes(); err != nil {
45 | t.Fatalf("Nodes returned error: %v", err)
46 | } else {
47 | for i, n := range nodes {
48 | t.Logf("%v -> %v", i, n)
49 | }
50 | }
51 | }
52 |
53 | // TestSubmitTask tests that client.SubmitTask(...) adds a task to
54 | // the proper path in etcd, and that the same task id cannot be
55 | // submitted more than once.
56 | func TestSubmitTask(t *testing.T) {
57 | client := testutil.NewEtcdV3Client(t)
58 | mclient := NewClient(Namespace, client)
59 |
60 | task := DefaultTaskFunc("testid1", "")
61 |
62 | if err := mclient.DeleteTask(task.ID()); err != nil {
63 | t.Logf("DeleteTask returned an error, which maybe ok. Error:%v", err)
64 | }
65 |
66 | if err := mclient.SubmitTask(task); err != nil {
67 | t.Fatalf("Submit task failed on initial submission, error: %v", err)
68 | }
69 |
70 | if err := mclient.SubmitTask(task); err == nil {
71 | t.Fatalf("Submit task did not fail, but should of, when using existing tast id")
72 | }
73 | }
74 |
75 | // TestSubmitCommand tests that client.SubmitCommand(...) adds a command
76 | // to the proper node path in etcd, and that it can be read back.
77 | func TestSubmitCommand(t *testing.T) {
78 | eclient := testutil.NewEtcdV3Client(t)
79 | kvc := etcdv3.NewKV(eclient)
80 | mclient := NewClient(Namespace, eclient)
81 |
82 | if err := mclient.SubmitCommand(Node1, metafora.CommandFreeze()); err != nil {
83 | t.Fatalf("Unable to submit command. error:%v", err)
84 | }
85 |
86 | if res, err := kvc.Get(context.Background(), NodesDir, etcdv3.WithPrefix()); err != nil {
87 | t.Fatalf("Get on path %v returned error: %v", NodesDir, err)
88 | } else if res.Count == 0 {
89 | t.Fatalf("Get on path %v returned nil for child nodes", NodesDir)
90 | }
91 | }
92 |
--------------------------------------------------------------------------------
/metcdv3/commander.go:
--------------------------------------------------------------------------------
1 | package metcdv3
2 |
3 | import (
4 | "context"
5 | "encoding/json"
6 | "errors"
7 | "fmt"
8 | "path"
9 | "sync"
10 | "time"
11 |
12 | "github.com/lytics/metafora"
13 | "github.com/lytics/metafora/statemachine"
14 | etcdv3 "go.etcd.io/etcd/client/v3"
15 | )
16 |
17 | var (
18 | ErrWatchClosedUnexpectedly = errors.New("metafora: watch closed unexpectedly")
19 | )
20 |
21 | type cmdr struct {
22 | etcdv3c *etcdv3.Client
23 | kvc etcdv3.KV
24 | taskspath string
25 | }
26 |
27 | func NewCommander(namespace string, c *etcdv3.Client) statemachine.Commander {
28 | return &cmdr{
29 | taskspath: path.Join("/", namespace, TasksPath),
30 | etcdv3c: c,
31 | kvc: etcdv3.NewKV(c),
32 | }
33 | }
34 |
35 | // Send command to a task. Overwrites existing commands.
36 | func (c *cmdr) Send(taskID string, m *statemachine.Message) error {
37 | buf, err := json.Marshal(m)
38 | if err != nil {
39 | return err
40 | }
41 |
42 | cmdPath := path.Join(c.taskspath, taskID, CommandsPath)
43 | _, err = c.kvc.Put(context.Background(), cmdPath, string(buf))
44 | return err
45 | }
46 |
47 | type cmdListener struct {
48 | etcdv3c *etcdv3.Client
49 | kvc etcdv3.KV
50 | name string
51 | taskcmdpath string
52 |
53 | commands chan *statemachine.Message
54 |
55 | mu *sync.Mutex
56 | stop chan bool
57 | }
58 |
59 | // NewCommandListener makes a statemachine.CommandListener implementation
60 | // backed by etcd. The namespace should be the same as the coordinator as
61 | // commands use a separate path within a namespace than tasks or nodes.
62 | func NewCommandListener(conf *Config, task metafora.Task, c *etcdv3.Client) statemachine.CommandListener {
63 | taskcmdpath := path.Join("/", conf.Namespace, TasksPath, task.ID(), CommandsPath)
64 | cl := &cmdListener{
65 | etcdv3c: c,
66 | name: conf.Name,
67 | taskcmdpath: taskcmdpath,
68 | kvc: etcdv3.NewKV(c),
69 | commands: make(chan *statemachine.Message),
70 | mu: &sync.Mutex{},
71 | stop: make(chan bool),
72 | }
73 | ctxt := context.Background()
74 | go cl.watch(ctxt, taskcmdpath)
75 | return cl
76 | }
77 |
78 | func (c *cmdListener) Receive() <-chan *statemachine.Message {
79 | return c.commands
80 | }
81 |
82 | func (c *cmdListener) ownerValueString() string {
83 | p, err := json.Marshal(&ownerValue{Node: c.name})
84 | if err != nil {
85 | panic(fmt.Sprintf("command listener: error marshalling node body: %v", err))
86 | }
87 | return string(p)
88 | }
89 |
90 | func (c *cmdListener) Stop() {
91 | c.mu.Lock()
92 | defer c.mu.Unlock()
93 | select {
94 | case <-c.stop:
95 | default:
96 | close(c.stop)
97 | }
98 | }
99 |
100 | func (cl *cmdListener) watch(c context.Context, prefix string) {
101 | getRes, err := cl.kvc.Get(c, prefix, etcdv3.WithPrefix())
102 | if err != nil {
103 | metafora.Errorf("Error GETting %s - sending error to stateful handler: %v", prefix, err)
104 | select {
105 | case <-c.Done():
106 | // TODO Do I need the stop channel?
107 | case <-cl.stop:
108 | case cl.commands <- statemachine.ErrorMessage(err):
109 | }
110 | return
111 | }
112 |
113 | // Create a message from an event.
114 | createMessage := func(key string, value []byte) (*statemachine.Message, error) {
115 | msg := &statemachine.Message{}
116 | if err := json.Unmarshal(value, msg); err != nil {
117 | metafora.Errorf("Error unmarshalling command from %s - sending error to stateful handler: %v", key, err)
118 | return nil, err
119 | }
120 |
121 | txnRes, err := cl.kvc.Txn(c).
122 | If(etcdv3.Compare(etcdv3.Value(path.Join(path.Dir(key), OwnerPath)), "=", cl.ownerValueString())).
123 | Then(etcdv3.OpDelete(key, etcdv3.WithPrefix())).
124 | Commit()
125 | if err != nil {
126 | metafora.Errorf("Error deleting command %s: %s - sending error to stateful handler: %v", key, msg, err)
127 | return nil, err
128 | }
129 | if !txnRes.Succeeded {
130 | metafora.Infof("Received successive commands; attempting to retrieve the latest")
131 | return nil, nil
132 | }
133 | return msg, nil
134 | }
135 | // Write a change or exit the watcher.
136 | put := func(msg *statemachine.Message) {
137 | select {
138 | case <-c.Done():
139 | case cl.commands <- msg:
140 | }
141 | }
142 | for _, kv := range getRes.Kvs {
143 | key := string(kv.Key)
144 | if path.Base(key) == MetadataPath {
145 | continue
146 | }
147 | value := kv.Value
148 | msg, err := createMessage(key, value)
149 | if err != nil {
150 | msg = statemachine.ErrorMessage(err)
151 | }
152 | if msg != nil {
153 | put(msg)
154 | }
155 | }
156 |
157 | putTerminalError := func(msg *statemachine.Message) {
158 | go func() {
159 | select {
160 | case <-c.Done():
161 | // TODO Do I need the stop channel?
162 | case <-cl.stop:
163 | case <-time.After(10 * time.Minute):
164 | metafora.Warnf("metafora command listener timed out putting message on channel: %v", msg)
165 | case cl.commands <- msg:
166 | }
167 | }()
168 | }
169 |
170 | // Watch deltas in etcd, with the give prefix, starting
171 | // at the revision of the get call above.
172 | deltas := cl.etcdv3c.Watch(c, prefix, etcdv3.WithPrefix(), etcdv3.WithRev(getRes.Header.Revision+1), etcdv3.WithFilterDelete())
173 | for {
174 | select {
175 | case <-c.Done():
176 | return
177 | case <-cl.stop:
178 | return
179 | case delta, open := <-deltas:
180 | if !open {
181 | putTerminalError(statemachine.ErrorMessage(ErrWatchClosedUnexpectedly))
182 | return
183 | }
184 | if delta.Err() != nil {
185 | putTerminalError(statemachine.ErrorMessage(delta.Err()))
186 | return
187 | }
188 | for _, event := range delta.Events {
189 | msg, err := createMessage(string(event.Kv.Key), event.Kv.Value)
190 | if err != nil {
191 | msg = statemachine.ErrorMessage(err)
192 | }
193 | if msg != nil {
194 | put(msg)
195 | }
196 | }
197 | }
198 | }
199 | }
200 |
--------------------------------------------------------------------------------
/metcdv3/commander_test.go:
--------------------------------------------------------------------------------
1 | package metcdv3
2 |
3 | import (
4 | "context"
5 | "fmt"
6 | "path"
7 | "testing"
8 | "time"
9 |
10 | "github.com/lytics/metafora"
11 | "github.com/lytics/metafora/statemachine"
12 | etcdv3 "go.etcd.io/etcd/client/v3"
13 | )
14 |
15 | func TestCommandListener(t *testing.T) {
16 | t.Parallel()
17 |
18 | etcdv3c, _, conf := setupEtcd(t)
19 | kvc := etcdv3.NewKV(etcdv3c)
20 |
21 | namespace := "/cltest"
22 | conf.Namespace = namespace
23 | _, _ = kvc.Delete(context.Background(), namespace, etcdv3.WithPrefix())
24 |
25 | task := metafora.NewTask("testtask")
26 | _, err := kvc.Put(context.Background(), path.Join(conf.Namespace, TasksPath, task.ID(), OwnerPath), fmt.Sprintf(`{"node":"%s"}`, conf.Name))
27 | if err != nil {
28 | t.Fatalf("Error creating fake claim: %v", err)
29 | }
30 |
31 | cmdr := NewCommander(namespace, etcdv3c)
32 |
33 | // Only the last command should be received once the listener is started
34 | _ = cmdr.Send(task.ID(), statemachine.PauseMessage())
35 | _ = cmdr.Send(task.ID(), statemachine.KillMessage())
36 |
37 | cl := NewCommandListener(conf, task, etcdv3c)
38 | defer cl.Stop()
39 |
40 | // Ensure last command was received
41 | select {
42 | case cmd := <-cl.Receive():
43 | if cmd.Code != statemachine.Kill {
44 | t.Fatalf("Expected Kill message, received %v", cmd)
45 | }
46 | case <-time.After(10 * time.Second):
47 | t.Fatal("CommandListener took too long to receive message")
48 | }
49 |
50 | // Ensure only one command was received
51 | select {
52 | case cmd := <-cl.Receive():
53 | t.Fatalf("Unexpected command received: %v", cmd)
54 | case <-time.After(300 * time.Millisecond):
55 | // Ok!
56 | }
57 |
58 | cl.Stop()
59 |
60 | // Stop doesn't block until watching loop exits, so wait briefly
61 | time.Sleep(10 * time.Millisecond)
62 |
63 | // Ensure receiving after Stopping never succeeds
64 | _ = cmdr.Send(task.ID(), statemachine.RunMessage())
65 | select {
66 | case cmd := <-cl.Receive():
67 | t.Fatalf("Unexpected command received: %v", cmd)
68 | case <-time.After(300 * time.Millisecond):
69 | // Ok
70 | }
71 | }
72 |
--------------------------------------------------------------------------------
/metcdv3/conf.go:
--------------------------------------------------------------------------------
1 | package metcdv3
2 |
3 | import (
4 | "fmt"
5 | "path"
6 | "strings"
7 | )
8 |
9 | type Config struct {
10 | // Namespace is the key prefix to allow for multitenant use of etcd.
11 | //
12 | // Namespaces must start with a / (added by NewConfig if needed).
13 | Namespace string
14 |
15 | // Name of this Metafora consumer. Only one instance of a Name is allowed to
16 | // run in a Namespace at a time, so if you set the Name to hostname you can
17 | // effectively limit Metafora to one process per server.
18 | Name string
19 |
20 | // NewTaskFunc is the function called to unmarshal tasks from etcd into a
21 | // custom struct. The struct must implement the metafora.Task interface.
22 | //
23 | // If nil it is set to DefaultTaskFunc
24 | NewTaskFunc TaskFunc
25 | }
26 |
27 | // NewConfig creates a Config with the required fields and uses defaults for
28 | // the others.
29 | //
30 | // Panics on empty values.
31 | func NewConfig(name, namespace string) *Config {
32 | if namespace == "" || name == "" {
33 | panic("invalid etcd config")
34 | }
35 |
36 | namespace = path.Join("/", strings.Trim(namespace, "/ "))
37 | return &Config{
38 | Name: name,
39 | Namespace: namespace,
40 | NewTaskFunc: DefaultTaskFunc,
41 | }
42 | }
43 |
44 | // Copy returns a shallow copy of this config.
45 | func (c *Config) Copy() *Config {
46 | return &Config{
47 | Name: c.Name,
48 | Namespace: c.Namespace,
49 | NewTaskFunc: c.NewTaskFunc,
50 | }
51 | }
52 |
53 | func (c *Config) String() string {
54 | return fmt.Sprintf("etcd:%s/%s", c.Namespace, c.Name)
55 | }
56 |
--------------------------------------------------------------------------------
/metcdv3/const.go:
--------------------------------------------------------------------------------
1 | package metcdv3
2 |
3 | const (
4 | TasksPath = "tasks"
5 | NodesPath = "nodes"
6 | CommandsPath = "commands"
7 | // Is this true for etcdv3?
8 | MetadataPath = "_metafora" // _{KEYs} are hidden files, so this will not trigger our watches
9 | OwnerPath = "owner"
10 | PropsPath = "props"
11 |
12 | //Etcd Error codes are passed directly through go-etcd from the http response,
13 | //So to find the error codes use this ref:
14 | // https://go.etcd.io/etcd/blob/master/error/error.go#L67
15 | EcodeKeyNotFound = 100
16 | EcodeCompareFailed = 101
17 | EcodeNodeExist = 105
18 | EcodeExpiredIndex = 401 // The event in requested index is outdated and cleared
19 | )
20 |
--------------------------------------------------------------------------------
/metcdv3/coordinator_test.go:
--------------------------------------------------------------------------------
1 | package metcdv3
2 |
3 | import (
4 | "context"
5 | "path"
6 | "strings"
7 | "testing"
8 | "time"
9 |
10 | "github.com/lytics/metafora"
11 |
12 | etcdv3 "go.etcd.io/etcd/client/v3"
13 | )
14 |
15 | /*
16 | Running the Integration Test:
17 |
18 | go test -v ./...
19 | */
20 |
21 | func TestCoordinatorFirstNodeJoiner(t *testing.T) {
22 | t.Parallel()
23 | etcdv3c, coordinator1, conf := setupEtcd(t)
24 | if err := coordinator1.Init(newCtx(t, "coordinator1")); err != nil {
25 | t.Fatalf("Unexpected error initialzing coordinator: %v", err)
26 | }
27 | defer coordinator1.Close()
28 | kvc := etcdv3.NewKV(etcdv3c)
29 |
30 | tpath := path.Join(conf.Namespace, TasksPath)
31 | _, err := kvc.Get(context.Background(), tpath)
32 | if err != nil && strings.Contains(err.Error(), "Key not found") {
33 | t.Fatalf("The tasks path wasn't created when the first node joined: %s", tpath)
34 | } else if err != nil {
35 | t.Fatalf("Unknown error trying to test: err: %s", err.Error())
36 | }
37 |
38 | //TODO test for node path too...
39 | }
40 |
41 | // Ensure that Watch() picks up new tasks and returns them.
42 | func TestCoordinatorTC1(t *testing.T) {
43 | t.Parallel()
44 | etcdv3c, coordinator1, conf := setupEtcd(t)
45 | if err := coordinator1.Init(newCtx(t, "coordinator1")); err != nil {
46 | t.Fatalf("Unexpected error initialzing coordinator: %v", err)
47 | }
48 | defer coordinator1.Close()
49 | kvc := etcdv3.NewKV(etcdv3c)
50 |
51 | tasks := make(chan metafora.Task)
52 | task001 := &task{id: "test-task"}
53 | taskPath := path.Join(conf.Namespace, TasksPath, task001.ID())
54 | errc := make(chan error)
55 |
56 | go func() {
57 | //Watch blocks, so we need to test it in its own go routine.
58 | errc <- coordinator1.Watch(tasks)
59 | }()
60 |
61 | _, _ = kvc.Put(context.Background(), taskPath, "5")
62 |
63 | select {
64 | case task := <-tasks:
65 | if task.ID() != task001.ID() {
66 | t.Fatalf("coordinator1.Watch() test failed: We received the incorrect taskId. Got [%s] Expected[%s]", task, task001)
67 | }
68 | case <-time.After(time.Second * 5):
69 | t.Fatalf("coordinator1.Watch() test failed: The testcase timed out after 5 seconds.")
70 | }
71 |
72 | coordinator1.Close()
73 | err := <-errc
74 | if err != nil {
75 | t.Fatalf("coordinator1.Watch() returned an err: %v", err)
76 | }
77 | }
78 |
79 | // Submit a task while a coordinator is actively watching for tasks.
80 | func TestCoordinatorTC2(t *testing.T) {
81 | t.Parallel()
82 | etcdv3c, coordinator1, conf := setupEtcd(t)
83 | if err := coordinator1.Init(newCtx(t, "coordinator1")); err != nil {
84 | t.Fatalf("Unexpected error initialzing coordinator: %v", err)
85 | }
86 | defer coordinator1.Close()
87 |
88 | testTasks := []string{"test1", "test2", "test3"}
89 |
90 | mclient := NewClient(conf.Namespace, etcdv3c)
91 |
92 | tasks := make(chan metafora.Task)
93 | errc := make(chan error)
94 | go func() {
95 | //Watch blocks, so we need to test it in its own go routine.
96 | errc <- coordinator1.Watch(tasks)
97 | }()
98 |
99 | for _, taskid := range testTasks {
100 | err := mclient.SubmitTask(DefaultTaskFunc(taskid, ""))
101 | if err != nil {
102 | t.Fatalf("Error submitting a task to metafora via the client. Error:\n%v", err)
103 | }
104 | recvd := <-tasks
105 | if recvd.ID() != taskid {
106 | t.Fatalf("%s != %s - received an unexpected task", recvd.ID(), taskid)
107 | }
108 | if ok := coordinator1.Claim(recvd); !ok {
109 | t.Fatal("coordinator1.Claim() unable to claim the task")
110 | }
111 | }
112 |
113 | coordinator1.Close()
114 | err := <-errc
115 | if err != nil {
116 | t.Fatalf("coordinator1.Watch() returned an err: %v", err)
117 | }
118 | }
119 |
120 | // Start two coordinators to ensure that fighting over claims results in only
121 | // one coordinator winning (and the other not crashing).
122 | func TestCoordinatorTC3(t *testing.T) {
123 | t.Parallel()
124 | etcdv3c, coordinator1, conf1 := setupEtcd(t)
125 | if err := coordinator1.Init(newCtx(t, "coordinator1")); err != nil {
126 | t.Fatalf("Unexpected error initialzing coordinator: %v", err)
127 | }
128 | defer coordinator1.Close()
129 | conf2 := conf1.Copy()
130 | conf2.Name = "node2"
131 | coordinator2 := NewEtcdV3Coordinator(conf2, etcdv3c)
132 | if err := coordinator2.Init(newCtx(t, "coordinator2")); err != nil {
133 | t.Fatalf("Unexpected error initialzing coordinator: %v", err)
134 | }
135 | defer coordinator2.Close()
136 |
137 | testTasks := []string{"test-claiming-task0001", "test-claiming-task0002", "test-claiming-task0003"}
138 |
139 | mclient := NewClient(conf1.Namespace, etcdv3c)
140 |
141 | // Start the watchers
142 | errc := make(chan error, 2)
143 | c1tasks := make(chan metafora.Task)
144 | c2tasks := make(chan metafora.Task)
145 | go func() {
146 | errc <- coordinator1.Watch(c1tasks)
147 | }()
148 | go func() {
149 | errc <- coordinator2.Watch(c2tasks)
150 | }()
151 |
152 | // Submit the tasks
153 | for _, tid := range testTasks {
154 | err := mclient.SubmitTask(DefaultTaskFunc(tid, ""))
155 | if err != nil {
156 | t.Fatalf("Error submitting task=%q to metafora via the client. Error:\n%v", tid, err)
157 | }
158 | }
159 |
160 | //XXX This assumes tasks are sent by watchers in the order they were
161 | // submitted to etcd which, while /possible/ to guarantee, isn't a gurantee
162 | // we're interested in making.
163 | // We only want to guarantee that exactly one coordinator can claim a task.
164 | c1t := <-c1tasks
165 | c2t := <-c2tasks
166 | if c1t.ID() != c2t.ID() {
167 | t.Logf("Watchers didn't receive the same task %s != %s. It's fine; watch order isn't guaranteed", c1t, c2t)
168 | }
169 |
170 | // Make sure c1 can claim and c2 cannot
171 | if ok := coordinator1.Claim(c1t); !ok {
172 | t.Fatalf("coordinator1.Claim() unable to claim the task=%q", c1t)
173 | }
174 | if ok := coordinator2.Claim(c1t); ok {
175 | t.Fatalf("coordinator2.Claim() succeeded for task=%q when it shouldn't have!", c2t)
176 | }
177 |
178 | // Make sure coordinators close down properly and quickly
179 | coordinator1.Close()
180 | if err := <-errc; err != nil {
181 | t.Errorf("Error shutting down coordinator1: %v", err)
182 | }
183 | coordinator2.Close()
184 | if err := <-errc; err != nil {
185 | t.Errorf("Error shutting down coordinator2: %v", err)
186 | }
187 | }
188 |
189 | // Submit a task before any coordinators are active. Then start a coordinator to
190 | // ensure the tasks are picked up by the new coordinator
191 | //
192 | // Then call coordinator.Release() on the task to make sure a coordinator picks it
193 | // up again.
194 | func TestCoordinatorTC4(t *testing.T) {
195 | t.Parallel()
196 | etcdv3c, coordinator1, conf1 := setupEtcd(t)
197 |
198 | task := "testtask4"
199 |
200 | mclient := NewClient(conf1.Namespace, etcdv3c)
201 |
202 | if err := mclient.SubmitTask(DefaultTaskFunc(task, "")); err != nil {
203 | t.Fatalf("Error submitting a task to metafora via the client. Error:\n%v", err)
204 | }
205 |
206 | // Don't start up the coordinator until after the metafora client has submitted work.
207 | if err := coordinator1.Init(newCtx(t, "coordinator1")); err != nil {
208 | t.Fatalf("Unexpected error initialzing coordinator: %v", err)
209 | }
210 | defer coordinator1.Close()
211 |
212 | errc := make(chan error)
213 | c1tasks := make(chan metafora.Task)
214 | go func() {
215 | errc <- coordinator1.Watch(c1tasks)
216 | }()
217 |
218 | tid := <-c1tasks
219 |
220 | if ok := coordinator1.Claim(tid); !ok {
221 | t.Fatal("coordinator1.Claim() unable to claim the task")
222 | }
223 |
224 | // Startup a second
225 | conf2 := conf1.Copy()
226 | conf2.Name = "node2"
227 | coordinator2 := NewEtcdV3Coordinator(conf2, etcdv3c)
228 | if err := coordinator2.Init(newCtx(t, "coordinator2")); err != nil {
229 | t.Fatalf("Unexpected error initialzing coordinator: %v", err)
230 | }
231 | defer coordinator2.Close()
232 |
233 | c2tasks := make(chan metafora.Task)
234 | go func() {
235 | errc <- coordinator2.Watch(c2tasks)
236 | }()
237 |
238 | // coordinator 2 shouldn't see anything yet
239 | select {
240 | case <-c2tasks:
241 | t.Fatal("coordinator2.Watch() returned a task when there are none to claim!")
242 | case <-time.After(100 * time.Millisecond):
243 | }
244 |
245 | // Now release the task from coordinator1 and claim it with coordinator2
246 | coordinator1.Release(tid)
247 | tid = <-c2tasks
248 | if ok := coordinator2.Claim(tid); !ok {
249 | t.Fatalf("coordinator2.Claim() should have succeded on released task=%q", tid)
250 | }
251 |
252 | coordinator1.Close()
253 | coordinator2.Close()
254 | for i := 0; i < 2; i++ {
255 | if err := <-errc; err != nil {
256 | t.Errorf("coordinator returned an error after closing: %v", err)
257 | }
258 | }
259 | }
260 |
261 | // TestNodeCleanup ensures the coordinator properly cleans up its node entry
262 | // upon exit.
263 | func TestNodeCleanup(t *testing.T) {
264 | t.Parallel()
265 | etcdv3c, c1, conf1 := setupEtcd(t)
266 | if err := c1.Init(newCtx(t, "coordinator1")); err != nil {
267 | t.Fatalf("Unexpected error initialzing coordinator: %v", err)
268 | }
269 | conf2 := conf1.Copy()
270 | conf2.Name = "node2"
271 | c2 := NewEtcdV3Coordinator(conf2, etcdv3c)
272 | kvc := etcdv3.NewKV(etcdv3c)
273 | if err := c2.Init(newCtx(t, "coordinator2")); err != nil {
274 | t.Fatalf("Unexpected error initialzing coordinator: %v", err)
275 | }
276 | defer c1.Close()
277 | defer c2.Close()
278 |
279 | // Make sure node directories were created
280 | c1nodep := path.Join(conf1.Namespace, NodesPath, conf1.Name, MetadataPath)
281 | c := context.Background()
282 | resp, err := kvc.Get(c, c1nodep)
283 | if err != nil {
284 | t.Fatalf("Error retrieving node key from etcd: %v", err)
285 | }
286 | if resp.Count == 0 {
287 | t.Error(c1nodep + " isn't a directory!")
288 | }
289 |
290 | c2nodep := path.Join(conf2.Namespace, NodesPath, conf2.Name, MetadataPath)
291 | resp, err = kvc.Get(c, c2nodep)
292 | if err != nil {
293 | t.Fatalf("Error retrieving node key from etcd: %v", err)
294 | }
295 | if resp.Count == 0 {
296 | t.Error(c2nodep + " isn't a directory!")
297 | }
298 |
299 | // Shutdown one and make sure its node directory is gone
300 | c1.Close()
301 |
302 | resp, err = kvc.Get(c, c1nodep)
303 | if err != nil {
304 | t.Errorf("Unexpected error %T retrieving node key from etcd: %v", err, err)
305 | }
306 | if resp.Count != 0 {
307 | t.Errorf("Expected Not Found error, but directory still exists!")
308 | }
309 |
310 | // Make sure c2 is untouched
311 | resp, err = kvc.Get(c, c2nodep)
312 | if err != nil {
313 | t.Fatalf("Error retrieving node key from etcd: %v", err)
314 | }
315 | if resp.Count == 0 {
316 | t.Error(c2nodep + " isn't a directory!")
317 | }
318 | }
319 |
320 | // TestExpiration ensures that expired claims get reclaimed properly.
321 | func TestExpiration(t *testing.T) {
322 | t.Parallel()
323 | etcdv3c, coord, conf := setupEtcd(t)
324 | kvc := etcdv3.NewKV(etcdv3c)
325 | claims := make(chan int, 10)
326 | hf := metafora.HandlerFunc(metafora.SimpleHandler(func(_ metafora.Task, stop <-chan bool) bool {
327 | claims <- 1
328 | <-stop
329 | return true
330 | }))
331 | consumer, err := metafora.NewConsumer(coord, hf, metafora.DumbBalancer)
332 | if err != nil {
333 | t.Fatalf("Error creating consumer: %+v", err)
334 | }
335 |
336 | _, err = kvc.Put(context.Background(), path.Join(conf.Namespace, TasksPath, "abc", OwnerPath), `{"node":"--"}`)
337 | if err != nil {
338 | t.Fatalf("Error creating fake claim: %v", err)
339 | }
340 | _, err = kvc.Put(context.Background(), path.Join(conf.Namespace, TasksPath, "abc"), "")
341 | if err != nil {
342 | t.Fatalf("Error creating fake task: %v", err)
343 | }
344 | _, err = kvc.Delete(context.Background(), path.Join(conf.Namespace, TasksPath, "abc", OwnerPath))
345 | if err != nil {
346 | t.Fatalf("Error deleting fake claim: %v", err)
347 | }
348 |
349 | defer consumer.Shutdown()
350 | go consumer.Run()
351 |
352 | // Wait for claim to expire and coordinator to pick up task
353 | select {
354 | case <-claims:
355 | // Task claimed!
356 | case <-time.After(5 * time.Second):
357 | t.Fatal("Task not claimed long after it should have been.")
358 | }
359 |
360 | tasks := consumer.Tasks()
361 | if len(tasks) != 1 {
362 | t.Fatalf("Expected 1 task to be claimed but found: %v", tasks)
363 | }
364 | }
365 |
--------------------------------------------------------------------------------
/metcdv3/doc.go:
--------------------------------------------------------------------------------
1 | // Package metcdv3 contains implementations of all Metafora interfaces using
2 | // etcd as the broker/backing store.
3 | //
4 | // See https://github.com/lytics/metafora/Documentation/etcdv3.md for details.
5 | package metcdv3
6 |
--------------------------------------------------------------------------------
/metcdv3/helpers_test.go:
--------------------------------------------------------------------------------
1 | package metcdv3
2 |
3 | import (
4 | "context"
5 | "fmt"
6 | "log"
7 | "os"
8 | "sync/atomic"
9 | "testing"
10 |
11 | "github.com/lytics/metafora"
12 | "github.com/lytics/metafora/metcdv3/testutil"
13 |
14 | etcdv3 "go.etcd.io/etcd/client/v3"
15 | )
16 |
17 | func init() {
18 | metafora.SetLogger(log.New(os.Stderr, "", log.Lmicroseconds|log.Lshortfile))
19 | //metafora.SetLogLevel(metafora.LogLevelDebug)
20 | }
21 |
22 | var testcounter uint64
23 |
24 | // setupEtcd should be used for all etcd integration tests. It handles the following tasks:
25 | // - Create and return an etcd client
26 | // - Create and return an initial etcd coordinator
27 | // - Clearing the test namespace in etcd
28 | func setupEtcd(t *testing.T) (*etcdv3.Client, *EtcdV3Coordinator, *Config) {
29 | c := context.Background()
30 | client := testutil.NewEtcdV3Client(t)
31 | kvc := etcdv3.NewKV(client)
32 | n := atomic.AddUint64(&testcounter, 1)
33 | ns := fmt.Sprintf("/metaforatests-%d", n)
34 | _, err := kvc.Delete(c, ns, etcdv3.WithPrefix())
35 | if err != nil {
36 | t.Errorf("failed to clean up namespace in etcd")
37 | }
38 | conf := NewConfig("testclient", ns)
39 | coord := NewEtcdV3Coordinator(conf, client)
40 | return client, coord, conf
41 | }
42 |
43 | type testLogger struct {
44 | prefix string
45 | *testing.T
46 | }
47 |
48 | func (l testLogger) Log(lvl int, m string, v ...interface{}) {
49 | l.T.Logf("%s:[%d] %s", l.prefix, lvl, fmt.Sprintf(m, v...))
50 | }
51 |
52 | type testCoordCtx struct {
53 | testLogger
54 | lost chan string
55 | }
56 |
57 | func newCtx(t *testing.T, prefix string) *testCoordCtx {
58 | return &testCoordCtx{
59 | testLogger: testLogger{prefix: prefix, T: t},
60 | lost: make(chan string, 10),
61 | }
62 | }
63 |
64 | func (t *testCoordCtx) Lost(task metafora.Task) {
65 | t.Log(4, "Lost(%s)", task.ID())
66 | t.lost <- task.ID()
67 | }
68 |
--------------------------------------------------------------------------------
/metcdv3/integration_test.go:
--------------------------------------------------------------------------------
1 | package metcdv3_test
2 |
3 | import (
4 | "context"
5 | "errors"
6 | "testing"
7 | "time"
8 |
9 | "github.com/lytics/metafora"
10 | "github.com/lytics/metafora/metcdv3"
11 | "github.com/lytics/metafora/metcdv3/testutil"
12 | "github.com/lytics/metafora/statemachine"
13 | etcdv3 "go.etcd.io/etcd/client/v3"
14 | )
15 |
16 | // TestSleepTest is an integration test for all of m_etcd's components.
17 | func TestSleepTest(t *testing.T) {
18 | etcdv3c := testutil.NewEtcdV3Client(t)
19 | kvc := etcdv3.NewKV(etcdv3c)
20 | t.Parallel()
21 | const namespace = "/sleeptest-metafora"
22 | const sleepingtasks = "sleeping-task1"
23 |
24 | _, _ = kvc.Delete(context.Background(), namespace, etcdv3.WithPrefix())
25 |
26 | holdtask := make(chan bool)
27 | h := func(task metafora.Task, cmds <-chan *statemachine.Message) *statemachine.Message {
28 |
29 | if task.ID() == sleepingtasks {
30 | sleeptil := 5 * time.Second
31 | nextstarttime := (time.Now().Add(sleeptil))
32 | t.Logf("sleeping task:%v sleepfor:%v", task, nextstarttime)
33 | <-holdtask
34 | return statemachine.SleepMessage(nextstarttime)
35 | }
36 |
37 | cmd := <-cmds
38 | t.Logf("non sleeping task:%v", task)
39 |
40 | return cmd
41 | }
42 |
43 | newC := func(name, ns string) *metafora.Consumer {
44 | conf := metcdv3.NewConfig(name, ns)
45 | coord, hf, bal := metcdv3.New(conf, etcdv3c, h)
46 | cons, err := metafora.NewConsumer(coord, hf, bal)
47 | if err != nil {
48 | t.Fatalf("Error creating consumer %s:%s: %v", ns, name, err)
49 | }
50 | go func() {
51 | cons.Run()
52 | t.Logf("Consumer:%s exited.", name)
53 | }()
54 | return cons
55 | }
56 |
57 | assertRunning := func(tid string, cons ...*metafora.Consumer) {
58 | found := false
59 | for _, c := range cons {
60 | tasks := c.Tasks()
61 | if len(tasks) > 0 && found {
62 | t.Fatal("Task already found running but another task is running on a different consumer")
63 | }
64 | if len(tasks) > 1 {
65 | t.Fatalf("Expected at most 1 task, but found: %d", len(tasks))
66 | }
67 | if len(tasks) == 1 && tasks[0].Task().ID() == tid {
68 | found = true
69 | }
70 | }
71 | if !found {
72 | t.Fatalf("Could not find task=%q", tid)
73 | }
74 | }
75 |
76 | // Start 2 consumers
77 | cons1 := newC("node1", namespace)
78 | cons2 := newC("node2", namespace)
79 |
80 | // Create clients and start some tests
81 | cliA := metcdv3.NewClient(namespace, etcdv3c)
82 |
83 | if err := cliA.SubmitTask(metcdv3.DefaultTaskFunc(sleepingtasks, "")); err != nil {
84 | t.Fatalf("Error submitting task1 to a: %v", err)
85 | }
86 |
87 | // Give consumers a bit to pick up tasks
88 | time.Sleep(500 * time.Millisecond)
89 |
90 | assertRunning(sleepingtasks, cons1, cons2)
91 |
92 | holdtask <- true
93 | // Give consumers a bit to pick up tasks
94 | time.Sleep(500 * time.Millisecond)
95 |
96 | assertRunning(sleepingtasks, cons1, cons2) // not sure if this should be true or false.
97 |
98 | wait1 := make(chan bool)
99 | go func() {
100 | defer close(wait1)
101 | // Shutdown
102 | cons1.Shutdown()
103 | cons2.Shutdown()
104 | }()
105 |
106 | timeout := time.NewTimer(5 * time.Second)
107 | select {
108 | case <-wait1:
109 | case <-timeout.C:
110 | t.Fatalf("failed waiting for shutdown")
111 | }
112 |
113 | // make sure all tasks are released
114 | for _, c := range []*metafora.Consumer{cons1, cons2} {
115 | tasks := c.Tasks()
116 | for _, work := range tasks {
117 | t.Fatalf("work id %v is still running", work)
118 | }
119 | }
120 | }
121 |
122 | // TestAll is an integration test for all of m_etcd's components.
123 | //
124 | // While huge integration tests like this are rarely desirable as they can be
125 | // overly fragile and complex, I found myself manually repeating the tests I've
126 | // automated here over and over. This is far more reliable than expecting
127 | // developers to do adhoc testing of all of the m_etcd package's features.
128 | func TestAll(t *testing.T) {
129 | etcdv3c := testutil.NewEtcdV3Client(t)
130 | kvc := etcdv3.NewKV(etcdv3c)
131 | t.Parallel()
132 |
133 | c := context.Background()
134 | _, _ = kvc.Delete(c, "/test-a", etcdv3.WithPrefix())
135 | _, _ = kvc.Delete(c, "/test-b", etcdv3.WithPrefix())
136 |
137 | h := func(task metafora.Task, cmds <-chan *statemachine.Message) *statemachine.Message {
138 | cmd := <-cmds
139 | if task.ID() == "error-test" {
140 | return statemachine.ErrorMessage(errors.New("error-test"))
141 | }
142 | return cmd
143 | }
144 |
145 | newC := func(name, ns string) *metafora.Consumer {
146 | conf := metcdv3.NewConfig(name, ns)
147 | conf.Name = name
148 | coord, hf, bal := metcdv3.New(conf, etcdv3c, h)
149 | cons, err := metafora.NewConsumer(coord, hf, bal)
150 | if err != nil {
151 | t.Fatalf("Error creating consumer %s:%s: %v", ns, name, err)
152 | }
153 | go cons.Run()
154 | return cons
155 | }
156 | // Start 4 consumers, 2 per namespace
157 | cons1a := newC("node1", "/test-a")
158 | cons2a := newC("node2", "/test-a")
159 | cons1b := newC("node1", "/test-b")
160 | cons2b := newC("node2", "/test-b")
161 |
162 | // Create clients and start some tests
163 | cliA := metcdv3.NewClient("/test-a", etcdv3c)
164 | cliB := metcdv3.NewClient("/test-b", etcdv3c)
165 |
166 | if err := cliA.SubmitTask(metcdv3.DefaultTaskFunc("task1", "")); err != nil {
167 | t.Fatalf("Error submitting task1 to a: %v", err)
168 | }
169 | if err := cliB.SubmitTask(metcdv3.DefaultTaskFunc("task1", "")); err != nil {
170 | t.Fatalf("Error submitting task1 to b: %v", err)
171 | }
172 |
173 | // Give consumers a bit to pick up tasks
174 | time.Sleep(500 * time.Millisecond)
175 |
176 | assertRunning := func(tid string, cons ...*metafora.Consumer) {
177 | found := false
178 | for _, c := range cons {
179 | tasks := c.Tasks()
180 | if len(tasks) > 0 && found {
181 | t.Fatal("Task already found running but another task is running on a different consumer")
182 | }
183 | if len(tasks) > 1 {
184 | t.Fatalf("Expected at most 1 task, but found: %d", len(tasks))
185 | }
186 | if len(tasks) == 1 && tasks[0].Task().ID() == tid {
187 | found = true
188 | }
189 | }
190 | if !found {
191 | t.Fatalf("Could not find task=%q", tid)
192 | }
193 | }
194 |
195 | assertRunning("task1", cons1a, cons2a)
196 | assertRunning("task1", cons1b, cons2b)
197 |
198 | // Kill task1 in A
199 | {
200 | cmdr := metcdv3.NewCommander("/test-a", etcdv3c)
201 | if err := cmdr.Send("task1", statemachine.KillMessage()); err != nil {
202 | t.Fatalf("Error sending kill to task1: %v", err)
203 | }
204 | time.Sleep(1000 * time.Millisecond)
205 |
206 | for _, c := range []*metafora.Consumer{cons1a, cons2a} {
207 | tasks := c.Tasks()
208 | if len(tasks) != 0 {
209 | t.Fatalf("Expected no tasks but found: %d", len(tasks))
210 | }
211 | }
212 | }
213 |
214 | // Submit a bunch of tasks to A
215 | {
216 | tasks := []string{"task2", "task3", "task4", "task5", "task6", "task7"}
217 | for _, tid := range tasks {
218 | if err := cliA.SubmitTask(metcdv3.DefaultTaskFunc(tid, "")); err != nil {
219 | t.Fatalf("Error submitting task=%q to A: %v", tid, err)
220 | }
221 | }
222 |
223 | // Give them time to start
224 | time.Sleep(800 * time.Millisecond)
225 |
226 | // Ensure they're balanced
227 | if err := cliA.SubmitCommand("node1", metafora.CommandBalance()); err != nil {
228 | t.Fatalf("Error submitting balance command to cons1a: %v", err)
229 | }
230 | time.Sleep(800 * time.Millisecond)
231 | if err := cliA.SubmitCommand("node2", metafora.CommandBalance()); err != nil {
232 | t.Fatalf("Error submitting balance command to cons1a: %v", err)
233 | }
234 |
235 | a1tasks := cons1a.Tasks()
236 | a2tasks := cons2a.Tasks()
237 | for _, task := range a1tasks {
238 | metafora.Debug("A1: ", task.Task(), " - ", task.Stopped().IsZero())
239 | }
240 | for _, task := range a2tasks {
241 | metafora.Debug("A2: ", task.Task(), " - ", task.Stopped().IsZero())
242 | }
243 | time.Sleep(800 * time.Millisecond)
244 |
245 | a1tasks = cons1a.Tasks()
246 | a2tasks = cons2a.Tasks()
247 | if len(a1tasks) < 2 || len(a1tasks) > 4 || len(a2tasks) < 2 || len(a2tasks) > 4 {
248 | t.Fatalf("Namespace A isn't fairly balanced: node1: %d; node2: %d", len(a1tasks), len(a2tasks))
249 | }
250 |
251 | // Shutting down a consumer should migrate all tasks to the other
252 | cons1a.Shutdown()
253 | time.Sleep(800 * time.Millisecond)
254 |
255 | a2tasks = cons2a.Tasks()
256 | if len(a2tasks) != len(tasks) {
257 | t.Fatalf("Consumer 2a should have received all %d tasks but only has %d.", len(tasks), len(a2tasks))
258 | }
259 | }
260 |
261 | // Use Namespace B to check Error state handling
262 | {
263 | tasks := []string{"task8", "error-test"}
264 | for _, tid := range tasks {
265 | if err := cliB.SubmitTask(metcdv3.DefaultTaskFunc(tid, "")); err != nil {
266 | t.Fatalf("Error submitting task=%q to B: %v", tid, err)
267 | }
268 | }
269 |
270 | // Give them time to start
271 | time.Sleep(time.Second)
272 |
273 | n := len(cons1b.Tasks()) + len(cons2b.Tasks())
274 | if n != 3 {
275 | t.Fatalf("Expected B to be running 3 tasks but found %d", n)
276 | }
277 |
278 | // Resuming error-test 8*2 times should cause it to be failed
279 | cmdr := metcdv3.NewCommander("/test-b", etcdv3c)
280 | for i := 0; i < statemachine.DefaultErrMax*2; i++ {
281 | if err := cmdr.Send("error-test", statemachine.RunMessage()); err != nil {
282 | t.Fatalf("Unexpected error resuming error-test in B: %v", err)
283 | }
284 | time.Sleep(500 * time.Millisecond)
285 | }
286 |
287 | n = len(cons1b.Tasks()) + len(cons2b.Tasks())
288 | if n != 2 {
289 | t.Fatalf("Expected B to be running 2 tasks but found %d", n)
290 | }
291 |
292 | // Resubmitting a failed task shouldn't error but also shouldn't run.
293 | if err := cliB.SubmitTask(metcdv3.DefaultTaskFunc("error-test", "")); err != nil {
294 | t.Fatalf("Error resubmitting error-test task to B: %v", err)
295 | }
296 |
297 | // Give the statemachine a moment to load the initial state and exit
298 | time.Sleep(time.Second)
299 |
300 | n = len(cons1b.Tasks()) + len(cons2b.Tasks())
301 | if n != 2 {
302 | t.Fatalf("Expected B to be running 2 tasks but found %d", n)
303 | }
304 | }
305 |
306 | // Shutdown
307 | cons2a.Shutdown()
308 | cons1b.Shutdown()
309 | cons2b.Shutdown()
310 | }
311 |
312 | // TestTaskResurrectionInt ensures that a Claim won't recreate a task that had
313 | // been deleted (marked as done). taskmgr has a non-integration version of this
314 | // test.
315 | func TestTaskResurrectionInt(t *testing.T) {
316 | etcdv3c := testutil.NewEtcdV3Client(t)
317 | kvc := etcdv3.NewKV(etcdv3c)
318 | c := context.Background()
319 | t.Parallel()
320 |
321 | _, _ = kvc.Delete(c, "/test-resurrect", etcdv3.WithPrefix())
322 |
323 | task := metcdv3.DefaultTaskFunc("xyz", "")
324 |
325 | conf := metcdv3.NewConfig("testclient", "/test-resurrect")
326 | coord := metcdv3.NewEtcdV3Coordinator(conf, etcdv3c)
327 | if err := coord.Init(nil); err != nil {
328 | t.Fatalf("Error initializing coordinator: %v", err)
329 | }
330 | defer coord.Close()
331 |
332 | // Try to claim a nonexistent
333 | if claimed := coord.Claim(task); claimed {
334 | t.Fatal("Claiming a nonexistent task should not work but did!")
335 | }
336 |
337 | // Create a task, mark it as done, and try to claim it again
338 | client := metcdv3.NewClient("/test-resurrect", etcdv3c)
339 | if err := client.SubmitTask(metcdv3.DefaultTaskFunc("xyz", "")); err != nil {
340 | t.Fatalf("Error submitting task xyz: %v", err)
341 | }
342 |
343 | if claimed := coord.Claim(task); !claimed {
344 | t.Fatal("Failed to claim task xyz")
345 | }
346 |
347 | coord.Done(task)
348 |
349 | if claimed := coord.Claim(task); claimed {
350 | t.Fatal("Reclaimed task that was marked as done.")
351 | }
352 | }
353 |
--------------------------------------------------------------------------------
/metcdv3/statestore.go:
--------------------------------------------------------------------------------
1 | package metcdv3
2 |
3 | import (
4 | "context"
5 | "encoding/json"
6 | "path"
7 |
8 | "github.com/lytics/metafora"
9 | "github.com/lytics/metafora/statemachine"
10 | etcdv3 "go.etcd.io/etcd/client/v3"
11 | )
12 |
13 | const statePath = "state"
14 |
15 | // stateStore is an etcd implementation of statemachine.StateStore.
16 | type stateStore struct {
17 | etcdv3c *etcdv3.Client
18 | kvc etcdv3.KV
19 | path string
20 | }
21 |
22 | // NewStateStore returns a StateStore implementation that persists task states
23 | // in etcd.
24 | func NewStateStore(namespace string, etcdv3c *etcdv3.Client) statemachine.StateStore {
25 | return &stateStore{
26 | etcdv3c: etcdv3c,
27 | kvc: etcdv3.NewKV(etcdv3c),
28 | path: path.Join("/", namespace, statePath),
29 | }
30 | }
31 |
32 | // Load retrieves the given task's state from etcd or stores and returns
33 | // Runnable if no state exists.
34 | func (s *stateStore) Load(task metafora.Task) (*statemachine.State, error) {
35 | resp, err := s.kvc.Get(context.Background(), path.Join(s.path, task.ID()), etcdv3.WithLimit(1))
36 | if err != nil {
37 | return nil, err
38 |
39 | }
40 |
41 | if resp.Count == 0 {
42 | metafora.Infof("task=%q has no existing state, default to Runnable", task.ID())
43 | state := &statemachine.State{Code: statemachine.Runnable}
44 | if err := s.Store(task, state); err != nil {
45 | return nil, err
46 | }
47 | return state, nil
48 | }
49 |
50 | // Unmarshal state from key
51 | state := &statemachine.State{}
52 | if err := json.Unmarshal([]byte(resp.Kvs[0].Value), state); err != nil {
53 | return nil, err
54 | }
55 | return state, nil
56 | }
57 |
58 | // Store taskID's state in etcd overwriting any prior state.
59 | func (s *stateStore) Store(task metafora.Task, state *statemachine.State) error {
60 | buf, err := json.Marshal(state)
61 | if err != nil {
62 | return err
63 | }
64 |
65 | _, err = s.kvc.Put(context.Background(), path.Join(s.path, task.ID()), string(buf))
66 | return err
67 | }
68 |
--------------------------------------------------------------------------------
/metcdv3/task.go:
--------------------------------------------------------------------------------
1 | package metcdv3
2 |
3 | import "github.com/lytics/metafora"
4 |
5 | type task struct {
6 | id string
7 | }
8 |
9 | func (t *task) ID() string { return t.id }
10 |
11 | // TaskFunc creates a Task interface from a task ID and etcd Node. The Node
12 | // corresponds to the task directory.
13 | //
14 | // Implementations must support value being an empty string.
15 | //
16 | // If nil is returned the task is ignored.
17 | type TaskFunc func(id, value string) metafora.Task
18 |
19 | // DefaultTaskFunc is the default new task function used by the EtcdCoordinator
20 | // and does not attempt to process the properties value.
21 | func DefaultTaskFunc(id, _ string) metafora.Task { return &task{id: id} }
22 |
--------------------------------------------------------------------------------
/metcdv3/task_test.go:
--------------------------------------------------------------------------------
1 | package metcdv3_test
2 |
3 | import (
4 | "context"
5 | "encoding/json"
6 | "fmt"
7 | "testing"
8 | "time"
9 |
10 | "github.com/lytics/metafora"
11 | "github.com/lytics/metafora/metcdv3"
12 | "github.com/lytics/metafora/metcdv3/testutil"
13 | "github.com/lytics/metafora/statemachine"
14 | etcdv3 "go.etcd.io/etcd/client/v3"
15 | )
16 |
17 | // exTask is an extended Task type to demonstrate using an alternative NewTask
18 | // TaskFunc.
19 | type exTask struct {
20 | id string
21 | SubmittedT *time.Time `json:"_submitted"`
22 | UserID string `json:"UserID"`
23 | }
24 |
25 | func (t *exTask) ID() string { return t.id }
26 | func (t *exTask) Submitted() *time.Time { return t.SubmittedT }
27 | func (t *exTask) String() string {
28 | if t.SubmittedT == nil {
29 | return t.id
30 | }
31 | return fmt.Sprintf("%s submitted %s", t.id, t.SubmittedT)
32 | }
33 |
34 | func TestAltTask(t *testing.T) {
35 | etcdv3c := testutil.NewEtcdV3Client(t)
36 | kvc := etcdv3.NewKV(etcdv3c)
37 | c := context.Background()
38 | t.Parallel()
39 | const namespace = "/alttask-metafora"
40 | _, _ = kvc.Delete(c, namespace, etcdv3.WithPrefix())
41 |
42 | conf := metcdv3.NewConfig("testclient", namespace)
43 |
44 | // Sample overridden NewTask func
45 | conf.NewTaskFunc = func(id, props string) metafora.Task {
46 | task := exTask{id: id}
47 | if err := json.Unmarshal([]byte(props), &task); err != nil {
48 | metafora.Warnf("%s properties could not be unmarshalled: %v", id, err)
49 | }
50 | return &task
51 | }
52 |
53 | // Create a handler that returns results through a chan for synchronization
54 | results := make(chan string, 1)
55 |
56 | h := func(task metafora.Task, _ <-chan *statemachine.Message) *statemachine.Message {
57 | alttask, ok := task.(*exTask)
58 | if !ok {
59 | results <- fmt.Sprintf("%q is of type %T", task.ID(), task)
60 | return statemachine.PauseMessage()
61 | }
62 | if alttask.UserID == "" {
63 | results <- "missing UserID"
64 | return statemachine.PauseMessage()
65 | }
66 | results <- "ok"
67 | return statemachine.PauseMessage()
68 | }
69 |
70 | coord, hf, bal := metcdv3.New(conf, etcdv3c, h)
71 | consumer, err := metafora.NewConsumer(coord, hf, bal)
72 | if err != nil {
73 | t.Fatal(err)
74 | }
75 | go consumer.Run()
76 | defer consumer.Shutdown()
77 |
78 | cli := metcdv3.NewClient(namespace, etcdv3c)
79 | if err := cli.SubmitTask(&exTask{id: "test1", UserID: "test2"}); err != nil {
80 | t.Fatal(err)
81 | }
82 |
83 | result := <-results
84 | if result != "ok" {
85 | t.Fatal(result)
86 | }
87 | }
88 |
--------------------------------------------------------------------------------
/metcdv3/testutil/testutil.go:
--------------------------------------------------------------------------------
1 | // Package testutil is a collection of utilities for use by Metafora's etcd
2 | // tests. Since tests are spread across the m_etcd and m_etcd_test packages
3 | // utilities must be in a shared location.
4 | //
5 | // Unless you're making changes to the m_etcd package you don't need to use
6 | // this.
7 | package testutil
8 |
9 | import (
10 | "os"
11 | "strings"
12 | "time"
13 |
14 | etcdv3 "go.etcd.io/etcd/client/v3"
15 | )
16 |
17 | // TestCase just defines the subset of *testing.T methods needed to avoid
18 | // pulling in the testing package.
19 | type TestCase interface {
20 | Skip(args ...interface{})
21 | Fatalf(format string, args ...interface{})
22 | }
23 |
24 | // NewEtcdClient creates a new etcd client for use by the metafora client during testing.
25 | func NewEtcdV3Client(t TestCase) *etcdv3.Client {
26 | if os.Getenv("ETCDTESTS") == "" {
27 | t.Skip("ETCDTESTS unset. Skipping etcd tests.")
28 | }
29 |
30 | // This is the same ENV variable that etcdctl uses for peers.
31 | peerAddrs := os.Getenv("ETCD_PEERS")
32 | if peerAddrs == "" {
33 | peerAddrs = "127.0.0.1:2379"
34 | }
35 |
36 | peers := strings.Split(peerAddrs, ",")
37 | cli, err := etcdv3.New(etcdv3.Config{
38 | Endpoints: peers,
39 | DialTimeout: 5 * time.Second,
40 | })
41 | if err != nil {
42 | t.Fatalf("failed to create etcdv3 client: %v", err)
43 | }
44 | //defer cli.Close()
45 | return cli
46 | }
47 |
--------------------------------------------------------------------------------
/resreporter/mem_linux.go:
--------------------------------------------------------------------------------
1 | package resreporter
2 |
3 | import (
4 | "bufio"
5 | "fmt"
6 | "os"
7 |
8 | "github.com/lytics/metafora"
9 | )
10 |
11 | const meminfo = "/proc/meminfo"
12 |
13 | var Memory = memory{}
14 |
15 | type memory struct{}
16 |
17 | func (memory) Used() (used uint64, total uint64) {
18 | fd, err := os.Open(meminfo)
19 | if err != nil {
20 | metafora.Errorf("Error reading free memory via "+meminfo+": %v", err)
21 |
22 | // Effectively disable the balancer since an error happened
23 | return 0, 100
24 | }
25 | defer fd.Close()
26 |
27 | s := bufio.NewScanner(fd)
28 | foundFree, foundCache, foundBuf := false, false, false
29 | var cache uint64
30 | var buffered uint64
31 | var free uint64
32 | for s.Scan() {
33 | if total > 0 && foundFree && foundCache && foundBuf {
34 | break
35 | }
36 | if total == 0 {
37 | if n, _ := fmt.Sscanf(s.Text(), "MemTotal:%d", &total); n == 1 {
38 | continue
39 | }
40 | }
41 | if foundFree {
42 | if n, _ := fmt.Sscanf(s.Text(), "MemFree:%d", &free); n == 1 {
43 | continue
44 | }
45 | }
46 | if !foundCache {
47 | if n, _ := fmt.Sscanf(s.Text(), "Cached:%d", &cache); n == 1 {
48 | foundCache = true
49 | continue
50 | }
51 | }
52 | if !foundBuf {
53 | if n, _ := fmt.Sscanf(s.Text(), "Buffers:%d", &buffered); n == 1 {
54 | foundBuf = true
55 | continue
56 | }
57 | }
58 | }
59 | if err := s.Err(); err != nil {
60 | metafora.Errorf("Error reading free memory via "+meminfo+": %v", err)
61 |
62 | // Effectively disable the balancer since an error happened
63 | return 0, 100
64 | }
65 |
66 | return total - (free + buffered + cache), total
67 | }
68 |
69 | func (memory) String() string { return "kB" }
70 |
--------------------------------------------------------------------------------
/resreporter/mem_linux_test.go:
--------------------------------------------------------------------------------
1 | package resreporter_test
2 |
3 | import (
4 | "testing"
5 |
6 | "github.com/lytics/metafora/resreporter"
7 | )
8 |
9 | func TestMemReporter(t *testing.T) {
10 | used, total := resreporter.Memory.Used()
11 | t.Logf("Used: %d %s (%d mB)", used, resreporter.Memory, used/1024)
12 | t.Logf("Total: %d %s (%d mB)", total, resreporter.Memory, total/1024)
13 | if used == 0 && total == 100 {
14 | t.Fatal("Memory reporter failed!")
15 | }
16 | if used > total {
17 | t.Fatal("More memory used than available?!")
18 | }
19 | }
20 |
--------------------------------------------------------------------------------
/scripts/docker_run_etcd.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | export RunningEtcdDockers=$(sudo docker ps -a | grep metafora-etcd- | awk '{print $1}')
3 | if [[ -n $RunningEtcdDockers ]]; then
4 | echo stopping existing etcd metafora docker containers
5 | echo --------------------------------------------------------------------------------
6 | echo sudo docker stop ${RunningEtcdDockers}
7 | sudo docker stop ${RunningEtcdDockers}
8 | echo
9 |
10 |
11 | echo removing existing etcd docker containers
12 | echo --------------------------------------------------------------------------------
13 | sudo docker rm ${RunningEtcdDockers}
14 | echo
15 | fi
16 |
17 | if [[ $1 = "-stop" ]]; then
18 | echo "-stop specified; not starting new containers"
19 | exit 0
20 | fi
21 |
22 | echo starting new etcd metafora docker containers
23 | echo --------------------------------------------------------------------------------
24 | sudo docker run -d --name="metafora-etcd-a" --net=host coreos/etcd \
25 | -peer-addr 127.0.0.1:8001 -peer-bind-addr 127.0.0.1:8001 -addr 127.0.0.1:5001 -bind-addr 127.0.0.1:5001 -name metafora-a
26 | sudo docker run -d --name="metafora-etcd-b" --net=host coreos/etcd \
27 | -peer-addr 127.0.0.1:8002 -peer-bind-addr 127.0.0.1:8002 -addr 127.0.0.1:5002 -bind-addr 127.0.0.1:5002 -name metafora-b -peers 127.0.0.1:8001,127.0.0.1:8002,127.0.0.1:8003
28 | sudo docker run -d --name="metafora-etcd-c" --net=host coreos/etcd \
29 | -peer-addr 127.0.0.1:8003 -peer-bind-addr 127.0.0.1:8003 -addr 127.0.0.1:5003 -bind-addr 127.0.0.1:5003 -name metafora-c -peers 127.0.0.1:8001,127.0.0.1:8002,127.0.0.1:8003
30 | echo
31 |
32 | echo list of running metafora docker containers
33 | echo --------------------------------------------------------------------------------
34 | sudo docker ps | head -n 1
35 | sudo docker ps | grep metafora-etcd-
36 |
--------------------------------------------------------------------------------
/slowtask_test.go:
--------------------------------------------------------------------------------
1 | package metafora
2 |
3 | import (
4 | "testing"
5 | "time"
6 | )
7 |
8 | type releaseAllBalancer struct {
9 | balances chan int
10 | ctx BalancerContext
11 | }
12 |
13 | func (b *releaseAllBalancer) Init(c BalancerContext) {
14 | b.ctx = c
15 | b.balances = make(chan int)
16 | }
17 | func (b *releaseAllBalancer) CanClaim(Task) (time.Time, bool) { return NoDelay, true }
18 | func (b *releaseAllBalancer) Balance() []string {
19 | b.balances <- 1
20 | ids := []string{}
21 | for _, task := range b.ctx.Tasks() {
22 | ids = append(ids, task.Task().ID())
23 | }
24 | return ids
25 | }
26 |
27 | func TestDoubleRelease(t *testing.T) {
28 | t.Parallel()
29 |
30 | started := make(chan int)
31 | reallyStop := make(chan bool)
32 | h := SimpleHandler(func(task Task, stop <-chan bool) bool {
33 | started <- 1
34 | t.Logf("TestDoubleRelease handler recieved %s - blocking until reallyStop closed.", task)
35 | <-reallyStop
36 | return true
37 | })
38 |
39 | tc := NewTestCoord()
40 |
41 | b := &releaseAllBalancer{}
42 | c, err := NewConsumer(tc, h, b)
43 | if err != nil {
44 | t.Fatalf("Error creating consumer: %v", err)
45 | }
46 | go c.Run()
47 |
48 | // This won't exit when told to
49 | tc.Tasks <- testTask{"1"}
50 | <-started
51 |
52 | // Make sure balancing/mainloop isn't blocked
53 | tc.Commands <- CommandBalance()
54 | <-b.balances
55 | tc.Commands <- CommandBalance()
56 | <-b.balances
57 | tc.Commands <- CommandBalance()
58 | <-b.balances
59 |
60 | shutdownComplete := make(chan bool)
61 | go func() {
62 | c.Shutdown()
63 | close(shutdownComplete)
64 | }()
65 |
66 | // Make sure the release insidiously blocks until we close reallyStop
67 | select {
68 | case <-shutdownComplete:
69 | t.Fatal("Shutdown completed when it should have blocked indefinitely")
70 | case <-time.After(100 * time.Millisecond):
71 | }
72 |
73 | // Close reallyStop and make sure Shutdown actually exits
74 | close(reallyStop)
75 | // Make sure the release insidiously blocks until we close reallyStop
76 | <-shutdownComplete
77 | }
78 |
--------------------------------------------------------------------------------
/statemachine/README.md:
--------------------------------------------------------------------------------
1 | # Metafora Finite State Machine
2 |
3 | The `statemachine` package provides a featureful state machine for use by
4 | Metafora task handlers.
5 |
6 | ## Features
7 |
8 | * Static state machine; no custom states or messages (transitions)
9 | * Per task state machine; task may intercept commands
10 | * Flexible state store (see `StateStore` interface)
11 | * Flexible command sending/receiving (see `Commander`, `CommandListener`, or
12 | [the etcd implementation](../m_etcd/commander.go)).
13 | * Flexible error handling with builtin retry logic (see
14 | [`errors.go`](errors.go)).
15 | * States: Runnable, Paused, Sleeping, Fault, Completed, Failed, Killed
16 | * Commands/Messages: Run, Pause, Sleep, Release, Error, Kill, Complete, Checkpoint
17 | * Tasks in a terminal state are unscheduled and will take no cluster resources.
18 |
19 | ## Control Flow
20 |
21 | 1. Coordinator receives a claimable task from a Watch
22 | 2. Consumer calls `Balancer.CanClaim(task)`
23 | 3. If claimable, Consumer calls `Coordinator.Claim(task)` to claim it.
24 | 4. If claim was successful, Consumer starts the task handler which is created
25 | by `statemachine.New(...)`.
26 | 5. State machine loads initial state via `StateStore.Load(task)`.
27 | 6. If the task is `Runnable` hand over control to the `StatefulHandler`
28 | implementation provided by the user.
29 | 7. Run until task returns a `Message` either due to completion, an error, or a
30 | received command.
31 |
32 | There are quite a few moving parts that are hooked together:
33 |
34 | * The Consumer needs a `Coordinator`, `Balancer`, and `HandlerFunc` like
35 | normal, but you should use `statemachine.New(...)` to create the `Handler`
36 | returned by your `HandlerFunc`.
37 | * The state machine requires a `StateStore` and `CommandListener`. The `m_etcd`
38 | package includes an etcd implemenation of `CommandLister` (as well as
39 | `Commander` for sending commands), but no default `StateStore` is provided.
40 | * Your task handling code must be implemented in a function (or method) that
41 | fulfills the `StatefulHandler` signature. When your handler receives a
42 | command it should return it (or override it with a new `Message`) to the
43 | state machine to handle state transitions.
44 |
45 | ## States
46 |
47 | State | Description
48 | ------|------------
49 | Runnable | Task is runnable and control is passed to the task handler.
50 | Paused | Task is paused until a command is received.
51 | Sleeping | Task is paused until a specified time (or a command is received).
52 | Fault | An error occurred and a custom error handler is invoked.
53 | Completed | **Terminal** Task returned the `Complete` message because it finished succesfully.
54 | Failed | **Terminal** The error handler executed during the Fault state determined the task has failed permanently.
55 | Killed | **Terminal** Task received a `Kill` message.
56 |
57 | **Terminal** states are final. The task is removed from from the broker and will never be scheduled to run again.
58 |
59 | ## Messages
60 |
61 | AKA Events or Commands
62 |
63 | Messages cause transitions between states.
64 |
65 | Message | Description
66 | --------|------------
67 | Run | Causes a `Paused` or `Sleeping` task to transition to `Runnable` and begin executing.
68 | Pause | Causes a `Runnable` or `Sleeping` task to transition to `Paused`.
69 | Sleep | Requires an `Until time.Time` to be set. Causes non-terminal states to pause until the time is reached.
70 | Error | Requires an `Err error` to be set. Usually returned by tasks to transition to `Fault` state.
71 | Release | *See below*
72 | Checkpoint | *See below*
73 | Kill | Causes a non-terminal state to transition to `Killed`.
74 | Complete | Should only be returned by tasks. Causes a `Runnable` state to transition to `Completed`.
75 |
76 |
77 | ### Release
78 |
79 | Release is a special message that does *not* transition between states. Instead
80 | the task handler exits and the Coordinator's claim on the task is released.
81 |
82 | Metafora's `Handler.Stop()` method sends the `Release` command to a running
83 | task to request it exit. It's most often used when cleanly restarting Metafora
84 | nodes.
85 |
86 | ### Checkpoint
87 |
88 | Checkpoint is a special message that - like `Release` - does *not* transition
89 | between states. It is meant to be a signal to tasks to persist any internal
90 | state and optionally exit to allow the state machine to store.
91 |
92 | Since a `Checkpoint` is a noop in the state machine a task may decide to
93 | intercept the message and *not* return.
94 |
--------------------------------------------------------------------------------
/statemachine/commander.go:
--------------------------------------------------------------------------------
1 | package statemachine
2 |
3 | type CommandListener interface {
4 | Receive() <-chan *Message
5 | Stop()
6 | }
7 |
8 | type Commander interface {
9 | Send(taskID string, m *Message) error
10 | }
11 |
--------------------------------------------------------------------------------
/statemachine/doc.go:
--------------------------------------------------------------------------------
1 | // Statemachine is a featureful statemachine implementation for Metafora
2 | // handlers to use. It is implemented as a Handler wrapper which provides a
3 | // channel of incoming commands to wrapped handlers. Internal handlers are
4 | // expected to shutdown cleanly and exit upon receiving a command from the
5 | // state machine. The state machine will handle the state transition and
6 | // restart the internal handler if necesary.
7 | //
8 | // Users must provide a StateStore implementation for persisting task state and
9 | // Command Listener implementation for receiving commands. See the m_etcd or
10 | // embedded packages for example Command Listener implementations.
11 | //
12 | // See the README in this package for details.
13 | package statemachine
14 |
--------------------------------------------------------------------------------
/statemachine/errors.go:
--------------------------------------------------------------------------------
1 | package statemachine
2 |
3 | import (
4 | "errors"
5 | "time"
6 |
7 | "github.com/lytics/metafora"
8 | )
9 |
10 | // ExceededErrorRate is returned by error handlers in an Error Message when
11 | // retry logic has been exhausted for a handler and it should transition to
12 | // Failed.
13 | var ExceededErrorRate = errors.New("exceeded error rate")
14 |
15 | // Err represents an error that occurred while a stateful handler was running.
16 | //
17 | // NewErr was added to allow callers to construct an instance from an underlying error.
18 | // The underlying error is now preserved so that Err can be converted back using errors.As
19 | // This is useful for custom error handlers that wish to inspect underlying error types
20 | // and decision accordingly.
21 | type Err struct {
22 | Time time.Time `json:"timestamp"`
23 | Err string `json:"error"`
24 | baseErr error
25 | }
26 |
27 | // NewErr constructs an Err from an underlying error e.
28 | func NewErr(e error, t time.Time) Err {
29 | return Err{Err: e.Error(), Time: t, baseErr: e}
30 | }
31 |
32 | // Error implements the Error interface.
33 | func (e Err) Error() string {
34 | return e.Err
35 | }
36 |
37 | // Unwrap returns baseErr.
38 | func (e Err) Unwrap() error {
39 | return e.baseErr
40 | }
41 |
42 | // ErrHandler functions should return Run, Sleep, or Fail messages depending on
43 | // the rate of errors.
44 | //
45 | // Either ErrHandler and/or StateStore should trim the error slice to keep it
46 | // from growing without bound.
47 | type ErrHandler func(task metafora.Task, errs []Err) (*Message, []Err)
48 |
49 | const (
50 | DefaultErrLifetime = -4 * time.Hour
51 | DefaultErrMax = 8
52 | )
53 |
54 | // DefaultErrHandler returns a Fail message if 8 errors have occurred in 4
55 | // hours. Otherwise it enters the Sleep state for 10 minutes before trying
56 | // again.
57 | func DefaultErrHandler(_ metafora.Task, errs []Err) (*Message, []Err) {
58 | recent := time.Now().Add(DefaultErrLifetime)
59 | strikes := 0
60 | for _, err := range errs {
61 | if err.Time.After(recent) {
62 | strikes++
63 | }
64 | }
65 |
66 | if len(errs) > DefaultErrMax {
67 | errs = errs[len(errs)-DefaultErrMax:]
68 | }
69 |
70 | if strikes >= DefaultErrMax {
71 | // Return a new error to transition to Failed as well as the original
72 | // errors to store what caused this failure.
73 | return ErrorMessage(ExceededErrorRate), errs
74 | }
75 | return SleepMessage(time.Now().Add(10 * time.Minute)), errs
76 | }
77 |
--------------------------------------------------------------------------------
/statemachine/errors_test.go:
--------------------------------------------------------------------------------
1 | package statemachine_test
2 |
3 | import (
4 | "errors"
5 | "testing"
6 | "time"
7 |
8 | . "github.com/lytics/metafora/statemachine"
9 | "github.com/stretchr/testify/assert"
10 | "github.com/stretchr/testify/require"
11 | )
12 |
13 | type task string
14 |
15 | func (t task) ID() string { return string(t) }
16 |
17 | func TestDefaultErrHandler(t *testing.T) {
18 | t.Parallel()
19 | tid := ""
20 |
21 | errs := []Err{{Time: time.Now()}}
22 |
23 | {
24 | msg, errs := DefaultErrHandler(task(tid), errs)
25 | if len(errs) != 1 {
26 | t.Fatalf("Expected 1 err, found: %d", len(errs))
27 | }
28 | if msg.Code != Sleep || msg.Until == nil || msg.Until.Before(time.Now().Add(9*time.Minute)) {
29 | t.Fatalf("Expected sleep until +10m state but found: %s", msg)
30 | }
31 | }
32 |
33 | // Push error list over limit
34 | for i := 0; i < DefaultErrMax+1; i++ {
35 | errs = append(errs, Err{Time: time.Now()})
36 | }
37 |
38 | {
39 | msg, errs := DefaultErrHandler(task(tid), errs)
40 | if len(errs) > DefaultErrMax {
41 | t.Fatalf("Expected %d errors but received: %d", DefaultErrMax, len(errs))
42 | }
43 | if msg.Code != Error || msg.Err != ExceededErrorRate {
44 | t.Fatalf("Expected error handler to permanently fail but receied: %s", msg)
45 | }
46 | }
47 | }
48 |
49 | type errType1 struct{ error }
50 | type errType2 struct{ error }
51 |
52 | func TestErr(t *testing.T) {
53 | err := errType1{errors.New("some underlying error")}
54 | se := NewErr(err, time.Now())
55 |
56 | // confirm se implements the error interface
57 | require.Implements(t, (*error)(nil), se)
58 |
59 | // confirm we can only convert se to an error of the same underlying type
60 | assert.True(t, errors.As(se, new(errType1)))
61 | assert.False(t, errors.As(se, new(errType2)))
62 |
63 | // make sure we don't panic if someone uses it the old way and baseErr is nil
64 | se = Err{Time: time.Now(), Err: "something bad"}
65 | assert.Equal(t, "something bad", se.Error())
66 | assert.False(t, errors.As(se, new(errType1)))
67 |
68 | // confirm we can check for a specific instance of baseErr too
69 | e1 := errType1{errors.New("target instance")}
70 | e2 := errType1{errors.New("different instance")}
71 | se = NewErr(e1, time.Now())
72 | assert.True(t, errors.Is(se, e1))
73 | assert.False(t, errors.Is(se, e2))
74 | }
75 |
--------------------------------------------------------------------------------
/statemachine/run_test.go:
--------------------------------------------------------------------------------
1 | package statemachine
2 |
3 | import (
4 | "testing"
5 | "time"
6 |
7 | "github.com/lytics/metafora"
8 | )
9 |
10 | type task string
11 |
12 | func (t task) ID() string { return string(t) }
13 |
14 | // TestCommandBlackhole is meant to demonstrate what happens if a
15 | // StatefulHandler implementation receives commands in a goroutine that lives
16 | // past the SatefulHandler func exiting. This is a very easy bug to write, so
17 | // defensive code was added to prevent the leaked goroutine from "stealing"
18 | // commands meant for other states (Paused or Sleeping being the two states
19 | // that absolutely need to accept commands).
20 | //
21 | // This test breaking isn't necessarily the sign of a bug. It may just mean
22 | // we've decided to remove the defensive code protecting against such errors in
23 | // which case this test should be removed as well.
24 | func TestCommandBlackhole(t *testing.T) {
25 | t.Parallel()
26 | stop := make(chan bool)
27 | rdy := make(chan int, 1)
28 | defer close(stop)
29 |
30 | f := func(_ metafora.Task, c <-chan *Message) *Message {
31 | go func() {
32 | rdy <- 1
33 | select {
34 | case <-c:
35 | t.Log("Intercepted!")
36 | case <-stop:
37 | return
38 | }
39 | }()
40 | return nil
41 | }
42 | cmds := make(chan *Message)
43 |
44 | // Ignore the return message, the point is to make sure it doesn't intercept
45 | // further commands.
46 | run(f, task("test-task"), cmds)
47 | <-rdy
48 |
49 | go func() { cmds <- RunMessage() }()
50 |
51 | select {
52 | case <-cmds:
53 | // Yay! command wasn't intercepted by leaked goroutine!
54 | case <-time.After(time.Second):
55 | t.Fatalf("Command was intercepted by leaked goroutine.")
56 | }
57 | }
58 |
--------------------------------------------------------------------------------
/statemachine/statemachine.go:
--------------------------------------------------------------------------------
1 | package statemachine
2 |
3 | import (
4 | "errors"
5 | "fmt"
6 | "runtime"
7 | "strings"
8 | "sync"
9 | "time"
10 |
11 | "github.com/lytics/metafora"
12 | )
13 |
14 | var (
15 | MissingUntilError = errors.New("sleeping state missing deadline")
16 | MissingErrorsError = errors.New("fault state has no errors")
17 | ReleasableError = errors.New("network error, release and retry")
18 | )
19 |
20 | // StateCode is the actual state key. The State struct adds additional metadata
21 | // related to certain StateCodes.
22 | type StateCode string
23 |
24 | const (
25 | Runnable StateCode = "runnable" // Scheduled
26 | Sleeping StateCode = "sleeping" // Scheduled, not running until time has elapsed
27 | Completed StateCode = "completed" // Terminal, not scheduled
28 | Killed StateCode = "killed" // Terminal, not scheduled
29 | Failed StateCode = "failed" // Terminal, not scheduled
30 | Fault StateCode = "fault" // Scheduled, in error handling / retry logic
31 | Paused StateCode = "paused" // Scheduled, not running
32 | )
33 |
34 | // Terminal states will never run and cannot transition to a non-terminal
35 | // state.
36 | func (s StateCode) Terminal() bool {
37 | switch s {
38 | case Runnable, Sleeping, Paused, Fault:
39 | return false
40 | case Completed, Killed, Failed:
41 | return true
42 | default:
43 | metafora.Error("unknown state: ", s)
44 | return false
45 | }
46 | }
47 |
48 | func (s StateCode) String() string { return string(s) }
49 |
50 | // State represents the current state of a stateful handler. See StateCode for
51 | // details. Until and Errors are extra state used by the Sleeping and Fault
52 | // states respectively.
53 | type State struct {
54 | Code StateCode `json:"state"`
55 | Until *time.Time `json:"until,omitempty"`
56 | Errors []Err `json:"errors,omitempty"`
57 | }
58 |
59 | // copy state so mutations to Until and Errors aren't shared.
60 | func (s *State) copy() *State {
61 | ns := &State{Code: s.Code}
62 | if s.Until != nil {
63 | until := *s.Until
64 | ns.Until = &until
65 | }
66 | ns.Errors = append(ns.Errors, s.Errors...)
67 | return ns
68 | }
69 |
70 | func (s *State) String() string {
71 | switch s.Code {
72 | case Sleeping:
73 | return fmt.Sprintf("%s until %s", s.Code, s.Until)
74 | case Fault:
75 | return fmt.Sprintf("%s (%d errors)", s.Code, len(s.Errors))
76 | default:
77 | return string(s.Code)
78 | }
79 | }
80 |
81 | func (s *State) Valid() error {
82 | switch s.Code {
83 | case Completed, Failed, Killed, Paused, Runnable:
84 | case Sleeping:
85 | if s.Until == nil {
86 | return MissingUntilError
87 | }
88 | case Fault:
89 | if len(s.Errors) == 0 {
90 | return MissingErrorsError
91 | }
92 | default:
93 | return fmt.Errorf("unknown state: %q", s.Code)
94 | }
95 | return nil
96 | }
97 |
98 | // Messages are events that cause state transitions. Until and Err are used by
99 | // the Sleep and Error messages respectively.
100 | type Message struct {
101 | Code MessageCode `json:"message"`
102 |
103 | // Until is when the statemachine should transition from sleeping to runnable
104 | Until *time.Time `json:"until,omitempty"`
105 |
106 | // Err is the error that caused this Error message
107 | Err error `json:"error,omitempty"`
108 | }
109 |
110 | // ErrorMessage is a simpler helper for creating error messages from an error.
111 | func ErrorMessage(err error) *Message {
112 | return &Message{Code: Error, Err: err}
113 | }
114 |
115 | // SleepMessage is a simpler helper for creating sleep messages from a time.
116 | func SleepMessage(t time.Time) *Message {
117 | return &Message{Code: Sleep, Until: &t}
118 | }
119 |
120 | func RunMessage() *Message { return &Message{Code: Run} }
121 | func PauseMessage() *Message { return &Message{Code: Pause} }
122 | func KillMessage() *Message { return &Message{Code: Kill} }
123 | func CheckpointMessage() *Message { return &Message{Code: Checkpoint} }
124 | func ReleaseMessage() *Message { return &Message{Code: Release} }
125 | func CompleteMessage() *Message { return &Message{Code: Complete} }
126 |
127 | // Valid returns true if the Message is valid. Invalid messages sent as
128 | // commands are discarded by the state machine.
129 | func (m *Message) Valid() bool {
130 | switch m.Code {
131 | case Run, Pause, Release, Checkpoint, Complete, Kill:
132 | return true
133 | case Sleep:
134 | return m.Until != nil
135 | case Error:
136 | return m.Err != nil
137 | default:
138 | return false
139 | }
140 | }
141 |
142 | func (m *Message) String() string {
143 | switch m.Code {
144 | case Sleep:
145 | if m.Until != nil {
146 | return fmt.Sprintf("%s until %s", m.Code, m.Until)
147 | }
148 | case Error:
149 | if m.Err != nil {
150 | return fmt.Sprintf("%s: %s", m.Code, m.Err.Error())
151 | }
152 | }
153 | return string(m.Code)
154 | }
155 |
156 | // MessageCode is the symbolic name of a state transition.
157 | type MessageCode string
158 |
159 | func (m MessageCode) String() string { return string(m) }
160 |
161 | const (
162 | Run MessageCode = "run"
163 | Sleep MessageCode = "sleep"
164 | Pause MessageCode = "pause"
165 | Kill MessageCode = "kill"
166 | Error MessageCode = "error"
167 | Complete MessageCode = "complete"
168 | Checkpoint MessageCode = "checkpoint"
169 |
170 | // Special event which triggers state machine to exit without transitioning
171 | // between states.
172 | Release MessageCode = "release"
173 | )
174 |
175 | // Transitions represent a state machine transition from one state to another
176 | // given an event message.
177 | type Transition struct {
178 | Event MessageCode
179 | From StateCode
180 | To StateCode
181 | }
182 |
183 | func (t Transition) String() string {
184 | return fmt.Sprintf("%v---%v--->%v", t.From, t.Event, t.To)
185 | }
186 |
187 | var (
188 | // Rules is the state transition table.
189 | Rules = [...]Transition{
190 | // Runnable can transition to anything
191 | {Event: Checkpoint, From: Runnable, To: Runnable},
192 | {Event: Release, From: Runnable, To: Runnable},
193 | {Event: Sleep, From: Runnable, To: Sleeping},
194 | {Event: Complete, From: Runnable, To: Completed},
195 | {Event: Kill, From: Runnable, To: Killed},
196 | {Event: Error, From: Runnable, To: Fault},
197 | {Event: Pause, From: Runnable, To: Paused},
198 | {Event: Run, From: Runnable, To: Runnable},
199 |
200 | // Sleeping can return to Runnable or be Killed/Paused
201 | {Event: Checkpoint, From: Sleeping, To: Sleeping},
202 | {Event: Release, From: Sleeping, To: Sleeping},
203 | {Event: Sleep, From: Sleeping, To: Sleeping},
204 | {Event: Run, From: Sleeping, To: Runnable},
205 | {Event: Kill, From: Sleeping, To: Killed},
206 | {Event: Pause, From: Sleeping, To: Paused},
207 | {Event: Error, From: Sleeping, To: Fault},
208 |
209 | // The error state transitions to either sleeping, failed, or released (to
210 | // allow custom error handlers to workaround localitly related errors).
211 | {Event: Sleep, From: Fault, To: Sleeping},
212 | {Event: Error, From: Fault, To: Failed},
213 |
214 | // Paused can return to Runnable, be put to Sleep, or Killed
215 | {Event: Checkpoint, From: Paused, To: Paused},
216 | {Event: Release, From: Paused, To: Paused},
217 | {Event: Run, From: Paused, To: Runnable},
218 | {Event: Sleep, From: Paused, To: Sleeping},
219 | {Event: Kill, From: Paused, To: Killed},
220 | {Event: Pause, From: Paused, To: Paused},
221 |
222 | // Completed, Failed, and Killed are terminal states that cannot transition
223 | // to anything.
224 | }
225 | )
226 |
227 | // StatefulHandler is the function signature that the state machine is able to
228 | // run. Instead of metafora.Handler's Stop method, StatefulHandlers receive
229 | // Messages via the commands chan and return their exit status via a Message.
230 | //
231 | // Normally StatefulHandlers simply return a Message as soon as it's received
232 | // on the commands chan. However, it's also acceptable for a handler to return
233 | // a different Message. For example if it encounters an error during shutdown,
234 | // it may choose to return that error as an Error Message as opposed to the
235 | // original command.
236 | type StatefulHandler func(task metafora.Task, commands <-chan *Message) *Message
237 |
238 | type stateMachine struct {
239 | task metafora.Task
240 | h StatefulHandler
241 | ss StateStore
242 | cl CommandListener
243 | cmds chan *Message
244 | errHandler ErrHandler
245 |
246 | mu *sync.RWMutex
247 | state *State
248 | ts time.Time
249 |
250 | stopL *sync.Mutex
251 | stopped chan bool
252 | }
253 |
254 | // New handler that creates a state machine and exposes state transitions to
255 | // the given handler by calling its Transition method. It should be created in
256 | // the HandlerFunc you use with metafora's Consumer.
257 | //
258 | // If ErrHandler is nil DefaultErrHandler will be used.
259 | func New(task metafora.Task, h StatefulHandler, ss StateStore, cl CommandListener, e ErrHandler) metafora.Handler {
260 | if e == nil {
261 | e = DefaultErrHandler
262 | }
263 | return &stateMachine{
264 | task: task,
265 | h: h,
266 | ss: ss,
267 | cl: cl,
268 | errHandler: e,
269 | mu: &sync.RWMutex{},
270 | ts: time.Now(),
271 | stopL: &sync.Mutex{},
272 | stopped: make(chan bool),
273 | }
274 | }
275 |
276 | // State returns the current state the state machine is in and what time it
277 | // entered that state. The State may be nil if Run() has yet to be called.
278 | func (s *stateMachine) State() (*State, time.Time) {
279 | s.mu.RLock()
280 | defer s.mu.RUnlock()
281 | return s.state, s.ts
282 | }
283 |
284 | func (s *stateMachine) setState(state *State) {
285 | s.mu.Lock()
286 | s.state = state.copy()
287 | s.ts = time.Now()
288 | s.mu.Unlock()
289 | }
290 |
291 | // Run the state machine enabled handler. Loads the initial state and passes
292 | // control to the internal stateful handler passing commands from the command
293 | // listener into the handler's commands chan.
294 | func (s *stateMachine) Run() (done bool) {
295 | // Multiplex external (Stop) messages and internal ones
296 | s.cmds = make(chan *Message)
297 | go func() {
298 | for {
299 | select {
300 | case m := <-s.cl.Receive():
301 | if !m.Valid() {
302 | metafora.Warnf("Ignoring invalid command: %q", m)
303 | continue
304 | }
305 | select {
306 | case s.cmds <- m:
307 | case <-s.stopped:
308 | return
309 | }
310 | case <-s.stopped:
311 | return
312 | }
313 | }
314 | }()
315 |
316 | // Stop the command listener and internal message multiplexer when Run exits
317 | defer func() {
318 | s.cl.Stop()
319 | s.stop()
320 | }()
321 |
322 | tid := s.task.ID()
323 |
324 | // Load the initial state
325 | state, err := s.ss.Load(s.task)
326 | if err == ReleasableError {
327 | // A failure to load was reported by our provided loader, but the loader believed the failure
328 | // was retriable. In most cases this will be some type of network partition or communication error,
329 | // too many file handles, etc.
330 | metafora.Errorf("task=%q could not load initial state but the task is retriable!", tid)
331 | time.Sleep(time.Second) //defer releasing the task so other nodes don't thunder herd retrying it.
332 | return false
333 | } else if err != nil {
334 | // A failure to load the state for a task is *fatal* - the task will be
335 | // unscheduled and requires operator intervention to reschedule.
336 | metafora.Errorf("task=%q could not load initial state. Marking done! Error: %v", tid, err)
337 | return true
338 | }
339 | if state == nil {
340 | // Note to StateStore implementors: This should not happen! Either state or
341 | // err must be non-nil. This code is simply to prevent a nil pointer panic.
342 | metafora.Errorf("statestore %T returned nil state and err for task=%q - unscheduling", s.ss, tid)
343 | return true
344 | }
345 | if state.Code.Terminal() {
346 | metafora.Warnf("task=%q in terminal state %s - exiting.", tid, state.Code)
347 | return true
348 | }
349 |
350 | s.setState(state) // for introspection/debugging
351 |
352 | // Main Statemachine Loop
353 | done = false
354 | for {
355 | // Enter State
356 | metafora.Debugf("task=%q in state %s", tid, state.Code)
357 | msg := s.exec(state)
358 |
359 | // Apply Message
360 | newstate, ok := apply(state, msg)
361 | if !ok {
362 | metafora.Warnf("task=%q Invalid state transition=%q returned by task. Old state=%q msg.Err=%s", tid, msg.Code, state.Code, msg.Err)
363 | msg = ErrorMessage(msg.Err)
364 | if newstate, ok = apply(state, msg); !ok {
365 | metafora.Errorf("task=%q Unable to transition to error state! Exiting with state=%q", tid, state.Code)
366 | return state.Code.Terminal()
367 | }
368 | }
369 |
370 | metafora.Infof("task=%q transitioning %s --> %s --> %s", tid, state, msg, newstate)
371 |
372 | // Save state - second part of logic probably should never happen
373 | if msg.Code != Release || (msg.Code == Release && (state.Code != newstate.Code || len(state.Errors) != len(newstate.Errors))) {
374 | if err := s.ss.Store(s.task, newstate); err != nil {
375 | // After upgrading to 1.25.5-gke.2000 we started experiencing the metadata server throwing POD_FINDER_IP_MISMATCH
376 | // errors resulting in failures authenticating to spanner. This panic will cause the pod to cyle
377 | // See https://github.com/lytics/lio/issues/30414
378 | if strings.Contains(err.Error(), "spanner: code = \"Unauthenticated\"") {
379 | metafora.Errorf("task=%q Unable to persist state=%q due to failure to authenticate to spanner.", tid, newstate.Code)
380 | panic(err)
381 | }
382 |
383 | metafora.Errorf("task=%q Unable to persist state=%q. Continuing.", tid, newstate.Code)
384 | return true
385 | }
386 | }
387 |
388 | // Set next state and loop if non-terminal
389 | state = newstate
390 |
391 | // Expose the state for introspection
392 | s.setState(state)
393 |
394 | // Exit and unschedule task on terminal state.
395 | if state.Code.Terminal() {
396 | return true
397 | }
398 |
399 | // Release messages indicate the task should exit but not unschedule.
400 | if msg.Code == Release {
401 | return false
402 | }
403 |
404 | // Alternatively Stop() may have been called but the handler may not have
405 | // returned the Release message. Always exit if we've been told to Stop()
406 | // even if the handler has returned a different Message.
407 | select {
408 | case <-s.stopped:
409 | return false
410 | default:
411 | }
412 | }
413 | }
414 |
415 | // execute non-terminal states
416 | func (s *stateMachine) exec(state *State) *Message {
417 | switch state.Code {
418 | case Runnable:
419 | // Runnable passes control to the stateful handler
420 | return run(s.h, s.task, s.cmds)
421 | case Paused:
422 | // Paused until a message arrives
423 | return <-s.cmds
424 | case Sleeping:
425 | // Sleeping until the specified time (or a message)
426 | if state.Until == nil {
427 | metafora.Warnf("task=%q told to sleep without a time. Resuming.", s.task.ID())
428 | return RunMessage()
429 | }
430 | dur := time.Until(*state.Until)
431 | metafora.Infof("task=%q sleeping for %s", s.task.ID(), dur)
432 | timer := time.NewTimer(dur)
433 | select {
434 | case <-timer.C:
435 | return RunMessage()
436 | case msg := <-s.cmds:
437 | timer.Stop()
438 | // Checkpoint & Release are special cases that shouldn't affect sleep
439 | // time, so maintain it across the state transition
440 | if msg.Code == Checkpoint || msg.Code == Release {
441 | msg.Until = state.Until
442 | }
443 | return msg
444 | }
445 | case Fault:
446 | // Special case where we potentially trim the current state to keep
447 | // errors from growing without bound.
448 | var msg *Message
449 | msg, state.Errors = s.errHandler(s.task, state.Errors)
450 | return msg
451 | default:
452 | panic("invalid state: " + state.String())
453 | }
454 | }
455 |
456 | func run(f StatefulHandler, task metafora.Task, cmd <-chan *Message) (m *Message) {
457 | defer func() {
458 | if r := recover(); r != nil {
459 | stackBuf := make([]byte, 6000)
460 | stackBufLen := runtime.Stack(stackBuf, false)
461 | stackTraceStr := string(stackBuf[0:stackBufLen])
462 | metafora.Errorf("task=%q Run method panic()d! Applying Error message. Panic: %v\nStack: %s", task.ID(), r, stackTraceStr)
463 | m = &Message{Code: Error, Err: fmt.Errorf("panic: %v\nstack: %s", r, stackTraceStr)}
464 | }
465 | }()
466 |
467 | // Defensive code to give handlers a *copy* of the command chan. That way if
468 | // a handler keeps receiving on the command chan in a goroutine past the
469 | // handler's lifetime it doesn't intercept commands intended for the
470 | // statemachine.
471 | internalcmd := make(chan *Message)
472 | stopped := make(chan struct{})
473 | go func() {
474 | for {
475 | select {
476 | case c := <-cmd:
477 | internalcmd <- c
478 | case <-stopped:
479 | return
480 | }
481 | }
482 | }()
483 | defer close(stopped)
484 |
485 | return f(task, internalcmd)
486 | }
487 |
488 | // Stop sends a Release message to the state machine through the command chan.
489 | func (s *stateMachine) Stop() {
490 | select {
491 | case s.cmds <- ReleaseMessage():
492 | // Also inform the state machine it should exit since the internal handler
493 | // may override the release message causing the task to be unreleaseable.
494 | s.stop()
495 | case <-s.stopped:
496 | // Already stopped!
497 | }
498 | }
499 |
500 | func (s *stateMachine) stop() {
501 | s.stopL.Lock()
502 | defer s.stopL.Unlock()
503 | select {
504 | case <-s.stopped:
505 | return
506 | default:
507 | close(s.stopped)
508 | }
509 | }
510 |
511 | // apply a message to cause a state transition. Returns false if the state
512 | // transition is invalid.
513 | func apply(cur *State, m *Message) (*State, bool) {
514 | //XXX Is a linear scan of all rules really the best option here?
515 | for _, trans := range Rules {
516 | if trans.Event == m.Code && trans.From == cur.Code {
517 | metafora.Debugf("Transitioned %s", trans)
518 | if m.Err != nil {
519 | // Append errors from message
520 | cur.Errors = append(cur.Errors, NewErr(m.Err, time.Now()))
521 | }
522 |
523 | // New State + Message's Until + Combined Errors
524 | return &State{Code: trans.To, Until: m.Until, Errors: cur.Errors}, true
525 | }
526 | }
527 | return cur, false
528 | }
529 |
--------------------------------------------------------------------------------
/statemachine/statemachine_test.go:
--------------------------------------------------------------------------------
1 | package statemachine_test
2 |
3 | import (
4 | "errors"
5 | "testing"
6 | "time"
7 |
8 | "github.com/lytics/metafora"
9 | "github.com/lytics/metafora/embedded"
10 | . "github.com/lytics/metafora/statemachine"
11 | )
12 |
13 | func testhandler(task metafora.Task, cmds <-chan *Message) *Message {
14 | metafora.Debugf("Starting %s", task.ID())
15 | m := <-cmds
16 | metafora.Debugf("%s recvd %s", task.ID(), m.Code)
17 | return m
18 | }
19 |
20 | type testStore struct {
21 | initial *State
22 | out chan<- *State
23 | }
24 |
25 | func (s testStore) Load(metafora.Task) (*State, error) {
26 | s.out <- s.initial
27 | return s.initial, nil
28 | }
29 | func (s testStore) Store(task metafora.Task, newstate *State) error {
30 | metafora.Debugf("%s storing %s", task.ID(), newstate.Code)
31 | s.out <- newstate
32 | return nil
33 | }
34 |
35 | // setup a task with the specified task ID in a stateful handler and run it.
36 | func setup(t *testing.T, tid string) (*embedded.StateStore, Commander, metafora.Handler, chan bool) {
37 | t.Parallel()
38 | ss := embedded.NewStateStore().(*embedded.StateStore)
39 | _ = ss.Store(task(tid), &State{Code: Runnable})
40 | <-ss.Stored // pop initial state out
41 | cmdr := embedded.NewCommander()
42 | cmdlistener := cmdr.NewListener(tid)
43 | sm := New(task(tid), testhandler, ss, cmdlistener, nil)
44 | done := make(chan bool)
45 | go func() { done <- sm.Run() }()
46 | return ss, cmdr, sm, done
47 | }
48 |
49 | // FIXME leaks goroutines
50 | func TestRules(t *testing.T) {
51 | t.Parallel()
52 | for i, trans := range Rules {
53 | metafora.Debugf("Trying %s", trans)
54 | cmdr := embedded.NewCommander()
55 | cmdlistener := cmdr.NewListener("test")
56 | store := make(chan *State)
57 |
58 | state := &State{Code: trans.From}
59 |
60 | // Sleeping state needs extra Until state
61 | if trans.From == Sleeping {
62 | until := time.Now().Add(100 * time.Millisecond)
63 | state.Until = &until
64 | }
65 |
66 | ts := testStore{initial: state, out: store}
67 |
68 | // Create a new statemachine that starts from the From state
69 | sm := New(task("test"), testhandler, ts, cmdlistener, nil)
70 | go sm.Run()
71 | initial := <-store
72 | if initial.Code != trans.From {
73 | t.Fatalf("%d Initial state %q not set. Found: %q", i, trans.From, initial.Code)
74 | }
75 |
76 | // The Fault state transitions itself to either sleeping or failed
77 | if trans.From != Fault {
78 | // Apply the Event to transition to the To state
79 | msg := &Message{Code: trans.Event}
80 |
81 | // Sleep messages need extra state
82 | if trans.Event == Sleep {
83 | until := time.Now().Add(10 * time.Millisecond)
84 | msg.Until = &until
85 | }
86 | if trans.Event == Error {
87 | msg.Err = errors.New("test")
88 | }
89 | if err := cmdr.Send("test", msg); err != nil {
90 | t.Fatalf("Error sending message %s: %v", trans.Event, err)
91 | }
92 | }
93 | newstate := <-store
94 | if trans.From == Fault && trans.To == Failed {
95 | // continue on as this transition relies on state this test doesn't exercise
96 | continue
97 | }
98 | if newstate.Code != trans.To {
99 | t.Fatalf("%d Expected %q but found %q for transition %s", i, trans.To, newstate.Code, trans)
100 | }
101 | }
102 | }
103 |
104 | func TestCheckpointRelease(t *testing.T) {
105 | ss, cmdr, _, done := setup(t, "test1")
106 |
107 | // Should just cause statemachine to loop
108 | if err := cmdr.Send("test1", CheckpointMessage()); err != nil {
109 | t.Fatalf("Error sending checkpoint: %v", err)
110 | }
111 | select {
112 | case <-done:
113 | t.Fatalf("Checkpoint command should not have caused statemachine to exit.")
114 | case <-time.After(100 * time.Millisecond):
115 | }
116 |
117 | // Should cause the statemachine to exit
118 | if err := cmdr.Send("test1", ReleaseMessage()); err != nil {
119 | t.Fatalf("Error sending release: %v", err)
120 | }
121 | select {
122 | case d := <-done:
123 | if d {
124 | t.Fatalf("Release command should not have caused the task to be marked as done.")
125 | }
126 | case <-time.After(100 * time.Millisecond):
127 | t.Fatalf("Expected statemachine to exit but it did not.")
128 | }
129 | state, err := ss.Load(task("test1"))
130 | if err != nil {
131 | t.Fatal(err)
132 | }
133 | if state.Code != Runnable {
134 | t.Fatalf("Expected released task to be runnable but found state %q", state.Code)
135 | }
136 | }
137 |
138 | func TestSleep(t *testing.T) {
139 | ss, cmdr, _, _ := setup(t, "sleep-test")
140 |
141 | {
142 | // Put to sleep forever
143 | until := time.Now().Add(9001 * time.Hour)
144 | if err := cmdr.Send("sleep-test", SleepMessage(until)); err != nil {
145 | t.Fatalf("Error sending sleep: %v", err)
146 | }
147 |
148 | newstate := <-ss.Stored
149 | if newstate.State.Code != Sleeping || !newstate.State.Until.Equal(until) {
150 | t.Fatalf("Expected task to store state Sleeping, but stored: %s", newstate)
151 | }
152 | }
153 |
154 | // Make sure it stays sleeping for at least a bit
155 | select {
156 | case newstate := <-ss.Stored:
157 | t.Fatalf("Expected task to stay asleep forever but transitioned to: %s", newstate)
158 | case <-time.After(100 * time.Millisecond):
159 | }
160 |
161 | // Override current sleep with a shorter one
162 | dur := 1 * time.Second
163 | start := time.Now()
164 | until := start.Add(dur)
165 | if err := cmdr.Send("sleep-test", SleepMessage(until)); err != nil {
166 | t.Fatalf("Error sending sleep: %v", err)
167 | }
168 |
169 | newstate := <-ss.Stored
170 | if newstate.State.Code != Sleeping || !newstate.State.Until.Equal(until) {
171 | t.Fatalf("Expected task to store state Sleeping, but stored: %s", newstate)
172 | }
173 |
174 | // Make sure it transitions to Runnable after sleep has elapsed
175 | newstate = <-ss.Stored
176 | transitioned := time.Now()
177 | if newstate.State.Code != Runnable || newstate.State.Until != nil {
178 | t.Fatalf("Expected task to be runnable without an Until time but found: %s", newstate.State)
179 | }
180 | elapsed := transitioned.Sub(start)
181 | if transitioned.Sub(start) < dur {
182 | t.Fatalf("Expected task to sleep for %s but slept for %s", dur, elapsed)
183 | }
184 | t.Logf("Statemachine latency: %s", elapsed-dur)
185 | }
186 |
187 | func TestSleepRelease(t *testing.T) {
188 | ss, cmdr, _, returned := setup(t, "sleep-test")
189 |
190 | until := time.Now().Add(9001 * time.Hour)
191 | {
192 | // Put to sleep forever
193 | if err := cmdr.Send("sleep-test", SleepMessage(until)); err != nil {
194 | t.Fatalf("Error sending sleep: %v", err)
195 | }
196 |
197 | newstate := <-ss.Stored
198 | if newstate.State.Code != Sleeping || !newstate.State.Until.Equal(until) {
199 | t.Fatalf("Expected task to store state Sleeping, but stored: %s", newstate)
200 | }
201 | }
202 |
203 | {
204 | // Releasing should maintain sleep state but exit
205 | if err := cmdr.Send("sleep-test", ReleaseMessage()); err != nil {
206 | t.Fatalf("Error sending release: %v", err)
207 | }
208 | newstate := <-ss.Stored
209 | if newstate.State.Code != Sleeping || newstate.State.Until == nil || !newstate.State.Until.Equal(until) {
210 | t.Fatalf("Releasing unexpectedly changed state: %s != Sleeping || %v != %s", newstate.State.Code, newstate.State.Until, until)
211 | }
212 | if done := <-returned; done {
213 | t.Fatal("Releasing should not have returned done.")
214 | }
215 | }
216 | }
217 |
218 | func TestTerminal(t *testing.T) {
219 | ss, cmdr, sm, done := setup(t, "terminal-test")
220 |
221 | // Kill the task
222 | if err := cmdr.Send("terminal-test", &Message{Code: Kill}); err != nil {
223 | t.Fatalf("Error sending kill command: %v", err)
224 | }
225 |
226 | // Task should be killed and done (unscheduled)
227 | newstate := <-ss.Stored
228 | if newstate.State.Code != Killed {
229 | t.Fatalf("Expected task to be killed but found: %s", newstate.State)
230 | }
231 | if !(<-done) {
232 | t.Fatal("Expected task to be done.")
233 | }
234 | if state, err := ss.Load(task("terminal-test")); err != nil || state.Code != Killed {
235 | t.Fatalf("Failed to load expected killed state for task: state=%s err=%v", state, err)
236 | }
237 |
238 | // Task should just die again if we try to reschedule it
239 | go func() { done <- sm.Run() }()
240 | select {
241 | case newstate := <-ss.Stored:
242 | t.Fatalf("Re-running a terminated task should *not* store state, but it stored: %v", newstate.State)
243 | case <-time.After(100 * time.Millisecond):
244 | // State shouldn't even be stored since it's not being changed and terminal
245 | // states should be immutable
246 | }
247 |
248 | if !(<-done) {
249 | t.Fatal("Expected task to be done.")
250 | }
251 | }
252 |
253 | func TestPause(t *testing.T) {
254 | ss, cmdr, sm, done := setup(t, "test-pause")
255 |
256 | pause := func() {
257 | if err := cmdr.Send("test-pause", PauseMessage()); err != nil {
258 | t.Fatalf("Error sending pause command to test-pause: %v", err)
259 | }
260 | newstate := <-ss.Stored
261 | if newstate.State.Code != Paused {
262 | t.Fatalf("Expected paused state but found: %s", newstate.State)
263 | }
264 | if state, err := ss.Load(task("test-pause")); err != nil || state.Code != Paused {
265 | t.Fatalf("Failed to load expected pause state for task: state=%s err=%v", state, err)
266 | }
267 |
268 | // Task should not be Done; pausing doesn't exit the statemachine
269 | select {
270 | case <-done:
271 | t.Fatal("Task exited unexpectedly.")
272 | case <-time.After(100 * time.Millisecond):
273 | }
274 | }
275 |
276 | // Pause the work
277 | pause()
278 |
279 | // Should be able to resume paused work
280 | if err := cmdr.Send("test-pause", RunMessage()); err != nil {
281 | t.Fatalf("Error sending run command to test-pause: %v", err)
282 | }
283 | newstate := <-ss.Stored
284 | if newstate.State.Code != Runnable {
285 | t.Fatalf("Expected runnable state but found: %s", newstate.State)
286 | }
287 | if state, err := ss.Load(task("test-pause")); err != nil || state.Code != Runnable {
288 | t.Fatalf("Failed to load expected runnable state for task: state=%s err=%v", state, err)
289 | }
290 |
291 | // Re-pause the work
292 | pause()
293 |
294 | // Pausing paused work is silly but fine
295 | pause()
296 |
297 | // Releasing paused work should make it exit but leave it in the paused state
298 | sm.Stop()
299 | newstate = <-ss.Stored
300 | if newstate.State.Code != Paused {
301 | t.Fatalf("Releasing should not have changed paused state but stored: %s", newstate.State)
302 | }
303 | select {
304 | case d := <-done:
305 | if d {
306 | t.Fatal("Releasing task should not have marked it as done.")
307 | }
308 | case <-time.After(100 * time.Millisecond):
309 | t.Fatal("Releasing paused task should have exited the statemachine, but didn't.")
310 | }
311 |
312 | // Ensure task is stored with the paused state
313 | if state, err := ss.Load(task("test-pause")); err != nil || state.Code != Paused {
314 | t.Fatalf("Failed to load expected paused state for task: state=%s err=%v", state, err)
315 | }
316 | }
317 |
318 | func TestMessageValid(t *testing.T) {
319 | t.Parallel()
320 | until := time.Now()
321 | validmsgs := []Message{
322 | {Code: Run},
323 | {Code: Sleep, Until: &until},
324 | {Code: Pause},
325 | {Code: Kill},
326 | {Code: Error, Err: errors.New("test")},
327 | {Code: Complete},
328 | {Code: Checkpoint},
329 | {Code: Release},
330 | }
331 | for _, m := range validmsgs {
332 | if !m.Valid() {
333 | t.Errorf("Expected %s to be valid.", m)
334 | }
335 | }
336 |
337 | invalidmsgs := []Message{
338 | {},
339 | {Code: Sleep},
340 | {Code: Error},
341 | }
342 | for _, m := range invalidmsgs {
343 | if m.Valid() {
344 | t.Errorf("Expected %s to be invalid.", m)
345 | }
346 | }
347 | }
348 |
--------------------------------------------------------------------------------
/statemachine/statestore.go:
--------------------------------------------------------------------------------
1 | package statemachine
2 |
3 | import "github.com/lytics/metafora"
4 |
5 | // StateStore is an interface implementations must provide for persisting task
6 | // state. Since the task ID is provided on each method call a single global
7 | // StateStore can be used and implementations should be safe for concurrent
8 | // access.
9 | type StateStore interface {
10 | // Load the persisted or initial state for a task. Errors will cause tasks to
11 | // be marked as done.
12 | //
13 | // The one exception is the special error StateNotFound which will cause the
14 | // state machine to start from the initial (Runnable) state.
15 | Load(metafora.Task) (*State, error)
16 |
17 | // Store the current task state. Errors will prevent current state from being
18 | // persisted and prevent state transitions.
19 | Store(metafora.Task, *State) error
20 | }
21 |
--------------------------------------------------------------------------------
/task.go:
--------------------------------------------------------------------------------
1 | package metafora
2 |
3 | import (
4 | "encoding/json"
5 | "sync"
6 | "time"
7 | )
8 |
9 | // Task is the minimum interface for Tasks to implement.
10 | type Task interface {
11 | // ID is the immutable globally unique ID for this task.
12 | ID() string
13 | }
14 |
15 | type basictask string
16 |
17 | // NewTask creates the most basic Task implementation: just a string ID.
18 | func NewTask(id string) Task { return basictask(id) }
19 | func (t basictask) ID() string { return string(t) }
20 |
21 | // RunningTask represents tasks running within a consumer.
22 | type RunningTask interface {
23 | Task() Task
24 |
25 | // Started is the time the task was started by this consumer.
26 | Started() time.Time
27 |
28 | // Stopped is the first time Stop() was called on this task or zero is it has
29 | // yet to be called. Tasks may take an indeterminate amount of time to
30 | // shutdown after Stop() is called.
31 | Stopped() time.Time
32 |
33 | // Handler implementation called for this task.
34 | Handler() Handler
35 | }
36 |
37 | // runtask is the per-task state Metafora tracks internally.
38 | type runtask struct {
39 | // task is the original Task from the coordinator
40 | task Task
41 |
42 | // handler on which Run and Stop are called
43 | h Handler
44 |
45 | // stopL serializes calls to task.h.Stop() to make handler implementations
46 | // easier/safer as well as guard stopped
47 | stopL sync.Mutex
48 |
49 | // when task was started and when Stop was first called
50 | started time.Time
51 | stopped time.Time
52 | }
53 |
54 | func newTask(task Task, h Handler) *runtask {
55 | return &runtask{task: task, h: h, started: time.Now()}
56 | }
57 |
58 | func (t *runtask) stop() {
59 | t.stopL.Lock()
60 | defer t.stopL.Unlock()
61 | if t.stopped.IsZero() {
62 | t.stopped = time.Now()
63 | }
64 | t.h.Stop()
65 | }
66 |
67 | func (t *runtask) Task() Task { return t.task }
68 | func (t *runtask) Handler() Handler { return t.h }
69 | func (t *runtask) Started() time.Time { return t.started }
70 | func (t *runtask) Stopped() time.Time {
71 | t.stopL.Lock()
72 | defer t.stopL.Unlock()
73 | return t.stopped
74 | }
75 |
76 | func (t *runtask) MarshalJSON() ([]byte, error) {
77 | js := struct {
78 | ID string `json:"id"`
79 | Started time.Time `json:"started"`
80 | Stopped *time.Time `json:"stopped,omitempty"`
81 | }{ID: t.task.ID(), Started: t.started}
82 |
83 | // Only set stopped if it's non-zero
84 | if s := t.Stopped(); !s.IsZero() {
85 | js.Stopped = &s
86 | }
87 |
88 | return json.Marshal(&js)
89 | }
90 |
--------------------------------------------------------------------------------
/util_test.go:
--------------------------------------------------------------------------------
1 | package metafora
2 |
3 | import (
4 | "errors"
5 | "log"
6 | "os"
7 | )
8 |
9 | func init() {
10 | SetLogger(log.New(os.Stderr, "", log.Lmicroseconds|log.Lshortfile))
11 | }
12 |
13 | //TODO Move out into a testutil package for other packages to use. The problem
14 | //is that existing metafora tests would have to be moved to the metafora_test
15 | //package which means no manipulating unexported globals like balance jitter.
16 |
17 | type TestCoord struct {
18 | name string
19 | Tasks chan Task // will be returned in order, "" indicates return an error
20 | Commands chan Command
21 | Releases chan Task
22 | Dones chan Task
23 | closed chan bool
24 | }
25 |
26 | func NewTestCoord() *TestCoord {
27 | return &TestCoord{
28 | name: "testcoord",
29 | Tasks: make(chan Task, 10),
30 | Commands: make(chan Command, 10),
31 | Releases: make(chan Task, 10),
32 | Dones: make(chan Task, 10),
33 | closed: make(chan bool),
34 | }
35 | }
36 |
37 | func (*TestCoord) Init(CoordinatorContext) error { return nil }
38 | func (*TestCoord) Claim(Task) bool { return true }
39 | func (c *TestCoord) Close() { close(c.closed) }
40 | func (c *TestCoord) Release(task Task) { c.Releases <- task }
41 | func (c *TestCoord) Done(task Task) { c.Dones <- task }
42 | func (c *TestCoord) Name() string { return c.name }
43 |
44 | // Watch sends tasks from the Tasks channel unless an empty string is sent.
45 | // Then an error is returned.
46 | func (c *TestCoord) Watch(out chan<- Task) error {
47 | var task Task
48 | for {
49 | select {
50 | case task = <-c.Tasks:
51 | Debugf("TestCoord recvd: %s", task)
52 | if task == nil || task.ID() == "" {
53 | return errors.New("test error")
54 | }
55 | case <-c.closed:
56 | return nil
57 | }
58 | select {
59 | case out <- task:
60 | Debugf("TestCoord sent: %s", task)
61 | case <-c.closed:
62 | return nil
63 | }
64 | }
65 | }
66 |
67 | // Command returns commands from the Commands channel unless a nil is sent.
68 | // Then an error is returned.
69 | func (c *TestCoord) Command() (Command, error) {
70 | cmd := <-c.Commands
71 | if cmd == nil {
72 | return cmd, errors.New("test error")
73 | }
74 | return cmd, nil
75 | }
76 |
--------------------------------------------------------------------------------