├── .travis.yml
├── marshal
    ├── log.go
    ├── log_test.go
    ├── cluster_test.go
    ├── message_test.go
    ├── topic.go
    ├── marshal_test.go
    ├── message.go
    ├── rationalizer_test.go
    ├── rationalizer.go
    ├── marshal.go
    ├── cluster.go
    ├── claim.go
    ├── claim_test.go
    ├── consumer_test.go
    └── consumer.go
├── LICENSE
├── example
    └── main.go
├── debug
    └── main.go
├── PROTOCOL.md
└── README.md


/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: go
 2 | go:
 3 |   - 1.7
 4 | 
 5 | env:
 6 |   - GOMAXPROCS=4
 7 | 
 8 | sudo: false
 9 | 
10 | script:
11 |   - cd marshal
12 |   - test `gofmt -l . | wc -l` = 0
13 |   - go test -p=1 -race -timeout=600s -check.v ./...
14 | 
15 | 


--------------------------------------------------------------------------------
/marshal/log.go:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * portal - marshal
 3 |  *
 4 |  * this package exists for the simple reason that go vet complains bitterly
 5 |  * that the go-logging package we use named its error printer 'Error' without
 6 |  * the trailing 'f', and yet it accepts a format string. blah.
 7 |  *
 8 |  */
 9 | 
10 | package marshal
11 | 
12 | import (
13 | 	"sync"
14 | 
15 | 	"github.com/op/go-logging"
16 | )
17 | 
18 | var log *logging.Logger
19 | var logMu = &sync.Mutex{}
20 | 
21 | func init() {
22 | 	logMu.Lock()
23 | 	defer logMu.Unlock()
24 | 
25 | 	if log != nil {
26 | 		return
27 | 	}
28 | 	log = logging.MustGetLogger("KafkaMarshal")
29 | 	logging.SetLevel(logging.INFO, "KafkaMarshal")
30 | }
31 | 
32 | // SetLogger can be called with a logging.Logger in order to overwrite our internal
33 | // logger. Useful if you need to control the logging (such as in tests).
34 | func SetLogger(l *logging.Logger) {
35 | 	logMu.Lock()
36 | 	defer logMu.Unlock()
37 | 
38 | 	log = l
39 | }
40 | 


--------------------------------------------------------------------------------
/marshal/log_test.go:
--------------------------------------------------------------------------------
 1 | package marshal
 2 | 
 3 | import (
 4 | 	"sync"
 5 | 
 6 | 	"github.com/zorkian/kafka"
 7 | 	"github.com/zorkian/kafka/kafkatest"
 8 | 	"github.com/op/go-logging"
 9 | 
10 | 	. "gopkg.in/check.v1"
11 | )
12 | 
13 | type logTestBackend struct {
14 | 	c  *C
15 | 	mu *sync.Mutex
16 | }
17 | 
18 | var logTest = &logTestBackend{mu: &sync.Mutex{}}
19 | 
20 | func init() {
21 | 	logMu.Lock()
22 | 	defer logMu.Unlock()
23 | 
24 | 	leveledLogger := logging.AddModuleLevel(logTest)
25 | 	leveledLogger.SetLevel(logging.DEBUG, "KafkaMarshal")
26 | 	leveledLogger.SetLevel(logging.DEBUG, "KafkaClient")
27 | 	leveledLogger.SetLevel(logging.DEBUG, "KafkaTest")
28 | 
29 | 	log = logging.MustGetLogger("KafkaMarshal")
30 | 	log.SetBackend(leveledLogger)
31 | 
32 | 	kafkatest.SetLogger(log)
33 | 	kafka.SetLogger(log)
34 | }
35 | 
36 | func (l *logTestBackend) SetC(c *C) {
37 | 	l.mu.Lock()
38 | 	defer l.mu.Unlock()
39 | 
40 | 	l.c = c
41 | }
42 | 
43 | func ResetTestLogger(c *C) {
44 | 	logTest.SetC(c)
45 | }
46 | 
47 | func (l *logTestBackend) Log(lvl logging.Level, cd int, rec *logging.Record) error {
48 | 	l.mu.Lock()
49 | 	defer l.mu.Unlock()
50 | 
51 | 	l.c.Log(rec.Formatted(cd))
52 | 	return nil
53 | }
54 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2015 authors and contributors.
 2 | 
 3 | All rights reserved.
 4 | 
 5 | Redistribution and use in source and binary forms, with or without
 6 | modification, are permitted provided that the following conditions are
 7 | met:
 8 | 
 9 |     * Redistributions of source code must retain the above copyright
10 |     notice, this list of conditions and the following disclaimer.
11 | 
12 |     * Redistributions in binary form must reproduce the above copyright
13 |     notice, this list of conditions and the following disclaimer in the
14 |     documentation and/or other materials provided with the distribution.
15 | 
16 |     * Neither the name of the <organization> nor the names of its
17 |     contributors may be used to endorse or promote products derived from
18 |     this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
23 | PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER>
24 | BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
30 | THE POSSIBILITY OF SUCH DAMAGE.
31 | 


--------------------------------------------------------------------------------
/example/main.go:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * simple Marshal example consumer
 3 |  */
 4 | 
 5 | package main
 6 | 
 7 | import (
 8 | 	"github.com/zorkian/marshal/marshal"
 9 | 	"github.com/op/go-logging"
10 | )
11 | 
12 | func main() {
13 | 	log := logging.MustGetLogger("MarshalExample")
14 | 
15 | 	// Construct the marshaler. There will be one of these globally, and it's thread safe so can
16 | 	// be used from any goroutine.
17 | 	marshaler, err := marshal.NewMarshaler(
18 | 		"marshal_example_client_id",
19 | 		"marshal_example_consumer_group_id",
20 | 		[]string{"127.0.0.1:9092"})
21 | 	if err != nil {
22 | 		log.Fatalf("Failed to construct marshaler: %s", err)
23 | 	}
24 | 
25 | 	// Make sure to terminate the Marshaler. This ensures that we release all of the partition
26 | 	// locks we're holding so other consumers can pick them up.
27 | 	defer marshaler.Terminate()
28 | 
29 | 	// Now we set up a basic consumer; and we enable GreedyClaims which is useful in low QPS
30 | 	// environments as it will cause the consumer to claim as many partitions as it can
31 | 	// up front. Of course, if you have a very busy topic with many partitions, you will
32 | 	// not want to use this.
33 | 	options := marshal.NewConsumerOptions()
34 | 	options.GreedyClaims = true
35 | 
36 | 	consumer, err := marshaler.NewConsumer([]string{"some-topic"}, options)
37 | 	if err != nil {
38 | 		log.Fatalf("Failed to construct consumer: %s", err)
39 | 	}
40 | 	defer consumer.Terminate(true)
41 | 
42 | 	// Now we can get the consumption channel. Messages will be available in this channel
43 | 	// and you can consume from it in many different goroutines if your message processing
44 | 	// is such that it takes a while.
45 | 	msgChan := consumer.ConsumeChannel()
46 | 
47 | 	// You can spin up many goroutines to process messages; how many depends entirely on the type
48 | 	// of workload you have. See the docs.
49 | 	for i := 0; i < 10; i++ {
50 | 		i := i
51 | 		go func() {
52 | 			for {
53 | 				msg := <-msgChan
54 | 				log.Info("[%d] got message: %s", i, msg.Value)
55 | 
56 | 				// Now we have to commit the message now that we're done with it. If you don't
57 | 				// commit, then Marshal will never record forward progress and will eventually
58 | 				// terminate.
59 | 				consumer.Commit(msg)
60 | 			}
61 | 		}()
62 | 	}
63 | }
64 | 


--------------------------------------------------------------------------------
/marshal/cluster_test.go:
--------------------------------------------------------------------------------
 1 | package marshal
 2 | 
 3 | import (
 4 | 	. "gopkg.in/check.v1"
 5 | 
 6 | 	"github.com/zorkian/kafka/kafkatest"
 7 | )
 8 | 
 9 | var _ = Suite(&ClusterSuite{})
10 | 
11 | type ClusterSuite struct {
12 | 	s  *kafkatest.Server
13 | 	m  *Marshaler
14 | 	m2 *Marshaler
15 | }
16 | 
17 | func (s *ClusterSuite) SetUpTest(c *C) {
18 | 	ResetTestLogger(c)
19 | 
20 | 	s.s = StartServer()
21 | 
22 | 	var err error
23 | 	s.m, err = NewMarshaler("cl", "gr", []string{s.s.Addr()})
24 | 	c.Assert(err, IsNil)
25 | 	c.Assert(s.m, NotNil)
26 | 
27 | 	s.m2, err = NewMarshaler("cl", "gr2", []string{s.s.Addr()})
28 | 	c.Assert(err, IsNil)
29 | 	c.Assert(s.m2, NotNil)
30 | }
31 | 
32 | func (s *ClusterSuite) TearDownTest(c *C) {
33 | 	s.m.Terminate()
34 | 	s.s.Close()
35 | }
36 | 
37 | func (s *ClusterSuite) TestGetTopicState(c *C) {
38 | 	// Always works
39 | 	c.Assert(s.m.cluster.getPartitionState("gr", "test2", 0), NotNil)
40 | 
41 | 	// Should error (not claimed)
42 | 	topic, err := s.m.getClaimedPartitionState("test2", 0)
43 | 	c.Assert(topic, IsNil)
44 | 	c.Assert(err, NotNil)
45 | 
46 | 	// Now claim this partition
47 | 	c.Assert(s.m.ClaimPartition("test2", 0), Equals, true)
48 | 	steps, err := s.m.cluster.waitForRsteps(1)
49 | 	c.Assert(err, IsNil)
50 | 	c.Assert(steps, Equals, 1)
51 | 
52 | 	// getClaimed should now work for our group
53 | 	topic, err = s.m.getClaimedPartitionState("test2", 0)
54 | 	c.Assert(topic, NotNil)
55 | 	c.Assert(err, IsNil)
56 | 
57 | 	// And fail here
58 | 	topic, err = s.m2.getClaimedPartitionState("test2", 0)
59 | 	c.Assert(topic, IsNil)
60 | 	c.Assert(err, NotNil)
61 | 
62 | 	// And fail here (our group, diff partition)
63 | 	topic, err = s.m.getClaimedPartitionState("test2", 1)
64 | 	c.Assert(topic, IsNil)
65 | 	c.Assert(err, NotNil)
66 | 
67 | 	// Release partition now
68 | 	c.Assert(s.m.ReleasePartition("test2", 0, 0), IsNil)
69 | 	steps, err = s.m.cluster.waitForRsteps(2)
70 | 	c.Assert(err, IsNil)
71 | 	c.Assert(steps, Equals, 2)
72 | 
73 | 	// getClaimed should now fail again for our group
74 | 	topic, err = s.m.getClaimedPartitionState("test2", 0)
75 | 	c.Assert(topic, IsNil)
76 | 	c.Assert(err, NotNil)
77 | 
78 | 	// And fail here
79 | 	topic, err = s.m2.getClaimedPartitionState("test2", 0)
80 | 	c.Assert(topic, IsNil)
81 | 	c.Assert(err, NotNil)
82 | 
83 | 	// And fail here (our group, diff partition)
84 | 	topic, err = s.m.getClaimedPartitionState("test2", 1)
85 | 	c.Assert(topic, IsNil)
86 | 	c.Assert(err, NotNil)
87 | }
88 | 


--------------------------------------------------------------------------------
/debug/main.go:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * simple Marshal debug/timer utility
 3 |  *
 4 |  * To use: point this binary at a Kafka server and give it a topic and some consumer options.
 5 |  * It will then spin up a Marshaler and start to claim the topic. We time every operation and
 6 |  * will report some statistics about the state of the world.
 7 |  */
 8 | 
 9 | package main
10 | 
11 | import (
12 | 	"flag"
13 | 	"time"
14 | 
15 | 	"github.com/zorkian/marshal/marshal"
16 | 	"github.com/op/go-logging"
17 | )
18 | 
19 | var log = logging.MustGetLogger("MarshalDebug")
20 | 
21 | type timeableFunc func()
22 | 
23 | func timeIt(text string, tf timeableFunc) {
24 | 	start := time.Now()
25 | 	tf()
26 | 	elapsed := time.Now().Sub(start)
27 | 	log.Info("<%0.2f ms> %s", float64(elapsed.Nanoseconds())/1000000.0, text)
28 | }
29 | 
30 | func main() {
31 | 	broker := flag.String("broker", "localhost:9092", "ip:port of a single broker")
32 | 	group := flag.String("group", "debug-group", "group ID to use")
33 | 	client := flag.String("client", "debug-client", "client ID to use")
34 | 	topic := flag.String("topic", "test64", "topic to test against")
35 | 	claimTopic := flag.Bool("claim-topic", false, "claim entire topic mode")
36 | 	greedyClaim := flag.Bool("greedy-claim", false, "turn on greedy claims")
37 | 	fastReclaim := flag.Bool("fast-reclaim", false, "enable fast reclaim mode")
38 | 	printOnly := flag.Bool("print-state-only", false, "only print state, do not claim")
39 | 	flag.Parse()
40 | 
41 | 	// Raise marshal debugging level
42 | 	logging.SetLevel(logging.DEBUG, "KafkaMarshal")
43 | 
44 | 	// Construction timing
45 | 	var m *marshal.Marshaler
46 | 	timeIt("construct Marshaler", func() {
47 | 		var err error
48 | 		m, err = marshal.NewMarshaler(*client, *group, []string{*broker})
49 | 		if err != nil {
50 | 			log.Fatalf("Failed to construct Marshaler: %s", err)
51 | 		}
52 | 	})
53 | 	defer timeIt("terminate Marshaler", func() { m.Terminate() })
54 | 
55 | 	// If we're in print mode just do that and exit
56 | 	if *printOnly {
57 | 		m.PrintState()
58 | 		return
59 | 	}
60 | 
61 | 	// Ensure target topic exists
62 | 	partitions := m.Partitions(*topic)
63 | 	if partitions == 0 {
64 | 		log.Fatalf("Topic %s has no partitions/does not exist.", *topic)
65 | 	}
66 | 	log.Info("Topic %s has %d partitions.", *topic, partitions)
67 | 
68 | 	// Set up consumption of the topic with the options they gave us
69 | 	options := marshal.NewConsumerOptions()
70 | 	options.GreedyClaims = *greedyClaim
71 | 	options.FastReclaim = *fastReclaim
72 | 	options.ClaimEntireTopic = *claimTopic
73 | 
74 | 	timeIt("claim all partitions", func() {
75 | 		var c *marshal.Consumer
76 | 		timeIt("construct Consumer", func() {
77 | 			var err error
78 | 			c, err = m.NewConsumer([]string{*topic}, options)
79 | 			if err != nil {
80 | 				log.Fatalf("Failed to construct consumer: %s", err)
81 | 			}
82 | 		})
83 | 		defer timeIt("terminate Consumer", func() { c.Terminate(false) })
84 | 
85 | 		// Wait for all partitions to be claimed
86 | 		for c.GetCurrentLoad() < partitions {
87 | 			time.Sleep(10 * time.Millisecond)
88 | 		}
89 | 	})
90 | 
91 | 	m.PrintState()
92 | }
93 | 


--------------------------------------------------------------------------------
/marshal/message_test.go:
--------------------------------------------------------------------------------
 1 | package marshal
 2 | 
 3 | import . "gopkg.in/check.v1"
 4 | 
 5 | var _ = Suite(&MessageSuite{})
 6 | 
 7 | type MessageSuite struct{}
 8 | 
 9 | func (s *MessageSuite) SetUpTest(c *C) {
10 | 	ResetTestLogger(c)
11 | }
12 | 
13 | func (s *MessageSuite) TestMessageEncode(c *C) {
14 | 	base := msgBase{
15 | 		Version:    4,
16 | 		Time:       2,
17 | 		InstanceID: "ii",
18 | 		ClientID:   "cl",
19 | 		GroupID:    "gr",
20 | 		Topic:      "t",
21 | 		PartID:     3,
22 | 	}
23 | 	c.Assert(base.Encode(), Equals, "4/2/ii/cl/gr/t/3")
24 | 
25 | 	hb := msgHeartbeat{
26 | 		msgBase:       base,
27 | 		CurrentOffset: 5,
28 | 	}
29 | 	c.Assert(hb.Encode(), Equals, "Heartbeat/4/2/ii/cl/gr/t/3/5")
30 | 
31 | 	cp := msgClaimingPartition{
32 | 		msgBase: base,
33 | 	}
34 | 	c.Assert(cp.Encode(), Equals, "ClaimingPartition/4/2/ii/cl/gr/t/3")
35 | 
36 | 	rp := msgReleasingPartition{
37 | 		msgBase:       base,
38 | 		CurrentOffset: 7,
39 | 	}
40 | 	c.Assert(rp.Encode(), Equals, "ReleasingPartition/4/2/ii/cl/gr/t/3/7")
41 | 
42 | 	cm := msgClaimingMessages{
43 | 		msgBase:               base,
44 | 		ProposedCurrentOffset: 9,
45 | 	}
46 | 	c.Assert(cm.Encode(), Equals, "ClaimingMessages/4/2/ii/cl/gr/t/3/9")
47 | }
48 | 
49 | func (s *MessageSuite) TestMessageDecode(c *C) {
50 | 	msg, err := decode([]byte("banana"))
51 | 	c.Assert(msg, IsNil)
52 | 	c.Assert(err, NotNil)
53 | 
54 | 	msg, err = decode([]byte("Heartbeat/4/2/ii/cl/gr/t/1/2"))
55 | 	c.Assert(msg, NotNil)
56 | 	c.Assert(err, IsNil)
57 | 
58 | 	mhb, ok := msg.(*msgHeartbeat)
59 | 	if !ok || msg.Type() != msgTypeHeartbeat || mhb.ClientID != "cl" || mhb.GroupID != "gr" ||
60 | 		mhb.Topic != "t" || mhb.PartID != 1 || mhb.CurrentOffset != 2 || mhb.Time != 2 ||
61 | 		mhb.Version != 4 {
62 | 		c.Error("Heartbeat message contents invalid")
63 | 	}
64 | 
65 | 	msg, err = decode([]byte("ClaimingPartition/4/2/ii/cl/gr/t/1"))
66 | 	if msg == nil || err != nil {
67 | 		c.Error("Expected msg, got error", err)
68 | 	}
69 | 	mcp, ok := msg.(*msgClaimingPartition)
70 | 	if !ok || msg.Type() != msgTypeClaimingPartition || mcp.ClientID != "cl" ||
71 | 		mcp.GroupID != "gr" || mcp.Topic != "t" || mcp.PartID != 1 || mcp.Time != 2 ||
72 | 		mcp.Version != 4 {
73 | 		c.Error("ClaimingPartition message contents invalid")
74 | 	}
75 | 
76 | 	msg, err = decode([]byte("ReleasingPartition/4/2/ii/cl/gr/t/1/9"))
77 | 	if msg == nil || err != nil {
78 | 		c.Error("Expected msg, got error", err)
79 | 	}
80 | 	mrp, ok := msg.(*msgReleasingPartition)
81 | 	if !ok || msg.Type() != msgTypeReleasingPartition || mrp.ClientID != "cl" ||
82 | 		mrp.GroupID != "gr" || mrp.Topic != "t" || mrp.PartID != 1 || mrp.Time != 2 ||
83 | 		mrp.CurrentOffset != 9 || mhb.Version != 4 {
84 | 		c.Error("ReleasingPartition message contents invalid")
85 | 	}
86 | 
87 | 	msg, err = decode([]byte("ClaimingMessages/4/2/ii/cl/gr/t/1/2"))
88 | 	if msg == nil || err != nil {
89 | 		c.Error("Expected msg, got error", err)
90 | 	}
91 | 	mcm, ok := msg.(*msgClaimingMessages)
92 | 	if !ok || msg.Type() != msgTypeClaimingMessages || mcm.ClientID != "cl" || mcm.GroupID != "gr" ||
93 | 		mcm.Topic != "t" || mcm.PartID != 1 || mcm.ProposedCurrentOffset != 2 || mcm.Time != 2 ||
94 | 		mhb.Version != 4 {
95 | 		c.Error("ClaimingMessages message contents invalid")
96 | 	}
97 | }
98 | 


--------------------------------------------------------------------------------
/marshal/topic.go:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * portal - marshal
  3 |  *
  4 |  * a library that implements an algorithm for doing consumer coordination within Kafka, rather
  5 |  * than using Zookeeper or another external system.
  6 |  *
  7 |  */
  8 | 
  9 | package marshal
 10 | 
 11 | import (
 12 | 	"sync"
 13 | 	"time"
 14 | )
 15 | 
 16 | // topicState contains information about a given topic.
 17 | type topicState struct {
 18 | 	// claimPartition is which Marshal topic partition to use for coordination of this topic.
 19 | 	// Read only, set at initialization time so not protected by the lock.
 20 | 	claimPartition int
 21 | 
 22 | 	// This lock also protects the contents of the partitions member.
 23 | 	lock       *sync.RWMutex
 24 | 	partitions []PartitionClaim
 25 | }
 26 | 
 27 | // PrintState causes us to log the state of this topic's claims.
 28 | func (ts *topicState) PrintState() {
 29 | 	ts.lock.RLock()
 30 | 	defer ts.lock.RUnlock()
 31 | 
 32 | 	now := time.Now().Unix()
 33 | 	for partID, claim := range ts.partitions {
 34 | 		state := "CLMD"
 35 | 		if !claim.claimed(now) {
 36 | 			state = "----"
 37 | 		}
 38 | 		log.Infof("      * %2d [%s]: GPID %s | CLID %s | LHB %d (%d) | LOF %d | PCL %d",
 39 | 			partID, state, claim.GroupID, claim.ClientID, claim.LastHeartbeat,
 40 | 			now-claim.LastHeartbeat, claim.CurrentOffset, len(claim.pendingClaims))
 41 | 	}
 42 | }
 43 | 
 44 | // PartitionOffsets is a record of offsets for a given partition. Contains information
 45 | // combined from Kafka and our current state.
 46 | //
 47 | // A Kafka partition consists of N messages with offsets. In the basic case, you
 48 | // can think of an offset like an array index. With log compaction and other trickery
 49 | // it acts more like a sparse array, but it's a close enough metaphor.
 50 | //
 51 | // We keep track of four values for offsets:
 52 | //
 53 | //    offsets       1     2     3     7     9    10    11
 54 | //   partition  [ msg1, msg2, msg3, msg4, msg5, msg6, msg7, ... ]
 55 | //                 ^                  ^                      ^
 56 | //                 \- Earliest        |                      |
 57 | //                                    \- Current          Latest
 58 | //
 59 | // In this example, Earliest is 1 which is the "oldest" offset within the
 60 | // partition. At any given time this offset might become invalid if a log rolls
 61 | // so we might update it.
 62 | //
 63 | // Current is 7, which is the offset of the NEXT message i.e. this message
 64 | // has not been consumed yet.
 65 | //
 66 | // Latest is 12, which is the offset that Kafka will assign to the message
 67 | // that next gets committed to the partition. This offset does not yet exist,
 68 | // and might never.
 69 | //
 70 | // Committed is the value recorded in Kafka's committed offsets system.
 71 | type PartitionOffsets struct {
 72 | 	Current   int64
 73 | 	Earliest  int64
 74 | 	Latest    int64
 75 | 	Committed int64
 76 | }
 77 | 
 78 | // PartitionClaim contains claim information about a given partition.
 79 | type PartitionClaim struct {
 80 | 	InstanceID    string
 81 | 	ClientID      string
 82 | 	GroupID       string
 83 | 	LastRelease   int64
 84 | 	LastHeartbeat int64
 85 | 	CurrentOffset int64
 86 | 
 87 | 	// Used internally when someone is waiting on this partition to be claimed.
 88 | 	pendingClaims []chan struct{}
 89 | }
 90 | 
 91 | // checkOwnership compares the ClientID/GroupID (and optionally InstanceID) of a given
 92 | // claim to a given message and returns whether or not they match.
 93 | func (p *PartitionClaim) checkOwnership(msg message, checkInstanceID bool) bool {
 94 | 	iid, cid, gid := msg.Ownership()
 95 | 	if p.ClientID != cid || p.GroupID != gid {
 96 | 		return false
 97 | 	}
 98 | 	return !checkInstanceID || p.InstanceID == iid
 99 | }
100 | 
101 | // claimed returns a boolean indicating whether or not this structure is indicating a
102 | // still valid claim. Validity is based on the delta between NOW and lastHeartbeat:
103 | //
104 | // delta = 0 .. HeartbeatInterval: claim good.
105 | //         HeartbeatInterval .. 2*HeartbeatInterval-1: claim good.
106 | //         >2xHeartbeatInterval: claim invalid.
107 | //
108 | // This means that the worst case for a "dead consumer" that has failed to heartbeat
109 | // is that a partition will be idle for twice the heartbeat interval.
110 | func (p *PartitionClaim) claimed(ts int64) bool {
111 | 	// If lastHeartbeat is 0, then the partition is unclaimed
112 | 	if p.LastHeartbeat == 0 {
113 | 		return false
114 | 	}
115 | 
116 | 	// We believe we have claim information, but let's analyze it to determine whether or
117 | 	// not the claim is valid. Of course this assumes that our time and the remote's time
118 | 	// are roughly in sync.
119 | 	now := ts
120 | 	if ts == 0 {
121 | 		now = time.Now().Unix()
122 | 	}
123 | 
124 | 	delta := now - p.LastHeartbeat
125 | 	switch {
126 | 	case 0 <= delta && delta <= HeartbeatInterval:
127 | 		// Fresh claim - all good
128 | 		return true
129 | 	case HeartbeatInterval < delta && delta < 2*HeartbeatInterval:
130 | 		// Aging claim - missed/delayed heartbeat, but still in tolerance
131 | 		return true
132 | 	default:
133 | 		// Stale claim - no longer valid
134 | 		return false
135 | 	}
136 | }
137 | 
138 | // Claimed returns whether or not the PartitionClaim indicates a valid (as of this
139 | // invocation) claim.
140 | func (p *PartitionClaim) Claimed() bool {
141 | 	return p.claimed(0)
142 | }
143 | 


--------------------------------------------------------------------------------
/marshal/marshal_test.go:
--------------------------------------------------------------------------------
  1 | package marshal
  2 | 
  3 | import (
  4 | 	"time"
  5 | 
  6 | 	. "gopkg.in/check.v1"
  7 | 
  8 | 	"github.com/zorkian/kafka/kafkatest"
  9 | )
 10 | 
 11 | var _ = Suite(&MarshalSuite{})
 12 | 
 13 | type MarshalSuite struct {
 14 | 	s *kafkatest.Server
 15 | 	m *Marshaler
 16 | }
 17 | 
 18 | func (s *MarshalSuite) SetUpTest(c *C) {
 19 | 	ResetTestLogger(c)
 20 | 
 21 | 	s.s = StartServer()
 22 | 
 23 | 	var err error
 24 | 	s.m, err = NewMarshaler("cl", "gr", []string{s.s.Addr()})
 25 | 	if err != nil {
 26 | 		c.Errorf("New Marshaler failed: %s", err)
 27 | 	}
 28 | }
 29 | 
 30 | func (s *MarshalSuite) TearDownTest(c *C) {
 31 | 	s.m.Terminate()
 32 | 	s.s.Close()
 33 | }
 34 | 
 35 | func MakeTopic(srv *kafkatest.Server, topic string, numPartitions int) {
 36 | 	for i := 0; i < numPartitions; i++ {
 37 | 		srv.AddMessages(topic, int32(i))
 38 | 	}
 39 | }
 40 | 
 41 | func StartServer() *kafkatest.Server {
 42 | 	srv := kafkatest.NewServer()
 43 | 	srv.MustSpawn()
 44 | 	MakeTopic(srv, MarshalTopic, 4)
 45 | 	MakeTopic(srv, "test1", 1)
 46 | 	MakeTopic(srv, "test2", 2)
 47 | 	MakeTopic(srv, "test3", 3)
 48 | 	return srv
 49 | }
 50 | 
 51 | func (s *MarshalSuite) TestNewMarshaler(c *C) {
 52 | 	// Test that Marshaler starts up and learns about the topics.
 53 | 	c.Assert(s.m.Partitions(MarshalTopic), Equals, 4)
 54 | 	c.Assert(s.m.Partitions("test1"), Equals, 1)
 55 | 	c.Assert(s.m.Partitions("test2"), Equals, 2)
 56 | 	c.Assert(s.m.Partitions("test3"), Equals, 3)
 57 | 	c.Assert(s.m.Partitions("unknown"), Equals, 0)
 58 | 
 59 | 	// If our hash algorithm changes, these values will have to change. This tests the low
 60 | 	// level hash function.
 61 | 	c.Assert(s.m.cluster.getClaimPartition("test1"), Equals, 2)
 62 | 	c.Assert(s.m.cluster.getClaimPartition("test2"), Equals, 1)
 63 | 	c.Assert(s.m.cluster.getClaimPartition("test3"), Equals, 2)
 64 | 	c.Assert(s.m.cluster.getClaimPartition("unknown"), Equals, 1)
 65 | 	c.Assert(s.m.cluster.getClaimPartition("unknown"), Equals, 1) // Twice on purpose.
 66 | }
 67 | 
 68 | // This is a full integration test of claiming including writing to Kafka via the marshaler
 69 | // and waiting for responses
 70 | func (s *MarshalSuite) TestClaimPartitionIntegration(c *C) {
 71 | 	resp := make(chan bool)
 72 | 	go func() {
 73 | 		resp <- s.m.ClaimPartition("test1", 0) // true
 74 | 		resp <- s.m.ClaimPartition("test1", 0) // true (no-op)
 75 | 		s.m.lock.Lock()
 76 | 		s.m.clientID = "cl-other"
 77 | 		s.m.lock.Unlock()
 78 | 		resp <- s.m.ClaimPartition("test1", 0) // false (collission)
 79 | 		resp <- s.m.ClaimPartition("test1", 1) // true (new client)
 80 | 	}()
 81 | 
 82 | 	select {
 83 | 	case out := <-resp:
 84 | 		c.Assert(out, Equals, true)
 85 | 	case <-time.After(5 * time.Second):
 86 | 		c.Error("Timed out claiming partition")
 87 | 	}
 88 | 
 89 | 	select {
 90 | 	case out := <-resp:
 91 | 		c.Assert(out, Equals, true)
 92 | 	case <-time.After(5 * time.Second):
 93 | 		c.Error("Timed out claiming partition")
 94 | 	}
 95 | 
 96 | 	select {
 97 | 	case out := <-resp:
 98 | 		c.Assert(out, Equals, false)
 99 | 	case <-time.After(5 * time.Second):
100 | 		c.Error("Timed out claiming partition")
101 | 	}
102 | 
103 | 	select {
104 | 	case out := <-resp:
105 | 		c.Assert(out, Equals, true)
106 | 	case <-time.After(5 * time.Second):
107 | 		c.Error("Timed out claiming partition")
108 | 	}
109 | }
110 | 
111 | // This is a full integration test of a claim, heartbeat, and release cycle
112 | func (s *MarshalSuite) TestPartitionLifecycleIntegration(c *C) {
113 | 	// Claim partition (this is synchronous, will only return when)
114 | 	// it has succeeded
115 | 	c.Assert(s.m.ClaimPartition("test1", 0), Equals, true)
116 | 	steps, err := s.m.cluster.waitForRsteps(1)
117 | 	c.Assert(err, IsNil)
118 | 	c.Assert(steps, Equals, 1)
119 | 
120 | 	// Ensure we have claimed it
121 | 	cl := s.m.GetPartitionClaim("test1", 0)
122 | 	if cl.LastHeartbeat <= 0 || cl.ClientID != "cl" || cl.GroupID != "gr" {
123 | 		c.Errorf("PartitionClaim values unexpected %+v", cl)
124 | 	}
125 | 	if cl.CurrentOffset != 0 {
126 | 		c.Error("CurrentOffset is not 0")
127 | 	}
128 | 
129 | 	// Now heartbeat on it to update the last offset
130 | 	c.Assert(s.m.Heartbeat("test1", 0, 10), IsNil)
131 | 	steps, err = s.m.cluster.waitForRsteps(2)
132 | 	c.Assert(err, IsNil)
133 | 	c.Assert(steps, Equals, 2)
134 | 
135 | 	// Get the claim again, validate it's updated
136 | 	cl = s.m.GetPartitionClaim("test1", 0)
137 | 	if cl.LastHeartbeat <= 0 || cl.ClientID != "cl" || cl.GroupID != "gr" {
138 | 		c.Errorf("PartitionClaim values unexpected %+v", cl)
139 | 	}
140 | 	if cl.CurrentOffset != 10 {
141 | 		c.Error("CurrentOffset is not 10")
142 | 	}
143 | 
144 | 	// Release
145 | 	c.Assert(s.m.ReleasePartition("test1", 0, 20), IsNil)
146 | 	steps, err = s.m.cluster.waitForRsteps(3)
147 | 	c.Assert(err, IsNil)
148 | 	c.Assert(steps, Equals, 3)
149 | 
150 | 	// Get the claim again, validate it's empty
151 | 	cl = s.m.GetPartitionClaim("test1", 0)
152 | 	if cl.LastHeartbeat > 0 || cl.ClientID != "" || cl.GroupID != "" {
153 | 		c.Errorf("PartitionClaim values unexpected %+v", cl)
154 | 	}
155 | 	if cl.CurrentOffset != 0 {
156 | 		c.Error("CurrentOffset is not 20")
157 | 	}
158 | 
159 | 	// Get the last known claim data
160 | 	cl = s.m.GetLastPartitionClaim("test1", 0)
161 | 	if cl.LastHeartbeat > 0 || cl.ClientID != "cl" || cl.GroupID != "gr" {
162 | 		c.Errorf("PartitionClaim values unexpected %+v", cl)
163 | 	}
164 | 	if cl.CurrentOffset != 20 {
165 | 		c.Error("CurrentOffset is not 20")
166 | 	}
167 | }
168 | 
169 | func (s *MarshalSuite) TestTerminatedMarshalRemovesSelfFromCluster(c *C) {
170 | 	// Test that terminated Marshalers remove their cluster's reference to it.
171 | 	c.Assert(s.m.cluster.marshalers, DeepEquals, []*Marshaler{s.m})
172 | 	s.m.Terminate()
173 | 	c.Assert(s.m.cluster.marshalers, DeepEquals, []*Marshaler{})
174 | }
175 | 


--------------------------------------------------------------------------------
/marshal/message.go:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * portal - marshal
  3 |  *
  4 |  * a library that implements an algorithm for doing consumer coordination within Kafka, rather
  5 |  * than using Zookeeper or another external system.
  6 |  *
  7 |  */
  8 | 
  9 | package marshal
 10 | 
 11 | import (
 12 | 	"fmt"
 13 | 	"strconv"
 14 | 	"strings"
 15 | )
 16 | 
 17 | // TODO: This all uses a dumb string representation format which is very bytes-intensive.
 18 | // A binary protocol would be nice.
 19 | 
 20 | type msgType int
 21 | 
 22 | const (
 23 | 	msgLengthBase int = 8
 24 | 	idxType       int = 0
 25 | 	idxVersion    int = 1
 26 | 	idxTimestamp  int = 2
 27 | 	idxInstanceID int = 3
 28 | 	idxClientID   int = 4
 29 | 	idxGroupID    int = 5
 30 | 	idxTopic      int = 6
 31 | 	idxPartID     int = 7
 32 | 	idxBaseEnd    int = 7 // Index of last element in base message.
 33 | 
 34 | 	msgTypeHeartbeat   msgType = 0
 35 | 	msgLengthHeartbeat int     = msgLengthBase + 1
 36 | 	idxHBCurrentOffset int     = idxBaseEnd + 1
 37 | 
 38 | 	msgTypeClaimingPartition   msgType = 1
 39 | 	msgLengthClaimingPartition int     = msgLengthBase
 40 | 
 41 | 	msgTypeReleasingPartition   msgType = 2
 42 | 	msgLengthReleasingPartition int     = msgLengthBase + 1
 43 | 	idxRPCurrentOffset          int     = idxBaseEnd + 1
 44 | 
 45 | 	msgTypeClaimingMessages    msgType = 3
 46 | 	msgLengthClaimingMessages  int     = msgLengthBase + 1
 47 | 	idxCMProposedCurrentOffset int     = idxBaseEnd + 1
 48 | )
 49 | 
 50 | type message interface {
 51 | 	Encode() string
 52 | 	Timestamp() int
 53 | 	Type() msgType
 54 | 	Ownership() (string, string, string)
 55 | }
 56 | 
 57 | // decode takes a slice of bytes that should constitute a single message and attempts to
 58 | // decode it into one of our message structs.
 59 | func decode(inp []byte) (message, error) {
 60 | 	parts := strings.Split(string(inp), "/")
 61 | 	if len(parts) < msgLengthBase {
 62 | 		return nil, fmt.Errorf("Invalid message (length): [%s]", string(inp))
 63 | 	}
 64 | 
 65 | 	version, err := strconv.Atoi(parts[idxVersion])
 66 | 	if err != nil {
 67 | 		return nil, fmt.Errorf("Invalid message (version): [%s]", string(inp))
 68 | 	}
 69 | 
 70 | 	// Get out the base message which is always present as it identifies the sender.
 71 | 	partID, err := strconv.Atoi(parts[idxPartID])
 72 | 	if err != nil {
 73 | 		return nil, fmt.Errorf("Invalid message (partID): [%s]", string(inp))
 74 | 	}
 75 | 	ts, err := strconv.Atoi(parts[idxTimestamp])
 76 | 	if err != nil {
 77 | 		return nil, fmt.Errorf("Invalid message (timestamp): [%s]", string(inp))
 78 | 	}
 79 | 	base := msgBase{
 80 | 		Version:    version,
 81 | 		Time:       ts,
 82 | 		InstanceID: parts[idxInstanceID],
 83 | 		ClientID:   parts[idxClientID],
 84 | 		GroupID:    parts[idxGroupID],
 85 | 		Topic:      parts[idxTopic],
 86 | 		PartID:     partID,
 87 | 	}
 88 | 
 89 | 	switch parts[0] {
 90 | 	case "Heartbeat":
 91 | 		if len(parts) != msgLengthHeartbeat {
 92 | 			return nil, fmt.Errorf("Invalid message (hb length): [%s]", string(inp))
 93 | 		}
 94 | 		offset, err := strconv.ParseInt(parts[idxHBCurrentOffset], 10, 0)
 95 | 		if err != nil {
 96 | 			return nil, fmt.Errorf("Invalid message (hb offset): [%s]", string(inp))
 97 | 		}
 98 | 		return &msgHeartbeat{msgBase: base, CurrentOffset: int64(offset)}, nil
 99 | 	case "ClaimingPartition":
100 | 		if len(parts) != msgLengthClaimingPartition {
101 | 			return nil, fmt.Errorf("Invalid message (cp length): [%s]", string(inp))
102 | 		}
103 | 		return &msgClaimingPartition{msgBase: base}, nil
104 | 	case "ReleasingPartition":
105 | 		if len(parts) != msgLengthReleasingPartition {
106 | 			return nil, fmt.Errorf("Invalid message (rp length): [%s]", string(inp))
107 | 		}
108 | 		offset, err := strconv.ParseInt(parts[idxRPCurrentOffset], 10, 0)
109 | 		if err != nil {
110 | 			return nil, fmt.Errorf("Invalid message (rp offset): [%s]", string(inp))
111 | 		}
112 | 		return &msgReleasingPartition{msgBase: base, CurrentOffset: offset}, nil
113 | 	case "ClaimingMessages":
114 | 		if len(parts) != msgLengthClaimingMessages {
115 | 			return nil, fmt.Errorf("Invalid message (cm length): [%s]", string(inp))
116 | 		}
117 | 		offset, err := strconv.ParseInt(parts[idxCMProposedCurrentOffset], 10, 0)
118 | 		if err != nil {
119 | 			return nil, fmt.Errorf("Invalid message (cm offset): [%s]", string(inp))
120 | 		}
121 | 		return &msgClaimingMessages{msgBase: base, ProposedCurrentOffset: offset}, nil
122 | 	}
123 | 	return nil, fmt.Errorf("Invalid message: [%s]", string(inp))
124 | }
125 | 
126 | type msgBase struct {
127 | 	Version    int
128 | 	Time       int
129 | 	InstanceID string
130 | 	ClientID   string
131 | 	GroupID    string
132 | 	Topic      string
133 | 	PartID     int
134 | }
135 | 
136 | // Encode returns a string representation of the message.
137 | func (m *msgBase) Encode() string {
138 | 	return fmt.Sprintf("%d/%d/%s/%s/%s/%s/%d",
139 | 		m.Version, m.Time, m.InstanceID, m.ClientID, m.GroupID, m.Topic, m.PartID)
140 | }
141 | 
142 | // Type returns the type of this message.
143 | func (m *msgBase) Type() msgType {
144 | 	panic("Attempted to type the base message. This should never happen.")
145 | }
146 | 
147 | // Timestamp returns the timestamp of the message
148 | func (m *msgBase) Timestamp() int {
149 | 	return m.Time
150 | }
151 | 
152 | // Ownership returns InstanceID, ClientID, GroupID for message
153 | func (m *msgBase) Ownership() (string, string, string) {
154 | 	return m.InstanceID, m.ClientID, m.GroupID
155 | }
156 | 
157 | // msgHeartbeat is sent regularly by all consumers to re-up their claim to the partition that
158 | // they're consuming.
159 | type msgHeartbeat struct {
160 | 	msgBase
161 | 	CurrentOffset int64
162 | }
163 | 
164 | // Encode returns a string representation of the message.
165 | func (m *msgHeartbeat) Encode() string {
166 | 	return "Heartbeat/" + m.msgBase.Encode() + fmt.Sprintf("/%d", m.CurrentOffset)
167 | }
168 | 
169 | // Type returns the type of this message.
170 | func (m *msgHeartbeat) Type() msgType {
171 | 	return msgTypeHeartbeat
172 | }
173 | 
174 | // Timestamp returns the timestamp of the message
175 | func (m *msgHeartbeat) Timestamp() int {
176 | 	return m.Time
177 | }
178 | 
179 | // Ownership returns InstanceID, ClientID, GroupID for message
180 | func (m *msgHeartbeat) Ownership() (string, string, string) {
181 | 	return m.InstanceID, m.ClientID, m.GroupID
182 | }
183 | 
184 | // msgClaimingPartition is used in the claim flow.
185 | type msgClaimingPartition struct {
186 | 	msgBase
187 | }
188 | 
189 | // Encode returns a string representation of the message.
190 | func (m *msgClaimingPartition) Encode() string {
191 | 	return "ClaimingPartition/" + m.msgBase.Encode()
192 | }
193 | 
194 | // Type returns the type of this message.
195 | func (m *msgClaimingPartition) Type() msgType {
196 | 	return msgTypeClaimingPartition
197 | }
198 | 
199 | // Timestamp returns the timestamp of the message
200 | func (m *msgClaimingPartition) Timestamp() int {
201 | 	return m.Time
202 | }
203 | 
204 | // Ownership returns InstanceID, ClientID, GroupID for message
205 | func (m *msgClaimingPartition) Ownership() (string, string, string) {
206 | 	return m.InstanceID, m.ClientID, m.GroupID
207 | }
208 | 
209 | // msgReleasingPartition is used in a controlled shutdown to indicate that you are done with
210 | // a partition.
211 | type msgReleasingPartition struct {
212 | 	msgBase
213 | 	CurrentOffset int64
214 | }
215 | 
216 | // Encode returns a string representation of the message.
217 | func (m *msgReleasingPartition) Encode() string {
218 | 	return "ReleasingPartition/" + m.msgBase.Encode() + fmt.Sprintf("/%d", m.CurrentOffset)
219 | }
220 | 
221 | // Type returns the type of this message.
222 | func (m *msgReleasingPartition) Type() msgType {
223 | 	return msgTypeReleasingPartition
224 | }
225 | 
226 | // Timestamp returns the timestamp of the message
227 | func (m *msgReleasingPartition) Timestamp() int {
228 | 	return m.Time
229 | }
230 | 
231 | // Ownership returns InstanceID, ClientID, GroupID for message
232 | func (m *msgReleasingPartition) Ownership() (string, string, string) {
233 | 	return m.InstanceID, m.ClientID, m.GroupID
234 | }
235 | 
236 | // msgClaimingMessages is used for at-most-once consumption semantics, this is a pre-commit
237 | // advisory message.
238 | type msgClaimingMessages struct {
239 | 	msgBase
240 | 	ProposedCurrentOffset int64
241 | }
242 | 
243 | // Encode returns a string representation of the message.
244 | func (m *msgClaimingMessages) Encode() string {
245 | 	return "ClaimingMessages/" + m.msgBase.Encode() + fmt.Sprintf("/%d", m.ProposedCurrentOffset)
246 | }
247 | 
248 | // Type returns the type of this message.
249 | func (m *msgClaimingMessages) Type() msgType {
250 | 	return msgTypeClaimingMessages
251 | }
252 | 
253 | // Timestamp returns the timestamp of the message
254 | func (m *msgClaimingMessages) Timestamp() int {
255 | 	return m.Time
256 | }
257 | 
258 | // Ownership returns InstanceID, ClientID, GroupID for message
259 | func (m *msgClaimingMessages) Ownership() (string, string, string) {
260 | 	return m.InstanceID, m.ClientID, m.GroupID
261 | }
262 | 


--------------------------------------------------------------------------------
/PROTOCOL.md:
--------------------------------------------------------------------------------
  1 | # Kafka Only Consumer Coordination
  2 | 
  3 | This is a description of the consumer coordination protocol implemented by the Marshal
  4 | library.
  5 | 
  6 | ## Synopsis
  7 | 
  8 | It is possible to coordinate N consumers without any shared state other than what Kafka
  9 | provides. Without using Zookeeper or any other such coordination system, and still provide
 10 | similar guarantees/functionality.
 11 | 
 12 | The essence of this approach is that we can use a new topic in Kafka such as `__marshal`
 13 | as a write-ahead log/transaction log and use it for constructing a race-safe consumer
 14 | coordination protocol. Since Kafka guarantees ordering within a partition, outside of an
 15 | unclean leader election we can safely coordinate consumers.
 16 | 
 17 | The goal is for this protocol to be robust to all failure cases. The goal is *not* for the
 18 | protocol to be the absolute fastest thing out there.
 19 | 
 20 | ## Protocol Messages
 21 | 
 22 | This section defines the messages used in the protocol.
 23 | 
 24 | In the following definitions, certain bolded words are used to define parameters to
 25 | the message:
 26 | 
 27 | - **client_id** is an arbitrary string.
 28 |   - This value should be unique within a **group_id** (see below).
 29 |   - This can be random, but, you might want to make it predictable for your set of consumers.
 30 |     If you do, you gain the property that your consumer can restart where it left off if
 31 |     you restart it (since it can resume its own heartbeats as long as the **client_id** is
 32 |     stable).
 33 | - **group_id** is a namespaced opaque string. I.e., if you are the foo team,
 34 |   you should use a value such as `foo.bar_consumer`.
 35 | - **topic**, **desired_topic** is a string of the topic name, this is also namespaced
 36 |   by your team.
 37 | - **partition**, **desired_partition** is an integer as provided by Kafka.
 38 | - **last_offset**, **proposed_last_offset** is an integer representing a message offset as
 39 |   provided by Kafka. This should never be generated on your own.
 40 | 
 41 | Some constants defined in the protocol
 42 | 
 43 | - *HeartbeatInterval* is the maximum allowed time between two heartbeats. Consumers are expected
 44 |   to send heartbeat messages once per interval. The smaller this number is, the busier the
 45 |   coordination topic will be, but the faster failure recovery will be.
 46 | 
 47 | The protocol is defined with several simple messages:
 48 | 
 49 | 1. `Heartbeat` which includes **client_id**, **group_id**, **topic**, **partition**,
 50 |    **last_offset**. These are sent at most every **HeartbeatInterval** seconds apart.
 51 | 1. `ClaimingPartition` which includes **client_id**, **group_id**, **topic**, **partition**
 52 |    and is used as the initial request stating that you wish to claim a partition.
 53 | 1. `ReleasingPartition` which includes **client_id**, **group_id**, **topic**,
 54 |    **partition**, **last_offset** and is used when a consumer wants to proactively release
 55 |    a partition.
 56 | 1. `ClaimingMessages` which includes **client_id**, **group_id**, **topic**,
 57 |    **partition**, **proposed_last_offset** is used for the At Most Once consumption flow.
 58 | 1. `ReleaseGroup` which includes **client_id**, **group_id**, **msg_expire_time**. This message
 59 |    is sent by a special Admin actor, which can pause an entire consumer group identified
 60 |    by the **group_id**, until **msg_expire_time**. This message is used to set a consumer
 61 |    group's position. See the section "Setting Consumer Group Position."
 62 | 
 63 | ## Determining World State
 64 | 
 65 | This is the primary engine of Marshal. The "rationalizer" will read the messages in the
 66 | coordination topic and calculate a "world state" given the sequence of messages in the log.
 67 | The algorithm works based on everybody coming to the same conclusion about the world state
 68 | given the same log, i.e., every state transition is determined solely by the messages in
 69 | the logs and their relative ordering.
 70 | 
 71 | The state is calculated, for a given **topic**/**partition** you wish to know about,
 72 | by fully consuming the data from the coordination topic (which should be relatively
 73 | minimal and fast to process) and constructing a current state of the world (as of the
 74 | last message you have).
 75 | 
 76 | You can know what consumers exist (actively) based on the heartbeats and partition messages.
 77 | 
 78 | ### Heartbeats
 79 | 
 80 | Every consumer is required to heartbeat every **HeartbeatInterval** seconds.
 81 | 
 82 | A client is considered fresh when less than **HeartbeatInterval** seconds have elapsed
 83 | since the last heartbeat.
 84 | 
 85 | A client is considered to be in an unknown state when **HeartbeatInterval** to *twice that
 86 | value* seconds have elapsed.
 87 | 
 88 | A client is considered stale when *more than twice* **HeartbeatInterval** seconds have
 89 | elapsed and no further heartbeat has been received.
 90 | 
 91 | ## Partition Assignment for Consumption
 92 | 
 93 | This is the meat of the system and the reason for such an algorithm. Being able to safely
 94 | assign partitions to consumers such that they can process messages with the desired
 95 | properties is non-trivial and requires this coordination.
 96 | 
 97 | ### Consuming a New Partition
 98 | 
 99 | This assumes that you want to start consuming a new partition.
100 | 
101 | 1. Pick a partition to try to claim
102 |   1. This is done by whatever method you choose. Random, round-robin, etc.
103 | 1. Pick coordinating partition based on
104 |    `hash(desired_topic, desired_partition) % number of partitions in coordinating topic`
105 | 1. Determine state of world on chosen coordinating partition
106 |   1. If the partition you wish to claim is already claimed, and the heartbeat for
107 |      that partition is not stale, return to step 1 (remember, "stale" is defined as
108 |      twice the **HeartbeatInterval**)
109 |   1. Since the heartbeat is stale, this consumer may continue to step 4 and attempt to claim
110 | 1. Send a `ClaimingPartition` message
111 | 1. Re-determine the state of the world (read up to the end)
112 |   1. Look for the earliest `ClaimingPartition` message associated with the desired
113 |      topic/partition, if it was ours (message from step 4) then continue
114 |   1. If somebody else won the race, return to step 1
115 | 1. If the current client wins (has the earliest claim), send an immediate `Heartbeat`
116 |    message and consumption on desired topic/partition can begin
117 | 
118 | This process, assuming no Kafka data loss (we'll have to carefully make sure to produce with the right options), should guarantee safe partition assignment.
119 | 
120 | ### Consuming Recently Used Partitions on Restart
121 | 
122 | This is an optimization to help prevent churn of consumption. If you define your consumer
123 | such that you have a predictable **client_id** and it is unique within your consumer
124 | group, you can use that to determine what partitions your client was previously consuming.
125 | 
126 | 1. Determine the complete state of the world
127 |   1. This requires scanning "recent" events for the entire coordination topic (all partitions)
128 | 1. If heartbeats are found for the current **client_id**+**group_id**, and if those
129 |    heartbeats are fresh (only), then send a new heartbeat and recover state
130 |   1. Note: The previous heartbeats should contain enough information to continue where you
131 |      left off (modulo the guarantees of ALO/AMO consumption)
132 | 
133 | ## Consumption
134 | 
135 | There are two main algorithms for message processing. Both of these assume that your client
136 | *already has a valid claim to a partition* that you are going to be consuming from.
137 | 
138 | ### At Most Once (AMO)
139 | 
140 | The semantics of at-most-once consumption are that you would prefer to consume a message
141 | zero (0) times (never see it) than to consume it more than once.
142 | 
143 | To do safely, we use the same linear nature of the Kafka partitions to make a
144 | transactional guarantee:
145 | 
146 | 1. Determine that we still have the claim to this partition
147 | 1. Fetch a batch of messages
148 |   1. Batch size should be adjusted for the QPS of your category, the smaller your batches
149 |      the more traffic against the coordination topic, but the fewer you lose in the failure
150 |      case
151 | 1. Produce a `ClaimingMessages` message with the last offset from our batch of messages
152 | 1. Re-determine the state of the world
153 |   1. Validate we still hold the claim on this partition
154 |   1. Validate that our claim of the messages from step 2 is in the log
155 | 1. Send `Heartbeat` with the offset to "commit" the transaction
156 | 1. Process the messages in this batch
157 | 
158 | Assuming again that Kafka is durable and we use the right settings, this should provide the
159 | guarantees we want for at-most-once. Any failure along the way will be handled by either a
160 | normal heartbeat-expire-retry loop (steps 1-4 fail) and if we fail during step 5 then that
161 | batch of messages will be dropped per AMO semantics.
162 | 
163 | ### At Least Once (ALO)
164 | 
165 | Much easier than at-most-once, still assuming we have the claim to a partition:
166 | 
167 | 1. Determine that we still have the claim to this partition
168 | 1. Fetch a batch of messages
169 | 1. Process batch of messages
170 | 1. Every **HeartbeatInterval**, send a `Heartbeat` message with the last processed offset
171 | 
172 | As long as you heartbeat every interval, failure is constrained to only re-process at most
173 | one single **HeartbeatInterval** of messages.
174 | 
175 | ### Consumer Failure
176 | 
177 | If a consumer stops reporting heartbeats, other consumers can pick up that partition.
178 | In essence, if no `Heartbeat` messages have arrived on a partition for twice the
179 | **HeartbeatInterval**, then whichever consumers are looking for partitions will attempt
180 | to claim that partition, starting that whole process.
181 | 
182 | In the ALO consumption case, this can lead to two consumers running on a single batch
183 | of messages at the same time, but it is constrained to one batch. The AMO consumer cannot
184 | have that failure case, at worst it will never process some messages.
185 | 
186 | ## Setting Consumer Group Position
187 | 
188 | Documentation being written.
189 | 
190 | # TODO
191 | 
192 | I believe a Kafka-service system would want to consume messages off of a partition but
193 | not necessarily take a whole lock on the partition. I.e. just saying "I claim message
194 | offsets X-Y". You can also then fix latency issues by pre-claiming ranges so that the
195 | instant they become used you've already negotiated the lock on that range and can
196 | start processing them/handing them out?
197 | 


--------------------------------------------------------------------------------
/marshal/rationalizer_test.go:
--------------------------------------------------------------------------------
  1 | package marshal
  2 | 
  3 | import (
  4 | 	"sync"
  5 | 	"testing"
  6 | 	"time"
  7 | 
  8 | 	. "gopkg.in/check.v1"
  9 | 
 10 | 	"github.com/op/go-logging"
 11 | )
 12 | 
 13 | func init() {
 14 | 	// TODO: This changes logging for the whole suite. Is that what we want?
 15 | 	logging.SetLevel(logging.ERROR, "PortalMarshal")
 16 | }
 17 | 
 18 | func Test(t *testing.T) { TestingT(t) }
 19 | 
 20 | var _ = Suite(&RationalizerSuite{})
 21 | 
 22 | type RationalizerSuite struct {
 23 | 	m   *Marshaler
 24 | 	out chan message
 25 | 	ret chan struct{}
 26 | }
 27 | 
 28 | func (s *RationalizerSuite) SetUpTest(c *C) {
 29 | 	ResetTestLogger(c)
 30 | 
 31 | 	s.m = NewWorld()
 32 | 	s.out = make(chan message)
 33 | 	go s.m.cluster.rationalize(0, s.out)
 34 | 
 35 | 	// Build our return channel and insert it (simulating what the marshal does for
 36 | 	// actually trying to claim)
 37 | 	s.ret = make(chan struct{}, 1)
 38 | 	topic := s.m.cluster.getPartitionState(s.m.groupID, "test1", 0)
 39 | 	topic.lock.Lock()
 40 | 	topic.partitions[0].pendingClaims = append(topic.partitions[0].pendingClaims, s.ret)
 41 | 	topic.lock.Unlock()
 42 | }
 43 | 
 44 | func (s *RationalizerSuite) TearDownTest(c *C) {
 45 | 	s.m.Terminate()
 46 | 	close(s.out)
 47 | 
 48 | 	// This one might have already been closed, so safely close it.
 49 | 	select {
 50 | 	case <-s.ret:
 51 | 	default:
 52 | 		close(s.ret)
 53 | 	}
 54 | }
 55 | 
 56 | func NewWorld() *Marshaler {
 57 | 	return &Marshaler{
 58 | 		quit:     new(int32),
 59 | 		clientID: "cl",
 60 | 		groupID:  "gr",
 61 | 		cluster: &KafkaCluster{
 62 | 			quit:          new(int32),
 63 | 			rsteps:        new(int32),
 64 | 			groups:        make(map[string]map[string]*topicState),
 65 | 			partitions:    1,
 66 | 			lock:          &sync.RWMutex{},
 67 | 			rationalizers: &sync.WaitGroup{},
 68 | 		},
 69 | 		lock: &sync.RWMutex{},
 70 | 	}
 71 | }
 72 | 
 73 | func heartbeat(ts int, ii, cl, gr, t string, id int, lo int64) *msgHeartbeat {
 74 | 	return &msgHeartbeat{
 75 | 		msgBase: msgBase{
 76 | 			Time:       ts,
 77 | 			InstanceID: ii,
 78 | 			ClientID:   cl,
 79 | 			GroupID:    gr,
 80 | 			Topic:      t,
 81 | 			PartID:     id,
 82 | 		},
 83 | 		CurrentOffset: lo,
 84 | 	}
 85 | }
 86 | 
 87 | func claimingPartition(ts int, ii, cl, gr, t string, id int) *msgClaimingPartition {
 88 | 	return &msgClaimingPartition{
 89 | 		msgBase: msgBase{
 90 | 			Time:       ts,
 91 | 			InstanceID: ii,
 92 | 			ClientID:   cl,
 93 | 			GroupID:    gr,
 94 | 			Topic:      t,
 95 | 			PartID:     id,
 96 | 		},
 97 | 	}
 98 | }
 99 | 
100 | func releasingPartition(ts int, ii, cl, gr, t string, id int, lo int64) *msgReleasingPartition {
101 | 	return &msgReleasingPartition{
102 | 		msgBase: msgBase{
103 | 			Time:       ts,
104 | 			InstanceID: ii,
105 | 			ClientID:   cl,
106 | 			GroupID:    gr,
107 | 			Topic:      t,
108 | 			PartID:     id,
109 | 		},
110 | 		CurrentOffset: lo,
111 | 	}
112 | }
113 | 
114 | func (s *RationalizerSuite) WaitForRsteps(c *C, cluster *KafkaCluster, numSteps int) {
115 | 	steps, err := cluster.waitForRsteps(numSteps)
116 | 	c.Assert(err, IsNil)
117 | 	c.Assert(steps, Equals, numSteps)
118 | }
119 | 
120 | func (s *RationalizerSuite) TestClaimed(c *C) {
121 | 	// This log, a single heartbeat at t=0, indicates that this topic/partition are claimed
122 | 	// by the client/group given.
123 | 	s.out <- heartbeat(1, "ii", "cl", "gr", "test1", 0, 0)
124 | 	s.WaitForRsteps(c, s.m.cluster, 1)
125 | 
126 | 	// They heartbeated at 1, should be claimed as of 1.
127 | 	s.m.cluster.ts = 1
128 | 	c.Assert(s.m.Claimed("test1", 0), Equals, true)
129 | 
130 | 	// Should still be claimed immediately after the interval
131 | 	s.m.cluster.ts = HeartbeatInterval + 2
132 | 	c.Assert(s.m.Claimed("test1", 0), Equals, true)
133 | 
134 | 	// And still claimed right at the last second of the cutoff
135 | 	s.m.cluster.ts = HeartbeatInterval * 2
136 | 	c.Assert(s.m.Claimed("test1", 0), Equals, true)
137 | 
138 | 	// Should NOT be claimed >2x the heartbeat interval
139 | 	s.m.cluster.ts = HeartbeatInterval*2 + 1
140 | 	c.Assert(s.m.Claimed("test1", 0), Equals, false)
141 | }
142 | 
143 | func (s *RationalizerSuite) TestClaimNotMutable(c *C) {
144 | 	// This log, a single heartbeat at t=0, indicates that this topic/partition are claimed
145 | 	// by the client/group given.
146 | 	s.out <- heartbeat(1, "ii", "cl", "gr", "test1", 0, 0)
147 | 	s.WaitForRsteps(c, s.m.cluster, 1)
148 | 
149 | 	// They heartbeated at 1, should be claimed as of 1.
150 | 	s.m.cluster.ts = 1
151 | 	cl := s.m.GetPartitionClaim("test1", 0)
152 | 	c.Assert(cl.LastHeartbeat, Not(Equals), int64(0))
153 | 
154 | 	// Modify structure, then refetch and make sure it hasn't been mutated
155 | 	cl.ClientID = "invalid"
156 | 	cl2 := s.m.GetPartitionClaim("test1", 0)
157 | 	c.Assert(cl2.LastHeartbeat, Not(Equals), int64(0))
158 | 	c.Assert(cl2.ClientID, Equals, "cl")
159 | }
160 | 
161 | func (s *RationalizerSuite) TestClaimNotOurs(c *C) {
162 | 	// This log, a single heartbeat at t=0, indicates that this topic/partition are claimed
163 | 	// by the client/group given.
164 | 	s.out <- heartbeat(1, "ii", "cl", "grother", "test1", 0, 0)
165 | 	s.WaitForRsteps(c, s.m.cluster, 1)
166 | 
167 | 	// They heartbeated at 1, but since we have a different groupID, this should say that
168 | 	// the partition is not claimed
169 | 	s.m.cluster.ts = 1
170 | 	cl := s.m.GetPartitionClaim("test1", 0)
171 | 	c.Assert(cl.LastHeartbeat, Equals, int64(0))
172 | 
173 | 	// Now change our marshal's group to match
174 | 	s.m.groupID = "grother"
175 | 	s.m.cluster.ts = 1
176 | 	cl = s.m.GetPartitionClaim("test1", 0)
177 | 	c.Assert(cl.LastHeartbeat, Not(Equals), int64(0))
178 | }
179 | 
180 | func (s *RationalizerSuite) TestClaimPartition(c *C) {
181 | 	// This log, a single heartbeat at t=0, indicates that this topic/partition are claimed
182 | 	// by the client/group given.
183 | 	s.m.cluster.ts = 30
184 | 	s.out <- claimingPartition(1, "ii", "cl", "gr", "test1", 0)
185 | 
186 | 	select {
187 | 	case <-s.ret:
188 | 		cl, err := s.m.getClaimedPartitionState("test1", 0)
189 | 		c.Assert(err, IsNil)
190 | 		c.Assert(cl, NotNil)
191 | 	case <-time.After(1 * time.Second):
192 | 		c.Error("Timed out claiming partition")
193 | 	}
194 | }
195 | 
196 | func (s *RationalizerSuite) TestReclaimPartition(c *C) {
197 | 	// This log is us having the partition (HB) + a CP from someone else + a CP from us,
198 | 	// this should result in us owning the partition + the other person not
199 | 	s.m.cluster.ts = 30
200 | 	s.out <- heartbeat(1, "ii", "cl", "gr", "test1", 0, 0)
201 | 	s.out <- claimingPartition(2, "ii", "clother", "gr", "test1", 0)
202 | 	s.out <- claimingPartition(3, "ii", "cl", "gr", "test1", 0)
203 | 
204 | 	select {
205 | 	case <-s.ret:
206 | 		// We own it
207 | 		cl, err := s.m.getClaimedPartitionState("test1", 0)
208 | 		c.Assert(err, IsNil)
209 | 		c.Assert(cl, NotNil)
210 | 	case <-time.After(1 * time.Second):
211 | 		c.Error("Timed out claiming partition")
212 | 	}
213 | }
214 | 
215 | func (s *RationalizerSuite) TestReleaseClaim(c *C) {
216 | 	// This log, a single heartbeat at t=0, indicates that this topic/partition are claimed
217 | 	// by the client/group given.
218 | 	s.out <- heartbeat(1, "ii", "cl", "gr", "test1", 0, 0)
219 | 	s.WaitForRsteps(c, s.m.cluster, 1)
220 | 
221 | 	// They heartbeated at 1, should be claimed as of 1.
222 | 	s.m.cluster.ts = 1
223 | 	c.Assert(s.m.Claimed("test1", 0), Equals, true)
224 | 
225 | 	// Someone else attempts to release the claim, this shouldn't work
226 | 	s.out <- releasingPartition(20, "ii", "cl-bad", "gr", "test1", 0, 5)
227 | 	s.WaitForRsteps(c, s.m.cluster, 2)
228 | 
229 | 	// Must be unclaimed, invalid release
230 | 	s.m.cluster.ts = 25
231 | 	c.Assert(s.m.Claimed("test1", 0), Equals, true)
232 | 
233 | 	// Now they release it at position 10
234 | 	s.out <- releasingPartition(30, "ii", "cl", "gr", "test1", 0, 10)
235 | 	s.WaitForRsteps(c, s.m.cluster, 3)
236 | 	c.Assert(s.m.GetLastPartitionClaim("test1", 0).LastHeartbeat, Equals, int64(0))
237 | 	c.Assert(s.m.GetLastPartitionClaim("test1", 0).LastRelease, Equals, int64(30))
238 | 
239 | 	// They released at 30, should be free as of 31
240 | 	s.m.cluster.ts = 31
241 | 	c.Assert(s.m.Claimed("test1", 0), Equals, false)
242 | 	c.Assert(s.m.GetLastPartitionClaim("test1", 0).CurrentOffset, Equals, int64(10))
243 | }
244 | 
245 | func (s *RationalizerSuite) TestClaimHandoff(c *C) {
246 | 	// This log, a single heartbeat at t=0, indicates that this topic/partition are claimed
247 | 	// by the client/group given.
248 | 	s.out <- heartbeat(1, "ii", "cl", "gr", "test1", 0, 0)
249 | 	s.WaitForRsteps(c, s.m.cluster, 1)
250 | 
251 | 	// They heartbeated at 1, should be claimed as of 1.
252 | 	s.m.cluster.ts = 1
253 | 	c.Assert(s.m.Claimed("test1", 0), Equals, true)
254 | 
255 | 	// Now they hand this off to someone else who picks up the heartbeat
256 | 	s.out <- heartbeat(10, "ii", "cl2", "gr", "test1", 0, 10)
257 | 	s.WaitForRsteps(c, s.m.cluster, 2)
258 | 
259 | 	// Must be claimed, and claimed by cl2
260 | 	s.m.cluster.ts = 25
261 | 	c.Assert(s.m.Claimed("test1", 0), Equals, true)
262 | 	c.Assert(s.m.GetPartitionClaim("test1", 0).ClientID, Equals, "cl2")
263 | 
264 | 	// Now we change the group ID of our world state (which client's can't do) and validate
265 | 	// that these partitions are NOT claimed
266 | 	s.m.cluster.ts = 25
267 | 	s.m.groupID = "gr2"
268 | 	c.Assert(s.m.Claimed("test1", 0), Equals, false)
269 | 	c.Assert(s.m.GetPartitionClaim("test1", 0).ClientID, Equals, "")
270 | }
271 | 
272 | func (s *RationalizerSuite) TestPartitionExtend(c *C) {
273 | 	// This log, a single heartbeat at t=0, indicates that this topic/partition are claimed
274 | 	// by the client/group given.
275 | 	s.out <- heartbeat(1, "ii", "cl", "gr", "test1", 0, 0)
276 | 	s.WaitForRsteps(c, s.m.cluster, 1)
277 | 
278 | 	// Ensure len is 1
279 | 	s.m.lock.RLock()
280 | 	s.m.cluster.groups["gr"]["test1"].lock.RLock()
281 | 	c.Assert(len(s.m.cluster.groups["gr"]["test1"].partitions), Equals, 1)
282 | 	s.m.cluster.groups["gr"]["test1"].lock.RUnlock()
283 | 	s.m.lock.RUnlock()
284 | 
285 | 	// Extend by 4
286 | 	s.out <- heartbeat(2, "ii", "cl2", "gr", "test1", 4, 0)
287 | 	s.WaitForRsteps(c, s.m.cluster, 2)
288 | 
289 | 	// Ensure len is 5
290 | 	s.m.lock.RLock()
291 | 	defer s.m.lock.RUnlock()
292 | 	s.m.cluster.groups["gr"]["test1"].lock.RLock()
293 | 	defer s.m.cluster.groups["gr"]["test1"].lock.RUnlock()
294 | 	c.Assert(len(s.m.cluster.groups["gr"]["test1"].partitions), Equals, 5)
295 | 
296 | 	// Ensure 0 and 4 are claimed by us
297 | 	p1 := s.m.cluster.groups["gr"]["test1"].partitions[0]
298 | 	c.Assert(p1.ClientID, Equals, "cl")
299 | 	c.Assert(p1.GroupID, Equals, "gr")
300 | 	c.Assert(p1.LastHeartbeat, Equals, int64(1))
301 | 	p2 := s.m.cluster.groups["gr"]["test1"].partitions[4]
302 | 	c.Assert(p2.ClientID, Equals, "cl2")
303 | 	c.Assert(p2.GroupID, Equals, "gr")
304 | 	c.Assert(p2.LastHeartbeat, Equals, int64(2))
305 | }
306 | 


--------------------------------------------------------------------------------
/marshal/rationalizer.go:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * portal - marshal
  3 |  *
  4 |  * a library that implements an algorithm for doing consumer coordination within Kafka, rather
  5 |  * than using Zookeeper or another external system.
  6 |  *
  7 |  */
  8 | 
  9 | package marshal
 10 | 
 11 | import (
 12 | 	"sync/atomic"
 13 | 	"time"
 14 | 
 15 | 	"github.com/zorkian/kafka"
 16 | 	"github.com/jpillora/backoff"
 17 | )
 18 | 
 19 | // kafkaConsumerChannel creates a consumer that continuously attempts to consume messages from
 20 | // Kafka for the given partition.
 21 | func (c *KafkaCluster) kafkaConsumerChannel(partID int) <-chan message {
 22 | 	log.Debugf("[%s] rationalize[%d]: starting", c.name, partID)
 23 | 	out := make(chan message, 1000)
 24 | 	go c.consumeFromKafka(partID, out, false)
 25 | 	return out
 26 | }
 27 | 
 28 | // consumeFromKafka will start consuming messages from Kafka and writing them to the given
 29 | // channel forever. It is important that this method closes the "out" channel when it's done,
 30 | // as that instructs the downstream goroutine to exit.
 31 | func (c *KafkaCluster) consumeFromKafka(partID int, out chan message, startOldest bool) {
 32 | 	var err error
 33 | 	var alive bool
 34 | 	var offsetFirst, offsetNext int64
 35 | 
 36 | 	// Exit logic -- make sure downstream knows we exited.
 37 | 	defer func() {
 38 | 		log.Debugf("[%s] rationalize[%d]: terminating.", c.name, partID)
 39 | 		close(out)
 40 | 	}()
 41 | 
 42 | 	// Try to connect to Kafka. This might sleep a bit and retry since the broker could
 43 | 	// be down a bit.
 44 | 	retry := &backoff.Backoff{Min: 500 * time.Millisecond, Jitter: true}
 45 | 	for ; true; time.Sleep(retry.Duration()) {
 46 | 		// Figure out how many messages are in this topic. This can fail if the broker handling
 47 | 		// this partition is down, so we will loop.
 48 | 		offsetFirst, err = c.broker.OffsetEarliest(MarshalTopic, int32(partID))
 49 | 		if err != nil {
 50 | 			log.Errorf("[%s] rationalize[%d]: failed to get offset: %s", c.name, partID, err)
 51 | 			continue
 52 | 		}
 53 | 		offsetNext, err = c.broker.OffsetLatest(MarshalTopic, int32(partID))
 54 | 		if err != nil {
 55 | 			log.Errorf("[%s] rationalize[%d]: failed to get offset: %s", c.name, partID, err)
 56 | 			continue
 57 | 		}
 58 | 		log.Debugf("[%s] rationalize[%d]: offsets %d to %d",
 59 | 			c.name, partID, offsetFirst, offsetNext)
 60 | 
 61 | 		// TODO: Is there a case where the latest offset is X>0 but there is no data in
 62 | 		// the partition? does the offset reset to 0?
 63 | 		if offsetNext == 0 || offsetFirst == offsetNext {
 64 | 			alive = true
 65 | 			c.rationalizers.Done()
 66 | 		}
 67 | 		break
 68 | 	}
 69 | 	retry.Reset()
 70 | 
 71 | 	// Assume we're starting at the oldest offset for consumption
 72 | 	consumerConf := kafka.NewConsumerConf(MarshalTopic, int32(partID))
 73 | 	consumerConf.RetryErrLimit = 1 // Do not retry
 74 | 	consumerConf.StartOffset = kafka.StartOffsetOldest
 75 | 	consumerConf.RequestTimeout = c.options.MarshalRequestTimeout
 76 | 	consumerConf.RetryWait = c.options.MarshalRequestRetryWait
 77 | 
 78 | 	// Get the offsets of this partition, we're going to arbitrarily pick something that
 79 | 	// is ~100,000 from the end if there's more than that. This is only if startOldest is
 80 | 	// false, i.e., we didn't run into a "message too new" situation.
 81 | 	checkMessageTs := false
 82 | 	if !startOldest && offsetNext-offsetFirst > 100000 {
 83 | 		checkMessageTs = true
 84 | 		consumerConf.StartOffset = offsetNext - 100000
 85 | 		log.Infof("[%s] rationalize[%d]: fast forwarding to offset %d.",
 86 | 			c.name, partID, consumerConf.StartOffset)
 87 | 	}
 88 | 
 89 | 	consumer, err := c.broker.Consumer(consumerConf)
 90 | 	if err != nil {
 91 | 		// Unfortunately this is a termination error, as without being able to consume this
 92 | 		// partition we can't effectively rationalize.
 93 | 		log.Errorf("[%s] rationalize[%d]: Failed to create consumer: %s", c.name, partID, err)
 94 | 		c.Terminate()
 95 | 		return
 96 | 	}
 97 | 
 98 | 	// Consume messages forever, or until told to quit.
 99 | 	for !c.Terminated() {
100 | 		msgb, err := consumer.Consume()
101 | 		if err != nil {
102 | 			// The internal consumer will do a number of retries. If we get an error here,
103 | 			// we're probably in the middle of a partition handoff. We should pause so we
104 | 			// don't hammer the cluster, but otherwise continue.
105 | 			log.Warningf("[%s] rationalize[%d]: failed to consume: %s", c.name, partID, err)
106 | 			time.Sleep(retry.Duration())
107 | 			continue
108 | 		}
109 | 		retry.Reset()
110 | 
111 | 		msg, err := decode(msgb.Value)
112 | 		if err != nil {
113 | 			// Invalid message in the streac. This should never happen, but if it does, just
114 | 			// continue on.
115 | 			// TODO: We should probably think about this. If we end up in a situation where
116 | 			// one version of this software has a bug that writes invalid messages, it could
117 | 			// be doing things we don't anticipate. Of course, crashing all consumers
118 | 			// reading that partition is also bad.
119 | 			log.Errorf("[%s] rationalize[%d]: %s", c.name, partID, err)
120 | 
121 | 			// In the case where the first message is an invalid message, we need to
122 | 			// to notify that we're alive now
123 | 			if !alive {
124 | 				alive = true
125 | 				c.rationalizers.Done()
126 | 			}
127 | 			continue
128 | 		}
129 | 
130 | 		// If we are on our first message, and we started at a non-zero offset, we need
131 | 		// to check to make sure that the timestamp is older than a given threshold. If it's
132 | 		// too new, that indicates our 100000 try didn't work, so let's go from the start.
133 | 		// TODO: This could be a binary search or something.
134 | 		if checkMessageTs {
135 | 			if int64(msg.Timestamp()) > time.Now().Unix()-HeartbeatInterval*2 {
136 | 				log.Warningf("[%s] rationalize[%d]: rewinding, fast-forwarded message was too new",
137 | 					c.name, partID)
138 | 				go c.consumeFromKafka(partID, out, true)
139 | 				return // terminate self.
140 | 			}
141 | 			checkMessageTs = false
142 | 		}
143 | 
144 | 		log.Debugf("[%s] rationalize[%d]: @%d: [%s]", c.name, partID, msgb.Offset, msg.Encode())
145 | 		out <- msg
146 | 
147 | 		// This is a one-time thing that fires the first time the rationalizer comes up
148 | 		// and makes sure we actually process all of the messages.
149 | 		if !alive && msgb.Offset >= offsetNext-1 {
150 | 			for len(out) > 0 {
151 | 				time.Sleep(100 * time.Millisecond)
152 | 			}
153 | 			log.Infof("[%s] rationalize[%d]: reached offset %d, now alive",
154 | 				c.name, partID, msgb.Offset)
155 | 			alive = true
156 | 			c.rationalizers.Done()
157 | 		}
158 | 	}
159 | }
160 | 
161 | // updateClaim is called whenever we need to adjust a claim structure.
162 | func (c *KafkaCluster) updateClaim(msg *msgHeartbeat) {
163 | 	topic := c.getPartitionState(msg.GroupID, msg.Topic, msg.PartID)
164 | 
165 | 	topic.lock.Lock()
166 | 	defer topic.lock.Unlock()
167 | 
168 | 	// Note that a heartbeat will just set the claim structure. It's not valid to heartbeat
169 | 	// for something you don't own (which is why we have ClaimPartition as a separate
170 | 	// message), so we can only assume it's valid.
171 | 	topic.partitions[msg.PartID].InstanceID = msg.InstanceID
172 | 	topic.partitions[msg.PartID].ClientID = msg.ClientID
173 | 	topic.partitions[msg.PartID].GroupID = msg.GroupID
174 | 	topic.partitions[msg.PartID].CurrentOffset = msg.CurrentOffset
175 | 	topic.partitions[msg.PartID].LastHeartbeat = int64(msg.Time)
176 | 	topic.partitions[msg.PartID].LastRelease = 0
177 | }
178 | 
179 | // releaseClaim is called whenever someone has released their claim on a partition.
180 | func (c *KafkaCluster) releaseClaim(msg *msgReleasingPartition) {
181 | 	topic := c.getPartitionState(msg.GroupID, msg.Topic, msg.PartID)
182 | 
183 | 	topic.lock.Lock()
184 | 	defer topic.lock.Unlock()
185 | 
186 | 	// The partition must be claimed by the person releasing it
187 | 	if !topic.partitions[msg.PartID].checkOwnership(msg, true) {
188 | 		log.Warningf(
189 | 			"[%s] ReleasePartition %s:%d from client %s that doesn't own it. Dropping.",
190 | 			c.name, msg.Topic, msg.PartID, msg.ClientID)
191 | 		return
192 | 	}
193 | 
194 | 	// Record the offset they told us they last processed, and then set the heartbeat to 0
195 | 	// which means this is no longer claimed
196 | 	topic.partitions[msg.PartID].CurrentOffset = msg.CurrentOffset
197 | 	topic.partitions[msg.PartID].LastHeartbeat = 0
198 | 	topic.partitions[msg.PartID].LastRelease = int64(msg.Time)
199 | }
200 | 
201 | // handleClaim is called whenever we see a ClaimPartition message.
202 | func (c *KafkaCluster) handleClaim(msg *msgClaimingPartition) {
203 | 	topic := c.getPartitionState(msg.GroupID, msg.Topic, msg.PartID)
204 | 
205 | 	topic.lock.Lock()
206 | 	defer topic.lock.Unlock()
207 | 
208 | 	// Send message to all pending consumers then clear the list (it is a violation of the
209 | 	// protocol to send two responses). This fires at the end when we exit so that anybody
210 | 	// who is waiting on this partition will know the state has changed.
211 | 	defer func() {
212 | 		for _, out := range topic.partitions[msg.PartID].pendingClaims {
213 | 			close(out)
214 | 		}
215 | 		topic.partitions[msg.PartID].pendingClaims = nil
216 | 	}()
217 | 
218 | 	// If the partition is already claimed, there's nothing we need to do.
219 | 	if topic.partitions[msg.PartID].claimed(c.ts) {
220 | 		return
221 | 	}
222 | 
223 | 	// At this point, the partition is unclaimed, which means we know we have the first
224 | 	// ClaimPartition message. As soon as we get it, we fill in the structure which makes
225 | 	// us think it's claimed (it is).
226 | 	topic.partitions[msg.PartID].InstanceID = msg.InstanceID
227 | 	topic.partitions[msg.PartID].ClientID = msg.ClientID
228 | 	topic.partitions[msg.PartID].GroupID = msg.GroupID
229 | 	topic.partitions[msg.PartID].LastHeartbeat = int64(msg.Time)
230 | 	topic.partitions[msg.PartID].LastRelease = 0
231 | }
232 | 
233 | // rationalize is a goroutine that constantly consumes from a given partition of the marshal
234 | // topic and makes changes to the world state whenever something happens.
235 | func (c *KafkaCluster) rationalize(partID int, in <-chan message) { // Might be in over my head.
236 | 	for !c.Terminated() {
237 | 		msg, ok := <-in
238 | 		if !ok {
239 | 			log.Infof("[%s] rationalize[%d]: exiting, channel closed", c.name, partID)
240 | 			return
241 | 		}
242 | 
243 | 		switch msg.Type() {
244 | 		case msgTypeHeartbeat:
245 | 			c.updateClaim(msg.(*msgHeartbeat))
246 | 		case msgTypeClaimingPartition:
247 | 			c.handleClaim(msg.(*msgClaimingPartition))
248 | 		case msgTypeReleasingPartition:
249 | 			c.releaseClaim(msg.(*msgReleasingPartition))
250 | 		case msgTypeClaimingMessages:
251 | 			// TODO: Implement.
252 | 		}
253 | 
254 | 		// Update step counter so the test suite can wait for messages to be
255 | 		// processed in a predictable way (rather than waiting random times)
256 | 		atomic.AddInt32(c.rsteps, 1)
257 | 	}
258 | 	log.Infof("[%s] rationalize[%d]: exiting, Marshaler terminated", c.name, partID)
259 | }
260 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Marshal - a Kafka consumer coordination library
  2 | 
  3 | [![GoDoc](http://img.shields.io/badge/godoc-reference-blue.svg)](http://godoc.org/github.com/zorkian/marshal/marshal)
  4 | [![Build Status](https://travis-ci.org/zorkian/marshal.svg)](https://travis-ci.org/dropbox/marshal)
  5 | 
  6 | Marshal is **in beta**. We have deployed it in a few places and are
  7 | working to ensure it's stable and fast. It is not 100% battle tested
  8 | yet, feedback is very welcome.
  9 | 
 10 | ## Purpose
 11 | 
 12 | This project assumes you have some familiarity with Kafka. You should
 13 | know what a topic is and what partitions are.
 14 | 
 15 | In Kafka, the unit of scalability is the partition. If you have a
 16 | topic that is getting "too busy", you increase the partition count.
 17 | Consumption of data from those busy topics requires consumers to be
 18 | aware of these partitions and be able to coordinate their consumption
 19 | across all of the consumers.
 20 | 
 21 | Traditional setups use Zookeeper or some other system for coordinating
 22 | consumers. This works in many situations, but introduces a point of
 23 | failure that isn't necessary. It is possible to completely perform
 24 | consumer coordination using Kafka alone.
 25 | 
 26 | Additionally, getting consumer coordination correct is a rather taxing
 27 | exercise in development and, frankly, shouldn't need to be done for
 28 | every single project, company, etc. There should be an open source
 29 | system that handles it for you.
 30 | 
 31 | Marshal is a library that you can drop into your Go programs and use it
 32 | to coordinate the consumption of partitions across multiple processes,
 33 | servers, etc. It is implemented in terms of Kafka itself: zero extra
 34 | dependencies.
 35 | 
 36 | Marshal is designed for use in production environments where there are
 37 | many topics, each topic having hundreds of partitions, with potentially
 38 | thousands of consumers working in concert across the infrastructure to
 39 | consume them. Marshal is designed for big environments with critical
 40 | needs.
 41 | 
 42 | ## Usage
 43 | 
 44 | This module is designed to be extremely simple to use. The basic logical
 45 | flow is that you create a Marshaler and then you use that to create as
 46 | many Consumers as you need topics to consume. Logically, you want one
 47 | Marshaler in your program, and you want a single Consumer per topic that
 48 | you need to consume from.
 49 | 
 50 | Here's the simplest example (but see a more complicated example in the
 51 | example directory):
 52 | 
 53 | ```go
 54 | package main
 55 | 
 56 | import "fmt"
 57 | import "github.com/zorkian/marshal/marshal"
 58 | 
 59 | func main() {
 60 |     marshaler, _ := marshal.NewMarshaler(
 61 |         "clientid", "groupid", []string{"127.0.0.1:9092"})
 62 |     defer marshaler.Terminate()
 63 | 
 64 |     consumer, _ := marshaler.NewConsumer(
 65 |         []string{"some-topic"}, marshal.NewConsumerOptions())
 66 |     defer consumer.Terminate()
 67 | 
 68 |     msgChan := consumer.ConsumeChannel()
 69 | 
 70 |     for {
 71 |         msg := <-msgChan
 72 |         fmt.Printf("Consumed message: %s", msg.Value)
 73 |         consumer.Commit(msg)
 74 |     }
 75 | }
 76 | ```
 77 | 
 78 | If you were to hypothetically run this against a cluster that contained
 79 | a topic named `some-topic` that had 8 partitions, it would begin
 80 | claiming those partitions one by one until it had them all. If you
 81 | started up a second copy of the program, it would only claim the
 82 | partitions that are not already claimed. If the first one dies, the
 83 | second one will pick up the dropped partitions within a few minutes.
 84 | 
 85 | In essence, Marshal takes all of the effort of consumer coordination out
 86 | of your software and puts it where it belongs: on Kafka.
 87 | 
 88 | ## How Coordination Works
 89 | 
 90 | Please read this section to get a handle on how Kafka performs
 91 | coordination and the guarantees that it gives you. In particular, the
 92 | failure scenarios might be interesting.
 93 | 
 94 | If you want the gory details about the protocol
 95 | used internally, please see the [PROTOCOL
 96 | documentation](https://github.com/zorkian/marshal/blob/master/PROTOCOL.md).
 97 | You don't need to read and understand it, though, but it might be
 98 | useful.
 99 | 
100 | ### Basic Coordination
101 | 
102 | In essence, Marshal uses a special topic within Kafka to coordinate the
103 | actions of many consumers anywhere in the infrastructure. As long as
104 | the consumers can connect to the Kafka cluster you want to coordinate,
105 | you can use Marshal. There is no language dependency either -- Marshal
106 | the algorithm could be implemented in any language and consumers could
107 | coordinate with each other.
108 | 
109 | We assume that you're familiar with the basics of Kafka -- notably that
110 | each partition is effectively a write-ahead log that records an ordered
111 | set of events, and that it's not possible (barring unclean leader
112 | elections) for two consumers to see different event orderings. Marshal
113 | takes advantage of that property to perform distributed coordination.
114 | 
115 | When a program using Marshal starts up, the first thing it does is read
116 | the logs in the coordinating topic. These logs contain certain events,
117 | such as: claim partition, heartbeat, and release partition to name a
118 | few.
119 | 
120 | Using these events Marshal can know not only what consumers exist, but
121 | what partitions they are currently working on and how far along they
122 | are. Using that information the local program can decide such things as
123 | "which partitions are unclaimed" and then take action to claim and begin
124 | consuming those partitions.
125 | 
126 | ### Groups and Clients
127 | 
128 | Coordination happens within "groups". When you create a `Marshaler` you
129 | can specify the group that your consumer is part of. All claims are done
130 | on a per-group basis, which means you can consume the same topic N times
131 | -- as long as you have N groups. There is a one-to-one mapping between
132 | "consumers that can claim a given partition" and "number of groups".
133 | 
134 | The "client ID" specified when you create a `Marshaler` is used to
135 | identify a particular instance of a program. These should be unique per
136 | instance of software, but they should be reasonably stable. At Dropbox
137 | we use the name of the machine the software is running on, plus possibly
138 | an instance ID if we run multiple copies on a single box.
139 | 
140 | ### Consumption of Messages
141 | 
142 | The main engine of Marshal happens when you create a consumer and call
143 | `consumer.Consume()`. This will possibly return a message from one
144 | of the partitions you have claimed. You then do something with the
145 | message... and consume the next one. You don't have to do anything else.
146 | 
147 | Behind the scenes, the act of consuming updates internal cursors and
148 | timers and will possibly generate heartbeat messages into the Marshal
149 | event log. These messages contain information about the last offset
150 | consumed, allowing other consumers (and monitoring systems) to know
151 | where you are within the partition. In case of failure, they can resume
152 | at the last point you heartbeated.
153 | 
154 | Presently, all consumption within Marshal is **at least once**. In
155 | case of most consumer failures, it is likely a block of messages (one
156 | heartbeat interval) will be reprocessed by the next consumer.
157 | 
158 | ### Message Ordering
159 | 
160 | Kafka guarantees the ordering of messages committed to a partition,
161 | but does not guarantee any ordering across partitions. Marshal will
162 | give you messages from any partition it has claimed, so in essence,
163 | Marshal *does not* guarantee ordering. If you need message ordering,
164 | this library is not presently appropriate for you.
165 | 
166 | If you are having throughput problems you should increase the number of
167 | partitions you have available so that Marshal can have more in-flight
168 | messages.
169 | 
170 | ## Failure Modes
171 | 
172 | This documents some of the failure modes and how Marshal handles them.
173 | Please let us know about more questions and we can analyze and write
174 | about them.
175 | 
176 | ### Consumer Too Slow
177 | 
178 | In the case where a consumer is too slow -- i.e. it is consuming more
179 | slowly from a partition than data is coming in -- Marshal will detect
180 | this and internally it will start failing its health checks. When this
181 | happens it will, after enough time has passed, decide that it is not
182 | able to sustain the load and will voluntarily surrender partitions.
183 | 
184 | This is useful as a load balancing mechanism if you happen to have one
185 | consumer that ends up with 8 claims while another has only a handful,
186 | the former can shed load and the latter will pick it up.
187 | 
188 | However, it is worth noting that in the unbalanced scenario, as long
189 | as the consumers are keeping up with the traffic they won't release
190 | partitions. It is perfectly valid right now for Marshal consumers to end
191 | up unbalanced -- as long as they're all pulling their weight.
192 | 
193 | ### Consumer Death: Expected
194 | 
195 | If a consumer dies or shuts down in an expected (controlled) way,
196 | Marshal will attempt to commit release partition events into the log. If
197 | this happens successfully then other consumers will be able to pick up
198 | the partitions within seconds and begin consuming exactly where the last
199 | consumer left off.
200 | 
201 | No data is skipped or double-consumed in this mode and the downtime is
202 | extremely minimal.
203 | 
204 | ### Consumer Death: Unexpected
205 | 
206 | If a consumer dies unexpectedly, things are slightly worse off. Assuming
207 | a hardware failure or other such issue (network split, etc), the
208 | partition's claim will start to become stale. From the perspective of
209 | the rest of the fleet, they will have to wait an appropriate interval
210 | (two heartbeats) until they can claim the partition.
211 | 
212 | Data might be double-consumed, but the maximum amount is one heartbeat's
213 | worth. Depending on the last time you heartbeated, at worst you will see
214 | that many messages be double-consumed. The downtime of consumption is
215 | also up to two heartbeat intervals at worst.
216 | 
217 | ### Network Partitions
218 | 
219 | Since Kafka can only have a single leader for a partition, any consumers
220 | that are on the side of the leader will be able to continue working.
221 | Consumers that are on the other side will fail to heartbeat and will
222 | stop being able to work -- even if they could otherwise reach the leader
223 | for the topics they were consuming.
224 | 
225 | The consumers on the side of the Marshal coordination partitions will be
226 | able to tell that the other consumers dropped off and will be able to
227 | start working. (Of course, this may cause them to overload themselves
228 | with too many claims, leading to consumer slowness.)
229 | 
230 | If the partition is between the consumer and Kafka, the consumers
231 | will be unable to consume and will also fail their heartbeat. This is
232 | effectively treated as Consumer Death: Unexpected. When the partition
233 | heals, the consumers that lost their lock will know (assuming machine
234 | time is synchronized) and will abandon their claims.
235 | 
236 | ## Important Notes
237 | 
238 | This system assumes that timestamps are valid. If your machines are
239 | not using NTP to synchronize their clocks, you will not be able to get
240 | deterministic behavior. Sorry.
241 | 
242 | Marshal also relies on all actors being good actors. Malicious users can
243 | cause the system to act unpredictably or at their choosing.
244 | 
245 | ## Frequently Asked Questions
246 | 
247 | Here are some questions we've seen. For more, see us on IRC.
248 | 
249 | ### My consumers are unbalanced; one has more partitions than the others.
250 | 
251 | This is a design property of Marshal's implementation. We start with the
252 | premise that we can capably health check ourself and determine whether
253 | or not we are keeping up with our current claims. If that's true, then
254 | it doesn't matter how many partitions we have -- we'll be healthy.
255 | 
256 | This means that we can end up in a state where one consumer has several
257 | partitions and another consumer has fewer (or none), but Marshal
258 | guarantees that all of them will be healthy.
259 | 
260 | ### My consumer isn't claiming any partitions.
261 | 
262 | This usually happens when you are reusing Client IDs and your consumer
263 | has previously become unhealthy and released partitions. A sick consumer
264 | will not reclaim partitions it has previously released.
265 | 
266 | Make sure you have multiple consumers with different Client IDs, or
267 | make sure that in the single consumer use case you are using randomly
268 | generated Client IDs every time your program starts.
269 | 
270 | ## Bugs and Contact
271 | 
272 | There may be bugs. This is a new project. There are tests, however, and
273 | we very much welcome the submission of bug reports, pull requests, etc.
274 | 
275 | Github: https://github.com/zorkian/marshal
276 | 
277 | IRC: #kafka-marshal on Freenode
278 | 


--------------------------------------------------------------------------------
/marshal/marshal.go:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * portal - marshal
  3 |  *
  4 |  * a library that implements an algorithm for doing consumer coordination within Kafka, rather
  5 |  * than using Zookeeper or another external system.
  6 |  *
  7 |  */
  8 | 
  9 | package marshal
 10 | 
 11 | import (
 12 | 	"fmt"
 13 | 	"sync"
 14 | 	"sync/atomic"
 15 | 	"time"
 16 | 
 17 | 	"github.com/zorkian/kafka"
 18 | 	"github.com/zorkian/kafka/proto"
 19 | 	"github.com/pborman/uuid"
 20 | )
 21 | 
 22 | const (
 23 | 	// MarshalTopic is the main topic used for coordination. This must be constant across all
 24 | 	// consumers that you want to coordinate.
 25 | 	MarshalTopic = "__marshal"
 26 | 
 27 | 	// HeartbeatInterval is the main timing used to determine how "chatty" the system is and how
 28 | 	// fast it responds to failures of consumers. THIS VALUE MUST BE THE SAME BETWEEN ALL CONSUMERS
 29 | 	// as it is critical to coordination.
 30 | 	HeartbeatInterval = 60 // Measured in seconds.
 31 | )
 32 | 
 33 | // Marshaler is the coordinator type. It is designed to be used once per (client,
 34 | // group) and is thread safe. Creating one of these will create connections to your
 35 | // Kafka cluster and begin actively monitoring the coordination topic.
 36 | type Marshaler struct {
 37 | 	// These members are not protected by the lock and can be read at any
 38 | 	// time as they're write-once or only ever atomically updated. They must
 39 | 	// never be overwritten once a Marshaler is created.
 40 | 	quit        *int32
 41 | 	cluster     *KafkaCluster
 42 | 	ownsCluster bool
 43 | 	instanceID  string
 44 | 	clientID    string
 45 | 	groupID     string
 46 | 	offsets     kafka.OffsetCoordinator
 47 | 
 48 | 	// Lock protects the following members; you must have this lock in order to
 49 | 	// read from or write to these.
 50 | 	lock      *sync.RWMutex
 51 | 	consumers []*Consumer
 52 | }
 53 | 
 54 | // NewMarshaler connects to a cluster (given broker addresses) and prepares to handle marshalling
 55 | // requests. Given the way this system works, the marshaler has to process all messages in the
 56 | // topic before it's safely able to begin operating. This might take a while. NOTE: If you are
 57 | // creating multiple marshalers in your program, you should instead call Dial and then use
 58 | // the NewMarshaler method on that object.
 59 | func NewMarshaler(clientID, groupID string, brokers []string) (*Marshaler, error) {
 60 | 	cluster, err := Dial("automatic", brokers, NewMarshalOptions())
 61 | 	if err != nil {
 62 | 		return nil, err
 63 | 	}
 64 | 	m, err := cluster.NewMarshaler(clientID, groupID)
 65 | 	if err != nil {
 66 | 		m.ownsCluster = true
 67 | 	}
 68 | 	return m, err
 69 | }
 70 | 
 71 | // newInstanceID creates a new random instance ID for use inside Marshal messages. This
 72 | // is generated new every time we restart.
 73 | func newInstanceID() string {
 74 | 	// A UUID4 starts with 8 random characters, so let's use that as our instance ID.
 75 | 	// This should be a good tradeoff between randomness and brevity.
 76 | 	return uuid.New()[0:8]
 77 | }
 78 | 
 79 | // addNewConsumer is called when a new Consumer is created. This allows Marshal to keep
 80 | // track of the consumers that exist so we can operate on them later if needed.
 81 | func (m *Marshaler) addNewConsumer(c *Consumer) {
 82 | 	m.lock.Lock()
 83 | 	defer m.lock.Unlock()
 84 | 
 85 | 	m.consumers = append(m.consumers, c)
 86 | }
 87 | 
 88 | // removeConsumer is called when a Consumer is terminating and should be removed from our list.
 89 | func (m *Marshaler) removeConsumer(c *Consumer) {
 90 | 	m.lock.Lock()
 91 | 	defer m.lock.Unlock()
 92 | 
 93 | 	for i, cn := range m.consumers {
 94 | 		if cn == c {
 95 | 			m.consumers = append(m.consumers[:i], m.consumers[i+1:]...)
 96 | 			break
 97 | 		}
 98 | 	}
 99 | }
100 | 
101 | // getClaimedPartitionState returns a topicState iff it is claimed by the current Marshaler.
102 | // Else, an error is returned. This is on the Marshaler becomes it's a helper to only return
103 | // a claim that is presently valid and owned by us.
104 | func (m *Marshaler) getClaimedPartitionState(topicName string, partID int) (
105 | 	*topicState, error) {
106 | 
107 | 	// Get partition state of whatever happens to be here
108 | 	topic := m.cluster.getPartitionState(m.groupID, topicName, partID)
109 | 
110 | 	topic.lock.RLock()
111 | 	defer topic.lock.RUnlock()
112 | 
113 | 	if !topic.partitions[partID].claimed(m.cluster.ts) {
114 | 		return nil, fmt.Errorf("Partition %s:%d is not claimed!", topicName, partID)
115 | 	}
116 | 
117 | 	// And if it's not claimed by us...
118 | 	if topic.partitions[partID].GroupID != m.groupID ||
119 | 		topic.partitions[partID].ClientID != m.clientID {
120 | 		return nil, fmt.Errorf("Partition %s:%d is not claimed by us!", topicName, partID)
121 | 	}
122 | 
123 | 	return topic, nil
124 | }
125 | 
126 | // Topics returns the list of known topics.
127 | func (m *Marshaler) Topics() []string {
128 | 	return m.cluster.getTopics()
129 | }
130 | 
131 | // Partitions returns the count of how many partitions are in a given topic. Returns 0 if a
132 | // topic is unknown.
133 | func (m *Marshaler) Partitions(topicName string) int {
134 | 	return m.cluster.getTopicPartitions(topicName)
135 | }
136 | 
137 | // terminateAndCleanup terminates the marshal, with the option of removing
138 | // the marshaler's reference from its associated cluster.
139 | func (m *Marshaler) terminateAndCleanup(remove bool) {
140 | 	if !atomic.CompareAndSwapInt32(m.quit, 0, 1) {
141 | 		return
142 | 	}
143 | 
144 | 	m.lock.Lock()
145 | 	defer m.lock.Unlock()
146 | 
147 | 	// Now terminate all of the consumers. In this codepath we do a no-release termination
148 | 	// because that is usually correct in production. If someone actually wants to release
149 | 	// they need to terminate the consumers manually.
150 | 	for _, cn := range m.consumers {
151 | 		cn.terminateAndCleanup(false, false)
152 | 	}
153 | 	m.consumers = nil
154 | 
155 | 	// If we own the cluster, terminate it.
156 | 	if m.ownsCluster {
157 | 		m.cluster.Terminate()
158 | 	}
159 | 
160 | 	// Remove this marshal from its cluster. Doing so is recommended
161 | 	// if the cluster doesn't remove the terminated marshal itself (by setting its
162 | 	// list of marshals to nil or filtering them).
163 | 	if remove {
164 | 		m.cluster.removeMarshal(m)
165 | 	}
166 | }
167 | 
168 | // Terminate is called when we're done with the marshaler and want to shut down.
169 | func (m *Marshaler) Terminate() {
170 | 	m.terminateAndCleanup(true)
171 | }
172 | 
173 | // Terminated returns whether or not we have been terminated.
174 | func (m *Marshaler) Terminated() bool {
175 | 	return atomic.LoadInt32(m.quit) == 1
176 | }
177 | 
178 | // Claimed returns the current status on whether or not a partition is claimed by any other
179 | // consumer in our group (including ourselves). A topic/partition that does not exist is
180 | // considered to be unclaimed.
181 | func (m *Marshaler) Claimed(topicName string, partID int) bool {
182 | 	// The contract of this method is that if it returns something and the heartbeat is
183 | 	// non-zero, the partition is claimed.
184 | 	claim := m.GetPartitionClaim(topicName, partID)
185 | 	return claim.LastHeartbeat > 0
186 | }
187 | 
188 | // GetPartitionClaim returns a PartitionClaim structure for a given partition. The structure
189 | // describes the consumer that is currently claiming this partition. This is a copy of the
190 | // claim structure, so changing it cannot change the world state.
191 | func (m *Marshaler) GetPartitionClaim(topicName string, partID int) PartitionClaim {
192 | 	topic := m.cluster.getPartitionState(m.groupID, topicName, partID)
193 | 
194 | 	topic.lock.RLock()
195 | 	defer topic.lock.RUnlock()
196 | 
197 | 	if topic.partitions[partID].claimed(m.cluster.ts) {
198 | 		return topic.partitions[partID] // copy.
199 | 	}
200 | 	return PartitionClaim{}
201 | }
202 | 
203 | // GetLastPartitionClaim returns a PartitionClaim structure for a given partition. The structure
204 | // describes the consumer that is currently or most recently claiming this partition. This is a
205 | // copy of the claim structure, so changing it cannot change the world state.
206 | func (m *Marshaler) GetLastPartitionClaim(topicName string, partID int) PartitionClaim {
207 | 	topic := m.cluster.getPartitionState(m.groupID, topicName, partID)
208 | 
209 | 	topic.lock.RLock()
210 | 	defer topic.lock.RUnlock()
211 | 
212 | 	return topic.partitions[partID] // copy.
213 | }
214 | 
215 | // GetPartitionOffsets returns the current state of a topic/partition. This has to hit Kafka
216 | // thrice to ask about a partition, but it returns the full state of information that can be
217 | // used to calculate consumer lag.
218 | func (m *Marshaler) GetPartitionOffsets(topicName string, partID int) (PartitionOffsets, error) {
219 | 	var err error
220 | 
221 | 	o := PartitionOffsets{}
222 | 	o.Earliest, err = m.cluster.broker.OffsetEarliest(topicName, int32(partID))
223 | 	if err != nil {
224 | 		return PartitionOffsets{}, err
225 | 	}
226 | 
227 | 	o.Latest, err = m.cluster.broker.OffsetLatest(topicName, int32(partID))
228 | 	if err != nil {
229 | 		return PartitionOffsets{}, err
230 | 	}
231 | 
232 | 	// Get committed offsets for our particular group using our offset coordinator.
233 | 	o.Committed, _, err = m.offsets.Offset(topicName, int32(partID))
234 | 	if err != nil {
235 | 		// This error happens when Kafka does not know about the partition i.e. no
236 | 		// offset has been committed here. In that case we ignore it.
237 | 		if err != proto.ErrUnknownTopicOrPartition {
238 | 			return PartitionOffsets{}, fmt.Errorf("offset fetch fail: %s", err)
239 | 		}
240 | 	}
241 | 
242 | 	// Use the last claim we know about, whatever it is
243 | 	claim := m.GetLastPartitionClaim(topicName, partID)
244 | 	o.Current = claim.CurrentOffset
245 | 	return o, nil
246 | }
247 | 
248 | // msgBase constructs a base message object for a message.
249 | func (m *Marshaler) msgBase(topicName string, partID int) *msgBase {
250 | 	return &msgBase{
251 | 		Version:    1,
252 | 		Time:       int(time.Now().Unix()),
253 | 		InstanceID: m.instanceID,
254 | 		ClientID:   m.clientID,
255 | 		GroupID:    m.groupID,
256 | 		Topic:      topicName,
257 | 		PartID:     partID,
258 | 	}
259 | }
260 | 
261 | // ClaimPartition is how you can actually claim a partition. If you call this, Marshal will
262 | // attempt to claim the partition on your behalf. This is the low level function, you probably
263 | // want to use a MarshaledConsumer. Returns a bool on whether or not the claim succeeded and
264 | // whether you can continue.
265 | func (m *Marshaler) ClaimPartition(topicName string, partID int) bool {
266 | 	topic := m.cluster.getPartitionState(m.groupID, topicName, partID)
267 | 
268 | 	// Unlock is later, since this function might take a while
269 | 	// TODO: Move this logic to a func and defer the lock (for sanity sake)
270 | 	topic.lock.Lock()
271 | 
272 | 	// If the topic is already claimed, we can short circuit the decision process
273 | 	if topic.partitions[partID].claimed(m.cluster.ts) {
274 | 		defer topic.lock.Unlock()
275 | 		if topic.partitions[partID].GroupID == m.groupID &&
276 | 			topic.partitions[partID].ClientID == m.clientID {
277 | 			return true
278 | 		}
279 | 		log.Warningf("Attempt to claim already claimed partition.")
280 | 		return false
281 | 	}
282 | 
283 | 	// Make a channel for results, append it to the list so we hear about claims
284 | 	out := make(chan struct{}, 1)
285 | 	topic.partitions[partID].pendingClaims = append(
286 | 		topic.partitions[partID].pendingClaims, out)
287 | 	topic.lock.Unlock()
288 | 
289 | 	// Produce message to kafka
290 | 	cl := &msgClaimingPartition{
291 | 		msgBase: *m.msgBase(topicName, partID),
292 | 	}
293 | 	_, err := m.cluster.producer.Produce(MarshalTopic, int32(topic.claimPartition),
294 | 		&proto.Message{Value: []byte(cl.Encode())})
295 | 	if err != nil {
296 | 		// If we failed to produce, this is probably serious so we should undo the work
297 | 		// we did and then return failure
298 | 		log.Errorf("Failed to produce to Kafka: %s", err)
299 | 		return false
300 | 	}
301 | 
302 | 	// Wait for channel to close, which is the signal that the rationalizer has
303 | 	// updated the status.
304 | 	<-out
305 | 
306 | 	// Now we have to check if we own the partition. If this returns anything, the partition
307 | 	// is ours. nil = not.
308 | 	topic, err = m.getClaimedPartitionState(topicName, partID)
309 | 	if topic == nil || err != nil {
310 | 		return false
311 | 	}
312 | 	return true
313 | }
314 | 
315 | // Heartbeat will send an update for other people to know that we're still alive and
316 | // still owning this partition. Returns an error if anything has gone wrong (at which
317 | // point we can no longer assert we have the lock).
318 | func (m *Marshaler) Heartbeat(topicName string, partID int, offset int64) error {
319 | 	topic, err := m.getClaimedPartitionState(topicName, partID)
320 | 	if err != nil {
321 | 		return err
322 | 	}
323 | 
324 | 	// Attempt to commit offset, this is best-effort and we don't care if it fails
325 | 	// since the canonical storage is in the heartbeat
326 | 	if err := m.CommitOffsets(topicName, partID, offset); err != nil {
327 | 		log.Warningf("[%s:%d] failed to commit offset during heartbeat: %s",
328 | 			topicName, partID, err)
329 | 	}
330 | 
331 | 	// All good, let's heartbeat
332 | 	cl := &msgHeartbeat{
333 | 		msgBase:       *m.msgBase(topicName, partID),
334 | 		CurrentOffset: offset,
335 | 	}
336 | 	_, err = m.cluster.producer.Produce(MarshalTopic, int32(topic.claimPartition),
337 | 		&proto.Message{Value: []byte(cl.Encode())})
338 | 	if err != nil {
339 | 		log.Errorf("[%s:%d] failed to send heartbeat message to Kafka: %s",
340 | 			topicName, partID, err)
341 | 		return fmt.Errorf("Failed to produce heartbeat to Kafka: %s", err)
342 | 	}
343 | 
344 | 	return nil
345 | }
346 | 
347 | // ReleasePartition will send an update for other people to know that we're done with
348 | // a partition. Returns an error if anything has gone wrong (at which
349 | // point we can no longer assert we have the lock).
350 | func (m *Marshaler) ReleasePartition(topicName string, partID int, offset int64) error {
351 | 	topic, err := m.getClaimedPartitionState(topicName, partID)
352 | 	if err != nil {
353 | 		return err
354 | 	}
355 | 
356 | 	// Commit our offset first; if this fails, we can still try to release,
357 | 	// but we should advise
358 | 	if err := m.CommitOffsets(topicName, partID, offset); err != nil {
359 | 		log.Warningf("[%s:%d] failed to commit offset during release: %s",
360 | 			topicName, partID, err)
361 | 	}
362 | 
363 | 	// All good, let's release
364 | 	cl := &msgReleasingPartition{
365 | 		msgBase:       *m.msgBase(topicName, partID),
366 | 		CurrentOffset: offset,
367 | 	}
368 | 	_, err = m.cluster.producer.Produce(MarshalTopic, int32(topic.claimPartition),
369 | 		&proto.Message{Value: []byte(cl.Encode())})
370 | 	if err != nil {
371 | 		log.Errorf("[%s:%d] failed to send release message to Kafka: %s",
372 | 			topicName, partID, err)
373 | 		return fmt.Errorf("Failed to produce release to Kafka: %s", err)
374 | 	}
375 | 
376 | 	return nil
377 | }
378 | 
379 | // CommitOffsets will commit the partition offsets to Kafka so it's available in the
380 | // long-term storage of the offset coordination system. Note: this method does not ensure
381 | // that this Marshal instance owns the topic/partition in question.
382 | func (m *Marshaler) CommitOffsets(topicName string, partID int, offset int64) error {
383 | 	return m.offsets.Commit(topicName, int32(partID), offset)
384 | }
385 | 
386 | // ClientID returns the client ID we're using
387 | func (m *Marshaler) ClientID() string {
388 | 	return m.clientID
389 | }
390 | 
391 | // GroupID returns the group ID we're using
392 | func (m *Marshaler) GroupID() string {
393 | 	return m.groupID
394 | }
395 | 
396 | // PrintState will take the current state of the Marshal world and print it verbosely to the
397 | // logging output. This is used in the rare case where we're self-terminating or on request
398 | // from the user.
399 | func (m *Marshaler) PrintState() {
400 | 	m.lock.RLock()
401 | 	defer m.lock.RUnlock()
402 | 
403 | 	m.cluster.lock.RLock()
404 | 	defer m.cluster.lock.RUnlock()
405 | 
406 | 	log.Infof("Marshal state dump beginning.")
407 | 	log.Infof("")
408 | 	log.Infof("Group ID:    %s", m.groupID)
409 | 	log.Infof("Client ID:   %s", m.clientID)
410 | 	log.Infof("Instance ID: %s", m.instanceID)
411 | 	log.Infof("")
412 | 	log.Infof("Marshal topic partitions: %d", m.cluster.partitions)
413 | 	log.Infof("Known Kafka topics:       %d", len(m.cluster.topics))
414 | 	log.Infof("Internal rsteps counter:  %d", atomic.LoadInt32(m.cluster.rsteps))
415 | 	log.Infof("")
416 | 	log.Infof("State of the world:")
417 | 	log.Infof("")
418 | 	for group, topicmap := range m.cluster.groups {
419 | 		log.Infof("  GROUP: %s", group)
420 | 		for topic, state := range topicmap {
421 | 			log.Infof("    TOPIC: %s [on %s:%d]", topic, MarshalTopic, state.claimPartition)
422 | 			state.PrintState()
423 | 		}
424 | 	}
425 | 	log.Infof("")
426 | 	log.Infof("Consumer states:")
427 | 	log.Infof("")
428 | 	for _, consumer := range m.consumers {
429 | 		consumer.PrintState()
430 | 	}
431 | 	log.Infof("")
432 | 	log.Infof("Marshal state dump complete.")
433 | }
434 | 


--------------------------------------------------------------------------------
/marshal/cluster.go:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * portal - marshal
  3 |  *
  4 |  * a library that implements an algorithm for doing consumer coordination within Kafka, rather
  5 |  * than using Zookeeper or another external system.
  6 |  *
  7 |  */
  8 | 
  9 | package marshal
 10 | 
 11 | import (
 12 | 	"crypto/md5"
 13 | 	"encoding/binary"
 14 | 	"errors"
 15 | 	"fmt"
 16 | 	"math/rand"
 17 | 	"sync"
 18 | 	"sync/atomic"
 19 | 	"time"
 20 | 
 21 | 	"github.com/zorkian/kafka"
 22 | )
 23 | 
 24 | // KafkaCluster is a user-agnostic view of the world. It connects to a Kafka cluster
 25 | // and runs rationalizers to observe the complete world state.
 26 | type KafkaCluster struct {
 27 | 	// These members are not protected by the lock and can be read at any
 28 | 	// time as they're write-once or only ever atomically updated. They must
 29 | 	// never be overwritten once a KafkaCluster is created.
 30 | 	quit       *int32
 31 | 	name       string
 32 | 	broker     *kafka.Broker
 33 | 	producer   kafka.Producer
 34 | 	partitions int
 35 | 	jitters    chan time.Duration
 36 | 	options    MarshalOptions
 37 | 
 38 | 	// Lock protects the following members; you must have this lock in order to
 39 | 	// read from or write to these.
 40 | 	lock       *sync.RWMutex
 41 | 	marshalers []*Marshaler
 42 | 	topics     map[string]int
 43 | 	groups     map[string]map[string]*topicState
 44 | 	// pausedGroups stores the expiry time for groups that are paused.
 45 | 	pausedGroups map[string]time.Time
 46 | 
 47 | 	// This WaitGroup is used for signalling when all of the rationalizers have
 48 | 	// finished processing.
 49 | 	rationalizers *sync.WaitGroup
 50 | 
 51 | 	// rsteps is updated whenever a rationalizer processes a log entry, this is
 52 | 	// used mainly by the test suite.
 53 | 	rsteps *int32
 54 | 
 55 | 	// This is for testing only. When this is non-zero, the rationalizer will answer
 56 | 	// queries based on THIS time instead of the current, actual time.
 57 | 	ts int64
 58 | }
 59 | 
 60 | // MarshalOptions contains various tunables that can be used to adjust the configuration
 61 | // of the underlying system.
 62 | type MarshalOptions struct {
 63 | 	// BrokerConnectionLimit is used to set the maximum simultaneous number of connections
 64 | 	// that can be made to each broker.
 65 | 	// Default: 30.
 66 | 	BrokerConnectionLimit int
 67 | 
 68 | 	// ConsumeRequestTimeout sets the time that we ask Kafka to wait before returning any
 69 | 	// data to us. Setting this high uses more connections and can lead to some latency
 70 | 	// but keeps the load on Kafka minimal. Use this to balance QPS against latency.
 71 | 	//
 72 | 	// Default: 1 millisecond.
 73 | 	ConsumeRequestTimeout time.Duration
 74 | 
 75 | 	// MarshalRequestTimeout is used for our coordination requests. This should be reasonable
 76 | 	// at default, but is left as a tunable in case you have clients that are claiming an
 77 | 	// extremely large number of partitions and are too slow. The overall Marshal latency
 78 | 	// is impacted by this value as well as the MarshalRequestRetryWait below.
 79 | 	//
 80 | 	// Default: 1 millisecond.
 81 | 	MarshalRequestTimeout time.Duration
 82 | 
 83 | 	// MarshalRequestRetryWait is the time between consume requests Marshal generates. This
 84 | 	// should be set to balance the above timeouts to prevent hammering the server.
 85 | 	//
 86 | 	// Default: 500 milliseconds.
 87 | 	MarshalRequestRetryWait time.Duration
 88 | 
 89 | 	// MaxMessageSize is the maximum size in bytes of messages that can be returned. This
 90 | 	// must be set to the size of the largest messages your cluster is allowed to store,
 91 | 	// else you will end up with stalled streams. I.e., Kafka will never send you a message
 92 | 	// if the message is larger than this value but we can't detect that, we just think
 93 | 	// there is no data.
 94 | 	//
 95 | 	// Default: 2,000,000 bytes.
 96 | 	MaxMessageSize int32
 97 | 
 98 | 	// MaxMessageQueue is the number of messages to retrieve from Kafka and store in-memory
 99 | 	// waiting for consumption. This is per-Consumer and independent of message size so you
100 | 	// should adjust this for your consumption patterns.
101 | 	//
102 | 	// Default: 1000 messages.
103 | 	MaxMessageQueue int
104 | }
105 | 
106 | // NewMarshalOptions returns a set of MarshalOptions populated with defaults.
107 | func NewMarshalOptions() MarshalOptions {
108 | 	return MarshalOptions{
109 | 		BrokerConnectionLimit:   30,
110 | 		ConsumeRequestTimeout:   1 * time.Millisecond,
111 | 		MarshalRequestTimeout:   1 * time.Millisecond,
112 | 		MarshalRequestRetryWait: 500 * time.Millisecond,
113 | 		MaxMessageSize:          2000000,
114 | 		MaxMessageQueue:         1000,
115 | 	}
116 | }
117 | 
118 | // Dial returns a new cluster object which can be used to instantiate a number of Marshalers
119 | // that all use the same cluster. You may pass brokerConf or may set it to nil.
120 | func Dial(name string, brokers []string, options MarshalOptions) (*KafkaCluster, error) {
121 | 	// Connect to Kafka
122 | 	brokerConf := kafka.NewBrokerConf("PortalMarshal")
123 | 	brokerConf.ClusterConnectionConf.MetadataRefreshFrequency = time.Hour
124 | 	brokerConf.ClusterConnectionConf.ConnectionLimit = options.BrokerConnectionLimit
125 | 	brokerConf.LeaderRetryLimit = 1 // Do not retry
126 | 	broker, err := kafka.NewBroker(name, brokers, brokerConf)
127 | 	if err != nil {
128 | 		return nil, err
129 | 	}
130 | 
131 | 	c := &KafkaCluster{
132 | 		quit:          new(int32),
133 | 		rsteps:        new(int32),
134 | 		name:          name,
135 | 		options:       options,
136 | 		lock:          &sync.RWMutex{},
137 | 		rationalizers: &sync.WaitGroup{},
138 | 		broker:        broker,
139 | 		producer:      broker.Producer(kafka.NewProducerConf()),
140 | 		topics:        make(map[string]int),
141 | 		groups:        make(map[string]map[string]*topicState),
142 | 		pausedGroups:  make(map[string]time.Time),
143 | 		jitters:       make(chan time.Duration, 100),
144 | 		// It's important that marshalers begins as an empty slice and not nil to avoid
145 | 		// a race between NewMarshaler and Terminate. See note in Terminate.
146 | 		marshalers: make([]*Marshaler, 0),
147 | 	}
148 | 
149 | 	// Do an initial metadata fetch, this will block a bit
150 | 	err = c.refreshMetadata()
151 | 	if err != nil {
152 | 		return nil, fmt.Errorf("Failed to get metadata: %s", err)
153 | 	}
154 | 
155 | 	// If there is no marshal topic, then we can't run. The admins must go create the topic
156 | 	// before they can use this library. Please see the README.
157 | 	c.partitions = c.getTopicPartitions(MarshalTopic)
158 | 	if c.partitions == 0 {
159 | 		return nil, errors.New("Marshalling topic not found. Please see the documentation.")
160 | 	}
161 | 
162 | 	// Now we start a goroutine to start consuming each of the partitions in the marshal
163 | 	// topic. Note that this doesn't handle increasing the partition count on that topic
164 | 	// without stopping all consumers.
165 | 	c.rationalizers.Add(c.partitions)
166 | 	for id := 0; id < c.partitions; id++ {
167 | 		go c.rationalize(id, c.kafkaConsumerChannel(id))
168 | 	}
169 | 
170 | 	// A jitter calculator, just fills a channel with random numbers so that other
171 | 	// people don't have to build their own random generator. It is important that
172 | 	// these values be somewhat less than the HeartbeatInterval as we use this for
173 | 	// jittering our heartbeats.
174 | 	go func() {
175 | 		rnd := rand.New(rand.NewSource(time.Now().UnixNano()))
176 | 		for {
177 | 			jitter := rnd.Intn(HeartbeatInterval/2) + (HeartbeatInterval / 4)
178 | 			c.jitters <- time.Duration(jitter) * time.Second
179 | 		}
180 | 	}()
181 | 
182 | 	// Now start the metadata refreshing goroutine
183 | 	go func() {
184 | 		for !c.Terminated() {
185 | 			time.Sleep(<-c.jitters)
186 | 			log.Infof("[%s] Refreshing topic metadata.", c.name)
187 | 			c.refreshMetadata()
188 | 
189 | 			// See if the number of partitions in the marshal topic changed. This is bad if
190 | 			// it happens, since it means we can no longer coordinate correctly.
191 | 			if c.getTopicPartitions(MarshalTopic) != c.partitions {
192 | 				log.Errorf("[%s] Marshal topic partition count changed. Terminating!", c.name)
193 | 				c.Terminate()
194 | 			}
195 | 		}
196 | 	}()
197 | 
198 | 	// Wait for all rationalizers to come alive
199 | 	log.Infof("[%s] Waiting for all rationalizers to come alive.", c.name)
200 | 	c.rationalizers.Wait()
201 | 	log.Infof("[%s] All rationalizers alive, KafkaCluster now alive.", c.name)
202 | 
203 | 	return c, nil
204 | }
205 | 
206 | // NewMarshaler creates a Marshaler off of an existing cluster. This is more efficient
207 | // if you're creating multiple instances, since they can share the same underlying cluster.
208 | func (c *KafkaCluster) NewMarshaler(clientID, groupID string) (*Marshaler, error) {
209 | 	if c.Terminated() {
210 | 		return nil, errors.New("Cluster is terminated.")
211 | 	}
212 | 
213 | 	// Get offset coordinator so we can look up (and save) committed offsets later.
214 | 	coordinator, err := c.getOffsetCoordinator(groupID)
215 | 	if err != nil {
216 | 		return nil, err
217 | 	}
218 | 
219 | 	m := &Marshaler{
220 | 		quit:       new(int32),
221 | 		cluster:    c,
222 | 		instanceID: newInstanceID(),
223 | 		clientID:   clientID,
224 | 		groupID:    groupID,
225 | 		offsets:    coordinator,
226 | 		lock:       &sync.RWMutex{},
227 | 	}
228 | 
229 | 	c.lock.Lock()
230 | 	defer c.lock.Unlock()
231 | 
232 | 	// This is a bit of hack, see note in KafkaCluster::Terminate.
233 | 	if c.marshalers == nil {
234 | 		return nil, errors.New("Cluster is terminated (marshalers is nil).")
235 | 	}
236 | 
237 | 	// Remove any dead marshalers from our slice and add the new one
238 | 	filtered := make([]*Marshaler, 0)
239 | 	for _, marshaler := range c.marshalers {
240 | 		if !marshaler.Terminated() {
241 | 			filtered = append(filtered, marshaler)
242 | 		}
243 | 	}
244 | 	filtered = append(filtered, m)
245 | 	c.marshalers = filtered
246 | 
247 | 	return m, nil
248 | }
249 | 
250 | // refreshMetadata is periodically used to update our internal state with topic information
251 | // about the world.
252 | func (c *KafkaCluster) refreshMetadata() error {
253 | 	md, err := c.broker.Metadata()
254 | 	if err != nil {
255 | 		return err
256 | 	}
257 | 
258 | 	newTopics := make(map[string]int)
259 | 	for _, topic := range md.Topics {
260 | 		newTopics[topic.Name] = len(topic.Partitions)
261 | 	}
262 | 
263 | 	c.lock.Lock()
264 | 	defer c.lock.Unlock()
265 | 	c.topics = newTopics
266 | 	return nil
267 | }
268 | 
269 | // getOffsetCoordinator returns a kafka.OffsetCoordinator for a specific group.
270 | func (c *KafkaCluster) getOffsetCoordinator(groupID string) (kafka.OffsetCoordinator, error) {
271 | 	return c.broker.OffsetCoordinator(
272 | 		kafka.NewOffsetCoordinatorConf(groupID))
273 | }
274 | 
275 | // getClaimPartition calculates which partition a topic should use for coordination. This uses
276 | // a hashing function (non-cryptographic) to predictably partition the topic space.
277 | func (c *KafkaCluster) getClaimPartition(topicName string) int {
278 | 	// We use MD5 because it's a fast and good hashing algorithm and we don't need cryptographic
279 | 	// properties. We then take the first 8 bytes and treat them as a uint64 and modulo that
280 | 	// across how many partitions we have.
281 | 	hash := md5.Sum([]byte(topicName))
282 | 	uval := binary.LittleEndian.Uint64(hash[0:8])
283 | 	return int(uval % uint64(c.partitions))
284 | }
285 | 
286 | // getGroupState returns the map of topics to topicState objects for a group.
287 | func (c *KafkaCluster) getGroupState(groupID string) map[string]*topicState {
288 | 	// Read lock check
289 | 	c.lock.RLock()
290 | 	if group, ok := c.groups[groupID]; ok {
291 | 		c.lock.RUnlock()
292 | 		return group
293 | 	}
294 | 	c.lock.RUnlock()
295 | 
296 | 	// Failed, write lock check and possible create
297 | 	c.lock.Lock()
298 | 	defer c.lock.Unlock()
299 | 
300 | 	if group, ok := c.groups[groupID]; ok {
301 | 		return group
302 | 	}
303 | 	c.groups[groupID] = make(map[string]*topicState)
304 | 	return c.groups[groupID]
305 | }
306 | 
307 | // getTopicState returns a topicState for a given topic.
308 | func (c *KafkaCluster) getTopicState(groupID, topicName string) *topicState {
309 | 	group := c.getGroupState(groupID)
310 | 
311 | 	// Read lock check
312 | 	c.lock.RLock()
313 | 	if topic, ok := group[topicName]; ok {
314 | 		c.lock.RUnlock()
315 | 		return topic
316 | 	}
317 | 	c.lock.RUnlock()
318 | 
319 | 	// Write lock check and possible create
320 | 	c.lock.Lock()
321 | 	defer c.lock.Unlock()
322 | 
323 | 	if topic, ok := group[topicName]; ok {
324 | 		return topic
325 | 	}
326 | 	group[topicName] = &topicState{
327 | 		claimPartition: c.getClaimPartition(topicName),
328 | 		partitions:     nil,
329 | 		lock:           &sync.RWMutex{},
330 | 	}
331 | 	return group[topicName]
332 | }
333 | 
334 | // getPartitionState returns a topicState and possibly creates it and the partition state within
335 | // the State.
336 | func (c *KafkaCluster) getPartitionState(groupID, topicName string, partID int) *topicState {
337 | 	// Get topic and lock it so we can update it if needed
338 | 	topic := c.getTopicState(groupID, topicName)
339 | 
340 | 	// Read lock check
341 | 	topic.lock.RLock()
342 | 	if len(topic.partitions) > partID {
343 | 		topic.lock.RUnlock()
344 | 		return topic
345 | 	}
346 | 	topic.lock.RUnlock()
347 | 
348 | 	// Must upgrade, looks like we need a new partition
349 | 	topic.lock.Lock()
350 | 	defer topic.lock.Unlock()
351 | 
352 | 	if len(topic.partitions) < partID+1 {
353 | 		for i := len(topic.partitions); i <= partID; i++ {
354 | 			topic.partitions = append(topic.partitions, PartitionClaim{})
355 | 		}
356 | 	}
357 | 	return topic
358 | }
359 | 
360 | // getTopics returns the list of known topics.
361 | func (c *KafkaCluster) getTopics() []string {
362 | 	c.lock.RLock()
363 | 	defer c.lock.RUnlock()
364 | 
365 | 	topics := make([]string, 0, len(c.topics))
366 | 	for topic := range c.topics {
367 | 		topics = append(topics, topic)
368 | 	}
369 | 	return topics
370 | }
371 | 
372 | // getTopicPartitions returns the count of how many partitions are in a given topic. Returns 0 if a
373 | // topic is unknown.
374 | func (c *KafkaCluster) getTopicPartitions(topicName string) int {
375 | 	c.lock.RLock()
376 | 	defer c.lock.RUnlock()
377 | 
378 | 	count, _ := c.topics[topicName]
379 | 	return count
380 | }
381 | 
382 | // removeMarshal removes a terminated Marshal from a cluster's list.
383 | func (c *KafkaCluster) removeMarshal(m *Marshaler) {
384 | 	c.lock.Lock()
385 | 	defer c.lock.Unlock()
386 | 
387 | 	for i, ml := range c.marshalers {
388 | 		if ml == m {
389 | 			c.marshalers = append(c.marshalers[:i], c.marshalers[i+1:]...)
390 | 			break
391 | 		}
392 | 	}
393 | }
394 | 
395 | // waitForRsteps is used by the test suite to ask the rationalizer to wait until some number
396 | // of events have been processed. This also returns the current rsteps when it returns.
397 | func (c *KafkaCluster) waitForRsteps(steps int) (int, error) {
398 | 	cancel := make(chan struct{})
399 | 	result := make(chan int)
400 | 	go func() {
401 | 		for {
402 | 			select {
403 | 			case <-cancel:
404 | 				break
405 | 			default:
406 | 				cval := atomic.LoadInt32(c.rsteps)
407 | 				if cval >= int32(steps) {
408 | 					result <- int(cval)
409 | 				}
410 | 				time.Sleep(5 * time.Millisecond)
411 | 			}
412 | 		}
413 | 	}()
414 | 
415 | 	select {
416 | 	case res := <-result:
417 | 		return res, nil
418 | 	case <-time.After(3 * time.Second):
419 | 		close(cancel)
420 | 		return 0, errors.New("Timed out waiting for steps")
421 | 	}
422 | }
423 | 
424 | // pauseConsumerGroup stores an expiry time for consumer groups that we'd like to pause.
425 | func (c *KafkaCluster) pauseConsumerGroup(groupID string, adminID string, expiry time.Time) {
426 | 	c.lock.Lock()
427 | 	defer c.lock.Unlock()
428 | 
429 | 	log.Warningf("Cluster marking group %s paused with expiry: %s", groupID, expiry.Format(time.UnixDate))
430 | 	c.pausedGroups[groupID] = expiry
431 | }
432 | 
433 | // IsGroupPaused returns true if the given consumer group is paused.
434 | // TODO(pihu) This just checks the expiry time, and not the admin ID.
435 | func (c *KafkaCluster) IsGroupPaused(groupID string) bool {
436 | 	c.lock.RLock()
437 | 	defer c.lock.RUnlock()
438 | 
439 | 	if res, ok := c.pausedGroups[groupID]; !ok {
440 | 		return false
441 | 	} else {
442 | 		return time.Now().Before(res)
443 | 	}
444 | }
445 | 
446 | // Terminate is called when we're done with the marshaler and want to shut down.
447 | func (c *KafkaCluster) Terminate() {
448 | 	if !atomic.CompareAndSwapInt32(c.quit, 0, 1) {
449 | 		return
450 | 	}
451 | 
452 | 	log.Infof("[%s] beginning termination", c.name)
453 | 
454 | 	// This is a bit of a hack, but because marshaler.terminateAndCleanup requires the read lock
455 | 	// on c, we can't terminate the Marshalers in the list while holding the write lock.
456 | 	// Because KafkaCluster::NewMarshaler will return an error if the marshalers slice is nil,
457 | 	// we know there cannot be new Marshalers created which aren't included in the local slice
458 | 	// we create here.
459 | 	//
460 | 	// There is probably some alternative where the quit variable is protected by the mutex instead
461 | 	// of being atomic, but this seems somewhat cleaner in case some future refactoring eliminates
462 | 	// the marshalers slice entirely this hack will go away automatically.
463 | 	c.lock.Lock()
464 | 	marshalers := c.marshalers
465 | 	c.marshalers = nil
466 | 	c.lock.Unlock()
467 | 
468 | 	// Terminate all Marshalers which will in turn terminate all Consumers and
469 | 	// let everybody know we're all done.
470 | 	for _, marshaler := range marshalers {
471 | 		marshaler.terminateAndCleanup(false)
472 | 	}
473 | }
474 | 
475 | // Terminated returns whether or not we have been terminated.
476 | func (c *KafkaCluster) Terminated() bool {
477 | 	return atomic.LoadInt32(c.quit) == 1
478 | }
479 | 


--------------------------------------------------------------------------------
/marshal/claim.go:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * portal - marshal
  3 |  *
  4 |  * a library that implements an algorithm for doing consumer coordination within Kafka, rather
  5 |  * than using Zookeeper or another external system.
  6 |  *
  7 |  */
  8 | 
  9 | package marshal
 10 | 
 11 | import (
 12 | 	"fmt"
 13 | 	"math/rand"
 14 | 	"sort"
 15 | 	"sync"
 16 | 	"sync/atomic"
 17 | 	"time"
 18 | 
 19 | 	"github.com/zorkian/kafka"
 20 | 	"github.com/zorkian/kafka/proto"
 21 | 	"github.com/jpillora/backoff"
 22 | )
 23 | 
 24 | // int64slice is for sorting.
 25 | type int64slice []int64
 26 | 
 27 | func (a int64slice) Len() int           { return len(a) }
 28 | func (a int64slice) Swap(i, j int)      { a[i], a[j] = a[j], a[i] }
 29 | func (a int64slice) Less(i, j int) bool { return a[i] < a[j] }
 30 | 
 31 | // claim is instantiated for each partition "claim" we have. This type is responsible for
 32 | // pulling data from Kafka and managing its cursors, heartbeating as necessary, and health
 33 | // checking itself.
 34 | type claim struct {
 35 | 	// These items are read-only. They are never changed after the object is created,
 36 | 	// so access to these may be done without the lock.
 37 | 	topic  string
 38 | 	partID int
 39 | 
 40 | 	// lock protects all access to the member variables of this struct except for the
 41 | 	// messages channel, which can be read from or written to without holding the lock.
 42 | 	// Additionally the stopChan can be used.
 43 | 	lock            *sync.RWMutex
 44 | 	messagesLock    *sync.Mutex
 45 | 	offsets         PartitionOffsets
 46 | 	marshal         *Marshaler
 47 | 	consumer        *Consumer
 48 | 	rand            *rand.Rand
 49 | 	terminated      *int32
 50 | 	beatCounter     int32
 51 | 	lastHeartbeat   int64
 52 | 	lastMessageTime time.Time
 53 | 	options         ConsumerOptions
 54 | 	kafkaConsumer   kafka.Consumer
 55 | 	messages        chan *Message
 56 | 	stopChan        chan struct{}
 57 | 	doneChan        chan struct{}
 58 | 
 59 | 	// tracking is a dict that maintains information about offsets that have been
 60 | 	// sent to and acknowledged by clients. An offset is inserted into this map when
 61 | 	// we insert it into the message queue, and when it is committed we record an update
 62 | 	// saying so. This map is pruned during the heartbeats.
 63 | 	tracking            map[int64]bool
 64 | 	outstandingMessages int
 65 | 
 66 | 	// Number of heartbeat cycles this claim has been lagging, i.e., consumption is going
 67 | 	// too slowly (defined as being behind by more than 2 heartbeat cycles)
 68 | 	cyclesBehind int
 69 | 
 70 | 	// History arrays used for calculating average velocity for health checking.
 71 | 	offsetCurrentHistory [10]int64
 72 | 	offsetLatestHistory  [10]int64
 73 | }
 74 | 
 75 | // newClaim returns an internal claim object, used by the consumer to manage the
 76 | // claim of a single partition. It is up to the caller to ensure healthCheckLoop gets
 77 | // called in a goroutine. If you do not, the claim will die from failing to heartbeat
 78 | // after a short period.
 79 | func newClaim(topic string, partID int, marshal *Marshaler, consumer *Consumer,
 80 | 	messages chan *Message, options ConsumerOptions) *claim {
 81 | 
 82 | 	// Get all available offset information
 83 | 	offsets, err := marshal.GetPartitionOffsets(topic, partID)
 84 | 	if err != nil {
 85 | 		log.Errorf("[%s:%d] failed to get offsets: %s", topic, partID, err)
 86 | 		return nil
 87 | 	}
 88 | 	log.Debugf("[%s:%d] consumer offsets: early = %d, cur/comm = %d/%d, late = %d",
 89 | 		topic, partID, offsets.Earliest, offsets.Current, offsets.Committed, offsets.Latest)
 90 | 
 91 | 	// For offsets, we strictly prefer the contents of the MarshalTopic and will use that
 92 | 	// if present. If we don't have that data, then we'll fall back to the Kafka committed
 93 | 	// offsets. Failing that we'll start at the beginning of the partition.
 94 | 	if offsets.Current > 0 {
 95 | 		// Ideal case, we just use the Marshal offset that is already set
 96 | 	} else if offsets.Committed > 0 {
 97 | 		log.Infof("[%s:%d] no Marshal offset found, using committed offset %d",
 98 | 			topic, partID, offsets.Committed)
 99 | 		offsets.Current = offsets.Committed
100 | 	} else {
101 | 		log.Infof("[%s:%d] no Marshal or committed offset found, using earliest offset %d",
102 | 			topic, partID, offsets.Earliest)
103 | 		offsets.Current = offsets.Earliest
104 | 	}
105 | 
106 | 	// Construct object and set it up
107 | 	obj := &claim{
108 | 		lock:            &sync.RWMutex{},
109 | 		messagesLock:    &sync.Mutex{},
110 | 		stopChan:        make(chan struct{}),
111 | 		doneChan:        make(chan struct{}),
112 | 		marshal:         marshal,
113 | 		consumer:        consumer,
114 | 		topic:           topic,
115 | 		partID:          partID,
116 | 		terminated:      new(int32),
117 | 		offsets:         offsets,
118 | 		messages:        messages,
119 | 		options:         options,
120 | 		tracking:        make(map[int64]bool),
121 | 		rand:            rand.New(rand.NewSource(time.Now().UnixNano())),
122 | 		lastMessageTime: time.Now(),
123 | 	}
124 | 
125 | 	// Now try to actually claim it, this can block a while
126 | 	log.Infof("[%s:%d] consumer attempting to claim", topic, partID)
127 | 	if !marshal.ClaimPartition(topic, partID) {
128 | 		log.Infof("[%s:%d] consumer failed to claim", topic, partID)
129 | 		return nil
130 | 	}
131 | 
132 | 	// If that worked, kick off the main setup loop and return
133 | 	obj.setup()
134 | 	return obj
135 | }
136 | 
137 | // setup is the initial worker that initializes the claim structure. Until this is done,
138 | // our internal state is inconsistent.
139 | func (c *claim) setup() {
140 | 	c.lock.Lock()
141 | 	defer c.lock.Unlock()
142 | 
143 | 	// Of course, if the current offset is greater than the earliest, we must reset
144 | 	// to the earliest known
145 | 	if c.offsets.Current < c.offsets.Earliest {
146 | 		log.Warningf("[%s:%d] consumer fast-forwarding from %d to %d",
147 | 			c.topic, c.partID, c.offsets.Current, c.offsets.Earliest)
148 | 		c.offsets.Current = c.offsets.Earliest
149 | 	}
150 | 
151 | 	// Since it's claimed, we now want to heartbeat with the last seen offset
152 | 	err := c.marshal.Heartbeat(c.topic, c.partID, c.offsets.Current)
153 | 	if err != nil {
154 | 		log.Errorf("[%s:%d] consumer failed to heartbeat: %s", c.topic, c.partID, err)
155 | 		go c.Release()
156 | 		return
157 | 	}
158 | 	c.lastHeartbeat = time.Now().Unix()
159 | 
160 | 	// Set up Kafka consumer
161 | 	consumerConf := kafka.NewConsumerConf(c.topic, int32(c.partID))
162 | 	consumerConf.StartOffset = c.offsets.Current
163 | 	consumerConf.MaxFetchSize = c.marshal.cluster.options.MaxMessageSize
164 | 	consumerConf.RequestTimeout = c.marshal.cluster.options.ConsumeRequestTimeout
165 | 	// Do not retry. If we get back no data, we'll do our own retries.
166 | 	consumerConf.RetryLimit = 0
167 | 
168 | 	kafkaConsumer, err := c.marshal.cluster.broker.Consumer(consumerConf)
169 | 	if err != nil {
170 | 		log.Errorf("[%s:%d] consumer failed to create Kafka Consumer: %s",
171 | 			c.topic, c.partID, err)
172 | 		go c.Release()
173 | 		return
174 | 	}
175 | 	c.kafkaConsumer = kafkaConsumer
176 | 
177 | 	// Start our maintenance goroutines that keep this system healthy
178 | 	go c.messagePump()
179 | 
180 | 	// Totally done, let the world know and move on
181 | 	log.Infof("[%s:%d] consumer %s claimed at offset %d (is %d behind)",
182 | 		c.topic, c.partID, c.marshal.clientID, c.offsets.Current, c.offsets.Latest-c.offsets.Current)
183 | }
184 | 
185 | // Commit is called by a Consumer class when the client has indicated that it has finished
186 | // processing a message. This updates our tracking structure so the heartbeat knows how
187 | // far ahead it can move our offset.
188 | func (c *claim) Commit(offset int64) error {
189 | 	if c.Terminated() {
190 | 		return fmt.Errorf("[%s:%d] is no longer claimed; can't commit offset %d",
191 | 			c.topic, c.partID, offset)
192 | 	}
193 | 
194 | 	c.lock.Lock()
195 | 	defer c.lock.Unlock()
196 | 
197 | 	_, ok := c.tracking[offset]
198 | 	if !ok {
199 | 		// This is bogus; committing an offset we've never seen?
200 | 		return fmt.Errorf("[%s:%d] committing offset %d but we've never seen it",
201 | 			c.topic, c.partID, offset)
202 | 	}
203 | 	c.tracking[offset] = true
204 | 	c.outstandingMessages--
205 | 	return nil
206 | }
207 | 
208 | // Terminated returns whether the consumer has terminated the Claim. The claim may or may NOT
209 | // remain claimed depending on whether it was released or not.
210 | func (c *claim) Terminated() bool {
211 | 	return atomic.LoadInt32(c.terminated) == 1
212 | }
213 | 
214 | // GetCurrentLag returns this partition's cursor lag.
215 | func (c *claim) GetCurrentLag() int64 {
216 | 	c.lock.RLock()
217 | 	defer c.lock.RUnlock()
218 | 
219 | 	if c.offsets.Current < c.offsets.Latest {
220 | 		return c.offsets.Latest - c.offsets.Current
221 | 	}
222 | 	return 0
223 | }
224 | 
225 | // Flush will write updated offsets to Kafka immediately if we have any outstanding offset
226 | // updates to write. If not, this is a relatively quick no-op.
227 | func (c *claim) Flush() error {
228 | 	// By definition a terminated claim has already flushed anything it can flush
229 | 	// or we've lost the lock so there's nothing we can do. It's not an error.
230 | 	if c.Terminated() {
231 | 		return nil
232 | 	}
233 | 
234 | 	// This is technically a racey design, but the worst case is that we
235 | 	// will write out two correct heartbeats which is fine.
236 | 	didAdvance, currentOffset := c.updateCurrentOffsets()
237 | 	if !didAdvance {
238 | 		// Current offset did not advance
239 | 		return nil
240 | 	}
241 | 
242 | 	// Now heartbeat this value and update our heartbeat time
243 | 	if err := c.marshal.Heartbeat(c.topic, c.partID, currentOffset); err != nil {
244 | 		go c.Release()
245 | 		return fmt.Errorf("[%s:%d] failed to flush, releasing: %s", c.topic, c.partID, err)
246 | 	}
247 | 	return nil
248 | }
249 | 
250 | // Release will invoke commit offsets and release the Kafka partition. After calling Release,
251 | // consumer cannot consume messages anymore.
252 | // Does not return until the message pump has exited and the release has finished.
253 | func (c *claim) Release() bool {
254 | 	return c.teardown(true)
255 | }
256 | 
257 | // Terminate will invoke commit offsets, terminate the claim, but does NOT release the partition.
258 | // Does not return until the message pump has exited and termination has finished.
259 | func (c *claim) Terminate() bool {
260 | 	return c.teardown(false)
261 | }
262 | 
263 | // teardown handles releasing the claim or just updating our offsets for a fast restart.
264 | func (c *claim) teardown(releasePartition bool) bool {
265 | 	if !atomic.CompareAndSwapInt32(c.terminated, 0, 1) {
266 | 		<-c.doneChan
267 | 		return false
268 | 	}
269 | 
270 | 	// Kill the stopchan now which is a useful way of knowing we're quitting within selects
271 | 	close(c.stopChan)
272 | 
273 | 	// need to serialize access to the messages channel. We should not release if the message pump
274 | 	// is about to write to the consumer channel
275 | 	c.messagesLock.Lock()
276 | 	defer c.messagesLock.Unlock()
277 | 
278 | 	// Let's update current offset internally to the last processed
279 | 	_, currentOffset := c.updateCurrentOffsets()
280 | 
281 | 	// Advise the consumer that this claim is terminating, this is so that the consumer
282 | 	// can release other claims if we've lost part of a topic
283 | 	if c.consumer != nil {
284 | 		go c.consumer.claimTerminated(c, releasePartition)
285 | 	}
286 | 
287 | 	var err error
288 | 	if releasePartition {
289 | 		log.Infof("[%s:%d] releasing partition claim", c.topic, c.partID)
290 | 		err = c.marshal.ReleasePartition(c.topic, c.partID, currentOffset)
291 | 	} else {
292 | 		// We're not releasing but we do want to update our offsets to the latest value
293 | 		// we know about, so issue a gratuitous heartbeat
294 | 		err = c.marshal.Heartbeat(c.topic, c.partID, currentOffset)
295 | 	}
296 | 
297 | 	// Wait for messagePump to exit
298 | 	<-c.doneChan
299 | 
300 | 	if err != nil {
301 | 		log.Errorf("[%s:%d] failed to release: %s", c.topic, c.partID, err)
302 | 		return false
303 | 	}
304 | 	return true
305 | }
306 | 
307 | // messagePump continuously pulls message from Kafka for this partition and makes them
308 | // available for consumption.
309 | func (c *claim) messagePump() {
310 | 	// When the pump exits we close the doneChan so people can know when it's not
311 | 	// possible for the pump to be running
312 | 	defer close(c.doneChan)
313 | 
314 | 	// This method MUST NOT make changes to the claim structure. Since we might
315 | 	// be running while someone else has the lock, and we can't get it ourselves, we are
316 | 	// forbidden to touch anything other than the consumer and the message channel.
317 | 	retry := &backoff.Backoff{Min: 10 * time.Millisecond, Max: 1 * time.Second, Jitter: true}
318 | 	for !c.Terminated() {
319 | 		msg, err := c.kafkaConsumer.Consume()
320 | 		if err == proto.ErrOffsetOutOfRange {
321 | 			// Fell out of range, presumably because we're handling this too slow, so
322 | 			// let's abandon this claim
323 | 			log.Warningf("[%s:%d] error consuming: out of range, abandoning partition",
324 | 				c.topic, c.partID)
325 | 			go c.Release()
326 | 			return
327 | 		} else if err == kafka.ErrNoData {
328 | 			// No data, just loop; if we're stuck receiving no data for too long the healthcheck
329 | 			// will start failing
330 | 			time.Sleep(retry.Duration())
331 | 			continue
332 | 		} else if err != nil {
333 | 			log.Errorf("[%s:%d] error consuming: %s", c.topic, c.partID, err)
334 | 
335 | 			// Often a consumption error is caused by data going away, such as if we're consuming
336 | 			// from the head and Kafka has deleted the data. In that case we need to wait for
337 | 			// the next offset update, so let's not go crazy
338 | 			time.Sleep(1 * time.Second)
339 | 			continue
340 | 		}
341 | 		retry.Reset()
342 | 
343 | 		// Briefly get the lock to update our tracking map... I wish there were
344 | 		// goroutine safe maps in Go.
345 | 		c.lock.Lock()
346 | 		c.lastMessageTime = time.Now()
347 | 		c.tracking[msg.Offset] = false
348 | 		c.outstandingMessages++
349 | 		if msg.Offset < c.offsets.Current {
350 | 			log.Errorf("[%s:%d] just consumed offset %d earlier than current %d",
351 | 				c.topic, c.partID, msg.Offset, c.offsets.Current)
352 | 		}
353 | 		c.lock.Unlock()
354 | 
355 | 		// Push the message down to the client (this bypasses the Consumer)
356 | 		// We should NOT write to the consumer channel if the claim is no longer claimed. This
357 | 		// needs to be serialized with Release, otherwise a race-condition can potentially
358 | 		// lead to a write to a closed-channel. That's why we're using this lock. We're not
359 | 		// using the main lock to avoid deadlocks since the write to the channel is blocking
360 | 		// until someone consumes the message blocking all Commit operations.
361 | 		//
362 | 		// This must not block -- if we hold the messagesLock for too long we will cause
363 | 		// possible deadlocks.
364 | 		c.messagesLock.Lock()
365 | 		if !c.Terminated() {
366 | 			// This allocates a new Message to put the proto.Message in.
367 | 			// TODO: This is really annoying and probably stupidly inefficient, is there any
368 | 			// way to do this better?
369 | 			tmp := Message(*msg)
370 | 			select {
371 | 			case c.messages <- &tmp:
372 | 				// Message successfully delivered to queue
373 | 			case <-c.stopChan:
374 | 				// Claim is terminated, the message will go nowhere
375 | 			}
376 | 		}
377 | 		c.messagesLock.Unlock()
378 | 	}
379 | 	log.Debugf("[%s:%d] no longer claimed, pump exiting", c.topic, c.partID)
380 | }
381 | 
382 | // heartbeat is the internal "send a heartbeat" function. Calling this will immediately
383 | // send a heartbeat to Kafka. If we fail to send a heartbeat, we will release the
384 | // partition.
385 | func (c *claim) heartbeat() bool {
386 | 	// Unclaimed partitions don't heartbeat.
387 | 	if c.Terminated() {
388 | 		return false
389 | 	}
390 | 
391 | 	// Lock held because we use c.offsets and update c.lastHeartbeat below
392 | 	c.lock.Lock()
393 | 	defer c.lock.Unlock()
394 | 
395 | 	// Now heartbeat this value and update our heartbeat time
396 | 	err := c.marshal.Heartbeat(c.topic, c.partID, c.offsets.Current)
397 | 	if err != nil {
398 | 		log.Errorf("[%s:%d] failed to heartbeat, releasing: %s", c.topic, c.partID, err)
399 | 		go c.Release()
400 | 	}
401 | 
402 | 	log.Infof("[%s:%d] heartbeat: Current offset is %d, partition offset range is %d..%d.",
403 | 		c.topic, c.partID, c.offsets.Current, c.offsets.Earliest, c.offsets.Latest)
404 | 	log.Infof("[%s:%d] heartbeat: There are %d messages in queue and %d messages outstanding.",
405 | 		c.topic, c.partID, len(c.messages), c.outstandingMessages)
406 | 	c.lastHeartbeat = time.Now().Unix()
407 | 	return true
408 | }
409 | 
410 | // updateCurrentOffsets updates the current offsets so that a Commit/Heartbeat can pick up the
411 | // latest offsets. Returns true if we advanced our current offset, false if there was no
412 | // change. Also returns the latest current offset.
413 | func (c *claim) updateCurrentOffsets() (bool, int64) {
414 | 	c.lock.Lock()
415 | 	defer c.lock.Unlock()
416 | 
417 | 	// Get the sorted set of offsets
418 | 	offsets := make(int64slice, 0, len(c.tracking))
419 | 	for key := range c.tracking {
420 | 		offsets = append(offsets, key)
421 | 	}
422 | 	sort.Sort(offsets)
423 | 
424 | 	// Now iterate the offsets bottom up and increment our current offset until we
425 | 	// see the first uncommitted offset (oldest message)
426 | 	didAdvance := false
427 | 	for _, offset := range offsets {
428 | 		if !c.tracking[offset] {
429 | 			break
430 | 		}
431 | 		// Remember current is always "last committed + 1", see the docs on
432 | 		// PartitionOffset for a reminder.
433 | 		didAdvance = true
434 | 		if offset+1 <= c.offsets.Current  {
435 | 			log.Errorf("[%s:%d] rewinding current offset from %d to %d",
436 | 				c.topic, c.partID, c.offsets.Current, offset+1)
437 | 		}
438 | 		c.offsets.Current = offset + 1
439 | 		delete(c.tracking, offset)
440 | 	}
441 | 
442 | 	// If we end up with more than a queue of outstanding messages, then something is
443 | 	// probably broken in the implementation... since that will cause us to grow
444 | 	// forever in memory, let's alert the user
445 | 	if len(c.tracking) > c.marshal.cluster.options.MaxMessageQueue {
446 | 		log.Errorf("[%s:%d] has %d uncommitted offsets. You must call Commit.",
447 | 			c.topic, c.partID, len(c.tracking))
448 | 	}
449 | 	return didAdvance, c.offsets.Current
450 | }
451 | 
452 | // heartbeatExpired returns whether or not our last successful heartbeat is so
453 | // long ago that we know we're expired.
454 | func (c *claim) heartbeatExpired() bool {
455 | 	c.lock.RLock()
456 | 	defer c.lock.RUnlock()
457 | 
458 | 	return c.lastHeartbeat < time.Now().Unix()-HeartbeatInterval
459 | }
460 | 
461 | // healthCheck performs a single health check against the claim. If we have failed
462 | // too many times, this will also start a partition release. Returns true if the
463 | // partition is healthy, else false.
464 | func (c *claim) healthCheck() bool {
465 | 	// Unclaimed partitions aren't healthy.
466 | 	if c.Terminated() {
467 | 		return false
468 | 	}
469 | 
470 | 	// Get velocities; these functions both use the locks so we have to do this before
471 | 	// we personally take the lock (to avoid deadlock)
472 | 	consumerVelocity := c.ConsumerVelocity()
473 | 	partitionVelocity := c.PartitionVelocity()
474 | 
475 | 	// If our heartbeat is expired, we are definitely unhealthy... don't even bother
476 | 	// with checking velocity
477 | 	if c.heartbeatExpired() {
478 | 		log.Warningf("[%s:%d] consumer unhealthy by heartbeat test, releasing",
479 | 			c.topic, c.partID)
480 | 		go c.Release()
481 | 		return false
482 | 	}
483 | 
484 | 	// If the consumer group owning this claim is paused, we must release this claim.
485 | 	if c.marshal.cluster.IsGroupPaused(c.marshal.GroupID()) {
486 | 		log.Infof("[%s:%d] consumer group %s is paused, claim releasing",
487 | 			c.topic, c.partID, c.marshal.GroupID())
488 | 		go c.Release()
489 | 		return false
490 | 	}
491 | 
492 | 	// Take the lock below here as we are reading protected values on c and we're
493 | 	// writing to c.cyclesBehind
494 | 	c.lock.Lock()
495 | 	defer c.lock.Unlock()
496 | 
497 | 	// If we haven't seen any messages for more than a heartbeat interval, it's possible
498 | 	// we've gotten into a bad state. Make a check to see how far behind we are, if we
499 | 	// are behind and not seeing any messages then release.
500 | 	if time.Now().After(c.lastMessageTime.Add(HeartbeatInterval * time.Second)) {
501 | 		if c.options.ReleaseClaimsIfBehind && consumerVelocity == 0 && (partitionVelocity > 0 || c.offsets.Latest > c.offsets.Current) {
502 | 			// If that's true then it means velocity has been 0 for at least long enough
503 | 			// to drive the average to 0, which means about 10 heartbeat cycles. This is
504 | 			// long enough that releasing seems fine.
505 | 			log.Warningf("[%s:%d] no messages received for %d seconds with CV=%0.2f PV=%0.2f, releasing",
506 | 				c.topic, c.partID, HeartbeatInterval, consumerVelocity, partitionVelocity)
507 | 			go c.Release()
508 | 			return false
509 | 		} else {
510 | 			log.Infof("[%s:%d] no messages received for %d seconds with CV=%0.2f PV=%0.2f",
511 | 				c.topic, c.partID, HeartbeatInterval, consumerVelocity, partitionVelocity)
512 | 		}
513 | 	}
514 | 
515 | 	// In topic claim mode we don't do any velocity checking. It's up to the consumer
516 | 	// to ensure they're claiming. TODO: Unclear if this is correct or not.
517 | 	if c.options.ClaimEntireTopic || !c.options.ReleaseClaimsIfBehind {
518 | 		return true
519 | 	}
520 | 
521 | 	// We consider a consumer to be caught up if the predicted offset is past the end
522 | 	// of the partition. This takes into account the fact that we only get offset information
523 | 	// every hearbeat, so we could have some stale data.
524 | 	testOffset := c.offsets.Current + int64(consumerVelocity*2)
525 | 	if testOffset >= c.offsets.Latest {
526 | 		c.cyclesBehind = 0
527 | 		return true
528 | 	}
529 | 
530 | 	// At this point we know the consumer is NOT PRESENTLY caught up or predicted to catch
531 | 	// up in the next two heartbeats.
532 | 
533 | 	// If the consumer is moving as fast or faster than the partition and the consumer is
534 | 	// at least moving, consider it healthy. This case is true in the standard catching
535 | 	// up from behind case.
536 | 	if partitionVelocity < consumerVelocity {
537 | 		log.Infof("[%s:%d] consumer catching up: consume ∆ %0.2f >= produce ∆ %0.2f",
538 | 			c.topic, c.partID, consumerVelocity, partitionVelocity)
539 | 		c.cyclesBehind = 0
540 | 		return true
541 | 	}
542 | 
543 | 	// Unhealthy, so increase the cycle count so we know when it's been unhealthy for
544 | 	// too long and we want to give it up
545 | 	c.cyclesBehind++
546 | 
547 | 	// If were behind by too many cycles, then we should try to release the
548 | 	// partition. If so, do this in a goroutine since it will involve calling out
549 | 	// to Kafka and releasing the partition.
550 | 	if c.cyclesBehind >= 3 {
551 | 		log.Warningf("[%s:%d] consumer unhealthy for too long, releasing",
552 | 			c.topic, c.partID)
553 | 		go c.Release()
554 | 		return false
555 | 	}
556 | 
557 | 	// Clearly we haven't been behind for long enough, so we're still "healthy"
558 | 	log.Warningf("[%s:%d] consumer too slow: consume ∆ %0.2f < produce ∆ %0.2f (warning #%d)",
559 | 		c.topic, c.partID, consumerVelocity, partitionVelocity, c.cyclesBehind)
560 | 	return true
561 | }
562 | 
563 | // healthCheckLoop runs regularly and will perform a health check. Exits when this claim
564 | // has been terminated.
565 | func (c *claim) healthCheckLoop() {
566 | 	time.Sleep(<-c.marshal.cluster.jitters)
567 | 	for !c.Terminated() {
568 | 		// Attempt to update offsets; if this fails we want to to do a quicker retry
569 | 		// than the jitter interval to allow us to try to retry some times before we
570 | 		// give up.
571 | 		for !c.heartbeatExpired() {
572 | 			if err := c.updateOffsets(); err != nil {
573 | 				log.Errorf("[%s:%d] health check loop failed to update offsets: %s",
574 | 					c.topic, c.partID, err)
575 | 				time.Sleep(1 * time.Second)
576 | 				continue
577 | 			}
578 | 			break
579 | 		}
580 | 
581 | 		// Now healthcheck and, if it's good, heartbeat
582 | 		if c.healthCheck() {
583 | 			go c.heartbeat()
584 | 		}
585 | 		time.Sleep(<-c.marshal.cluster.jitters)
586 | 	}
587 | 	log.Infof("[%s:%d] health check loop exiting, claim terminated",
588 | 		c.topic, c.partID)
589 | }
590 | 
591 | // average returns the average of a given slice of int64s. It ignores 0s as
592 | // those are "uninitialized" elements.
593 | func average(vals []int64) float64 {
594 | 	min, max, ct := int64(0), int64(0), int64(0)
595 | 	for _, val := range vals {
596 | 		if val <= 0 {
597 | 			continue
598 | 		}
599 | 		if min == 0 || val < min {
600 | 			min = val
601 | 		}
602 | 		if max == 0 || val > max {
603 | 			max = val
604 | 		}
605 | 		ct++
606 | 	}
607 | 
608 | 	if min == max || ct < 2 {
609 | 		return 0
610 | 	}
611 | 
612 | 	return float64(max-min) / float64(ct-1)
613 | }
614 | 
615 | // ConsumerVelocity returns the average of our consumers' velocity
616 | func (c *claim) ConsumerVelocity() float64 {
617 | 	c.lock.RLock()
618 | 	defer c.lock.RUnlock()
619 | 
620 | 	return average(c.offsetCurrentHistory[0:])
621 | }
622 | 
623 | // PartitionVelocity returns the average of the partition's velocity
624 | func (c *claim) PartitionVelocity() float64 {
625 | 	c.lock.RLock()
626 | 	defer c.lock.RUnlock()
627 | 
628 | 	return average(c.offsetLatestHistory[0:])
629 | }
630 | 
631 | // updateOffsets will update the offsets of our current partition.
632 | func (c *claim) updateOffsets() error {
633 | 	// Start by updating our current offsets so even if we fail to get the offsets
634 | 	// we need to calculate Kafka data, we still move our current offset forward.
635 | 	c.updateCurrentOffsets()
636 | 
637 | 	// Slow, hits Kafka. Run in a goroutine.
638 | 	offsets, err := c.marshal.GetPartitionOffsets(c.topic, c.partID)
639 | 	if err != nil {
640 | 		log.Errorf("[%s:%d] failed to get offsets: %s", c.topic, c.partID, err)
641 | 		return err
642 | 	}
643 | 
644 | 	c.lock.Lock()
645 | 	defer c.lock.Unlock()
646 | 
647 | 	// Update the earliest/latest offsets that are presently available within the
648 | 	// partition
649 | 	c.offsets.Earliest = offsets.Earliest
650 | 	c.offsets.Latest = offsets.Latest
651 | 
652 | 	// Do update our "history" values, this is used for calculating moving averages
653 | 	// in the health checking function
654 | 	c.offsetLatestHistory[c.beatCounter] = offsets.Latest
655 | 	c.offsetCurrentHistory[c.beatCounter] = c.offsets.Current
656 | 
657 | 	c.beatCounter = (c.beatCounter + 1) % 10
658 | 	return nil
659 | }
660 | 
661 | // numTrackingOffsets returns the size of the tracking dict.
662 | func (c *claim) numTrackingOffsets() int {
663 | 	c.lock.RLock()
664 | 	defer c.lock.RUnlock()
665 | 
666 | 	return len(c.tracking)
667 | }
668 | 
669 | // PrintState outputs the status of the consumer.
670 | func (c *claim) PrintState() {
671 | 	c.lock.RLock()
672 | 	defer c.lock.RUnlock()
673 | 
674 | 	// "Claimed" status is from Marshal rationalizer, and "Terminated" status is from
675 | 	// the local claim object (indicates we've exited somehow)
676 | 	state := "----"
677 | 	cl := c.marshal.GetPartitionClaim(c.topic, c.partID)
678 | 	if cl.Claimed() {
679 | 		if c.Terminated() {
680 | 			state = "CL+T"
681 | 		} else {
682 | 			state = "CLMD"
683 | 		}
684 | 	} else if c.Terminated() {
685 | 		state = "TERM"
686 | 	}
687 | 
688 | 	ct := 0
689 | 	for _, st := range c.tracking {
690 | 		if st {
691 | 			ct++
692 | 		}
693 | 	}
694 | 
695 | 	now := time.Now().Unix()
696 | 
697 | 	log.Infof("      * %2d [%s]: offsets %d <= %d <= %d | %d",
698 | 		c.partID, state, c.offsets.Earliest, c.offsets.Current,
699 | 		c.offsets.Latest, c.offsets.Committed)
700 | 	log.Infof("                   BC %d | LHB %d (%d) | OM %d | CB %d",
701 | 		c.beatCounter, c.lastHeartbeat, now-c.lastHeartbeat,
702 | 		c.outstandingMessages, c.cyclesBehind)
703 | 	log.Infof("                   TRACK COMMITTED %d | TRACK OUTSTANDING %d",
704 | 		ct, len(c.tracking)-ct)
705 | 	log.Infof("                   PV %0.2f | CV %0.2f",
706 | 		c.PartitionVelocity(), c.ConsumerVelocity())
707 | }
708 | 


--------------------------------------------------------------------------------
/marshal/claim_test.go:
--------------------------------------------------------------------------------
  1 | package marshal
  2 | 
  3 | import (
  4 | 	"sync"
  5 | 	"sync/atomic"
  6 | 	"time"
  7 | 
  8 | 	. "gopkg.in/check.v1"
  9 | 
 10 | 	"github.com/zorkian/kafka/kafkatest"
 11 | 	"github.com/zorkian/kafka/proto"
 12 | )
 13 | 
 14 | var _ = Suite(&ClaimSuite{})
 15 | 
 16 | type ClaimSuite struct {
 17 | 	c  *C
 18 | 	s  *kafkatest.Server
 19 | 	kc *KafkaCluster
 20 | 	m  *Marshaler
 21 | 	ch chan *Message
 22 | 	cl *claim
 23 | }
 24 | 
 25 | func (s *ClaimSuite) SetUpSuite(c *C) {
 26 | 	ResetTestLogger(c)
 27 | 
 28 | 	s.s = StartServer()
 29 | 
 30 | 	opts := NewMarshalOptions()
 31 | 	opts.BrokerConnectionLimit = 10
 32 | 	opts.ConsumeRequestTimeout = 20 * time.Millisecond
 33 | 	opts.MarshalRequestTimeout = 20 * time.Millisecond
 34 | 	opts.MarshalRequestRetryWait = 1 * time.Millisecond
 35 | 
 36 | 	var err error
 37 | 	s.kc, err = Dial("claimsuite", []string{s.s.Addr()}, opts)
 38 | 	c.Assert(err, IsNil)
 39 | }
 40 | 
 41 | func (s *ClaimSuite) SetUpTest(c *C) {
 42 | 	// Give a second for the last test to finish up, this prevents messages from
 43 | 	// releases from going into this test's pool
 44 | 	time.Sleep(1 * time.Second)
 45 | 
 46 | 	ResetTestLogger(c)
 47 | 
 48 | 	s.c = c
 49 | 	s.ch = make(chan *Message, 10)
 50 | 	s.s.ResetTopic("test3")
 51 | 	atomic.StoreInt32(s.kc.rsteps, 0)
 52 | 
 53 | 	var err error
 54 | 	s.m, err = s.kc.NewMarshaler("cl", newInstanceID())
 55 | 	c.Assert(err, IsNil)
 56 | 	s.cl = newClaim("test3", 0, s.m, nil, s.ch, NewConsumerOptions())
 57 | 	c.Assert(s.cl, NotNil)
 58 | }
 59 | 
 60 | func (s *ClaimSuite) TearDownTest(c *C) {
 61 | 	if s.cl != nil {
 62 | 		s.cl.Release()
 63 | 	}
 64 | 	if s.m != nil {
 65 | 		s.m.Terminate()
 66 | 	}
 67 | }
 68 | 
 69 | func (s *ClaimSuite) TearDownSuite(c *C) {
 70 | 	if s.kc != nil {
 71 | 		s.kc.Terminate()
 72 | 	}
 73 | 	if s.s != nil {
 74 | 		s.s.Close()
 75 | 	}
 76 | }
 77 | 
 78 | func (s *ClaimSuite) Produce(topicName string, partID int, msgs ...string) int64 {
 79 | 	var protos []*proto.Message
 80 | 	for _, msg := range msgs {
 81 | 		protos = append(protos, &proto.Message{Value: []byte(msg)})
 82 | 	}
 83 | 	offset, err := s.m.cluster.producer.Produce(topicName, int32(partID), protos...)
 84 | 	s.c.Assert(err, IsNil)
 85 | 	return offset
 86 | }
 87 | 
 88 | func (s *ClaimSuite) WaitForRsteps(c *C, cluster *KafkaCluster, numSteps int) {
 89 | 	steps, err := cluster.waitForRsteps(numSteps)
 90 | 	c.Assert(err, IsNil)
 91 | 	c.Assert(steps, Equals, numSteps)
 92 | }
 93 | 
 94 | func (s *ClaimSuite) TestOffsetUpdates(c *C) {
 95 | 	// Test that the updateOffsets function works and updates offsets from Kafka
 96 | 	c.Assert(s.cl.updateOffsets(), IsNil)
 97 | 	c.Assert(s.Produce("test3", 0, "m1", "m2", "m3"), Equals, int64(2))
 98 | 	c.Assert(s.cl.updateOffsets(), IsNil)
 99 | 	c.Assert(s.cl.offsets.Latest, Equals, int64(3))
100 | }
101 | 
102 | func (s *ClaimSuite) consumeOne(c *C) *Message {
103 | 	select {
104 | 	case msg := <-s.ch:
105 | 		return msg
106 | 	case <-time.After(3 * time.Second):
107 | 		c.Error("Timed out consuming a message.")
108 | 	}
109 | 	return nil
110 | }
111 | 
112 | func (s *ClaimSuite) TestCommit(c *C) {
113 | 	// Test the commit message flow, ensuring that our offset only gets updated when
114 | 	// we have properly committed messages
115 | 	c.Assert(s.Produce("test3", 0, "m1", "m2", "m3", "m4", "m5", "m6"), Equals, int64(5))
116 | 	c.Assert(s.cl.updateOffsets(), IsNil)
117 | 	c.Assert(s.cl.heartbeat(), Equals, true)
118 | 	c.Assert(s.cl.offsets.Current, Equals, int64(0))
119 | 	c.Assert(s.cl.offsets.Earliest, Equals, int64(0))
120 | 	c.Assert(s.cl.offsets.Latest, Equals, int64(6))
121 | 
122 | 	// Consume 1, heartbeat... offsets still 0
123 | 	msg1 := s.consumeOne(c)
124 | 	c.Assert(msg1.Value, DeepEquals, []byte("m1"))
125 | 	c.Assert(s.cl.updateOffsets(), IsNil)
126 | 	c.Assert(s.cl.heartbeat(), Equals, true)
127 | 	c.Assert(s.cl.offsets.Current, Equals, int64(0))
128 | 	s.cl.lock.RLock()
129 | 	c.Assert(s.cl.tracking[0], Equals, false)
130 | 	s.cl.lock.RUnlock()
131 | 
132 | 	// Consume 2, still 0
133 | 	msg2 := s.consumeOne(c)
134 | 	c.Assert(msg2.Value, DeepEquals, []byte("m2"))
135 | 	c.Assert(s.cl.updateOffsets(), IsNil)
136 | 	c.Assert(s.cl.heartbeat(), Equals, true)
137 | 	c.Assert(s.cl.offsets.Current, Equals, int64(0))
138 | 
139 | 	// Commit 1, offset 1 but only after heartbeat phase
140 | 	c.Assert(s.cl.Commit(msg1.Offset), IsNil)
141 | 	c.Assert(s.cl.offsets.Current, Equals, int64(0))
142 | 	c.Assert(s.cl.updateOffsets(), IsNil)
143 | 	c.Assert(s.cl.heartbeat(), Equals, true)
144 | 	c.Assert(s.cl.offsets.Current, Equals, int64(1))
145 | 	c.Assert(s.cl.numTrackingOffsets(), Equals, 5)
146 | 
147 | 	// Consume 3, heartbeat, offset 1
148 | 	msg3 := s.consumeOne(c)
149 | 	c.Assert(msg3.Value, DeepEquals, []byte("m3"))
150 | 	c.Assert(s.cl.offsets.Current, Equals, int64(1))
151 | 	c.Assert(s.cl.updateOffsets(), IsNil)
152 | 	c.Assert(s.cl.heartbeat(), Equals, true)
153 | 	c.Assert(s.cl.offsets.Current, Equals, int64(1))
154 | 
155 | 	// Commit #3, offset will stay 1!
156 | 	c.Assert(s.cl.Commit(msg3.Offset), IsNil)
157 | 	c.Assert(s.cl.offsets.Current, Equals, int64(1))
158 | 	c.Assert(s.cl.updateOffsets(), IsNil)
159 | 	c.Assert(s.cl.heartbeat(), Equals, true)
160 | 	c.Assert(s.cl.offsets.Current, Equals, int64(1))
161 | 	c.Assert(s.cl.numTrackingOffsets(), Equals, 5)
162 | 
163 | 	// Commit #2, offset now advances to 3
164 | 	c.Assert(s.cl.Commit(msg2.Offset), IsNil)
165 | 	c.Assert(s.cl.offsets.Current, Equals, int64(1))
166 | 	c.Assert(s.cl.updateOffsets(), IsNil)
167 | 	c.Assert(s.cl.heartbeat(), Equals, true)
168 | 	c.Assert(s.cl.offsets.Current, Equals, int64(3))
169 | 	c.Assert(s.cl.numTrackingOffsets(), Equals, 3)
170 | 
171 | 	// Attempt to commit invalid offset (never seen), make sure it errors
172 | 	msg3.Offset = 95
173 | 	c.Assert(s.cl.Commit(msg3.Offset), NotNil)
174 | 
175 | 	// Commit the rest
176 | 	c.Assert(s.cl.Commit(s.consumeOne(c).Offset), IsNil)
177 | 	c.Assert(s.cl.Commit(s.consumeOne(c).Offset), IsNil)
178 | 	c.Assert(s.cl.Commit(s.consumeOne(c).Offset), IsNil)
179 | 	c.Assert(s.cl.offsets.Current, Equals, int64(3))
180 | 	c.Assert(s.cl.updateOffsets(), IsNil)
181 | 	c.Assert(s.cl.heartbeat(), Equals, true)
182 | 	c.Assert(s.cl.offsets.Current, Equals, int64(6))
183 | 	c.Assert(s.cl.numTrackingOffsets(), Equals, 0)
184 | }
185 | 
186 | func (s *ClaimSuite) waitForTrackingOffsets(c *C, ct int) {
187 | 	for i := 0; i < 100; i++ {
188 | 		if s.cl.numTrackingOffsets() != ct {
189 | 			time.Sleep(10 * time.Millisecond)
190 | 			continue
191 | 		}
192 | 	}
193 | 	c.Assert(s.cl.numTrackingOffsets(), Equals, ct)
194 | }
195 | 
196 | func (s *ClaimSuite) TestFlush(c *C) {
197 | 	// Test the commit message flow, ensuring that our offset only gets updated when
198 | 	// we have properly committed messages
199 | 	//
200 | 	// Basically the same as the heartbeat test, since a flush triggers a heartbeat
201 | 	c.Assert(s.Produce("test3", 0, "m1", "m2", "m3", "m4", "m5", "m6"), Equals, int64(5))
202 | 	c.Assert(s.cl.updateOffsets(), IsNil)
203 | 	c.Assert(s.cl.heartbeat(), Equals, true)
204 | 	c.Assert(s.cl.offsets.Current, Equals, int64(0))
205 | 	c.Assert(s.cl.offsets.Earliest, Equals, int64(0))
206 | 	c.Assert(s.cl.offsets.Latest, Equals, int64(6))
207 | 	s.waitForTrackingOffsets(c, 6)
208 | 
209 | 	// Consume 1, Flush... offsets still 0
210 | 	msg1 := s.consumeOne(c)
211 | 	c.Assert(msg1.Value, DeepEquals, []byte("m1"))
212 | 	c.Assert(msg1.Offset, Equals, int64(0))
213 | 	c.Assert(s.cl.Flush(), IsNil)
214 | 	c.Assert(s.cl.offsets.Current, Equals, int64(0))
215 | 	s.cl.lock.RLock()
216 | 	val, ok := s.cl.tracking[0]
217 | 	c.Assert(ok, Equals, true)
218 | 	c.Assert(val, Equals, false)
219 | 	s.cl.lock.RUnlock()
220 | 
221 | 	// Consume 2, still 0
222 | 	msg2 := s.consumeOne(c)
223 | 	c.Assert(msg2.Value, DeepEquals, []byte("m2"))
224 | 	c.Assert(msg2.Offset, Equals, int64(1))
225 | 	c.Assert(s.cl.Flush(), IsNil)
226 | 	c.Assert(s.cl.offsets.Current, Equals, int64(0))
227 | 	c.Assert(s.cl.numTrackingOffsets(), Equals, 6)
228 | 
229 | 	// Commit 1, offset 1 but only after Flush phase
230 | 	c.Assert(s.cl.Commit(msg1.Offset), IsNil)
231 | 	c.Assert(s.cl.offsets.Current, Equals, int64(0))
232 | 	c.Assert(s.cl.Flush(), IsNil)
233 | 	c.Assert(s.cl.offsets.Current, Equals, int64(1))
234 | 	c.Assert(s.cl.numTrackingOffsets(), Equals, 5)
235 | 
236 | 	// Produce some more
237 | 	c.Assert(s.Produce("test3", 0, "m7"), Equals, int64(6))
238 | 	s.waitForTrackingOffsets(c, 6)
239 | 
240 | 	// Consume 3, Flush, offset 1
241 | 	msg3 := s.consumeOne(c)
242 | 	c.Assert(msg3.Value, DeepEquals, []byte("m3"))
243 | 	c.Assert(msg3.Offset, Equals, int64(2))
244 | 	c.Assert(s.cl.offsets.Current, Equals, int64(1))
245 | 	c.Assert(s.cl.Flush(), IsNil)
246 | 	c.Assert(s.cl.offsets.Current, Equals, int64(1))
247 | 
248 | 	// Assert that the above didn't update the Latest offset, the Flush
249 | 	// flow doesn't (unlike heartbeat which does)
250 | 	c.Assert(s.cl.offsets.Latest, Equals, int64(6))
251 | 
252 | 	// Commit #3, offset will stay 1! we're still tracking 6 because the
253 | 	// committed one in middle position must stay tracked until the
254 | 	// previous messages are committed
255 | 	c.Assert(s.cl.Commit(msg3.Offset), IsNil)
256 | 	c.Assert(s.cl.offsets.Current, Equals, int64(1))
257 | 	c.Assert(s.cl.Flush(), IsNil)
258 | 	c.Assert(s.cl.offsets.Current, Equals, int64(1))
259 | 	c.Assert(s.cl.numTrackingOffsets(), Equals, 6)
260 | 
261 | 	// Now a heartbeat happens, it should change nothing except Latest
262 | 	c.Assert(s.cl.updateOffsets(), IsNil)
263 | 	c.Assert(s.cl.heartbeat(), Equals, true)
264 | 	c.Assert(s.cl.offsets.Current, Equals, int64(1))
265 | 	c.Assert(s.cl.numTrackingOffsets(), Equals, 6)
266 | 	c.Assert(s.cl.offsets.Latest, Equals, int64(7))
267 | 
268 | 	// Commit #2, offset now advances to 3 and the outstanding is 4
269 | 	c.Assert(s.cl.Commit(msg2.Offset), IsNil)
270 | 	c.Assert(s.cl.offsets.Current, Equals, int64(1))
271 | 	c.Assert(s.cl.Flush(), IsNil)
272 | 	c.Assert(s.cl.offsets.Current, Equals, int64(3))
273 | 	c.Assert(s.cl.numTrackingOffsets(), Equals, 4)
274 | 
275 | 	// Attempt to commit invalid offset (never seen), make sure it errors
276 | 	msg3.Offset = 95
277 | 	c.Assert(s.cl.Commit(msg3.Offset), NotNil)
278 | 
279 | 	// Commit the rest
280 | 	c.Assert(s.cl.Commit(s.consumeOne(c).Offset), IsNil)
281 | 	c.Assert(s.cl.Commit(s.consumeOne(c).Offset), IsNil)
282 | 	c.Assert(s.cl.Commit(s.consumeOne(c).Offset), IsNil)
283 | 	c.Assert(s.cl.Commit(s.consumeOne(c).Offset), IsNil)
284 | 	c.Assert(s.cl.offsets.Current, Equals, int64(3))
285 | 	c.Assert(s.cl.Flush(), IsNil)
286 | 	c.Assert(s.cl.offsets.Current, Equals, int64(7))
287 | 	c.Assert(s.cl.numTrackingOffsets(), Equals, 0)
288 | 
289 | 	// One last heartbeat, should be no change from the above Flush since
290 | 	// nothing has happened
291 | 	c.Assert(s.cl.updateOffsets(), IsNil)
292 | 	c.Assert(s.cl.heartbeat(), Equals, true)
293 | 	c.Assert(s.cl.offsets.Current, Equals, int64(7))
294 | 	c.Assert(s.cl.numTrackingOffsets(), Equals, 0)
295 | 	c.Assert(s.cl.offsets.Latest, Equals, int64(7))
296 | }
297 | 
298 | func (s *ClaimSuite) BenchmarkConsumeAndCommit(c *C) {
299 | 	// Produce N messages for consumption into the test partition and hopefully this
300 | 	// doesn't end up being the really slow part of the operation
301 | 	msgs := make([]string, 0, c.N)
302 | 	for i := 0; i < c.N; i++ {
303 | 		msgs = append(msgs, "message")
304 | 	}
305 | 	s.Produce("test3", 0, msgs...)
306 | 
307 | 	// Now consume everything and immediately commit it
308 | 	for i := 0; i < c.N; i++ {
309 | 		if msg := s.consumeOne(c); msg != nil {
310 | 			s.cl.Commit(msg.Offset)
311 | 		}
312 | 	}
313 | }
314 | 
315 | func (s *ClaimSuite) assertRelease(c *C) {
316 | 	for i := 0; i < 100; i++ {
317 | 		time.Sleep(30 * time.Millisecond)
318 | 
319 | 		cl := s.cl.marshal.GetPartitionClaim(s.cl.topic, s.cl.partID)
320 | 		if !cl.Claimed() {
321 | 			break
322 | 		}
323 | 	}
324 | 
325 | 	cl := s.cl.marshal.GetPartitionClaim(s.cl.topic, s.cl.partID)
326 | 	c.Assert(cl.Claimed(), Equals, false)
327 | }
328 | 
329 | func (s *ClaimSuite) assertNoRelease(c *C) {
330 | 	for i := 0; i < 100; i++ {
331 | 		time.Sleep(30 * time.Millisecond)
332 | 
333 | 		cl := s.cl.marshal.GetPartitionClaim(s.cl.topic, s.cl.partID)
334 | 		if cl.Claimed() {
335 | 			break
336 | 		}
337 | 	}
338 | 
339 | 	cl := s.cl.marshal.GetPartitionClaim(s.cl.topic, s.cl.partID)
340 | 	c.Assert(cl.Claimed(), Equals, true)
341 | }
342 | 
343 | func (s *ClaimSuite) TestRelease(c *C) {
344 | 	// Test that calling Release on a claim properly releases the partition
345 | 	c.Assert(s.m.GetPartitionClaim("test3", 0).LastHeartbeat, Not(Equals), int64(0))
346 | 	c.Assert(s.cl.Terminated(), Equals, false)
347 | 	c.Assert(s.cl.Release(), Equals, true)
348 | 	s.assertRelease(c)
349 | 	s.WaitForRsteps(c, s.m.cluster, 3)
350 | 	c.Assert(s.m.GetPartitionClaim("test3", 0).LastHeartbeat, Equals, int64(0))
351 | 	c.Assert(s.cl.Release(), Equals, false)
352 | }
353 | 
354 | func (s *ClaimSuite) TestTerminate(c *C) {
355 | 	// Test that calling Terminate on a claim properly sets the flag and commits offsets
356 | 	// for the partition but does not release
357 | 	c.Assert(s.m.GetPartitionClaim("test3", 0).LastHeartbeat, Not(Equals), int64(0))
358 | 	c.Assert(s.cl.Terminated(), Equals, false)
359 | 	c.Assert(s.cl.Terminate(), Equals, true)
360 | 	c.Assert(s.cl.Terminated(), Equals, true)
361 | 	s.assertNoRelease(c)
362 | }
363 | 
364 | func (s *ClaimSuite) TestTerminateDoesNotDeadlock(c *C) {
365 | 	// Test that termination is not blocked by a full messages channel
366 | 	c.Assert(s.m.GetPartitionClaim("test3", 0).LastHeartbeat, Not(Equals), int64(0))
367 | 	c.Assert(s.cl.Terminated(), Equals, false)
368 | 
369 | 	// Replace message chan with length 1 chan for testing
370 | 	s.cl.lock.Lock()
371 | 	s.cl.messages = make(chan *Message, 1)
372 | 	s.cl.lock.Unlock()
373 | 
374 | 	// Now produce 2 messages and wait for them to be consumed
375 | 	c.Assert(s.Produce("test3", 0, "m1", "m2"), Equals, int64(1))
376 | 	s.waitForTrackingOffsets(c, 2)
377 | 
378 | 	// Assert message channel has 1 message in it, we have 2 tracking but 1 message
379 | 	// in channel because second is blocked
380 | 	c.Assert(len(s.cl.messages), Equals, 1)
381 | 
382 | 	// Assert that messageLock is being held, this works by sending a goroutine to
383 | 	// get the lock and then in our current function we sleep a bit. If the sleep expires,
384 | 	// that means the goroutine was blocked (or never scheduled). We get around that by
385 | 	// using the WaitGroup to make sure it actually scheduled.
386 | 	wasScheduled := &sync.WaitGroup{}
387 | 	wasScheduled.Add(1)
388 | 	probablyHeld := make(chan bool, 2)
389 | 	go func() {
390 | 		wasScheduled.Done() // Got scheduled!
391 | 		s.cl.messagesLock.Lock()
392 | 		defer s.cl.messagesLock.Unlock()
393 | 		probablyHeld <- false
394 | 	}()
395 | 	wasScheduled.Wait()
396 | 	select {
397 | 	case <-time.After(100 * time.Millisecond):
398 | 		probablyHeld <- true
399 | 	}
400 | 	c.Assert(<-probablyHeld, Equals, true)
401 | 
402 | 	// Now terminate, this should return and work and not be claimed
403 | 	c.Assert(s.cl.Release(), Equals, true)
404 | 	c.Assert(s.cl.Terminated(), Equals, true)
405 | 	s.assertRelease(c)
406 | }
407 | 
408 | func (s *ClaimSuite) TestCommitOutstanding(c *C) {
409 | 	// Test that calling CommitOffsets should commit offsets for outstanding messages and
410 | 	// updates claim tracking
411 | 	c.Assert(s.Produce("test3", 0, "m1", "m2", "m3", "m4", "m5", "m6"), Equals, int64(5))
412 | 	c.Assert(s.cl.updateOffsets(), IsNil)
413 | 	c.Assert(s.cl.offsets.Current, Equals, int64(0))
414 | 	c.Assert(s.cl.offsets.Earliest, Equals, int64(0))
415 | 	c.Assert(s.cl.offsets.Latest, Equals, int64(6))
416 | 
417 | 	// This test requires all messages to have been consumed into the channel, else
418 | 	// we can get inconsistent results
419 | 	readyChan := make(chan struct{})
420 | 	go func() {
421 | 		defer close(readyChan)
422 | 		for {
423 | 			s.cl.lock.RLock()
424 | 			if len(s.cl.messages) == 6 {
425 | 				s.cl.lock.RUnlock()
426 | 				break
427 | 			}
428 | 			s.cl.lock.RUnlock()
429 | 
430 | 			time.Sleep(100 * time.Millisecond)
431 | 		}
432 | 	}()
433 | 	select {
434 | 	case <-readyChan:
435 | 		// all good, continue
436 | 	case <-time.After(3 * time.Second):
437 | 		// Timeout reached, we've failed
438 | 		c.FailNow()
439 | 	}
440 | 
441 | 	// Consume 1, heartbeat... offsets still 0
442 | 	msg1 := s.consumeOne(c)
443 | 	c.Assert(msg1.Value, DeepEquals, []byte("m1"))
444 | 	c.Assert(s.cl.Commit(msg1.Offset), IsNil)
445 | 	c.Assert(s.cl.numTrackingOffsets(), Equals, 6)
446 | 	c.Assert(s.cl.offsets.Current, Equals, int64(0))
447 | 
448 | 	// Commit the offsets....should update current offset and tracking for the claim
449 | 	c.Assert(s.cl.Terminate(), Equals, true)
450 | 	c.Assert(s.cl.offsets.Current, Equals, int64(1))
451 | 	c.Assert(s.cl.numTrackingOffsets(), Equals, 5)
452 | }
453 | 
454 | func (s *ClaimSuite) TestCurrentLag(c *C) {
455 | 	// Test that GetCurrentLag returns the correct numbers in various cases
456 | 	s.cl.offsets.Current = 0
457 | 	s.cl.offsets.Latest = 0
458 | 	c.Assert(s.cl.GetCurrentLag(), Equals, int64(0))
459 | 
460 | 	s.cl.offsets.Current = 1
461 | 	s.cl.offsets.Latest = 0
462 | 	c.Assert(s.cl.GetCurrentLag(), Equals, int64(0))
463 | 
464 | 	s.cl.offsets.Current = 0
465 | 	s.cl.offsets.Latest = 1
466 | 	c.Assert(s.cl.GetCurrentLag(), Equals, int64(1))
467 | 
468 | 	s.cl.offsets.Current = 1
469 | 	s.cl.offsets.Latest = 2
470 | 	c.Assert(s.cl.GetCurrentLag(), Equals, int64(1))
471 | }
472 | 
473 | func (s *ClaimSuite) TestHeartbeat(c *C) {
474 | 	// Ensure that our heartbeats are updating the marshal structures appropriately
475 | 	// (makes sure clients are seeing the right values)
476 | 	c.Assert(s.m.GetPartitionClaim("test3", 0).CurrentOffset, Equals, int64(0))
477 | 	s.cl.offsets.Current = 10
478 | 	c.Assert(s.cl.heartbeat(), Equals, true)
479 | 	s.WaitForRsteps(c, s.m.cluster, 3)
480 | 	c.Assert(s.m.GetPartitionClaim("test3", 0).CurrentOffset, Equals, int64(10))
481 | 
482 | 	// And test that releasing means we can't update heartbeat anymore
483 | 	c.Assert(s.cl.Release(), Equals, true)
484 | 	s.WaitForRsteps(c, s.m.cluster, 4)
485 | 	s.cl.offsets.Current = 20
486 | 	c.Assert(s.cl.heartbeat(), Equals, false)
487 | 	c.Assert(s.m.GetPartitionClaim("test3", 0).LastHeartbeat, Equals, int64(0))
488 | 	c.Assert(s.m.GetPartitionClaim("test3", 0).CurrentOffset, Equals, int64(0))
489 | 	c.Assert(s.m.GetLastPartitionClaim("test3", 0).CurrentOffset, Equals, int64(10))
490 | }
491 | 
492 | func (s *ClaimSuite) TestReleaseIfWedged(c *C) {
493 | 	s.cl.offsets.Current = 10
494 | 	s.cl.offsetCurrentHistory = [10]int64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}
495 | 	s.cl.offsets.Latest = 20
496 | 	s.cl.offsetLatestHistory = [10]int64{11, 12, 13, 14, 15, 16, 17, 18, 19, 20}
497 | 	s.cl.lastMessageTime = time.Now()
498 | 	c.Assert(s.cl.healthCheck(), Equals, true)
499 | 
500 | 	// Now say the last message was a while ago, but our velocities are non-zero
501 | 	// so we shouldn't release
502 | 	s.cl.lastMessageTime = time.Now().Add(-(HeartbeatInterval + 1) * time.Second)
503 | 	c.Assert(s.cl.healthCheck(), Equals, true)
504 | 	c.Assert(s.m.GetPartitionClaim("test3", 0).LastHeartbeat, Not(Equals), int64(0))
505 | 
506 | 	// Set both PV and CV to be 0 and they're equal, should also succeed
507 | 	s.cl.offsets.Current = 12
508 | 	s.cl.offsetCurrentHistory = [10]int64{12, 12, 12, 12, 12, 12, 12, 12, 12, 12}
509 | 	s.cl.offsets.Latest = 12
510 | 	s.cl.offsetLatestHistory = [10]int64{12, 12, 12, 12, 12, 12, 12, 12, 12, 12}
511 | 	c.Assert(s.cl.healthCheck(), Equals, true)
512 | 	c.Assert(s.m.GetPartitionClaim("test3", 0).LastHeartbeat, Not(Equals), int64(0))
513 | 
514 | 	// Set both PV and CV to be 0 and they're not-equal, should release
515 | 	s.cl.offsets.Current = 12
516 | 	s.cl.offsetCurrentHistory = [10]int64{12, 12, 12, 12, 12, 12, 12, 12, 12, 12}
517 | 	s.cl.offsets.Latest = 13
518 | 	s.cl.offsetLatestHistory = [10]int64{13, 13, 13, 13, 13, 13, 13, 13, 13, 13}
519 | 	c.Assert(s.cl.healthCheck(), Equals, false)
520 | 	s.WaitForRsteps(c, s.m.cluster, 3)
521 | 	c.Assert(s.m.GetPartitionClaim("test3", 0).LastHeartbeat, Equals, int64(0))
522 | }
523 | 
524 | func (s *ClaimSuite) TestReleaseIfWedged2(c *C) {
525 | 	s.cl.offsets.Current = 10
526 | 	s.cl.offsetCurrentHistory = [10]int64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}
527 | 	s.cl.offsets.Latest = 20
528 | 	s.cl.offsetLatestHistory = [10]int64{11, 12, 13, 14, 15, 16, 17, 18, 19, 20}
529 | 	s.cl.lastMessageTime = time.Now()
530 | 	c.Assert(s.cl.healthCheck(), Equals, true)
531 | 
532 | 	// Set CV=0 and PV>0, should release
533 | 	s.cl.offsets.Current = 12
534 | 	s.cl.offsetCurrentHistory = [10]int64{12, 12, 12, 12, 12, 12, 12, 12, 12, 12}
535 | 	s.cl.offsets.Latest = 14
536 | 	s.cl.offsetLatestHistory = [10]int64{13, 13, 13, 13, 13, 13, 13, 14, 14, 14}
537 | 	s.cl.lastMessageTime = time.Now().Add(-(HeartbeatInterval + 1) * time.Second)
538 | 	c.Assert(s.cl.healthCheck(), Equals, false)
539 | 	s.WaitForRsteps(c, s.m.cluster, 3)
540 | 	c.Assert(s.m.GetPartitionClaim("test3", 0).LastHeartbeat, Equals, int64(0))
541 | }
542 | 
543 | func (s *ClaimSuite) TestVelocity(c *C) {
544 | 	// Test that the velocity functions perform as expected given the expected inputs
545 | 	s.cl.offsetCurrentHistory = [10]int64{0, 0, 0, 0, 0, 0, 0, 0, 0, 0}
546 | 	c.Assert(s.cl.ConsumerVelocity(), Equals, float64(0))
547 | 	s.cl.offsetLatestHistory = s.cl.offsetCurrentHistory
548 | 	c.Assert(s.cl.PartitionVelocity(), Equals, s.cl.ConsumerVelocity())
549 | 
550 | 	s.cl.offsetCurrentHistory = [10]int64{1, 2, 0, 0, 0, 0, 0, 0, 0, 0}
551 | 	c.Assert(s.cl.ConsumerVelocity(), Equals, float64(1))
552 | 	s.cl.offsetLatestHistory = s.cl.offsetCurrentHistory
553 | 	c.Assert(s.cl.PartitionVelocity(), Equals, s.cl.ConsumerVelocity())
554 | 
555 | 	s.cl.offsetCurrentHistory = [10]int64{1, 2, 3, 0, 0, 0, 0, 0, 0, 0}
556 | 	c.Assert(s.cl.ConsumerVelocity(), Equals, float64(1))
557 | 	s.cl.offsetLatestHistory = s.cl.offsetCurrentHistory
558 | 	c.Assert(s.cl.PartitionVelocity(), Equals, s.cl.ConsumerVelocity())
559 | 
560 | 	s.cl.offsetCurrentHistory = [10]int64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}
561 | 	c.Assert(s.cl.ConsumerVelocity(), Equals, float64(1))
562 | 	s.cl.offsetLatestHistory = s.cl.offsetCurrentHistory
563 | 	c.Assert(s.cl.PartitionVelocity(), Equals, s.cl.ConsumerVelocity())
564 | 
565 | 	s.cl.offsetCurrentHistory = [10]int64{1, 21, 21, 0, 0, 0, 0, 0, 0, 0}
566 | 	c.Assert(s.cl.ConsumerVelocity(), Equals, float64(10))
567 | 	s.cl.offsetLatestHistory = s.cl.offsetCurrentHistory
568 | 	c.Assert(s.cl.PartitionVelocity(), Equals, s.cl.ConsumerVelocity())
569 | 
570 | 	s.cl.offsetCurrentHistory = [10]int64{1, 21, 21, 21, 21, 0, 0, 0, 0, 0}
571 | 	c.Assert(s.cl.ConsumerVelocity(), Equals, float64(5))
572 | 	s.cl.offsetLatestHistory = s.cl.offsetCurrentHistory
573 | 	c.Assert(s.cl.PartitionVelocity(), Equals, s.cl.ConsumerVelocity())
574 | 
575 | 	s.cl.offsetCurrentHistory = [10]int64{21, 21, 1, 21, 21, 0, 0, 0, 0, 0}
576 | 	c.Assert(s.cl.ConsumerVelocity(), Equals, float64(5))
577 | 	s.cl.offsetLatestHistory = s.cl.offsetCurrentHistory
578 | 	c.Assert(s.cl.PartitionVelocity(), Equals, s.cl.ConsumerVelocity())
579 | 
580 | 	s.cl.offsetCurrentHistory = [10]int64{21, 0, 0, 0, 0, 0, 0, 0, 0, 0}
581 | 	c.Assert(s.cl.ConsumerVelocity(), Equals, float64(0))
582 | 	s.cl.offsetLatestHistory = s.cl.offsetCurrentHistory
583 | 	c.Assert(s.cl.PartitionVelocity(), Equals, s.cl.ConsumerVelocity())
584 | }
585 | 
586 | func (s *ClaimSuite) TestHealthCheck(c *C) {
587 | 	// Ensure that the health check system returns expected values for given states
588 | 	s.cl.offsets.Current = 0
589 | 	s.cl.offsetCurrentHistory = [10]int64{0, 0, 0, 0, 0, 0, 0, 0, 0, 0}
590 | 	s.cl.offsets.Latest = 0
591 | 	s.cl.offsetLatestHistory = [10]int64{0, 0, 0, 0, 0, 0, 0, 0, 0, 0}
592 | 	c.Assert(s.cl.healthCheck(), Equals, true)
593 | 	c.Assert(s.cl.cyclesBehind, Equals, 0)
594 | 
595 | 	// Put us in an "unhealthy" state, PV is high and we aren't caught up
596 | 	s.cl.offsets.Latest = 10
597 | 	s.cl.offsetLatestHistory = [10]int64{1, 10, 0, 0, 0, 0, 0, 0, 0, 0}
598 | 	c.Assert(s.cl.healthCheck(), Equals, true)
599 | 	c.Assert(s.cl.cyclesBehind, Equals, 1)
600 | 
601 | 	// Now we're "caught up" even PV>CV we're healthy
602 | 	s.cl.offsets.Current = 21
603 | 	s.cl.offsetCurrentHistory = [10]int64{1, 6, 11, 16, 21, 0, 0, 0, 0, 0}
604 | 	s.cl.offsets.Latest = 21
605 | 	s.cl.offsetLatestHistory = [10]int64{1, 11, 21, 0, 0, 0, 0, 0, 0, 0}
606 | 	c.Assert(s.cl.ConsumerVelocity() < s.cl.PartitionVelocity(), Equals, true)
607 | 	c.Assert(s.cl.healthCheck(), Equals, true)
608 | 	c.Assert(s.cl.cyclesBehind, Equals, 0)
609 | 
610 | 	// Test that "predictive speed" is working, i.e., that the consumer is
611 | 	// considered healthy when it's within a heartbeat of the end
612 | 	s.cl.offsets.Latest = 31
613 | 	s.cl.offsetLatestHistory = [10]int64{1, 11, 21, 31, 0, 0, 0, 0, 0, 0}
614 | 	c.Assert(s.cl.ConsumerVelocity() < s.cl.PartitionVelocity(), Equals, true)
615 | 	c.Assert(s.cl.healthCheck(), Equals, true)
616 | 	c.Assert(s.cl.cyclesBehind, Equals, 0)
617 | 
618 | 	// Test that PV=0, CV=0 but behind is unhealthy
619 | 	s.cl.offsets.Current = 21
620 | 	s.cl.offsetCurrentHistory = [10]int64{21, 21, 21, 21, 21, 21, 21, 21, 21, 21}
621 | 	s.cl.offsets.Latest = 23
622 | 	s.cl.offsetLatestHistory = [10]int64{23, 23, 23, 23, 23, 23, 23, 23, 23, 23}
623 | 	c.Assert(s.cl.ConsumerVelocity(), Equals, float64(0))
624 | 	c.Assert(s.cl.PartitionVelocity(), Equals, float64(0))
625 | 	c.Assert(s.cl.ConsumerVelocity() == s.cl.PartitionVelocity(), Equals, true)
626 | 	c.Assert(s.cl.healthCheck(), Equals, true)
627 | 	c.Assert(s.cl.cyclesBehind, Equals, 1)
628 | 	c.Assert(s.cl.healthCheck(), Equals, true)
629 | 	c.Assert(s.cl.cyclesBehind, Equals, 2)
630 | 
631 | 	// Now we advance one message, giving us SOME velocity -- even tho PV is still 0
632 | 	// this should make us healthy
633 | 	s.cl.offsets.Current = 22
634 | 	s.cl.offsetCurrentHistory = [10]int64{21, 21, 21, 21, 21, 21, 21, 21, 21, 22}
635 | 	c.Assert(s.cl.PartitionVelocity(), Equals, float64(0))
636 | 	c.Assert(s.cl.ConsumerVelocity() > s.cl.PartitionVelocity(), Equals, true)
637 | 	c.Assert(s.cl.healthCheck(), Equals, true)
638 | 	c.Assert(s.cl.cyclesBehind, Equals, 0)
639 | 
640 | 	// Now handle the "far behind but catching up" case, CV>PV but beyond the prediction
641 | 	s.cl.offsets.Current = 31
642 | 	s.cl.offsetCurrentHistory = [10]int64{21, 22, 23, 24, 26, 27, 28, 29, 30, 31}
643 | 	s.cl.offsets.Latest = 132
644 | 	s.cl.offsetLatestHistory = [10]int64{123, 124, 125, 126, 127, 128, 129, 130, 131, 132}
645 | 	c.Assert(s.cl.PartitionVelocity(), Equals, float64(1))
646 | 	c.Assert(s.cl.ConsumerVelocity() > s.cl.PartitionVelocity(), Equals, true)
647 | 	c.Assert(s.cl.healthCheck(), Equals, true)
648 | 	c.Assert(s.cl.cyclesBehind, Equals, 0)
649 | 
650 | 	// Now we're behind and fail health checks 3 times, this will release
651 | 	s.cl.offsets.Current = 22
652 | 	s.cl.offsetCurrentHistory = [10]int64{21, 21, 21, 21, 21, 21, 21, 21, 21, 22}
653 | 	s.cl.offsets.Latest = 32
654 | 	s.cl.offsetLatestHistory = [10]int64{1, 11, 21, 32, 0, 0, 0, 0, 0, 0}
655 | 	c.Assert(s.cl.ConsumerVelocity() < s.cl.PartitionVelocity(), Equals, true)
656 | 	c.Assert(s.cl.healthCheck(), Equals, true)
657 | 	c.Assert(s.cl.cyclesBehind, Equals, 1)
658 | 	c.Assert(s.cl.healthCheck(), Equals, true)
659 | 	c.Assert(s.cl.cyclesBehind, Equals, 2)
660 | 	c.Assert(s.cl.healthCheck(), Equals, false)
661 | 	c.Assert(s.cl.cyclesBehind, Equals, 3)
662 | 	s.WaitForRsteps(c, s.m.cluster, 3)
663 | 	c.Assert(s.m.GetPartitionClaim("test3", 0).LastHeartbeat, Equals, int64(0))
664 | 	c.Assert(s.m.GetPartitionClaim("test3", 0).CurrentOffset, Equals, int64(0))
665 | 	c.Assert(s.m.GetLastPartitionClaim("test3", 0).CurrentOffset, Equals, int64(22))
666 | 
667 | 	// If we are okay with CV<PV we shouldn't release
668 | 	opts := NewConsumerOptions()
669 | 	opts.ReleaseClaimsIfBehind = false
670 | 	s.cl = newClaim("test3", 0, s.m, nil, s.ch, opts)
671 | 	c.Assert(s.cl, NotNil)
672 | 	s.cl.offsetLatestHistory = [10]int64{1, 10, 0, 0, 0, 0, 0, 0, 0, 0}
673 | 	s.cl.offsetCurrentHistory = [10]int64{0, 0, 0, 0, 0, 0, 0, 0, 0, 0}
674 | 	s.cl.offsets.Current = 0
675 | 	s.cl.offsets.Latest = 10
676 | 	c.Assert(s.cl.ConsumerVelocity() < s.cl.PartitionVelocity(), Equals, true)
677 | 	c.Assert(s.cl.healthCheck(), Equals, true)
678 | }
679 | 
680 | func (s *ClaimSuite) TestHealthCheckRelease(c *C) {
681 | 	// Test that an expired heartbeat causes the partition to get immediately released
682 | 	s.cl.lastHeartbeat -= HeartbeatInterval * 2
683 | 	s.cl.offsets.Current = 5
684 | 	c.Assert(s.cl.healthCheck(), Equals, false)
685 | 	s.WaitForRsteps(c, s.m.cluster, 3)
686 | 	c.Assert(s.m.GetPartitionClaim("test3", 0).LastHeartbeat, Equals, int64(0))
687 | 	s.assertRelease(c)
688 | 	c.Assert(s.cl.healthCheck(), Equals, false)
689 | 	c.Assert(s.m.GetPartitionClaim("test3", 0).LastHeartbeat, Equals, int64(0))
690 | 	c.Assert(s.m.GetPartitionClaim("test3", 0).CurrentOffset, Equals, int64(0))
691 | 	c.Assert(s.m.GetLastPartitionClaim("test3", 0).CurrentOffset, Equals, int64(5))
692 | }
693 | 


--------------------------------------------------------------------------------
/marshal/consumer_test.go:
--------------------------------------------------------------------------------
  1 | package marshal
  2 | 
  3 | import (
  4 | 	"math/rand"
  5 | 	"sort"
  6 | 	"strconv"
  7 | 	"sync"
  8 | 	"sync/atomic"
  9 | 	"time"
 10 | 
 11 | 	. "gopkg.in/check.v1"
 12 | 
 13 | 	"github.com/zorkian/kafka/kafkatest"
 14 | 	"github.com/zorkian/kafka/proto"
 15 | )
 16 | 
 17 | var _ = Suite(&ConsumerSuite{})
 18 | 
 19 | type ConsumerSuite struct {
 20 | 	c  *C
 21 | 	s  *kafkatest.Server
 22 | 	kc *KafkaCluster
 23 | 	m  *Marshaler
 24 | 	m2 *Marshaler
 25 | 	cn *Consumer
 26 | 	gr string
 27 | }
 28 | 
 29 | func NewTestConsumer(m *Marshaler, topics []string) *Consumer {
 30 | 	cn := &Consumer{
 31 | 		alive:              new(int32),
 32 | 		marshal:            m,
 33 | 		topics:             topics,
 34 | 		options:            NewConsumerOptions(),
 35 | 		partitions:         make(map[string]int),
 36 | 		lock:               &sync.RWMutex{},
 37 | 		rand:               rand.New(rand.NewSource(time.Now().UnixNano())),
 38 | 		claims:             make(map[string]map[int]*claim),
 39 | 		messages:           make(chan *Message, 1000),
 40 | 		topicClaimsChan:    make(chan map[string]bool, 1),
 41 | 		topicClaimsUpdated: make(chan struct{}, 1),
 42 | 		stopChan:           make(chan struct{}),
 43 | 		doneChan:           make(chan struct{}),
 44 | 	}
 45 | 
 46 | 	for _, topic := range topics {
 47 | 		cn.partitions[topic] = m.Partitions(topic)
 48 | 	}
 49 | 	atomic.StoreInt32(cn.alive, 1)
 50 | 
 51 | 	go cn.sendTopicClaimsLoop()
 52 | 
 53 | 	return cn
 54 | }
 55 | 
 56 | func (s *ConsumerSuite) SetUpSuite(c *C) {
 57 | 	ResetTestLogger(c)
 58 | 
 59 | 	s.s = StartServer()
 60 | 
 61 | 	opts := NewMarshalOptions()
 62 | 	opts.BrokerConnectionLimit = 10
 63 | 	opts.ConsumeRequestTimeout = 20 * time.Millisecond
 64 | 	opts.MarshalRequestTimeout = 20 * time.Millisecond
 65 | 	opts.MarshalRequestRetryWait = 1 * time.Millisecond
 66 | 
 67 | 	var err error
 68 | 	s.kc, err = Dial("test", []string{s.s.Addr()}, opts)
 69 | 	c.Assert(err, IsNil)
 70 | }
 71 | 
 72 | func (s *ConsumerSuite) SetUpTest(c *C) {
 73 | 	// Give a second for the last test to finish up, this prevents messages from
 74 | 	// releases from going into this test's pool
 75 | 	time.Sleep(1 * time.Second)
 76 | 
 77 | 	ResetTestLogger(c)
 78 | 
 79 | 	s.c = c
 80 | 	s.s.ResetTopic("test1")
 81 | 	s.s.ResetTopic("test2")
 82 | 	s.s.ResetTopic("test3")
 83 | 	atomic.StoreInt32(s.kc.rsteps, 0)
 84 | 
 85 | 	MakeTopic(s.s, "test1", 1)
 86 | 	MakeTopic(s.s, "test2", 2)
 87 | 	MakeTopic(s.s, "test3", 3)
 88 | 
 89 | 	s.gr = newInstanceID()
 90 | 
 91 | 	var err error
 92 | 	s.m, err = s.kc.NewMarshaler("cl", s.gr)
 93 | 	c.Assert(err, IsNil)
 94 | 	s.m2, err = s.kc.NewMarshaler("cl2", s.gr)
 95 | 	c.Assert(err, IsNil)
 96 | 
 97 | 	s.cn = NewTestConsumer(s.m, []string{"test3"})
 98 | }
 99 | 
100 | func (s *ConsumerSuite) TearDownTest(c *C) {
101 | 	if s.cn != nil {
102 | 		s.cn.Terminate(true)
103 | 	}
104 | 	if s.m != nil {
105 | 		s.m.Terminate()
106 | 	}
107 | 	if s.m2 != nil {
108 | 		s.m2.Terminate()
109 | 	}
110 | }
111 | 
112 | func (s *ConsumerSuite) TearDownSuite(c *C) {
113 | 	if s.kc != nil {
114 | 		s.kc.Terminate()
115 | 	}
116 | }
117 | 
118 | func (s *ConsumerSuite) Produce(topicName string, partID int, msgs ...string) int64 {
119 | 	var protos []*proto.Message
120 | 	for _, msg := range msgs {
121 | 		protos = append(protos, &proto.Message{Value: []byte(msg)})
122 | 	}
123 | 	offset, err := s.kc.producer.Produce(topicName, int32(partID), protos...)
124 | 	s.c.Assert(err, IsNil)
125 | 	return offset
126 | }
127 | 
128 | func (s *ConsumerSuite) WaitForRsteps(c *C, cluster *KafkaCluster, numSteps int) {
129 | 	steps, err := cluster.waitForRsteps(numSteps)
130 | 	c.Assert(err, IsNil)
131 | 	c.Assert(steps, Equals, numSteps)
132 | }
133 | 
134 | func (s *ConsumerSuite) TestNewConsumer(c *C) {
135 | 	options := NewConsumerOptions()
136 | 	options.GreedyClaims = true
137 | 
138 | 	cn, err := s.m.NewConsumer([]string{"test1"}, options)
139 | 	c.Assert(err, IsNil)
140 | 	defer cn.Terminate(true)
141 | 
142 | 	// Wait for 2 messages to be processed
143 | 	s.WaitForRsteps(c, s.kc, 2)
144 | 	c.Assert(s.m.GetPartitionClaim("test1", 0).LastHeartbeat, Not(Equals), int64(0))
145 | 
146 | 	// Test basic consumption
147 | 	s.Produce("test1", 0, "m1", "m2", "m3")
148 | 	c.Assert(cn.consumeOne().Value, DeepEquals, []byte("m1"))
149 | 	c.Assert(cn.consumeOne().Value, DeepEquals, []byte("m2"))
150 | 	c.Assert(cn.consumeOne().Value, DeepEquals, []byte("m3"))
151 | 
152 | 	// Get consumer channel for this next test
153 | 	chn := cn.ConsumeChannel()
154 | 
155 | 	// Terminate marshaler, ensure it terminates the consumer
156 | 	s.m.Terminate()
157 | 	c.Assert(s.m.Terminated(), Equals, true)
158 | 	c.Assert(cn.Terminated(), Equals, true)
159 | 
160 | 	// Ensure that the channel has been closed
161 | 	select {
162 | 	case _, ok := <-chn:
163 | 		c.Assert(ok, Equals, false)
164 | 	default:
165 | 		c.Assert(false, Equals, true)
166 | 	}
167 | 
168 | 	// Now ensure we can't create a new consumer
169 | 	cn, err = s.m.NewConsumer([]string{"test1"}, options)
170 | 	c.Assert(cn, IsNil)
171 | 	c.Assert(err, NotNil)
172 | }
173 | 
174 | func (s *ConsumerSuite) TestFlush(c *C) {
175 | 	options := NewConsumerOptions()
176 | 	options.GreedyClaims = true
177 | 
178 | 	cn, err := s.m.NewConsumer([]string{"test2"}, options)
179 | 	c.Assert(err, IsNil)
180 | 	defer cn.Terminate(true)
181 | 
182 | 	// Wait for 4 messages to be processed
183 | 	s.WaitForRsteps(c, s.kc, 4)
184 | 	c.Assert(s.m.GetPartitionClaim("test2", 0).LastHeartbeat, Not(Equals), int64(0))
185 | 	c.Assert(s.m.GetPartitionClaim("test2", 1).LastHeartbeat, Not(Equals), int64(0))
186 | 
187 | 	// Produce and consume some messages
188 | 	s.Produce("test2", 0, "m1", "m2", "m3")
189 | 	c.Assert(cn.consumeOne().Value, DeepEquals, []byte("m1"))
190 | 	c.Assert(cn.consumeOne().Value, DeepEquals, []byte("m2"))
191 | 	c.Assert(cn.consumeOne().Value, DeepEquals, []byte("m3"))
192 | 
193 | 	s.Produce("test2", 1, "m4", "m5")
194 | 	c.Assert(cn.consumeOne().Value, DeepEquals, []byte("m4"))
195 | 	c.Assert(cn.consumeOne().Value, DeepEquals, []byte("m5"))
196 | 
197 | 	// Ensure all offsets are as expected (i.e. no heartbeat/flush yet, so
198 | 	// offsets are 0)
199 | 	c.Assert(cn.claims["test2"][0].offsets.Latest, Equals, int64(0))
200 | 	c.Assert(cn.claims["test2"][1].offsets.Latest, Equals, int64(0))
201 | 	c.Assert(cn.claims["test2"][0].offsets.Current, Equals, int64(0))
202 | 	c.Assert(cn.claims["test2"][1].offsets.Current, Equals, int64(0))
203 | 
204 | 	// Now flush, this updates the Current offsets but not Latest
205 | 	c.Assert(cn.Flush(), IsNil)
206 | 	c.Assert(cn.claims["test2"][0].offsets.Latest, Equals, int64(0))
207 | 	c.Assert(cn.claims["test2"][1].offsets.Latest, Equals, int64(0))
208 | 	c.Assert(cn.claims["test2"][0].offsets.Current, Equals, int64(3))
209 | 	c.Assert(cn.claims["test2"][1].offsets.Current, Equals, int64(2))
210 | }
211 | 
212 | func (s *ConsumerSuite) TestTerminateWithRelease(c *C) {
213 | 	// Termination is supposed to release active claims that we have, ensure that
214 | 	// this happens
215 | 	c.Assert(s.cn.tryClaimPartition(s.cn.defaultTopic(), 0), Equals, true)
216 | 	c.Assert(s.cn.Terminate(true), Equals, true)
217 | 	s.WaitForRsteps(c, s.kc, 3)
218 | 	c.Assert(s.m.GetPartitionClaim(s.cn.defaultTopic(), 0).LastHeartbeat, Equals, int64(0))
219 | }
220 | 
221 | func (s *ConsumerSuite) TestTerminateWithoutRelease(c *C) {
222 | 	// Termination is supposed to commit the active claims without releasing the partition
223 | 	c.Assert(s.cn.tryClaimPartition(s.cn.defaultTopic(), 0), Equals, true)
224 | 	c.Assert(s.cn.Terminate(false), Equals, true)
225 | 	// Shouldn't release the partition
226 | 	c.Assert(s.m.GetPartitionClaim(s.cn.defaultTopic(), 0).LastHeartbeat, Not(Equals), int64(0))
227 | }
228 | 
229 | func (s *ConsumerSuite) TestMultiClaim(c *C) {
230 | 	// Set up claims on two partitions, we'll put messages in both and then ensure that
231 | 	// we get all of the messages out
232 | 	c.Assert(s.cn.tryClaimPartition(s.cn.defaultTopic(), 0), Equals, true)
233 | 	c.Assert(s.cn.tryClaimPartition(s.cn.defaultTopic(), 1), Equals, true)
234 | 
235 | 	// Produce numMessages messages to the two partitions
236 | 	numMessages := 100
237 | 	for i := 0; i < numMessages; i++ {
238 | 		s.Produce("test3", i%2, strconv.Itoa(i))
239 | 	}
240 | 
241 | 	// Now consume numMessages times and ensure we get exactly 1000 unique messages
242 | 	results := make(map[string]bool)
243 | 	for i := 0; i < numMessages; i++ {
244 | 		results[string(s.cn.consumeOne().Value)] = true
245 | 	}
246 | 	c.Assert(len(results), Equals, numMessages)
247 | }
248 | 
249 | func (s *ConsumerSuite) TestTopicClaim(c *C) {
250 | 	topic := "test2"
251 | 	// Claim an entire topic
252 | 	options := NewConsumerOptions()
253 | 	options.ClaimEntireTopic = true
254 | 	cn, err := s.m.NewConsumer([]string{topic}, options)
255 | 	c.Assert(err, IsNil)
256 | 	defer cn.Terminate(true)
257 | 
258 | 	// Wait for 4 messages to be processed and ensure we have the entire topic
259 | 	s.WaitForRsteps(c, s.kc, 4)
260 | 	c.Assert(s.m.GetPartitionClaim(topic, 0).LastHeartbeat, Not(Equals), int64(0))
261 | 	c.Assert(s.m.GetPartitionClaim(topic, 1).LastHeartbeat, Not(Equals), int64(0))
262 | }
263 | 
264 | func (s *ConsumerSuite) TestTopicClaimBlocked(c *C) {
265 | 	topic := "test2"
266 | 	// Claim partition 0 with one consumer
267 | 	cnbl := NewTestConsumer(s.m, []string{topic})
268 | 	c.Assert(cnbl.tryClaimPartition(topic, 0), Equals, true)
269 | 	s.WaitForRsteps(c, s.kc, 2)
270 | 	s.WaitForRsteps(c, s.m2.cluster, 2)
271 | 
272 | 	// Claim an entire topic, this creates a real consumer
273 | 	cn := NewTestConsumer(s.m2, []string{topic})
274 | 	cn.lock.Lock()
275 | 	cn.options.ClaimEntireTopic = true
276 | 	cn.lock.Unlock()
277 | 	wg := sync.WaitGroup{}
278 | 	wg.Add(1)
279 | 	go func() {
280 | 		defer wg.Done()
281 | 		select {
282 | 		case claimedTopics := <-cn.TopicClaims():
283 | 			c.Assert(len(claimedTopics), Equals, 1)
284 | 			break
285 | 		}
286 | 	}()
287 | 
288 | 	// Force our consumer to run its topic claim loop so we know it has run
289 | 	cn.claimTopics()
290 | 
291 | 	// Ensure partition 1 is unclaimed still and that our new consumer has no claims
292 | 	c.Assert(s.m.GetPartitionClaim(topic, 1).LastHeartbeat, Equals, int64(0))
293 | 	c.Assert(cnbl.getNumActiveClaims(), Equals, 1)
294 | 	c.Assert(cn.getNumActiveClaims(), Equals, 0)
295 | 
296 | 	// Now release partition 0
297 | 	cnbl.Terminate(true)
298 | 	s.WaitForRsteps(c, s.kc, 3)
299 | 	s.WaitForRsteps(c, s.m2.cluster, 3)
300 | 	c.Assert(cnbl.getNumActiveClaims(), Equals, 0)
301 | 	c.Assert(cn.getNumActiveClaims(), Equals, 0)
302 | 
303 | 	// Reclaim and assert we get two claims
304 | 	cn.claimTopics()
305 | 	c.Assert(cnbl.getNumActiveClaims(), Equals, 0)
306 | 	c.Assert(cn.getNumActiveClaims(), Equals, 2)
307 | 	claimedTopics, err := cn.GetCurrentTopicClaims()
308 | 	c.Assert(err, IsNil)
309 | 	c.Assert(claimedTopics[topic], Equals, true)
310 | 	c.Assert(len(claimedTopics), Equals, 1)
311 | 	wg.Wait()
312 | 
313 | 	// let's release the topic and make sure the claimed topics get updated too
314 | 	cn.Terminate(true)
315 | 	wg.Add(1)
316 | 	go func() {
317 | 		defer wg.Done()
318 | 		select {
319 | 		case claimedTopics := <-cn.TopicClaims():
320 | 			c.Assert(len(claimedTopics), Equals, 0)
321 | 			break
322 | 		}
323 | 	}()
324 | 	wg.Wait()
325 | 	claimedTopics, err = cn.GetCurrentTopicClaims()
326 | 	c.Assert(err, IsNil)
327 | 	c.Assert(claimedTopics[topic], Equals, false)
328 | 	c.Assert(len(claimedTopics), Equals, 0)
329 | }
330 | 
331 | func (s *ConsumerSuite) TestTopicClaimPartial(c *C) {
332 | 	topic := "test2"
333 | 	// Claim partition 1 with one consumer
334 | 	cnbl := NewTestConsumer(s.m, []string{topic})
335 | 	c.Assert(cnbl.tryClaimPartition(topic, 1), Equals, true)
336 | 	s.WaitForRsteps(c, s.kc, 2)
337 | 	s.WaitForRsteps(c, s.m2.cluster, 2)
338 | 
339 | 	// Claim an entire topic, this creates a real consumer
340 | 	cn := NewTestConsumer(s.m2, []string{topic})
341 | 	cn.lock.Lock()
342 | 	cn.options.ClaimEntireTopic = true
343 | 	cn.lock.Unlock()
344 | 	defer cn.Terminate(true)
345 | 
346 | 	// Force our consumer to run it's topic claim loop so we know it has run
347 | 	cn.claimTopics()
348 | 
349 | 	// Both should have 1 partition -- the topic claim got 0, and the other one still has 1.
350 | 	// This can happen in the case where a consumer has died and partition 0's claim expires
351 | 	// before the other partitions.
352 | 	c.Assert(cnbl.getNumActiveClaims(), Equals, 1)
353 | 	c.Assert(cn.getNumActiveClaims(), Equals, 1)
354 | 
355 | 	// Now release partition 1, end state should be that the topic claimant still has 0 and
356 | 	// nobody has 1
357 | 	cnbl.Terminate(true)
358 | 	s.WaitForRsteps(c, s.kc, 5)
359 | 	s.WaitForRsteps(c, s.m2.cluster, 5)
360 | 	c.Assert(cnbl.getNumActiveClaims(), Equals, 0)
361 | 	c.Assert(cn.getNumActiveClaims(), Equals, 1)
362 | 
363 | 	// Now the topic claimant runs again and will see it can claim partition 1 and does
364 | 	cn.claimTopics()
365 | 	s.WaitForRsteps(c, s.kc, 7)
366 | 	s.WaitForRsteps(c, s.m2.cluster, 7)
367 | 	c.Assert(cnbl.getNumActiveClaims(), Equals, 0)
368 | 	c.Assert(cn.getNumActiveClaims(), Equals, 2)
369 | 
370 | 	// Now we release claim 1 and make sure both get released (this is the healthy
371 | 	// release case where we lose 1 partition and we want to make sure we release
372 | 	// all partitions)
373 | 	c.Assert(cn.claims[topic][1].Release(), Equals, true)
374 | 	s.WaitForRsteps(c, s.kc, 9)
375 | 	c.Assert(s.m.GetPartitionClaim(topic, 0).LastHeartbeat, Equals, int64(0))
376 | 	c.Assert(s.m.GetPartitionClaim(topic, 1).LastHeartbeat, Equals, int64(0))
377 | 
378 | }
379 | 
380 | func (s *ConsumerSuite) TestMultiTopicClaim(c *C) {
381 | 	// Claim partition 1 with one consumer
382 | 	topics := []string{"test1", "test2"}
383 | 	cn := NewTestConsumer(s.m, topics)
384 | 	cn.lock.Lock()
385 | 	cn.options.ClaimEntireTopic = true
386 | 	cn.lock.Unlock()
387 | 	defer cn.Terminate(true)
388 | 	wg := sync.WaitGroup{}
389 | 	wg.Add(1)
390 | 	go func() {
391 | 		defer wg.Done()
392 | 		for i := range topics {
393 | 			select {
394 | 			case claimedTopics := <-cn.TopicClaims():
395 | 				// we should get topic claims one by one
396 | 				c.Assert(len(claimedTopics), Equals, i+1)
397 | 			}
398 | 		}
399 | 	}()
400 | 
401 | 	// Force our consumer to run it's topic claim loop so we know it has run
402 | 	cn.claimTopics()
403 | 
404 | 	// we should have claimed all topic partitions
405 | 	partitions := 0
406 | 	for _, topic := range topics {
407 | 		partitions += s.m.Partitions(topic)
408 | 	}
409 | 
410 | 	claimedTopics, err := cn.GetCurrentTopicClaims()
411 | 	c.Assert(err, IsNil)
412 | 	c.Assert(cn.getNumActiveClaims(), Equals, partitions)
413 | 	c.Assert(len(claimedTopics), Equals, len(topics))
414 | 	wg.Wait()
415 | 
416 | 	// let's force another claimTopics and make sure we don't get any more notifications.
417 | 	// Force our consumer to run it's topic claim loop so we know it has run
418 | 	cn.claimTopics()
419 | 	timeoutChan := make(chan struct{})
420 | 	go func() {
421 | 		select {
422 | 		case claimedTopics := <-cn.TopicClaims():
423 | 			// we should not get any more topic claims
424 | 			log.Errorf("received topic claims unexpectedly: %v", claimedTopics)
425 | 			c.Fail()
426 | 		case <-time.After(time.Second):
427 | 			timeoutChan <- struct{}{}
428 | 		}
429 | 	}()
430 | 
431 | 	<-timeoutChan
432 | 
433 | 	claimedTopics, err = cn.GetCurrentTopicClaims()
434 | 	c.Assert(err, IsNil)
435 | 	c.Assert(len(claimedTopics), Equals, len(topics))
436 | }
437 | 
438 | func (s *ConsumerSuite) TestMultiTopicClaimWithLimit(c *C) {
439 | 	// Claim partition 1 with one consumer
440 | 	topics := []string{"test1", "test2"}
441 | 	cn := NewTestConsumer(s.m, topics)
442 | 	cn.lock.Lock()
443 | 	cn.options.ClaimEntireTopic = true
444 | 	cn.options.MaximumClaims = 1
445 | 	cn.lock.Unlock()
446 | 	defer cn.Terminate(true)
447 | 
448 | 	// Force our consumer to run it's topic claim loop so we know it has run
449 | 	cn.claimTopics()
450 | 	c.Assert(cn.getNumActiveClaims(), Not(Equals), 0)
451 | 
452 | 	// let's also make sure that the length of claimed topics is correct
453 | 	claimedTopics, err := cn.GetCurrentTopicClaims()
454 | 	c.Assert(err, IsNil)
455 | 	c.Assert(len(claimedTopics), Equals, 1)
456 | }
457 | 
458 | func (s *ConsumerSuite) TestUnhealthyPartition(c *C) {
459 | 	c.Assert(s.cn.tryClaimPartition(s.cn.defaultTopic(), 0), Equals, true)
460 | 	s.cn.lock.RLock()
461 | 	cl := s.cn.claims[s.cn.defaultTopic()][0]
462 | 	s.cn.lock.RUnlock()
463 | 
464 | 	// We just claimed, nothing should be unhealthy
465 | 	c.Assert(cl.healthCheck(), Equals, true)
466 | 	c.Assert(cl.cyclesBehind, Equals, 0)
467 | 
468 | 	// Put in one message and consume it making sure things work, and then update offsets.
469 | 	s.Produce("test3", 0, "m1")
470 | 	c.Assert(s.cn.consumeOne().Value, DeepEquals, []byte("m1"))
471 | 	s.cn.claims[s.cn.defaultTopic()][0].heartbeat()
472 | 	c.Assert(cl.updateOffsets(), IsNil)
473 | 	c.Assert(cl.healthCheck(), Equals, true)
474 | 	c.Assert(cl.cyclesBehind, Equals, 0)
475 | 
476 | 	// Produce 5, consume 3... at this point we are "healthy" since predictive speed
477 | 	// says we'll probably catch up within a heartbeat
478 | 	s.Produce("test3", 0, "m2", "m3", "m4", "m5", "m6")
479 | 	c.Assert(s.cn.consumeOne().Value, DeepEquals, []byte("m2"))
480 | 	c.Assert(s.cn.consumeOne().Value, DeepEquals, []byte("m3"))
481 | 	c.Assert(s.cn.consumeOne().Value, DeepEquals, []byte("m4"))
482 | 	s.cn.claims[s.cn.defaultTopic()][0].heartbeat()
483 | 	c.Assert(cl.updateOffsets(), IsNil)
484 | 	c.Assert(cl.healthCheck(), Equals, true)
485 | 	c.Assert(cl.cyclesBehind, Equals, 0)
486 | 
487 | 	// Consume nothing, produce two more, predictive speed will now fail
488 | 	s.Produce("test3", 0, "m7", "m8")
489 | 	s.cn.claims[s.cn.defaultTopic()][0].heartbeat()
490 | 	c.Assert(cl.updateOffsets(), IsNil)
491 | 	c.Assert(cl.healthCheck(), Equals, true)
492 | 	c.Assert(cl.cyclesBehind, Equals, 1)
493 | 
494 | 	// Produce nothing and consume the last four, we become healthy again because
495 | 	// we are caught up and our velocity is equal
496 | 	c.Assert(s.cn.consumeOne().Value, DeepEquals, []byte("m5"))
497 | 	c.Assert(s.cn.consumeOne().Value, DeepEquals, []byte("m6"))
498 | 	c.Assert(s.cn.consumeOne().Value, DeepEquals, []byte("m7"))
499 | 	c.Assert(s.cn.consumeOne().Value, DeepEquals, []byte("m8"))
500 | 	s.cn.claims[s.cn.defaultTopic()][0].heartbeat()
501 | 	c.Assert(cl.updateOffsets(), IsNil)
502 | 	c.Assert(cl.healthCheck(), Equals, true)
503 | 	c.Assert(cl.ConsumerVelocity() == cl.PartitionVelocity(), Equals, true)
504 | 	c.Assert(cl.cyclesBehind, Equals, 0)
505 | 
506 | 	// Produce a lot, goes unhealthy
507 | 	s.Produce("test3", 0, "mX")
508 | 	s.Produce("test3", 0, "mX")
509 | 	s.Produce("test3", 0, "mX")
510 | 	s.Produce("test3", 0, "mX")
511 | 	s.Produce("test3", 0, "mX")
512 | 	s.Produce("test3", 0, "mX")
513 | 	c.Assert(cl.updateOffsets(), IsNil)
514 | 	c.Assert(cl.healthCheck(), Equals, true)
515 | 	c.Assert(cl.cyclesBehind, Equals, 1)
516 | 
517 | 	// Still behind
518 | 	c.Assert(cl.updateOffsets(), IsNil)
519 | 	c.Assert(cl.healthCheck(), Equals, true)
520 | 	c.Assert(cl.cyclesBehind, Equals, 2)
521 | 
522 | 	// Consume the last message, which will fix our velocity and let us
523 | 	// pass as healthy again
524 | 	c.Assert(s.cn.consumeOne().Value, DeepEquals, []byte("mX"))
525 | 	c.Assert(s.cn.consumeOne().Value, DeepEquals, []byte("mX"))
526 | 	c.Assert(s.cn.consumeOne().Value, DeepEquals, []byte("mX"))
527 | 	c.Assert(s.cn.consumeOne().Value, DeepEquals, []byte("mX"))
528 | 	c.Assert(s.cn.consumeOne().Value, DeepEquals, []byte("mX"))
529 | 	c.Assert(s.cn.consumeOne().Value, DeepEquals, []byte("mX"))
530 | 	s.cn.claims[s.cn.defaultTopic()][0].heartbeat()
531 | 	c.Assert(cl.updateOffsets(), IsNil)
532 | 	c.Assert(cl.healthCheck(), Equals, true)
533 | 	c.Assert(cl.cyclesBehind, Equals, 0)
534 | }
535 | 
536 | func (s *ConsumerSuite) TestConsumerHeartbeat(c *C) {
537 | 	c.Assert(s.cn.tryClaimPartition(s.cn.defaultTopic(), 0), Equals, true)
538 | 	s.WaitForRsteps(c, s.kc, 2)
539 | 	cl := s.cn.claims[s.cn.defaultTopic()][0]
540 | 	// Newly claimed partition should have heartbeated
541 | 	c.Assert(cl.lastHeartbeat, Not(Equals), 0)
542 | 
543 | 	// Now reset the heartbeat to some other value
544 | 	cl.lastHeartbeat -= HeartbeatInterval
545 | 	hb := cl.lastHeartbeat
546 | 
547 | 	// Manual heartbeat, ensure lastHeartbeat is updated
548 | 	cl.heartbeat()
549 | 	s.WaitForRsteps(c, s.kc, 3)
550 | 	c.Assert(cl.lastHeartbeat, Not(Equals), hb)
551 | }
552 | 
553 | func (s *ConsumerSuite) TestCommittedOffset(c *C) {
554 | 	// Test that we save/load committed offsets properly and that when we load them we will
555 | 	// prefer them over the heartbeated values (if they're higher)
556 | 	s.Produce("test3", 0, "m1", "m2", "m3", "m4")
557 | 	c.Assert(s.m.offsets.Commit("test3", 0, 2), IsNil)
558 | 	c.Assert(s.cn.tryClaimPartition(s.cn.defaultTopic(), 0), Equals, true)
559 | 	s.WaitForRsteps(c, s.kc, 2)
560 | 	cl := s.cn.claims[s.cn.defaultTopic()][0]
561 | 	c.Assert(cl.offsets.Current, Equals, int64(2))
562 | 
563 | 	// Since the committed offset was 2, the first consumption should be the third message
564 | 	c.Assert(s.cn.consumeOne().Value, DeepEquals, []byte("m3"))
565 | 
566 | 	// Heartbeat should succeed after updating the committed offset
567 | 	c.Assert(cl.updateOffsets(), IsNil)
568 | 	c.Assert(cl.heartbeat(), Equals, true)
569 | 	s.WaitForRsteps(c, s.kc, 3)
570 | 	offset, _, err := s.m.offsets.Offset("test3", 0)
571 | 	c.Assert(err, IsNil)
572 | 	c.Assert(offset, Equals, int64(3))
573 | 	c.Assert(cl.Release(), Equals, true)
574 | 	s.WaitForRsteps(c, s.kc, 4)
575 | 	clm := cl.marshal.GetPartitionClaim(cl.topic, cl.partID)
576 | 	c.Assert(clm.Claimed(), Equals, false)
577 | 	s.cn.claims[s.cn.defaultTopic()][0] = nil
578 | 
579 | 	// Now let's "downcommit" the offset back to an earlier value, and then re-claim the
580 | 	// partition to verify that it sets the offset to the heartbeated value rather than
581 | 	// the committed value
582 | 	c.Assert(s.m.offsets.Commit("test3", 0, 2), IsNil)
583 | 	offset, _, err = s.m.offsets.Offset("test3", 0)
584 | 	c.Assert(err, IsNil)
585 | 	c.Assert(offset, Equals, int64(2))
586 | 	c.Assert(s.cn.tryClaimPartition(s.cn.defaultTopic(), 0), Equals, true)
587 | 	s.WaitForRsteps(c, s.kc, 6)
588 | 	c.Assert(s.cn.claims[s.cn.defaultTopic()][0].offsets.Current, Equals, int64(3))
589 | }
590 | 
591 | func (s *ConsumerSuite) TestCommitByToken(c *C) {
592 | 	s.Produce("test3", 0, "m1")
593 | 	c.Assert(s.cn.tryClaimPartition(s.cn.defaultTopic(), 0), Equals, true)
594 | 	s.WaitForRsteps(c, s.kc, 2)
595 | 	cl := s.cn.claims["test3"][0]
596 | 	msg1 := <-s.cn.messages
597 | 
598 | 	// One outstanding offset, one tracked offset
599 | 	c.Assert(cl.outstandingMessages, Equals, 1)
600 | 	c.Assert(cl.numTrackingOffsets(), Equals, 1)
601 | 
602 | 	// Now commit it, 0 outstanding
603 | 	token := msg1.CommitToken()
604 | 	c.Assert(token.offset, Equals, int64(0))
605 | 	c.Assert(s.cn.CommitByToken(token), IsNil)
606 | 	c.Assert(cl.outstandingMessages, Equals, 0)
607 | 	c.Assert(cl.numTrackingOffsets(), Equals, 1)
608 | 
609 | 	// Now heartbeat, both 0
610 | 	c.Assert(cl.updateOffsets(), IsNil)
611 | 	c.Assert(cl.heartbeat(), Equals, true)
612 | 	c.Assert(cl.outstandingMessages, Equals, 0)
613 | 	c.Assert(cl.numTrackingOffsets(), Equals, 0)
614 | }
615 | 
616 | func (s *ConsumerSuite) TestTryClaimPartition(c *C) {
617 | 	// Should work
618 | 	c.Assert(s.cn.tryClaimPartition(s.cn.defaultTopic(), 0), Equals, true)
619 | 	// Should fail (can't claim a second time)
620 | 	c.Assert(s.cn.tryClaimPartition(s.cn.defaultTopic(), 0), Equals, false)
621 | }
622 | 
623 | func (s *ConsumerSuite) TestAggressiveClaim(c *C) {
624 | 	// Ensure aggressive mode claims all partitions in a single call to claim
625 | 	s.cn.lock.Lock()
626 | 	s.cn.options.GreedyClaims = true
627 | 	s.cn.lock.Unlock()
628 | 	c.Assert(s.cn.GetCurrentLoad(), Equals, 0)
629 | 	s.cn.claimPartitions()
630 | 	c.Assert(s.cn.GetCurrentLoad(), Equals, 3)
631 | }
632 | 
633 | func (s *ConsumerSuite) TestBalancedClaim(c *C) {
634 | 	// Ensure balanced mode only claims one partition
635 | 	c.Assert(s.cn.GetCurrentLoad(), Equals, 0)
636 | 	s.cn.claimPartitions()
637 | 	c.Assert(s.cn.GetCurrentLoad(), Equals, 1)
638 | }
639 | 
640 | func (s *ConsumerSuite) TestUnhealthyReclaim(c *C) {
641 | 	cn := NewTestConsumer(s.m, []string{"test1"})
642 | 	defer cn.Terminate(true)
643 | 
644 | 	// Claim a partition
645 | 	c.Assert(cn.tryClaimPartition("test1", 0), Equals, true)
646 | 	cn.claims["test1"][0].Release()
647 | 	s.WaitForRsteps(c, s.kc, 3)
648 | 
649 | 	// Call ClaimPartitions, verify it does not claim
650 | 	cn.claimPartitions()
651 | 	c.Assert(cn.GetCurrentLoad(), Equals, 0)
652 | 
653 | 	// Artificially age the claim's last release time
654 | 	s.kc.lock.Lock()
655 | 	s.kc.groups[s.m.groupID]["test1"].partitions[0].LastRelease -= HeartbeatInterval
656 | 	s.kc.lock.Unlock()
657 | 
658 | 	// Call ClaimPartitions, verify it claims
659 | 	cn.claimPartitions()
660 | 	c.Assert(cn.GetCurrentLoad(), Equals, 1)
661 | }
662 | 
663 | func (s *ConsumerSuite) TestFastReclaim(c *C) {
664 | 	// Claim some partitions then create a new consumer with fast reclaim on; this
665 | 	// should "reclaim" the partitions automatically at the offset they were last
666 | 	// reported at
667 | 	cn1, err := s.m.NewConsumer([]string{"test2"}, NewConsumerOptions())
668 | 	c.Assert(err, IsNil)
669 | 	defer cn1.Terminate(true)
670 | 	s.Produce("test2", 0, "m1", "m2", "m3")
671 | 
672 | 	// By default the consumer will claim all partitions so let's wait for that
673 | 	s.WaitForRsteps(c, s.kc, 4)
674 | 	cn1.lock.RLock()
675 | 	cn1.lock.RUnlock()
676 | 
677 | 	// Consume the first two messages from 0, then heartbeat to set the offset to 2
678 | 	c.Assert(cn1.consumeOne().Value, DeepEquals, []byte("m1"))
679 | 	c.Assert(cn1.consumeOne().Value, DeepEquals, []byte("m2"))
680 | 	cn1.lock.Lock()
681 | 	c.Assert(cn1.claims["test2"][0].updateOffsets(), IsNil)
682 | 	c.Assert(cn1.claims["test2"][0].heartbeat(), Equals, true)
683 | 	cn1.lock.Unlock()
684 | 	s.WaitForRsteps(c, s.kc, 5)
685 | 
686 | 	// Now add some messages to the next, but only consume some
687 | 	s.Produce("test2", 1, "p1", "p2", "p3", "p4")
688 | 	c.Assert(cn1.consumeOne().Value, DeepEquals, []byte("m3"))
689 | 	c.Assert(cn1.consumeOne().Value, DeepEquals, []byte("p1"))
690 | 	c.Assert(cn1.consumeOne().Value, DeepEquals, []byte("p2"))
691 | 	c.Assert(cn1.consumeOne().Value, DeepEquals, []byte("p3"))
692 | 
693 | 	// Now we "reclaim" by creating a new consumer here; this is actually bogus
694 | 	// usage as it would normally lead to stepping on the prior consumer, but it
695 | 	// is useful for this test.
696 | 	cn, err := s.m.NewConsumer([]string{"test2"}, NewConsumerOptions())
697 | 	c.Assert(err, IsNil)
698 | 	defer cn.Terminate(true)
699 | 
700 | 	// We expect the two partitions to be reclaimed with a simple heartbeat
701 | 	// and no claim message sent
702 | 	s.WaitForRsteps(c, s.kc, 7)
703 | 	c.Assert(len(cn.claims[cn.defaultTopic()]), Equals, 2)
704 | 	cn.lock.RLock()
705 | 	cl0, cl1 := cn.claims[cn.defaultTopic()][0], cn.claims[cn.defaultTopic()][1]
706 | 	cn.lock.RUnlock()
707 | 	c.Assert(cl0.offsets.Current, Equals, int64(2))
708 | 	c.Assert(cl1.offsets.Current, Equals, int64(0))
709 | 
710 | 	// There should be five messages left, but they can come in any order depending
711 | 	// on how things get scheduled. Let's get them all and sort and verify. This
712 | 	// does indicate we've double-consumed, but that's expected in this particular
713 | 	// failure scenario.
714 | 	var msgs []string
715 | 	for i := 0; i < 5; i++ {
716 | 		msgs = append(msgs, string(cn.consumeOne().Value))
717 | 	}
718 | 	sort.Strings(msgs)
719 | 	c.Assert(msgs, DeepEquals, []string{"m3", "p1", "p2", "p3", "p4"})
720 | }
721 | 
722 | func (s *ConsumerSuite) TestMaximumClaims(c *C) {
723 | 	// Test the MaximumClaims option.
724 | 	s.cn.lock.Lock()
725 | 	s.cn.options.MaximumClaims = 2
726 | 	s.cn.lock.Unlock()
727 | 	c.Assert(s.cn.isClaimLimitReached(), Equals, false)
728 | 	c.Assert(s.cn.getNumActiveClaims(), Equals, 0)
729 | 	s.cn.claimPartitions()
730 | 	c.Assert(s.cn.isClaimLimitReached(), Equals, false)
731 | 	c.Assert(s.cn.getNumActiveClaims(), Equals, 1)
732 | 	s.cn.claimPartitions()
733 | 	c.Assert(s.cn.isClaimLimitReached(), Equals, true)
734 | 	c.Assert(s.cn.getNumActiveClaims(), Equals, 2)
735 | 	s.cn.claimPartitions()
736 | 	c.Assert(s.cn.isClaimLimitReached(), Equals, true)
737 | 	c.Assert(s.cn.getNumActiveClaims(), Equals, 2)
738 | }
739 | 
740 | func (s *ConsumerSuite) TestMaximumGreedyClaims(c *C) {
741 | 	// Test the MaximumClaims option combined with GreedyClaims.
742 | 	s.cn.lock.Lock()
743 | 	s.cn.options.MaximumClaims = 2
744 | 	s.cn.options.GreedyClaims = true
745 | 	s.cn.lock.Unlock()
746 | 	c.Assert(s.cn.isClaimLimitReached(), Equals, false)
747 | 	c.Assert(s.cn.getNumActiveClaims(), Equals, 0)
748 | 	s.cn.claimPartitions()
749 | 	c.Assert(s.cn.isClaimLimitReached(), Equals, true)
750 | 	c.Assert(s.cn.getNumActiveClaims(), Equals, 2)
751 | 	s.cn.claimPartitions()
752 | 	c.Assert(s.cn.isClaimLimitReached(), Equals, true)
753 | 	c.Assert(s.cn.getNumActiveClaims(), Equals, 2)
754 | }
755 | 
756 | func (s *ConsumerSuite) TestUpdatePartitionCounts(c *C) {
757 | 	// Create a new consumer on test1
758 | 	topic := "test1"
759 | 	cn := NewTestConsumer(s.m, []string{topic})
760 | 	defer cn.Terminate(true)
761 | 	s.cn = cn
762 | 
763 | 	// Claim the only partition
764 | 	s.cn.claimPartitions()
765 | 	c.Assert(s.cn.getNumActiveClaims(), Equals, 1)
766 | 
767 | 	// Increase number of partitions
768 | 	MakeTopic(s.s, "test1", 2)
769 | 	s.kc.refreshMetadata()
770 | 
771 | 	// Verify that the consumer claims the new partition
772 | 	s.cn.updatePartitionCounts()
773 | 	s.cn.claimPartitions()
774 | 	c.Assert(s.cn.getNumActiveClaims(), Equals, 2)
775 | }
776 | 
777 | func (s *ConsumerSuite) TestConsumerRemovesSelfFromMarshal(c *C) {
778 | 	// Test that Consumers remove themselves from the associated Marshal.
779 | 	s.m.addNewConsumer(s.cn)
780 | 	c.Assert(s.m.consumers, DeepEquals, []*Consumer{s.cn})
781 | 	s.cn.Terminate(true)
782 | 	c.Assert(s.cn.marshal.consumers, DeepEquals, []*Consumer{})
783 | }
784 | 
785 | func (s *ConsumerSuite) TestDoubleClaim(c *C) {
786 | 	// This reproduces an issue we've seen where the consumer can get into a state
787 | 	// where it thinks it needs to terminate and it doesn't.
788 | 	//
789 | 	// Consumer (cl, gr) claims partition. Then releases.
790 | 	// Lock is held on consumer.
791 | 	// Consumer.tryClaimPartition is called.
792 | 	// Rationalizer processes successful claim while lock is still held.
793 | 	// Eventually, lock is released.
794 | 	// Consumer "detects" double claim and dies.
795 | 
796 | 	cn := NewTestConsumer(s.m, []string{"test1"})
797 | 	defer cn.Terminate(true)
798 | 
799 | 	// Claim a partition and release
800 | 	c.Assert(cn.tryClaimPartition("test1", 0), Equals, true)
801 | 	cn.claims["test1"][0].Release()
802 | 	s.WaitForRsteps(c, s.kc, 3)
803 | 
804 | 	// Now take the lock, then claim again
805 | 	cn.lock.Lock()
806 | 	go func() {
807 | 		time.Sleep(time.Second)
808 | 		cn.lock.Unlock()
809 | 	}()
810 | 	c.Assert(cn.tryClaimPartition("test1", 0), Equals, true)
811 | 	s.WaitForRsteps(c, s.kc, 5)
812 | 
813 | 	// If we get this far, the test has passed and we didn't exit.
814 | 	c.Assert(cn.Terminated(), Equals, false)
815 | }
816 | 


--------------------------------------------------------------------------------
/marshal/consumer.go:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * portal - marshal
  3 |  *
  4 |  * a library that implements an algorithm for doing consumer coordination within Kafka, rather
  5 |  * than using Zookeeper or another external system.
  6 |  *
  7 |  */
  8 | 
  9 | package marshal
 10 | 
 11 | import (
 12 | 	"errors"
 13 | 	"fmt"
 14 | 	"math/rand"
 15 | 	"sync"
 16 | 	"sync/atomic"
 17 | 	"time"
 18 | 
 19 | 	"github.com/zorkian/kafka/proto"
 20 | )
 21 | 
 22 | // CommitToken is a minimal structure that contains only the information necessary to
 23 | // mark a message committed. This is done so that you can throw away the message instead
 24 | // of holding on to it in memory.
 25 | type CommitToken struct {
 26 | 	topic  string
 27 | 	partID int
 28 | 	offset int64
 29 | }
 30 | 
 31 | // Message is a container for Kafka messages.
 32 | type Message proto.Message
 33 | 
 34 | // CommitToken returns a CommitToken for a message. This can be passed to the
 35 | // CommitByToken method.
 36 | func (m *Message) CommitToken() CommitToken {
 37 | 	return CommitToken{
 38 | 		topic:  m.Topic,
 39 | 		partID: int(m.Partition),
 40 | 		offset: m.Offset,
 41 | 	}
 42 | }
 43 | 
 44 | // ConsumerOptions represents all of the options that a consumer can be configured with.
 45 | type ConsumerOptions struct {
 46 | 	// FastReclaim instructs the consumer to attempt to reclaim any partitions
 47 | 	// that are presently claimed by the ClientID/GroupID we have. This is useful
 48 | 	// for situations where your ClientID is predictable/stable and you want to
 49 | 	// minimize churn during restarts. This is dangerous if you have two copies
 50 | 	// of your application running with the same ClientID/GroupID.
 51 | 	// TODO: Create an instance ID for Marshaler such that we can detect when
 52 | 	// someone else has decided to use our Client/Group.
 53 | 	//
 54 | 	// Note that this option ignores MaximumClaims, so it is possible to
 55 | 	// exceed the claim limit if the ClientID previously held more claims.
 56 | 	FastReclaim bool
 57 | 
 58 | 	// ClaimEntireTopic makes Marshal handle claims on the entire topic rather than
 59 | 	// on a per-partition basis. This is used with sharded produce/consume setups.
 60 | 	// Defaults to false.
 61 | 	ClaimEntireTopic bool
 62 | 
 63 | 	// GreedyClaims indicates whether we should attempt to claim all unclaimed
 64 | 	// partitions on start. This is appropriate in low QPS type environments.
 65 | 	// Defaults to false/off.
 66 | 	GreedyClaims bool
 67 | 
 68 | 	// ReleaseClaimsIfBehind indicates whether to release a claim if a consumer
 69 | 	// is consuming at a rate slower than the partition is being produced to.
 70 | 	ReleaseClaimsIfBehind bool
 71 | 
 72 | 	// The maximum number of claims this Consumer is allowed to hold simultaneously.
 73 | 	// MaximumClaims indicates the maximum number of partitions to be claimed when
 74 | 	// ClaimEntireTopic is set to false. Otherwise, it indicates the maximum number
 75 | 	// of topics to claim.
 76 | 	// Set to 0 (default) to allow an unlimited number of claims.
 77 | 	//
 78 | 	// Using this option will leave some partitions/topics completely unclaimed
 79 | 	// if the number of Consumers in this GroupID falls below the number of
 80 | 	// partitions/topics that exist.
 81 | 	//
 82 | 	// Note this limit does not apply to claims made via FastReclaim.
 83 | 	MaximumClaims int
 84 | }
 85 | 
 86 | // Consumer allows you to safely consume data from a given topic in such a way that you
 87 | // don't need to worry about partitions and can safely split the load across as many
 88 | // processes as might be consuming from this topic. However, you should ONLY create one
 89 | // Consumer per topic in your application!
 90 | type Consumer struct {
 91 | 	alive    *int32
 92 | 	marshal  *Marshaler
 93 | 	topics   []string
 94 | 	options  ConsumerOptions
 95 | 	messages chan *Message
 96 | 
 97 | 	// These are used to manage topic claim notifications. These notifications are
 98 | 	// sent only when a topic claim changes state: i.e., you can assert that when
 99 | 	// receiving a notification there is some change to the global state.
100 | 	topicClaimsChan    chan map[string]bool
101 | 	topicClaimsUpdated chan struct{}
102 | 
103 | 	// lock protects access to the following mutables.
104 | 	lock       *sync.RWMutex
105 | 	rand       *rand.Rand
106 | 	partitions map[string]int
107 | 	claims     map[string]map[int]*claim
108 | 	stopChan   chan struct{}
109 | 	doneChan   chan struct{}
110 | }
111 | 
112 | // NewConsumer instantiates a consumer object for a given topic. You must create a
113 | // separate consumer for every individual topic that you want to consume from. Please
114 | // see the documentation on ConsumerBehavior.
115 | func (m *Marshaler) NewConsumer(topicNames []string, options ConsumerOptions) (*Consumer, error) {
116 | 	if m.Terminated() {
117 | 		return nil, errors.New("Marshaler has terminated, no new consumers can be created")
118 | 	}
119 | 
120 | 	if len(topicNames) > 1 && !options.ClaimEntireTopic {
121 | 		return nil, errors.New("ClaimEntireTopic must be set if provided more than one topic")
122 | 	} else if len(topicNames) == 0 {
123 | 		return nil, errors.New("must provide at least one topic")
124 | 	}
125 | 
126 | 	partitions := make(map[string]int)
127 | 
128 | 	for _, topic := range topicNames {
129 | 		partitions[topic] = m.Partitions(topic)
130 | 	}
131 | 
132 | 	// Construct base structure
133 | 	c := &Consumer{
134 | 		alive:              new(int32),
135 | 		marshal:            m,
136 | 		topics:             topicNames,
137 | 		partitions:         partitions,
138 | 		options:            options,
139 | 		messages:           make(chan *Message, m.cluster.options.MaxMessageQueue),
140 | 		lock:               &sync.RWMutex{},
141 | 		rand:               rand.New(rand.NewSource(time.Now().UnixNano())),
142 | 		claims:             make(map[string]map[int]*claim),
143 | 		topicClaimsChan:    make(chan map[string]bool),
144 | 		topicClaimsUpdated: make(chan struct{}, 1),
145 | 		stopChan:           make(chan struct{}),
146 | 		doneChan:           make(chan struct{}),
147 | 	}
148 | 	atomic.StoreInt32(c.alive, 1)
149 | 	m.addNewConsumer(c)
150 | 
151 | 	// Take the lock for now as we're updating various points internally
152 | 	c.lock.Lock()
153 | 	defer c.lock.Unlock()
154 | 
155 | 	// Start notifier about topic claims now because people are going to start
156 | 	// listening immediately
157 | 	go c.sendTopicClaimsLoop()
158 | 
159 | 	// Fast-reclaim: iterate over existing claims in the given topics and see if
160 | 	// any of them look to be from previous incarnations of this Marshal (client, group)
161 | 	// and are currently claimed. If so, claim them. Do this before the claim manager
162 | 	// is started.
163 | 	if c.options.FastReclaim {
164 | 		claimedTopics := make(map[string]bool)
165 | 		for topic, partitionCount := range c.partitions {
166 | 			for partID := 0; partID < partitionCount; partID++ {
167 | 				cl := c.marshal.GetPartitionClaim(topic, partID)
168 | 
169 | 				// If not presently claimed, or not claimed by us, skip
170 | 				if !cl.Claimed() ||
171 | 					cl.ClientID != c.marshal.ClientID() ||
172 | 					cl.GroupID != c.marshal.GroupID() {
173 | 					continue
174 | 				}
175 | 
176 | 				// This looks to be ours, let's do it. This is basically the fast path,
177 | 				// and our heartbeat will happen shortly from the automatic health
178 | 				// check which fires up immediately on newClaim.
179 | 				log.Infof("[%s:%d] attempting to fast-reclaim", topic, partID)
180 | 				if _, ok := c.claims[topic]; !ok {
181 | 					c.claims[topic] = make(map[int]*claim)
182 | 				}
183 | 
184 | 				// update topic claims
185 | 				if options.ClaimEntireTopic {
186 | 					if partID == 0 {
187 | 						claimedTopics[topic] = true
188 | 					}
189 | 
190 | 					// don't fast re-claim partitions for a topic unless partition 0 is claimed
191 | 					if !claimedTopics[topic] {
192 | 						log.Infof("[%s:%d] blocked fast-reclaim because topic is not claimed",
193 | 							topic, partID)
194 | 						continue
195 | 					}
196 | 				}
197 | 
198 | 				// Attempt to claim, this can fail
199 | 				claim := newClaim(
200 | 					topic, partID, c.marshal, c, c.messages, options)
201 | 				if claim == nil {
202 | 					log.Warningf("[%s:%d] failed to fast-reclaim", topic, partID)
203 | 				} else {
204 | 					c.claims[topic][partID] = claim
205 | 					go claim.healthCheckLoop()
206 | 					c.sendTopicClaimsUpdate()
207 | 				}
208 | 			}
209 | 
210 | 			// this check needs to be after iterating all partitions in a topic
211 | 			if options.ClaimEntireTopic && len(claimedTopics) >= options.MaximumClaims {
212 | 				log.Infof("reached max-topics for fast-reclaim. Claimed topics: %v",
213 | 					claimedTopics)
214 | 				break
215 | 			}
216 | 		}
217 | 	}
218 | 
219 | 	go c.manageClaims()
220 | 	return c, nil
221 | }
222 | 
223 | // NewConsumerOptions returns a default set of options for the Consumer.
224 | func NewConsumerOptions() ConsumerOptions {
225 | 	return ConsumerOptions{
226 | 		FastReclaim:           true,
227 | 		ClaimEntireTopic:      false,
228 | 		GreedyClaims:          false,
229 | 		ReleaseClaimsIfBehind: true,
230 | 	}
231 | }
232 | 
233 | func (c *Consumer) defaultTopic() string {
234 | 	c.lock.RLock()
235 | 	defer c.lock.RUnlock()
236 | 
237 | 	if len(c.partitions) > 1 {
238 | 		log.Errorf("attempted to claim partitions for more than one topic")
239 | 		go c.Terminate(false)
240 | 		return ""
241 | 	}
242 | 
243 | 	for topic := range c.partitions {
244 | 		return topic
245 | 	}
246 | 
247 | 	log.Errorf("couldn't find default topic!")
248 | 	go c.Terminate(false)
249 | 	return ""
250 | }
251 | 
252 | func (c *Consumer) defaultTopicPartitions() int {
253 | 	c.lock.RLock()
254 | 	defer c.lock.RUnlock()
255 | 
256 | 	if len(c.partitions) > 1 {
257 | 		log.Errorf("attempted to claim partitions for more than one topic")
258 | 		go c.Terminate(false)
259 | 		return 0
260 | 	}
261 | 
262 | 	for _, partitions := range c.partitions {
263 | 		return partitions
264 | 	}
265 | 
266 | 	log.Errorf("couldn't find default topic!")
267 | 	go c.Terminate(false)
268 | 	return 0
269 | }
270 | 
271 | // claimTerminated is called by a claim when they've terminated. This is used so we can
272 | // ensure topic claim semantics are adhered to. In topic claim mode this will be called
273 | // by every claim during a release.
274 | func (c *Consumer) claimTerminated(cl *claim, released bool) {
275 | 	// For now, we don't care except in the topic claim mode
276 | 	if !c.options.ClaimEntireTopic {
277 | 		return
278 | 	}
279 | 
280 | 	// Send an update at the end
281 | 	defer c.sendTopicClaimsUpdate()
282 | 
283 | 	// This is a topic claim, so we need to perform the same operation on the rest of
284 | 	// the claims in this topic
285 | 	c.lock.RLock()
286 | 	defer c.lock.RUnlock()
287 | 	for _, claim := range c.claims[cl.topic] {
288 | 		if cl != claim {
289 | 			if released {
290 | 				go claim.Release()
291 | 			} else {
292 | 				go claim.Terminate()
293 | 			}
294 | 		}
295 | 	}
296 | }
297 | 
298 | // tryClaimPartition attempts to claim a partition and make it available in the consumption
299 | // flow. If this is called a second time on a partition we already own, it will return
300 | // false. Returns true only if the partition was never claimed and we succeeded in
301 | // claiming it.
302 | func (c *Consumer) tryClaimPartition(topic string, partID int) bool {
303 | 	if c.options.ClaimEntireTopic {
304 | 		if c.isTopicClaimLimitReached(topic) {
305 | 			return false
306 | 		}
307 | 	} else {
308 | 		if c.isClaimLimitReached() {
309 | 			return false
310 | 		}
311 | 	}
312 | 
313 | 	// See if partition is presently claimed by anybody, if so, do nothing. This is an
314 | 	// optimization but overall the whole system is racy and that race is handled elsewhere.
315 | 	// This gives us no protection.
316 | 	currentClaim := c.marshal.GetPartitionClaim(topic, partID)
317 | 	if currentClaim.Claimed() {
318 | 		return false
319 | 	}
320 | 
321 | 	// Attempt to claim. This handles asynchronously and might ultimately fail because
322 | 	// someone beat us to the claim or we failed to produce to Kafka or something. This can
323 | 	// block for a while.
324 | 	newClaim := newClaim(topic, partID, c.marshal, c, c.messages, c.options)
325 | 	if newClaim == nil {
326 | 		return false
327 | 	}
328 | 	go newClaim.healthCheckLoop()
329 | 
330 | 	// Critical section. Engage the lock here, we hold it until we exit. This lock can take
331 | 	// some time to get, so the following code has to be resilient to state changes that might
332 | 	// happen due to lock dilation.
333 | 	c.lock.Lock()
334 | 	defer c.lock.Unlock()
335 | 
336 | 	// If claim says it's terminated, do nothing and exit. This can happen if something
337 | 	// in the claim failed to produce to Kafka.
338 | 	if newClaim.Terminated() {
339 | 		return false
340 | 	}
341 | 
342 | 	// Ugh, we managed to claim a partition in our termination state. Don't worry too hard
343 | 	// and just release it.
344 | 	if c.Terminated() {
345 | 		// This can be a long blocking operation so send it to the background. We ultimately
346 | 		// don't care if it finishes or not, because the heartbeat will save us if we don't
347 | 		// submit a release message. This is just an optimization.
348 | 		go newClaim.Release()
349 | 		return false
350 | 	}
351 | 
352 | 	// If we have an old claim (i.e. this is a reclaim) then assert that the old claim has
353 | 	// been properly terminated. If not, then this could indicate a bug in the Marshal state
354 | 	// machine.
355 | 	topicClaims, ok := c.claims[topic]
356 | 	if ok {
357 | 		oldClaim, ok := topicClaims[partID]
358 | 		if ok && oldClaim != nil {
359 | 			if !oldClaim.Terminated() {
360 | 				log.Errorf("Internal double-claim for %s:%d.", topic, partID)
361 | 				log.Errorf("This is a catastrophic error. We're terminating Marshal.")
362 | 				log.Errorf("No further messages will be available. Please restart.")
363 | 				go newClaim.Release()
364 | 				go c.terminateAndCleanup(false, false)
365 | 				go func() {
366 | 					c.marshal.PrintState()
367 | 					c.marshal.Terminate()
368 | 				}()
369 | 				return false
370 | 			}
371 | 		}
372 | 	}
373 | 
374 | 	if _, ok := c.claims[topic]; !ok {
375 | 		c.claims[topic] = make(map[int]*claim)
376 | 	}
377 | 
378 | 	// Save the claim, this makes it available for message consumption and status.
379 | 	c.claims[topic][partID] = newClaim
380 | 	return true
381 | }
382 | 
383 | // rndIntn gets a random number.
384 | func (c *Consumer) rndIntn(n int) int {
385 | 	c.lock.Lock()
386 | 	defer c.lock.Unlock()
387 | 
388 | 	return c.rand.Intn(n)
389 | }
390 | 
391 | // releaseClaims releases all claims this consumer has. This is called when a consumer is paused.
392 | func (c *Consumer) releaseClaims() {
393 | 	c.lock.Lock()
394 | 	defer c.lock.Unlock()
395 | 
396 | 	// Release all claims that this consumer keeps track of and remove them from the claims map.
397 | 	for topic, partitions := range c.claims {
398 | 		for partID, claim := range partitions {
399 | 			if !claim.Terminated() {
400 | 				log.Warningf("[%s:%d] Consumer still paused, releasing claim",
401 | 					topic, partID)
402 | 				claim.Release()
403 | 			}
404 | 		}
405 | 		c.claims[topic] = make(map[int]*claim)
406 | 	}
407 | }
408 | 
409 | // claimPartitions actually attempts to claim partitions. If the current consumer is
410 | // set on aggressive, this will try to claim ALL partitions that are free. Balanced mode
411 | // will claim a single partition.
412 | func (c *Consumer) claimPartitions() {
413 | 	func() {
414 | 		c.lock.RLock()
415 | 		defer c.lock.RUnlock()
416 | 		if len(c.partitions) > 1 {
417 | 			log.Errorf("attempted to claim partitions for more than a single topic")
418 | 			go c.Terminate(false)
419 | 		}
420 | 	}()
421 | 
422 | 	topic := c.defaultTopic()
423 | 	partitions := c.defaultTopicPartitions()
424 | 	if partitions <= 0 {
425 | 		return
426 | 	}
427 | 
428 | 	// Don't bother trying to make claims if we are at our claim limit.
429 | 	// This is just an optimization, because we aren't holding the lock here
430 | 	// this check is repeated inside tryClaimPartition.
431 | 	if c.isClaimLimitReached() {
432 | 		return
433 | 	}
434 | 
435 | 	offset := c.rndIntn(partitions)
436 | 	for i := 0; i < partitions; i++ {
437 | 		partID := (i + offset) % partitions
438 | 
439 | 		// Get the most recent claim for this partition
440 | 		lastClaim := c.marshal.GetLastPartitionClaim(topic, partID)
441 | 		if lastClaim.Claimed() {
442 | 			continue
443 | 		}
444 | 
445 | 		// If the last claim was by this particular consumer, skip if we just released.
446 | 		// This is because we might have become unhealthy and dropped it or we might already be
447 | 		// claiming this partition.
448 | 		if lastClaim.GroupID == c.marshal.groupID &&
449 | 			lastClaim.ClientID == c.marshal.clientID {
450 | 			// Check release time, if it's over a heartbeat interval allow us to reclaim it
451 | 			if time.Now().Unix()-lastClaim.LastRelease < HeartbeatInterval {
452 | 				log.Infof("[%s:%d] skipping unclaimed partition because we recently released it",
453 | 					topic, partID)
454 | 				continue
455 | 			} else {
456 | 				log.Infof("[%s:%d] reclaiming because we released it a while ago",
457 | 					topic, partID)
458 | 			}
459 | 		}
460 | 
461 | 		// Unclaimed, so attempt to claim it
462 | 		if !c.tryClaimPartition(topic, partID) {
463 | 			continue
464 | 		}
465 | 
466 | 		// If greedy claims is disabled, finish here
467 | 		if !c.options.GreedyClaims {
468 | 			break
469 | 		}
470 | 	}
471 | }
472 | 
473 | // isTopicClaimLimitReached indicates whether we can claim any partition of this topic
474 | // or not given the topics we've already claimed and MaximumClaims
475 | func (c *Consumer) isTopicClaimLimitReached(topic string) bool {
476 | 	if c.options.MaximumClaims <= 0 {
477 | 		return false
478 | 	}
479 | 
480 | 	c.lock.RLock()
481 | 	defer c.lock.RUnlock()
482 | 
483 | 	claimed := make(map[string]bool)
484 | 	for topic, topicClaims := range c.claims {
485 | 		if claim, ok := topicClaims[0]; ok && !claim.Terminated() {
486 | 			claimed[topic] = true
487 | 		}
488 | 	}
489 | 
490 | 	if !claimed[topic] && len(claimed) >= c.options.MaximumClaims {
491 | 		return true
492 | 	}
493 | 	return false
494 | }
495 | 
496 | // claimTopic attempts to claim the entire topic if we're in that mode. We use partition 0
497 | // as the key, anybody who has that partition has claimed the entire topic. This requires all
498 | // consumers to use this mode.
499 | func (c *Consumer) claimTopics() {
500 | 	// Whenever we're done here, try to send an update
501 | 	defer c.sendTopicClaimsUpdate()
502 | 
503 | 	// Get a copy of c.partitions so we don't have to hold the lock throughout
504 | 	// this entire method
505 | 	topicPartitions := make(map[string]int)
506 | 	func() {
507 | 		c.lock.RLock()
508 | 		defer c.lock.RUnlock()
509 | 
510 | 		for k, v := range c.partitions {
511 | 			topicPartitions[k] = v
512 | 		}
513 | 	}()
514 | 
515 | 	// Now iterate each and try to claim
516 | 	for topic, partitions := range topicPartitions {
517 | 		if partitions <= 0 {
518 | 			continue
519 | 		}
520 | 
521 | 		// We use partition 0 as our "key". Whoever claims partition 0 is considered the owner of
522 | 		// the topic. See if partition 0 is claimed or not.
523 | 		lastClaim := c.marshal.GetLastPartitionClaim(topic, 0)
524 | 		if lastClaim.Claimed() {
525 | 			// If it's not claimed by us, return.
526 | 			if lastClaim.GroupID != c.marshal.groupID ||
527 | 				lastClaim.ClientID != c.marshal.clientID {
528 | 				// in case we had this topic, but now somebody else has claimed it
529 | 				continue
530 | 			}
531 | 		} else {
532 | 			// Unclaimed, so attempt to claim partition 0. This is how we key topic claims.
533 | 			log.Infof("[%s] attempting to claim topic (key partition 0)", topic)
534 | 
535 | 			// we need to check if we're above the maximum topics to be claimed
536 | 			// we should only allow the first k topics to be claimed and allow all
537 | 			// of their partitions to be claimed. This is controlled by controlling how
538 | 			// many (key partition 0) we claim.
539 | 			if c.isTopicClaimLimitReached(topic) {
540 | 				log.Debugf("[%s] blocked claiming topic due to limit: %d",
541 | 					topic, c.options.MaximumClaims)
542 | 				continue
543 | 			}
544 | 
545 | 			if !c.tryClaimPartition(topic, 0) {
546 | 				continue
547 | 			}
548 | 			log.Infof("[%s] claimed topic (key partition 0) successfully", topic)
549 | 
550 | 			// Optimistically send update to try to reduce latency between us claiming a
551 | 			// topic and notifying a listener
552 | 			c.sendTopicClaimsUpdate()
553 | 		}
554 | 
555 | 		// We either just claimed or we have already owned the 0th partition. Let's iterate
556 | 		// through all partitions and attempt to claim any that we don't own yet.
557 | 		for partID := 1; partID < partitions; partID++ {
558 | 			if !c.marshal.Claimed(topic, partID) {
559 | 				log.Infof("[%s:%d] claiming partition (topic claim mode)", topic, partID)
560 | 				c.tryClaimPartition(topic, partID)
561 | 			}
562 | 		}
563 | 	}
564 | }
565 | 
566 | // sendTopicClaimUpdate can be called by various codepaths that have learned that there is
567 | // an update to send down to the users.
568 | func (c *Consumer) sendTopicClaimsUpdate() {
569 | 	select {
570 | 	case c.topicClaimsUpdated <- struct{}{}:
571 | 		// Just sends a marker on the channel.
572 | 	default:
573 | 	}
574 | }
575 | 
576 | // topicClaimsLoop analyzes the current topic claims and sends an update if
577 | // and only if there is a change in claim state. I.e., a new topic is claimed
578 | // or a topic is released.
579 | func (c *Consumer) sendTopicClaimsLoop() {
580 | 	defer close(c.topicClaimsChan)
581 | 
582 | 	lastClaims := make(map[string]bool)
583 | 	keepRunning := true
584 | 
585 | 	for keepRunning {
586 | 		select {
587 | 		case <-c.topicClaimsUpdated:
588 | 			// Continue on and send an update
589 | 		case <-c.stopChan:
590 | 			// Send one more and exit
591 | 			keepRunning = false
592 | 		}
593 | 
594 | 		// Get consistent claims and send them
595 | 		claims, err := c.GetCurrentTopicClaims()
596 | 		if err != nil {
597 | 			log.Errorf("Failed to send topic claims update: %s", err)
598 | 			continue
599 | 		}
600 | 
601 | 		// See if anything has changed
602 | 		anyUpdates := false
603 | 		for topic, claimed := range lastClaims {
604 | 			stillClaimed, ok := claims[topic]
605 | 			if !ok {
606 | 				// Existed before but does not exist now
607 | 				anyUpdates = true
608 | 			} else if claimed != stillClaimed {
609 | 				// Status changed in some way
610 | 				anyUpdates = true
611 | 			}
612 | 		}
613 | 		for topic := range claims {
614 | 			if _, ok := lastClaims[topic]; !ok {
615 | 				// New topic claim
616 | 				anyUpdates = true
617 | 			}
618 | 		}
619 | 		lastClaims = claims
620 | 
621 | 		// If no updates, continue
622 | 		if !anyUpdates {
623 | 			continue
624 | 		}
625 | 
626 | 		// Drain out any unconsumed update
627 | 		select {
628 | 		case <-c.topicClaimsChan:
629 | 		default:
630 | 		}
631 | 
632 | 		// This should never block since we're the only writer
633 | 		c.topicClaimsChan <- claims
634 | 	}
635 | }
636 | 
637 | // updatePartitionCounts pulls the latest partition counts per topic from the Marshaler.
638 | func (c *Consumer) updatePartitionCounts() {
639 | 	// Write lock as we're updating c.partitions below, potentially
640 | 	c.lock.Lock()
641 | 	defer c.lock.Unlock()
642 | 
643 | 	for _, topic := range c.marshal.Topics() {
644 | 		// Only update partitions for topics we already know about
645 | 		if _, ok := c.partitions[topic]; ok {
646 | 			c.partitions[topic] = c.marshal.Partitions(topic)
647 | 		}
648 | 	}
649 | }
650 | 
651 | // manageClaims is our internal state machine that handles partitions and claiming new
652 | // ones (or releasing ones).
653 | func (c *Consumer) manageClaims() {
654 | 	for !c.Terminated() {
655 | 		c.updatePartitionCounts()
656 | 
657 | 		// If we learn that our consumer group is paused, release all claims.
658 | 		if c.marshal.cluster.IsGroupPaused(c.marshal.GroupID()) {
659 | 			c.releaseClaims()
660 | 		} else {
661 | 			// Attempt to claim more partitions, this always runs and will keep running until all
662 | 			// partitions in the topic are claimed (by somebody).
663 | 			if c.options.ClaimEntireTopic {
664 | 				c.claimTopics()
665 | 			} else {
666 | 				c.claimPartitions()
667 | 			}
668 | 		}
669 | 		// Now sleep a bit so we don't pound things
670 | 		// TODO: Raise this later, we shouldn't attempt to claim this fast, this is just for
671 | 		// development.
672 | 		time.Sleep(time.Duration(c.rndIntn(3000)) * time.Millisecond)
673 | 	}
674 | }
675 | 
676 | // Terminated returns whether or not this consumer has been terminated.
677 | func (c *Consumer) Terminated() bool {
678 | 	return atomic.LoadInt32(c.alive) == 0
679 | }
680 | 
681 | // terminateAndCleanup instructs the consumer to commit its offsets,
682 | // possibly release its partitions, and possibly remove its reference from
683 | // the associated marshaler. This will allow other consumers to begin consuming.
684 | func (c *Consumer) terminateAndCleanup(release bool, remove bool) bool {
685 | 	if !atomic.CompareAndSwapInt32(c.alive, 1, 0) {
686 | 		return false
687 | 	}
688 | 
689 | 	// Purposefully done outside of the lock below so we can send the message
690 | 	// that stopping is happening ASAP, and doneChan is deferred so it will close
691 | 	// at some point in the future
692 | 	close(c.stopChan)
693 | 	defer close(c.doneChan)
694 | 
695 | 	latestTopicClaims := make(map[string]bool)
696 | 	releasedTopics := make(map[string]bool)
697 | 
698 | 	c.lock.Lock()
699 | 	defer c.lock.Unlock()
700 | 
701 | 	for topic, topicClaims := range c.claims {
702 | 		for partID, claim := range topicClaims {
703 | 			if claim != nil {
704 | 				if release {
705 | 					claim.Release()
706 | 					if partID == 0 {
707 | 						releasedTopics[topic] = true
708 | 					}
709 | 				} else {
710 | 					claim.Terminate()
711 | 				}
712 | 			}
713 | 		}
714 | 	}
715 | 
716 | 	close(c.messages)
717 | 
718 | 	for topic := range c.claims {
719 | 		if !releasedTopics[topic] {
720 | 			latestTopicClaims[topic] = true
721 | 		}
722 | 	}
723 | 
724 | 	// Optionally remove consumer from its marshal. Doing so is recommended
725 | 	// if the marshal doesn't explicitly remove the consumer.
726 | 	if remove {
727 | 		c.marshal.removeConsumer(c)
728 | 	}
729 | 
730 | 	// Update the claims one last time
731 | 	c.sendTopicClaimsUpdate()
732 | 	return true
733 | }
734 | 
735 | // Terminate instructs the consumer to clean up and allow other consumers to begin consuming.
736 | // (If you do not call this method before exiting, things will still work, but more slowly.)
737 | func (c *Consumer) Terminate(release bool) bool {
738 | 	return c.terminateAndCleanup(release, true)
739 | }
740 | 
741 | // GetCurrentTopicClaims returns the topics that are currently claimed by this
742 | // consumer. It should be relevent only when ClaimEntireTopic is set
743 | func (c *Consumer) GetCurrentTopicClaims() (map[string]bool, error) {
744 | 	c.lock.RLock()
745 | 	defer c.lock.RUnlock()
746 | 
747 | 	if !c.options.ClaimEntireTopic {
748 | 		return nil, errors.New(
749 | 			"GetCurrentTopicClaims requires options.ClaimEntireTopic be set")
750 | 	}
751 | 
752 | 	claimedTopics := make(map[string]bool)
753 | 	if c.Terminated() {
754 | 		return claimedTopics, nil
755 | 	}
756 | 
757 | 	// Iterate each topic we know about and see if we have partition 0 claimed
758 | 	// for that topic, if so, consider it valid
759 | 	for topic := range c.partitions {
760 | 		cl := c.marshal.GetPartitionClaim(topic, 0)
761 | 		if cl.ClientID == c.marshal.ClientID() &&
762 | 			cl.GroupID == c.marshal.GroupID() {
763 | 			// We own this topic
764 | 			claimedTopics[topic] = true
765 | 		}
766 | 	}
767 | 	return claimedTopics, nil
768 | }
769 | 
770 | // TopicClaims returns a read-only channel that receives updates for topic claims.
771 | // It's only relevant when CLaimEntireTopic is set
772 | func (c *Consumer) TopicClaims() <-chan map[string]bool {
773 | 	if !c.options.ClaimEntireTopic {
774 | 		err := fmt.Errorf(
775 | 			"GetCurrentTopicClaims is only relevant when ClaimEntireTopic is set")
776 | 		log.Error(err.Error())
777 | 	}
778 | 
779 | 	return c.topicClaimsChan
780 | }
781 | 
782 | // GetCurrentLag returns the number of messages that this consumer is lagging by. Note that
783 | // this value can be unstable in the beginning of a run, as we might not have claimed all of
784 | // partitions we will end up claiming, or we might have overclaimed and need to back off.
785 | // Ideally this will settle towards 0. If it continues to rise, that implies there isn't
786 | // enough consumer capacity.
787 | func (c *Consumer) GetCurrentLag() int64 {
788 | 	c.lock.RLock()
789 | 	defer c.lock.RUnlock()
790 | 
791 | 	var lag int64
792 | 	for _, topicClaims := range c.claims {
793 | 		for _, cl := range topicClaims {
794 | 			if !cl.Terminated() {
795 | 				lag += cl.GetCurrentLag()
796 | 			}
797 | 		}
798 | 	}
799 | 	return lag
800 | }
801 | 
802 | // GetCurrentLoad returns a number representing the "load" of this consumer. Think of this
803 | // like a load average in Unix systems: the numbers are kind of related to how much work
804 | // the system is doing, but by itself they don't tell you much.
805 | func (c *Consumer) GetCurrentLoad() int {
806 | 	return c.getNumActiveClaims()
807 | }
808 | 
809 | // getNumActiveClaims returns the number of claims actively owned by this Consumer.
810 | func (c *Consumer) getNumActiveClaims() (ct int) {
811 | 	c.lock.RLock()
812 | 	defer c.lock.RUnlock()
813 | 
814 | 	for _, topicClaims := range c.claims {
815 | 		for _, cl := range topicClaims {
816 | 			if !cl.Terminated() {
817 | 				ct++
818 | 			}
819 | 		}
820 | 	}
821 | 	return
822 | }
823 | 
824 | // isClaimLimitReached returns the number of claims actively owned by this Consumer.
825 | func (c *Consumer) isClaimLimitReached() bool {
826 | 	// if we're claiming topics, then this is not applicable. It's handled inside claimTopics
827 | 	return !c.options.ClaimEntireTopic && c.options.MaximumClaims > 0 &&
828 | 		c.getNumActiveClaims() >= c.options.MaximumClaims
829 | }
830 | 
831 | // ConsumeChannel returns a read-only channel. Messages that are retrieved from Kafka will be
832 | // made available in this channel.
833 | func (c *Consumer) ConsumeChannel() <-chan *Message {
834 | 	return c.messages
835 | }
836 | 
837 | // consumeOne returns a single message. This is mostly used within the test suite to
838 | // make testing easier as it simulates the message handling behavior.
839 | func (c *Consumer) consumeOne() *Message {
840 | 	msg := <-c.messages
841 | 	c.Commit(msg)
842 | 	return msg
843 | }
844 | 
845 | // Commit is called when you've finished processing a message. This operation marks
846 | // the offset as committed internally and is suitable for at-least-once processing
847 | // because we do not immediately write the offsets to storage. We will flush the
848 | // offsets periodically (based on the heartbeat interval).
849 | func (c *Consumer) Commit(msg *Message) error {
850 | 	cl, ok := func() (*claim, bool) {
851 | 		c.lock.RLock()
852 | 		defer c.lock.RUnlock()
853 | 
854 | 		cl, ok := c.claims[msg.Topic][int(msg.Partition)]
855 | 		return cl, ok
856 | 	}()
857 | 	if !ok {
858 | 		return fmt.Errorf("Message not committed (claim for topic %s, partition %d expired).",
859 | 			msg.Topic, msg.Partition)
860 | 	}
861 | 	return cl.Commit(msg.Offset)
862 | }
863 | 
864 | // Flush will cause us to upate all of the committed offsets. This operation can be
865 | // performed to periodically sync offsets without waiting on the internal flushing mechanism.
866 | func (c *Consumer) Flush() error {
867 | 	c.lock.RLock()
868 | 	defer c.lock.RUnlock()
869 | 
870 | 	claims := make([]*claim, 0)
871 | 	for topic := range c.claims {
872 | 		for partID := range c.claims[topic] {
873 | 			claims = append(claims, c.claims[topic][partID])
874 | 		}
875 | 	}
876 | 
877 | 	// Do flushing concurrently because they involve sending messages to Kafka
878 | 	// which can be slow if done serially
879 | 	waiter := &sync.WaitGroup{}
880 | 	waiter.Add(len(claims))
881 | 	errChan := make(chan error, len(claims))
882 | 
883 | 	for _, cl := range claims {
884 | 		cl := cl
885 | 		go func() {
886 | 			defer waiter.Done()
887 | 			if err := cl.Flush(); err != nil {
888 | 				errChan <- err
889 | 			}
890 | 		}()
891 | 	}
892 | 
893 | 	// Wait for all flushes to finish
894 | 	waiter.Wait()
895 | 	close(errChan)
896 | 
897 | 	// Channel will be empty unless there was an error
898 | 	anyErrors := false
899 | 	for err := range errChan {
900 | 		anyErrors = true
901 | 		log.Errorf("Flush error: %s", err)
902 | 	}
903 | 	if anyErrors {
904 | 		return errors.New("One or more errors encountered flushing offsets.")
905 | 	}
906 | 	return nil
907 | }
908 | 
909 | // CommitByToken is called when you've finished processing a message. In the at-least-once
910 | // consumption case, this will allow the "last processed offset" to move forward so that
911 | // we can never see this message again. This particular method is used when you've only
912 | // got a CommitToken to commit from.
913 | func (c *Consumer) CommitByToken(token CommitToken) error {
914 | 	cl, ok := func() (*claim, bool) {
915 | 		c.lock.RLock()
916 | 		defer c.lock.RUnlock()
917 | 
918 | 		cl, ok := c.claims[token.topic][token.partID]
919 | 		return cl, ok
920 | 	}()
921 | 	if !ok {
922 | 		return fmt.Errorf("Message not committed (claim for topic %s, partition %d expired).",
923 | 			token.topic, token.partID)
924 | 	}
925 | 	return cl.Commit(token.offset)
926 | }
927 | 
928 | // PrintState outputs the status of the consumer.
929 | func (c *Consumer) PrintState() {
930 | 	c.lock.RLock()
931 | 	defer c.lock.RUnlock()
932 | 
933 | 	log.Infof("  CONSUMER: %d messages in queue", len(c.messages))
934 | 	for _, topic := range c.topics {
935 | 		log.Infof("    TOPIC: %s", topic)
936 | 		for _, claim := range c.claims[topic] {
937 | 			claim.PrintState()
938 | 		}
939 | 	}
940 | }
941 | 


--------------------------------------------------------------------------------