├── .travis.yml ├── marshal ├── log.go ├── log_test.go ├── cluster_test.go ├── message_test.go ├── topic.go ├── marshal_test.go ├── message.go ├── rationalizer_test.go ├── rationalizer.go ├── marshal.go ├── cluster.go ├── claim.go ├── claim_test.go ├── consumer_test.go └── consumer.go ├── LICENSE ├── example └── main.go ├── debug └── main.go ├── PROTOCOL.md └── README.md /.travis.yml: -------------------------------------------------------------------------------- 1 | language: go 2 | go: 3 | - 1.7 4 | 5 | env: 6 | - GOMAXPROCS=4 7 | 8 | sudo: false 9 | 10 | script: 11 | - cd marshal 12 | - test `gofmt -l . | wc -l` = 0 13 | - go test -p=1 -race -timeout=600s -check.v ./... 14 | 15 | -------------------------------------------------------------------------------- /marshal/log.go: -------------------------------------------------------------------------------- 1 | /* 2 | * portal - marshal 3 | * 4 | * this package exists for the simple reason that go vet complains bitterly 5 | * that the go-logging package we use named its error printer 'Error' without 6 | * the trailing 'f', and yet it accepts a format string. blah. 7 | * 8 | */ 9 | 10 | package marshal 11 | 12 | import ( 13 | "sync" 14 | 15 | "github.com/op/go-logging" 16 | ) 17 | 18 | var log *logging.Logger 19 | var logMu = &sync.Mutex{} 20 | 21 | func init() { 22 | logMu.Lock() 23 | defer logMu.Unlock() 24 | 25 | if log != nil { 26 | return 27 | } 28 | log = logging.MustGetLogger("KafkaMarshal") 29 | logging.SetLevel(logging.INFO, "KafkaMarshal") 30 | } 31 | 32 | // SetLogger can be called with a logging.Logger in order to overwrite our internal 33 | // logger. Useful if you need to control the logging (such as in tests). 34 | func SetLogger(l *logging.Logger) { 35 | logMu.Lock() 36 | defer logMu.Unlock() 37 | 38 | log = l 39 | } 40 | -------------------------------------------------------------------------------- /marshal/log_test.go: -------------------------------------------------------------------------------- 1 | package marshal 2 | 3 | import ( 4 | "sync" 5 | 6 | "github.com/zorkian/kafka" 7 | "github.com/zorkian/kafka/kafkatest" 8 | "github.com/op/go-logging" 9 | 10 | . "gopkg.in/check.v1" 11 | ) 12 | 13 | type logTestBackend struct { 14 | c *C 15 | mu *sync.Mutex 16 | } 17 | 18 | var logTest = &logTestBackend{mu: &sync.Mutex{}} 19 | 20 | func init() { 21 | logMu.Lock() 22 | defer logMu.Unlock() 23 | 24 | leveledLogger := logging.AddModuleLevel(logTest) 25 | leveledLogger.SetLevel(logging.DEBUG, "KafkaMarshal") 26 | leveledLogger.SetLevel(logging.DEBUG, "KafkaClient") 27 | leveledLogger.SetLevel(logging.DEBUG, "KafkaTest") 28 | 29 | log = logging.MustGetLogger("KafkaMarshal") 30 | log.SetBackend(leveledLogger) 31 | 32 | kafkatest.SetLogger(log) 33 | kafka.SetLogger(log) 34 | } 35 | 36 | func (l *logTestBackend) SetC(c *C) { 37 | l.mu.Lock() 38 | defer l.mu.Unlock() 39 | 40 | l.c = c 41 | } 42 | 43 | func ResetTestLogger(c *C) { 44 | logTest.SetC(c) 45 | } 46 | 47 | func (l *logTestBackend) Log(lvl logging.Level, cd int, rec *logging.Record) error { 48 | l.mu.Lock() 49 | defer l.mu.Unlock() 50 | 51 | l.c.Log(rec.Formatted(cd)) 52 | return nil 53 | } 54 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2015 authors and contributors. 2 | 3 | All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are 7 | met: 8 | 9 | * Redistributions of source code must retain the above copyright 10 | notice, this list of conditions and the following disclaimer. 11 | 12 | * Redistributions in binary form must reproduce the above copyright 13 | notice, this list of conditions and the following disclaimer in the 14 | documentation and/or other materials provided with the distribution. 15 | 16 | * Neither the name of the nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 21 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 22 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 23 | PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL 24 | BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 25 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 26 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 27 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 28 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 29 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF 30 | THE POSSIBILITY OF SUCH DAMAGE. 31 | -------------------------------------------------------------------------------- /example/main.go: -------------------------------------------------------------------------------- 1 | /* 2 | * simple Marshal example consumer 3 | */ 4 | 5 | package main 6 | 7 | import ( 8 | "github.com/zorkian/marshal/marshal" 9 | "github.com/op/go-logging" 10 | ) 11 | 12 | func main() { 13 | log := logging.MustGetLogger("MarshalExample") 14 | 15 | // Construct the marshaler. There will be one of these globally, and it's thread safe so can 16 | // be used from any goroutine. 17 | marshaler, err := marshal.NewMarshaler( 18 | "marshal_example_client_id", 19 | "marshal_example_consumer_group_id", 20 | []string{"127.0.0.1:9092"}) 21 | if err != nil { 22 | log.Fatalf("Failed to construct marshaler: %s", err) 23 | } 24 | 25 | // Make sure to terminate the Marshaler. This ensures that we release all of the partition 26 | // locks we're holding so other consumers can pick them up. 27 | defer marshaler.Terminate() 28 | 29 | // Now we set up a basic consumer; and we enable GreedyClaims which is useful in low QPS 30 | // environments as it will cause the consumer to claim as many partitions as it can 31 | // up front. Of course, if you have a very busy topic with many partitions, you will 32 | // not want to use this. 33 | options := marshal.NewConsumerOptions() 34 | options.GreedyClaims = true 35 | 36 | consumer, err := marshaler.NewConsumer([]string{"some-topic"}, options) 37 | if err != nil { 38 | log.Fatalf("Failed to construct consumer: %s", err) 39 | } 40 | defer consumer.Terminate(true) 41 | 42 | // Now we can get the consumption channel. Messages will be available in this channel 43 | // and you can consume from it in many different goroutines if your message processing 44 | // is such that it takes a while. 45 | msgChan := consumer.ConsumeChannel() 46 | 47 | // You can spin up many goroutines to process messages; how many depends entirely on the type 48 | // of workload you have. See the docs. 49 | for i := 0; i < 10; i++ { 50 | i := i 51 | go func() { 52 | for { 53 | msg := <-msgChan 54 | log.Info("[%d] got message: %s", i, msg.Value) 55 | 56 | // Now we have to commit the message now that we're done with it. If you don't 57 | // commit, then Marshal will never record forward progress and will eventually 58 | // terminate. 59 | consumer.Commit(msg) 60 | } 61 | }() 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /marshal/cluster_test.go: -------------------------------------------------------------------------------- 1 | package marshal 2 | 3 | import ( 4 | . "gopkg.in/check.v1" 5 | 6 | "github.com/zorkian/kafka/kafkatest" 7 | ) 8 | 9 | var _ = Suite(&ClusterSuite{}) 10 | 11 | type ClusterSuite struct { 12 | s *kafkatest.Server 13 | m *Marshaler 14 | m2 *Marshaler 15 | } 16 | 17 | func (s *ClusterSuite) SetUpTest(c *C) { 18 | ResetTestLogger(c) 19 | 20 | s.s = StartServer() 21 | 22 | var err error 23 | s.m, err = NewMarshaler("cl", "gr", []string{s.s.Addr()}) 24 | c.Assert(err, IsNil) 25 | c.Assert(s.m, NotNil) 26 | 27 | s.m2, err = NewMarshaler("cl", "gr2", []string{s.s.Addr()}) 28 | c.Assert(err, IsNil) 29 | c.Assert(s.m2, NotNil) 30 | } 31 | 32 | func (s *ClusterSuite) TearDownTest(c *C) { 33 | s.m.Terminate() 34 | s.s.Close() 35 | } 36 | 37 | func (s *ClusterSuite) TestGetTopicState(c *C) { 38 | // Always works 39 | c.Assert(s.m.cluster.getPartitionState("gr", "test2", 0), NotNil) 40 | 41 | // Should error (not claimed) 42 | topic, err := s.m.getClaimedPartitionState("test2", 0) 43 | c.Assert(topic, IsNil) 44 | c.Assert(err, NotNil) 45 | 46 | // Now claim this partition 47 | c.Assert(s.m.ClaimPartition("test2", 0), Equals, true) 48 | steps, err := s.m.cluster.waitForRsteps(1) 49 | c.Assert(err, IsNil) 50 | c.Assert(steps, Equals, 1) 51 | 52 | // getClaimed should now work for our group 53 | topic, err = s.m.getClaimedPartitionState("test2", 0) 54 | c.Assert(topic, NotNil) 55 | c.Assert(err, IsNil) 56 | 57 | // And fail here 58 | topic, err = s.m2.getClaimedPartitionState("test2", 0) 59 | c.Assert(topic, IsNil) 60 | c.Assert(err, NotNil) 61 | 62 | // And fail here (our group, diff partition) 63 | topic, err = s.m.getClaimedPartitionState("test2", 1) 64 | c.Assert(topic, IsNil) 65 | c.Assert(err, NotNil) 66 | 67 | // Release partition now 68 | c.Assert(s.m.ReleasePartition("test2", 0, 0), IsNil) 69 | steps, err = s.m.cluster.waitForRsteps(2) 70 | c.Assert(err, IsNil) 71 | c.Assert(steps, Equals, 2) 72 | 73 | // getClaimed should now fail again for our group 74 | topic, err = s.m.getClaimedPartitionState("test2", 0) 75 | c.Assert(topic, IsNil) 76 | c.Assert(err, NotNil) 77 | 78 | // And fail here 79 | topic, err = s.m2.getClaimedPartitionState("test2", 0) 80 | c.Assert(topic, IsNil) 81 | c.Assert(err, NotNil) 82 | 83 | // And fail here (our group, diff partition) 84 | topic, err = s.m.getClaimedPartitionState("test2", 1) 85 | c.Assert(topic, IsNil) 86 | c.Assert(err, NotNil) 87 | } 88 | -------------------------------------------------------------------------------- /debug/main.go: -------------------------------------------------------------------------------- 1 | /* 2 | * simple Marshal debug/timer utility 3 | * 4 | * To use: point this binary at a Kafka server and give it a topic and some consumer options. 5 | * It will then spin up a Marshaler and start to claim the topic. We time every operation and 6 | * will report some statistics about the state of the world. 7 | */ 8 | 9 | package main 10 | 11 | import ( 12 | "flag" 13 | "time" 14 | 15 | "github.com/zorkian/marshal/marshal" 16 | "github.com/op/go-logging" 17 | ) 18 | 19 | var log = logging.MustGetLogger("MarshalDebug") 20 | 21 | type timeableFunc func() 22 | 23 | func timeIt(text string, tf timeableFunc) { 24 | start := time.Now() 25 | tf() 26 | elapsed := time.Now().Sub(start) 27 | log.Info("<%0.2f ms> %s", float64(elapsed.Nanoseconds())/1000000.0, text) 28 | } 29 | 30 | func main() { 31 | broker := flag.String("broker", "localhost:9092", "ip:port of a single broker") 32 | group := flag.String("group", "debug-group", "group ID to use") 33 | client := flag.String("client", "debug-client", "client ID to use") 34 | topic := flag.String("topic", "test64", "topic to test against") 35 | claimTopic := flag.Bool("claim-topic", false, "claim entire topic mode") 36 | greedyClaim := flag.Bool("greedy-claim", false, "turn on greedy claims") 37 | fastReclaim := flag.Bool("fast-reclaim", false, "enable fast reclaim mode") 38 | printOnly := flag.Bool("print-state-only", false, "only print state, do not claim") 39 | flag.Parse() 40 | 41 | // Raise marshal debugging level 42 | logging.SetLevel(logging.DEBUG, "KafkaMarshal") 43 | 44 | // Construction timing 45 | var m *marshal.Marshaler 46 | timeIt("construct Marshaler", func() { 47 | var err error 48 | m, err = marshal.NewMarshaler(*client, *group, []string{*broker}) 49 | if err != nil { 50 | log.Fatalf("Failed to construct Marshaler: %s", err) 51 | } 52 | }) 53 | defer timeIt("terminate Marshaler", func() { m.Terminate() }) 54 | 55 | // If we're in print mode just do that and exit 56 | if *printOnly { 57 | m.PrintState() 58 | return 59 | } 60 | 61 | // Ensure target topic exists 62 | partitions := m.Partitions(*topic) 63 | if partitions == 0 { 64 | log.Fatalf("Topic %s has no partitions/does not exist.", *topic) 65 | } 66 | log.Info("Topic %s has %d partitions.", *topic, partitions) 67 | 68 | // Set up consumption of the topic with the options they gave us 69 | options := marshal.NewConsumerOptions() 70 | options.GreedyClaims = *greedyClaim 71 | options.FastReclaim = *fastReclaim 72 | options.ClaimEntireTopic = *claimTopic 73 | 74 | timeIt("claim all partitions", func() { 75 | var c *marshal.Consumer 76 | timeIt("construct Consumer", func() { 77 | var err error 78 | c, err = m.NewConsumer([]string{*topic}, options) 79 | if err != nil { 80 | log.Fatalf("Failed to construct consumer: %s", err) 81 | } 82 | }) 83 | defer timeIt("terminate Consumer", func() { c.Terminate(false) }) 84 | 85 | // Wait for all partitions to be claimed 86 | for c.GetCurrentLoad() < partitions { 87 | time.Sleep(10 * time.Millisecond) 88 | } 89 | }) 90 | 91 | m.PrintState() 92 | } 93 | -------------------------------------------------------------------------------- /marshal/message_test.go: -------------------------------------------------------------------------------- 1 | package marshal 2 | 3 | import . "gopkg.in/check.v1" 4 | 5 | var _ = Suite(&MessageSuite{}) 6 | 7 | type MessageSuite struct{} 8 | 9 | func (s *MessageSuite) SetUpTest(c *C) { 10 | ResetTestLogger(c) 11 | } 12 | 13 | func (s *MessageSuite) TestMessageEncode(c *C) { 14 | base := msgBase{ 15 | Version: 4, 16 | Time: 2, 17 | InstanceID: "ii", 18 | ClientID: "cl", 19 | GroupID: "gr", 20 | Topic: "t", 21 | PartID: 3, 22 | } 23 | c.Assert(base.Encode(), Equals, "4/2/ii/cl/gr/t/3") 24 | 25 | hb := msgHeartbeat{ 26 | msgBase: base, 27 | CurrentOffset: 5, 28 | } 29 | c.Assert(hb.Encode(), Equals, "Heartbeat/4/2/ii/cl/gr/t/3/5") 30 | 31 | cp := msgClaimingPartition{ 32 | msgBase: base, 33 | } 34 | c.Assert(cp.Encode(), Equals, "ClaimingPartition/4/2/ii/cl/gr/t/3") 35 | 36 | rp := msgReleasingPartition{ 37 | msgBase: base, 38 | CurrentOffset: 7, 39 | } 40 | c.Assert(rp.Encode(), Equals, "ReleasingPartition/4/2/ii/cl/gr/t/3/7") 41 | 42 | cm := msgClaimingMessages{ 43 | msgBase: base, 44 | ProposedCurrentOffset: 9, 45 | } 46 | c.Assert(cm.Encode(), Equals, "ClaimingMessages/4/2/ii/cl/gr/t/3/9") 47 | } 48 | 49 | func (s *MessageSuite) TestMessageDecode(c *C) { 50 | msg, err := decode([]byte("banana")) 51 | c.Assert(msg, IsNil) 52 | c.Assert(err, NotNil) 53 | 54 | msg, err = decode([]byte("Heartbeat/4/2/ii/cl/gr/t/1/2")) 55 | c.Assert(msg, NotNil) 56 | c.Assert(err, IsNil) 57 | 58 | mhb, ok := msg.(*msgHeartbeat) 59 | if !ok || msg.Type() != msgTypeHeartbeat || mhb.ClientID != "cl" || mhb.GroupID != "gr" || 60 | mhb.Topic != "t" || mhb.PartID != 1 || mhb.CurrentOffset != 2 || mhb.Time != 2 || 61 | mhb.Version != 4 { 62 | c.Error("Heartbeat message contents invalid") 63 | } 64 | 65 | msg, err = decode([]byte("ClaimingPartition/4/2/ii/cl/gr/t/1")) 66 | if msg == nil || err != nil { 67 | c.Error("Expected msg, got error", err) 68 | } 69 | mcp, ok := msg.(*msgClaimingPartition) 70 | if !ok || msg.Type() != msgTypeClaimingPartition || mcp.ClientID != "cl" || 71 | mcp.GroupID != "gr" || mcp.Topic != "t" || mcp.PartID != 1 || mcp.Time != 2 || 72 | mcp.Version != 4 { 73 | c.Error("ClaimingPartition message contents invalid") 74 | } 75 | 76 | msg, err = decode([]byte("ReleasingPartition/4/2/ii/cl/gr/t/1/9")) 77 | if msg == nil || err != nil { 78 | c.Error("Expected msg, got error", err) 79 | } 80 | mrp, ok := msg.(*msgReleasingPartition) 81 | if !ok || msg.Type() != msgTypeReleasingPartition || mrp.ClientID != "cl" || 82 | mrp.GroupID != "gr" || mrp.Topic != "t" || mrp.PartID != 1 || mrp.Time != 2 || 83 | mrp.CurrentOffset != 9 || mhb.Version != 4 { 84 | c.Error("ReleasingPartition message contents invalid") 85 | } 86 | 87 | msg, err = decode([]byte("ClaimingMessages/4/2/ii/cl/gr/t/1/2")) 88 | if msg == nil || err != nil { 89 | c.Error("Expected msg, got error", err) 90 | } 91 | mcm, ok := msg.(*msgClaimingMessages) 92 | if !ok || msg.Type() != msgTypeClaimingMessages || mcm.ClientID != "cl" || mcm.GroupID != "gr" || 93 | mcm.Topic != "t" || mcm.PartID != 1 || mcm.ProposedCurrentOffset != 2 || mcm.Time != 2 || 94 | mhb.Version != 4 { 95 | c.Error("ClaimingMessages message contents invalid") 96 | } 97 | } 98 | -------------------------------------------------------------------------------- /marshal/topic.go: -------------------------------------------------------------------------------- 1 | /* 2 | * portal - marshal 3 | * 4 | * a library that implements an algorithm for doing consumer coordination within Kafka, rather 5 | * than using Zookeeper or another external system. 6 | * 7 | */ 8 | 9 | package marshal 10 | 11 | import ( 12 | "sync" 13 | "time" 14 | ) 15 | 16 | // topicState contains information about a given topic. 17 | type topicState struct { 18 | // claimPartition is which Marshal topic partition to use for coordination of this topic. 19 | // Read only, set at initialization time so not protected by the lock. 20 | claimPartition int 21 | 22 | // This lock also protects the contents of the partitions member. 23 | lock *sync.RWMutex 24 | partitions []PartitionClaim 25 | } 26 | 27 | // PrintState causes us to log the state of this topic's claims. 28 | func (ts *topicState) PrintState() { 29 | ts.lock.RLock() 30 | defer ts.lock.RUnlock() 31 | 32 | now := time.Now().Unix() 33 | for partID, claim := range ts.partitions { 34 | state := "CLMD" 35 | if !claim.claimed(now) { 36 | state = "----" 37 | } 38 | log.Infof(" * %2d [%s]: GPID %s | CLID %s | LHB %d (%d) | LOF %d | PCL %d", 39 | partID, state, claim.GroupID, claim.ClientID, claim.LastHeartbeat, 40 | now-claim.LastHeartbeat, claim.CurrentOffset, len(claim.pendingClaims)) 41 | } 42 | } 43 | 44 | // PartitionOffsets is a record of offsets for a given partition. Contains information 45 | // combined from Kafka and our current state. 46 | // 47 | // A Kafka partition consists of N messages with offsets. In the basic case, you 48 | // can think of an offset like an array index. With log compaction and other trickery 49 | // it acts more like a sparse array, but it's a close enough metaphor. 50 | // 51 | // We keep track of four values for offsets: 52 | // 53 | // offsets 1 2 3 7 9 10 11 54 | // partition [ msg1, msg2, msg3, msg4, msg5, msg6, msg7, ... ] 55 | // ^ ^ ^ 56 | // \- Earliest | | 57 | // \- Current Latest 58 | // 59 | // In this example, Earliest is 1 which is the "oldest" offset within the 60 | // partition. At any given time this offset might become invalid if a log rolls 61 | // so we might update it. 62 | // 63 | // Current is 7, which is the offset of the NEXT message i.e. this message 64 | // has not been consumed yet. 65 | // 66 | // Latest is 12, which is the offset that Kafka will assign to the message 67 | // that next gets committed to the partition. This offset does not yet exist, 68 | // and might never. 69 | // 70 | // Committed is the value recorded in Kafka's committed offsets system. 71 | type PartitionOffsets struct { 72 | Current int64 73 | Earliest int64 74 | Latest int64 75 | Committed int64 76 | } 77 | 78 | // PartitionClaim contains claim information about a given partition. 79 | type PartitionClaim struct { 80 | InstanceID string 81 | ClientID string 82 | GroupID string 83 | LastRelease int64 84 | LastHeartbeat int64 85 | CurrentOffset int64 86 | 87 | // Used internally when someone is waiting on this partition to be claimed. 88 | pendingClaims []chan struct{} 89 | } 90 | 91 | // checkOwnership compares the ClientID/GroupID (and optionally InstanceID) of a given 92 | // claim to a given message and returns whether or not they match. 93 | func (p *PartitionClaim) checkOwnership(msg message, checkInstanceID bool) bool { 94 | iid, cid, gid := msg.Ownership() 95 | if p.ClientID != cid || p.GroupID != gid { 96 | return false 97 | } 98 | return !checkInstanceID || p.InstanceID == iid 99 | } 100 | 101 | // claimed returns a boolean indicating whether or not this structure is indicating a 102 | // still valid claim. Validity is based on the delta between NOW and lastHeartbeat: 103 | // 104 | // delta = 0 .. HeartbeatInterval: claim good. 105 | // HeartbeatInterval .. 2*HeartbeatInterval-1: claim good. 106 | // >2xHeartbeatInterval: claim invalid. 107 | // 108 | // This means that the worst case for a "dead consumer" that has failed to heartbeat 109 | // is that a partition will be idle for twice the heartbeat interval. 110 | func (p *PartitionClaim) claimed(ts int64) bool { 111 | // If lastHeartbeat is 0, then the partition is unclaimed 112 | if p.LastHeartbeat == 0 { 113 | return false 114 | } 115 | 116 | // We believe we have claim information, but let's analyze it to determine whether or 117 | // not the claim is valid. Of course this assumes that our time and the remote's time 118 | // are roughly in sync. 119 | now := ts 120 | if ts == 0 { 121 | now = time.Now().Unix() 122 | } 123 | 124 | delta := now - p.LastHeartbeat 125 | switch { 126 | case 0 <= delta && delta <= HeartbeatInterval: 127 | // Fresh claim - all good 128 | return true 129 | case HeartbeatInterval < delta && delta < 2*HeartbeatInterval: 130 | // Aging claim - missed/delayed heartbeat, but still in tolerance 131 | return true 132 | default: 133 | // Stale claim - no longer valid 134 | return false 135 | } 136 | } 137 | 138 | // Claimed returns whether or not the PartitionClaim indicates a valid (as of this 139 | // invocation) claim. 140 | func (p *PartitionClaim) Claimed() bool { 141 | return p.claimed(0) 142 | } 143 | -------------------------------------------------------------------------------- /marshal/marshal_test.go: -------------------------------------------------------------------------------- 1 | package marshal 2 | 3 | import ( 4 | "time" 5 | 6 | . "gopkg.in/check.v1" 7 | 8 | "github.com/zorkian/kafka/kafkatest" 9 | ) 10 | 11 | var _ = Suite(&MarshalSuite{}) 12 | 13 | type MarshalSuite struct { 14 | s *kafkatest.Server 15 | m *Marshaler 16 | } 17 | 18 | func (s *MarshalSuite) SetUpTest(c *C) { 19 | ResetTestLogger(c) 20 | 21 | s.s = StartServer() 22 | 23 | var err error 24 | s.m, err = NewMarshaler("cl", "gr", []string{s.s.Addr()}) 25 | if err != nil { 26 | c.Errorf("New Marshaler failed: %s", err) 27 | } 28 | } 29 | 30 | func (s *MarshalSuite) TearDownTest(c *C) { 31 | s.m.Terminate() 32 | s.s.Close() 33 | } 34 | 35 | func MakeTopic(srv *kafkatest.Server, topic string, numPartitions int) { 36 | for i := 0; i < numPartitions; i++ { 37 | srv.AddMessages(topic, int32(i)) 38 | } 39 | } 40 | 41 | func StartServer() *kafkatest.Server { 42 | srv := kafkatest.NewServer() 43 | srv.MustSpawn() 44 | MakeTopic(srv, MarshalTopic, 4) 45 | MakeTopic(srv, "test1", 1) 46 | MakeTopic(srv, "test2", 2) 47 | MakeTopic(srv, "test3", 3) 48 | return srv 49 | } 50 | 51 | func (s *MarshalSuite) TestNewMarshaler(c *C) { 52 | // Test that Marshaler starts up and learns about the topics. 53 | c.Assert(s.m.Partitions(MarshalTopic), Equals, 4) 54 | c.Assert(s.m.Partitions("test1"), Equals, 1) 55 | c.Assert(s.m.Partitions("test2"), Equals, 2) 56 | c.Assert(s.m.Partitions("test3"), Equals, 3) 57 | c.Assert(s.m.Partitions("unknown"), Equals, 0) 58 | 59 | // If our hash algorithm changes, these values will have to change. This tests the low 60 | // level hash function. 61 | c.Assert(s.m.cluster.getClaimPartition("test1"), Equals, 2) 62 | c.Assert(s.m.cluster.getClaimPartition("test2"), Equals, 1) 63 | c.Assert(s.m.cluster.getClaimPartition("test3"), Equals, 2) 64 | c.Assert(s.m.cluster.getClaimPartition("unknown"), Equals, 1) 65 | c.Assert(s.m.cluster.getClaimPartition("unknown"), Equals, 1) // Twice on purpose. 66 | } 67 | 68 | // This is a full integration test of claiming including writing to Kafka via the marshaler 69 | // and waiting for responses 70 | func (s *MarshalSuite) TestClaimPartitionIntegration(c *C) { 71 | resp := make(chan bool) 72 | go func() { 73 | resp <- s.m.ClaimPartition("test1", 0) // true 74 | resp <- s.m.ClaimPartition("test1", 0) // true (no-op) 75 | s.m.lock.Lock() 76 | s.m.clientID = "cl-other" 77 | s.m.lock.Unlock() 78 | resp <- s.m.ClaimPartition("test1", 0) // false (collission) 79 | resp <- s.m.ClaimPartition("test1", 1) // true (new client) 80 | }() 81 | 82 | select { 83 | case out := <-resp: 84 | c.Assert(out, Equals, true) 85 | case <-time.After(5 * time.Second): 86 | c.Error("Timed out claiming partition") 87 | } 88 | 89 | select { 90 | case out := <-resp: 91 | c.Assert(out, Equals, true) 92 | case <-time.After(5 * time.Second): 93 | c.Error("Timed out claiming partition") 94 | } 95 | 96 | select { 97 | case out := <-resp: 98 | c.Assert(out, Equals, false) 99 | case <-time.After(5 * time.Second): 100 | c.Error("Timed out claiming partition") 101 | } 102 | 103 | select { 104 | case out := <-resp: 105 | c.Assert(out, Equals, true) 106 | case <-time.After(5 * time.Second): 107 | c.Error("Timed out claiming partition") 108 | } 109 | } 110 | 111 | // This is a full integration test of a claim, heartbeat, and release cycle 112 | func (s *MarshalSuite) TestPartitionLifecycleIntegration(c *C) { 113 | // Claim partition (this is synchronous, will only return when) 114 | // it has succeeded 115 | c.Assert(s.m.ClaimPartition("test1", 0), Equals, true) 116 | steps, err := s.m.cluster.waitForRsteps(1) 117 | c.Assert(err, IsNil) 118 | c.Assert(steps, Equals, 1) 119 | 120 | // Ensure we have claimed it 121 | cl := s.m.GetPartitionClaim("test1", 0) 122 | if cl.LastHeartbeat <= 0 || cl.ClientID != "cl" || cl.GroupID != "gr" { 123 | c.Errorf("PartitionClaim values unexpected %+v", cl) 124 | } 125 | if cl.CurrentOffset != 0 { 126 | c.Error("CurrentOffset is not 0") 127 | } 128 | 129 | // Now heartbeat on it to update the last offset 130 | c.Assert(s.m.Heartbeat("test1", 0, 10), IsNil) 131 | steps, err = s.m.cluster.waitForRsteps(2) 132 | c.Assert(err, IsNil) 133 | c.Assert(steps, Equals, 2) 134 | 135 | // Get the claim again, validate it's updated 136 | cl = s.m.GetPartitionClaim("test1", 0) 137 | if cl.LastHeartbeat <= 0 || cl.ClientID != "cl" || cl.GroupID != "gr" { 138 | c.Errorf("PartitionClaim values unexpected %+v", cl) 139 | } 140 | if cl.CurrentOffset != 10 { 141 | c.Error("CurrentOffset is not 10") 142 | } 143 | 144 | // Release 145 | c.Assert(s.m.ReleasePartition("test1", 0, 20), IsNil) 146 | steps, err = s.m.cluster.waitForRsteps(3) 147 | c.Assert(err, IsNil) 148 | c.Assert(steps, Equals, 3) 149 | 150 | // Get the claim again, validate it's empty 151 | cl = s.m.GetPartitionClaim("test1", 0) 152 | if cl.LastHeartbeat > 0 || cl.ClientID != "" || cl.GroupID != "" { 153 | c.Errorf("PartitionClaim values unexpected %+v", cl) 154 | } 155 | if cl.CurrentOffset != 0 { 156 | c.Error("CurrentOffset is not 20") 157 | } 158 | 159 | // Get the last known claim data 160 | cl = s.m.GetLastPartitionClaim("test1", 0) 161 | if cl.LastHeartbeat > 0 || cl.ClientID != "cl" || cl.GroupID != "gr" { 162 | c.Errorf("PartitionClaim values unexpected %+v", cl) 163 | } 164 | if cl.CurrentOffset != 20 { 165 | c.Error("CurrentOffset is not 20") 166 | } 167 | } 168 | 169 | func (s *MarshalSuite) TestTerminatedMarshalRemovesSelfFromCluster(c *C) { 170 | // Test that terminated Marshalers remove their cluster's reference to it. 171 | c.Assert(s.m.cluster.marshalers, DeepEquals, []*Marshaler{s.m}) 172 | s.m.Terminate() 173 | c.Assert(s.m.cluster.marshalers, DeepEquals, []*Marshaler{}) 174 | } 175 | -------------------------------------------------------------------------------- /marshal/message.go: -------------------------------------------------------------------------------- 1 | /* 2 | * portal - marshal 3 | * 4 | * a library that implements an algorithm for doing consumer coordination within Kafka, rather 5 | * than using Zookeeper or another external system. 6 | * 7 | */ 8 | 9 | package marshal 10 | 11 | import ( 12 | "fmt" 13 | "strconv" 14 | "strings" 15 | ) 16 | 17 | // TODO: This all uses a dumb string representation format which is very bytes-intensive. 18 | // A binary protocol would be nice. 19 | 20 | type msgType int 21 | 22 | const ( 23 | msgLengthBase int = 8 24 | idxType int = 0 25 | idxVersion int = 1 26 | idxTimestamp int = 2 27 | idxInstanceID int = 3 28 | idxClientID int = 4 29 | idxGroupID int = 5 30 | idxTopic int = 6 31 | idxPartID int = 7 32 | idxBaseEnd int = 7 // Index of last element in base message. 33 | 34 | msgTypeHeartbeat msgType = 0 35 | msgLengthHeartbeat int = msgLengthBase + 1 36 | idxHBCurrentOffset int = idxBaseEnd + 1 37 | 38 | msgTypeClaimingPartition msgType = 1 39 | msgLengthClaimingPartition int = msgLengthBase 40 | 41 | msgTypeReleasingPartition msgType = 2 42 | msgLengthReleasingPartition int = msgLengthBase + 1 43 | idxRPCurrentOffset int = idxBaseEnd + 1 44 | 45 | msgTypeClaimingMessages msgType = 3 46 | msgLengthClaimingMessages int = msgLengthBase + 1 47 | idxCMProposedCurrentOffset int = idxBaseEnd + 1 48 | ) 49 | 50 | type message interface { 51 | Encode() string 52 | Timestamp() int 53 | Type() msgType 54 | Ownership() (string, string, string) 55 | } 56 | 57 | // decode takes a slice of bytes that should constitute a single message and attempts to 58 | // decode it into one of our message structs. 59 | func decode(inp []byte) (message, error) { 60 | parts := strings.Split(string(inp), "/") 61 | if len(parts) < msgLengthBase { 62 | return nil, fmt.Errorf("Invalid message (length): [%s]", string(inp)) 63 | } 64 | 65 | version, err := strconv.Atoi(parts[idxVersion]) 66 | if err != nil { 67 | return nil, fmt.Errorf("Invalid message (version): [%s]", string(inp)) 68 | } 69 | 70 | // Get out the base message which is always present as it identifies the sender. 71 | partID, err := strconv.Atoi(parts[idxPartID]) 72 | if err != nil { 73 | return nil, fmt.Errorf("Invalid message (partID): [%s]", string(inp)) 74 | } 75 | ts, err := strconv.Atoi(parts[idxTimestamp]) 76 | if err != nil { 77 | return nil, fmt.Errorf("Invalid message (timestamp): [%s]", string(inp)) 78 | } 79 | base := msgBase{ 80 | Version: version, 81 | Time: ts, 82 | InstanceID: parts[idxInstanceID], 83 | ClientID: parts[idxClientID], 84 | GroupID: parts[idxGroupID], 85 | Topic: parts[idxTopic], 86 | PartID: partID, 87 | } 88 | 89 | switch parts[0] { 90 | case "Heartbeat": 91 | if len(parts) != msgLengthHeartbeat { 92 | return nil, fmt.Errorf("Invalid message (hb length): [%s]", string(inp)) 93 | } 94 | offset, err := strconv.ParseInt(parts[idxHBCurrentOffset], 10, 0) 95 | if err != nil { 96 | return nil, fmt.Errorf("Invalid message (hb offset): [%s]", string(inp)) 97 | } 98 | return &msgHeartbeat{msgBase: base, CurrentOffset: int64(offset)}, nil 99 | case "ClaimingPartition": 100 | if len(parts) != msgLengthClaimingPartition { 101 | return nil, fmt.Errorf("Invalid message (cp length): [%s]", string(inp)) 102 | } 103 | return &msgClaimingPartition{msgBase: base}, nil 104 | case "ReleasingPartition": 105 | if len(parts) != msgLengthReleasingPartition { 106 | return nil, fmt.Errorf("Invalid message (rp length): [%s]", string(inp)) 107 | } 108 | offset, err := strconv.ParseInt(parts[idxRPCurrentOffset], 10, 0) 109 | if err != nil { 110 | return nil, fmt.Errorf("Invalid message (rp offset): [%s]", string(inp)) 111 | } 112 | return &msgReleasingPartition{msgBase: base, CurrentOffset: offset}, nil 113 | case "ClaimingMessages": 114 | if len(parts) != msgLengthClaimingMessages { 115 | return nil, fmt.Errorf("Invalid message (cm length): [%s]", string(inp)) 116 | } 117 | offset, err := strconv.ParseInt(parts[idxCMProposedCurrentOffset], 10, 0) 118 | if err != nil { 119 | return nil, fmt.Errorf("Invalid message (cm offset): [%s]", string(inp)) 120 | } 121 | return &msgClaimingMessages{msgBase: base, ProposedCurrentOffset: offset}, nil 122 | } 123 | return nil, fmt.Errorf("Invalid message: [%s]", string(inp)) 124 | } 125 | 126 | type msgBase struct { 127 | Version int 128 | Time int 129 | InstanceID string 130 | ClientID string 131 | GroupID string 132 | Topic string 133 | PartID int 134 | } 135 | 136 | // Encode returns a string representation of the message. 137 | func (m *msgBase) Encode() string { 138 | return fmt.Sprintf("%d/%d/%s/%s/%s/%s/%d", 139 | m.Version, m.Time, m.InstanceID, m.ClientID, m.GroupID, m.Topic, m.PartID) 140 | } 141 | 142 | // Type returns the type of this message. 143 | func (m *msgBase) Type() msgType { 144 | panic("Attempted to type the base message. This should never happen.") 145 | } 146 | 147 | // Timestamp returns the timestamp of the message 148 | func (m *msgBase) Timestamp() int { 149 | return m.Time 150 | } 151 | 152 | // Ownership returns InstanceID, ClientID, GroupID for message 153 | func (m *msgBase) Ownership() (string, string, string) { 154 | return m.InstanceID, m.ClientID, m.GroupID 155 | } 156 | 157 | // msgHeartbeat is sent regularly by all consumers to re-up their claim to the partition that 158 | // they're consuming. 159 | type msgHeartbeat struct { 160 | msgBase 161 | CurrentOffset int64 162 | } 163 | 164 | // Encode returns a string representation of the message. 165 | func (m *msgHeartbeat) Encode() string { 166 | return "Heartbeat/" + m.msgBase.Encode() + fmt.Sprintf("/%d", m.CurrentOffset) 167 | } 168 | 169 | // Type returns the type of this message. 170 | func (m *msgHeartbeat) Type() msgType { 171 | return msgTypeHeartbeat 172 | } 173 | 174 | // Timestamp returns the timestamp of the message 175 | func (m *msgHeartbeat) Timestamp() int { 176 | return m.Time 177 | } 178 | 179 | // Ownership returns InstanceID, ClientID, GroupID for message 180 | func (m *msgHeartbeat) Ownership() (string, string, string) { 181 | return m.InstanceID, m.ClientID, m.GroupID 182 | } 183 | 184 | // msgClaimingPartition is used in the claim flow. 185 | type msgClaimingPartition struct { 186 | msgBase 187 | } 188 | 189 | // Encode returns a string representation of the message. 190 | func (m *msgClaimingPartition) Encode() string { 191 | return "ClaimingPartition/" + m.msgBase.Encode() 192 | } 193 | 194 | // Type returns the type of this message. 195 | func (m *msgClaimingPartition) Type() msgType { 196 | return msgTypeClaimingPartition 197 | } 198 | 199 | // Timestamp returns the timestamp of the message 200 | func (m *msgClaimingPartition) Timestamp() int { 201 | return m.Time 202 | } 203 | 204 | // Ownership returns InstanceID, ClientID, GroupID for message 205 | func (m *msgClaimingPartition) Ownership() (string, string, string) { 206 | return m.InstanceID, m.ClientID, m.GroupID 207 | } 208 | 209 | // msgReleasingPartition is used in a controlled shutdown to indicate that you are done with 210 | // a partition. 211 | type msgReleasingPartition struct { 212 | msgBase 213 | CurrentOffset int64 214 | } 215 | 216 | // Encode returns a string representation of the message. 217 | func (m *msgReleasingPartition) Encode() string { 218 | return "ReleasingPartition/" + m.msgBase.Encode() + fmt.Sprintf("/%d", m.CurrentOffset) 219 | } 220 | 221 | // Type returns the type of this message. 222 | func (m *msgReleasingPartition) Type() msgType { 223 | return msgTypeReleasingPartition 224 | } 225 | 226 | // Timestamp returns the timestamp of the message 227 | func (m *msgReleasingPartition) Timestamp() int { 228 | return m.Time 229 | } 230 | 231 | // Ownership returns InstanceID, ClientID, GroupID for message 232 | func (m *msgReleasingPartition) Ownership() (string, string, string) { 233 | return m.InstanceID, m.ClientID, m.GroupID 234 | } 235 | 236 | // msgClaimingMessages is used for at-most-once consumption semantics, this is a pre-commit 237 | // advisory message. 238 | type msgClaimingMessages struct { 239 | msgBase 240 | ProposedCurrentOffset int64 241 | } 242 | 243 | // Encode returns a string representation of the message. 244 | func (m *msgClaimingMessages) Encode() string { 245 | return "ClaimingMessages/" + m.msgBase.Encode() + fmt.Sprintf("/%d", m.ProposedCurrentOffset) 246 | } 247 | 248 | // Type returns the type of this message. 249 | func (m *msgClaimingMessages) Type() msgType { 250 | return msgTypeClaimingMessages 251 | } 252 | 253 | // Timestamp returns the timestamp of the message 254 | func (m *msgClaimingMessages) Timestamp() int { 255 | return m.Time 256 | } 257 | 258 | // Ownership returns InstanceID, ClientID, GroupID for message 259 | func (m *msgClaimingMessages) Ownership() (string, string, string) { 260 | return m.InstanceID, m.ClientID, m.GroupID 261 | } 262 | -------------------------------------------------------------------------------- /PROTOCOL.md: -------------------------------------------------------------------------------- 1 | # Kafka Only Consumer Coordination 2 | 3 | This is a description of the consumer coordination protocol implemented by the Marshal 4 | library. 5 | 6 | ## Synopsis 7 | 8 | It is possible to coordinate N consumers without any shared state other than what Kafka 9 | provides. Without using Zookeeper or any other such coordination system, and still provide 10 | similar guarantees/functionality. 11 | 12 | The essence of this approach is that we can use a new topic in Kafka such as `__marshal` 13 | as a write-ahead log/transaction log and use it for constructing a race-safe consumer 14 | coordination protocol. Since Kafka guarantees ordering within a partition, outside of an 15 | unclean leader election we can safely coordinate consumers. 16 | 17 | The goal is for this protocol to be robust to all failure cases. The goal is *not* for the 18 | protocol to be the absolute fastest thing out there. 19 | 20 | ## Protocol Messages 21 | 22 | This section defines the messages used in the protocol. 23 | 24 | In the following definitions, certain bolded words are used to define parameters to 25 | the message: 26 | 27 | - **client_id** is an arbitrary string. 28 | - This value should be unique within a **group_id** (see below). 29 | - This can be random, but, you might want to make it predictable for your set of consumers. 30 | If you do, you gain the property that your consumer can restart where it left off if 31 | you restart it (since it can resume its own heartbeats as long as the **client_id** is 32 | stable). 33 | - **group_id** is a namespaced opaque string. I.e., if you are the foo team, 34 | you should use a value such as `foo.bar_consumer`. 35 | - **topic**, **desired_topic** is a string of the topic name, this is also namespaced 36 | by your team. 37 | - **partition**, **desired_partition** is an integer as provided by Kafka. 38 | - **last_offset**, **proposed_last_offset** is an integer representing a message offset as 39 | provided by Kafka. This should never be generated on your own. 40 | 41 | Some constants defined in the protocol 42 | 43 | - *HeartbeatInterval* is the maximum allowed time between two heartbeats. Consumers are expected 44 | to send heartbeat messages once per interval. The smaller this number is, the busier the 45 | coordination topic will be, but the faster failure recovery will be. 46 | 47 | The protocol is defined with several simple messages: 48 | 49 | 1. `Heartbeat` which includes **client_id**, **group_id**, **topic**, **partition**, 50 | **last_offset**. These are sent at most every **HeartbeatInterval** seconds apart. 51 | 1. `ClaimingPartition` which includes **client_id**, **group_id**, **topic**, **partition** 52 | and is used as the initial request stating that you wish to claim a partition. 53 | 1. `ReleasingPartition` which includes **client_id**, **group_id**, **topic**, 54 | **partition**, **last_offset** and is used when a consumer wants to proactively release 55 | a partition. 56 | 1. `ClaimingMessages` which includes **client_id**, **group_id**, **topic**, 57 | **partition**, **proposed_last_offset** is used for the At Most Once consumption flow. 58 | 1. `ReleaseGroup` which includes **client_id**, **group_id**, **msg_expire_time**. This message 59 | is sent by a special Admin actor, which can pause an entire consumer group identified 60 | by the **group_id**, until **msg_expire_time**. This message is used to set a consumer 61 | group's position. See the section "Setting Consumer Group Position." 62 | 63 | ## Determining World State 64 | 65 | This is the primary engine of Marshal. The "rationalizer" will read the messages in the 66 | coordination topic and calculate a "world state" given the sequence of messages in the log. 67 | The algorithm works based on everybody coming to the same conclusion about the world state 68 | given the same log, i.e., every state transition is determined solely by the messages in 69 | the logs and their relative ordering. 70 | 71 | The state is calculated, for a given **topic**/**partition** you wish to know about, 72 | by fully consuming the data from the coordination topic (which should be relatively 73 | minimal and fast to process) and constructing a current state of the world (as of the 74 | last message you have). 75 | 76 | You can know what consumers exist (actively) based on the heartbeats and partition messages. 77 | 78 | ### Heartbeats 79 | 80 | Every consumer is required to heartbeat every **HeartbeatInterval** seconds. 81 | 82 | A client is considered fresh when less than **HeartbeatInterval** seconds have elapsed 83 | since the last heartbeat. 84 | 85 | A client is considered to be in an unknown state when **HeartbeatInterval** to *twice that 86 | value* seconds have elapsed. 87 | 88 | A client is considered stale when *more than twice* **HeartbeatInterval** seconds have 89 | elapsed and no further heartbeat has been received. 90 | 91 | ## Partition Assignment for Consumption 92 | 93 | This is the meat of the system and the reason for such an algorithm. Being able to safely 94 | assign partitions to consumers such that they can process messages with the desired 95 | properties is non-trivial and requires this coordination. 96 | 97 | ### Consuming a New Partition 98 | 99 | This assumes that you want to start consuming a new partition. 100 | 101 | 1. Pick a partition to try to claim 102 | 1. This is done by whatever method you choose. Random, round-robin, etc. 103 | 1. Pick coordinating partition based on 104 | `hash(desired_topic, desired_partition) % number of partitions in coordinating topic` 105 | 1. Determine state of world on chosen coordinating partition 106 | 1. If the partition you wish to claim is already claimed, and the heartbeat for 107 | that partition is not stale, return to step 1 (remember, "stale" is defined as 108 | twice the **HeartbeatInterval**) 109 | 1. Since the heartbeat is stale, this consumer may continue to step 4 and attempt to claim 110 | 1. Send a `ClaimingPartition` message 111 | 1. Re-determine the state of the world (read up to the end) 112 | 1. Look for the earliest `ClaimingPartition` message associated with the desired 113 | topic/partition, if it was ours (message from step 4) then continue 114 | 1. If somebody else won the race, return to step 1 115 | 1. If the current client wins (has the earliest claim), send an immediate `Heartbeat` 116 | message and consumption on desired topic/partition can begin 117 | 118 | This process, assuming no Kafka data loss (we'll have to carefully make sure to produce with the right options), should guarantee safe partition assignment. 119 | 120 | ### Consuming Recently Used Partitions on Restart 121 | 122 | This is an optimization to help prevent churn of consumption. If you define your consumer 123 | such that you have a predictable **client_id** and it is unique within your consumer 124 | group, you can use that to determine what partitions your client was previously consuming. 125 | 126 | 1. Determine the complete state of the world 127 | 1. This requires scanning "recent" events for the entire coordination topic (all partitions) 128 | 1. If heartbeats are found for the current **client_id**+**group_id**, and if those 129 | heartbeats are fresh (only), then send a new heartbeat and recover state 130 | 1. Note: The previous heartbeats should contain enough information to continue where you 131 | left off (modulo the guarantees of ALO/AMO consumption) 132 | 133 | ## Consumption 134 | 135 | There are two main algorithms for message processing. Both of these assume that your client 136 | *already has a valid claim to a partition* that you are going to be consuming from. 137 | 138 | ### At Most Once (AMO) 139 | 140 | The semantics of at-most-once consumption are that you would prefer to consume a message 141 | zero (0) times (never see it) than to consume it more than once. 142 | 143 | To do safely, we use the same linear nature of the Kafka partitions to make a 144 | transactional guarantee: 145 | 146 | 1. Determine that we still have the claim to this partition 147 | 1. Fetch a batch of messages 148 | 1. Batch size should be adjusted for the QPS of your category, the smaller your batches 149 | the more traffic against the coordination topic, but the fewer you lose in the failure 150 | case 151 | 1. Produce a `ClaimingMessages` message with the last offset from our batch of messages 152 | 1. Re-determine the state of the world 153 | 1. Validate we still hold the claim on this partition 154 | 1. Validate that our claim of the messages from step 2 is in the log 155 | 1. Send `Heartbeat` with the offset to "commit" the transaction 156 | 1. Process the messages in this batch 157 | 158 | Assuming again that Kafka is durable and we use the right settings, this should provide the 159 | guarantees we want for at-most-once. Any failure along the way will be handled by either a 160 | normal heartbeat-expire-retry loop (steps 1-4 fail) and if we fail during step 5 then that 161 | batch of messages will be dropped per AMO semantics. 162 | 163 | ### At Least Once (ALO) 164 | 165 | Much easier than at-most-once, still assuming we have the claim to a partition: 166 | 167 | 1. Determine that we still have the claim to this partition 168 | 1. Fetch a batch of messages 169 | 1. Process batch of messages 170 | 1. Every **HeartbeatInterval**, send a `Heartbeat` message with the last processed offset 171 | 172 | As long as you heartbeat every interval, failure is constrained to only re-process at most 173 | one single **HeartbeatInterval** of messages. 174 | 175 | ### Consumer Failure 176 | 177 | If a consumer stops reporting heartbeats, other consumers can pick up that partition. 178 | In essence, if no `Heartbeat` messages have arrived on a partition for twice the 179 | **HeartbeatInterval**, then whichever consumers are looking for partitions will attempt 180 | to claim that partition, starting that whole process. 181 | 182 | In the ALO consumption case, this can lead to two consumers running on a single batch 183 | of messages at the same time, but it is constrained to one batch. The AMO consumer cannot 184 | have that failure case, at worst it will never process some messages. 185 | 186 | ## Setting Consumer Group Position 187 | 188 | Documentation being written. 189 | 190 | # TODO 191 | 192 | I believe a Kafka-service system would want to consume messages off of a partition but 193 | not necessarily take a whole lock on the partition. I.e. just saying "I claim message 194 | offsets X-Y". You can also then fix latency issues by pre-claiming ranges so that the 195 | instant they become used you've already negotiated the lock on that range and can 196 | start processing them/handing them out? 197 | -------------------------------------------------------------------------------- /marshal/rationalizer_test.go: -------------------------------------------------------------------------------- 1 | package marshal 2 | 3 | import ( 4 | "sync" 5 | "testing" 6 | "time" 7 | 8 | . "gopkg.in/check.v1" 9 | 10 | "github.com/op/go-logging" 11 | ) 12 | 13 | func init() { 14 | // TODO: This changes logging for the whole suite. Is that what we want? 15 | logging.SetLevel(logging.ERROR, "PortalMarshal") 16 | } 17 | 18 | func Test(t *testing.T) { TestingT(t) } 19 | 20 | var _ = Suite(&RationalizerSuite{}) 21 | 22 | type RationalizerSuite struct { 23 | m *Marshaler 24 | out chan message 25 | ret chan struct{} 26 | } 27 | 28 | func (s *RationalizerSuite) SetUpTest(c *C) { 29 | ResetTestLogger(c) 30 | 31 | s.m = NewWorld() 32 | s.out = make(chan message) 33 | go s.m.cluster.rationalize(0, s.out) 34 | 35 | // Build our return channel and insert it (simulating what the marshal does for 36 | // actually trying to claim) 37 | s.ret = make(chan struct{}, 1) 38 | topic := s.m.cluster.getPartitionState(s.m.groupID, "test1", 0) 39 | topic.lock.Lock() 40 | topic.partitions[0].pendingClaims = append(topic.partitions[0].pendingClaims, s.ret) 41 | topic.lock.Unlock() 42 | } 43 | 44 | func (s *RationalizerSuite) TearDownTest(c *C) { 45 | s.m.Terminate() 46 | close(s.out) 47 | 48 | // This one might have already been closed, so safely close it. 49 | select { 50 | case <-s.ret: 51 | default: 52 | close(s.ret) 53 | } 54 | } 55 | 56 | func NewWorld() *Marshaler { 57 | return &Marshaler{ 58 | quit: new(int32), 59 | clientID: "cl", 60 | groupID: "gr", 61 | cluster: &KafkaCluster{ 62 | quit: new(int32), 63 | rsteps: new(int32), 64 | groups: make(map[string]map[string]*topicState), 65 | partitions: 1, 66 | lock: &sync.RWMutex{}, 67 | rationalizers: &sync.WaitGroup{}, 68 | }, 69 | lock: &sync.RWMutex{}, 70 | } 71 | } 72 | 73 | func heartbeat(ts int, ii, cl, gr, t string, id int, lo int64) *msgHeartbeat { 74 | return &msgHeartbeat{ 75 | msgBase: msgBase{ 76 | Time: ts, 77 | InstanceID: ii, 78 | ClientID: cl, 79 | GroupID: gr, 80 | Topic: t, 81 | PartID: id, 82 | }, 83 | CurrentOffset: lo, 84 | } 85 | } 86 | 87 | func claimingPartition(ts int, ii, cl, gr, t string, id int) *msgClaimingPartition { 88 | return &msgClaimingPartition{ 89 | msgBase: msgBase{ 90 | Time: ts, 91 | InstanceID: ii, 92 | ClientID: cl, 93 | GroupID: gr, 94 | Topic: t, 95 | PartID: id, 96 | }, 97 | } 98 | } 99 | 100 | func releasingPartition(ts int, ii, cl, gr, t string, id int, lo int64) *msgReleasingPartition { 101 | return &msgReleasingPartition{ 102 | msgBase: msgBase{ 103 | Time: ts, 104 | InstanceID: ii, 105 | ClientID: cl, 106 | GroupID: gr, 107 | Topic: t, 108 | PartID: id, 109 | }, 110 | CurrentOffset: lo, 111 | } 112 | } 113 | 114 | func (s *RationalizerSuite) WaitForRsteps(c *C, cluster *KafkaCluster, numSteps int) { 115 | steps, err := cluster.waitForRsteps(numSteps) 116 | c.Assert(err, IsNil) 117 | c.Assert(steps, Equals, numSteps) 118 | } 119 | 120 | func (s *RationalizerSuite) TestClaimed(c *C) { 121 | // This log, a single heartbeat at t=0, indicates that this topic/partition are claimed 122 | // by the client/group given. 123 | s.out <- heartbeat(1, "ii", "cl", "gr", "test1", 0, 0) 124 | s.WaitForRsteps(c, s.m.cluster, 1) 125 | 126 | // They heartbeated at 1, should be claimed as of 1. 127 | s.m.cluster.ts = 1 128 | c.Assert(s.m.Claimed("test1", 0), Equals, true) 129 | 130 | // Should still be claimed immediately after the interval 131 | s.m.cluster.ts = HeartbeatInterval + 2 132 | c.Assert(s.m.Claimed("test1", 0), Equals, true) 133 | 134 | // And still claimed right at the last second of the cutoff 135 | s.m.cluster.ts = HeartbeatInterval * 2 136 | c.Assert(s.m.Claimed("test1", 0), Equals, true) 137 | 138 | // Should NOT be claimed >2x the heartbeat interval 139 | s.m.cluster.ts = HeartbeatInterval*2 + 1 140 | c.Assert(s.m.Claimed("test1", 0), Equals, false) 141 | } 142 | 143 | func (s *RationalizerSuite) TestClaimNotMutable(c *C) { 144 | // This log, a single heartbeat at t=0, indicates that this topic/partition are claimed 145 | // by the client/group given. 146 | s.out <- heartbeat(1, "ii", "cl", "gr", "test1", 0, 0) 147 | s.WaitForRsteps(c, s.m.cluster, 1) 148 | 149 | // They heartbeated at 1, should be claimed as of 1. 150 | s.m.cluster.ts = 1 151 | cl := s.m.GetPartitionClaim("test1", 0) 152 | c.Assert(cl.LastHeartbeat, Not(Equals), int64(0)) 153 | 154 | // Modify structure, then refetch and make sure it hasn't been mutated 155 | cl.ClientID = "invalid" 156 | cl2 := s.m.GetPartitionClaim("test1", 0) 157 | c.Assert(cl2.LastHeartbeat, Not(Equals), int64(0)) 158 | c.Assert(cl2.ClientID, Equals, "cl") 159 | } 160 | 161 | func (s *RationalizerSuite) TestClaimNotOurs(c *C) { 162 | // This log, a single heartbeat at t=0, indicates that this topic/partition are claimed 163 | // by the client/group given. 164 | s.out <- heartbeat(1, "ii", "cl", "grother", "test1", 0, 0) 165 | s.WaitForRsteps(c, s.m.cluster, 1) 166 | 167 | // They heartbeated at 1, but since we have a different groupID, this should say that 168 | // the partition is not claimed 169 | s.m.cluster.ts = 1 170 | cl := s.m.GetPartitionClaim("test1", 0) 171 | c.Assert(cl.LastHeartbeat, Equals, int64(0)) 172 | 173 | // Now change our marshal's group to match 174 | s.m.groupID = "grother" 175 | s.m.cluster.ts = 1 176 | cl = s.m.GetPartitionClaim("test1", 0) 177 | c.Assert(cl.LastHeartbeat, Not(Equals), int64(0)) 178 | } 179 | 180 | func (s *RationalizerSuite) TestClaimPartition(c *C) { 181 | // This log, a single heartbeat at t=0, indicates that this topic/partition are claimed 182 | // by the client/group given. 183 | s.m.cluster.ts = 30 184 | s.out <- claimingPartition(1, "ii", "cl", "gr", "test1", 0) 185 | 186 | select { 187 | case <-s.ret: 188 | cl, err := s.m.getClaimedPartitionState("test1", 0) 189 | c.Assert(err, IsNil) 190 | c.Assert(cl, NotNil) 191 | case <-time.After(1 * time.Second): 192 | c.Error("Timed out claiming partition") 193 | } 194 | } 195 | 196 | func (s *RationalizerSuite) TestReclaimPartition(c *C) { 197 | // This log is us having the partition (HB) + a CP from someone else + a CP from us, 198 | // this should result in us owning the partition + the other person not 199 | s.m.cluster.ts = 30 200 | s.out <- heartbeat(1, "ii", "cl", "gr", "test1", 0, 0) 201 | s.out <- claimingPartition(2, "ii", "clother", "gr", "test1", 0) 202 | s.out <- claimingPartition(3, "ii", "cl", "gr", "test1", 0) 203 | 204 | select { 205 | case <-s.ret: 206 | // We own it 207 | cl, err := s.m.getClaimedPartitionState("test1", 0) 208 | c.Assert(err, IsNil) 209 | c.Assert(cl, NotNil) 210 | case <-time.After(1 * time.Second): 211 | c.Error("Timed out claiming partition") 212 | } 213 | } 214 | 215 | func (s *RationalizerSuite) TestReleaseClaim(c *C) { 216 | // This log, a single heartbeat at t=0, indicates that this topic/partition are claimed 217 | // by the client/group given. 218 | s.out <- heartbeat(1, "ii", "cl", "gr", "test1", 0, 0) 219 | s.WaitForRsteps(c, s.m.cluster, 1) 220 | 221 | // They heartbeated at 1, should be claimed as of 1. 222 | s.m.cluster.ts = 1 223 | c.Assert(s.m.Claimed("test1", 0), Equals, true) 224 | 225 | // Someone else attempts to release the claim, this shouldn't work 226 | s.out <- releasingPartition(20, "ii", "cl-bad", "gr", "test1", 0, 5) 227 | s.WaitForRsteps(c, s.m.cluster, 2) 228 | 229 | // Must be unclaimed, invalid release 230 | s.m.cluster.ts = 25 231 | c.Assert(s.m.Claimed("test1", 0), Equals, true) 232 | 233 | // Now they release it at position 10 234 | s.out <- releasingPartition(30, "ii", "cl", "gr", "test1", 0, 10) 235 | s.WaitForRsteps(c, s.m.cluster, 3) 236 | c.Assert(s.m.GetLastPartitionClaim("test1", 0).LastHeartbeat, Equals, int64(0)) 237 | c.Assert(s.m.GetLastPartitionClaim("test1", 0).LastRelease, Equals, int64(30)) 238 | 239 | // They released at 30, should be free as of 31 240 | s.m.cluster.ts = 31 241 | c.Assert(s.m.Claimed("test1", 0), Equals, false) 242 | c.Assert(s.m.GetLastPartitionClaim("test1", 0).CurrentOffset, Equals, int64(10)) 243 | } 244 | 245 | func (s *RationalizerSuite) TestClaimHandoff(c *C) { 246 | // This log, a single heartbeat at t=0, indicates that this topic/partition are claimed 247 | // by the client/group given. 248 | s.out <- heartbeat(1, "ii", "cl", "gr", "test1", 0, 0) 249 | s.WaitForRsteps(c, s.m.cluster, 1) 250 | 251 | // They heartbeated at 1, should be claimed as of 1. 252 | s.m.cluster.ts = 1 253 | c.Assert(s.m.Claimed("test1", 0), Equals, true) 254 | 255 | // Now they hand this off to someone else who picks up the heartbeat 256 | s.out <- heartbeat(10, "ii", "cl2", "gr", "test1", 0, 10) 257 | s.WaitForRsteps(c, s.m.cluster, 2) 258 | 259 | // Must be claimed, and claimed by cl2 260 | s.m.cluster.ts = 25 261 | c.Assert(s.m.Claimed("test1", 0), Equals, true) 262 | c.Assert(s.m.GetPartitionClaim("test1", 0).ClientID, Equals, "cl2") 263 | 264 | // Now we change the group ID of our world state (which client's can't do) and validate 265 | // that these partitions are NOT claimed 266 | s.m.cluster.ts = 25 267 | s.m.groupID = "gr2" 268 | c.Assert(s.m.Claimed("test1", 0), Equals, false) 269 | c.Assert(s.m.GetPartitionClaim("test1", 0).ClientID, Equals, "") 270 | } 271 | 272 | func (s *RationalizerSuite) TestPartitionExtend(c *C) { 273 | // This log, a single heartbeat at t=0, indicates that this topic/partition are claimed 274 | // by the client/group given. 275 | s.out <- heartbeat(1, "ii", "cl", "gr", "test1", 0, 0) 276 | s.WaitForRsteps(c, s.m.cluster, 1) 277 | 278 | // Ensure len is 1 279 | s.m.lock.RLock() 280 | s.m.cluster.groups["gr"]["test1"].lock.RLock() 281 | c.Assert(len(s.m.cluster.groups["gr"]["test1"].partitions), Equals, 1) 282 | s.m.cluster.groups["gr"]["test1"].lock.RUnlock() 283 | s.m.lock.RUnlock() 284 | 285 | // Extend by 4 286 | s.out <- heartbeat(2, "ii", "cl2", "gr", "test1", 4, 0) 287 | s.WaitForRsteps(c, s.m.cluster, 2) 288 | 289 | // Ensure len is 5 290 | s.m.lock.RLock() 291 | defer s.m.lock.RUnlock() 292 | s.m.cluster.groups["gr"]["test1"].lock.RLock() 293 | defer s.m.cluster.groups["gr"]["test1"].lock.RUnlock() 294 | c.Assert(len(s.m.cluster.groups["gr"]["test1"].partitions), Equals, 5) 295 | 296 | // Ensure 0 and 4 are claimed by us 297 | p1 := s.m.cluster.groups["gr"]["test1"].partitions[0] 298 | c.Assert(p1.ClientID, Equals, "cl") 299 | c.Assert(p1.GroupID, Equals, "gr") 300 | c.Assert(p1.LastHeartbeat, Equals, int64(1)) 301 | p2 := s.m.cluster.groups["gr"]["test1"].partitions[4] 302 | c.Assert(p2.ClientID, Equals, "cl2") 303 | c.Assert(p2.GroupID, Equals, "gr") 304 | c.Assert(p2.LastHeartbeat, Equals, int64(2)) 305 | } 306 | -------------------------------------------------------------------------------- /marshal/rationalizer.go: -------------------------------------------------------------------------------- 1 | /* 2 | * portal - marshal 3 | * 4 | * a library that implements an algorithm for doing consumer coordination within Kafka, rather 5 | * than using Zookeeper or another external system. 6 | * 7 | */ 8 | 9 | package marshal 10 | 11 | import ( 12 | "sync/atomic" 13 | "time" 14 | 15 | "github.com/zorkian/kafka" 16 | "github.com/jpillora/backoff" 17 | ) 18 | 19 | // kafkaConsumerChannel creates a consumer that continuously attempts to consume messages from 20 | // Kafka for the given partition. 21 | func (c *KafkaCluster) kafkaConsumerChannel(partID int) <-chan message { 22 | log.Debugf("[%s] rationalize[%d]: starting", c.name, partID) 23 | out := make(chan message, 1000) 24 | go c.consumeFromKafka(partID, out, false) 25 | return out 26 | } 27 | 28 | // consumeFromKafka will start consuming messages from Kafka and writing them to the given 29 | // channel forever. It is important that this method closes the "out" channel when it's done, 30 | // as that instructs the downstream goroutine to exit. 31 | func (c *KafkaCluster) consumeFromKafka(partID int, out chan message, startOldest bool) { 32 | var err error 33 | var alive bool 34 | var offsetFirst, offsetNext int64 35 | 36 | // Exit logic -- make sure downstream knows we exited. 37 | defer func() { 38 | log.Debugf("[%s] rationalize[%d]: terminating.", c.name, partID) 39 | close(out) 40 | }() 41 | 42 | // Try to connect to Kafka. This might sleep a bit and retry since the broker could 43 | // be down a bit. 44 | retry := &backoff.Backoff{Min: 500 * time.Millisecond, Jitter: true} 45 | for ; true; time.Sleep(retry.Duration()) { 46 | // Figure out how many messages are in this topic. This can fail if the broker handling 47 | // this partition is down, so we will loop. 48 | offsetFirst, err = c.broker.OffsetEarliest(MarshalTopic, int32(partID)) 49 | if err != nil { 50 | log.Errorf("[%s] rationalize[%d]: failed to get offset: %s", c.name, partID, err) 51 | continue 52 | } 53 | offsetNext, err = c.broker.OffsetLatest(MarshalTopic, int32(partID)) 54 | if err != nil { 55 | log.Errorf("[%s] rationalize[%d]: failed to get offset: %s", c.name, partID, err) 56 | continue 57 | } 58 | log.Debugf("[%s] rationalize[%d]: offsets %d to %d", 59 | c.name, partID, offsetFirst, offsetNext) 60 | 61 | // TODO: Is there a case where the latest offset is X>0 but there is no data in 62 | // the partition? does the offset reset to 0? 63 | if offsetNext == 0 || offsetFirst == offsetNext { 64 | alive = true 65 | c.rationalizers.Done() 66 | } 67 | break 68 | } 69 | retry.Reset() 70 | 71 | // Assume we're starting at the oldest offset for consumption 72 | consumerConf := kafka.NewConsumerConf(MarshalTopic, int32(partID)) 73 | consumerConf.RetryErrLimit = 1 // Do not retry 74 | consumerConf.StartOffset = kafka.StartOffsetOldest 75 | consumerConf.RequestTimeout = c.options.MarshalRequestTimeout 76 | consumerConf.RetryWait = c.options.MarshalRequestRetryWait 77 | 78 | // Get the offsets of this partition, we're going to arbitrarily pick something that 79 | // is ~100,000 from the end if there's more than that. This is only if startOldest is 80 | // false, i.e., we didn't run into a "message too new" situation. 81 | checkMessageTs := false 82 | if !startOldest && offsetNext-offsetFirst > 100000 { 83 | checkMessageTs = true 84 | consumerConf.StartOffset = offsetNext - 100000 85 | log.Infof("[%s] rationalize[%d]: fast forwarding to offset %d.", 86 | c.name, partID, consumerConf.StartOffset) 87 | } 88 | 89 | consumer, err := c.broker.Consumer(consumerConf) 90 | if err != nil { 91 | // Unfortunately this is a termination error, as without being able to consume this 92 | // partition we can't effectively rationalize. 93 | log.Errorf("[%s] rationalize[%d]: Failed to create consumer: %s", c.name, partID, err) 94 | c.Terminate() 95 | return 96 | } 97 | 98 | // Consume messages forever, or until told to quit. 99 | for !c.Terminated() { 100 | msgb, err := consumer.Consume() 101 | if err != nil { 102 | // The internal consumer will do a number of retries. If we get an error here, 103 | // we're probably in the middle of a partition handoff. We should pause so we 104 | // don't hammer the cluster, but otherwise continue. 105 | log.Warningf("[%s] rationalize[%d]: failed to consume: %s", c.name, partID, err) 106 | time.Sleep(retry.Duration()) 107 | continue 108 | } 109 | retry.Reset() 110 | 111 | msg, err := decode(msgb.Value) 112 | if err != nil { 113 | // Invalid message in the streac. This should never happen, but if it does, just 114 | // continue on. 115 | // TODO: We should probably think about this. If we end up in a situation where 116 | // one version of this software has a bug that writes invalid messages, it could 117 | // be doing things we don't anticipate. Of course, crashing all consumers 118 | // reading that partition is also bad. 119 | log.Errorf("[%s] rationalize[%d]: %s", c.name, partID, err) 120 | 121 | // In the case where the first message is an invalid message, we need to 122 | // to notify that we're alive now 123 | if !alive { 124 | alive = true 125 | c.rationalizers.Done() 126 | } 127 | continue 128 | } 129 | 130 | // If we are on our first message, and we started at a non-zero offset, we need 131 | // to check to make sure that the timestamp is older than a given threshold. If it's 132 | // too new, that indicates our 100000 try didn't work, so let's go from the start. 133 | // TODO: This could be a binary search or something. 134 | if checkMessageTs { 135 | if int64(msg.Timestamp()) > time.Now().Unix()-HeartbeatInterval*2 { 136 | log.Warningf("[%s] rationalize[%d]: rewinding, fast-forwarded message was too new", 137 | c.name, partID) 138 | go c.consumeFromKafka(partID, out, true) 139 | return // terminate self. 140 | } 141 | checkMessageTs = false 142 | } 143 | 144 | log.Debugf("[%s] rationalize[%d]: @%d: [%s]", c.name, partID, msgb.Offset, msg.Encode()) 145 | out <- msg 146 | 147 | // This is a one-time thing that fires the first time the rationalizer comes up 148 | // and makes sure we actually process all of the messages. 149 | if !alive && msgb.Offset >= offsetNext-1 { 150 | for len(out) > 0 { 151 | time.Sleep(100 * time.Millisecond) 152 | } 153 | log.Infof("[%s] rationalize[%d]: reached offset %d, now alive", 154 | c.name, partID, msgb.Offset) 155 | alive = true 156 | c.rationalizers.Done() 157 | } 158 | } 159 | } 160 | 161 | // updateClaim is called whenever we need to adjust a claim structure. 162 | func (c *KafkaCluster) updateClaim(msg *msgHeartbeat) { 163 | topic := c.getPartitionState(msg.GroupID, msg.Topic, msg.PartID) 164 | 165 | topic.lock.Lock() 166 | defer topic.lock.Unlock() 167 | 168 | // Note that a heartbeat will just set the claim structure. It's not valid to heartbeat 169 | // for something you don't own (which is why we have ClaimPartition as a separate 170 | // message), so we can only assume it's valid. 171 | topic.partitions[msg.PartID].InstanceID = msg.InstanceID 172 | topic.partitions[msg.PartID].ClientID = msg.ClientID 173 | topic.partitions[msg.PartID].GroupID = msg.GroupID 174 | topic.partitions[msg.PartID].CurrentOffset = msg.CurrentOffset 175 | topic.partitions[msg.PartID].LastHeartbeat = int64(msg.Time) 176 | topic.partitions[msg.PartID].LastRelease = 0 177 | } 178 | 179 | // releaseClaim is called whenever someone has released their claim on a partition. 180 | func (c *KafkaCluster) releaseClaim(msg *msgReleasingPartition) { 181 | topic := c.getPartitionState(msg.GroupID, msg.Topic, msg.PartID) 182 | 183 | topic.lock.Lock() 184 | defer topic.lock.Unlock() 185 | 186 | // The partition must be claimed by the person releasing it 187 | if !topic.partitions[msg.PartID].checkOwnership(msg, true) { 188 | log.Warningf( 189 | "[%s] ReleasePartition %s:%d from client %s that doesn't own it. Dropping.", 190 | c.name, msg.Topic, msg.PartID, msg.ClientID) 191 | return 192 | } 193 | 194 | // Record the offset they told us they last processed, and then set the heartbeat to 0 195 | // which means this is no longer claimed 196 | topic.partitions[msg.PartID].CurrentOffset = msg.CurrentOffset 197 | topic.partitions[msg.PartID].LastHeartbeat = 0 198 | topic.partitions[msg.PartID].LastRelease = int64(msg.Time) 199 | } 200 | 201 | // handleClaim is called whenever we see a ClaimPartition message. 202 | func (c *KafkaCluster) handleClaim(msg *msgClaimingPartition) { 203 | topic := c.getPartitionState(msg.GroupID, msg.Topic, msg.PartID) 204 | 205 | topic.lock.Lock() 206 | defer topic.lock.Unlock() 207 | 208 | // Send message to all pending consumers then clear the list (it is a violation of the 209 | // protocol to send two responses). This fires at the end when we exit so that anybody 210 | // who is waiting on this partition will know the state has changed. 211 | defer func() { 212 | for _, out := range topic.partitions[msg.PartID].pendingClaims { 213 | close(out) 214 | } 215 | topic.partitions[msg.PartID].pendingClaims = nil 216 | }() 217 | 218 | // If the partition is already claimed, there's nothing we need to do. 219 | if topic.partitions[msg.PartID].claimed(c.ts) { 220 | return 221 | } 222 | 223 | // At this point, the partition is unclaimed, which means we know we have the first 224 | // ClaimPartition message. As soon as we get it, we fill in the structure which makes 225 | // us think it's claimed (it is). 226 | topic.partitions[msg.PartID].InstanceID = msg.InstanceID 227 | topic.partitions[msg.PartID].ClientID = msg.ClientID 228 | topic.partitions[msg.PartID].GroupID = msg.GroupID 229 | topic.partitions[msg.PartID].LastHeartbeat = int64(msg.Time) 230 | topic.partitions[msg.PartID].LastRelease = 0 231 | } 232 | 233 | // rationalize is a goroutine that constantly consumes from a given partition of the marshal 234 | // topic and makes changes to the world state whenever something happens. 235 | func (c *KafkaCluster) rationalize(partID int, in <-chan message) { // Might be in over my head. 236 | for !c.Terminated() { 237 | msg, ok := <-in 238 | if !ok { 239 | log.Infof("[%s] rationalize[%d]: exiting, channel closed", c.name, partID) 240 | return 241 | } 242 | 243 | switch msg.Type() { 244 | case msgTypeHeartbeat: 245 | c.updateClaim(msg.(*msgHeartbeat)) 246 | case msgTypeClaimingPartition: 247 | c.handleClaim(msg.(*msgClaimingPartition)) 248 | case msgTypeReleasingPartition: 249 | c.releaseClaim(msg.(*msgReleasingPartition)) 250 | case msgTypeClaimingMessages: 251 | // TODO: Implement. 252 | } 253 | 254 | // Update step counter so the test suite can wait for messages to be 255 | // processed in a predictable way (rather than waiting random times) 256 | atomic.AddInt32(c.rsteps, 1) 257 | } 258 | log.Infof("[%s] rationalize[%d]: exiting, Marshaler terminated", c.name, partID) 259 | } 260 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Marshal - a Kafka consumer coordination library 2 | 3 | [![GoDoc](http://img.shields.io/badge/godoc-reference-blue.svg)](http://godoc.org/github.com/zorkian/marshal/marshal) 4 | [![Build Status](https://travis-ci.org/zorkian/marshal.svg)](https://travis-ci.org/dropbox/marshal) 5 | 6 | Marshal is **in beta**. We have deployed it in a few places and are 7 | working to ensure it's stable and fast. It is not 100% battle tested 8 | yet, feedback is very welcome. 9 | 10 | ## Purpose 11 | 12 | This project assumes you have some familiarity with Kafka. You should 13 | know what a topic is and what partitions are. 14 | 15 | In Kafka, the unit of scalability is the partition. If you have a 16 | topic that is getting "too busy", you increase the partition count. 17 | Consumption of data from those busy topics requires consumers to be 18 | aware of these partitions and be able to coordinate their consumption 19 | across all of the consumers. 20 | 21 | Traditional setups use Zookeeper or some other system for coordinating 22 | consumers. This works in many situations, but introduces a point of 23 | failure that isn't necessary. It is possible to completely perform 24 | consumer coordination using Kafka alone. 25 | 26 | Additionally, getting consumer coordination correct is a rather taxing 27 | exercise in development and, frankly, shouldn't need to be done for 28 | every single project, company, etc. There should be an open source 29 | system that handles it for you. 30 | 31 | Marshal is a library that you can drop into your Go programs and use it 32 | to coordinate the consumption of partitions across multiple processes, 33 | servers, etc. It is implemented in terms of Kafka itself: zero extra 34 | dependencies. 35 | 36 | Marshal is designed for use in production environments where there are 37 | many topics, each topic having hundreds of partitions, with potentially 38 | thousands of consumers working in concert across the infrastructure to 39 | consume them. Marshal is designed for big environments with critical 40 | needs. 41 | 42 | ## Usage 43 | 44 | This module is designed to be extremely simple to use. The basic logical 45 | flow is that you create a Marshaler and then you use that to create as 46 | many Consumers as you need topics to consume. Logically, you want one 47 | Marshaler in your program, and you want a single Consumer per topic that 48 | you need to consume from. 49 | 50 | Here's the simplest example (but see a more complicated example in the 51 | example directory): 52 | 53 | ```go 54 | package main 55 | 56 | import "fmt" 57 | import "github.com/zorkian/marshal/marshal" 58 | 59 | func main() { 60 | marshaler, _ := marshal.NewMarshaler( 61 | "clientid", "groupid", []string{"127.0.0.1:9092"}) 62 | defer marshaler.Terminate() 63 | 64 | consumer, _ := marshaler.NewConsumer( 65 | []string{"some-topic"}, marshal.NewConsumerOptions()) 66 | defer consumer.Terminate() 67 | 68 | msgChan := consumer.ConsumeChannel() 69 | 70 | for { 71 | msg := <-msgChan 72 | fmt.Printf("Consumed message: %s", msg.Value) 73 | consumer.Commit(msg) 74 | } 75 | } 76 | ``` 77 | 78 | If you were to hypothetically run this against a cluster that contained 79 | a topic named `some-topic` that had 8 partitions, it would begin 80 | claiming those partitions one by one until it had them all. If you 81 | started up a second copy of the program, it would only claim the 82 | partitions that are not already claimed. If the first one dies, the 83 | second one will pick up the dropped partitions within a few minutes. 84 | 85 | In essence, Marshal takes all of the effort of consumer coordination out 86 | of your software and puts it where it belongs: on Kafka. 87 | 88 | ## How Coordination Works 89 | 90 | Please read this section to get a handle on how Kafka performs 91 | coordination and the guarantees that it gives you. In particular, the 92 | failure scenarios might be interesting. 93 | 94 | If you want the gory details about the protocol 95 | used internally, please see the [PROTOCOL 96 | documentation](https://github.com/zorkian/marshal/blob/master/PROTOCOL.md). 97 | You don't need to read and understand it, though, but it might be 98 | useful. 99 | 100 | ### Basic Coordination 101 | 102 | In essence, Marshal uses a special topic within Kafka to coordinate the 103 | actions of many consumers anywhere in the infrastructure. As long as 104 | the consumers can connect to the Kafka cluster you want to coordinate, 105 | you can use Marshal. There is no language dependency either -- Marshal 106 | the algorithm could be implemented in any language and consumers could 107 | coordinate with each other. 108 | 109 | We assume that you're familiar with the basics of Kafka -- notably that 110 | each partition is effectively a write-ahead log that records an ordered 111 | set of events, and that it's not possible (barring unclean leader 112 | elections) for two consumers to see different event orderings. Marshal 113 | takes advantage of that property to perform distributed coordination. 114 | 115 | When a program using Marshal starts up, the first thing it does is read 116 | the logs in the coordinating topic. These logs contain certain events, 117 | such as: claim partition, heartbeat, and release partition to name a 118 | few. 119 | 120 | Using these events Marshal can know not only what consumers exist, but 121 | what partitions they are currently working on and how far along they 122 | are. Using that information the local program can decide such things as 123 | "which partitions are unclaimed" and then take action to claim and begin 124 | consuming those partitions. 125 | 126 | ### Groups and Clients 127 | 128 | Coordination happens within "groups". When you create a `Marshaler` you 129 | can specify the group that your consumer is part of. All claims are done 130 | on a per-group basis, which means you can consume the same topic N times 131 | -- as long as you have N groups. There is a one-to-one mapping between 132 | "consumers that can claim a given partition" and "number of groups". 133 | 134 | The "client ID" specified when you create a `Marshaler` is used to 135 | identify a particular instance of a program. These should be unique per 136 | instance of software, but they should be reasonably stable. At Dropbox 137 | we use the name of the machine the software is running on, plus possibly 138 | an instance ID if we run multiple copies on a single box. 139 | 140 | ### Consumption of Messages 141 | 142 | The main engine of Marshal happens when you create a consumer and call 143 | `consumer.Consume()`. This will possibly return a message from one 144 | of the partitions you have claimed. You then do something with the 145 | message... and consume the next one. You don't have to do anything else. 146 | 147 | Behind the scenes, the act of consuming updates internal cursors and 148 | timers and will possibly generate heartbeat messages into the Marshal 149 | event log. These messages contain information about the last offset 150 | consumed, allowing other consumers (and monitoring systems) to know 151 | where you are within the partition. In case of failure, they can resume 152 | at the last point you heartbeated. 153 | 154 | Presently, all consumption within Marshal is **at least once**. In 155 | case of most consumer failures, it is likely a block of messages (one 156 | heartbeat interval) will be reprocessed by the next consumer. 157 | 158 | ### Message Ordering 159 | 160 | Kafka guarantees the ordering of messages committed to a partition, 161 | but does not guarantee any ordering across partitions. Marshal will 162 | give you messages from any partition it has claimed, so in essence, 163 | Marshal *does not* guarantee ordering. If you need message ordering, 164 | this library is not presently appropriate for you. 165 | 166 | If you are having throughput problems you should increase the number of 167 | partitions you have available so that Marshal can have more in-flight 168 | messages. 169 | 170 | ## Failure Modes 171 | 172 | This documents some of the failure modes and how Marshal handles them. 173 | Please let us know about more questions and we can analyze and write 174 | about them. 175 | 176 | ### Consumer Too Slow 177 | 178 | In the case where a consumer is too slow -- i.e. it is consuming more 179 | slowly from a partition than data is coming in -- Marshal will detect 180 | this and internally it will start failing its health checks. When this 181 | happens it will, after enough time has passed, decide that it is not 182 | able to sustain the load and will voluntarily surrender partitions. 183 | 184 | This is useful as a load balancing mechanism if you happen to have one 185 | consumer that ends up with 8 claims while another has only a handful, 186 | the former can shed load and the latter will pick it up. 187 | 188 | However, it is worth noting that in the unbalanced scenario, as long 189 | as the consumers are keeping up with the traffic they won't release 190 | partitions. It is perfectly valid right now for Marshal consumers to end 191 | up unbalanced -- as long as they're all pulling their weight. 192 | 193 | ### Consumer Death: Expected 194 | 195 | If a consumer dies or shuts down in an expected (controlled) way, 196 | Marshal will attempt to commit release partition events into the log. If 197 | this happens successfully then other consumers will be able to pick up 198 | the partitions within seconds and begin consuming exactly where the last 199 | consumer left off. 200 | 201 | No data is skipped or double-consumed in this mode and the downtime is 202 | extremely minimal. 203 | 204 | ### Consumer Death: Unexpected 205 | 206 | If a consumer dies unexpectedly, things are slightly worse off. Assuming 207 | a hardware failure or other such issue (network split, etc), the 208 | partition's claim will start to become stale. From the perspective of 209 | the rest of the fleet, they will have to wait an appropriate interval 210 | (two heartbeats) until they can claim the partition. 211 | 212 | Data might be double-consumed, but the maximum amount is one heartbeat's 213 | worth. Depending on the last time you heartbeated, at worst you will see 214 | that many messages be double-consumed. The downtime of consumption is 215 | also up to two heartbeat intervals at worst. 216 | 217 | ### Network Partitions 218 | 219 | Since Kafka can only have a single leader for a partition, any consumers 220 | that are on the side of the leader will be able to continue working. 221 | Consumers that are on the other side will fail to heartbeat and will 222 | stop being able to work -- even if they could otherwise reach the leader 223 | for the topics they were consuming. 224 | 225 | The consumers on the side of the Marshal coordination partitions will be 226 | able to tell that the other consumers dropped off and will be able to 227 | start working. (Of course, this may cause them to overload themselves 228 | with too many claims, leading to consumer slowness.) 229 | 230 | If the partition is between the consumer and Kafka, the consumers 231 | will be unable to consume and will also fail their heartbeat. This is 232 | effectively treated as Consumer Death: Unexpected. When the partition 233 | heals, the consumers that lost their lock will know (assuming machine 234 | time is synchronized) and will abandon their claims. 235 | 236 | ## Important Notes 237 | 238 | This system assumes that timestamps are valid. If your machines are 239 | not using NTP to synchronize their clocks, you will not be able to get 240 | deterministic behavior. Sorry. 241 | 242 | Marshal also relies on all actors being good actors. Malicious users can 243 | cause the system to act unpredictably or at their choosing. 244 | 245 | ## Frequently Asked Questions 246 | 247 | Here are some questions we've seen. For more, see us on IRC. 248 | 249 | ### My consumers are unbalanced; one has more partitions than the others. 250 | 251 | This is a design property of Marshal's implementation. We start with the 252 | premise that we can capably health check ourself and determine whether 253 | or not we are keeping up with our current claims. If that's true, then 254 | it doesn't matter how many partitions we have -- we'll be healthy. 255 | 256 | This means that we can end up in a state where one consumer has several 257 | partitions and another consumer has fewer (or none), but Marshal 258 | guarantees that all of them will be healthy. 259 | 260 | ### My consumer isn't claiming any partitions. 261 | 262 | This usually happens when you are reusing Client IDs and your consumer 263 | has previously become unhealthy and released partitions. A sick consumer 264 | will not reclaim partitions it has previously released. 265 | 266 | Make sure you have multiple consumers with different Client IDs, or 267 | make sure that in the single consumer use case you are using randomly 268 | generated Client IDs every time your program starts. 269 | 270 | ## Bugs and Contact 271 | 272 | There may be bugs. This is a new project. There are tests, however, and 273 | we very much welcome the submission of bug reports, pull requests, etc. 274 | 275 | Github: https://github.com/zorkian/marshal 276 | 277 | IRC: #kafka-marshal on Freenode 278 | -------------------------------------------------------------------------------- /marshal/marshal.go: -------------------------------------------------------------------------------- 1 | /* 2 | * portal - marshal 3 | * 4 | * a library that implements an algorithm for doing consumer coordination within Kafka, rather 5 | * than using Zookeeper or another external system. 6 | * 7 | */ 8 | 9 | package marshal 10 | 11 | import ( 12 | "fmt" 13 | "sync" 14 | "sync/atomic" 15 | "time" 16 | 17 | "github.com/zorkian/kafka" 18 | "github.com/zorkian/kafka/proto" 19 | "github.com/pborman/uuid" 20 | ) 21 | 22 | const ( 23 | // MarshalTopic is the main topic used for coordination. This must be constant across all 24 | // consumers that you want to coordinate. 25 | MarshalTopic = "__marshal" 26 | 27 | // HeartbeatInterval is the main timing used to determine how "chatty" the system is and how 28 | // fast it responds to failures of consumers. THIS VALUE MUST BE THE SAME BETWEEN ALL CONSUMERS 29 | // as it is critical to coordination. 30 | HeartbeatInterval = 60 // Measured in seconds. 31 | ) 32 | 33 | // Marshaler is the coordinator type. It is designed to be used once per (client, 34 | // group) and is thread safe. Creating one of these will create connections to your 35 | // Kafka cluster and begin actively monitoring the coordination topic. 36 | type Marshaler struct { 37 | // These members are not protected by the lock and can be read at any 38 | // time as they're write-once or only ever atomically updated. They must 39 | // never be overwritten once a Marshaler is created. 40 | quit *int32 41 | cluster *KafkaCluster 42 | ownsCluster bool 43 | instanceID string 44 | clientID string 45 | groupID string 46 | offsets kafka.OffsetCoordinator 47 | 48 | // Lock protects the following members; you must have this lock in order to 49 | // read from or write to these. 50 | lock *sync.RWMutex 51 | consumers []*Consumer 52 | } 53 | 54 | // NewMarshaler connects to a cluster (given broker addresses) and prepares to handle marshalling 55 | // requests. Given the way this system works, the marshaler has to process all messages in the 56 | // topic before it's safely able to begin operating. This might take a while. NOTE: If you are 57 | // creating multiple marshalers in your program, you should instead call Dial and then use 58 | // the NewMarshaler method on that object. 59 | func NewMarshaler(clientID, groupID string, brokers []string) (*Marshaler, error) { 60 | cluster, err := Dial("automatic", brokers, NewMarshalOptions()) 61 | if err != nil { 62 | return nil, err 63 | } 64 | m, err := cluster.NewMarshaler(clientID, groupID) 65 | if err != nil { 66 | m.ownsCluster = true 67 | } 68 | return m, err 69 | } 70 | 71 | // newInstanceID creates a new random instance ID for use inside Marshal messages. This 72 | // is generated new every time we restart. 73 | func newInstanceID() string { 74 | // A UUID4 starts with 8 random characters, so let's use that as our instance ID. 75 | // This should be a good tradeoff between randomness and brevity. 76 | return uuid.New()[0:8] 77 | } 78 | 79 | // addNewConsumer is called when a new Consumer is created. This allows Marshal to keep 80 | // track of the consumers that exist so we can operate on them later if needed. 81 | func (m *Marshaler) addNewConsumer(c *Consumer) { 82 | m.lock.Lock() 83 | defer m.lock.Unlock() 84 | 85 | m.consumers = append(m.consumers, c) 86 | } 87 | 88 | // removeConsumer is called when a Consumer is terminating and should be removed from our list. 89 | func (m *Marshaler) removeConsumer(c *Consumer) { 90 | m.lock.Lock() 91 | defer m.lock.Unlock() 92 | 93 | for i, cn := range m.consumers { 94 | if cn == c { 95 | m.consumers = append(m.consumers[:i], m.consumers[i+1:]...) 96 | break 97 | } 98 | } 99 | } 100 | 101 | // getClaimedPartitionState returns a topicState iff it is claimed by the current Marshaler. 102 | // Else, an error is returned. This is on the Marshaler becomes it's a helper to only return 103 | // a claim that is presently valid and owned by us. 104 | func (m *Marshaler) getClaimedPartitionState(topicName string, partID int) ( 105 | *topicState, error) { 106 | 107 | // Get partition state of whatever happens to be here 108 | topic := m.cluster.getPartitionState(m.groupID, topicName, partID) 109 | 110 | topic.lock.RLock() 111 | defer topic.lock.RUnlock() 112 | 113 | if !topic.partitions[partID].claimed(m.cluster.ts) { 114 | return nil, fmt.Errorf("Partition %s:%d is not claimed!", topicName, partID) 115 | } 116 | 117 | // And if it's not claimed by us... 118 | if topic.partitions[partID].GroupID != m.groupID || 119 | topic.partitions[partID].ClientID != m.clientID { 120 | return nil, fmt.Errorf("Partition %s:%d is not claimed by us!", topicName, partID) 121 | } 122 | 123 | return topic, nil 124 | } 125 | 126 | // Topics returns the list of known topics. 127 | func (m *Marshaler) Topics() []string { 128 | return m.cluster.getTopics() 129 | } 130 | 131 | // Partitions returns the count of how many partitions are in a given topic. Returns 0 if a 132 | // topic is unknown. 133 | func (m *Marshaler) Partitions(topicName string) int { 134 | return m.cluster.getTopicPartitions(topicName) 135 | } 136 | 137 | // terminateAndCleanup terminates the marshal, with the option of removing 138 | // the marshaler's reference from its associated cluster. 139 | func (m *Marshaler) terminateAndCleanup(remove bool) { 140 | if !atomic.CompareAndSwapInt32(m.quit, 0, 1) { 141 | return 142 | } 143 | 144 | m.lock.Lock() 145 | defer m.lock.Unlock() 146 | 147 | // Now terminate all of the consumers. In this codepath we do a no-release termination 148 | // because that is usually correct in production. If someone actually wants to release 149 | // they need to terminate the consumers manually. 150 | for _, cn := range m.consumers { 151 | cn.terminateAndCleanup(false, false) 152 | } 153 | m.consumers = nil 154 | 155 | // If we own the cluster, terminate it. 156 | if m.ownsCluster { 157 | m.cluster.Terminate() 158 | } 159 | 160 | // Remove this marshal from its cluster. Doing so is recommended 161 | // if the cluster doesn't remove the terminated marshal itself (by setting its 162 | // list of marshals to nil or filtering them). 163 | if remove { 164 | m.cluster.removeMarshal(m) 165 | } 166 | } 167 | 168 | // Terminate is called when we're done with the marshaler and want to shut down. 169 | func (m *Marshaler) Terminate() { 170 | m.terminateAndCleanup(true) 171 | } 172 | 173 | // Terminated returns whether or not we have been terminated. 174 | func (m *Marshaler) Terminated() bool { 175 | return atomic.LoadInt32(m.quit) == 1 176 | } 177 | 178 | // Claimed returns the current status on whether or not a partition is claimed by any other 179 | // consumer in our group (including ourselves). A topic/partition that does not exist is 180 | // considered to be unclaimed. 181 | func (m *Marshaler) Claimed(topicName string, partID int) bool { 182 | // The contract of this method is that if it returns something and the heartbeat is 183 | // non-zero, the partition is claimed. 184 | claim := m.GetPartitionClaim(topicName, partID) 185 | return claim.LastHeartbeat > 0 186 | } 187 | 188 | // GetPartitionClaim returns a PartitionClaim structure for a given partition. The structure 189 | // describes the consumer that is currently claiming this partition. This is a copy of the 190 | // claim structure, so changing it cannot change the world state. 191 | func (m *Marshaler) GetPartitionClaim(topicName string, partID int) PartitionClaim { 192 | topic := m.cluster.getPartitionState(m.groupID, topicName, partID) 193 | 194 | topic.lock.RLock() 195 | defer topic.lock.RUnlock() 196 | 197 | if topic.partitions[partID].claimed(m.cluster.ts) { 198 | return topic.partitions[partID] // copy. 199 | } 200 | return PartitionClaim{} 201 | } 202 | 203 | // GetLastPartitionClaim returns a PartitionClaim structure for a given partition. The structure 204 | // describes the consumer that is currently or most recently claiming this partition. This is a 205 | // copy of the claim structure, so changing it cannot change the world state. 206 | func (m *Marshaler) GetLastPartitionClaim(topicName string, partID int) PartitionClaim { 207 | topic := m.cluster.getPartitionState(m.groupID, topicName, partID) 208 | 209 | topic.lock.RLock() 210 | defer topic.lock.RUnlock() 211 | 212 | return topic.partitions[partID] // copy. 213 | } 214 | 215 | // GetPartitionOffsets returns the current state of a topic/partition. This has to hit Kafka 216 | // thrice to ask about a partition, but it returns the full state of information that can be 217 | // used to calculate consumer lag. 218 | func (m *Marshaler) GetPartitionOffsets(topicName string, partID int) (PartitionOffsets, error) { 219 | var err error 220 | 221 | o := PartitionOffsets{} 222 | o.Earliest, err = m.cluster.broker.OffsetEarliest(topicName, int32(partID)) 223 | if err != nil { 224 | return PartitionOffsets{}, err 225 | } 226 | 227 | o.Latest, err = m.cluster.broker.OffsetLatest(topicName, int32(partID)) 228 | if err != nil { 229 | return PartitionOffsets{}, err 230 | } 231 | 232 | // Get committed offsets for our particular group using our offset coordinator. 233 | o.Committed, _, err = m.offsets.Offset(topicName, int32(partID)) 234 | if err != nil { 235 | // This error happens when Kafka does not know about the partition i.e. no 236 | // offset has been committed here. In that case we ignore it. 237 | if err != proto.ErrUnknownTopicOrPartition { 238 | return PartitionOffsets{}, fmt.Errorf("offset fetch fail: %s", err) 239 | } 240 | } 241 | 242 | // Use the last claim we know about, whatever it is 243 | claim := m.GetLastPartitionClaim(topicName, partID) 244 | o.Current = claim.CurrentOffset 245 | return o, nil 246 | } 247 | 248 | // msgBase constructs a base message object for a message. 249 | func (m *Marshaler) msgBase(topicName string, partID int) *msgBase { 250 | return &msgBase{ 251 | Version: 1, 252 | Time: int(time.Now().Unix()), 253 | InstanceID: m.instanceID, 254 | ClientID: m.clientID, 255 | GroupID: m.groupID, 256 | Topic: topicName, 257 | PartID: partID, 258 | } 259 | } 260 | 261 | // ClaimPartition is how you can actually claim a partition. If you call this, Marshal will 262 | // attempt to claim the partition on your behalf. This is the low level function, you probably 263 | // want to use a MarshaledConsumer. Returns a bool on whether or not the claim succeeded and 264 | // whether you can continue. 265 | func (m *Marshaler) ClaimPartition(topicName string, partID int) bool { 266 | topic := m.cluster.getPartitionState(m.groupID, topicName, partID) 267 | 268 | // Unlock is later, since this function might take a while 269 | // TODO: Move this logic to a func and defer the lock (for sanity sake) 270 | topic.lock.Lock() 271 | 272 | // If the topic is already claimed, we can short circuit the decision process 273 | if topic.partitions[partID].claimed(m.cluster.ts) { 274 | defer topic.lock.Unlock() 275 | if topic.partitions[partID].GroupID == m.groupID && 276 | topic.partitions[partID].ClientID == m.clientID { 277 | return true 278 | } 279 | log.Warningf("Attempt to claim already claimed partition.") 280 | return false 281 | } 282 | 283 | // Make a channel for results, append it to the list so we hear about claims 284 | out := make(chan struct{}, 1) 285 | topic.partitions[partID].pendingClaims = append( 286 | topic.partitions[partID].pendingClaims, out) 287 | topic.lock.Unlock() 288 | 289 | // Produce message to kafka 290 | cl := &msgClaimingPartition{ 291 | msgBase: *m.msgBase(topicName, partID), 292 | } 293 | _, err := m.cluster.producer.Produce(MarshalTopic, int32(topic.claimPartition), 294 | &proto.Message{Value: []byte(cl.Encode())}) 295 | if err != nil { 296 | // If we failed to produce, this is probably serious so we should undo the work 297 | // we did and then return failure 298 | log.Errorf("Failed to produce to Kafka: %s", err) 299 | return false 300 | } 301 | 302 | // Wait for channel to close, which is the signal that the rationalizer has 303 | // updated the status. 304 | <-out 305 | 306 | // Now we have to check if we own the partition. If this returns anything, the partition 307 | // is ours. nil = not. 308 | topic, err = m.getClaimedPartitionState(topicName, partID) 309 | if topic == nil || err != nil { 310 | return false 311 | } 312 | return true 313 | } 314 | 315 | // Heartbeat will send an update for other people to know that we're still alive and 316 | // still owning this partition. Returns an error if anything has gone wrong (at which 317 | // point we can no longer assert we have the lock). 318 | func (m *Marshaler) Heartbeat(topicName string, partID int, offset int64) error { 319 | topic, err := m.getClaimedPartitionState(topicName, partID) 320 | if err != nil { 321 | return err 322 | } 323 | 324 | // Attempt to commit offset, this is best-effort and we don't care if it fails 325 | // since the canonical storage is in the heartbeat 326 | if err := m.CommitOffsets(topicName, partID, offset); err != nil { 327 | log.Warningf("[%s:%d] failed to commit offset during heartbeat: %s", 328 | topicName, partID, err) 329 | } 330 | 331 | // All good, let's heartbeat 332 | cl := &msgHeartbeat{ 333 | msgBase: *m.msgBase(topicName, partID), 334 | CurrentOffset: offset, 335 | } 336 | _, err = m.cluster.producer.Produce(MarshalTopic, int32(topic.claimPartition), 337 | &proto.Message{Value: []byte(cl.Encode())}) 338 | if err != nil { 339 | log.Errorf("[%s:%d] failed to send heartbeat message to Kafka: %s", 340 | topicName, partID, err) 341 | return fmt.Errorf("Failed to produce heartbeat to Kafka: %s", err) 342 | } 343 | 344 | return nil 345 | } 346 | 347 | // ReleasePartition will send an update for other people to know that we're done with 348 | // a partition. Returns an error if anything has gone wrong (at which 349 | // point we can no longer assert we have the lock). 350 | func (m *Marshaler) ReleasePartition(topicName string, partID int, offset int64) error { 351 | topic, err := m.getClaimedPartitionState(topicName, partID) 352 | if err != nil { 353 | return err 354 | } 355 | 356 | // Commit our offset first; if this fails, we can still try to release, 357 | // but we should advise 358 | if err := m.CommitOffsets(topicName, partID, offset); err != nil { 359 | log.Warningf("[%s:%d] failed to commit offset during release: %s", 360 | topicName, partID, err) 361 | } 362 | 363 | // All good, let's release 364 | cl := &msgReleasingPartition{ 365 | msgBase: *m.msgBase(topicName, partID), 366 | CurrentOffset: offset, 367 | } 368 | _, err = m.cluster.producer.Produce(MarshalTopic, int32(topic.claimPartition), 369 | &proto.Message{Value: []byte(cl.Encode())}) 370 | if err != nil { 371 | log.Errorf("[%s:%d] failed to send release message to Kafka: %s", 372 | topicName, partID, err) 373 | return fmt.Errorf("Failed to produce release to Kafka: %s", err) 374 | } 375 | 376 | return nil 377 | } 378 | 379 | // CommitOffsets will commit the partition offsets to Kafka so it's available in the 380 | // long-term storage of the offset coordination system. Note: this method does not ensure 381 | // that this Marshal instance owns the topic/partition in question. 382 | func (m *Marshaler) CommitOffsets(topicName string, partID int, offset int64) error { 383 | return m.offsets.Commit(topicName, int32(partID), offset) 384 | } 385 | 386 | // ClientID returns the client ID we're using 387 | func (m *Marshaler) ClientID() string { 388 | return m.clientID 389 | } 390 | 391 | // GroupID returns the group ID we're using 392 | func (m *Marshaler) GroupID() string { 393 | return m.groupID 394 | } 395 | 396 | // PrintState will take the current state of the Marshal world and print it verbosely to the 397 | // logging output. This is used in the rare case where we're self-terminating or on request 398 | // from the user. 399 | func (m *Marshaler) PrintState() { 400 | m.lock.RLock() 401 | defer m.lock.RUnlock() 402 | 403 | m.cluster.lock.RLock() 404 | defer m.cluster.lock.RUnlock() 405 | 406 | log.Infof("Marshal state dump beginning.") 407 | log.Infof("") 408 | log.Infof("Group ID: %s", m.groupID) 409 | log.Infof("Client ID: %s", m.clientID) 410 | log.Infof("Instance ID: %s", m.instanceID) 411 | log.Infof("") 412 | log.Infof("Marshal topic partitions: %d", m.cluster.partitions) 413 | log.Infof("Known Kafka topics: %d", len(m.cluster.topics)) 414 | log.Infof("Internal rsteps counter: %d", atomic.LoadInt32(m.cluster.rsteps)) 415 | log.Infof("") 416 | log.Infof("State of the world:") 417 | log.Infof("") 418 | for group, topicmap := range m.cluster.groups { 419 | log.Infof(" GROUP: %s", group) 420 | for topic, state := range topicmap { 421 | log.Infof(" TOPIC: %s [on %s:%d]", topic, MarshalTopic, state.claimPartition) 422 | state.PrintState() 423 | } 424 | } 425 | log.Infof("") 426 | log.Infof("Consumer states:") 427 | log.Infof("") 428 | for _, consumer := range m.consumers { 429 | consumer.PrintState() 430 | } 431 | log.Infof("") 432 | log.Infof("Marshal state dump complete.") 433 | } 434 | -------------------------------------------------------------------------------- /marshal/cluster.go: -------------------------------------------------------------------------------- 1 | /* 2 | * portal - marshal 3 | * 4 | * a library that implements an algorithm for doing consumer coordination within Kafka, rather 5 | * than using Zookeeper or another external system. 6 | * 7 | */ 8 | 9 | package marshal 10 | 11 | import ( 12 | "crypto/md5" 13 | "encoding/binary" 14 | "errors" 15 | "fmt" 16 | "math/rand" 17 | "sync" 18 | "sync/atomic" 19 | "time" 20 | 21 | "github.com/zorkian/kafka" 22 | ) 23 | 24 | // KafkaCluster is a user-agnostic view of the world. It connects to a Kafka cluster 25 | // and runs rationalizers to observe the complete world state. 26 | type KafkaCluster struct { 27 | // These members are not protected by the lock and can be read at any 28 | // time as they're write-once or only ever atomically updated. They must 29 | // never be overwritten once a KafkaCluster is created. 30 | quit *int32 31 | name string 32 | broker *kafka.Broker 33 | producer kafka.Producer 34 | partitions int 35 | jitters chan time.Duration 36 | options MarshalOptions 37 | 38 | // Lock protects the following members; you must have this lock in order to 39 | // read from or write to these. 40 | lock *sync.RWMutex 41 | marshalers []*Marshaler 42 | topics map[string]int 43 | groups map[string]map[string]*topicState 44 | // pausedGroups stores the expiry time for groups that are paused. 45 | pausedGroups map[string]time.Time 46 | 47 | // This WaitGroup is used for signalling when all of the rationalizers have 48 | // finished processing. 49 | rationalizers *sync.WaitGroup 50 | 51 | // rsteps is updated whenever a rationalizer processes a log entry, this is 52 | // used mainly by the test suite. 53 | rsteps *int32 54 | 55 | // This is for testing only. When this is non-zero, the rationalizer will answer 56 | // queries based on THIS time instead of the current, actual time. 57 | ts int64 58 | } 59 | 60 | // MarshalOptions contains various tunables that can be used to adjust the configuration 61 | // of the underlying system. 62 | type MarshalOptions struct { 63 | // BrokerConnectionLimit is used to set the maximum simultaneous number of connections 64 | // that can be made to each broker. 65 | // Default: 30. 66 | BrokerConnectionLimit int 67 | 68 | // ConsumeRequestTimeout sets the time that we ask Kafka to wait before returning any 69 | // data to us. Setting this high uses more connections and can lead to some latency 70 | // but keeps the load on Kafka minimal. Use this to balance QPS against latency. 71 | // 72 | // Default: 1 millisecond. 73 | ConsumeRequestTimeout time.Duration 74 | 75 | // MarshalRequestTimeout is used for our coordination requests. This should be reasonable 76 | // at default, but is left as a tunable in case you have clients that are claiming an 77 | // extremely large number of partitions and are too slow. The overall Marshal latency 78 | // is impacted by this value as well as the MarshalRequestRetryWait below. 79 | // 80 | // Default: 1 millisecond. 81 | MarshalRequestTimeout time.Duration 82 | 83 | // MarshalRequestRetryWait is the time between consume requests Marshal generates. This 84 | // should be set to balance the above timeouts to prevent hammering the server. 85 | // 86 | // Default: 500 milliseconds. 87 | MarshalRequestRetryWait time.Duration 88 | 89 | // MaxMessageSize is the maximum size in bytes of messages that can be returned. This 90 | // must be set to the size of the largest messages your cluster is allowed to store, 91 | // else you will end up with stalled streams. I.e., Kafka will never send you a message 92 | // if the message is larger than this value but we can't detect that, we just think 93 | // there is no data. 94 | // 95 | // Default: 2,000,000 bytes. 96 | MaxMessageSize int32 97 | 98 | // MaxMessageQueue is the number of messages to retrieve from Kafka and store in-memory 99 | // waiting for consumption. This is per-Consumer and independent of message size so you 100 | // should adjust this for your consumption patterns. 101 | // 102 | // Default: 1000 messages. 103 | MaxMessageQueue int 104 | } 105 | 106 | // NewMarshalOptions returns a set of MarshalOptions populated with defaults. 107 | func NewMarshalOptions() MarshalOptions { 108 | return MarshalOptions{ 109 | BrokerConnectionLimit: 30, 110 | ConsumeRequestTimeout: 1 * time.Millisecond, 111 | MarshalRequestTimeout: 1 * time.Millisecond, 112 | MarshalRequestRetryWait: 500 * time.Millisecond, 113 | MaxMessageSize: 2000000, 114 | MaxMessageQueue: 1000, 115 | } 116 | } 117 | 118 | // Dial returns a new cluster object which can be used to instantiate a number of Marshalers 119 | // that all use the same cluster. You may pass brokerConf or may set it to nil. 120 | func Dial(name string, brokers []string, options MarshalOptions) (*KafkaCluster, error) { 121 | // Connect to Kafka 122 | brokerConf := kafka.NewBrokerConf("PortalMarshal") 123 | brokerConf.ClusterConnectionConf.MetadataRefreshFrequency = time.Hour 124 | brokerConf.ClusterConnectionConf.ConnectionLimit = options.BrokerConnectionLimit 125 | brokerConf.LeaderRetryLimit = 1 // Do not retry 126 | broker, err := kafka.NewBroker(name, brokers, brokerConf) 127 | if err != nil { 128 | return nil, err 129 | } 130 | 131 | c := &KafkaCluster{ 132 | quit: new(int32), 133 | rsteps: new(int32), 134 | name: name, 135 | options: options, 136 | lock: &sync.RWMutex{}, 137 | rationalizers: &sync.WaitGroup{}, 138 | broker: broker, 139 | producer: broker.Producer(kafka.NewProducerConf()), 140 | topics: make(map[string]int), 141 | groups: make(map[string]map[string]*topicState), 142 | pausedGroups: make(map[string]time.Time), 143 | jitters: make(chan time.Duration, 100), 144 | // It's important that marshalers begins as an empty slice and not nil to avoid 145 | // a race between NewMarshaler and Terminate. See note in Terminate. 146 | marshalers: make([]*Marshaler, 0), 147 | } 148 | 149 | // Do an initial metadata fetch, this will block a bit 150 | err = c.refreshMetadata() 151 | if err != nil { 152 | return nil, fmt.Errorf("Failed to get metadata: %s", err) 153 | } 154 | 155 | // If there is no marshal topic, then we can't run. The admins must go create the topic 156 | // before they can use this library. Please see the README. 157 | c.partitions = c.getTopicPartitions(MarshalTopic) 158 | if c.partitions == 0 { 159 | return nil, errors.New("Marshalling topic not found. Please see the documentation.") 160 | } 161 | 162 | // Now we start a goroutine to start consuming each of the partitions in the marshal 163 | // topic. Note that this doesn't handle increasing the partition count on that topic 164 | // without stopping all consumers. 165 | c.rationalizers.Add(c.partitions) 166 | for id := 0; id < c.partitions; id++ { 167 | go c.rationalize(id, c.kafkaConsumerChannel(id)) 168 | } 169 | 170 | // A jitter calculator, just fills a channel with random numbers so that other 171 | // people don't have to build their own random generator. It is important that 172 | // these values be somewhat less than the HeartbeatInterval as we use this for 173 | // jittering our heartbeats. 174 | go func() { 175 | rnd := rand.New(rand.NewSource(time.Now().UnixNano())) 176 | for { 177 | jitter := rnd.Intn(HeartbeatInterval/2) + (HeartbeatInterval / 4) 178 | c.jitters <- time.Duration(jitter) * time.Second 179 | } 180 | }() 181 | 182 | // Now start the metadata refreshing goroutine 183 | go func() { 184 | for !c.Terminated() { 185 | time.Sleep(<-c.jitters) 186 | log.Infof("[%s] Refreshing topic metadata.", c.name) 187 | c.refreshMetadata() 188 | 189 | // See if the number of partitions in the marshal topic changed. This is bad if 190 | // it happens, since it means we can no longer coordinate correctly. 191 | if c.getTopicPartitions(MarshalTopic) != c.partitions { 192 | log.Errorf("[%s] Marshal topic partition count changed. Terminating!", c.name) 193 | c.Terminate() 194 | } 195 | } 196 | }() 197 | 198 | // Wait for all rationalizers to come alive 199 | log.Infof("[%s] Waiting for all rationalizers to come alive.", c.name) 200 | c.rationalizers.Wait() 201 | log.Infof("[%s] All rationalizers alive, KafkaCluster now alive.", c.name) 202 | 203 | return c, nil 204 | } 205 | 206 | // NewMarshaler creates a Marshaler off of an existing cluster. This is more efficient 207 | // if you're creating multiple instances, since they can share the same underlying cluster. 208 | func (c *KafkaCluster) NewMarshaler(clientID, groupID string) (*Marshaler, error) { 209 | if c.Terminated() { 210 | return nil, errors.New("Cluster is terminated.") 211 | } 212 | 213 | // Get offset coordinator so we can look up (and save) committed offsets later. 214 | coordinator, err := c.getOffsetCoordinator(groupID) 215 | if err != nil { 216 | return nil, err 217 | } 218 | 219 | m := &Marshaler{ 220 | quit: new(int32), 221 | cluster: c, 222 | instanceID: newInstanceID(), 223 | clientID: clientID, 224 | groupID: groupID, 225 | offsets: coordinator, 226 | lock: &sync.RWMutex{}, 227 | } 228 | 229 | c.lock.Lock() 230 | defer c.lock.Unlock() 231 | 232 | // This is a bit of hack, see note in KafkaCluster::Terminate. 233 | if c.marshalers == nil { 234 | return nil, errors.New("Cluster is terminated (marshalers is nil).") 235 | } 236 | 237 | // Remove any dead marshalers from our slice and add the new one 238 | filtered := make([]*Marshaler, 0) 239 | for _, marshaler := range c.marshalers { 240 | if !marshaler.Terminated() { 241 | filtered = append(filtered, marshaler) 242 | } 243 | } 244 | filtered = append(filtered, m) 245 | c.marshalers = filtered 246 | 247 | return m, nil 248 | } 249 | 250 | // refreshMetadata is periodically used to update our internal state with topic information 251 | // about the world. 252 | func (c *KafkaCluster) refreshMetadata() error { 253 | md, err := c.broker.Metadata() 254 | if err != nil { 255 | return err 256 | } 257 | 258 | newTopics := make(map[string]int) 259 | for _, topic := range md.Topics { 260 | newTopics[topic.Name] = len(topic.Partitions) 261 | } 262 | 263 | c.lock.Lock() 264 | defer c.lock.Unlock() 265 | c.topics = newTopics 266 | return nil 267 | } 268 | 269 | // getOffsetCoordinator returns a kafka.OffsetCoordinator for a specific group. 270 | func (c *KafkaCluster) getOffsetCoordinator(groupID string) (kafka.OffsetCoordinator, error) { 271 | return c.broker.OffsetCoordinator( 272 | kafka.NewOffsetCoordinatorConf(groupID)) 273 | } 274 | 275 | // getClaimPartition calculates which partition a topic should use for coordination. This uses 276 | // a hashing function (non-cryptographic) to predictably partition the topic space. 277 | func (c *KafkaCluster) getClaimPartition(topicName string) int { 278 | // We use MD5 because it's a fast and good hashing algorithm and we don't need cryptographic 279 | // properties. We then take the first 8 bytes and treat them as a uint64 and modulo that 280 | // across how many partitions we have. 281 | hash := md5.Sum([]byte(topicName)) 282 | uval := binary.LittleEndian.Uint64(hash[0:8]) 283 | return int(uval % uint64(c.partitions)) 284 | } 285 | 286 | // getGroupState returns the map of topics to topicState objects for a group. 287 | func (c *KafkaCluster) getGroupState(groupID string) map[string]*topicState { 288 | // Read lock check 289 | c.lock.RLock() 290 | if group, ok := c.groups[groupID]; ok { 291 | c.lock.RUnlock() 292 | return group 293 | } 294 | c.lock.RUnlock() 295 | 296 | // Failed, write lock check and possible create 297 | c.lock.Lock() 298 | defer c.lock.Unlock() 299 | 300 | if group, ok := c.groups[groupID]; ok { 301 | return group 302 | } 303 | c.groups[groupID] = make(map[string]*topicState) 304 | return c.groups[groupID] 305 | } 306 | 307 | // getTopicState returns a topicState for a given topic. 308 | func (c *KafkaCluster) getTopicState(groupID, topicName string) *topicState { 309 | group := c.getGroupState(groupID) 310 | 311 | // Read lock check 312 | c.lock.RLock() 313 | if topic, ok := group[topicName]; ok { 314 | c.lock.RUnlock() 315 | return topic 316 | } 317 | c.lock.RUnlock() 318 | 319 | // Write lock check and possible create 320 | c.lock.Lock() 321 | defer c.lock.Unlock() 322 | 323 | if topic, ok := group[topicName]; ok { 324 | return topic 325 | } 326 | group[topicName] = &topicState{ 327 | claimPartition: c.getClaimPartition(topicName), 328 | partitions: nil, 329 | lock: &sync.RWMutex{}, 330 | } 331 | return group[topicName] 332 | } 333 | 334 | // getPartitionState returns a topicState and possibly creates it and the partition state within 335 | // the State. 336 | func (c *KafkaCluster) getPartitionState(groupID, topicName string, partID int) *topicState { 337 | // Get topic and lock it so we can update it if needed 338 | topic := c.getTopicState(groupID, topicName) 339 | 340 | // Read lock check 341 | topic.lock.RLock() 342 | if len(topic.partitions) > partID { 343 | topic.lock.RUnlock() 344 | return topic 345 | } 346 | topic.lock.RUnlock() 347 | 348 | // Must upgrade, looks like we need a new partition 349 | topic.lock.Lock() 350 | defer topic.lock.Unlock() 351 | 352 | if len(topic.partitions) < partID+1 { 353 | for i := len(topic.partitions); i <= partID; i++ { 354 | topic.partitions = append(topic.partitions, PartitionClaim{}) 355 | } 356 | } 357 | return topic 358 | } 359 | 360 | // getTopics returns the list of known topics. 361 | func (c *KafkaCluster) getTopics() []string { 362 | c.lock.RLock() 363 | defer c.lock.RUnlock() 364 | 365 | topics := make([]string, 0, len(c.topics)) 366 | for topic := range c.topics { 367 | topics = append(topics, topic) 368 | } 369 | return topics 370 | } 371 | 372 | // getTopicPartitions returns the count of how many partitions are in a given topic. Returns 0 if a 373 | // topic is unknown. 374 | func (c *KafkaCluster) getTopicPartitions(topicName string) int { 375 | c.lock.RLock() 376 | defer c.lock.RUnlock() 377 | 378 | count, _ := c.topics[topicName] 379 | return count 380 | } 381 | 382 | // removeMarshal removes a terminated Marshal from a cluster's list. 383 | func (c *KafkaCluster) removeMarshal(m *Marshaler) { 384 | c.lock.Lock() 385 | defer c.lock.Unlock() 386 | 387 | for i, ml := range c.marshalers { 388 | if ml == m { 389 | c.marshalers = append(c.marshalers[:i], c.marshalers[i+1:]...) 390 | break 391 | } 392 | } 393 | } 394 | 395 | // waitForRsteps is used by the test suite to ask the rationalizer to wait until some number 396 | // of events have been processed. This also returns the current rsteps when it returns. 397 | func (c *KafkaCluster) waitForRsteps(steps int) (int, error) { 398 | cancel := make(chan struct{}) 399 | result := make(chan int) 400 | go func() { 401 | for { 402 | select { 403 | case <-cancel: 404 | break 405 | default: 406 | cval := atomic.LoadInt32(c.rsteps) 407 | if cval >= int32(steps) { 408 | result <- int(cval) 409 | } 410 | time.Sleep(5 * time.Millisecond) 411 | } 412 | } 413 | }() 414 | 415 | select { 416 | case res := <-result: 417 | return res, nil 418 | case <-time.After(3 * time.Second): 419 | close(cancel) 420 | return 0, errors.New("Timed out waiting for steps") 421 | } 422 | } 423 | 424 | // pauseConsumerGroup stores an expiry time for consumer groups that we'd like to pause. 425 | func (c *KafkaCluster) pauseConsumerGroup(groupID string, adminID string, expiry time.Time) { 426 | c.lock.Lock() 427 | defer c.lock.Unlock() 428 | 429 | log.Warningf("Cluster marking group %s paused with expiry: %s", groupID, expiry.Format(time.UnixDate)) 430 | c.pausedGroups[groupID] = expiry 431 | } 432 | 433 | // IsGroupPaused returns true if the given consumer group is paused. 434 | // TODO(pihu) This just checks the expiry time, and not the admin ID. 435 | func (c *KafkaCluster) IsGroupPaused(groupID string) bool { 436 | c.lock.RLock() 437 | defer c.lock.RUnlock() 438 | 439 | if res, ok := c.pausedGroups[groupID]; !ok { 440 | return false 441 | } else { 442 | return time.Now().Before(res) 443 | } 444 | } 445 | 446 | // Terminate is called when we're done with the marshaler and want to shut down. 447 | func (c *KafkaCluster) Terminate() { 448 | if !atomic.CompareAndSwapInt32(c.quit, 0, 1) { 449 | return 450 | } 451 | 452 | log.Infof("[%s] beginning termination", c.name) 453 | 454 | // This is a bit of a hack, but because marshaler.terminateAndCleanup requires the read lock 455 | // on c, we can't terminate the Marshalers in the list while holding the write lock. 456 | // Because KafkaCluster::NewMarshaler will return an error if the marshalers slice is nil, 457 | // we know there cannot be new Marshalers created which aren't included in the local slice 458 | // we create here. 459 | // 460 | // There is probably some alternative where the quit variable is protected by the mutex instead 461 | // of being atomic, but this seems somewhat cleaner in case some future refactoring eliminates 462 | // the marshalers slice entirely this hack will go away automatically. 463 | c.lock.Lock() 464 | marshalers := c.marshalers 465 | c.marshalers = nil 466 | c.lock.Unlock() 467 | 468 | // Terminate all Marshalers which will in turn terminate all Consumers and 469 | // let everybody know we're all done. 470 | for _, marshaler := range marshalers { 471 | marshaler.terminateAndCleanup(false) 472 | } 473 | } 474 | 475 | // Terminated returns whether or not we have been terminated. 476 | func (c *KafkaCluster) Terminated() bool { 477 | return atomic.LoadInt32(c.quit) == 1 478 | } 479 | -------------------------------------------------------------------------------- /marshal/claim.go: -------------------------------------------------------------------------------- 1 | /* 2 | * portal - marshal 3 | * 4 | * a library that implements an algorithm for doing consumer coordination within Kafka, rather 5 | * than using Zookeeper or another external system. 6 | * 7 | */ 8 | 9 | package marshal 10 | 11 | import ( 12 | "fmt" 13 | "math/rand" 14 | "sort" 15 | "sync" 16 | "sync/atomic" 17 | "time" 18 | 19 | "github.com/zorkian/kafka" 20 | "github.com/zorkian/kafka/proto" 21 | "github.com/jpillora/backoff" 22 | ) 23 | 24 | // int64slice is for sorting. 25 | type int64slice []int64 26 | 27 | func (a int64slice) Len() int { return len(a) } 28 | func (a int64slice) Swap(i, j int) { a[i], a[j] = a[j], a[i] } 29 | func (a int64slice) Less(i, j int) bool { return a[i] < a[j] } 30 | 31 | // claim is instantiated for each partition "claim" we have. This type is responsible for 32 | // pulling data from Kafka and managing its cursors, heartbeating as necessary, and health 33 | // checking itself. 34 | type claim struct { 35 | // These items are read-only. They are never changed after the object is created, 36 | // so access to these may be done without the lock. 37 | topic string 38 | partID int 39 | 40 | // lock protects all access to the member variables of this struct except for the 41 | // messages channel, which can be read from or written to without holding the lock. 42 | // Additionally the stopChan can be used. 43 | lock *sync.RWMutex 44 | messagesLock *sync.Mutex 45 | offsets PartitionOffsets 46 | marshal *Marshaler 47 | consumer *Consumer 48 | rand *rand.Rand 49 | terminated *int32 50 | beatCounter int32 51 | lastHeartbeat int64 52 | lastMessageTime time.Time 53 | options ConsumerOptions 54 | kafkaConsumer kafka.Consumer 55 | messages chan *Message 56 | stopChan chan struct{} 57 | doneChan chan struct{} 58 | 59 | // tracking is a dict that maintains information about offsets that have been 60 | // sent to and acknowledged by clients. An offset is inserted into this map when 61 | // we insert it into the message queue, and when it is committed we record an update 62 | // saying so. This map is pruned during the heartbeats. 63 | tracking map[int64]bool 64 | outstandingMessages int 65 | 66 | // Number of heartbeat cycles this claim has been lagging, i.e., consumption is going 67 | // too slowly (defined as being behind by more than 2 heartbeat cycles) 68 | cyclesBehind int 69 | 70 | // History arrays used for calculating average velocity for health checking. 71 | offsetCurrentHistory [10]int64 72 | offsetLatestHistory [10]int64 73 | } 74 | 75 | // newClaim returns an internal claim object, used by the consumer to manage the 76 | // claim of a single partition. It is up to the caller to ensure healthCheckLoop gets 77 | // called in a goroutine. If you do not, the claim will die from failing to heartbeat 78 | // after a short period. 79 | func newClaim(topic string, partID int, marshal *Marshaler, consumer *Consumer, 80 | messages chan *Message, options ConsumerOptions) *claim { 81 | 82 | // Get all available offset information 83 | offsets, err := marshal.GetPartitionOffsets(topic, partID) 84 | if err != nil { 85 | log.Errorf("[%s:%d] failed to get offsets: %s", topic, partID, err) 86 | return nil 87 | } 88 | log.Debugf("[%s:%d] consumer offsets: early = %d, cur/comm = %d/%d, late = %d", 89 | topic, partID, offsets.Earliest, offsets.Current, offsets.Committed, offsets.Latest) 90 | 91 | // For offsets, we strictly prefer the contents of the MarshalTopic and will use that 92 | // if present. If we don't have that data, then we'll fall back to the Kafka committed 93 | // offsets. Failing that we'll start at the beginning of the partition. 94 | if offsets.Current > 0 { 95 | // Ideal case, we just use the Marshal offset that is already set 96 | } else if offsets.Committed > 0 { 97 | log.Infof("[%s:%d] no Marshal offset found, using committed offset %d", 98 | topic, partID, offsets.Committed) 99 | offsets.Current = offsets.Committed 100 | } else { 101 | log.Infof("[%s:%d] no Marshal or committed offset found, using earliest offset %d", 102 | topic, partID, offsets.Earliest) 103 | offsets.Current = offsets.Earliest 104 | } 105 | 106 | // Construct object and set it up 107 | obj := &claim{ 108 | lock: &sync.RWMutex{}, 109 | messagesLock: &sync.Mutex{}, 110 | stopChan: make(chan struct{}), 111 | doneChan: make(chan struct{}), 112 | marshal: marshal, 113 | consumer: consumer, 114 | topic: topic, 115 | partID: partID, 116 | terminated: new(int32), 117 | offsets: offsets, 118 | messages: messages, 119 | options: options, 120 | tracking: make(map[int64]bool), 121 | rand: rand.New(rand.NewSource(time.Now().UnixNano())), 122 | lastMessageTime: time.Now(), 123 | } 124 | 125 | // Now try to actually claim it, this can block a while 126 | log.Infof("[%s:%d] consumer attempting to claim", topic, partID) 127 | if !marshal.ClaimPartition(topic, partID) { 128 | log.Infof("[%s:%d] consumer failed to claim", topic, partID) 129 | return nil 130 | } 131 | 132 | // If that worked, kick off the main setup loop and return 133 | obj.setup() 134 | return obj 135 | } 136 | 137 | // setup is the initial worker that initializes the claim structure. Until this is done, 138 | // our internal state is inconsistent. 139 | func (c *claim) setup() { 140 | c.lock.Lock() 141 | defer c.lock.Unlock() 142 | 143 | // Of course, if the current offset is greater than the earliest, we must reset 144 | // to the earliest known 145 | if c.offsets.Current < c.offsets.Earliest { 146 | log.Warningf("[%s:%d] consumer fast-forwarding from %d to %d", 147 | c.topic, c.partID, c.offsets.Current, c.offsets.Earliest) 148 | c.offsets.Current = c.offsets.Earliest 149 | } 150 | 151 | // Since it's claimed, we now want to heartbeat with the last seen offset 152 | err := c.marshal.Heartbeat(c.topic, c.partID, c.offsets.Current) 153 | if err != nil { 154 | log.Errorf("[%s:%d] consumer failed to heartbeat: %s", c.topic, c.partID, err) 155 | go c.Release() 156 | return 157 | } 158 | c.lastHeartbeat = time.Now().Unix() 159 | 160 | // Set up Kafka consumer 161 | consumerConf := kafka.NewConsumerConf(c.topic, int32(c.partID)) 162 | consumerConf.StartOffset = c.offsets.Current 163 | consumerConf.MaxFetchSize = c.marshal.cluster.options.MaxMessageSize 164 | consumerConf.RequestTimeout = c.marshal.cluster.options.ConsumeRequestTimeout 165 | // Do not retry. If we get back no data, we'll do our own retries. 166 | consumerConf.RetryLimit = 0 167 | 168 | kafkaConsumer, err := c.marshal.cluster.broker.Consumer(consumerConf) 169 | if err != nil { 170 | log.Errorf("[%s:%d] consumer failed to create Kafka Consumer: %s", 171 | c.topic, c.partID, err) 172 | go c.Release() 173 | return 174 | } 175 | c.kafkaConsumer = kafkaConsumer 176 | 177 | // Start our maintenance goroutines that keep this system healthy 178 | go c.messagePump() 179 | 180 | // Totally done, let the world know and move on 181 | log.Infof("[%s:%d] consumer %s claimed at offset %d (is %d behind)", 182 | c.topic, c.partID, c.marshal.clientID, c.offsets.Current, c.offsets.Latest-c.offsets.Current) 183 | } 184 | 185 | // Commit is called by a Consumer class when the client has indicated that it has finished 186 | // processing a message. This updates our tracking structure so the heartbeat knows how 187 | // far ahead it can move our offset. 188 | func (c *claim) Commit(offset int64) error { 189 | if c.Terminated() { 190 | return fmt.Errorf("[%s:%d] is no longer claimed; can't commit offset %d", 191 | c.topic, c.partID, offset) 192 | } 193 | 194 | c.lock.Lock() 195 | defer c.lock.Unlock() 196 | 197 | _, ok := c.tracking[offset] 198 | if !ok { 199 | // This is bogus; committing an offset we've never seen? 200 | return fmt.Errorf("[%s:%d] committing offset %d but we've never seen it", 201 | c.topic, c.partID, offset) 202 | } 203 | c.tracking[offset] = true 204 | c.outstandingMessages-- 205 | return nil 206 | } 207 | 208 | // Terminated returns whether the consumer has terminated the Claim. The claim may or may NOT 209 | // remain claimed depending on whether it was released or not. 210 | func (c *claim) Terminated() bool { 211 | return atomic.LoadInt32(c.terminated) == 1 212 | } 213 | 214 | // GetCurrentLag returns this partition's cursor lag. 215 | func (c *claim) GetCurrentLag() int64 { 216 | c.lock.RLock() 217 | defer c.lock.RUnlock() 218 | 219 | if c.offsets.Current < c.offsets.Latest { 220 | return c.offsets.Latest - c.offsets.Current 221 | } 222 | return 0 223 | } 224 | 225 | // Flush will write updated offsets to Kafka immediately if we have any outstanding offset 226 | // updates to write. If not, this is a relatively quick no-op. 227 | func (c *claim) Flush() error { 228 | // By definition a terminated claim has already flushed anything it can flush 229 | // or we've lost the lock so there's nothing we can do. It's not an error. 230 | if c.Terminated() { 231 | return nil 232 | } 233 | 234 | // This is technically a racey design, but the worst case is that we 235 | // will write out two correct heartbeats which is fine. 236 | didAdvance, currentOffset := c.updateCurrentOffsets() 237 | if !didAdvance { 238 | // Current offset did not advance 239 | return nil 240 | } 241 | 242 | // Now heartbeat this value and update our heartbeat time 243 | if err := c.marshal.Heartbeat(c.topic, c.partID, currentOffset); err != nil { 244 | go c.Release() 245 | return fmt.Errorf("[%s:%d] failed to flush, releasing: %s", c.topic, c.partID, err) 246 | } 247 | return nil 248 | } 249 | 250 | // Release will invoke commit offsets and release the Kafka partition. After calling Release, 251 | // consumer cannot consume messages anymore. 252 | // Does not return until the message pump has exited and the release has finished. 253 | func (c *claim) Release() bool { 254 | return c.teardown(true) 255 | } 256 | 257 | // Terminate will invoke commit offsets, terminate the claim, but does NOT release the partition. 258 | // Does not return until the message pump has exited and termination has finished. 259 | func (c *claim) Terminate() bool { 260 | return c.teardown(false) 261 | } 262 | 263 | // teardown handles releasing the claim or just updating our offsets for a fast restart. 264 | func (c *claim) teardown(releasePartition bool) bool { 265 | if !atomic.CompareAndSwapInt32(c.terminated, 0, 1) { 266 | <-c.doneChan 267 | return false 268 | } 269 | 270 | // Kill the stopchan now which is a useful way of knowing we're quitting within selects 271 | close(c.stopChan) 272 | 273 | // need to serialize access to the messages channel. We should not release if the message pump 274 | // is about to write to the consumer channel 275 | c.messagesLock.Lock() 276 | defer c.messagesLock.Unlock() 277 | 278 | // Let's update current offset internally to the last processed 279 | _, currentOffset := c.updateCurrentOffsets() 280 | 281 | // Advise the consumer that this claim is terminating, this is so that the consumer 282 | // can release other claims if we've lost part of a topic 283 | if c.consumer != nil { 284 | go c.consumer.claimTerminated(c, releasePartition) 285 | } 286 | 287 | var err error 288 | if releasePartition { 289 | log.Infof("[%s:%d] releasing partition claim", c.topic, c.partID) 290 | err = c.marshal.ReleasePartition(c.topic, c.partID, currentOffset) 291 | } else { 292 | // We're not releasing but we do want to update our offsets to the latest value 293 | // we know about, so issue a gratuitous heartbeat 294 | err = c.marshal.Heartbeat(c.topic, c.partID, currentOffset) 295 | } 296 | 297 | // Wait for messagePump to exit 298 | <-c.doneChan 299 | 300 | if err != nil { 301 | log.Errorf("[%s:%d] failed to release: %s", c.topic, c.partID, err) 302 | return false 303 | } 304 | return true 305 | } 306 | 307 | // messagePump continuously pulls message from Kafka for this partition and makes them 308 | // available for consumption. 309 | func (c *claim) messagePump() { 310 | // When the pump exits we close the doneChan so people can know when it's not 311 | // possible for the pump to be running 312 | defer close(c.doneChan) 313 | 314 | // This method MUST NOT make changes to the claim structure. Since we might 315 | // be running while someone else has the lock, and we can't get it ourselves, we are 316 | // forbidden to touch anything other than the consumer and the message channel. 317 | retry := &backoff.Backoff{Min: 10 * time.Millisecond, Max: 1 * time.Second, Jitter: true} 318 | for !c.Terminated() { 319 | msg, err := c.kafkaConsumer.Consume() 320 | if err == proto.ErrOffsetOutOfRange { 321 | // Fell out of range, presumably because we're handling this too slow, so 322 | // let's abandon this claim 323 | log.Warningf("[%s:%d] error consuming: out of range, abandoning partition", 324 | c.topic, c.partID) 325 | go c.Release() 326 | return 327 | } else if err == kafka.ErrNoData { 328 | // No data, just loop; if we're stuck receiving no data for too long the healthcheck 329 | // will start failing 330 | time.Sleep(retry.Duration()) 331 | continue 332 | } else if err != nil { 333 | log.Errorf("[%s:%d] error consuming: %s", c.topic, c.partID, err) 334 | 335 | // Often a consumption error is caused by data going away, such as if we're consuming 336 | // from the head and Kafka has deleted the data. In that case we need to wait for 337 | // the next offset update, so let's not go crazy 338 | time.Sleep(1 * time.Second) 339 | continue 340 | } 341 | retry.Reset() 342 | 343 | // Briefly get the lock to update our tracking map... I wish there were 344 | // goroutine safe maps in Go. 345 | c.lock.Lock() 346 | c.lastMessageTime = time.Now() 347 | c.tracking[msg.Offset] = false 348 | c.outstandingMessages++ 349 | if msg.Offset < c.offsets.Current { 350 | log.Errorf("[%s:%d] just consumed offset %d earlier than current %d", 351 | c.topic, c.partID, msg.Offset, c.offsets.Current) 352 | } 353 | c.lock.Unlock() 354 | 355 | // Push the message down to the client (this bypasses the Consumer) 356 | // We should NOT write to the consumer channel if the claim is no longer claimed. This 357 | // needs to be serialized with Release, otherwise a race-condition can potentially 358 | // lead to a write to a closed-channel. That's why we're using this lock. We're not 359 | // using the main lock to avoid deadlocks since the write to the channel is blocking 360 | // until someone consumes the message blocking all Commit operations. 361 | // 362 | // This must not block -- if we hold the messagesLock for too long we will cause 363 | // possible deadlocks. 364 | c.messagesLock.Lock() 365 | if !c.Terminated() { 366 | // This allocates a new Message to put the proto.Message in. 367 | // TODO: This is really annoying and probably stupidly inefficient, is there any 368 | // way to do this better? 369 | tmp := Message(*msg) 370 | select { 371 | case c.messages <- &tmp: 372 | // Message successfully delivered to queue 373 | case <-c.stopChan: 374 | // Claim is terminated, the message will go nowhere 375 | } 376 | } 377 | c.messagesLock.Unlock() 378 | } 379 | log.Debugf("[%s:%d] no longer claimed, pump exiting", c.topic, c.partID) 380 | } 381 | 382 | // heartbeat is the internal "send a heartbeat" function. Calling this will immediately 383 | // send a heartbeat to Kafka. If we fail to send a heartbeat, we will release the 384 | // partition. 385 | func (c *claim) heartbeat() bool { 386 | // Unclaimed partitions don't heartbeat. 387 | if c.Terminated() { 388 | return false 389 | } 390 | 391 | // Lock held because we use c.offsets and update c.lastHeartbeat below 392 | c.lock.Lock() 393 | defer c.lock.Unlock() 394 | 395 | // Now heartbeat this value and update our heartbeat time 396 | err := c.marshal.Heartbeat(c.topic, c.partID, c.offsets.Current) 397 | if err != nil { 398 | log.Errorf("[%s:%d] failed to heartbeat, releasing: %s", c.topic, c.partID, err) 399 | go c.Release() 400 | } 401 | 402 | log.Infof("[%s:%d] heartbeat: Current offset is %d, partition offset range is %d..%d.", 403 | c.topic, c.partID, c.offsets.Current, c.offsets.Earliest, c.offsets.Latest) 404 | log.Infof("[%s:%d] heartbeat: There are %d messages in queue and %d messages outstanding.", 405 | c.topic, c.partID, len(c.messages), c.outstandingMessages) 406 | c.lastHeartbeat = time.Now().Unix() 407 | return true 408 | } 409 | 410 | // updateCurrentOffsets updates the current offsets so that a Commit/Heartbeat can pick up the 411 | // latest offsets. Returns true if we advanced our current offset, false if there was no 412 | // change. Also returns the latest current offset. 413 | func (c *claim) updateCurrentOffsets() (bool, int64) { 414 | c.lock.Lock() 415 | defer c.lock.Unlock() 416 | 417 | // Get the sorted set of offsets 418 | offsets := make(int64slice, 0, len(c.tracking)) 419 | for key := range c.tracking { 420 | offsets = append(offsets, key) 421 | } 422 | sort.Sort(offsets) 423 | 424 | // Now iterate the offsets bottom up and increment our current offset until we 425 | // see the first uncommitted offset (oldest message) 426 | didAdvance := false 427 | for _, offset := range offsets { 428 | if !c.tracking[offset] { 429 | break 430 | } 431 | // Remember current is always "last committed + 1", see the docs on 432 | // PartitionOffset for a reminder. 433 | didAdvance = true 434 | if offset+1 <= c.offsets.Current { 435 | log.Errorf("[%s:%d] rewinding current offset from %d to %d", 436 | c.topic, c.partID, c.offsets.Current, offset+1) 437 | } 438 | c.offsets.Current = offset + 1 439 | delete(c.tracking, offset) 440 | } 441 | 442 | // If we end up with more than a queue of outstanding messages, then something is 443 | // probably broken in the implementation... since that will cause us to grow 444 | // forever in memory, let's alert the user 445 | if len(c.tracking) > c.marshal.cluster.options.MaxMessageQueue { 446 | log.Errorf("[%s:%d] has %d uncommitted offsets. You must call Commit.", 447 | c.topic, c.partID, len(c.tracking)) 448 | } 449 | return didAdvance, c.offsets.Current 450 | } 451 | 452 | // heartbeatExpired returns whether or not our last successful heartbeat is so 453 | // long ago that we know we're expired. 454 | func (c *claim) heartbeatExpired() bool { 455 | c.lock.RLock() 456 | defer c.lock.RUnlock() 457 | 458 | return c.lastHeartbeat < time.Now().Unix()-HeartbeatInterval 459 | } 460 | 461 | // healthCheck performs a single health check against the claim. If we have failed 462 | // too many times, this will also start a partition release. Returns true if the 463 | // partition is healthy, else false. 464 | func (c *claim) healthCheck() bool { 465 | // Unclaimed partitions aren't healthy. 466 | if c.Terminated() { 467 | return false 468 | } 469 | 470 | // Get velocities; these functions both use the locks so we have to do this before 471 | // we personally take the lock (to avoid deadlock) 472 | consumerVelocity := c.ConsumerVelocity() 473 | partitionVelocity := c.PartitionVelocity() 474 | 475 | // If our heartbeat is expired, we are definitely unhealthy... don't even bother 476 | // with checking velocity 477 | if c.heartbeatExpired() { 478 | log.Warningf("[%s:%d] consumer unhealthy by heartbeat test, releasing", 479 | c.topic, c.partID) 480 | go c.Release() 481 | return false 482 | } 483 | 484 | // If the consumer group owning this claim is paused, we must release this claim. 485 | if c.marshal.cluster.IsGroupPaused(c.marshal.GroupID()) { 486 | log.Infof("[%s:%d] consumer group %s is paused, claim releasing", 487 | c.topic, c.partID, c.marshal.GroupID()) 488 | go c.Release() 489 | return false 490 | } 491 | 492 | // Take the lock below here as we are reading protected values on c and we're 493 | // writing to c.cyclesBehind 494 | c.lock.Lock() 495 | defer c.lock.Unlock() 496 | 497 | // If we haven't seen any messages for more than a heartbeat interval, it's possible 498 | // we've gotten into a bad state. Make a check to see how far behind we are, if we 499 | // are behind and not seeing any messages then release. 500 | if time.Now().After(c.lastMessageTime.Add(HeartbeatInterval * time.Second)) { 501 | if c.options.ReleaseClaimsIfBehind && consumerVelocity == 0 && (partitionVelocity > 0 || c.offsets.Latest > c.offsets.Current) { 502 | // If that's true then it means velocity has been 0 for at least long enough 503 | // to drive the average to 0, which means about 10 heartbeat cycles. This is 504 | // long enough that releasing seems fine. 505 | log.Warningf("[%s:%d] no messages received for %d seconds with CV=%0.2f PV=%0.2f, releasing", 506 | c.topic, c.partID, HeartbeatInterval, consumerVelocity, partitionVelocity) 507 | go c.Release() 508 | return false 509 | } else { 510 | log.Infof("[%s:%d] no messages received for %d seconds with CV=%0.2f PV=%0.2f", 511 | c.topic, c.partID, HeartbeatInterval, consumerVelocity, partitionVelocity) 512 | } 513 | } 514 | 515 | // In topic claim mode we don't do any velocity checking. It's up to the consumer 516 | // to ensure they're claiming. TODO: Unclear if this is correct or not. 517 | if c.options.ClaimEntireTopic || !c.options.ReleaseClaimsIfBehind { 518 | return true 519 | } 520 | 521 | // We consider a consumer to be caught up if the predicted offset is past the end 522 | // of the partition. This takes into account the fact that we only get offset information 523 | // every hearbeat, so we could have some stale data. 524 | testOffset := c.offsets.Current + int64(consumerVelocity*2) 525 | if testOffset >= c.offsets.Latest { 526 | c.cyclesBehind = 0 527 | return true 528 | } 529 | 530 | // At this point we know the consumer is NOT PRESENTLY caught up or predicted to catch 531 | // up in the next two heartbeats. 532 | 533 | // If the consumer is moving as fast or faster than the partition and the consumer is 534 | // at least moving, consider it healthy. This case is true in the standard catching 535 | // up from behind case. 536 | if partitionVelocity < consumerVelocity { 537 | log.Infof("[%s:%d] consumer catching up: consume ∆ %0.2f >= produce ∆ %0.2f", 538 | c.topic, c.partID, consumerVelocity, partitionVelocity) 539 | c.cyclesBehind = 0 540 | return true 541 | } 542 | 543 | // Unhealthy, so increase the cycle count so we know when it's been unhealthy for 544 | // too long and we want to give it up 545 | c.cyclesBehind++ 546 | 547 | // If were behind by too many cycles, then we should try to release the 548 | // partition. If so, do this in a goroutine since it will involve calling out 549 | // to Kafka and releasing the partition. 550 | if c.cyclesBehind >= 3 { 551 | log.Warningf("[%s:%d] consumer unhealthy for too long, releasing", 552 | c.topic, c.partID) 553 | go c.Release() 554 | return false 555 | } 556 | 557 | // Clearly we haven't been behind for long enough, so we're still "healthy" 558 | log.Warningf("[%s:%d] consumer too slow: consume ∆ %0.2f < produce ∆ %0.2f (warning #%d)", 559 | c.topic, c.partID, consumerVelocity, partitionVelocity, c.cyclesBehind) 560 | return true 561 | } 562 | 563 | // healthCheckLoop runs regularly and will perform a health check. Exits when this claim 564 | // has been terminated. 565 | func (c *claim) healthCheckLoop() { 566 | time.Sleep(<-c.marshal.cluster.jitters) 567 | for !c.Terminated() { 568 | // Attempt to update offsets; if this fails we want to to do a quicker retry 569 | // than the jitter interval to allow us to try to retry some times before we 570 | // give up. 571 | for !c.heartbeatExpired() { 572 | if err := c.updateOffsets(); err != nil { 573 | log.Errorf("[%s:%d] health check loop failed to update offsets: %s", 574 | c.topic, c.partID, err) 575 | time.Sleep(1 * time.Second) 576 | continue 577 | } 578 | break 579 | } 580 | 581 | // Now healthcheck and, if it's good, heartbeat 582 | if c.healthCheck() { 583 | go c.heartbeat() 584 | } 585 | time.Sleep(<-c.marshal.cluster.jitters) 586 | } 587 | log.Infof("[%s:%d] health check loop exiting, claim terminated", 588 | c.topic, c.partID) 589 | } 590 | 591 | // average returns the average of a given slice of int64s. It ignores 0s as 592 | // those are "uninitialized" elements. 593 | func average(vals []int64) float64 { 594 | min, max, ct := int64(0), int64(0), int64(0) 595 | for _, val := range vals { 596 | if val <= 0 { 597 | continue 598 | } 599 | if min == 0 || val < min { 600 | min = val 601 | } 602 | if max == 0 || val > max { 603 | max = val 604 | } 605 | ct++ 606 | } 607 | 608 | if min == max || ct < 2 { 609 | return 0 610 | } 611 | 612 | return float64(max-min) / float64(ct-1) 613 | } 614 | 615 | // ConsumerVelocity returns the average of our consumers' velocity 616 | func (c *claim) ConsumerVelocity() float64 { 617 | c.lock.RLock() 618 | defer c.lock.RUnlock() 619 | 620 | return average(c.offsetCurrentHistory[0:]) 621 | } 622 | 623 | // PartitionVelocity returns the average of the partition's velocity 624 | func (c *claim) PartitionVelocity() float64 { 625 | c.lock.RLock() 626 | defer c.lock.RUnlock() 627 | 628 | return average(c.offsetLatestHistory[0:]) 629 | } 630 | 631 | // updateOffsets will update the offsets of our current partition. 632 | func (c *claim) updateOffsets() error { 633 | // Start by updating our current offsets so even if we fail to get the offsets 634 | // we need to calculate Kafka data, we still move our current offset forward. 635 | c.updateCurrentOffsets() 636 | 637 | // Slow, hits Kafka. Run in a goroutine. 638 | offsets, err := c.marshal.GetPartitionOffsets(c.topic, c.partID) 639 | if err != nil { 640 | log.Errorf("[%s:%d] failed to get offsets: %s", c.topic, c.partID, err) 641 | return err 642 | } 643 | 644 | c.lock.Lock() 645 | defer c.lock.Unlock() 646 | 647 | // Update the earliest/latest offsets that are presently available within the 648 | // partition 649 | c.offsets.Earliest = offsets.Earliest 650 | c.offsets.Latest = offsets.Latest 651 | 652 | // Do update our "history" values, this is used for calculating moving averages 653 | // in the health checking function 654 | c.offsetLatestHistory[c.beatCounter] = offsets.Latest 655 | c.offsetCurrentHistory[c.beatCounter] = c.offsets.Current 656 | 657 | c.beatCounter = (c.beatCounter + 1) % 10 658 | return nil 659 | } 660 | 661 | // numTrackingOffsets returns the size of the tracking dict. 662 | func (c *claim) numTrackingOffsets() int { 663 | c.lock.RLock() 664 | defer c.lock.RUnlock() 665 | 666 | return len(c.tracking) 667 | } 668 | 669 | // PrintState outputs the status of the consumer. 670 | func (c *claim) PrintState() { 671 | c.lock.RLock() 672 | defer c.lock.RUnlock() 673 | 674 | // "Claimed" status is from Marshal rationalizer, and "Terminated" status is from 675 | // the local claim object (indicates we've exited somehow) 676 | state := "----" 677 | cl := c.marshal.GetPartitionClaim(c.topic, c.partID) 678 | if cl.Claimed() { 679 | if c.Terminated() { 680 | state = "CL+T" 681 | } else { 682 | state = "CLMD" 683 | } 684 | } else if c.Terminated() { 685 | state = "TERM" 686 | } 687 | 688 | ct := 0 689 | for _, st := range c.tracking { 690 | if st { 691 | ct++ 692 | } 693 | } 694 | 695 | now := time.Now().Unix() 696 | 697 | log.Infof(" * %2d [%s]: offsets %d <= %d <= %d | %d", 698 | c.partID, state, c.offsets.Earliest, c.offsets.Current, 699 | c.offsets.Latest, c.offsets.Committed) 700 | log.Infof(" BC %d | LHB %d (%d) | OM %d | CB %d", 701 | c.beatCounter, c.lastHeartbeat, now-c.lastHeartbeat, 702 | c.outstandingMessages, c.cyclesBehind) 703 | log.Infof(" TRACK COMMITTED %d | TRACK OUTSTANDING %d", 704 | ct, len(c.tracking)-ct) 705 | log.Infof(" PV %0.2f | CV %0.2f", 706 | c.PartitionVelocity(), c.ConsumerVelocity()) 707 | } 708 | -------------------------------------------------------------------------------- /marshal/claim_test.go: -------------------------------------------------------------------------------- 1 | package marshal 2 | 3 | import ( 4 | "sync" 5 | "sync/atomic" 6 | "time" 7 | 8 | . "gopkg.in/check.v1" 9 | 10 | "github.com/zorkian/kafka/kafkatest" 11 | "github.com/zorkian/kafka/proto" 12 | ) 13 | 14 | var _ = Suite(&ClaimSuite{}) 15 | 16 | type ClaimSuite struct { 17 | c *C 18 | s *kafkatest.Server 19 | kc *KafkaCluster 20 | m *Marshaler 21 | ch chan *Message 22 | cl *claim 23 | } 24 | 25 | func (s *ClaimSuite) SetUpSuite(c *C) { 26 | ResetTestLogger(c) 27 | 28 | s.s = StartServer() 29 | 30 | opts := NewMarshalOptions() 31 | opts.BrokerConnectionLimit = 10 32 | opts.ConsumeRequestTimeout = 20 * time.Millisecond 33 | opts.MarshalRequestTimeout = 20 * time.Millisecond 34 | opts.MarshalRequestRetryWait = 1 * time.Millisecond 35 | 36 | var err error 37 | s.kc, err = Dial("claimsuite", []string{s.s.Addr()}, opts) 38 | c.Assert(err, IsNil) 39 | } 40 | 41 | func (s *ClaimSuite) SetUpTest(c *C) { 42 | // Give a second for the last test to finish up, this prevents messages from 43 | // releases from going into this test's pool 44 | time.Sleep(1 * time.Second) 45 | 46 | ResetTestLogger(c) 47 | 48 | s.c = c 49 | s.ch = make(chan *Message, 10) 50 | s.s.ResetTopic("test3") 51 | atomic.StoreInt32(s.kc.rsteps, 0) 52 | 53 | var err error 54 | s.m, err = s.kc.NewMarshaler("cl", newInstanceID()) 55 | c.Assert(err, IsNil) 56 | s.cl = newClaim("test3", 0, s.m, nil, s.ch, NewConsumerOptions()) 57 | c.Assert(s.cl, NotNil) 58 | } 59 | 60 | func (s *ClaimSuite) TearDownTest(c *C) { 61 | if s.cl != nil { 62 | s.cl.Release() 63 | } 64 | if s.m != nil { 65 | s.m.Terminate() 66 | } 67 | } 68 | 69 | func (s *ClaimSuite) TearDownSuite(c *C) { 70 | if s.kc != nil { 71 | s.kc.Terminate() 72 | } 73 | if s.s != nil { 74 | s.s.Close() 75 | } 76 | } 77 | 78 | func (s *ClaimSuite) Produce(topicName string, partID int, msgs ...string) int64 { 79 | var protos []*proto.Message 80 | for _, msg := range msgs { 81 | protos = append(protos, &proto.Message{Value: []byte(msg)}) 82 | } 83 | offset, err := s.m.cluster.producer.Produce(topicName, int32(partID), protos...) 84 | s.c.Assert(err, IsNil) 85 | return offset 86 | } 87 | 88 | func (s *ClaimSuite) WaitForRsteps(c *C, cluster *KafkaCluster, numSteps int) { 89 | steps, err := cluster.waitForRsteps(numSteps) 90 | c.Assert(err, IsNil) 91 | c.Assert(steps, Equals, numSteps) 92 | } 93 | 94 | func (s *ClaimSuite) TestOffsetUpdates(c *C) { 95 | // Test that the updateOffsets function works and updates offsets from Kafka 96 | c.Assert(s.cl.updateOffsets(), IsNil) 97 | c.Assert(s.Produce("test3", 0, "m1", "m2", "m3"), Equals, int64(2)) 98 | c.Assert(s.cl.updateOffsets(), IsNil) 99 | c.Assert(s.cl.offsets.Latest, Equals, int64(3)) 100 | } 101 | 102 | func (s *ClaimSuite) consumeOne(c *C) *Message { 103 | select { 104 | case msg := <-s.ch: 105 | return msg 106 | case <-time.After(3 * time.Second): 107 | c.Error("Timed out consuming a message.") 108 | } 109 | return nil 110 | } 111 | 112 | func (s *ClaimSuite) TestCommit(c *C) { 113 | // Test the commit message flow, ensuring that our offset only gets updated when 114 | // we have properly committed messages 115 | c.Assert(s.Produce("test3", 0, "m1", "m2", "m3", "m4", "m5", "m6"), Equals, int64(5)) 116 | c.Assert(s.cl.updateOffsets(), IsNil) 117 | c.Assert(s.cl.heartbeat(), Equals, true) 118 | c.Assert(s.cl.offsets.Current, Equals, int64(0)) 119 | c.Assert(s.cl.offsets.Earliest, Equals, int64(0)) 120 | c.Assert(s.cl.offsets.Latest, Equals, int64(6)) 121 | 122 | // Consume 1, heartbeat... offsets still 0 123 | msg1 := s.consumeOne(c) 124 | c.Assert(msg1.Value, DeepEquals, []byte("m1")) 125 | c.Assert(s.cl.updateOffsets(), IsNil) 126 | c.Assert(s.cl.heartbeat(), Equals, true) 127 | c.Assert(s.cl.offsets.Current, Equals, int64(0)) 128 | s.cl.lock.RLock() 129 | c.Assert(s.cl.tracking[0], Equals, false) 130 | s.cl.lock.RUnlock() 131 | 132 | // Consume 2, still 0 133 | msg2 := s.consumeOne(c) 134 | c.Assert(msg2.Value, DeepEquals, []byte("m2")) 135 | c.Assert(s.cl.updateOffsets(), IsNil) 136 | c.Assert(s.cl.heartbeat(), Equals, true) 137 | c.Assert(s.cl.offsets.Current, Equals, int64(0)) 138 | 139 | // Commit 1, offset 1 but only after heartbeat phase 140 | c.Assert(s.cl.Commit(msg1.Offset), IsNil) 141 | c.Assert(s.cl.offsets.Current, Equals, int64(0)) 142 | c.Assert(s.cl.updateOffsets(), IsNil) 143 | c.Assert(s.cl.heartbeat(), Equals, true) 144 | c.Assert(s.cl.offsets.Current, Equals, int64(1)) 145 | c.Assert(s.cl.numTrackingOffsets(), Equals, 5) 146 | 147 | // Consume 3, heartbeat, offset 1 148 | msg3 := s.consumeOne(c) 149 | c.Assert(msg3.Value, DeepEquals, []byte("m3")) 150 | c.Assert(s.cl.offsets.Current, Equals, int64(1)) 151 | c.Assert(s.cl.updateOffsets(), IsNil) 152 | c.Assert(s.cl.heartbeat(), Equals, true) 153 | c.Assert(s.cl.offsets.Current, Equals, int64(1)) 154 | 155 | // Commit #3, offset will stay 1! 156 | c.Assert(s.cl.Commit(msg3.Offset), IsNil) 157 | c.Assert(s.cl.offsets.Current, Equals, int64(1)) 158 | c.Assert(s.cl.updateOffsets(), IsNil) 159 | c.Assert(s.cl.heartbeat(), Equals, true) 160 | c.Assert(s.cl.offsets.Current, Equals, int64(1)) 161 | c.Assert(s.cl.numTrackingOffsets(), Equals, 5) 162 | 163 | // Commit #2, offset now advances to 3 164 | c.Assert(s.cl.Commit(msg2.Offset), IsNil) 165 | c.Assert(s.cl.offsets.Current, Equals, int64(1)) 166 | c.Assert(s.cl.updateOffsets(), IsNil) 167 | c.Assert(s.cl.heartbeat(), Equals, true) 168 | c.Assert(s.cl.offsets.Current, Equals, int64(3)) 169 | c.Assert(s.cl.numTrackingOffsets(), Equals, 3) 170 | 171 | // Attempt to commit invalid offset (never seen), make sure it errors 172 | msg3.Offset = 95 173 | c.Assert(s.cl.Commit(msg3.Offset), NotNil) 174 | 175 | // Commit the rest 176 | c.Assert(s.cl.Commit(s.consumeOne(c).Offset), IsNil) 177 | c.Assert(s.cl.Commit(s.consumeOne(c).Offset), IsNil) 178 | c.Assert(s.cl.Commit(s.consumeOne(c).Offset), IsNil) 179 | c.Assert(s.cl.offsets.Current, Equals, int64(3)) 180 | c.Assert(s.cl.updateOffsets(), IsNil) 181 | c.Assert(s.cl.heartbeat(), Equals, true) 182 | c.Assert(s.cl.offsets.Current, Equals, int64(6)) 183 | c.Assert(s.cl.numTrackingOffsets(), Equals, 0) 184 | } 185 | 186 | func (s *ClaimSuite) waitForTrackingOffsets(c *C, ct int) { 187 | for i := 0; i < 100; i++ { 188 | if s.cl.numTrackingOffsets() != ct { 189 | time.Sleep(10 * time.Millisecond) 190 | continue 191 | } 192 | } 193 | c.Assert(s.cl.numTrackingOffsets(), Equals, ct) 194 | } 195 | 196 | func (s *ClaimSuite) TestFlush(c *C) { 197 | // Test the commit message flow, ensuring that our offset only gets updated when 198 | // we have properly committed messages 199 | // 200 | // Basically the same as the heartbeat test, since a flush triggers a heartbeat 201 | c.Assert(s.Produce("test3", 0, "m1", "m2", "m3", "m4", "m5", "m6"), Equals, int64(5)) 202 | c.Assert(s.cl.updateOffsets(), IsNil) 203 | c.Assert(s.cl.heartbeat(), Equals, true) 204 | c.Assert(s.cl.offsets.Current, Equals, int64(0)) 205 | c.Assert(s.cl.offsets.Earliest, Equals, int64(0)) 206 | c.Assert(s.cl.offsets.Latest, Equals, int64(6)) 207 | s.waitForTrackingOffsets(c, 6) 208 | 209 | // Consume 1, Flush... offsets still 0 210 | msg1 := s.consumeOne(c) 211 | c.Assert(msg1.Value, DeepEquals, []byte("m1")) 212 | c.Assert(msg1.Offset, Equals, int64(0)) 213 | c.Assert(s.cl.Flush(), IsNil) 214 | c.Assert(s.cl.offsets.Current, Equals, int64(0)) 215 | s.cl.lock.RLock() 216 | val, ok := s.cl.tracking[0] 217 | c.Assert(ok, Equals, true) 218 | c.Assert(val, Equals, false) 219 | s.cl.lock.RUnlock() 220 | 221 | // Consume 2, still 0 222 | msg2 := s.consumeOne(c) 223 | c.Assert(msg2.Value, DeepEquals, []byte("m2")) 224 | c.Assert(msg2.Offset, Equals, int64(1)) 225 | c.Assert(s.cl.Flush(), IsNil) 226 | c.Assert(s.cl.offsets.Current, Equals, int64(0)) 227 | c.Assert(s.cl.numTrackingOffsets(), Equals, 6) 228 | 229 | // Commit 1, offset 1 but only after Flush phase 230 | c.Assert(s.cl.Commit(msg1.Offset), IsNil) 231 | c.Assert(s.cl.offsets.Current, Equals, int64(0)) 232 | c.Assert(s.cl.Flush(), IsNil) 233 | c.Assert(s.cl.offsets.Current, Equals, int64(1)) 234 | c.Assert(s.cl.numTrackingOffsets(), Equals, 5) 235 | 236 | // Produce some more 237 | c.Assert(s.Produce("test3", 0, "m7"), Equals, int64(6)) 238 | s.waitForTrackingOffsets(c, 6) 239 | 240 | // Consume 3, Flush, offset 1 241 | msg3 := s.consumeOne(c) 242 | c.Assert(msg3.Value, DeepEquals, []byte("m3")) 243 | c.Assert(msg3.Offset, Equals, int64(2)) 244 | c.Assert(s.cl.offsets.Current, Equals, int64(1)) 245 | c.Assert(s.cl.Flush(), IsNil) 246 | c.Assert(s.cl.offsets.Current, Equals, int64(1)) 247 | 248 | // Assert that the above didn't update the Latest offset, the Flush 249 | // flow doesn't (unlike heartbeat which does) 250 | c.Assert(s.cl.offsets.Latest, Equals, int64(6)) 251 | 252 | // Commit #3, offset will stay 1! we're still tracking 6 because the 253 | // committed one in middle position must stay tracked until the 254 | // previous messages are committed 255 | c.Assert(s.cl.Commit(msg3.Offset), IsNil) 256 | c.Assert(s.cl.offsets.Current, Equals, int64(1)) 257 | c.Assert(s.cl.Flush(), IsNil) 258 | c.Assert(s.cl.offsets.Current, Equals, int64(1)) 259 | c.Assert(s.cl.numTrackingOffsets(), Equals, 6) 260 | 261 | // Now a heartbeat happens, it should change nothing except Latest 262 | c.Assert(s.cl.updateOffsets(), IsNil) 263 | c.Assert(s.cl.heartbeat(), Equals, true) 264 | c.Assert(s.cl.offsets.Current, Equals, int64(1)) 265 | c.Assert(s.cl.numTrackingOffsets(), Equals, 6) 266 | c.Assert(s.cl.offsets.Latest, Equals, int64(7)) 267 | 268 | // Commit #2, offset now advances to 3 and the outstanding is 4 269 | c.Assert(s.cl.Commit(msg2.Offset), IsNil) 270 | c.Assert(s.cl.offsets.Current, Equals, int64(1)) 271 | c.Assert(s.cl.Flush(), IsNil) 272 | c.Assert(s.cl.offsets.Current, Equals, int64(3)) 273 | c.Assert(s.cl.numTrackingOffsets(), Equals, 4) 274 | 275 | // Attempt to commit invalid offset (never seen), make sure it errors 276 | msg3.Offset = 95 277 | c.Assert(s.cl.Commit(msg3.Offset), NotNil) 278 | 279 | // Commit the rest 280 | c.Assert(s.cl.Commit(s.consumeOne(c).Offset), IsNil) 281 | c.Assert(s.cl.Commit(s.consumeOne(c).Offset), IsNil) 282 | c.Assert(s.cl.Commit(s.consumeOne(c).Offset), IsNil) 283 | c.Assert(s.cl.Commit(s.consumeOne(c).Offset), IsNil) 284 | c.Assert(s.cl.offsets.Current, Equals, int64(3)) 285 | c.Assert(s.cl.Flush(), IsNil) 286 | c.Assert(s.cl.offsets.Current, Equals, int64(7)) 287 | c.Assert(s.cl.numTrackingOffsets(), Equals, 0) 288 | 289 | // One last heartbeat, should be no change from the above Flush since 290 | // nothing has happened 291 | c.Assert(s.cl.updateOffsets(), IsNil) 292 | c.Assert(s.cl.heartbeat(), Equals, true) 293 | c.Assert(s.cl.offsets.Current, Equals, int64(7)) 294 | c.Assert(s.cl.numTrackingOffsets(), Equals, 0) 295 | c.Assert(s.cl.offsets.Latest, Equals, int64(7)) 296 | } 297 | 298 | func (s *ClaimSuite) BenchmarkConsumeAndCommit(c *C) { 299 | // Produce N messages for consumption into the test partition and hopefully this 300 | // doesn't end up being the really slow part of the operation 301 | msgs := make([]string, 0, c.N) 302 | for i := 0; i < c.N; i++ { 303 | msgs = append(msgs, "message") 304 | } 305 | s.Produce("test3", 0, msgs...) 306 | 307 | // Now consume everything and immediately commit it 308 | for i := 0; i < c.N; i++ { 309 | if msg := s.consumeOne(c); msg != nil { 310 | s.cl.Commit(msg.Offset) 311 | } 312 | } 313 | } 314 | 315 | func (s *ClaimSuite) assertRelease(c *C) { 316 | for i := 0; i < 100; i++ { 317 | time.Sleep(30 * time.Millisecond) 318 | 319 | cl := s.cl.marshal.GetPartitionClaim(s.cl.topic, s.cl.partID) 320 | if !cl.Claimed() { 321 | break 322 | } 323 | } 324 | 325 | cl := s.cl.marshal.GetPartitionClaim(s.cl.topic, s.cl.partID) 326 | c.Assert(cl.Claimed(), Equals, false) 327 | } 328 | 329 | func (s *ClaimSuite) assertNoRelease(c *C) { 330 | for i := 0; i < 100; i++ { 331 | time.Sleep(30 * time.Millisecond) 332 | 333 | cl := s.cl.marshal.GetPartitionClaim(s.cl.topic, s.cl.partID) 334 | if cl.Claimed() { 335 | break 336 | } 337 | } 338 | 339 | cl := s.cl.marshal.GetPartitionClaim(s.cl.topic, s.cl.partID) 340 | c.Assert(cl.Claimed(), Equals, true) 341 | } 342 | 343 | func (s *ClaimSuite) TestRelease(c *C) { 344 | // Test that calling Release on a claim properly releases the partition 345 | c.Assert(s.m.GetPartitionClaim("test3", 0).LastHeartbeat, Not(Equals), int64(0)) 346 | c.Assert(s.cl.Terminated(), Equals, false) 347 | c.Assert(s.cl.Release(), Equals, true) 348 | s.assertRelease(c) 349 | s.WaitForRsteps(c, s.m.cluster, 3) 350 | c.Assert(s.m.GetPartitionClaim("test3", 0).LastHeartbeat, Equals, int64(0)) 351 | c.Assert(s.cl.Release(), Equals, false) 352 | } 353 | 354 | func (s *ClaimSuite) TestTerminate(c *C) { 355 | // Test that calling Terminate on a claim properly sets the flag and commits offsets 356 | // for the partition but does not release 357 | c.Assert(s.m.GetPartitionClaim("test3", 0).LastHeartbeat, Not(Equals), int64(0)) 358 | c.Assert(s.cl.Terminated(), Equals, false) 359 | c.Assert(s.cl.Terminate(), Equals, true) 360 | c.Assert(s.cl.Terminated(), Equals, true) 361 | s.assertNoRelease(c) 362 | } 363 | 364 | func (s *ClaimSuite) TestTerminateDoesNotDeadlock(c *C) { 365 | // Test that termination is not blocked by a full messages channel 366 | c.Assert(s.m.GetPartitionClaim("test3", 0).LastHeartbeat, Not(Equals), int64(0)) 367 | c.Assert(s.cl.Terminated(), Equals, false) 368 | 369 | // Replace message chan with length 1 chan for testing 370 | s.cl.lock.Lock() 371 | s.cl.messages = make(chan *Message, 1) 372 | s.cl.lock.Unlock() 373 | 374 | // Now produce 2 messages and wait for them to be consumed 375 | c.Assert(s.Produce("test3", 0, "m1", "m2"), Equals, int64(1)) 376 | s.waitForTrackingOffsets(c, 2) 377 | 378 | // Assert message channel has 1 message in it, we have 2 tracking but 1 message 379 | // in channel because second is blocked 380 | c.Assert(len(s.cl.messages), Equals, 1) 381 | 382 | // Assert that messageLock is being held, this works by sending a goroutine to 383 | // get the lock and then in our current function we sleep a bit. If the sleep expires, 384 | // that means the goroutine was blocked (or never scheduled). We get around that by 385 | // using the WaitGroup to make sure it actually scheduled. 386 | wasScheduled := &sync.WaitGroup{} 387 | wasScheduled.Add(1) 388 | probablyHeld := make(chan bool, 2) 389 | go func() { 390 | wasScheduled.Done() // Got scheduled! 391 | s.cl.messagesLock.Lock() 392 | defer s.cl.messagesLock.Unlock() 393 | probablyHeld <- false 394 | }() 395 | wasScheduled.Wait() 396 | select { 397 | case <-time.After(100 * time.Millisecond): 398 | probablyHeld <- true 399 | } 400 | c.Assert(<-probablyHeld, Equals, true) 401 | 402 | // Now terminate, this should return and work and not be claimed 403 | c.Assert(s.cl.Release(), Equals, true) 404 | c.Assert(s.cl.Terminated(), Equals, true) 405 | s.assertRelease(c) 406 | } 407 | 408 | func (s *ClaimSuite) TestCommitOutstanding(c *C) { 409 | // Test that calling CommitOffsets should commit offsets for outstanding messages and 410 | // updates claim tracking 411 | c.Assert(s.Produce("test3", 0, "m1", "m2", "m3", "m4", "m5", "m6"), Equals, int64(5)) 412 | c.Assert(s.cl.updateOffsets(), IsNil) 413 | c.Assert(s.cl.offsets.Current, Equals, int64(0)) 414 | c.Assert(s.cl.offsets.Earliest, Equals, int64(0)) 415 | c.Assert(s.cl.offsets.Latest, Equals, int64(6)) 416 | 417 | // This test requires all messages to have been consumed into the channel, else 418 | // we can get inconsistent results 419 | readyChan := make(chan struct{}) 420 | go func() { 421 | defer close(readyChan) 422 | for { 423 | s.cl.lock.RLock() 424 | if len(s.cl.messages) == 6 { 425 | s.cl.lock.RUnlock() 426 | break 427 | } 428 | s.cl.lock.RUnlock() 429 | 430 | time.Sleep(100 * time.Millisecond) 431 | } 432 | }() 433 | select { 434 | case <-readyChan: 435 | // all good, continue 436 | case <-time.After(3 * time.Second): 437 | // Timeout reached, we've failed 438 | c.FailNow() 439 | } 440 | 441 | // Consume 1, heartbeat... offsets still 0 442 | msg1 := s.consumeOne(c) 443 | c.Assert(msg1.Value, DeepEquals, []byte("m1")) 444 | c.Assert(s.cl.Commit(msg1.Offset), IsNil) 445 | c.Assert(s.cl.numTrackingOffsets(), Equals, 6) 446 | c.Assert(s.cl.offsets.Current, Equals, int64(0)) 447 | 448 | // Commit the offsets....should update current offset and tracking for the claim 449 | c.Assert(s.cl.Terminate(), Equals, true) 450 | c.Assert(s.cl.offsets.Current, Equals, int64(1)) 451 | c.Assert(s.cl.numTrackingOffsets(), Equals, 5) 452 | } 453 | 454 | func (s *ClaimSuite) TestCurrentLag(c *C) { 455 | // Test that GetCurrentLag returns the correct numbers in various cases 456 | s.cl.offsets.Current = 0 457 | s.cl.offsets.Latest = 0 458 | c.Assert(s.cl.GetCurrentLag(), Equals, int64(0)) 459 | 460 | s.cl.offsets.Current = 1 461 | s.cl.offsets.Latest = 0 462 | c.Assert(s.cl.GetCurrentLag(), Equals, int64(0)) 463 | 464 | s.cl.offsets.Current = 0 465 | s.cl.offsets.Latest = 1 466 | c.Assert(s.cl.GetCurrentLag(), Equals, int64(1)) 467 | 468 | s.cl.offsets.Current = 1 469 | s.cl.offsets.Latest = 2 470 | c.Assert(s.cl.GetCurrentLag(), Equals, int64(1)) 471 | } 472 | 473 | func (s *ClaimSuite) TestHeartbeat(c *C) { 474 | // Ensure that our heartbeats are updating the marshal structures appropriately 475 | // (makes sure clients are seeing the right values) 476 | c.Assert(s.m.GetPartitionClaim("test3", 0).CurrentOffset, Equals, int64(0)) 477 | s.cl.offsets.Current = 10 478 | c.Assert(s.cl.heartbeat(), Equals, true) 479 | s.WaitForRsteps(c, s.m.cluster, 3) 480 | c.Assert(s.m.GetPartitionClaim("test3", 0).CurrentOffset, Equals, int64(10)) 481 | 482 | // And test that releasing means we can't update heartbeat anymore 483 | c.Assert(s.cl.Release(), Equals, true) 484 | s.WaitForRsteps(c, s.m.cluster, 4) 485 | s.cl.offsets.Current = 20 486 | c.Assert(s.cl.heartbeat(), Equals, false) 487 | c.Assert(s.m.GetPartitionClaim("test3", 0).LastHeartbeat, Equals, int64(0)) 488 | c.Assert(s.m.GetPartitionClaim("test3", 0).CurrentOffset, Equals, int64(0)) 489 | c.Assert(s.m.GetLastPartitionClaim("test3", 0).CurrentOffset, Equals, int64(10)) 490 | } 491 | 492 | func (s *ClaimSuite) TestReleaseIfWedged(c *C) { 493 | s.cl.offsets.Current = 10 494 | s.cl.offsetCurrentHistory = [10]int64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10} 495 | s.cl.offsets.Latest = 20 496 | s.cl.offsetLatestHistory = [10]int64{11, 12, 13, 14, 15, 16, 17, 18, 19, 20} 497 | s.cl.lastMessageTime = time.Now() 498 | c.Assert(s.cl.healthCheck(), Equals, true) 499 | 500 | // Now say the last message was a while ago, but our velocities are non-zero 501 | // so we shouldn't release 502 | s.cl.lastMessageTime = time.Now().Add(-(HeartbeatInterval + 1) * time.Second) 503 | c.Assert(s.cl.healthCheck(), Equals, true) 504 | c.Assert(s.m.GetPartitionClaim("test3", 0).LastHeartbeat, Not(Equals), int64(0)) 505 | 506 | // Set both PV and CV to be 0 and they're equal, should also succeed 507 | s.cl.offsets.Current = 12 508 | s.cl.offsetCurrentHistory = [10]int64{12, 12, 12, 12, 12, 12, 12, 12, 12, 12} 509 | s.cl.offsets.Latest = 12 510 | s.cl.offsetLatestHistory = [10]int64{12, 12, 12, 12, 12, 12, 12, 12, 12, 12} 511 | c.Assert(s.cl.healthCheck(), Equals, true) 512 | c.Assert(s.m.GetPartitionClaim("test3", 0).LastHeartbeat, Not(Equals), int64(0)) 513 | 514 | // Set both PV and CV to be 0 and they're not-equal, should release 515 | s.cl.offsets.Current = 12 516 | s.cl.offsetCurrentHistory = [10]int64{12, 12, 12, 12, 12, 12, 12, 12, 12, 12} 517 | s.cl.offsets.Latest = 13 518 | s.cl.offsetLatestHistory = [10]int64{13, 13, 13, 13, 13, 13, 13, 13, 13, 13} 519 | c.Assert(s.cl.healthCheck(), Equals, false) 520 | s.WaitForRsteps(c, s.m.cluster, 3) 521 | c.Assert(s.m.GetPartitionClaim("test3", 0).LastHeartbeat, Equals, int64(0)) 522 | } 523 | 524 | func (s *ClaimSuite) TestReleaseIfWedged2(c *C) { 525 | s.cl.offsets.Current = 10 526 | s.cl.offsetCurrentHistory = [10]int64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10} 527 | s.cl.offsets.Latest = 20 528 | s.cl.offsetLatestHistory = [10]int64{11, 12, 13, 14, 15, 16, 17, 18, 19, 20} 529 | s.cl.lastMessageTime = time.Now() 530 | c.Assert(s.cl.healthCheck(), Equals, true) 531 | 532 | // Set CV=0 and PV>0, should release 533 | s.cl.offsets.Current = 12 534 | s.cl.offsetCurrentHistory = [10]int64{12, 12, 12, 12, 12, 12, 12, 12, 12, 12} 535 | s.cl.offsets.Latest = 14 536 | s.cl.offsetLatestHistory = [10]int64{13, 13, 13, 13, 13, 13, 13, 14, 14, 14} 537 | s.cl.lastMessageTime = time.Now().Add(-(HeartbeatInterval + 1) * time.Second) 538 | c.Assert(s.cl.healthCheck(), Equals, false) 539 | s.WaitForRsteps(c, s.m.cluster, 3) 540 | c.Assert(s.m.GetPartitionClaim("test3", 0).LastHeartbeat, Equals, int64(0)) 541 | } 542 | 543 | func (s *ClaimSuite) TestVelocity(c *C) { 544 | // Test that the velocity functions perform as expected given the expected inputs 545 | s.cl.offsetCurrentHistory = [10]int64{0, 0, 0, 0, 0, 0, 0, 0, 0, 0} 546 | c.Assert(s.cl.ConsumerVelocity(), Equals, float64(0)) 547 | s.cl.offsetLatestHistory = s.cl.offsetCurrentHistory 548 | c.Assert(s.cl.PartitionVelocity(), Equals, s.cl.ConsumerVelocity()) 549 | 550 | s.cl.offsetCurrentHistory = [10]int64{1, 2, 0, 0, 0, 0, 0, 0, 0, 0} 551 | c.Assert(s.cl.ConsumerVelocity(), Equals, float64(1)) 552 | s.cl.offsetLatestHistory = s.cl.offsetCurrentHistory 553 | c.Assert(s.cl.PartitionVelocity(), Equals, s.cl.ConsumerVelocity()) 554 | 555 | s.cl.offsetCurrentHistory = [10]int64{1, 2, 3, 0, 0, 0, 0, 0, 0, 0} 556 | c.Assert(s.cl.ConsumerVelocity(), Equals, float64(1)) 557 | s.cl.offsetLatestHistory = s.cl.offsetCurrentHistory 558 | c.Assert(s.cl.PartitionVelocity(), Equals, s.cl.ConsumerVelocity()) 559 | 560 | s.cl.offsetCurrentHistory = [10]int64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10} 561 | c.Assert(s.cl.ConsumerVelocity(), Equals, float64(1)) 562 | s.cl.offsetLatestHistory = s.cl.offsetCurrentHistory 563 | c.Assert(s.cl.PartitionVelocity(), Equals, s.cl.ConsumerVelocity()) 564 | 565 | s.cl.offsetCurrentHistory = [10]int64{1, 21, 21, 0, 0, 0, 0, 0, 0, 0} 566 | c.Assert(s.cl.ConsumerVelocity(), Equals, float64(10)) 567 | s.cl.offsetLatestHistory = s.cl.offsetCurrentHistory 568 | c.Assert(s.cl.PartitionVelocity(), Equals, s.cl.ConsumerVelocity()) 569 | 570 | s.cl.offsetCurrentHistory = [10]int64{1, 21, 21, 21, 21, 0, 0, 0, 0, 0} 571 | c.Assert(s.cl.ConsumerVelocity(), Equals, float64(5)) 572 | s.cl.offsetLatestHistory = s.cl.offsetCurrentHistory 573 | c.Assert(s.cl.PartitionVelocity(), Equals, s.cl.ConsumerVelocity()) 574 | 575 | s.cl.offsetCurrentHistory = [10]int64{21, 21, 1, 21, 21, 0, 0, 0, 0, 0} 576 | c.Assert(s.cl.ConsumerVelocity(), Equals, float64(5)) 577 | s.cl.offsetLatestHistory = s.cl.offsetCurrentHistory 578 | c.Assert(s.cl.PartitionVelocity(), Equals, s.cl.ConsumerVelocity()) 579 | 580 | s.cl.offsetCurrentHistory = [10]int64{21, 0, 0, 0, 0, 0, 0, 0, 0, 0} 581 | c.Assert(s.cl.ConsumerVelocity(), Equals, float64(0)) 582 | s.cl.offsetLatestHistory = s.cl.offsetCurrentHistory 583 | c.Assert(s.cl.PartitionVelocity(), Equals, s.cl.ConsumerVelocity()) 584 | } 585 | 586 | func (s *ClaimSuite) TestHealthCheck(c *C) { 587 | // Ensure that the health check system returns expected values for given states 588 | s.cl.offsets.Current = 0 589 | s.cl.offsetCurrentHistory = [10]int64{0, 0, 0, 0, 0, 0, 0, 0, 0, 0} 590 | s.cl.offsets.Latest = 0 591 | s.cl.offsetLatestHistory = [10]int64{0, 0, 0, 0, 0, 0, 0, 0, 0, 0} 592 | c.Assert(s.cl.healthCheck(), Equals, true) 593 | c.Assert(s.cl.cyclesBehind, Equals, 0) 594 | 595 | // Put us in an "unhealthy" state, PV is high and we aren't caught up 596 | s.cl.offsets.Latest = 10 597 | s.cl.offsetLatestHistory = [10]int64{1, 10, 0, 0, 0, 0, 0, 0, 0, 0} 598 | c.Assert(s.cl.healthCheck(), Equals, true) 599 | c.Assert(s.cl.cyclesBehind, Equals, 1) 600 | 601 | // Now we're "caught up" even PV>CV we're healthy 602 | s.cl.offsets.Current = 21 603 | s.cl.offsetCurrentHistory = [10]int64{1, 6, 11, 16, 21, 0, 0, 0, 0, 0} 604 | s.cl.offsets.Latest = 21 605 | s.cl.offsetLatestHistory = [10]int64{1, 11, 21, 0, 0, 0, 0, 0, 0, 0} 606 | c.Assert(s.cl.ConsumerVelocity() < s.cl.PartitionVelocity(), Equals, true) 607 | c.Assert(s.cl.healthCheck(), Equals, true) 608 | c.Assert(s.cl.cyclesBehind, Equals, 0) 609 | 610 | // Test that "predictive speed" is working, i.e., that the consumer is 611 | // considered healthy when it's within a heartbeat of the end 612 | s.cl.offsets.Latest = 31 613 | s.cl.offsetLatestHistory = [10]int64{1, 11, 21, 31, 0, 0, 0, 0, 0, 0} 614 | c.Assert(s.cl.ConsumerVelocity() < s.cl.PartitionVelocity(), Equals, true) 615 | c.Assert(s.cl.healthCheck(), Equals, true) 616 | c.Assert(s.cl.cyclesBehind, Equals, 0) 617 | 618 | // Test that PV=0, CV=0 but behind is unhealthy 619 | s.cl.offsets.Current = 21 620 | s.cl.offsetCurrentHistory = [10]int64{21, 21, 21, 21, 21, 21, 21, 21, 21, 21} 621 | s.cl.offsets.Latest = 23 622 | s.cl.offsetLatestHistory = [10]int64{23, 23, 23, 23, 23, 23, 23, 23, 23, 23} 623 | c.Assert(s.cl.ConsumerVelocity(), Equals, float64(0)) 624 | c.Assert(s.cl.PartitionVelocity(), Equals, float64(0)) 625 | c.Assert(s.cl.ConsumerVelocity() == s.cl.PartitionVelocity(), Equals, true) 626 | c.Assert(s.cl.healthCheck(), Equals, true) 627 | c.Assert(s.cl.cyclesBehind, Equals, 1) 628 | c.Assert(s.cl.healthCheck(), Equals, true) 629 | c.Assert(s.cl.cyclesBehind, Equals, 2) 630 | 631 | // Now we advance one message, giving us SOME velocity -- even tho PV is still 0 632 | // this should make us healthy 633 | s.cl.offsets.Current = 22 634 | s.cl.offsetCurrentHistory = [10]int64{21, 21, 21, 21, 21, 21, 21, 21, 21, 22} 635 | c.Assert(s.cl.PartitionVelocity(), Equals, float64(0)) 636 | c.Assert(s.cl.ConsumerVelocity() > s.cl.PartitionVelocity(), Equals, true) 637 | c.Assert(s.cl.healthCheck(), Equals, true) 638 | c.Assert(s.cl.cyclesBehind, Equals, 0) 639 | 640 | // Now handle the "far behind but catching up" case, CV>PV but beyond the prediction 641 | s.cl.offsets.Current = 31 642 | s.cl.offsetCurrentHistory = [10]int64{21, 22, 23, 24, 26, 27, 28, 29, 30, 31} 643 | s.cl.offsets.Latest = 132 644 | s.cl.offsetLatestHistory = [10]int64{123, 124, 125, 126, 127, 128, 129, 130, 131, 132} 645 | c.Assert(s.cl.PartitionVelocity(), Equals, float64(1)) 646 | c.Assert(s.cl.ConsumerVelocity() > s.cl.PartitionVelocity(), Equals, true) 647 | c.Assert(s.cl.healthCheck(), Equals, true) 648 | c.Assert(s.cl.cyclesBehind, Equals, 0) 649 | 650 | // Now we're behind and fail health checks 3 times, this will release 651 | s.cl.offsets.Current = 22 652 | s.cl.offsetCurrentHistory = [10]int64{21, 21, 21, 21, 21, 21, 21, 21, 21, 22} 653 | s.cl.offsets.Latest = 32 654 | s.cl.offsetLatestHistory = [10]int64{1, 11, 21, 32, 0, 0, 0, 0, 0, 0} 655 | c.Assert(s.cl.ConsumerVelocity() < s.cl.PartitionVelocity(), Equals, true) 656 | c.Assert(s.cl.healthCheck(), Equals, true) 657 | c.Assert(s.cl.cyclesBehind, Equals, 1) 658 | c.Assert(s.cl.healthCheck(), Equals, true) 659 | c.Assert(s.cl.cyclesBehind, Equals, 2) 660 | c.Assert(s.cl.healthCheck(), Equals, false) 661 | c.Assert(s.cl.cyclesBehind, Equals, 3) 662 | s.WaitForRsteps(c, s.m.cluster, 3) 663 | c.Assert(s.m.GetPartitionClaim("test3", 0).LastHeartbeat, Equals, int64(0)) 664 | c.Assert(s.m.GetPartitionClaim("test3", 0).CurrentOffset, Equals, int64(0)) 665 | c.Assert(s.m.GetLastPartitionClaim("test3", 0).CurrentOffset, Equals, int64(22)) 666 | 667 | // If we are okay with CV 1 && !options.ClaimEntireTopic { 121 | return nil, errors.New("ClaimEntireTopic must be set if provided more than one topic") 122 | } else if len(topicNames) == 0 { 123 | return nil, errors.New("must provide at least one topic") 124 | } 125 | 126 | partitions := make(map[string]int) 127 | 128 | for _, topic := range topicNames { 129 | partitions[topic] = m.Partitions(topic) 130 | } 131 | 132 | // Construct base structure 133 | c := &Consumer{ 134 | alive: new(int32), 135 | marshal: m, 136 | topics: topicNames, 137 | partitions: partitions, 138 | options: options, 139 | messages: make(chan *Message, m.cluster.options.MaxMessageQueue), 140 | lock: &sync.RWMutex{}, 141 | rand: rand.New(rand.NewSource(time.Now().UnixNano())), 142 | claims: make(map[string]map[int]*claim), 143 | topicClaimsChan: make(chan map[string]bool), 144 | topicClaimsUpdated: make(chan struct{}, 1), 145 | stopChan: make(chan struct{}), 146 | doneChan: make(chan struct{}), 147 | } 148 | atomic.StoreInt32(c.alive, 1) 149 | m.addNewConsumer(c) 150 | 151 | // Take the lock for now as we're updating various points internally 152 | c.lock.Lock() 153 | defer c.lock.Unlock() 154 | 155 | // Start notifier about topic claims now because people are going to start 156 | // listening immediately 157 | go c.sendTopicClaimsLoop() 158 | 159 | // Fast-reclaim: iterate over existing claims in the given topics and see if 160 | // any of them look to be from previous incarnations of this Marshal (client, group) 161 | // and are currently claimed. If so, claim them. Do this before the claim manager 162 | // is started. 163 | if c.options.FastReclaim { 164 | claimedTopics := make(map[string]bool) 165 | for topic, partitionCount := range c.partitions { 166 | for partID := 0; partID < partitionCount; partID++ { 167 | cl := c.marshal.GetPartitionClaim(topic, partID) 168 | 169 | // If not presently claimed, or not claimed by us, skip 170 | if !cl.Claimed() || 171 | cl.ClientID != c.marshal.ClientID() || 172 | cl.GroupID != c.marshal.GroupID() { 173 | continue 174 | } 175 | 176 | // This looks to be ours, let's do it. This is basically the fast path, 177 | // and our heartbeat will happen shortly from the automatic health 178 | // check which fires up immediately on newClaim. 179 | log.Infof("[%s:%d] attempting to fast-reclaim", topic, partID) 180 | if _, ok := c.claims[topic]; !ok { 181 | c.claims[topic] = make(map[int]*claim) 182 | } 183 | 184 | // update topic claims 185 | if options.ClaimEntireTopic { 186 | if partID == 0 { 187 | claimedTopics[topic] = true 188 | } 189 | 190 | // don't fast re-claim partitions for a topic unless partition 0 is claimed 191 | if !claimedTopics[topic] { 192 | log.Infof("[%s:%d] blocked fast-reclaim because topic is not claimed", 193 | topic, partID) 194 | continue 195 | } 196 | } 197 | 198 | // Attempt to claim, this can fail 199 | claim := newClaim( 200 | topic, partID, c.marshal, c, c.messages, options) 201 | if claim == nil { 202 | log.Warningf("[%s:%d] failed to fast-reclaim", topic, partID) 203 | } else { 204 | c.claims[topic][partID] = claim 205 | go claim.healthCheckLoop() 206 | c.sendTopicClaimsUpdate() 207 | } 208 | } 209 | 210 | // this check needs to be after iterating all partitions in a topic 211 | if options.ClaimEntireTopic && len(claimedTopics) >= options.MaximumClaims { 212 | log.Infof("reached max-topics for fast-reclaim. Claimed topics: %v", 213 | claimedTopics) 214 | break 215 | } 216 | } 217 | } 218 | 219 | go c.manageClaims() 220 | return c, nil 221 | } 222 | 223 | // NewConsumerOptions returns a default set of options for the Consumer. 224 | func NewConsumerOptions() ConsumerOptions { 225 | return ConsumerOptions{ 226 | FastReclaim: true, 227 | ClaimEntireTopic: false, 228 | GreedyClaims: false, 229 | ReleaseClaimsIfBehind: true, 230 | } 231 | } 232 | 233 | func (c *Consumer) defaultTopic() string { 234 | c.lock.RLock() 235 | defer c.lock.RUnlock() 236 | 237 | if len(c.partitions) > 1 { 238 | log.Errorf("attempted to claim partitions for more than one topic") 239 | go c.Terminate(false) 240 | return "" 241 | } 242 | 243 | for topic := range c.partitions { 244 | return topic 245 | } 246 | 247 | log.Errorf("couldn't find default topic!") 248 | go c.Terminate(false) 249 | return "" 250 | } 251 | 252 | func (c *Consumer) defaultTopicPartitions() int { 253 | c.lock.RLock() 254 | defer c.lock.RUnlock() 255 | 256 | if len(c.partitions) > 1 { 257 | log.Errorf("attempted to claim partitions for more than one topic") 258 | go c.Terminate(false) 259 | return 0 260 | } 261 | 262 | for _, partitions := range c.partitions { 263 | return partitions 264 | } 265 | 266 | log.Errorf("couldn't find default topic!") 267 | go c.Terminate(false) 268 | return 0 269 | } 270 | 271 | // claimTerminated is called by a claim when they've terminated. This is used so we can 272 | // ensure topic claim semantics are adhered to. In topic claim mode this will be called 273 | // by every claim during a release. 274 | func (c *Consumer) claimTerminated(cl *claim, released bool) { 275 | // For now, we don't care except in the topic claim mode 276 | if !c.options.ClaimEntireTopic { 277 | return 278 | } 279 | 280 | // Send an update at the end 281 | defer c.sendTopicClaimsUpdate() 282 | 283 | // This is a topic claim, so we need to perform the same operation on the rest of 284 | // the claims in this topic 285 | c.lock.RLock() 286 | defer c.lock.RUnlock() 287 | for _, claim := range c.claims[cl.topic] { 288 | if cl != claim { 289 | if released { 290 | go claim.Release() 291 | } else { 292 | go claim.Terminate() 293 | } 294 | } 295 | } 296 | } 297 | 298 | // tryClaimPartition attempts to claim a partition and make it available in the consumption 299 | // flow. If this is called a second time on a partition we already own, it will return 300 | // false. Returns true only if the partition was never claimed and we succeeded in 301 | // claiming it. 302 | func (c *Consumer) tryClaimPartition(topic string, partID int) bool { 303 | if c.options.ClaimEntireTopic { 304 | if c.isTopicClaimLimitReached(topic) { 305 | return false 306 | } 307 | } else { 308 | if c.isClaimLimitReached() { 309 | return false 310 | } 311 | } 312 | 313 | // See if partition is presently claimed by anybody, if so, do nothing. This is an 314 | // optimization but overall the whole system is racy and that race is handled elsewhere. 315 | // This gives us no protection. 316 | currentClaim := c.marshal.GetPartitionClaim(topic, partID) 317 | if currentClaim.Claimed() { 318 | return false 319 | } 320 | 321 | // Attempt to claim. This handles asynchronously and might ultimately fail because 322 | // someone beat us to the claim or we failed to produce to Kafka or something. This can 323 | // block for a while. 324 | newClaim := newClaim(topic, partID, c.marshal, c, c.messages, c.options) 325 | if newClaim == nil { 326 | return false 327 | } 328 | go newClaim.healthCheckLoop() 329 | 330 | // Critical section. Engage the lock here, we hold it until we exit. This lock can take 331 | // some time to get, so the following code has to be resilient to state changes that might 332 | // happen due to lock dilation. 333 | c.lock.Lock() 334 | defer c.lock.Unlock() 335 | 336 | // If claim says it's terminated, do nothing and exit. This can happen if something 337 | // in the claim failed to produce to Kafka. 338 | if newClaim.Terminated() { 339 | return false 340 | } 341 | 342 | // Ugh, we managed to claim a partition in our termination state. Don't worry too hard 343 | // and just release it. 344 | if c.Terminated() { 345 | // This can be a long blocking operation so send it to the background. We ultimately 346 | // don't care if it finishes or not, because the heartbeat will save us if we don't 347 | // submit a release message. This is just an optimization. 348 | go newClaim.Release() 349 | return false 350 | } 351 | 352 | // If we have an old claim (i.e. this is a reclaim) then assert that the old claim has 353 | // been properly terminated. If not, then this could indicate a bug in the Marshal state 354 | // machine. 355 | topicClaims, ok := c.claims[topic] 356 | if ok { 357 | oldClaim, ok := topicClaims[partID] 358 | if ok && oldClaim != nil { 359 | if !oldClaim.Terminated() { 360 | log.Errorf("Internal double-claim for %s:%d.", topic, partID) 361 | log.Errorf("This is a catastrophic error. We're terminating Marshal.") 362 | log.Errorf("No further messages will be available. Please restart.") 363 | go newClaim.Release() 364 | go c.terminateAndCleanup(false, false) 365 | go func() { 366 | c.marshal.PrintState() 367 | c.marshal.Terminate() 368 | }() 369 | return false 370 | } 371 | } 372 | } 373 | 374 | if _, ok := c.claims[topic]; !ok { 375 | c.claims[topic] = make(map[int]*claim) 376 | } 377 | 378 | // Save the claim, this makes it available for message consumption and status. 379 | c.claims[topic][partID] = newClaim 380 | return true 381 | } 382 | 383 | // rndIntn gets a random number. 384 | func (c *Consumer) rndIntn(n int) int { 385 | c.lock.Lock() 386 | defer c.lock.Unlock() 387 | 388 | return c.rand.Intn(n) 389 | } 390 | 391 | // releaseClaims releases all claims this consumer has. This is called when a consumer is paused. 392 | func (c *Consumer) releaseClaims() { 393 | c.lock.Lock() 394 | defer c.lock.Unlock() 395 | 396 | // Release all claims that this consumer keeps track of and remove them from the claims map. 397 | for topic, partitions := range c.claims { 398 | for partID, claim := range partitions { 399 | if !claim.Terminated() { 400 | log.Warningf("[%s:%d] Consumer still paused, releasing claim", 401 | topic, partID) 402 | claim.Release() 403 | } 404 | } 405 | c.claims[topic] = make(map[int]*claim) 406 | } 407 | } 408 | 409 | // claimPartitions actually attempts to claim partitions. If the current consumer is 410 | // set on aggressive, this will try to claim ALL partitions that are free. Balanced mode 411 | // will claim a single partition. 412 | func (c *Consumer) claimPartitions() { 413 | func() { 414 | c.lock.RLock() 415 | defer c.lock.RUnlock() 416 | if len(c.partitions) > 1 { 417 | log.Errorf("attempted to claim partitions for more than a single topic") 418 | go c.Terminate(false) 419 | } 420 | }() 421 | 422 | topic := c.defaultTopic() 423 | partitions := c.defaultTopicPartitions() 424 | if partitions <= 0 { 425 | return 426 | } 427 | 428 | // Don't bother trying to make claims if we are at our claim limit. 429 | // This is just an optimization, because we aren't holding the lock here 430 | // this check is repeated inside tryClaimPartition. 431 | if c.isClaimLimitReached() { 432 | return 433 | } 434 | 435 | offset := c.rndIntn(partitions) 436 | for i := 0; i < partitions; i++ { 437 | partID := (i + offset) % partitions 438 | 439 | // Get the most recent claim for this partition 440 | lastClaim := c.marshal.GetLastPartitionClaim(topic, partID) 441 | if lastClaim.Claimed() { 442 | continue 443 | } 444 | 445 | // If the last claim was by this particular consumer, skip if we just released. 446 | // This is because we might have become unhealthy and dropped it or we might already be 447 | // claiming this partition. 448 | if lastClaim.GroupID == c.marshal.groupID && 449 | lastClaim.ClientID == c.marshal.clientID { 450 | // Check release time, if it's over a heartbeat interval allow us to reclaim it 451 | if time.Now().Unix()-lastClaim.LastRelease < HeartbeatInterval { 452 | log.Infof("[%s:%d] skipping unclaimed partition because we recently released it", 453 | topic, partID) 454 | continue 455 | } else { 456 | log.Infof("[%s:%d] reclaiming because we released it a while ago", 457 | topic, partID) 458 | } 459 | } 460 | 461 | // Unclaimed, so attempt to claim it 462 | if !c.tryClaimPartition(topic, partID) { 463 | continue 464 | } 465 | 466 | // If greedy claims is disabled, finish here 467 | if !c.options.GreedyClaims { 468 | break 469 | } 470 | } 471 | } 472 | 473 | // isTopicClaimLimitReached indicates whether we can claim any partition of this topic 474 | // or not given the topics we've already claimed and MaximumClaims 475 | func (c *Consumer) isTopicClaimLimitReached(topic string) bool { 476 | if c.options.MaximumClaims <= 0 { 477 | return false 478 | } 479 | 480 | c.lock.RLock() 481 | defer c.lock.RUnlock() 482 | 483 | claimed := make(map[string]bool) 484 | for topic, topicClaims := range c.claims { 485 | if claim, ok := topicClaims[0]; ok && !claim.Terminated() { 486 | claimed[topic] = true 487 | } 488 | } 489 | 490 | if !claimed[topic] && len(claimed) >= c.options.MaximumClaims { 491 | return true 492 | } 493 | return false 494 | } 495 | 496 | // claimTopic attempts to claim the entire topic if we're in that mode. We use partition 0 497 | // as the key, anybody who has that partition has claimed the entire topic. This requires all 498 | // consumers to use this mode. 499 | func (c *Consumer) claimTopics() { 500 | // Whenever we're done here, try to send an update 501 | defer c.sendTopicClaimsUpdate() 502 | 503 | // Get a copy of c.partitions so we don't have to hold the lock throughout 504 | // this entire method 505 | topicPartitions := make(map[string]int) 506 | func() { 507 | c.lock.RLock() 508 | defer c.lock.RUnlock() 509 | 510 | for k, v := range c.partitions { 511 | topicPartitions[k] = v 512 | } 513 | }() 514 | 515 | // Now iterate each and try to claim 516 | for topic, partitions := range topicPartitions { 517 | if partitions <= 0 { 518 | continue 519 | } 520 | 521 | // We use partition 0 as our "key". Whoever claims partition 0 is considered the owner of 522 | // the topic. See if partition 0 is claimed or not. 523 | lastClaim := c.marshal.GetLastPartitionClaim(topic, 0) 524 | if lastClaim.Claimed() { 525 | // If it's not claimed by us, return. 526 | if lastClaim.GroupID != c.marshal.groupID || 527 | lastClaim.ClientID != c.marshal.clientID { 528 | // in case we had this topic, but now somebody else has claimed it 529 | continue 530 | } 531 | } else { 532 | // Unclaimed, so attempt to claim partition 0. This is how we key topic claims. 533 | log.Infof("[%s] attempting to claim topic (key partition 0)", topic) 534 | 535 | // we need to check if we're above the maximum topics to be claimed 536 | // we should only allow the first k topics to be claimed and allow all 537 | // of their partitions to be claimed. This is controlled by controlling how 538 | // many (key partition 0) we claim. 539 | if c.isTopicClaimLimitReached(topic) { 540 | log.Debugf("[%s] blocked claiming topic due to limit: %d", 541 | topic, c.options.MaximumClaims) 542 | continue 543 | } 544 | 545 | if !c.tryClaimPartition(topic, 0) { 546 | continue 547 | } 548 | log.Infof("[%s] claimed topic (key partition 0) successfully", topic) 549 | 550 | // Optimistically send update to try to reduce latency between us claiming a 551 | // topic and notifying a listener 552 | c.sendTopicClaimsUpdate() 553 | } 554 | 555 | // We either just claimed or we have already owned the 0th partition. Let's iterate 556 | // through all partitions and attempt to claim any that we don't own yet. 557 | for partID := 1; partID < partitions; partID++ { 558 | if !c.marshal.Claimed(topic, partID) { 559 | log.Infof("[%s:%d] claiming partition (topic claim mode)", topic, partID) 560 | c.tryClaimPartition(topic, partID) 561 | } 562 | } 563 | } 564 | } 565 | 566 | // sendTopicClaimUpdate can be called by various codepaths that have learned that there is 567 | // an update to send down to the users. 568 | func (c *Consumer) sendTopicClaimsUpdate() { 569 | select { 570 | case c.topicClaimsUpdated <- struct{}{}: 571 | // Just sends a marker on the channel. 572 | default: 573 | } 574 | } 575 | 576 | // topicClaimsLoop analyzes the current topic claims and sends an update if 577 | // and only if there is a change in claim state. I.e., a new topic is claimed 578 | // or a topic is released. 579 | func (c *Consumer) sendTopicClaimsLoop() { 580 | defer close(c.topicClaimsChan) 581 | 582 | lastClaims := make(map[string]bool) 583 | keepRunning := true 584 | 585 | for keepRunning { 586 | select { 587 | case <-c.topicClaimsUpdated: 588 | // Continue on and send an update 589 | case <-c.stopChan: 590 | // Send one more and exit 591 | keepRunning = false 592 | } 593 | 594 | // Get consistent claims and send them 595 | claims, err := c.GetCurrentTopicClaims() 596 | if err != nil { 597 | log.Errorf("Failed to send topic claims update: %s", err) 598 | continue 599 | } 600 | 601 | // See if anything has changed 602 | anyUpdates := false 603 | for topic, claimed := range lastClaims { 604 | stillClaimed, ok := claims[topic] 605 | if !ok { 606 | // Existed before but does not exist now 607 | anyUpdates = true 608 | } else if claimed != stillClaimed { 609 | // Status changed in some way 610 | anyUpdates = true 611 | } 612 | } 613 | for topic := range claims { 614 | if _, ok := lastClaims[topic]; !ok { 615 | // New topic claim 616 | anyUpdates = true 617 | } 618 | } 619 | lastClaims = claims 620 | 621 | // If no updates, continue 622 | if !anyUpdates { 623 | continue 624 | } 625 | 626 | // Drain out any unconsumed update 627 | select { 628 | case <-c.topicClaimsChan: 629 | default: 630 | } 631 | 632 | // This should never block since we're the only writer 633 | c.topicClaimsChan <- claims 634 | } 635 | } 636 | 637 | // updatePartitionCounts pulls the latest partition counts per topic from the Marshaler. 638 | func (c *Consumer) updatePartitionCounts() { 639 | // Write lock as we're updating c.partitions below, potentially 640 | c.lock.Lock() 641 | defer c.lock.Unlock() 642 | 643 | for _, topic := range c.marshal.Topics() { 644 | // Only update partitions for topics we already know about 645 | if _, ok := c.partitions[topic]; ok { 646 | c.partitions[topic] = c.marshal.Partitions(topic) 647 | } 648 | } 649 | } 650 | 651 | // manageClaims is our internal state machine that handles partitions and claiming new 652 | // ones (or releasing ones). 653 | func (c *Consumer) manageClaims() { 654 | for !c.Terminated() { 655 | c.updatePartitionCounts() 656 | 657 | // If we learn that our consumer group is paused, release all claims. 658 | if c.marshal.cluster.IsGroupPaused(c.marshal.GroupID()) { 659 | c.releaseClaims() 660 | } else { 661 | // Attempt to claim more partitions, this always runs and will keep running until all 662 | // partitions in the topic are claimed (by somebody). 663 | if c.options.ClaimEntireTopic { 664 | c.claimTopics() 665 | } else { 666 | c.claimPartitions() 667 | } 668 | } 669 | // Now sleep a bit so we don't pound things 670 | // TODO: Raise this later, we shouldn't attempt to claim this fast, this is just for 671 | // development. 672 | time.Sleep(time.Duration(c.rndIntn(3000)) * time.Millisecond) 673 | } 674 | } 675 | 676 | // Terminated returns whether or not this consumer has been terminated. 677 | func (c *Consumer) Terminated() bool { 678 | return atomic.LoadInt32(c.alive) == 0 679 | } 680 | 681 | // terminateAndCleanup instructs the consumer to commit its offsets, 682 | // possibly release its partitions, and possibly remove its reference from 683 | // the associated marshaler. This will allow other consumers to begin consuming. 684 | func (c *Consumer) terminateAndCleanup(release bool, remove bool) bool { 685 | if !atomic.CompareAndSwapInt32(c.alive, 1, 0) { 686 | return false 687 | } 688 | 689 | // Purposefully done outside of the lock below so we can send the message 690 | // that stopping is happening ASAP, and doneChan is deferred so it will close 691 | // at some point in the future 692 | close(c.stopChan) 693 | defer close(c.doneChan) 694 | 695 | latestTopicClaims := make(map[string]bool) 696 | releasedTopics := make(map[string]bool) 697 | 698 | c.lock.Lock() 699 | defer c.lock.Unlock() 700 | 701 | for topic, topicClaims := range c.claims { 702 | for partID, claim := range topicClaims { 703 | if claim != nil { 704 | if release { 705 | claim.Release() 706 | if partID == 0 { 707 | releasedTopics[topic] = true 708 | } 709 | } else { 710 | claim.Terminate() 711 | } 712 | } 713 | } 714 | } 715 | 716 | close(c.messages) 717 | 718 | for topic := range c.claims { 719 | if !releasedTopics[topic] { 720 | latestTopicClaims[topic] = true 721 | } 722 | } 723 | 724 | // Optionally remove consumer from its marshal. Doing so is recommended 725 | // if the marshal doesn't explicitly remove the consumer. 726 | if remove { 727 | c.marshal.removeConsumer(c) 728 | } 729 | 730 | // Update the claims one last time 731 | c.sendTopicClaimsUpdate() 732 | return true 733 | } 734 | 735 | // Terminate instructs the consumer to clean up and allow other consumers to begin consuming. 736 | // (If you do not call this method before exiting, things will still work, but more slowly.) 737 | func (c *Consumer) Terminate(release bool) bool { 738 | return c.terminateAndCleanup(release, true) 739 | } 740 | 741 | // GetCurrentTopicClaims returns the topics that are currently claimed by this 742 | // consumer. It should be relevent only when ClaimEntireTopic is set 743 | func (c *Consumer) GetCurrentTopicClaims() (map[string]bool, error) { 744 | c.lock.RLock() 745 | defer c.lock.RUnlock() 746 | 747 | if !c.options.ClaimEntireTopic { 748 | return nil, errors.New( 749 | "GetCurrentTopicClaims requires options.ClaimEntireTopic be set") 750 | } 751 | 752 | claimedTopics := make(map[string]bool) 753 | if c.Terminated() { 754 | return claimedTopics, nil 755 | } 756 | 757 | // Iterate each topic we know about and see if we have partition 0 claimed 758 | // for that topic, if so, consider it valid 759 | for topic := range c.partitions { 760 | cl := c.marshal.GetPartitionClaim(topic, 0) 761 | if cl.ClientID == c.marshal.ClientID() && 762 | cl.GroupID == c.marshal.GroupID() { 763 | // We own this topic 764 | claimedTopics[topic] = true 765 | } 766 | } 767 | return claimedTopics, nil 768 | } 769 | 770 | // TopicClaims returns a read-only channel that receives updates for topic claims. 771 | // It's only relevant when CLaimEntireTopic is set 772 | func (c *Consumer) TopicClaims() <-chan map[string]bool { 773 | if !c.options.ClaimEntireTopic { 774 | err := fmt.Errorf( 775 | "GetCurrentTopicClaims is only relevant when ClaimEntireTopic is set") 776 | log.Error(err.Error()) 777 | } 778 | 779 | return c.topicClaimsChan 780 | } 781 | 782 | // GetCurrentLag returns the number of messages that this consumer is lagging by. Note that 783 | // this value can be unstable in the beginning of a run, as we might not have claimed all of 784 | // partitions we will end up claiming, or we might have overclaimed and need to back off. 785 | // Ideally this will settle towards 0. If it continues to rise, that implies there isn't 786 | // enough consumer capacity. 787 | func (c *Consumer) GetCurrentLag() int64 { 788 | c.lock.RLock() 789 | defer c.lock.RUnlock() 790 | 791 | var lag int64 792 | for _, topicClaims := range c.claims { 793 | for _, cl := range topicClaims { 794 | if !cl.Terminated() { 795 | lag += cl.GetCurrentLag() 796 | } 797 | } 798 | } 799 | return lag 800 | } 801 | 802 | // GetCurrentLoad returns a number representing the "load" of this consumer. Think of this 803 | // like a load average in Unix systems: the numbers are kind of related to how much work 804 | // the system is doing, but by itself they don't tell you much. 805 | func (c *Consumer) GetCurrentLoad() int { 806 | return c.getNumActiveClaims() 807 | } 808 | 809 | // getNumActiveClaims returns the number of claims actively owned by this Consumer. 810 | func (c *Consumer) getNumActiveClaims() (ct int) { 811 | c.lock.RLock() 812 | defer c.lock.RUnlock() 813 | 814 | for _, topicClaims := range c.claims { 815 | for _, cl := range topicClaims { 816 | if !cl.Terminated() { 817 | ct++ 818 | } 819 | } 820 | } 821 | return 822 | } 823 | 824 | // isClaimLimitReached returns the number of claims actively owned by this Consumer. 825 | func (c *Consumer) isClaimLimitReached() bool { 826 | // if we're claiming topics, then this is not applicable. It's handled inside claimTopics 827 | return !c.options.ClaimEntireTopic && c.options.MaximumClaims > 0 && 828 | c.getNumActiveClaims() >= c.options.MaximumClaims 829 | } 830 | 831 | // ConsumeChannel returns a read-only channel. Messages that are retrieved from Kafka will be 832 | // made available in this channel. 833 | func (c *Consumer) ConsumeChannel() <-chan *Message { 834 | return c.messages 835 | } 836 | 837 | // consumeOne returns a single message. This is mostly used within the test suite to 838 | // make testing easier as it simulates the message handling behavior. 839 | func (c *Consumer) consumeOne() *Message { 840 | msg := <-c.messages 841 | c.Commit(msg) 842 | return msg 843 | } 844 | 845 | // Commit is called when you've finished processing a message. This operation marks 846 | // the offset as committed internally and is suitable for at-least-once processing 847 | // because we do not immediately write the offsets to storage. We will flush the 848 | // offsets periodically (based on the heartbeat interval). 849 | func (c *Consumer) Commit(msg *Message) error { 850 | cl, ok := func() (*claim, bool) { 851 | c.lock.RLock() 852 | defer c.lock.RUnlock() 853 | 854 | cl, ok := c.claims[msg.Topic][int(msg.Partition)] 855 | return cl, ok 856 | }() 857 | if !ok { 858 | return fmt.Errorf("Message not committed (claim for topic %s, partition %d expired).", 859 | msg.Topic, msg.Partition) 860 | } 861 | return cl.Commit(msg.Offset) 862 | } 863 | 864 | // Flush will cause us to upate all of the committed offsets. This operation can be 865 | // performed to periodically sync offsets without waiting on the internal flushing mechanism. 866 | func (c *Consumer) Flush() error { 867 | c.lock.RLock() 868 | defer c.lock.RUnlock() 869 | 870 | claims := make([]*claim, 0) 871 | for topic := range c.claims { 872 | for partID := range c.claims[topic] { 873 | claims = append(claims, c.claims[topic][partID]) 874 | } 875 | } 876 | 877 | // Do flushing concurrently because they involve sending messages to Kafka 878 | // which can be slow if done serially 879 | waiter := &sync.WaitGroup{} 880 | waiter.Add(len(claims)) 881 | errChan := make(chan error, len(claims)) 882 | 883 | for _, cl := range claims { 884 | cl := cl 885 | go func() { 886 | defer waiter.Done() 887 | if err := cl.Flush(); err != nil { 888 | errChan <- err 889 | } 890 | }() 891 | } 892 | 893 | // Wait for all flushes to finish 894 | waiter.Wait() 895 | close(errChan) 896 | 897 | // Channel will be empty unless there was an error 898 | anyErrors := false 899 | for err := range errChan { 900 | anyErrors = true 901 | log.Errorf("Flush error: %s", err) 902 | } 903 | if anyErrors { 904 | return errors.New("One or more errors encountered flushing offsets.") 905 | } 906 | return nil 907 | } 908 | 909 | // CommitByToken is called when you've finished processing a message. In the at-least-once 910 | // consumption case, this will allow the "last processed offset" to move forward so that 911 | // we can never see this message again. This particular method is used when you've only 912 | // got a CommitToken to commit from. 913 | func (c *Consumer) CommitByToken(token CommitToken) error { 914 | cl, ok := func() (*claim, bool) { 915 | c.lock.RLock() 916 | defer c.lock.RUnlock() 917 | 918 | cl, ok := c.claims[token.topic][token.partID] 919 | return cl, ok 920 | }() 921 | if !ok { 922 | return fmt.Errorf("Message not committed (claim for topic %s, partition %d expired).", 923 | token.topic, token.partID) 924 | } 925 | return cl.Commit(token.offset) 926 | } 927 | 928 | // PrintState outputs the status of the consumer. 929 | func (c *Consumer) PrintState() { 930 | c.lock.RLock() 931 | defer c.lock.RUnlock() 932 | 933 | log.Infof(" CONSUMER: %d messages in queue", len(c.messages)) 934 | for _, topic := range c.topics { 935 | log.Infof(" TOPIC: %s", topic) 936 | for _, claim := range c.claims[topic] { 937 | claim.PrintState() 938 | } 939 | } 940 | } 941 | --------------------------------------------------------------------------------