├── todo.md ├── Makefile ├── .gitignore ├── logging.go ├── conflict_delegate.go ├── tag.sh ├── merge_delegate.go ├── test └── setup_subnet.sh ├── alive_delegate.go ├── broadcast_test.go ├── ping_delegate.go ├── logging_test.go ├── awareness_test.go ├── security_test.go ├── delegate.go ├── awareness.go ├── event_delegate.go ├── integ_test.go ├── transport.go ├── transport_test.go ├── mock_transport.go ├── broadcast.go ├── keyring_test.go ├── queue.go ├── suspicion.go ├── keyring.go ├── queue_test.go ├── suspicion_test.go ├── security.go ├── README.md ├── util_test.go ├── util.go ├── net_transport.go ├── config.go ├── LICENSE ├── memberlist.go ├── net_test.go └── net.go /todo.md: -------------------------------------------------------------------------------- 1 | # TODO 2 | * Dynamic RTT discovery 3 | * Compute 99th percentile for ping/ack 4 | * Better lower bound for ping/ack, faster failure detection 5 | * Dynamic MTU discovery 6 | * Prevent lost updates, increases efficiency 7 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | test: subnet 2 | go test ./... 3 | 4 | integ: subnet 5 | INTEG_TESTS=yes go test ./... 6 | 7 | subnet: 8 | ./test/setup_subnet.sh 9 | 10 | cov: 11 | gocov test github.com/hashicorp/memberlist | gocov-html > /tmp/coverage.html 12 | open /tmp/coverage.html 13 | 14 | .PNONY: test cov integ 15 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Compiled Object files, Static and Dynamic libs (Shared Objects) 2 | *.o 3 | *.a 4 | *.so 5 | 6 | # Folders 7 | _obj 8 | _test 9 | 10 | # Architecture specific extensions/prefixes 11 | *.[568vq] 12 | [568vq].out 13 | 14 | *.cgo1.go 15 | *.cgo2.c 16 | _cgo_defun.c 17 | _cgo_gotypes.go 18 | _cgo_export.* 19 | 20 | _testmain.go 21 | 22 | *.exe 23 | *.test 24 | .vagrant/ 25 | 26 | -------------------------------------------------------------------------------- /logging.go: -------------------------------------------------------------------------------- 1 | package memberlist 2 | 3 | import ( 4 | "fmt" 5 | "net" 6 | ) 7 | 8 | func LogAddress(addr net.Addr) string { 9 | if addr == nil { 10 | return "from=" 11 | } 12 | 13 | return fmt.Sprintf("from=%s", addr.String()) 14 | } 15 | 16 | func LogConn(conn net.Conn) string { 17 | if conn == nil { 18 | return LogAddress(nil) 19 | } 20 | 21 | return LogAddress(conn.RemoteAddr()) 22 | } 23 | -------------------------------------------------------------------------------- /conflict_delegate.go: -------------------------------------------------------------------------------- 1 | package memberlist 2 | 3 | // ConflictDelegate is a used to inform a client that 4 | // a node has attempted to join which would result in a 5 | // name conflict. This happens if two clients are configured 6 | // with the same name but different addresses. 7 | type ConflictDelegate interface { 8 | // NotifyConflict is invoked when a name conflict is detected 9 | NotifyConflict(existing, other *Node) 10 | } 11 | -------------------------------------------------------------------------------- /tag.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -e 3 | 4 | # The version must be supplied from the environment. Do not include the 5 | # leading "v". 6 | if [ -z $VERSION ]; then 7 | echo "Please specify a version." 8 | exit 1 9 | fi 10 | 11 | # Generate the tag. 12 | echo "==> Tagging version $VERSION..." 13 | git commit --allow-empty -a --gpg-sign=348FFC4C -m "Release v$VERSION" 14 | git tag -a -m "Version $VERSION" -s -u 348FFC4C "v${VERSION}" master 15 | 16 | exit 0 17 | -------------------------------------------------------------------------------- /merge_delegate.go: -------------------------------------------------------------------------------- 1 | package memberlist 2 | 3 | // MergeDelegate is used to involve a client in 4 | // a potential cluster merge operation. Namely, when 5 | // a node does a TCP push/pull (as part of a join), 6 | // the delegate is involved and allowed to cancel the join 7 | // based on custom logic. The merge delegate is NOT invoked 8 | // as part of the push-pull anti-entropy. 9 | type MergeDelegate interface { 10 | // NotifyMerge is invoked when a merge could take place. 11 | // Provides a list of the nodes known by the peer. If 12 | // the return value is non-nil, the merge is canceled. 13 | NotifyMerge(peers []*Node) error 14 | } 15 | -------------------------------------------------------------------------------- /test/setup_subnet.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # This script makes sure that 127.0.0.x is routable. On Darwin, there 4 | # is a bug that it isn't routable and this causes errors. 5 | # 6 | 7 | # Check if loopback is setup 8 | ping -c 1 -W 10 127.0.0.2 > /dev/null 2>&1 9 | if [ $? -eq 0 ] 10 | then 11 | exit 12 | fi 13 | 14 | # If we're not on OS X, then error 15 | case $OSTYPE in 16 | darwin*) 17 | ;; 18 | *) 19 | echo "Can't setup interfaces on non-Mac. Error!" 20 | exit 1 21 | ;; 22 | esac 23 | 24 | # Setup loopback 25 | for ((i=2;i<256;i++)) 26 | do 27 | sudo ifconfig lo0 alias 127.0.0.$i up 28 | done 29 | -------------------------------------------------------------------------------- /alive_delegate.go: -------------------------------------------------------------------------------- 1 | package memberlist 2 | 3 | // AliveDelegate is used to involve a client in processing 4 | // a node "alive" message. When a node joins, either through 5 | // a UDP gossip or TCP push/pull, we update the state of 6 | // that node via an alive message. This can be used to filter 7 | // a node out and prevent it from being considered a peer 8 | // using application specific logic. 9 | type AliveDelegate interface { 10 | // NotifyMerge is invoked when a merge could take place. 11 | // Provides a list of the nodes known by the peer. If 12 | // the return value is non-nil, the merge is canceled. 13 | NotifyAlive(peer *Node) error 14 | } 15 | -------------------------------------------------------------------------------- /broadcast_test.go: -------------------------------------------------------------------------------- 1 | package memberlist 2 | 3 | import ( 4 | "reflect" 5 | "testing" 6 | ) 7 | 8 | func TestMemberlistBroadcast_Invalidates(t *testing.T) { 9 | m1 := &memberlistBroadcast{"test", nil, nil} 10 | m2 := &memberlistBroadcast{"foo", nil, nil} 11 | 12 | if m1.Invalidates(m2) || m2.Invalidates(m1) { 13 | t.Fatalf("unexpected invalidation") 14 | } 15 | 16 | if !m1.Invalidates(m1) { 17 | t.Fatalf("expected invalidation") 18 | } 19 | } 20 | 21 | func TestMemberlistBroadcast_Message(t *testing.T) { 22 | m1 := &memberlistBroadcast{"test", []byte("test"), nil} 23 | msg := m1.Message() 24 | if !reflect.DeepEqual(msg, []byte("test")) { 25 | t.Fatalf("messages do not match") 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /ping_delegate.go: -------------------------------------------------------------------------------- 1 | package memberlist 2 | 3 | import "time" 4 | 5 | // PingDelegate is used to notify an observer how long it took for a ping message to 6 | // complete a round trip. It can also be used for writing arbitrary byte slices 7 | // into ack messages. Note that in order to be meaningful for RTT estimates, this 8 | // delegate does not apply to indirect pings, nor fallback pings sent over TCP. 9 | type PingDelegate interface { 10 | // AckPayload is invoked when an ack is being sent; the returned bytes will be appended to the ack 11 | AckPayload() []byte 12 | // NotifyPing is invoked when an ack for a ping is received 13 | NotifyPingComplete(other *Node, rtt time.Duration, payload []byte) 14 | } 15 | -------------------------------------------------------------------------------- /logging_test.go: -------------------------------------------------------------------------------- 1 | package memberlist 2 | 3 | import ( 4 | "fmt" 5 | "net" 6 | "testing" 7 | ) 8 | 9 | func TestLogging_Address(t *testing.T) { 10 | s := LogAddress(nil) 11 | if s != "from=" { 12 | t.Fatalf("bad: %s", s) 13 | } 14 | 15 | addr, err := net.ResolveIPAddr("ip4", "127.0.0.1") 16 | if err != nil { 17 | t.Fatalf("err: %v", err) 18 | } 19 | 20 | s = LogAddress(addr) 21 | if s != "from=127.0.0.1" { 22 | t.Fatalf("bad: %s", s) 23 | } 24 | } 25 | 26 | func TestLogging_Conn(t *testing.T) { 27 | s := LogConn(nil) 28 | if s != "from=" { 29 | t.Fatalf("bad: %s", s) 30 | } 31 | 32 | ln, err := net.Listen("tcp", ":0") 33 | if err != nil { 34 | t.Fatalf("err: %v", err) 35 | } 36 | 37 | conn, err := net.Dial("tcp", ln.Addr().String()) 38 | if err != nil { 39 | t.Fatalf("err: %v", err) 40 | } 41 | defer conn.Close() 42 | 43 | s = LogConn(conn) 44 | if s != fmt.Sprintf("from=%s", conn.RemoteAddr().String()) { 45 | t.Fatalf("bad: %s", s) 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /awareness_test.go: -------------------------------------------------------------------------------- 1 | package memberlist 2 | 3 | import ( 4 | "testing" 5 | "time" 6 | ) 7 | 8 | func TestAwareness(t *testing.T) { 9 | cases := []struct { 10 | delta int 11 | score int 12 | timeout time.Duration 13 | }{ 14 | {0, 0, 1 * time.Second}, 15 | {-1, 0, 1 * time.Second}, 16 | {-10, 0, 1 * time.Second}, 17 | {1, 1, 2 * time.Second}, 18 | {-1, 0, 1 * time.Second}, 19 | {10, 7, 8 * time.Second}, 20 | {-1, 6, 7 * time.Second}, 21 | {-1, 5, 6 * time.Second}, 22 | {-1, 4, 5 * time.Second}, 23 | {-1, 3, 4 * time.Second}, 24 | {-1, 2, 3 * time.Second}, 25 | {-1, 1, 2 * time.Second}, 26 | {-1, 0, 1 * time.Second}, 27 | {-1, 0, 1 * time.Second}, 28 | } 29 | 30 | a := newAwareness(8) 31 | for i, c := range cases { 32 | a.ApplyDelta(c.delta) 33 | if a.GetHealthScore() != c.score { 34 | t.Errorf("case %d: score mismatch %d != %d", i, a.score, c.score) 35 | } 36 | if timeout := a.ScaleTimeout(1 * time.Second); timeout != c.timeout { 37 | t.Errorf("case %d: scaled timeout mismatch %9.6f != %9.6f", 38 | i, timeout.Seconds(), c.timeout.Seconds()) 39 | } 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /security_test.go: -------------------------------------------------------------------------------- 1 | package memberlist 2 | 3 | import ( 4 | "bytes" 5 | "reflect" 6 | "testing" 7 | ) 8 | 9 | func TestPKCS7(t *testing.T) { 10 | for i := 0; i <= 255; i++ { 11 | // Make a buffer of size i 12 | buf := []byte{} 13 | for j := 0; j < i; j++ { 14 | buf = append(buf, byte(i)) 15 | } 16 | 17 | // Copy to bytes buffer 18 | inp := bytes.NewBuffer(nil) 19 | inp.Write(buf) 20 | 21 | // Pad this out 22 | pkcs7encode(inp, 0, 16) 23 | 24 | // Unpad 25 | dec := pkcs7decode(inp.Bytes(), 16) 26 | 27 | // Ensure equivilence 28 | if !reflect.DeepEqual(buf, dec) { 29 | t.Fatalf("mismatch: %v %v", buf, dec) 30 | } 31 | } 32 | 33 | } 34 | 35 | func TestEncryptDecrypt_V0(t *testing.T) { 36 | encryptDecryptVersioned(0, t) 37 | } 38 | 39 | func TestEncryptDecrypt_V1(t *testing.T) { 40 | encryptDecryptVersioned(1, t) 41 | } 42 | 43 | func encryptDecryptVersioned(vsn encryptionVersion, t *testing.T) { 44 | k1 := []byte{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15} 45 | plaintext := []byte("this is a plain text message") 46 | extra := []byte("random data") 47 | 48 | var buf bytes.Buffer 49 | err := encryptPayload(vsn, k1, plaintext, extra, &buf) 50 | if err != nil { 51 | t.Fatalf("err: %v", err) 52 | } 53 | 54 | expLen := encryptedLength(vsn, len(plaintext)) 55 | if buf.Len() != expLen { 56 | t.Fatalf("output length is unexpected %d %d %d", len(plaintext), buf.Len(), expLen) 57 | } 58 | 59 | msg, err := decryptPayload([][]byte{k1}, buf.Bytes(), extra) 60 | if err != nil { 61 | t.Fatalf("err: %v", err) 62 | } 63 | 64 | cmp := bytes.Compare(msg, plaintext) 65 | if cmp != 0 { 66 | t.Errorf("len %d %v", len(msg), msg) 67 | t.Errorf("len %d %v", len(plaintext), plaintext) 68 | t.Fatalf("encrypt/decrypt failed! %d '%s' '%s'", cmp, msg, plaintext) 69 | } 70 | } 71 | -------------------------------------------------------------------------------- /delegate.go: -------------------------------------------------------------------------------- 1 | package memberlist 2 | 3 | // Delegate is the interface that clients must implement if they want to hook 4 | // into the gossip layer of Memberlist. All the methods must be thread-safe, 5 | // as they can and generally will be called concurrently. 6 | type Delegate interface { 7 | // NodeMeta is used to retrieve meta-data about the current node 8 | // when broadcasting an alive message. It's length is limited to 9 | // the given byte size. This metadata is available in the Node structure. 10 | NodeMeta(limit int) []byte 11 | 12 | // NotifyMsg is called when a user-data message is received. 13 | // Care should be taken that this method does not block, since doing 14 | // so would block the entire UDP packet receive loop. Additionally, the byte 15 | // slice may be modified after the call returns, so it should be copied if needed 16 | NotifyMsg([]byte) 17 | 18 | // GetBroadcasts is called when user data messages can be broadcast. 19 | // It can return a list of buffers to send. Each buffer should assume an 20 | // overhead as provided with a limit on the total byte size allowed. 21 | // The total byte size of the resulting data to send must not exceed 22 | // the limit. Care should be taken that this method does not block, 23 | // since doing so would block the entire UDP packet receive loop. 24 | GetBroadcasts(overhead, limit int) [][]byte 25 | 26 | // LocalState is used for a TCP Push/Pull. This is sent to 27 | // the remote side in addition to the membership information. Any 28 | // data can be sent here. See MergeRemoteState as well. The `join` 29 | // boolean indicates this is for a join instead of a push/pull. 30 | LocalState(join bool) []byte 31 | 32 | // MergeRemoteState is invoked after a TCP Push/Pull. This is the 33 | // state received from the remote side and is the result of the 34 | // remote side's LocalState call. The 'join' 35 | // boolean indicates this is for a join instead of a push/pull. 36 | MergeRemoteState(buf []byte, join bool) 37 | } 38 | -------------------------------------------------------------------------------- /awareness.go: -------------------------------------------------------------------------------- 1 | package memberlist 2 | 3 | import ( 4 | "sync" 5 | "time" 6 | 7 | "github.com/armon/go-metrics" 8 | ) 9 | 10 | // awareness manages a simple metric for tracking the estimated health of the 11 | // local node. Health is primary the node's ability to respond in the soft 12 | // real-time manner required for correct health checking of other nodes in the 13 | // cluster. 14 | type awareness struct { 15 | sync.RWMutex 16 | 17 | // max is the upper threshold for the timeout scale (the score will be 18 | // constrained to be from 0 <= score < max). 19 | max int 20 | 21 | // score is the current awareness score. Lower values are healthier and 22 | // zero is the minimum value. 23 | score int 24 | } 25 | 26 | // newAwareness returns a new awareness object. 27 | func newAwareness(max int) *awareness { 28 | return &awareness{ 29 | max: max, 30 | score: 0, 31 | } 32 | } 33 | 34 | // ApplyDelta takes the given delta and applies it to the score in a thread-safe 35 | // manner. It also enforces a floor of zero and a max of max, so deltas may not 36 | // change the overall score if it's railed at one of the extremes. 37 | func (a *awareness) ApplyDelta(delta int) { 38 | a.Lock() 39 | initial := a.score 40 | a.score += delta 41 | if a.score < 0 { 42 | a.score = 0 43 | } else if a.score > (a.max - 1) { 44 | a.score = (a.max - 1) 45 | } 46 | final := a.score 47 | a.Unlock() 48 | 49 | if initial != final { 50 | metrics.SetGauge([]string{"memberlist", "health", "score"}, float32(final)) 51 | } 52 | } 53 | 54 | // GetHealthScore returns the raw health score. 55 | func (a *awareness) GetHealthScore() int { 56 | a.RLock() 57 | score := a.score 58 | a.RUnlock() 59 | return score 60 | } 61 | 62 | // ScaleTimeout takes the given duration and scales it based on the current 63 | // score. Less healthyness will lead to longer timeouts. 64 | func (a *awareness) ScaleTimeout(timeout time.Duration) time.Duration { 65 | a.RLock() 66 | score := a.score 67 | a.RUnlock() 68 | return timeout * (time.Duration(score) + 1) 69 | } 70 | -------------------------------------------------------------------------------- /event_delegate.go: -------------------------------------------------------------------------------- 1 | package memberlist 2 | 3 | // EventDelegate is a simpler delegate that is used only to receive 4 | // notifications about members joining and leaving. The methods in this 5 | // delegate may be called by multiple goroutines, but never concurrently. 6 | // This allows you to reason about ordering. 7 | type EventDelegate interface { 8 | // NotifyJoin is invoked when a node is detected to have joined. 9 | // The Node argument must not be modified. 10 | NotifyJoin(*Node) 11 | 12 | // NotifyLeave is invoked when a node is detected to have left. 13 | // The Node argument must not be modified. 14 | NotifyLeave(*Node) 15 | 16 | // NotifyUpdate is invoked when a node is detected to have 17 | // updated, usually involving the meta data. The Node argument 18 | // must not be modified. 19 | NotifyUpdate(*Node) 20 | } 21 | 22 | // ChannelEventDelegate is used to enable an application to receive 23 | // events about joins and leaves over a channel instead of a direct 24 | // function call. 25 | // 26 | // Care must be taken that events are processed in a timely manner from 27 | // the channel, since this delegate will block until an event can be sent. 28 | type ChannelEventDelegate struct { 29 | Ch chan<- NodeEvent 30 | } 31 | 32 | // NodeEventType are the types of events that can be sent from the 33 | // ChannelEventDelegate. 34 | type NodeEventType int 35 | 36 | const ( 37 | NodeJoin NodeEventType = iota 38 | NodeLeave 39 | NodeUpdate 40 | ) 41 | 42 | // NodeEvent is a single event related to node activity in the memberlist. 43 | // The Node member of this struct must not be directly modified. It is passed 44 | // as a pointer to avoid unnecessary copies. If you wish to modify the node, 45 | // make a copy first. 46 | type NodeEvent struct { 47 | Event NodeEventType 48 | Node *Node 49 | } 50 | 51 | func (c *ChannelEventDelegate) NotifyJoin(n *Node) { 52 | c.Ch <- NodeEvent{NodeJoin, n} 53 | } 54 | 55 | func (c *ChannelEventDelegate) NotifyLeave(n *Node) { 56 | c.Ch <- NodeEvent{NodeLeave, n} 57 | } 58 | 59 | func (c *ChannelEventDelegate) NotifyUpdate(n *Node) { 60 | c.Ch <- NodeEvent{NodeUpdate, n} 61 | } 62 | -------------------------------------------------------------------------------- /integ_test.go: -------------------------------------------------------------------------------- 1 | package memberlist 2 | 3 | import ( 4 | "fmt" 5 | "log" 6 | "os" 7 | "testing" 8 | "time" 9 | ) 10 | 11 | // CheckInteg will skip a test if integration testing is not enabled. 12 | func CheckInteg(t *testing.T) { 13 | if !IsInteg() { 14 | t.SkipNow() 15 | } 16 | } 17 | 18 | // IsInteg returns a boolean telling you if we're in integ testing mode. 19 | func IsInteg() bool { 20 | return os.Getenv("INTEG_TESTS") != "" 21 | } 22 | 23 | // Tests the memberlist by creating a cluster of 100 nodes 24 | // and checking that we get strong convergence of changes. 25 | func TestMemberlist_Integ(t *testing.T) { 26 | CheckInteg(t) 27 | 28 | num := 16 29 | var members []*Memberlist 30 | 31 | secret := []byte{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15} 32 | eventCh := make(chan NodeEvent, num) 33 | 34 | addr := "127.0.0.1" 35 | for i := 0; i < num; i++ { 36 | c := DefaultLANConfig() 37 | c.Name = fmt.Sprintf("%s:%d", addr, 12345+i) 38 | c.BindAddr = addr 39 | c.BindPort = 12345 + i 40 | c.ProbeInterval = 20 * time.Millisecond 41 | c.ProbeTimeout = 100 * time.Millisecond 42 | c.GossipInterval = 20 * time.Millisecond 43 | c.PushPullInterval = 200 * time.Millisecond 44 | c.SecretKey = secret 45 | 46 | if i == 0 { 47 | c.Events = &ChannelEventDelegate{eventCh} 48 | } 49 | 50 | m, err := Create(c) 51 | if err != nil { 52 | t.Fatalf("unexpected err: %s", err) 53 | } 54 | members = append(members, m) 55 | defer m.Shutdown() 56 | 57 | if i > 0 { 58 | last := members[i-1] 59 | num, err := m.Join([]string{last.config.Name}) 60 | if num == 0 || err != nil { 61 | t.Fatalf("unexpected err: %s", err) 62 | } 63 | } 64 | } 65 | 66 | // Wait and print debug info 67 | breakTimer := time.After(250 * time.Millisecond) 68 | WAIT: 69 | for { 70 | select { 71 | case e := <-eventCh: 72 | if e.Event == NodeJoin { 73 | log.Printf("[DEBUG] Node join: %v (%d)", *e.Node, members[0].NumMembers()) 74 | } else { 75 | log.Printf("[DEBUG] Node leave: %v (%d)", *e.Node, members[0].NumMembers()) 76 | } 77 | case <-breakTimer: 78 | break WAIT 79 | } 80 | } 81 | 82 | for idx, m := range members { 83 | got := m.NumMembers() 84 | if got != num { 85 | t.Errorf("bad num members at idx %d. Expected %d. Got %d.", 86 | idx, num, got) 87 | } 88 | } 89 | } 90 | -------------------------------------------------------------------------------- /transport.go: -------------------------------------------------------------------------------- 1 | package memberlist 2 | 3 | import ( 4 | "net" 5 | "time" 6 | ) 7 | 8 | // Packet is used to provide some metadata about incoming packets from peers 9 | // over a packet connection, as well as the packet payload. 10 | type Packet struct { 11 | // Buf has the raw contents of the packet. 12 | Buf []byte 13 | 14 | // From has the address of the peer. This is an actual net.Addr so we 15 | // can expose some concrete details about incoming packets. 16 | From net.Addr 17 | 18 | // Timestamp is the time when the packet was received. This should be 19 | // taken as close as possible to the actual receipt time to help make an 20 | // accurate RTT measurements during probes. 21 | Timestamp time.Time 22 | } 23 | 24 | // Transport is used to abstract over communicating with other peers. The packet 25 | // interface is assumed to be best-effort and the stream interface is assumed to 26 | // be reliable. 27 | type Transport interface { 28 | // FinalAdvertiseAddr is given the user's configured values (which 29 | // might be empty) and returns the desired IP and port to advertise to 30 | // the rest of the cluster. 31 | FinalAdvertiseAddr(ip string, port int) (net.IP, int, error) 32 | 33 | // WriteTo is a packet-oriented interface that fires off the given 34 | // payload to the given address in a connectionless fashion. This should 35 | // return a time stamp that's as close as possible to when the packet 36 | // was transmitted to help make accurate RTT measurements during probes. 37 | // 38 | // This is similar to net.PacketConn, though we didn't want to expose 39 | // that full set of required methods to keep assumptions about the 40 | // underlying plumbing to a minimum. We also treat the address here as a 41 | // string, similar to Dial, so it's network neutral, so this usually is 42 | // in the form of "host:port". 43 | WriteTo(b []byte, addr string) (time.Time, error) 44 | 45 | // PacketCh returns a channel that can be read to receive incoming 46 | // packets from other peers. How this is set up for listening is left as 47 | // an exercise for the concrete transport implementations. 48 | PacketCh() <-chan *Packet 49 | 50 | // DialTimeout is used to create a connection that allows us to perform 51 | // two-way communication with a peer. This is generally more expensive 52 | // than packet connections so is used for more infrequent operations 53 | // such as anti-entropy or fallback probes if the packet-oriented probe 54 | // failed. 55 | DialTimeout(addr string, timeout time.Duration) (net.Conn, error) 56 | 57 | // StreamCh returns a channel that can be read to handle incoming stream 58 | // connections from other peers. How this is set up for listening is 59 | // left as an exercise for the concrete transport implementations. 60 | StreamCh() <-chan net.Conn 61 | 62 | // Shutdown is called when memberlist is shutting down; this gives the 63 | // transport a chance to clean up any listeners. 64 | Shutdown() error 65 | } 66 | -------------------------------------------------------------------------------- /transport_test.go: -------------------------------------------------------------------------------- 1 | package memberlist 2 | 3 | import ( 4 | "bytes" 5 | "testing" 6 | "time" 7 | ) 8 | 9 | func TestTransport_Join(t *testing.T) { 10 | net := &MockNetwork{} 11 | 12 | t1 := net.NewTransport() 13 | 14 | c1 := DefaultLANConfig() 15 | c1.Name = "node1" 16 | c1.Transport = t1 17 | m1, err := Create(c1) 18 | if err != nil { 19 | t.Fatalf("err: %v", err) 20 | } 21 | m1.setAlive() 22 | m1.schedule() 23 | defer m1.Shutdown() 24 | 25 | c2 := DefaultLANConfig() 26 | c2.Name = "node2" 27 | c2.Transport = net.NewTransport() 28 | m2, err := Create(c2) 29 | if err != nil { 30 | t.Fatalf("err: %v", err) 31 | } 32 | m2.setAlive() 33 | m2.schedule() 34 | defer m2.Shutdown() 35 | 36 | num, err := m2.Join([]string{t1.addr.String()}) 37 | if num != 1 { 38 | t.Fatalf("bad: %d", num) 39 | } 40 | if err != nil { 41 | t.Fatalf("err: %v", err) 42 | } 43 | 44 | if len(m2.Members()) != 2 { 45 | t.Fatalf("bad: %v", m2.Members()) 46 | } 47 | if m2.estNumNodes() != 2 { 48 | t.Fatalf("bad: %v", m2.Members()) 49 | } 50 | 51 | } 52 | 53 | func TestTransport_Send(t *testing.T) { 54 | net := &MockNetwork{} 55 | 56 | t1 := net.NewTransport() 57 | d1 := &MockDelegate{} 58 | 59 | c1 := DefaultLANConfig() 60 | c1.Name = "node1" 61 | c1.Transport = t1 62 | c1.Delegate = d1 63 | m1, err := Create(c1) 64 | if err != nil { 65 | t.Fatalf("err: %v", err) 66 | } 67 | m1.setAlive() 68 | m1.schedule() 69 | defer m1.Shutdown() 70 | 71 | c2 := DefaultLANConfig() 72 | c2.Name = "node2" 73 | c2.Transport = net.NewTransport() 74 | m2, err := Create(c2) 75 | if err != nil { 76 | t.Fatalf("err: %v", err) 77 | } 78 | m2.setAlive() 79 | m2.schedule() 80 | defer m2.Shutdown() 81 | 82 | num, err := m2.Join([]string{t1.addr.String()}) 83 | if num != 1 { 84 | t.Fatalf("bad: %d", num) 85 | } 86 | if err != nil { 87 | t.Fatalf("err: %v", err) 88 | } 89 | 90 | if err := m2.SendTo(t1.addr, []byte("SendTo")); err != nil { 91 | t.Fatalf("err: %v", err) 92 | } 93 | 94 | var n1 *Node 95 | for _, n := range m2.Members() { 96 | if n.Name == c1.Name { 97 | n1 = n 98 | break 99 | } 100 | } 101 | if n1 == nil { 102 | t.Fatalf("bad") 103 | } 104 | 105 | if err := m2.SendToUDP(n1, []byte("SendToUDP")); err != nil { 106 | t.Fatalf("err: %v", err) 107 | } 108 | if err := m2.SendToTCP(n1, []byte("SendToTCP")); err != nil { 109 | t.Fatalf("err: %v", err) 110 | } 111 | if err := m2.SendBestEffort(n1, []byte("SendBestEffort")); err != nil { 112 | t.Fatalf("err: %v", err) 113 | } 114 | if err := m2.SendReliable(n1, []byte("SendReliable")); err != nil { 115 | t.Fatalf("err: %v", err) 116 | } 117 | time.Sleep(100 * time.Millisecond) 118 | 119 | received := bytes.Join(d1.msgs, []byte("|")) 120 | expected := []byte("SendTo|SendToUDP|SendToTCP|SendBestEffort|SendReliable") 121 | if !bytes.Equal(received, expected) { 122 | t.Fatalf("bad: %s", received) 123 | } 124 | } 125 | -------------------------------------------------------------------------------- /mock_transport.go: -------------------------------------------------------------------------------- 1 | package memberlist 2 | 3 | import ( 4 | "fmt" 5 | "net" 6 | "strconv" 7 | "time" 8 | ) 9 | 10 | // MockNetwork is used as a factory that produces MockTransport instances which 11 | // are uniquely addressed and wired up to talk to each other. 12 | type MockNetwork struct { 13 | transports map[string]*MockTransport 14 | port int 15 | } 16 | 17 | // NewTransport returns a new MockTransport with a unique address, wired up to 18 | // talk to the other transports in the MockNetwork. 19 | func (n *MockNetwork) NewTransport() *MockTransport { 20 | n.port += 1 21 | addr := fmt.Sprintf("127.0.0.1:%d", n.port) 22 | transport := &MockTransport{ 23 | net: n, 24 | addr: &MockAddress{addr}, 25 | packetCh: make(chan *Packet), 26 | streamCh: make(chan net.Conn), 27 | } 28 | 29 | if n.transports == nil { 30 | n.transports = make(map[string]*MockTransport) 31 | } 32 | n.transports[addr] = transport 33 | return transport 34 | } 35 | 36 | // MockAddress is a wrapper which adds the net.Addr interface to our mock 37 | // address scheme. 38 | type MockAddress struct { 39 | addr string 40 | } 41 | 42 | // See net.Addr. 43 | func (a *MockAddress) Network() string { 44 | return "mock" 45 | } 46 | 47 | // See net.Addr. 48 | func (a *MockAddress) String() string { 49 | return a.addr 50 | } 51 | 52 | // MockTransport directly plumbs messages to other transports its MockNetwork. 53 | type MockTransport struct { 54 | net *MockNetwork 55 | addr *MockAddress 56 | packetCh chan *Packet 57 | streamCh chan net.Conn 58 | } 59 | 60 | // See Transport. 61 | func (t *MockTransport) FinalAdvertiseAddr(string, int) (net.IP, int, error) { 62 | host, portStr, err := net.SplitHostPort(t.addr.String()) 63 | if err != nil { 64 | return nil, 0, err 65 | } 66 | 67 | ip := net.ParseIP(host) 68 | if ip == nil { 69 | return nil, 0, fmt.Errorf("Failed to parse IP %q", host) 70 | } 71 | 72 | port, err := strconv.ParseInt(portStr, 10, 16) 73 | if err != nil { 74 | return nil, 0, err 75 | } 76 | 77 | return ip, int(port), nil 78 | } 79 | 80 | // See Transport. 81 | func (t *MockTransport) WriteTo(b []byte, addr string) (time.Time, error) { 82 | dest, ok := t.net.transports[addr] 83 | if !ok { 84 | return time.Time{}, fmt.Errorf("No route to %q", addr) 85 | } 86 | 87 | now := time.Now() 88 | dest.packetCh <- &Packet{ 89 | Buf: b, 90 | From: t.addr, 91 | Timestamp: now, 92 | } 93 | return now, nil 94 | } 95 | 96 | // See Transport. 97 | func (t *MockTransport) PacketCh() <-chan *Packet { 98 | return t.packetCh 99 | } 100 | 101 | // See Transport. 102 | func (t *MockTransport) DialTimeout(addr string, timeout time.Duration) (net.Conn, error) { 103 | dest, ok := t.net.transports[addr] 104 | if !ok { 105 | return nil, fmt.Errorf("No route to %q", addr) 106 | } 107 | 108 | p1, p2 := net.Pipe() 109 | dest.streamCh <- p1 110 | return p2, nil 111 | } 112 | 113 | // See Transport. 114 | func (t *MockTransport) StreamCh() <-chan net.Conn { 115 | return t.streamCh 116 | } 117 | 118 | // See Transport. 119 | func (t *MockTransport) Shutdown() error { 120 | return nil 121 | } 122 | -------------------------------------------------------------------------------- /broadcast.go: -------------------------------------------------------------------------------- 1 | package memberlist 2 | 3 | /* 4 | The broadcast mechanism works by maintaining a sorted list of messages to be 5 | sent out. When a message is to be broadcast, the retransmit count 6 | is set to zero and appended to the queue. The retransmit count serves 7 | as the "priority", ensuring that newer messages get sent first. Once 8 | a message hits the retransmit limit, it is removed from the queue. 9 | 10 | Additionally, older entries can be invalidated by new messages that 11 | are contradictory. For example, if we send "{suspect M1 inc: 1}, 12 | then a following {alive M1 inc: 2} will invalidate that message 13 | */ 14 | 15 | type memberlistBroadcast struct { 16 | node string 17 | msg []byte 18 | notify chan struct{} 19 | } 20 | 21 | func (b *memberlistBroadcast) Invalidates(other Broadcast) bool { 22 | // Check if that broadcast is a memberlist type 23 | mb, ok := other.(*memberlistBroadcast) 24 | if !ok { 25 | return false 26 | } 27 | 28 | // Invalidates any message about the same node 29 | return b.node == mb.node 30 | } 31 | 32 | func (b *memberlistBroadcast) Message() []byte { 33 | return b.msg 34 | } 35 | 36 | func (b *memberlistBroadcast) Finished() { 37 | select { 38 | case b.notify <- struct{}{}: 39 | default: 40 | } 41 | } 42 | 43 | // encodeAndBroadcast encodes a message and enqueues it for broadcast. Fails 44 | // silently if there is an encoding error. 45 | func (m *Memberlist) encodeAndBroadcast(node string, msgType messageType, msg interface{}) { 46 | m.encodeBroadcastNotify(node, msgType, msg, nil) 47 | } 48 | 49 | // encodeBroadcastNotify encodes a message and enqueues it for broadcast 50 | // and notifies the given channel when transmission is finished. Fails 51 | // silently if there is an encoding error. 52 | func (m *Memberlist) encodeBroadcastNotify(node string, msgType messageType, msg interface{}, notify chan struct{}) { 53 | buf, err := encode(msgType, msg) 54 | if err != nil { 55 | m.logger.Printf("[ERR] memberlist: Failed to encode message for broadcast: %s", err) 56 | } else { 57 | m.queueBroadcast(node, buf.Bytes(), notify) 58 | } 59 | } 60 | 61 | // queueBroadcast is used to start dissemination of a message. It will be 62 | // sent up to a configured number of times. The message could potentially 63 | // be invalidated by a future message about the same node 64 | func (m *Memberlist) queueBroadcast(node string, msg []byte, notify chan struct{}) { 65 | b := &memberlistBroadcast{node, msg, notify} 66 | m.broadcasts.QueueBroadcast(b) 67 | } 68 | 69 | // getBroadcasts is used to return a slice of broadcasts to send up to 70 | // a maximum byte size, while imposing a per-broadcast overhead. This is used 71 | // to fill a UDP packet with piggybacked data 72 | func (m *Memberlist) getBroadcasts(overhead, limit int) [][]byte { 73 | // Get memberlist messages first 74 | toSend := m.broadcasts.GetBroadcasts(overhead, limit) 75 | 76 | // Check if the user has anything to broadcast 77 | d := m.config.Delegate 78 | if d != nil { 79 | // Determine the bytes used already 80 | bytesUsed := 0 81 | for _, msg := range toSend { 82 | bytesUsed += len(msg) + overhead 83 | } 84 | 85 | // Check space remaining for user messages 86 | avail := limit - bytesUsed 87 | if avail > overhead+userMsgOverhead { 88 | userMsgs := d.GetBroadcasts(overhead+userMsgOverhead, avail) 89 | 90 | // Frame each user message 91 | for _, msg := range userMsgs { 92 | buf := make([]byte, 1, len(msg)+1) 93 | buf[0] = byte(userMsg) 94 | buf = append(buf, msg...) 95 | toSend = append(toSend, buf) 96 | } 97 | } 98 | } 99 | return toSend 100 | } 101 | -------------------------------------------------------------------------------- /keyring_test.go: -------------------------------------------------------------------------------- 1 | package memberlist 2 | 3 | import ( 4 | "bytes" 5 | "testing" 6 | ) 7 | 8 | var TestKeys [][]byte = [][]byte{ 9 | []byte{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, 10 | []byte{15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}, 11 | []byte{8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7}, 12 | } 13 | 14 | func TestKeyring_EmptyRing(t *testing.T) { 15 | // Keyrings can be created with no encryption keys (disabled encryption) 16 | keyring, err := NewKeyring(nil, nil) 17 | if err != nil { 18 | t.Fatalf("err: %s", err) 19 | } 20 | 21 | keys := keyring.GetKeys() 22 | if len(keys) != 0 { 23 | t.Fatalf("Expected 0 keys but have %d", len(keys)) 24 | } 25 | } 26 | 27 | func TestKeyring_PrimaryOnly(t *testing.T) { 28 | // Keyrings can be created using only a primary key 29 | keyring, err := NewKeyring(nil, TestKeys[0]) 30 | if err != nil { 31 | t.Fatalf("err: %s", err) 32 | } 33 | 34 | keys := keyring.GetKeys() 35 | if len(keys) != 1 { 36 | t.Fatalf("Expected 1 key but have %d", len(keys)) 37 | } 38 | } 39 | 40 | func TestKeyring_GetPrimaryKey(t *testing.T) { 41 | keyring, err := NewKeyring(TestKeys, TestKeys[1]) 42 | if err != nil { 43 | t.Fatalf("err: %s", err) 44 | } 45 | 46 | // GetPrimaryKey returns correct key 47 | primaryKey := keyring.GetPrimaryKey() 48 | if !bytes.Equal(primaryKey, TestKeys[1]) { 49 | t.Fatalf("Unexpected primary key: %v", primaryKey) 50 | } 51 | } 52 | 53 | func TestKeyring_AddRemoveUse(t *testing.T) { 54 | keyring, err := NewKeyring(nil, TestKeys[1]) 55 | if err != nil { 56 | t.Fatalf("err :%s", err) 57 | } 58 | 59 | // Use non-existent key throws error 60 | if err := keyring.UseKey(TestKeys[2]); err == nil { 61 | t.Fatalf("Expected key not installed error") 62 | } 63 | 64 | // Add key to ring 65 | if err := keyring.AddKey(TestKeys[2]); err != nil { 66 | t.Fatalf("err: %s", err) 67 | } 68 | 69 | keys := keyring.GetKeys() 70 | if !bytes.Equal(keys[0], TestKeys[1]) { 71 | t.Fatalf("Unexpected primary key change") 72 | } 73 | 74 | if len(keys) != 2 { 75 | t.Fatalf("Expected 2 keys but have %d", len(keys)) 76 | } 77 | 78 | // Use key that exists should succeed 79 | if err := keyring.UseKey(TestKeys[2]); err != nil { 80 | t.Fatalf("err: %s", err) 81 | } 82 | 83 | primaryKey := keyring.GetPrimaryKey() 84 | if !bytes.Equal(primaryKey, TestKeys[2]) { 85 | t.Fatalf("Unexpected primary key: %v", primaryKey) 86 | } 87 | 88 | // Removing primary key should fail 89 | if err := keyring.RemoveKey(TestKeys[2]); err == nil { 90 | t.Fatalf("Expected primary key removal error") 91 | } 92 | 93 | // Removing non-primary key should succeed 94 | if err := keyring.RemoveKey(TestKeys[1]); err != nil { 95 | t.Fatalf("err: %s", err) 96 | } 97 | 98 | keys = keyring.GetKeys() 99 | if len(keys) != 1 { 100 | t.Fatalf("Expected 1 key but have %d", len(keys)) 101 | } 102 | } 103 | 104 | func TestKeyRing_MultiKeyEncryptDecrypt(t *testing.T) { 105 | plaintext := []byte("this is a plain text message") 106 | extra := []byte("random data") 107 | 108 | keyring, err := NewKeyring(TestKeys, TestKeys[0]) 109 | if err != nil { 110 | t.Fatalf("err: %s", err) 111 | } 112 | 113 | // First encrypt using the primary key and make sure we can decrypt 114 | var buf bytes.Buffer 115 | err = encryptPayload(1, TestKeys[0], plaintext, extra, &buf) 116 | if err != nil { 117 | t.Fatalf("err: %v", err) 118 | } 119 | 120 | msg, err := decryptPayload(keyring.GetKeys(), buf.Bytes(), extra) 121 | if err != nil { 122 | t.Fatalf("err: %v", err) 123 | } 124 | 125 | if !bytes.Equal(msg, plaintext) { 126 | t.Fatalf("bad: %v", msg) 127 | } 128 | 129 | // Now encrypt with a secondary key and try decrypting again. 130 | buf.Reset() 131 | err = encryptPayload(1, TestKeys[2], plaintext, extra, &buf) 132 | if err != nil { 133 | t.Fatalf("err: %v", err) 134 | } 135 | 136 | msg, err = decryptPayload(keyring.GetKeys(), buf.Bytes(), extra) 137 | if err != nil { 138 | t.Fatalf("err: %v", err) 139 | } 140 | 141 | if !bytes.Equal(msg, plaintext) { 142 | t.Fatalf("bad: %v", msg) 143 | } 144 | 145 | // Remove a key from the ring, and then try decrypting again 146 | if err := keyring.RemoveKey(TestKeys[2]); err != nil { 147 | t.Fatalf("err: %s", err) 148 | } 149 | 150 | msg, err = decryptPayload(keyring.GetKeys(), buf.Bytes(), extra) 151 | if err == nil { 152 | t.Fatalf("Expected no keys to decrypt message") 153 | } 154 | } 155 | -------------------------------------------------------------------------------- /queue.go: -------------------------------------------------------------------------------- 1 | package memberlist 2 | 3 | import ( 4 | "sort" 5 | "sync" 6 | ) 7 | 8 | // TransmitLimitedQueue is used to queue messages to broadcast to 9 | // the cluster (via gossip) but limits the number of transmits per 10 | // message. It also prioritizes messages with lower transmit counts 11 | // (hence newer messages). 12 | type TransmitLimitedQueue struct { 13 | // NumNodes returns the number of nodes in the cluster. This is 14 | // used to determine the retransmit count, which is calculated 15 | // based on the log of this. 16 | NumNodes func() int 17 | 18 | // RetransmitMult is the multiplier used to determine the maximum 19 | // number of retransmissions attempted. 20 | RetransmitMult int 21 | 22 | sync.Mutex 23 | bcQueue limitedBroadcasts 24 | } 25 | 26 | type limitedBroadcast struct { 27 | transmits int // Number of transmissions attempted. 28 | b Broadcast 29 | } 30 | type limitedBroadcasts []*limitedBroadcast 31 | 32 | // Broadcast is something that can be broadcasted via gossip to 33 | // the memberlist cluster. 34 | type Broadcast interface { 35 | // Invalidates checks if enqueuing the current broadcast 36 | // invalidates a previous broadcast 37 | Invalidates(b Broadcast) bool 38 | 39 | // Returns a byte form of the message 40 | Message() []byte 41 | 42 | // Finished is invoked when the message will no longer 43 | // be broadcast, either due to invalidation or to the 44 | // transmit limit being reached 45 | Finished() 46 | } 47 | 48 | // QueueBroadcast is used to enqueue a broadcast 49 | func (q *TransmitLimitedQueue) QueueBroadcast(b Broadcast) { 50 | q.Lock() 51 | defer q.Unlock() 52 | 53 | // Check if this message invalidates another 54 | n := len(q.bcQueue) 55 | for i := 0; i < n; i++ { 56 | if b.Invalidates(q.bcQueue[i].b) { 57 | q.bcQueue[i].b.Finished() 58 | copy(q.bcQueue[i:], q.bcQueue[i+1:]) 59 | q.bcQueue[n-1] = nil 60 | q.bcQueue = q.bcQueue[:n-1] 61 | n-- 62 | } 63 | } 64 | 65 | // Append to the queue 66 | q.bcQueue = append(q.bcQueue, &limitedBroadcast{0, b}) 67 | } 68 | 69 | // GetBroadcasts is used to get a number of broadcasts, up to a byte limit 70 | // and applying a per-message overhead as provided. 71 | func (q *TransmitLimitedQueue) GetBroadcasts(overhead, limit int) [][]byte { 72 | q.Lock() 73 | defer q.Unlock() 74 | 75 | // Fast path the default case 76 | if len(q.bcQueue) == 0 { 77 | return nil 78 | } 79 | 80 | transmitLimit := retransmitLimit(q.RetransmitMult, q.NumNodes()) 81 | bytesUsed := 0 82 | var toSend [][]byte 83 | 84 | for i := len(q.bcQueue) - 1; i >= 0; i-- { 85 | // Check if this is within our limits 86 | b := q.bcQueue[i] 87 | msg := b.b.Message() 88 | if bytesUsed+overhead+len(msg) > limit { 89 | continue 90 | } 91 | 92 | // Add to slice to send 93 | bytesUsed += overhead + len(msg) 94 | toSend = append(toSend, msg) 95 | 96 | // Check if we should stop transmission 97 | b.transmits++ 98 | if b.transmits >= transmitLimit { 99 | b.b.Finished() 100 | n := len(q.bcQueue) 101 | q.bcQueue[i], q.bcQueue[n-1] = q.bcQueue[n-1], nil 102 | q.bcQueue = q.bcQueue[:n-1] 103 | } 104 | } 105 | 106 | // If we are sending anything, we need to re-sort to deal 107 | // with adjusted transmit counts 108 | if len(toSend) > 0 { 109 | q.bcQueue.Sort() 110 | } 111 | return toSend 112 | } 113 | 114 | // NumQueued returns the number of queued messages 115 | func (q *TransmitLimitedQueue) NumQueued() int { 116 | q.Lock() 117 | defer q.Unlock() 118 | return len(q.bcQueue) 119 | } 120 | 121 | // Reset clears all the queued messages 122 | func (q *TransmitLimitedQueue) Reset() { 123 | q.Lock() 124 | defer q.Unlock() 125 | for _, b := range q.bcQueue { 126 | b.b.Finished() 127 | } 128 | q.bcQueue = nil 129 | } 130 | 131 | // Prune will retain the maxRetain latest messages, and the rest 132 | // will be discarded. This can be used to prevent unbounded queue sizes 133 | func (q *TransmitLimitedQueue) Prune(maxRetain int) { 134 | q.Lock() 135 | defer q.Unlock() 136 | 137 | // Do nothing if queue size is less than the limit 138 | n := len(q.bcQueue) 139 | if n < maxRetain { 140 | return 141 | } 142 | 143 | // Invalidate the messages we will be removing 144 | for i := 0; i < n-maxRetain; i++ { 145 | q.bcQueue[i].b.Finished() 146 | } 147 | 148 | // Move the messages, and retain only the last maxRetain 149 | copy(q.bcQueue[0:], q.bcQueue[n-maxRetain:]) 150 | q.bcQueue = q.bcQueue[:maxRetain] 151 | } 152 | 153 | func (b limitedBroadcasts) Len() int { 154 | return len(b) 155 | } 156 | 157 | func (b limitedBroadcasts) Less(i, j int) bool { 158 | return b[i].transmits < b[j].transmits 159 | } 160 | 161 | func (b limitedBroadcasts) Swap(i, j int) { 162 | b[i], b[j] = b[j], b[i] 163 | } 164 | 165 | func (b limitedBroadcasts) Sort() { 166 | sort.Sort(sort.Reverse(b)) 167 | } 168 | -------------------------------------------------------------------------------- /suspicion.go: -------------------------------------------------------------------------------- 1 | package memberlist 2 | 3 | import ( 4 | "math" 5 | "sync/atomic" 6 | "time" 7 | ) 8 | 9 | // suspicion manages the suspect timer for a node and provides an interface 10 | // to accelerate the timeout as we get more independent confirmations that 11 | // a node is suspect. 12 | type suspicion struct { 13 | // n is the number of independent confirmations we've seen. This must 14 | // be updated using atomic instructions to prevent contention with the 15 | // timer callback. 16 | n int32 17 | 18 | // k is the number of independent confirmations we'd like to see in 19 | // order to drive the timer to its minimum value. 20 | k int32 21 | 22 | // min is the minimum timer value. 23 | min time.Duration 24 | 25 | // max is the maximum timer value. 26 | max time.Duration 27 | 28 | // start captures the timestamp when we began the timer. This is used 29 | // so we can calculate durations to feed the timer during updates in 30 | // a way the achieves the overall time we'd like. 31 | start time.Time 32 | 33 | // timer is the underlying timer that implements the timeout. 34 | timer *time.Timer 35 | 36 | // f is the function to call when the timer expires. We hold on to this 37 | // because there are cases where we call it directly. 38 | timeoutFn func() 39 | 40 | // confirmations is a map of "from" nodes that have confirmed a given 41 | // node is suspect. This prevents double counting. 42 | confirmations map[string]struct{} 43 | } 44 | 45 | // newSuspicion returns a timer started with the max time, and that will drive 46 | // to the min time after seeing k or more confirmations. The from node will be 47 | // excluded from confirmations since we might get our own suspicion message 48 | // gossiped back to us. The minimum time will be used if no confirmations are 49 | // called for (k <= 0). 50 | func newSuspicion(from string, k int, min time.Duration, max time.Duration, fn func(int)) *suspicion { 51 | s := &suspicion{ 52 | k: int32(k), 53 | min: min, 54 | max: max, 55 | confirmations: make(map[string]struct{}), 56 | } 57 | 58 | // Exclude the from node from any confirmations. 59 | s.confirmations[from] = struct{}{} 60 | 61 | // Pass the number of confirmations into the timeout function for 62 | // easy telemetry. 63 | s.timeoutFn = func() { 64 | fn(int(atomic.LoadInt32(&s.n))) 65 | } 66 | 67 | // If there aren't any confirmations to be made then take the min 68 | // time from the start. 69 | timeout := max 70 | if k < 1 { 71 | timeout = min 72 | } 73 | s.timer = time.AfterFunc(timeout, s.timeoutFn) 74 | 75 | // Capture the start time right after starting the timer above so 76 | // we should always err on the side of a little longer timeout if 77 | // there's any preemption that separates this and the step above. 78 | s.start = time.Now() 79 | return s 80 | } 81 | 82 | // remainingSuspicionTime takes the state variables of the suspicion timer and 83 | // calculates the remaining time to wait before considering a node dead. The 84 | // return value can be negative, so be prepared to fire the timer immediately in 85 | // that case. 86 | func remainingSuspicionTime(n, k int32, elapsed time.Duration, min, max time.Duration) time.Duration { 87 | frac := math.Log(float64(n)+1.0) / math.Log(float64(k)+1.0) 88 | raw := max.Seconds() - frac*(max.Seconds()-min.Seconds()) 89 | timeout := time.Duration(math.Floor(1000.0*raw)) * time.Millisecond 90 | if timeout < min { 91 | timeout = min 92 | } 93 | 94 | // We have to take into account the amount of time that has passed so 95 | // far, so we get the right overall timeout. 96 | return timeout - elapsed 97 | } 98 | 99 | // Confirm registers that a possibly new peer has also determined the given 100 | // node is suspect. This returns true if this was new information, and false 101 | // if it was a duplicate confirmation, or if we've got enough confirmations to 102 | // hit the minimum. 103 | func (s *suspicion) Confirm(from string) bool { 104 | // If we've got enough confirmations then stop accepting them. 105 | if atomic.LoadInt32(&s.n) >= s.k { 106 | return false 107 | } 108 | 109 | // Only allow one confirmation from each possible peer. 110 | if _, ok := s.confirmations[from]; ok { 111 | return false 112 | } 113 | s.confirmations[from] = struct{}{} 114 | 115 | // Compute the new timeout given the current number of confirmations and 116 | // adjust the timer. If the timeout becomes negative *and* we can cleanly 117 | // stop the timer then we will call the timeout function directly from 118 | // here. 119 | n := atomic.AddInt32(&s.n, 1) 120 | elapsed := time.Now().Sub(s.start) 121 | remaining := remainingSuspicionTime(n, s.k, elapsed, s.min, s.max) 122 | if s.timer.Stop() { 123 | if remaining > 0 { 124 | s.timer.Reset(remaining) 125 | } else { 126 | go s.timeoutFn() 127 | } 128 | } 129 | return true 130 | } 131 | -------------------------------------------------------------------------------- /keyring.go: -------------------------------------------------------------------------------- 1 | package memberlist 2 | 3 | import ( 4 | "bytes" 5 | "fmt" 6 | "sync" 7 | ) 8 | 9 | type Keyring struct { 10 | // Keys stores the key data used during encryption and decryption. It is 11 | // ordered in such a way where the first key (index 0) is the primary key, 12 | // which is used for encrypting messages, and is the first key tried during 13 | // message decryption. 14 | keys [][]byte 15 | 16 | // The keyring lock is used while performing IO operations on the keyring. 17 | l sync.Mutex 18 | } 19 | 20 | // Init allocates substructures 21 | func (k *Keyring) init() { 22 | k.keys = make([][]byte, 0) 23 | } 24 | 25 | // NewKeyring constructs a new container for a set of encryption keys. The 26 | // keyring contains all key data used internally by memberlist. 27 | // 28 | // While creating a new keyring, you must do one of: 29 | // - Omit keys and primary key, effectively disabling encryption 30 | // - Pass a set of keys plus the primary key 31 | // - Pass only a primary key 32 | // 33 | // If only a primary key is passed, then it will be automatically added to the 34 | // keyring. If creating a keyring with multiple keys, one key must be designated 35 | // primary by passing it as the primaryKey. If the primaryKey does not exist in 36 | // the list of secondary keys, it will be automatically added at position 0. 37 | // 38 | // A key should be either 16, 24, or 32 bytes to select AES-128, 39 | // AES-192, or AES-256. 40 | func NewKeyring(keys [][]byte, primaryKey []byte) (*Keyring, error) { 41 | keyring := &Keyring{} 42 | keyring.init() 43 | 44 | if len(keys) > 0 || len(primaryKey) > 0 { 45 | if len(primaryKey) == 0 { 46 | return nil, fmt.Errorf("Empty primary key not allowed") 47 | } 48 | if err := keyring.AddKey(primaryKey); err != nil { 49 | return nil, err 50 | } 51 | for _, key := range keys { 52 | if err := keyring.AddKey(key); err != nil { 53 | return nil, err 54 | } 55 | } 56 | } 57 | 58 | return keyring, nil 59 | } 60 | 61 | // ValidateKey will check to see if the key is valid and returns an error if not. 62 | // 63 | // key should be either 16, 24, or 32 bytes to select AES-128, 64 | // AES-192, or AES-256. 65 | func ValidateKey(key []byte) error { 66 | if l := len(key); l != 16 && l != 24 && l != 32 { 67 | return fmt.Errorf("key size must be 16, 24 or 32 bytes") 68 | } 69 | return nil 70 | } 71 | 72 | // AddKey will install a new key on the ring. Adding a key to the ring will make 73 | // it available for use in decryption. If the key already exists on the ring, 74 | // this function will just return noop. 75 | // 76 | // key should be either 16, 24, or 32 bytes to select AES-128, 77 | // AES-192, or AES-256. 78 | func (k *Keyring) AddKey(key []byte) error { 79 | if err := ValidateKey(key); err != nil { 80 | return err 81 | } 82 | 83 | // No-op if key is already installed 84 | for _, installedKey := range k.keys { 85 | if bytes.Equal(installedKey, key) { 86 | return nil 87 | } 88 | } 89 | 90 | keys := append(k.keys, key) 91 | primaryKey := k.GetPrimaryKey() 92 | if primaryKey == nil { 93 | primaryKey = key 94 | } 95 | k.installKeys(keys, primaryKey) 96 | return nil 97 | } 98 | 99 | // UseKey changes the key used to encrypt messages. This is the only key used to 100 | // encrypt messages, so peers should know this key before this method is called. 101 | func (k *Keyring) UseKey(key []byte) error { 102 | for _, installedKey := range k.keys { 103 | if bytes.Equal(key, installedKey) { 104 | k.installKeys(k.keys, key) 105 | return nil 106 | } 107 | } 108 | return fmt.Errorf("Requested key is not in the keyring") 109 | } 110 | 111 | // RemoveKey drops a key from the keyring. This will return an error if the key 112 | // requested for removal is currently at position 0 (primary key). 113 | func (k *Keyring) RemoveKey(key []byte) error { 114 | if bytes.Equal(key, k.keys[0]) { 115 | return fmt.Errorf("Removing the primary key is not allowed") 116 | } 117 | for i, installedKey := range k.keys { 118 | if bytes.Equal(key, installedKey) { 119 | keys := append(k.keys[:i], k.keys[i+1:]...) 120 | k.installKeys(keys, k.keys[0]) 121 | } 122 | } 123 | return nil 124 | } 125 | 126 | // installKeys will take out a lock on the keyring, and replace the keys with a 127 | // new set of keys. The key indicated by primaryKey will be installed as the new 128 | // primary key. 129 | func (k *Keyring) installKeys(keys [][]byte, primaryKey []byte) { 130 | k.l.Lock() 131 | defer k.l.Unlock() 132 | 133 | newKeys := [][]byte{primaryKey} 134 | for _, key := range keys { 135 | if !bytes.Equal(key, primaryKey) { 136 | newKeys = append(newKeys, key) 137 | } 138 | } 139 | k.keys = newKeys 140 | } 141 | 142 | // GetKeys returns the current set of keys on the ring. 143 | func (k *Keyring) GetKeys() [][]byte { 144 | k.l.Lock() 145 | defer k.l.Unlock() 146 | 147 | return k.keys 148 | } 149 | 150 | // GetPrimaryKey returns the key on the ring at position 0. This is the key used 151 | // for encrypting messages, and is the first key tried for decrypting messages. 152 | func (k *Keyring) GetPrimaryKey() (key []byte) { 153 | k.l.Lock() 154 | defer k.l.Unlock() 155 | 156 | if len(k.keys) > 0 { 157 | key = k.keys[0] 158 | } 159 | return 160 | } 161 | -------------------------------------------------------------------------------- /queue_test.go: -------------------------------------------------------------------------------- 1 | package memberlist 2 | 3 | import ( 4 | "testing" 5 | ) 6 | 7 | func TestTransmitLimited_Queue(t *testing.T) { 8 | q := &TransmitLimitedQueue{RetransmitMult: 1, NumNodes: func() int { return 1 }} 9 | q.QueueBroadcast(&memberlistBroadcast{"test", nil, nil}) 10 | q.QueueBroadcast(&memberlistBroadcast{"foo", nil, nil}) 11 | q.QueueBroadcast(&memberlistBroadcast{"bar", nil, nil}) 12 | 13 | if len(q.bcQueue) != 3 { 14 | t.Fatalf("bad len") 15 | } 16 | if q.bcQueue[0].b.(*memberlistBroadcast).node != "test" { 17 | t.Fatalf("missing test") 18 | } 19 | if q.bcQueue[1].b.(*memberlistBroadcast).node != "foo" { 20 | t.Fatalf("missing foo") 21 | } 22 | if q.bcQueue[2].b.(*memberlistBroadcast).node != "bar" { 23 | t.Fatalf("missing bar") 24 | } 25 | 26 | // Should invalidate previous message 27 | q.QueueBroadcast(&memberlistBroadcast{"test", nil, nil}) 28 | 29 | if len(q.bcQueue) != 3 { 30 | t.Fatalf("bad len") 31 | } 32 | if q.bcQueue[0].b.(*memberlistBroadcast).node != "foo" { 33 | t.Fatalf("missing foo") 34 | } 35 | if q.bcQueue[1].b.(*memberlistBroadcast).node != "bar" { 36 | t.Fatalf("missing bar") 37 | } 38 | if q.bcQueue[2].b.(*memberlistBroadcast).node != "test" { 39 | t.Fatalf("missing test") 40 | } 41 | } 42 | 43 | func TestTransmitLimited_GetBroadcasts(t *testing.T) { 44 | q := &TransmitLimitedQueue{RetransmitMult: 3, NumNodes: func() int { return 10 }} 45 | 46 | // 18 bytes per message 47 | q.QueueBroadcast(&memberlistBroadcast{"test", []byte("1. this is a test."), nil}) 48 | q.QueueBroadcast(&memberlistBroadcast{"foo", []byte("2. this is a test."), nil}) 49 | q.QueueBroadcast(&memberlistBroadcast{"bar", []byte("3. this is a test."), nil}) 50 | q.QueueBroadcast(&memberlistBroadcast{"baz", []byte("4. this is a test."), nil}) 51 | 52 | // 2 byte overhead per message, should get all 4 messages 53 | all := q.GetBroadcasts(2, 80) 54 | if len(all) != 4 { 55 | t.Fatalf("missing messages: %v", all) 56 | } 57 | 58 | // 3 byte overhead, should only get 3 messages back 59 | partial := q.GetBroadcasts(3, 80) 60 | if len(partial) != 3 { 61 | t.Fatalf("missing messages: %v", partial) 62 | } 63 | } 64 | 65 | func TestTransmitLimited_GetBroadcasts_Limit(t *testing.T) { 66 | q := &TransmitLimitedQueue{RetransmitMult: 1, NumNodes: func() int { return 10 }} 67 | 68 | // 18 bytes per message 69 | q.QueueBroadcast(&memberlistBroadcast{"test", []byte("1. this is a test."), nil}) 70 | q.QueueBroadcast(&memberlistBroadcast{"foo", []byte("2. this is a test."), nil}) 71 | q.QueueBroadcast(&memberlistBroadcast{"bar", []byte("3. this is a test."), nil}) 72 | q.QueueBroadcast(&memberlistBroadcast{"baz", []byte("4. this is a test."), nil}) 73 | 74 | // 3 byte overhead, should only get 3 messages back 75 | partial1 := q.GetBroadcasts(3, 80) 76 | if len(partial1) != 3 { 77 | t.Fatalf("missing messages: %v", partial1) 78 | } 79 | 80 | partial2 := q.GetBroadcasts(3, 80) 81 | if len(partial2) != 3 { 82 | t.Fatalf("missing messages: %v", partial2) 83 | } 84 | 85 | // Only two not expired 86 | partial3 := q.GetBroadcasts(3, 80) 87 | if len(partial3) != 2 { 88 | t.Fatalf("missing messages: %v", partial3) 89 | } 90 | 91 | // Should get nothing 92 | partial5 := q.GetBroadcasts(3, 80) 93 | if len(partial5) != 0 { 94 | t.Fatalf("missing messages: %v", partial5) 95 | } 96 | } 97 | 98 | func TestTransmitLimited_Prune(t *testing.T) { 99 | q := &TransmitLimitedQueue{RetransmitMult: 1, NumNodes: func() int { return 10 }} 100 | 101 | ch1 := make(chan struct{}, 1) 102 | ch2 := make(chan struct{}, 1) 103 | 104 | // 18 bytes per message 105 | q.QueueBroadcast(&memberlistBroadcast{"test", []byte("1. this is a test."), ch1}) 106 | q.QueueBroadcast(&memberlistBroadcast{"foo", []byte("2. this is a test."), ch2}) 107 | q.QueueBroadcast(&memberlistBroadcast{"bar", []byte("3. this is a test."), nil}) 108 | q.QueueBroadcast(&memberlistBroadcast{"baz", []byte("4. this is a test."), nil}) 109 | 110 | // Keep only 2 111 | q.Prune(2) 112 | 113 | if q.NumQueued() != 2 { 114 | t.Fatalf("bad len") 115 | } 116 | 117 | // Should notify the first two 118 | select { 119 | case <-ch1: 120 | default: 121 | t.Fatalf("expected invalidation") 122 | } 123 | select { 124 | case <-ch2: 125 | default: 126 | t.Fatalf("expected invalidation") 127 | } 128 | 129 | if q.bcQueue[0].b.(*memberlistBroadcast).node != "bar" { 130 | t.Fatalf("missing bar") 131 | } 132 | if q.bcQueue[1].b.(*memberlistBroadcast).node != "baz" { 133 | t.Fatalf("missing baz") 134 | } 135 | } 136 | 137 | func TestLimitedBroadcastSort(t *testing.T) { 138 | bc := limitedBroadcasts([]*limitedBroadcast{ 139 | &limitedBroadcast{ 140 | transmits: 0, 141 | }, 142 | &limitedBroadcast{ 143 | transmits: 10, 144 | }, 145 | &limitedBroadcast{ 146 | transmits: 3, 147 | }, 148 | &limitedBroadcast{ 149 | transmits: 4, 150 | }, 151 | &limitedBroadcast{ 152 | transmits: 7, 153 | }, 154 | }) 155 | bc.Sort() 156 | 157 | if bc[0].transmits != 10 { 158 | t.Fatalf("bad val %v", bc[0]) 159 | } 160 | if bc[1].transmits != 7 { 161 | t.Fatalf("bad val %v", bc[7]) 162 | } 163 | if bc[2].transmits != 4 { 164 | t.Fatalf("bad val %v", bc[2]) 165 | } 166 | if bc[3].transmits != 3 { 167 | t.Fatalf("bad val %v", bc[3]) 168 | } 169 | if bc[4].transmits != 0 { 170 | t.Fatalf("bad val %v", bc[4]) 171 | } 172 | } 173 | -------------------------------------------------------------------------------- /suspicion_test.go: -------------------------------------------------------------------------------- 1 | package memberlist 2 | 3 | import ( 4 | "testing" 5 | "time" 6 | ) 7 | 8 | func TestSuspicion_remainingSuspicionTime(t *testing.T) { 9 | cases := []struct { 10 | n int32 11 | k int32 12 | elapsed time.Duration 13 | min time.Duration 14 | max time.Duration 15 | expected time.Duration 16 | }{ 17 | {0, 3, 0, 2 * time.Second, 30 * time.Second, 30 * time.Second}, 18 | {1, 3, 2 * time.Second, 2 * time.Second, 30 * time.Second, 14 * time.Second}, 19 | {2, 3, 3 * time.Second, 2 * time.Second, 30 * time.Second, 4810 * time.Millisecond}, 20 | {3, 3, 4 * time.Second, 2 * time.Second, 30 * time.Second, -2 * time.Second}, 21 | {4, 3, 5 * time.Second, 2 * time.Second, 30 * time.Second, -3 * time.Second}, 22 | {5, 3, 10 * time.Second, 2 * time.Second, 30 * time.Second, -8 * time.Second}, 23 | } 24 | for i, c := range cases { 25 | remaining := remainingSuspicionTime(c.n, c.k, c.elapsed, c.min, c.max) 26 | if remaining != c.expected { 27 | t.Errorf("case %d: remaining %9.6f != expected %9.6f", i, remaining.Seconds(), c.expected.Seconds()) 28 | } 29 | } 30 | } 31 | 32 | func TestSuspicion_Timer(t *testing.T) { 33 | const k = 3 34 | const min = 500 * time.Millisecond 35 | const max = 2 * time.Second 36 | 37 | type pair struct { 38 | from string 39 | newInfo bool 40 | } 41 | cases := []struct { 42 | numConfirmations int 43 | from string 44 | confirmations []pair 45 | expected time.Duration 46 | }{ 47 | { 48 | 0, 49 | "me", 50 | []pair{}, 51 | max, 52 | }, 53 | { 54 | 1, 55 | "me", 56 | []pair{ 57 | pair{"me", false}, 58 | pair{"foo", true}, 59 | }, 60 | 1250 * time.Millisecond, 61 | }, 62 | { 63 | 1, 64 | "me", 65 | []pair{ 66 | pair{"me", false}, 67 | pair{"foo", true}, 68 | pair{"foo", false}, 69 | pair{"foo", false}, 70 | }, 71 | 1250 * time.Millisecond, 72 | }, 73 | { 74 | 2, 75 | "me", 76 | []pair{ 77 | pair{"me", false}, 78 | pair{"foo", true}, 79 | pair{"bar", true}, 80 | }, 81 | 810 * time.Millisecond, 82 | }, 83 | { 84 | 3, 85 | "me", 86 | []pair{ 87 | pair{"me", false}, 88 | pair{"foo", true}, 89 | pair{"bar", true}, 90 | pair{"baz", true}, 91 | }, 92 | min, 93 | }, 94 | { 95 | 3, 96 | "me", 97 | []pair{ 98 | pair{"me", false}, 99 | pair{"foo", true}, 100 | pair{"bar", true}, 101 | pair{"baz", true}, 102 | pair{"zoo", false}, 103 | }, 104 | min, 105 | }, 106 | } 107 | for i, c := range cases { 108 | ch := make(chan time.Duration, 1) 109 | start := time.Now() 110 | f := func(numConfirmations int) { 111 | if numConfirmations != c.numConfirmations { 112 | t.Errorf("case %d: bad %d != %d", i, numConfirmations, c.numConfirmations) 113 | } 114 | 115 | ch <- time.Now().Sub(start) 116 | } 117 | 118 | // Create the timer and add the requested confirmations. Wait 119 | // the fudge amount to help make sure we calculate the timeout 120 | // overall, and don't accumulate extra time. 121 | s := newSuspicion(c.from, k, min, max, f) 122 | fudge := 25 * time.Millisecond 123 | for _, p := range c.confirmations { 124 | time.Sleep(fudge) 125 | if s.Confirm(p.from) != p.newInfo { 126 | t.Fatalf("case %d: newInfo mismatch for %s", i, p.from) 127 | } 128 | } 129 | 130 | // Wait until right before the timeout and make sure the 131 | // timer hasn't fired. 132 | already := time.Duration(len(c.confirmations)) * fudge 133 | time.Sleep(c.expected - already - fudge) 134 | select { 135 | case d := <-ch: 136 | t.Fatalf("case %d: should not have fired (%9.6f)", i, d.Seconds()) 137 | default: 138 | } 139 | 140 | // Wait through the timeout and a little after and make sure it 141 | // fires. 142 | time.Sleep(2 * fudge) 143 | select { 144 | case <-ch: 145 | default: 146 | t.Fatalf("case %d: should have fired", i) 147 | } 148 | 149 | // Confirm after to make sure it handles a negative remaining 150 | // time correctly and doesn't fire again. 151 | s.Confirm("late") 152 | time.Sleep(c.expected + 2*fudge) 153 | select { 154 | case d := <-ch: 155 | t.Fatalf("case %d: should not have fired (%9.6f)", i, d.Seconds()) 156 | default: 157 | } 158 | } 159 | } 160 | 161 | func TestSuspicion_Timer_ZeroK(t *testing.T) { 162 | ch := make(chan struct{}, 1) 163 | f := func(int) { 164 | ch <- struct{}{} 165 | } 166 | 167 | // This should select the min time since there are no expected 168 | // confirmations to accelerate the timer. 169 | s := newSuspicion("me", 0, 25*time.Millisecond, 30*time.Second, f) 170 | if s.Confirm("foo") { 171 | t.Fatalf("should not provide new information") 172 | } 173 | 174 | select { 175 | case <-ch: 176 | case <-time.After(50 * time.Millisecond): 177 | t.Fatalf("should have fired") 178 | } 179 | } 180 | 181 | func TestSuspicion_Timer_Immediate(t *testing.T) { 182 | ch := make(chan struct{}, 1) 183 | f := func(int) { 184 | ch <- struct{}{} 185 | } 186 | 187 | // This should underflow the timeout and fire immediately. 188 | s := newSuspicion("me", 1, 100*time.Millisecond, 30*time.Second, f) 189 | time.Sleep(200 * time.Millisecond) 190 | s.Confirm("foo") 191 | 192 | // Wait a little while since the function gets called in a goroutine. 193 | select { 194 | case <-ch: 195 | case <-time.After(25 * time.Millisecond): 196 | t.Fatalf("should have fired") 197 | } 198 | } 199 | -------------------------------------------------------------------------------- /security.go: -------------------------------------------------------------------------------- 1 | package memberlist 2 | 3 | import ( 4 | "bytes" 5 | "crypto/aes" 6 | "crypto/cipher" 7 | "crypto/rand" 8 | "fmt" 9 | "io" 10 | ) 11 | 12 | /* 13 | 14 | Encrypted messages are prefixed with an encryptionVersion byte 15 | that is used for us to be able to properly encode/decode. We 16 | currently support the following versions: 17 | 18 | 0 - AES-GCM 128, using PKCS7 padding 19 | 1 - AES-GCM 128, no padding. Padding not needed, caused bloat. 20 | 21 | */ 22 | type encryptionVersion uint8 23 | 24 | const ( 25 | minEncryptionVersion encryptionVersion = 0 26 | maxEncryptionVersion encryptionVersion = 1 27 | ) 28 | 29 | const ( 30 | versionSize = 1 31 | nonceSize = 12 32 | tagSize = 16 33 | maxPadOverhead = 16 34 | blockSize = aes.BlockSize 35 | ) 36 | 37 | // pkcs7encode is used to pad a byte buffer to a specific block size using 38 | // the PKCS7 algorithm. "Ignores" some bytes to compensate for IV 39 | func pkcs7encode(buf *bytes.Buffer, ignore, blockSize int) { 40 | n := buf.Len() - ignore 41 | more := blockSize - (n % blockSize) 42 | for i := 0; i < more; i++ { 43 | buf.WriteByte(byte(more)) 44 | } 45 | } 46 | 47 | // pkcs7decode is used to decode a buffer that has been padded 48 | func pkcs7decode(buf []byte, blockSize int) []byte { 49 | if len(buf) == 0 { 50 | panic("Cannot decode a PKCS7 buffer of zero length") 51 | } 52 | n := len(buf) 53 | last := buf[n-1] 54 | n -= int(last) 55 | return buf[:n] 56 | } 57 | 58 | // encryptOverhead returns the maximum possible overhead of encryption by version 59 | func encryptOverhead(vsn encryptionVersion) int { 60 | switch vsn { 61 | case 0: 62 | return 45 // Version: 1, IV: 12, Padding: 16, Tag: 16 63 | case 1: 64 | return 29 // Version: 1, IV: 12, Tag: 16 65 | default: 66 | panic("unsupported version") 67 | } 68 | } 69 | 70 | // encryptedLength is used to compute the buffer size needed 71 | // for a message of given length 72 | func encryptedLength(vsn encryptionVersion, inp int) int { 73 | // If we are on version 1, there is no padding 74 | if vsn >= 1 { 75 | return versionSize + nonceSize + inp + tagSize 76 | } 77 | 78 | // Determine the padding size 79 | padding := blockSize - (inp % blockSize) 80 | 81 | // Sum the extra parts to get total size 82 | return versionSize + nonceSize + inp + padding + tagSize 83 | } 84 | 85 | // encryptPayload is used to encrypt a message with a given key. 86 | // We make use of AES-128 in GCM mode. New byte buffer is the version, 87 | // nonce, ciphertext and tag 88 | func encryptPayload(vsn encryptionVersion, key []byte, msg []byte, data []byte, dst *bytes.Buffer) error { 89 | // Get the AES block cipher 90 | aesBlock, err := aes.NewCipher(key) 91 | if err != nil { 92 | return err 93 | } 94 | 95 | // Get the GCM cipher mode 96 | gcm, err := cipher.NewGCM(aesBlock) 97 | if err != nil { 98 | return err 99 | } 100 | 101 | // Grow the buffer to make room for everything 102 | offset := dst.Len() 103 | dst.Grow(encryptedLength(vsn, len(msg))) 104 | 105 | // Write the encryption version 106 | dst.WriteByte(byte(vsn)) 107 | 108 | // Add a random nonce 109 | io.CopyN(dst, rand.Reader, nonceSize) 110 | afterNonce := dst.Len() 111 | 112 | // Ensure we are correctly padded (only version 0) 113 | if vsn == 0 { 114 | io.Copy(dst, bytes.NewReader(msg)) 115 | pkcs7encode(dst, offset+versionSize+nonceSize, aes.BlockSize) 116 | } 117 | 118 | // Encrypt message using GCM 119 | slice := dst.Bytes()[offset:] 120 | nonce := slice[versionSize : versionSize+nonceSize] 121 | 122 | // Message source depends on the encryption version. 123 | // Version 0 uses padding, version 1 does not 124 | var src []byte 125 | if vsn == 0 { 126 | src = slice[versionSize+nonceSize:] 127 | } else { 128 | src = msg 129 | } 130 | out := gcm.Seal(nil, nonce, src, data) 131 | 132 | // Truncate the plaintext, and write the cipher text 133 | dst.Truncate(afterNonce) 134 | dst.Write(out) 135 | return nil 136 | } 137 | 138 | // decryptMessage performs the actual decryption of ciphertext. This is in its 139 | // own function to allow it to be called on all keys easily. 140 | func decryptMessage(key, msg []byte, data []byte) ([]byte, error) { 141 | // Get the AES block cipher 142 | aesBlock, err := aes.NewCipher(key) 143 | if err != nil { 144 | return nil, err 145 | } 146 | 147 | // Get the GCM cipher mode 148 | gcm, err := cipher.NewGCM(aesBlock) 149 | if err != nil { 150 | return nil, err 151 | } 152 | 153 | // Decrypt the message 154 | nonce := msg[versionSize : versionSize+nonceSize] 155 | ciphertext := msg[versionSize+nonceSize:] 156 | plain, err := gcm.Open(nil, nonce, ciphertext, data) 157 | if err != nil { 158 | return nil, err 159 | } 160 | 161 | // Success! 162 | return plain, nil 163 | } 164 | 165 | // decryptPayload is used to decrypt a message with a given key, 166 | // and verify it's contents. Any padding will be removed, and a 167 | // slice to the plaintext is returned. Decryption is done IN PLACE! 168 | func decryptPayload(keys [][]byte, msg []byte, data []byte) ([]byte, error) { 169 | // Ensure we have at least one byte 170 | if len(msg) == 0 { 171 | return nil, fmt.Errorf("Cannot decrypt empty payload") 172 | } 173 | 174 | // Verify the version 175 | vsn := encryptionVersion(msg[0]) 176 | if vsn > maxEncryptionVersion { 177 | return nil, fmt.Errorf("Unsupported encryption version %d", msg[0]) 178 | } 179 | 180 | // Ensure the length is sane 181 | if len(msg) < encryptedLength(vsn, 0) { 182 | return nil, fmt.Errorf("Payload is too small to decrypt: %d", len(msg)) 183 | } 184 | 185 | for _, key := range keys { 186 | plain, err := decryptMessage(key, msg, data) 187 | if err == nil { 188 | // Remove the PKCS7 padding for vsn 0 189 | if vsn == 0 { 190 | return pkcs7decode(plain, aes.BlockSize), nil 191 | } else { 192 | return plain, nil 193 | } 194 | } 195 | } 196 | 197 | return nil, fmt.Errorf("No installed keys could decrypt the message") 198 | } 199 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # memberlist [![GoDoc](https://godoc.org/github.com/hashicorp/memberlist?status.png)](https://godoc.org/github.com/hashicorp/memberlist) 2 | 3 | memberlist is a [Go](http://www.golang.org) library that manages cluster 4 | membership and member failure detection using a gossip based protocol. 5 | 6 | The use cases for such a library are far-reaching: all distributed systems 7 | require membership, and memberlist is a re-usable solution to managing 8 | cluster membership and node failure detection. 9 | 10 | memberlist is eventually consistent but converges quickly on average. 11 | The speed at which it converges can be heavily tuned via various knobs 12 | on the protocol. Node failures are detected and network partitions are partially 13 | tolerated by attempting to communicate to potentially dead nodes through 14 | multiple routes. 15 | 16 | ## Building 17 | 18 | If you wish to build memberlist you'll need Go version 1.2+ installed. 19 | 20 | Please check your installation with: 21 | 22 | ``` 23 | go version 24 | ``` 25 | 26 | ## Usage 27 | 28 | Memberlist is surprisingly simple to use. An example is shown below: 29 | 30 | ```go 31 | /* Create the initial memberlist from a safe configuration. 32 | Please reference the godoc for other default config types. 33 | http://godoc.org/github.com/hashicorp/memberlist#Config 34 | */ 35 | list, err := memberlist.Create(memberlist.DefaultLocalConfig()) 36 | if err != nil { 37 | panic("Failed to create memberlist: " + err.Error()) 38 | } 39 | 40 | // Join an existing cluster by specifying at least one known member. 41 | n, err := list.Join([]string{"1.2.3.4"}) 42 | if err != nil { 43 | panic("Failed to join cluster: " + err.Error()) 44 | } 45 | 46 | // Ask for members of the cluster 47 | for _, member := range list.Members() { 48 | fmt.Printf("Member: %s %s\n", member.Name, member.Addr) 49 | } 50 | 51 | // Continue doing whatever you need, memberlist will maintain membership 52 | // information in the background. Delegates can be used for receiving 53 | // events when members join or leave. 54 | ``` 55 | 56 | The most difficult part of memberlist is configuring it since it has many 57 | available knobs in order to tune state propagation delay and convergence times. 58 | Memberlist provides a default configuration that offers a good starting point, 59 | but errs on the side of caution, choosing values that are optimized for 60 | higher convergence at the cost of higher bandwidth usage. 61 | 62 | For complete documentation, see the associated [Godoc](http://godoc.org/github.com/hashicorp/memberlist). 63 | 64 | ## Protocol 65 | 66 | memberlist is based on ["SWIM: Scalable Weakly-consistent Infection-style Process Group Membership Protocol"](http://www.cs.cornell.edu/~asdas/research/dsn02-swim.pdf), 67 | with a few minor adaptations, mostly to increase propagation speed and 68 | convergence rate. 69 | 70 | A high level overview of the memberlist protocol (based on SWIM) is 71 | described below, but for details please read the full 72 | [SWIM paper](http://www.cs.cornell.edu/~asdas/research/dsn02-swim.pdf) 73 | followed by the memberlist source. We welcome any questions related 74 | to the protocol on our issue tracker. 75 | 76 | ### Protocol Description 77 | 78 | memberlist begins by joining an existing cluster or starting a new 79 | cluster. If starting a new cluster, additional nodes are expected to join 80 | it. New nodes in an existing cluster must be given the address of at 81 | least one existing member in order to join the cluster. The new member 82 | does a full state sync with the existing member over TCP and begins gossiping its 83 | existence to the cluster. 84 | 85 | Gossip is done over UDP with a configurable but fixed fanout and interval. 86 | This ensures that network usage is constant with regards to number of nodes, as opposed to 87 | exponential growth that can occur with traditional heartbeat mechanisms. 88 | Complete state exchanges with a random node are done periodically over 89 | TCP, but much less often than gossip messages. This increases the likelihood 90 | that the membership list converges properly since the full state is exchanged 91 | and merged. The interval between full state exchanges is configurable or can 92 | be disabled entirely. 93 | 94 | Failure detection is done by periodic random probing using a configurable interval. 95 | If the node fails to ack within a reasonable time (typically some multiple 96 | of RTT), then an indirect probe as well as a direct TCP probe are attempted. An 97 | indirect probe asks a configurable number of random nodes to probe the same node, 98 | in case there are network issues causing our own node to fail the probe. The direct 99 | TCP probe is used to help identify the common situation where networking is 100 | misconfigured to allow TCP but not UDP. Without the TCP probe, a UDP-isolated node 101 | would think all other nodes were suspect and could cause churn in the cluster when 102 | it attempts a TCP-based state exchange with another node. It is not desirable to 103 | operate with only TCP connectivity because convergence will be much slower, but it 104 | is enabled so that memberlist can detect this situation and alert operators. 105 | 106 | If both our probe, the indirect probes, and the direct TCP probe fail within a 107 | configurable time, then the node is marked "suspicious" and this knowledge is 108 | gossiped to the cluster. A suspicious node is still considered a member of 109 | cluster. If the suspect member of the cluster does not dispute the suspicion 110 | within a configurable period of time, the node is finally considered dead, 111 | and this state is then gossiped to the cluster. 112 | 113 | This is a brief and incomplete description of the protocol. For a better idea, 114 | please read the 115 | [SWIM paper](http://www.cs.cornell.edu/~asdas/research/dsn02-swim.pdf) 116 | in its entirety, along with the memberlist source code. 117 | 118 | ### Changes from SWIM 119 | 120 | As mentioned earlier, the memberlist protocol is based on SWIM but includes 121 | minor changes, mostly to increase propagation speed and convergence rates. 122 | 123 | The changes from SWIM are noted here: 124 | 125 | * memberlist does a full state sync over TCP periodically. SWIM only propagates 126 | changes over gossip. While both eventually reach convergence, the full state 127 | sync increases the likelihood that nodes are fully converged more quickly, 128 | at the expense of more bandwidth usage. This feature can be totally disabled 129 | if you wish. 130 | 131 | * memberlist has a dedicated gossip layer separate from the failure detection 132 | protocol. SWIM only piggybacks gossip messages on top of probe/ack messages. 133 | memberlist also piggybacks gossip messages on top of probe/ack messages, but 134 | also will periodically send out dedicated gossip messages on their own. This 135 | feature lets you have a higher gossip rate (for example once per 200ms) 136 | and a slower failure detection rate (such as once per second), resulting 137 | in overall faster convergence rates and data propagation speeds. This feature 138 | can be totally disabed as well, if you wish. 139 | 140 | * memberlist stores around the state of dead nodes for a set amount of time, 141 | so that when full syncs are requested, the requester also receives information 142 | about dead nodes. Because SWIM doesn't do full syncs, SWIM deletes dead node 143 | state immediately upon learning that the node is dead. This change again helps 144 | the cluster converge more quickly. 145 | -------------------------------------------------------------------------------- /util_test.go: -------------------------------------------------------------------------------- 1 | package memberlist 2 | 3 | import ( 4 | "fmt" 5 | "reflect" 6 | "testing" 7 | "time" 8 | ) 9 | 10 | func Test_hasPort(t *testing.T) { 11 | cases := []struct { 12 | s string 13 | expected bool 14 | }{ 15 | {"", false}, 16 | {":80", true}, 17 | {"127.0.0.1", false}, 18 | {"127.0.0.1:80", true}, 19 | {"::1", false}, 20 | {"2001:db8:a0b:12f0::1", false}, 21 | {"[2001:db8:a0b:12f0::1]", false}, 22 | {"[2001:db8:a0b:12f0::1]:80", true}, 23 | } 24 | for _, c := range cases { 25 | if hasPort(c.s) != c.expected { 26 | t.Fatalf("bad: '%s' hasPort was not %v", c.s, c.expected) 27 | } 28 | } 29 | } 30 | 31 | func TestEncodeDecode(t *testing.T) { 32 | msg := &ping{SeqNo: 100} 33 | buf, err := encode(pingMsg, msg) 34 | if err != nil { 35 | t.Fatalf("unexpected err: %s", err) 36 | } 37 | var out ping 38 | if err := decode(buf.Bytes()[1:], &out); err != nil { 39 | t.Fatalf("unexpected err: %s", err) 40 | } 41 | if msg.SeqNo != out.SeqNo { 42 | t.Fatalf("bad sequence no") 43 | } 44 | } 45 | 46 | func TestRandomOffset(t *testing.T) { 47 | vals := make(map[int]struct{}) 48 | for i := 0; i < 100; i++ { 49 | offset := randomOffset(2 << 30) 50 | if _, ok := vals[offset]; ok { 51 | t.Fatalf("got collision") 52 | } 53 | vals[offset] = struct{}{} 54 | } 55 | } 56 | 57 | func TestRandomOffset_Zero(t *testing.T) { 58 | offset := randomOffset(0) 59 | if offset != 0 { 60 | t.Fatalf("bad offset") 61 | } 62 | } 63 | 64 | func TestSuspicionTimeout(t *testing.T) { 65 | timeouts := map[int]time.Duration{ 66 | 5: 1000 * time.Millisecond, 67 | 10: 1000 * time.Millisecond, 68 | 50: 1698 * time.Millisecond, 69 | 100: 2000 * time.Millisecond, 70 | 500: 2698 * time.Millisecond, 71 | 1000: 3000 * time.Millisecond, 72 | } 73 | for n, expected := range timeouts { 74 | timeout := suspicionTimeout(3, n, time.Second) / 3 75 | if timeout != expected { 76 | t.Fatalf("bad: %v, %v", expected, timeout) 77 | } 78 | } 79 | } 80 | 81 | func TestRetransmitLimit(t *testing.T) { 82 | lim := retransmitLimit(3, 0) 83 | if lim != 0 { 84 | t.Fatalf("bad val %v", lim) 85 | } 86 | lim = retransmitLimit(3, 1) 87 | if lim != 3 { 88 | t.Fatalf("bad val %v", lim) 89 | } 90 | lim = retransmitLimit(3, 99) 91 | if lim != 6 { 92 | t.Fatalf("bad val %v", lim) 93 | } 94 | } 95 | 96 | func TestShuffleNodes(t *testing.T) { 97 | orig := []*nodeState{ 98 | &nodeState{ 99 | State: stateDead, 100 | }, 101 | &nodeState{ 102 | State: stateAlive, 103 | }, 104 | &nodeState{ 105 | State: stateAlive, 106 | }, 107 | &nodeState{ 108 | State: stateDead, 109 | }, 110 | &nodeState{ 111 | State: stateAlive, 112 | }, 113 | &nodeState{ 114 | State: stateAlive, 115 | }, 116 | &nodeState{ 117 | State: stateDead, 118 | }, 119 | &nodeState{ 120 | State: stateAlive, 121 | }, 122 | } 123 | nodes := make([]*nodeState, len(orig)) 124 | copy(nodes[:], orig[:]) 125 | 126 | if !reflect.DeepEqual(nodes, orig) { 127 | t.Fatalf("should match") 128 | } 129 | 130 | shuffleNodes(nodes) 131 | 132 | if reflect.DeepEqual(nodes, orig) { 133 | t.Fatalf("should not match") 134 | } 135 | } 136 | 137 | func TestPushPullScale(t *testing.T) { 138 | sec := time.Second 139 | for i := 0; i <= 32; i++ { 140 | if s := pushPullScale(sec, i); s != sec { 141 | t.Fatalf("Bad time scale: %v", s) 142 | } 143 | } 144 | for i := 33; i <= 64; i++ { 145 | if s := pushPullScale(sec, i); s != 2*sec { 146 | t.Fatalf("Bad time scale: %v", s) 147 | } 148 | } 149 | for i := 65; i <= 128; i++ { 150 | if s := pushPullScale(sec, i); s != 3*sec { 151 | t.Fatalf("Bad time scale: %v", s) 152 | } 153 | } 154 | } 155 | 156 | func TestMoveDeadNodes(t *testing.T) { 157 | nodes := []*nodeState{ 158 | &nodeState{ 159 | State: stateDead, 160 | StateChange: time.Now().Add(-20 * time.Second), 161 | }, 162 | &nodeState{ 163 | State: stateAlive, 164 | StateChange: time.Now().Add(-20 * time.Second), 165 | }, 166 | // This dead node should not be moved, as its state changed 167 | // less than the specified GossipToTheDead time ago 168 | &nodeState{ 169 | State: stateDead, 170 | StateChange: time.Now().Add(-10 * time.Second), 171 | }, 172 | &nodeState{ 173 | State: stateAlive, 174 | StateChange: time.Now().Add(-20 * time.Second), 175 | }, 176 | &nodeState{ 177 | State: stateDead, 178 | StateChange: time.Now().Add(-20 * time.Second), 179 | }, 180 | &nodeState{ 181 | State: stateAlive, 182 | StateChange: time.Now().Add(-20 * time.Second), 183 | }, 184 | } 185 | 186 | idx := moveDeadNodes(nodes, (15 * time.Second)) 187 | if idx != 4 { 188 | t.Fatalf("bad index") 189 | } 190 | for i := 0; i < idx; i++ { 191 | switch i { 192 | case 2: 193 | // Recently dead node remains at index 2, 194 | // since nodes are swapped out to move to end. 195 | if nodes[i].State != stateDead { 196 | t.Fatalf("Bad state %d", i) 197 | } 198 | default: 199 | if nodes[i].State != stateAlive { 200 | t.Fatalf("Bad state %d", i) 201 | } 202 | } 203 | } 204 | for i := idx; i < len(nodes); i++ { 205 | if nodes[i].State != stateDead { 206 | t.Fatalf("Bad state %d", i) 207 | } 208 | } 209 | } 210 | 211 | func TestKRandomNodes(t *testing.T) { 212 | nodes := []*nodeState{} 213 | for i := 0; i < 90; i++ { 214 | // Half the nodes are in a bad state 215 | state := stateAlive 216 | switch i % 3 { 217 | case 0: 218 | state = stateAlive 219 | case 1: 220 | state = stateSuspect 221 | case 2: 222 | state = stateDead 223 | } 224 | nodes = append(nodes, &nodeState{ 225 | Node: Node{ 226 | Name: fmt.Sprintf("test%d", i), 227 | }, 228 | State: state, 229 | }) 230 | } 231 | 232 | filterFunc := func(n *nodeState) bool { 233 | if n.Name == "test0" || n.State != stateAlive { 234 | return true 235 | } 236 | return false 237 | } 238 | 239 | s1 := kRandomNodes(3, nodes, filterFunc) 240 | s2 := kRandomNodes(3, nodes, filterFunc) 241 | s3 := kRandomNodes(3, nodes, filterFunc) 242 | 243 | if reflect.DeepEqual(s1, s2) { 244 | t.Fatalf("unexpected equal") 245 | } 246 | if reflect.DeepEqual(s1, s3) { 247 | t.Fatalf("unexpected equal") 248 | } 249 | if reflect.DeepEqual(s2, s3) { 250 | t.Fatalf("unexpected equal") 251 | } 252 | 253 | for _, s := range [][]*nodeState{s1, s2, s3} { 254 | if len(s) != 3 { 255 | t.Fatalf("bad len") 256 | } 257 | for _, n := range s { 258 | if n.Name == "test0" { 259 | t.Fatalf("Bad name") 260 | } 261 | if n.State != stateAlive { 262 | t.Fatalf("Bad state") 263 | } 264 | } 265 | } 266 | } 267 | 268 | func TestMakeCompoundMessage(t *testing.T) { 269 | msg := &ping{SeqNo: 100} 270 | buf, err := encode(pingMsg, msg) 271 | if err != nil { 272 | t.Fatalf("unexpected err: %s", err) 273 | } 274 | 275 | msgs := [][]byte{buf.Bytes(), buf.Bytes(), buf.Bytes()} 276 | compound := makeCompoundMessage(msgs) 277 | 278 | if compound.Len() != 3*buf.Len()+3*compoundOverhead+compoundHeaderOverhead { 279 | t.Fatalf("bad len") 280 | } 281 | } 282 | 283 | func TestDecodeCompoundMessage(t *testing.T) { 284 | msg := &ping{SeqNo: 100} 285 | buf, err := encode(pingMsg, msg) 286 | if err != nil { 287 | t.Fatalf("unexpected err: %s", err) 288 | } 289 | 290 | msgs := [][]byte{buf.Bytes(), buf.Bytes(), buf.Bytes()} 291 | compound := makeCompoundMessage(msgs) 292 | 293 | trunc, parts, err := decodeCompoundMessage(compound.Bytes()[1:]) 294 | if err != nil { 295 | t.Fatalf("unexpected err: %s", err) 296 | } 297 | if trunc != 0 { 298 | t.Fatalf("should not truncate") 299 | } 300 | if len(parts) != 3 { 301 | t.Fatalf("bad parts") 302 | } 303 | for _, p := range parts { 304 | if len(p) != buf.Len() { 305 | t.Fatalf("bad part len") 306 | } 307 | } 308 | } 309 | 310 | func TestDecodeCompoundMessage_Trunc(t *testing.T) { 311 | msg := &ping{SeqNo: 100} 312 | buf, err := encode(pingMsg, msg) 313 | if err != nil { 314 | t.Fatalf("unexpected err: %s", err) 315 | } 316 | 317 | msgs := [][]byte{buf.Bytes(), buf.Bytes(), buf.Bytes()} 318 | compound := makeCompoundMessage(msgs) 319 | 320 | trunc, parts, err := decodeCompoundMessage(compound.Bytes()[1:38]) 321 | if err != nil { 322 | t.Fatalf("unexpected err: %s", err) 323 | } 324 | if trunc != 1 { 325 | t.Fatalf("truncate: %d", trunc) 326 | } 327 | if len(parts) != 2 { 328 | t.Fatalf("bad parts") 329 | } 330 | for _, p := range parts { 331 | if len(p) != buf.Len() { 332 | t.Fatalf("bad part len") 333 | } 334 | } 335 | } 336 | 337 | func TestCompressDecompressPayload(t *testing.T) { 338 | buf, err := compressPayload([]byte("testing")) 339 | if err != nil { 340 | t.Fatalf("unexpected err: %s", err) 341 | } 342 | 343 | decomp, err := decompressPayload(buf.Bytes()[1:]) 344 | if err != nil { 345 | t.Fatalf("unexpected err: %s", err) 346 | } 347 | 348 | if !reflect.DeepEqual(decomp, []byte("testing")) { 349 | t.Fatalf("bad payload: %v", decomp) 350 | } 351 | } 352 | -------------------------------------------------------------------------------- /util.go: -------------------------------------------------------------------------------- 1 | package memberlist 2 | 3 | import ( 4 | "bytes" 5 | "compress/lzw" 6 | "encoding/binary" 7 | "fmt" 8 | "io" 9 | "math" 10 | "math/rand" 11 | "net" 12 | "strconv" 13 | "strings" 14 | "time" 15 | 16 | "github.com/hashicorp/go-msgpack/codec" 17 | "github.com/sean-/seed" 18 | ) 19 | 20 | // pushPullScale is the minimum number of nodes 21 | // before we start scaling the push/pull timing. The scale 22 | // effect is the log2(Nodes) - log2(pushPullScale). This means 23 | // that the 33rd node will cause us to double the interval, 24 | // while the 65th will triple it. 25 | const pushPullScaleThreshold = 32 26 | 27 | const ( 28 | // Constant litWidth 2-8 29 | lzwLitWidth = 8 30 | ) 31 | 32 | func init() { 33 | seed.Init() 34 | } 35 | 36 | // Decode reverses the encode operation on a byte slice input 37 | func decode(buf []byte, out interface{}) error { 38 | r := bytes.NewReader(buf) 39 | hd := codec.MsgpackHandle{} 40 | dec := codec.NewDecoder(r, &hd) 41 | return dec.Decode(out) 42 | } 43 | 44 | // Encode writes an encoded object to a new bytes buffer 45 | func encode(msgType messageType, in interface{}) (*bytes.Buffer, error) { 46 | buf := bytes.NewBuffer(nil) 47 | buf.WriteByte(uint8(msgType)) 48 | hd := codec.MsgpackHandle{} 49 | enc := codec.NewEncoder(buf, &hd) 50 | err := enc.Encode(in) 51 | return buf, err 52 | } 53 | 54 | // Returns a random offset between 0 and n 55 | func randomOffset(n int) int { 56 | if n == 0 { 57 | return 0 58 | } 59 | return int(rand.Uint32() % uint32(n)) 60 | } 61 | 62 | // suspicionTimeout computes the timeout that should be used when 63 | // a node is suspected 64 | func suspicionTimeout(suspicionMult, n int, interval time.Duration) time.Duration { 65 | nodeScale := math.Max(1.0, math.Log10(math.Max(1.0, float64(n)))) 66 | // multiply by 1000 to keep some precision because time.Duration is an int64 type 67 | timeout := time.Duration(suspicionMult) * time.Duration(nodeScale*1000) * interval / 1000 68 | return timeout 69 | } 70 | 71 | // retransmitLimit computes the limit of retransmissions 72 | func retransmitLimit(retransmitMult, n int) int { 73 | nodeScale := math.Ceil(math.Log10(float64(n + 1))) 74 | limit := retransmitMult * int(nodeScale) 75 | return limit 76 | } 77 | 78 | // shuffleNodes randomly shuffles the input nodes using the Fisher-Yates shuffle 79 | func shuffleNodes(nodes []*nodeState) { 80 | n := len(nodes) 81 | for i := n - 1; i > 0; i-- { 82 | j := rand.Intn(i + 1) 83 | nodes[i], nodes[j] = nodes[j], nodes[i] 84 | } 85 | } 86 | 87 | // pushPushScale is used to scale the time interval at which push/pull 88 | // syncs take place. It is used to prevent network saturation as the 89 | // cluster size grows 90 | func pushPullScale(interval time.Duration, n int) time.Duration { 91 | // Don't scale until we cross the threshold 92 | if n <= pushPullScaleThreshold { 93 | return interval 94 | } 95 | 96 | multiplier := math.Ceil(math.Log2(float64(n))-math.Log2(pushPullScaleThreshold)) + 1.0 97 | return time.Duration(multiplier) * interval 98 | } 99 | 100 | // moveDeadNodes moves nodes that are dead and beyond the gossip to the dead interval 101 | // to the end of the slice and returns the index of the first moved node. 102 | func moveDeadNodes(nodes []*nodeState, gossipToTheDeadTime time.Duration) int { 103 | numDead := 0 104 | n := len(nodes) 105 | for i := 0; i < n-numDead; i++ { 106 | if nodes[i].State != stateDead { 107 | continue 108 | } 109 | 110 | // Respect the gossip to the dead interval 111 | if time.Since(nodes[i].StateChange) <= gossipToTheDeadTime { 112 | continue 113 | } 114 | 115 | // Move this node to the end 116 | nodes[i], nodes[n-numDead-1] = nodes[n-numDead-1], nodes[i] 117 | numDead++ 118 | i-- 119 | } 120 | return n - numDead 121 | } 122 | 123 | // kRandomNodes is used to select up to k random nodes, excluding any nodes where 124 | // the filter function returns true. It is possible that less than k nodes are 125 | // returned. 126 | func kRandomNodes(k int, nodes []*nodeState, filterFn func(*nodeState) bool) []*nodeState { 127 | n := len(nodes) 128 | kNodes := make([]*nodeState, 0, k) 129 | OUTER: 130 | // Probe up to 3*n times, with large n this is not necessary 131 | // since k << n, but with small n we want search to be 132 | // exhaustive 133 | for i := 0; i < 3*n && len(kNodes) < k; i++ { 134 | // Get random node 135 | idx := randomOffset(n) 136 | node := nodes[idx] 137 | 138 | // Give the filter a shot at it. 139 | if filterFn != nil && filterFn(node) { 140 | continue OUTER 141 | } 142 | 143 | // Check if we have this node already 144 | for j := 0; j < len(kNodes); j++ { 145 | if node == kNodes[j] { 146 | continue OUTER 147 | } 148 | } 149 | 150 | // Append the node 151 | kNodes = append(kNodes, node) 152 | } 153 | return kNodes 154 | } 155 | 156 | // makeCompoundMessage takes a list of messages and generates 157 | // a single compound message containing all of them 158 | func makeCompoundMessage(msgs [][]byte) *bytes.Buffer { 159 | // Create a local buffer 160 | buf := bytes.NewBuffer(nil) 161 | 162 | // Write out the type 163 | buf.WriteByte(uint8(compoundMsg)) 164 | 165 | // Write out the number of message 166 | buf.WriteByte(uint8(len(msgs))) 167 | 168 | // Add the message lengths 169 | for _, m := range msgs { 170 | binary.Write(buf, binary.BigEndian, uint16(len(m))) 171 | } 172 | 173 | // Append the messages 174 | for _, m := range msgs { 175 | buf.Write(m) 176 | } 177 | 178 | return buf 179 | } 180 | 181 | // decodeCompoundMessage splits a compound message and returns 182 | // the slices of individual messages. Also returns the number 183 | // of truncated messages and any potential error 184 | func decodeCompoundMessage(buf []byte) (trunc int, parts [][]byte, err error) { 185 | if len(buf) < 1 { 186 | err = fmt.Errorf("missing compound length byte") 187 | return 188 | } 189 | numParts := uint8(buf[0]) 190 | buf = buf[1:] 191 | 192 | // Check we have enough bytes 193 | if len(buf) < int(numParts*2) { 194 | err = fmt.Errorf("truncated len slice") 195 | return 196 | } 197 | 198 | // Decode the lengths 199 | lengths := make([]uint16, numParts) 200 | for i := 0; i < int(numParts); i++ { 201 | lengths[i] = binary.BigEndian.Uint16(buf[i*2 : i*2+2]) 202 | } 203 | buf = buf[numParts*2:] 204 | 205 | // Split each message 206 | for idx, msgLen := range lengths { 207 | if len(buf) < int(msgLen) { 208 | trunc = int(numParts) - idx 209 | return 210 | } 211 | 212 | // Extract the slice, seek past on the buffer 213 | slice := buf[:msgLen] 214 | buf = buf[msgLen:] 215 | parts = append(parts, slice) 216 | } 217 | return 218 | } 219 | 220 | // Given a string of the form "host", "host:port", 221 | // "ipv6::addr" or "[ipv6::address]:port", 222 | // return true if the string includes a port. 223 | func hasPort(s string) bool { 224 | last := strings.LastIndex(s, ":") 225 | if last == -1 { 226 | return false 227 | } 228 | if s[0] == '[' { 229 | return s[last-1] == ']' 230 | } 231 | return strings.Index(s, ":") == last 232 | } 233 | 234 | // compressPayload takes an opaque input buffer, compresses it 235 | // and wraps it in a compress{} message that is encoded. 236 | func compressPayload(inp []byte) (*bytes.Buffer, error) { 237 | var buf bytes.Buffer 238 | compressor := lzw.NewWriter(&buf, lzw.LSB, lzwLitWidth) 239 | 240 | _, err := compressor.Write(inp) 241 | if err != nil { 242 | return nil, err 243 | } 244 | 245 | // Ensure we flush everything out 246 | if err := compressor.Close(); err != nil { 247 | return nil, err 248 | } 249 | 250 | // Create a compressed message 251 | c := compress{ 252 | Algo: lzwAlgo, 253 | Buf: buf.Bytes(), 254 | } 255 | return encode(compressMsg, &c) 256 | } 257 | 258 | // decompressPayload is used to unpack an encoded compress{} 259 | // message and return its payload uncompressed 260 | func decompressPayload(msg []byte) ([]byte, error) { 261 | // Decode the message 262 | var c compress 263 | if err := decode(msg, &c); err != nil { 264 | return nil, err 265 | } 266 | return decompressBuffer(&c) 267 | } 268 | 269 | // decompressBuffer is used to decompress the buffer of 270 | // a single compress message, handling multiple algorithms 271 | func decompressBuffer(c *compress) ([]byte, error) { 272 | // Verify the algorithm 273 | if c.Algo != lzwAlgo { 274 | return nil, fmt.Errorf("Cannot decompress unknown algorithm %d", c.Algo) 275 | } 276 | 277 | // Create a uncompressor 278 | uncomp := lzw.NewReader(bytes.NewReader(c.Buf), lzw.LSB, lzwLitWidth) 279 | defer uncomp.Close() 280 | 281 | // Read all the data 282 | var b bytes.Buffer 283 | _, err := io.Copy(&b, uncomp) 284 | if err != nil { 285 | return nil, err 286 | } 287 | 288 | // Return the uncompressed bytes 289 | return b.Bytes(), nil 290 | } 291 | 292 | // joinHostPort returns the host:port form of an address, for use with a 293 | // transport. 294 | func joinHostPort(host string, port uint16) string { 295 | return net.JoinHostPort(host, strconv.Itoa(int(port))) 296 | } 297 | -------------------------------------------------------------------------------- /net_transport.go: -------------------------------------------------------------------------------- 1 | package memberlist 2 | 3 | import ( 4 | "fmt" 5 | "log" 6 | "net" 7 | "sync" 8 | "sync/atomic" 9 | "time" 10 | 11 | "github.com/armon/go-metrics" 12 | sockaddr "github.com/hashicorp/go-sockaddr" 13 | ) 14 | 15 | const ( 16 | // udpPacketBufSize is used to buffer incoming packets during read 17 | // operations. 18 | udpPacketBufSize = 65536 19 | 20 | // udpRecvBufSize is a large buffer size that we attempt to set UDP 21 | // sockets to in order to handle a large volume of messages. 22 | udpRecvBufSize = 2 * 1024 * 1024 23 | ) 24 | 25 | // NetTransportConfig is used to configure a net transport. 26 | type NetTransportConfig struct { 27 | // BindAddrs is a list of addresses to bind to for both TCP and UDP 28 | // communications. 29 | BindAddrs []string 30 | 31 | // BindPort is the port to listen on, for each address above. 32 | BindPort int 33 | 34 | // Logger is a logger for operator messages. 35 | Logger *log.Logger 36 | } 37 | 38 | // NetTransport is a Transport implementation that uses connectionless UDP for 39 | // packet operations, and ad-hoc TCP connections for stream operations. 40 | type NetTransport struct { 41 | config *NetTransportConfig 42 | packetCh chan *Packet 43 | streamCh chan net.Conn 44 | logger *log.Logger 45 | wg sync.WaitGroup 46 | tcpListeners []*net.TCPListener 47 | udpListeners []*net.UDPConn 48 | shutdown int32 49 | } 50 | 51 | // NewNetTransport returns a net transport with the given configuration. On 52 | // success all the network listeners will be created and listening. 53 | func NewNetTransport(config *NetTransportConfig) (*NetTransport, error) { 54 | // If we reject the empty list outright we can assume that there's at 55 | // least one listener of each type later during operation. 56 | if len(config.BindAddrs) == 0 { 57 | return nil, fmt.Errorf("At least one bind address is required") 58 | } 59 | 60 | // Build out the new transport. 61 | var ok bool 62 | t := NetTransport{ 63 | config: config, 64 | packetCh: make(chan *Packet), 65 | streamCh: make(chan net.Conn), 66 | logger: config.Logger, 67 | } 68 | 69 | // Clean up listeners if there's an error. 70 | defer func() { 71 | if !ok { 72 | t.Shutdown() 73 | } 74 | }() 75 | 76 | // Build all the TCP and UDP listeners. 77 | port := config.BindPort 78 | for _, addr := range config.BindAddrs { 79 | ip := net.ParseIP(addr) 80 | 81 | tcpAddr := &net.TCPAddr{IP: ip, Port: port} 82 | tcpLn, err := net.ListenTCP("tcp", tcpAddr) 83 | if err != nil { 84 | return nil, fmt.Errorf("Failed to start TCP listener on %q port %d: %v", addr, port, err) 85 | } 86 | t.tcpListeners = append(t.tcpListeners, tcpLn) 87 | 88 | // If the config port given was zero, use the first TCP listener 89 | // to pick an available port and then apply that to everything 90 | // else. 91 | if port == 0 { 92 | port = tcpLn.Addr().(*net.TCPAddr).Port 93 | } 94 | 95 | udpAddr := &net.UDPAddr{IP: ip, Port: port} 96 | udpLn, err := net.ListenUDP("udp", udpAddr) 97 | if err != nil { 98 | return nil, fmt.Errorf("Failed to start UDP listener on %q port %d: %v", addr, port, err) 99 | } 100 | if err := setUDPRecvBuf(udpLn); err != nil { 101 | return nil, fmt.Errorf("Failed to resize UDP buffer: %v", err) 102 | } 103 | t.udpListeners = append(t.udpListeners, udpLn) 104 | } 105 | 106 | // Fire them up now that we've been able to create them all. 107 | for i := 0; i < len(config.BindAddrs); i++ { 108 | t.wg.Add(2) 109 | go t.tcpListen(t.tcpListeners[i]) 110 | go t.udpListen(t.udpListeners[i]) 111 | } 112 | 113 | ok = true 114 | return &t, nil 115 | } 116 | 117 | // GetAutoBindPort returns the bind port that was automatically given by the 118 | // kernel, if a bind port of 0 was given. 119 | func (t *NetTransport) GetAutoBindPort() int { 120 | // We made sure there's at least one TCP listener, and that one's 121 | // port was applied to all the others for the dynamic bind case. 122 | return t.tcpListeners[0].Addr().(*net.TCPAddr).Port 123 | } 124 | 125 | // See Transport. 126 | func (t *NetTransport) FinalAdvertiseAddr(ip string, port int) (net.IP, int, error) { 127 | var advertiseAddr net.IP 128 | var advertisePort int 129 | if ip != "" { 130 | // If they've supplied an address, use that. 131 | advertiseAddr = net.ParseIP(ip) 132 | if advertiseAddr == nil { 133 | return nil, 0, fmt.Errorf("Failed to parse advertise address %q", ip) 134 | } 135 | 136 | // Ensure IPv4 conversion if necessary. 137 | if ip4 := advertiseAddr.To4(); ip4 != nil { 138 | advertiseAddr = ip4 139 | } 140 | advertisePort = port 141 | } else { 142 | if t.config.BindAddrs[0] == "0.0.0.0" { 143 | // Otherwise, if we're not bound to a specific IP, let's 144 | // use a suitable private IP address. 145 | var err error 146 | ip, err = sockaddr.GetPrivateIP() 147 | if err != nil { 148 | return nil, 0, fmt.Errorf("Failed to get interface addresses: %v", err) 149 | } 150 | if ip == "" { 151 | return nil, 0, fmt.Errorf("No private IP address found, and explicit IP not provided") 152 | } 153 | 154 | advertiseAddr = net.ParseIP(ip) 155 | if advertiseAddr == nil { 156 | return nil, 0, fmt.Errorf("Failed to parse advertise address: %q", ip) 157 | } 158 | } else { 159 | // Use the IP that we're bound to, based on the first 160 | // TCP listener, which we already ensure is there. 161 | advertiseAddr = t.tcpListeners[0].Addr().(*net.TCPAddr).IP 162 | } 163 | 164 | // Use the port we are bound to. 165 | advertisePort = t.GetAutoBindPort() 166 | } 167 | 168 | return advertiseAddr, advertisePort, nil 169 | } 170 | 171 | // See Transport. 172 | func (t *NetTransport) WriteTo(b []byte, addr string) (time.Time, error) { 173 | udpAddr, err := net.ResolveUDPAddr("udp", addr) 174 | if err != nil { 175 | return time.Time{}, err 176 | } 177 | 178 | // We made sure there's at least one UDP listener, so just use the 179 | // packet sending interface on the first one. Take the time after the 180 | // write call comes back, which will underestimate the time a little, 181 | // but help account for any delays before the write occurs. 182 | _, err = t.udpListeners[0].WriteTo(b, udpAddr) 183 | return time.Now(), err 184 | } 185 | 186 | // See Transport. 187 | func (t *NetTransport) PacketCh() <-chan *Packet { 188 | return t.packetCh 189 | } 190 | 191 | // See Transport. 192 | func (t *NetTransport) DialTimeout(addr string, timeout time.Duration) (net.Conn, error) { 193 | dialer := net.Dialer{Timeout: timeout} 194 | return dialer.Dial("tcp", addr) 195 | } 196 | 197 | // See Transport. 198 | func (t *NetTransport) StreamCh() <-chan net.Conn { 199 | return t.streamCh 200 | } 201 | 202 | // See Transport. 203 | func (t *NetTransport) Shutdown() error { 204 | // This will avoid log spam about errors when we shut down. 205 | atomic.StoreInt32(&t.shutdown, 1) 206 | 207 | // Rip through all the connections and shut them down. 208 | for _, conn := range t.tcpListeners { 209 | conn.Close() 210 | } 211 | for _, conn := range t.udpListeners { 212 | conn.Close() 213 | } 214 | 215 | // Block until all the listener threads have died. 216 | t.wg.Wait() 217 | return nil 218 | } 219 | 220 | // tcpListen is a long running goroutine that accepts incoming TCP connections 221 | // and hands them off to the stream channel. 222 | func (t *NetTransport) tcpListen(tcpLn *net.TCPListener) { 223 | defer t.wg.Done() 224 | for { 225 | conn, err := tcpLn.AcceptTCP() 226 | if err != nil { 227 | if s := atomic.LoadInt32(&t.shutdown); s == 1 { 228 | break 229 | } 230 | 231 | t.logger.Printf("[ERR] memberlist: Error accepting TCP connection: %v", err) 232 | continue 233 | } 234 | 235 | t.streamCh <- conn 236 | } 237 | } 238 | 239 | // udpListen is a long running goroutine that accepts incoming UDP packets and 240 | // hands them off to the packet channel. 241 | func (t *NetTransport) udpListen(udpLn *net.UDPConn) { 242 | defer t.wg.Done() 243 | for { 244 | // Do a blocking read into a fresh buffer. Grab a time stamp as 245 | // close as possible to the I/O. 246 | buf := make([]byte, udpPacketBufSize) 247 | n, addr, err := udpLn.ReadFrom(buf) 248 | ts := time.Now() 249 | if err != nil { 250 | if s := atomic.LoadInt32(&t.shutdown); s == 1 { 251 | break 252 | } 253 | 254 | t.logger.Printf("[ERR] memberlist: Error reading UDP packet: %v", err) 255 | continue 256 | } 257 | 258 | // Check the length - it needs to have at least one byte to be a 259 | // proper message. 260 | if n < 1 { 261 | t.logger.Printf("[ERR] memberlist: UDP packet too short (%d bytes) %s", 262 | len(buf), LogAddress(addr)) 263 | continue 264 | } 265 | 266 | // Ingest the packet. 267 | metrics.IncrCounter([]string{"memberlist", "udp", "received"}, float32(n)) 268 | t.packetCh <- &Packet{ 269 | Buf: buf[:n], 270 | From: addr, 271 | Timestamp: ts, 272 | } 273 | } 274 | } 275 | 276 | // setUDPRecvBuf is used to resize the UDP receive window. The function 277 | // attempts to set the read buffer to `udpRecvBuf` but backs off until 278 | // the read buffer can be set. 279 | func setUDPRecvBuf(c *net.UDPConn) error { 280 | size := udpRecvBufSize 281 | var err error 282 | for size > 0 { 283 | if err = c.SetReadBuffer(size); err == nil { 284 | return nil 285 | } 286 | size = size / 2 287 | } 288 | return err 289 | } 290 | -------------------------------------------------------------------------------- /config.go: -------------------------------------------------------------------------------- 1 | package memberlist 2 | 3 | import ( 4 | "io" 5 | "log" 6 | "os" 7 | "time" 8 | ) 9 | 10 | type Config struct { 11 | // The name of this node. This must be unique in the cluster. 12 | Name string 13 | 14 | // Transport is a hook for providing custom code to communicate with 15 | // other nodes. If this is left nil, then memberlist will by default 16 | // make a NetTransport using BindAddr and BindPort from this structure. 17 | Transport Transport 18 | 19 | // Name of the cluster 20 | ClusterName string 21 | 22 | // Configuration related to what address to bind to and ports to 23 | // listen on. The port is used for both UDP and TCP gossip. It is 24 | // assumed other nodes are running on this port, but they do not need 25 | // to. 26 | BindAddr string 27 | BindPort int 28 | 29 | // Configuration related to what address to advertise to other 30 | // cluster members. Used for nat traversal. 31 | AdvertiseAddr string 32 | AdvertisePort int 33 | 34 | // ProtocolVersion is the configured protocol version that we 35 | // will _speak_. This must be between ProtocolVersionMin and 36 | // ProtocolVersionMax. 37 | ProtocolVersion uint8 38 | 39 | // TCPTimeout is the timeout for establishing a stream connection with 40 | // a remote node for a full state sync, and for stream read and write 41 | // operations. This is a legacy name for backwards compatibility, but 42 | // should really be called StreamTimeout now that we have generalized 43 | // the transport. 44 | TCPTimeout time.Duration 45 | 46 | // IndirectChecks is the number of nodes that will be asked to perform 47 | // an indirect probe of a node in the case a direct probe fails. Memberlist 48 | // waits for an ack from any single indirect node, so increasing this 49 | // number will increase the likelihood that an indirect probe will succeed 50 | // at the expense of bandwidth. 51 | IndirectChecks int 52 | 53 | // RetransmitMult is the multiplier for the number of retransmissions 54 | // that are attempted for messages broadcasted over gossip. The actual 55 | // count of retransmissions is calculated using the formula: 56 | // 57 | // Retransmits = RetransmitMult * log(N+1) 58 | // 59 | // This allows the retransmits to scale properly with cluster size. The 60 | // higher the multiplier, the more likely a failed broadcast is to converge 61 | // at the expense of increased bandwidth. 62 | RetransmitMult int 63 | 64 | // SuspicionMult is the multiplier for determining the time an 65 | // inaccessible node is considered suspect before declaring it dead. 66 | // The actual timeout is calculated using the formula: 67 | // 68 | // SuspicionTimeout = SuspicionMult * log(N+1) * ProbeInterval 69 | // 70 | // This allows the timeout to scale properly with expected propagation 71 | // delay with a larger cluster size. The higher the multiplier, the longer 72 | // an inaccessible node is considered part of the cluster before declaring 73 | // it dead, giving that suspect node more time to refute if it is indeed 74 | // still alive. 75 | SuspicionMult int 76 | 77 | // SuspicionMaxTimeoutMult is the multiplier applied to the 78 | // SuspicionTimeout used as an upper bound on detection time. This max 79 | // timeout is calculated using the formula: 80 | // 81 | // SuspicionMaxTimeout = SuspicionMaxTimeoutMult * SuspicionTimeout 82 | // 83 | // If everything is working properly, confirmations from other nodes will 84 | // accelerate suspicion timers in a manner which will cause the timeout 85 | // to reach the base SuspicionTimeout before that elapses, so this value 86 | // will typically only come into play if a node is experiencing issues 87 | // communicating with other nodes. It should be set to a something fairly 88 | // large so that a node having problems will have a lot of chances to 89 | // recover before falsely declaring other nodes as failed, but short 90 | // enough for a legitimately isolated node to still make progress marking 91 | // nodes failed in a reasonable amount of time. 92 | SuspicionMaxTimeoutMult int 93 | 94 | // PushPullInterval is the interval between complete state syncs. 95 | // Complete state syncs are done with a single node over TCP and are 96 | // quite expensive relative to standard gossiped messages. Setting this 97 | // to zero will disable state push/pull syncs completely. 98 | // 99 | // Setting this interval lower (more frequent) will increase convergence 100 | // speeds across larger clusters at the expense of increased bandwidth 101 | // usage. 102 | PushPullInterval time.Duration 103 | 104 | // ProbeInterval and ProbeTimeout are used to configure probing 105 | // behavior for memberlist. 106 | // 107 | // ProbeInterval is the interval between random node probes. Setting 108 | // this lower (more frequent) will cause the memberlist cluster to detect 109 | // failed nodes more quickly at the expense of increased bandwidth usage. 110 | // 111 | // ProbeTimeout is the timeout to wait for an ack from a probed node 112 | // before assuming it is unhealthy. This should be set to 99-percentile 113 | // of RTT (round-trip time) on your network. 114 | ProbeInterval time.Duration 115 | ProbeTimeout time.Duration 116 | 117 | // DisableTcpPings will turn off the fallback TCP pings that are attempted 118 | // if the direct UDP ping fails. These get pipelined along with the 119 | // indirect UDP pings. 120 | DisableTcpPings bool 121 | 122 | // AwarenessMaxMultiplier will increase the probe interval if the node 123 | // becomes aware that it might be degraded and not meeting the soft real 124 | // time requirements to reliably probe other nodes. 125 | AwarenessMaxMultiplier int 126 | 127 | // GossipInterval and GossipNodes are used to configure the gossip 128 | // behavior of memberlist. 129 | // 130 | // GossipInterval is the interval between sending messages that need 131 | // to be gossiped that haven't been able to piggyback on probing messages. 132 | // If this is set to zero, non-piggyback gossip is disabled. By lowering 133 | // this value (more frequent) gossip messages are propagated across 134 | // the cluster more quickly at the expense of increased bandwidth. 135 | // 136 | // GossipNodes is the number of random nodes to send gossip messages to 137 | // per GossipInterval. Increasing this number causes the gossip messages 138 | // to propagate across the cluster more quickly at the expense of 139 | // increased bandwidth. 140 | // 141 | // GossipToTheDeadTime is the interval after which a node has died that 142 | // we will still try to gossip to it. This gives it a chance to refute. 143 | GossipInterval time.Duration 144 | GossipNodes int 145 | GossipToTheDeadTime time.Duration 146 | 147 | // 148 | // GossipMessages is the number of times we try to get new messages within 149 | // each GossipInterval. Each set of messages is sent to GossipNodes number 150 | // of nodes. 151 | GossipMessages int 152 | 153 | // GossipVerifyIncoming controls whether to enforce encryption for incoming 154 | // gossip. It is used for upshifting from unencrypted to encrypted gossip on 155 | // a running cluster. 156 | GossipVerifyIncoming bool 157 | 158 | // GossipVerifyOutgoing controls whether to enforce encryption for outgoing 159 | // gossip. It is used for upshifting from unencrypted to encrypted gossip on 160 | // a running cluster. 161 | GossipVerifyOutgoing bool 162 | 163 | // EnableCompression is used to control message compression. This can 164 | // be used to reduce bandwidth usage at the cost of slightly more CPU 165 | // utilization. This is only available starting at protocol version 1. 166 | EnableCompression bool 167 | 168 | // SecretKey is used to initialize the primary encryption key in a keyring. 169 | // The primary encryption key is the only key used to encrypt messages and 170 | // the first key used while attempting to decrypt messages. Providing a 171 | // value for this primary key will enable message-level encryption and 172 | // verification, and automatically install the key onto the keyring. 173 | // The value should be either 16, 24, or 32 bytes to select AES-128, 174 | // AES-192, or AES-256. 175 | SecretKey []byte 176 | 177 | // The keyring holds all of the encryption keys used internally. It is 178 | // automatically initialized using the SecretKey and SecretKeys values. 179 | Keyring *Keyring 180 | 181 | // Delegate and Events are delegates for receiving and providing 182 | // data to memberlist via callback mechanisms. For Delegate, see 183 | // the Delegate interface. For Events, see the EventDelegate interface. 184 | // 185 | // The DelegateProtocolMin/Max are used to guarantee protocol-compatibility 186 | // for any custom messages that the delegate might do (broadcasts, 187 | // local/remote state, etc.). If you don't set these, then the protocol 188 | // versions will just be zero, and version compliance won't be done. 189 | Delegate Delegate 190 | DelegateProtocolVersion uint8 191 | DelegateProtocolMin uint8 192 | DelegateProtocolMax uint8 193 | Events EventDelegate 194 | Conflict ConflictDelegate 195 | Merge MergeDelegate 196 | Ping PingDelegate 197 | Alive AliveDelegate 198 | 199 | // DNSConfigPath points to the system's DNS config file, usually located 200 | // at /etc/resolv.conf. It can be overridden via config for easier testing. 201 | DNSConfigPath string 202 | 203 | // LogOutput is the writer where logs should be sent. If this is not 204 | // set, logging will go to stderr by default. You cannot specify both LogOutput 205 | // and Logger at the same time. 206 | LogOutput io.Writer 207 | 208 | // Logger is a custom logger which you provide. If Logger is set, it will use 209 | // this for the internal logger. If Logger is not set, it will fall back to the 210 | // behavior for using LogOutput. You cannot specify both LogOutput and Logger 211 | // at the same time. 212 | Logger *log.Logger 213 | 214 | // Size of Memberlist's internal channel which handles UDP messages. The 215 | // size of this determines the size of the queue which Memberlist will keep 216 | // while UDP messages are handled. 217 | HandoffQueueDepth int 218 | 219 | // Maximum number of bytes that memberlist will put in a packet (this 220 | // will be for UDP packets by default with a NetTransport). A safe value 221 | // for this is typically 1400 bytes (which is the default). However, 222 | // depending on your network's MTU (Maximum Transmission Unit) you may 223 | // be able to increase this to get more content into each gossip packet. 224 | // This is a legacy name for backward compatibility but should really be 225 | // called PacketBufferSize now that we have generalized the transport. 226 | UDPBufferSize int 227 | 228 | // Prefer TCP-based DNS lookup. Do we use a memberlist custom lookup function 229 | // first, using deprecated ANY record, or just rely on Go's DNS resolver? 230 | PreferTCPDNS bool 231 | } 232 | 233 | // DefaultLANConfig returns a sane set of configurations for Memberlist. 234 | // It uses the hostname as the node name, and otherwise sets very conservative 235 | // values that are sane for most LAN environments. The default configuration 236 | // errs on the side of caution, choosing values that are optimized 237 | // for higher convergence at the cost of higher bandwidth usage. Regardless, 238 | // these values are a good starting point when getting started with memberlist. 239 | func DefaultLANConfig() *Config { 240 | hostname, _ := os.Hostname() 241 | return &Config{ 242 | Name: hostname, 243 | ClusterName: "default", 244 | BindAddr: "0.0.0.0", 245 | BindPort: 7946, 246 | AdvertiseAddr: "", 247 | AdvertisePort: 7946, 248 | ProtocolVersion: ProtocolVersion2Compatible, 249 | TCPTimeout: 10 * time.Second, // Timeout after 10 seconds 250 | IndirectChecks: 3, // Use 3 nodes for the indirect ping 251 | RetransmitMult: 4, // Retransmit a message 4 * log(N+1) nodes 252 | SuspicionMult: 5, // Suspect a node for 5 * log(N+1) * Interval 253 | SuspicionMaxTimeoutMult: 6, // For 10k nodes this will give a max timeout of 120 seconds 254 | PushPullInterval: 30 * time.Second, // Low frequency 255 | ProbeTimeout: 500 * time.Millisecond, // Reasonable RTT time for LAN 256 | ProbeInterval: 1 * time.Second, // Failure check every second 257 | DisableTcpPings: false, // TCP pings are safe, even with mixed versions 258 | AwarenessMaxMultiplier: 8, // Probe interval backs off to 8 seconds 259 | 260 | GossipNodes: 3, // Gossip to 3 nodes 261 | GossipMessages: 3, // Ask for 3 sets of messages on each pass 262 | GossipInterval: 200 * time.Millisecond, // Gossip more rapidly 263 | GossipToTheDeadTime: 30 * time.Second, // Same as push/pull 264 | GossipVerifyIncoming: true, 265 | GossipVerifyOutgoing: true, 266 | 267 | EnableCompression: true, // Enable compression by default 268 | 269 | SecretKey: nil, 270 | Keyring: nil, 271 | 272 | DNSConfigPath: "/etc/resolv.conf", 273 | PreferTCPDNS: true, 274 | 275 | HandoffQueueDepth: 1024, 276 | UDPBufferSize: 1400, 277 | } 278 | } 279 | 280 | // DefaultWANConfig works like DefaultConfig, however it returns a configuration 281 | // that is optimized for most WAN environments. The default configuration is 282 | // still very conservative and errs on the side of caution. 283 | func DefaultWANConfig() *Config { 284 | conf := DefaultLANConfig() 285 | conf.TCPTimeout = 30 * time.Second 286 | conf.SuspicionMult = 6 287 | conf.PushPullInterval = 60 * time.Second 288 | conf.ProbeTimeout = 3 * time.Second 289 | conf.ProbeInterval = 5 * time.Second 290 | conf.GossipNodes = 4 // Gossip less frequently, but to an additional node 291 | conf.GossipMessages = 4 // Ask for 4 sets of messages on each pass 292 | conf.GossipInterval = 500 * time.Millisecond 293 | conf.GossipToTheDeadTime = 60 * time.Second 294 | return conf 295 | } 296 | 297 | // DefaultLocalConfig works like DefaultConfig, however it returns a configuration 298 | // that is optimized for a local loopback environments. The default configuration is 299 | // still very conservative and errs on the side of caution. 300 | func DefaultLocalConfig() *Config { 301 | conf := DefaultLANConfig() 302 | conf.TCPTimeout = time.Second 303 | conf.IndirectChecks = 1 304 | conf.RetransmitMult = 2 305 | conf.SuspicionMult = 3 306 | conf.PushPullInterval = 15 * time.Second 307 | conf.ProbeTimeout = 200 * time.Millisecond 308 | conf.ProbeInterval = time.Second 309 | conf.GossipMessages = 2 // There are usually fewer nodes in this environment 310 | conf.GossipInterval = 100 * time.Millisecond 311 | conf.GossipToTheDeadTime = 15 * time.Second 312 | return conf 313 | } 314 | 315 | // Returns whether or not encryption is enabled 316 | func (c *Config) EncryptionEnabled() bool { 317 | return c.Keyring != nil && len(c.Keyring.GetKeys()) > 0 318 | } 319 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Mozilla Public License, version 2.0 2 | 3 | 1. Definitions 4 | 5 | 1.1. “Contributor” 6 | 7 | means each individual or legal entity that creates, contributes to the 8 | creation of, or owns Covered Software. 9 | 10 | 1.2. “Contributor Version” 11 | 12 | means the combination of the Contributions of others (if any) used by a 13 | Contributor and that particular Contributor’s Contribution. 14 | 15 | 1.3. “Contribution” 16 | 17 | means Covered Software of a particular Contributor. 18 | 19 | 1.4. “Covered Software” 20 | 21 | means Source Code Form to which the initial Contributor has attached the 22 | notice in Exhibit A, the Executable Form of such Source Code Form, and 23 | Modifications of such Source Code Form, in each case including portions 24 | thereof. 25 | 26 | 1.5. “Incompatible With Secondary Licenses” 27 | means 28 | 29 | a. that the initial Contributor has attached the notice described in 30 | Exhibit B to the Covered Software; or 31 | 32 | b. that the Covered Software was made available under the terms of version 33 | 1.1 or earlier of the License, but not also under the terms of a 34 | Secondary License. 35 | 36 | 1.6. “Executable Form” 37 | 38 | means any form of the work other than Source Code Form. 39 | 40 | 1.7. “Larger Work” 41 | 42 | means a work that combines Covered Software with other material, in a separate 43 | file or files, that is not Covered Software. 44 | 45 | 1.8. “License” 46 | 47 | means this document. 48 | 49 | 1.9. “Licensable” 50 | 51 | means having the right to grant, to the maximum extent possible, whether at the 52 | time of the initial grant or subsequently, any and all of the rights conveyed by 53 | this License. 54 | 55 | 1.10. “Modifications” 56 | 57 | means any of the following: 58 | 59 | a. any file in Source Code Form that results from an addition to, deletion 60 | from, or modification of the contents of Covered Software; or 61 | 62 | b. any new file in Source Code Form that contains any Covered Software. 63 | 64 | 1.11. “Patent Claims” of a Contributor 65 | 66 | means any patent claim(s), including without limitation, method, process, 67 | and apparatus claims, in any patent Licensable by such Contributor that 68 | would be infringed, but for the grant of the License, by the making, 69 | using, selling, offering for sale, having made, import, or transfer of 70 | either its Contributions or its Contributor Version. 71 | 72 | 1.12. “Secondary License” 73 | 74 | means either the GNU General Public License, Version 2.0, the GNU Lesser 75 | General Public License, Version 2.1, the GNU Affero General Public 76 | License, Version 3.0, or any later versions of those licenses. 77 | 78 | 1.13. “Source Code Form” 79 | 80 | means the form of the work preferred for making modifications. 81 | 82 | 1.14. “You” (or “Your”) 83 | 84 | means an individual or a legal entity exercising rights under this 85 | License. For legal entities, “You” includes any entity that controls, is 86 | controlled by, or is under common control with You. For purposes of this 87 | definition, “control” means (a) the power, direct or indirect, to cause 88 | the direction or management of such entity, whether by contract or 89 | otherwise, or (b) ownership of more than fifty percent (50%) of the 90 | outstanding shares or beneficial ownership of such entity. 91 | 92 | 93 | 2. License Grants and Conditions 94 | 95 | 2.1. Grants 96 | 97 | Each Contributor hereby grants You a world-wide, royalty-free, 98 | non-exclusive license: 99 | 100 | a. under intellectual property rights (other than patent or trademark) 101 | Licensable by such Contributor to use, reproduce, make available, 102 | modify, display, perform, distribute, and otherwise exploit its 103 | Contributions, either on an unmodified basis, with Modifications, or as 104 | part of a Larger Work; and 105 | 106 | b. under Patent Claims of such Contributor to make, use, sell, offer for 107 | sale, have made, import, and otherwise transfer either its Contributions 108 | or its Contributor Version. 109 | 110 | 2.2. Effective Date 111 | 112 | The licenses granted in Section 2.1 with respect to any Contribution become 113 | effective for each Contribution on the date the Contributor first distributes 114 | such Contribution. 115 | 116 | 2.3. Limitations on Grant Scope 117 | 118 | The licenses granted in this Section 2 are the only rights granted under this 119 | License. No additional rights or licenses will be implied from the distribution 120 | or licensing of Covered Software under this License. Notwithstanding Section 121 | 2.1(b) above, no patent license is granted by a Contributor: 122 | 123 | a. for any code that a Contributor has removed from Covered Software; or 124 | 125 | b. for infringements caused by: (i) Your and any other third party’s 126 | modifications of Covered Software, or (ii) the combination of its 127 | Contributions with other software (except as part of its Contributor 128 | Version); or 129 | 130 | c. under Patent Claims infringed by Covered Software in the absence of its 131 | Contributions. 132 | 133 | This License does not grant any rights in the trademarks, service marks, or 134 | logos of any Contributor (except as may be necessary to comply with the 135 | notice requirements in Section 3.4). 136 | 137 | 2.4. Subsequent Licenses 138 | 139 | No Contributor makes additional grants as a result of Your choice to 140 | distribute the Covered Software under a subsequent version of this License 141 | (see Section 10.2) or under the terms of a Secondary License (if permitted 142 | under the terms of Section 3.3). 143 | 144 | 2.5. Representation 145 | 146 | Each Contributor represents that the Contributor believes its Contributions 147 | are its original creation(s) or it has sufficient rights to grant the 148 | rights to its Contributions conveyed by this License. 149 | 150 | 2.6. Fair Use 151 | 152 | This License is not intended to limit any rights You have under applicable 153 | copyright doctrines of fair use, fair dealing, or other equivalents. 154 | 155 | 2.7. Conditions 156 | 157 | Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted in 158 | Section 2.1. 159 | 160 | 161 | 3. Responsibilities 162 | 163 | 3.1. Distribution of Source Form 164 | 165 | All distribution of Covered Software in Source Code Form, including any 166 | Modifications that You create or to which You contribute, must be under the 167 | terms of this License. You must inform recipients that the Source Code Form 168 | of the Covered Software is governed by the terms of this License, and how 169 | they can obtain a copy of this License. You may not attempt to alter or 170 | restrict the recipients’ rights in the Source Code Form. 171 | 172 | 3.2. Distribution of Executable Form 173 | 174 | If You distribute Covered Software in Executable Form then: 175 | 176 | a. such Covered Software must also be made available in Source Code Form, 177 | as described in Section 3.1, and You must inform recipients of the 178 | Executable Form how they can obtain a copy of such Source Code Form by 179 | reasonable means in a timely manner, at a charge no more than the cost 180 | of distribution to the recipient; and 181 | 182 | b. You may distribute such Executable Form under the terms of this License, 183 | or sublicense it under different terms, provided that the license for 184 | the Executable Form does not attempt to limit or alter the recipients’ 185 | rights in the Source Code Form under this License. 186 | 187 | 3.3. Distribution of a Larger Work 188 | 189 | You may create and distribute a Larger Work under terms of Your choice, 190 | provided that You also comply with the requirements of this License for the 191 | Covered Software. If the Larger Work is a combination of Covered Software 192 | with a work governed by one or more Secondary Licenses, and the Covered 193 | Software is not Incompatible With Secondary Licenses, this License permits 194 | You to additionally distribute such Covered Software under the terms of 195 | such Secondary License(s), so that the recipient of the Larger Work may, at 196 | their option, further distribute the Covered Software under the terms of 197 | either this License or such Secondary License(s). 198 | 199 | 3.4. Notices 200 | 201 | You may not remove or alter the substance of any license notices (including 202 | copyright notices, patent notices, disclaimers of warranty, or limitations 203 | of liability) contained within the Source Code Form of the Covered 204 | Software, except that You may alter any license notices to the extent 205 | required to remedy known factual inaccuracies. 206 | 207 | 3.5. Application of Additional Terms 208 | 209 | You may choose to offer, and to charge a fee for, warranty, support, 210 | indemnity or liability obligations to one or more recipients of Covered 211 | Software. However, You may do so only on Your own behalf, and not on behalf 212 | of any Contributor. You must make it absolutely clear that any such 213 | warranty, support, indemnity, or liability obligation is offered by You 214 | alone, and You hereby agree to indemnify every Contributor for any 215 | liability incurred by such Contributor as a result of warranty, support, 216 | indemnity or liability terms You offer. You may include additional 217 | disclaimers of warranty and limitations of liability specific to any 218 | jurisdiction. 219 | 220 | 4. Inability to Comply Due to Statute or Regulation 221 | 222 | If it is impossible for You to comply with any of the terms of this License 223 | with respect to some or all of the Covered Software due to statute, judicial 224 | order, or regulation then You must: (a) comply with the terms of this License 225 | to the maximum extent possible; and (b) describe the limitations and the code 226 | they affect. Such description must be placed in a text file included with all 227 | distributions of the Covered Software under this License. Except to the 228 | extent prohibited by statute or regulation, such description must be 229 | sufficiently detailed for a recipient of ordinary skill to be able to 230 | understand it. 231 | 232 | 5. Termination 233 | 234 | 5.1. The rights granted under this License will terminate automatically if You 235 | fail to comply with any of its terms. However, if You become compliant, 236 | then the rights granted under this License from a particular Contributor 237 | are reinstated (a) provisionally, unless and until such Contributor 238 | explicitly and finally terminates Your grants, and (b) on an ongoing basis, 239 | if such Contributor fails to notify You of the non-compliance by some 240 | reasonable means prior to 60 days after You have come back into compliance. 241 | Moreover, Your grants from a particular Contributor are reinstated on an 242 | ongoing basis if such Contributor notifies You of the non-compliance by 243 | some reasonable means, this is the first time You have received notice of 244 | non-compliance with this License from such Contributor, and You become 245 | compliant prior to 30 days after Your receipt of the notice. 246 | 247 | 5.2. If You initiate litigation against any entity by asserting a patent 248 | infringement claim (excluding declaratory judgment actions, counter-claims, 249 | and cross-claims) alleging that a Contributor Version directly or 250 | indirectly infringes any patent, then the rights granted to You by any and 251 | all Contributors for the Covered Software under Section 2.1 of this License 252 | shall terminate. 253 | 254 | 5.3. In the event of termination under Sections 5.1 or 5.2 above, all end user 255 | license agreements (excluding distributors and resellers) which have been 256 | validly granted by You or Your distributors under this License prior to 257 | termination shall survive termination. 258 | 259 | 6. Disclaimer of Warranty 260 | 261 | Covered Software is provided under this License on an “as is” basis, without 262 | warranty of any kind, either expressed, implied, or statutory, including, 263 | without limitation, warranties that the Covered Software is free of defects, 264 | merchantable, fit for a particular purpose or non-infringing. The entire 265 | risk as to the quality and performance of the Covered Software is with You. 266 | Should any Covered Software prove defective in any respect, You (not any 267 | Contributor) assume the cost of any necessary servicing, repair, or 268 | correction. This disclaimer of warranty constitutes an essential part of this 269 | License. No use of any Covered Software is authorized under this License 270 | except under this disclaimer. 271 | 272 | 7. Limitation of Liability 273 | 274 | Under no circumstances and under no legal theory, whether tort (including 275 | negligence), contract, or otherwise, shall any Contributor, or anyone who 276 | distributes Covered Software as permitted above, be liable to You for any 277 | direct, indirect, special, incidental, or consequential damages of any 278 | character including, without limitation, damages for lost profits, loss of 279 | goodwill, work stoppage, computer failure or malfunction, or any and all 280 | other commercial damages or losses, even if such party shall have been 281 | informed of the possibility of such damages. This limitation of liability 282 | shall not apply to liability for death or personal injury resulting from such 283 | party’s negligence to the extent applicable law prohibits such limitation. 284 | Some jurisdictions do not allow the exclusion or limitation of incidental or 285 | consequential damages, so this exclusion and limitation may not apply to You. 286 | 287 | 8. Litigation 288 | 289 | Any litigation relating to this License may be brought only in the courts of 290 | a jurisdiction where the defendant maintains its principal place of business 291 | and such litigation shall be governed by laws of that jurisdiction, without 292 | reference to its conflict-of-law provisions. Nothing in this Section shall 293 | prevent a party’s ability to bring cross-claims or counter-claims. 294 | 295 | 9. Miscellaneous 296 | 297 | This License represents the complete agreement concerning the subject matter 298 | hereof. If any provision of this License is held to be unenforceable, such 299 | provision shall be reformed only to the extent necessary to make it 300 | enforceable. Any law or regulation which provides that the language of a 301 | contract shall be construed against the drafter shall not be used to construe 302 | this License against a Contributor. 303 | 304 | 305 | 10. Versions of the License 306 | 307 | 10.1. New Versions 308 | 309 | Mozilla Foundation is the license steward. Except as provided in Section 310 | 10.3, no one other than the license steward has the right to modify or 311 | publish new versions of this License. Each version will be given a 312 | distinguishing version number. 313 | 314 | 10.2. Effect of New Versions 315 | 316 | You may distribute the Covered Software under the terms of the version of 317 | the License under which You originally received the Covered Software, or 318 | under the terms of any subsequent version published by the license 319 | steward. 320 | 321 | 10.3. Modified Versions 322 | 323 | If you create software not governed by this License, and you want to 324 | create a new license for such software, you may create and use a modified 325 | version of this License if you rename the license and remove any 326 | references to the name of the license steward (except to note that such 327 | modified license differs from this License). 328 | 329 | 10.4. Distributing Source Code Form that is Incompatible With Secondary Licenses 330 | If You choose to distribute Source Code Form that is Incompatible With 331 | Secondary Licenses under the terms of this version of the License, the 332 | notice described in Exhibit B of this License must be attached. 333 | 334 | Exhibit A - Source Code Form License Notice 335 | 336 | This Source Code Form is subject to the 337 | terms of the Mozilla Public License, v. 338 | 2.0. If a copy of the MPL was not 339 | distributed with this file, You can 340 | obtain one at 341 | http://mozilla.org/MPL/2.0/. 342 | 343 | If it is not possible or desirable to put the notice in a particular file, then 344 | You may include the notice in a location (such as a LICENSE file in a relevant 345 | directory) where a recipient would be likely to look for such a notice. 346 | 347 | You may add additional accurate notices of copyright ownership. 348 | 349 | Exhibit B - “Incompatible With Secondary Licenses” Notice 350 | 351 | This Source Code Form is “Incompatible 352 | With Secondary Licenses”, as defined by 353 | the Mozilla Public License, v. 2.0. 354 | 355 | -------------------------------------------------------------------------------- /memberlist.go: -------------------------------------------------------------------------------- 1 | /* 2 | memberlist is a library that manages cluster 3 | membership and member failure detection using a gossip based protocol. 4 | 5 | The use cases for such a library are far-reaching: all distributed systems 6 | require membership, and memberlist is a re-usable solution to managing 7 | cluster membership and node failure detection. 8 | 9 | memberlist is eventually consistent but converges quickly on average. 10 | The speed at which it converges can be heavily tuned via various knobs 11 | on the protocol. Node failures are detected and network partitions are partially 12 | tolerated by attempting to communicate to potentially dead nodes through 13 | multiple routes. 14 | */ 15 | package memberlist 16 | 17 | import ( 18 | "fmt" 19 | "log" 20 | "net" 21 | "os" 22 | "strconv" 23 | "strings" 24 | "sync" 25 | "time" 26 | 27 | "github.com/hashicorp/go-multierror" 28 | sockaddr "github.com/hashicorp/go-sockaddr" 29 | "github.com/miekg/dns" 30 | ) 31 | 32 | type Memberlist struct { 33 | sequenceNum uint32 // Local sequence number 34 | incarnation uint32 // Local incarnation number 35 | numNodes uint32 // Number of known nodes (estimate) 36 | 37 | config *Config 38 | shutdown bool 39 | shutdownCh chan struct{} 40 | leave bool 41 | leaveBroadcast chan struct{} 42 | 43 | transport Transport 44 | handoff chan msgHandoff 45 | 46 | nodeLock sync.RWMutex 47 | nodes []*nodeState // Known nodes 48 | nodeMap map[string]*nodeState // Maps Addr.String() -> NodeState 49 | nodeTimers map[string]*suspicion // Maps Addr.String() -> suspicion timer 50 | awareness *awareness 51 | 52 | tickerLock sync.Mutex 53 | tickers []*time.Ticker 54 | stopTick chan struct{} 55 | probeIndex int 56 | 57 | ackLock sync.Mutex 58 | ackHandlers map[uint32]*ackHandler 59 | 60 | broadcasts *TransmitLimitedQueue 61 | 62 | logger *log.Logger 63 | } 64 | 65 | // newMemberlist creates the network listeners. 66 | // Does not schedule execution of background maintenance. 67 | func newMemberlist(conf *Config) (*Memberlist, error) { 68 | if conf.ProtocolVersion < ProtocolVersionMin { 69 | return nil, fmt.Errorf("Protocol version '%d' too low. Must be in range: [%d, %d]", 70 | conf.ProtocolVersion, ProtocolVersionMin, ProtocolVersionMax) 71 | } else if conf.ProtocolVersion > ProtocolVersionMax { 72 | return nil, fmt.Errorf("Protocol version '%d' too high. Must be in range: [%d, %d]", 73 | conf.ProtocolVersion, ProtocolVersionMin, ProtocolVersionMax) 74 | } 75 | 76 | if len(conf.SecretKey) > 0 { 77 | if conf.Keyring == nil { 78 | keyring, err := NewKeyring(nil, conf.SecretKey) 79 | if err != nil { 80 | return nil, err 81 | } 82 | conf.Keyring = keyring 83 | } else { 84 | if err := conf.Keyring.AddKey(conf.SecretKey); err != nil { 85 | return nil, err 86 | } 87 | if err := conf.Keyring.UseKey(conf.SecretKey); err != nil { 88 | return nil, err 89 | } 90 | } 91 | } 92 | 93 | if conf.LogOutput != nil && conf.Logger != nil { 94 | return nil, fmt.Errorf("Cannot specify both LogOutput and Logger. Please choose a single log configuration setting.") 95 | } 96 | 97 | logDest := conf.LogOutput 98 | if logDest == nil { 99 | logDest = os.Stderr 100 | } 101 | 102 | logger := conf.Logger 103 | if logger == nil { 104 | logger = log.New(logDest, "", log.LstdFlags) 105 | } 106 | 107 | // Set up a network transport by default if a custom one wasn't given 108 | // by the config. 109 | transport := conf.Transport 110 | if transport == nil { 111 | nc := &NetTransportConfig{ 112 | BindAddrs: []string{conf.BindAddr}, 113 | BindPort: conf.BindPort, 114 | Logger: logger, 115 | } 116 | nt, err := NewNetTransport(nc) 117 | if err != nil { 118 | return nil, fmt.Errorf("Could not set up network transport: %v", err) 119 | } 120 | 121 | if conf.BindPort == 0 { 122 | port := nt.GetAutoBindPort() 123 | conf.BindPort = port 124 | logger.Printf("[DEBUG] Using dynamic bind port %d", port) 125 | } 126 | transport = nt 127 | } 128 | 129 | m := &Memberlist{ 130 | config: conf, 131 | shutdownCh: make(chan struct{}), 132 | leaveBroadcast: make(chan struct{}, 1), 133 | transport: transport, 134 | handoff: make(chan msgHandoff, conf.HandoffQueueDepth), 135 | nodeMap: make(map[string]*nodeState), 136 | nodeTimers: make(map[string]*suspicion), 137 | awareness: newAwareness(conf.AwarenessMaxMultiplier), 138 | ackHandlers: make(map[uint32]*ackHandler), 139 | broadcasts: &TransmitLimitedQueue{RetransmitMult: conf.RetransmitMult}, 140 | logger: logger, 141 | } 142 | m.broadcasts.NumNodes = func() int { 143 | return m.estNumNodes() 144 | } 145 | go m.streamListen() 146 | go m.packetListen() 147 | go m.packetHandler() 148 | return m, nil 149 | } 150 | 151 | // Create will create a new Memberlist using the given configuration. 152 | // This will not connect to any other node (see Join) yet, but will start 153 | // all the listeners to allow other nodes to join this memberlist. 154 | // After creating a Memberlist, the configuration given should not be 155 | // modified by the user anymore. 156 | func Create(conf *Config) (*Memberlist, error) { 157 | m, err := newMemberlist(conf) 158 | if err != nil { 159 | return nil, err 160 | } 161 | if err := m.setAlive(); err != nil { 162 | m.Shutdown() 163 | return nil, err 164 | } 165 | m.schedule() 166 | return m, nil 167 | } 168 | 169 | // Join is used to take an existing Memberlist and attempt to join a cluster 170 | // by contacting all the given hosts and performing a state sync. Initially, 171 | // the Memberlist only contains our own state, so doing this will cause 172 | // remote nodes to become aware of the existence of this node, effectively 173 | // joining the cluster. 174 | // 175 | // This returns the number of hosts successfully contacted and an error if 176 | // none could be reached. If an error is returned, the node did not successfully 177 | // join the cluster. 178 | func (m *Memberlist) Join(existing []string) (int, error) { 179 | numSuccess := 0 180 | var errs error 181 | for _, exist := range existing { 182 | addrs, err := m.resolveAddr(exist) 183 | if err != nil { 184 | err = fmt.Errorf("Failed to resolve %s: %v", exist, err) 185 | errs = multierror.Append(errs, err) 186 | m.logger.Printf("[WARN] memberlist: %v", err) 187 | continue 188 | } 189 | 190 | for _, addr := range addrs { 191 | hp := joinHostPort(addr.ip.String(), addr.port) 192 | if err := m.pushPullNode(hp, true); err != nil { 193 | err = fmt.Errorf("Failed to join %s: %v", addr.ip, err) 194 | errs = multierror.Append(errs, err) 195 | m.logger.Printf("[DEBUG] memberlist: %v", err) 196 | continue 197 | } 198 | numSuccess++ 199 | } 200 | 201 | } 202 | if numSuccess > 0 { 203 | errs = nil 204 | } 205 | return numSuccess, errs 206 | } 207 | 208 | // ipPort holds information about a node we want to try to join. 209 | type ipPort struct { 210 | ip net.IP 211 | port uint16 212 | } 213 | 214 | // tcpLookupIP is a helper to initiate a TCP-based DNS lookup for the given host. 215 | // The built-in Go resolver will do a UDP lookup first, and will only use TCP if 216 | // the response has the truncate bit set, which isn't common on DNS servers like 217 | // Consul's. By doing the TCP lookup directly, we get the best chance for the 218 | // largest list of hosts to join. Since joins are relatively rare events, it's ok 219 | // to do this rather expensive operation. 220 | func (m *Memberlist) tcpLookupIP(host string, defaultPort uint16) ([]ipPort, error) { 221 | // Don't attempt any TCP lookups against non-fully qualified domain 222 | // names, since those will likely come from the resolv.conf file. 223 | if !strings.Contains(host, ".") { 224 | return nil, nil 225 | } 226 | 227 | // Make sure the domain name is terminated with a dot (we know there's 228 | // at least one character at this point). 229 | dn := host 230 | if dn[len(dn)-1] != '.' { 231 | dn = dn + "." 232 | } 233 | 234 | // See if we can find a server to try. 235 | cc, err := dns.ClientConfigFromFile(m.config.DNSConfigPath) 236 | if err != nil { 237 | return nil, err 238 | } 239 | if len(cc.Servers) > 0 { 240 | // We support host:port in the DNS config, but need to add the 241 | // default port if one is not supplied. 242 | server := cc.Servers[0] 243 | if !hasPort(server) { 244 | server = net.JoinHostPort(server, cc.Port) 245 | } 246 | 247 | // Do the lookup. 248 | c := new(dns.Client) 249 | c.Net = "tcp" 250 | msg := new(dns.Msg) 251 | msg.SetQuestion(dn, dns.TypeANY) 252 | in, _, err := c.Exchange(msg, server) 253 | if err != nil { 254 | return nil, err 255 | } 256 | 257 | // Handle any IPs we get back that we can attempt to join. 258 | var ips []ipPort 259 | for _, r := range in.Answer { 260 | switch rr := r.(type) { 261 | case (*dns.A): 262 | ips = append(ips, ipPort{rr.A, defaultPort}) 263 | case (*dns.AAAA): 264 | ips = append(ips, ipPort{rr.AAAA, defaultPort}) 265 | case (*dns.CNAME): 266 | m.logger.Printf("[DEBUG] memberlist: Ignoring CNAME RR in TCP-first answer for '%s'", host) 267 | } 268 | } 269 | return ips, nil 270 | } 271 | 272 | return nil, nil 273 | } 274 | 275 | // resolveAddr is used to resolve the address into an address, 276 | // port, and error. If no port is given, use the default 277 | func (m *Memberlist) resolveAddr(hostStr string) ([]ipPort, error) { 278 | // Normalize the incoming string to host:port so we can apply Go's 279 | // parser to it. 280 | port := uint16(0) 281 | if !hasPort(hostStr) { 282 | hostStr += ":" + strconv.Itoa(m.config.BindPort) 283 | } 284 | host, sport, err := net.SplitHostPort(hostStr) 285 | if err != nil { 286 | return nil, err 287 | } 288 | 289 | // This will capture the supplied port, or the default one added above. 290 | lport, err := strconv.ParseUint(sport, 10, 16) 291 | if err != nil { 292 | return nil, err 293 | } 294 | port = uint16(lport) 295 | 296 | // If it looks like an IP address we are done. The SplitHostPort() above 297 | // will make sure the host part is in good shape for parsing, even for 298 | // IPv6 addresses. 299 | if ip := net.ParseIP(host); ip != nil { 300 | return []ipPort{ipPort{ip, port}}, nil 301 | } 302 | 303 | var ips []ipPort 304 | 305 | if m.config.PreferTCPDNS { 306 | // First try TCP so we have the best chance for the largest list of 307 | // hosts to join. If this fails it's not fatal since this isn't a standard 308 | // way to query DNS, and we have a fallback below. 309 | ips, err = m.tcpLookupIP(host, port) 310 | if err != nil { 311 | m.logger.Printf("[DEBUG] memberlist: TCP-first lookup failed for '%s', falling back to UDP: %s", hostStr, err) 312 | } 313 | if len(ips) > 0 { 314 | return ips, nil 315 | } 316 | } 317 | 318 | // If TCP didn't yield anything then use the normal Go resolver which 319 | // will try UDP, then might possibly try TCP again if the UDP response 320 | // indicates it was truncated. 321 | ans, err := net.LookupIP(host) 322 | if err != nil { 323 | return nil, err 324 | } 325 | ips = make([]ipPort, 0, len(ans)) 326 | for _, ip := range ans { 327 | ips = append(ips, ipPort{ip, port}) 328 | } 329 | return ips, nil 330 | } 331 | 332 | // setAlive is used to mark this node as being alive. This is the same 333 | // as if we received an alive notification our own network channel for 334 | // ourself. 335 | func (m *Memberlist) setAlive() error { 336 | // Get the final advertise address from the transport, which may need 337 | // to see which address we bound to. 338 | addr, port, err := m.transport.FinalAdvertiseAddr( 339 | m.config.AdvertiseAddr, m.config.AdvertisePort) 340 | if err != nil { 341 | return fmt.Errorf("Failed to get final advertise address: %v", err) 342 | } 343 | 344 | // Check if this is a public address without encryption 345 | ipAddr, err := sockaddr.NewIPAddr(addr.String()) 346 | if err != nil { 347 | return fmt.Errorf("Failed to parse interface addresses: %v", err) 348 | } 349 | ifAddrs := []sockaddr.IfAddr{ 350 | sockaddr.IfAddr{ 351 | SockAddr: ipAddr, 352 | }, 353 | } 354 | _, publicIfs, err := sockaddr.IfByRFC("6890", ifAddrs) 355 | if len(publicIfs) > 0 && !m.config.EncryptionEnabled() { 356 | m.logger.Printf("[WARN] memberlist: Binding to public address without encryption!") 357 | } 358 | 359 | // Set any metadata from the delegate. 360 | var meta []byte 361 | if m.config.Delegate != nil { 362 | meta = m.config.Delegate.NodeMeta(MetaMaxSize) 363 | if len(meta) > MetaMaxSize { 364 | panic("Node meta data provided is longer than the limit") 365 | } 366 | } 367 | 368 | a := alive{ 369 | Incarnation: m.nextIncarnation(), 370 | Node: m.config.Name, 371 | ClusterName: m.ClusterName(), 372 | Addr: addr, 373 | Port: uint16(port), 374 | Meta: meta, 375 | Vsn: []uint8{ 376 | ProtocolVersionMin, ProtocolVersionMax, m.config.ProtocolVersion, 377 | m.config.DelegateProtocolMin, m.config.DelegateProtocolMax, 378 | m.config.DelegateProtocolVersion, 379 | }, 380 | } 381 | m.aliveNode(&a, nil, true) 382 | return nil 383 | } 384 | 385 | // LocalNode is used to return the local Node 386 | func (m *Memberlist) LocalNode() *Node { 387 | m.nodeLock.RLock() 388 | defer m.nodeLock.RUnlock() 389 | state := m.nodeMap[m.config.Name] 390 | return &state.Node 391 | } 392 | 393 | // UpdateNode is used to trigger re-advertising the local node. This is 394 | // primarily used with a Delegate to support dynamic updates to the local 395 | // meta data. This will block until the update message is successfully 396 | // broadcasted to a member of the cluster, if any exist or until a specified 397 | // timeout is reached. 398 | func (m *Memberlist) UpdateNode(timeout time.Duration) error { 399 | // Get the node meta data 400 | var meta []byte 401 | if m.config.Delegate != nil { 402 | meta = m.config.Delegate.NodeMeta(MetaMaxSize) 403 | if len(meta) > MetaMaxSize { 404 | panic("Node meta data provided is longer than the limit") 405 | } 406 | } 407 | 408 | // Get the existing node 409 | m.nodeLock.RLock() 410 | state := m.nodeMap[m.config.Name] 411 | m.nodeLock.RUnlock() 412 | 413 | // Format a new alive message 414 | a := alive{ 415 | Incarnation: m.nextIncarnation(), 416 | Node: m.config.Name, 417 | ClusterName: m.ClusterName(), 418 | Addr: state.Addr, 419 | Port: state.Port, 420 | Meta: meta, 421 | Vsn: []uint8{ 422 | ProtocolVersionMin, ProtocolVersionMax, m.config.ProtocolVersion, 423 | m.config.DelegateProtocolMin, m.config.DelegateProtocolMax, 424 | m.config.DelegateProtocolVersion, 425 | }, 426 | } 427 | notifyCh := make(chan struct{}) 428 | m.aliveNode(&a, notifyCh, true) 429 | 430 | // Wait for the broadcast or a timeout 431 | if m.anyAlive() { 432 | var timeoutCh <-chan time.Time 433 | if timeout > 0 { 434 | timeoutCh = time.After(timeout) 435 | } 436 | select { 437 | case <-notifyCh: 438 | case <-timeoutCh: 439 | return fmt.Errorf("timeout waiting for update broadcast") 440 | } 441 | } 442 | return nil 443 | } 444 | 445 | // SendTo is deprecated in favor of SendBestEffort, which requires a node to 446 | // target. 447 | func (m *Memberlist) SendTo(to net.Addr, msg []byte) error { 448 | // Encode as a user message 449 | buf := make([]byte, 1, len(msg)+1) 450 | buf[0] = byte(userMsg) 451 | buf = append(buf, msg...) 452 | 453 | // Send the message 454 | return m.rawSendMsgPacket(to.String(), nil, buf) 455 | } 456 | 457 | // SendToUDP is deprecated in favor of SendBestEffort. 458 | func (m *Memberlist) SendToUDP(to *Node, msg []byte) error { 459 | return m.SendBestEffort(to, msg) 460 | } 461 | 462 | // SendToTCP is deprecated in favor of SendReliable. 463 | func (m *Memberlist) SendToTCP(to *Node, msg []byte) error { 464 | return m.SendReliable(to, msg) 465 | } 466 | 467 | // SendBestEffort uses the unreliable packet-oriented interface of the transport 468 | // to target a user message at the given node (this does not use the gossip 469 | // mechanism). The maximum size of the message depends on the configured 470 | // UDPBufferSize for this memberlist instance. 471 | func (m *Memberlist) SendBestEffort(to *Node, msg []byte) error { 472 | // Encode as a user message 473 | buf := make([]byte, 1, len(msg)+1) 474 | buf[0] = byte(userMsg) 475 | buf = append(buf, msg...) 476 | 477 | // Send the message 478 | return m.rawSendMsgPacket(to.Address(), to, buf) 479 | } 480 | 481 | // SendReliable uses the reliable stream-oriented interface of the transport to 482 | // target a user message at the given node (this does not use the gossip 483 | // mechanism). Delivery is guaranteed if no error is returned, and there is no 484 | // limit on the size of the message. 485 | func (m *Memberlist) SendReliable(to *Node, msg []byte) error { 486 | return m.sendUserMsg(to.Address(), msg) 487 | } 488 | 489 | // Members returns a list of all known live nodes. The node structures 490 | // returned must not be modified. If you wish to modify a Node, make a 491 | // copy first. 492 | func (m *Memberlist) Members() []*Node { 493 | m.nodeLock.RLock() 494 | defer m.nodeLock.RUnlock() 495 | 496 | nodes := make([]*Node, 0, len(m.nodes)) 497 | for _, n := range m.nodes { 498 | if n.State != stateDead { 499 | nodes = append(nodes, &n.Node) 500 | } 501 | } 502 | 503 | return nodes 504 | } 505 | 506 | // NumMembers returns the number of alive nodes currently known. Between 507 | // the time of calling this and calling Members, the number of alive nodes 508 | // may have changed, so this shouldn't be used to determine how many 509 | // members will be returned by Members. 510 | func (m *Memberlist) NumMembers() (alive int) { 511 | m.nodeLock.RLock() 512 | defer m.nodeLock.RUnlock() 513 | 514 | for _, n := range m.nodes { 515 | if n.State != stateDead { 516 | alive++ 517 | } 518 | } 519 | 520 | return 521 | } 522 | 523 | // Leave will broadcast a leave message but will not shutdown the background 524 | // listeners, meaning the node will continue participating in gossip and state 525 | // updates. 526 | // 527 | // This will block until the leave message is successfully broadcasted to 528 | // a member of the cluster, if any exist or until a specified timeout 529 | // is reached. 530 | // 531 | // This method is safe to call multiple times, but must not be called 532 | // after the cluster is already shut down. 533 | func (m *Memberlist) Leave(timeout time.Duration) error { 534 | m.nodeLock.Lock() 535 | // We can't defer m.nodeLock.Unlock() because m.deadNode will also try to 536 | // acquire a lock so we need to Unlock before that. 537 | 538 | if m.shutdown { 539 | m.nodeLock.Unlock() 540 | panic("leave after shutdown") 541 | } 542 | 543 | if !m.leave { 544 | m.leave = true 545 | 546 | state, ok := m.nodeMap[m.config.Name] 547 | m.nodeLock.Unlock() 548 | if !ok { 549 | m.logger.Printf("[WARN] memberlist: Leave but we're not in the node map.") 550 | return nil 551 | } 552 | 553 | d := dead{ 554 | Incarnation: state.Incarnation, 555 | Node: state.Name, 556 | ClusterName: m.ClusterName(), 557 | } 558 | m.deadNode(&d) 559 | 560 | // Block until the broadcast goes out 561 | if m.anyAlive() { 562 | var timeoutCh <-chan time.Time 563 | if timeout > 0 { 564 | timeoutCh = time.After(timeout) 565 | } 566 | select { 567 | case <-m.leaveBroadcast: 568 | case <-timeoutCh: 569 | return fmt.Errorf("timeout waiting for leave broadcast") 570 | } 571 | } 572 | } else { 573 | m.nodeLock.Unlock() 574 | } 575 | 576 | return nil 577 | } 578 | 579 | // Check for any other alive node. 580 | func (m *Memberlist) anyAlive() bool { 581 | m.nodeLock.RLock() 582 | defer m.nodeLock.RUnlock() 583 | for _, n := range m.nodes { 584 | if n.State != stateDead && n.Name != m.config.Name { 585 | return true 586 | } 587 | } 588 | return false 589 | } 590 | 591 | // GetHealthScore gives this instance's idea of how well it is meeting the soft 592 | // real-time requirements of the protocol. Lower numbers are better, and zero 593 | // means "totally healthy". 594 | func (m *Memberlist) GetHealthScore() int { 595 | return m.awareness.GetHealthScore() 596 | } 597 | 598 | // ProtocolVersion returns the protocol version currently in use by 599 | // this memberlist. 600 | func (m *Memberlist) ProtocolVersion() uint8 { 601 | // NOTE: This method exists so that in the future we can control 602 | // any locking if necessary, if we change the protocol version at 603 | // runtime, etc. 604 | return m.config.ProtocolVersion 605 | } 606 | 607 | // Shutdown will stop any background maintanence of network activity 608 | // for this memberlist, causing it to appear "dead". A leave message 609 | // will not be broadcasted prior, so the cluster being left will have 610 | // to detect this node's shutdown using probing. If you wish to more 611 | // gracefully exit the cluster, call Leave prior to shutting down. 612 | // 613 | // This method is safe to call multiple times. 614 | func (m *Memberlist) Shutdown() error { 615 | m.nodeLock.Lock() 616 | defer m.nodeLock.Unlock() 617 | 618 | if m.shutdown { 619 | return nil 620 | } 621 | 622 | // Shut down the transport first, which should block until it's 623 | // completely torn down. If we kill the memberlist-side handlers 624 | // those I/O handlers might get stuck. 625 | m.transport.Shutdown() 626 | 627 | // Now tear down everything else. 628 | m.shutdown = true 629 | close(m.shutdownCh) 630 | m.deschedule() 631 | return nil 632 | } 633 | 634 | // ClusterName returns the ClusterName from the config struct 635 | func (m *Memberlist) ClusterName() string { 636 | return m.config.ClusterName 637 | } 638 | -------------------------------------------------------------------------------- /net_test.go: -------------------------------------------------------------------------------- 1 | package memberlist 2 | 3 | import ( 4 | "bytes" 5 | "encoding/binary" 6 | "fmt" 7 | "io" 8 | "log" 9 | "net" 10 | "reflect" 11 | "strings" 12 | "testing" 13 | "time" 14 | 15 | "github.com/hashicorp/go-msgpack/codec" 16 | ) 17 | 18 | // As a regression we left this test very low-level and network-ey, even after 19 | // we abstracted the transport. We added some basic network-free transport tests 20 | // in transport_test.go to prove that we didn't hard code some network stuff 21 | // outside of NetTransport. 22 | 23 | func TestHandleCompoundPing(t *testing.T) { 24 | m := GetMemberlist(t) 25 | m.config.EnableCompression = false 26 | defer m.Shutdown() 27 | 28 | var udp *net.UDPConn 29 | for port := 60000; port < 61000; port++ { 30 | udpAddr := fmt.Sprintf("127.0.0.1:%d", port) 31 | udpLn, err := net.ListenPacket("udp", udpAddr) 32 | if err == nil { 33 | udp = udpLn.(*net.UDPConn) 34 | break 35 | } 36 | } 37 | 38 | if udp == nil { 39 | t.Fatalf("no udp listener") 40 | } 41 | 42 | // Encode a ping 43 | ping := ping{SeqNo: 42} 44 | buf, err := encode(pingMsg, ping) 45 | if err != nil { 46 | t.Fatalf("unexpected err %s", err) 47 | } 48 | 49 | // Make a compound message 50 | compound := makeCompoundMessage([][]byte{buf.Bytes(), buf.Bytes(), buf.Bytes()}) 51 | 52 | // Send compound version 53 | addr := &net.UDPAddr{IP: net.ParseIP(m.config.BindAddr), Port: m.config.BindPort} 54 | udp.WriteTo(compound.Bytes(), addr) 55 | 56 | // Wait for responses 57 | doneCh := make(chan struct{}, 1) 58 | go func() { 59 | select { 60 | case <-doneCh: 61 | case <-time.After(2 * time.Second): 62 | panic("timeout") 63 | } 64 | }() 65 | 66 | for i := 0; i < 3; i++ { 67 | in := make([]byte, 1500) 68 | n, _, err := udp.ReadFrom(in) 69 | if err != nil { 70 | t.Fatalf("unexpected err %s", err) 71 | } 72 | in = in[0:n] 73 | 74 | msgType := messageType(in[0]) 75 | if msgType != ackRespMsg { 76 | t.Fatalf("bad response %v", in) 77 | } 78 | 79 | var ack ackResp 80 | if err := decode(in[1:], &ack); err != nil { 81 | t.Fatalf("unexpected err %s", err) 82 | } 83 | 84 | if ack.SeqNo != 42 { 85 | t.Fatalf("bad sequence no") 86 | } 87 | } 88 | 89 | doneCh <- struct{}{} 90 | } 91 | 92 | func TestHandlePing(t *testing.T) { 93 | m := GetMemberlist(t) 94 | m.config.EnableCompression = false 95 | defer m.Shutdown() 96 | 97 | var udp *net.UDPConn 98 | for port := 60000; port < 61000; port++ { 99 | udpAddr := fmt.Sprintf("127.0.0.1:%d", port) 100 | udpLn, err := net.ListenPacket("udp", udpAddr) 101 | if err == nil { 102 | udp = udpLn.(*net.UDPConn) 103 | break 104 | } 105 | } 106 | 107 | if udp == nil { 108 | t.Fatalf("no udp listener") 109 | } 110 | 111 | // Encode a ping 112 | ping := ping{SeqNo: 42} 113 | buf, err := encode(pingMsg, ping) 114 | if err != nil { 115 | t.Fatalf("unexpected err %s", err) 116 | } 117 | 118 | // Send 119 | addr := &net.UDPAddr{IP: net.ParseIP(m.config.BindAddr), Port: m.config.BindPort} 120 | udp.WriteTo(buf.Bytes(), addr) 121 | 122 | // Wait for response 123 | doneCh := make(chan struct{}, 1) 124 | go func() { 125 | select { 126 | case <-doneCh: 127 | case <-time.After(2 * time.Second): 128 | panic("timeout") 129 | } 130 | }() 131 | 132 | in := make([]byte, 1500) 133 | n, _, err := udp.ReadFrom(in) 134 | if err != nil { 135 | t.Fatalf("unexpected err %s", err) 136 | } 137 | in = in[0:n] 138 | 139 | msgType := messageType(in[0]) 140 | if msgType != ackRespMsg { 141 | t.Fatalf("bad response %v", in) 142 | } 143 | 144 | var ack ackResp 145 | if err := decode(in[1:], &ack); err != nil { 146 | t.Fatalf("unexpected err %s", err) 147 | } 148 | 149 | if ack.SeqNo != 42 { 150 | t.Fatalf("bad sequence no") 151 | } 152 | 153 | doneCh <- struct{}{} 154 | } 155 | 156 | func TestHandlePing_WrongNode(t *testing.T) { 157 | m := GetMemberlist(t) 158 | m.config.EnableCompression = false 159 | defer m.Shutdown() 160 | 161 | var udp *net.UDPConn 162 | for port := 60000; port < 61000; port++ { 163 | udpAddr := fmt.Sprintf("127.0.0.1:%d", port) 164 | udpLn, err := net.ListenPacket("udp", udpAddr) 165 | if err == nil { 166 | udp = udpLn.(*net.UDPConn) 167 | break 168 | } 169 | } 170 | 171 | if udp == nil { 172 | t.Fatalf("no udp listener") 173 | } 174 | 175 | // Encode a ping, wrong node! 176 | ping := ping{SeqNo: 42, Node: m.config.Name + "-bad"} 177 | buf, err := encode(pingMsg, ping) 178 | if err != nil { 179 | t.Fatalf("unexpected err %s", err) 180 | } 181 | 182 | // Send 183 | addr := &net.UDPAddr{IP: net.ParseIP(m.config.BindAddr), Port: m.config.BindPort} 184 | udp.WriteTo(buf.Bytes(), addr) 185 | 186 | // Wait for response 187 | udp.SetDeadline(time.Now().Add(50 * time.Millisecond)) 188 | in := make([]byte, 1500) 189 | _, _, err = udp.ReadFrom(in) 190 | 191 | // Should get an i/o timeout 192 | if err == nil { 193 | t.Fatalf("expected err %s", err) 194 | } 195 | } 196 | 197 | func TestHandleIndirectPing(t *testing.T) { 198 | m := GetMemberlist(t) 199 | m.config.EnableCompression = false 200 | defer m.Shutdown() 201 | 202 | var udp *net.UDPConn 203 | for port := 60000; port < 61000; port++ { 204 | udpAddr := fmt.Sprintf("127.0.0.1:%d", port) 205 | udpLn, err := net.ListenPacket("udp", udpAddr) 206 | if err == nil { 207 | udp = udpLn.(*net.UDPConn) 208 | break 209 | } 210 | } 211 | 212 | if udp == nil { 213 | t.Fatalf("no udp listener") 214 | } 215 | 216 | // Encode an indirect ping 217 | ind := indirectPingReq{ 218 | SeqNo: 100, 219 | Target: net.ParseIP(m.config.BindAddr), 220 | Port: uint16(m.config.BindPort), 221 | } 222 | buf, err := encode(indirectPingMsg, &ind) 223 | if err != nil { 224 | t.Fatalf("unexpected err %s", err) 225 | } 226 | 227 | // Send 228 | addr := &net.UDPAddr{IP: net.ParseIP(m.config.BindAddr), Port: m.config.BindPort} 229 | udp.WriteTo(buf.Bytes(), addr) 230 | 231 | // Wait for response 232 | doneCh := make(chan struct{}, 1) 233 | go func() { 234 | select { 235 | case <-doneCh: 236 | case <-time.After(2 * time.Second): 237 | panic("timeout") 238 | } 239 | }() 240 | 241 | in := make([]byte, 1500) 242 | n, _, err := udp.ReadFrom(in) 243 | if err != nil { 244 | t.Fatalf("unexpected err %s", err) 245 | } 246 | in = in[0:n] 247 | 248 | msgType := messageType(in[0]) 249 | if msgType != ackRespMsg { 250 | t.Fatalf("bad response %v", in) 251 | } 252 | 253 | var ack ackResp 254 | if err := decode(in[1:], &ack); err != nil { 255 | t.Fatalf("unexpected err %s", err) 256 | } 257 | 258 | if ack.SeqNo != 100 { 259 | t.Fatalf("bad sequence no") 260 | } 261 | 262 | doneCh <- struct{}{} 263 | } 264 | 265 | func TestTCPPing(t *testing.T) { 266 | var tcp *net.TCPListener 267 | var tcpAddr *net.TCPAddr 268 | for port := 60000; port < 61000; port++ { 269 | tcpAddr = &net.TCPAddr{IP: net.ParseIP("127.0.0.1"), Port: port} 270 | tcpLn, err := net.ListenTCP("tcp", tcpAddr) 271 | if err == nil { 272 | tcp = tcpLn 273 | break 274 | } 275 | } 276 | if tcp == nil { 277 | t.Fatalf("no tcp listener") 278 | } 279 | 280 | // Note that tcp gets closed in the last test, so we avoid a deferred 281 | // Close() call here. 282 | 283 | m := GetMemberlist(t) 284 | defer m.Shutdown() 285 | pingTimeout := m.config.ProbeInterval 286 | pingTimeMax := m.config.ProbeInterval + 10*time.Millisecond 287 | 288 | // Do a normal round trip. 289 | pingOut := ping{SeqNo: 23, Node: "mongo"} 290 | go func() { 291 | tcp.SetDeadline(time.Now().Add(pingTimeMax)) 292 | conn, err := tcp.AcceptTCP() 293 | if err != nil { 294 | t.Fatalf("failed to connect: %s", err) 295 | } 296 | defer conn.Close() 297 | 298 | msgType, _, dec, err := m.readStream(conn) 299 | if err != nil { 300 | t.Fatalf("failed to read ping: %s", err) 301 | } 302 | 303 | if msgType != pingMsg { 304 | t.Fatalf("expecting ping, got message type (%d)", msgType) 305 | } 306 | 307 | var pingIn ping 308 | if err := dec.Decode(&pingIn); err != nil { 309 | t.Fatalf("failed to decode ping: %s", err) 310 | } 311 | 312 | if pingIn.SeqNo != pingOut.SeqNo { 313 | t.Fatalf("sequence number isn't correct (%d) vs (%d)", pingIn.SeqNo, pingOut.SeqNo) 314 | } 315 | 316 | if pingIn.Node != pingOut.Node { 317 | t.Fatalf("node name isn't correct (%s) vs (%s)", pingIn.Node, pingOut.Node) 318 | } 319 | 320 | ack := ackResp{pingIn.SeqNo, nil} 321 | out, err := encode(ackRespMsg, &ack) 322 | if err != nil { 323 | t.Fatalf("failed to encode ack: %s", err) 324 | } 325 | 326 | err = m.rawSendMsgStream(conn, out.Bytes()) 327 | if err != nil { 328 | t.Fatalf("failed to send ack: %s", err) 329 | } 330 | }() 331 | deadline := time.Now().Add(pingTimeout) 332 | didContact, err := m.sendPingAndWaitForAck(tcpAddr.String(), pingOut, deadline) 333 | if err != nil { 334 | t.Fatalf("error trying to ping: %s", err) 335 | } 336 | if !didContact { 337 | t.Fatalf("expected successful ping") 338 | } 339 | 340 | // Make sure a mis-matched sequence number is caught. 341 | go func() { 342 | tcp.SetDeadline(time.Now().Add(pingTimeMax)) 343 | conn, err := tcp.AcceptTCP() 344 | if err != nil { 345 | t.Fatalf("failed to connect: %s", err) 346 | } 347 | defer conn.Close() 348 | 349 | _, _, dec, err := m.readStream(conn) 350 | if err != nil { 351 | t.Fatalf("failed to read ping: %s", err) 352 | } 353 | 354 | var pingIn ping 355 | if err := dec.Decode(&pingIn); err != nil { 356 | t.Fatalf("failed to decode ping: %s", err) 357 | } 358 | 359 | ack := ackResp{pingIn.SeqNo + 1, nil} 360 | out, err := encode(ackRespMsg, &ack) 361 | if err != nil { 362 | t.Fatalf("failed to encode ack: %s", err) 363 | } 364 | 365 | err = m.rawSendMsgStream(conn, out.Bytes()) 366 | if err != nil { 367 | t.Fatalf("failed to send ack: %s", err) 368 | } 369 | }() 370 | deadline = time.Now().Add(pingTimeout) 371 | didContact, err = m.sendPingAndWaitForAck(tcpAddr.String(), pingOut, deadline) 372 | if err == nil || !strings.Contains(err.Error(), "Sequence number") { 373 | t.Fatalf("expected an error from mis-matched sequence number") 374 | } 375 | if didContact { 376 | t.Fatalf("expected failed ping") 377 | } 378 | 379 | // Make sure an unexpected message type is handled gracefully. 380 | go func() { 381 | tcp.SetDeadline(time.Now().Add(pingTimeMax)) 382 | conn, err := tcp.AcceptTCP() 383 | if err != nil { 384 | t.Fatalf("failed to connect: %s", err) 385 | } 386 | defer conn.Close() 387 | 388 | _, _, _, err = m.readStream(conn) 389 | if err != nil { 390 | t.Fatalf("failed to read ping: %s", err) 391 | } 392 | 393 | bogus := indirectPingReq{} 394 | out, err := encode(indirectPingMsg, &bogus) 395 | if err != nil { 396 | t.Fatalf("failed to encode bogus msg: %s", err) 397 | } 398 | 399 | err = m.rawSendMsgStream(conn, out.Bytes()) 400 | if err != nil { 401 | t.Fatalf("failed to send bogus msg: %s", err) 402 | } 403 | }() 404 | deadline = time.Now().Add(pingTimeout) 405 | didContact, err = m.sendPingAndWaitForAck(tcpAddr.String(), pingOut, deadline) 406 | if err == nil || !strings.Contains(err.Error(), "Unexpected msgType") { 407 | t.Fatalf("expected an error from bogus message") 408 | } 409 | if didContact { 410 | t.Fatalf("expected failed ping") 411 | } 412 | 413 | // Make sure failed I/O respects the deadline. In this case we try the 414 | // common case of the receiving node being totally down. 415 | tcp.Close() 416 | deadline = time.Now().Add(pingTimeout) 417 | startPing := time.Now() 418 | didContact, err = m.sendPingAndWaitForAck(tcpAddr.String(), pingOut, deadline) 419 | pingTime := time.Now().Sub(startPing) 420 | if err != nil { 421 | t.Fatalf("expected no error during ping on closed socket, got: %s", err) 422 | } 423 | if didContact { 424 | t.Fatalf("expected failed ping") 425 | } 426 | if pingTime > pingTimeMax { 427 | t.Fatalf("took too long to fail ping, %9.6f", pingTime.Seconds()) 428 | } 429 | } 430 | 431 | func TestTCPPushPull(t *testing.T) { 432 | m := GetMemberlist(t) 433 | defer m.Shutdown() 434 | m.nodes = append(m.nodes, &nodeState{ 435 | Node: Node{ 436 | Name: "Test 0", 437 | Addr: net.ParseIP(m.config.BindAddr), 438 | Port: uint16(m.config.BindPort), 439 | }, 440 | Incarnation: 0, 441 | State: stateSuspect, 442 | StateChange: time.Now().Add(-1 * time.Second), 443 | }) 444 | 445 | addr := fmt.Sprintf("%s:%d", m.config.BindAddr, m.config.BindPort) 446 | conn, err := net.Dial("tcp", addr) 447 | if err != nil { 448 | t.Fatalf("unexpected err %s", err) 449 | } 450 | defer conn.Close() 451 | 452 | localNodes := make([]pushNodeState, 3) 453 | localNodes[0].Name = "Test 0" 454 | localNodes[0].Addr = net.ParseIP(m.config.BindAddr) 455 | localNodes[0].Port = uint16(m.config.BindPort) 456 | localNodes[0].Incarnation = 1 457 | localNodes[0].State = stateAlive 458 | localNodes[1].Name = "Test 1" 459 | localNodes[1].Addr = net.ParseIP(m.config.BindAddr) 460 | localNodes[1].Port = uint16(m.config.BindPort) 461 | localNodes[1].Incarnation = 1 462 | localNodes[1].State = stateAlive 463 | localNodes[2].Name = "Test 2" 464 | localNodes[2].Addr = net.ParseIP(m.config.BindAddr) 465 | localNodes[2].Port = uint16(m.config.BindPort) 466 | localNodes[2].Incarnation = 1 467 | localNodes[2].State = stateAlive 468 | 469 | // Send our node state 470 | header := pushPullHeader{Nodes: 3, ClusterName: m.config.ClusterName} 471 | hd := codec.MsgpackHandle{} 472 | enc := codec.NewEncoder(conn, &hd) 473 | 474 | // Send the push/pull indicator 475 | conn.Write([]byte{byte(pushPullMsg)}) 476 | 477 | if err := enc.Encode(&header); err != nil { 478 | t.Fatalf("unexpected err %s", err) 479 | } 480 | for i := 0; i < header.Nodes; i++ { 481 | if err := enc.Encode(&localNodes[i]); err != nil { 482 | t.Fatalf("unexpected err %s", err) 483 | } 484 | } 485 | 486 | // Read the message type 487 | var msgType messageType 488 | if err := binary.Read(conn, binary.BigEndian, &msgType); err != nil { 489 | t.Fatalf("unexpected err %s", err) 490 | } 491 | 492 | var bufConn io.Reader = conn 493 | msghd := codec.MsgpackHandle{} 494 | dec := codec.NewDecoder(bufConn, &msghd) 495 | 496 | // Check if we have a compressed message 497 | if msgType == compressMsg { 498 | var c compress 499 | if err := dec.Decode(&c); err != nil { 500 | t.Fatalf("unexpected err %s", err) 501 | } 502 | decomp, err := decompressBuffer(&c) 503 | if err != nil { 504 | t.Fatalf("unexpected err %s", err) 505 | } 506 | 507 | // Reset the message type 508 | msgType = messageType(decomp[0]) 509 | 510 | // Create a new bufConn 511 | bufConn = bytes.NewReader(decomp[1:]) 512 | 513 | // Create a new decoder 514 | dec = codec.NewDecoder(bufConn, &hd) 515 | } 516 | 517 | // Quit if not push/pull 518 | if msgType != pushPullMsg { 519 | t.Fatalf("bad message type") 520 | } 521 | 522 | if err := dec.Decode(&header); err != nil { 523 | t.Fatalf("unexpected err %s", err) 524 | } 525 | 526 | // Allocate space for the transfer 527 | remoteNodes := make([]pushNodeState, header.Nodes) 528 | 529 | // Try to decode all the states 530 | for i := 0; i < header.Nodes; i++ { 531 | if err := dec.Decode(&remoteNodes[i]); err != nil { 532 | t.Fatalf("unexpected err %s", err) 533 | } 534 | } 535 | 536 | if len(remoteNodes) != 1 { 537 | t.Fatalf("bad response") 538 | } 539 | 540 | n := &remoteNodes[0] 541 | if n.Name != "Test 0" { 542 | t.Fatalf("bad name") 543 | } 544 | if bytes.Compare(n.Addr, net.ParseIP(m.config.BindAddr)) != 0 { 545 | t.Fatal("bad addr") 546 | } 547 | if n.Incarnation != 0 { 548 | t.Fatal("bad incarnation") 549 | } 550 | if n.State != stateSuspect { 551 | t.Fatal("bad state") 552 | } 553 | } 554 | 555 | func TestSendMsg_Piggyback(t *testing.T) { 556 | m := GetMemberlist(t) 557 | defer m.Shutdown() 558 | 559 | // Add a message to be broadcast 560 | a := alive{ 561 | Incarnation: 10, 562 | Node: "rand", 563 | ClusterName: m.config.ClusterName, 564 | Addr: []byte{127, 0, 0, 255}, 565 | Meta: nil, 566 | } 567 | m.encodeAndBroadcast("rand", aliveMsg, &a) 568 | 569 | var udp *net.UDPConn 570 | for port := 60000; port < 61000; port++ { 571 | udpAddr := fmt.Sprintf("127.0.0.1:%d", port) 572 | udpLn, err := net.ListenPacket("udp", udpAddr) 573 | if err == nil { 574 | udp = udpLn.(*net.UDPConn) 575 | break 576 | } 577 | } 578 | 579 | // Encode a ping 580 | ping := ping{SeqNo: 42} 581 | buf, err := encode(pingMsg, ping) 582 | if err != nil { 583 | t.Fatalf("unexpected err %s", err) 584 | } 585 | 586 | // Send 587 | addr := &net.UDPAddr{IP: net.ParseIP(m.config.BindAddr), Port: m.config.BindPort} 588 | udp.WriteTo(buf.Bytes(), addr) 589 | 590 | // Wait for response 591 | doneCh := make(chan struct{}, 1) 592 | go func() { 593 | select { 594 | case <-doneCh: 595 | case <-time.After(2 * time.Second): 596 | panic("timeout") 597 | } 598 | }() 599 | 600 | in := make([]byte, 1500) 601 | n, _, err := udp.ReadFrom(in) 602 | if err != nil { 603 | t.Fatalf("unexpected err %s", err) 604 | } 605 | in = in[0:n] 606 | 607 | msgType := messageType(in[0]) 608 | if msgType != compoundMsg { 609 | t.Fatalf("bad response %v", in) 610 | } 611 | 612 | // get the parts 613 | trunc, parts, err := decodeCompoundMessage(in[1:]) 614 | if trunc != 0 { 615 | t.Fatalf("unexpected truncation") 616 | } 617 | if len(parts) != 2 { 618 | t.Fatalf("unexpected parts %v", parts) 619 | } 620 | if err != nil { 621 | t.Fatalf("unexpected err %s", err) 622 | } 623 | 624 | var ack ackResp 625 | if err := decode(parts[0][1:], &ack); err != nil { 626 | t.Fatalf("unexpected err %s", err) 627 | } 628 | 629 | if ack.SeqNo != 42 { 630 | t.Fatalf("bad sequence no") 631 | } 632 | 633 | var aliveout alive 634 | if err := decode(parts[1][1:], &aliveout); err != nil { 635 | t.Fatalf("unexpected err %s", err) 636 | } 637 | 638 | if aliveout.Node != "rand" || aliveout.Incarnation != 10 { 639 | t.Fatalf("bad mesg") 640 | } 641 | 642 | doneCh <- struct{}{} 643 | } 644 | 645 | func TestEncryptDecryptState(t *testing.T) { 646 | state := []byte("this is our internal state...") 647 | config := &Config{ 648 | SecretKey: []byte{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, 649 | ProtocolVersion: ProtocolVersionMax, 650 | } 651 | 652 | m, err := Create(config) 653 | if err != nil { 654 | t.Fatalf("err: %s", err) 655 | } 656 | defer m.Shutdown() 657 | 658 | crypt, err := m.encryptLocalState(state) 659 | if err != nil { 660 | t.Fatalf("err: %v", err) 661 | } 662 | 663 | // Create reader, seek past the type byte 664 | buf := bytes.NewReader(crypt) 665 | buf.Seek(1, 0) 666 | 667 | plain, err := m.decryptRemoteState(buf) 668 | if err != nil { 669 | t.Fatalf("err: %v", err) 670 | } 671 | 672 | if !reflect.DeepEqual(state, plain) { 673 | t.Fatalf("Decrypt failed: %v", plain) 674 | } 675 | } 676 | 677 | func TestRawSendUdp_CRC(t *testing.T) { 678 | m := GetMemberlist(t) 679 | m.config.EnableCompression = false 680 | defer m.Shutdown() 681 | 682 | var udp *net.UDPConn 683 | for port := 60000; port < 61000; port++ { 684 | udpAddr := fmt.Sprintf("127.0.0.1:%d", port) 685 | udpLn, err := net.ListenPacket("udp", udpAddr) 686 | if err == nil { 687 | udp = udpLn.(*net.UDPConn) 688 | break 689 | } 690 | } 691 | 692 | if udp == nil { 693 | t.Fatalf("no udp listener") 694 | } 695 | 696 | // Pass a nil node with no nodes registered, should result in no checksum 697 | payload := []byte{3, 3, 3, 3} 698 | m.rawSendMsgPacket(udp.LocalAddr().String(), nil, payload) 699 | 700 | in := make([]byte, 1500) 701 | n, _, err := udp.ReadFrom(in) 702 | if err != nil { 703 | t.Fatalf("unexpected err %s", err) 704 | } 705 | in = in[0:n] 706 | 707 | if len(in) != 4 { 708 | t.Fatalf("bad: %v", in) 709 | } 710 | 711 | // Pass a non-nil node with PMax >= 5, should result in a checksum 712 | m.rawSendMsgPacket(udp.LocalAddr().String(), &Node{PMax: 5}, payload) 713 | 714 | in = make([]byte, 1500) 715 | n, _, err = udp.ReadFrom(in) 716 | if err != nil { 717 | t.Fatalf("unexpected err %s", err) 718 | } 719 | in = in[0:n] 720 | 721 | if len(in) != 9 { 722 | t.Fatalf("bad: %v", in) 723 | } 724 | 725 | // Register a node with PMax >= 5 to be looked up, should result in a checksum 726 | m.nodeMap["127.0.0.1"] = &nodeState{ 727 | Node: Node{PMax: 5}, 728 | } 729 | m.rawSendMsgPacket(udp.LocalAddr().String(), nil, payload) 730 | 731 | in = make([]byte, 1500) 732 | n, _, err = udp.ReadFrom(in) 733 | if err != nil { 734 | t.Fatalf("unexpected err %s", err) 735 | } 736 | in = in[0:n] 737 | 738 | if len(in) != 9 { 739 | t.Fatalf("bad: %v", in) 740 | } 741 | } 742 | 743 | func TestIngestPacket_CRC(t *testing.T) { 744 | m := GetMemberlist(t) 745 | m.config.EnableCompression = false 746 | defer m.Shutdown() 747 | 748 | var udp *net.UDPConn 749 | for port := 60000; port < 61000; port++ { 750 | udpAddr := fmt.Sprintf("127.0.0.1:%d", port) 751 | udpLn, err := net.ListenPacket("udp", udpAddr) 752 | if err == nil { 753 | udp = udpLn.(*net.UDPConn) 754 | break 755 | } 756 | } 757 | 758 | if udp == nil { 759 | t.Fatalf("no udp listener") 760 | } 761 | 762 | // Get a message with a checksum 763 | payload := []byte{3, 3, 3, 3} 764 | m.rawSendMsgPacket(udp.LocalAddr().String(), &Node{PMax: 5}, payload) 765 | 766 | in := make([]byte, 1500) 767 | n, _, err := udp.ReadFrom(in) 768 | if err != nil { 769 | t.Fatalf("unexpected err %s", err) 770 | } 771 | in = in[0:n] 772 | 773 | if len(in) != 9 { 774 | t.Fatalf("bad: %v", in) 775 | } 776 | 777 | // Corrupt the checksum 778 | in[1] <<= 1 779 | 780 | logs := &bytes.Buffer{} 781 | logger := log.New(logs, "", 0) 782 | m.logger = logger 783 | m.ingestPacket(in, udp.LocalAddr(), time.Now()) 784 | 785 | if !strings.Contains(logs.String(), "invalid checksum") { 786 | t.Fatalf("bad: %s", logs.String()) 787 | } 788 | } 789 | 790 | func TestIsSameCluster(t *testing.T) { 791 | m := GetMemberlist(t) 792 | defer m.Shutdown() 793 | 794 | m.config.ClusterName = "default" 795 | clusterName := "default" 796 | 797 | if !m.isSameCluster(clusterName) { 798 | t.Fatalf("Cluster names should match. %v <-> %v", m.config.ClusterName, clusterName) 799 | } 800 | 801 | clusterName = "badCluster" 802 | 803 | if m.isSameCluster(clusterName) { 804 | t.Fatalf("Cluster names should not match. %v <-> %v", m.config.ClusterName, clusterName) 805 | } 806 | } 807 | 808 | // handleSuspect() should discard messages from nodes with different Cluster names 809 | func TestHandleSuspect(t *testing.T) { 810 | addr1 := getBindAddr() 811 | addr2 := getBindAddr() 812 | addr3 := getBindAddr() 813 | ip1 := []byte(addr1) 814 | ip2 := []byte(addr2) 815 | ip3 := []byte(addr3) 816 | 817 | m1 := HostMemberlist(addr1.String(), t, nil) 818 | 819 | // dummy address 820 | addr, _ := net.ResolveIPAddr("udp", "127.0.0.1/32") 821 | 822 | a1 := alive{Node: "node1", ClusterName: m1.config.ClusterName, Addr: ip1, Port: 7946, Incarnation: 1} 823 | a2 := alive{Node: "node2", ClusterName: m1.config.ClusterName, Addr: ip2, Port: 7946, Incarnation: 2} 824 | a3 := alive{Node: "node3", ClusterName: m1.config.ClusterName, Addr: ip3, Port: 7946, Incarnation: 3} 825 | 826 | a1buf, a1err := encode(aliveMsg, a1) 827 | if a1err != nil { 828 | t.Fatal("Unexpected error: %v", a1err) 829 | } 830 | 831 | a2buf, a2err := encode(aliveMsg, a2) 832 | if a2err != nil { 833 | t.Fatal("Unexpected error: %v", a2err) 834 | } 835 | 836 | a3buf, a3err := encode(aliveMsg, a3) 837 | if a3err != nil { 838 | t.Fatal("Unexpected error: %v", a3err) 839 | } 840 | 841 | m1.handleAlive(a1buf.Bytes()[1:], addr) 842 | m1.handleAlive(a2buf.Bytes()[1:], addr) 843 | m1.handleAlive(a3buf.Bytes()[1:], addr) 844 | 845 | if len(m1.nodes) != 3 { 846 | t.Fatalf("Should have 3 nodes in memberlist, but have %v", len(m1.nodes)) 847 | } 848 | 849 | // All nodes should be in alive state 850 | for k, v := range m1.nodeMap { 851 | if v.State != stateAlive { 852 | t.Fatalf("Node %v should be in alive state", k) 853 | } 854 | } 855 | 856 | // Suspect message with same cluster name 857 | s1 := suspect{Node: "node2", Incarnation: 2, ClusterName: m1.config.ClusterName} 858 | 859 | s1buf, s1err := encode(suspectMsg, s1) 860 | if s1err != nil { 861 | t.Fatal("Unexpected error: %v", s1err) 862 | } 863 | 864 | // Send the message, shouldn't get discarded; state should change 865 | m1.handleSuspect(s1buf.Bytes()[1:], addr) 866 | 867 | if m1.nodeMap[a2.Node].State != stateSuspect { 868 | t.Fatal("Node 2 should be in suspect state") 869 | } 870 | 871 | // Send a suspect message for node 3, but with different cluster name 872 | s2 := suspect{Node: "node3", Incarnation: 3, ClusterName: "badCluster"} 873 | 874 | s2buf, s2err := encode(suspectMsg, s2) 875 | if s2err != nil { 876 | t.Fatal("Unexpected error: %v", s2err) 877 | } 878 | 879 | m1.handleSuspect(s2buf.Bytes()[1:], addr) 880 | 881 | // Suspect message should be discarded and have not affected 'real' node 3 882 | if m1.nodeMap[a3.Node].State != stateAlive { 883 | t.Fatalf("Node 3 should still be in alive state") 884 | } 885 | } 886 | 887 | func TestHandleAlive(t *testing.T) { 888 | addr1 := getBindAddr() 889 | addr2 := getBindAddr() 890 | addr3 := getBindAddr() 891 | ip1 := []byte(addr1) 892 | ip2 := []byte(addr2) 893 | ip3 := []byte(addr3) 894 | 895 | m1 := HostMemberlist(addr1.String(), t, nil) 896 | 897 | // dummy address 898 | addr, _ := net.ResolveIPAddr("udp", "127.0.0.1/32") 899 | 900 | a1 := alive{Node: "node1", ClusterName: m1.config.ClusterName, Addr: ip1, Port: 7946, Incarnation: 1} 901 | a2 := alive{Node: "node2", ClusterName: m1.config.ClusterName, Addr: ip2, Port: 7946, Incarnation: 2} 902 | a3 := alive{Node: "node3", ClusterName: "badCluster", Addr: ip3, Port: 7946, Incarnation: 2} 903 | 904 | // Encode 2 alive messages with the same cluster name 905 | a1buf, err := encode(aliveMsg, a1) 906 | if err != nil { 907 | t.Fatal("Unexpected error: %v", err) 908 | } 909 | 910 | a2buf, err2 := encode(aliveMsg, a2) 911 | if err2 != nil { 912 | t.Fatal("Unexpected error: %v", err2) 913 | } 914 | 915 | // Encode a third message with a different cluster name 916 | a3buf, err3 := encode(aliveMsg, a3) 917 | if err3 != nil { 918 | t.Fatal("Unexpected error: %v", err3) 919 | } 920 | 921 | m1.handleAlive(a1buf.Bytes()[1:], addr) 922 | m1.handleAlive(a2buf.Bytes()[1:], addr) 923 | 924 | if len(m1.nodes) != 2 { 925 | t.Fatalf("Should have 2 nodes in memberlist, but have %v", len(m1.nodes)) 926 | } 927 | 928 | // Send 3rd message, which should get ignored 929 | m1.handleAlive(a3buf.Bytes()[1:], addr) 930 | 931 | if len(m1.nodes) != 2 { 932 | t.Fatalf("Should still have 2 nodes in memberlist, but have %v", len(m1.nodes)) 933 | } 934 | } 935 | 936 | func TestHandleDead(t *testing.T) { 937 | addr1 := getBindAddr() 938 | addr2 := getBindAddr() 939 | addr3 := getBindAddr() 940 | ip1 := []byte(addr1) 941 | ip2 := []byte(addr2) 942 | ip3 := []byte(addr3) 943 | 944 | m1 := HostMemberlist(addr1.String(), t, nil) 945 | 946 | // dummy address 947 | addr, _ := net.ResolveIPAddr("udp", "127.0.0.1/32") 948 | 949 | a1 := alive{Node: "node1", ClusterName: m1.config.ClusterName, Addr: ip1, Port: 7946, Incarnation: 1} 950 | a2 := alive{Node: "node2", ClusterName: m1.config.ClusterName, Addr: ip2, Port: 7946, Incarnation: 2} 951 | a3 := alive{Node: "node3", ClusterName: m1.config.ClusterName, Addr: ip3, Port: 7946, Incarnation: 3} 952 | 953 | a1buf, a1err := encode(aliveMsg, a1) 954 | if a1err != nil { 955 | t.Fatal("Unexpected error: %v", a1err) 956 | } 957 | 958 | a2buf, a2err := encode(aliveMsg, a2) 959 | if a2err != nil { 960 | t.Fatal("Unexpected error: %v", a2err) 961 | } 962 | 963 | a3buf, a3err := encode(aliveMsg, a3) 964 | if a3err != nil { 965 | t.Fatal("Unexpected error: %v", a3err) 966 | } 967 | 968 | m1.handleAlive(a1buf.Bytes()[1:], addr) 969 | m1.handleAlive(a2buf.Bytes()[1:], addr) 970 | m1.handleAlive(a3buf.Bytes()[1:], addr) 971 | 972 | if len(m1.nodes) != 3 { 973 | t.Fatalf("Should have 3 nodes in memberlist, but have %v", len(m1.nodes)) 974 | } 975 | 976 | // All nodes should be in alive state 977 | for k, v := range m1.nodeMap { 978 | if v.State != stateAlive { 979 | t.Fatalf("Node %v should be in alive state", k) 980 | } 981 | } 982 | 983 | // Dead message with same cluster name 984 | d1 := dead{Node: "node2", ClusterName: m1.config.ClusterName, Incarnation: 2} 985 | 986 | d1buf, d1err := encode(deadMsg, d1) 987 | if d1err != nil { 988 | t.Fatal("Unexpected error: %v", d1err) 989 | } 990 | 991 | // Send the message, shouldn't get discarded; state should change 992 | m1.handleDead(d1buf.Bytes()[1:], addr) 993 | 994 | if m1.nodeMap[a2.Node].State != stateDead { 995 | t.Fatal("Node 2 should be in dead state") 996 | } 997 | 998 | // Send a dead message for node 3, but with different cluster name 999 | d2 := dead{Node: "node3", ClusterName: "badCluster", Incarnation: 3} 1000 | 1001 | d2buf, d2err := encode(deadMsg, d2) 1002 | if d2err != nil { 1003 | t.Fatal("Unexpected error: %v", d2err) 1004 | } 1005 | 1006 | m1.handleDead(d2buf.Bytes()[1:], addr) 1007 | 1008 | // Dead message should be discarded and have not affected 'real' node 3 1009 | if m1.nodeMap[a3.Node].State != stateAlive { 1010 | t.Fatalf("Node 3 should still be in alive state") 1011 | } 1012 | } 1013 | 1014 | // TODO: This probably should get tested as well, but will require a bit of setup 1015 | func TestReadRemoteState(t *testing.T) { 1016 | } 1017 | -------------------------------------------------------------------------------- /net.go: -------------------------------------------------------------------------------- 1 | package memberlist 2 | 3 | import ( 4 | "bufio" 5 | "bytes" 6 | "encoding/binary" 7 | "fmt" 8 | "hash/crc32" 9 | "io" 10 | "net" 11 | "time" 12 | 13 | "github.com/armon/go-metrics" 14 | "github.com/hashicorp/go-msgpack/codec" 15 | ) 16 | 17 | // This is the minimum and maximum protocol version that we can 18 | // _understand_. We're allowed to speak at any version within this 19 | // range. This range is inclusive. 20 | const ( 21 | ProtocolVersionMin uint8 = 1 22 | 23 | // Version 3 added support for TCP pings but we kept the default 24 | // protocol version at 2 to ease transition to this new feature. 25 | // A memberlist speaking version 2 of the protocol will attempt 26 | // to TCP ping another memberlist who understands version 3 or 27 | // greater. 28 | // 29 | // Version 4 added support for nacks as part of indirect probes. 30 | // A memberlist speaking version 2 of the protocol will expect 31 | // nacks from another memberlist who understands version 4 or 32 | // greater, and likewise nacks will be sent to memberlists who 33 | // understand version 4 or greater. 34 | ProtocolVersion2Compatible = 2 35 | 36 | ProtocolVersionMax = 5 37 | ) 38 | 39 | // messageType is an integer ID of a type of message that can be received 40 | // on network channels from other members. 41 | type messageType uint8 42 | 43 | // The list of available message types. 44 | const ( 45 | pingMsg messageType = iota 46 | indirectPingMsg 47 | ackRespMsg 48 | suspectMsg 49 | aliveMsg 50 | deadMsg 51 | pushPullMsg 52 | compoundMsg 53 | userMsg // User mesg, not handled by us 54 | compressMsg 55 | encryptMsg 56 | nackRespMsg 57 | hasCrcMsg 58 | ) 59 | 60 | // compressionType is used to specify the compression algorithm 61 | type compressionType uint8 62 | 63 | const ( 64 | lzwAlgo compressionType = iota 65 | ) 66 | 67 | const ( 68 | MetaMaxSize = 512 // Maximum size for node meta data 69 | compoundHeaderOverhead = 2 // Assumed header overhead 70 | compoundOverhead = 2 // Assumed overhead per entry in compoundHeader 71 | userMsgOverhead = 1 72 | blockingWarning = 10 * time.Millisecond // Warn if a UDP packet takes this long to process 73 | maxPushStateBytes = 10 * 1024 * 1024 74 | ) 75 | 76 | // ping request sent directly to node 77 | type ping struct { 78 | SeqNo uint32 79 | 80 | // Node is sent so the target can verify they are 81 | // the intended recipient. This is to protect again an agent 82 | // restart with a new name. 83 | Node string 84 | } 85 | 86 | // indirect ping sent to an indirect ndoe 87 | type indirectPingReq struct { 88 | SeqNo uint32 89 | Target []byte 90 | Port uint16 91 | Node string 92 | Nack bool // true if we'd like a nack back 93 | } 94 | 95 | // ack response is sent for a ping 96 | type ackResp struct { 97 | SeqNo uint32 98 | Payload []byte 99 | } 100 | 101 | // nack response is sent for an indirect ping when the pinger doesn't hear from 102 | // the ping-ee within the configured timeout. This lets the original node know 103 | // that the indirect ping attempt happened but didn't succeed. 104 | type nackResp struct { 105 | SeqNo uint32 106 | } 107 | 108 | // suspect is broadcast when we suspect a node is dead 109 | type suspect struct { 110 | Incarnation uint32 111 | Node string 112 | ClusterName string 113 | From string // Include who is suspecting 114 | } 115 | 116 | // alive is broadcast when we know a node is alive. 117 | // Overloaded for nodes joining 118 | type alive struct { 119 | Incarnation uint32 120 | Node string 121 | ClusterName string 122 | Addr []byte 123 | Port uint16 124 | Meta []byte 125 | 126 | // The versions of the protocol/delegate that are being spoken, order: 127 | // pmin, pmax, pcur, dmin, dmax, dcur 128 | Vsn []uint8 129 | } 130 | 131 | // dead is broadcast when we confirm a node is dead 132 | // Overloaded for nodes leaving 133 | type dead struct { 134 | Incarnation uint32 135 | Node string 136 | ClusterName string 137 | From string // Include who is suspecting 138 | } 139 | 140 | // pushPullHeader is used to inform the 141 | // otherside how many states we are transferring 142 | type pushPullHeader struct { 143 | Nodes int 144 | ClusterName string 145 | UserStateLen int // Encodes the byte lengh of user state 146 | Join bool // Is this a join request or a anti-entropy run 147 | } 148 | 149 | // userMsgHeader is used to encapsulate a userMsg 150 | type userMsgHeader struct { 151 | UserMsgLen int // Encodes the byte lengh of user state 152 | } 153 | 154 | // pushNodeState is used for pushPullReq when we are 155 | // transferring out node states 156 | type pushNodeState struct { 157 | Name string 158 | Addr []byte 159 | Port uint16 160 | Meta []byte 161 | Incarnation uint32 162 | State nodeStateType 163 | Vsn []uint8 // Protocol versions 164 | } 165 | 166 | // compress is used to wrap an underlying payload 167 | // using a specified compression algorithm 168 | type compress struct { 169 | Algo compressionType 170 | Buf []byte 171 | } 172 | 173 | // msgHandoff is used to transfer a message between goroutines 174 | type msgHandoff struct { 175 | msgType messageType 176 | buf []byte 177 | from net.Addr 178 | } 179 | 180 | func NewPushPullHeader(Nodes int, ClusterName string, 181 | UserStateLen int, Join bool) *pushPullHeader { 182 | 183 | if len(ClusterName) < 1 { 184 | fmt.Println("[Err] memberlist: No cluster name passed!") 185 | } 186 | 187 | return &pushPullHeader{Nodes, ClusterName, UserStateLen, Join} 188 | } 189 | 190 | // encryptionVersion returns the encryption version to use 191 | func (m *Memberlist) encryptionVersion() encryptionVersion { 192 | switch m.ProtocolVersion() { 193 | case 1: 194 | return 0 195 | default: 196 | return 1 197 | } 198 | } 199 | 200 | // streamListen is a long running goroutine that pulls incoming streams from the 201 | // transport and hands them off for processing. 202 | func (m *Memberlist) streamListen() { 203 | for { 204 | select { 205 | case conn := <-m.transport.StreamCh(): 206 | go m.handleConn(conn) 207 | 208 | case <-m.shutdownCh: 209 | return 210 | } 211 | } 212 | } 213 | 214 | // handleConn handles a single incoming stream connection from the transport. 215 | func (m *Memberlist) handleConn(conn net.Conn) { 216 | m.logger.Printf("[DEBUG] memberlist: Stream connection %s", LogConn(conn)) 217 | 218 | defer conn.Close() 219 | metrics.IncrCounter([]string{"memberlist", "tcp", "accept"}, 1) 220 | 221 | conn.SetDeadline(time.Now().Add(m.config.TCPTimeout)) 222 | msgType, bufConn, dec, err := m.readStream(conn) 223 | if err != nil { 224 | if err != io.EOF { 225 | m.logger.Printf("[ERR] memberlist: failed to receive: %s %s", err, LogConn(conn)) 226 | } 227 | return 228 | } 229 | 230 | switch msgType { 231 | case userMsg: 232 | if err := m.readUserMsg(bufConn, dec); err != nil { 233 | m.logger.Printf("[ERR] memberlist: Failed to receive user message: %s %s", err, LogConn(conn)) 234 | } 235 | case pushPullMsg: 236 | join, remoteNodes, userState, err := m.readRemoteState(bufConn, dec) 237 | if err != nil { 238 | m.logger.Printf("[ERR] memberlist: Failed to read remote state: %s %s", err, LogConn(conn)) 239 | return 240 | } 241 | 242 | if err := m.sendLocalState(conn, join); err != nil { 243 | m.logger.Printf("[ERR] memberlist: Failed to push local state: %s %s", err, LogConn(conn)) 244 | return 245 | } 246 | 247 | if err := m.mergeRemoteState(join, remoteNodes, userState); err != nil { 248 | m.logger.Printf("[ERR] memberlist: Failed push/pull merge: %s %s", err, LogConn(conn)) 249 | return 250 | } 251 | case pingMsg: 252 | var p ping 253 | if err := dec.Decode(&p); err != nil { 254 | m.logger.Printf("[ERR] memberlist: Failed to decode ping: %s %s", err, LogConn(conn)) 255 | return 256 | } 257 | 258 | if p.Node != "" && p.Node != m.config.Name { 259 | m.logger.Printf("[WARN] memberlist: Got ping for unexpected node %s %s", p.Node, LogConn(conn)) 260 | return 261 | } 262 | 263 | ack := ackResp{p.SeqNo, nil} 264 | out, err := encode(ackRespMsg, &ack) 265 | if err != nil { 266 | m.logger.Printf("[ERR] memberlist: Failed to encode ack: %s", err) 267 | return 268 | } 269 | 270 | err = m.rawSendMsgStream(conn, out.Bytes()) 271 | if err != nil { 272 | m.logger.Printf("[ERR] memberlist: Failed to send ack: %s %s", err, LogConn(conn)) 273 | return 274 | } 275 | default: 276 | m.logger.Printf("[ERR] memberlist: Received invalid msgType (%d) %s", msgType, LogConn(conn)) 277 | } 278 | } 279 | 280 | // packetListen is a long running goroutine that pulls packets out of the 281 | // transport and hands them off for processing. 282 | func (m *Memberlist) packetListen() { 283 | for { 284 | select { 285 | case packet := <-m.transport.PacketCh(): 286 | m.ingestPacket(packet.Buf, packet.From, packet.Timestamp) 287 | 288 | case <-m.shutdownCh: 289 | return 290 | } 291 | } 292 | } 293 | 294 | func (m *Memberlist) ingestPacket(buf []byte, from net.Addr, timestamp time.Time) { 295 | // Check if encryption is enabled 296 | if m.config.EncryptionEnabled() { 297 | // Decrypt the payload 298 | plain, err := decryptPayload(m.config.Keyring.GetKeys(), buf, nil) 299 | if err != nil { 300 | if !m.config.GossipVerifyIncoming { 301 | // Treat the message as plaintext 302 | plain = buf 303 | } else { 304 | m.logger.Printf("[ERR] memberlist: Decrypt packet failed: %v %s", err, LogAddress(from)) 305 | return 306 | } 307 | } 308 | 309 | // Continue processing the plaintext buffer 310 | buf = plain 311 | } 312 | 313 | // See if there's a checksum included to verify the contents of the message 314 | if len(buf) >= 5 && messageType(buf[0]) == hasCrcMsg { 315 | crc := crc32.ChecksumIEEE(buf[5:]) 316 | expected := binary.BigEndian.Uint32(buf[1:5]) 317 | if crc != expected { 318 | m.logger.Printf("[WARN] memberlist: Got invalid checksum for UDP packet: %x, %x", crc, expected) 319 | return 320 | } 321 | m.handleCommand(buf[5:], from, timestamp) 322 | } else { 323 | m.handleCommand(buf, from, timestamp) 324 | } 325 | } 326 | 327 | func (m *Memberlist) handleCommand(buf []byte, from net.Addr, timestamp time.Time) { 328 | // Decode the message type 329 | msgType := messageType(buf[0]) 330 | buf = buf[1:] 331 | 332 | // Switch on the msgType 333 | switch msgType { 334 | case compoundMsg: 335 | m.handleCompound(buf, from, timestamp) 336 | case compressMsg: 337 | m.handleCompressed(buf, from, timestamp) 338 | 339 | case pingMsg: 340 | m.handlePing(buf, from) 341 | case indirectPingMsg: 342 | m.handleIndirectPing(buf, from) 343 | case ackRespMsg: 344 | m.handleAck(buf, from, timestamp) 345 | case nackRespMsg: 346 | m.handleNack(buf, from) 347 | 348 | case suspectMsg: 349 | fallthrough 350 | case aliveMsg: 351 | fallthrough 352 | case deadMsg: 353 | fallthrough 354 | case userMsg: 355 | select { 356 | case m.handoff <- msgHandoff{msgType, buf, from}: 357 | default: 358 | m.logger.Printf("[WARN] memberlist: handler queue full, dropping message (%d) %s", msgType, LogAddress(from)) 359 | } 360 | 361 | default: 362 | m.logger.Printf("[ERR] memberlist: msg type (%d) not supported %s", msgType, LogAddress(from)) 363 | } 364 | } 365 | 366 | // packetHandler is a long running goroutine that processes messages received 367 | // over the packet interface, but is decoupled from the listener to avoid 368 | // blocking the listener which may cause ping/ack messages to be delayed. 369 | func (m *Memberlist) packetHandler() { 370 | for { 371 | select { 372 | case msg := <-m.handoff: 373 | msgType := msg.msgType 374 | buf := msg.buf 375 | from := msg.from 376 | 377 | switch msgType { 378 | case suspectMsg: 379 | m.handleSuspect(buf, from) 380 | case aliveMsg: 381 | m.handleAlive(buf, from) 382 | case deadMsg: 383 | m.handleDead(buf, from) 384 | case userMsg: 385 | m.handleUser(buf, from) 386 | default: 387 | m.logger.Printf("[ERR] memberlist: Message type (%d) not supported %s (packet handler)", msgType, LogAddress(from)) 388 | } 389 | 390 | case <-m.shutdownCh: 391 | return 392 | } 393 | } 394 | } 395 | 396 | func (m *Memberlist) handleCompound(buf []byte, from net.Addr, timestamp time.Time) { 397 | // Decode the parts 398 | trunc, parts, err := decodeCompoundMessage(buf) 399 | if err != nil { 400 | m.logger.Printf("[ERR] memberlist: Failed to decode compound request: %s %s", err, LogAddress(from)) 401 | return 402 | } 403 | 404 | // Log any truncation 405 | if trunc > 0 { 406 | m.logger.Printf("[WARN] memberlist: Compound request had %d truncated messages %s", trunc, LogAddress(from)) 407 | } 408 | 409 | // Handle each message 410 | for _, part := range parts { 411 | m.handleCommand(part, from, timestamp) 412 | } 413 | } 414 | 415 | func (m *Memberlist) handlePing(buf []byte, from net.Addr) { 416 | var p ping 417 | if err := decode(buf, &p); err != nil { 418 | m.logger.Printf("[ERR] memberlist: Failed to decode ping request: %s %s", err, LogAddress(from)) 419 | return 420 | } 421 | // If node is provided, verify that it is for us 422 | if p.Node != "" && p.Node != m.config.Name { 423 | m.logger.Printf("[WARN] memberlist: Got ping for unexpected node '%s' %s", p.Node, LogAddress(from)) 424 | return 425 | } 426 | var ack ackResp 427 | ack.SeqNo = p.SeqNo 428 | if m.config.Ping != nil { 429 | ack.Payload = m.config.Ping.AckPayload() 430 | } 431 | if err := m.encodeAndSendMsg(from.String(), ackRespMsg, &ack); err != nil { 432 | m.logger.Printf("[ERR] memberlist: Failed to send ack: %s %s", err, LogAddress(from)) 433 | } 434 | } 435 | 436 | func (m *Memberlist) handleIndirectPing(buf []byte, from net.Addr) { 437 | var ind indirectPingReq 438 | if err := decode(buf, &ind); err != nil { 439 | m.logger.Printf("[ERR] memberlist: Failed to decode indirect ping request: %s %s", err, LogAddress(from)) 440 | return 441 | } 442 | 443 | // For proto versions < 2, there is no port provided. Mask old 444 | // behavior by using the configured port. 445 | if m.ProtocolVersion() < 2 || ind.Port == 0 { 446 | ind.Port = uint16(m.config.BindPort) 447 | } 448 | 449 | // Send a ping to the correct host. 450 | localSeqNo := m.nextSeqNo() 451 | ping := ping{SeqNo: localSeqNo, Node: ind.Node} 452 | 453 | // Setup a response handler to relay the ack 454 | cancelCh := make(chan struct{}) 455 | respHandler := func(payload []byte, timestamp time.Time) { 456 | // Try to prevent the nack if we've caught it in time. 457 | close(cancelCh) 458 | 459 | // Forward the ack back to the requestor. 460 | ack := ackResp{ind.SeqNo, nil} 461 | if err := m.encodeAndSendMsg(from.String(), ackRespMsg, &ack); err != nil { 462 | m.logger.Printf("[ERR] memberlist: Failed to forward ack: %s %s", err, LogAddress(from)) 463 | } 464 | } 465 | m.setAckHandler(localSeqNo, respHandler, m.config.ProbeTimeout) 466 | 467 | // Send the ping. 468 | addr := joinHostPort(net.IP(ind.Target).String(), ind.Port) 469 | if err := m.encodeAndSendMsg(addr, pingMsg, &ping); err != nil { 470 | m.logger.Printf("[ERR] memberlist: Failed to send ping: %s %s", err, LogAddress(from)) 471 | } 472 | 473 | // Setup a timer to fire off a nack if no ack is seen in time. 474 | if ind.Nack { 475 | go func() { 476 | select { 477 | case <-cancelCh: 478 | return 479 | case <-time.After(m.config.ProbeTimeout): 480 | nack := nackResp{ind.SeqNo} 481 | if err := m.encodeAndSendMsg(from.String(), nackRespMsg, &nack); err != nil { 482 | m.logger.Printf("[ERR] memberlist: Failed to send nack: %s %s", err, LogAddress(from)) 483 | } 484 | } 485 | }() 486 | } 487 | } 488 | 489 | func (m *Memberlist) handleAck(buf []byte, from net.Addr, timestamp time.Time) { 490 | var ack ackResp 491 | if err := decode(buf, &ack); err != nil { 492 | m.logger.Printf("[ERR] memberlist: Failed to decode ack response: %s %s", err, LogAddress(from)) 493 | return 494 | } 495 | m.invokeAckHandler(ack, timestamp) 496 | } 497 | 498 | func (m *Memberlist) handleNack(buf []byte, from net.Addr) { 499 | var nack nackResp 500 | if err := decode(buf, &nack); err != nil { 501 | m.logger.Printf("[ERR] memberlist: Failed to decode nack response: %s %s", err, LogAddress(from)) 502 | return 503 | } 504 | m.invokeNackHandler(nack) 505 | } 506 | 507 | func (m *Memberlist) handleSuspect(buf []byte, from net.Addr) { 508 | var sus suspect 509 | if err := decode(buf, &sus); err != nil { 510 | m.logger.Printf("[ERR] memberlist: Failed to decode suspect message: %s %s", err, LogAddress(from)) 511 | return 512 | } 513 | 514 | if !m.isSameCluster(sus.ClusterName) { 515 | return 516 | } 517 | 518 | m.suspectNode(&sus) 519 | } 520 | 521 | func (m *Memberlist) handleAlive(buf []byte, from net.Addr) { 522 | var live alive 523 | if err := decode(buf, &live); err != nil { 524 | m.logger.Printf("[ERR] memberlist: Failed to decode alive message: %s %s", err, LogAddress(from)) 525 | return 526 | } 527 | 528 | // For proto versions < 2, there is no port provided. Mask old 529 | // behavior by using the configured port 530 | if m.ProtocolVersion() < 2 || live.Port == 0 { 531 | live.Port = uint16(m.config.BindPort) 532 | } 533 | 534 | if !m.isSameCluster(live.ClusterName) { 535 | return 536 | } 537 | 538 | m.aliveNode(&live, nil, false) 539 | } 540 | 541 | func (m *Memberlist) handleDead(buf []byte, from net.Addr) { 542 | var d dead 543 | if err := decode(buf, &d); err != nil { 544 | m.logger.Printf("[ERR] memberlist: Failed to decode dead message: %s %s", err, LogAddress(from)) 545 | return 546 | } 547 | 548 | if !m.isSameCluster(d.ClusterName) { 549 | return 550 | } 551 | 552 | m.deadNode(&d) 553 | } 554 | 555 | // handleUser is used to notify channels of incoming user data 556 | func (m *Memberlist) handleUser(buf []byte, from net.Addr) { 557 | d := m.config.Delegate 558 | if d != nil { 559 | d.NotifyMsg(buf) 560 | } 561 | } 562 | 563 | // handleCompressed is used to unpack a compressed message 564 | func (m *Memberlist) handleCompressed(buf []byte, from net.Addr, timestamp time.Time) { 565 | // Try to decode the payload 566 | payload, err := decompressPayload(buf) 567 | if err != nil { 568 | m.logger.Printf("[ERR] memberlist: Failed to decompress payload: %v %s", err, LogAddress(from)) 569 | return 570 | } 571 | 572 | // Recursively handle the payload 573 | m.handleCommand(payload, from, timestamp) 574 | } 575 | 576 | // encodeAndSendMsg is used to combine the encoding and sending steps 577 | func (m *Memberlist) encodeAndSendMsg(addr string, msgType messageType, msg interface{}) error { 578 | out, err := encode(msgType, msg) 579 | if err != nil { 580 | return err 581 | } 582 | if err := m.sendMsg(addr, out.Bytes()); err != nil { 583 | return err 584 | } 585 | return nil 586 | } 587 | 588 | // sendMsg is used to send a message via packet to another host. It will 589 | // opportunistically create a compoundMsg and piggy back other broadcasts. 590 | func (m *Memberlist) sendMsg(addr string, msg []byte) error { 591 | // Check if we can piggy back any messages 592 | bytesAvail := m.config.UDPBufferSize - len(msg) - compoundHeaderOverhead 593 | if m.config.EncryptionEnabled() && m.config.GossipVerifyOutgoing { 594 | bytesAvail -= encryptOverhead(m.encryptionVersion()) 595 | } 596 | extra := m.getBroadcasts(compoundOverhead, bytesAvail) 597 | 598 | // Fast path if nothing to piggypack 599 | if len(extra) == 0 { 600 | return m.rawSendMsgPacket(addr, nil, msg) 601 | } 602 | 603 | // Join all the messages 604 | msgs := make([][]byte, 0, 1+len(extra)) 605 | msgs = append(msgs, msg) 606 | msgs = append(msgs, extra...) 607 | 608 | // Create a compound message 609 | compound := makeCompoundMessage(msgs) 610 | 611 | // Send the message 612 | return m.rawSendMsgPacket(addr, nil, compound.Bytes()) 613 | } 614 | 615 | // rawSendMsgPacket is used to send message via packet to another host without 616 | // modification, other than compression or encryption if enabled. 617 | func (m *Memberlist) rawSendMsgPacket(addr string, node *Node, msg []byte) error { 618 | // Check if we have compression enabled 619 | if m.config.EnableCompression { 620 | buf, err := compressPayload(msg) 621 | if err != nil { 622 | m.logger.Printf("[WARN] memberlist: Failed to compress payload: %v", err) 623 | } else { 624 | // Only use compression if it reduced the size 625 | if buf.Len() < len(msg) { 626 | msg = buf.Bytes() 627 | } 628 | } 629 | } 630 | 631 | // Try to look up the destination node 632 | if node == nil { 633 | toAddr, _, err := net.SplitHostPort(addr) 634 | if err != nil { 635 | m.logger.Printf("[ERR] memberlist: Failed to parse address %q: %v", addr, err) 636 | return err 637 | } 638 | m.nodeLock.RLock() 639 | nodeState, ok := m.nodeMap[toAddr] 640 | m.nodeLock.RUnlock() 641 | if ok { 642 | node = &nodeState.Node 643 | } 644 | } 645 | 646 | // Add a CRC to the end of the payload if the recipient understands 647 | // ProtocolVersion >= 5 648 | if node != nil && node.PMax >= 5 { 649 | crc := crc32.ChecksumIEEE(msg) 650 | header := make([]byte, 5, 5+len(msg)) 651 | header[0] = byte(hasCrcMsg) 652 | binary.BigEndian.PutUint32(header[1:], crc) 653 | msg = append(header, msg...) 654 | } 655 | 656 | // Check if we have encryption enabled 657 | if m.config.EncryptionEnabled() && m.config.GossipVerifyOutgoing { 658 | // Encrypt the payload 659 | var buf bytes.Buffer 660 | primaryKey := m.config.Keyring.GetPrimaryKey() 661 | err := encryptPayload(m.encryptionVersion(), primaryKey, msg, nil, &buf) 662 | if err != nil { 663 | m.logger.Printf("[ERR] memberlist: Encryption of message failed: %v", err) 664 | return err 665 | } 666 | msg = buf.Bytes() 667 | } 668 | 669 | metrics.IncrCounter([]string{"memberlist", "udp", "sent"}, float32(len(msg))) 670 | _, err := m.transport.WriteTo(msg, addr) 671 | return err 672 | } 673 | 674 | // rawSendMsgStream is used to stream a message to another host without 675 | // modification, other than applying compression and encryption if enabled. 676 | func (m *Memberlist) rawSendMsgStream(conn net.Conn, sendBuf []byte) error { 677 | // Check if compresion is enabled 678 | if m.config.EnableCompression { 679 | compBuf, err := compressPayload(sendBuf) 680 | if err != nil { 681 | m.logger.Printf("[ERROR] memberlist: Failed to compress payload: %v", err) 682 | } else { 683 | sendBuf = compBuf.Bytes() 684 | } 685 | } 686 | 687 | // Check if encryption is enabled 688 | if m.config.EncryptionEnabled() && m.config.GossipVerifyOutgoing { 689 | crypt, err := m.encryptLocalState(sendBuf) 690 | if err != nil { 691 | m.logger.Printf("[ERROR] memberlist: Failed to encrypt local state: %v", err) 692 | return err 693 | } 694 | sendBuf = crypt 695 | } 696 | 697 | // Write out the entire send buffer 698 | metrics.IncrCounter([]string{"memberlist", "tcp", "sent"}, float32(len(sendBuf))) 699 | 700 | if n, err := conn.Write(sendBuf); err != nil { 701 | return err 702 | } else if n != len(sendBuf) { 703 | return fmt.Errorf("only %d of %d bytes written", n, len(sendBuf)) 704 | } 705 | 706 | return nil 707 | } 708 | 709 | // sendUserMsg is used to stream a user message to another host. 710 | func (m *Memberlist) sendUserMsg(addr string, sendBuf []byte) error { 711 | conn, err := m.transport.DialTimeout(addr, m.config.TCPTimeout) 712 | if err != nil { 713 | return err 714 | } 715 | defer conn.Close() 716 | 717 | bufConn := bytes.NewBuffer(nil) 718 | if err := bufConn.WriteByte(byte(userMsg)); err != nil { 719 | return err 720 | } 721 | 722 | header := userMsgHeader{UserMsgLen: len(sendBuf)} 723 | hd := codec.MsgpackHandle{} 724 | enc := codec.NewEncoder(bufConn, &hd) 725 | if err := enc.Encode(&header); err != nil { 726 | return err 727 | } 728 | if _, err := bufConn.Write(sendBuf); err != nil { 729 | return err 730 | } 731 | return m.rawSendMsgStream(conn, bufConn.Bytes()) 732 | } 733 | 734 | // sendAndReceiveState is used to initiate a push/pull over a stream with a 735 | // remote host. 736 | func (m *Memberlist) sendAndReceiveState(addr string, join bool) ([]pushNodeState, []byte, error) { 737 | // Attempt to connect 738 | conn, err := m.transport.DialTimeout(addr, m.config.TCPTimeout) 739 | if err != nil { 740 | return nil, nil, err 741 | } 742 | defer conn.Close() 743 | m.logger.Printf("[DEBUG] memberlist: Initiating push/pull sync with: %s", conn.RemoteAddr()) 744 | metrics.IncrCounter([]string{"memberlist", "tcp", "connect"}, 1) 745 | 746 | // Send our state 747 | if err := m.sendLocalState(conn, join); err != nil { 748 | return nil, nil, err 749 | } 750 | 751 | conn.SetDeadline(time.Now().Add(m.config.TCPTimeout)) 752 | msgType, bufConn, dec, err := m.readStream(conn) 753 | if err != nil { 754 | return nil, nil, err 755 | } 756 | 757 | // Quit if not push/pull 758 | if msgType != pushPullMsg { 759 | err := fmt.Errorf("received invalid msgType (%d), expected pushPullMsg (%d) %s", msgType, pushPullMsg, LogConn(conn)) 760 | return nil, nil, err 761 | } 762 | 763 | // Read remote state 764 | _, remoteNodes, userState, err := m.readRemoteState(bufConn, dec) 765 | return remoteNodes, userState, err 766 | } 767 | 768 | // sendLocalState is invoked to send our local state over a stream connection. 769 | func (m *Memberlist) sendLocalState(conn net.Conn, join bool) error { 770 | // Setup a deadline 771 | conn.SetDeadline(time.Now().Add(m.config.TCPTimeout)) 772 | 773 | // Prepare the local node state 774 | m.nodeLock.RLock() 775 | localNodes := make([]pushNodeState, len(m.nodes)) 776 | for idx, n := range m.nodes { 777 | localNodes[idx].Name = n.Name 778 | localNodes[idx].Addr = n.Addr 779 | localNodes[idx].Port = n.Port 780 | localNodes[idx].Incarnation = n.Incarnation 781 | localNodes[idx].State = n.State 782 | localNodes[idx].Meta = n.Meta 783 | localNodes[idx].Vsn = []uint8{ 784 | n.PMin, n.PMax, n.PCur, 785 | n.DMin, n.DMax, n.DCur, 786 | } 787 | } 788 | m.nodeLock.RUnlock() 789 | 790 | // Get the delegate state 791 | var userData []byte 792 | if m.config.Delegate != nil { 793 | userData = m.config.Delegate.LocalState(join) 794 | } 795 | 796 | // Create a bytes buffer writer 797 | bufConn := bytes.NewBuffer(nil) 798 | 799 | // Send our node state 800 | header := NewPushPullHeader( 801 | len(localNodes), 802 | m.config.ClusterName, 803 | len(userData), 804 | join, 805 | ) 806 | hd := codec.MsgpackHandle{} 807 | enc := codec.NewEncoder(bufConn, &hd) 808 | 809 | // Begin state push 810 | if _, err := bufConn.Write([]byte{byte(pushPullMsg)}); err != nil { 811 | return err 812 | } 813 | 814 | if err := enc.Encode(header); err != nil { 815 | return err 816 | } 817 | for i := 0; i < header.Nodes; i++ { 818 | if err := enc.Encode(&localNodes[i]); err != nil { 819 | return err 820 | } 821 | } 822 | 823 | // Write the user state as well 824 | if userData != nil { 825 | if _, err := bufConn.Write(userData); err != nil { 826 | return err 827 | } 828 | } 829 | 830 | // Get the send buffer 831 | return m.rawSendMsgStream(conn, bufConn.Bytes()) 832 | } 833 | 834 | // encryptLocalState is used to help encrypt local state before sending 835 | func (m *Memberlist) encryptLocalState(sendBuf []byte) ([]byte, error) { 836 | var buf bytes.Buffer 837 | 838 | // Write the encryptMsg byte 839 | buf.WriteByte(byte(encryptMsg)) 840 | 841 | // Write the size of the message 842 | sizeBuf := make([]byte, 4) 843 | encVsn := m.encryptionVersion() 844 | encLen := encryptedLength(encVsn, len(sendBuf)) 845 | binary.BigEndian.PutUint32(sizeBuf, uint32(encLen)) 846 | buf.Write(sizeBuf) 847 | 848 | // Write the encrypted cipher text to the buffer 849 | key := m.config.Keyring.GetPrimaryKey() 850 | err := encryptPayload(encVsn, key, sendBuf, buf.Bytes()[:5], &buf) 851 | if err != nil { 852 | return nil, err 853 | } 854 | return buf.Bytes(), nil 855 | } 856 | 857 | // decryptRemoteState is used to help decrypt the remote state 858 | func (m *Memberlist) decryptRemoteState(bufConn io.Reader) ([]byte, error) { 859 | // Read in enough to determine message length 860 | cipherText := bytes.NewBuffer(nil) 861 | cipherText.WriteByte(byte(encryptMsg)) 862 | _, err := io.CopyN(cipherText, bufConn, 4) 863 | if err != nil { 864 | return nil, err 865 | } 866 | 867 | // Ensure we aren't asked to download too much. This is to guard against 868 | // an attack vector where a huge amount of state is sent 869 | moreBytes := binary.BigEndian.Uint32(cipherText.Bytes()[1:5]) 870 | if moreBytes > maxPushStateBytes { 871 | return nil, fmt.Errorf("Remote node state is larger than limit (%d)", moreBytes) 872 | } 873 | 874 | // Read in the rest of the payload 875 | _, err = io.CopyN(cipherText, bufConn, int64(moreBytes)) 876 | if err != nil { 877 | return nil, err 878 | } 879 | 880 | // Decrypt the cipherText 881 | dataBytes := cipherText.Bytes()[:5] 882 | cipherBytes := cipherText.Bytes()[5:] 883 | 884 | // Decrypt the payload 885 | keys := m.config.Keyring.GetKeys() 886 | return decryptPayload(keys, cipherBytes, dataBytes) 887 | } 888 | 889 | // readStream is used to read from a stream connection, decrypting and 890 | // decompressing the stream if necessary. 891 | func (m *Memberlist) readStream(conn net.Conn) (messageType, io.Reader, *codec.Decoder, error) { 892 | // Created a buffered reader 893 | var bufConn io.Reader = bufio.NewReader(conn) 894 | 895 | // Read the message type 896 | buf := [1]byte{0} 897 | if _, err := bufConn.Read(buf[:]); err != nil { 898 | return 0, nil, nil, err 899 | } 900 | msgType := messageType(buf[0]) 901 | 902 | // Check if the message is encrypted 903 | if msgType == encryptMsg { 904 | if !m.config.EncryptionEnabled() { 905 | return 0, nil, nil, 906 | fmt.Errorf("Remote state is encrypted and encryption is not configured") 907 | } 908 | 909 | plain, err := m.decryptRemoteState(bufConn) 910 | if err != nil { 911 | return 0, nil, nil, err 912 | } 913 | 914 | // Reset message type and bufConn 915 | msgType = messageType(plain[0]) 916 | bufConn = bytes.NewReader(plain[1:]) 917 | } else if m.config.EncryptionEnabled() && m.config.GossipVerifyIncoming { 918 | return 0, nil, nil, 919 | fmt.Errorf("Encryption is configured but remote state is not encrypted") 920 | } 921 | 922 | // Get the msgPack decoders 923 | hd := codec.MsgpackHandle{} 924 | dec := codec.NewDecoder(bufConn, &hd) 925 | 926 | // Check if we have a compressed message 927 | if msgType == compressMsg { 928 | var c compress 929 | if err := dec.Decode(&c); err != nil { 930 | return 0, nil, nil, err 931 | } 932 | decomp, err := decompressBuffer(&c) 933 | if err != nil { 934 | return 0, nil, nil, err 935 | } 936 | 937 | // Reset the message type 938 | msgType = messageType(decomp[0]) 939 | 940 | // Create a new bufConn 941 | bufConn = bytes.NewReader(decomp[1:]) 942 | 943 | // Create a new decoder 944 | dec = codec.NewDecoder(bufConn, &hd) 945 | } 946 | 947 | return msgType, bufConn, dec, nil 948 | } 949 | 950 | // readRemoteState is used to read the remote state from a connection 951 | func (m *Memberlist) readRemoteState(bufConn io.Reader, dec *codec.Decoder) (bool, []pushNodeState, []byte, error) { 952 | // Read the push/pull header 953 | var header pushPullHeader 954 | if err := dec.Decode(&header); err != nil { 955 | return false, nil, nil, err 956 | } 957 | 958 | if !m.isSameCluster(header.ClusterName) { 959 | return false, nil, nil, fmt.Errorf("Cluster names do not match: %s <-> %s", 960 | header.ClusterName, m.config.ClusterName) 961 | } 962 | 963 | // Allocate space for the transfer 964 | remoteNodes := make([]pushNodeState, header.Nodes) 965 | 966 | // Try to decode all the states 967 | for i := 0; i < header.Nodes; i++ { 968 | if err := dec.Decode(&remoteNodes[i]); err != nil { 969 | return false, nil, nil, err 970 | } 971 | } 972 | 973 | // Read the remote user state into a buffer 974 | var userBuf []byte 975 | if header.UserStateLen > 0 { 976 | userBuf = make([]byte, header.UserStateLen) 977 | bytes, err := io.ReadAtLeast(bufConn, userBuf, header.UserStateLen) 978 | if err == nil && bytes != header.UserStateLen { 979 | err = fmt.Errorf( 980 | "Failed to read full user state (%d / %d)", 981 | bytes, header.UserStateLen) 982 | } 983 | if err != nil { 984 | return false, nil, nil, err 985 | } 986 | } 987 | 988 | // For proto versions < 2, there is no port provided. Mask old 989 | // behavior by using the configured port 990 | for idx := range remoteNodes { 991 | if m.ProtocolVersion() < 2 || remoteNodes[idx].Port == 0 { 992 | remoteNodes[idx].Port = uint16(m.config.BindPort) 993 | } 994 | } 995 | 996 | return header.Join, remoteNodes, userBuf, nil 997 | } 998 | 999 | 1000 | 1001 | // mergeRemoteState is used to merge the remote state with our local state 1002 | func (m *Memberlist) mergeRemoteState(join bool, remoteNodes []pushNodeState, userBuf []byte) error { 1003 | if err := m.verifyProtocol(remoteNodes); err != nil { 1004 | return err 1005 | } 1006 | 1007 | // Invoke the merge delegate if any 1008 | if join && m.config.Merge != nil { 1009 | nodes := make([]*Node, len(remoteNodes)) 1010 | for idx, n := range remoteNodes { 1011 | nodes[idx] = &Node{ 1012 | Name: n.Name, 1013 | Addr: n.Addr, 1014 | Port: n.Port, 1015 | Meta: n.Meta, 1016 | PMin: n.Vsn[0], 1017 | PMax: n.Vsn[1], 1018 | PCur: n.Vsn[2], 1019 | DMin: n.Vsn[3], 1020 | DMax: n.Vsn[4], 1021 | DCur: n.Vsn[5], 1022 | } 1023 | } 1024 | if err := m.config.Merge.NotifyMerge(nodes); err != nil { 1025 | return err 1026 | } 1027 | } 1028 | 1029 | // Merge the membership state 1030 | m.mergeState(remoteNodes) 1031 | 1032 | // Invoke the delegate for user state 1033 | if userBuf != nil && m.config.Delegate != nil { 1034 | m.config.Delegate.MergeRemoteState(userBuf, join) 1035 | } 1036 | return nil 1037 | } 1038 | 1039 | // readUserMsg is used to decode a userMsg from a stream. 1040 | func (m *Memberlist) readUserMsg(bufConn io.Reader, dec *codec.Decoder) error { 1041 | // Read the user message header 1042 | var header userMsgHeader 1043 | if err := dec.Decode(&header); err != nil { 1044 | return err 1045 | } 1046 | 1047 | // Read the user message into a buffer 1048 | var userBuf []byte 1049 | if header.UserMsgLen > 0 { 1050 | userBuf = make([]byte, header.UserMsgLen) 1051 | bytes, err := io.ReadAtLeast(bufConn, userBuf, header.UserMsgLen) 1052 | if err == nil && bytes != header.UserMsgLen { 1053 | err = fmt.Errorf( 1054 | "Failed to read full user message (%d / %d)", 1055 | bytes, header.UserMsgLen) 1056 | } 1057 | if err != nil { 1058 | return err 1059 | } 1060 | 1061 | d := m.config.Delegate 1062 | if d != nil { 1063 | d.NotifyMsg(userBuf) 1064 | } 1065 | } 1066 | 1067 | return nil 1068 | } 1069 | 1070 | // sendPingAndWaitForAck makes a stream connection to the given address, sends 1071 | // a ping, and waits for an ack. All of this is done as a series of blocking 1072 | // operations, given the deadline. The bool return parameter is true if we 1073 | // we able to round trip a ping to the other node. 1074 | func (m *Memberlist) sendPingAndWaitForAck(addr string, ping ping, deadline time.Time) (bool, error) { 1075 | conn, err := m.transport.DialTimeout(addr, m.config.TCPTimeout) 1076 | if err != nil { 1077 | // If the node is actually dead we expect this to fail, so we 1078 | // shouldn't spam the logs with it. After this point, errors 1079 | // with the connection are real, unexpected errors and should 1080 | // get propagated up. 1081 | return false, nil 1082 | } 1083 | defer conn.Close() 1084 | conn.SetDeadline(deadline) 1085 | 1086 | out, err := encode(pingMsg, &ping) 1087 | if err != nil { 1088 | return false, err 1089 | } 1090 | 1091 | if err = m.rawSendMsgStream(conn, out.Bytes()); err != nil { 1092 | return false, err 1093 | } 1094 | 1095 | msgType, _, dec, err := m.readStream(conn) 1096 | if err != nil { 1097 | return false, err 1098 | } 1099 | 1100 | if msgType != ackRespMsg { 1101 | return false, fmt.Errorf("Unexpected msgType (%d) from ping %s", msgType, LogConn(conn)) 1102 | } 1103 | 1104 | var ack ackResp 1105 | if err = dec.Decode(&ack); err != nil { 1106 | return false, err 1107 | } 1108 | 1109 | if ack.SeqNo != ping.SeqNo { 1110 | return false, fmt.Errorf("Sequence number from ack (%d) doesn't match ping (%d)", ack.SeqNo, ping.SeqNo, LogConn(conn)) 1111 | } 1112 | 1113 | return true, nil 1114 | } 1115 | 1116 | // Compare given cluster name against config cluster name 1117 | func (m *Memberlist) isSameCluster(name string) bool { 1118 | if name != m.config.ClusterName { 1119 | m.logger.Printf("[ERR] memberlist: Cluster names do not match: %s <-> %s", 1120 | name, m.config.ClusterName) 1121 | return false 1122 | } 1123 | 1124 | return true 1125 | } 1126 | --------------------------------------------------------------------------------