├── create_cluster.sh ├── compile_pb.sh ├── dtable ├── doc.go ├── dtable_delegator.go ├── pb_defs │ └── dtable.proto ├── dtable_query.go ├── README.md ├── dtable_util.go ├── dtable_zmq_handler.go ├── dtable_replication.go ├── dtable.pb.go ├── dtable.go └── dtable_remote.go ├── doc.go ├── pb_defs └── chord.proto ├── LICENSE ├── chord_math.go ├── transport_local.go ├── README.md ├── vnode_handlers.go ├── transport_zmq_handler.go ├── chord.pb.go ├── transport_zmq_init.go ├── dendrite.go ├── vnode.go └── transport_zmq.go /create_cluster.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | for port in 5001 5002 5003 5004 4 | do 5 | dendrite-node -host "127.0.0.1:$port" -nodes "127.0.0.1:5000" & 6 | done 7 | 8 | 9 | -------------------------------------------------------------------------------- /compile_pb.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | cd pb_defs 3 | protoc --go_out=../ chord.proto 4 | cd ../dtable/pb_defs 5 | protoc --proto_path "../../pb_defs:." --go_out=../ dtable.proto 6 | sed -i 's/import dendrite \"chord.pb/import dendrite \"github.com\/fastfn\/dendrite/' ../dtable.pb.go 7 | 8 | -------------------------------------------------------------------------------- /dtable/doc.go: -------------------------------------------------------------------------------- 1 | /* 2 | Package dtable implements highly available, distributed in-memory key/value datastore. 3 | 4 | DTable is built on top of dendrite for key distribution and high availability, replication 5 | and failover. It exposes Query interface for Get() and Set() operations. 6 | 7 | It hooks on dendrite as a TransportHook and uses ZeroMQ for communication between remote nodes. 8 | All messages between the nodes are serialized with protocol buffers. 9 | */ 10 | package dtable 11 | -------------------------------------------------------------------------------- /doc.go: -------------------------------------------------------------------------------- 1 | /* 2 | Package dendrite implements a distributed hash table (DHT) based on Chord Protocol. 3 | Included sub-package 'dtable' is built on top of dendrite and implements 4 | distributed in-memory key/value database, with replication and failover support, 5 | with query interface to Get() or Set() items with different consistency levels. 6 | 7 | For better key distribution, dendrite allows configurable number of virtual nodes 8 | per instance (vnodes). The number of replicas in dtable is also configurable. 9 | 10 | Calling application can bootstrap the cluster, or join existing one by connecting to any of 11 | existing nodes (must be manually specified). Node discovery is not part of the implementation. 12 | Use consul (consul.io) or something else for that purpose. 13 | 14 | Chord protocol defines ring stabilization. In dendrite, stabilization period is configurable. 15 | 16 | Node to node (network) communication is built on top of ZeroMQ sockets over TCP for speed, clustering 17 | and reliability. Dendrite starts configurable number of goroutines (default: 10) for load balanced 18 | serving of remote requests, but scales that number up and down depending on the load (aka prefork model). 19 | 20 | All messages sent through dendrite are encapsulated in ChordMsg structure, where first byte indicates message type, 21 | and actual data follows. Data part is serialized with protocol buffers. 22 | 23 | Dendrite can be extended through two interfaces: 24 | TransportHook 25 | DelegateHook 26 | 27 | TransportHook allows other packages to provide additional message types, decoders and handlers, while DelegateHook 28 | can be used to capture chord events that dendrite emits: 29 | EvPredecessorJoined 30 | EvPredecessorLeft 31 | EvReplicasChanged 32 | */ 33 | package dendrite 34 | -------------------------------------------------------------------------------- /pb_defs/chord.proto: -------------------------------------------------------------------------------- 1 | /* 2 | dendrite protobuf message definitions. 3 | */ 4 | package dendrite; 5 | 6 | 7 | // PBProtoVnode represents Vnode structure. 8 | message PBProtoVnode { 9 | required bytes id = 1; 10 | required string host = 2; 11 | } 12 | 13 | // PBProtoPing is simple structure for pinging remote vnodes. 14 | message PBProtoPing { 15 | required int64 version = 1; 16 | } 17 | 18 | // PBProtoAck is generic response message with boolean 'ok' state. 19 | message PBProtoAck { 20 | required int64 version = 1; 21 | required bool ok = 2; 22 | } 23 | 24 | // PBProtoErr defines error message. 25 | message PBProtoErr { 26 | required string error = 2; 27 | } 28 | 29 | // PBProtoForward is sent to caller if request should be forwarded to another vnode. 30 | message PBProtoForward { 31 | required PBProtoVnode vnode = 1; 32 | } 33 | 34 | // PBProtoLeave (not used) 35 | message PBProtoLeave { 36 | required PBProtoVnode source = 1; 37 | required PBProtoVnode dest = 2; 38 | } 39 | 40 | // PBProtoListVnodes - request the list of vnodes from remote vnode. 41 | message PBProtoListVnodes { 42 | // no params are needed 43 | } 44 | 45 | // PBProtoListVnodesResp is a structure for returning multiple vnodes to a caller. 46 | message PBProtoListVnodesResp { 47 | repeated PBProtoVnode vnodes = 1; 48 | } 49 | 50 | // PBProtoFindSuccessors is a structure to request successors for a key. 51 | message PBProtoFindSuccessors { 52 | required bytes key = 1; 53 | required PBProtoVnode dest = 2; 54 | optional int32 limit = 3; 55 | } 56 | 57 | // PBProtoGetPredecessor - request immediate predecessor from vnode. 58 | message PBProtoGetPredecessor { 59 | required PBProtoVnode dest = 1; 60 | } 61 | 62 | // PBProtoNotify is a message to notify the remote vnode of origin's existence. 63 | message PBProtoNotify { 64 | required PBProtoVnode dest = 1; 65 | required PBProtoVnode vnode = 2; 66 | } 67 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 Zeljko Tomic 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of 6 | this software and associated documentation files (the "Software"), to deal in 7 | the Software without restriction, including without limitation the rights to 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 9 | the Software, and to permit persons to whom the Software is furnished to do so, 10 | subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 17 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 18 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 19 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 20 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 21 | 22 | ------- 23 | Some of the code is used from another project: github.com/armon/go-chord 24 | 25 | The MIT License (MIT) 26 | 27 | Copyright (c) 2013 Armon Dadgar 28 | 29 | Permission is hereby granted, free of charge, to any person obtaining a copy of 30 | this software and associated documentation files (the "Software"), to deal in 31 | the Software without restriction, including without limitation the rights to 32 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 33 | the Software, and to permit persons to whom the Software is furnished to do so, 34 | subject to the following conditions: 35 | 36 | The above copyright notice and this permission notice shall be included in all 37 | copies or substantial portions of the Software. 38 | 39 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 40 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 41 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 42 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 43 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 44 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 45 | 46 | -------------------------------------------------------------------------------- /dtable/dtable_delegator.go: -------------------------------------------------------------------------------- 1 | package dtable 2 | 3 | import ( 4 | "github.com/fastfn/dendrite" 5 | "time" 6 | ) 7 | 8 | // delegator() - captures dendrite events as well as internal dtable events 9 | // and synchronizes data operations 10 | func (dt *DTable) delegator() { 11 | for { 12 | select { 13 | case event := <-dt.event_c: 14 | switch event.EvType { 15 | case dendrite.EvPredecessorLeft: 16 | dt.Logf(LogDebug, "delegator() - predecessor left - promoting ourselves %s, status: ", event.Target.String()) 17 | // don't make the call just yet. Need to verify that peer is ready 18 | if err := dt.checkPeer(event.Target); err != nil { 19 | go dt.replayEvent(event) 20 | } else { 21 | dt.promote(event.Target) 22 | dt.Logln(LogDebug, "promote() done on", event.Target.String()) 23 | } 24 | case dendrite.EvPredecessorJoined: 25 | dt.Logf(LogDebug, "delegator() - predecessor joined - demoting keys to new predecessor %s, status: ", event.Target.String()) 26 | // don't make the call just yet. Need to verify that peer is ready 27 | if err := dt.checkPeer(event.PrimaryItem); err != nil { 28 | // schedule for replay 29 | go dt.replayEvent(event) 30 | } else { 31 | dt.demote(event.Target, event.PrimaryItem) 32 | dt.Logln(LogDebug, "demoting done on", event.Target.String()) 33 | } 34 | case dendrite.EvReplicasChanged: 35 | dt.Logf(LogDebug, "delegator() - replicas changed on %s, status: ", event.Target.String()) 36 | safe := true 37 | for _, remote := range event.ItemList { 38 | if remote == nil { 39 | continue 40 | } 41 | if err := dt.checkPeer(remote); err != nil { 42 | safe = false 43 | break 44 | } 45 | } 46 | if !safe { 47 | go dt.replayEvent(event) 48 | } else { 49 | dt.changeReplicas(event.Target, event.ItemList) 50 | dt.Logln(LogDebug, "changeReplica() done on", event.Target.String()) 51 | } 52 | } 53 | case event := <-dt.dtable_c: 54 | // internal event received 55 | switch event.evType { 56 | case evPromoteKey: 57 | dt.Logf(LogDebug, "delegator() - promotekey() event - on %s, for key %s", event.vnode.String(), event.item.keyHashString()) 58 | dt.promoteKey(event.vnode, event.item) 59 | } 60 | case <-dt.selfcheck_t.C: 61 | dt.Logln(LogDebug, "delegator() - selfcheck() started") 62 | dt.selfCheck() 63 | dt.Logln(LogDebug, "delegator() - selfcheck() completed") 64 | } 65 | } 66 | 67 | } 68 | 69 | func (dt *DTable) checkPeer(remote *dendrite.Vnode) error { 70 | return dt.remoteStatus(remote) 71 | } 72 | 73 | // replayEvent() is called when remote node does not have dtable initialized 74 | func (dt *DTable) replayEvent(event *dendrite.EventCtx) { 75 | dt.Logln(LogDebug, "- replayEvent scheduled") 76 | time.Sleep(5 * time.Second) 77 | dt.EmitEvent(event) 78 | } 79 | -------------------------------------------------------------------------------- /dtable/pb_defs/dtable.proto: -------------------------------------------------------------------------------- 1 | package dtable; 2 | import "chord.proto"; 3 | 4 | // PBDTableResponse is a generic response structure with error indication. 5 | message PBDTableResponse { 6 | required bool ok = 1; 7 | optional string error = 2; 8 | } 9 | 10 | // PBDTableStatus is a message to request the status of remote vnode. 11 | message PBDTableStatus { 12 | required dendrite.PBProtoVnode dest = 1; 13 | } 14 | 15 | // PBDTableReplicaInfo message represents kvItem's replicaInfo structure. 16 | message PBDTableReplicaInfo { 17 | optional dendrite.PBProtoVnode master = 1; 18 | repeated dendrite.PBProtoVnode vnodes = 2; 19 | repeated dendrite.PBProtoVnode orphanVnodes = 3; 20 | optional int32 state = 4; 21 | optional int32 depth = 5; 22 | } 23 | 24 | // PBDTableItem message represents kvItem's structure. 25 | message PBDTableItem { 26 | optional bytes key = 1; 27 | optional bytes val = 2; 28 | optional int64 timestamp = 3; 29 | optional bool commited = 4; 30 | optional bytes keyHash = 5; 31 | optional PBDTableReplicaInfo replicaInfo = 6; 32 | optional dendrite.PBProtoVnode origin = 7; 33 | optional bool found = 8; 34 | } 35 | 36 | // PBDTableDemotedItem message represents demotedItem's structure. 37 | message PBDTableDemotedItem { 38 | required dendrite.PBProtoVnode dest = 1; 39 | required PBDTableItem item = 2; 40 | optional dendrite.PBProtoVnode origin = 3; 41 | } 42 | 43 | // PBDTableMultiItemResponse is a response message used to send multiple kvItems to the caller. 44 | message PBDTableMultiItemResponse { 45 | optional dendrite.PBProtoVnode origin = 1; 46 | repeated PBDTableItem items = 2; 47 | } 48 | 49 | // PBDTableGetItem is a request message used to get an item from remote vnode. 50 | message PBDTableGetItem { 51 | required dendrite.PBProtoVnode dest = 1; 52 | required bytes keyHash = 2; 53 | optional dendrite.PBProtoVnode origin = 3; 54 | } 55 | 56 | // PBDTableSetItem is a request message used to set an item to remote vnode. 57 | message PBDTableSetItem { 58 | required dendrite.PBProtoVnode dest = 1; 59 | required PBDTableItem item = 2; 60 | optional dendrite.PBProtoVnode origin = 3; 61 | optional bool demoting = 4; 62 | optional int32 minAcks = 5; 63 | } 64 | 65 | // PBDTableSetMultiItem is a request message used to set multiple items on remote vnode. 66 | message PBDTableSetMultiItem { 67 | required dendrite.PBProtoVnode dest = 1; 68 | optional dendrite.PBProtoVnode origin = 2; 69 | repeated PBDTableItem items = 3; 70 | } 71 | 72 | // PBDTableClearReplica is a request message used to remove replicated item from remote vnode. 73 | message PBDTableClearReplica { 74 | required dendrite.PBProtoVnode dest = 1; 75 | required bytes keyHash = 2; 76 | required bool demoted = 3; 77 | optional dendrite.PBProtoVnode origin = 4; 78 | } 79 | 80 | // PBDTableSetReplicaInfo is a request message used to update metadata for replicated item on remote vnode. 81 | message PBDTableSetReplicaInfo { 82 | required dendrite.PBProtoVnode dest = 1; 83 | required bytes keyHash = 2; 84 | required PBDTableReplicaInfo replicaInfo = 3; 85 | optional dendrite.PBProtoVnode origin = 4; 86 | } 87 | 88 | // PBDTablePromoteKey is a request message used to request a promotion of a key on the remote vnode. 89 | message PBDTablePromoteKey { 90 | required dendrite.PBProtoVnode dest = 1; 91 | required PBDTableItem item = 2; 92 | optional dendrite.PBProtoVnode origin = 3; 93 | } 94 | -------------------------------------------------------------------------------- /chord_math.go: -------------------------------------------------------------------------------- 1 | package dendrite 2 | 3 | import ( 4 | "bytes" 5 | //"log" 6 | "crypto/sha1" 7 | "encoding/hex" 8 | "math/big" 9 | "math/rand" 10 | "time" 11 | ) 12 | 13 | func min(a, b int) int { 14 | if a <= b { 15 | return a 16 | } else { 17 | return b 18 | } 19 | } 20 | 21 | // Min returns lesser of two ints. 22 | func Min(a, b int) int { 23 | return min(a, b) 24 | } 25 | 26 | // randStabilize generates a random stabilization time between conf.StabilizeMin and conf.StabilizeMax. 27 | func randStabilize(conf *Config) time.Duration { 28 | min := conf.StabilizeMin 29 | max := conf.StabilizeMax 30 | rand.Seed(time.Now().UnixNano()) 31 | r := rand.Float64() 32 | return time.Duration((r * float64(max-min)) + float64(min)) 33 | } 34 | 35 | func between(id1, id2, key []byte, rincl bool) bool { 36 | // Check for ring wrap around 37 | if bytes.Compare(id1, id2) == 1 { 38 | if rincl { 39 | return bytes.Compare(id1, key) == -1 || 40 | bytes.Compare(id2, key) >= 0 41 | } 42 | return bytes.Compare(id1, key) == -1 || 43 | bytes.Compare(id2, key) == 1 44 | } 45 | 46 | // Handle the normal case 47 | if rincl { 48 | return bytes.Compare(id1, key) == -1 && 49 | bytes.Compare(id2, key) >= 0 50 | } 51 | return bytes.Compare(id1, key) == -1 && 52 | bytes.Compare(id2, key) == 1 53 | } 54 | 55 | /* 56 | Between checks if key is between id1 and id2, such that: 57 | 58 | if rincl (right-included flag) is true: 59 | (id1 > key > id2) 60 | if rincl (right-included flag) is false: 61 | (id1 > key >= id2) 62 | */ 63 | func Between(id1, id2, key []byte, rincl bool) bool { 64 | return between(id1, id2, key, rincl) 65 | } 66 | 67 | // nearestVnodeToKey for a given list of sorted vnodes, return the closest(predecessor) one to the given key 68 | func nearestVnodeToKey(vnodes []*localVnode, key []byte) *Vnode { 69 | for i := len(vnodes) - 1; i >= 0; i-- { 70 | if bytes.Compare(vnodes[i].Id, key) == -1 { 71 | return &vnodes[i].Vnode 72 | } 73 | } 74 | // Return the last vnode 75 | return &vnodes[len(vnodes)-1].Vnode 76 | } 77 | 78 | // powerOffset computes the offset by (n + 2^exp) % (2^mod) 79 | func powerOffset(id []byte, exp int, mod int) []byte { 80 | // Copy the existing slice 81 | off := make([]byte, len(id)) 82 | copy(off, id) 83 | 84 | // Convert the ID to a bigint 85 | idInt := big.Int{} 86 | idInt.SetBytes(id) 87 | 88 | // Get the offset 89 | two := big.NewInt(2) 90 | offset := big.Int{} 91 | offset.Exp(two, big.NewInt(int64(exp)), nil) 92 | 93 | // Sum 94 | sum := big.Int{} 95 | sum.Add(&idInt, &offset) 96 | 97 | // Get the ceiling 98 | ceil := big.Int{} 99 | ceil.Exp(two, big.NewInt(int64(mod)), nil) 100 | 101 | // Apply the mod 102 | idInt.Mod(&sum, &ceil) 103 | 104 | // Add together 105 | return idInt.Bytes() 106 | } 107 | 108 | // distance calculates the distance between two keys. 109 | func distance(a, b []byte) *big.Int { 110 | // Get the ring size 111 | var ring big.Int 112 | ring.Exp(big.NewInt(2), big.NewInt(int64(160)), nil) 113 | // Convert to int 114 | var a_int, b_int, dist big.Int 115 | (&a_int).SetBytes(a) 116 | (&b_int).SetBytes(b) 117 | (&dist).SetInt64(0) 118 | 119 | cmp := bytes.Compare(a, b) 120 | switch cmp { 121 | case 0: 122 | return &dist 123 | case -1: 124 | return (&dist).Sub(&b_int, &a_int) 125 | default: 126 | // loop the ring 127 | (&dist).Sub(&ring, &a_int) 128 | return (&dist).Add(&dist, &b_int) 129 | } 130 | 131 | } 132 | 133 | // HashKey generates SHA1 hash for a given []byte key 134 | func HashKey(key []byte) []byte { 135 | hash := sha1.New() 136 | hash.Write(key) 137 | return hash.Sum(nil) 138 | } 139 | 140 | // KeyFromString decodes hex string to []byte 141 | func KeyFromString(key_str string) []byte { 142 | key, _ := hex.DecodeString(key_str) 143 | return key 144 | } 145 | -------------------------------------------------------------------------------- /dtable/dtable_query.go: -------------------------------------------------------------------------------- 1 | package dtable 2 | 3 | import ( 4 | "fmt" 5 | "github.com/fastfn/dendrite" 6 | "sync" 7 | "time" 8 | ) 9 | 10 | type queryType int 11 | 12 | const ( 13 | qErr queryType = -1 14 | qGet queryType = 0 15 | qSet queryType = 1 16 | ) 17 | 18 | // Query is dtable's native interface for doing data operations. 19 | type Query interface { 20 | Consistency(int) Query 21 | Get([]byte) (*KVItem, error) 22 | Set([]byte, []byte) error // (key, val) 23 | GetLocalKeys() [][]byte 24 | } 25 | 26 | // Query defines a dtable query. 27 | type query struct { 28 | dt *DTable 29 | qType queryType 30 | minAcks int 31 | kvItem *kvItem 32 | err error 33 | } 34 | 35 | // NewQuery returns Query. 36 | func (dt *DTable) NewQuery() Query { 37 | return &query{ 38 | dt: dt, 39 | qType: -1, 40 | minAcks: 1, 41 | } 42 | } 43 | 44 | // Consistency is used prior to Set() to request minimum writes before operation returns success. 45 | // If dtable runs with 2 replicas, user may request 2 writes (primary + 1 replica) and let dtable 46 | // handle final write in the background. If requested value is larger than configured dendrite replicas, 47 | // value is reset to 1. Default is 1. 48 | func (q *query) Consistency(n int) Query { 49 | if n >= 1 && n <= q.dt.ring.Replicas()+1 { 50 | q.minAcks = n 51 | } 52 | return q 53 | } 54 | 55 | // Get returns *KVItem for a key. If key is not found on this node, but node holds key replica, replica is returned. 56 | // If key is not found on this node, and node does not hold replica, request is forwarded to the node responsible 57 | // for this key. *KVItem is nil if key was not found, and error is set if there was an error during request. 58 | func (q *query) Get(key []byte) (*KVItem, error) { 59 | if key == nil || len(key) == 0 { 60 | return nil, fmt.Errorf("key can not be nil or empty") 61 | } 62 | reqItem := new(kvItem) 63 | reqItem.Key = key 64 | reqItem.keyHash = dendrite.HashKey(key) 65 | 66 | item, err := q.dt.get(reqItem) 67 | if err != nil { 68 | return nil, err 69 | } 70 | if item == nil { 71 | return nil, nil 72 | } 73 | 74 | return &item.KVItem, nil 75 | } 76 | 77 | // Set writes to dtable. 78 | func (q *query) Set(key, val []byte) error { 79 | if key == nil || len(key) == 0 { 80 | return fmt.Errorf("key can not be nil or empty") 81 | } 82 | q.qType = qSet 83 | reqItem := new(kvItem) 84 | reqItem.lock = new(sync.Mutex) 85 | 86 | reqItem.Key = make([]byte, len(key)) 87 | copy(reqItem.Key, key) 88 | 89 | if val != nil { 90 | reqItem.Val = make([]byte, len(val)) 91 | copy(reqItem.Val, val) 92 | } 93 | 94 | reqItem.keyHash = dendrite.HashKey(key) 95 | reqItem.timestamp = time.Now() 96 | reqItem.replicaInfo = new(kvReplicaInfo) 97 | reqItem.replicaInfo.vnodes = make([]*dendrite.Vnode, q.dt.ring.Replicas()) 98 | reqItem.replicaInfo.orphan_vnodes = make([]*dendrite.Vnode, 0) 99 | 100 | wait := make(chan error) 101 | succs, err := q.dt.ring.Lookup(1, reqItem.keyHash) 102 | if err != nil { 103 | return err 104 | } 105 | if len(succs) != 1 || succs[0] == nil { 106 | return fmt.Errorf("successor lookup failed for key, %x", reqItem.keyHash) 107 | } 108 | // see if this node is responsible for this key 109 | _, ok := q.dt.table[succs[0].String()] 110 | if ok { 111 | go q.dt.set(succs[0], reqItem, q.minAcks, wait) 112 | } else { 113 | // pass to remote 114 | reqItem.replicaInfo.master = succs[0] 115 | go q.dt.remoteSet(succs[0], succs[0], reqItem, q.minAcks, false, wait) 116 | } 117 | err = <-wait 118 | return err 119 | } 120 | 121 | // GetLocalKeys returns the list of keys that are stored on this node (across all vnodes). 122 | func (q *query) GetLocalKeys() [][]byte { 123 | rv := make([][]byte, 0) 124 | for _, table := range q.dt.table { 125 | for _, item := range table { 126 | copy_key := make([]byte, len(item.Key)) 127 | copy(copy_key, item.Key) 128 | rv = append(rv, copy_key) 129 | } 130 | } 131 | return rv 132 | } 133 | -------------------------------------------------------------------------------- /transport_local.go: -------------------------------------------------------------------------------- 1 | package dendrite 2 | 3 | import ( 4 | "sync" 5 | ) 6 | 7 | // LocalTransport implements Transport interface, but is used for communicating between local vnodes. 8 | type LocalTransport struct { 9 | host string 10 | remote Transport 11 | lock sync.RWMutex 12 | table map[string]*localHandler 13 | } 14 | 15 | // InitLocalTransport initializes LocalTransport. 16 | func InitLocalTransport(remote Transport) Transport { 17 | lt := &LocalTransport{ 18 | remote: remote, 19 | table: make(map[string]*localHandler), 20 | } 21 | return lt 22 | } 23 | 24 | // RegisterHook does nothing in local transport. Just satisfying interface. 25 | func (lt *LocalTransport) RegisterHook(th TransportHook) { 26 | } 27 | 28 | // Decode does nothing in local transport. Just satisfying interface. 29 | func (lt *LocalTransport) Decode(raw []byte) (*ChordMsg, error) { 30 | return nil, nil 31 | } 32 | 33 | // Encode does nothing in local transport. Just satisfying interface. 34 | func (lt *LocalTransport) Encode(msgtype MsgType, data []byte) []byte { 35 | return nil 36 | } 37 | 38 | // Register registers a VnodeHandler within local and remote transports. 39 | func (lt *LocalTransport) Register(vnode *Vnode, handler VnodeHandler) { 40 | // Register local instance 41 | lt.lock.Lock() 42 | lt.host = vnode.Host 43 | lt.table[vnode.String()] = &localHandler{vnode, handler} 44 | lt.lock.Unlock() 45 | 46 | // Register with remote transport 47 | lt.remote.Register(vnode, handler) 48 | } 49 | 50 | func (lt *LocalTransport) getVnodeHandler(vnode *Vnode) (VnodeHandler, bool) { 51 | lt.lock.Lock() 52 | defer lt.lock.Unlock() 53 | h, ok := lt.table[vnode.String()] 54 | if ok { 55 | return h.handler, ok 56 | } 57 | return nil, ok 58 | } 59 | 60 | // GetVnodeHandler returns registered local vnode handler, if one is found for given vnode. 61 | func (lt *LocalTransport) GetVnodeHandler(vnode *Vnode) (VnodeHandler, bool) { 62 | return lt.getVnodeHandler(vnode) 63 | } 64 | 65 | // FindSuccessors implements Transport's FindSuccessors() in local transport. 66 | func (lt *LocalTransport) FindSuccessors(vn *Vnode, limit int, key []byte) ([]*Vnode, error) { 67 | // Look for it locally 68 | handler, ok := lt.getVnodeHandler(vn) 69 | // If it exists locally, handle it 70 | if ok { 71 | succs, forward_vn, err := handler.FindSuccessors(key, limit) 72 | if err != nil { 73 | return nil, err 74 | } 75 | if forward_vn != nil { 76 | return lt.FindSuccessors(forward_vn, limit, key) 77 | } 78 | return succs, nil 79 | } 80 | 81 | // Pass onto remote 82 | return lt.remote.FindSuccessors(vn, limit, key) 83 | } 84 | 85 | // ListVnodes implements Transport's ListVnodes() in local transport. 86 | func (lt *LocalTransport) ListVnodes(host string) ([]*Vnode, error) { 87 | // Check if this is a local host 88 | if host == lt.host { 89 | // Generate all the local clients 90 | res := make([]*Vnode, 0, len(lt.table)) 91 | 92 | // Build list 93 | lt.lock.RLock() 94 | for _, v := range lt.table { 95 | res = append(res, v.vn) 96 | } 97 | lt.lock.RUnlock() 98 | 99 | return res, nil 100 | } 101 | 102 | // Pass onto remote 103 | return lt.remote.ListVnodes(host) 104 | } 105 | 106 | // Ping implements Transport's Ping() in local transport. 107 | func (lt *LocalTransport) Ping(vn *Vnode) (bool, error) { 108 | // Look for it locally 109 | _, ok := lt.getVnodeHandler(vn) 110 | if ok { 111 | return true, nil 112 | } 113 | // ping remote 114 | return lt.remote.Ping(vn) 115 | } 116 | 117 | // GetPredecessor implements Transport's GetPredecessor() in local transport. 118 | func (lt *LocalTransport) GetPredecessor(vn *Vnode) (*Vnode, error) { 119 | local_vn, ok := lt.getVnodeHandler(vn) 120 | if ok { 121 | return local_vn.GetPredecessor() 122 | } 123 | return lt.remote.GetPredecessor(vn) 124 | } 125 | 126 | // Notify implements Transport's Notify() in local transport. 127 | func (lt *LocalTransport) Notify(dest, self *Vnode) ([]*Vnode, error) { 128 | // Look for it locally 129 | handler, ok := lt.getVnodeHandler(dest) 130 | 131 | // If it exists locally, handle it 132 | if ok { 133 | return handler.Notify(self) 134 | } 135 | 136 | // Pass onto remote 137 | return lt.remote.Notify(dest, self) 138 | } 139 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Dendrite 2 | 3 | Dendrite is a Go package that implements distributed hash table (DHT) based on Chord Protocol. 4 | Included sub-package 'dtable' is built on top of dendrite and implements 5 | distributed in-memory key/value database, with replication and failover support, 6 | with query interface to Get() or Set() items with different consistency levels. 7 | 8 | For better key distribution, dendrite allows configurable number of virtual nodes 9 | per instance (vnodes). The number of replicas in dtable is also configurable. 10 | 11 | Calling application can bootstrap the cluster, or join existing one by connecting to any of 12 | existing nodes (must be manually specified). Node discovery is not part of the implementation. 13 | Use consul (consul.io) or something else for that purpose. 14 | 15 | Chord protocol defines ring stabilization. In dendrite, stabilization period is configurable. 16 | 17 | Node to node (network) communication is built on top of ZeroMQ sockets over TCP for speed, clustering 18 | and reliability. Dendrite starts configurable number of goroutines (default: 10) for load balanced 19 | serving of remote requests, but scales that number up and down depending on the load (aka prefork model). 20 | 21 | All messages sent through dendrite are encapsulated in ChordMsg structure, where first byte indicates message type, 22 | and actual data follows. Data part is serialized with protocol buffers. 23 | 24 | Dendrite can be extended through two interfaces: 25 | - TransportHook 26 | - DelegateHook 27 | 28 | TransportHook allows other packages to provide additional message types, decoders and handlers, while DelegateHook 29 | can be used to capture chord events that dendrite emits: 30 | - EvPredecessorJoined 31 | - EvPredecessorLeft 32 | - EvReplicasChanged 33 | 34 | 35 | ## Documentation 36 | - http://godoc.org/github.com/fastfn/dendrite 37 | - http://godoc.org/github.com/fastfn/dendrite/dtable 38 | 39 | 40 | ## Usage 41 | ``` 42 | import "github.com/fastfn/dendrite" 43 | import "github.com/fastfn/dendrite/dtable" 44 | ... 45 | // Initialize ZMQTransport with timeout set to 5 seconds 46 | transport, err := dendrite.InitZMQTransport("127.0.0.1:5000", 30*time.Second, nil) 47 | if err != nil { 48 | panic(err) 49 | return 50 | } 51 | config := dendrite.DefaultConfig("127.0.0.1:5000") 52 | ``` 53 | 54 | ### Bootstrap the cluster (first node) 55 | ``` 56 | // Start new cluster 57 | ring, err = dendrite.CreateRing(config, transport) 58 | if err != nil { 59 | panic(err) 60 | } 61 | table = dtable.Init(ring, transport, dtable.LogInfo) 62 | ``` 63 | 64 | ### Joining the cluster 65 | ``` 66 | // We join the cluster by providing the address of one of existing nodes in the cluster. 67 | ring, err = dendrite.JoinRing(config, transport, "192.168.0.50:5000") 68 | if err != nil { 69 | panic(err) 70 | } 71 | table = dtable.Init(ring, transport, dtable.LogInfo) 72 | ``` 73 | ### DTable Query examples 74 | #### Set() 75 | ``` 76 | query := table.NewQuery() 77 | err := query.Set([]byte("testkey"), []byte("testvalue")) 78 | if err != nil { 79 | panic(err) 80 | } 81 | ``` 82 | #### Set() with consistency 83 | Consistency() is used prior to Set() to request minimum writes before operation returns success. 84 | If dtable runs with 2 replicas, user may request 2 writes (primary + 1 replica) and let dtable 85 | handle final write in the background. If requested value is larger than configured dendrite replicas, 86 | value is reset to default. Default is 1. 87 | ``` 88 | query := table.NewQuery() 89 | err := query.Consistency(2).Set([]byte("testkey"), []byte("testvalue")) 90 | if err != nil { 91 | panic(err) 92 | } 93 | ``` 94 | #### Get() 95 | ``` 96 | query := table.NewQuery() 97 | item, err := query.Get([]byte("testkey")) 98 | if err != nil { 99 | log.Println("Got error in table Get: ", err) 100 | } else if item == nil { 101 | log.Printf("item not found") 102 | } else { 103 | log.Printf("Value is: %s\n", string(item.Val)) 104 | } 105 | ``` 106 | #### GetLocalKeys() 107 | GetLocalKeys() returns the list of all keys stored on local node. 108 | ``` 109 | query := table.NewQuery() 110 | for _, key := range query.GetLocalKeys() { 111 | log.Printf("Key: %s\n", string(key)) 112 | } 113 | ``` 114 | 115 | ## Todo 116 | - dtable: support SetMulti() and GetMulti() on public interface 117 | - dtable: support batches on replication/migration ops 118 | - dendrite: add some kind of security for inter communication between nodes 119 | -------------------------------------------------------------------------------- /dtable/README.md: -------------------------------------------------------------------------------- 1 | # Dendrite 2 | 3 | Dendrite is a Go package that implements distributed hash table (DHT) based on Chord Protocol. 4 | Included sub-package 'dtable' is built on top of dendrite and implements 5 | distributed in-memory key/value database, with replication and failover support, 6 | with query interface to Get() or Set() items with different consistency levels. 7 | 8 | For better key distribution, dendrite allows configurable number of virtual nodes 9 | per instance (vnodes). The number of replicas in dtable is also configurable. 10 | 11 | Calling application can bootstrap the cluster, or join existing one by connecting to any of 12 | existing nodes (must be manually specified). Node discovery is not part of the implementation. 13 | Use consul (consul.io) or something else for that purpose. 14 | 15 | Chord protocol defines ring stabilization. In dendrite, stabilization period is configurable. 16 | 17 | Node to node (network) communication is built on top of ZeroMQ sockets over TCP for speed, clustering 18 | and reliability. Dendrite starts configurable number of goroutines (default: 10) for load balanced 19 | serving of remote requests, but scales that number up and down depending on the load (aka prefork model). 20 | 21 | All messages sent through dendrite are encapsulated in ChordMsg structure, where first byte indicates message type, 22 | and actual data follows. Data part is serialized with protocol buffers. 23 | 24 | Dendrite can be extended through two interfaces: 25 | - TransportHook 26 | - DelegateHook 27 | 28 | TransportHook allows other packages to provide additional message types, decoders and handlers, while DelegateHook 29 | can be used to capture chord events that dendrite emits: 30 | - EvPredecessorJoined 31 | - EvPredecessorLeft 32 | - EvReplicasChanged 33 | 34 | 35 | ## Documentation 36 | - http://godoc.org/github.com/fastfn/dendrite 37 | - http://godoc.org/github.com/fastfn/dendrite/dtable 38 | 39 | 40 | ## Usage 41 | ``` 42 | import "github.com/fastfn/dendrite" 43 | import "github.com/fastfn/dendrite/dtable" 44 | ... 45 | // Initialize ZMQTransport with timeout set to 5 seconds 46 | transport, err := dendrite.InitZMQTransport("127.0.0.1:5000", 5*time.Second) 47 | if err != nil { 48 | panic(err) 49 | return 50 | } 51 | config := dendrite.DefaultConfig("127.0.0.1:5000") 52 | ``` 53 | 54 | ### Bootstrap the cluster (first node) 55 | ``` 56 | // Start new cluster 57 | ring, err = dendrite.CreateRing(config, transport) 58 | if err != nil { 59 | panic(err) 60 | } 61 | table = dtable.Init(ring, transport, dtable.LogInfo) 62 | ``` 63 | 64 | ### Joining the cluster 65 | ``` 66 | // We join the cluster by providing the address of one of existing nodes in the cluster. 67 | ring, err = dendrite.JoinRing(config, transport, "192.168.0.50:5000") 68 | if err != nil { 69 | panic(err) 70 | } 71 | table = dtable.Init(ring, transport, dtable.LogInfo) 72 | ``` 73 | ### DTable Query examples 74 | #### Set() 75 | ``` 76 | query := table.NewQuery() 77 | err := query.Set([]byte("testkey"), []byte("testvalue")) 78 | if err != nil { 79 | panic(err) 80 | } 81 | ``` 82 | #### Set() with consistency 83 | Consistency() is used prior to Set() to request minimum writes before operation returns success. 84 | If dtable runs with 2 replicas, user may request 2 writes (primary + 1 replica) and let dtable 85 | handle final write in the background. If requested value is larger than configured dendrite replicas, 86 | value is reset to default. Default is 1. 87 | ``` 88 | query := table.NewQuery() 89 | err := query.Consistency(2).Set([]byte("testkey"), []byte("testvalue")) 90 | if err != nil { 91 | panic(err) 92 | } 93 | ``` 94 | #### Get() 95 | ``` 96 | query := table.NewQuery() 97 | item, err := query.Get([]byte("testkey")) 98 | if err != nil { 99 | log.Println("Got error in table Get: ", err) 100 | } else if item == nil { 101 | log.Printf("item not found") 102 | } else { 103 | log.Printf("Value is: %s\n", string(item.Val)) 104 | } 105 | ``` 106 | #### GetLocalKeys() 107 | GetLocalKeys() returns the list of all keys stored on local node. 108 | ``` 109 | query := table.NewQuery() 110 | for _, key := range query.GetLocalKeys() { 111 | log.Printf("Key: %s\n", string(key)) 112 | } 113 | ``` 114 | 115 | ## Todo 116 | - dtable: support SetMulti() and GetMulti() on public interface 117 | - dtable: support batches on replication/migration ops 118 | - dendrite: add some kind of security for inter communication between nodes 119 | -------------------------------------------------------------------------------- /vnode_handlers.go: -------------------------------------------------------------------------------- 1 | package dendrite 2 | 3 | import ( 4 | "bytes" 5 | ) 6 | 7 | /* 8 | VnodeHandler interface defines methods (from Transport interface) that are to be called in vnode context. 9 | Transports use this interface to avoid duplicate implementations. localVnode implements this interface. 10 | */ 11 | type VnodeHandler interface { 12 | FindSuccessors([]byte, int) ([]*Vnode, *Vnode, error) // args: key, limit # returns: succs, forward, error 13 | FindRemoteSuccessors(int) ([]*Vnode, error) 14 | GetPredecessor() (*Vnode, error) 15 | Notify(*Vnode) ([]*Vnode, error) 16 | } 17 | 18 | // localHandler is a handler object connecting a VnodeHandler and Vnode. 19 | type localHandler struct { 20 | vn *Vnode 21 | handler VnodeHandler 22 | } 23 | 24 | // FindSuccessors implements Transport's FindSuccessors() in vnode context. 25 | func (vn *localVnode) FindSuccessors(key []byte, limit int) ([]*Vnode, *Vnode, error) { 26 | // check if we have direct successor for requested key 27 | succs := make([]*Vnode, 0) 28 | max_vnodes := min(limit, len(vn.successors)) 29 | if bytes.Compare(key, vn.Id) == 0 || between(vn.Id, vn.successors[0].Id, key, true) { 30 | for i := 0; i < max_vnodes; i++ { 31 | if vn.successors[i] == nil { 32 | continue 33 | } 34 | succs = append(succs, &Vnode{ 35 | Id: vn.successors[i].Id, 36 | Host: vn.successors[i].Host, 37 | }) 38 | } 39 | return succs, nil, nil 40 | } 41 | 42 | // if finger table has been initialized - forward request to closest finger 43 | // otherwise forward to my successor 44 | 45 | forward_vn := vn.closest_preceeding_finger(key) 46 | 47 | // if we got ourselves back, that's it - I'm the successor 48 | if bytes.Compare(forward_vn.Id, vn.Id) == 0 { 49 | succs = append(succs, &Vnode{ 50 | Id: vn.Id, 51 | Host: vn.Host, 52 | }) 53 | for i := 1; i < max_vnodes; i++ { 54 | if vn.successors[i-1] == nil { 55 | break 56 | } 57 | succs = append(succs, vn.successors[i-1]) 58 | } 59 | return succs, nil, nil 60 | } 61 | //log.Printf("findsuccessor (%X) forwarding to %X\n", vn.Id, forward_vn.Id) 62 | return nil, forward_vn, nil 63 | } 64 | 65 | // GetPredecessor implements Transport's GetPredecessor() in vnode context. 66 | func (vn *localVnode) GetPredecessor() (*Vnode, error) { 67 | if vn.predecessor == nil { 68 | return nil, nil 69 | } 70 | return vn.predecessor, nil 71 | } 72 | 73 | // Notify is invoked when a Vnode gets notified. 74 | func (vn *localVnode) Notify(maybe_pred *Vnode) ([]*Vnode, error) { 75 | // Check if we should update our predecessor 76 | if vn.predecessor == nil || between(vn.predecessor.Id, vn.Id, maybe_pred.Id, false) { 77 | var real_pred *Vnode 78 | 79 | if vn.predecessor == nil { 80 | if vn.old_predecessor != nil { 81 | // need to check against old predecessor here 82 | real_pred = vn.old_predecessor 83 | } else { 84 | real_pred = vn.predecessor 85 | } 86 | } else { 87 | real_pred = vn.predecessor 88 | } 89 | 90 | // before emiting anything, lets update our remotes 91 | vn.updateRemoteSuccessors() 92 | 93 | if real_pred == nil || between(real_pred.Id, vn.Id, maybe_pred.Id, false) { 94 | ctx := &EventCtx{ 95 | EvType: EvPredecessorJoined, 96 | Target: &vn.Vnode, 97 | PrimaryItem: maybe_pred, 98 | SecondaryItem: vn.old_predecessor, 99 | } 100 | vn.ring.emit(ctx) 101 | } else { 102 | ctx := &EventCtx{ 103 | EvType: EvPredecessorLeft, 104 | Target: &vn.Vnode, 105 | PrimaryItem: maybe_pred, 106 | SecondaryItem: vn.old_predecessor, 107 | } 108 | vn.ring.emit(ctx) 109 | } 110 | 111 | // maybe we're just joining and one of our local vnodes is closer to us than this predecessor 112 | vn.ring.Logf(LogInfo, "vn.Notify() - setting new predecessor for %x: %x\n", vn.Id, maybe_pred.Id) 113 | vn.predecessor = maybe_pred 114 | } 115 | 116 | // Return our successors list 117 | return vn.successors, nil 118 | } 119 | 120 | // FindRemoteSuccessors returns up to 'limit' successor vnodes, 121 | // that are unique and do not reside on same physical node as vnode. 122 | func (vn *localVnode) FindRemoteSuccessors(limit int) ([]*Vnode, error) { 123 | remote_succs := make([]*Vnode, 0) 124 | for _, succ := range vn.remote_successors { 125 | if succ == nil { 126 | continue 127 | } 128 | remote_succs = append(remote_succs, succ) 129 | } 130 | return remote_succs, nil 131 | } 132 | -------------------------------------------------------------------------------- /transport_zmq_handler.go: -------------------------------------------------------------------------------- 1 | package dendrite 2 | 3 | import ( 4 | "github.com/golang/protobuf/proto" 5 | ) 6 | 7 | func (transport *ZMQTransport) zmq_ping_handler(request *ChordMsg, w chan *ChordMsg) { 8 | pbPongMsg := &PBProtoPing{ 9 | Version: proto.Int64(1), 10 | } 11 | pbPong, _ := proto.Marshal(pbPongMsg) 12 | pong := &ChordMsg{ 13 | Type: PbPing, 14 | Data: pbPong, 15 | } 16 | w <- pong 17 | } 18 | 19 | func (transport *ZMQTransport) zmq_listVnodes_handler(request *ChordMsg, w chan *ChordMsg) { 20 | pblist := new(PBProtoListVnodesResp) 21 | for _, handler := range transport.table { 22 | h, _ := transport.getVnodeHandler(handler.vn) 23 | local_vn := h.(*localVnode) 24 | for _, vnode := range local_vn.ring.vnodes { 25 | pblist.Vnodes = append(pblist.Vnodes, vnode.ToProtobuf()) 26 | } 27 | break 28 | } 29 | pbdata, err := proto.Marshal(pblist) 30 | if err != nil { 31 | errorMsg := transport.newErrorMsg("ZMQ::ListVnodesHandler - failed to marshal response - " + err.Error()) 32 | w <- errorMsg 33 | return 34 | } 35 | w <- &ChordMsg{ 36 | Type: PbListVnodesResp, 37 | Data: pbdata, 38 | } 39 | return 40 | } 41 | 42 | func (transport *ZMQTransport) zmq_find_successors_handler(request *ChordMsg, w chan *ChordMsg) { 43 | pbMsg := request.TransportMsg.(PBProtoFindSuccessors) 44 | key := pbMsg.GetKey() 45 | dest := VnodeFromProtobuf(pbMsg.GetDest()) 46 | 47 | // make sure destination vnode exists locally 48 | local_vn, err := transport.getVnodeHandler(dest) 49 | if err != nil { 50 | errorMsg := transport.newErrorMsg("ZMQ::FindSuccessorsHandler - " + err.Error()) 51 | w <- errorMsg 52 | return 53 | } 54 | succs, forward_vn, err := local_vn.FindSuccessors(key, int(pbMsg.GetLimit())) 55 | if err != nil { 56 | errorMsg := transport.newErrorMsg("ZMQ::FindSuccessorsHandler - " + err.Error()) 57 | w <- errorMsg 58 | return 59 | } 60 | 61 | // if forward_vn is not set, return the list 62 | if forward_vn == nil { 63 | pblist := new(PBProtoListVnodesResp) 64 | for _, s := range succs { 65 | pblist.Vnodes = append(pblist.Vnodes, s.ToProtobuf()) 66 | } 67 | pbdata, err := proto.Marshal(pblist) 68 | if err != nil { 69 | errorMsg := transport.newErrorMsg("ZMQ::FindSuccessorsHandler - failed to marshal response - " + err.Error()) 70 | w <- errorMsg 71 | return 72 | } 73 | w <- &ChordMsg{ 74 | Type: PbListVnodesResp, 75 | Data: pbdata, 76 | } 77 | return 78 | } 79 | // send forward response 80 | pbfwd := &PBProtoForward{Vnode: forward_vn.ToProtobuf()} 81 | pbdata, err := proto.Marshal(pbfwd) 82 | if err != nil { 83 | errorMsg := transport.newErrorMsg("ZMQ::FindSuccessorsHandler - failed to marshal forward response - " + err.Error()) 84 | w <- errorMsg 85 | return 86 | } 87 | w <- &ChordMsg{ 88 | Type: PbForward, 89 | Data: pbdata, 90 | } 91 | } 92 | 93 | func (transport *ZMQTransport) zmq_get_predecessor_handler(request *ChordMsg, w chan *ChordMsg) { 94 | pbMsg := request.TransportMsg.(PBProtoGetPredecessor) 95 | dest := VnodeFromProtobuf(pbMsg.GetDest()) 96 | 97 | // make sure destination vnode exists locally 98 | local_vn, err := transport.getVnodeHandler(dest) 99 | if err != nil { 100 | errorMsg := transport.newErrorMsg("ZMQ::GetPredecessorHandler - " + err.Error()) 101 | w <- errorMsg 102 | return 103 | } 104 | 105 | pred, err := local_vn.GetPredecessor() 106 | if err != nil { 107 | errorMsg := transport.newErrorMsg("ZMQ::GetPredecessorHandler - " + err.Error()) 108 | w <- errorMsg 109 | return 110 | } 111 | pbpred := &PBProtoVnode{} 112 | if pred != nil { 113 | pbpred.Id = pred.Id 114 | pbpred.Host = proto.String(pred.Host) 115 | } 116 | pbdata, err := proto.Marshal(pbpred) 117 | if err != nil { 118 | errorMsg := transport.newErrorMsg("ZMQ::GetPredecessorHandler - Failed to marshal response - " + err.Error()) 119 | w <- errorMsg 120 | return 121 | } 122 | 123 | w <- &ChordMsg{ 124 | Type: PbProtoVnode, 125 | Data: pbdata, 126 | } 127 | 128 | } 129 | 130 | // handle Notify() request 131 | func (transport *ZMQTransport) zmq_notify_handler(request *ChordMsg, w chan *ChordMsg) { 132 | pbMsg := request.TransportMsg.(PBProtoNotify) 133 | dest := VnodeFromProtobuf(pbMsg.GetDest()) 134 | 135 | // make sure destination vnode exists locally 136 | local_vn, err := transport.getVnodeHandler(dest) 137 | if err != nil { 138 | errorMsg := transport.newErrorMsg("ZMQ::NotifyHandler - " + err.Error()) 139 | w <- errorMsg 140 | return 141 | } 142 | pred := VnodeFromProtobuf(pbMsg.GetVnode()) 143 | succ_list, err := local_vn.Notify(pred) 144 | if err != nil { 145 | errorMsg := transport.newErrorMsg("ZMQ::NotifyHandler - " + err.Error()) 146 | w <- errorMsg 147 | return 148 | } 149 | pblist := new(PBProtoListVnodesResp) 150 | for _, succ := range succ_list { 151 | if succ == nil { 152 | break 153 | } 154 | pblist.Vnodes = append(pblist.Vnodes, succ.ToProtobuf()) 155 | } 156 | 157 | pbdata, err := proto.Marshal(pblist) 158 | if err != nil { 159 | errorMsg := transport.newErrorMsg("ZMQ::Notify - Failed to marshal response - " + err.Error()) 160 | w <- errorMsg 161 | return 162 | } 163 | w <- &ChordMsg{ 164 | Type: PbListVnodesResp, 165 | Data: pbdata, 166 | } 167 | } 168 | 169 | func (transport *ZMQTransport) zmq_leave_handler(request *ChordMsg, w chan *ChordMsg) { 170 | 171 | } 172 | func (transport *ZMQTransport) zmq_error_handler(request *ChordMsg, w chan *ChordMsg) { 173 | 174 | } 175 | -------------------------------------------------------------------------------- /dtable/dtable_util.go: -------------------------------------------------------------------------------- 1 | package dtable 2 | 3 | import ( 4 | "fmt" 5 | "github.com/fastfn/dendrite" 6 | "github.com/golang/protobuf/proto" 7 | "log" 8 | "time" 9 | ) 10 | 11 | func (item *kvItem) keyHashString() string { 12 | if item.keyHash == nil { 13 | item.keyHash = dendrite.HashKey(item.Key) 14 | } 15 | return fmt.Sprintf("%x", item.keyHash) 16 | } 17 | 18 | func (rinfo *kvReplicaInfo) to_protobuf() *PBDTableReplicaInfo { 19 | pb_master := rinfo.master.ToProtobuf() 20 | pb_vnodes := make([]*dendrite.PBProtoVnode, 0) 21 | for _, rvn := range rinfo.vnodes { 22 | if rvn == nil { 23 | continue 24 | } 25 | pb_vnodes = append(pb_vnodes, rvn.ToProtobuf()) 26 | } 27 | pb_orphanVnodes := make([]*dendrite.PBProtoVnode, 0) 28 | for _, ovn := range rinfo.orphan_vnodes { 29 | pb_orphanVnodes = append(pb_orphanVnodes, ovn.ToProtobuf()) 30 | } 31 | return &PBDTableReplicaInfo{ 32 | Master: pb_master, 33 | Vnodes: pb_vnodes, 34 | OrphanVnodes: pb_orphanVnodes, 35 | State: proto.Int32(int32(rinfo.state)), 36 | Depth: proto.Int32(int32(rinfo.depth)), 37 | } 38 | } 39 | 40 | func replicaInfo_from_protobuf(pb *PBDTableReplicaInfo) *kvReplicaInfo { 41 | if pb == nil { 42 | return nil 43 | } 44 | rInfo := new(kvReplicaInfo) 45 | rInfo.master = dendrite.VnodeFromProtobuf(pb.GetMaster()) 46 | rInfo.vnodes = make([]*dendrite.Vnode, 0) 47 | for _, pb_vnode := range pb.GetVnodes() { 48 | rInfo.vnodes = append(rInfo.vnodes, dendrite.VnodeFromProtobuf(pb_vnode)) 49 | } 50 | rInfo.orphan_vnodes = make([]*dendrite.Vnode, 0) 51 | for _, pb_orphanVnode := range pb.GetOrphanVnodes() { 52 | rInfo.orphan_vnodes = append(rInfo.orphan_vnodes, dendrite.VnodeFromProtobuf(pb_orphanVnode)) 53 | } 54 | rInfo.state = replicaState(int(pb.GetState())) 55 | rInfo.depth = int(pb.GetDepth()) 56 | return rInfo 57 | } 58 | 59 | func (item *kvItem) to_protobuf() *PBDTableItem { 60 | rv := &PBDTableItem{ 61 | Key: item.Key, 62 | Val: item.Val, 63 | Timestamp: proto.Int64(item.timestamp.UnixNano()), 64 | KeyHash: item.keyHash, 65 | Commited: proto.Bool(item.commited), 66 | } 67 | if item.replicaInfo != nil { 68 | rv.ReplicaInfo = item.replicaInfo.to_protobuf() 69 | } 70 | return rv 71 | } 72 | 73 | func (item *kvItem) from_protobuf(pb *PBDTableItem) { 74 | item.Key = pb.GetKey() 75 | item.Val = pb.GetVal() 76 | item.timestamp = time.Unix(0, pb.GetTimestamp()) 77 | item.keyHash = pb.GetKeyHash() 78 | item.commited = pb.GetCommited() 79 | item.replicaInfo = replicaInfo_from_protobuf(pb.GetReplicaInfo()) 80 | } 81 | 82 | func (item *kvItem) to_demoted(new_master *dendrite.Vnode) *demotedKvItem { 83 | rv := new(demotedKvItem) 84 | rv.item = item.dup() 85 | rv.new_master = new_master 86 | rv.demoted_ts = time.Now() 87 | return rv 88 | } 89 | 90 | func (item *kvItem) numActiveReplicas() int { 91 | if item.replicaInfo == nil { 92 | return 0 93 | } 94 | rv := 0 95 | for _, r := range item.replicaInfo.vnodes { 96 | if r != nil { 97 | rv++ 98 | } 99 | } 100 | return rv 101 | } 102 | 103 | func (item *kvItem) dup() *kvItem { 104 | new_item := new(kvItem) 105 | new_item.timestamp = item.timestamp 106 | new_item.commited = item.commited 107 | new_item.lock = item.lock 108 | 109 | new_item.Key = make([]byte, len(item.Key)) 110 | copy(new_item.Key, item.Key) 111 | 112 | new_item.Val = make([]byte, len(item.Val)) 113 | copy(new_item.Val, item.Val) 114 | 115 | new_item.keyHash = make([]byte, len(item.keyHash)) 116 | copy(new_item.keyHash, item.keyHash) 117 | 118 | if item.replicaInfo != nil { 119 | new_item.replicaInfo = new(kvReplicaInfo) 120 | new_item.replicaInfo.master = item.replicaInfo.master 121 | new_item.replicaInfo.vnodes = make([]*dendrite.Vnode, len(item.replicaInfo.vnodes)) 122 | new_item.replicaInfo.orphan_vnodes = make([]*dendrite.Vnode, len(item.replicaInfo.orphan_vnodes)) 123 | copy(new_item.replicaInfo.vnodes, item.replicaInfo.vnodes) 124 | copy(new_item.replicaInfo.orphan_vnodes, item.replicaInfo.orphan_vnodes) 125 | new_item.replicaInfo.state = item.replicaInfo.state 126 | new_item.replicaInfo.depth = item.replicaInfo.depth 127 | } else { 128 | new_item.replicaInfo = nil 129 | } 130 | return new_item 131 | } 132 | 133 | type LogLevel int 134 | 135 | const ( 136 | LogNull LogLevel = 0 137 | LogInfo LogLevel = 1 138 | LogDebug LogLevel = 2 139 | ) 140 | 141 | // Logf wraps log.Printf with additional LogLevel. 142 | func (dt *DTable) Logf(level LogLevel, format string, v ...interface{}) { 143 | if level == LogNull { 144 | return 145 | } 146 | var new_format string 147 | if level == LogInfo { 148 | new_format = "[DTABLE][INFO] " + format 149 | } else if level == LogDebug { 150 | new_format = "[DTABLE][DEBUG] " + format 151 | } 152 | 153 | if dt.confLogLevel == LogDebug { 154 | if dt.ring.Logger != nil { 155 | dt.ring.Logger.Printf(new_format, v...) 156 | } else { 157 | log.Printf(new_format, v...) 158 | } 159 | } else if dt.confLogLevel == LogInfo && level == LogInfo { 160 | if dt.ring.Logger != nil { 161 | dt.ring.Logger.Printf(new_format, v...) 162 | } else { 163 | log.Printf(new_format, v...) 164 | } 165 | } 166 | } 167 | 168 | // Logln wraps log.Println with additional LogLevel. 169 | func (dt *DTable) Logln(level LogLevel, v ...interface{}) { 170 | if level == LogNull { 171 | return 172 | } 173 | 174 | var new_format string 175 | if level == LogInfo { 176 | new_format = "[DTABLE][INFO]" 177 | } else if level == LogDebug { 178 | new_format = "[DTABLE][DEBUG]" 179 | } 180 | if dt.confLogLevel == LogDebug { 181 | v = append([]interface{}{new_format}, v...) 182 | if dt.ring.Logger != nil { 183 | dt.ring.Logger.Println(v...) 184 | } else { 185 | log.Println(v...) 186 | } 187 | 188 | } else if dt.confLogLevel == LogInfo && level == LogInfo { 189 | v = append([]interface{}{new_format}, v...) 190 | if dt.ring.Logger != nil { 191 | dt.ring.Logger.Println(v...) 192 | } else { 193 | log.Println(v...) 194 | } 195 | } 196 | } 197 | -------------------------------------------------------------------------------- /chord.pb.go: -------------------------------------------------------------------------------- 1 | package dendrite 2 | 3 | import proto "github.com/golang/protobuf/proto" 4 | import math "math" 5 | 6 | // Reference imports to suppress errors if they are not otherwise used. 7 | var _ = proto.Marshal 8 | var _ = math.Inf 9 | 10 | // PBProtoVnode represents Vnode structure. 11 | type PBProtoVnode struct { 12 | Id []byte `protobuf:"bytes,1,req,name=id" json:"id,omitempty"` 13 | Host *string `protobuf:"bytes,2,req,name=host" json:"host,omitempty"` 14 | XXX_unrecognized []byte `json:"-"` 15 | } 16 | 17 | func (m *PBProtoVnode) Reset() { *m = PBProtoVnode{} } 18 | func (m *PBProtoVnode) String() string { return proto.CompactTextString(m) } 19 | func (*PBProtoVnode) ProtoMessage() {} 20 | 21 | func (m *PBProtoVnode) GetId() []byte { 22 | if m != nil { 23 | return m.Id 24 | } 25 | return nil 26 | } 27 | 28 | func (m *PBProtoVnode) GetHost() string { 29 | if m != nil && m.Host != nil { 30 | return *m.Host 31 | } 32 | return "" 33 | } 34 | 35 | // PBProtoPing is simple structure for pinging remote vnodes. 36 | type PBProtoPing struct { 37 | Version *int64 `protobuf:"varint,1,req,name=version" json:"version,omitempty"` 38 | XXX_unrecognized []byte `json:"-"` 39 | } 40 | 41 | func (m *PBProtoPing) Reset() { *m = PBProtoPing{} } 42 | func (m *PBProtoPing) String() string { return proto.CompactTextString(m) } 43 | func (*PBProtoPing) ProtoMessage() {} 44 | 45 | func (m *PBProtoPing) GetVersion() int64 { 46 | if m != nil && m.Version != nil { 47 | return *m.Version 48 | } 49 | return 0 50 | } 51 | 52 | // PBProtoAck is generic response message with boolean 'ok' state. 53 | type PBProtoAck struct { 54 | Version *int64 `protobuf:"varint,1,req,name=version" json:"version,omitempty"` 55 | Ok *bool `protobuf:"varint,2,req,name=ok" json:"ok,omitempty"` 56 | XXX_unrecognized []byte `json:"-"` 57 | } 58 | 59 | func (m *PBProtoAck) Reset() { *m = PBProtoAck{} } 60 | func (m *PBProtoAck) String() string { return proto.CompactTextString(m) } 61 | func (*PBProtoAck) ProtoMessage() {} 62 | 63 | func (m *PBProtoAck) GetVersion() int64 { 64 | if m != nil && m.Version != nil { 65 | return *m.Version 66 | } 67 | return 0 68 | } 69 | 70 | func (m *PBProtoAck) GetOk() bool { 71 | if m != nil && m.Ok != nil { 72 | return *m.Ok 73 | } 74 | return false 75 | } 76 | 77 | // PBProtoErr defines error message. 78 | type PBProtoErr struct { 79 | Error *string `protobuf:"bytes,2,req,name=error" json:"error,omitempty"` 80 | XXX_unrecognized []byte `json:"-"` 81 | } 82 | 83 | func (m *PBProtoErr) Reset() { *m = PBProtoErr{} } 84 | func (m *PBProtoErr) String() string { return proto.CompactTextString(m) } 85 | func (*PBProtoErr) ProtoMessage() {} 86 | 87 | func (m *PBProtoErr) GetError() string { 88 | if m != nil && m.Error != nil { 89 | return *m.Error 90 | } 91 | return "" 92 | } 93 | 94 | // PBProtoForward is sent to caller if request should be forwarded to another vnode. 95 | type PBProtoForward struct { 96 | Vnode *PBProtoVnode `protobuf:"bytes,1,req,name=vnode" json:"vnode,omitempty"` 97 | XXX_unrecognized []byte `json:"-"` 98 | } 99 | 100 | func (m *PBProtoForward) Reset() { *m = PBProtoForward{} } 101 | func (m *PBProtoForward) String() string { return proto.CompactTextString(m) } 102 | func (*PBProtoForward) ProtoMessage() {} 103 | 104 | func (m *PBProtoForward) GetVnode() *PBProtoVnode { 105 | if m != nil { 106 | return m.Vnode 107 | } 108 | return nil 109 | } 110 | 111 | // PBProtoLeave (not used) 112 | type PBProtoLeave struct { 113 | Source *PBProtoVnode `protobuf:"bytes,1,req,name=source" json:"source,omitempty"` 114 | Dest *PBProtoVnode `protobuf:"bytes,2,req,name=dest" json:"dest,omitempty"` 115 | XXX_unrecognized []byte `json:"-"` 116 | } 117 | 118 | func (m *PBProtoLeave) Reset() { *m = PBProtoLeave{} } 119 | func (m *PBProtoLeave) String() string { return proto.CompactTextString(m) } 120 | func (*PBProtoLeave) ProtoMessage() {} 121 | 122 | func (m *PBProtoLeave) GetSource() *PBProtoVnode { 123 | if m != nil { 124 | return m.Source 125 | } 126 | return nil 127 | } 128 | 129 | func (m *PBProtoLeave) GetDest() *PBProtoVnode { 130 | if m != nil { 131 | return m.Dest 132 | } 133 | return nil 134 | } 135 | 136 | // PBProtoListVnodes - request the list of vnodes from remote vnode. 137 | type PBProtoListVnodes struct { 138 | XXX_unrecognized []byte `json:"-"` 139 | } 140 | 141 | func (m *PBProtoListVnodes) Reset() { *m = PBProtoListVnodes{} } 142 | func (m *PBProtoListVnodes) String() string { return proto.CompactTextString(m) } 143 | func (*PBProtoListVnodes) ProtoMessage() {} 144 | 145 | // PBProtoListVnodesResp is a structure for returning multiple vnodes to a caller. 146 | type PBProtoListVnodesResp struct { 147 | Vnodes []*PBProtoVnode `protobuf:"bytes,1,rep,name=vnodes" json:"vnodes,omitempty"` 148 | XXX_unrecognized []byte `json:"-"` 149 | } 150 | 151 | func (m *PBProtoListVnodesResp) Reset() { *m = PBProtoListVnodesResp{} } 152 | func (m *PBProtoListVnodesResp) String() string { return proto.CompactTextString(m) } 153 | func (*PBProtoListVnodesResp) ProtoMessage() {} 154 | 155 | func (m *PBProtoListVnodesResp) GetVnodes() []*PBProtoVnode { 156 | if m != nil { 157 | return m.Vnodes 158 | } 159 | return nil 160 | } 161 | 162 | // PBProtoFindSuccessors is a structure to request successors for a key. 163 | type PBProtoFindSuccessors struct { 164 | Key []byte `protobuf:"bytes,1,req,name=key" json:"key,omitempty"` 165 | Dest *PBProtoVnode `protobuf:"bytes,2,req,name=dest" json:"dest,omitempty"` 166 | Limit *int32 `protobuf:"varint,3,opt,name=limit" json:"limit,omitempty"` 167 | XXX_unrecognized []byte `json:"-"` 168 | } 169 | 170 | func (m *PBProtoFindSuccessors) Reset() { *m = PBProtoFindSuccessors{} } 171 | func (m *PBProtoFindSuccessors) String() string { return proto.CompactTextString(m) } 172 | func (*PBProtoFindSuccessors) ProtoMessage() {} 173 | 174 | func (m *PBProtoFindSuccessors) GetKey() []byte { 175 | if m != nil { 176 | return m.Key 177 | } 178 | return nil 179 | } 180 | 181 | func (m *PBProtoFindSuccessors) GetDest() *PBProtoVnode { 182 | if m != nil { 183 | return m.Dest 184 | } 185 | return nil 186 | } 187 | 188 | func (m *PBProtoFindSuccessors) GetLimit() int32 { 189 | if m != nil && m.Limit != nil { 190 | return *m.Limit 191 | } 192 | return 0 193 | } 194 | 195 | // PBProtoGetPredecessor - request immediate predecessor from vnode. 196 | type PBProtoGetPredecessor struct { 197 | Dest *PBProtoVnode `protobuf:"bytes,1,req,name=dest" json:"dest,omitempty"` 198 | XXX_unrecognized []byte `json:"-"` 199 | } 200 | 201 | func (m *PBProtoGetPredecessor) Reset() { *m = PBProtoGetPredecessor{} } 202 | func (m *PBProtoGetPredecessor) String() string { return proto.CompactTextString(m) } 203 | func (*PBProtoGetPredecessor) ProtoMessage() {} 204 | 205 | func (m *PBProtoGetPredecessor) GetDest() *PBProtoVnode { 206 | if m != nil { 207 | return m.Dest 208 | } 209 | return nil 210 | } 211 | 212 | // PBProtoNotify is a message to notify the remote vnode of origin's existence. 213 | type PBProtoNotify struct { 214 | Dest *PBProtoVnode `protobuf:"bytes,1,req,name=dest" json:"dest,omitempty"` 215 | Vnode *PBProtoVnode `protobuf:"bytes,2,req,name=vnode" json:"vnode,omitempty"` 216 | XXX_unrecognized []byte `json:"-"` 217 | } 218 | 219 | func (m *PBProtoNotify) Reset() { *m = PBProtoNotify{} } 220 | func (m *PBProtoNotify) String() string { return proto.CompactTextString(m) } 221 | func (*PBProtoNotify) ProtoMessage() {} 222 | 223 | func (m *PBProtoNotify) GetDest() *PBProtoVnode { 224 | if m != nil { 225 | return m.Dest 226 | } 227 | return nil 228 | } 229 | 230 | func (m *PBProtoNotify) GetVnode() *PBProtoVnode { 231 | if m != nil { 232 | return m.Vnode 233 | } 234 | return nil 235 | } 236 | 237 | func init() { 238 | } 239 | -------------------------------------------------------------------------------- /transport_zmq_init.go: -------------------------------------------------------------------------------- 1 | package dendrite 2 | 3 | import ( 4 | zmq "github.com/pebbe/zmq4" 5 | "log" 6 | "os" 7 | "sync" 8 | "time" 9 | ) 10 | 11 | type controlType int 12 | 13 | const ( 14 | workerShutdownReq controlType = iota 15 | workerShutdownAllowed 16 | workerShutdownDenied 17 | workerShutdownConfirm 18 | workerRegisterReq 19 | workerRegisterAllowed 20 | workerRegisterDenied 21 | workerCtlShutdown 22 | ) 23 | 24 | // ZMQTransport implements Transport interface using ZeroMQ for communication. 25 | type ZMQTransport struct { 26 | lock *sync.Mutex 27 | minHandlers int 28 | maxHandlers int 29 | incrHandlers int 30 | activeRequests int 31 | ring *Ring 32 | table map[string]*localHandler 33 | clientTimeout time.Duration 34 | ClientTimeout time.Duration 35 | control_c chan *workerComm 36 | dealer_sock *zmq.Socket 37 | router_sock *zmq.Socket 38 | zmq_context *zmq.Context 39 | ZMQContext *zmq.Context 40 | workerIdleTimeout time.Duration 41 | hooks []TransportHook 42 | Logger *log.Logger 43 | } 44 | 45 | // RegisterHook registers TransportHook within ZMQTransport. 46 | func (t *ZMQTransport) RegisterHook(h TransportHook) { 47 | t.hooks = append(t.hooks, h) 48 | } 49 | 50 | /* 51 | InitZMQTransport creates ZeroMQ transport. 52 | 53 | It multiplexes incoming connections which are then processed in separate go routines (workers). 54 | Multiplexer spawns go routines as needed, but 10 worker routines are created on startup. 55 | Every request times out after provided timeout duration. ZMQ pattern is: 56 | zmq.ROUTER(incoming) -> proxy -> zmq.DEALER -> [zmq.REP(worker), zmq.REP...] 57 | */ 58 | func InitZMQTransport(hostname string, timeout time.Duration, logger *log.Logger) (Transport, error) { 59 | // use default logger if one is not provided 60 | if logger == nil { 61 | logger = log.New(os.Stdout, "", log.Ldate|log.Ltime|log.Lshortfile) 62 | } 63 | // initialize ZMQ Context 64 | context, err := zmq.NewContext() 65 | if err != nil { 66 | return nil, err 67 | } 68 | 69 | // setup router and bind() to tcp address for clients to connect to 70 | router_sock, err := context.NewSocket(zmq.ROUTER) 71 | if err != nil { 72 | return nil, err 73 | } 74 | err = router_sock.Bind("tcp://" + hostname) 75 | if err != nil { 76 | return nil, err 77 | } 78 | 79 | // setup dealer 80 | dealer_sock, err := context.NewSocket(zmq.DEALER) 81 | if err != nil { 82 | return nil, err 83 | } 84 | err = dealer_sock.Bind("inproc://dendrite-zmqdealer") 85 | if err != nil { 86 | return nil, err 87 | } 88 | poller := zmq.NewPoller() 89 | poller.Add(router_sock, zmq.POLLIN) 90 | poller.Add(dealer_sock, zmq.POLLIN) 91 | 92 | transport := &ZMQTransport{ 93 | lock: new(sync.Mutex), 94 | clientTimeout: timeout, 95 | ClientTimeout: timeout, 96 | minHandlers: 10, 97 | maxHandlers: 1024, 98 | incrHandlers: 10, 99 | activeRequests: 0, 100 | workerIdleTimeout: 10 * time.Second, 101 | table: make(map[string]*localHandler), 102 | control_c: make(chan *workerComm), 103 | dealer_sock: dealer_sock, 104 | router_sock: router_sock, 105 | zmq_context: context, 106 | ZMQContext: context, 107 | hooks: make([]TransportHook, 0), 108 | Logger: logger, 109 | } 110 | 111 | go zmq.Proxy(router_sock, dealer_sock, nil) 112 | // Scheduler goroutine keeps track of running workers 113 | // It spawns new ones if needed, and cancels ones that are idling 114 | go func() { 115 | sched_ticker := time.NewTicker(60 * time.Second) 116 | workers := make(map[*workerComm]bool) 117 | // fire up initial set of workers 118 | for i := 0; i < transport.minHandlers; i++ { 119 | go transport.zmq_worker() 120 | } 121 | for { 122 | select { 123 | case comm := <-transport.control_c: 124 | // worker sent something... 125 | msg := <-comm.worker_out 126 | switch { 127 | case msg == workerRegisterReq: 128 | if len(workers) == transport.maxHandlers { 129 | comm.worker_in <- workerRegisterDenied 130 | logger.Println("[DENDRITE][INFO]: TransportListener - max number of workers reached") 131 | continue 132 | } 133 | if _, ok := workers[comm]; ok { 134 | // worker already registered 135 | continue 136 | } 137 | comm.worker_in <- workerRegisterAllowed 138 | workers[comm] = true 139 | logger.Println("[DENDRITE][INFO]: TransportListener - registered new worker, total:", len(workers)) 140 | case msg == workerShutdownReq: 141 | //logger.Println("Got shutdown req") 142 | if len(workers) > transport.minHandlers { 143 | comm.worker_in <- workerShutdownAllowed 144 | for _ = range comm.worker_out { 145 | // wait until worker closes the channel 146 | } 147 | delete(workers, comm) 148 | } else { 149 | comm.worker_in <- workerShutdownDenied 150 | } 151 | } 152 | case <-sched_ticker.C: 153 | // check if requests are piling up and start more workers if that's the case 154 | if transport.activeRequests > 3*len(workers) { 155 | for i := 0; i < transport.incrHandlers; i++ { 156 | go transport.zmq_worker() 157 | } 158 | } 159 | } 160 | } 161 | }() 162 | return transport, nil 163 | } 164 | 165 | type workerComm struct { 166 | worker_in chan controlType // worker's input channel for two way communication with scheduler 167 | worker_out chan controlType // worker's output channel for two way communication with scheduler 168 | worker_ctl chan controlType // worker's control channel for communication with scheduler 169 | } 170 | 171 | func (transport *ZMQTransport) zmq_worker() { 172 | // setup REP socket 173 | rep_sock, err := transport.zmq_context.NewSocket(zmq.REP) 174 | if err != nil { 175 | transport.Logger.Println("[DENDRITE][ERROR]: TransportListener worker failed to create REP socket", err) 176 | return 177 | } 178 | err = rep_sock.Connect("inproc://dendrite-zmqdealer") 179 | if err != nil { 180 | transport.Logger.Println("[DENDRITE][ERROR]: TransportListener worker failed to connect to dealer", err) 181 | return 182 | } 183 | 184 | // setup communication channels with scheduler 185 | worker_in := make(chan controlType, 1) 186 | worker_out := make(chan controlType, 1) 187 | worker_ctl := make(chan controlType, 1) 188 | comm := &workerComm{ 189 | worker_in: worker_in, 190 | worker_out: worker_out, 191 | worker_ctl: worker_ctl, 192 | } 193 | // notify scheduler that we're up 194 | worker_out <- workerRegisterReq 195 | transport.control_c <- comm 196 | v := <-worker_in 197 | if v == workerRegisterDenied { 198 | return 199 | } 200 | 201 | // setup socket read channel 202 | rpc_req_c := make(chan *ChordMsg) 203 | rpc_response_c := make(chan *ChordMsg) 204 | poller := zmq.NewPoller() 205 | poller.Add(rep_sock, zmq.POLLIN) 206 | cancel_c := make(chan bool, 1) 207 | // read from socket and emit data, or stop if canceled 208 | go func() { 209 | MAINLOOP: 210 | for { 211 | // poll for 5 seconds, but then see if we should be canceled 212 | sockets, _ := poller.Poll(5 * time.Second) 213 | for _, socket := range sockets { 214 | rawmsg, err := socket.Socket.RecvBytes(0) 215 | if err != nil { 216 | transport.Logger.Println("[DENDRITE][ERROR]: TransportListener error while reading from REP, ", err) 217 | continue 218 | } 219 | // decode raw data 220 | decoded, err := transport.Decode(rawmsg) 221 | if err != nil { 222 | errorMsg := transport.newErrorMsg("Failed to decode request - " + err.Error()) 223 | encoded := transport.Encode(errorMsg.Type, errorMsg.Data) 224 | socket.Socket.SendBytes(encoded, 0) 225 | continue 226 | } 227 | // if transportHandler is nil, this can not be a valid request 228 | if decoded.TransportHandler == nil { 229 | errorMsg := transport.newErrorMsg("Invalid request, unknown handler") 230 | encoded := transport.Encode(errorMsg.Type, errorMsg.Data) 231 | socket.Socket.SendBytes(encoded, 0) 232 | continue 233 | } 234 | rpc_req_c <- decoded 235 | // wait for response 236 | response := <-rpc_response_c 237 | encoded := transport.Encode(response.Type, response.Data) 238 | socket.Socket.SendBytes(encoded, 0) 239 | } 240 | // check for cancel request 241 | select { 242 | case <-cancel_c: 243 | break MAINLOOP 244 | default: 245 | break 246 | } 247 | } 248 | }() 249 | 250 | // read from socket and process request -- OR 251 | // shutdown if scheduler wants me to -- OR 252 | // request shutdown from scheduler if idling and exit if allowed 253 | ticker := time.NewTicker(transport.workerIdleTimeout) 254 | for { 255 | select { 256 | case request := <-rpc_req_c: 257 | // handle request 258 | request.TransportHandler(request, rpc_response_c) 259 | // restart idle timer 260 | ticker.Stop() 261 | ticker = time.NewTicker(transport.workerIdleTimeout) 262 | 263 | case controlMsg := <-comm.worker_ctl: 264 | if controlMsg == workerCtlShutdown { 265 | close(comm.worker_out) 266 | cancel_c <- true 267 | close(cancel_c) 268 | transport.Logger.Println("[DENDRITE][INFO]: TransportListener: worker shutdown") 269 | return 270 | } 271 | case <-ticker.C: 272 | // we're idling, lets request shutdown 273 | comm.worker_out <- workerShutdownReq 274 | transport.control_c <- comm 275 | v := <-comm.worker_in 276 | if v == workerShutdownAllowed { 277 | transport.Logger.Println("[DENDRITE][INFO]: TransportListener: worker shutdown due to idle state") 278 | close(comm.worker_out) 279 | cancel_c <- true 280 | close(cancel_c) 281 | return 282 | } 283 | } 284 | } 285 | } 286 | -------------------------------------------------------------------------------- /dtable/dtable_zmq_handler.go: -------------------------------------------------------------------------------- 1 | package dtable 2 | 3 | import ( 4 | "fmt" 5 | "github.com/fastfn/dendrite" 6 | "github.com/golang/protobuf/proto" 7 | "sync" 8 | ) 9 | 10 | func (dt *DTable) zmq_status_handler(request *dendrite.ChordMsg, w chan *dendrite.ChordMsg) { 11 | pbMsg := request.TransportMsg.(PBDTableStatus) 12 | 13 | dest := dendrite.VnodeFromProtobuf(pbMsg.GetDest()) 14 | dest_key_str := fmt.Sprintf("%x", dest.Id) 15 | zmq_transport := dt.transport.(*dendrite.ZMQTransport) 16 | 17 | // make sure destination vnode exists locally 18 | _, ok := dt.table[dest_key_str] 19 | setResp := &PBDTableResponse{} 20 | if !ok { 21 | setResp.Ok = proto.Bool(false) 22 | setResp.Error = proto.String("local dtable vnode not found") 23 | } else { 24 | setResp.Ok = proto.Bool(true) 25 | } 26 | 27 | // encode and send the response 28 | pbdata, err := proto.Marshal(setResp) 29 | if err != nil { 30 | errorMsg := zmq_transport.NewErrorMsg("ZMQ::DTable::StatusHandler - failed to marshal response - " + err.Error()) 31 | w <- errorMsg 32 | return 33 | } 34 | w <- &dendrite.ChordMsg{ 35 | Type: PbDtableResponse, 36 | Data: pbdata, 37 | } 38 | return 39 | } 40 | 41 | func (dt *DTable) zmq_get_handler(request *dendrite.ChordMsg, w chan *dendrite.ChordMsg) { 42 | pbMsg := request.TransportMsg.(PBDTableGetItem) 43 | keyHash := pbMsg.GetKeyHash() 44 | dest := dendrite.VnodeFromProtobuf(pbMsg.GetDest()) 45 | dest_key_str := fmt.Sprintf("%x", dest.Id) 46 | zmq_transport := dt.transport.(*dendrite.ZMQTransport) 47 | 48 | // make sure destination vnode exists locally 49 | vn_table, ok := dt.table[dest_key_str] 50 | if !ok { 51 | errorMsg := zmq_transport.NewErrorMsg("ZMQ::DTable::GetHandler - local vnode table not found") 52 | w <- errorMsg 53 | return 54 | } 55 | key_str := fmt.Sprintf("%x", keyHash) 56 | 57 | var itemResp *PBDTableItem 58 | 59 | if localItem, ok := vn_table[key_str]; ok { 60 | itemResp = localItem.to_protobuf() 61 | itemResp.Found = proto.Bool(true) 62 | } else { 63 | itemResp = &PBDTableItem{ 64 | Found: proto.Bool(false), 65 | } 66 | } 67 | 68 | // encode and send the response 69 | pbdata, err := proto.Marshal(itemResp) 70 | if err != nil { 71 | errorMsg := zmq_transport.NewErrorMsg("ZMQ::DTable::GetHandler - failed to marshal response - " + err.Error()) 72 | w <- errorMsg 73 | return 74 | } 75 | w <- &dendrite.ChordMsg{ 76 | Type: PbDtableItem, 77 | Data: pbdata, 78 | } 79 | return 80 | } 81 | 82 | func (dt *DTable) zmq_set_handler(request *dendrite.ChordMsg, w chan *dendrite.ChordMsg) { 83 | pbMsg := request.TransportMsg.(PBDTableSetItem) 84 | reqItem := new(kvItem) 85 | reqItem.lock = new(sync.Mutex) 86 | reqItem.from_protobuf(pbMsg.GetItem()) 87 | demoting := pbMsg.GetDemoting() 88 | minAcks := int(pbMsg.GetMinAcks()) 89 | dest := dendrite.VnodeFromProtobuf(pbMsg.GetDest()) 90 | origin := dendrite.VnodeFromProtobuf(pbMsg.GetOrigin()) 91 | dest_key_str := fmt.Sprintf("%x", dest.Id) 92 | zmq_transport := dt.transport.(*dendrite.ZMQTransport) 93 | 94 | // make sure destination vnode exists locally 95 | _, ok := dt.table[dest_key_str] 96 | if !ok { 97 | errorMsg := zmq_transport.NewErrorMsg("ZMQ::DTable::SetHandler - local vnode table not found") 98 | w <- errorMsg 99 | return 100 | } 101 | setResp := &PBDTableResponse{ 102 | Ok: proto.Bool(false), 103 | } 104 | 105 | if demoting { 106 | old_master := reqItem.replicaInfo.master 107 | reqItem.replicaInfo.master = dest 108 | reqItem.lock.Lock() 109 | err := dt.table[dest_key_str].put(reqItem) 110 | if err != nil { 111 | errorMsg := zmq_transport.NewErrorMsg("ZMQ::DTable::SetHandler - demote received error on - " + err.Error()) 112 | w <- errorMsg 113 | reqItem.lock.Unlock() 114 | return 115 | } 116 | go dt.processDemoteKey(dest, origin, old_master, reqItem) 117 | setResp.Ok = proto.Bool(true) 118 | reqItem.lock.Unlock() 119 | } else { 120 | wait := make(chan error) 121 | go dt.set(dest, reqItem, minAcks, wait) 122 | err := <-wait 123 | if err != nil { 124 | setResp.Error = proto.String("ZMQ::DTable::SetHandler - error executing transaction - " + err.Error()) 125 | } else { 126 | setResp.Ok = proto.Bool(true) 127 | } 128 | } 129 | 130 | // encode and send the response 131 | pbdata, err := proto.Marshal(setResp) 132 | if err != nil { 133 | errorMsg := zmq_transport.NewErrorMsg("ZMQ::DTable::SetHandler - failed to marshal response - " + err.Error()) 134 | w <- errorMsg 135 | return 136 | } 137 | w <- &dendrite.ChordMsg{ 138 | Type: PbDtableResponse, 139 | Data: pbdata, 140 | } 141 | 142 | return 143 | } 144 | 145 | func (dt *DTable) zmq_setReplica_handler(request *dendrite.ChordMsg, w chan *dendrite.ChordMsg) { 146 | pbMsg := request.TransportMsg.(PBDTableSetItem) 147 | reqItem := new(kvItem) 148 | reqItem.lock = new(sync.Mutex) 149 | reqItem.from_protobuf(pbMsg.GetItem()) 150 | dest := dendrite.VnodeFromProtobuf(pbMsg.GetDest()) 151 | dest_key_str := fmt.Sprintf("%x", dest.Id) 152 | zmq_transport := dt.transport.(*dendrite.ZMQTransport) 153 | 154 | // make sure destination vnode exists locally 155 | _, ok := dt.table[dest_key_str] 156 | if !ok { 157 | errorMsg := zmq_transport.NewErrorMsg("ZMQ::DTable::SetReplicaHandler - local vnode table not found") 158 | w <- errorMsg 159 | return 160 | } 161 | setResp := &PBDTableResponse{ 162 | Ok: proto.Bool(true), 163 | } 164 | dt.setReplica(dest, reqItem) 165 | 166 | // encode and send the response 167 | pbdata, err := proto.Marshal(setResp) 168 | if err != nil { 169 | errorMsg := zmq_transport.NewErrorMsg("ZMQ::DTable::SetReplicaHandler - failed to marshal response - " + err.Error()) 170 | w <- errorMsg 171 | return 172 | } 173 | w <- &dendrite.ChordMsg{ 174 | Type: PbDtableResponse, 175 | Data: pbdata, 176 | } 177 | 178 | return 179 | } 180 | 181 | func (dt *DTable) zmq_setReplicaInfo_handler(request *dendrite.ChordMsg, w chan *dendrite.ChordMsg) { 182 | pbMsg := request.TransportMsg.(PBDTableSetReplicaInfo) 183 | rInfo := replicaInfo_from_protobuf(pbMsg.GetReplicaInfo()) 184 | keyHash := pbMsg.GetKeyHash() 185 | dest := dendrite.VnodeFromProtobuf(pbMsg.GetDest()) 186 | dest_key_str := fmt.Sprintf("%x", dest.Id) 187 | zmq_transport := dt.transport.(*dendrite.ZMQTransport) 188 | 189 | // make sure destination vnode exists locally 190 | vn_table, ok := dt.rtable[dest_key_str] 191 | if !ok { 192 | errorMsg := zmq_transport.NewErrorMsg("ZMQ::DTable::SetMetaHandler - local vnode table not found") 193 | w <- errorMsg 194 | return 195 | } 196 | key_str := fmt.Sprintf("%x", keyHash) 197 | item, ok := vn_table[key_str] 198 | if !ok { 199 | errorMsg := zmq_transport.NewErrorMsg("ZMQ::DTable::SetMetaHandler - key not found") 200 | w <- errorMsg 201 | return 202 | } 203 | item.replicaInfo = rInfo 204 | 205 | // encode and send the response 206 | setResp := &PBDTableResponse{ 207 | Ok: proto.Bool(true), 208 | } 209 | pbdata, err := proto.Marshal(setResp) 210 | if err != nil { 211 | errorMsg := zmq_transport.NewErrorMsg("ZMQ::DTable::SetMetaHandler - failed to marshal response - " + err.Error()) 212 | w <- errorMsg 213 | return 214 | } 215 | w <- &dendrite.ChordMsg{ 216 | Type: PbDtableResponse, 217 | Data: pbdata, 218 | } 219 | return 220 | } 221 | 222 | func (dt *DTable) zmq_clearreplica_handler(request *dendrite.ChordMsg, w chan *dendrite.ChordMsg) { 223 | pbMsg := request.TransportMsg.(PBDTableClearReplica) 224 | keyHash := pbMsg.GetKeyHash() 225 | demoted := pbMsg.GetDemoted() 226 | dest := dendrite.VnodeFromProtobuf(pbMsg.GetDest()) 227 | dest_key_str := fmt.Sprintf("%x", dest.Id) 228 | zmq_transport := dt.transport.(*dendrite.ZMQTransport) 229 | 230 | // make sure destination vnode exists locally 231 | r_table, ok := dt.rtable[dest_key_str] 232 | if !ok { 233 | errorMsg := zmq_transport.NewErrorMsg("ZMQ::DTable::ClearReplicaHandler - local vnode table not found") 234 | w <- errorMsg 235 | return 236 | } 237 | 238 | key_str := fmt.Sprintf("%x", keyHash) 239 | if demoted { 240 | d_table, _ := dt.demoted_table[dest_key_str] 241 | if _, ok := d_table[key_str]; ok { 242 | delete(d_table, key_str) 243 | } else { 244 | errorMsg := zmq_transport.NewErrorMsg("ZMQ::DTable::ClearReplicaHandler - key " + key_str + " not found in demoted table") 245 | w <- errorMsg 246 | return 247 | } 248 | } else { 249 | if _, ok := r_table[key_str]; ok { 250 | delete(r_table, key_str) 251 | } else { 252 | errorMsg := zmq_transport.NewErrorMsg("ZMQ::DTable::ClearReplicaHandler - key " + key_str + " not found in replica table on vnode " + dest.String()) 253 | w <- errorMsg 254 | return 255 | } 256 | } 257 | 258 | // encode and send the response 259 | setResp := &PBDTableResponse{ 260 | Ok: proto.Bool(true), 261 | } 262 | pbdata, err := proto.Marshal(setResp) 263 | if err != nil { 264 | errorMsg := zmq_transport.NewErrorMsg("ZMQ::DTable::ClearReplicaHandler - failed to marshal response - " + err.Error()) 265 | w <- errorMsg 266 | return 267 | } 268 | w <- &dendrite.ChordMsg{ 269 | Type: PbDtableResponse, 270 | Data: pbdata, 271 | } 272 | return 273 | } 274 | 275 | func (dt *DTable) zmq_promoteKey_handler(request *dendrite.ChordMsg, w chan *dendrite.ChordMsg) { 276 | pbMsg := request.TransportMsg.(PBDTablePromoteKey) 277 | reqItem := new(kvItem) 278 | reqItem.lock = new(sync.Mutex) 279 | reqItem.from_protobuf(pbMsg.GetItem()) 280 | 281 | // send out the event to delegator 282 | ev := &dtableEvent{ 283 | evType: evPromoteKey, 284 | vnode: dendrite.VnodeFromProtobuf(pbMsg.GetDest()), 285 | item: reqItem, 286 | } 287 | dt.dtable_c <- ev 288 | 289 | zmq_transport := dt.transport.(*dendrite.ZMQTransport) 290 | 291 | setResp := &PBDTableResponse{ 292 | Ok: proto.Bool(true), 293 | } 294 | 295 | // encode and send the response 296 | pbdata, err := proto.Marshal(setResp) 297 | if err != nil { 298 | errorMsg := zmq_transport.NewErrorMsg("ZMQ::DTable::SetReplicaHandler - failed to marshal response - " + err.Error()) 299 | w <- errorMsg 300 | return 301 | } 302 | w <- &dendrite.ChordMsg{ 303 | Type: PbDtableResponse, 304 | Data: pbdata, 305 | } 306 | return 307 | } 308 | -------------------------------------------------------------------------------- /dtable/dtable_replication.go: -------------------------------------------------------------------------------- 1 | package dtable 2 | 3 | import ( 4 | "bytes" 5 | "github.com/fastfn/dendrite" 6 | "time" 7 | ) 8 | 9 | // promoteKey() -- called when remote wants to promote a key to us 10 | func (dt *DTable) promoteKey(vnode *dendrite.Vnode, reqItem *kvItem) { 11 | rtable := dt.rtable[vnode.String()] 12 | vn_table := dt.table[vnode.String()] 13 | // if we're already primary node for this key, just replicate again because one replica could be deleted 14 | if _, ok := vn_table[reqItem.keyHashString()]; ok { 15 | dt.replicateKey(vnode, reqItem, dt.ring.Replicas()) 16 | return 17 | } 18 | delete(rtable, reqItem.keyHashString()) 19 | reqItem.lock.Lock() 20 | vn_table.put(reqItem) 21 | dt.replicateKey(vnode, reqItem, dt.ring.Replicas()) 22 | reqItem.lock.Unlock() 23 | } 24 | 25 | // promote() - called when remote predecessor died or left the ring 26 | // because we're only first REMOTE node from original master 27 | // it doesn't mean that we're the actual successor for all the replicated data with depth 0 28 | // if we are, we promote ourselves 29 | // if not, we must find actual successor for each key, and promote that vnode for each key 30 | func (dt *DTable) promote(vnode *dendrite.Vnode) { 31 | //log.Printf("Node left me: %X for %X now replicating to:\n", localVn.Id, new_pred.Id) 32 | rtable := dt.rtable[vnode.String()] 33 | vn_table := dt.table[vnode.String()] 34 | for key_str, ritem := range rtable { 35 | if ritem.replicaInfo.depth != 0 { 36 | continue 37 | } 38 | // check if we're real successor for this key 39 | succs, err := dt.ring.Lookup(1, ritem.keyHash) 40 | if err != nil { 41 | dt.Logf(LogInfo, "Could not promote key, Lookup() failed: %s\n", err.Error()) 42 | continue 43 | } 44 | if bytes.Compare(succs[0].Id, vnode.Id) == 0 { 45 | // this key should be promoted locally 46 | new_ritem := ritem.dup() 47 | new_ritem.replicaInfo.vnodes[0] = nil 48 | new_ritem.commited = true 49 | new_ritem.lock.Lock() 50 | vn_table.put(new_ritem) 51 | dt.Logf(LogDebug, "Promoted local key: %s - running replicator now replicas are %+v \n", key_str, new_ritem.replicaInfo.vnodes) 52 | delete(rtable, key_str) 53 | dt.Logf(LogDebug, "Promote calling replicateKey for key %s\n", key_str) 54 | dt.replicateKey(vnode, new_ritem, dt.ring.Replicas()) 55 | new_ritem.lock.Unlock() 56 | dt.Logf(LogDebug, "Promote finishing key %s, replicaVnodes are: %+v\n", key_str, new_ritem.replicaInfo.vnodes) 57 | } else { 58 | // TODO promote remote vnode 59 | dt.Logf(LogDebug, "Promoting remote vnode %s for key %s\n", succs[0].String(), key_str) 60 | delete(rtable, key_str) 61 | dt.remotePromoteKey(vnode, succs[0], ritem) 62 | } 63 | } 64 | } 65 | 66 | /* demote() - promotes new predecessor with keys from primary table 67 | if new predecessor is local: 68 | - move all of my replica keys to new vnode 69 | - replica scheme of remote successors doesn't change here 70 | we just need to update metadata on all replica nodes to reflect this change 71 | if new predecessor is remote: 72 | - for all keys in primary table, that are <= new_pred.Id: 73 | 1. move key to demoted table and wait there for cleanup call from new master 74 | 2. call demoteKey() to commit to new_pred's primary table + let that vnode know where existing replicas are 75 | 3. demoteKey() will callback to cleanup each key from demoted table after it's written new replicas 76 | - handle replica-0 table such that: 77 | 1. for each key, check if master vnode is located on same physical node as new_pred 78 | - if it is, we don't need to do anything because we're still natural remote successor 79 | - if not 80 | 1. call demoteReplica() to let master know existing replica setup and about newRemoteSucc 81 | 2. master will reconfigure replicas around and delete unnecessary copies (if any) 82 | */ 83 | func (dt *DTable) demote(vnode, new_pred *dendrite.Vnode) { 84 | // determine if new_pred is on this node 85 | isLocal := false 86 | for _, lvn := range dt.ring.MyVnodes() { 87 | if lvn.Host == new_pred.Host { 88 | isLocal = true 89 | } 90 | } 91 | switch isLocal { 92 | case true: 93 | // move all replica keys to new vnode 94 | vn_rtable := dt.rtable[vnode.String()] 95 | for rkey, ritem := range vn_rtable { 96 | if !ritem.commited { 97 | continue 98 | } 99 | 100 | ritem.replicaInfo.vnodes[ritem.replicaInfo.depth] = new_pred 101 | ritem.lock.Lock() 102 | dt.rtable[new_pred.String()].put(ritem) 103 | delete(vn_rtable, rkey) 104 | 105 | // update metadata on all replicas 106 | new_state := ritem.replicaInfo.state 107 | for idx, replica := range ritem.replicaInfo.vnodes { 108 | // skip ourselves 109 | if idx == ritem.replicaInfo.depth { 110 | continue 111 | } 112 | new_ritem := ritem.dup() 113 | new_ritem.replicaInfo.depth = idx 114 | new_ritem.replicaInfo.state = new_state 115 | 116 | err := dt.remoteSetReplicaInfo(replica, new_ritem) 117 | if err != nil { 118 | dt.Logf(LogInfo, "Error updating replicaMeta on demote() -", err) 119 | new_state = replicaIncomplete 120 | continue 121 | } 122 | } 123 | ritem.lock.Unlock() 124 | } 125 | case false: 126 | // loop over primary table to find keys that should belong to new predecessor 127 | vn_table := dt.table[vnode.String()] 128 | for key_str, item := range vn_table { 129 | if !item.commited { 130 | continue 131 | } 132 | if dendrite.Between(vnode.Id, new_pred.Id, item.keyHash, true) { 133 | //log.Printf("Analyzed key for demoting %s and pushing to %s\n", key_str, new_pred.String()) 134 | // copy the key to demoted table and remove it from primary one 135 | dt.demoted_table[vnode.String()][item.keyHashString()] = item.to_demoted(new_pred) 136 | delete(vn_table, key_str) 137 | done_c := make(chan error) 138 | go dt.remoteSet(vnode, new_pred, item, dt.ring.Replicas(), true, done_c) 139 | err := <-done_c 140 | if err != nil { 141 | dt.Logln(LogInfo, "Error demoting key to new predecessor -", err) 142 | continue 143 | } 144 | } 145 | } 146 | } 147 | 148 | } 149 | 150 | // changeReplicas() -- callend when replica set changes 151 | // 152 | func (dt *DTable) changeReplicas(vnode *dendrite.Vnode, new_replicas []*dendrite.Vnode) { 153 | for _, item := range dt.table[vnode.String()] { 154 | if !item.commited { 155 | continue 156 | } 157 | item.lock.Lock() 158 | dt.replicateKey(vnode, item, dt.ring.Replicas()) 159 | item.lock.Unlock() 160 | } 161 | } 162 | 163 | func (dt *DTable) replicateKey(vnode *dendrite.Vnode, reqItem *kvItem, limit int) { 164 | handler, _ := dt.transport.GetVnodeHandler(vnode) 165 | if handler == nil { 166 | return 167 | } 168 | // find remote successors to write replicas to 169 | remote_succs, err := handler.FindRemoteSuccessors(limit) 170 | if err != nil { 171 | return 172 | } 173 | 174 | // first, lets remove existing replicas 175 | for idx, existing := range reqItem.replicaInfo.vnodes { 176 | if existing == nil { 177 | continue 178 | } 179 | if err := dt.remoteClearReplica(existing, reqItem, false); err != nil { 180 | // lets add this replica to orphans 181 | reqItem.replicaInfo.orphan_vnodes = append(reqItem.replicaInfo.orphan_vnodes, existing) 182 | reqItem.replicaInfo.vnodes[idx] = nil 183 | continue 184 | } else { 185 | reqItem.replicaInfo.vnodes[idx] = nil 186 | } 187 | } 188 | 189 | // we set replicaState to stable if enough remote successors are found 190 | // otherwise, replicaState is still stable, but partial 191 | // if any of replica writes fail later on, we'll set the state to Incomplete 192 | var new_replica_state replicaState 193 | if len(remote_succs) >= dt.ring.Replicas() { 194 | new_replica_state = replicaStable 195 | } else { 196 | new_replica_state = replicaPartial 197 | } 198 | 199 | // now lets write replicas 200 | new_replicas := make([]*dendrite.Vnode, 0) 201 | 202 | for _, succ := range remote_succs { 203 | if succ == nil { 204 | continue 205 | } 206 | dt.Logf(LogDebug, "replicating to: %x\n", succ.Id) 207 | new_ritem := reqItem.dup() 208 | new_ritem.replicaInfo.state = replicaIncomplete 209 | new_ritem.commited = false 210 | 211 | err := dt.remoteWriteReplica(vnode, succ, new_ritem) 212 | if err != nil { 213 | dt.Logf(LogInfo, "Error writing replica to %s for key %s due to error: %s\n", succ.String(), new_ritem.keyHashString(), err.Error()) 214 | new_replica_state = replicaIncomplete 215 | continue 216 | } 217 | new_replicas = append(new_replicas, succ) 218 | } 219 | 220 | // update metadata on original item 221 | reqItem.replicaInfo.vnodes = make([]*dendrite.Vnode, limit) 222 | for idx, new_replica := range new_replicas { 223 | reqItem.replicaInfo.vnodes[idx] = new_replica 224 | } 225 | reqItem.replicaInfo.state = new_replica_state 226 | 227 | // update metadata on successful replicas 228 | for idx, replica := range reqItem.replicaInfo.vnodes { 229 | if replica == nil { 230 | break 231 | } 232 | new_ritem := reqItem.dup() 233 | new_ritem.replicaInfo.depth = idx 234 | new_ritem.replicaInfo.state = new_replica_state 235 | 236 | err := dt.remoteSetReplicaInfo(replica, new_ritem) 237 | if err != nil { 238 | // this should not happen. It means another replica node failed in the meantime 239 | // need to trigger orphan cleaner, which will restart this process 240 | reqItem.replicaInfo.state = replicaIncomplete 241 | reqItem.replicaInfo.vnodes[idx] = nil 242 | reqItem.replicaInfo.orphan_vnodes = append(reqItem.replicaInfo.orphan_vnodes, replica) 243 | continue 244 | } 245 | } 246 | } 247 | 248 | func (dt *DTable) selfCheck() { 249 | //check for orphaned keys 250 | for _, vn_table := range dt.table { 251 | ITEM_LOOP: 252 | for _, item := range vn_table { 253 | if len(item.replicaInfo.orphan_vnodes) == 0 { 254 | continue ITEM_LOOP 255 | } 256 | item.lock.Lock() 257 | new_orphans := make([]*dendrite.Vnode, 0) 258 | replicate_again := false 259 | for _, orphan_vnode := range item.replicaInfo.orphan_vnodes { 260 | if orphan_vnode == nil { 261 | continue 262 | } 263 | // maybe it was fixed already by another process (eg, replicas changed and key was re-replicated) 264 | fixed := false 265 | for _, replica := range item.replicaInfo.vnodes { 266 | if replica == nil { 267 | continue 268 | } 269 | if bytes.Compare(orphan_vnode.Id, replica.Id) == 0 { 270 | fixed = true 271 | } 272 | } 273 | if !fixed { 274 | if err := dt.remoteClearReplica(orphan_vnode, item, false); err != nil { 275 | // attempt to clear orphan'ed item failed 276 | new_orphans = append(new_orphans, orphan_vnode) 277 | } else { 278 | // success, lets re-replicate this key 279 | replicate_again = true 280 | } 281 | } 282 | } 283 | item.replicaInfo.orphan_vnodes = new_orphans 284 | if replicate_again { 285 | dt.replicateKey(item.replicaInfo.master, item, dt.ring.Replicas()) 286 | } 287 | item.lock.Unlock() 288 | } 289 | } 290 | 291 | //check for demoted keys 292 | for _, demoted_table := range dt.demoted_table { 293 | for _, demoted_item := range demoted_table { 294 | if demoted_item.demoted_ts.Add(time.Minute * 3).Before(time.Now()) { 295 | // new master did not process this item to the end when we demoted the key 296 | // lets see if we can lookup the key 297 | val, err := dt.NewQuery().Get([]byte(demoted_item.item.keyHashString())) 298 | if err != nil { 299 | dt.Logf(LogInfo, "selfCheck() tried to check demoted key: %s, but Get() failed: %s\n", demoted_item.item.keyHashString(), err.Error()) 300 | continue 301 | } 302 | if val == nil { 303 | dt.Logf(LogInfo, "selfCheck() found old demoted key: %s. Restoring it now...", demoted_item.item.keyHashString()) 304 | err = dt.NewQuery().Set(demoted_item.item.Key, demoted_item.item.Val) 305 | if err != nil { 306 | dt.Logf(LogInfo, "selfCheck() failed while restoring demoted key %s. Err: %s\n", demoted_item.item.keyHashString(), err.Error()) 307 | continue 308 | } 309 | delete(demoted_table, demoted_item.item.keyHashString()) 310 | dt.Logf(LogInfo, "selfCheck() restored demoted key: %s\n", demoted_item.item.keyHashString()) 311 | } 312 | } 313 | } 314 | } 315 | } 316 | -------------------------------------------------------------------------------- /dendrite.go: -------------------------------------------------------------------------------- 1 | package dendrite 2 | 3 | import ( 4 | "bytes" 5 | "fmt" 6 | "log" 7 | "sort" 8 | "time" 9 | ) 10 | 11 | // MsgType represents message type for ChordMsg encoding. 12 | type MsgType byte 13 | 14 | // ChordMsg is lowest entity to be transmited through dendrite. 15 | type ChordMsg struct { 16 | Type MsgType 17 | Data []byte 18 | TransportMsg interface{} // unmarshalled data, depending on transport 19 | TransportHandler func(*ChordMsg, chan *ChordMsg) // request pointer, response channel 20 | } 21 | 22 | type ErrHookUnknownType string 23 | 24 | func (e ErrHookUnknownType) Error() string { 25 | return fmt.Sprintf("%s", string(e)) 26 | } 27 | 28 | // TransportHook provides interface to build additional message types, decoders and handlers through 3rd party 29 | // packages that can register their hooks and leverage existing transport architecture and capabilities. 30 | type TransportHook interface { 31 | Decode([]byte) (*ChordMsg, error) // decodes bytes to ChordMsg 32 | } 33 | 34 | // DelegateHook provides interface to capture dendrite events in 3rd party packages. 35 | type DelegateHook interface { 36 | EmitEvent(*EventCtx) 37 | } 38 | 39 | // Transport interface defines methods for communication between vnodes. 40 | type Transport interface { 41 | // ListVnodes returns list of local vnodes from remote host. 42 | ListVnodes(string) ([]*Vnode, error) 43 | 44 | // Ping sends ping message to a vnode. 45 | Ping(*Vnode) (bool, error) 46 | 47 | // GetPredecessor is a request to get vnode's predecessor. 48 | GetPredecessor(*Vnode) (*Vnode, error) 49 | 50 | // Notify our successor of ourselves. 51 | Notify(dest, self *Vnode) ([]*Vnode, error) 52 | 53 | // FindSuccessors sends request to a vnode, requesting the list of successors for given key. 54 | FindSuccessors(*Vnode, int, []byte) ([]*Vnode, error) 55 | 56 | // GetVnodeHandler returns VnodeHandler interface if requested vnode is local 57 | GetVnodeHandler(*Vnode) (VnodeHandler, bool) 58 | 59 | // Register registers local vnode handlers 60 | Register(*Vnode, VnodeHandler) 61 | 62 | // Encode encodes dendrite msg into two frame byte stream. First frame is a single byte representing 63 | // message type, and another frame is protobuf data. 64 | Encode(MsgType, []byte) []byte 65 | 66 | // RegisterHook registers a TransportHook within the transport. 67 | RegisterHook(TransportHook) 68 | 69 | TransportHook 70 | } 71 | 72 | // Config is a main ring configuration struct. 73 | type Config struct { 74 | Hostname string 75 | NumVnodes int // num of vnodes to create 76 | StabilizeMin time.Duration 77 | StabilizeMax time.Duration 78 | NumSuccessors int // number of successor to keep in self log 79 | Replicas int // number of replicas to keep by default 80 | LogLevel LogLevel // logLevel, 0 = null, 1 = info, 2 = debug 81 | Logger *log.Logger 82 | } 83 | 84 | // DefaultConfig returns *Config with default values. 85 | func DefaultConfig(hostname string) *Config { 86 | return &Config{ 87 | Hostname: hostname, 88 | // NumVnodes should be set around logN 89 | // N is approximate number of real nodes in cluster 90 | // this way we get O(logN) lookup speed 91 | NumVnodes: 3, 92 | StabilizeMin: 1 * time.Second, 93 | StabilizeMax: 3 * time.Second, 94 | NumSuccessors: 8, // number of known successors to keep track with 95 | Replicas: 2, 96 | LogLevel: LogInfo, 97 | } 98 | } 99 | 100 | type LogLevel int 101 | 102 | const ( 103 | LogNull LogLevel = 0 104 | LogInfo LogLevel = 1 105 | LogDebug LogLevel = 2 106 | ) 107 | 108 | // Logf wraps log.Printf 109 | func (r *Ring) Logf(level LogLevel, format string, v ...interface{}) { 110 | var new_format string 111 | if level == LogInfo { 112 | new_format = "[DENDRITE][INFO] " + format 113 | } else if level == LogDebug { 114 | new_format = "[DENDRITE][DEBUG] " + format 115 | } 116 | 117 | if r.config.LogLevel == LogDebug { 118 | if r.config.Logger != nil { 119 | r.config.Logger.Printf(new_format, v...) 120 | } else { 121 | log.Printf(new_format, v...) 122 | } 123 | } else if r.config.LogLevel == LogInfo && level == LogInfo { 124 | if r.config.Logger != nil { 125 | r.config.Logger.Printf(new_format, v...) 126 | } else { 127 | log.Printf(new_format, v...) 128 | } 129 | } 130 | } 131 | 132 | // Logln wraps log.Println 133 | func (r *Ring) Logln(level LogLevel, v ...interface{}) { 134 | var new_format string 135 | if level == LogInfo { 136 | new_format = "[DENDRITE][INFO]" 137 | } else if level == LogDebug { 138 | new_format = "[DENDRITE][DEBUG]" 139 | } 140 | if r.config.LogLevel == LogDebug { 141 | v = append([]interface{}{new_format}, v...) 142 | if r.config.Logger != nil { 143 | r.config.Logger.Println(v...) 144 | } else { 145 | log.Println(v...) 146 | } 147 | } else if r.config.LogLevel == LogInfo && level == LogInfo { 148 | v = append([]interface{}{new_format}, v...) 149 | if r.config.Logger != nil { 150 | r.config.Logger.Println(v...) 151 | } else { 152 | log.Println(v...) 153 | } 154 | } 155 | } 156 | 157 | // Ring is the main chord ring object. 158 | type Ring struct { 159 | config *Config 160 | transport Transport 161 | vnodes []*localVnode // list of local vnodes 162 | shutdown chan bool 163 | Stabilizations int 164 | delegateHooks []DelegateHook 165 | Logger *log.Logger 166 | } 167 | 168 | // Less implements sort.Interface Less() - used to sort ring.vnodes. 169 | func (r *Ring) Less(i, j int) bool { 170 | return bytes.Compare(r.vnodes[i].Id, r.vnodes[j].Id) == -1 171 | } 172 | 173 | // Swap implements sort.Interface Swap() - used to sort ring.vnodes. 174 | func (r *Ring) Swap(i, j int) { 175 | r.vnodes[i], r.vnodes[j] = r.vnodes[j], r.vnodes[i] 176 | } 177 | 178 | // Len implements sort.Interface Len() - used to sort ring.vnodes. 179 | func (r *Ring) Len() int { 180 | return len(r.vnodes) 181 | } 182 | 183 | // Replicas returns ring.config.Replicas. 184 | func (r *Ring) Replicas() int { 185 | return r.config.Replicas 186 | } 187 | 188 | // MaxStabilize returns ring.config.StabilizeMax duration. 189 | func (r *Ring) MaxStabilize() time.Duration { 190 | return r.config.StabilizeMax 191 | } 192 | 193 | // Lookup. For given key hash, it finds N successors in the ring. 194 | func (r *Ring) Lookup(n int, keyHash []byte) ([]*Vnode, error) { 195 | // Ensure that n is sane 196 | if n > r.config.NumSuccessors { 197 | return nil, fmt.Errorf("Cannot ask for more successors than NumSuccessors!") 198 | } 199 | 200 | // Find the nearest local vnode 201 | nearest := nearestVnodeToKey(r.vnodes, keyHash) 202 | 203 | // Use the nearest node for the lookup 204 | successors, err := r.transport.FindSuccessors(nearest, n, keyHash) 205 | if err != nil { 206 | return nil, err 207 | } 208 | 209 | // Trim the nil successors 210 | for successors[len(successors)-1] == nil { 211 | successors = successors[:len(successors)-1] 212 | } 213 | return successors, nil 214 | } 215 | 216 | // setLocalSuccessors initializes the vnodes with their local successors. 217 | // Vnodes need to be sorted before this method is called. 218 | func (r *Ring) setLocalSuccessors() { 219 | numV := len(r.vnodes) 220 | if numV == 1 { 221 | for _, vnode := range r.vnodes { 222 | vnode.successors[0] = &vnode.Vnode 223 | } 224 | return 225 | } 226 | // we use numV-1 in order to avoid setting ourselves as last successor 227 | numSuc := min(r.config.NumSuccessors, numV-1) 228 | for idx, vnode := range r.vnodes { 229 | for i := 0; i < numSuc; i++ { 230 | vnode.successors[i] = &r.vnodes[(idx+i+1)%numV].Vnode 231 | } 232 | } 233 | 234 | } 235 | 236 | // init initializes the ring. 237 | func (r *Ring) init(config *Config, transport Transport) { 238 | r.config = config 239 | r.Logger = config.Logger 240 | r.transport = InitLocalTransport(transport) 241 | r.vnodes = make([]*localVnode, config.NumVnodes) 242 | r.shutdown = make(chan bool) 243 | r.delegateHooks = make([]DelegateHook, 0) 244 | // initialize vnodes 245 | for i := 0; i < config.NumVnodes; i++ { 246 | vn := &localVnode{} 247 | r.vnodes[i] = vn 248 | vn.ring = r 249 | vn.init(i) 250 | } 251 | sort.Sort(r) 252 | /* 253 | go func() { 254 | for { 255 | for _, vnode := range r.vnodes { 256 | var pred []byte 257 | if vnode.predecessor == nil { 258 | pred = nil 259 | } else { 260 | pred = vnode.predecessor.Id 261 | } 262 | 263 | fmt.Printf("Vnode: %X -> pred: %X -> succ: ", vnode.Id, pred) 264 | for idx, suc := range vnode.successors { 265 | if suc == nil { 266 | break 267 | } 268 | fmt.Printf("succ-%d-%X, ", idx, suc.Id) 269 | } 270 | fmt.Printf("\n") 271 | } 272 | time.Sleep(15 * time.Second) 273 | } 274 | }() 275 | */ 276 | } 277 | 278 | // schedule schedules ring's vnodes stabilize() for execution. 279 | func (r *Ring) schedule() { 280 | for i := 0; i < len(r.vnodes); i++ { 281 | r.vnodes[i].schedule() 282 | } 283 | } 284 | 285 | // MyVnodes returns slice of local Vnodes 286 | func (r *Ring) MyVnodes() []*Vnode { 287 | rv := make([]*Vnode, len(r.vnodes)) 288 | for idx, local_vn := range r.vnodes { 289 | rv[idx] = &local_vn.Vnode 290 | } 291 | return rv 292 | } 293 | 294 | // CreateRing bootstraps the ring with given config and local transport. 295 | func CreateRing(config *Config, transport Transport) (*Ring, error) { 296 | // initialize the ring and sort vnodes 297 | r := &Ring{} 298 | r.init(config, transport) 299 | 300 | // for each vnode, setup local successors 301 | r.setLocalSuccessors() 302 | 303 | // schedule vnode stabilizers 304 | r.schedule() 305 | 306 | return r, nil 307 | } 308 | 309 | // JoinRing joins existing dendrite network. 310 | func JoinRing(config *Config, transport Transport, existing string) (*Ring, error) { 311 | hosts, err := transport.ListVnodes(existing) 312 | if err != nil { 313 | return nil, err 314 | } 315 | if hosts == nil || len(hosts) == 0 { 316 | return nil, fmt.Errorf("Remote host has no vnodes registered yet") 317 | } 318 | 319 | // initialize the ring and sort vnodes 320 | r := &Ring{} 321 | r.init(config, transport) 322 | 323 | // for each vnode, get the new list of live successors from remote 324 | for _, vn := range r.vnodes { 325 | resolved := false 326 | var last_error error 327 | // go through each host until we get successor list from one of them 328 | L: 329 | for _, remote_host := range hosts { 330 | suc_pos := 0 331 | succs, err := transport.FindSuccessors(remote_host, config.NumSuccessors, vn.Id) 332 | if err != nil { 333 | last_error = err 334 | continue L 335 | } 336 | if succs == nil || len(succs) == 0 { 337 | //return nil, fmt.Errorf("Failed to find successors for vnode, got empty list") 338 | last_error = fmt.Errorf("Failed to find successors for vnode, got empty list") 339 | continue L 340 | } 341 | SL: 342 | for _, s := range succs { 343 | // if we're rejoining before failure is detected.. s could be us 344 | if bytes.Compare(vn.Id, s.Id) == 0 { 345 | continue SL 346 | } 347 | if s == nil { 348 | break SL 349 | } 350 | vn.successors[suc_pos] = s 351 | suc_pos += 1 352 | } 353 | resolved = true 354 | break L 355 | } 356 | if !resolved { 357 | return nil, fmt.Errorf("Exhausted all remote vnodes while trying to get the list of successors. Last error: %s", last_error.Error()) 358 | } 359 | 360 | } 361 | r.transport.Ping(&Vnode{Host: existing}) 362 | 363 | // We can now initiate stabilization protocol 364 | for _, vn := range r.vnodes { 365 | vn.stabilize() 366 | } 367 | return r, nil 368 | } 369 | 370 | // RegisterDelegateHook registers DelegateHook for emitting ring events. 371 | func (r *Ring) RegisterDelegateHook(dh DelegateHook) { 372 | r.delegateHooks = append(r.delegateHooks, dh) 373 | } 374 | 375 | type RingEventType int 376 | 377 | var ( 378 | EvPredecessorJoined RingEventType = 1 379 | EvPredecessorLeft RingEventType = 2 380 | EvReplicasChanged RingEventType = 3 381 | ) 382 | 383 | // EventCtx is a generic struct representing an event. Instance of EventCtx is emitted to DelegateHooks. 384 | type EventCtx struct { 385 | EvType RingEventType 386 | Target *Vnode 387 | PrimaryItem *Vnode 388 | SecondaryItem *Vnode 389 | ItemList []*Vnode 390 | ResponseCh chan interface{} 391 | } 392 | 393 | // emit emits EventCtx to all registered DelegateHooks. 394 | func (r *Ring) emit(ctx *EventCtx) { 395 | for _, dh := range r.delegateHooks { 396 | go dh.EmitEvent(ctx) 397 | } 398 | } 399 | -------------------------------------------------------------------------------- /vnode.go: -------------------------------------------------------------------------------- 1 | package dendrite 2 | 3 | import ( 4 | "bytes" 5 | "crypto/sha1" 6 | "encoding/binary" 7 | "fmt" 8 | "github.com/golang/protobuf/proto" 9 | "log" 10 | "sync" 11 | "time" 12 | ) 13 | 14 | // Vnode is basic virtual node structure. 15 | type Vnode struct { 16 | Id []byte 17 | Host string // ip:port 18 | } 19 | 20 | // String returns string representation (hex encoded) of vnode's Id. 21 | func (vn *Vnode) String() string { 22 | return fmt.Sprintf("%x", vn.Id) 23 | } 24 | 25 | // ToProtobuf is a helper method which returns PBProtoVnode message from a *Vnode. 26 | func (vn *Vnode) ToProtobuf() *PBProtoVnode { 27 | return &PBProtoVnode{ 28 | Host: proto.String(vn.Host), 29 | Id: vn.Id, 30 | } 31 | } 32 | 33 | // VnodeFromProtobuf is helper method that creates *Vnode from PBProtoVnode message. 34 | func VnodeFromProtobuf(pb *PBProtoVnode) *Vnode { 35 | return &Vnode{ 36 | Id: pb.GetId(), 37 | Host: pb.GetHost(), 38 | } 39 | } 40 | 41 | // localVnode inherits Vnode and adds additional fields. 42 | type localVnode struct { 43 | Vnode 44 | ring *Ring 45 | successors []*Vnode // "backlog" of known successors 46 | remote_successors []*Vnode 47 | finger []*Vnode 48 | last_finger int 49 | predecessor *Vnode 50 | old_predecessor *Vnode 51 | stabilized time.Time 52 | timer *time.Timer 53 | delegateMux sync.Mutex 54 | } 55 | 56 | // init initializes a localVnode. 57 | func (vn *localVnode) init(idx int) { 58 | // combine hostname with idx to generate hash 59 | hash := sha1.New() 60 | hash.Write([]byte(vn.ring.config.Hostname)) 61 | binary.Write(hash, binary.BigEndian, uint16(idx)) 62 | vn.Id = hash.Sum(nil) 63 | vn.Host = vn.ring.config.Hostname 64 | vn.successors = make([]*Vnode, vn.ring.config.NumSuccessors) 65 | vn.remote_successors = make([]*Vnode, vn.ring.config.Replicas) 66 | vn.finger = make([]*Vnode, 160) // keyspace size is 160 with SHA1 67 | vn.ring.transport.Register(&vn.Vnode, vn) 68 | } 69 | 70 | // schedule schedules vnode's stabilize(). 71 | func (vn *localVnode) schedule() { 72 | // Setup our stabilize timer 73 | vn.timer = time.AfterFunc(randStabilize(vn.ring.config), vn.stabilize) 74 | } 75 | 76 | // stabilize is part of Chord Protocol. It is used to position a vnode inside of the ring and handle changes. 77 | func (vn *localVnode) stabilize() { 78 | defer vn.schedule() 79 | 80 | start := time.Now() 81 | if err := vn.checkNewSuccessor(); err != nil { 82 | vn.ring.Logln(LogDebug, "stabilize() - error checking successor:", err) 83 | } 84 | //log.Printf("CheckSucc returned for %X - %X\n", vn.Id, vn.successors[0].Id) 85 | 86 | // Notify the successor 87 | if err := vn.notifySuccessor(); err != nil { 88 | vn.ring.Logln(LogDebug, "stabilize() - error notifying successor:", err) 89 | } 90 | //log.Printf("NotifySucc returned for %X\n", vn.Id) 91 | 92 | if err := vn.fixFingerTable(); err != nil { 93 | vn.ring.Logln(LogDebug, "stabilize() - error fixing finger table, last:", time.Since(start), vn.last_finger, err) 94 | } 95 | 96 | if err := vn.checkPredecessor(); err != nil { 97 | vn.ring.Logf(LogInfo, "stabilize() predecessor failed for %X - %s\n", vn.Id, err) 98 | } 99 | //log.Println("[stabilize] completed in", time.Since(start)) 100 | } 101 | 102 | // closest_preceeding_finger finds closest preceeding Vnode for given id, by using finger table and local successor list. 103 | func (vn *localVnode) closest_preceeding_finger(id []byte) *Vnode { 104 | var finger_node, successor_node *Vnode 105 | 106 | // loop through finger table, keysize(i) down to 1 107 | for i := vn.last_finger; i >= 0; i-- { 108 | if vn.finger[i] == nil { 109 | continue 110 | } 111 | // check if id falls after this finger (finger[i] IN (n, id)) 112 | if between(vn.Id, id, vn.finger[i].Id, false) { 113 | finger_node = vn.finger[i] 114 | break 115 | } 116 | } 117 | 118 | // loop through successors 119 | for i := len(vn.successors) - 1; i >= 1; i-- { 120 | if vn.successors[i] == nil { 121 | continue 122 | } 123 | if between(vn.Id, id, vn.successors[i].Id, false) { 124 | successor_node = vn.successors[i] 125 | break 126 | } 127 | } 128 | 129 | // return the best result 130 | if finger_node == nil { 131 | if successor_node == nil { 132 | return &vn.Vnode 133 | } 134 | return successor_node 135 | } 136 | if successor_node == nil { 137 | return finger_node 138 | } 139 | 140 | finger_dist := distance(vn.Id, finger_node.Id) 141 | successor_dist := distance(vn.Id, successor_node.Id) 142 | if finger_dist.Cmp(successor_dist) <= 0 { 143 | return successor_node 144 | } else { 145 | return finger_node 146 | } 147 | return nil 148 | } 149 | 150 | // checkNewSuccessor checks if there's new successor ahead. 151 | func (vn *localVnode) checkNewSuccessor() error { 152 | update_remotes := false 153 | for { 154 | if vn.successors[0] == nil { 155 | log.Fatal("Node has no more successors :(") 156 | } 157 | // Ask our successor for it's predecessor 158 | maybe_suc, err := vn.ring.transport.GetPredecessor(vn.successors[0]) 159 | if err != nil { 160 | vn.ring.Logln(LogDebug, "stabilize::checkNewSuccessor() trying next known successor due to error:", err) 161 | copy(vn.successors[0:], vn.successors[1:]) 162 | update_remotes = true 163 | continue 164 | } 165 | 166 | if maybe_suc != nil && between(vn.Id, vn.successors[0].Id, maybe_suc.Id, false) { 167 | alive, _ := vn.ring.transport.Ping(maybe_suc) 168 | if alive { 169 | copy(vn.successors[1:], vn.successors[0:len(vn.successors)-1]) 170 | vn.successors[0] = maybe_suc 171 | update_remotes = true 172 | vn.ring.Logf(LogInfo, "stabilize::checkNewSuccessor() - new successor set: %X -> %X\n", vn.Id, maybe_suc.Id) 173 | } else { 174 | // skip this one, it's not alive 175 | //log.Println("[stabilize] new successor found, but it's not alive") 176 | } 177 | break 178 | } else { 179 | // we're good for now, checkPredcessor should fix this (maybe_suc is nil) 180 | break 181 | } 182 | } 183 | // while we're here, ping other successors to make sure they're alive 184 | vn.fixLiveSuccessors() 185 | 186 | // update remote successors if our list changed 187 | if update_remotes { 188 | vn.updateRemoteSuccessors() 189 | } 190 | return nil 191 | } 192 | 193 | // fixLiveSuccessors pings all locally known successors and returns new list of active ones. 194 | func (vn *localVnode) fixLiveSuccessors() { 195 | live_successors := make([]*Vnode, vn.ring.config.NumSuccessors) 196 | real_idx := 0 197 | for _, succ := range vn.successors { 198 | if succ == nil { 199 | continue 200 | } 201 | if alive, _ := vn.ring.transport.Ping(succ); alive { 202 | live_successors[real_idx] = succ 203 | real_idx++ 204 | } 205 | } 206 | vn.successors = live_successors 207 | } 208 | 209 | // notifySuccessor notifies our successor of us, and updates successor list. 210 | func (vn *localVnode) notifySuccessor() error { 211 | old_successors := make([]*Vnode, len(vn.successors)) 212 | copy(old_successors, vn.successors) 213 | // Notify successor 214 | succ := vn.successors[0] 215 | succ_list, err := vn.ring.transport.Notify(succ, &vn.Vnode) 216 | if err != nil { 217 | return err 218 | } 219 | 220 | // Trim the successors list if too long 221 | max_succ := vn.ring.config.NumSuccessors 222 | if len(succ_list) > max_succ-1 { 223 | succ_list = succ_list[:max_succ-1] 224 | } 225 | 226 | // Update local successors list 227 | for idx, s := range succ_list { 228 | if s == nil { 229 | break 230 | } 231 | // Ensure we don't set ourselves as a successor! 232 | if s == nil || s.String() == vn.String() { 233 | break 234 | } 235 | //fmt.Printf("Updating successor from notifySuccessor(), %X -> %X\n", vn.Id, s.Id) 236 | vn.successors[idx+1] = s 237 | } 238 | // remove inactive successors 239 | vn.fixLiveSuccessors() 240 | 241 | // lets see if our successor list changed 242 | for idx, new_succ := range vn.successors { 243 | if (new_succ == nil && old_successors[idx] != nil) || 244 | (new_succ != nil && old_successors[idx] == nil) { 245 | vn.updateRemoteSuccessors() 246 | break 247 | } 248 | if new_succ == nil && old_successors[idx] == nil { 249 | continue 250 | } 251 | if bytes.Compare(new_succ.Id, old_successors[idx].Id) != 0 { 252 | // changed! we should update our remotes now 253 | vn.updateRemoteSuccessors() 254 | break 255 | } 256 | } 257 | return nil 258 | } 259 | 260 | // checkPredecessor checks the health of vnode's predecessor. 261 | func (vn *localVnode) checkPredecessor() error { 262 | // Check predecessor 263 | if vn.predecessor != nil { 264 | ok, err := vn.ring.transport.Ping(vn.predecessor) 265 | if err != nil || !ok { 266 | vn.ring.Logln(LogInfo, "stabilize::checkPredecessor() - detected predecessor failure") 267 | vn.old_predecessor = vn.predecessor 268 | vn.predecessor = nil 269 | return err 270 | } 271 | } 272 | return nil 273 | } 274 | 275 | // fixFingerTable updates finger table. 276 | func (vn *localVnode) fixFingerTable() error { 277 | //log.Printf("Starting fixFingerTable, %X - %X\n", vn.Id, vn.successors[0].Id) 278 | idx := 0 279 | self := &vn.Vnode 280 | for i := 0; i < 160; i++ { 281 | offset := powerOffset(self.Id, i, 160) 282 | //log.Printf("\t\tidx: %d: %X\n", i, offset) 283 | succs, err := vn.ring.transport.FindSuccessors(self, 1, offset) 284 | if err != nil { 285 | vn.last_finger = idx 286 | return err 287 | } 288 | if succs == nil || len(succs) == 0 { 289 | vn.last_finger = idx 290 | return fmt.Errorf("no successors found for key") 291 | } 292 | // see if we already have this node, keeps finger table short 293 | if idx > 0 && bytes.Compare(vn.finger[vn.last_finger].Id, succs[0].Id) == 0 { 294 | continue 295 | } 296 | // don't set ourselves as finger 297 | if bytes.Compare(succs[0].Id, vn.Id) == 0 { 298 | //log.Printf("\t\t\t GOT OURSELVES BACK.. HOW????, skipping\n") 299 | break 300 | } 301 | vn.finger[idx] = succs[0] 302 | vn.last_finger = idx 303 | idx += 1 304 | //log.Printf("\t\t\t set id: %X\n", succs[0].Id) 305 | } 306 | return nil 307 | } 308 | 309 | // updateRemoteSuccessors finds immediate but remote successors. It is used to form replica nodes. 310 | func (vn *localVnode) updateRemoteSuccessors() { 311 | old_remotes := make([]*Vnode, vn.ring.Replicas()) 312 | copy(old_remotes, vn.remote_successors) 313 | 314 | remotes, _ := vn.findRemoteSuccessors(vn.ring.Replicas()) 315 | changed := false 316 | for idx, remote := range remotes { 317 | if remote != nil && old_remotes[idx] != nil { 318 | if bytes.Compare(remote.Id, old_remotes[idx].Id) != 0 { 319 | if alive, _ := vn.ring.transport.Ping(remote); alive { 320 | vn.remote_successors[idx] = remote 321 | changed = true 322 | } 323 | } 324 | } else if remote == nil && old_remotes[idx] != nil { 325 | vn.remote_successors[idx] = remote 326 | changed = true 327 | } else if remote != nil && old_remotes[idx] == nil { 328 | if alive, _ := vn.ring.transport.Ping(remote); alive { 329 | vn.remote_successors[idx] = remote 330 | changed = true 331 | } 332 | } else { 333 | // we're good 334 | } 335 | } 336 | if changed { 337 | vn.ring.Logf(LogDebug, "updateRemoteSuccessors() - updated on: %s: %+v", vn.String(), vn.remote_successors) 338 | ctx := &EventCtx{ 339 | EvType: EvReplicasChanged, 340 | Target: &vn.Vnode, 341 | ItemList: vn.remote_successors, 342 | } 343 | vn.ring.emit(ctx) 344 | } 345 | } 346 | 347 | /* 348 | findRemoteSuccessors returns up to 'limit' successor vnodes that are unique and 349 | do not reside on same physical node as calling vnode. 350 | */ 351 | func (vn *localVnode) findRemoteSuccessors(limit int) ([]*Vnode, error) { 352 | remote_succs := make([]*Vnode, limit) 353 | seen_vnodes := make(map[string]bool) 354 | seen_hosts := make(map[string]bool) 355 | seen_vnodes[vn.String()] = true 356 | seen_hosts[vn.Host] = true 357 | var pivot_succ *Vnode 358 | num_appended := 0 359 | next_pos := 0 360 | 361 | for _, succ := range vn.successors { 362 | if num_appended == limit { 363 | return remote_succs, nil 364 | } 365 | if succ == nil { 366 | continue 367 | } 368 | seen_vnodes[succ.String()] = true 369 | pivot_succ = succ 370 | if _, ok := seen_hosts[succ.Host]; ok { 371 | continue 372 | } 373 | if succ.Host == vn.Host { 374 | continue 375 | } 376 | // make sure host is alive 377 | if alive, _ := vn.ring.transport.Ping(succ); alive { 378 | seen_hosts[succ.Host] = true 379 | remote_succs[next_pos] = succ 380 | next_pos++ 381 | num_appended++ 382 | } 383 | } 384 | 385 | // forward through pivot successor until we reach the limit or detect loopback 386 | for { 387 | if num_appended == limit { 388 | return remote_succs, nil 389 | } 390 | if pivot_succ == nil { 391 | return remote_succs, nil 392 | } 393 | next_successors, err := vn.ring.transport.FindSuccessors(pivot_succ, vn.ring.config.NumSuccessors, pivot_succ.Id) 394 | if err != nil { 395 | //vn.ring.Logln(LogDebug, "Pivot successor returned error, returning what we have so far.") 396 | return remote_succs, nil 397 | } 398 | for _, succ := range next_successors { 399 | if num_appended == limit { 400 | return remote_succs, nil 401 | } 402 | if succ == nil { 403 | continue 404 | } 405 | if _, ok := seen_vnodes[succ.String()]; ok { 406 | // loop detected, must return 407 | return remote_succs, nil 408 | } 409 | seen_vnodes[succ.String()] = true 410 | pivot_succ = succ 411 | if _, ok := seen_hosts[succ.Host]; ok { 412 | // we have this host already 413 | continue 414 | } 415 | if alive, _ := vn.ring.transport.Ping(succ); alive { 416 | seen_hosts[succ.Host] = true 417 | remote_succs[next_pos] = succ 418 | next_pos++ 419 | num_appended++ 420 | } 421 | } 422 | } 423 | return remote_succs, nil 424 | } 425 | -------------------------------------------------------------------------------- /dtable/dtable.pb.go: -------------------------------------------------------------------------------- 1 | package dtable 2 | 3 | import proto "github.com/golang/protobuf/proto" 4 | import math "math" 5 | import dendrite "github.com/fastfn/dendrite" 6 | 7 | // Reference imports to suppress errors if they are not otherwise used. 8 | var _ = proto.Marshal 9 | var _ = math.Inf 10 | 11 | // PBDTableResponse is a generic response structure with error indication. 12 | type PBDTableResponse struct { 13 | Ok *bool `protobuf:"varint,1,req,name=ok" json:"ok,omitempty"` 14 | Error *string `protobuf:"bytes,2,opt,name=error" json:"error,omitempty"` 15 | XXX_unrecognized []byte `json:"-"` 16 | } 17 | 18 | func (m *PBDTableResponse) Reset() { *m = PBDTableResponse{} } 19 | func (m *PBDTableResponse) String() string { return proto.CompactTextString(m) } 20 | func (*PBDTableResponse) ProtoMessage() {} 21 | 22 | func (m *PBDTableResponse) GetOk() bool { 23 | if m != nil && m.Ok != nil { 24 | return *m.Ok 25 | } 26 | return false 27 | } 28 | 29 | func (m *PBDTableResponse) GetError() string { 30 | if m != nil && m.Error != nil { 31 | return *m.Error 32 | } 33 | return "" 34 | } 35 | 36 | // PBDTableStatus is a message to request the status of remote vnode. 37 | type PBDTableStatus struct { 38 | Dest *dendrite.PBProtoVnode `protobuf:"bytes,1,req,name=dest" json:"dest,omitempty"` 39 | XXX_unrecognized []byte `json:"-"` 40 | } 41 | 42 | func (m *PBDTableStatus) Reset() { *m = PBDTableStatus{} } 43 | func (m *PBDTableStatus) String() string { return proto.CompactTextString(m) } 44 | func (*PBDTableStatus) ProtoMessage() {} 45 | 46 | func (m *PBDTableStatus) GetDest() *dendrite.PBProtoVnode { 47 | if m != nil { 48 | return m.Dest 49 | } 50 | return nil 51 | } 52 | 53 | // PBDTableReplicaInfo message represents kvItem's replicaInfo structure. 54 | type PBDTableReplicaInfo struct { 55 | Master *dendrite.PBProtoVnode `protobuf:"bytes,1,opt,name=master" json:"master,omitempty"` 56 | Vnodes []*dendrite.PBProtoVnode `protobuf:"bytes,2,rep,name=vnodes" json:"vnodes,omitempty"` 57 | OrphanVnodes []*dendrite.PBProtoVnode `protobuf:"bytes,3,rep,name=orphanVnodes" json:"orphanVnodes,omitempty"` 58 | State *int32 `protobuf:"varint,4,opt,name=state" json:"state,omitempty"` 59 | Depth *int32 `protobuf:"varint,5,opt,name=depth" json:"depth,omitempty"` 60 | XXX_unrecognized []byte `json:"-"` 61 | } 62 | 63 | func (m *PBDTableReplicaInfo) Reset() { *m = PBDTableReplicaInfo{} } 64 | func (m *PBDTableReplicaInfo) String() string { return proto.CompactTextString(m) } 65 | func (*PBDTableReplicaInfo) ProtoMessage() {} 66 | 67 | func (m *PBDTableReplicaInfo) GetMaster() *dendrite.PBProtoVnode { 68 | if m != nil { 69 | return m.Master 70 | } 71 | return nil 72 | } 73 | 74 | func (m *PBDTableReplicaInfo) GetVnodes() []*dendrite.PBProtoVnode { 75 | if m != nil { 76 | return m.Vnodes 77 | } 78 | return nil 79 | } 80 | 81 | func (m *PBDTableReplicaInfo) GetOrphanVnodes() []*dendrite.PBProtoVnode { 82 | if m != nil { 83 | return m.OrphanVnodes 84 | } 85 | return nil 86 | } 87 | 88 | func (m *PBDTableReplicaInfo) GetState() int32 { 89 | if m != nil && m.State != nil { 90 | return *m.State 91 | } 92 | return 0 93 | } 94 | 95 | func (m *PBDTableReplicaInfo) GetDepth() int32 { 96 | if m != nil && m.Depth != nil { 97 | return *m.Depth 98 | } 99 | return 0 100 | } 101 | 102 | // PBDTableItem message represents kvItem's structure. 103 | type PBDTableItem struct { 104 | Key []byte `protobuf:"bytes,1,opt,name=key" json:"key,omitempty"` 105 | Val []byte `protobuf:"bytes,2,opt,name=val" json:"val,omitempty"` 106 | Timestamp *int64 `protobuf:"varint,3,opt,name=timestamp" json:"timestamp,omitempty"` 107 | Commited *bool `protobuf:"varint,4,opt,name=commited" json:"commited,omitempty"` 108 | KeyHash []byte `protobuf:"bytes,5,opt,name=keyHash" json:"keyHash,omitempty"` 109 | ReplicaInfo *PBDTableReplicaInfo `protobuf:"bytes,6,opt,name=replicaInfo" json:"replicaInfo,omitempty"` 110 | Origin *dendrite.PBProtoVnode `protobuf:"bytes,7,opt,name=origin" json:"origin,omitempty"` 111 | Found *bool `protobuf:"varint,8,opt,name=found" json:"found,omitempty"` 112 | XXX_unrecognized []byte `json:"-"` 113 | } 114 | 115 | func (m *PBDTableItem) Reset() { *m = PBDTableItem{} } 116 | func (m *PBDTableItem) String() string { return proto.CompactTextString(m) } 117 | func (*PBDTableItem) ProtoMessage() {} 118 | 119 | func (m *PBDTableItem) GetKey() []byte { 120 | if m != nil { 121 | return m.Key 122 | } 123 | return nil 124 | } 125 | 126 | func (m *PBDTableItem) GetVal() []byte { 127 | if m != nil { 128 | return m.Val 129 | } 130 | return nil 131 | } 132 | 133 | func (m *PBDTableItem) GetTimestamp() int64 { 134 | if m != nil && m.Timestamp != nil { 135 | return *m.Timestamp 136 | } 137 | return 0 138 | } 139 | 140 | func (m *PBDTableItem) GetCommited() bool { 141 | if m != nil && m.Commited != nil { 142 | return *m.Commited 143 | } 144 | return false 145 | } 146 | 147 | func (m *PBDTableItem) GetKeyHash() []byte { 148 | if m != nil { 149 | return m.KeyHash 150 | } 151 | return nil 152 | } 153 | 154 | func (m *PBDTableItem) GetReplicaInfo() *PBDTableReplicaInfo { 155 | if m != nil { 156 | return m.ReplicaInfo 157 | } 158 | return nil 159 | } 160 | 161 | func (m *PBDTableItem) GetOrigin() *dendrite.PBProtoVnode { 162 | if m != nil { 163 | return m.Origin 164 | } 165 | return nil 166 | } 167 | 168 | func (m *PBDTableItem) GetFound() bool { 169 | if m != nil && m.Found != nil { 170 | return *m.Found 171 | } 172 | return false 173 | } 174 | 175 | // PBDTableDemotedItem message represents demotedItem's structure. 176 | type PBDTableDemotedItem struct { 177 | Dest *dendrite.PBProtoVnode `protobuf:"bytes,1,req,name=dest" json:"dest,omitempty"` 178 | Item *PBDTableItem `protobuf:"bytes,2,req,name=item" json:"item,omitempty"` 179 | Origin *dendrite.PBProtoVnode `protobuf:"bytes,3,opt,name=origin" json:"origin,omitempty"` 180 | XXX_unrecognized []byte `json:"-"` 181 | } 182 | 183 | func (m *PBDTableDemotedItem) Reset() { *m = PBDTableDemotedItem{} } 184 | func (m *PBDTableDemotedItem) String() string { return proto.CompactTextString(m) } 185 | func (*PBDTableDemotedItem) ProtoMessage() {} 186 | 187 | func (m *PBDTableDemotedItem) GetDest() *dendrite.PBProtoVnode { 188 | if m != nil { 189 | return m.Dest 190 | } 191 | return nil 192 | } 193 | 194 | func (m *PBDTableDemotedItem) GetItem() *PBDTableItem { 195 | if m != nil { 196 | return m.Item 197 | } 198 | return nil 199 | } 200 | 201 | func (m *PBDTableDemotedItem) GetOrigin() *dendrite.PBProtoVnode { 202 | if m != nil { 203 | return m.Origin 204 | } 205 | return nil 206 | } 207 | 208 | // PBDTableMultiItemResponse is a response message used to send multiple kvItems to the caller. 209 | type PBDTableMultiItemResponse struct { 210 | Origin *dendrite.PBProtoVnode `protobuf:"bytes,1,opt,name=origin" json:"origin,omitempty"` 211 | Items []*PBDTableItem `protobuf:"bytes,2,rep,name=items" json:"items,omitempty"` 212 | XXX_unrecognized []byte `json:"-"` 213 | } 214 | 215 | func (m *PBDTableMultiItemResponse) Reset() { *m = PBDTableMultiItemResponse{} } 216 | func (m *PBDTableMultiItemResponse) String() string { return proto.CompactTextString(m) } 217 | func (*PBDTableMultiItemResponse) ProtoMessage() {} 218 | 219 | func (m *PBDTableMultiItemResponse) GetOrigin() *dendrite.PBProtoVnode { 220 | if m != nil { 221 | return m.Origin 222 | } 223 | return nil 224 | } 225 | 226 | func (m *PBDTableMultiItemResponse) GetItems() []*PBDTableItem { 227 | if m != nil { 228 | return m.Items 229 | } 230 | return nil 231 | } 232 | 233 | // PBDTableGetItem is a request message used to get an item from remote vnode. 234 | type PBDTableGetItem struct { 235 | Dest *dendrite.PBProtoVnode `protobuf:"bytes,1,req,name=dest" json:"dest,omitempty"` 236 | KeyHash []byte `protobuf:"bytes,2,req,name=keyHash" json:"keyHash,omitempty"` 237 | Origin *dendrite.PBProtoVnode `protobuf:"bytes,3,opt,name=origin" json:"origin,omitempty"` 238 | XXX_unrecognized []byte `json:"-"` 239 | } 240 | 241 | func (m *PBDTableGetItem) Reset() { *m = PBDTableGetItem{} } 242 | func (m *PBDTableGetItem) String() string { return proto.CompactTextString(m) } 243 | func (*PBDTableGetItem) ProtoMessage() {} 244 | 245 | func (m *PBDTableGetItem) GetDest() *dendrite.PBProtoVnode { 246 | if m != nil { 247 | return m.Dest 248 | } 249 | return nil 250 | } 251 | 252 | func (m *PBDTableGetItem) GetKeyHash() []byte { 253 | if m != nil { 254 | return m.KeyHash 255 | } 256 | return nil 257 | } 258 | 259 | func (m *PBDTableGetItem) GetOrigin() *dendrite.PBProtoVnode { 260 | if m != nil { 261 | return m.Origin 262 | } 263 | return nil 264 | } 265 | 266 | // PBDTableSetItem is a request message used to set an item to remote vnode. 267 | type PBDTableSetItem struct { 268 | Dest *dendrite.PBProtoVnode `protobuf:"bytes,1,req,name=dest" json:"dest,omitempty"` 269 | Item *PBDTableItem `protobuf:"bytes,2,req,name=item" json:"item,omitempty"` 270 | Origin *dendrite.PBProtoVnode `protobuf:"bytes,3,opt,name=origin" json:"origin,omitempty"` 271 | Demoting *bool `protobuf:"varint,4,opt,name=demoting" json:"demoting,omitempty"` 272 | MinAcks *int32 `protobuf:"varint,5,opt,name=minAcks" json:"minAcks,omitempty"` 273 | XXX_unrecognized []byte `json:"-"` 274 | } 275 | 276 | func (m *PBDTableSetItem) Reset() { *m = PBDTableSetItem{} } 277 | func (m *PBDTableSetItem) String() string { return proto.CompactTextString(m) } 278 | func (*PBDTableSetItem) ProtoMessage() {} 279 | 280 | func (m *PBDTableSetItem) GetDest() *dendrite.PBProtoVnode { 281 | if m != nil { 282 | return m.Dest 283 | } 284 | return nil 285 | } 286 | 287 | func (m *PBDTableSetItem) GetItem() *PBDTableItem { 288 | if m != nil { 289 | return m.Item 290 | } 291 | return nil 292 | } 293 | 294 | func (m *PBDTableSetItem) GetOrigin() *dendrite.PBProtoVnode { 295 | if m != nil { 296 | return m.Origin 297 | } 298 | return nil 299 | } 300 | 301 | func (m *PBDTableSetItem) GetDemoting() bool { 302 | if m != nil && m.Demoting != nil { 303 | return *m.Demoting 304 | } 305 | return false 306 | } 307 | 308 | func (m *PBDTableSetItem) GetMinAcks() int32 { 309 | if m != nil && m.MinAcks != nil { 310 | return *m.MinAcks 311 | } 312 | return 0 313 | } 314 | 315 | // PBDTableSetMultiItem is a request message used to set multiple items on remote vnode. 316 | type PBDTableSetMultiItem struct { 317 | Dest *dendrite.PBProtoVnode `protobuf:"bytes,1,req,name=dest" json:"dest,omitempty"` 318 | Origin *dendrite.PBProtoVnode `protobuf:"bytes,2,opt,name=origin" json:"origin,omitempty"` 319 | Items []*PBDTableItem `protobuf:"bytes,3,rep,name=items" json:"items,omitempty"` 320 | XXX_unrecognized []byte `json:"-"` 321 | } 322 | 323 | func (m *PBDTableSetMultiItem) Reset() { *m = PBDTableSetMultiItem{} } 324 | func (m *PBDTableSetMultiItem) String() string { return proto.CompactTextString(m) } 325 | func (*PBDTableSetMultiItem) ProtoMessage() {} 326 | 327 | func (m *PBDTableSetMultiItem) GetDest() *dendrite.PBProtoVnode { 328 | if m != nil { 329 | return m.Dest 330 | } 331 | return nil 332 | } 333 | 334 | func (m *PBDTableSetMultiItem) GetOrigin() *dendrite.PBProtoVnode { 335 | if m != nil { 336 | return m.Origin 337 | } 338 | return nil 339 | } 340 | 341 | func (m *PBDTableSetMultiItem) GetItems() []*PBDTableItem { 342 | if m != nil { 343 | return m.Items 344 | } 345 | return nil 346 | } 347 | 348 | // PBDTableClearReplica is a request message used to remove replicated item from remote vnode. 349 | type PBDTableClearReplica struct { 350 | Dest *dendrite.PBProtoVnode `protobuf:"bytes,1,req,name=dest" json:"dest,omitempty"` 351 | KeyHash []byte `protobuf:"bytes,2,req,name=keyHash" json:"keyHash,omitempty"` 352 | Demoted *bool `protobuf:"varint,3,req,name=demoted" json:"demoted,omitempty"` 353 | Origin *dendrite.PBProtoVnode `protobuf:"bytes,4,opt,name=origin" json:"origin,omitempty"` 354 | XXX_unrecognized []byte `json:"-"` 355 | } 356 | 357 | func (m *PBDTableClearReplica) Reset() { *m = PBDTableClearReplica{} } 358 | func (m *PBDTableClearReplica) String() string { return proto.CompactTextString(m) } 359 | func (*PBDTableClearReplica) ProtoMessage() {} 360 | 361 | func (m *PBDTableClearReplica) GetDest() *dendrite.PBProtoVnode { 362 | if m != nil { 363 | return m.Dest 364 | } 365 | return nil 366 | } 367 | 368 | func (m *PBDTableClearReplica) GetKeyHash() []byte { 369 | if m != nil { 370 | return m.KeyHash 371 | } 372 | return nil 373 | } 374 | 375 | func (m *PBDTableClearReplica) GetDemoted() bool { 376 | if m != nil && m.Demoted != nil { 377 | return *m.Demoted 378 | } 379 | return false 380 | } 381 | 382 | func (m *PBDTableClearReplica) GetOrigin() *dendrite.PBProtoVnode { 383 | if m != nil { 384 | return m.Origin 385 | } 386 | return nil 387 | } 388 | 389 | // PBDTableSetReplicaInfo is a request message used to update metadata for replicated item on remote vnode. 390 | type PBDTableSetReplicaInfo struct { 391 | Dest *dendrite.PBProtoVnode `protobuf:"bytes,1,req,name=dest" json:"dest,omitempty"` 392 | KeyHash []byte `protobuf:"bytes,2,req,name=keyHash" json:"keyHash,omitempty"` 393 | ReplicaInfo *PBDTableReplicaInfo `protobuf:"bytes,3,req,name=replicaInfo" json:"replicaInfo,omitempty"` 394 | Origin *dendrite.PBProtoVnode `protobuf:"bytes,4,opt,name=origin" json:"origin,omitempty"` 395 | XXX_unrecognized []byte `json:"-"` 396 | } 397 | 398 | func (m *PBDTableSetReplicaInfo) Reset() { *m = PBDTableSetReplicaInfo{} } 399 | func (m *PBDTableSetReplicaInfo) String() string { return proto.CompactTextString(m) } 400 | func (*PBDTableSetReplicaInfo) ProtoMessage() {} 401 | 402 | func (m *PBDTableSetReplicaInfo) GetDest() *dendrite.PBProtoVnode { 403 | if m != nil { 404 | return m.Dest 405 | } 406 | return nil 407 | } 408 | 409 | func (m *PBDTableSetReplicaInfo) GetKeyHash() []byte { 410 | if m != nil { 411 | return m.KeyHash 412 | } 413 | return nil 414 | } 415 | 416 | func (m *PBDTableSetReplicaInfo) GetReplicaInfo() *PBDTableReplicaInfo { 417 | if m != nil { 418 | return m.ReplicaInfo 419 | } 420 | return nil 421 | } 422 | 423 | func (m *PBDTableSetReplicaInfo) GetOrigin() *dendrite.PBProtoVnode { 424 | if m != nil { 425 | return m.Origin 426 | } 427 | return nil 428 | } 429 | 430 | // PBDTablePromoteKey is a request message used to request a promotion of a key on the remote vnode. 431 | type PBDTablePromoteKey struct { 432 | Dest *dendrite.PBProtoVnode `protobuf:"bytes,1,req,name=dest" json:"dest,omitempty"` 433 | Item *PBDTableItem `protobuf:"bytes,2,req,name=item" json:"item,omitempty"` 434 | Origin *dendrite.PBProtoVnode `protobuf:"bytes,3,opt,name=origin" json:"origin,omitempty"` 435 | XXX_unrecognized []byte `json:"-"` 436 | } 437 | 438 | func (m *PBDTablePromoteKey) Reset() { *m = PBDTablePromoteKey{} } 439 | func (m *PBDTablePromoteKey) String() string { return proto.CompactTextString(m) } 440 | func (*PBDTablePromoteKey) ProtoMessage() {} 441 | 442 | func (m *PBDTablePromoteKey) GetDest() *dendrite.PBProtoVnode { 443 | if m != nil { 444 | return m.Dest 445 | } 446 | return nil 447 | } 448 | 449 | func (m *PBDTablePromoteKey) GetItem() *PBDTableItem { 450 | if m != nil { 451 | return m.Item 452 | } 453 | return nil 454 | } 455 | 456 | func (m *PBDTablePromoteKey) GetOrigin() *dendrite.PBProtoVnode { 457 | if m != nil { 458 | return m.Origin 459 | } 460 | return nil 461 | } 462 | 463 | func init() { 464 | } 465 | -------------------------------------------------------------------------------- /dtable/dtable.go: -------------------------------------------------------------------------------- 1 | package dtable 2 | 3 | import ( 4 | "fmt" 5 | "github.com/fastfn/dendrite" 6 | "github.com/golang/protobuf/proto" 7 | "sync" 8 | "time" 9 | ) 10 | 11 | type replicaState int 12 | type dtableEventType int 13 | 14 | // KVItem is basic database item struct. 15 | type KVItem struct { 16 | Key []byte 17 | Val []byte 18 | } 19 | 20 | type kvReplicaInfo struct { 21 | master *dendrite.Vnode 22 | vnodes []*dendrite.Vnode 23 | orphan_vnodes []*dendrite.Vnode 24 | state replicaState 25 | depth int 26 | } 27 | 28 | type kvItem struct { 29 | KVItem 30 | timestamp time.Time 31 | commited bool 32 | keyHash []byte 33 | lock *sync.Mutex 34 | replicaInfo *kvReplicaInfo 35 | } 36 | 37 | type demotedKvItem struct { 38 | item *kvItem 39 | new_master *dendrite.Vnode 40 | demoted_ts time.Time 41 | } 42 | 43 | type dtableEvent struct { 44 | evType dtableEventType 45 | vnode *dendrite.Vnode 46 | item *kvItem 47 | } 48 | type itemMap map[string]*kvItem 49 | type demotedItemMap map[string]*demotedKvItem 50 | 51 | // DTable is main dtable struct. 52 | type DTable struct { 53 | // base structures 54 | table map[string]itemMap // primary k/v table 55 | rtable map[string]itemMap // rtable is table of replicas 56 | demoted_table map[string]demotedItemMap // demoted items 57 | ring *dendrite.Ring 58 | transport dendrite.Transport 59 | confLogLevel LogLevel 60 | // communication channels 61 | event_c chan *dendrite.EventCtx // dendrite sends events here 62 | dtable_c chan *dtableEvent // internal dtable events 63 | selfcheck_t *time.Ticker 64 | captureKeyHooks []CaptureKeyHook 65 | } 66 | 67 | type CaptureKeyHook interface { 68 | CaptureKeyHandler(key []byte) 69 | } 70 | 71 | const ( 72 | PbDtableStatus dendrite.MsgType = 0x20 // status request to see if remote dtable is initialized 73 | PbDtableResponse dendrite.MsgType = 0x21 // generic response 74 | PbDtableItem dendrite.MsgType = 0x22 // single item response 75 | PbDtableMultiItemResponse dendrite.MsgType = 0x23 // response with multiple items 76 | PbDtableGetItem dendrite.MsgType = 0x24 // getItem request 77 | PbDtableSetItem dendrite.MsgType = 0x25 // setItem request 78 | PbDtableSetMultiItem dendrite.MsgType = 0x26 // setMultiItem request 79 | PbDtableClearReplica dendrite.MsgType = 0x27 // clearReplica request 80 | PbDtableSetReplica dendrite.MsgType = 0x28 // setReplica request 81 | PbDtableSetReplicaInfo dendrite.MsgType = 0x29 // setReplicaInfo request 82 | PbDtablePromoteKey dendrite.MsgType = 0x30 // promote remote vnode for given key 83 | 84 | replicaStable replicaState = 0 // all replicas commited 85 | replicaPartial replicaState = 1 // all available replicas commited but there's no enough remote nodes 86 | replicaIncomplete replicaState = 2 // some of the replicas did not commit 87 | 88 | evPromoteKey dtableEventType = 0 89 | ) 90 | 91 | func (m itemMap) put(item *kvItem) error { 92 | if oldItem, ok := m[item.keyHashString()]; ok { 93 | if oldItem.timestamp.UnixNano() <= item.timestamp.UnixNano() { 94 | // key exists but the record is older than new one 95 | if item.Val == nil { 96 | delete(m, item.keyHashString()) 97 | } else { 98 | m[item.keyHashString()] = item 99 | } 100 | } else { 101 | return fmt.Errorf("map.put() refused write for key %s. Record too old: %d > %d", 102 | item.keyHashString(), oldItem.timestamp.UnixNano(), item.timestamp.UnixNano()) 103 | } 104 | } else { 105 | if item.Val != nil { 106 | m[item.keyHashString()] = item 107 | } else { 108 | return fmt.Errorf("map.put() - empty value not allowed") 109 | } 110 | } 111 | return nil 112 | } 113 | 114 | // Init initializes dtable and registers with dendrite as a TransportHook and DelegateHook. 115 | func Init(ring *dendrite.Ring, transport dendrite.Transport, level LogLevel) *DTable { 116 | dt := &DTable{ 117 | table: make(map[string]itemMap), 118 | rtable: make(map[string]itemMap), 119 | demoted_table: make(map[string]demotedItemMap), 120 | ring: ring, 121 | transport: transport, 122 | confLogLevel: level, 123 | event_c: make(chan *dendrite.EventCtx), 124 | dtable_c: make(chan *dtableEvent), 125 | selfcheck_t: time.NewTicker(10 * time.Minute), 126 | captureKeyHooks: make([]CaptureKeyHook, 0), 127 | } 128 | // each local vnode needs to be separate key in dtable 129 | for _, vnode := range ring.MyVnodes() { 130 | node_kv := make(map[string]*kvItem) 131 | node_rkv := make(map[string]*kvItem) 132 | node_demoted := make(map[string]*demotedKvItem) 133 | vn_key_str := fmt.Sprintf("%x", vnode.Id) 134 | dt.table[vn_key_str] = node_kv 135 | dt.rtable[vn_key_str] = node_rkv 136 | dt.demoted_table[vn_key_str] = node_demoted 137 | } 138 | transport.RegisterHook(dt) 139 | go dt.delegator() 140 | ring.RegisterDelegateHook(dt) 141 | return dt 142 | } 143 | 144 | // EmitEvent implements dendrite's DelegateHook. 145 | func (dt *DTable) EmitEvent(ctx *dendrite.EventCtx) { 146 | dt.event_c <- ctx 147 | } 148 | 149 | func (dt *DTable) RegisterCaptureKeyHook(hook CaptureKeyHook) { 150 | dt.captureKeyHooks = append(dt.captureKeyHooks, hook) 151 | } 152 | 153 | func (dt *DTable) callHooks(item *kvItem) { 154 | if item.Val != nil { 155 | for _, hook := range dt.captureKeyHooks { 156 | hook.CaptureKeyHandler(item.Key) 157 | } 158 | } 159 | } 160 | 161 | // Decode implements dendrite's TransportHook. 162 | func (dt *DTable) Decode(data []byte) (*dendrite.ChordMsg, error) { 163 | data_len := len(data) 164 | if data_len == 0 { 165 | return nil, fmt.Errorf("data too short: %d", len(data)) 166 | } 167 | 168 | cm := &dendrite.ChordMsg{Type: dendrite.MsgType(data[0])} 169 | 170 | if data_len > 1 { 171 | cm.Data = data[1:] 172 | } 173 | 174 | // parse the data and set the handler 175 | switch cm.Type { 176 | case PbDtableStatus: 177 | var dtableStatusMsg PBDTableStatus 178 | err := proto.Unmarshal(cm.Data, &dtableStatusMsg) 179 | if err != nil { 180 | return nil, fmt.Errorf("error decoding PBDTableStatus message - %s", err) 181 | } 182 | cm.TransportMsg = dtableStatusMsg 183 | cm.TransportHandler = dt.zmq_status_handler 184 | case PbDtableGetItem: 185 | var dtableGetItemMsg PBDTableGetItem 186 | err := proto.Unmarshal(cm.Data, &dtableGetItemMsg) 187 | if err != nil { 188 | return nil, fmt.Errorf("error decoding PBDTableGetItem message - %s", err) 189 | } 190 | cm.TransportMsg = dtableGetItemMsg 191 | cm.TransportHandler = dt.zmq_get_handler 192 | case PbDtableItem: 193 | var dtableItemMsg PBDTableItem 194 | err := proto.Unmarshal(cm.Data, &dtableItemMsg) 195 | if err != nil { 196 | return nil, fmt.Errorf("error decoding PBDTableItem message - %s", err) 197 | } 198 | cm.TransportMsg = dtableItemMsg 199 | case PbDtableSetItem: 200 | var dtableSetItemMsg PBDTableSetItem 201 | err := proto.Unmarshal(cm.Data, &dtableSetItemMsg) 202 | if err != nil { 203 | return nil, fmt.Errorf("error decoding PBDTableSetItem message - %s", err) 204 | } 205 | cm.TransportMsg = dtableSetItemMsg 206 | cm.TransportHandler = dt.zmq_set_handler 207 | case PbDtableSetReplicaInfo: 208 | var dtableSetReplicaInfoMsg PBDTableSetReplicaInfo 209 | err := proto.Unmarshal(cm.Data, &dtableSetReplicaInfoMsg) 210 | if err != nil { 211 | return nil, fmt.Errorf("error decoding PBDTableSetReplicaInfo message - %s", err) 212 | } 213 | cm.TransportMsg = dtableSetReplicaInfoMsg 214 | cm.TransportHandler = dt.zmq_setReplicaInfo_handler 215 | case PbDtableClearReplica: 216 | var dtableClearReplicaMsg PBDTableClearReplica 217 | err := proto.Unmarshal(cm.Data, &dtableClearReplicaMsg) 218 | if err != nil { 219 | return nil, fmt.Errorf("error decoding PBDTableClearReplica message - %s", err) 220 | } 221 | cm.TransportMsg = dtableClearReplicaMsg 222 | cm.TransportHandler = dt.zmq_clearreplica_handler 223 | case PbDtableSetReplica: 224 | var dtableSetReplicaMsg PBDTableSetItem 225 | err := proto.Unmarshal(cm.Data, &dtableSetReplicaMsg) 226 | if err != nil { 227 | return nil, fmt.Errorf("error decoding PBDTableSetReplica message - %s", err) 228 | } 229 | cm.TransportMsg = dtableSetReplicaMsg 230 | cm.TransportHandler = dt.zmq_setReplica_handler 231 | case PbDtablePromoteKey: 232 | var dtablePromoteKeyMsg PBDTablePromoteKey 233 | err := proto.Unmarshal(cm.Data, &dtablePromoteKeyMsg) 234 | if err != nil { 235 | return nil, fmt.Errorf("error decoding PBDTablePromoteKey message - %s", err) 236 | } 237 | cm.TransportMsg = dtablePromoteKeyMsg 238 | cm.TransportHandler = dt.zmq_promoteKey_handler 239 | case PbDtableResponse: 240 | var dtableResponseMsg PBDTableResponse 241 | err := proto.Unmarshal(cm.Data, &dtableResponseMsg) 242 | if err != nil { 243 | return nil, fmt.Errorf("error decoding PBDTableResponse message - %s", err) 244 | } 245 | cm.TransportMsg = dtableResponseMsg 246 | default: 247 | // must return unknownType error 248 | //fmt.Printf("GOT UNKNOWN!!!!!!! %x - %x\n", cm.Type, byte(cm.Type)) 249 | var rv dendrite.ErrHookUnknownType = "unknown request type" 250 | return nil, rv 251 | } 252 | return cm, nil 253 | } 254 | 255 | // get returns value for a given key 256 | func (dt *DTable) get(reqItem *kvItem) (*kvItem, error) { 257 | succs, err := dt.ring.Lookup(3, reqItem.keyHash) 258 | if err != nil { 259 | return nil, err 260 | } 261 | // check if successor exists in local dtable 262 | vn_table, ok := dt.table[succs[0].String()] 263 | key_str := reqItem.keyHashString() 264 | if ok { 265 | if item, exists := vn_table[key_str]; exists && item.commited { 266 | return item.dup(), nil 267 | } else { 268 | return nil, nil 269 | } 270 | } else { 271 | // check against replica tables 272 | for _, rtable := range dt.rtable { 273 | if item, exists := rtable[key_str]; exists && item.replicaInfo.state == replicaIncomplete { 274 | return item, nil 275 | } 276 | } 277 | } 278 | 279 | // make remote call to all successors 280 | var last_err error 281 | for _, succ := range succs { 282 | respItem, _, err := dt.remoteGet(succ, reqItem) 283 | if err != nil { 284 | last_err = err 285 | dt.Logln(LogDebug, "ZMQ::remoteGet error - ", err) 286 | continue 287 | } 288 | return respItem, nil 289 | } 290 | return nil, last_err 291 | } 292 | 293 | // handle remote replica requests 294 | func (dt *DTable) setReplica(vnode *dendrite.Vnode, item *kvItem) { 295 | key_str := item.keyHashString() 296 | if item.Val == nil { 297 | //log.Println("SetReplica() - value for key", key_str, "is nil, removing item") 298 | delete(dt.rtable[vnode.String()], key_str) 299 | } else { 300 | //log.Println("SetReplica() - success for key", key_str) 301 | item.commited = true 302 | dt.rtable[vnode.String()][key_str] = item 303 | } 304 | } 305 | 306 | /* set writes to dtable's primary(non-replica table). It is called from both Query api and 307 | by remote clients via zmq. 308 | 309 | It reports back on done chan when minAcks is reached so that clients can continue without 310 | blocking while replication takes place. 311 | */ 312 | func (dt *DTable) set(vn *dendrite.Vnode, item *kvItem, minAcks int, done chan error) { 313 | // make sure we have local handler before doing any write 314 | handler, _ := dt.transport.GetVnodeHandler(vn) 315 | if handler == nil { 316 | done <- fmt.Errorf("local handler could not be found for vnode %x", vn.Id) 317 | return 318 | } 319 | write_count := 0 320 | vn_table, _ := dt.table[vn.String()] 321 | 322 | item.lock.Lock() 323 | defer item.lock.Unlock() 324 | 325 | item.replicaInfo.master = vn 326 | err := vn_table.put(item) 327 | if err != nil { 328 | done <- err 329 | return 330 | } 331 | 332 | write_count++ 333 | repwrite_count := 0 334 | returned := false 335 | item.replicaInfo.state = replicaIncomplete 336 | 337 | // should we return to client immediately? 338 | if minAcks == write_count { 339 | // cover the case where ring.Replicas() returns 0 340 | if dt.ring.Replicas() == repwrite_count { 341 | item.replicaInfo.state = replicaStable 342 | item.commited = true 343 | done <- nil 344 | dt.callHooks(item) 345 | return 346 | } 347 | item.commited = true 348 | done <- nil 349 | returned = true 350 | } 351 | 352 | // find remote successors to write replicas to 353 | remote_succs, err := handler.FindRemoteSuccessors(dt.ring.Replicas()) 354 | if err != nil { 355 | if !returned { 356 | done <- fmt.Errorf("could not find replica nodes due to error %s", err) 357 | } 358 | dt.Logf(LogDebug, "could not find replica nodes due to error %s\n", err) 359 | dt.rollback(vn, item) 360 | return 361 | } 362 | 363 | // don't write any replica if not enough replica nodes have been found for requested consistency 364 | if minAcks > len(remote_succs)+1 { 365 | done <- fmt.Errorf("insufficient nodes found for requested consistency level (%d)\n", minAcks) 366 | dt.rollback(vn, item) 367 | return 368 | } 369 | 370 | // now lets write replicas 371 | item_replicas := make([]*dendrite.Vnode, 0) 372 | repl_item := item.dup() 373 | repl_item.commited = false 374 | 375 | for _, succ := range remote_succs { 376 | err := dt.remoteWriteReplica(vn, succ, repl_item) 377 | if err != nil { 378 | dt.Logf(LogDebug, "could not write replica due to error: %s\n", err) 379 | continue 380 | } 381 | item_replicas = append(item_replicas, succ) 382 | } 383 | 384 | // check if we have enough written replicas for requested minAcks 385 | if minAcks > len(item_replicas)+1 { 386 | done <- fmt.Errorf("insufficient active nodes found for requested consistency level (%d)\n", minAcks) 387 | dt.rollback(vn, item) 388 | return 389 | } 390 | 391 | // update replication state based on available replicas 392 | var target_state replicaState 393 | if dt.ring.Replicas() <= len(item_replicas) { 394 | target_state = replicaStable 395 | } else { 396 | target_state = replicaPartial 397 | } 398 | 399 | // replicas have been written, lets now update metadata 400 | real_idx := 0 401 | fail_count := 0 402 | repl_item.commited = true 403 | repl_item.replicaInfo.vnodes = item_replicas 404 | repl_item.replicaInfo.state = target_state 405 | repl_item.replicaInfo.master = vn 406 | 407 | for _, replica := range item_replicas { 408 | // update metadata/commit on remote 409 | repl_item.replicaInfo.depth = real_idx 410 | err := dt.remoteSetReplicaInfo(replica, repl_item) 411 | if err != nil { 412 | fail_count++ 413 | if !returned && len(item_replicas)-fail_count < minAcks { 414 | done <- fmt.Errorf("insufficient (phase2) active nodes found for requested consistency level (%d)\n", minAcks) 415 | dt.rollback(vn, item) 416 | return 417 | } 418 | continue 419 | } 420 | real_idx++ 421 | repwrite_count++ 422 | 423 | // notify client if enough replicas have been written 424 | if !returned && repwrite_count+1 == minAcks { 425 | done <- nil 426 | returned = true 427 | } 428 | } 429 | item.replicaInfo.state = target_state 430 | item.commited = true 431 | dt.callHooks(item) 432 | 433 | } 434 | 435 | // rollback is called on failed set() 436 | func (dt *DTable) rollback(vn *dendrite.Vnode, item *kvItem) { 437 | if item.replicaInfo != nil { 438 | for _, replica := range item.replicaInfo.vnodes { 439 | if replica != nil { 440 | dt.remoteClearReplica(replica, item, false) 441 | } 442 | } 443 | } 444 | delete(dt.table[vn.String()], item.keyHashString()) 445 | } 446 | 447 | // DumpStr dumps dtable keys per vnode on stdout. Mostly used for debugging. 448 | func (dt *DTable) DumpStr() { 449 | fmt.Println("Dumping DTABLE") 450 | for vn_id, vn_table := range dt.table { 451 | fmt.Printf("\tvnode: %s\n", vn_id) 452 | for key, item := range vn_table { 453 | fmt.Printf("\t\t%s - %s - %v - commited:%v\n", key, item.Val, item.replicaInfo.state, item.commited) 454 | } 455 | rt, _ := dt.rtable[vn_id] 456 | for key, item := range rt { 457 | fmt.Printf("\t\t- r%d - %s - %s - %d - commited:%v\n", item.replicaInfo.depth, key, item.Val, item.replicaInfo.state, item.commited) 458 | } 459 | for key, item := range dt.demoted_table[vn_id] { 460 | fmt.Printf("\t\t- d - %s - %s - %v\n", key, item.new_master.String(), item.demoted_ts) 461 | } 462 | } 463 | } 464 | 465 | // processDemoteKey is called when our successor is demoting key to us. 466 | // We fix replicas for the key and when we're done we make a call to origin 467 | // (old primary for this key) to clear demotedItem there. 468 | func (dt *DTable) processDemoteKey(vnode, origin, old_master *dendrite.Vnode, reqItem *kvItem) { 469 | // find the key in our primary table 470 | key_str := reqItem.keyHashString() 471 | if _, ok := dt.table[vnode.String()][key_str]; ok { 472 | dt.replicateKey(vnode, reqItem, dt.ring.Replicas()) 473 | 474 | // now clear demoted item on origin 475 | err := dt.remoteClearReplica(origin, reqItem, true) 476 | if err != nil { 477 | dt.Logf(LogInfo, "processDemoteKey() - failed while removing demoted key from origin %x for key %s\n", origin.Id, key_str) 478 | } 479 | } else { 480 | dt.Logln(LogInfo, "processDemoteKey failed - key not found:", key_str) 481 | return 482 | } 483 | } 484 | -------------------------------------------------------------------------------- /transport_zmq.go: -------------------------------------------------------------------------------- 1 | package dendrite 2 | 3 | import ( 4 | "bytes" 5 | "fmt" 6 | "github.com/golang/protobuf/proto" 7 | zmq "github.com/pebbe/zmq4" 8 | "time" 9 | ) 10 | 11 | const ( 12 | // protocol buffer messages (for definitions, see pb_defs/chord.proto) 13 | PbPing MsgType = iota 14 | PbAck 15 | PbErr 16 | PbForward 17 | PbJoin 18 | PbLeave 19 | PbListVnodes 20 | PbListVnodesResp 21 | PbFindSuccessors 22 | PbGetPredecessor 23 | PbProtoVnode 24 | PbNotify 25 | ) 26 | 27 | // newErrorMsg is a helper to create encoded *ChordMsg (PBProtoErr) with error in it. 28 | func (transport *ZMQTransport) newErrorMsg(msg string) *ChordMsg { 29 | pbmsg := &PBProtoErr{ 30 | Error: proto.String(msg), 31 | } 32 | pbdata, _ := proto.Marshal(pbmsg) 33 | return &ChordMsg{ 34 | Type: PbErr, 35 | Data: pbdata, 36 | } 37 | 38 | } 39 | 40 | // NewErrorMsg is a helper to create encoded *ChordMsg (PBProtoErr) with error in it. 41 | func (transport *ZMQTransport) NewErrorMsg(msg string) *ChordMsg { 42 | pbmsg := &PBProtoErr{ 43 | Error: proto.String(msg), 44 | } 45 | pbdata, _ := proto.Marshal(pbmsg) 46 | return &ChordMsg{ 47 | Type: PbErr, 48 | Data: pbdata, 49 | } 50 | 51 | } 52 | 53 | // Encode implement's Transport's Encode() in ZMQTransport. 54 | func (transport *ZMQTransport) Encode(mt MsgType, data []byte) []byte { 55 | buf := new(bytes.Buffer) 56 | buf.WriteByte(byte(mt)) 57 | buf.Write(data) 58 | return buf.Bytes() 59 | } 60 | 61 | // Decode implements Transport's Decode() in ZMQTransport. For request messages 62 | // it also sets their respective handler to be called when such request comes in. 63 | // If message type is unknown to this transport, Decode() also checks for registered 64 | // TransportHooks and runs their Decode() implementation. 65 | func (transport *ZMQTransport) Decode(data []byte) (*ChordMsg, error) { 66 | data_len := len(data) 67 | if data_len == 0 { 68 | return nil, fmt.Errorf("data too short: %d", len(data)) 69 | } 70 | 71 | cm := &ChordMsg{Type: MsgType(data[0])} 72 | 73 | if data_len > 1 { 74 | cm.Data = data[1:] 75 | } 76 | 77 | // parse the data and set the handler 78 | switch cm.Type { 79 | case PbPing: 80 | var pingMsg PBProtoPing 81 | err := proto.Unmarshal(cm.Data, &pingMsg) 82 | if err != nil { 83 | return nil, fmt.Errorf("error decoding PBProtoPing message - %s", err) 84 | } 85 | cm.TransportMsg = pingMsg 86 | cm.TransportHandler = transport.zmq_ping_handler 87 | case PbErr: 88 | var errorMsg PBProtoErr 89 | err := proto.Unmarshal(cm.Data, &errorMsg) 90 | if err != nil { 91 | return nil, fmt.Errorf("error decoding PBProtoErr message - %s", err) 92 | } 93 | cm.TransportMsg = errorMsg 94 | cm.TransportHandler = transport.zmq_error_handler 95 | case PbForward: 96 | var forwardMsg PBProtoForward 97 | err := proto.Unmarshal(cm.Data, &forwardMsg) 98 | if err != nil { 99 | return nil, fmt.Errorf("error decoding PBProtoForward message - %s", err) 100 | } 101 | cm.TransportMsg = forwardMsg 102 | case PbLeave: 103 | var leaveMsg PBProtoLeave 104 | err := proto.Unmarshal(cm.Data, &leaveMsg) 105 | if err != nil { 106 | return nil, fmt.Errorf("error decoding PBProtoLeave message - %s", err) 107 | } 108 | cm.TransportMsg = leaveMsg 109 | cm.TransportHandler = transport.zmq_leave_handler 110 | case PbListVnodes: 111 | var listVnodesMsg PBProtoListVnodes 112 | err := proto.Unmarshal(cm.Data, &listVnodesMsg) 113 | if err != nil { 114 | return nil, fmt.Errorf("error decoding PBProtoListVnodes message - %s", err) 115 | } 116 | cm.TransportMsg = listVnodesMsg 117 | cm.TransportHandler = transport.zmq_listVnodes_handler 118 | case PbListVnodesResp: 119 | var listVnodesRespMsg PBProtoListVnodesResp 120 | err := proto.Unmarshal(cm.Data, &listVnodesRespMsg) 121 | if err != nil { 122 | return nil, fmt.Errorf("error decoding PBProtoListVnodesResp message - %s", err) 123 | } 124 | cm.TransportMsg = listVnodesRespMsg 125 | case PbFindSuccessors: 126 | var findSuccMsg PBProtoFindSuccessors 127 | err := proto.Unmarshal(cm.Data, &findSuccMsg) 128 | if err != nil { 129 | return nil, fmt.Errorf("error decoding PBProtoFindSuccessors message - %s", err) 130 | } 131 | cm.TransportMsg = findSuccMsg 132 | cm.TransportHandler = transport.zmq_find_successors_handler 133 | case PbGetPredecessor: 134 | var getPredMsg PBProtoGetPredecessor 135 | err := proto.Unmarshal(cm.Data, &getPredMsg) 136 | if err != nil { 137 | return nil, fmt.Errorf("error decoding PBProtoGetPredecessor message - %s", err) 138 | } 139 | cm.TransportMsg = getPredMsg 140 | cm.TransportHandler = transport.zmq_get_predecessor_handler 141 | case PbNotify: 142 | var notifyMsg PBProtoNotify 143 | err := proto.Unmarshal(cm.Data, ¬ifyMsg) 144 | if err != nil { 145 | return nil, fmt.Errorf("error decoding PBProtoNotify message - %s", err) 146 | } 147 | cm.TransportMsg = notifyMsg 148 | cm.TransportHandler = transport.zmq_notify_handler 149 | case PbProtoVnode: 150 | var vnodeMsg PBProtoVnode 151 | err := proto.Unmarshal(cm.Data, &vnodeMsg) 152 | if err != nil { 153 | return nil, fmt.Errorf("error decoding PBProtoVnode message - %s", err) 154 | } 155 | cm.TransportMsg = vnodeMsg 156 | default: 157 | // maybe a TransportHook should handle this? 158 | for _, hook := range transport.hooks { 159 | if hook_cm, err := hook.Decode(data); err != nil { 160 | _, ok := err.(ErrHookUnknownType) 161 | if ok { 162 | // this hook knows nothing about this message type, try next one 163 | continue 164 | } 165 | return nil, err 166 | } else { 167 | // hook is handling this! 168 | return hook_cm, nil 169 | } 170 | } 171 | return nil, fmt.Errorf("error decoding message - unknown request type %x", cm.Type) 172 | } 173 | 174 | return cm, nil 175 | } 176 | 177 | // getVnodeHandler returns registered local vnode handler, if one is found for given vnode. 178 | func (transport *ZMQTransport) getVnodeHandler(dest *Vnode) (VnodeHandler, error) { 179 | h, ok := transport.table[dest.String()] 180 | if ok { 181 | return h.handler, nil 182 | } 183 | return nil, fmt.Errorf("local vnode handler not found") 184 | } 185 | 186 | // GetVnodeHandler returns registered local vnode handler, if one is found for given vnode. 187 | func (transport *ZMQTransport) GetVnodeHandler(vnode *Vnode) (VnodeHandler, bool) { 188 | handler, err := transport.getVnodeHandler(vnode) 189 | if err != nil { 190 | return nil, false 191 | } 192 | return handler, true 193 | } 194 | 195 | // Register registers a VnodeHandler within ZMQTransport. 196 | func (transport *ZMQTransport) Register(vnode *Vnode, handler VnodeHandler) { 197 | transport.lock.Lock() 198 | transport.table[vnode.String()] = &localHandler{vn: vnode, handler: handler} 199 | transport.lock.Unlock() 200 | } 201 | 202 | // ListVnodes - client request. Implements Transport's ListVnodes() in ZQMTransport. 203 | func (transport *ZMQTransport) ListVnodes(host string) ([]*Vnode, error) { 204 | error_c := make(chan error, 1) 205 | resp_c := make(chan []*Vnode, 1) 206 | 207 | go func() { 208 | req_sock, err := transport.zmq_context.NewSocket(zmq.REQ) 209 | if err != nil { 210 | error_c <- fmt.Errorf("ZMQ:ListVnodes - newsocket error - %s", err) 211 | return 212 | } 213 | req_sock.SetRcvtimeo(2 * time.Second) 214 | req_sock.SetSndtimeo(2 * time.Second) 215 | 216 | defer req_sock.Close() 217 | err = req_sock.Connect("tcp://" + host) 218 | if err != nil { 219 | error_c <- fmt.Errorf("ZMQ:ListVnodes - connect error - %s", err) 220 | return 221 | } 222 | // Build request protobuf 223 | req := new(PBProtoListVnodes) 224 | reqData, _ := proto.Marshal(req) 225 | encoded := transport.Encode(PbListVnodes, reqData) 226 | _, err = req_sock.SendBytes(encoded, 0) 227 | if err != nil { 228 | error_c <- fmt.Errorf("ZMQ::ListVnodes - error while sending request - %s", err) 229 | return 230 | } 231 | 232 | // read response and decode it 233 | resp, err := req_sock.RecvBytes(0) 234 | if err != nil { 235 | error_c <- fmt.Errorf("ZMQ::ListVnodes - error while reading response - %s", err) 236 | return 237 | } 238 | decoded, err := transport.Decode(resp) 239 | if err != nil { 240 | error_c <- fmt.Errorf("ZMQ::ListVnodes - error while decoding response - %s", err) 241 | return 242 | } 243 | 244 | switch decoded.Type { 245 | case PbErr: 246 | pbMsg := decoded.TransportMsg.(PBProtoErr) 247 | error_c <- fmt.Errorf("ZMQ::ListVnodes - got error response - %s", pbMsg.GetError()) 248 | case PbListVnodesResp: 249 | pbMsg := decoded.TransportMsg.(PBProtoListVnodesResp) 250 | vnodes := make([]*Vnode, len(pbMsg.GetVnodes())) 251 | for idx, pbVnode := range pbMsg.GetVnodes() { 252 | vnodes[idx] = VnodeFromProtobuf(pbVnode) 253 | } 254 | resp_c <- vnodes 255 | return 256 | default: 257 | // unexpected response 258 | error_c <- fmt.Errorf("ZMQ::ListVnodes - unexpected response") 259 | return 260 | } 261 | }() 262 | 263 | select { 264 | case <-time.After(transport.clientTimeout): 265 | return nil, fmt.Errorf("ZMQ::ListVnodes - command timed out!") 266 | case err := <-error_c: 267 | return nil, err 268 | case resp_vnodes := <-resp_c: 269 | return resp_vnodes, nil 270 | } 271 | 272 | } 273 | 274 | // FindSuccessors - client request. Implements Transport's FindSuccessors() in ZQMTransport. 275 | func (transport *ZMQTransport) FindSuccessors(remote *Vnode, limit int, key []byte) ([]*Vnode, error) { 276 | error_c := make(chan error, 1) 277 | resp_c := make(chan []*Vnode, 1) 278 | forward_c := make(chan *Vnode, 1) 279 | 280 | go func() { 281 | req_sock, err := transport.zmq_context.NewSocket(zmq.REQ) 282 | if err != nil { 283 | error_c <- fmt.Errorf("ZMQ:FindSuccessors - newsocket error - %s", err) 284 | return 285 | } 286 | req_sock.SetRcvtimeo(2 * time.Second) 287 | req_sock.SetSndtimeo(2 * time.Second) 288 | 289 | defer req_sock.Close() 290 | err = req_sock.Connect("tcp://" + remote.Host) 291 | if err != nil { 292 | error_c <- fmt.Errorf("ZMQ:FindSuccessors - connect error - %s", err) 293 | return 294 | } 295 | // Build request protobuf 296 | req := &PBProtoFindSuccessors{ 297 | Dest: remote.ToProtobuf(), 298 | Key: key, 299 | Limit: proto.Int32(int32(limit)), 300 | } 301 | reqData, _ := proto.Marshal(req) 302 | encoded := transport.Encode(PbFindSuccessors, reqData) 303 | _, err = req_sock.SendBytes(encoded, 0) 304 | if err != nil { 305 | error_c <- fmt.Errorf("ZMQ::FindSuccessors - error while sending request - %s", err) 306 | return 307 | } 308 | 309 | // read response and decode it 310 | resp, err := req_sock.RecvBytes(0) 311 | if err != nil { 312 | error_c <- fmt.Errorf("ZMQ::FindSuccessors - error while reading response - %s %X", err, remote.Id) 313 | return 314 | } 315 | decoded, err := transport.Decode(resp) 316 | if err != nil { 317 | error_c <- fmt.Errorf("ZMQ::FindSuccessors - error while decoding response - %s", err) 318 | return 319 | } 320 | 321 | switch decoded.Type { 322 | case PbErr: 323 | pbMsg := decoded.TransportMsg.(PBProtoErr) 324 | error_c <- fmt.Errorf("ZMQ::FindSuccessors - got error response - %s", pbMsg.GetError()) 325 | return 326 | case PbForward: 327 | pbMsg := decoded.TransportMsg.(PBProtoForward) 328 | forward_c <- VnodeFromProtobuf(pbMsg.GetVnode()) 329 | return 330 | case PbListVnodesResp: 331 | pbMsg := decoded.TransportMsg.(PBProtoListVnodesResp) 332 | vnodes := make([]*Vnode, len(pbMsg.GetVnodes())) 333 | for idx, pbVnode := range pbMsg.GetVnodes() { 334 | vnodes[idx] = VnodeFromProtobuf(pbVnode) 335 | } 336 | resp_c <- vnodes 337 | return 338 | default: 339 | // unexpected response 340 | error_c <- fmt.Errorf("ZMQ::FindSuccessors - unexpected response") 341 | return 342 | } 343 | }() 344 | 345 | select { 346 | case <-time.After(transport.clientTimeout): 347 | return nil, fmt.Errorf("ZMQ::FindSuccessors - command timed out!") 348 | case err := <-error_c: 349 | return nil, err 350 | case new_remote := <-forward_c: 351 | return transport.FindSuccessors(new_remote, limit, key) 352 | case resp_vnodes := <-resp_c: 353 | return resp_vnodes, nil 354 | } 355 | } 356 | 357 | // GetPredecessor - client request. Implements Transport's GetPredecessor() in ZQMTransport. 358 | func (transport *ZMQTransport) GetPredecessor(remote *Vnode) (*Vnode, error) { 359 | error_c := make(chan error, 1) 360 | resp_c := make(chan *Vnode, 1) 361 | 362 | go func() { 363 | req_sock, err := transport.zmq_context.NewSocket(zmq.REQ) 364 | if err != nil { 365 | error_c <- fmt.Errorf("ZMQ:GetPredecessor - newsocket error - %s", err) 366 | return 367 | } 368 | req_sock.SetRcvtimeo(2 * time.Second) 369 | req_sock.SetSndtimeo(2 * time.Second) 370 | 371 | defer req_sock.Close() 372 | err = req_sock.Connect("tcp://" + remote.Host) 373 | if err != nil { 374 | error_c <- fmt.Errorf("ZMQ:GetPredecessor - connect error - %s", err) 375 | return 376 | } 377 | // Build request protobuf 378 | req := &PBProtoGetPredecessor{ 379 | Dest: remote.ToProtobuf(), 380 | } 381 | reqData, _ := proto.Marshal(req) 382 | encoded := transport.Encode(PbGetPredecessor, reqData) 383 | _, err = req_sock.SendBytes(encoded, 0) 384 | if err != nil { 385 | error_c <- fmt.Errorf("ZMQ:GetPredecessor - error while sending request - %s", err) 386 | return 387 | } 388 | 389 | // read response and decode it 390 | resp, err := req_sock.RecvBytes(0) 391 | if err != nil { 392 | error_c <- fmt.Errorf("ZMQ::GetPredecessor - error while reading response - %s", err) 393 | return 394 | } 395 | decoded, err := transport.Decode(resp) 396 | if err != nil { 397 | error_c <- fmt.Errorf("ZMQ::GetPredecessor - error while decoding response - %s", err) 398 | return 399 | } 400 | 401 | switch decoded.Type { 402 | case PbErr: 403 | pbMsg := decoded.TransportMsg.(PBProtoErr) 404 | error_c <- fmt.Errorf("ZMQ::GetPredecessor - got error response - %s", pbMsg.GetError()) 405 | return 406 | case PbProtoVnode: 407 | pbMsg := decoded.TransportMsg.(PBProtoVnode) 408 | resp_c <- VnodeFromProtobuf(&pbMsg) 409 | return 410 | default: 411 | // unexpected response 412 | error_c <- fmt.Errorf("ZMQ::GetPredecessor - unexpected response") 413 | return 414 | } 415 | }() 416 | 417 | select { 418 | case <-time.After(transport.clientTimeout): 419 | return nil, fmt.Errorf("ZMQ::GetPredecessor - command timed out!") 420 | case err := <-error_c: 421 | return nil, err 422 | case resp_vnode := <-resp_c: 423 | return resp_vnode, nil 424 | } 425 | } 426 | 427 | // Notify - client request. Implements Transport's Notify() in ZQMTransport. 428 | func (transport *ZMQTransport) Notify(remote, self *Vnode) ([]*Vnode, error) { 429 | error_c := make(chan error, 1) 430 | resp_c := make(chan []*Vnode, 1) 431 | 432 | go func() { 433 | req_sock, err := transport.zmq_context.NewSocket(zmq.REQ) 434 | if err != nil { 435 | error_c <- fmt.Errorf("ZMQ:GetPredecessor - newsocket error - %s", err) 436 | return 437 | } 438 | req_sock.SetRcvtimeo(2 * time.Second) 439 | req_sock.SetSndtimeo(2 * time.Second) 440 | 441 | defer req_sock.Close() 442 | err = req_sock.Connect("tcp://" + remote.Host) 443 | if err != nil { 444 | error_c <- fmt.Errorf("ZMQ:GetPredecessor - connect error - %s", err) 445 | return 446 | } 447 | 448 | // Build request protobuf 449 | req := &PBProtoNotify{ 450 | Dest: remote.ToProtobuf(), 451 | Vnode: self.ToProtobuf(), 452 | } 453 | reqData, _ := proto.Marshal(req) 454 | encoded := transport.Encode(PbNotify, reqData) 455 | _, err = req_sock.SendBytes(encoded, 0) 456 | if err != nil { 457 | error_c <- fmt.Errorf("ZMQ::Notify - error while sending request - %s", err) 458 | return 459 | } 460 | 461 | // read response and decode it 462 | resp, err := req_sock.RecvBytes(0) 463 | if err != nil { 464 | error_c <- fmt.Errorf("ZMQ::Notify - error while reading response - %s", err) 465 | return 466 | } 467 | decoded, err := transport.Decode(resp) 468 | if err != nil { 469 | error_c <- fmt.Errorf("ZMQ::Notify - error while decoding response - %s", err) 470 | return 471 | } 472 | 473 | switch decoded.Type { 474 | case PbErr: 475 | pbMsg := decoded.TransportMsg.(PBProtoErr) 476 | error_c <- fmt.Errorf("ZMQ::Notify - got error response - %s", pbMsg.GetError()) 477 | return 478 | case PbListVnodesResp: 479 | pbMsg := decoded.TransportMsg.(PBProtoListVnodesResp) 480 | vnodes := make([]*Vnode, len(pbMsg.GetVnodes())) 481 | for idx, pbVnode := range pbMsg.GetVnodes() { 482 | vnodes[idx] = VnodeFromProtobuf(pbVnode) 483 | } 484 | resp_c <- vnodes 485 | return 486 | default: 487 | // unexpected response 488 | error_c <- fmt.Errorf("ZMQ::Notify - unexpected response") 489 | return 490 | } 491 | }() 492 | 493 | select { 494 | case <-time.After(transport.clientTimeout): 495 | return nil, fmt.Errorf("ZMQ::Notify - command timed out!") 496 | case err := <-error_c: 497 | return nil, err 498 | case resp_vnode := <-resp_c: 499 | return resp_vnode, nil 500 | } 501 | } 502 | 503 | // Ping - client request. Implements Transport's Ping() in ZQMTransport. 504 | func (transport *ZMQTransport) Ping(remote_vn *Vnode) (bool, error) { 505 | req_sock, err := transport.zmq_context.NewSocket(zmq.REQ) 506 | if err != nil { 507 | return false, err 508 | } 509 | defer req_sock.Close() 510 | 511 | err = req_sock.Connect("tcp://" + remote_vn.Host) 512 | if err != nil { 513 | return false, err 514 | } 515 | req_sock.SetRcvtimeo(2 * time.Second) 516 | req_sock.SetSndtimeo(2 * time.Second) 517 | 518 | PbPingMsg := &PBProtoPing{ 519 | Version: proto.Int64(1), 520 | } 521 | PbPingData, _ := proto.Marshal(PbPingMsg) 522 | encoded := transport.Encode(PbPing, PbPingData) 523 | _, err = req_sock.SendBytes(encoded, 0) 524 | if err != nil { 525 | return false, err 526 | } 527 | resp, err := req_sock.RecvBytes(0) 528 | if err != nil { 529 | return false, err 530 | } 531 | decoded, err := transport.Decode(resp) 532 | if err != nil { 533 | return false, err 534 | } 535 | pongMsg := new(PBProtoPing) 536 | err = proto.Unmarshal(decoded.Data, pongMsg) 537 | if err != nil { 538 | return false, err 539 | } 540 | return true, nil 541 | } 542 | -------------------------------------------------------------------------------- /dtable/dtable_remote.go: -------------------------------------------------------------------------------- 1 | package dtable 2 | 3 | import ( 4 | "fmt" 5 | "github.com/fastfn/dendrite" 6 | "github.com/golang/protobuf/proto" 7 | zmq "github.com/pebbe/zmq4" 8 | "time" 9 | ) 10 | 11 | // Client Request: Get value for a key from remote host 12 | func (dt *DTable) remoteGet(remote *dendrite.Vnode, reqItem *kvItem) (*kvItem, bool, error) { 13 | error_c := make(chan error, 1) 14 | resp_c := make(chan *kvItem, 1) 15 | notfound_c := make(chan bool, 1) 16 | zmq_transport := dt.transport.(*dendrite.ZMQTransport) 17 | go func() { 18 | 19 | req_sock, err := zmq_transport.ZMQContext.NewSocket(zmq.REQ) 20 | if err != nil { 21 | error_c <- fmt.Errorf("ZMQ:DTable:remoteGet - newsocket error - %s", err) 22 | return 23 | } 24 | req_sock.SetRcvtimeo(2 * time.Second) 25 | req_sock.SetSndtimeo(2 * time.Second) 26 | 27 | defer req_sock.Close() 28 | err = req_sock.Connect("tcp://" + remote.Host) 29 | if err != nil { 30 | error_c <- fmt.Errorf("ZMQ:DTable:remoteGet - connect error - %s", err) 31 | return 32 | } 33 | // Build request protobuf 34 | req := &PBDTableGetItem{ 35 | Dest: remote.ToProtobuf(), 36 | KeyHash: reqItem.keyHash, 37 | } 38 | 39 | reqData, _ := proto.Marshal(req) 40 | encoded := dt.transport.Encode(PbDtableGetItem, reqData) 41 | _, err = req_sock.SendBytes(encoded, 0) 42 | if err != nil { 43 | error_c <- fmt.Errorf("ZMQ:DTable:remoteGet - error while sending request - %s", err) 44 | return 45 | } 46 | 47 | // read response and decode it 48 | resp, err := req_sock.RecvBytes(0) 49 | if err != nil { 50 | error_c <- fmt.Errorf("ZMQ:DTable:remoteGet - error while reading response - %s", err) 51 | return 52 | } 53 | decoded, err := dt.transport.Decode(resp) 54 | if err != nil { 55 | error_c <- fmt.Errorf("ZMQ:DTable:remoteGet - error while decoding response - %s", err) 56 | return 57 | } 58 | 59 | switch decoded.Type { 60 | case dendrite.PbErr: 61 | pbMsg := decoded.TransportMsg.(dendrite.PBProtoErr) 62 | error_c <- fmt.Errorf("ZMQ:DTable:remoteGet - got error response - %s", pbMsg.GetError()) 63 | case PbDtableItem: 64 | pbMsg := decoded.TransportMsg.(PBDTableItem) 65 | if found := pbMsg.GetFound(); !found { 66 | notfound_c <- true 67 | return 68 | } 69 | item := new(kvItem) 70 | copy(item.Key, reqItem.Key) 71 | copy(item.keyHash, reqItem.keyHash) 72 | item.Val = pbMsg.GetVal() 73 | resp_c <- item 74 | return 75 | default: 76 | // unexpected response 77 | error_c <- fmt.Errorf("ZMQ:DTable:remoteGet - unexpected response") 78 | return 79 | } 80 | }() 81 | 82 | select { 83 | case <-time.After(zmq_transport.ClientTimeout): 84 | return nil, false, fmt.Errorf("ZMQ:DTable:remoteGet - command timed out!") 85 | case err := <-error_c: 86 | return nil, false, err 87 | case _ = <-notfound_c: 88 | return nil, false, nil 89 | case item := <-resp_c: 90 | return item, true, nil 91 | } 92 | } 93 | 94 | // Client Request: set value for a key to remote host 95 | func (dt *DTable) remoteSet(origin, remote *dendrite.Vnode, reqItem *kvItem, minAcks int, demoting bool, done chan error) { 96 | //fmt.Printf("REMOTESET CALLED from %s to %s for key %s\n", origin.String(), remote.String(), reqItem.keyHashString()) 97 | error_c := make(chan error, 1) 98 | resp_c := make(chan bool, 1) 99 | zmq_transport := dt.transport.(*dendrite.ZMQTransport) 100 | 101 | go func() { 102 | req_sock, err := zmq_transport.ZMQContext.NewSocket(zmq.REQ) 103 | if err != nil { 104 | error_c <- fmt.Errorf("ZMQ:DTable:remoteSet - newsocket error - %s", err) 105 | return 106 | } 107 | req_sock.SetRcvtimeo(5 * time.Second) 108 | req_sock.SetSndtimeo(5 * time.Second) 109 | 110 | defer req_sock.Close() 111 | err = req_sock.Connect("tcp://" + remote.Host) 112 | if err != nil { 113 | error_c <- fmt.Errorf("ZMQ:DTable:remoteSet - connect error - %s", err) 114 | return 115 | } 116 | // Build request protobuf 117 | req := &PBDTableSetItem{ 118 | Origin: origin.ToProtobuf(), 119 | Dest: remote.ToProtobuf(), 120 | Item: reqItem.to_protobuf(), 121 | MinAcks: proto.Int32(int32(minAcks)), 122 | Demoting: proto.Bool(demoting), 123 | } 124 | 125 | reqData, _ := proto.Marshal(req) 126 | encoded := dt.transport.Encode(PbDtableSetItem, reqData) 127 | _, err = req_sock.SendBytes(encoded, 0) 128 | if err != nil { 129 | error_c <- fmt.Errorf("ZMQ:DTable:remoteSet - error while sending request - %s", err) 130 | return 131 | } 132 | 133 | // read response and decode it 134 | resp, err := req_sock.RecvBytes(0) 135 | if err != nil { 136 | error_c <- fmt.Errorf("ZMQ:DTable:remoteSet - error while reading response - %s", err) 137 | return 138 | } 139 | decoded, err := dt.transport.Decode(resp) 140 | if err != nil { 141 | error_c <- fmt.Errorf("ZMQ:DTable:remoteSet - error while decoding response - %s", err) 142 | return 143 | } 144 | 145 | switch decoded.Type { 146 | case dendrite.PbErr: 147 | pbMsg := decoded.TransportMsg.(dendrite.PBProtoErr) 148 | error_c <- fmt.Errorf("ZMQ:DTable:remoteSet - got error response - %s", pbMsg.GetError()) 149 | case PbDtableResponse: 150 | pbMsg := decoded.TransportMsg.(PBDTableResponse) 151 | success := pbMsg.GetOk() 152 | if success { 153 | resp_c <- true 154 | return 155 | } 156 | error_c <- fmt.Errorf("ZMQ:DTable:remoteSet - write error - %s", pbMsg.GetError()) 157 | return 158 | default: 159 | // unexpected response 160 | error_c <- fmt.Errorf("ZMQ:DTable:remoteSet - unexpected response") 161 | return 162 | } 163 | }() 164 | 165 | select { 166 | case <-time.After(zmq_transport.ClientTimeout): 167 | done <- fmt.Errorf("ZMQ:DTable:remoteSet - command timed out!") 168 | case err := <-error_c: 169 | done <- err 170 | case _ = <-resp_c: 171 | done <- nil 172 | } 173 | } 174 | 175 | // Client Request: set replicaInfo for replicated item to remote host 176 | func (dt *DTable) remoteSetReplicaInfo(remote *dendrite.Vnode, reqItem *kvItem) error { 177 | error_c := make(chan error, 1) 178 | resp_c := make(chan bool, 1) 179 | zmq_transport := dt.transport.(*dendrite.ZMQTransport) 180 | 181 | go func() { 182 | req_sock, err := zmq_transport.ZMQContext.NewSocket(zmq.REQ) 183 | if err != nil { 184 | error_c <- fmt.Errorf("ZMQ:DTable:remoteSetReplicaInfo - newsocket error - %s", err) 185 | return 186 | } 187 | req_sock.SetRcvtimeo(5 * time.Second) 188 | req_sock.SetSndtimeo(5 * time.Second) 189 | 190 | defer req_sock.Close() 191 | err = req_sock.Connect("tcp://" + remote.Host) 192 | if err != nil { 193 | error_c <- fmt.Errorf("ZMQ:DTable:remoteSetReplicaInfo - connect error - %s", err) 194 | return 195 | } 196 | // Build request protobuf 197 | req := &PBDTableSetReplicaInfo{ 198 | Dest: remote.ToProtobuf(), 199 | KeyHash: reqItem.keyHash, 200 | ReplicaInfo: reqItem.replicaInfo.to_protobuf(), 201 | } 202 | 203 | reqData, _ := proto.Marshal(req) 204 | encoded := dt.transport.Encode(PbDtableSetReplicaInfo, reqData) 205 | _, err = req_sock.SendBytes(encoded, 0) 206 | if err != nil { 207 | error_c <- fmt.Errorf("ZMQ:DTable:remoteSetReplicaInfo - error while sending request - %s", err) 208 | return 209 | } 210 | 211 | // read response and decode it 212 | resp, err := req_sock.RecvBytes(0) 213 | if err != nil { 214 | error_c <- fmt.Errorf("ZMQ:DTable:remoteSetReplicaInfo - error while reading response - %s", err) 215 | return 216 | } 217 | decoded, err := dt.transport.Decode(resp) 218 | if err != nil { 219 | error_c <- fmt.Errorf("ZMQ:DTable:remoteSetReplicaInfo - error while decoding response - %s", err) 220 | return 221 | } 222 | 223 | switch decoded.Type { 224 | case dendrite.PbErr: 225 | pbMsg := decoded.TransportMsg.(dendrite.PBProtoErr) 226 | error_c <- fmt.Errorf("ZMQ:DTable:remoteSetReplicaInfo - got error response - %s", pbMsg.GetError()) 227 | case PbDtableResponse: 228 | pbMsg := decoded.TransportMsg.(PBDTableResponse) 229 | success := pbMsg.GetOk() 230 | if success { 231 | resp_c <- true 232 | return 233 | } 234 | error_c <- fmt.Errorf("ZMQ:DTable:remoteSetReplicaInfo - write error - %s", pbMsg.GetError()) 235 | return 236 | default: 237 | // unexpected response 238 | error_c <- fmt.Errorf("ZMQ:DTable:remoteSetReplicaInfo - unexpected response") 239 | return 240 | } 241 | }() 242 | 243 | select { 244 | case <-time.After(zmq_transport.ClientTimeout): 245 | return fmt.Errorf("ZMQ:DTable:remoteSetReplicaInfo - command timed out!") 246 | case err := <-error_c: 247 | return err 248 | case _ = <-resp_c: 249 | return nil 250 | } 251 | } 252 | 253 | // Client Request: remove replica 254 | func (dt *DTable) remoteClearReplica(remote *dendrite.Vnode, reqItem *kvItem, demoted bool) error { 255 | error_c := make(chan error, 1) 256 | resp_c := make(chan bool, 1) 257 | zmq_transport := dt.transport.(*dendrite.ZMQTransport) 258 | 259 | go func() { 260 | req_sock, err := zmq_transport.ZMQContext.NewSocket(zmq.REQ) 261 | if err != nil { 262 | error_c <- fmt.Errorf("ZMQ:DTable:remoteClearReplica - newsocket error - %s", err) 263 | return 264 | } 265 | req_sock.SetRcvtimeo(5 * time.Second) 266 | req_sock.SetSndtimeo(5 * time.Second) 267 | 268 | defer req_sock.Close() 269 | err = req_sock.Connect("tcp://" + remote.Host) 270 | if err != nil { 271 | error_c <- fmt.Errorf("ZMQ:DTable:remoteClearReplica - connect error - %s", err) 272 | return 273 | } 274 | // Build request protobuf 275 | req := &PBDTableClearReplica{ 276 | Dest: remote.ToProtobuf(), 277 | KeyHash: reqItem.keyHash, 278 | Demoted: proto.Bool(demoted), 279 | } 280 | 281 | reqData, _ := proto.Marshal(req) 282 | encoded := dt.transport.Encode(PbDtableClearReplica, reqData) 283 | _, err = req_sock.SendBytes(encoded, 0) 284 | if err != nil { 285 | error_c <- fmt.Errorf("ZMQ:DTable:remoteClearReplica - error while sending request - %s", err) 286 | return 287 | } 288 | 289 | // read response and decode it 290 | resp, err := req_sock.RecvBytes(0) 291 | if err != nil { 292 | error_c <- fmt.Errorf("ZMQ:DTable:remoteClearReplica - error while reading response - %s", err) 293 | return 294 | } 295 | decoded, err := dt.transport.Decode(resp) 296 | if err != nil { 297 | error_c <- fmt.Errorf("ZMQ:DTable:remoteClearReplica - error while decoding response - %s", err) 298 | return 299 | } 300 | 301 | switch decoded.Type { 302 | case dendrite.PbErr: 303 | pbMsg := decoded.TransportMsg.(dendrite.PBProtoErr) 304 | error_c <- fmt.Errorf("ZMQ:DTable:remoteClearReplica - got error response - %s", pbMsg.GetError()) 305 | case PbDtableResponse: 306 | pbMsg := decoded.TransportMsg.(PBDTableResponse) 307 | success := pbMsg.GetOk() 308 | if success { 309 | resp_c <- true 310 | return 311 | } 312 | error_c <- fmt.Errorf("ZMQ:DTable:remoteClearReplica - error - %s", pbMsg.GetError()) 313 | return 314 | default: 315 | // unexpected response 316 | error_c <- fmt.Errorf("ZMQ:DTable:remoteClearReplica - unexpected response") 317 | return 318 | } 319 | }() 320 | 321 | select { 322 | case <-time.After(zmq_transport.ClientTimeout): 323 | return fmt.Errorf("ZMQ:DTable:remoteClearReplica - command timed out!") 324 | case err := <-error_c: 325 | return err 326 | case _ = <-resp_c: 327 | return nil 328 | } 329 | } 330 | 331 | // Client Request: take a rvalue and write replica to another host 332 | func (dt *DTable) remoteWriteReplica(origin, remote *dendrite.Vnode, reqItem *kvItem) error { 333 | error_c := make(chan error, 1) 334 | resp_c := make(chan bool, 1) 335 | zmq_transport := dt.transport.(*dendrite.ZMQTransport) 336 | 337 | go func() { 338 | req_sock, err := zmq_transport.ZMQContext.NewSocket(zmq.REQ) 339 | if err != nil { 340 | error_c <- fmt.Errorf("ZMQ:DTable:remoteWriteReplica - newsocket error - %s", err) 341 | return 342 | } 343 | req_sock.SetRcvtimeo(5 * time.Second) 344 | req_sock.SetSndtimeo(5 * time.Second) 345 | 346 | defer req_sock.Close() 347 | err = req_sock.Connect("tcp://" + remote.Host) 348 | if err != nil { 349 | error_c <- fmt.Errorf("ZMQ:DTable:remoteWriteReplica - connect error - %s", err) 350 | return 351 | } 352 | // Build request protobuf 353 | req := &PBDTableSetItem{ 354 | Origin: origin.ToProtobuf(), 355 | Dest: remote.ToProtobuf(), 356 | Item: reqItem.to_protobuf(), 357 | } 358 | 359 | reqData, _ := proto.Marshal(req) 360 | encoded := dt.transport.Encode(PbDtableSetReplica, reqData) 361 | _, err = req_sock.SendBytes(encoded, 0) 362 | if err != nil { 363 | error_c <- fmt.Errorf("ZMQ:DTable:remoteWriteReplica - error while sending request - %s", err) 364 | return 365 | } 366 | 367 | // read response and decode it 368 | resp, err := req_sock.RecvBytes(0) 369 | if err != nil { 370 | error_c <- fmt.Errorf("ZMQ:DTable:remoteWriteReplica - error while reading response - %s", err) 371 | return 372 | } 373 | decoded, err := dt.transport.Decode(resp) 374 | if err != nil { 375 | error_c <- fmt.Errorf("ZMQ:DTable:remoteWriteReplica - error while decoding response - %s", err) 376 | return 377 | } 378 | 379 | switch decoded.Type { 380 | case dendrite.PbErr: 381 | pbMsg := decoded.TransportMsg.(dendrite.PBProtoErr) 382 | error_c <- fmt.Errorf("ZMQ:DTable:remoteWriteReplica - got error response - %s", pbMsg.GetError()) 383 | case PbDtableResponse: 384 | pbMsg := decoded.TransportMsg.(PBDTableResponse) 385 | success := pbMsg.GetOk() 386 | if success { 387 | resp_c <- true 388 | return 389 | } 390 | error_c <- fmt.Errorf("ZMQ:DTable:remoteWriteReplica - write error - %s", pbMsg.GetError()) 391 | return 392 | default: 393 | // unexpected response 394 | error_c <- fmt.Errorf("ZMQ:DTable:remoteWriteReplica - unexpected response") 395 | return 396 | } 397 | }() 398 | 399 | select { 400 | case <-time.After(zmq_transport.ClientTimeout): 401 | return fmt.Errorf("ZMQ:DTable:remoteWriteReplica - command timed out!") 402 | case err := <-error_c: 403 | return err 404 | case _ = <-resp_c: 405 | return nil 406 | } 407 | } 408 | 409 | // Client Request: get dtable status of remote vnode 410 | func (dt *DTable) remoteStatus(remote *dendrite.Vnode) error { 411 | error_c := make(chan error, 1) 412 | resp_c := make(chan bool, 1) 413 | zmq_transport := dt.transport.(*dendrite.ZMQTransport) 414 | 415 | go func() { 416 | req_sock, err := zmq_transport.ZMQContext.NewSocket(zmq.REQ) 417 | if err != nil { 418 | error_c <- fmt.Errorf("ZMQ:DTable:remoteStatus - newsocket error - %s", err) 419 | return 420 | } 421 | req_sock.SetRcvtimeo(5 * time.Second) 422 | req_sock.SetSndtimeo(5 * time.Second) 423 | 424 | defer req_sock.Close() 425 | err = req_sock.Connect("tcp://" + remote.Host) 426 | if err != nil { 427 | error_c <- fmt.Errorf("ZMQ:DTable:remoteStatus - connect error - %s", err) 428 | return 429 | } 430 | // Build request protobuf 431 | req := &PBDTableStatus{ 432 | Dest: remote.ToProtobuf(), 433 | } 434 | 435 | reqData, _ := proto.Marshal(req) 436 | encoded := dt.transport.Encode(PbDtableStatus, reqData) 437 | _, err = req_sock.SendBytes(encoded, 0) 438 | if err != nil { 439 | error_c <- fmt.Errorf("ZMQ:DTable:remoteStatus - error while sending request - %s", err) 440 | return 441 | } 442 | 443 | // read response and decode it 444 | resp, err := req_sock.RecvBytes(0) 445 | if err != nil { 446 | error_c <- fmt.Errorf("ZMQ:DTable:remoteStatus - error while reading response - %s", err) 447 | return 448 | } 449 | decoded, err := dt.transport.Decode(resp) 450 | if err != nil { 451 | error_c <- fmt.Errorf("ZMQ:DTable:remoteStatus - error while decoding response - %s", err) 452 | return 453 | } 454 | 455 | switch decoded.Type { 456 | case dendrite.PbErr: 457 | pbMsg := decoded.TransportMsg.(dendrite.PBProtoErr) 458 | error_c <- fmt.Errorf("ZMQ:DTable:remoteStatus - got error response - %s", pbMsg.GetError()) 459 | case PbDtableResponse: 460 | pbMsg := decoded.TransportMsg.(PBDTableResponse) 461 | success := pbMsg.GetOk() 462 | if success { 463 | resp_c <- true 464 | return 465 | } 466 | error_c <- fmt.Errorf("ZMQ:DTable:remoteStatus - error - %s", pbMsg.GetError()) 467 | return 468 | default: 469 | // unexpected response 470 | error_c <- fmt.Errorf("ZMQ:DTable:remoteStatus - unexpected response") 471 | return 472 | } 473 | }() 474 | 475 | select { 476 | case <-time.After(zmq_transport.ClientTimeout): 477 | return fmt.Errorf("ZMQ:DTable:remoteStatus - command timed out!") 478 | case err := <-error_c: 479 | return err 480 | case _ = <-resp_c: 481 | return nil 482 | } 483 | } 484 | 485 | // Client Request: promote remote vnode for a key 486 | func (dt *DTable) remotePromoteKey(origin, remote *dendrite.Vnode, reqItem *kvItem) error { 487 | error_c := make(chan error, 1) 488 | resp_c := make(chan bool, 1) 489 | zmq_transport := dt.transport.(*dendrite.ZMQTransport) 490 | 491 | go func() { 492 | req_sock, err := zmq_transport.ZMQContext.NewSocket(zmq.REQ) 493 | if err != nil { 494 | error_c <- fmt.Errorf("ZMQ:DTable:remotePromoteKey - newsocket error - %s", err) 495 | return 496 | } 497 | req_sock.SetRcvtimeo(5 * time.Second) 498 | req_sock.SetSndtimeo(5 * time.Second) 499 | 500 | defer req_sock.Close() 501 | err = req_sock.Connect("tcp://" + remote.Host) 502 | if err != nil { 503 | error_c <- fmt.Errorf("ZMQ:DTable:remotePromoteKey - connect error - %s", err) 504 | return 505 | } 506 | 507 | // Build request protobuf 508 | req := &PBDTablePromoteKey{ 509 | Dest: remote.ToProtobuf(), 510 | Origin: origin.ToProtobuf(), 511 | Item: reqItem.to_protobuf(), 512 | } 513 | 514 | reqData, _ := proto.Marshal(req) 515 | encoded := dt.transport.Encode(PbDtablePromoteKey, reqData) 516 | _, err = req_sock.SendBytes(encoded, 0) 517 | if err != nil { 518 | error_c <- fmt.Errorf("ZMQ:DTable:remotePromoteKey - error while sending request - %s", err) 519 | return 520 | } 521 | 522 | // read response and decode it 523 | resp, err := req_sock.RecvBytes(0) 524 | if err != nil { 525 | error_c <- fmt.Errorf("ZMQ:DTable:remotePromoteKey - error while reading response - %s", err) 526 | return 527 | } 528 | decoded, err := dt.transport.Decode(resp) 529 | if err != nil { 530 | error_c <- fmt.Errorf("ZMQ:DTable:remotePromoteKey - error while decoding response - %s", err) 531 | return 532 | } 533 | 534 | switch decoded.Type { 535 | case dendrite.PbErr: 536 | pbMsg := decoded.TransportMsg.(dendrite.PBProtoErr) 537 | error_c <- fmt.Errorf("ZMQ:DTable:remotePromoteKey - got error response - %s", pbMsg.GetError()) 538 | case PbDtableResponse: 539 | pbMsg := decoded.TransportMsg.(PBDTableResponse) 540 | success := pbMsg.GetOk() 541 | if success { 542 | resp_c <- true 543 | return 544 | } 545 | error_c <- fmt.Errorf("ZMQ:DTable:remotePromoteKey - error - %s", pbMsg.GetError()) 546 | return 547 | default: 548 | // unexpected response 549 | error_c <- fmt.Errorf("ZMQ:DTable:remotePromoteKey - unexpected response") 550 | return 551 | } 552 | }() 553 | 554 | select { 555 | case <-time.After(zmq_transport.ClientTimeout): 556 | return fmt.Errorf("ZMQ:DTable:remotePromoteKey - command timed out!") 557 | case err := <-error_c: 558 | return err 559 | case _ = <-resp_c: 560 | return nil 561 | } 562 | } 563 | --------------------------------------------------------------------------------