├── .gitignore
├── 1-etcd的使用场景.md
├── 10-etcd选型对比.md
├── 2-etcd中watch的源码解析.md
├── 3-grpc通过etcd实现服务发现.md
├── 4-centos7中部署etcd.md
├── 5-raft算法理解.md
├── 6-etcd实现raft源码解读.md
├── 7-raft和线性一致性.md
├── 8-etcd中的lease.md
├── 9-etcd中存储的实现.md
├── README.md
├── discovery
    ├── README.md
    ├── grpc_discovery_example
    │   ├── client
    │   │   └── main.go
    │   └── server
    │   │   ├── helloworld
    │   │       ├── client.go
    │   │       ├── helloworld.pb.go
    │   │       └── helloworld.proto
    │   │   ├── main.go
    │   │   └── service.go
    ├── helloworld
    │   ├── helloworld.pb.go
    │   └── helloworld.proto
    ├── instance.go
    ├── register.go
    ├── register_test.go
    ├── resolver.go
    └── resolver_test.go
├── drawio
    └── etcd.drawio
├── etcdMutex
    ├── README.md
    └── main.go
├── go.mod
├── go.sum
├── img
    ├── etcd-balance.png
    ├── etcd-consul.webp
    ├── etcd-leader.png
    ├── etcd-lease.png
    ├── etcd-lock.png
    ├── etcd-mvcc.png
    ├── etcd-notify.png
    ├── etcd-raft-node-pre.png
    ├── etcd-raft-node.png
    ├── etcd-raft-read-follower.png
    ├── etcd-raft-read-leader.png
    ├── etcd-raft-wal.jpg
    ├── etcd-raftExample.jpg
    ├── etcd-register.png
    ├── etcd-register_1.png
    ├── etcd-server-1.png
    ├── etcd-server.png
    ├── etcd-watch-client.png
    ├── etcd-watch.png
    ├── etcd.png
    ├── etcd_1.png
    ├── grpc_balance_1.png
    ├── grpc_balance_2.png
    ├── grpc_balance_3.png
    ├── grpc_balance_4.png
    ├── k8s-etcd.webp
    ├── raft-leader.png
    ├── raft-log_1.png
    ├── raft-net.png
    ├── raft_1.png
    ├── raftexample.jpg
    └── zookeeper.webp
├── main.go
├── other.md
├── queue
    ├── README.md
    └── main.go
├── raftexample
    ├── Procfile
    ├── README.md
    ├── doc.go
    ├── httpapi.go
    ├── kvstore.go
    ├── kvstore_test.go
    ├── listener.go
    ├── main.go
    ├── raft.go
    └── raftexample_test.go
└── sync
    ├── README.md
    └── cache
        ├── cache_debug_show.go
        └── cache_update.go


/.gitignore:
--------------------------------------------------------------------------------
1 | /.idea/
2 | 


--------------------------------------------------------------------------------
/1-etcd的使用场景.md:
--------------------------------------------------------------------------------
  1 | <!-- START doctoc generated TOC please keep comment here to allow auto update -->
  2 | <!-- DON'T EDIT THIS SECTION, INSTEAD RE-RUN doctoc TO UPDATE -->
  3 | 
  4 | - [etcd的使用](#etcd%E7%9A%84%E4%BD%BF%E7%94%A8)
  5 |   - [什么是etcd](#%E4%BB%80%E4%B9%88%E6%98%AFetcd)
  6 |   - [etcd的特点](#etcd%E7%9A%84%E7%89%B9%E7%82%B9)
  7 |   - [etcd的应用场景](#etcd%E7%9A%84%E5%BA%94%E7%94%A8%E5%9C%BA%E6%99%AF)
  8 |     - [服务注册与发现](#%E6%9C%8D%E5%8A%A1%E6%B3%A8%E5%86%8C%E4%B8%8E%E5%8F%91%E7%8E%B0)
  9 |     - [消息发布和订阅](#%E6%B6%88%E6%81%AF%E5%8F%91%E5%B8%83%E5%92%8C%E8%AE%A2%E9%98%85)
 10 |     - [负载均衡](#%E8%B4%9F%E8%BD%BD%E5%9D%87%E8%A1%A1)
 11 |     - [分布式通知与协调](#%E5%88%86%E5%B8%83%E5%BC%8F%E9%80%9A%E7%9F%A5%E4%B8%8E%E5%8D%8F%E8%B0%83)
 12 |     - [分布式锁](#%E5%88%86%E5%B8%83%E5%BC%8F%E9%94%81)
 13 |     - [分布式队列](#%E5%88%86%E5%B8%83%E5%BC%8F%E9%98%9F%E5%88%97)
 14 |     - [集群监控与Leader竞选](#%E9%9B%86%E7%BE%A4%E7%9B%91%E6%8E%A7%E4%B8%8Eleader%E7%AB%9E%E9%80%89)
 15 |   - [参考](#%E5%8F%82%E8%80%83)
 16 | 
 17 | <!-- END doctoc generated TOC please keep comment here to allow auto update -->
 18 | 
 19 | ## etcd的使用
 20 | 
 21 | ### 什么是etcd
 22 | 
 23 | ETCD是一个分布式、可靠的`key-value`存储的分布式系统，用于存储分布式系统中的关键数据；当然，它不仅仅用于存储，还提供配置共享及服务发现；基于Go语言实现  。
 24 | 
 25 | ### etcd的特点
 26 | 
 27 | - 完全复制：集群中的每个节点都可以使用完整的存档
 28 | 
 29 | - 高可用性：Etcd可用于避免硬件的单点故障或网络问题
 30 | 
 31 | - 一致性：每次读取都会返回跨多主机的最新写入
 32 | 
 33 | - 简单：包括一个定义良好、面向用户的API（gRPC）
 34 | 
 35 | - 安全：实现了带有可选的客户端证书身份验证的自动化TLS
 36 | 
 37 | - 可靠：使用Raft算法实现了强一致、高可用的服务存储目录
 38 | 
 39 | ### etcd的应用场景
 40 | 
 41 | #### 服务注册与发现
 42 | 
 43 | 服务发现还能注册
 44 | 
 45 | 服务注册发现解决的是分布式系统中最常见的问题之一，即在同一个分布式系统中，找到我们需要的目标服务，建立连接，然后完成整个链路的调度。   
 46 | 
 47 | 本质上来说，服务发现就是想要了解集群中是否有进程在监听 udp 或 tcp 端口，并且通过名字就可以查找和连接。要解决服务发现的问题，需要有下面三大支柱，缺一不可。  
 48 | 
 49 | 1、**一个强一致性、高可用的服务存储目录**。基于Raft算法的etcd天生就是这样一个强一致性高可用的服务存储目录。  
 50 | 
 51 | 2、**一种注册服务和监控服务健康状态的机制**。用户可以在etcd中注册服务，并且对注册的服务设置`key TTL`，定时保持服务的心跳以达到监控健康状态的效果。   
 52 | 
 53 | 3、**一种查找和连接服务的机制**。通过在 etcd 指定的主题下注册的服务也能在对应的主题下查找到。为了确保连接，我们可以在每个服务机器上都部署一个Proxy模式的etcd，这样就可以确保能访问etcd集群的服务都能互相连接。    
 54 | 
 55 | <img src="/img/etcd_1.png" alt="etcd" align=center/>
 56 | 
 57 | 一个用户的api请求，可能调用多个微服务资源，这些服务我们可以使用etcd进行服务注册和服务发现，当每个服务启动的时候就注册到etcd中，当我们需要使用的时候，直接在etcd中寻找，调用即可。  
 58 | 
 59 | 当然，每个服务的实例不止一个，比如我们的用户服务，我们可能启动了多个实例，这些实例在服务启动过程中全部注册到了etcd中，但是某个实例可能出现故障重启，这时候就etcd在进行转发的时候，就会屏蔽到故障的实例节点，只向正常运行的实例，进行请求转发。   
 60 | 
 61 | <img src="/img/etcd-register.png" alt="etcd" align=center/>
 62 | 
 63 | 来看个服务注册发现的demo
 64 | 
 65 | 这里放一段比较核心的代码，这里摘录了我们线上正在使用的etcd实现grpc服务注册和发现的实现，具体的实现可参考，[etcd实现grpc的服务注册和服务发现](https://github.com/boilingfrog/daily-test/tree/master/etcd/discovery)  
 66 | 
 67 | 对于etcd中的连接，我们每个都维护一个租约，通过KeepAlive自动续保。如果租约过期则所有附加在租约上的key将过期并被删除，即所对应的服务被拿掉。  
 68 | 
 69 | ```go
 70 | package discovery
 71 | 
 72 | import (
 73 | 	"context"
 74 | 	"encoding/json"
 75 | 	"errors"
 76 | 	"net/http"
 77 | 	"strconv"
 78 | 	"strings"
 79 | 	"time"
 80 | 
 81 | 	clientv3 "go.etcd.io/etcd/client/v3"
 82 | 	"go.uber.org/zap"
 83 | )
 84 | 
 85 | // Register for grpc server
 86 | type Register struct {
 87 | 	EtcdAddrs   []string
 88 | 	DialTimeout int
 89 | 
 90 | 	closeCh     chan struct{}
 91 | 	leasesID    clientv3.LeaseID
 92 | 	keepAliveCh <-chan *clientv3.LeaseKeepAliveResponse
 93 | 
 94 | 	srvInfo Server
 95 | 	srvTTL  int64
 96 | 	cli     *clientv3.Client
 97 | 	logger  *zap.Logger
 98 | }
 99 | 
100 | // NewRegister create a register base on etcd
101 | func NewRegister(etcdAddrs []string, logger *zap.Logger) *Register {
102 | 	return &Register{
103 | 		EtcdAddrs:   etcdAddrs,
104 | 		DialTimeout: 3,
105 | 		logger:      logger,
106 | 	}
107 | }
108 | 
109 | // Register a service
110 | func (r *Register) Register(srvInfo Server, ttl int64) (chan<- struct{}, error) {
111 | 	var err error
112 | 
113 | 	if strings.Split(srvInfo.Addr, ":")[0] == "" {
114 | 		return nil, errors.New("invalid ip")
115 | 	}
116 | 
117 | 	if r.cli, err = clientv3.New(clientv3.Config{
118 | 		Endpoints:   r.EtcdAddrs,
119 | 		DialTimeout: time.Duration(r.DialTimeout) * time.Second,
120 | 	}); err != nil {
121 | 		return nil, err
122 | 	}
123 | 
124 | 	r.srvInfo = srvInfo
125 | 	r.srvTTL = ttl
126 | 
127 | 	if err = r.register(); err != nil {
128 | 		return nil, err
129 | 	}
130 | 
131 | 	r.closeCh = make(chan struct{})
132 | 
133 | 	go r.keepAlive()
134 | 
135 | 	return r.closeCh, nil
136 | }
137 | 
138 | // Stop stop register
139 | func (r *Register) Stop() {
140 | 	r.closeCh <- struct{}{}
141 | }
142 | 
143 | // register 注册节点
144 | func (r *Register) register() error {
145 | 	leaseCtx, cancel := context.WithTimeout(context.Background(), time.Duration(r.DialTimeout)*time.Second)
146 | 	defer cancel()
147 | 
148 | 	leaseResp, err := r.cli.Grant(leaseCtx, r.srvTTL)
149 | 	if err != nil {
150 | 		return err
151 | 	}
152 | 	r.leasesID = leaseResp.ID
153 | 	if r.keepAliveCh, err = r.cli.KeepAlive(context.Background(), leaseResp.ID); err != nil {
154 | 		return err
155 | 	}
156 | 
157 | 	data, err := json.Marshal(r.srvInfo)
158 | 	if err != nil {
159 | 		return err
160 | 	}
161 | 	_, err = r.cli.Put(context.Background(), BuildRegPath(r.srvInfo), string(data), clientv3.WithLease(r.leasesID))
162 | 	return err
163 | }
164 | 
165 | // unregister 删除节点
166 | func (r *Register) unregister() error {
167 | 	_, err := r.cli.Delete(context.Background(), BuildRegPath(r.srvInfo))
168 | 	return err
169 | }
170 | 
171 | // keepAlive
172 | func (r *Register) keepAlive() {
173 | 	ticker := time.NewTicker(time.Duration(r.srvTTL) * time.Second)
174 | 	for {
175 | 		select {
176 | 		case <-r.closeCh:
177 | 			if err := r.unregister(); err != nil {
178 | 				r.logger.Error("unregister failed", zap.Error(err))
179 | 			}
180 | 			if _, err := r.cli.Revoke(context.Background(), r.leasesID); err != nil {
181 | 				r.logger.Error("revoke failed", zap.Error(err))
182 | 			}
183 | 			return
184 | 		case res := <-r.keepAliveCh:
185 | 			if res == nil {
186 | 				if err := r.register(); err != nil {
187 | 					r.logger.Error("register failed", zap.Error(err))
188 | 				}
189 | 			}
190 | 		case <-ticker.C:
191 | 			if r.keepAliveCh == nil {
192 | 				if err := r.register(); err != nil {
193 | 					r.logger.Error("register failed", zap.Error(err))
194 | 				}
195 | 			}
196 | 		}
197 | 	}
198 | }
199 | ```
200 | 
201 | #### 消息发布和订阅
202 | 
203 | 在分布式系统中，最适用的一种组件间通信方式就是消息发布与订阅。即构建一个配置共享中心，数据提供者在这个配置中心发布消息，而消息使用者则订阅他们关心的主题，一旦主题有消息发布，就会实时通知订阅者。通过这种方式可以做到分布式系统配置的集中式管理与动态更新  
204 | 
205 | - **应用中用到的一些配置信息放到 etcd 上进行集中管理**。这类场景的使用方式通常是这样：应用在启动的时候主动从etcd获取一次配置信息，同时，在etcd节点上注册一个Watcher并等待，以后每次配置有更新的时候，etcd都会实时通知订阅者，以此达到获取最新配置信息的目的。  
206 | 
207 | - **分布式搜索服务中，索引的元信息和服务器集群机器的节点状态存放在etcd中**，供各个客户端订阅使用。使用etcd的`key TTL`功能可以确保机器状态是实时更新的。  
208 | 
209 | - **分布式日志收集系统**。这个系统的核心工作是收集分布在不同机器的日志。收集器通常是按照应用（或主题）来分配收集任务单元，因此可以在 etcd 上创建一个以应用（主题）命名的目录 P，并将这个应用（主题相关）的所有机器 ip，以子目录的形式存储到目录 P 上，然后设置一个etcd递归的Watcher，递归式的监控应用（主题）目录下所有信息的变动。这样就实现了机器 IP（消息）变动的时候，能够实时通知到收集器调整任务分配。 
210 | 
211 | - **系统中信息需要动态自动获取与人工干预修改信息请求内容的情况**。通常是暴露出接口，例如 JMX 接口，来获取一些运行时的信息。引入 etcd 之后，就不用自己实现一套方案了，只要将这些信息存放到指定的 etcd 目录中即可，etcd 的这些目录就可以通过 HTTP 的接口在外部访问。   
212 | 
213 | <img src="/img/etcd-watch.png" alt="etcd" align=center/>
214 | 
215 | 消息发布被订阅的实际应用  
216 | 
217 | 我们一个性能要求比较高的项目，所需要的配置信息，存放到本地的localCache中，通过etcd的消息发布和订阅实现，实现配置信息在不同节点同步更新。  
218 | 
219 | 来看下如何实现  
220 | 
221 | ```go
222 | func init() {
223 | 	handleMap = make(map[string]func([]byte) error)
224 | }
225 | 
226 | var handleMap map[string]func([]byte) error
227 | 
228 | func RegisterUpdateHandle(key string, f func([]byte) error) {
229 | 	handleMap[key] = f
230 | }
231 | 
232 | type PubClient interface {
233 | 	Pub(ctx context.Context, key string, val string) error
234 | }
235 | 
236 | var Pub PubClient
237 | 
238 | type PubClientImpl struct {
239 | 	client *clientv3.Client
240 | 	logger *zap.Logger
241 | 	prefix string
242 | }
243 | 
244 | // 监听变化，实时更新到本地的map中
245 | func (c *PubClientImpl) Watcher() {
246 | 	ctx, cancel := context.WithCancel(context.Background())
247 | 	rch := c.client.Watch(ctx, c.prefix, clientv3.WithPrefix())
248 | 	defer cancel()
249 | 
250 | 	for wresp := range rch {
251 | 		for _, ev := range wresp.Events {
252 | 			switch ev.Type {
253 | 			case mvccpb.PUT:
254 | 				c.logger.Warn("Cache Update", zap.Any("value", ev.Kv))
255 | 				err := handleCacheUpdate(ev.Kv)
256 | 				if err != nil {
257 | 					c.logger.Error("Cache Update", zap.Error(err))
258 | 				}
259 | 			case mvccpb.DELETE:
260 | 				c.logger.Error("Cache Delete NOT SUPPORT")
261 | 			}
262 | 		}
263 | 	}
264 | }
265 | 
266 | func handleCacheUpdate(val *mvccpb.KeyValue) error {
267 | 	if val == nil {
268 | 		return nil
269 | 	}
270 | 	f := handleMap[string(val.Key)]
271 | 	if f != nil {
272 | 		return f(val.Value)
273 | 	}
274 | 	return nil
275 | }
276 | 
277 | func (c *PubClientImpl) Pub(ctx context.Context, key string, val string) error {
278 | 	ctx, _ = context.WithTimeout(ctx, time.Second*10)
279 | 	_, err := c.client.Put(ctx, key, val)
280 | 	if err != nil {
281 | 		return err
282 | 	}
283 | 	return nil
284 | }
285 | ```
286 | 
287 | #### 负载均衡
288 | 
289 | 关于负载均衡，通常意义上有两种  
290 | 
291 | - 软负载，顾名思义就是靠软件手段来实现的负载均衡。软负载也通常被称为 4层或 7 层负载！  
292 | 
293 | - 硬负载，就是靠硬件实现的负载均衡，数据包转发功能。常见的就是 F5。  
294 | 
295 | 通过etcd实现的负载均衡就是软负载，在分布式系统中，高并发的场景下，我们通常会构建服务的集群，当某一个机器宕机了，别的机器可以马上顶替上来。   
296 | 
297 | etcd中实现负载均衡，例如我们上文的例子服务注册和发现，对于一个用户服务来讲，后面的用户服务的实例可能是多个，每个都有自己的ip和port，这些服务会在项目启动的时候全部注册到etcd中，所以当使用的时候，每次etcd会轮询出一个健康的服务实例，来处理用户的请求。  
298 | 
299 | <img src="/img/etcd-balance.png" alt="etcd" align=center/>
300 | 
301 | #### 分布式通知与协调
302 | 
303 | 这里说到的分布式通知与协调，与消息发布和订阅有些相似。都用到了etcd中的Watcher机制，通过注册与异步通知机制，实现分布式环境下不同系统之间的通知与协调，从而对数据变更做到实时处理。实现方式通常是这样：不同系统都在etcd上对同一个目录进行注册，同时设置Watcher观测该目录的变化（如果对子目录的变化也有需要，可以设置递归模式），当某个系统更新了etcd的目录，那么设置了Watcher的系统就会收到通知，并作出相应处理。  
304 | 
305 | - **通过etcd进行低耦合的心跳检测**。检测系统和被检测系统通过 etcd 上某个目录关联而非直接关联起来，这样可以大大减少系统的耦合性。  
306 | 
307 | - **通过etcd完成系统调度**。某系统有控制台和推送系统两部分组成，控制台的职责是控制推送系统进行相应的推送工作。管理人员在控制台作的一些操作，实际上是修改了etcd上某些目录节点的状态，而etcd就把这些变化通知给注册了Watcher的推送系统客户端，推送系统再作出相应的推送任务。  
308 | 
309 | - **通过etcd完成工作汇报**。大部分类似的任务分发系统，子任务启动后，到etcd来注册一个临时工作目录，并且定时将自己的进度进行汇报（将进度写入到这个临时目录），这样任务管理者就能够实时知道任务进度。    
310 | 
311 | <img src="/img/etcd-notify.png" alt="etcd" align=center/>
312 | 
313 | #### 分布式锁
314 | 
315 | 因为etcd使用Raft算法保持了数据的强一致性，某次操作存储到集群中的值必然是全局一致的，所以很容易实现分布式锁。锁服务有两种使用方式，一是保持独占，二是控制时序。  
316 | 
317 | 首先，来看一下分布式锁应该具备哪些条件。  
318 | 
319 | - 互斥性：在任意时刻，对于同一个锁，只有一个客户端能持有，从而保证一个共享资源同一时间只能被一个客户端操作；  
320 | 
321 | - 安全性：即不会形成死锁，当一个客户端在持有锁的期间崩溃而没有主动解锁的情况下，其持有的锁也能够被正确释放，并保证后续其它客户端能加锁；  
322 | 
323 | - 可用性：当提供锁服务的节点发生宕机等不可恢复性故障时，“热备” 节点能够接替故障的节点继续提供服务，并保证自身持有的数据与故障节点一致。  
324 | 
325 | - 对称性：对于任意一个锁，其加锁和解锁必须是同一个客户端，即客户端 A 不能把客户端 B 加的锁给解了。
326 | 
327 | etcd的 Watch 机制、Lease 机制、Revision 机制和 Prefix 机制，这些机制赋予了 Etcd 实现分布式锁的能力。  
328 | 
329 | - Lease 机制  
330 | 
331 | 即租约机制（TTL，Time To Live），Etcd 可以为存储的 Key-Value 对设置租约，当租约到期，Key-Value 将失效删除；同时也支持续约，通过客户端可以在租约到期之前续约，以避免 Key-Value 对过期失效。Lease 机制可以保证分布式锁的安全性，为锁对应的 Key 配置租约，即使锁的持有者因故障而不能主动释放锁，锁也会因租约到期而自动释放。  
332 | 
333 | - Revision 机制  
334 | 
335 | 每个 Key 带有一个 Revision 号，每进行一次事务便加一，因此它是全局唯一的，如初始值为 0，进行一次 put(key, value)，Key 的 Revision 变为 1，同样的操作，再进行一次，Revision 变为 2；换成 key1 进行`put(key1, value)`操作，Revision将变为 3；这种机制有一个作用：通过 Revision 的大小就可以知道写操作的顺序。在实现分布式锁时，多个客户端同时抢锁，根据 Revision 号大小依次获得锁，可以避免 “羊群效应” （也称“惊群效应”），实现公平锁。  
336 | 
337 | - Prefix 机制  
338 | 
339 | 即前缀机制，也称目录机制，例如，一个名为 `/mylock` 的锁，两个争抢它的客户端进行写操作，实际写入的Key分别为：`key1="/mylock/UUID1"`,`key2="/mylock/UUID2"`，其中，UUID表示全局唯一的ID，确保两个Key的唯一性。很显然，写操作都会成功，但返回的Revision不一样，那么，如何判断谁获得了锁呢？通过前缀`“/mylock”`查询，返回包含两个Key-Value对的Key-Value列表，同时也包含它们的Revision，通过Revision大小，客户端可以判断自己是否获得锁，如果抢锁失败，则等待锁释放（对应的 Key 被删除或者租约过期），然后再判断自己是否可以获得锁。  
340 | 
341 | - Watch 机制
342 | 
343 | 即监听机制，Watch机制支持监听某个固定的Key，也支持监听一个范围（前缀机制），当被监听的Key或范围发生变化，客户端将收到通知；在实现分布式锁时，如果抢锁失败，可通过Prefix机制返回的Key-Value列表获得Revision比自己小且相差最小的 Key（称为 Pre-Key），对Pre-Key进行监听，因为只有它释放锁，自己才能获得锁，如果监听到Pre-Key的DELETE事件，则说明Pre-Key已经释放，自己已经持有锁。      
344 | 
345 | 来看下etcd中锁是如何实现的  
346 | 
347 | `client/v3/concurrency/mutex.go`  
348 | 
349 | ```go
350 | // Mutex implements the sync Locker interface with etcd
351 | type Mutex struct {
352 | 	s *Session
353 | 
354 | 	pfx   string // 前缀
355 | 	myKey string // key
356 | 	myRev int64 // 自增的Revision
357 | 	hdr   *pb.ResponseHeader
358 | }
359 | 
360 | // Lock 使用可取消的context锁定互斥锁。如果context被取消
361 | // 在尝试获取锁时，互斥锁会尝试清除其过时的锁条目。
362 | func (m *Mutex) Lock(ctx context.Context) error {
363 | 	resp, err := m.tryAcquire(ctx)
364 | 	if err != nil {
365 | 		return err
366 | 	}
367 | 	// if no key on prefix / the minimum rev is key, already hold the lock
368 | 	ownerKey := resp.Responses[1].GetResponseRange().Kvs
369 | 	if len(ownerKey) == 0 || ownerKey[0].CreateRevision == m.myRev {
370 | 		m.hdr = resp.Header
371 | 		return nil
372 | 	}
373 | 	client := m.s.Client()
374 | 
375 | 	// waitDeletes 有效地等待，直到所有键匹配前缀且不大于
376 | 	// 创建的version。
377 | 	_, werr := waitDeletes(ctx, client, m.pfx, m.myRev-1)
378 | 	// release lock key if wait failed
379 | 	if werr != nil {
380 | 		m.Unlock(client.Ctx())
381 | 		return werr
382 | 	}
383 | 
384 | 	// make sure the session is not expired, and the owner key still exists.
385 | 	gresp, werr := client.Get(ctx, m.myKey)
386 | 	if werr != nil {
387 | 		m.Unlock(client.Ctx())
388 | 		return werr
389 | 	}
390 | 
391 | 	if len(gresp.Kvs) == 0 { // is the session key lost?
392 | 		return ErrSessionExpired
393 | 	}
394 | 	m.hdr = gresp.Header
395 | 
396 | 	return nil
397 | }
398 | 
399 | func (m *Mutex) tryAcquire(ctx context.Context) (*v3.TxnResponse, error) {
400 | 	s := m.s
401 | 	client := m.s.Client()
402 | 	// s.Lease()租约
403 | 	m.myKey = fmt.Sprintf("%s%x", m.pfx, s.Lease())
404 | 	// 比较Revision, 这里构建了一个比较表达式
405 | 	// 具体的比较逻辑在下面的client.Txn用到
406 | 	// 如果等于0，写入当前的key，并设置租约，
407 | 	// 否则获取这个key,重用租约中的锁(这里主要目的是在于重入)
408 | 	// 通过第二次获取锁,判断锁是否存在来支持重入
409 | 	// 所以只要租约一致,那么是可以重入的.
410 | 	cmp := v3.Compare(v3.CreateRevision(m.myKey), "=", 0)
411 | 	// 通过 myKey 将自己锁在waiters；最早的waiters将获得锁
412 | 	put := v3.OpPut(m.myKey, "", v3.WithLease(s.Lease()))
413 | 	// 获取已经拿到锁的key的信息
414 | 	get := v3.OpGet(m.myKey)
415 | 	// 仅使用一个 RPC 获取当前持有者以完成无竞争路径
416 | 	getOwner := v3.OpGet(m.pfx, v3.WithFirstCreate()...)
417 | 	// 这里是比较的逻辑，如果等于0，写入当前的key，否则则读取这个key
418 | 	// 大佬的代码写的就是奇妙
419 | 	resp, err := client.Txn(ctx).If(cmp).Then(put, getOwner).Else(get, getOwner).Commit()
420 | 	if err != nil {
421 | 		return nil, err
422 | 	}
423 | 
424 | 	// 根据比较操作的结果写入Revision到m.myRev中
425 | 	m.myRev = resp.Header.Revision
426 | 	if !resp.Succeeded {
427 | 		m.myRev = resp.Responses[0].GetResponseRange().Kvs[0].CreateRevision
428 | 	}
429 | 	return resp, nil
430 | }
431 | 
432 | // 抽象出了一个session对象来持续保持租约不过期
433 | func NewSession(client *v3.Client, opts ...SessionOption) (*Session, error) {
434 | 	...
435 | 	ctx, cancel := context.WithCancel(ops.ctx)
436 | 	// 保证锁，在线程的活动期间，实现锁的的续租
437 | 	keepAlive, err := client.KeepAlive(ctx, id)
438 | 	if err != nil || keepAlive == nil {
439 | 		cancel()
440 | 		return nil, err
441 | 	}
442 | 
443 | 	...
444 | 	return s, nil
445 | }
446 | ```
447 | 
448 | 设计思路： 
449 | 
450 | 1、多个请求来前抢占锁，通过Revision来判断锁的先后顺序；    
451 | 
452 | 2、如果有比当前key的Revision小的Revision存在，说明有key已经获得了锁；  
453 | 
454 | 3、等待直到前面的key被删除，然后自己就获得了锁。   
455 | 
456 | 通过etcd实现的锁，直接包含了锁的续租，如果使用Redis还要自己去实现，相比较使用更简单。  
457 | 
458 | <img src="/img/etcd-lock.png" alt="etcd" align=center/>
459 | 
460 | 来实现一个etcd的锁   
461 | 
462 | ```go
463 | package main
464 | 
465 | import (
466 | 	"context"
467 | 	"fmt"
468 | 	"log"
469 | 	"time"
470 | 
471 | 	clientv3 "go.etcd.io/etcd/client/v3"
472 | 	"go.etcd.io/etcd/client/v3/concurrency"
473 | )
474 | 
475 | func main() {
476 | 	cli, err := clientv3.New(clientv3.Config{
477 | 		Endpoints:   []string{"localhost:2379"},
478 | 		DialTimeout: 5 * time.Second,
479 | 	})
480 | 	if err != nil {
481 | 		log.Fatal(err)
482 | 	}
483 | 	defer cli.Close()
484 | 	ctx := context.Background()
485 | 	// m1来抢锁
486 | 	go func() {
487 | 		s1, err := concurrency.NewSession(cli)
488 | 		if err != nil {
489 | 			log.Fatal(err)
490 | 		}
491 | 		defer s1.Close()
492 | 		m1 := concurrency.NewMutex(s1, "/my-lock/")
493 | 
494 | 		// acquire lock for s1
495 | 		if err := m1.Lock(ctx); err != nil {
496 | 			log.Fatal(err)
497 | 		}
498 | 		fmt.Println("m1---获得了锁")
499 | 
500 | 		time.Sleep(time.Second * 3)
501 | 
502 | 		// 释放锁
503 | 		if err := m1.Unlock(ctx); err != nil {
504 | 			log.Fatal(err)
505 | 		}
506 | 		fmt.Println("m1++释放了锁")
507 | 	}()
508 | 
509 | 	// m2来抢锁
510 | 	go func() {
511 | 		s2, err := concurrency.NewSession(cli)
512 | 		if err != nil {
513 | 			log.Fatal(err)
514 | 		}
515 | 		defer s2.Close()
516 | 		m2 := concurrency.NewMutex(s2, "/my-lock/")
517 | 		if err := m2.Lock(ctx); err != nil {
518 | 			log.Fatal(err)
519 | 		}
520 | 		fmt.Println("m2---获得了锁")
521 | 
522 | 		// mock业务执行的时间
523 | 		time.Sleep(time.Second * 3)
524 | 
525 | 		// 释放锁
526 | 		if err := m2.Unlock(ctx); err != nil {
527 | 			log.Fatal(err)
528 | 		}
529 | 
530 | 		fmt.Println("m2++释放了锁")
531 | 	}()
532 | 
533 | 	time.Sleep(time.Second * 10)
534 | }
535 | 
536 | ```
537 | 
538 | 打印下输出
539 | 
540 | ```
541 | m2---获得了锁
542 | m2++释放了锁
543 | m1---获得了锁
544 | m1++释放了锁
545 | ```
546 | 
547 | #### 分布式队列
548 | 
549 | 即创建一个先进先出的队列保持顺序。另一种比较有意思的实现是在保证队列达到某个条件时再统一按顺序执行。这种方法的实现可以在`/queue`这个目录中另外建立一个`/queue/condition`节点。  
550 | 
551 | - condition 可以表示队列大小。比如一个大的任务需要很多小任务就绪的情况下才能执行，每次有一个小任务就绪，就给这个 condition 数字加 1，直到达到大任务规定的数字，再开始执行队列里的一系列小任务，最终执行大任务。  
552 | 
553 | - condition 可以表示某个任务在不在队列。这个任务可以是所有排序任务的首个执行程序，也可以是拓扑结构中没有依赖的点。通常，必须执行这些任务后才能执行队列中的其他任务。  
554 | 
555 | - condition 还可以表示其它的一类开始执行任务的通知。可以由控制程序指定，当 condition 出现变化时，开始执行队列任务。  
556 | 
557 | 来看下实现  
558 | 
559 | 入队  
560 | 
561 | ```go
562 | func newUniqueKV(kv v3.KV, prefix string, val string) (*RemoteKV, error) {
563 | 	for {
564 | 		newKey := fmt.Sprintf("%s/%v", prefix, time.Now().UnixNano())
565 | 		// 创建对应的key
566 | 		rev, err := putNewKV(kv, newKey, val, v3.NoLease)
567 | 		if err == nil {
568 | 			return &RemoteKV{kv, newKey, rev, val}, nil
569 | 		}
570 | 		// 如果之前已经创建了，就返回
571 | 		if err != ErrKeyExists {
572 | 			return nil, err
573 | 		}
574 | 	}
575 | }
576 | 
577 | // 只有在没有创建的时候才能创建成功
578 | func putNewKV(kv v3.KV, key, val string, leaseID v3.LeaseID) (int64, error) {
579 | 	cmp := v3.Compare(v3.Version(key), "=", 0)
580 | 	req := v3.OpPut(key, val, v3.WithLease(leaseID))
581 | 	// 这里还用到了这种比较的逻辑
582 | 	txnresp, err := kv.Txn(context.TODO()).If(cmp).Then(req).Commit()
583 | 	if err != nil {
584 | 		return 0, err
585 | 	}
586 | 	// 已经存在则返回错误
587 | 	if !txnresp.Succeeded {
588 | 		return 0, ErrKeyExists
589 | 	}
590 | 	return txnresp.Header.Revision, nil
591 | }
592 | ```
593 | 
594 | 出队  
595 | 
596 | ```go
597 | // Dequeue处理的是一个先进新出的队列
598 | // 如果队列为空，Dequeue将会阻塞直到里面有值塞入
599 | func (q *Queue) Dequeue() (string, error) {
600 | 	resp, err := q.client.Get(q.ctx, q.keyPrefix, v3.WithFirstRev()...)
601 | 	if err != nil {
602 | 		return "", err
603 | 	}
604 | 
605 | 	kv, err := claimFirstKey(q.client, resp.Kvs)
606 | 	if err != nil {
607 | 		return "", err
608 | 	} else if kv != nil {
609 | 		return string(kv.Value), nil
610 | 		// more 表示在请求的范围内是否还有更多的键要返回。
611 | 		// 则进行Dequeue重试
612 | 	} else if resp.More {
613 | 		// missed some items, retry to read in more
614 | 		return q.Dequeue()
615 | 	}
616 | 
617 | 	// nothing yet; wait on elements
618 | 	ev, err := WaitPrefixEvents(
619 | 		q.client,
620 | 		q.keyPrefix,
621 | 		resp.Header.Revision,
622 | 		[]mvccpb.Event_EventType{mvccpb.PUT})
623 | 	if err != nil {
624 | 		return "", err
625 | 	}
626 | 
627 | 	ok, err := deleteRevKey(q.client, string(ev.Kv.Key), ev.Kv.ModRevision)
628 | 	if err != nil {
629 | 		return "", err
630 | 	} else if !ok {
631 | 		// 如果删除失败，重试
632 | 		return q.Dequeue()
633 | 	}
634 | 	return string(ev.Kv.Value), err
635 | }
636 | ```
637 | 
638 | 总结  
639 | 
640 | 1、这里的入队是一个先进新出的队列；  
641 | 
642 | 2、出队的实现也很简单，如果队列为空，Dequeue将会阻塞直到里面有值塞入；  
643 | 
644 | 来个demo  
645 | 
646 | ```go
647 | package main
648 | 
649 | import (
650 | 	"fmt"
651 | 	"log"
652 | 	"time"
653 | 
654 | 	clientv3 "go.etcd.io/etcd/client/v3"
655 | 	recipe "go.etcd.io/etcd/client/v3/experimental/recipes"
656 | )
657 | 
658 | func main() {
659 | 	cli, err := clientv3.New(clientv3.Config{
660 | 		Endpoints: []string{"localhost:2379"},
661 | 	})
662 | 	if err != nil {
663 | 		log.Fatalf("error New (%v)", err)
664 | 	}
665 | 
666 | 	go func() {
667 | 		q := recipe.NewQueue(cli, "testq")
668 | 		for i := 0; i < 5; i++ {
669 | 			if err := q.Enqueue(fmt.Sprintf("%d", i)); err != nil {
670 | 				log.Fatalf("error enqueuing (%v)", err)
671 | 			}
672 | 		}
673 | 	}()
674 | 
675 | 	go func() {
676 | 		q := recipe.NewQueue(cli, "testq")
677 | 		for i := 10; i < 100; i++ {
678 | 			if err := q.Enqueue(fmt.Sprintf("%d", i)); err != nil {
679 | 				log.Fatalf("error enqueuing (%v)", err)
680 | 			}
681 | 		}
682 | 	}()
683 | 
684 | 	q := recipe.NewQueue(cli, "testq")
685 | 	for i := 0; i < 100; i++ {
686 | 		s, err := q.Dequeue()
687 | 		if err != nil {
688 | 			log.Fatalf("error dequeueing (%v)", err)
689 | 		}
690 | 		fmt.Println(s)
691 | 	}
692 | 
693 | 	time.Sleep(time.Second * 3)
694 | }
695 | ```
696 | 
697 | #### 集群监控与Leader竞选
698 | 
699 | 通过etcd来进行监控实现起来非常简单并且实时性强   
700 | 
701 | 1、前面几个场景已经提到Watcher机制，当某个节点消失或有变动时，Watcher会第一时间发现并告知用户。  
702 | 
703 | 2、节点可以设置`TTL key`，比如每隔 30s 发送一次心跳使代表该机器存活的节点继续存在，否则节点消失。  
704 | 
705 | 这样就可以第一时间检测到各节点的健康状态，以完成集群的监控要求  
706 | 
707 | 另外，使用分布式锁，可以完成Leader竞选。这种场景通常是一些长时间CPU计算或者使用IO操作的机器，只需要竞选出的Leader计算或处理一次，就可以把结果复制给其他的Follower。从而避免重复劳动，节省计算资源。  
708 | 
709 | 这个的经典场景是`搜索系统中建立全量索引`。如果每个机器都进行一遍索引的建立，不但耗时而且建立索引的一致性不能保证。通过在etcd的CAS机制同时创建一个节点，创建成功的机器作为Leader，进行索引计算，然后把计算结果分发到其它节点。  
710 | 
711 | <img src="/img/etcd-leader.png" alt="etcd" align=center/>
712 | 
713 | ### 参考
714 | 
715 | 【一文入门ETCD】https://juejin.cn/post/6844904031186321416   
716 | 【etcd：从应用场景到实现原理的全方位解读】https://www.infoq.cn/article/etcd-interpretation-application-scenario-implement-principle   
717 | 【Etcd 架构与实现解析】http://jolestar.com/etcd-architecture/   
718 | 【linux单节点和集群的etcd】https://www.jianshu.com/p/07ca88b6ff67   
719 | 【软负载均衡与硬负载均衡、4层与7层负载均衡】https://cloud.tencent.com/developer/article/1446391   
720 | 【Etcd Lock详解】https://tangxusc.github.io/blog/2019/05/etcd-lock%E8%AF%A6%E8%A7%A3/   
721 | 【etcd基础与使用】https://zhuyasen.com/post/etcd.html   
722 | 【ETCD核心机制解析】https://www.cnblogs.com/FG123/p/13632095.html      
723 | 【etcd watch机制】http://liangjf.top/2019/12/31/110.etcd-watch%E6%9C%BA%E5%88%B6%E5%88%86%E6%9E%90/   
724 | 【ETCD 源码学习--Watch(server)】https://www.codeleading.com/article/15455457381/   
725 | 【etcdV3—watcher服务端源码解析】https://blog.csdn.net/stayfoolish_yj/article/details/104497233    


--------------------------------------------------------------------------------
/10-etcd选型对比.md:
--------------------------------------------------------------------------------
  1 | <!-- START doctoc generated TOC please keep comment here to allow auto update -->
  2 | <!-- DON'T EDIT THIS SECTION, INSTEAD RE-RUN doctoc TO UPDATE -->
  3 | 
  4 | - [etcd选型对比](#etcd%E9%80%89%E5%9E%8B%E5%AF%B9%E6%AF%94)
  5 |   - [前言](#%E5%89%8D%E8%A8%80)
  6 |   - [基本架构和原理](#%E5%9F%BA%E6%9C%AC%E6%9E%B6%E6%9E%84%E5%92%8C%E5%8E%9F%E7%90%86)
  7 |     - [etcd](#etcd)
  8 |     - [Consul](#consul)
  9 |     - [ZooKeeper](#zookeeper)
 10 |   - [选型对比](#%E9%80%89%E5%9E%8B%E5%AF%B9%E6%AF%94)
 11 |   - [总结](#%E6%80%BB%E7%BB%93)
 12 |   - [参考](#%E5%8F%82%E8%80%83)
 13 | 
 14 | <!-- END doctoc generated TOC please keep comment here to allow auto update -->
 15 | 
 16 | ## etcd选型对比
 17 | 
 18 | ### 前言
 19 | 
 20 | 对比 Consul, ZooKeeper。选型etcd有那些好处呢？  
 21 | 
 22 | ### 基本架构和原理
 23 | 
 24 | #### etcd
 25 | 
 26 | ETCD是一个分布式、可靠的key-value存储的分布式系统，用于存储分布式系统中的关键数据；当然，它不仅仅用于存储，还提供配置共享及服务发现；基于Go语言实现 。  
 27 | 
 28 | etcd的特点  
 29 | 
 30 | - 完全复制：集群中的每个节点都可以使用完整的存档  
 31 | 
 32 | - 高可用性：Etcd可用于避免硬件的单点故障或网络问题  
 33 | 
 34 | - 一致性：每次读取都会返回跨多主机的最新写入  
 35 | 
 36 | - 简单：包括一个定义良好、面向用户的API（gRPC）   
 37 | 
 38 | - 安全：实现了带有可选的客户端证书身份验证的自动化TLS  
 39 | 
 40 | - 可靠：使用Raft算法实现了强一致、高可用的服务存储目录  
 41 | 
 42 | etcd 是基于 raft 算法实现的，具体的实现可参见[etcd实现raft源码解读](https://www.cnblogs.com/ricklz/p/15155095.html)
 43 | 
 44 | #### Consul
 45 | 
 46 | 先放一张 consul 的架构图  
 47 | 
 48 | <img src="/img/etcd-consul.webp" alt="etcd" align=center/>
 49 | 
 50 | consul 使用的是 Gossip 协议  
 51 | 
 52 | Gossip 中文名称叫流言协议，它是一种消息传播协议。它的核心思想其实源自我们生活中的八卦、闲聊。我们在日常生活中所看到的劲爆消息其实源于两类，一类是权威机构如国家新闻媒体发布的消息，另一类则是大家通过微信等社交聊天软件相互八卦，一传十，十传百的结果。   
 53 | 
 54 | Gossip 协议的基本工作原理与我们八卦类似，在 Gossip 协议中，如下图所示，各个节点会周期性地选择一定数量节点，然后将消息同步给这些节点。收到消息后的节点同样做出类似的动作，随机的选择节点，继续扩散给其他节点。  
 55 | 
 56 | Gossip 协议的基本工作原理与我们八卦类似，在 Gossip 协议中，如下图所示，各个节点会周期性地选择一定数量节点，然后将消息同步给这些节点。收到消息后的节点同样做出类似的动作，随机的选择节点，继续扩散给其他节点。  
 57 | 
 58 | 最终经过一定次数的扩散、传播，整个集群的各个节点都能感知到此消息，各个节点的数据趋于一致。Gossip 协议被广泛应用在多个知名项目中，比如 Redis Cluster 集群版，Apache Cassandra，AWS Dynamo。  
 59 | 
 60 | Consul 天然支持多数据中心，但是多数据中心内的服务数据并不会跨数据中心同步，各个数据中心的 Server 集群是独立的,Consul 提供了 Prepared Query 功能，它支持根据一定的策略返回多数据中心下的最佳的服务实例地址，使你的服务具备跨数据中心容灾。   
 61 | 
 62 | 这里来看下 Prepared Query 查询的过程：  
 63 | 
 64 | 比如当你的 API 网关收到用户请求查询 A 服务，API 网关服务优先从缓存中查找 A 服务对应的最佳实例。若无缓存则向 Consul 发起一个 Prepared Query 请求查询 A 服务实例，Consul 收到请求后，优先返回本数据中心下的服务实例。如果本数据中心没有或异常则根据数据中心间 RTT 由近到远查询其它数据中心数据，最终网关可将用户请求转发给最佳的数据中心下的实例地址。  
 65 | 
 66 | Consul 支持以下三种模式的读请求：  
 67 | 
 68 | - 默认（default）。默认是此模式，绝大部分场景下它能保证数据的强一致性。但在老的 Leader 出现网络分区被隔离、新的 Leader 被选举出来的一个极小时间窗口内，可能会导致 stale read。这是因为 Consul 为了提高读性能，使用的是基于 Lease 机制来维持 Leader 身份，避免了与其他节点进行交互确认的开销。  
 69 | 
 70 | - 强一致性（consistent）。强一致性读与 etcd 默认线性读模式一样，每次请求需要集群多数节点确认 Leader 身份，因此相比 default 模式读，性能会有所下降。  
 71 | 
 72 | - 弱一致性（stale)。任何节点都可以读，无论它是否 Leader。可能读取到陈旧的数据，类似 etcd 的串行读。这种读模式不要求集群有 Leader，因此当集群不可用时，只要有节点存活，它依然可以响应读请求。  
 73 | 
 74 | #### ZooKeeper
 75 | 
 76 | ZooKeeper 是一个典型的分布式数据一致性解决方案，分布式应用程序可以基于 ZooKeeper 实现诸如数据发布/订阅、负载均衡、命名服务、分布式协调/通知、集群管理、Master 选举、分布式锁和分布式队列等功能。  
 77 | 
 78 | ZooKeeper 的有点：  
 79 | 
 80 | - 顺序一致性： 从同一客户端发起的事务请求，最终将会严格地按照顺序被应用到 ZooKeeper 中去。
 81 | 
 82 | - 原子性： 所有事务请求的处理结果在整个集群中所有机器上的应用情况是一致的，也就是说，要么整个集群中所有的机器都成功应用了某一个事务，要么都没有应用。
 83 | 
 84 | - 单一系统映像 ： 无论客户端连到哪一个 ZooKeeper 服务器上，其看到的服务端数据模型都是一致的。
 85 | 
 86 | - 可靠性： 一旦一次更改请求被应用，更改的结果就会被持久化，直到被下一次更改覆盖。
 87 | 
 88 | 再来看下 ZooKeeper 的架构图，图片摘自[etcd实战课](https://time.geekbang.org/column/article/351898)  
 89 | 
 90 | <img src="/img/zookeeper.webp" alt="etcd" align=center/>  
 91 | 
 92 | ZooKeeper 集群中的所有机器通过一个 Leader 选举过程来选定一台称为 “Leader” 的机器，Leader 既可以为客户端提供写服务又能提供读服务。  
 93 | 
 94 | 除了 Leader 外，Follower 和 Observer 都只能提供读服务。Follower 和 Observer 唯一的区别在于 Observer 机器不参与 Leader 的选举过程，也不参与写操作的“过半写成功”策略，因此 Observer 机器可以在不影响写性能的情况下提升集群的读性能。    
 95 | 
 96 | ZooKeeper 使用的是 Zab 协议   
 97 | 
 98 | ZAB（ZooKeeper Atomic Broadcast 原子广播） 协议是为分布式协调服务 ZooKeeper 专门设计的一种支持崩溃恢复的原子广播协议。 在 ZooKeeper 中，主要依赖 ZAB 协议来实现分布式数据一致性，基于该协议，ZooKeeper 实现了一种主备模式的系统架构来保持集群中各个副本之间的数据一致性。   
 99 | 
100 | Zab 协议可以分为以下阶段：  
101 | 
102 | - Phase 0，Leader 选举（Leader Election)。一个节点只要求获得半数以上投票，就可以当选为准 Leader；  
103 | 
104 | - Phase 1，发现（Discovery）。准 Leader 收集其他节点的数据信息，并将最新的数据复制到自身；  
105 | 
106 | - Phase 2，同步（Synchronization）。准 Leader 将自身最新数据复制给其他落后的节点，并告知其他节点自己正式当选为 Leader；  
107 | 
108 | - Phase 3，广播（Broadcast）。Leader 正式对外服务，处理客户端写请求，对消息进行广播。当收到一个写请求后，它会生成 Proposal 广播给各个 Follower 节点，一半以上 Follower 节点应答之后，Leader 再发送 Commit 命令给各个 Follower，告知它们提交相关提案；  
109 | 
110 | 关于 ZAB 中的两种模式：崩溃恢复和消息广播  
111 | 
112 | **崩溃恢复**
113 | 
114 | 当整个服务框架在启动过程中，或是当 Leader 服务器出现网络中断、崩溃退出与重启等异常情况时，ZAB 协议就会进人恢复模式并选举产生新的Leader服务器。  
115 | 
116 | 当选出 leader ,并且完成了上面 Phase 2 的同步过程，就退出崩溃恢复模式   
117 | 
118 | **消息广播**
119 | 
120 | 当准 Leader 将自身最新数据复制给其他落后的节点，并告知其他节点自己正式当选为 Leader。这时候就可以进入广播模式，当有客户端进行数据写入操作的时候，就可以通过广播模式通知所有的 follower 了。   
121 | 
122 | 当集群中已经有过半的Follower服务器完成了和Leader服务器的状态同步，那么整个服务框架就可以进人消息广播模式了。  
123 | 
124 | ### 选型对比
125 | 
126 | - 1、并发原语：etcd 和 ZooKeeper 并未提供原生的分布式锁、Leader 选举支持，只提供了核心的基本数据读写、并发控制 API，由应用上层去封装，consul 就简单多了，提供了原生的支持，通过简单点命令就能使用；  
127 | 
128 | - 2、服务发现：etcd 和 ZooKeeper 并未提供原生的服务发现支持，Consul 在服务发现方面做了很多解放用户双手的工作，提供了服务发现的框架，帮助你的业务快速接入，并提供了 HTTP 和 DNS 两种获取服务方式；  
129 | 
130 | - 3、健康检查：consul 的健康检查机制，是一种基于 client、Gossip 协议、分布式的健康检查机制，具备低延时、可扩展的特点。业务可通过 Consul 的健康检查机制，实现 HTTP 接口返回码、内存乃至磁盘空间的检测，相比 etcd、ZooKeeper 它们提供的健康检查机制和能力就非常有限了；  
131 | 
132 | etcd 提供了 Lease 机制来实现活性检测。它是一种中心化的健康检查，依赖用户不断地发送心跳续租、更新 TTL  
133 | 
134 | ZooKeeper 使用的是一种名为临时节点的状态来实现健康检查。当 client 与 ZooKeeper 节点连接断掉时，ZooKeeper 就会删除此临时节点的 key-value 数据。它比基于心跳机制更复杂，也给 client 带去了更多的复杂性，所有 client 必须维持与 ZooKeeper server 的活跃连接并保持存活。  
135 | 
136 | - 4、watch 特性：相比于 etcd , Consul 存储引擎是基于Radix Tree实现的，因此它不支持范围查询和监听，只支持前缀查询和监听，而 etcd 都支持, ZooKeeper 的 Watch 特性有更多的局限性，它是个一次性触发器;  
137 | 
138 | - 5、线性读。etcd 和 Consul 都支持线性读，而 ZooKeeper 并不具备。  
139 | 
140 | - 6、权限机制比较。etcd 实现了 RBAC 的权限校验，而 ZooKeeper 和 Consul 实现的 ACL。  
141 | 
142 | - 7、事务比较。etcd 和 Consul 都提供了简易的事务能力，支持对字段进行比较，而 ZooKeeper 只提供了版本号检查能力，功能较弱。  
143 | 
144 | - 8、多数据中心。在多数据中心支持上，只有 Consul 是天然支持的，虽然它本身不支持数据自动跨数据中心同步，但是它提供的服务发现机制、Prepared Query功能，赋予了业务在一个可用区后端实例故障时，可将请求转发到最近的数据中心实例。而 etcd 和 ZooKeeper 并不支持。  
145 | 
146 | ### 总结
147 | 
148 | 总的看下来，consul 提供了原生的分布式锁、健康检查、服务发现机制支持，让业务可以更省心，同时也对多数据中心进行了支持；  
149 | 
150 | 当然 etcd 和 ZooKeeper 也都有相应的库，也能很好的进行支持，但是这两者不支持多数据中心；  
151 | 
152 | ZooKeeper 在 Java 业务中选型使用的较多，etcd 因为是 go 语言开发的，所以如果本身就是 go 的技术栈，使用这个也是个不错的选择，Consul 在国外应用比较多，中文文档及实践案例相比 etcd 较少；   
153 | 
154 | ### 参考 
155 | 
156 | 【服务发现框架选型: Consul、Zookeeper还是etcd ？】https://www.cnblogs.com/sunsky303/p/11127324.html    
157 | 【23 | 选型：etcd/ZooKeeper/Consul等我们该如何选择？】https://time.geekbang.org/column/article/351898  
158 | 【服务发现比较】https://developer.aliyun.com/article/759139  
159 | 【ZooKeeper讲解】https://juejin.cn/post/6844903677367418893  
160 | 【ETCD对比Consul和zooKeeper如何选型】https://boilingfrog.github.io/2021/09/16/etcd%E5%AF%B9%E6%AF%94consul%E5%92%8CzooKeeper/  
161 | 


--------------------------------------------------------------------------------
/3-grpc通过etcd实现服务发现.md:
--------------------------------------------------------------------------------
  1 | <!-- START doctoc generated TOC please keep comment here to allow auto update -->
  2 | <!-- DON'T EDIT THIS SECTION, INSTEAD RE-RUN doctoc TO UPDATE -->
  3 | 
  4 | - [grpc通过etcd实现服务发现](#grpc%E9%80%9A%E8%BF%87etcd%E5%AE%9E%E7%8E%B0%E6%9C%8D%E5%8A%A1%E5%8F%91%E7%8E%B0)
  5 |   - [前言](#%E5%89%8D%E8%A8%80)
  6 |   - [服务注册](#%E6%9C%8D%E5%8A%A1%E6%B3%A8%E5%86%8C)
  7 |   - [服务发现](#%E6%9C%8D%E5%8A%A1%E5%8F%91%E7%8E%B0)
  8 |   - [负载均衡](#%E8%B4%9F%E8%BD%BD%E5%9D%87%E8%A1%A1)
  9 |     - [集中式LB（Proxy Model）](#%E9%9B%86%E4%B8%AD%E5%BC%8Flbproxy-model)
 10 |     - [进程内LB（Balancing-aware Client）](#%E8%BF%9B%E7%A8%8B%E5%86%85lbbalancing-aware-client)
 11 |     - [独立 LB 进程（External Load Balancing Service）](#%E7%8B%AC%E7%AB%8B-lb-%E8%BF%9B%E7%A8%8Bexternal-load-balancing-service)
 12 |   - [参考](#%E5%8F%82%E8%80%83)
 13 | 
 14 | <!-- END doctoc generated TOC please keep comment here to allow auto update -->
 15 | 
 16 | ## grpc通过etcd实现服务发现
 17 | 
 18 | ### 前言
 19 | 
 20 | 项目中使用etcd实现了grpc的服务户注册和服务发现，这里来看下如何实现的服务注册和服务发现  
 21 | 
 22 | 先来看下使用的demo，demo中的代码[discovery](https://github.com/boilingfrog/etcd-learning/tree/main/discovery)
 23 | 
 24 | ### 服务注册  
 25 | 
 26 | ```go
 27 | package discovery
 28 | 
 29 | import (
 30 | 	"context"
 31 | 	"encoding/json"
 32 | 	"errors"
 33 | 	"net/http"
 34 | 	"strconv"
 35 | 	"strings"
 36 | 	"time"
 37 | 
 38 | 	clientv3 "go.etcd.io/etcd/client/v3"
 39 | 	"go.uber.org/zap"
 40 | )
 41 | 
 42 | // Register for grpc server
 43 | type Register struct {
 44 | 	EtcdAddrs   []string
 45 | 	DialTimeout int
 46 | 
 47 | 	closeCh     chan struct{}
 48 | 	leasesID    clientv3.LeaseID
 49 | 	keepAliveCh <-chan *clientv3.LeaseKeepAliveResponse
 50 | 
 51 | 	srvInfo Server
 52 | 	srvTTL  int64
 53 | 	cli     *clientv3.Client
 54 | 	logger  *zap.Logger
 55 | }
 56 | 
 57 | // NewRegister create a register base on etcd
 58 | func NewRegister(etcdAddrs []string, logger *zap.Logger) *Register {
 59 | 	return &Register{
 60 | 		EtcdAddrs:   etcdAddrs,
 61 | 		DialTimeout: 3,
 62 | 		logger:      logger,
 63 | 	}
 64 | }
 65 | 
 66 | // Register a service
 67 | func (r *Register) Register(srvInfo Server, ttl int64) (chan<- struct{}, error) {
 68 | 	var err error
 69 | 
 70 | 	if strings.Split(srvInfo.Addr, ":")[0] == "" {
 71 | 		return nil, errors.New("invalid ip")
 72 | 	}
 73 | 
 74 | 	if r.cli, err = clientv3.New(clientv3.Config{
 75 | 		Endpoints:   r.EtcdAddrs,
 76 | 		DialTimeout: time.Duration(r.DialTimeout) * time.Second,
 77 | 	}); err != nil {
 78 | 		return nil, err
 79 | 	}
 80 | 
 81 | 	r.srvInfo = srvInfo
 82 | 	r.srvTTL = ttl
 83 | 
 84 | 	if err = r.register(); err != nil {
 85 | 		return nil, err
 86 | 	}
 87 | 
 88 | 	r.closeCh = make(chan struct{})
 89 | 
 90 | 	go r.keepAlive()
 91 | 
 92 | 	return r.closeCh, nil
 93 | }
 94 | 
 95 | // Stop stop register
 96 | func (r *Register) Stop() {
 97 | 	r.closeCh <- struct{}{}
 98 | }
 99 | 
100 | // register 注册节点
101 | func (r *Register) register() error {
102 | 	leaseCtx, cancel := context.WithTimeout(context.Background(), time.Duration(r.DialTimeout)*time.Second)
103 | 	defer cancel()
104 | 
105 | 	leaseResp, err := r.cli.Grant(leaseCtx, r.srvTTL)
106 | 	if err != nil {
107 | 		return err
108 | 	}
109 | 	r.leasesID = leaseResp.ID
110 | 	if r.keepAliveCh, err = r.cli.KeepAlive(context.Background(), leaseResp.ID); err != nil {
111 | 		return err
112 | 	}
113 | 
114 | 	data, err := json.Marshal(r.srvInfo)
115 | 	if err != nil {
116 | 		return err
117 | 	}
118 | 	_, err = r.cli.Put(context.Background(), BuildRegPath(r.srvInfo), string(data), clientv3.WithLease(r.leasesID))
119 | 	return err
120 | }
121 | 
122 | // unregister 删除节点
123 | func (r *Register) unregister() error {
124 | 	_, err := r.cli.Delete(context.Background(), BuildRegPath(r.srvInfo))
125 | 	return err
126 | }
127 | 
128 | // keepAlive
129 | func (r *Register) keepAlive() {
130 | 	ticker := time.NewTicker(time.Duration(r.srvTTL) * time.Second)
131 | 	for {
132 | 		select {
133 | 		case <-r.closeCh:
134 | 			if err := r.unregister(); err != nil {
135 | 				r.logger.Error("unregister failed", zap.Error(err))
136 | 			}
137 | 			if _, err := r.cli.Revoke(context.Background(), r.leasesID); err != nil {
138 | 				r.logger.Error("revoke failed", zap.Error(err))
139 | 			}
140 | 			return
141 | 		case res := <-r.keepAliveCh:
142 | 			if res == nil {
143 | 				if err := r.register(); err != nil {
144 | 					r.logger.Error("register failed", zap.Error(err))
145 | 				}
146 | 			}
147 | 		case <-ticker.C:
148 | 			if r.keepAliveCh == nil {
149 | 				if err := r.register(); err != nil {
150 | 					r.logger.Error("register failed", zap.Error(err))
151 | 				}
152 | 			}
153 | 		}
154 | 	}
155 | }
156 | 
157 | // UpdateHandler return http handler
158 | func (r *Register) UpdateHandler() http.HandlerFunc {
159 | 	return http.HandlerFunc(func(w http.ResponseWriter, req *http.Request) {
160 | 		wi := req.URL.Query().Get("weight")
161 | 		weight, err := strconv.Atoi(wi)
162 | 		if err != nil {
163 | 			w.WriteHeader(http.StatusBadRequest)
164 | 			w.Write([]byte(err.Error()))
165 | 			return
166 | 		}
167 | 
168 | 		var update = func() error {
169 | 			r.srvInfo.Weight = int64(weight)
170 | 			data, err := json.Marshal(r.srvInfo)
171 | 			if err != nil {
172 | 				return err
173 | 			}
174 | 			_, err = r.cli.Put(context.Background(), BuildRegPath(r.srvInfo), string(data), clientv3.WithLease(r.leasesID))
175 | 			return err
176 | 		}
177 | 
178 | 		if err := update(); err != nil {
179 | 			w.WriteHeader(http.StatusInternalServerError)
180 | 			w.Write([]byte(err.Error()))
181 | 			return
182 | 		}
183 | 		w.Write([]byte("update server weight success"))
184 | 	})
185 | }
186 | 
187 | func (r *Register) GetServerInfo() (Server, error) {
188 | 	resp, err := r.cli.Get(context.Background(), BuildRegPath(r.srvInfo))
189 | 	if err != nil {
190 | 		return r.srvInfo, err
191 | 	}
192 | 	info := Server{}
193 | 	if resp.Count >= 1 {
194 | 		if err := json.Unmarshal(resp.Kvs[0].Value, &info); err != nil {
195 | 			return info, err
196 | 		}
197 | 	}
198 | 	return info, nil
199 | }
200 | ```
201 | 
202 | 来分析下上面的代码实现  
203 | 
204 | 当启动一个grpc的时候我们注册到etcd中
205 | 
206 | ```go
207 | 	etcdRegister := discovery.NewRegister(config.Etcd.Addrs, log.Logger)
208 | 	node := discovery.Server{
209 | 		Name: app,
210 | 		Addr: utils.InternalIP() + config.Port.GRPC,
211 | 	}
212 | 
213 | 	if _, err := etcdRegister.Register(node, 10); err != nil {
214 | 		panic(fmt.Sprintf("server register failed: %v", err))
215 | 	}
216 | ```
217 | 
218 | 调用服务注册的时候首先分配了一个租约  
219 | 
220 | ```go
221 | func (l *lessor) Grant(ctx context.Context, ttl int64) (*LeaseGrantResponse, error) {
222 | 	r := &pb.LeaseGrantRequest{TTL: ttl}
223 | 	resp, err := l.remote.LeaseGrant(ctx, r, l.callOpts...)
224 | 	if err == nil {
225 | 		gresp := &LeaseGrantResponse{
226 | 			ResponseHeader: resp.GetHeader(),
227 | 			ID:             LeaseID(resp.ID),
228 | 			TTL:            resp.TTL,
229 | 			Error:          resp.Error,
230 | 		}
231 | 		return gresp, nil
232 | 	}
233 | 	return nil, toErr(ctx, err)
234 | }
235 | ```
236 | 
237 | 然后通过KeepAlive保活   
238 | 
239 | ```go
240 | // KeepAlive尝试保持给定的租约永久alive
241 | func (l *lessor) KeepAlive(ctx context.Context, id LeaseID) (<-chan *LeaseKeepAliveResponse, error) {
242 | 	ch := make(chan *LeaseKeepAliveResponse, LeaseResponseChSize)
243 | 
244 | 	l.mu.Lock()
245 | 	// ensure that recvKeepAliveLoop is still running
246 | 	select {
247 | 	case <-l.donec:
248 | 		err := l.loopErr
249 | 		l.mu.Unlock()
250 | 		close(ch)
251 | 		return ch, ErrKeepAliveHalted{Reason: err}
252 | 	default:
253 | 	}
254 | 	ka, ok := l.keepAlives[id]
255 | 	if !ok {
256 | 		// create fresh keep alive
257 | 		ka = &keepAlive{
258 | 			chs:           []chan<- *LeaseKeepAliveResponse{ch},
259 | 			ctxs:          []context.Context{ctx},
260 | 			deadline:      time.Now().Add(l.firstKeepAliveTimeout),
261 | 			nextKeepAlive: time.Now(),
262 | 			donec:         make(chan struct{}),
263 | 		}
264 | 		l.keepAlives[id] = ka
265 | 	} else {
266 | 		// add channel and context to existing keep alive
267 | 		ka.ctxs = append(ka.ctxs, ctx)
268 | 		ka.chs = append(ka.chs, ch)
269 | 	}
270 | 	l.mu.Unlock()
271 | 
272 | 	go l.keepAliveCtxCloser(ctx, id, ka.donec)
273 | 	// 使用once只在第一次调用
274 | 	l.firstKeepAliveOnce.Do(func() {
275 | 		// 500毫秒一次，不断的发送保持活动请求
276 | 		go l.recvKeepAliveLoop()
277 | 		// 删除等待太久没反馈的租约
278 | 		go l.deadlineLoop()
279 | 	})
280 | 
281 | 	return ch, nil
282 | }
283 | 
284 | // deadlineLoop获取在租约TTL中没有收到响应的任何保持活动的通道
285 | func (l *lessor) deadlineLoop() {
286 | 	for {
287 | 		select {
288 | 		case <-time.After(time.Second):
289 | 			// donec 关闭，当 recvKeepAliveLoop 停止时设置 loopErr
290 | 		case <-l.donec:
291 | 			return
292 | 		}
293 | 		now := time.Now()
294 | 		l.mu.Lock()
295 | 		for id, ka := range l.keepAlives {
296 | 			if ka.deadline.Before(now) {
297 | 				// 等待响应太久；租约可能已过期
298 | 				ka.close()
299 | 				delete(l.keepAlives, id)
300 | 			}
301 | 		}
302 | 		l.mu.Unlock()
303 | 	}
304 | }
305 | 
306 | func (l *lessor) recvKeepAliveLoop() (gerr error) {
307 | 	defer func() {
308 | 		l.mu.Lock()
309 | 		close(l.donec)
310 | 		l.loopErr = gerr
311 | 		for _, ka := range l.keepAlives {
312 | 			ka.close()
313 | 		}
314 | 		l.keepAlives = make(map[LeaseID]*keepAlive)
315 | 		l.mu.Unlock()
316 | 	}()
317 | 
318 | 	for {
319 | 		// resetRecv 打开一个新的lease stream并开始发送保持活动请求。
320 | 		stream, err := l.resetRecv()
321 | 		if err != nil {
322 | 			if canceledByCaller(l.stopCtx, err) {
323 | 				return err
324 | 			}
325 | 		} else {
326 | 			for {
327 | 				// 接收lease stream的返回返回
328 | 				resp, err := stream.Recv()
329 | 				if err != nil {
330 | 					if canceledByCaller(l.stopCtx, err) {
331 | 						return err
332 | 					}
333 | 
334 | 					if toErr(l.stopCtx, err) == rpctypes.ErrNoLeader {
335 | 						l.closeRequireLeader()
336 | 					}
337 | 					break
338 | 				}
339 | 				// 根据LeaseKeepAliveResponse更新租约
340 | 				// 如果租约过期删除所有alive channels
341 | 				l.recvKeepAlive(resp)
342 | 			}
343 | 		}
344 | 
345 | 		select {
346 | 		case <-time.After(retryConnWait):
347 | 			continue
348 | 		case <-l.stopCtx.Done():
349 | 			return l.stopCtx.Err()
350 | 		}
351 | 	}
352 | }
353 | 
354 | // resetRecv 打开一个新的lease stream并开始发送保持活动请求。
355 | func (l *lessor) resetRecv() (pb.Lease_LeaseKeepAliveClient, error) {
356 | 	sctx, cancel := context.WithCancel(l.stopCtx)
357 | 	// 建立服务端和客户端连接的lease stream
358 | 	stream, err := l.remote.LeaseKeepAlive(sctx, l.callOpts...)
359 | 	if err != nil {
360 | 		cancel()
361 | 		return nil, err
362 | 	}
363 | 
364 | 	l.mu.Lock()
365 | 	defer l.mu.Unlock()
366 | 	if l.stream != nil && l.streamCancel != nil {
367 | 		l.streamCancel()
368 | 	}
369 | 
370 | 	l.streamCancel = cancel
371 | 	l.stream = stream
372 | 
373 | 	go l.sendKeepAliveLoop(stream)
374 | 	return stream, nil
375 | }
376 | 
377 | // sendKeepAliveLoop 在给定流的生命周期内发送保持活动请求
378 | func (l *lessor) sendKeepAliveLoop(stream pb.Lease_LeaseKeepAliveClient) {
379 | 	for {
380 | 		var tosend []LeaseID
381 | 
382 | 		now := time.Now()
383 | 		l.mu.Lock()
384 | 		for id, ka := range l.keepAlives {
385 | 			if ka.nextKeepAlive.Before(now) {
386 | 				tosend = append(tosend, id)
387 | 			}
388 | 		}
389 | 		l.mu.Unlock()
390 | 
391 | 		for _, id := range tosend {
392 | 			r := &pb.LeaseKeepAliveRequest{ID: int64(id)}
393 | 			if err := stream.Send(r); err != nil {
394 | 				// TODO do something with this error?
395 | 				return
396 | 			}
397 | 		}
398 | 
399 | 		select {
400 | 		// 每500毫秒执行一次
401 | 		case <-time.After(500 * time.Millisecond):
402 | 		case <-stream.Context().Done():
403 | 			return
404 | 		case <-l.donec:
405 | 			return
406 | 		case <-l.stopCtx.Done():
407 | 			return
408 | 		}
409 | 	}
410 | }
411 | 
412 | // 撤销给定的租约，所有附加到租约的key将过期并被删除  
413 | func (l *lessor) Revoke(ctx context.Context, id LeaseID) (*LeaseRevokeResponse, error) {
414 | 	r := &pb.LeaseRevokeRequest{ID: int64(id)}
415 | 	resp, err := l.remote.LeaseRevoke(ctx, r, l.callOpts...)
416 | 	if err == nil {
417 | 		return (*LeaseRevokeResponse)(resp), nil
418 | 	}
419 | 	return nil, toErr(ctx, err)
420 | }
421 | ```
422 | 
423 | 总结：  
424 | 
425 | 1、每次注册一个服务的分配一个租约；  
426 | 
427 | 2、KeepAlive通过从客户端到服务器端的流化的`keep alive`请求和从服务器端到客户端的流化的`keep alive`应答来维持租约；  
428 | 
429 | 3、KeepAlive会500毫秒进行一次lease stream的发送；  
430 | 
431 | 4、然后接收到KeepAlive发送信息回执，处理更新租约，服务处于活动状态；  
432 | 
433 | 5、如果在租约TTL中没有收到响应的任何保持活动的请求，删除租约；  
434 | 
435 | 6、Revoke撤销一个租约，所有附加到租约的key将过期并被删除。  
436 | 
437 | ### 服务发现  
438 | 
439 | 我们只需实现grpc在resolver中提供了Builder和Resolver接口，就能完成gRPC客户端的服务发现和负载均衡  
440 | 
441 | ```go
442 | // 创建一个resolver用于监视名称解析更新
443 | type Builder interface {
444 | 	Build(target Target, cc ClientConn, opts BuildOption) (Resolver, error)
445 | 	Scheme() string
446 | }
447 | ```
448 | 
449 | - Build方法：为给定目标创建一个新的resolver，当调用grpc.Dial()时执行；  
450 | 
451 | - Scheme方法：返回此resolver支持的方案,可参考[Scheme定义](https://github.com/grpc/grpc/blob/master/doc/naming.md)  
452 | 
453 | ```go
454 | // 监视指定目标的更新，包括地址更新和服务配置更新
455 | type Resolver interface {
456 | 	ResolveNow(ResolveNowOption)
457 | 	Close()
458 | }
459 | ```
460 | 
461 | - ResolveNow方法：被 gRPC 调用，以尝试再次解析目标名称。只用于提示，可忽略该方法;  
462 | 
463 | - Close方法：关闭resolver。  
464 | 
465 | 接下来看下具体的实现  
466 | 
467 | ```go
468 | package discovery
469 | 
470 | import (
471 | 	"context"
472 | 	"time"
473 | 
474 | 	"go.uber.org/zap"
475 | 
476 | 	"go.etcd.io/etcd/api/v3/mvccpb"
477 | 	clientv3 "go.etcd.io/etcd/client/v3"
478 | 	"google.golang.org/grpc/resolver"
479 | )
480 | 
481 | const (
482 | 	schema = "etcd"
483 | )
484 | 
485 | // Resolver for grpc client
486 | type Resolver struct {
487 | 	schema      string
488 | 	EtcdAddrs   []string
489 | 	DialTimeout int
490 | 
491 | 	closeCh      chan struct{}
492 | 	watchCh      clientv3.WatchChan
493 | 	cli          *clientv3.Client
494 | 	keyPrifix    string
495 | 	srvAddrsList []resolver.Address
496 | 
497 | 	cc     resolver.ClientConn
498 | 	logger *zap.Logger
499 | }
500 | 
501 | // NewResolver create a new resolver.Builder base on etcd
502 | func NewResolver(etcdAddrs []string, logger *zap.Logger) *Resolver {
503 | 	return &Resolver{
504 | 		schema:      schema,
505 | 		EtcdAddrs:   etcdAddrs,
506 | 		DialTimeout: 3,
507 | 		logger:      logger,
508 | 	}
509 | }
510 | 
511 | // Scheme returns the scheme supported by this resolver.
512 | func (r *Resolver) Scheme() string {
513 | 	return r.schema
514 | }
515 | 
516 | // Build creates a new resolver.Resolver for the given target
517 | func (r *Resolver) Build(target resolver.Target, cc resolver.ClientConn, opts resolver.BuildOptions) (resolver.Resolver, error) {
518 | 	r.cc = cc
519 | 
520 | 	r.keyPrifix = BuildPrefix(Server{Name: target.Endpoint, Version: target.Authority})
521 | 	if _, err := r.start(); err != nil {
522 | 		return nil, err
523 | 	}
524 | 	return r, nil
525 | }
526 | 
527 | // ResolveNow resolver.Resolver interface
528 | func (r *Resolver) ResolveNow(o resolver.ResolveNowOptions) {}
529 | 
530 | // Close resolver.Resolver interface
531 | func (r *Resolver) Close() {
532 | 	r.closeCh <- struct{}{}
533 | }
534 | 
535 | // start
536 | func (r *Resolver) start() (chan<- struct{}, error) {
537 | 	var err error
538 | 	r.cli, err = clientv3.New(clientv3.Config{
539 | 		Endpoints:   r.EtcdAddrs,
540 | 		DialTimeout: time.Duration(r.DialTimeout) * time.Second,
541 | 	})
542 | 	if err != nil {
543 | 		return nil, err
544 | 	}
545 | 	resolver.Register(r)
546 | 
547 | 	r.closeCh = make(chan struct{})
548 | 
549 | 	if err = r.sync(); err != nil {
550 | 		return nil, err
551 | 	}
552 | 
553 | 	go r.watch()
554 | 
555 | 	return r.closeCh, nil
556 | }
557 | 
558 | // watch update events
559 | func (r *Resolver) watch() {
560 | 	ticker := time.NewTicker(time.Minute)
561 | 	r.watchCh = r.cli.Watch(context.Background(), r.keyPrifix, clientv3.WithPrefix())
562 | 
563 | 	for {
564 | 		select {
565 | 		case <-r.closeCh:
566 | 			return
567 | 		case res, ok := <-r.watchCh:
568 | 			if ok {
569 | 				r.update(res.Events)
570 | 			}
571 | 		case <-ticker.C:
572 | 			if err := r.sync(); err != nil {
573 | 				r.logger.Error("sync failed", zap.Error(err))
574 | 			}
575 | 		}
576 | 	}
577 | }
578 | 
579 | // update
580 | func (r *Resolver) update(events []*clientv3.Event) {
581 | 	for _, ev := range events {
582 | 		var info Server
583 | 		var err error
584 | 
585 | 		switch ev.Type {
586 | 		case mvccpb.PUT:
587 | 			info, err = ParseValue(ev.Kv.Value)
588 | 			if err != nil {
589 | 				continue
590 | 			}
591 | 			addr := resolver.Address{Addr: info.Addr, Metadata: info.Weight}
592 | 			if !Exist(r.srvAddrsList, addr) {
593 | 				r.srvAddrsList = append(r.srvAddrsList, addr)
594 | 				r.cc.UpdateState(resolver.State{Addresses: r.srvAddrsList})
595 | 			}
596 | 		case mvccpb.DELETE:
597 | 			info, err = SplitPath(string(ev.Kv.Key))
598 | 			if err != nil {
599 | 				continue
600 | 			}
601 | 			addr := resolver.Address{Addr: info.Addr}
602 | 			if s, ok := Remove(r.srvAddrsList, addr); ok {
603 | 				r.srvAddrsList = s
604 | 				r.cc.UpdateState(resolver.State{Addresses: r.srvAddrsList})
605 | 			}
606 | 		}
607 | 	}
608 | }
609 | 
610 | // sync 同步获取所有地址信息
611 | func (r *Resolver) sync() error {
612 | 	ctx, cancel := context.WithTimeout(context.Background(), 3*time.Second)
613 | 	defer cancel()
614 | 	res, err := r.cli.Get(ctx, r.keyPrifix, clientv3.WithPrefix())
615 | 	if err != nil {
616 | 		return err
617 | 	}
618 | 	r.srvAddrsList = []resolver.Address{}
619 | 
620 | 	for _, v := range res.Kvs {
621 | 		info, err := ParseValue(v.Value)
622 | 		if err != nil {
623 | 			continue
624 | 		}
625 | 		addr := resolver.Address{Addr: info.Addr, Metadata: info.Weight}
626 | 		r.srvAddrsList = append(r.srvAddrsList, addr)
627 | 	}
628 | 	r.cc.UpdateState(resolver.State{Addresses: r.srvAddrsList})
629 | 	return nil
630 | }
631 | ```
632 | 
633 | 总结：  
634 | 
635 | 1、watch会监听前缀的信息变更，有变更的通知，及时更新srvAddrsList的地址信息；  
636 | 
637 | 2、sync会定时的同步etcd中的可用的服务地址到srvAddrsList中；  
638 | 
639 | 3、使用UpdateState更新ClientConn的Addresses；      
640 | 
641 | 4、然后grpc客户端就能根据配置的具体策略发送请求到grpc的server中。  
642 | 
643 | 这里使用gRPC内置的负载均衡策略`round_robin`，根据负载均衡地址，以轮询的方式进行调用服务，来测试下服务的发现和简单的服务负载  
644 | 
645 | ```go
646 | package discovery
647 | 
648 | import (
649 | 	"context"
650 | 	"fmt"
651 | 	"log"
652 | 	"net"
653 | 	"testing"
654 | 	"time"
655 | 
656 | 	"go.uber.org/zap"
657 | 	"google.golang.org/grpc/balancer/roundrobin"
658 | 	"google.golang.org/grpc/resolver"
659 | 
660 | 	"etcd-learning/discovery/helloworld"
661 | 
662 | 	"google.golang.org/grpc"
663 | )
664 | 
665 | var etcdAddrs = []string{"127.0.0.1:2379"}
666 | 
667 | func TestResolver(t *testing.T) {
668 | 	r := NewResolver(etcdAddrs, zap.NewNop())
669 | 	resolver.Register(r)
670 | 
671 | 	// etcd中注册5个服务
672 | 	go newServer(t, ":1001", "1.0.0", 1)
673 | 	go newServer(t, ":1002", "1.0.0", 1)
674 | 	go newServer(t, ":1003", "1.0.0", 1)
675 | 	go newServer(t, ":1004", "1.0.0", 1)
676 | 	go newServer(t, ":1006", "1.0.0", 10)
677 | 
678 | 	conn, err := grpc.Dial("etcd:///hello", grpc.WithInsecure(), grpc.WithBalancerName(roundrobin.Name))
679 | 	if err != nil {
680 | 		t.Fatalf("failed to dial %v", err)
681 | 	}
682 | 	defer conn.Close()
683 | 
684 | 	c := helloworld.NewGreeterClient(conn)
685 | 
686 | 	// 进行十次数据请求
687 | 	for i := 0; i < 10; i++ {
688 | 		resp, err := c.SayHello(context.Background(), &helloworld.HelloRequest{Name: "abc"})
689 | 		if err != nil {
690 | 			t.Fatalf("say hello failed %v", err)
691 | 		}
692 | 		log.Println(resp.Message)
693 | 		time.Sleep(100 * time.Millisecond)
694 | 	}
695 | 
696 | 	time.Sleep(10 * time.Second)
697 | }
698 | 
699 | type server struct {
700 | 	Port string
701 | }
702 | 
703 | // SayHello implements helloworld.GreeterServer
704 | func (s *server) SayHello(ctx context.Context, in *helloworld.HelloRequest) (*helloworld.HelloReply, error) {
705 | 	return &helloworld.HelloReply{Message: fmt.Sprintf("Hello From %s", s.Port)}, nil
706 | }
707 | 
708 | func newServer(t *testing.T, port string, version string, weight int64) {
709 | 	register := NewRegister(etcdAddrs, zap.NewNop())
710 | 	defer register.Stop()
711 | 
712 | 	listen, err := net.Listen("tcp", port)
713 | 	if err != nil {
714 | 		log.Fatalf("failed to listen %v", err)
715 | 	}
716 | 
717 | 	s := grpc.NewServer()
718 | 	helloworld.RegisterGreeterServer(s, &server{Port: port})
719 | 
720 | 	info := Server{
721 | 		Name:    "hello",
722 | 		Addr:    fmt.Sprintf("127.0.0.1%s", port),
723 | 		Version: version,
724 | 		Weight:  weight,
725 | 	}
726 | 
727 | 	register.Register(info, 10)
728 | 
729 | 	if err := s.Serve(listen); err != nil {
730 | 		log.Fatalf("failed to server %v", err)
731 | 	}
732 | }
733 | ```
734 | 
735 | 这里注册了5个服务，端口号是1001到1006，循环调用10次   
736 | 
737 | ```go
738 | === RUN   TestResolver
739 | 2021/07/24 22:44:52 Hello From :1001
740 | 2021/07/24 22:44:52 Hello From :1006
741 | 2021/07/24 22:44:53 Hello From :1001
742 | 2021/07/24 22:44:53 Hello From :1002
743 | 2021/07/24 22:44:53 Hello From :1003
744 | 2021/07/24 22:44:53 Hello From :1004
745 | 2021/07/24 22:44:53 Hello From :1006
746 | 2021/07/24 22:44:53 Hello From :1001
747 | 2021/07/24 22:44:53 Hello From :1002
748 | 2021/07/24 22:44:53 Hello From :1003
749 | ```
750 | 
751 | 发现每次的请求会发送到不同的服务中   
752 | 
753 | 
754 | ### 负载均衡
755 | 
756 | #### 集中式LB（Proxy Model）  
757 | 
758 | <img src="/img/grpc_balance_1.png" alt="grpc" align=center/>
759 | 
760 | 在服务消费者和服务提供者之间有一个独立的LB，通常是专门的硬件设备如 F5，或者基于软件如`LVS`，`HAproxy`等实现。LB上有所有服务的地址映射表，通常由运维配置注册，当服务消费方调用某个目标服务时，它向LB发起请求，由LB以某种策略，比如轮询`（Round-Robin）`做负载均衡后将请求转发到目标服务。LB一般具备健康检查能力，能自动摘除不健康的服务实例。  
761 | 
762 | 该方案主要问题：  
763 | 
764 | 1、单点问题，所有服务调用流量都经过LB，当服务数量和调用量大的时候，LB容易成为瓶颈，且一旦LB发生故障影响整个系统；  
765 | 
766 | 2、服务消费方、提供方之间增加了一级，有一定性能开销。  
767 | 
768 | #### 进程内LB（Balancing-aware Client）   
769 | 
770 | <img src="/img/grpc_balance_2.png" alt="grpc" align=center/>
771 | 
772 | 针对第一个方案的不足，此方案将LB的功能集成到服务消费方进程里，也被称为软负载或者客户端负载方案。服务提供方启动时，首先将服务地址注册到服务注册表，同时定期报心跳到服务注册表以表明服务的存活状态，相当于健康检查，服务消费方要访问某个服务时，它通过内置的LB组件向服务注册表查询，同时缓存并定期刷新目标服务地址列表，然后以某种负载均衡策略选择一个目标服务地址，最后向目标服务发起请求。LB和服务发现能力被分散到每一个服务消费者的进程内部，同时服务消费方和服务提供方之间是直接调用，没有额外开销，性能比较好。  
773 | 
774 | 该方案主要问题：  
775 | 
776 | 1、开发成本，该方案将服务调用方集成到客户端的进程里头，如果有多种不同的语言栈，就要配合开发多种不同的客户端，有一定的研发和维护成本；  
777 | 
778 | 2、另外生产环境中，后续如果要对客户库进行升级，势必要求服务调用方修改代码并重新发布，升级较复杂。  
779 | 
780 | #### 独立 LB 进程（External Load Balancing Service）  
781 | 
782 | <img src="/img/grpc_balance_3.png" alt="grpc" align=center/>
783 | 
784 | 该方案是针对第二种方案的不足而提出的一种折中方案，原理和第二种方案基本类似。  
785 | 
786 | 不同之处是将LB和服务发现功能从进程内移出来，变成主机上的一个独立进程。主机上的一个或者多个服务要访问目标服务时，他们都通过同一主机上的独立LB进程做服务发现和负载均衡。该方案也是一种分布式方案没有单点问题，一个LB进程挂了只影响该主机上的服务调用方，服务调用方和LB之间是进程内调用性能好，同时该方案还简化了服务调用方，不需要为不同语言开发客户库，LB的升级不需要服务调用方改代码。  
787 | 
788 | 该方案主要问题：部署较复杂，环节多，出错调试排查问题不方便。  
789 | 
790 | 上面通过etcd实现服务发现，使用的及时第二种 进程内LB（Balancing-aware Client）。   
791 | 
792 | ### 参考  
793 | 
794 | 【Load Balancing in gRPC】https://github.com/grpc/grpc/blob/master/doc/load-balancing.md  
795 | 【文中的代码示例】https://github.com/boilingfrog/etcd-learning/tree/main/discovery    


--------------------------------------------------------------------------------
/4-centos7中部署etcd.md:
--------------------------------------------------------------------------------
  1 | <!-- START doctoc generated TOC please keep comment here to allow auto update -->
  2 | <!-- DON'T EDIT THIS SECTION, INSTEAD RE-RUN doctoc TO UPDATE -->
  3 | 
  4 | - [etcd的搭建](#etcd%E7%9A%84%E6%90%AD%E5%BB%BA)
  5 |   - [前言](#%E5%89%8D%E8%A8%80)
  6 |   - [单机](#%E5%8D%95%E6%9C%BA)
  7 |   - [集群](#%E9%9B%86%E7%BE%A4)
  8 |     - [创建etcd配置文件](#%E5%88%9B%E5%BB%BAetcd%E9%85%8D%E7%BD%AE%E6%96%87%E4%BB%B6)
  9 |     - [更新etcd系统默认配置](#%E6%9B%B4%E6%96%B0etcd%E7%B3%BB%E7%BB%9F%E9%BB%98%E8%AE%A4%E9%85%8D%E7%BD%AE)
 10 |     - [启动](#%E5%90%AF%E5%8A%A8)
 11 |     - [配置ETCD为启动服务](#%E9%85%8D%E7%BD%AEetcd%E4%B8%BA%E5%90%AF%E5%8A%A8%E6%9C%8D%E5%8A%A1)
 12 |   - [测试下](#%E6%B5%8B%E8%AF%95%E4%B8%8B)
 13 |   - [参考](#%E5%8F%82%E8%80%83)
 14 | 
 15 | <!-- END doctoc generated TOC please keep comment here to allow auto update -->
 16 | 
 17 | ## etcd的搭建
 18 | 
 19 | ### 前言  
 20 | 
 21 | 这里记录下如何搭建etcd   
 22 | 
 23 | ### 单机
 24 | 
 25 | 在etcd的releases中有安装脚本,[安装脚本](https://github.com/etcd-io/etcd/releases)  
 26 | 
 27 | 这里放一个docker的安装脚本  
 28 | 
 29 | ```shell script
 30 | rm -rf /tmp/etcd-data.tmp && mkdir -p /tmp/etcd-data.tmp && \
 31 |   docker rmi quay.io/coreos/etcd:v3.5.0 || true && \
 32 |   docker run \
 33 |   -p 2379:2379 \
 34 |   -p 2380:2380 \
 35 |   --mount type=bind,source=/tmp/etcd-data.tmp,destination=/etcd-data \
 36 |   --name etcd-gcr-v3.5.0 \
 37 |   quay.io/coreos/etcd:v3.5.0 \
 38 |   /usr/local/bin/etcd \
 39 |   --name s1 \
 40 |   --data-dir /etcd-data \
 41 |   --listen-client-urls http://0.0.0.0:2379 \
 42 |   --advertise-client-urls http://0.0.0.0:2379 \
 43 |   --listen-peer-urls http://0.0.0.0:2380 \
 44 |   --initial-advertise-peer-urls http://0.0.0.0:2380 \
 45 |   --initial-cluster s1=http://0.0.0.0:2380 \
 46 |   --initial-cluster-token tkn \
 47 |   --initial-cluster-state new \
 48 |   --log-level info \
 49 |   --logger zap \
 50 |   --log-outputs stderr
 51 | ```
 52 | 
 53 | ### 集群
 54 | 
 55 | 这里准备了三台`centos7`机器    
 56 | 
 57 | | 主机    | ip             |
 58 | | ------ | ------         | 
 59 | | etcd-1 | 192.168.56.111 |
 60 | | etcd-2 | 192.168.56.112 |
 61 | | etcd-3 | 192.168.56.113 |
 62 | 
 63 | 首先在每台机器中安装etcd,这里写了安装的脚本  
 64 | 
 65 | ```shell script
 66 | $ cat etcd.sh 
 67 | 
 68 | ETCD_VER=v3.5.0
 69 | 
 70 | # choose either URL
 71 | GOOGLE_URL=https://storage.googleapis.com/etcd
 72 | GITHUB_URL=https://github.com/etcd-io/etcd/releases/download
 73 | DOWNLOAD_URL=${GITHUB_URL}
 74 | 
 75 | rm -f /tmp/etcd-${ETCD_VER}-linux-amd64.tar.gz
 76 | rm -rf /opt/etcd && mkdir -p /opt/etcd
 77 | 
 78 | curl -L ${DOWNLOAD_URL}/${ETCD_VER}/etcd-${ETCD_VER}-linux-amd64.tar.gz -o /tmp/etcd-${ETCD_VER}-linux-amd64.tar.gz
 79 | tar xzvf /tmp/etcd-${ETCD_VER}-linux-amd64.tar.gz -C /opt/etcd --strip-components=1
 80 | rm -f /tmp/etcd-${ETCD_VER}-linux-amd64.tar.gz
 81 | ```
 82 | 
 83 | 赋予执行权限
 84 | ```
 85 | $ chmod +x etcd.sh
 86 | ```
 87 | 
 88 | 在每台机器中都执行下    
 89 | 
 90 | ```
 91 | $ ./etcd.sh
 92 |   % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
 93 |                                  Dload  Upload   Total   Spent    Left  Speed
 94 | 100   636  100   636    0     0   1328      0 --:--:-- --:--:-- --:--:--  1330
 95 | 100 18.4M  100 18.4M    0     0   717k      0  0:00:26  0:00:26 --:--:--  775k
 96 | ...
 97 | ```
 98 | 
 99 | #### 创建etcd配置文件   
100 | 
101 | ```
102 | $ mkdir /etc/etcd
103 | $ vi /etc/etcd/conf.yml
104 | ```
105 | 
106 | 节点1  
107 | 
108 | ```
109 | name: etcd-1
110 | data-dir: /opt/etcd/data
111 | listen-client-urls: http://192.168.56.111:2379,http://127.0.0.1:2379
112 | advertise-client-urls: http://192.168.56.111:2379,http://127.0.0.1:2379
113 | listen-peer-urls: http://192.168.56.111:2380
114 | initial-advertise-peer-urls: http://192.168.56.111:2380
115 | initial-cluster: etcd-1=http://192.168.56.111:2380,etcd-2=http://192.168.56.112:2380,etcd-3=http://192.168.56.113:2380
116 | initial-cluster-token: etcd-cluster-token
117 | initial-cluster-state: new
118 | ```
119 | 
120 | 节点2  
121 | 
122 | ```
123 | name: etcd-2
124 | data-dir: /opt/etcd/data
125 | listen-client-urls: http://192.168.56.112:2379,http://127.0.0.1:2379
126 | advertise-client-urls: http://192.168.56.112:2379,http://127.0.0.1:2379
127 | listen-peer-urls: http://192.168.56.112:2380
128 | initial-advertise-peer-urls: http://192.168.56.112:2380
129 | initial-cluster: etcd-1=http://192.168.56.111:2380,etcd-2=http://192.168.56.112:2380,etcd-3=http://192.168.56.113:2380
130 | initial-cluster-token: etcd-cluster-token
131 | initial-cluster-state: new
132 | ```
133 | 
134 | 节点3  
135 | 
136 | ```
137 | name: etcd-3
138 | data-dir: /opt/etcd/data
139 | listen-client-urls: http://192.168.56.113:2379,http://127.0.0.1:2379
140 | advertise-client-urls: http://192.168.56.113:2379,http://127.0.0.1:2379
141 | listen-peer-urls: http://192.168.56.113:2380
142 | initial-advertise-peer-urls: http://192.168.56.113:2380
143 | initial-cluster: etcd-1=http://192.168.56.111:2380,etcd-2=http://192.168.56.112:2380,etcd-3=http://192.168.56.113:2380
144 | initial-cluster-token: etcd-cluster-token
145 | initial-cluster-state: new
146 | ```
147 | 
148 | 配置项说明：
149 | 
150 | - --name：etcd集群中的节点名，这里可以随意，可区分且不重复就行   
151 | 
152 | - --listen-peer-urls：监听的用于节点之间通信的url，可监听多个，集群内部将通过这些url进行数据交互(如选举，数据同步等)  
153 | 
154 | - --initial-advertise-peer-urls：建议用于节点之间通信的url，节点间将以该值进行通信  
155 | 
156 | - --listen-client-urls：监听的用于客户端通信的url，同样可以监听多个  
157 | 
158 | - --advertise-client-urls：建议使用的客户端通信 url，该值用于 etcd 代理或 etcd 成员与 etcd 节点通信  
159 | 
160 | - --initial-cluster-token： etcd-cluster-1，节点的 token 值，设置该值后集群将生成唯一 id，并为每个节点也生成唯一 id，当使用相同配置文件再启动一个集群时，只要该 token 值不一样，etcd 集群就不会相互影响  
161 | 
162 | - --initial-cluster：也就是集群中所有的 initial-advertise-peer-urls 的合集  
163 | 
164 | - --initial-cluster-state：new，新建集群的标志  
165 | 
166 | #### 更新etcd系统默认配置  
167 | 
168 | 当前使用的是etcd v3版本，系统默认的是v2，通过下面命令修改配置。  
169 | 
170 | ```
171 | $ vi /etc/profile
172 | # 在末尾追加  
173 | export ETCDCTL_API=3
174 | # 然后更新
175 | $ source /etc/profile
176 | ```
177 | 
178 | #### 启动
179 | 
180 | ```
181 | $ ./etcd --config-file=/etc/etcd/conf.yml
182 | ```
183 | 
184 | #### 配置ETCD为启动服务
185 | 
186 | 编辑/usr/lib/systemd/system/etcd.service  
187 | 
188 | ```
189 | $ cat /usr/lib/systemd/system/etcd.service
190 | [Unit]
191 | Description=EtcdServer
192 | After=network.target
193 | After=network-online.target
194 | Wants=network-online.target
195 | 
196 | [Service]
197 | Type=notify
198 | WorkingDirectory=/opt/etcd/
199 | # User=etcd
200 | ExecStart=/opt/etcd/etcd --config-file=/etc/etcd/conf.yml
201 | Restart=on-failure
202 | LimitNOFILE=65536
203 | 
204 | [Install]
205 | WantedBy=multi-user.target
206 | ```
207 | 
208 | 更新启动：  
209 | 
210 | ```
211 | $ systemctl daemon-reload
212 | $ systemctl enable etcd
213 | $ systemctl start etcd
214 | $ systemctl restart etcd
215 | 
216 | $ systemctl status etcd.service -l
217 | ```
218 | 
219 | ### 测试下  
220 |  
221 | 复制etcd二进制文件到`/usr/local/bin/`  
222 | 
223 | ```
224 | $ cp /opt/etcd/etcd* /usr/local/bin/
225 | ```
226 | 
227 | 首先设置ETCD_ENDPOINTS  
228 | 
229 | ```
230 | # export ETCDCTL_API=3
231 | # export ETCD_ENDPOINTS=192.168.56.111:2379,192.168.56.112:2379,192.168.56.113:2379
232 | ```
233 | 
234 | 查看状态  
235 | 
236 | ```
237 | $ etcdctl --endpoints=${ETCD_ENDPOINTS} --write-out=table member list
238 | +------------------+---------+--------+----------------------------+--------------------------------------------------+------------+
239 | |        ID        | STATUS  |  NAME  |         PEER ADDRS         |                   CLIENT ADDRS                   | IS LEARNER |
240 | +------------------+---------+--------+----------------------------+--------------------------------------------------+------------+
241 | |  90d224ceb3098d7 | started | etcd-2 | http://192.168.56.112:2380 | http://127.0.0.1:2379,http://192.168.56.112:2379 |      false |
242 | | 3b23fbb7d9c7cd10 | started | etcd-1 | http://192.168.56.111:2380 | http://127.0.0.1:2379,http://192.168.56.111:2379 |      false |
243 | | 7909c74e3f5ffafa | started | etcd-3 | http://192.168.56.113:2380 | http://127.0.0.1:2379,http://192.168.56.113:2379 |      false |
244 | +------------------+---------+--------+----------------------------+--------------------------------------------------+------------+
245 | 
246 | $ etcdctl --endpoints=${ETCD_ENDPOINTS} --write-out=table endpoint health
247 | +---------------------+--------+------------+-------+
248 | |      ENDPOINT       | HEALTH |    TOOK    | ERROR |
249 | +---------------------+--------+------------+-------+
250 | | 192.168.56.111:2379 |   true | 6.558088ms |       |
251 | | 192.168.56.113:2379 |   true | 6.543104ms |       |
252 | | 192.168.56.112:2379 |   true | 7.405801ms |       |
253 | +---------------------+--------+------------+-------+
254 | 
255 | $ etcdctl --endpoints=${ETCD_ENDPOINTS} --write-out=table endpoint status
256 | +---------------------+------------------+---------+---------+-----------+------------+-----------+------------+--------------------+--------+
257 | |      ENDPOINT       |        ID        | VERSION | DB SIZE | IS LEADER | IS LEARNER | RAFT TERM | RAFT INDEX | RAFT APPLIED INDEX | ERRORS |
258 | +---------------------+------------------+---------+---------+-----------+------------+-----------+------------+--------------------+--------+
259 | | 192.168.56.111:2379 | 3b23fbb7d9c7cd10 |   3.5.0 |   20 kB |      true |      false |         2 |         19 |                 19 |        |
260 | | 192.168.56.112:2379 |  90d224ceb3098d7 |   3.5.0 |   20 kB |     false |      false |         2 |         19 |                 19 |        |
261 | | 192.168.56.113:2379 | 7909c74e3f5ffafa |   3.5.0 |   20 kB |     false |      false |         2 |         19 |                 19 |        |
262 | +---------------------+------------------+---------+---------+-----------+------------+-----------+------------+--------------------+--------+
263 | ```
264 | 
265 | 在etcd-1中watch一个key,然后再etcd-2中对key设置一个值  
266 | 
267 | ```
268 | [root@centos7-1 ~]# etcdctl watch test
269 | PUT
270 | test
271 | xiaoming
272 | 
273 | [root@centos7-3 ~]# etcdctl put test xiaoming
274 | OK
275 | ```
276 | 
277 | ### 参考  
278 | 
279 | 【ETCD集群安装配置】https://zhuanlan.zhihu.com/p/46477992    
280 | 【Install】https://etcd.io/docs/v3.5/install/    
281 | 【彻底搞懂 etcd 系列文章（三）：etcd 集群运维部署】https://developer.aliyun.com/article/765312  
282 | 
283 | 
284 | 
285 | 
286 | 
287 | 
288 | 


--------------------------------------------------------------------------------
/5-raft算法理解.md:
--------------------------------------------------------------------------------
  1 | <!-- START doctoc generated TOC please keep comment here to allow auto update -->
  2 | <!-- DON'T EDIT THIS SECTION, INSTEAD RE-RUN doctoc TO UPDATE -->
  3 | 
  4 | - [ETCD的Raft一致性算法原理](#etcd%E7%9A%84raft%E4%B8%80%E8%87%B4%E6%80%A7%E7%AE%97%E6%B3%95%E5%8E%9F%E7%90%86)
  5 |   - [前言](#%E5%89%8D%E8%A8%80)
  6 |   - [Raft原理了解](#raft%E5%8E%9F%E7%90%86%E4%BA%86%E8%A7%A3)
  7 |   - [raft选举](#raft%E9%80%89%E4%B8%BE)
  8 |     - [raft中的几种状态](#raft%E4%B8%AD%E7%9A%84%E5%87%A0%E7%A7%8D%E7%8A%B6%E6%80%81)
  9 |     - [任期](#%E4%BB%BB%E6%9C%9F)
 10 |     - [leader选举](#leader%E9%80%89%E4%B8%BE)
 11 |   - [日志复制](#%E6%97%A5%E5%BF%97%E5%A4%8D%E5%88%B6)
 12 |   - [安全性](#%E5%AE%89%E5%85%A8%E6%80%A7)
 13 |     - [leader宕机，新的leader未同步前任committed的数据](#leader%E5%AE%95%E6%9C%BA%E6%96%B0%E7%9A%84leader%E6%9C%AA%E5%90%8C%E6%AD%A5%E5%89%8D%E4%BB%BBcommitted%E7%9A%84%E6%95%B0%E6%8D%AE)
 14 |     - [Leader在将日志复制给Follower节点之前宕机](#leader%E5%9C%A8%E5%B0%86%E6%97%A5%E5%BF%97%E5%A4%8D%E5%88%B6%E7%BB%99follower%E8%8A%82%E7%82%B9%E4%B9%8B%E5%89%8D%E5%AE%95%E6%9C%BA)
 15 |     - [Leader在将日志复制给Follower节点之间宕机](#leader%E5%9C%A8%E5%B0%86%E6%97%A5%E5%BF%97%E5%A4%8D%E5%88%B6%E7%BB%99follower%E8%8A%82%E7%82%B9%E4%B9%8B%E9%97%B4%E5%AE%95%E6%9C%BA)
 16 |     - [Leader在响应客户端之前宕机](#leader%E5%9C%A8%E5%93%8D%E5%BA%94%E5%AE%A2%E6%88%B7%E7%AB%AF%E4%B9%8B%E5%89%8D%E5%AE%95%E6%9C%BA)
 17 |   - [时间和可用性](#%E6%97%B6%E9%97%B4%E5%92%8C%E5%8F%AF%E7%94%A8%E6%80%A7)
 18 |   - [网络分区问题](#%E7%BD%91%E7%BB%9C%E5%88%86%E5%8C%BA%E9%97%AE%E9%A2%98)
 19 |   - [总结](#%E6%80%BB%E7%BB%93)
 20 |   - [参考](#%E5%8F%82%E8%80%83)
 21 | 
 22 | <!-- END doctoc generated TOC please keep comment here to allow auto update -->
 23 | 
 24 | ## ETCD的Raft一致性算法原理
 25 | 
 26 | ### 前言
 27 | 
 28 | 关于Raft的文章很多，本文是参考了很多的文章之后，总结出来的，写的不对之处欢迎赐教。  
 29 | 
 30 | ### Raft原理了解
 31 | 
 32 | Raft 是一种为了管理复制日志的一致性算法。它提供了和 Paxos 算法相同的功能和性能，但是它的算法结构和 Paxos 不同，使得 Raft 算法更加容易理解并且更容易构建实际的系统。 
 33 | 
 34 | Raft是一种分布式一致性算法。它被设计得易于理解, 解决了即使在出现故障时也可以让多个服务器对共享状态达成一致的问题。共享状态通常是通过日志复制支持的数据结构。只要大多数服务器是正常运作的，系统就能全面运行。  
 35 | 
 36 | Raft的工作方式是在集群中选举一个领导者。领导者负责接受客户端请求并管理到其他服务器的日志复制。数据只在一个方向流动:从领导者到其他服务器。  
 37 | 
 38 | Raft将一致性问题分解为三个子问题:
 39 | 
 40 | - 领导者选举: 现有领导者失效时，需要选举新的领导者；  
 41 | 
 42 | - 日志复制: 领导者需要通过复制保持所有服务器的日志与自己的同步；  
 43 | 
 44 | - 安全性: 如果其中一个服务器在特定索引上提交了日志条目，那么其他服务器不能在该索引应用不同的日志条目。   
 45 | 
 46 | ### raft选举
 47 | 
 48 | #### raft中的几种状态  
 49 | 
 50 | 在raft算法中，在任何时刻，每一个服务器节点都处于这三个状态之一：  
 51 | 
 52 | - Follower:追随者，跟随者都是被动的：他们不会发送任何请求，只是简单的响应来自领导者或者候选人的请求；  
 53 | 
 54 | - Candidate:候选人，如果跟随者接收不到消息，那么他就会变成候选人并发起一次选举，获得集群中大多数选票的候选人将成为领导者。  
 55 | 
 56 | - Leader:领导者，系统中只有一个领导人并且其他的节点全部都是跟随者，领导人处理所有的客户端请求（如果一个客户端和跟随者联系，那么跟随者会把请求重定向给领导人）   
 57 | 
 58 | 来看下几个状态的关系  
 59 | 
 60 | <img src="/img/raft_1.png" alt="etcd" align=center/>
 61 | 
 62 | #### 任期  
 63 | 
 64 | Raft将时间划分为任意长度的任期，每个任期都以一次选举开始。如果一名候选人赢得选举，他在剩下的任期时间内仍然是领导者。如果投票出现分歧，那么这个任期则没有领导者,及时结束。  
 65 | 
 66 | 任期号单调递增。每个服务器存储当前任期号，并在每次通信中交换该任期编号。  
 67 | 
 68 | 如果一个服务器的当前任期号小于其他服务器，那么它将把当前任期更新为更大的值。如果候选人或领导者发现其任期已过期，则立即转化为追随者状态。如果服务器接收到带有过期任期号的请求，它将拒绝该请求。  
 69 | 
 70 | #### leader选举  
 71 | 
 72 | 领导者定期向跟随者发送心跳，来维持自己的leader角色。如果跟随者在一定的时间内没有接收到任何的消息，也就是选举超时，那么他就会认为系统中没有可用的领导者,并且发起选举以选出新的领导者。  
 73 | 
 74 | 要开始一次选举过程，跟随者先要增加自己的当前任期号并且转换到候选人状态。然后他会并行的向集群中的其他服务器节点发送请求投票的 RPCs 来给自己投票。  
 75 | 
 76 | 候选人的选举会有下面三种结果：   
 77 | 
 78 | 1、候选人自己赢得了选举；  
 79 | 
 80 | 2、其他服务成为了leader；  
 81 | 
 82 | 3、候选人中没有选出领导者，可能是多个跟随者同时成为候选人，然后选票被瓜分了，以至于没有候选人能获得最大的票数。这种情况下面详细介绍。  
 83 | 
 84 | 对于选举过程，对于选票被瓜分的情况，Raft算法使用随机候选超时时间的方法来确保很少会发生选票瓜分的情况，就算发生也能很快的解决。  
 85 | 
 86 | 来看下这个选举的随机算法：  
 87 | 
 88 | 1、为了阻止选票起初就被瓜分，候选超时时间是从一个固定的区间（例如 150-300 毫秒）随机选择；  
 89 | 
 90 | 2、这个候选超时时间就是follower要等待成为candidate的时间；  
 91 | 
 92 | 3、每一个候选人在开始一次选举的时候会重置一个随机候选的时间，也就是150-300中随机一个值；   
 93 | 
 94 | 4、这个时间结束之后follower变成candidate开始选举，不同时候苏醒竞争leader，这样苏醒早的就有竞争优势；  
 95 | 
 96 | 5、这样大大减少了选票被瓜分的情况，如何选票还是被瓜分，就继续从1开始选举。    
 97 | 
 98 | ### 日志复制
 99 | 
100 | 一旦leader被选举成功，就可以对客户端提供服务了。客户端提交每一条命令都会被按顺序记录到leader的日志中，每一条命令都包含term编号和顺序索引，然后向其他节点并行发送AppendEntries RPC用以复制命令(如果命令丢失会不断重发)，当复制成功也就是大多数节点成功复制后，leader就会提交命令，即执行该命令并且将执行结果返回客户端，raft保证已经提交的命令最终也会被其他节点成功执行。  
101 | 
102 | 来看下是具体的流程：  
103 | 
104 | 1、所有的请求都先经过leader,每个请求首先以日志的形式保存在leader中，然后这时候日志的状态是uncommited状态；  
105 | 
106 | 2、然后leader将这些更改的请求发送到follower；  
107 | 
108 | 3、leader等待大多数的follower确认提交；  
109 | 
110 | 4、leader在等待大多数的follower确认提交之后，commit这些更改，然后通知客户端更新的结果；  
111 | 
112 | 5、同时leader会不断的尝试通知follower去存储所有更新的信息。  
113 | 
114 | <img src="/img/raft-leader.png" alt="etcd" align=center/>
115 | 
116 | 日志由有序编号（log index）的日志条目组成。每个日志条目包含它被创建时的任期号（term），和用于状态机执行的命令。如果一个日志条目被复制到大多数服务器上，就被认为可以提交（commit）了。  
117 | 
118 | <img src="/img/raft-log_1.png" alt="etcd" align=center/>
119 | 
120 | Raft日志同步保证如下两点：  
121 | 
122 | - 如果不同日志中的两个条目有着相同的索引和任期号，则它们所存储的命令是相同的；  
123 | 
124 | - 如果不同日志中的两个条目有着相同的索引和任期号，则它们之前的所有条目都是完全一样的。  
125 | 
126 | 第一条特性源于Leader在一个term内在给定的一个log index最多创建一条日志条目，同时该条目在日志中的位置也从来不会改变。  
127 | 
128 | 第二条特性：Raft算法在发送日志复制请求时会携带前置日志的term和logIndex值（即 prevLogTerm 和 prevLogIndex），只有在 prevLogTerm 和 prevLogIndex 匹配的情况下才能成功响应请求。如果prevLogTerm和prevLogIndex不匹配，则说明当前节点可能是新加入的、或者之前服从于其它Leader，亦或当前节点之前是Leader节点。为了兑现承诺二，Leader节点需要与该Follower节点向前追溯找到term和logIndex匹配的那条日志，并使用Leader节点的日志强行覆盖该Follower此后的日志数据。  
129 | 
130 | 一般情况下，Leader和Followers的日志保持一致，因此AppendEntries一致性检查通常不会失败。然而，Leader崩溃可能会导致日志不一致：旧的Leader可能没有完全复制完日志中的所有条目。一个Follower可能会丢失掉Leader上的一些条目，也有可能包含一些Leader没有的条目，也有可能两者都会发生。丢失的或者多出来的条目可能会持续多个任期。  
131 | 
132 | Leader通过强制Followers复制它的日志来处理日志的不一致，Followers上的不一致的日志会被Leader的日志覆盖。  
133 | 
134 | Leader为了使Followers的日志同自己的一致，Leader需要找到Followers同它的日志一致的地方，然后覆盖Followers在该位置之后的条目。  
135 | 
136 | Leader会从后往前试，每次AppendEntries失败后尝试前一个日志条目，直到成功找到每个Follower的日志一致位点，然后向后逐条覆盖Followers在该位置之后的条目。  
137 | 
138 | ### 安全性
139 | 
140 | 上面我们讨论的是理想状态下的情况，在实际的生产环境中，我们会遇到各种各样的情况。这里参考了一位大佬文章[理解 Raft 分布式共识算法](https://www.zhenchao.org/2020/01/01/protocol/raft/)     
141 | 
142 | 下面来讨论几种常见的问题  
143 | 
144 | #### leader宕机，新的leader未同步前任committed的数据
145 | 
146 | leader宕机了，然后又选出了新的leader，但是新的leader没有同步前任committed的数据，新leader节点会强行覆盖集群中其它节点与自己冲突的日志数据。  
147 | 
148 | 如何避免：  
149 | 
150 | 这种情况raft会对参加选举的节点进行限制，只有包含已经committed日志的节点才有机会竞选成功  
151 | 
152 | - 1、参选节点的term值大于等于投票节点的term值；  
153 | 
154 | - 2、如果 term 值相等，则参选节点的 lastLogIndex 大于等于投票节点的 lastLogIndex 值。  
155 | 
156 | #### Leader在将日志复制给Follower节点之前宕机
157 | 
158 | 如果在复制之前宕机，当然这时候消息处于uncommitted状态，新选出的leader一定不包含这些日志信息，所以新的leader会强制覆盖follower中跟他冲突的日志，也就是刚刚宕机的leader，如果变成follower，他未同步的信息会被新的leader覆盖掉。  
159 | 
160 | #### Leader在将日志复制给Follower节点之间宕机
161 | 
162 | 在复制的过程中宕机，会有两种情况：   
163 | 
164 | 
165 | - 1、只有少数的follower被同步到了；
166 | 
167 | - 2、大多数的follower被同步到了；  
168 | 
169 | 情况1：如果只有少数的follower被同步了，如果新的leader不包含这些信息，新的leader会覆盖那些已经同步的节点的信息，如果新的节点包含这些数据,直接走到下面的情况2；    
170 | 
171 | 情况2：Leader在复制的过程中宕机,所以肯定消息是没有commit的，新的leader需要再次尝试将其复制给各个Follower节点，并依据自己的复制状态决定是否提交这些日志。   
172 | 
173 | #### Leader在响应客户端之前宕机
174 | 
175 | 这种情况，我们根据上面的同步机制可以知道，消息肯定是committed状态的，新的leader肯定包含这个信息，但是新任Leader可能还未被通知该日志已经被提交，不过这个信息在之后一定会被新任Leader标记为committed。   
176 | 
177 | 不过对于客户端可能超时拿不到结果，认为本次消息失败了，客户端需要考虑幂等性。   
178 | 
179 | ### 时间和可用性
180 | 
181 | Raft 的要求之一就是安全性不能依赖时间：整个系统不能因为某些事件运行的比预期快一点或者慢一点就产生了错误的结果。但是，可用性（系统可以及时的响应客户端）不可避免的要依赖于时间。  
182 | 
183 | 领导人选举是 Raft 中对时间要求最为关键的方面。Raft 可以选举并维持一个稳定的领导人,只要系统满足下面的时间要求： 
184 | 
185 | > 广播时间（broadcastTime） << 候选超时时间（electionTimeout） << 平均故障间隔时间（MTBF）
186 | 
187 | - broadcastTime: 广播时间指的是从一个服务器并行的发送 RPCs 给集群中的其他服务器并接收响应的平均时间，也就是集群之间的平均延时时间;  
188 | 
189 | - electionTimeout: 追随者设置的候选超时时间；  
190 | 
191 | - MTBF：平均故障间隔时间就是对于一台服务器而言，两次故障之间的平均时间。  
192 | 
193 | 如果一个follower在一个electionTimeout时间内没有接收到leader的RPC，也没有接收到其他candidate的voteRequestRPC，他就会苏醒，变成candidate状态，开始新一轮的投票。所以broadcastTime要小于electionTimeout的时间。   
194 | 
195 | 在Leader宕机与选举出新任Leader之间，整个集群处于无主的状态，我们应该尽可能缩短此类状态的持续时间，而控制的参数就是electionTimeout的最小值，所以electionTimeout需要在保证大于broadcastTime的前提下远小于一个集群中机器的平均故障间隔时间MTBF。  
196 | 
197 | ### 网络分区问题
198 | 
199 | 如果由于网络的隔离，导致原来的Raft集群分裂成多个小的集群，各自分区中会重新开始选举形各自形成新的leader  
200 | 
201 | <img src="/img/raft-net.png" alt="etcd" align=center/>
202 | 
203 | 在各自分区之内，各自leader会收到不同的client发送的请求，但是我们集群的节点还是5，所以两个节点的分区必定选不出leader。   
204 | 
205 | - 如果请求发送给了3个节点的分区，因为集群包含3个节点，所以提交给该分区的指令对应的日志存在被committed的可能性，此时3个节点均成功复制了日志。  
206 | 
207 | - 如果请求发送给了2个节点的分区，因为集群包含2个节点，所以提交给该集群的指令对应的日志因不满足过半数的条件而无法被提交。  
208 | 
209 | 当网络恢复的时候，新的leader必定在三个集群的节点中选取（为什么呢？可参看上文的安全性），然后新的leader覆盖未同步的2个节点中的数据。 
210 | 
211 | ### 总结
212 | 
213 | 1、Raft在对应的任期中每次只有一个leader产生，通过候选超时算法，保证了在大多数只有一个leader被选出的情况；  
214 | 
215 | 2、所有的数据都是从leader流向follower中，通过日志的复制确认机制，保证绝大多数的follower都能同步到消息；  
216 | 
217 | 3、当然，raft对于分布式中出现的各种安全性问题也做了兼容；   
218 | 
219 | 4、不过真正实现一个生产级别的Raft算法库，需要考虑的东西还是很多，这里主要分析了几个主要的问题。   
220 | 
221 | ### 参考  
222 | 
223 | 【一文搞懂Raft算法】https://www.cnblogs.com/xybaby/p/10124083.html    
224 | 【寻找一种易于理解的一致性算法（扩展版）】https://github.com/maemual/raft-zh_cn/blob/master/raft-zh_cn.md  
225 | 【raft演示动画】https://raft.github.io/raftscope/index.html    
226 | 【理解 raft 算法】https://sanyuesha.com/2019/04/18/raft/  
227 | 【理解Raft一致性算法—一篇学术论文总结】https://mp.weixin.qq.com/s/RkMeYyUck1WQPjNiGvahKQ  
228 | 【Raft协议原理详解】https://zhuanlan.zhihu.com/p/91288179  
229 | 【Raft算法详解】https://zhuanlan.zhihu.com/p/32052223  
230 | 
231 | 


--------------------------------------------------------------------------------
/7-raft和线性一致性.md:
--------------------------------------------------------------------------------
  1 | <!-- START doctoc generated TOC please keep comment here to allow auto update -->
  2 | <!-- DON'T EDIT THIS SECTION, INSTEAD RE-RUN doctoc TO UPDATE -->
  3 | 
  4 | - [线性一致性](#%E7%BA%BF%E6%80%A7%E4%B8%80%E8%87%B4%E6%80%A7)
  5 |   - [CAP](#cap)
  6 |   - [什么是CAP](#%E4%BB%80%E4%B9%88%E6%98%AFcap)
  7 |   - [CAP的权衡](#cap%E7%9A%84%E6%9D%83%E8%A1%A1)
  8 |     - [AP wihtout C](#ap-wihtout-c)
  9 |     - [CA without P](#ca-without-p)
 10 |     - [CP without A](#cp-without-a)
 11 |   - [线性一致性](#%E7%BA%BF%E6%80%A7%E4%B8%80%E8%87%B4%E6%80%A7-1)
 12 |   - [etcd中如何实现线性一致性](#etcd%E4%B8%AD%E5%A6%82%E4%BD%95%E5%AE%9E%E7%8E%B0%E7%BA%BF%E6%80%A7%E4%B8%80%E8%87%B4%E6%80%A7)
 13 |   - [线性一致性写](#%E7%BA%BF%E6%80%A7%E4%B8%80%E8%87%B4%E6%80%A7%E5%86%99)
 14 |   - [线性一致性读](#%E7%BA%BF%E6%80%A7%E4%B8%80%E8%87%B4%E6%80%A7%E8%AF%BB)
 15 |     - [1、客户端的get请求](#1%E5%AE%A2%E6%88%B7%E7%AB%AF%E7%9A%84get%E8%AF%B7%E6%B1%82)
 16 |     - [2、服务端响应读取请求](#2%E6%9C%8D%E5%8A%A1%E7%AB%AF%E5%93%8D%E5%BA%94%E8%AF%BB%E5%8F%96%E8%AF%B7%E6%B1%82)
 17 |     - [3、raft中如何处理一个读的请求](#3raft%E4%B8%AD%E5%A6%82%E4%BD%95%E5%A4%84%E7%90%86%E4%B8%80%E4%B8%AA%E8%AF%BB%E7%9A%84%E8%AF%B7%E6%B1%82)
 18 |       - [如果follower收到只读的消息](#%E5%A6%82%E6%9E%9Cfollower%E6%94%B6%E5%88%B0%E5%8F%AA%E8%AF%BB%E7%9A%84%E6%B6%88%E6%81%AF)
 19 |       - [如果leader收到只读请求](#%E5%A6%82%E6%9E%9Cleader%E6%94%B6%E5%88%B0%E5%8F%AA%E8%AF%BB%E8%AF%B7%E6%B1%82)
 20 |   - [MVCC](#mvcc)
 21 |     - [treeIndex](#treeindex)
 22 |     - [buffer](#buffer)
 23 |     - [boltdb](#boltdb)
 24 |   - [总结](#%E6%80%BB%E7%BB%93)
 25 |   - [参考](#%E5%8F%82%E8%80%83)
 26 | 
 27 | <!-- END doctoc generated TOC please keep comment here to allow auto update -->
 28 | 
 29 | ## 线性一致性
 30 | 
 31 | ### CAP
 32 | 
 33 | ### 什么是CAP
 34 | 
 35 | 在聊什么是线性一致性的时候，我们先来看看什么是CAP  
 36 | 
 37 | CAP理论：一个分布式系统最多只能同时满足一致性（Consistency）、可用性（Availability）和分区容错性（Partition tolerance）这三项中的两项。  
 38 | 
 39 | - 1、一致性（Consistency）
 40 | 
 41 | 一致性表示所有客户端同时看到相同的数据，无论它们连接到哪个节点。 要做到这一点，每当数据写入一个节点时，就必须立即将其转发或复制到系统中的所有其他节点，然后写操作才被视为“成功”。  
 42 | 
 43 | - 2、可用性（Availability） 
 44 | 
 45 | 可用性表示发出数据请求的任何客户端都会得到响应，即使一个或多个节点宕机。 可用性的另一种状态：分布式系统中的所有工作节点都返回任何请求的有效响应，而不会发生异常。   
 46 | 
 47 | - 3、分区容错性（Partition tolerance）  
 48 | 
 49 | 分区是分布式系统中的通信中断 - 两个节点之间的丢失连接或连接临时延迟。 分区容错意味着集群必须持续工作，无论系统中的节点之间有任何数量的通信中断。  
 50 | 
 51 | ### CAP的权衡
 52 | 
 53 | 根据定理，分布式系统只能满足三项中的两项而不可能满足全部三项。  
 54 | 
 55 | #### AP wihtout C
 56 | 
 57 | 允许分区下的高可用，就需要放弃一致性。一旦分区发生，节点之间可能会失去联系，为了高可用，每个节点只能用本地数据提供服务，而这样会导致全局数据的不一致性。  
 58 | 
 59 | #### CA without P  
 60 | 
 61 | 如果不会出现分区，一直性和可用性是可以同时保证的。但是我们现在的系统基本上是都是分布式的，也就是我们的服务肯定是被多台机器所提供的，所以分区就难以避免。  
 62 | 
 63 | #### CP without A  
 64 | 
 65 | 如果不要求A（可用），相当于每个请求都需要在Server之间强一致，而P（分区）会导致同步时间无限延长，如此CP也是可以保证的。   
 66 | 
 67 | ### 线性一致性
 68 | 
 69 | 线性一致性又叫做原子一致性，强一致性。线性一致性可以看做只有一个单核处理器，或者可以看做只有一个数据副本，并且所有操作都是原子的。在可线性化的分布式系统中，如果某个节点更新了数据，那么在其他节点如果都能读取到这个最新的数据。可以看见线性一致性和我们的CAP中的C是一致的。  
 70 | 
 71 | 举个非线性一致性的例子，比如有个秒杀活动，你和你的朋友同时去抢购一样东西，有可能他那里的库存已经没了，但是在你手机上显示还有几件，这个就违反了线性一致性，哪怕过了一会你的手机也显示库存没有，也依然是违反了。  
 72 | 
 73 | ### etcd中如何实现线性一致性
 74 | 
 75 | ### 线性一致性写
 76 | 
 77 | 所有的写操作，都要经过leader节点，一旦leader被选举成功，就可以对客户端提供服务了。客户端提交每一条命令都会被按顺序记录到leader的日志中，每一条命令都包含term编号和顺序索引，然后向其他节点并行发送AppendEntries RPC用以复制命令(如果命令丢失会不断重发)，当复制成功也就是大多数节点成功复制后，leader就会提交命令，即执行该命令并且将执行结果返回客户端，raft保证已经提交的命令最终也会被其他节点成功执行。具体源码参见[日志同步](https://www.cnblogs.com/ricklz/p/15155095.html#%E6%97%A5%E5%BF%97%E5%90%8C%E6%AD%A5)      
 78 | 
 79 | 因为日志是顺序记录的，并且有严格的确认机制，所以可以认为写是满足线性一致性的。   
 80 | 
 81 | 由于在Raft算法中，写操作成功仅仅意味着日志达成了一致（已经落盘），而并不能确保当前状态机也已经apply了日志。状态机apply日志的行为在大多数Raft算法的实现中都是异步的，所以此时读取状态机并不能准确反应数据的状态，很可能会读到过期数据。  
 82 | 
 83 | 如何实现读取的线性一致性，就需要引入ReadIndex了  
 84 | 
 85 | ### 线性一致性读  
 86 | 
 87 | ReadIndex算法：  
 88 | 
 89 | 每次读操作的时候记录此时集群的`commited index`，当状态机的`apply index`大于或等于`commited index`时才读取数据并返回。由于此时状态机已经把读请求发起时的已提交日志进行了apply动作，所以此时状态机的状态就可以反应读请求发起时的状态，符合线性一致性读的要求。  
 90 | 
 91 | Leader执行ReadIndex大致的流程如下：
 92 | 
 93 | - 1、记录当前的commit index，称为ReadIndex；  
 94 | 
 95 | 所有的请求都会交给leader，如果follower收到读请求，会将请求forward给leader
 96 | 
 97 | - 2、向 Follower 发起一次心跳，如果大多数节点回复了，那就能确定现在仍然是Leader；  
 98 | 
 99 | 确认当前leader的状态,避免当前节点状态切换，数据不能及时被同步更新  
100 | 
101 | 比如发生了网络分区：可参见[网络分区问题](https://www.cnblogs.com/ricklz/p/15094389.html#%E7%BD%91%E7%BB%9C%E5%88%86%E5%8C%BA%E9%97%AE%E9%A2%98)  
102 | 
103 | 1、当前的leader被分到了小的分区中，然后大的集群中有数据更新，小的集群是无感知的，如果读的请求被定位到小的集群中，所以读取就可能读取到旧的数据。  
104 | 
105 | 2、小集群中的数据同样是不能被写入信息的，提交给该集群的指令对应的日志因不满足过半数的条件而无法被提交。  
106 | 
107 | 3、说以只有当前节点是集群中有效的leader才可以，也就是能收到大多数节点的回复信息。  
108 | 
109 | - 3、等待状态机的apply index大于或等于commited index时才读取数据；    
110 | 
111 | `apply index`大于或等于`commited index`就能表示当前状态机已经把读请求发起时的已提交日志进行了apply动作，所以此时状态机的状态就可以反应读请求发起时的状态，满足一致性读；  
112 | 
113 | - 4、执行读请求，将结果返回给Client。  
114 | 
115 | 进一步来看下etcd的源码是如何实现的呢  
116 | 
117 | #### 1、客户端的get请求
118 | 
119 | ```go
120 | func (kv *kv) Get(ctx context.Context, key string, opts ...OpOption) (*GetResponse, error) {
121 | 	r, err := kv.Do(ctx, OpGet(key, opts...))
122 | 	return r.get, toErr(ctx, err)
123 | }
124 | 
125 | // OpGet returns "get" operation based on given key and operation options.
126 | func OpGet(key string, opts ...OpOption) Op {
127 | 	// WithPrefix and WithFromKey are not supported together
128 | 	if IsOptsWithPrefix(opts) && IsOptsWithFromKey(opts) {
129 | 		panic("`WithPrefix` and `WithFromKey` cannot be set at the same time, choose one")
130 | 	}
131 | 	ret := Op{t: tRange, key: []byte(key)}
132 | 	ret.applyOpts(opts)
133 | 	return ret
134 | }
135 | 
136 | func (kv *kv) Do(ctx context.Context, op Op) (OpResponse, error) {
137 | 	var err error
138 | 	switch op.t {
139 | 	case tRange:
140 | 		var resp *pb.RangeResponse
141 | 		resp, err = kv.remote.Range(ctx, op.toRangeRequest(), kv.callOpts...)
142 | 		if err == nil {
143 | 			return OpResponse{get: (*GetResponse)(resp)}, nil
144 | 		}
145 | 		...
146 | 	}
147 | 	return OpResponse{}, toErr(ctx, err)
148 | }
149 | 
150 | func (c *kVClient) Range(ctx context.Context, in *RangeRequest, opts ...grpc.CallOption) (*RangeResponse, error) {
151 | 	out := new(RangeResponse)
152 | 	err := c.cc.Invoke(ctx, "/etcdserverpb.KV/Range", in, out, opts...)
153 | 	if err != nil {
154 | 		return nil, err
155 | 	}
156 | 	return out, nil
157 | }
158 | 
159 | service KV {
160 |   // Range gets the keys in the range from the key-value store.
161 |   rpc Range(RangeRequest) returns (RangeResponse) {
162 |       option (google.api.http) = {
163 |         post: "/v3/kv/range"
164 |         body: "*"
165 |     };
166 |   }
167 | }
168 | ```
169 | 
170 | 可以看到get的请求最终通过通过rpc发送到Range  
171 | 
172 | #### 2、服务端响应读取请求
173 | 
174 | ```go
175 | // etcd/server/etcdserver/v3_server.go
176 | func (s *EtcdServer) Range(ctx context.Context, r *pb.RangeRequest) (*pb.RangeResponse, error) {
177 | 	...
178 | 	// 判断是否需要serializable read  
179 | 	// Serializable为true表示需要serializable read
180 | 	// serializable read 会直接读取当前节点的数据返回给客户端，它并不能保证返回给客户端的数据是最新的  
181 | 	// Serializable为false表示需要linearizable read
182 | 	// Linearizable Read 需要阻塞等待直到读到最新的数据
183 | 	if !r.Serializable {
184 | 		err = s.linearizableReadNotify(ctx)
185 | 		trace.Step("agreement among raft nodes before linearized reading")
186 | 		if err != nil {
187 | 			return nil, err
188 | 		}
189 | 	}
190 | 	...
191 | 	return resp, err
192 | }
193 | 
194 | // etcd/server/etcdserver/v3_server.go
195 | func (s *EtcdServer) linearizableReadNotify(ctx context.Context) error {
196 | 	s.readMu.RLock()
197 | 	nc := s.readNotifier
198 | 	s.readMu.RUnlock()
199 | 
200 | 	// linearizableReadLoop会阻塞监听readwaitc
201 | 	// 这边写入一个空结构体到readwaitc中，linearizableReadLoop就会开始结束阻塞开始工作  
202 | 	select {
203 | 	case s.readwaitc <- struct{}{}:
204 | 	default:
205 | 	}
206 | 
207 | 	// wait for read state notification
208 | 	select {
209 | 	case <-nc.c:
210 | 		return nc.err
211 | 	case <-ctx.Done():
212 | 		return ctx.Err()
213 | 	case <-s.done:
214 | 		return ErrStopped
215 | 	}
216 | }
217 | 
218 | // start会启动一个linearizableReadLoop
219 | func (s *EtcdServer) Start() {
220 | 	...
221 | 	s.GoAttach(s.linearizableReadLoop)
222 | 	...
223 | }
224 | 
225 | // etcd/server/etcdserver/v3_server.go
226 | func (s *EtcdServer) linearizableReadLoop() {
227 | 	for {
228 | 		requestId := s.reqIDGen.Next()
229 | 		leaderChangedNotifier := s.LeaderChangedNotify()
230 | 		select {
231 | 		case <-leaderChangedNotifier:
232 | 			continue
233 | 		// 在client发起一次Linearizable Read的时候，会向readwaitc写入一个空的结构体作为信号
234 | 		case <-s.readwaitc:
235 | 		case <-s.stopping:
236 | 			return
237 | 		}
238 | 		...
239 | 		// 处理不同的消息
240 | 		// 这里会监听readwaitc，等待MsgReadIndex信息的处理结果 
241 | 		// 同时获取当前已提交的日志索引
242 | 		confirmedIndex, err := s.requestCurrentIndex(leaderChangedNotifier, requestId)
243 | 		if isStopped(err) {
244 | 			return
245 | 		}
246 | 		if err != nil {
247 | 			nr.notify(err)
248 | 			continue
249 | 		}
250 | 
251 | 		...
252 | 		// 此处是重点
253 | 		// 等待 apply index >= read index
254 | 		if appliedIndex < confirmedIndex {
255 | 			select {
256 | 			case <-s.applyWait.Wait(confirmedIndex):
257 | 			case <-s.stopping:
258 | 				return
259 | 			}
260 | 		}
261 | 		// 发出可以进行读取状态机的信号
262 | 		nr.notify(nil)
263 | 		...
264 | 	}
265 | }
266 | ```
267 | 
268 | 总结：  
269 | 
270 | 服务端对于读的操作，如果是Linearizable Read，也就是线性一致性的读，最终会通过linearizableReadLoop，监听readwaitc来触发，阻塞直到`apply index >= read index`，最终发送可以读取的信息。  
271 | 
272 | #### 3、raft中如何处理一个读的请求
273 | 
274 | linearizableReadLoop收到readwaitc，最终会调用sendReadIndex  
275 | 
276 | ```go
277 | // etcd/server/etcdserver/v3_server.go
278 | func (s *EtcdServer) sendReadIndex(requestIndex uint64) error {
279 | 	...
280 | 	err := s.r.ReadIndex(cctx, ctxToSend)
281 | 	...
282 | 	return nil
283 | }
284 | 
285 | // etcd/raft/node.go
286 | func (n *node) ReadIndex(ctx context.Context, rctx []byte) error {
287 | 	return n.step(ctx, pb.Message{Type: pb.MsgReadIndex, Entries: []pb.Entry{{Data: rctx}}})
288 | }
289 | ```
290 | 
291 | 通过MsgReadIndex的消息来发送读的请求  
292 | 
293 | - 如果follower收到了客户端的MsgReadIndex类型的消息，因为客户端不能处理只读请求，需要将消息转发到leader节点进行处理；  
294 | 
295 | - 如果是leader收到了MsgReadIndex；  
296 | 
297 | 1、如果消息来自客户端，直接写入到readStates，start函数会将readStates中最后的一个放到readStateC，通知上游的处理结果；  
298 | 
299 | 2、如果消息来自follower，通过消息MsgReadIndexResp回复follower的响应结果，同时follower也是会将readStates中最后的一个放到readStateC，通知上游的处理结果；  
300 | 
301 | 上面的linearizableReadLoop监听readStateC，当收到请求，获取当前leader已经提交的日志索引，然后等待直到状态机已应用索引 (applied index) 大于等于 Leader 的已提交索引时 (committed Index)，然后去通知读请求，数据已赶上 Leader，就可以去状态机中访问数据了，处理数据返回给客户端  
302 | 
303 | 我们知道ReadIndex算法中，leader节点需要，向follower节点发起心跳，确认自己的leader地位，具体的就是通过ReadOnly来实现,下面会一一介绍到  
304 | 
305 | ##### 如果follower收到只读的消息
306 | 
307 | <img src="/img/etcd-raft-read-follower.png" alt="etcd" align=center/>
308 | 
309 | follower会将消息转发到leader  
310 | 
311 | ```go
312 | // etcd/raft/raft.go
313 | func stepFollower(r *raft, m pb.Message) error {
314 | 	switch m.Type {
315 | 		...
316 | 	case pb.MsgReadIndex:
317 | 		if r.lead == None {
318 | 			r.logger.Infof("%x no leader at term %d; dropping index reading msg", r.id, r.Term)
319 | 			return nil
320 | 		}
321 | 		// 目标为leader
322 | 		m.To = r.lead
323 | 		// 转发信息
324 | 		r.send(m)
325 | 	}
326 | 	...
327 | 	return nil
328 | }
329 | ```
330 | 
331 | 再来看下leader是如何处理的
332 | 
333 | ```go
334 | // etcd/raft/raft.go
335 | func stepLeader(r *raft, m pb.Message) error {
336 | 	// These message types do not require any progress for m.From.
337 | 	switch m.Type {
338 | 	case pb.MsgReadIndex:
339 | 		...
340 | 		sendMsgReadIndexResponse(r, m)
341 | 
342 | 		return nil
343 | 	}
344 | 
345 | 	return nil
346 | }
347 | 
348 | // raft结构体中的readOnly作用是批量处理只读请求，只读请求有两种模式，分别是ReadOnlySafe和ReadOnlyLeaseBased
349 | // ReadOnlySafe是ETCD作者推荐的模式，因为这种模式不受节点之间时钟差异和网络分区的影响
350 | // 线性一致性读用的就是ReadOnlySafe
351 | func sendMsgReadIndexResponse(r *raft, m pb.Message) {
352 | 	switch r.readOnly.option {
353 | 	// If more than the local vote is needed, go through a full broadcast.
354 | 	case ReadOnlySafe:
355 | 		// 清空readOnly中指定消息ID及之前的所有记录
356 | 		// 开启leader向follower的确认机制
357 | 		r.readOnly.addRequest(r.raftLog.committed, m)
358 | 		// recvAck通知只读结构raft状态机已收到对附加只读请求上下文的心跳信号的确认。
359 | 		// 也就是记录下只读的请求
360 | 		r.readOnly.recvAck(r.id, m.Entries[0].Data)
361 | 		// leader 节点向其他节点发起广播
362 | 		r.bcastHeartbeatWithCtx(m.Entries[0].Data)
363 | 	case ReadOnlyLeaseBased:
364 | 		if resp := r.responseToReadIndexReq(m, r.raftLog.committed); resp.To != None {
365 | 			r.send(resp)
366 | 		}
367 | 	}
368 | }
369 | ```
370 | 
371 | 这里省略follower对leader节点的心跳回应，直接看leader对心跳回执的信息处理  
372 | 
373 | ```go
374 | func stepLeader(r *raft, m pb.Message) error {
375 | 	// All other message types require a progress for m.From (pr).
376 | 	pr := r.prs.Progress[m.From]
377 | 	if pr == nil {
378 | 		r.logger.Debugf("%x no progress available for %x", r.id, m.From)
379 | 		return nil
380 | 	}
381 | 	switch m.Type {
382 | 	case pb.MsgHeartbeatResp:
383 | 		...
384 | 		if r.readOnly.option != ReadOnlySafe || len(m.Context) == 0 {
385 | 			return nil
386 | 		}
387 | 
388 | 		// 判断leader有没有收到大多数节点的确认
389 | 		// 也就是ReadIndex算法中，leader节点得到follower的确认，证明自己目前还是Leader
390 | 		if r.prs.Voters.VoteResult(r.readOnly.recvAck(m.From, m.Context)) != quorum.VoteWon {
391 | 			return nil
392 | 		}
393 | 
394 | 		// 收到了响应节点超过半数，会清空readOnly中指定消息ID及之前的所有记录
395 | 		rss := r.readOnly.advance(m)
396 | 		// 返回follower的心跳回执
397 | 		for _, rs := range rss {
398 | 			if resp := r.responseToReadIndexReq(rs.req, rs.index); resp.To != None {
399 | 				r.send(resp)
400 | 			}
401 | 		}
402 | 	}
403 | 	return nil
404 | }
405 | 
406 | // responseToReadIndexReq 为 `req` 构造一个响应。如果`req`来自对等方
407 | // 本身，将返回一个空值。
408 | func (r *raft) responseToReadIndexReq(req pb.Message, readIndex uint64) pb.Message {
409 | 	// 通过from来判断该消息是否是follower节点转发到leader中的
410 | 	...
411 | 	// 如果是其他follower节点转发到leader节点的MsgReadIndex消息
412 | 	// leader会回向follower节点返回响应的MsgReadIndexResp消息，follower会响应给client
413 | 	return pb.Message{
414 | 		Type:    pb.MsgReadIndexResp,
415 | 		To:      req.From,
416 | 		Index:   readIndex,
417 | 		Entries: req.Entries,
418 | 	}
419 | }
420 | ```
421 | 
422 | 然后follower收到响应，将MsgReadIndex消息中的已提交位置和消息id封装成ReadState实例，添加到readStates  
423 | 
424 | ```go
425 | func stepFollower(r *raft, m pb.Message) error {
426 | 	switch m.Type {
427 | 	case pb.MsgReadIndexResp:
428 | 		if len(m.Entries) != 1 {
429 | 			r.logger.Errorf("%x invalid format of MsgReadIndexResp from %x, entries count: %d", r.id, m.From, len(m.Entries))
430 | 			return nil
431 | 		}
432 | 		// 将MsgReadIndex消息中的已提交位置和消息id封装成ReadState实例，添加到readStates
433 | 		// raft 模块也有一个 for-loop 的 goroutine，来读取该数组，并对MsgReadIndex进行响应
434 | 		r.readStates = append(r.readStates, ReadState{Index: m.Index, RequestCtx: m.Entries[0].Data})
435 | 	}
436 | 	return nil
437 | }
438 | ```
439 | 
440 | raft 模块有一个for-loop的goroutine，来读取该数组，并对MsgReadIndex进行响应，将ReadStates中的最后一项将写入到readStateC中，通知监听readStateC的linearizableReadLoop函数的结果。  
441 | 
442 | ```go
443 | // etcd/server/etcdserver/raft.goetcd/raft/node.go
444 | func (r *raftNode) start(rh *raftReadyHandler) {
445 | 	internalTimeout := time.Second
446 | 
447 | 	go func() {
448 | 		defer r.onStop()
449 | 		islead := false
450 | 
451 | 		for {
452 | 			select {
453 | 			case rd := <-r.Ready():
454 | 				...
455 | 				if len(rd.ReadStates) != 0 {
456 | 					select {
457 | 					// ReadStates中最后意向将会被写入到readStateC中
458 | 					// linearizableReadLoop会监听readStateC，获取MsgReadIndex的处理信息
459 | 					case r.readStateC <- rd.ReadStates[len(rd.ReadStates)-1]:
460 | 					case <-time.After(internalTimeout):
461 | 						r.lg.Warn("timed out sending read state", zap.Duration("timeout", internalTimeout))
462 | 					case <-r.stopped:
463 | 						return
464 | 					}
465 | 				}
466 | 				...
467 | 			}
468 | 		}
469 | 	}()
470 | }
471 | ```
472 | 
473 | ##### 如果leader收到只读请求
474 | 
475 | <img src="/img/etcd-raft-read-leader.png" alt="etcd" align=center/>
476 | 
477 | ```go
478 | func stepLeader(r *raft, m pb.Message) error {
479 | 	// All other message types require a progress for m.From (pr).
480 | 	pr := r.prs.Progress[m.From]
481 | 	if pr == nil {
482 | 		r.logger.Debugf("%x no progress available for %x", r.id, m.From)
483 | 		return nil
484 | 	}
485 | 	switch m.Type {
486 | 	case pb.MsgReadIndex:
487 | 		// 表示当前只有一个节点，当前节点就是leader
488 | 		if r.prs.IsSingleton() {
489 | 			if resp := r.responseToReadIndexReq(m, r.raftLog.committed); resp.To != None {
490 | 				r.send(resp)
491 | 			}
492 | 			return nil
493 | 		}
494 | 		...
495 | 		return nil
496 | 	}
497 | 	return nil
498 | }
499 | 
500 | // responseToReadIndexReq 为 `req` 构造一个响应。如果`req`来自对等方
501 | // 本身，将返回一个空值。
502 | func (r *raft) responseToReadIndexReq(req pb.Message, readIndex uint64) pb.Message {
503 | 	// 通过from来判断该消息是否是follower节点转发到leader中的
504 | 
505 | 	// 如果是客户端直接发到leader节点的消息，将MsgReadIndex消息中的已提交位置和消息id封装成ReadState实例，添加到readStates
506 | 	// raft 模块也有一个 for-loop 的 goroutine，来读取该数组，并对MsgReadIndex进行响应
507 | 	if req.From == None || req.From == r.id {
508 | 		r.readStates = append(r.readStates, ReadState{
509 | 			Index:      readIndex,
510 | 			RequestCtx: req.Entries[0].Data,
511 | 		})
512 | 		return pb.Message{}
513 | 	}
514 | 	...
515 | }
516 | ```
517 | 
518 | 如果当前只有一个节点，那么当前的节点也是leader节点，所有的只读请求，将会发送到leader，leader直接对信息进行处理  
519 | 
520 | 如何从状态机中访问数据了，就需要了解下MVCC了  
521 | 
522 | ### MVCC
523 | 
524 | `Multiversion concurrency control`简称MVCC。这个模块是为了解决 etcd v2 不支持保存 key 的历史版本、不支持多 key 事务等问题而产生的。  
525 | 
526 | 它核心由内存树形索引模块 (treeIndex) 和嵌入式的 KV 持久化存储库 boltdb 组成。  
527 | 
528 | 那么 etcd 如何基于 boltdb 保存一个 key 的多个历史版本呢?  
529 | 
530 | 每次修改操作，生成一个新的版本号 (revision)，以版本号为 key， value 为用户 key-value 等信息组成的结构体。boltdb 的 key 是全局递增的版本号 (revision)，value 是用户 key、value 等字段组合成的结构体，然后通过 treeIndex 模块来保存用户 key 和版本号的映射关系。  
531 | 
532 | treeIndex 与 boltdb 关系如下面的读事务流程图所示，从 treeIndex 中获取 key hello 的版本号，再以版本号作为 boltdb 的 key，从 boltdb 中获取其 value 信息。  
533 | 
534 | <img src="/img/etcd-mvcc.png" alt="etcd" align=center/>
535 | 
536 | #### treeIndex
537 | 
538 | treeIndex 模块是基于 Google 开源的内存版 btree 库实现的
539 | 
540 | treeIndex 模块只会保存用户的 key 和相关版本号信息，用户 key 的 value 数据存储在 boltdb 里面，相比 ZooKeeper 和 etcd v2 全内存存储，etcd v3 对内存要求更低。  
541 | 
542 | #### buffer  
543 | 
544 | 在获取到版本号信息后，就可从 boltdb 模块中获取用户的 key-value 数据了。不过有一点你要注意，并不是所有请求都一定要从 boltdb 获取数据。etcd 出于数据一致性、性能等考虑，在访问 boltdb 前，首先会从一个内存读事务 buffer 中，二分查找你要访问 key 是否在 buffer 里面，若命中则直接返回。  
545 | 
546 | #### boltdb
547 | 
548 | boltdb 使用 B+ tree 来组织用户的 key-value 数据，获取 bucket key 对象后，通过 boltdb 的游标 Cursor 可快速在 B+ tree 找到 key hello 对应的 value 数据，返回给 client。  
549 | 
550 | ### 总结
551 | 
552 | etcd中对于写的请求，因为所有的写请求都是通过leader的，leader的确认机制将会保证消息复制到大多数节点中；  
553 | 
554 | 对于只读的请求，同样也是需要全部转发到leader节点中，通过ReadIndex算法，来实现线性一致性读；  
555 | 
556 | raft执行ReadIndex大致的流程如下：
557 | 
558 | - 1、记录当前的commit index，称为ReadIndex；  
559 | 
560 | - 2、向Follower发起一次心跳，如果大多数节点回复了，那就能确定现在仍然是Leader；  
561 | 
562 | - 3、等待状态机的apply index大于或等于commited index时才读取数据；    
563 | 
564 | - 4、执行读请求，将结果返回给Client。 
565 | 
566 | 关于状态机数据的读取，首先从treeIndex获取版本号，然后在buffer是否有对应的值，没有就去boltdb查询对应的值  
567 | 
568 | ### 参考
569 | 
570 | 【CAP定理】https://zh.wikipedia.org/wiki/CAP%E5%AE%9A%E7%90%86  
571 | 【CAP定理】https://www.ibm.com/cn-zh/cloud/learn/cap-theorem  
572 | 【线性一致性：什么是线性一致性？】https://zhuanlan.zhihu.com/p/42239873    
573 | 【什么是数据一致性】https://github.com/javagrowing/JGrowing/blob/master/%E5%88%86%E5%B8%83%E5%BC%8F/%E8%B0%88%E8%B0%88%E6%95%B0%E6%8D%AE%E4%B8%80%E8%87%B4%E6%80%A7.md    
574 | 【etcd 中线性一致性读的具体实现】https://zhengyinyong.com/post/etcd-linearizable-read-implementation/  
575 | 
576 | 
577 | 


--------------------------------------------------------------------------------
/8-etcd中的lease.md:
--------------------------------------------------------------------------------
  1 | <!-- START doctoc generated TOC please keep comment here to allow auto update -->
  2 | <!-- DON'T EDIT THIS SECTION, INSTEAD RE-RUN doctoc TO UPDATE -->
  3 | 
  4 | - [etcd中的Lease](#etcd%E4%B8%AD%E7%9A%84lease)
  5 |   - [前言](#%E5%89%8D%E8%A8%80)
  6 |   - [Lease](#lease)
  7 |   - [Lease 整体架构](#lease-%E6%95%B4%E4%BD%93%E6%9E%B6%E6%9E%84)
  8 |   - [key 如何关联 Lease](#key-%E5%A6%82%E4%BD%95%E5%85%B3%E8%81%94-lease)
  9 |   - [Lease的续期](#lease%E7%9A%84%E7%BB%AD%E6%9C%9F)
 10 |   - [过期 Lease 的删除](#%E8%BF%87%E6%9C%9F-lease-%E7%9A%84%E5%88%A0%E9%99%A4)
 11 |   - [checkpoint 机制](#checkpoint-%E6%9C%BA%E5%88%B6)
 12 |   - [总结](#%E6%80%BB%E7%BB%93)
 13 |   - [参考](#%E5%8F%82%E8%80%83)
 14 | 
 15 | <!-- END doctoc generated TOC please keep comment here to allow auto update -->
 16 | 
 17 | ## etcd中的Lease
 18 | 
 19 | ### 前言
 20 | 
 21 | 之前我们了解过[grpc使用etcd做服务发现](https://www.cnblogs.com/ricklz/p/15059497.html)  
 22 | 
 23 | 之前的服务发现我们使用了 Lease，每次注册一个服务分配一个租约，通过 Lease 自动上报机模式，实现了一种活性检测机制，保证了故障机器的及时剔除。这次我们来想写的学习 Lease 租约的实现。    
 24 | 
 25 | ### Lease
 26 | 
 27 | ### Lease 整体架构
 28 | 
 29 | 这里放一个来自【etcd实战课程】的一张图片  
 30 | 
 31 | <img src="/img/etcd-lease.png" alt="grpc" align=center/>
 32 | 
 33 | 来看下服务端中Lease中的几个主要函数  
 34 | 
 35 | ```go
 36 | // etcd/server/lease/lessor.go
 37 | // Lessor owns leases. It can grant, revoke, renew and modify leases for lessee.
 38 | type Lessor interface {
 39 |     ...
 40 |     // Grant 表示创建一个 TTL 为你指定秒数的 Lease
 41 |     Grant(id LeaseID, ttl int64) (*Lease, error)
 42 |     // Revoke 撤销具有给定 ID 的租约
 43 |     Revoke(id LeaseID) error
 44 |     
 45 |     // 将给定的租约附加到具有给定 LeaseID 的租约。
 46 |     Attach(id LeaseID, items []LeaseItem) error
 47 |     
 48 |     // Renew 使用给定的 ID 续订租约。它返回更新后的 TTL
 49 |     Renew(id LeaseID) (int64, error)
 50 |     ...
 51 | }
 52 | ```
 53 | 
 54 | 同时对于客户端 Lease 也提供了下面几个API    
 55 | 
 56 | ```go
 57 | // etcd/client/v3/lease.go
 58 | type Lease interface {
 59 | 	// Grant 表示创建一个 TTL 为你指定秒数的 Lease，server 端的 Lessor 会将 Lease 信息持久化存储在 boltdb 中；
 60 | 	Grant(ctx context.Context, ttl int64) (*LeaseGrantResponse, error)
 61 | 
 62 | 	// 表示撤销 Lease 并删除其关联的数据；
 63 | 	Revoke(ctx context.Context, id LeaseID) (*LeaseRevokeResponse, error)
 64 | 
 65 | 	// 表示获取一个 Lease 的有效期、剩余时间；
 66 | 	TimeToLive(ctx context.Context, id LeaseID, opts ...LeaseOption) (*LeaseTimeToLiveResponse, error)
 67 | 
 68 | 	// Leases retrieves all leases.
 69 | 	Leases(ctx context.Context) (*LeaseLeasesResponse, error)
 70 | 
 71 | 	// 表示为 Lease 续期
 72 | 	KeepAlive(ctx context.Context, id LeaseID) (<-chan *LeaseKeepAliveResponse, error)
 73 | 
 74 | 	// 使用once只在第一次调用
 75 | 	KeepAliveOnce(ctx context.Context, id LeaseID) (*LeaseKeepAliveResponse, error)
 76 | 
 77 | 	// Close releases all resources Lease keeps for efficient communication
 78 | 	// with the etcd server.
 79 | 	Close() error
 80 | }
 81 | ```
 82 | 
 83 | 服务端在启动 Lessor 模块的时候，会启动两个 goroutine ，`revokeExpiredLeases()` 和 `checkpointScheduledLeases()` 。  
 84 | 
 85 | - revokeExpiredLeases: 定时检查是否有过期 Lease，发起撤销过期的 Lease 操作;  
 86 | 
 87 | - checkpointScheduledLeases: 定时触发更新 Lease 的剩余到期时间的操作;  
 88 | 
 89 | ```go
 90 | func newLessor(lg *zap.Logger, b backend.Backend, cfg LessorConfig) *lessor {
 91 | 	...
 92 | 	l := &lessor{
 93 | 		...
 94 | 	}
 95 | 	l.initAndRecover()
 96 | 
 97 | 	go l.runLoop()
 98 | 
 99 | 	return l
100 | }
101 | 
102 | func (le *lessor) runLoop() {
103 | 	defer close(le.doneC)
104 | 
105 | 	for {
106 | 		le.revokeExpiredLeases()
107 | 		le.checkpointScheduledLeases()
108 | 
109 | 		select {
110 | 		case <-time.After(500 * time.Millisecond):
111 | 		case <-le.stopC:
112 | 			return
113 | 		}
114 | 	}
115 | }
116 | 
117 | // revokeExpiredLeases 找到所有过期的租约，并将它们发送到过期通道被撤销
118 | func (le *lessor) revokeExpiredLeases() {
119 | 	var ls []*Lease
120 | 
121 | 	// rate limit
122 | 	revokeLimit := leaseRevokeRate / 2
123 | 
124 | 	le.mu.RLock()
125 | 	if le.isPrimary() {
126 |         // 在leaseExpiredNotifier中的小顶堆中删除过期的lease
127 | 		ls = le.findExpiredLeases(revokeLimit)
128 | 	}
129 | 	le.mu.RUnlock()
130 | 
131 | 	if len(ls) != 0 {
132 | 		select {
133 | 		case <-le.stopC:
134 | 			return
135 | 		case le.expiredC <- ls:
136 | 		default:
137 | 			// the receiver of expiredC is probably busy handling
138 | 			// other stuff
139 | 			// let's try this next time after 500ms
140 | 		}
141 | 	}
142 | }
143 | 
144 | // checkpointScheduledLeases 查找所有到期的预定租约检查点将它们提交给检查点以将它们持久化到共识日志中。
145 | func (le *lessor) checkpointScheduledLeases() {
146 | 	var cps []*pb.LeaseCheckpoint
147 | 
148 | 	// rate limit
149 | 	for i := 0; i < leaseCheckpointRate/2; i++ {
150 | 		le.mu.Lock()
151 | 		if le.isPrimary() {
152 | 			cps = le.findDueScheduledCheckpoints(maxLeaseCheckpointBatchSize)
153 | 		}
154 | 		le.mu.Unlock()
155 | 
156 | 		if len(cps) != 0 {
157 | 			le.cp(context.Background(), &pb.LeaseCheckpointRequest{Checkpoints: cps})
158 | 		}
159 | 		if len(cps) < maxLeaseCheckpointBatchSize {
160 | 			return
161 | 		}
162 | 	}
163 | }
164 | ```
165 | 
166 | 我们可以看到对于`revokeExpiredLeases()` 和 `checkpointScheduledLeases()` 的操作，定时是500毫秒处理一次，直到收到退出的信息。    
167 | 
168 | ### key 如何关联 Lease  
169 | 
170 | 然后我们来分析下一个基于 Lease 特性实现检测一个节点存活的过程   
171 | 
172 | 客户端通过 Grant 创建一个 TTL 时间的 Lease   
173 | 
174 | ```go
175 | // etcd/client/v3/lease.go
176 | func (l *lessor) Grant(ctx context.Context, ttl int64) (*LeaseGrantResponse, error) {
177 | 	r := &pb.LeaseGrantRequest{TTL: ttl}
178 |     // 通过grpc低啊用服务端的创建函数
179 | 	resp, err := l.remote.LeaseGrant(ctx, r, l.callOpts...)
180 | 	if err == nil {
181 | 		gresp := &LeaseGrantResponse{
182 | 			ResponseHeader: resp.GetHeader(),
183 | 			ID:             LeaseID(resp.ID),
184 | 			TTL:            resp.TTL,
185 | 			Error:          resp.Error,
186 | 		}
187 | 		return gresp, nil
188 | 	}
189 | 	return nil, toErr(ctx, err)
190 | }
191 | ```
192 | 
193 | 客户端创建的时候会通过 LeaseGrant 也就是grpc调用服务端的 Grant 的创建函数  
194 | 
195 | 来看下服务端的 Grant  
196 | 
197 | ```go
198 | // etcd/server/lease/lessor.go
199 | func (le *lessor) Grant(id LeaseID, ttl int64) (*Lease, error) {
200 | 	...
201 | 	// TODO: when lessor is under high load, it should give out lease
202 | 	// with longer TTL to reduce renew load.
203 | 	l := &Lease{
204 | 		ID:      id,
205 | 		ttl:     ttl,
206 | 		itemSet: make(map[LeaseItem]struct{}),
207 | 		revokec: make(chan struct{}),
208 | 	}
209 | 
210 | 	le.mu.Lock()
211 | 	defer le.mu.Unlock()
212 | 
213 | 	// 检查内存leaseMap是否存在这个lease
214 | 	if _, ok := le.leaseMap[id]; ok {
215 | 		return nil, ErrLeaseExists
216 | 	}
217 | 
218 | 	// 这里有个ttl的最小值
219 | 	if l.ttl < le.minLeaseTTL {
220 | 		l.ttl = le.minLeaseTTL
221 | 	}
222 | 
223 | 	if le.isPrimary() {
224 | 		l.refresh(0)
225 | 	} else {
226 | 		l.forever()
227 | 	}
228 | 
229 | 	le.leaseMap[id] = l
230 |     // 将 Lease 数据保存到 boltdb 的 Lease bucket 中
231 | 	l.persistTo(le.b)
232 | 
233 | 	...
234 | 	return l, nil
235 | }
236 | ```
237 | 
238 | 首先 Lessor 的 Grant 接口会把 Lease 保存到内存的 ItemMap 数据结构中，将数据数据保存到 boltdb 的 Lease bucket 中，返回给客户端 leaseId   
239 | 
240 | 当然 Grant 只是申请了一个 Lease，将 key 和 Lease 进行关联的操作是在 Attach 中完成的  
241 | 
242 | ```go
243 | // 将给定的租约附加到具有给定 LeaseID 的租约。
244 | func (le *lessor) Attach(id LeaseID, items []LeaseItem) error {
245 | 	le.mu.Lock()
246 | 	defer le.mu.Unlock()
247 | 
248 | 	l := le.leaseMap[id]
249 | 	if l == nil {
250 | 		return ErrLeaseNotFound
251 | 	}
252 | 
253 | 	l.mu.Lock()
254 | 	// 将租约放到itemMap
255 | 	// 一个租约是可以关联多个key的
256 | 	for _, it := range items {
257 | 		l.itemSet[it] = struct{}{}
258 | 		le.itemMap[it] = id
259 | 	}
260 | 	l.mu.Unlock()
261 | 	return nil
262 | }
263 | ```
264 | 
265 | 一个 Lease 关联的 key 集合是保存在内存中的，那么 etcd 重启时，是如何知道每个 Lease 上关联了哪些 key 呢?  
266 | 
267 | 答案是 etcd 的 MVCC 模块在持久化存储 key-value 的时候，保存到 boltdb 的 value 是个结构体（mvccpb.KeyValue）， 它不仅包含你的 key-value 数据，还包含了关联的 LeaseID 等信息。因此当 etcd 重启时，可根据此信息，重建关联各个 Lease 的 key 集合列表。  
268 | 
269 | ```go
270 | func (le *lessor) initAndRecover() {
271 | 	tx := le.b.BatchTx()
272 | 	tx.Lock()
273 | 
274 | 	tx.UnsafeCreateBucket(buckets.Lease)
275 | 	_, vs := tx.UnsafeRange(buckets.Lease, int64ToBytes(0), int64ToBytes(math.MaxInt64), 0)
276 | 	// TODO: copy vs and do decoding outside tx lock if lock contention becomes an issue.
277 | 	for i := range vs {
278 | 		var lpb leasepb.Lease
279 | 		err := lpb.Unmarshal(vs[i])
280 | 		if err != nil {
281 | 			tx.Unlock()
282 | 			panic("failed to unmarshal lease proto item")
283 | 		}
284 | 		ID := LeaseID(lpb.ID)
285 | 		if lpb.TTL < le.minLeaseTTL {
286 | 			lpb.TTL = le.minLeaseTTL
287 | 		}
288 | 		le.leaseMap[ID] = &Lease{
289 | 			ID:  ID,
290 | 			ttl: lpb.TTL,
291 | 			// itemSet will be filled in when recover key-value pairs
292 | 			// set expiry to forever, refresh when promoted
293 | 			itemSet: make(map[LeaseItem]struct{}),
294 | 			expiry:  forever,
295 | 			revokec: make(chan struct{}),
296 | 		}
297 | 	}
298 | 	le.leaseExpiredNotifier.Init()
299 | 	heap.Init(&le.leaseCheckpointHeap)
300 | 	tx.Unlock()
301 | 
302 | 	le.b.ForceCommit()
303 | }
304 | ```
305 | 
306 | ### Lease的续期  
307 | 
308 | 续期我们通过定期发送 KeepAlive 请求给 etcd 续期健康状态的 Lease   
309 | 
310 | ```go
311 | // etcd/client/v3/lease.go
312 | // KeepAlive尝试保持给定的租约永久alive
313 | func (l *lessor) KeepAlive(ctx context.Context, id LeaseID) (<-chan *LeaseKeepAliveResponse, error) {
314 | 	ch := make(chan *LeaseKeepAliveResponse, LeaseResponseChSize)
315 | 
316 | 	l.mu.Lock()
317 | 	// ensure that recvKeepAliveLoop is still running
318 | 	select {
319 | 	case <-l.donec:
320 | 		err := l.loopErr
321 | 		l.mu.Unlock()
322 | 		close(ch)
323 | 		return ch, ErrKeepAliveHalted{Reason: err}
324 | 	default:
325 | 	}
326 | 	ka, ok := l.keepAlives[id]
327 | 	if !ok {
328 | 		// create fresh keep alive
329 | 		ka = &keepAlive{
330 | 			chs:           []chan<- *LeaseKeepAliveResponse{ch},
331 | 			ctxs:          []context.Context{ctx},
332 | 			deadline:      time.Now().Add(l.firstKeepAliveTimeout),
333 | 			nextKeepAlive: time.Now(),
334 | 			donec:         make(chan struct{}),
335 | 		}
336 | 		l.keepAlives[id] = ka
337 | 	} else {
338 | 		// add channel and context to existing keep alive
339 | 		ka.ctxs = append(ka.ctxs, ctx)
340 | 		ka.chs = append(ka.chs, ch)
341 | 	}
342 | 	l.mu.Unlock()
343 | 
344 | 	go l.keepAliveCtxCloser(ctx, id, ka.donec)
345 | 	// 使用once只在第一次调用
346 | 	l.firstKeepAliveOnce.Do(func() {
347 | 		// 500毫秒一次，不断的发送保持活动请求
348 | 		go l.recvKeepAliveLoop()
349 | 		// 删除等待太久没反馈的租约
350 | 		go l.deadlineLoop()
351 | 	})
352 | 
353 | 	return ch, nil
354 | }
355 | 
356 | // deadlineLoop获取在租约TTL中没有收到响应的任何保持活动的通道
357 | func (l *lessor) deadlineLoop() {
358 | 	for {
359 | 		select {
360 | 		case <-time.After(time.Second):
361 | 			// donec 关闭，当 recvKeepAliveLoop 停止时设置 loopErr
362 | 		case <-l.donec:
363 | 			return
364 | 		}
365 | 		now := time.Now()
366 | 		l.mu.Lock()
367 | 		for id, ka := range l.keepAlives {
368 | 			if ka.deadline.Before(now) {
369 | 				// 等待响应太久；租约可能已过期
370 | 				ka.close()
371 | 				delete(l.keepAlives, id)
372 | 			}
373 | 		}
374 | 		l.mu.Unlock()
375 | 	}
376 | }
377 | 
378 | func (l *lessor) recvKeepAliveLoop() (gerr error) {
379 | 	defer func() {
380 | 		l.mu.Lock()
381 | 		close(l.donec)
382 | 		l.loopErr = gerr
383 | 		for _, ka := range l.keepAlives {
384 | 			ka.close()
385 | 		}
386 | 		l.keepAlives = make(map[LeaseID]*keepAlive)
387 | 		l.mu.Unlock()
388 | 	}()
389 | 
390 | 	for {
391 | 		// resetRecv 打开一个新的lease stream并开始发送保持活动请求。
392 | 		stream, err := l.resetRecv()
393 | 		if err != nil {
394 | 			if canceledByCaller(l.stopCtx, err) {
395 | 				return err
396 | 			}
397 | 		} else {
398 | 			for {
399 | 				// 接收lease stream的返回返回
400 | 				resp, err := stream.Recv()
401 | 				if err != nil {
402 | 					if canceledByCaller(l.stopCtx, err) {
403 | 						return err
404 | 					}
405 | 
406 | 					if toErr(l.stopCtx, err) == rpctypes.ErrNoLeader {
407 | 						l.closeRequireLeader()
408 | 					}
409 | 					break
410 | 				}
411 | 				// 根据LeaseKeepAliveResponse更新租约
412 | 				// 如果租约过期删除所有alive channels
413 | 				l.recvKeepAlive(resp)
414 | 			}
415 | 		}
416 | 
417 | 		select {
418 | 		case <-time.After(retryConnWait):
419 | 			continue
420 | 		case <-l.stopCtx.Done():
421 | 			return l.stopCtx.Err()
422 | 		}
423 | 	}
424 | }
425 | 
426 | // resetRecv 打开一个新的lease stream并开始发送保持活动请求。
427 | func (l *lessor) resetRecv() (pb.Lease_LeaseKeepAliveClient, error) {
428 | 	sctx, cancel := context.WithCancel(l.stopCtx)
429 | 	// 建立服务端和客户端连接的lease stream
430 | 	stream, err := l.remote.LeaseKeepAlive(sctx, l.callOpts...)
431 | 	if err != nil {
432 | 		cancel()
433 | 		return nil, err
434 | 	}
435 | 
436 | 	l.mu.Lock()
437 | 	defer l.mu.Unlock()
438 | 	if l.stream != nil && l.streamCancel != nil {
439 | 		l.streamCancel()
440 | 	}
441 | 
442 | 	l.streamCancel = cancel
443 | 	l.stream = stream
444 | 
445 | 	go l.sendKeepAliveLoop(stream)
446 | 	return stream, nil
447 | }
448 | 
449 | // sendKeepAliveLoop 在给定流的生命周期内发送保持活动请求
450 | func (l *lessor) sendKeepAliveLoop(stream pb.Lease_LeaseKeepAliveClient) {
451 | 	for {
452 | 		var tosend []LeaseID
453 | 
454 | 		now := time.Now()
455 | 		l.mu.Lock()
456 | 		for id, ka := range l.keepAlives {
457 | 			if ka.nextKeepAlive.Before(now) {
458 | 				tosend = append(tosend, id)
459 | 			}
460 | 		}
461 | 		l.mu.Unlock()
462 | 
463 | 		for _, id := range tosend {
464 | 			r := &pb.LeaseKeepAliveRequest{ID: int64(id)}
465 | 			if err := stream.Send(r); err != nil {
466 | 				// TODO do something with this error?
467 | 				return
468 | 			}
469 | 		}
470 | 
471 | 		select {
472 | 		// 每500毫秒执行一次
473 | 		case <-time.After(500 * time.Millisecond):
474 | 		case <-stream.Context().Done():
475 | 			return
476 | 		case <-l.donec:
477 | 			return
478 | 		case <-l.stopCtx.Done():
479 | 			return
480 | 		}
481 | 	}
482 | }
483 | ```
484 | 
485 | 关于续期的新能优化    
486 | 
487 | 对于 TTL 的选择，TTL 过长会导致节点异常后，无法及时从 etcd 中删除，影响服务可用性，而过短，则要求 client 频繁发送续期请求。  
488 | 
489 | etcd v3 通过复用 lease 和引入 gRPC，提高了续期的效率  
490 | 
491 | 1、etcd v3 版本引入了 lease,上面的代码我们也可以看到，不同 key 若 TTL 相同，可复用同一个 Lease， 显著减少了 Lease 数。    
492 | 
493 | 2、同时 etcd v3 版本引入了 gRPC 。通过 gRPC HTTP/2 实现了多路复用，流式传输，同一连接可支持为多个 Lease 续期，能够大大减少连接数，提高续期的效率。    
494 | 
495 | ### 过期 Lease 的删除
496 | 
497 | 上面的代码我们介绍了 etcd 在启动 lease 的时候会启动一个 goroutine revokeExpiredLeases(),他会没500毫秒执行一次清除操作。  
498 | 
499 | ```go
500 | func (le *lessor) runLoop() {
501 | 	defer close(le.doneC)
502 | 
503 | 	for {
504 | 		// 函数最终调用expireExists来完成清除操作
505 | 		le.revokeExpiredLeases()
506 | 		le.checkpointScheduledLeases()
507 | 
508 | 		select {
509 | 		case <-time.After(500 * time.Millisecond):
510 | 		case <-le.stopC:
511 | 			return
512 | 		}
513 | 	}
514 | }
515 | 
516 | // expireExists returns true if expiry items exist.
517 | // It pops only when expiry item exists.
518 | // "next" is true, to indicate that it may exist in next attempt.
519 | func (le *lessor) expireExists() (l *Lease, ok bool, next bool) {
520 | 	if le.leaseExpiredNotifier.Len() == 0 {
521 | 		return nil, false, false
522 | 	}
523 | 
524 | 	item := le.leaseExpiredNotifier.Poll()
525 | 	l = le.leaseMap[item.id]
526 | 	if l == nil {
527 | 		// lease has expired or been revoked
528 | 		// no need to revoke (nothing is expiry)
529 | 		le.leaseExpiredNotifier.Unregister() // O(log N)
530 | 		return nil, false, true
531 | 	}
532 | 	now := time.Now()
533 | 	if now.Before(item.time) /* item.time: expiration time */ {
534 | 		// Candidate expirations are caught up, reinsert this item
535 | 		// and no need to revoke (nothing is expiry)
536 | 		return l, false, false
537 | 	}
538 | 
539 | 	// recheck if revoke is complete after retry interval
540 | 	item.time = now.Add(le.expiredLeaseRetryInterval)
541 | 	le.leaseExpiredNotifier.RegisterOrUpdate(item)
542 | 	return l, true, false
543 | }
544 | ```
545 | 
546 | etcd Lease 高效淘汰方案最小堆的实现方法的，每次新增 Lease、续期的时候，它会插入、更新一个对象到最小堆中，对象含有 LeaseID 和其到期时间 unixnano，对象之间按到期时间升序排序。  
547 | 
548 | etcd Lessor 主循环每隔 500ms 执行一次撤销 Lease 检查（RevokeExpiredLease），每次轮询堆顶的元素，若已过期则加入到待淘汰列表，直到堆顶的 Lease 过期时间大于当前，则结束本轮轮询。  
549 | 
550 | Lessor 模块会将已确认过期的 LeaseID，保存在一个名为 expiredC 的 channel 中，而 etcd server 的主循环会定期从 channel 中获取 LeaseID，发起 revoke 请求，通过 Raft Log 传递给 Follower 节点。  
551 | 
552 | 各个节点收到 revoke Lease 请求后，获取关联到此 Lease 上的 key 列表，从 boltdb 中删除 key，从 Lessor 的 Lease map 内存中删除此 Lease 对象，最后还需要从 boltdb 的 Lease bucket 中删除这个 Lease。  
553 | 
554 | ```go
555 | // revokeExpiredLeases finds all leases past their expiry and sends them to expired channel for
556 | // to be revoked.
557 | func (le *lessor) revokeExpiredLeases() {
558 | 	var ls []*Lease
559 | 
560 | 	// rate limit
561 | 	revokeLimit := leaseRevokeRate / 2
562 | 
563 | 	le.mu.RLock()
564 | 	if le.isPrimary() {
565 | 		ls = le.findExpiredLeases(revokeLimit)
566 | 	}
567 | 	le.mu.RUnlock()
568 | 
569 | 	if len(ls) != 0 {
570 | 		select {
571 | 		case <-le.stopC:
572 | 			return
573 | 			// 已经过期的lease会被放入到expiredC中，然后被上游进行处理
574 | 		case le.expiredC <- ls:
575 | 		default:
576 | 			// the receiver of expiredC is probably busy handling
577 | 			// other stuff
578 | 			// let's try this next time after 500ms
579 | 		}
580 | 	}
581 | }
582 | ```
583 | 
584 | ### checkpoint 机制
585 | 
586 | 对于 lease 的处理都是发生在 leader 节点，如果leader节点挂掉了呢？我们知道会重新发起选举选出新的 leader，那么问题就来了  
587 | 
588 | 当你的集群发生 Leader 切换后，新的 Leader 基于 Lease map 信息，按 Lease 过期时间构建一个最小堆时，etcd 早期版本为了优化性能，并未持久化存储 Lease 剩余 TTL 信息，因此重建的时候就会自动给所有 Lease 自动续期了。  
589 | 
590 | 然而若较频繁出现 Leader 切换，切换时间小于 Lease 的 TTL，这会导致 Lease 永远无法删除，大量 key 堆积，db 大小超过配额等异常。  
591 | 
592 | 为了解决这个问题，所以引入了 checkpoint 机制  
593 | 
594 | 一方面，etcd 启动的时候，Leader 节点后台会运行此异步任务，定期批量地将 Lease 剩余的 TTL 基于 Raft Log 同步给 Follower 节点，Follower 节点收到 CheckPoint 请求后，更新内存数据结构 LeaseMap 的剩余 TTL 信息。  
595 | 
596 | 另一方面，当 Leader 节点收到 KeepAlive 请求的时候，它也会通过 checkpoint 机制把此 Lease 的剩余 TTL 重置，并同步给 Follower 节点，尽量确保续期后集群各个节点的 Lease 剩余 TTL 一致性。  
597 | 
598 | ### 总结  
599 | 
600 | 对于 TTL 的选择，TTL 过长会导致节点异常后，无法及时从 etcd 中删除，影响服务可用性，而过短，则要求 client 频繁发送续期请求。  
601 | 
602 | etcd v3 通过复用 lease 和引入 gRPC，提高了续期的效率  
603 | 
604 | 1、etcd v3 版本引入了 lease,上面的代码我们也可以看到，不同 key 若 TTL 相同，可复用同一个 Lease， 显著减少了 Lease 数。    
605 | 
606 | 2、同时 etcd v3 版本引入了 gRPC 。通过 gRPC HTTP/2 实现了多路复用，流式传输，同一连接可支持为多个 Lease 续期，能够大大减少连接数，提高续期的效率。    
607 | 
608 | Lease 中过期的删除，使用的结构是最小堆，主循环每隔 500ms 执行一次撤销 Lease 检查（RevokeExpiredLease），每次轮询堆顶的元素，若已过期则加入到待淘汰列表，直到堆顶的 Lease 过期时间大于当前，则结束本轮轮询。  
609 | 
610 | 如果 leader 发生频繁节点切换，切换时间小于 Lease 的 TTL，这会导致 Lease 永远无法删除，大量 key 堆积，db 大小超过配额等异常，引入了 checkpoint 机制。  
611 | 
612 | ### 参考  
613 | 
614 | 【Load Balancing in gRPC】https://github.com/grpc/grpc/blob/master/doc/load-balancing.md  
615 | 【文中的代码示例】https://github.com/boilingfrog/etcd-learning/tree/main/discovery    
616 | 【06 | 租约：如何检测你的客户端存活？】https://time.geekbang.org/column/article/339337  


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ## etcd-learning
 2 | 
 3 | 
 4 | [etcd学习(1)-etcd的使用场景](https://www.cnblogs.com/ricklz/p/15033193.html)
 5 | 
 6 | [etcd学习(2)-etcd中的watch源码阅读](https://www.cnblogs.com/ricklz/p/15037925.html)  
 7 | 
 8 | [etcd学习(3)-grpc使用etcd做服务发现](https://www.cnblogs.com/ricklz/p/15059497.html)  
 9 | 
10 | [etcd学习(4)-centos7中部署etcd](https://www.cnblogs.com/ricklz/p/15074924.html)    
11 | 
12 | [etcd学习(5)-etcd的Raft一致性算法原理](https://www.cnblogs.com/ricklz/p/15094389.html)    
13 | 
14 | [etcd学习(6)-etcd实现raft源码解读](https://www.cnblogs.com/ricklz/p/15155095.html)     
15 | 
16 | [etcd学习(7)-etcd中如何实现线性一致性](https://www.cnblogs.com/ricklz/p/15204381.html)    
17 | 
18 | [etcd学习(8)-etcd中的lease](https://www.cnblogs.com/ricklz/p/15232204.html)     
19 | 
20 | [etcd学习(9)-etcd中的存储实现](https://www.cnblogs.com/ricklz/p/15253404.html)     
21 | 
22 | [etcd学习(10)-etcd对比Consul和zooKeeper如何选型](https://www.cnblogs.com/ricklz/p/15292306.html)     
23 | 
24 | 
25 |  
26 | 
27 |  
28 | 
29 | 
30 | 


--------------------------------------------------------------------------------
/discovery/README.md:
--------------------------------------------------------------------------------
  1 | <!-- START doctoc generated TOC please keep comment here to allow auto update -->
  2 | <!-- DON'T EDIT THIS SECTION, INSTEAD RE-RUN doctoc TO UPDATE -->
  3 | 
  4 | - [grpc通过etcd实现服务发现](#grpc%E9%80%9A%E8%BF%87etcd%E5%AE%9E%E7%8E%B0%E6%9C%8D%E5%8A%A1%E5%8F%91%E7%8E%B0)
  5 |   - [前言](#%E5%89%8D%E8%A8%80)
  6 |   - [服务注册](#%E6%9C%8D%E5%8A%A1%E6%B3%A8%E5%86%8C)
  7 |   - [服务发现](#%E6%9C%8D%E5%8A%A1%E5%8F%91%E7%8E%B0)
  8 |   - [负载均衡](#%E8%B4%9F%E8%BD%BD%E5%9D%87%E8%A1%A1)
  9 |     - [集中式LB（Proxy Model）](#%E9%9B%86%E4%B8%AD%E5%BC%8Flbproxy-model)
 10 |     - [进程内LB（Balancing-aware Client）](#%E8%BF%9B%E7%A8%8B%E5%86%85lbbalancing-aware-client)
 11 |     - [独立 LB 进程（External Load Balancing Service）](#%E7%8B%AC%E7%AB%8B-lb-%E8%BF%9B%E7%A8%8Bexternal-load-balancing-service)
 12 |   - [参考](#%E5%8F%82%E8%80%83)
 13 | 
 14 | <!-- END doctoc generated TOC please keep comment here to allow auto update -->
 15 | 
 16 | ## grpc通过etcd实现服务发现
 17 | 
 18 | ### 前言
 19 | 
 20 | 项目中使用etcd实现了grpc的服务户注册和服务发现，这里来看下如何实现的服务注册和服务发现  
 21 | 
 22 | 先来看下使用的demo，demo中的代码[discovery](https://github.com/boilingfrog/etcd-learning/tree/main/discovery)
 23 | 
 24 | ### 服务注册  
 25 | 
 26 | ```go
 27 | package discovery
 28 | 
 29 | import (
 30 | 	"context"
 31 | 	"encoding/json"
 32 | 	"errors"
 33 | 	"net/http"
 34 | 	"strconv"
 35 | 	"strings"
 36 | 	"time"
 37 | 
 38 | 	clientv3 "go.etcd.io/etcd/client/v3"
 39 | 	"go.uber.org/zap"
 40 | )
 41 | 
 42 | // Register for grpc server
 43 | type Register struct {
 44 | 	EtcdAddrs   []string
 45 | 	DialTimeout int
 46 | 
 47 | 	closeCh     chan struct{}
 48 | 	leasesID    clientv3.LeaseID
 49 | 	keepAliveCh <-chan *clientv3.LeaseKeepAliveResponse
 50 | 
 51 | 	srvInfo Server
 52 | 	srvTTL  int64
 53 | 	cli     *clientv3.Client
 54 | 	logger  *zap.Logger
 55 | }
 56 | 
 57 | // NewRegister create a register base on etcd
 58 | func NewRegister(etcdAddrs []string, logger *zap.Logger) *Register {
 59 | 	return &Register{
 60 | 		EtcdAddrs:   etcdAddrs,
 61 | 		DialTimeout: 3,
 62 | 		logger:      logger,
 63 | 	}
 64 | }
 65 | 
 66 | // Register a service
 67 | func (r *Register) Register(srvInfo Server, ttl int64) (chan<- struct{}, error) {
 68 | 	var err error
 69 | 
 70 | 	if strings.Split(srvInfo.Addr, ":")[0] == "" {
 71 | 		return nil, errors.New("invalid ip")
 72 | 	}
 73 | 
 74 | 	if r.cli, err = clientv3.New(clientv3.Config{
 75 | 		Endpoints:   r.EtcdAddrs,
 76 | 		DialTimeout: time.Duration(r.DialTimeout) * time.Second,
 77 | 	}); err != nil {
 78 | 		return nil, err
 79 | 	}
 80 | 
 81 | 	r.srvInfo = srvInfo
 82 | 	r.srvTTL = ttl
 83 | 
 84 | 	if err = r.register(); err != nil {
 85 | 		return nil, err
 86 | 	}
 87 | 
 88 | 	r.closeCh = make(chan struct{})
 89 | 
 90 | 	go r.keepAlive()
 91 | 
 92 | 	return r.closeCh, nil
 93 | }
 94 | 
 95 | // Stop stop register
 96 | func (r *Register) Stop() {
 97 | 	r.closeCh <- struct{}{}
 98 | }
 99 | 
100 | // register 注册节点
101 | func (r *Register) register() error {
102 | 	leaseCtx, cancel := context.WithTimeout(context.Background(), time.Duration(r.DialTimeout)*time.Second)
103 | 	defer cancel()
104 | 
105 | 	leaseResp, err := r.cli.Grant(leaseCtx, r.srvTTL)
106 | 	if err != nil {
107 | 		return err
108 | 	}
109 | 	r.leasesID = leaseResp.ID
110 | 	if r.keepAliveCh, err = r.cli.KeepAlive(context.Background(), leaseResp.ID); err != nil {
111 | 		return err
112 | 	}
113 | 
114 | 	data, err := json.Marshal(r.srvInfo)
115 | 	if err != nil {
116 | 		return err
117 | 	}
118 | 	_, err = r.cli.Put(context.Background(), BuildRegPath(r.srvInfo), string(data), clientv3.WithLease(r.leasesID))
119 | 	return err
120 | }
121 | 
122 | // unregister 删除节点
123 | func (r *Register) unregister() error {
124 | 	_, err := r.cli.Delete(context.Background(), BuildRegPath(r.srvInfo))
125 | 	return err
126 | }
127 | 
128 | // keepAlive
129 | func (r *Register) keepAlive() {
130 | 	ticker := time.NewTicker(time.Duration(r.srvTTL) * time.Second)
131 | 	for {
132 | 		select {
133 | 		case <-r.closeCh:
134 | 			if err := r.unregister(); err != nil {
135 | 				r.logger.Error("unregister failed", zap.Error(err))
136 | 			}
137 | 			if _, err := r.cli.Revoke(context.Background(), r.leasesID); err != nil {
138 | 				r.logger.Error("revoke failed", zap.Error(err))
139 | 			}
140 | 			return
141 | 		case res := <-r.keepAliveCh:
142 | 			if res == nil {
143 | 				if err := r.register(); err != nil {
144 | 					r.logger.Error("register failed", zap.Error(err))
145 | 				}
146 | 			}
147 | 		case <-ticker.C:
148 | 			if r.keepAliveCh == nil {
149 | 				if err := r.register(); err != nil {
150 | 					r.logger.Error("register failed", zap.Error(err))
151 | 				}
152 | 			}
153 | 		}
154 | 	}
155 | }
156 | 
157 | // UpdateHandler return http handler
158 | func (r *Register) UpdateHandler() http.HandlerFunc {
159 | 	return http.HandlerFunc(func(w http.ResponseWriter, req *http.Request) {
160 | 		wi := req.URL.Query().Get("weight")
161 | 		weight, err := strconv.Atoi(wi)
162 | 		if err != nil {
163 | 			w.WriteHeader(http.StatusBadRequest)
164 | 			w.Write([]byte(err.Error()))
165 | 			return
166 | 		}
167 | 
168 | 		var update = func() error {
169 | 			r.srvInfo.Weight = int64(weight)
170 | 			data, err := json.Marshal(r.srvInfo)
171 | 			if err != nil {
172 | 				return err
173 | 			}
174 | 			_, err = r.cli.Put(context.Background(), BuildRegPath(r.srvInfo), string(data), clientv3.WithLease(r.leasesID))
175 | 			return err
176 | 		}
177 | 
178 | 		if err := update(); err != nil {
179 | 			w.WriteHeader(http.StatusInternalServerError)
180 | 			w.Write([]byte(err.Error()))
181 | 			return
182 | 		}
183 | 		w.Write([]byte("update server weight success"))
184 | 	})
185 | }
186 | 
187 | func (r *Register) GetServerInfo() (Server, error) {
188 | 	resp, err := r.cli.Get(context.Background(), BuildRegPath(r.srvInfo))
189 | 	if err != nil {
190 | 		return r.srvInfo, err
191 | 	}
192 | 	info := Server{}
193 | 	if resp.Count >= 1 {
194 | 		if err := json.Unmarshal(resp.Kvs[0].Value, &info); err != nil {
195 | 			return info, err
196 | 		}
197 | 	}
198 | 	return info, nil
199 | }
200 | ```
201 | 
202 | 来分析下上面的代码实现  
203 | 
204 | 当启动一个grpc的时候我们注册到etcd中
205 | 
206 | ```go
207 | 	etcdRegister := discovery.NewRegister(config.Etcd.Addrs, log.Logger)
208 | 	node := discovery.Server{
209 | 		Name: app,
210 | 		Addr: utils.InternalIP() + config.Port.GRPC,
211 | 	}
212 | 
213 | 	if _, err := etcdRegister.Register(node, 10); err != nil {
214 | 		panic(fmt.Sprintf("server register failed: %v", err))
215 | 	}
216 | ```
217 | 
218 | 调用服务注册的时候首先分配了一个租约  
219 | 
220 | ```go
221 | func (l *lessor) Grant(ctx context.Context, ttl int64) (*LeaseGrantResponse, error) {
222 | 	r := &pb.LeaseGrantRequest{TTL: ttl}
223 | 	resp, err := l.remote.LeaseGrant(ctx, r, l.callOpts...)
224 | 	if err == nil {
225 | 		gresp := &LeaseGrantResponse{
226 | 			ResponseHeader: resp.GetHeader(),
227 | 			ID:             LeaseID(resp.ID),
228 | 			TTL:            resp.TTL,
229 | 			Error:          resp.Error,
230 | 		}
231 | 		return gresp, nil
232 | 	}
233 | 	return nil, toErr(ctx, err)
234 | }
235 | ```
236 | 
237 | 然后通过KeepAlive保活   
238 | 
239 | ```go
240 | // KeepAlive尝试保持给定的租约永久alive
241 | func (l *lessor) KeepAlive(ctx context.Context, id LeaseID) (<-chan *LeaseKeepAliveResponse, error) {
242 | 	ch := make(chan *LeaseKeepAliveResponse, LeaseResponseChSize)
243 | 
244 | 	l.mu.Lock()
245 | 	// ensure that recvKeepAliveLoop is still running
246 | 	select {
247 | 	case <-l.donec:
248 | 		err := l.loopErr
249 | 		l.mu.Unlock()
250 | 		close(ch)
251 | 		return ch, ErrKeepAliveHalted{Reason: err}
252 | 	default:
253 | 	}
254 | 	ka, ok := l.keepAlives[id]
255 | 	if !ok {
256 | 		// create fresh keep alive
257 | 		ka = &keepAlive{
258 | 			chs:           []chan<- *LeaseKeepAliveResponse{ch},
259 | 			ctxs:          []context.Context{ctx},
260 | 			deadline:      time.Now().Add(l.firstKeepAliveTimeout),
261 | 			nextKeepAlive: time.Now(),
262 | 			donec:         make(chan struct{}),
263 | 		}
264 | 		l.keepAlives[id] = ka
265 | 	} else {
266 | 		// add channel and context to existing keep alive
267 | 		ka.ctxs = append(ka.ctxs, ctx)
268 | 		ka.chs = append(ka.chs, ch)
269 | 	}
270 | 	l.mu.Unlock()
271 | 
272 | 	go l.keepAliveCtxCloser(ctx, id, ka.donec)
273 | 	// 使用once只在第一次调用
274 | 	l.firstKeepAliveOnce.Do(func() {
275 | 		// 500毫秒一次，不断的发送保持活动请求
276 | 		go l.recvKeepAliveLoop()
277 | 		// 删除等待太久没反馈的租约
278 | 		go l.deadlineLoop()
279 | 	})
280 | 
281 | 	return ch, nil
282 | }
283 | 
284 | // deadlineLoop获取在租约TTL中没有收到响应的任何保持活动的通道
285 | func (l *lessor) deadlineLoop() {
286 | 	for {
287 | 		select {
288 | 		case <-time.After(time.Second):
289 | 			// donec 关闭，当 recvKeepAliveLoop 停止时设置 loopErr
290 | 		case <-l.donec:
291 | 			return
292 | 		}
293 | 		now := time.Now()
294 | 		l.mu.Lock()
295 | 		for id, ka := range l.keepAlives {
296 | 			if ka.deadline.Before(now) {
297 | 				// 等待响应太久；租约可能已过期
298 | 				ka.close()
299 | 				delete(l.keepAlives, id)
300 | 			}
301 | 		}
302 | 		l.mu.Unlock()
303 | 	}
304 | }
305 | 
306 | func (l *lessor) recvKeepAliveLoop() (gerr error) {
307 | 	defer func() {
308 | 		l.mu.Lock()
309 | 		close(l.donec)
310 | 		l.loopErr = gerr
311 | 		for _, ka := range l.keepAlives {
312 | 			ka.close()
313 | 		}
314 | 		l.keepAlives = make(map[LeaseID]*keepAlive)
315 | 		l.mu.Unlock()
316 | 	}()
317 | 
318 | 	for {
319 | 		// resetRecv 打开一个新的lease stream并开始发送保持活动请求。
320 | 		stream, err := l.resetRecv()
321 | 		if err != nil {
322 | 			if canceledByCaller(l.stopCtx, err) {
323 | 				return err
324 | 			}
325 | 		} else {
326 | 			for {
327 | 				// 接收lease stream的返回返回
328 | 				resp, err := stream.Recv()
329 | 				if err != nil {
330 | 					if canceledByCaller(l.stopCtx, err) {
331 | 						return err
332 | 					}
333 | 
334 | 					if toErr(l.stopCtx, err) == rpctypes.ErrNoLeader {
335 | 						l.closeRequireLeader()
336 | 					}
337 | 					break
338 | 				}
339 | 				// 根据LeaseKeepAliveResponse更新租约
340 | 				// 如果租约过期删除所有alive channels
341 | 				l.recvKeepAlive(resp)
342 | 			}
343 | 		}
344 | 
345 | 		select {
346 | 		case <-time.After(retryConnWait):
347 | 			continue
348 | 		case <-l.stopCtx.Done():
349 | 			return l.stopCtx.Err()
350 | 		}
351 | 	}
352 | }
353 | 
354 | // resetRecv 打开一个新的lease stream并开始发送保持活动请求。
355 | func (l *lessor) resetRecv() (pb.Lease_LeaseKeepAliveClient, error) {
356 | 	sctx, cancel := context.WithCancel(l.stopCtx)
357 | 	// 建立服务端和客户端连接的lease stream
358 | 	stream, err := l.remote.LeaseKeepAlive(sctx, l.callOpts...)
359 | 	if err != nil {
360 | 		cancel()
361 | 		return nil, err
362 | 	}
363 | 
364 | 	l.mu.Lock()
365 | 	defer l.mu.Unlock()
366 | 	if l.stream != nil && l.streamCancel != nil {
367 | 		l.streamCancel()
368 | 	}
369 | 
370 | 	l.streamCancel = cancel
371 | 	l.stream = stream
372 | 
373 | 	go l.sendKeepAliveLoop(stream)
374 | 	return stream, nil
375 | }
376 | 
377 | // sendKeepAliveLoop 在给定流的生命周期内发送保持活动请求
378 | func (l *lessor) sendKeepAliveLoop(stream pb.Lease_LeaseKeepAliveClient) {
379 | 	for {
380 | 		var tosend []LeaseID
381 | 
382 | 		now := time.Now()
383 | 		l.mu.Lock()
384 | 		for id, ka := range l.keepAlives {
385 | 			if ka.nextKeepAlive.Before(now) {
386 | 				tosend = append(tosend, id)
387 | 			}
388 | 		}
389 | 		l.mu.Unlock()
390 | 
391 | 		for _, id := range tosend {
392 | 			r := &pb.LeaseKeepAliveRequest{ID: int64(id)}
393 | 			if err := stream.Send(r); err != nil {
394 | 				// TODO do something with this error?
395 | 				return
396 | 			}
397 | 		}
398 | 
399 | 		select {
400 | 		// 每500毫秒执行一次
401 | 		case <-time.After(500 * time.Millisecond):
402 | 		case <-stream.Context().Done():
403 | 			return
404 | 		case <-l.donec:
405 | 			return
406 | 		case <-l.stopCtx.Done():
407 | 			return
408 | 		}
409 | 	}
410 | }
411 | 
412 | // 撤销给定的租约，所有附加到租约的key将过期并被删除  
413 | func (l *lessor) Revoke(ctx context.Context, id LeaseID) (*LeaseRevokeResponse, error) {
414 | 	r := &pb.LeaseRevokeRequest{ID: int64(id)}
415 | 	resp, err := l.remote.LeaseRevoke(ctx, r, l.callOpts...)
416 | 	if err == nil {
417 | 		return (*LeaseRevokeResponse)(resp), nil
418 | 	}
419 | 	return nil, toErr(ctx, err)
420 | }
421 | ```
422 | 
423 | 总结：  
424 | 
425 | 1、每次注册一个服务的分配一个租约；  
426 | 
427 | 2、KeepAlive通过从客户端到服务器端的流化的`keep alive`请求和从服务器端到客户端的流化的`keep alive`应答来维持租约；  
428 | 
429 | 3、KeepAlive会500毫秒进行一次lease stream的发送；  
430 | 
431 | 4、然后接收到KeepAlive发送信息回执，处理更新租约，服务处于活动状态；  
432 | 
433 | 5、如果在租约TTL中没有收到响应的任何保持活动的请求，删除租约；  
434 | 
435 | 6、Revoke撤销一个租约，所有附加到租约的key将过期并被删除。  
436 | 
437 | ### 服务发现  
438 | 
439 | 我们只需实现grpc在resolver中提供了Builder和Resolver接口，就能完成gRPC客户端的服务发现和负载均衡  
440 | 
441 | ```go
442 | // 创建一个resolver用于监视名称解析更新
443 | type Builder interface {
444 | 	Build(target Target, cc ClientConn, opts BuildOption) (Resolver, error)
445 | 	Scheme() string
446 | }
447 | ```
448 | 
449 | - Build方法：为给定目标创建一个新的resolver，当调用grpc.Dial()时执行；  
450 | 
451 | - Scheme方法：返回此resolver支持的方案,可参考[Scheme定义](https://github.com/grpc/grpc/blob/master/doc/naming.md)  
452 | 
453 | ```go
454 | // 监视指定目标的更新，包括地址更新和服务配置更新
455 | type Resolver interface {
456 | 	ResolveNow(ResolveNowOption)
457 | 	Close()
458 | }
459 | ```
460 | 
461 | - ResolveNow方法：被 gRPC 调用，以尝试再次解析目标名称。只用于提示，可忽略该方法;  
462 | 
463 | - Close方法：关闭resolver。  
464 | 
465 | 接下来看下具体的实现  
466 | 
467 | ```go
468 | package discovery
469 | 
470 | import (
471 | 	"context"
472 | 	"time"
473 | 
474 | 	"go.uber.org/zap"
475 | 
476 | 	"go.etcd.io/etcd/api/v3/mvccpb"
477 | 	clientv3 "go.etcd.io/etcd/client/v3"
478 | 	"google.golang.org/grpc/resolver"
479 | )
480 | 
481 | const (
482 | 	schema = "etcd"
483 | )
484 | 
485 | // Resolver for grpc client
486 | type Resolver struct {
487 | 	schema      string
488 | 	EtcdAddrs   []string
489 | 	DialTimeout int
490 | 
491 | 	closeCh      chan struct{}
492 | 	watchCh      clientv3.WatchChan
493 | 	cli          *clientv3.Client
494 | 	keyPrifix    string
495 | 	srvAddrsList []resolver.Address
496 | 
497 | 	cc     resolver.ClientConn
498 | 	logger *zap.Logger
499 | }
500 | 
501 | // NewResolver create a new resolver.Builder base on etcd
502 | func NewResolver(etcdAddrs []string, logger *zap.Logger) *Resolver {
503 | 	return &Resolver{
504 | 		schema:      schema,
505 | 		EtcdAddrs:   etcdAddrs,
506 | 		DialTimeout: 3,
507 | 		logger:      logger,
508 | 	}
509 | }
510 | 
511 | // Scheme returns the scheme supported by this resolver.
512 | func (r *Resolver) Scheme() string {
513 | 	return r.schema
514 | }
515 | 
516 | // Build creates a new resolver.Resolver for the given target
517 | func (r *Resolver) Build(target resolver.Target, cc resolver.ClientConn, opts resolver.BuildOptions) (resolver.Resolver, error) {
518 | 	r.cc = cc
519 | 
520 | 	r.keyPrifix = BuildPrefix(Server{Name: target.Endpoint, Version: target.Authority})
521 | 	if _, err := r.start(); err != nil {
522 | 		return nil, err
523 | 	}
524 | 	return r, nil
525 | }
526 | 
527 | // ResolveNow resolver.Resolver interface
528 | func (r *Resolver) ResolveNow(o resolver.ResolveNowOptions) {}
529 | 
530 | // Close resolver.Resolver interface
531 | func (r *Resolver) Close() {
532 | 	r.closeCh <- struct{}{}
533 | }
534 | 
535 | // start
536 | func (r *Resolver) start() (chan<- struct{}, error) {
537 | 	var err error
538 | 	r.cli, err = clientv3.New(clientv3.Config{
539 | 		Endpoints:   r.EtcdAddrs,
540 | 		DialTimeout: time.Duration(r.DialTimeout) * time.Second,
541 | 	})
542 | 	if err != nil {
543 | 		return nil, err
544 | 	}
545 | 	resolver.Register(r)
546 | 
547 | 	r.closeCh = make(chan struct{})
548 | 
549 | 	if err = r.sync(); err != nil {
550 | 		return nil, err
551 | 	}
552 | 
553 | 	go r.watch()
554 | 
555 | 	return r.closeCh, nil
556 | }
557 | 
558 | // watch update events
559 | func (r *Resolver) watch() {
560 | 	ticker := time.NewTicker(time.Minute)
561 | 	r.watchCh = r.cli.Watch(context.Background(), r.keyPrifix, clientv3.WithPrefix())
562 | 
563 | 	for {
564 | 		select {
565 | 		case <-r.closeCh:
566 | 			return
567 | 		case res, ok := <-r.watchCh:
568 | 			if ok {
569 | 				r.update(res.Events)
570 | 			}
571 | 		case <-ticker.C:
572 | 			if err := r.sync(); err != nil {
573 | 				r.logger.Error("sync failed", zap.Error(err))
574 | 			}
575 | 		}
576 | 	}
577 | }
578 | 
579 | // update
580 | func (r *Resolver) update(events []*clientv3.Event) {
581 | 	for _, ev := range events {
582 | 		var info Server
583 | 		var err error
584 | 
585 | 		switch ev.Type {
586 | 		case mvccpb.PUT:
587 | 			info, err = ParseValue(ev.Kv.Value)
588 | 			if err != nil {
589 | 				continue
590 | 			}
591 | 			addr := resolver.Address{Addr: info.Addr, Metadata: info.Weight}
592 | 			if !Exist(r.srvAddrsList, addr) {
593 | 				r.srvAddrsList = append(r.srvAddrsList, addr)
594 | 				r.cc.UpdateState(resolver.State{Addresses: r.srvAddrsList})
595 | 			}
596 | 		case mvccpb.DELETE:
597 | 			info, err = SplitPath(string(ev.Kv.Key))
598 | 			if err != nil {
599 | 				continue
600 | 			}
601 | 			addr := resolver.Address{Addr: info.Addr}
602 | 			if s, ok := Remove(r.srvAddrsList, addr); ok {
603 | 				r.srvAddrsList = s
604 | 				r.cc.UpdateState(resolver.State{Addresses: r.srvAddrsList})
605 | 			}
606 | 		}
607 | 	}
608 | }
609 | 
610 | // sync 同步获取所有地址信息
611 | func (r *Resolver) sync() error {
612 | 	ctx, cancel := context.WithTimeout(context.Background(), 3*time.Second)
613 | 	defer cancel()
614 | 	res, err := r.cli.Get(ctx, r.keyPrifix, clientv3.WithPrefix())
615 | 	if err != nil {
616 | 		return err
617 | 	}
618 | 	r.srvAddrsList = []resolver.Address{}
619 | 
620 | 	for _, v := range res.Kvs {
621 | 		info, err := ParseValue(v.Value)
622 | 		if err != nil {
623 | 			continue
624 | 		}
625 | 		addr := resolver.Address{Addr: info.Addr, Metadata: info.Weight}
626 | 		r.srvAddrsList = append(r.srvAddrsList, addr)
627 | 	}
628 | 	r.cc.UpdateState(resolver.State{Addresses: r.srvAddrsList})
629 | 	return nil
630 | }
631 | ```
632 | 
633 | 总结：  
634 | 
635 | 1、watch会监听前缀的信息变更，有变更的通知，及时更新srvAddrsList的地址信息；  
636 | 
637 | 2、sync会定时的同步etcd中的可用的服务地址到srvAddrsList中；  
638 | 
639 | 3、使用UpdateState更新ClientConn的Addresses；      
640 | 
641 | 4、然后grpc客户端就能根据配置的具体策略发送请求到grpc的server中。  
642 | 
643 | 这里使用gRPC内置的负载均衡策略`round_robin`，根据负载均衡地址，以轮询的方式进行调用服务，来测试下服务的发现和简单的服务负载  
644 | 
645 | ```go
646 | package discovery
647 | 
648 | import (
649 | 	"context"
650 | 	"fmt"
651 | 	"log"
652 | 	"net"
653 | 	"testing"
654 | 	"time"
655 | 
656 | 	"go.uber.org/zap"
657 | 	"google.golang.org/grpc/balancer/roundrobin"
658 | 	"google.golang.org/grpc/resolver"
659 | 
660 | 	"etcd-learning/discovery/helloworld"
661 | 
662 | 	"google.golang.org/grpc"
663 | )
664 | 
665 | var etcdAddrs = []string{"127.0.0.1:2379"}
666 | 
667 | func TestResolver(t *testing.T) {
668 | 	r := NewResolver(etcdAddrs, zap.NewNop())
669 | 	resolver.Register(r)
670 | 
671 | 	// etcd中注册5个服务
672 | 	go newServer(t, ":1001", "1.0.0", 1)
673 | 	go newServer(t, ":1002", "1.0.0", 1)
674 | 	go newServer(t, ":1003", "1.0.0", 1)
675 | 	go newServer(t, ":1004", "1.0.0", 1)
676 | 	go newServer(t, ":1006", "1.0.0", 10)
677 | 
678 | 	conn, err := grpc.Dial("etcd:///hello", grpc.WithInsecure(), grpc.WithBalancerName(roundrobin.Name))
679 | 	if err != nil {
680 | 		t.Fatalf("failed to dial %v", err)
681 | 	}
682 | 	defer conn.Close()
683 | 
684 | 	c := helloworld.NewGreeterClient(conn)
685 | 
686 | 	// 进行十次数据请求
687 | 	for i := 0; i < 10; i++ {
688 | 		resp, err := c.SayHello(context.Background(), &helloworld.HelloRequest{Name: "abc"})
689 | 		if err != nil {
690 | 			t.Fatalf("say hello failed %v", err)
691 | 		}
692 | 		log.Println(resp.Message)
693 | 		time.Sleep(100 * time.Millisecond)
694 | 	}
695 | 
696 | 	time.Sleep(10 * time.Second)
697 | }
698 | 
699 | type server struct {
700 | 	Port string
701 | }
702 | 
703 | // SayHello implements helloworld.GreeterServer
704 | func (s *server) SayHello(ctx context.Context, in *helloworld.HelloRequest) (*helloworld.HelloReply, error) {
705 | 	return &helloworld.HelloReply{Message: fmt.Sprintf("Hello From %s", s.Port)}, nil
706 | }
707 | 
708 | func newServer(t *testing.T, port string, version string, weight int64) {
709 | 	register := NewRegister(etcdAddrs, zap.NewNop())
710 | 	defer register.Stop()
711 | 
712 | 	listen, err := net.Listen("tcp", port)
713 | 	if err != nil {
714 | 		log.Fatalf("failed to listen %v", err)
715 | 	}
716 | 
717 | 	s := grpc.NewServer()
718 | 	helloworld.RegisterGreeterServer(s, &server{Port: port})
719 | 
720 | 	info := Server{
721 | 		Name:    "hello",
722 | 		Addr:    fmt.Sprintf("127.0.0.1%s", port),
723 | 		Version: version,
724 | 		Weight:  weight,
725 | 	}
726 | 
727 | 	register.Register(info, 10)
728 | 
729 | 	if err := s.Serve(listen); err != nil {
730 | 		log.Fatalf("failed to server %v", err)
731 | 	}
732 | }
733 | ```
734 | 
735 | 这里注册了5个服务，端口号是1001到1006，循环调用10次   
736 | 
737 | ```go
738 | === RUN   TestResolver
739 | 2021/07/24 22:44:52 Hello From :1001
740 | 2021/07/24 22:44:52 Hello From :1006
741 | 2021/07/24 22:44:53 Hello From :1001
742 | 2021/07/24 22:44:53 Hello From :1002
743 | 2021/07/24 22:44:53 Hello From :1003
744 | 2021/07/24 22:44:53 Hello From :1004
745 | 2021/07/24 22:44:53 Hello From :1006
746 | 2021/07/24 22:44:53 Hello From :1001
747 | 2021/07/24 22:44:53 Hello From :1002
748 | 2021/07/24 22:44:53 Hello From :1003
749 | ```
750 | 
751 | 发现每次的请求会发送到不同的服务中   
752 | 
753 | 
754 | ### 负载均衡
755 | 
756 | #### 集中式LB（Proxy Model）  
757 | 
758 | <img src="/img/grpc_balance_1.png" alt="grpc" align=center/>
759 | 
760 | 在服务消费者和服务提供者之间有一个独立的LB，通常是专门的硬件设备如 F5，或者基于软件如`LVS`，`HAproxy`等实现。LB上有所有服务的地址映射表，通常由运维配置注册，当服务消费方调用某个目标服务时，它向LB发起请求，由LB以某种策略，比如轮询`（Round-Robin）`做负载均衡后将请求转发到目标服务。LB一般具备健康检查能力，能自动摘除不健康的服务实例。  
761 | 
762 | 该方案主要问题：  
763 | 
764 | 1、单点问题，所有服务调用流量都经过LB，当服务数量和调用量大的时候，LB容易成为瓶颈，且一旦LB发生故障影响整个系统；  
765 | 
766 | 2、服务消费方、提供方之间增加了一级，有一定性能开销。  
767 | 
768 | #### 进程内LB（Balancing-aware Client）   
769 | 
770 | <img src="/img/grpc_balance_2.png" alt="grpc" align=center/>
771 | 
772 | 针对第一个方案的不足，此方案将LB的功能集成到服务消费方进程里，也被称为软负载或者客户端负载方案。服务提供方启动时，首先将服务地址注册到服务注册表，同时定期报心跳到服务注册表以表明服务的存活状态，相当于健康检查，服务消费方要访问某个服务时，它通过内置的LB组件向服务注册表查询，同时缓存并定期刷新目标服务地址列表，然后以某种负载均衡策略选择一个目标服务地址，最后向目标服务发起请求。LB和服务发现能力被分散到每一个服务消费者的进程内部，同时服务消费方和服务提供方之间是直接调用，没有额外开销，性能比较好。  
773 | 
774 | 该方案主要问题：  
775 | 
776 | 1、开发成本，该方案将服务调用方集成到客户端的进程里头，如果有多种不同的语言栈，就要配合开发多种不同的客户端，有一定的研发和维护成本；  
777 | 
778 | 2、另外生产环境中，后续如果要对客户库进行升级，势必要求服务调用方修改代码并重新发布，升级较复杂。  
779 | 
780 | #### 独立 LB 进程（External Load Balancing Service）  
781 | 
782 | <img src="/img/grpc_balance_3.png" alt="grpc" align=center/>
783 | 
784 | 该方案是针对第二种方案的不足而提出的一种折中方案，原理和第二种方案基本类似。  
785 | 
786 | 不同之处是将LB和服务发现功能从进程内移出来，变成主机上的一个独立进程。主机上的一个或者多个服务要访问目标服务时，他们都通过同一主机上的独立LB进程做服务发现和负载均衡。该方案也是一种分布式方案没有单点问题，一个LB进程挂了只影响该主机上的服务调用方，服务调用方和LB之间是进程内调用性能好，同时该方案还简化了服务调用方，不需要为不同语言开发客户库，LB的升级不需要服务调用方改代码。  
787 | 
788 | 该方案主要问题：部署较复杂，环节多，出错调试排查问题不方便。  
789 | 
790 | 上面通过etcd实现服务发现，使用的及时第二种 进程内LB（Balancing-aware Client）。   
791 | 
792 | ### 参考  
793 | 
794 | 【Load Balancing in gRPC】https://github.com/grpc/grpc/blob/master/doc/load-balancing.md  
795 | 【文中的代码示例】https://github.com/boilingfrog/etcd-learning/tree/main/discovery    


--------------------------------------------------------------------------------
/discovery/grpc_discovery_example/client/main.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"etcd-learning/discovery/grpc_discovery_example/server/helloworld"
 5 | 	"fmt"
 6 | 
 7 | 	"golang.org/x/net/context"
 8 | )
 9 | 
10 | func main() {
11 | 	helloClient, err := helloworld.NewClient()
12 | 	if err != nil {
13 | 		panic(err)
14 | 	}
15 | 	res, err := helloClient.SayHello(context.Background(), &helloworld.HelloRequest{
16 | 		Name: "xiaoming",
17 | 	})
18 | 	if err != nil {
19 | 		fmt.Println(err)
20 | 	}
21 | 	fmt.Println(res)
22 | }
23 | 


--------------------------------------------------------------------------------
/discovery/grpc_discovery_example/server/helloworld/client.go:
--------------------------------------------------------------------------------
 1 | package helloworld
 2 | 
 3 | import (
 4 | 	"etcd-learning/discovery"
 5 | 
 6 | 	"go.uber.org/zap"
 7 | 
 8 | 	grpc "google.golang.org/grpc"
 9 | 	"google.golang.org/grpc/balancer/roundrobin"
10 | 	"google.golang.org/grpc/resolver"
11 | )
12 | 
13 | const App = "base-hello"
14 | 
15 | func NewClient(opts ...grpc.DialOption) (GreeterClient, error) {
16 | 	options := []grpc.DialOption{
17 | 		grpc.WithInsecure(),
18 | 		grpc.WithBalancerName(roundrobin.Name),
19 | 	}
20 | 
21 | 	addrs := []string{"127.0.0.1:2379"}
22 | 	r := discovery.NewResolver(addrs, zap.NewNop())
23 | 	resolver.Register(r)
24 | 
25 | 	conn, err := grpc.Dial("etcd:///"+App, options...)
26 | 	if err != nil {
27 | 		return nil, err
28 | 	}
29 | 	return NewGreeterClient(conn), nil
30 | }
31 | 


--------------------------------------------------------------------------------
/discovery/grpc_discovery_example/server/helloworld/helloworld.pb.go:
--------------------------------------------------------------------------------
  1 | // Code generated by protoc-gen-go. DO NOT EDIT.
  2 | // source: helloworld.proto
  3 | 
  4 | /*
  5 | Package helloworld is a generated protocol buffer package.
  6 | 
  7 | It is generated from these files:
  8 | 	helloworld.proto
  9 | 
 10 | It has these top-level messages:
 11 | 	HelloRequest
 12 | 	HelloReply
 13 | */
 14 | package helloworld
 15 | 
 16 | import (
 17 | 	fmt "fmt"
 18 | 
 19 | 	proto "github.com/golang/protobuf/proto"
 20 | 
 21 | 	math "math"
 22 | 
 23 | 	context "golang.org/x/net/context"
 24 | 
 25 | 	grpc "google.golang.org/grpc"
 26 | )
 27 | 
 28 | // Reference imports to suppress errors if they are not otherwise used.
 29 | var _ = proto.Marshal
 30 | var _ = fmt.Errorf
 31 | var _ = math.Inf
 32 | 
 33 | // This is a compile-time assertion to ensure that this generated file
 34 | // is compatible with the proto package it is being compiled against.
 35 | // A compilation error at this line likely means your copy of the
 36 | // proto package needs to be updated.
 37 | const _ = proto.ProtoPackageIsVersion2 // please upgrade the proto package
 38 | 
 39 | // The request message containing the user's name.
 40 | type HelloRequest struct {
 41 | 	Name string `protobuf:"bytes,1,opt,name=name" json:"name,omitempty"`
 42 | }
 43 | 
 44 | func (m *HelloRequest) Reset()                    { *m = HelloRequest{} }
 45 | func (m *HelloRequest) String() string            { return proto.CompactTextString(m) }
 46 | func (*HelloRequest) ProtoMessage()               {}
 47 | func (*HelloRequest) Descriptor() ([]byte, []int) { return fileDescriptor0, []int{0} }
 48 | 
 49 | func (m *HelloRequest) GetName() string {
 50 | 	if m != nil {
 51 | 		return m.Name
 52 | 	}
 53 | 	return ""
 54 | }
 55 | 
 56 | // The response message containing the greetings
 57 | type HelloReply struct {
 58 | 	Message string `protobuf:"bytes,1,opt,name=message" json:"message,omitempty"`
 59 | }
 60 | 
 61 | func (m *HelloReply) Reset()                    { *m = HelloReply{} }
 62 | func (m *HelloReply) String() string            { return proto.CompactTextString(m) }
 63 | func (*HelloReply) ProtoMessage()               {}
 64 | func (*HelloReply) Descriptor() ([]byte, []int) { return fileDescriptor0, []int{1} }
 65 | 
 66 | func (m *HelloReply) GetMessage() string {
 67 | 	if m != nil {
 68 | 		return m.Message
 69 | 	}
 70 | 	return ""
 71 | }
 72 | 
 73 | func init() {
 74 | 	proto.RegisterType((*HelloRequest)(nil), "helloworld.HelloRequest")
 75 | 	proto.RegisterType((*HelloReply)(nil), "helloworld.HelloReply")
 76 | }
 77 | 
 78 | // Reference imports to suppress errors if they are not otherwise used.
 79 | var _ context.Context
 80 | var _ grpc.ClientConn
 81 | 
 82 | // This is a compile-time assertion to ensure that this generated file
 83 | // is compatible with the grpc package it is being compiled against.
 84 | const _ = grpc.SupportPackageIsVersion4
 85 | 
 86 | // Client API for Greeter service
 87 | 
 88 | type GreeterClient interface {
 89 | 	//   Sends a greeting
 90 | 	SayHello(ctx context.Context, in *HelloRequest, opts ...grpc.CallOption) (*HelloReply, error)
 91 | }
 92 | 
 93 | type greeterClient struct {
 94 | 	cc *grpc.ClientConn
 95 | }
 96 | 
 97 | func NewGreeterClient(cc *grpc.ClientConn) GreeterClient {
 98 | 	return &greeterClient{cc}
 99 | }
100 | 
101 | func (c *greeterClient) SayHello(ctx context.Context, in *HelloRequest, opts ...grpc.CallOption) (*HelloReply, error) {
102 | 	out := new(HelloReply)
103 | 	err := grpc.Invoke(ctx, "/helloworld.Greeter/SayHello", in, out, c.cc, opts...)
104 | 	if err != nil {
105 | 		return nil, err
106 | 	}
107 | 	return out, nil
108 | }
109 | 
110 | // Server API for Greeter service
111 | 
112 | type GreeterServer interface {
113 | 	//   Sends a greeting
114 | 	SayHello(context.Context, *HelloRequest) (*HelloReply, error)
115 | }
116 | 
117 | func RegisterGreeterServer(s *grpc.Server, srv GreeterServer) {
118 | 	s.RegisterService(&_Greeter_serviceDesc, srv)
119 | }
120 | 
121 | func _Greeter_SayHello_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) {
122 | 	in := new(HelloRequest)
123 | 	if err := dec(in); err != nil {
124 | 		return nil, err
125 | 	}
126 | 	if interceptor == nil {
127 | 		return srv.(GreeterServer).SayHello(ctx, in)
128 | 	}
129 | 	info := &grpc.UnaryServerInfo{
130 | 		Server:     srv,
131 | 		FullMethod: "/helloworld.Greeter/SayHello",
132 | 	}
133 | 	handler := func(ctx context.Context, req interface{}) (interface{}, error) {
134 | 		return srv.(GreeterServer).SayHello(ctx, req.(*HelloRequest))
135 | 	}
136 | 	return interceptor(ctx, in, info, handler)
137 | }
138 | 
139 | var _Greeter_serviceDesc = grpc.ServiceDesc{
140 | 	ServiceName: "helloworld.Greeter",
141 | 	HandlerType: (*GreeterServer)(nil),
142 | 	Methods: []grpc.MethodDesc{
143 | 		{
144 | 			MethodName: "SayHello",
145 | 			Handler:    _Greeter_SayHello_Handler,
146 | 		},
147 | 	},
148 | 	Streams:  []grpc.StreamDesc{},
149 | 	Metadata: "helloworld.proto",
150 | }
151 | 
152 | func init() { proto.RegisterFile("helloworld.proto", fileDescriptor0) }
153 | 
154 | var fileDescriptor0 = []byte{
155 | 	// 186 bytes of a gzipped FileDescriptorProto
156 | 	0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0xff, 0xe2, 0x12, 0xc8, 0x48, 0xcd, 0xc9,
157 | 	0xc9, 0x2f, 0xcf, 0x2f, 0xca, 0x49, 0xd1, 0x2b, 0x28, 0xca, 0x2f, 0xc9, 0x17, 0xe2, 0x42, 0x88,
158 | 	0x28, 0x29, 0x71, 0xf1, 0x78, 0x80, 0x78, 0x41, 0xa9, 0x85, 0xa5, 0xa9, 0xc5, 0x25, 0x42, 0x42,
159 | 	0x5c, 0x2c, 0x79, 0x89, 0xb9, 0xa9, 0x12, 0x8c, 0x0a, 0x8c, 0x1a, 0x9c, 0x41, 0x60, 0xb6, 0x92,
160 | 	0x1a, 0x17, 0x17, 0x54, 0x4d, 0x41, 0x4e, 0xa5, 0x90, 0x04, 0x17, 0x7b, 0x6e, 0x6a, 0x71, 0x71,
161 | 	0x62, 0x3a, 0x4c, 0x11, 0x8c, 0x6b, 0xe4, 0xc9, 0xc5, 0xee, 0x5e, 0x94, 0x9a, 0x5a, 0x92, 0x5a,
162 | 	0x24, 0x64, 0xc7, 0xc5, 0x11, 0x9c, 0x58, 0x09, 0xd6, 0x25, 0x24, 0xa1, 0x87, 0xe4, 0x02, 0x64,
163 | 	0xcb, 0xa4, 0xc4, 0xb0, 0xc8, 0x14, 0xe4, 0x54, 0x2a, 0x31, 0x38, 0x19, 0x72, 0x89, 0x25, 0xe7,
164 | 	0xe7, 0xea, 0xe5, 0x66, 0xa6, 0xa4, 0x26, 0xea, 0x65, 0x15, 0xe9, 0x95, 0xa4, 0x16, 0x97, 0xe8,
165 | 	0xa5, 0x17, 0x15, 0x24, 0x3b, 0xf1, 0x83, 0xd5, 0x85, 0x83, 0xb4, 0x04, 0x80, 0x7c, 0x13, 0xc0,
166 | 	0xb8, 0x88, 0x89, 0xd9, 0xc3, 0x27, 0x3c, 0x89, 0x0d, 0xec, 0x39, 0x63, 0x40, 0x00, 0x00, 0x00,
167 | 	0xff, 0xff, 0x99, 0x17, 0x1f, 0xe3, 0xf0, 0x00, 0x00, 0x00,
168 | }
169 | 


--------------------------------------------------------------------------------
/discovery/grpc_discovery_example/server/helloworld/helloworld.proto:
--------------------------------------------------------------------------------
 1 | syntax = "proto3";
 2 | 
 3 | option java_multiple_files = true;
 4 | option java_package = "com.midea.jr.test.grpc";
 5 | option java_outer_classname = "HelloWorldProto";
 6 | option objc_class_prefix = "HLW";
 7 | 
 8 | package helloworld;
 9 | 
10 | // The greeting service definition.
11 | service Greeter {
12 |     //   Sends a greeting
13 |     rpc SayHello (HelloRequest) returns (HelloReply) {
14 |     }
15 | }
16 | 
17 | // The request message containing the user's name.
18 | message HelloRequest {
19 |     string name = 1;
20 | }
21 | 
22 | // The response message containing the greetings
23 | message HelloReply {
24 |     string message = 1;
25 | }
26 | 


--------------------------------------------------------------------------------
/discovery/grpc_discovery_example/server/main.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"etcd-learning/discovery"
 5 | 	"etcd-learning/discovery/helloworld"
 6 | 	"fmt"
 7 | 	"net"
 8 | 	"os"
 9 | 	"os/signal"
10 | 	"syscall"
11 | 	"time"
12 | 
13 | 	"google.golang.org/grpc"
14 | 
15 | 	"go.uber.org/zap"
16 | )
17 | 
18 | const (
19 | 	app         = "base-hello"
20 | 	grpcAddress = "127.0.0.1:8083"
21 | )
22 | 
23 | func main() {
24 | 	addrs := []string{"127.0.0.1:2379"}
25 | 	etcdRegister := discovery.NewRegister(addrs, zap.NewNop())
26 | 	node := discovery.Server{
27 | 		Name: app,
28 | 		Addr: grpcAddress,
29 | 	}
30 | 
31 | 	server, err := Start()
32 | 	if err != nil {
33 | 		panic(fmt.Sprintf("start server failed : %v", err))
34 | 	}
35 | 
36 | 	if _, err := etcdRegister.Register(node, 10); err != nil {
37 | 		panic(fmt.Sprintf("server register failed: %v", err))
38 | 	}
39 | 
40 | 	fmt.Println("service started listen on", grpcAddress)
41 | 	c := make(chan os.Signal, 1)
42 | 	signal.Notify(c, syscall.SIGHUP, syscall.SIGQUIT, syscall.SIGTERM, syscall.SIGINT)
43 | 	for {
44 | 		s := <-c
45 | 		switch s {
46 | 		case syscall.SIGQUIT, syscall.SIGTERM, syscall.SIGINT:
47 | 			server.Stop()
48 | 			etcdRegister.Stop()
49 | 			time.Sleep(time.Second)
50 | 			return
51 | 		case syscall.SIGHUP:
52 | 		default:
53 | 			return
54 | 		}
55 | 	}
56 | }
57 | 
58 | func Start() (*grpc.Server, error) {
59 | 	s := grpc.NewServer()
60 | 
61 | 	helloworld.RegisterGreeterServer(s, &server{})
62 | 	lis, err := net.Listen("tcp", grpcAddress)
63 | 	if err != nil {
64 | 		return nil, err
65 | 	}
66 | 
67 | 	go func() {
68 | 		if err := s.Serve(lis); err != nil {
69 | 			panic(err)
70 | 		}
71 | 	}()
72 | 
73 | 	return s, nil
74 | }
75 | 


--------------------------------------------------------------------------------
/discovery/grpc_discovery_example/server/service.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"context"
 5 | 	"etcd-learning/discovery/helloworld"
 6 | )
 7 | 
 8 | type Service struct {
 9 | }
10 | 
11 | type server struct {
12 | 	service *Service
13 | }
14 | 
15 | func (s server) SayHello(ctx context.Context, re *helloworld.HelloRequest) (*helloworld.HelloReply, error) {
16 | 
17 | 	return &helloworld.HelloReply{
18 | 		Message: "hello " + re.Name,
19 | 	}, nil
20 | }
21 | 


--------------------------------------------------------------------------------
/discovery/helloworld/helloworld.pb.go:
--------------------------------------------------------------------------------
  1 | // Code generated by protoc-gen-go. DO NOT EDIT.
  2 | // source: helloworld.proto
  3 | 
  4 | /*
  5 | Package helloworld is a generated protocol buffer package.
  6 | 
  7 | It is generated from these files:
  8 | 	helloworld.proto
  9 | 
 10 | It has these top-level messages:
 11 | 	HelloRequest
 12 | 	HelloReply
 13 | */
 14 | package helloworld
 15 | 
 16 | import (
 17 | 	fmt "fmt"
 18 | 
 19 | 	proto "github.com/golang/protobuf/proto"
 20 | 
 21 | 	math "math"
 22 | 
 23 | 	context "golang.org/x/net/context"
 24 | 
 25 | 	grpc "google.golang.org/grpc"
 26 | )
 27 | 
 28 | // Reference imports to suppress errors if they are not otherwise used.
 29 | var _ = proto.Marshal
 30 | var _ = fmt.Errorf
 31 | var _ = math.Inf
 32 | 
 33 | // This is a compile-time assertion to ensure that this generated file
 34 | // is compatible with the proto package it is being compiled against.
 35 | // A compilation error at this line likely means your copy of the
 36 | // proto package needs to be updated.
 37 | const _ = proto.ProtoPackageIsVersion2 // please upgrade the proto package
 38 | 
 39 | // The request message containing the user's name.
 40 | type HelloRequest struct {
 41 | 	Name string `protobuf:"bytes,1,opt,name=name" json:"name,omitempty"`
 42 | }
 43 | 
 44 | func (m *HelloRequest) Reset()                    { *m = HelloRequest{} }
 45 | func (m *HelloRequest) String() string            { return proto.CompactTextString(m) }
 46 | func (*HelloRequest) ProtoMessage()               {}
 47 | func (*HelloRequest) Descriptor() ([]byte, []int) { return fileDescriptor0, []int{0} }
 48 | 
 49 | func (m *HelloRequest) GetName() string {
 50 | 	if m != nil {
 51 | 		return m.Name
 52 | 	}
 53 | 	return ""
 54 | }
 55 | 
 56 | // The response message containing the greetings
 57 | type HelloReply struct {
 58 | 	Message string `protobuf:"bytes,1,opt,name=message" json:"message,omitempty"`
 59 | }
 60 | 
 61 | func (m *HelloReply) Reset()                    { *m = HelloReply{} }
 62 | func (m *HelloReply) String() string            { return proto.CompactTextString(m) }
 63 | func (*HelloReply) ProtoMessage()               {}
 64 | func (*HelloReply) Descriptor() ([]byte, []int) { return fileDescriptor0, []int{1} }
 65 | 
 66 | func (m *HelloReply) GetMessage() string {
 67 | 	if m != nil {
 68 | 		return m.Message
 69 | 	}
 70 | 	return ""
 71 | }
 72 | 
 73 | func init() {
 74 | 	proto.RegisterType((*HelloRequest)(nil), "helloworld.HelloRequest")
 75 | 	proto.RegisterType((*HelloReply)(nil), "helloworld.HelloReply")
 76 | }
 77 | 
 78 | // Reference imports to suppress errors if they are not otherwise used.
 79 | var _ context.Context
 80 | var _ grpc.ClientConn
 81 | 
 82 | // This is a compile-time assertion to ensure that this generated file
 83 | // is compatible with the grpc package it is being compiled against.
 84 | const _ = grpc.SupportPackageIsVersion4
 85 | 
 86 | // Client API for Greeter service
 87 | 
 88 | type GreeterClient interface {
 89 | 	//   Sends a greeting
 90 | 	SayHello(ctx context.Context, in *HelloRequest, opts ...grpc.CallOption) (*HelloReply, error)
 91 | }
 92 | 
 93 | type greeterClient struct {
 94 | 	cc *grpc.ClientConn
 95 | }
 96 | 
 97 | func NewGreeterClient(cc *grpc.ClientConn) GreeterClient {
 98 | 	return &greeterClient{cc}
 99 | }
100 | 
101 | func (c *greeterClient) SayHello(ctx context.Context, in *HelloRequest, opts ...grpc.CallOption) (*HelloReply, error) {
102 | 	out := new(HelloReply)
103 | 	err := grpc.Invoke(ctx, "/helloworld.Greeter/SayHello", in, out, c.cc, opts...)
104 | 	if err != nil {
105 | 		return nil, err
106 | 	}
107 | 	return out, nil
108 | }
109 | 
110 | // Server API for Greeter service
111 | 
112 | type GreeterServer interface {
113 | 	//   Sends a greeting
114 | 	SayHello(context.Context, *HelloRequest) (*HelloReply, error)
115 | }
116 | 
117 | func RegisterGreeterServer(s *grpc.Server, srv GreeterServer) {
118 | 	s.RegisterService(&_Greeter_serviceDesc, srv)
119 | }
120 | 
121 | func _Greeter_SayHello_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) {
122 | 	in := new(HelloRequest)
123 | 	if err := dec(in); err != nil {
124 | 		return nil, err
125 | 	}
126 | 	if interceptor == nil {
127 | 		return srv.(GreeterServer).SayHello(ctx, in)
128 | 	}
129 | 	info := &grpc.UnaryServerInfo{
130 | 		Server:     srv,
131 | 		FullMethod: "/helloworld.Greeter/SayHello",
132 | 	}
133 | 	handler := func(ctx context.Context, req interface{}) (interface{}, error) {
134 | 		return srv.(GreeterServer).SayHello(ctx, req.(*HelloRequest))
135 | 	}
136 | 	return interceptor(ctx, in, info, handler)
137 | }
138 | 
139 | var _Greeter_serviceDesc = grpc.ServiceDesc{
140 | 	ServiceName: "helloworld.Greeter",
141 | 	HandlerType: (*GreeterServer)(nil),
142 | 	Methods: []grpc.MethodDesc{
143 | 		{
144 | 			MethodName: "SayHello",
145 | 			Handler:    _Greeter_SayHello_Handler,
146 | 		},
147 | 	},
148 | 	Streams:  []grpc.StreamDesc{},
149 | 	Metadata: "helloworld.proto",
150 | }
151 | 
152 | func init() { proto.RegisterFile("helloworld.proto", fileDescriptor0) }
153 | 
154 | var fileDescriptor0 = []byte{
155 | 	// 186 bytes of a gzipped FileDescriptorProto
156 | 	0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0xff, 0xe2, 0x12, 0xc8, 0x48, 0xcd, 0xc9,
157 | 	0xc9, 0x2f, 0xcf, 0x2f, 0xca, 0x49, 0xd1, 0x2b, 0x28, 0xca, 0x2f, 0xc9, 0x17, 0xe2, 0x42, 0x88,
158 | 	0x28, 0x29, 0x71, 0xf1, 0x78, 0x80, 0x78, 0x41, 0xa9, 0x85, 0xa5, 0xa9, 0xc5, 0x25, 0x42, 0x42,
159 | 	0x5c, 0x2c, 0x79, 0x89, 0xb9, 0xa9, 0x12, 0x8c, 0x0a, 0x8c, 0x1a, 0x9c, 0x41, 0x60, 0xb6, 0x92,
160 | 	0x1a, 0x17, 0x17, 0x54, 0x4d, 0x41, 0x4e, 0xa5, 0x90, 0x04, 0x17, 0x7b, 0x6e, 0x6a, 0x71, 0x71,
161 | 	0x62, 0x3a, 0x4c, 0x11, 0x8c, 0x6b, 0xe4, 0xc9, 0xc5, 0xee, 0x5e, 0x94, 0x9a, 0x5a, 0x92, 0x5a,
162 | 	0x24, 0x64, 0xc7, 0xc5, 0x11, 0x9c, 0x58, 0x09, 0xd6, 0x25, 0x24, 0xa1, 0x87, 0xe4, 0x02, 0x64,
163 | 	0xcb, 0xa4, 0xc4, 0xb0, 0xc8, 0x14, 0xe4, 0x54, 0x2a, 0x31, 0x38, 0x19, 0x72, 0x89, 0x25, 0xe7,
164 | 	0xe7, 0xea, 0xe5, 0x66, 0xa6, 0xa4, 0x26, 0xea, 0x65, 0x15, 0xe9, 0x95, 0xa4, 0x16, 0x97, 0xe8,
165 | 	0xa5, 0x17, 0x15, 0x24, 0x3b, 0xf1, 0x83, 0xd5, 0x85, 0x83, 0xb4, 0x04, 0x80, 0x7c, 0x13, 0xc0,
166 | 	0xb8, 0x88, 0x89, 0xd9, 0xc3, 0x27, 0x3c, 0x89, 0x0d, 0xec, 0x39, 0x63, 0x40, 0x00, 0x00, 0x00,
167 | 	0xff, 0xff, 0x99, 0x17, 0x1f, 0xe3, 0xf0, 0x00, 0x00, 0x00,
168 | }
169 | 


--------------------------------------------------------------------------------
/discovery/helloworld/helloworld.proto:
--------------------------------------------------------------------------------
 1 | syntax = "proto3";
 2 | 
 3 | option java_multiple_files = true;
 4 | option java_package = "com.midea.jr.test.grpc";
 5 | option java_outer_classname = "HelloWorldProto";
 6 | option objc_class_prefix = "HLW";
 7 | 
 8 | package helloworld;
 9 | 
10 | // The greeting service definition.
11 | service Greeter {
12 |     //   Sends a greeting
13 |     rpc SayHello (HelloRequest) returns (HelloReply) {
14 |     }
15 | }
16 | 
17 | // The request message containing the user's name.
18 | message HelloRequest {
19 |     string name = 1;
20 | }
21 | 
22 | // The response message containing the greetings
23 | message HelloReply {
24 |     string message = 1;
25 | }
26 | 


--------------------------------------------------------------------------------
/discovery/instance.go:
--------------------------------------------------------------------------------
 1 | package discovery
 2 | 
 3 | import (
 4 | 	"encoding/json"
 5 | 	"errors"
 6 | 	"fmt"
 7 | 	"strings"
 8 | 
 9 | 	"google.golang.org/grpc/resolver"
10 | )
11 | 
12 | type Server struct {
13 | 	Name    string `json:"name"`
14 | 	Addr    string `json:"addr"`    //服务地址
15 | 	Version string `json:"version"` //服务版本
16 | 	Weight  int64  `json:"weight"`  //服务权重
17 | }
18 | 
19 | func BuildPrefix(info Server) string {
20 | 	if info.Version == "" {
21 | 		return fmt.Sprintf("/%s/", info.Name)
22 | 	}
23 | 	return fmt.Sprintf("/%s/%s/", info.Name, info.Version)
24 | }
25 | 
26 | func BuildRegPath(info Server) string {
27 | 	return fmt.Sprintf("%s%s", BuildPrefix(info), info.Addr)
28 | }
29 | 
30 | func ParseValue(value []byte) (Server, error) {
31 | 	info := Server{}
32 | 	if err := json.Unmarshal(value, &info); err != nil {
33 | 		return info, err
34 | 	}
35 | 	return info, nil
36 | }
37 | 
38 | func SplitPath(path string) (Server, error) {
39 | 	info := Server{}
40 | 	strs := strings.Split(path, "/")
41 | 	if len(strs) == 0 {
42 | 		return info, errors.New("invalid path")
43 | 	}
44 | 	info.Addr = strs[len(strs)-1]
45 | 	return info, nil
46 | }
47 | 
48 | // Exist helper function
49 | func Exist(l []resolver.Address, addr resolver.Address) bool {
50 | 	for i := range l {
51 | 		if l[i].Addr == addr.Addr {
52 | 			return true
53 | 		}
54 | 	}
55 | 	return false
56 | }
57 | 
58 | // Remove helper function
59 | func Remove(s []resolver.Address, addr resolver.Address) ([]resolver.Address, bool) {
60 | 	for i := range s {
61 | 		if s[i].Addr == addr.Addr {
62 | 			s[i] = s[len(s)-1]
63 | 			return s[:len(s)-1], true
64 | 		}
65 | 	}
66 | 	return nil, false
67 | }
68 | 
69 | func BuildResolverUrl(app string) string {
70 | 	return schema + ":///" + app
71 | }
72 | 


--------------------------------------------------------------------------------
/discovery/register.go:
--------------------------------------------------------------------------------
  1 | package discovery
  2 | 
  3 | import (
  4 | 	"context"
  5 | 	"encoding/json"
  6 | 	"errors"
  7 | 	"fmt"
  8 | 	"net/http"
  9 | 	"strconv"
 10 | 	"strings"
 11 | 	"time"
 12 | 
 13 | 	clientv3 "go.etcd.io/etcd/client/v3"
 14 | 	"go.uber.org/zap"
 15 | )
 16 | 
 17 | // Register for grpc server
 18 | type Register struct {
 19 | 	EtcdAddrs   []string
 20 | 	DialTimeout int
 21 | 
 22 | 	closeCh     chan struct{}
 23 | 	leasesID    clientv3.LeaseID
 24 | 	keepAliveCh <-chan *clientv3.LeaseKeepAliveResponse
 25 | 
 26 | 	srvInfo Server
 27 | 	srvTTL  int64
 28 | 	cli     *clientv3.Client
 29 | 	logger  *zap.Logger
 30 | }
 31 | 
 32 | // NewRegister create a register base on etcd
 33 | func NewRegister(etcdAddrs []string, logger *zap.Logger) *Register {
 34 | 	return &Register{
 35 | 		EtcdAddrs:   etcdAddrs,
 36 | 		DialTimeout: 3,
 37 | 		logger:      logger,
 38 | 	}
 39 | }
 40 | 
 41 | // Register a service
 42 | func (r *Register) Register(srvInfo Server, ttl int64) (chan<- struct{}, error) {
 43 | 	var err error
 44 | 
 45 | 	if strings.Split(srvInfo.Addr, ":")[0] == "" {
 46 | 		return nil, errors.New("invalid ip")
 47 | 	}
 48 | 
 49 | 	if r.cli, err = clientv3.New(clientv3.Config{
 50 | 		Endpoints:   r.EtcdAddrs,
 51 | 		DialTimeout: time.Duration(r.DialTimeout) * time.Second,
 52 | 	}); err != nil {
 53 | 		return nil, err
 54 | 	}
 55 | 
 56 | 	r.srvInfo = srvInfo
 57 | 	r.srvTTL = ttl
 58 | 
 59 | 	if err = r.register(); err != nil {
 60 | 		return nil, err
 61 | 	}
 62 | 
 63 | 	r.closeCh = make(chan struct{})
 64 | 
 65 | 	go r.keepAlive()
 66 | 
 67 | 	return r.closeCh, nil
 68 | }
 69 | 
 70 | // Stop stop register
 71 | func (r *Register) Stop() {
 72 | 	r.closeCh <- struct{}{}
 73 | }
 74 | 
 75 | // register 注册节点
 76 | func (r *Register) register() error {
 77 | 	leaseCtx, cancel := context.WithTimeout(context.Background(), time.Duration(r.DialTimeout)*time.Second)
 78 | 	defer cancel()
 79 | 
 80 | 	leaseResp, err := r.cli.Grant(leaseCtx, r.srvTTL)
 81 | 	if err != nil {
 82 | 		return err
 83 | 	}
 84 | 	r.leasesID = leaseResp.ID
 85 | 	if r.keepAliveCh, err = r.cli.KeepAlive(context.Background(), leaseResp.ID); err != nil {
 86 | 		return err
 87 | 	}
 88 | 
 89 | 	data, err := json.Marshal(r.srvInfo)
 90 | 	if err != nil {
 91 | 		return err
 92 | 	}
 93 | 	_, err = r.cli.Put(context.Background(), BuildRegPath(r.srvInfo), string(data), clientv3.WithLease(r.leasesID))
 94 | 	return err
 95 | }
 96 | 
 97 | // unregister 删除节点
 98 | func (r *Register) unregister() error {
 99 | 	_, err := r.cli.Delete(context.Background(), BuildRegPath(r.srvInfo))
100 | 	return err
101 | }
102 | 
103 | // keepAlive
104 | func (r *Register) keepAlive() {
105 | 	ticker := time.NewTicker(time.Duration(r.srvTTL) * time.Second)
106 | 	for {
107 | 		select {
108 | 		case <-r.closeCh:
109 | 			fmt.Println("+++++++")
110 | 			if err := r.unregister(); err != nil {
111 | 				r.logger.Error("unregister failed", zap.Error(err))
112 | 			}
113 | 			if _, err := r.cli.Revoke(context.Background(), r.leasesID); err != nil {
114 | 				r.logger.Error("revoke failed", zap.Error(err))
115 | 			}
116 | 			return
117 | 		case res := <-r.keepAliveCh:
118 | 			if res == nil {
119 | 				if err := r.register(); err != nil {
120 | 					r.logger.Error("register failed", zap.Error(err))
121 | 				}
122 | 			}
123 | 		case <-ticker.C:
124 | 			if r.keepAliveCh == nil {
125 | 				if err := r.register(); err != nil {
126 | 					r.logger.Error("register failed", zap.Error(err))
127 | 				}
128 | 			}
129 | 		}
130 | 	}
131 | }
132 | 
133 | // UpdateHandler return http handler
134 | func (r *Register) UpdateHandler() http.HandlerFunc {
135 | 	return http.HandlerFunc(func(w http.ResponseWriter, req *http.Request) {
136 | 		wi := req.URL.Query().Get("weight")
137 | 		weight, err := strconv.Atoi(wi)
138 | 		if err != nil {
139 | 			w.WriteHeader(http.StatusBadRequest)
140 | 			w.Write([]byte(err.Error()))
141 | 			return
142 | 		}
143 | 
144 | 		var update = func() error {
145 | 			r.srvInfo.Weight = int64(weight)
146 | 			data, err := json.Marshal(r.srvInfo)
147 | 			if err != nil {
148 | 				return err
149 | 			}
150 | 			_, err = r.cli.Put(context.Background(), BuildRegPath(r.srvInfo), string(data), clientv3.WithLease(r.leasesID))
151 | 			return err
152 | 		}
153 | 
154 | 		if err := update(); err != nil {
155 | 			w.WriteHeader(http.StatusInternalServerError)
156 | 			w.Write([]byte(err.Error()))
157 | 			return
158 | 		}
159 | 		w.Write([]byte("update server weight success"))
160 | 	})
161 | }
162 | 
163 | func (r *Register) GetServerInfo() (Server, error) {
164 | 	resp, err := r.cli.Get(context.Background(), BuildRegPath(r.srvInfo))
165 | 	if err != nil {
166 | 		return r.srvInfo, err
167 | 	}
168 | 	info := Server{}
169 | 	if resp.Count >= 1 {
170 | 		if err := json.Unmarshal(resp.Kvs[0].Value, &info); err != nil {
171 | 			return info, err
172 | 		}
173 | 	}
174 | 	return info, nil
175 | }
176 | 


--------------------------------------------------------------------------------
/discovery/register_test.go:
--------------------------------------------------------------------------------
 1 | package discovery
 2 | 
 3 | import (
 4 | 	"log"
 5 | 	"net/http"
 6 | 	"net/http/httptest"
 7 | 	"testing"
 8 | 	"time"
 9 | 
10 | 	"go.uber.org/zap"
11 | )
12 | 
13 | func TestRegister(t *testing.T) {
14 | 	info := Server{
15 | 		Name:    "user",
16 | 		Addr:    "localhost:8083",
17 | 		Version: "1.0.0",
18 | 		Weight:  2,
19 | 	}
20 | 
21 | 	addrs := []string{"127.0.0.1:2379"}
22 | 	r := NewRegister(addrs, zap.NewNop())
23 | 
24 | 	_, err := r.Register(info, 2)
25 | 	if err != nil {
26 | 		t.Fatalf("register to etcd failed %v", err)
27 | 	}
28 | 
29 | 	infoRes, err := r.GetServerInfo()
30 | 	if err != nil {
31 | 		t.Fatalf("get info failed %v", err)
32 | 	}
33 | 	log.Println(infoRes)
34 | 	time.Sleep(2 * time.Second)
35 | 
36 | 	req, err := http.NewRequest("GET", "/weight?weight=3", nil)
37 | 	if err != nil {
38 | 		t.Fatalf("init request failed: %v", err)
39 | 	}
40 | 	rr := httptest.NewRecorder()
41 | 	r.UpdateHandler().ServeHTTP(rr, req)
42 | 
43 | 	if status := rr.Code; status != http.StatusOK {
44 | 		t.Errorf("returned wrong status code: got %v want %v", status, http.StatusOK)
45 | 	}
46 | 
47 | 	infoRes, err = r.GetServerInfo()
48 | 	if err != nil {
49 | 		t.Fatalf("get info failed %v", err)
50 | 	}
51 | 	log.Println(infoRes)
52 | 	if infoRes.Weight != 3 {
53 | 		t.Fatal("update weight error")
54 | 	}
55 | 
56 | 	time.Sleep(5 * time.Second)
57 | 
58 | 	//r.Stop()
59 | }
60 | 


--------------------------------------------------------------------------------
/discovery/resolver.go:
--------------------------------------------------------------------------------
  1 | package discovery
  2 | 
  3 | import (
  4 | 	"context"
  5 | 	"time"
  6 | 
  7 | 	"go.uber.org/zap"
  8 | 
  9 | 	"go.etcd.io/etcd/api/v3/mvccpb"
 10 | 	clientv3 "go.etcd.io/etcd/client/v3"
 11 | 	"google.golang.org/grpc/resolver"
 12 | )
 13 | 
 14 | const (
 15 | 	schema = "etcd"
 16 | )
 17 | 
 18 | // Resolver for grpc client
 19 | type Resolver struct {
 20 | 	schema      string
 21 | 	EtcdAddrs   []string
 22 | 	DialTimeout int
 23 | 
 24 | 	closeCh      chan struct{}
 25 | 	watchCh      clientv3.WatchChan
 26 | 	cli          *clientv3.Client
 27 | 	keyPrifix    string
 28 | 	srvAddrsList []resolver.Address
 29 | 
 30 | 	cc     resolver.ClientConn
 31 | 	logger *zap.Logger
 32 | }
 33 | 
 34 | // NewResolver create a new resolver.Builder base on etcd
 35 | func NewResolver(etcdAddrs []string, logger *zap.Logger) *Resolver {
 36 | 	return &Resolver{
 37 | 		schema:      schema,
 38 | 		EtcdAddrs:   etcdAddrs,
 39 | 		DialTimeout: 3,
 40 | 		logger:      logger,
 41 | 	}
 42 | }
 43 | 
 44 | // Scheme returns the scheme supported by this resolver.
 45 | func (r *Resolver) Scheme() string {
 46 | 	return r.schema
 47 | }
 48 | 
 49 | // Build creates a new resolver.Resolver for the given target
 50 | func (r *Resolver) Build(target resolver.Target, cc resolver.ClientConn, opts resolver.BuildOptions) (resolver.Resolver, error) {
 51 | 	r.cc = cc
 52 | 
 53 | 	r.keyPrifix = BuildPrefix(Server{Name: target.Endpoint, Version: target.Authority})
 54 | 	if _, err := r.start(); err != nil {
 55 | 		return nil, err
 56 | 	}
 57 | 	return r, nil
 58 | }
 59 | 
 60 | // ResolveNow resolver.Resolver interface
 61 | func (r *Resolver) ResolveNow(o resolver.ResolveNowOptions) {}
 62 | 
 63 | // Close resolver.Resolver interface
 64 | func (r *Resolver) Close() {
 65 | 	r.closeCh <- struct{}{}
 66 | }
 67 | 
 68 | // start
 69 | func (r *Resolver) start() (chan<- struct{}, error) {
 70 | 	var err error
 71 | 	r.cli, err = clientv3.New(clientv3.Config{
 72 | 		Endpoints:   r.EtcdAddrs,
 73 | 		DialTimeout: time.Duration(r.DialTimeout) * time.Second,
 74 | 	})
 75 | 	if err != nil {
 76 | 		return nil, err
 77 | 	}
 78 | 	resolver.Register(r)
 79 | 
 80 | 	r.closeCh = make(chan struct{})
 81 | 
 82 | 	if err = r.sync(); err != nil {
 83 | 		return nil, err
 84 | 	}
 85 | 
 86 | 	go r.watch()
 87 | 
 88 | 	return r.closeCh, nil
 89 | }
 90 | 
 91 | // watch update events
 92 | func (r *Resolver) watch() {
 93 | 	ticker := time.NewTicker(time.Minute)
 94 | 	r.watchCh = r.cli.Watch(context.Background(), r.keyPrifix, clientv3.WithPrefix())
 95 | 
 96 | 	for {
 97 | 		select {
 98 | 		case <-r.closeCh:
 99 | 			return
100 | 		case res, ok := <-r.watchCh:
101 | 			if ok {
102 | 				r.update(res.Events)
103 | 			}
104 | 		case <-ticker.C:
105 | 			if err := r.sync(); err != nil {
106 | 				r.logger.Error("sync failed", zap.Error(err))
107 | 			}
108 | 		}
109 | 	}
110 | }
111 | 
112 | // update
113 | func (r *Resolver) update(events []*clientv3.Event) {
114 | 	for _, ev := range events {
115 | 		var info Server
116 | 		var err error
117 | 
118 | 		switch ev.Type {
119 | 		case mvccpb.PUT:
120 | 			info, err = ParseValue(ev.Kv.Value)
121 | 			if err != nil {
122 | 				continue
123 | 			}
124 | 			addr := resolver.Address{Addr: info.Addr, Metadata: info.Weight}
125 | 			if !Exist(r.srvAddrsList, addr) {
126 | 				r.srvAddrsList = append(r.srvAddrsList, addr)
127 | 				r.cc.UpdateState(resolver.State{Addresses: r.srvAddrsList})
128 | 			}
129 | 		case mvccpb.DELETE:
130 | 			info, err = SplitPath(string(ev.Kv.Key))
131 | 			if err != nil {
132 | 				continue
133 | 			}
134 | 			addr := resolver.Address{Addr: info.Addr}
135 | 			if s, ok := Remove(r.srvAddrsList, addr); ok {
136 | 				r.srvAddrsList = s
137 | 				r.cc.UpdateState(resolver.State{Addresses: r.srvAddrsList})
138 | 			}
139 | 		}
140 | 	}
141 | }
142 | 
143 | // sync 同步获取所有地址信息
144 | func (r *Resolver) sync() error {
145 | 	ctx, cancel := context.WithTimeout(context.Background(), 3*time.Second)
146 | 	defer cancel()
147 | 	res, err := r.cli.Get(ctx, r.keyPrifix, clientv3.WithPrefix())
148 | 	if err != nil {
149 | 		return err
150 | 	}
151 | 	r.srvAddrsList = []resolver.Address{}
152 | 
153 | 	for _, v := range res.Kvs {
154 | 		info, err := ParseValue(v.Value)
155 | 		if err != nil {
156 | 			continue
157 | 		}
158 | 		addr := resolver.Address{Addr: info.Addr, Metadata: info.Weight}
159 | 		r.srvAddrsList = append(r.srvAddrsList, addr)
160 | 	}
161 | 	r.cc.UpdateState(resolver.State{Addresses: r.srvAddrsList})
162 | 	return nil
163 | }
164 | 


--------------------------------------------------------------------------------
/discovery/resolver_test.go:
--------------------------------------------------------------------------------
 1 | package discovery
 2 | 
 3 | import (
 4 | 	"context"
 5 | 	"fmt"
 6 | 	"log"
 7 | 	"net"
 8 | 	"testing"
 9 | 	"time"
10 | 
11 | 	"go.uber.org/zap"
12 | 	"google.golang.org/grpc/balancer/roundrobin"
13 | 	"google.golang.org/grpc/resolver"
14 | 
15 | 	"etcd-learning/discovery/helloworld"
16 | 
17 | 	"google.golang.org/grpc"
18 | )
19 | 
20 | var etcdAddrs = []string{"127.0.0.1:2379"}
21 | 
22 | func TestResolver(t *testing.T) {
23 | 	r := NewResolver(etcdAddrs, zap.NewNop())
24 | 	resolver.Register(r)
25 | 
26 | 	// etcd中注册5个服务
27 | 	go newServer(t, ":1001", "1.0.0", 1)
28 | 	go newServer(t, ":1002", "1.0.0", 1)
29 | 	go newServer(t, ":1003", "1.0.0", 1)
30 | 	go newServer(t, ":1004", "1.0.0", 1)
31 | 	go newServer(t, ":1006", "1.0.0", 10)
32 | 
33 | 	conn, err := grpc.Dial("etcd:///hello", grpc.WithInsecure(), grpc.WithBalancerName(roundrobin.Name))
34 | 	if err != nil {
35 | 		t.Fatalf("failed to dial %v", err)
36 | 	}
37 | 	defer conn.Close()
38 | 
39 | 	c := helloworld.NewGreeterClient(conn)
40 | 
41 | 	// 进行十次数据请求
42 | 	for i := 0; i < 10; i++ {
43 | 		resp, err := c.SayHello(context.Background(), &helloworld.HelloRequest{Name: "abc"})
44 | 		if err != nil {
45 | 			t.Fatalf("say hello failed %v", err)
46 | 		}
47 | 		log.Println(resp.Message)
48 | 		time.Sleep(100 * time.Millisecond)
49 | 	}
50 | 
51 | 	time.Sleep(10 * time.Second)
52 | }
53 | 
54 | type server struct {
55 | 	Port string
56 | }
57 | 
58 | // SayHello implements helloworld.GreeterServer
59 | func (s *server) SayHello(ctx context.Context, in *helloworld.HelloRequest) (*helloworld.HelloReply, error) {
60 | 	return &helloworld.HelloReply{Message: fmt.Sprintf("Hello From %s", s.Port)}, nil
61 | }
62 | 
63 | func newServer(t *testing.T, port string, version string, weight int64) {
64 | 	register := NewRegister(etcdAddrs, zap.NewNop())
65 | 	defer register.Stop()
66 | 
67 | 	listen, err := net.Listen("tcp", port)
68 | 	if err != nil {
69 | 		log.Fatalf("failed to listen %v", err)
70 | 	}
71 | 
72 | 	s := grpc.NewServer()
73 | 	helloworld.RegisterGreeterServer(s, &server{Port: port})
74 | 
75 | 	info := Server{
76 | 		Name:    "hello",
77 | 		Addr:    fmt.Sprintf("127.0.0.1%s", port),
78 | 		Version: version,
79 | 		Weight:  weight,
80 | 	}
81 | 
82 | 	register.Register(info, 10)
83 | 
84 | 	if err := s.Serve(listen); err != nil {
85 | 		log.Fatalf("failed to server %v", err)
86 | 	}
87 | }
88 | 


--------------------------------------------------------------------------------
/etcdMutex/README.md:
--------------------------------------------------------------------------------
1 | ## etcd
2 | 
3 | 通过etcd实现锁     


--------------------------------------------------------------------------------
/etcdMutex/main.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"context"
 5 | 	"fmt"
 6 | 	"log"
 7 | 	"time"
 8 | 
 9 | 	clientv3 "go.etcd.io/etcd/client/v3"
10 | 	"go.etcd.io/etcd/client/v3/concurrency"
11 | )
12 | 
13 | func main() {
14 | 	cli, err := clientv3.New(clientv3.Config{
15 | 		Endpoints:   []string{"localhost:2379"},
16 | 		DialTimeout: 5 * time.Second,
17 | 	})
18 | 	if err != nil {
19 | 		log.Fatal(err)
20 | 	}
21 | 	defer cli.Close()
22 | 	ctx := context.Background()
23 | 	// m1来抢锁
24 | 	go func() {
25 | 		s1, err := concurrency.NewSession(cli)
26 | 		if err != nil {
27 | 			log.Fatal(err)
28 | 		}
29 | 		defer s1.Close()
30 | 		m1 := concurrency.NewMutex(s1, "/my-lock/")
31 | 
32 | 		// acquire lock for s1
33 | 		if err := m1.Lock(ctx); err != nil {
34 | 			log.Fatal(err)
35 | 		}
36 | 		fmt.Println("m1---获得了锁")
37 | 
38 | 		time.Sleep(time.Second * 3)
39 | 
40 | 		// 释放锁
41 | 		if err := m1.Unlock(ctx); err != nil {
42 | 			log.Fatal(err)
43 | 		}
44 | 		fmt.Println("m1++释放了锁")
45 | 	}()
46 | 
47 | 	// m2来抢锁
48 | 	go func() {
49 | 		s2, err := concurrency.NewSession(cli)
50 | 		if err != nil {
51 | 			log.Fatal(err)
52 | 		}
53 | 		defer s2.Close()
54 | 		m2 := concurrency.NewMutex(s2, "/my-lock/")
55 | 		if err := m2.Lock(ctx); err != nil {
56 | 			log.Fatal(err)
57 | 		}
58 | 		fmt.Println("m2---获得了锁")
59 | 
60 | 		// mock业务执行的时间
61 | 		time.Sleep(time.Second * 3)
62 | 
63 | 		// 释放锁
64 | 		if err := m2.Unlock(ctx); err != nil {
65 | 			log.Fatal(err)
66 | 		}
67 | 
68 | 		fmt.Println("m2++释放了锁")
69 | 	}()
70 | 
71 | 	time.Sleep(time.Second * 10)
72 | }
73 | 


--------------------------------------------------------------------------------
/go.mod:
--------------------------------------------------------------------------------
 1 | module etcd-learning
 2 | 
 3 | go 1.16
 4 | 
 5 | require (
 6 | 	github.com/golang/protobuf v1.5.2
 7 | 	go.etcd.io/etcd/api/v3 v3.5.0
 8 | 	go.etcd.io/etcd/client/pkg/v3 v3.5.0
 9 | 	go.etcd.io/etcd/client/v3 v3.5.0
10 | 	go.etcd.io/etcd/raft/v3 v3.5.0
11 | 	go.etcd.io/etcd/server/v3 v3.5.0
12 | 	go.uber.org/zap v1.18.1
13 | 	golang.org/x/net v0.0.0-20210716203947-853a461950ff
14 | 	google.golang.org/grpc v1.39.0
15 | 	gopkg.in/mgo.v2 v2.0.0-20190816093944-a6b53ec6cb22
16 | )
17 | 


--------------------------------------------------------------------------------
/img/etcd-balance.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/boilingfrog/etcd-learning/8d8ccb19b7503d5ba1924a80267732efc5705f7e/img/etcd-balance.png


--------------------------------------------------------------------------------
/img/etcd-consul.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/boilingfrog/etcd-learning/8d8ccb19b7503d5ba1924a80267732efc5705f7e/img/etcd-consul.webp


--------------------------------------------------------------------------------
/img/etcd-leader.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/boilingfrog/etcd-learning/8d8ccb19b7503d5ba1924a80267732efc5705f7e/img/etcd-leader.png


--------------------------------------------------------------------------------
/img/etcd-lease.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/boilingfrog/etcd-learning/8d8ccb19b7503d5ba1924a80267732efc5705f7e/img/etcd-lease.png


--------------------------------------------------------------------------------
/img/etcd-lock.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/boilingfrog/etcd-learning/8d8ccb19b7503d5ba1924a80267732efc5705f7e/img/etcd-lock.png


--------------------------------------------------------------------------------
/img/etcd-mvcc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/boilingfrog/etcd-learning/8d8ccb19b7503d5ba1924a80267732efc5705f7e/img/etcd-mvcc.png


--------------------------------------------------------------------------------
/img/etcd-notify.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/boilingfrog/etcd-learning/8d8ccb19b7503d5ba1924a80267732efc5705f7e/img/etcd-notify.png


--------------------------------------------------------------------------------
/img/etcd-raft-node-pre.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/boilingfrog/etcd-learning/8d8ccb19b7503d5ba1924a80267732efc5705f7e/img/etcd-raft-node-pre.png


--------------------------------------------------------------------------------
/img/etcd-raft-node.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/boilingfrog/etcd-learning/8d8ccb19b7503d5ba1924a80267732efc5705f7e/img/etcd-raft-node.png


--------------------------------------------------------------------------------
/img/etcd-raft-read-follower.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/boilingfrog/etcd-learning/8d8ccb19b7503d5ba1924a80267732efc5705f7e/img/etcd-raft-read-follower.png


--------------------------------------------------------------------------------
/img/etcd-raft-read-leader.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/boilingfrog/etcd-learning/8d8ccb19b7503d5ba1924a80267732efc5705f7e/img/etcd-raft-read-leader.png


--------------------------------------------------------------------------------
/img/etcd-raft-wal.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/boilingfrog/etcd-learning/8d8ccb19b7503d5ba1924a80267732efc5705f7e/img/etcd-raft-wal.jpg


--------------------------------------------------------------------------------
/img/etcd-raftExample.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/boilingfrog/etcd-learning/8d8ccb19b7503d5ba1924a80267732efc5705f7e/img/etcd-raftExample.jpg


--------------------------------------------------------------------------------
/img/etcd-register.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/boilingfrog/etcd-learning/8d8ccb19b7503d5ba1924a80267732efc5705f7e/img/etcd-register.png


--------------------------------------------------------------------------------
/img/etcd-register_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/boilingfrog/etcd-learning/8d8ccb19b7503d5ba1924a80267732efc5705f7e/img/etcd-register_1.png


--------------------------------------------------------------------------------
/img/etcd-server-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/boilingfrog/etcd-learning/8d8ccb19b7503d5ba1924a80267732efc5705f7e/img/etcd-server-1.png


--------------------------------------------------------------------------------
/img/etcd-server.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/boilingfrog/etcd-learning/8d8ccb19b7503d5ba1924a80267732efc5705f7e/img/etcd-server.png


--------------------------------------------------------------------------------
/img/etcd-watch-client.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/boilingfrog/etcd-learning/8d8ccb19b7503d5ba1924a80267732efc5705f7e/img/etcd-watch-client.png


--------------------------------------------------------------------------------
/img/etcd-watch.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/boilingfrog/etcd-learning/8d8ccb19b7503d5ba1924a80267732efc5705f7e/img/etcd-watch.png


--------------------------------------------------------------------------------
/img/etcd.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/boilingfrog/etcd-learning/8d8ccb19b7503d5ba1924a80267732efc5705f7e/img/etcd.png


--------------------------------------------------------------------------------
/img/etcd_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/boilingfrog/etcd-learning/8d8ccb19b7503d5ba1924a80267732efc5705f7e/img/etcd_1.png


--------------------------------------------------------------------------------
/img/grpc_balance_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/boilingfrog/etcd-learning/8d8ccb19b7503d5ba1924a80267732efc5705f7e/img/grpc_balance_1.png


--------------------------------------------------------------------------------
/img/grpc_balance_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/boilingfrog/etcd-learning/8d8ccb19b7503d5ba1924a80267732efc5705f7e/img/grpc_balance_2.png


--------------------------------------------------------------------------------
/img/grpc_balance_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/boilingfrog/etcd-learning/8d8ccb19b7503d5ba1924a80267732efc5705f7e/img/grpc_balance_3.png


--------------------------------------------------------------------------------
/img/grpc_balance_4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/boilingfrog/etcd-learning/8d8ccb19b7503d5ba1924a80267732efc5705f7e/img/grpc_balance_4.png


--------------------------------------------------------------------------------
/img/k8s-etcd.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/boilingfrog/etcd-learning/8d8ccb19b7503d5ba1924a80267732efc5705f7e/img/k8s-etcd.webp


--------------------------------------------------------------------------------
/img/raft-leader.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/boilingfrog/etcd-learning/8d8ccb19b7503d5ba1924a80267732efc5705f7e/img/raft-leader.png


--------------------------------------------------------------------------------
/img/raft-log_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/boilingfrog/etcd-learning/8d8ccb19b7503d5ba1924a80267732efc5705f7e/img/raft-log_1.png


--------------------------------------------------------------------------------
/img/raft-net.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/boilingfrog/etcd-learning/8d8ccb19b7503d5ba1924a80267732efc5705f7e/img/raft-net.png


--------------------------------------------------------------------------------
/img/raft_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/boilingfrog/etcd-learning/8d8ccb19b7503d5ba1924a80267732efc5705f7e/img/raft_1.png


--------------------------------------------------------------------------------
/img/raftexample.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/boilingfrog/etcd-learning/8d8ccb19b7503d5ba1924a80267732efc5705f7e/img/raftexample.jpg


--------------------------------------------------------------------------------
/img/zookeeper.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/boilingfrog/etcd-learning/8d8ccb19b7503d5ba1924a80267732efc5705f7e/img/zookeeper.webp


--------------------------------------------------------------------------------
/main.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"sync"
 6 | 	"time"
 7 | )
 8 | 
 9 | type test struct {
10 | 	mu    sync.Mutex // guards fields below it
11 | 	revMu sync.RWMutex
12 | 	mapp  map[string]string
13 | }
14 | 
15 | func main() {
16 | 	test := test{
17 | 		mapp: map[string]string{
18 | 			"test1": "1",
19 | 			"test2": "2",
20 | 			"test3": "3",
21 | 			"test4": "4",
22 | 		},
23 | 	}
24 | 
25 | 	go func() {
26 | 		test.mu.Lock()
27 | 		test.revMu.RLock()
28 | 		test.mapp["xxx1"] = "xxxx1"
29 | 		fmt.Println(test.mapp)
30 | 
31 | 		test.revMu.RUnlock()
32 | 
33 | 		test.mu.Unlock()
34 | 	}()
35 | 
36 | 	go func() {
37 | 		test.mu.Lock()
38 | 		test.revMu.RLock()
39 | 		test.mapp["xxx"] = "xxxx"
40 | 		fmt.Println("++++", test.mapp)
41 | 
42 | 		test.revMu.RUnlock()
43 | 
44 | 		test.mu.Unlock()
45 | 	}()
46 | 
47 | 	time.Sleep(time.Second * 1)
48 | }
49 | 


--------------------------------------------------------------------------------
/other.md:
--------------------------------------------------------------------------------
  1 | <!-- START doctoc generated TOC please keep comment here to allow auto update -->
  2 | <!-- DON'T EDIT THIS SECTION, INSTEAD RE-RUN doctoc TO UPDATE -->
  3 | 
  4 | - [etcd 中的点](#etcd-%E4%B8%AD%E7%9A%84%E7%82%B9)
  5 |   - [k8s 如何和 etcd 交互](#k8s-%E5%A6%82%E4%BD%95%E5%92%8C-etcd-%E4%BA%A4%E4%BA%92)
  6 |     - [Resource Version 与 etcd 版本号](#resource-version-%E4%B8%8E-etcd-%E7%89%88%E6%9C%AC%E5%8F%B7)
  7 |   - [如何支撑 k8s 中的上万节点](#%E5%A6%82%E4%BD%95%E6%94%AF%E6%92%91-k8s-%E4%B8%AD%E7%9A%84%E4%B8%8A%E4%B8%87%E8%8A%82%E7%82%B9)
  8 |     - [如何减少 expensive request](#%E5%A6%82%E4%BD%95%E5%87%8F%E5%B0%91-expensive-request)
  9 |   - [etcd 中分布式锁对比 redis 的安全性](#etcd-%E4%B8%AD%E5%88%86%E5%B8%83%E5%BC%8F%E9%94%81%E5%AF%B9%E6%AF%94-redis-%E7%9A%84%E5%AE%89%E5%85%A8%E6%80%A7)
 10 |     - [Redis 中分布式锁的缺点](#redis-%E4%B8%AD%E5%88%86%E5%B8%83%E5%BC%8F%E9%94%81%E7%9A%84%E7%BC%BA%E7%82%B9)
 11 |     - [使用 etcd 作为分布式锁的优点](#%E4%BD%BF%E7%94%A8-etcd-%E4%BD%9C%E4%B8%BA%E5%88%86%E5%B8%83%E5%BC%8F%E9%94%81%E7%9A%84%E4%BC%98%E7%82%B9)
 12 |   - [参考](#%E5%8F%82%E8%80%83)
 13 | 
 14 | <!-- END doctoc generated TOC please keep comment here to allow auto update -->
 15 | 
 16 | ## etcd 中的点
 17 | 
 18 | ### k8s 如何和 etcd 交互
 19 | 
 20 | 先来看下 k8s 中的架构  
 21 | 
 22 | <img src="/img/k8s-etcd.webp" alt="etcd" align=center/>  
 23 | 
 24 | etcd 主要是进行 Kubernetes 的元数据存储  
 25 | 
 26 | 可以看到主要是 kube-apiserver 和 etcd 进行交互的   
 27 | 
 28 | kube-apiserver: 负责对外提供集群各类资源的增删改查及 Watch 接口，它是 Kubernetes 集群中各组件数据交互和通信的枢纽。kube-apiserver 在设计上可水平扩展，高可用 Kubernetes 集群中一般多副本部署。当收到一个创建 Pod 写请求时，它的基本流程是对请求进行认证、限速、授权、准入机制等检查后，写入到 etcd 即可。  
 29 | 
 30 | 我们知道 k8s 中有 namespace 和标签，那么对于这种查询，是如何保持效率呢？  
 31 | 
 32 | - 按照资源名称查询  
 33 | 
 34 | 按具体资源名称查询。它本质就是个 key-value 查询，只需要写入 etcd 的 key 名称与资源 key 一致即可。  
 35 | 
 36 | - 按 namespace 查询  
 37 | 
 38 | 按照 namespace 查询，因为我们知道 etcd 支持范围查询，若 key 名称前缀包含 namespace、资源类型，查询的时候指定 namespace 和资源类型的组合的最小开始区间、最大结束区间即可。  
 39 | 
 40 | - 按照标签查询  
 41 | 
 42 | Kubernetes 中的查询方案是由 kube-apiserver 通过范围遍历 etcd 获取原始数据，然后基于用户指定标签，来筛选符合条件的资源返回给 client。  
 43 | 
 44 | 当前缺点就是大量标签查询可能会导致 etcd 大流量等异常情况发生  
 45 | 
 46 | 这里来看下 Kubernetes 集群中的 coredns 一系列资源在 etcd 中的存储格式  
 47 | 
 48 | ```
 49 | /registry/clusterrolebindings/system:coredns
 50 | /registry/clusterroles/system:coredns
 51 | /registry/configmaps/kube-system/coredns
 52 | /registry/deployments/kube-system/coredns
 53 | /registry/events/kube-system/coredns-7fcc6d65dc-6njlg.1662c287aabf742b
 54 | /registry/events/kube-system/coredns-7fcc6d65dc-6njlg.1662c288232143ae
 55 | /registry/pods/kube-system/coredns-7fcc6d65dc-jvj26
 56 | /registry/pods/kube-system/coredns-7fcc6d65dc-mgvtb
 57 | /registry/pods/kube-system/coredns-7fcc6d65dc-whzq9
 58 | /registry/replicasets/kube-system/coredns-7fcc6d65dc
 59 | /registry/secrets/kube-system/coredns-token-hpqbt
 60 | /registry/serviceaccounts/kube-system/coredns
 61 | ```
 62 | 
 63 | Kubernetes 资源在 etcd 中的存储格式由 prefix + "/" + 资源类型 + "/" + namespace + "/" + 具体资源名组成，基于 etcd 提供的范围查询能力，非常简单地支持了按具体资源名称查询和 namespace 查询。  
 64 | 
 65 | #### Resource Version 与 etcd 版本号
 66 | 
 67 | Kubernetes 集群中，它提供了什么概念来实现增量监听逻辑呢？  
 68 | 
 69 | Resource Version  
 70 | 
 71 | Resource Version 是 Kubernetes API 中非常重要的一个概念，顾名思义，它是一个 Kubernetes 资源的内部版本字符串，client 可通过它来判断资源是否发生了变化。同时，你可以在 Get、List、Watch 接口中，通过指定 Resource Version 值来满足你对数据一致性、高性能等诉求。  
 72 | 
 73 | 下面从 Get 和 Watch 接口中的 Resource Version 参数值为例，来看下和 etcd 的关系  
 74 | 
 75 | 在 Get 请求查询案例中，ResourceVersion 主要有以下这三种取值： 
 76 | 
 77 | - 指定 ResourceVersion 默认空字符串  
 78 | 
 79 | kube-apiserver 收到一个此类型的读请求后，它会向 etcd 发出共识读 / 线性读请求获取 etcd 集群最新的数据。  
 80 | 
 81 | - 指定 ResourceVersion="0"  
 82 | 
 83 | kube-apiserver 收到此类请求时，它可能会返回任意资源版本号的数据，但是优先返回较新版本。一般情况下它直接从 kube-apiserver 缓存中获取数据返回给 client，有可能读到过期的数据，适用于对数据一致性要求不高的场景。  
 84 | 
 85 | - 设置 ResourceVersion 为一个非 0 的字符串  
 86 | 
 87 | kube-apiserver 收到此类请求时，它会保证 Cache 中的最新 ResourceVersion 大于等于你传入的 ResourceVersion，然后从 Cache 中查找你请求的资源对象 key，返回数据给 client。基本原理是 kube-apiserver 为各个核心资源（如 Pod）维护了一个 Cache，通过 etcd 的 Watch 机制来实时更新 Cache。当你的 Get 请求中携带了非 0 的 ResourceVersion，它会等待缓存中最新 ResourceVersion 大于等于你 Get 请求中的 ResoureVersion，若满足条件则从 Cache 中查询数据，返回给 client。若不满足条件，它最多等待 3 秒，若超过 3 秒，Cache 中的最新 ResourceVersion 还小于 Get 请求中的 ResourceVersion，就会返回 ResourceVersionTooLarge 错误给 client。  
 88 | 
 89 | 再看下 watch 请求查询案例中，ResourceVersion 主要有以下这三种取值：   
 90 | 
 91 | - 指定 ResourceVersion 默认空字符串  
 92 | 
 93 | 一方面为了帮助 client 建立初始状态，它会将当前已存在的资源通过 Add 事件返回给 client。另一方面，它会从 etcd 当前版本号开始监听，后续新增写请求导致数据变化时可及时推送给 client。  
 94 | 
 95 | - 指定 ResourceVersion="0"  
 96 | 
 97 | 它同样会帮助 client 建立初始状态，但是它会从任意版本号开始监听，这种场景可能导致集群返回成就的数据。  
 98 | 
 99 | - 指定 ResourceVersion 为一个非 0 的字符串
100 | 
101 | 从精确的版本号开始监听数据，它只会返回大于等于精确版本号的变更事件。  
102 | 
103 | ### 如何支撑 k8s 中的上万节点
104 | 
105 | 大规模 Kubernetes 集群的外在表现是节点数成千上万，资源对象数量高达几十万。本质是更频繁地查询、写入更大的资源对象。  
106 | 
107 | 当然大量的节点，对于写入和读取都会产生性能的影响  
108 | 
109 | #### 如何减少 expensive request
110 | 
111 | - 分页  
112 | 
113 | - 资源按 namespace 拆分 
114 | 
115 | - Informer 机制  
116 | 
117 | Informer 机制的 Reflector 封装了 Watch、List 操作，结合本地 Cache、Indexer，实现了控制器加载完初始状态数据后，接下来的其他操作都只需要从本地缓存读取，极大降低了 kube-apiserver 和 etcd 的压力。  
118 | 
119 | - Watch bookmark 机制  
120 | 
121 | Watch bookmark 机制通过新增一个 bookmark 类型的事件来实现的。kube-apiserver 会通过定时器将各类型资源最新的 Resource Version 推送给 kubelet 等 client，在 client 与 kube-apiserver 网络异常重连等场景，大大降低了 client 重建 Watch 的开销，减少了 relist expensive request。  
122 | 
123 | - 更高效的 Watch 恢复机制  
124 | 
125 | ### etcd 中分布式锁对比 redis 的安全性
126 | 
127 | **分布式锁的几个核心要素**  
128 | 
129 | - 第一要素是互斥性、安全性。在同一时间内，不允许多个 client 同时获得锁。  
130 | 
131 | - 第二个要素就是活性  
132 | 
133 | 在实现分布式锁的过程中要考虑到 client 可能会出现 crash 或者网络分区，你需要原子申请分布式锁及设置锁的自动过期时间，通过过期、超时等机制自动释放锁，避免出现死锁，导致业务中断。  
134 | 
135 | - 第三个要素是，高性能、高可用。加锁、释放锁的过程性能开销要尽量低，同时要保证高可用，确保业务不会出现中断。  
136 | 
137 | #### Redis 中分布式锁的缺点
138 | 
139 | 比如主备切换  
140 | 
141 | 比如主节点收到了请求，但是还没有同步到其他节点，然后主节点挂掉了，之后其他节点中有一个节点被选出来变成了主节点，但是刚刚的信息没有同步到，所以客户端的请求过来又会产生信息写入，造成互斥锁被获取到了两次。互斥性和安全性都被破坏掉了。  
142 | 
143 | 主备切换、脑裂是 Redis 分布式锁的两个典型不安全的因素，本质原因是 Redis 为了满足高性能，采用了主备异步复制协议，同时也与负责主备切换的 Redis Sentinel 服务是否合理部署有关。  
144 | 
145 | 对于这种情况，redis 中提供了 RedLock 算法，什么是 RedLock 算法呢？  
146 | 
147 | 它是基于多个独立的 Redis Master 节点的一种实现（一般为 5）。client 依次向各个节点申请锁，若能从多数个节点中申请锁成功并满足一些条件限制，那么 client 就能获取锁成功。  
148 | 
149 | 它通过独立的 N 个 Master 节点，避免了使用主备异步复制协议的缺陷，只要多数 Redis 节点正常就能正常工作，显著提升了分布式锁的安全性、可用性。  
150 | 
151 | #### 使用 etcd 作为分布式锁的优点  
152 | 
153 | **事务与锁的安全性** 
154 | 
155 | 相比 Redis 基于主备异步复制导致锁的安全性问题，etcd 是基于 Raft 共识算法实现的，一个写请求需要经过集群多数节点确认。因此一旦分布式锁申请返回给 client 成功后，它一定是持久化到了集群多数节点上，不会出现 Redis 主备异步复制可能导致丢数据的问题，具备更高的安全性。  
156 | 
157 | **Lease 与锁的活性**
158 | 
159 | Lease 活动检测机制，client 需定期向 etcd 服务发送"特殊心跳"汇报健康状态，若你未正常发送心跳，并超过和 etcd 服务约定的最大存活时间后，就会被 etcd 服务移除此 Lease 和其关联的数据。  
160 | 
161 | **Watch 与锁的可用性**
162 | 
163 | Watch 提供了高效的数据监听能力。当其他 client 收到 Watch Delete 事件后，就可快速判断自己是否有资格获得锁，极大减少了锁的不可用时间。  
164 | 
165 | **etcd 自带的 concurrency 包**
166 | 
167 | etcd 社区提供了一个名为 concurrency 包帮助你更简单、正确地使用分布式锁、分布式选举。  
168 | 
169 | 核心流程  
170 | 
171 | - 首先通过 concurrency.NewSession 方法创建 Session，本质是创建了一个 TTL 为 10 的 Lease。  
172 | 
173 | - 其次得到 session 对象后，通过 concurrency.NewMutex 创建了一个 mutex 对象，包含 Lease、key prefix 等信息。  
174 | 
175 | - 然后通过 mutex 对象的 Lock 方法尝试获取锁。  
176 | 
177 | - 最后使用结束，可通过 mutex 对象的 Unlock 方法释放锁。  
178 | 
179 | 通过多个 concurrency 创建 prefix 相同，名称不一样的 key，哪个 key 的 revision 最小，最终就是它获得锁  
180 | 
181 | 那未获得锁的 client 是如何等待的呢?  
182 | 
183 | 答案是通过 Watch 机制各自监听 prefix 相同，revision 比自己小的 key，因为只有 revision 比自己小的 key 释放锁，我才能有机会，获得锁。  
184 | 
185 | ### 参考  
186 | 
187 | 【etcd实战课】https://time.geekbang.org/column/article/350285    


--------------------------------------------------------------------------------
/queue/README.md:
--------------------------------------------------------------------------------
1 | ## etcd
2 | 
3 | 通过etcd实现队列     


--------------------------------------------------------------------------------
/queue/main.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"log"
 6 | 	"time"
 7 | 
 8 | 	clientv3 "go.etcd.io/etcd/client/v3"
 9 | 	recipe "go.etcd.io/etcd/client/v3/experimental/recipes"
10 | )
11 | 
12 | func main() {
13 | 	cli, err := clientv3.New(clientv3.Config{
14 | 		Endpoints: []string{"localhost:2379"},
15 | 	})
16 | 	if err != nil {
17 | 		log.Fatalf("error New (%v)", err)
18 | 	}
19 | 
20 | 	go func() {
21 | 		q := recipe.NewQueue(cli, "testq")
22 | 		for i := 0; i < 5; i++ {
23 | 			if err := q.Enqueue(fmt.Sprintf("%d", i)); err != nil {
24 | 				log.Fatalf("error enqueuing (%v)", err)
25 | 			}
26 | 		}
27 | 	}()
28 | 
29 | 	go func() {
30 | 		q := recipe.NewQueue(cli, "testq")
31 | 		for i := 10; i < 100; i++ {
32 | 			if err := q.Enqueue(fmt.Sprintf("%d", i)); err != nil {
33 | 				log.Fatalf("error enqueuing (%v)", err)
34 | 			}
35 | 		}
36 | 	}()
37 | 
38 | 	q := recipe.NewQueue(cli, "testq")
39 | 	for i := 0; i < 100; i++ {
40 | 		s, err := q.Dequeue()
41 | 		if err != nil {
42 | 			log.Fatalf("error dequeueing (%v)", err)
43 | 		}
44 | 		fmt.Println(s)
45 | 	}
46 | 
47 | 	time.Sleep(time.Second * 3)
48 | }
49 | 


--------------------------------------------------------------------------------
/raftexample/Procfile:
--------------------------------------------------------------------------------
1 | # Use goreman to run `go get github.com/mattn/goreman`
2 | raftexample1: ./raftexample --id 1 --cluster http://127.0.0.1:12379,http://127.0.0.1:22379,http://127.0.0.1:32379 --port 12380
3 | raftexample2: ./raftexample --id 2 --cluster http://127.0.0.1:12379,http://127.0.0.1:22379,http://127.0.0.1:32379 --port 22380
4 | raftexample3: ./raftexample --id 3 --cluster http://127.0.0.1:12379,http://127.0.0.1:22379,http://127.0.0.1:32379 --port 32380
5 | 


--------------------------------------------------------------------------------
/raftexample/README.md:
--------------------------------------------------------------------------------
  1 | # raftexample
  2 | 
  3 | raftexample is an example usage of etcd's [raft library](../../raft). It provides a simple REST API for a key-value store cluster backed by the [Raft][raft] consensus algorithm.
  4 | 
  5 | [raft]: http://raftconsensus.github.io/
  6 | 
  7 | ## Getting Started
  8 | 
  9 | ### Building raftexample
 10 | 
 11 | Clone `etcd` to `<directory>/src/go.etcd.io/etcd`
 12 | 
 13 | ```sh
 14 | export GOPATH=<directory>
 15 | cd <directory>/src/go.etcd.io/etcd/contrib/raftexample
 16 | go build -o raftexample
 17 | ```
 18 | 
 19 | ### Running single node raftexample
 20 | 
 21 | First start a single-member cluster of raftexample:
 22 | 
 23 | ```sh
 24 | raftexample --id 1 --cluster http://127.0.0.1:12379 --port 12380
 25 | ```
 26 | 
 27 | Each raftexample process maintains a single raft instance and a key-value server.
 28 | The process's list of comma separated peers (--cluster), its raft ID index into the peer list (--id), and http key-value server port (--port) are passed through the command line.
 29 | 
 30 | Next, store a value ("hello") to a key ("my-key"):
 31 | 
 32 | ```
 33 | curl -L http://127.0.0.1:12380/my-key -XPUT -d hello
 34 | ```
 35 | 
 36 | Finally, retrieve the stored key:
 37 | 
 38 | ```
 39 | curl -L http://127.0.0.1:12380/my-key
 40 | ```
 41 | 
 42 | ### Running a local cluster
 43 | 
 44 | First install [goreman](https://github.com/mattn/goreman), which manages Procfile-based applications.
 45 | 
 46 | The [Procfile script](Procfile) will set up a local example cluster. Start it with:
 47 | 
 48 | ```sh
 49 | goreman start
 50 | ```
 51 | 
 52 | This will bring up three raftexample instances.
 53 | 
 54 | Now it's possible to write a key-value pair to any member of the cluster and likewise retrieve it from any member.
 55 | 
 56 | ### Fault Tolerance
 57 | 
 58 | To test cluster recovery, first start a cluster and write a value "foo":
 59 | ```sh
 60 | goreman start
 61 | curl -L http://127.0.0.1:12380/my-key -XPUT -d foo
 62 | ```
 63 | 
 64 | Next, remove a node and replace the value with "bar" to check cluster availability:
 65 | 
 66 | ```sh
 67 | goreman run stop raftexample2
 68 | curl -L http://127.0.0.1:12380/my-key -XPUT -d bar
 69 | curl -L http://127.0.0.1:32380/my-key
 70 | ```
 71 | 
 72 | Finally, bring the node back up and verify it recovers with the updated value "bar":
 73 | ```sh
 74 | goreman run start raftexample2
 75 | curl -L http://127.0.0.1:22380/my-key
 76 | ```
 77 | 
 78 | ### Dynamic cluster reconfiguration
 79 | 
 80 | Nodes can be added to or removed from a running cluster using requests to the REST API.
 81 | 
 82 | For example, suppose we have a 3-node cluster that was started with the commands:
 83 | ```sh
 84 | raftexample --id 1 --cluster http://127.0.0.1:12379,http://127.0.0.1:22379,http://127.0.0.1:32379 --port 12380
 85 | raftexample --id 2 --cluster http://127.0.0.1:12379,http://127.0.0.1:22379,http://127.0.0.1:32379 --port 22380
 86 | raftexample --id 3 --cluster http://127.0.0.1:12379,http://127.0.0.1:22379,http://127.0.0.1:32379 --port 32380
 87 | ```
 88 | 
 89 | A fourth node with ID 4 can be added by issuing a POST:
 90 | ```sh
 91 | curl -L http://127.0.0.1:12380/4 -XPOST -d http://127.0.0.1:42379
 92 | ```
 93 | 
 94 | Then the new node can be started as the others were, using the --join option:
 95 | ```sh
 96 | raftexample --id 4 --cluster http://127.0.0.1:12379,http://127.0.0.1:22379,http://127.0.0.1:32379,http://127.0.0.1:42379 --port 42380 --join
 97 | ```
 98 | 
 99 | The new node should join the cluster and be able to service key/value requests.
100 | 
101 | We can remove a node using a DELETE request:
102 | ```sh
103 | curl -L http://127.0.0.1:12380/3 -XDELETE
104 | ```
105 | 
106 | Node 3 should shut itself down once the cluster has processed this request.
107 | 
108 | ## Design
109 | 
110 | The raftexample consists of three components: a raft-backed key-value store, a REST API server, and a raft consensus server based on etcd's raft implementation.
111 | 
112 | The raft-backed key-value store is a key-value map that holds all committed key-values.
113 | The store bridges communication between the raft server and the REST server.
114 | Key-value updates are issued through the store to the raft server.
115 | The store updates its map once raft reports the updates are committed.
116 | 
117 | The REST server exposes the current raft consensus by accessing the raft-backed key-value store.
118 | A GET command looks up a key in the store and returns the value, if any.
119 | A key-value PUT command issues an update proposal to the store.
120 | 
121 | The raft server participates in consensus with its cluster peers.
122 | When the REST server submits a proposal, the raft server transmits the proposal to its peers.
123 | When raft reaches a consensus, the server publishes all committed updates over a commit channel.
124 | For raftexample, this commit channel is consumed by the key-value store.
125 | 
126 | 


--------------------------------------------------------------------------------
/raftexample/doc.go:
--------------------------------------------------------------------------------
 1 | // Copyright 2016 The etcd Authors
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | // raftexample is a simple KV store using the raft and rafthttp libraries.
16 | package main
17 | 


--------------------------------------------------------------------------------
/raftexample/httpapi.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2015 The etcd Authors
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 (the "License");
  4 | // you may not use this file except in compliance with the License.
  5 | // You may obtain a copy of the License at
  6 | //
  7 | //     http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | // Unless required by applicable law or agreed to in writing, software
 10 | // distributed under the License is distributed on an "AS IS" BASIS,
 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | // See the License for the specific language governing permissions and
 13 | // limitations under the License.
 14 | 
 15 | package main
 16 | 
 17 | import (
 18 | 	"io/ioutil"
 19 | 	"log"
 20 | 	"net/http"
 21 | 	"strconv"
 22 | 
 23 | 	"go.etcd.io/etcd/raft/v3/raftpb"
 24 | )
 25 | 
 26 | // Handler for a http based key-value store backed by raft
 27 | type httpKVAPI struct {
 28 | 	store       *kvstore
 29 | 	confChangeC chan<- raftpb.ConfChange
 30 | }
 31 | 
 32 | func (h *httpKVAPI) ServeHTTP(w http.ResponseWriter, r *http.Request) {
 33 | 	key := r.RequestURI
 34 | 	defer r.Body.Close()
 35 | 	switch {
 36 | 	case r.Method == "PUT":
 37 | 		v, err := ioutil.ReadAll(r.Body)
 38 | 		if err != nil {
 39 | 			log.Printf("Failed to read on PUT (%v)\n", err)
 40 | 			http.Error(w, "Failed on PUT", http.StatusBadRequest)
 41 | 			return
 42 | 		}
 43 | 
 44 | 		h.store.Propose(key, string(v))
 45 | 
 46 | 		// Optimistic-- no waiting for ack from raft. Value is not yet
 47 | 		// committed so a subsequent GET on the key may return old value
 48 | 		w.WriteHeader(http.StatusNoContent)
 49 | 	case r.Method == "GET":
 50 | 		if v, ok := h.store.Lookup(key); ok {
 51 | 			w.Write([]byte(v))
 52 | 		} else {
 53 | 			http.Error(w, "Failed to GET", http.StatusNotFound)
 54 | 		}
 55 | 	case r.Method == "POST":
 56 | 		url, err := ioutil.ReadAll(r.Body)
 57 | 		if err != nil {
 58 | 			log.Printf("Failed to read on POST (%v)\n", err)
 59 | 			http.Error(w, "Failed on POST", http.StatusBadRequest)
 60 | 			return
 61 | 		}
 62 | 
 63 | 		nodeId, err := strconv.ParseUint(key[1:], 0, 64)
 64 | 		if err != nil {
 65 | 			log.Printf("Failed to convert ID for conf change (%v)\n", err)
 66 | 			http.Error(w, "Failed on POST", http.StatusBadRequest)
 67 | 			return
 68 | 		}
 69 | 
 70 | 		cc := raftpb.ConfChange{
 71 | 			Type:    raftpb.ConfChangeAddNode,
 72 | 			NodeID:  nodeId,
 73 | 			Context: url,
 74 | 		}
 75 | 		h.confChangeC <- cc
 76 | 
 77 | 		// As above, optimistic that raft will apply the conf change
 78 | 		w.WriteHeader(http.StatusNoContent)
 79 | 	case r.Method == "DELETE":
 80 | 		nodeId, err := strconv.ParseUint(key[1:], 0, 64)
 81 | 		if err != nil {
 82 | 			log.Printf("Failed to convert ID for conf change (%v)\n", err)
 83 | 			http.Error(w, "Failed on DELETE", http.StatusBadRequest)
 84 | 			return
 85 | 		}
 86 | 
 87 | 		cc := raftpb.ConfChange{
 88 | 			Type:   raftpb.ConfChangeRemoveNode,
 89 | 			NodeID: nodeId,
 90 | 		}
 91 | 		h.confChangeC <- cc
 92 | 
 93 | 		// As above, optimistic that raft will apply the conf change
 94 | 		w.WriteHeader(http.StatusNoContent)
 95 | 	default:
 96 | 		w.Header().Set("Allow", "PUT")
 97 | 		w.Header().Add("Allow", "GET")
 98 | 		w.Header().Add("Allow", "POST")
 99 | 		w.Header().Add("Allow", "DELETE")
100 | 		http.Error(w, "Method not allowed", http.StatusMethodNotAllowed)
101 | 	}
102 | }
103 | 
104 | // serveHttpKVAPI starts a key-value server with a GET/PUT API and listens.
105 | func serveHttpKVAPI(kv *kvstore, port int, confChangeC chan<- raftpb.ConfChange, errorC <-chan error) {
106 | 	srv := http.Server{
107 | 		Addr: ":" + strconv.Itoa(port),
108 | 		Handler: &httpKVAPI{
109 | 			store:       kv,
110 | 			confChangeC: confChangeC,
111 | 		},
112 | 	}
113 | 	go func() {
114 | 		if err := srv.ListenAndServe(); err != nil {
115 | 			log.Fatal(err)
116 | 		}
117 | 	}()
118 | 
119 | 	// exit when raft goes down
120 | 	if err, ok := <-errorC; ok {
121 | 		log.Fatal(err)
122 | 	}
123 | }
124 | 


--------------------------------------------------------------------------------
/raftexample/kvstore.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2015 The etcd Authors
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 (the "License");
  4 | // you may not use this file except in compliance with the License.
  5 | // You may obtain a copy of the License at
  6 | //
  7 | //     http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | // Unless required by applicable law or agreed to in writing, software
 10 | // distributed under the License is distributed on an "AS IS" BASIS,
 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | // See the License for the specific language governing permissions and
 13 | // limitations under the License.
 14 | 
 15 | package main
 16 | 
 17 | import (
 18 | 	"bytes"
 19 | 	"encoding/gob"
 20 | 	"encoding/json"
 21 | 	"log"
 22 | 	"sync"
 23 | 
 24 | 	"go.etcd.io/etcd/raft/v3/raftpb"
 25 | 	"go.etcd.io/etcd/server/v3/etcdserver/api/snap"
 26 | )
 27 | 
 28 | // a key-value store backed by raft
 29 | type kvstore struct {
 30 | 	proposeC    chan<- string // channel for proposing updates
 31 | 	mu          sync.RWMutex
 32 | 	kvStore     map[string]string // current committed key-value pairs
 33 | 	snapshotter *snap.Snapshotter
 34 | }
 35 | 
 36 | type kv struct {
 37 | 	Key string
 38 | 	Val string
 39 | }
 40 | 
 41 | func newKVStore(snapshotter *snap.Snapshotter, proposeC chan<- string, commitC <-chan *commit, errorC <-chan error) *kvstore {
 42 | 	s := &kvstore{proposeC: proposeC, kvStore: make(map[string]string), snapshotter: snapshotter}
 43 | 	snapshot, err := s.loadSnapshot()
 44 | 	if err != nil {
 45 | 		log.Panic(err)
 46 | 	}
 47 | 	if snapshot != nil {
 48 | 		log.Printf("loading snapshot at term %d and index %d", snapshot.Metadata.Term, snapshot.Metadata.Index)
 49 | 		if err := s.recoverFromSnapshot(snapshot.Data); err != nil {
 50 | 			log.Panic(err)
 51 | 		}
 52 | 	}
 53 | 	// read commits from raft into kvStore map until error
 54 | 	go s.readCommits(commitC, errorC)
 55 | 	return s
 56 | }
 57 | 
 58 | func (s *kvstore) Lookup(key string) (string, bool) {
 59 | 	s.mu.RLock()
 60 | 	defer s.mu.RUnlock()
 61 | 	v, ok := s.kvStore[key]
 62 | 	return v, ok
 63 | }
 64 | 
 65 | func (s *kvstore) Propose(k string, v string) {
 66 | 	var buf bytes.Buffer
 67 | 	if err := gob.NewEncoder(&buf).Encode(kv{k, v}); err != nil {
 68 | 		log.Fatal(err)
 69 | 	}
 70 | 	s.proposeC <- buf.String()
 71 | }
 72 | 
 73 | func (s *kvstore) readCommits(commitC <-chan *commit, errorC <-chan error) {
 74 | 	for commit := range commitC {
 75 | 		if commit == nil {
 76 | 			// signaled to load snapshot
 77 | 			snapshot, err := s.loadSnapshot()
 78 | 			if err != nil {
 79 | 				log.Panic(err)
 80 | 			}
 81 | 			if snapshot != nil {
 82 | 				log.Printf("loading snapshot at term %d and index %d", snapshot.Metadata.Term, snapshot.Metadata.Index)
 83 | 				if err := s.recoverFromSnapshot(snapshot.Data); err != nil {
 84 | 					log.Panic(err)
 85 | 				}
 86 | 			}
 87 | 			continue
 88 | 		}
 89 | 
 90 | 		for _, data := range commit.data {
 91 | 			var dataKv kv
 92 | 			dec := gob.NewDecoder(bytes.NewBufferString(data))
 93 | 			if err := dec.Decode(&dataKv); err != nil {
 94 | 				log.Fatalf("raftexample: could not decode message (%v)", err)
 95 | 			}
 96 | 			s.mu.Lock()
 97 | 			s.kvStore[dataKv.Key] = dataKv.Val
 98 | 			s.mu.Unlock()
 99 | 		}
100 | 		close(commit.applyDoneC)
101 | 	}
102 | 	if err, ok := <-errorC; ok {
103 | 		log.Fatal(err)
104 | 	}
105 | }
106 | 
107 | func (s *kvstore) getSnapshot() ([]byte, error) {
108 | 	s.mu.RLock()
109 | 	defer s.mu.RUnlock()
110 | 	return json.Marshal(s.kvStore)
111 | }
112 | 
113 | func (s *kvstore) loadSnapshot() (*raftpb.Snapshot, error) {
114 | 	snapshot, err := s.snapshotter.Load()
115 | 	if err == snap.ErrNoSnapshot {
116 | 		return nil, nil
117 | 	}
118 | 	if err != nil {
119 | 		return nil, err
120 | 	}
121 | 	return snapshot, nil
122 | }
123 | 
124 | func (s *kvstore) recoverFromSnapshot(snapshot []byte) error {
125 | 	var store map[string]string
126 | 	if err := json.Unmarshal(snapshot, &store); err != nil {
127 | 		return err
128 | 	}
129 | 	s.mu.Lock()
130 | 	defer s.mu.Unlock()
131 | 	s.kvStore = store
132 | 	return nil
133 | }
134 | 


--------------------------------------------------------------------------------
/raftexample/kvstore_test.go:
--------------------------------------------------------------------------------
 1 | // Copyright 2016 The etcd Authors
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | package main
16 | 
17 | import (
18 | 	"reflect"
19 | 	"testing"
20 | )
21 | 
22 | func Test_kvstore_snapshot(t *testing.T) {
23 | 	tm := map[string]string{"foo": "bar"}
24 | 	s := &kvstore{kvStore: tm}
25 | 
26 | 	v, _ := s.Lookup("foo")
27 | 	if v != "bar" {
28 | 		t.Fatalf("foo has unexpected value, got %s", v)
29 | 	}
30 | 
31 | 	data, err := s.getSnapshot()
32 | 	if err != nil {
33 | 		t.Fatal(err)
34 | 	}
35 | 	s.kvStore = nil
36 | 
37 | 	if err := s.recoverFromSnapshot(data); err != nil {
38 | 		t.Fatal(err)
39 | 	}
40 | 	v, _ = s.Lookup("foo")
41 | 	if v != "bar" {
42 | 		t.Fatalf("foo has unexpected value, got %s", v)
43 | 	}
44 | 	if !reflect.DeepEqual(s.kvStore, tm) {
45 | 		t.Fatalf("store expected %+v, got %+v", tm, s.kvStore)
46 | 	}
47 | }
48 | 


--------------------------------------------------------------------------------
/raftexample/listener.go:
--------------------------------------------------------------------------------
 1 | // Copyright 2015 The etcd Authors
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | package main
16 | 
17 | import (
18 | 	"errors"
19 | 	"net"
20 | 	"time"
21 | )
22 | 
23 | // stoppableListener sets TCP keep-alive timeouts on accepted
24 | // connections and waits on stopc message
25 | type stoppableListener struct {
26 | 	*net.TCPListener
27 | 	stopc <-chan struct{}
28 | }
29 | 
30 | func newStoppableListener(addr string, stopc <-chan struct{}) (*stoppableListener, error) {
31 | 	ln, err := net.Listen("tcp", addr)
32 | 	if err != nil {
33 | 		return nil, err
34 | 	}
35 | 	return &stoppableListener{ln.(*net.TCPListener), stopc}, nil
36 | }
37 | 
38 | func (ln stoppableListener) Accept() (c net.Conn, err error) {
39 | 	connc := make(chan *net.TCPConn, 1)
40 | 	errc := make(chan error, 1)
41 | 	go func() {
42 | 		tc, err := ln.AcceptTCP()
43 | 		if err != nil {
44 | 			errc <- err
45 | 			return
46 | 		}
47 | 		connc <- tc
48 | 	}()
49 | 	select {
50 | 	case <-ln.stopc:
51 | 		return nil, errors.New("server stopped")
52 | 	case err := <-errc:
53 | 		return nil, err
54 | 	case tc := <-connc:
55 | 		tc.SetKeepAlive(true)
56 | 		tc.SetKeepAlivePeriod(3 * time.Minute)
57 | 		return tc, nil
58 | 	}
59 | }
60 | 


--------------------------------------------------------------------------------
/raftexample/main.go:
--------------------------------------------------------------------------------
 1 | // Copyright 2015 The etcd Authors
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | package main
16 | 
17 | import (
18 | 	"flag"
19 | 	"strings"
20 | 
21 | 	"go.etcd.io/etcd/raft/v3/raftpb"
22 | )
23 | 
24 | func main() {
25 | 	cluster := flag.String("cluster", "http://127.0.0.1:9021", "comma separated cluster peers")
26 | 	id := flag.Int("id", 1, "node ID")
27 | 	kvport := flag.Int("port", 9121, "key-value server port")
28 | 	join := flag.Bool("join", false, "join an existing cluster")
29 | 	flag.Parse()
30 | 
31 | 	proposeC := make(chan string)
32 | 	defer close(proposeC)
33 | 	confChangeC := make(chan raftpb.ConfChange)
34 | 	defer close(confChangeC)
35 | 
36 | 	// raft provides a commit stream for the proposals from the http api
37 | 	var kvs *kvstore
38 | 	getSnapshot := func() ([]byte, error) { return kvs.getSnapshot() }
39 | 	commitC, errorC, snapshotterReady := newRaftNode(*id, strings.Split(*cluster, ","), *join, getSnapshot, proposeC, confChangeC)
40 | 
41 | 	kvs = newKVStore(<-snapshotterReady, proposeC, commitC, errorC)
42 | 
43 | 	// the key-value http handler will propose updates to raft
44 | 	serveHttpKVAPI(kvs, *kvport, confChangeC, errorC)
45 | }
46 | 


--------------------------------------------------------------------------------
/raftexample/raft.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2015 The etcd Authors
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 (the "License");
  4 | // you may not use this file except in compliance with the License.
  5 | // You may obtain a copy of the License at
  6 | //
  7 | //     http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | // Unless required by applicable law or agreed to in writing, software
 10 | // distributed under the License is distributed on an "AS IS" BASIS,
 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | // See the License for the specific language governing permissions and
 13 | // limitations under the License.
 14 | 
 15 | package main
 16 | 
 17 | import (
 18 | 	"context"
 19 | 	"fmt"
 20 | 	"log"
 21 | 	"net/http"
 22 | 	"net/url"
 23 | 	"os"
 24 | 	"strconv"
 25 | 	"time"
 26 | 
 27 | 	"go.etcd.io/etcd/client/pkg/v3/fileutil"
 28 | 	"go.etcd.io/etcd/client/pkg/v3/types"
 29 | 	"go.etcd.io/etcd/raft/v3"
 30 | 	"go.etcd.io/etcd/raft/v3/raftpb"
 31 | 	"go.etcd.io/etcd/server/v3/etcdserver/api/rafthttp"
 32 | 	"go.etcd.io/etcd/server/v3/etcdserver/api/snap"
 33 | 	stats "go.etcd.io/etcd/server/v3/etcdserver/api/v2stats"
 34 | 	"go.etcd.io/etcd/server/v3/wal"
 35 | 	"go.etcd.io/etcd/server/v3/wal/walpb"
 36 | 
 37 | 	"go.uber.org/zap"
 38 | )
 39 | 
 40 | type commit struct {
 41 | 	data       []string
 42 | 	applyDoneC chan<- struct{}
 43 | }
 44 | 
 45 | // A key-value stream backed by raft
 46 | type raftNode struct {
 47 | 	proposeC    <-chan string            // proposed messages (k,v)
 48 | 	confChangeC <-chan raftpb.ConfChange // proposed cluster config changes
 49 | 	commitC     chan<- *commit           // entries committed to log (k,v)
 50 | 	errorC      chan<- error             // errors from raft session
 51 | 
 52 | 	id          int      // client ID for raft session
 53 | 	peers       []string // raft peer URLs
 54 | 	join        bool     // node is joining an existing cluster
 55 | 	waldir      string   // path to WAL directory
 56 | 	snapdir     string   // path to snapshot directory
 57 | 	getSnapshot func() ([]byte, error)
 58 | 
 59 | 	confState     raftpb.ConfState
 60 | 	snapshotIndex uint64
 61 | 	appliedIndex  uint64
 62 | 
 63 | 	// raft backing for the commit/error channel
 64 | 	node        raft.Node
 65 | 	raftStorage *raft.MemoryStorage
 66 | 	wal         *wal.WAL
 67 | 
 68 | 	snapshotter      *snap.Snapshotter
 69 | 	snapshotterReady chan *snap.Snapshotter // signals when snapshotter is ready
 70 | 
 71 | 	snapCount uint64
 72 | 	transport *rafthttp.Transport
 73 | 	stopc     chan struct{} // signals proposal channel closed
 74 | 	httpstopc chan struct{} // signals http server to shutdown
 75 | 	httpdonec chan struct{} // signals http server shutdown complete
 76 | 
 77 | 	logger *zap.Logger
 78 | }
 79 | 
 80 | var defaultSnapshotCount uint64 = 10000
 81 | 
 82 | // 主要完成了raftNode的初始化
 83 | // 使用上层模块传入的配置信息来创建raftNode实例，同时创建commitC 通道和errorC通道返回给上层模块使用
 84 | // 上层的应用通过这几个channel就能和raftNode进行交互
 85 | func newRaftNode(id int, peers []string, join bool, getSnapshot func() ([]byte, error), proposeC <-chan string,
 86 | 	confChangeC <-chan raftpb.ConfChange) (<-chan *commit, <-chan error, <-chan *snap.Snapshotter) {
 87 | 	// channel，主要传输Entry记录
 88 | 	// raftNode会将etcd-raft模块返回的待应用Entry记
 89 | 	// 录（封装在 Ready实例中〉写入commitC通道，另一方面，kvstore会从commitC通
 90 | 	// 道中读取这些待应用的 Entry 记录井保存其中的键值对信息。
 91 | 	commitC := make(chan *commit)
 92 | 	errorC := make(chan error)
 93 | 
 94 | 	rc := &raftNode{
 95 | 		proposeC:    proposeC,
 96 | 		confChangeC: confChangeC,
 97 | 		commitC:     commitC,
 98 | 		errorC:      errorC,
 99 | 		id:          id,
100 | 		peers:       peers,
101 | 		join:        join,
102 | 		// 初始化存放 WAL 日志和 Snapshot 文件的的目录
103 | 		waldir:      fmt.Sprintf("raftexample-%d", id),
104 | 		snapdir:     fmt.Sprintf("raftexample-%d-snap", id),
105 | 		getSnapshot: getSnapshot,
106 | 		snapCount:   defaultSnapshotCount,
107 | 		stopc:       make(chan struct{}),
108 | 		httpstopc:   make(chan struct{}),
109 | 		httpdonec:   make(chan struct{}),
110 | 
111 | 		logger: zap.NewExample(),
112 | 
113 | 		snapshotterReady: make(chan *snap.Snapshotter, 1),
114 | 		// rest of structure populated after WAL replay
115 | 	}
116 | 	// 启动一个goroutine,完成剩余的初始化工作
117 | 	go rc.startRaft()
118 | 	return commitC, errorC, rc.snapshotterReady
119 | }
120 | 
121 | func (rc *raftNode) saveSnap(snap raftpb.Snapshot) error {
122 | 	walSnap := walpb.Snapshot{
123 | 		Index:     snap.Metadata.Index,
124 | 		Term:      snap.Metadata.Term,
125 | 		ConfState: &snap.Metadata.ConfState,
126 | 	}
127 | 	// save the snapshot file before writing the snapshot to the wal.
128 | 	// This makes it possible for the snapshot file to become orphaned, but prevents
129 | 	// a WAL snapshot entry from having no corresponding snapshot file.
130 | 	if err := rc.snapshotter.SaveSnap(snap); err != nil {
131 | 		return err
132 | 	}
133 | 	if err := rc.wal.SaveSnapshot(walSnap); err != nil {
134 | 		return err
135 | 	}
136 | 	return rc.wal.ReleaseLockTo(snap.Metadata.Index)
137 | }
138 | 
139 | func (rc *raftNode) entriesToApply(ents []raftpb.Entry) (nents []raftpb.Entry) {
140 | 	if len(ents) == 0 {
141 | 		return ents
142 | 	}
143 | 	firstIdx := ents[0].Index
144 | 	if firstIdx > rc.appliedIndex+1 {
145 | 		log.Fatalf("first index of committed entry[%d] should <= progress.appliedIndex[%d]+1", firstIdx, rc.appliedIndex)
146 | 	}
147 | 	if rc.appliedIndex-firstIdx+1 < uint64(len(ents)) {
148 | 		nents = ents[rc.appliedIndex-firstIdx+1:]
149 | 	}
150 | 	return nents
151 | }
152 | 
153 | // publishEntries writes committed log entries to commit channel and returns
154 | // whether all entries could be published.
155 | func (rc *raftNode) publishEntries(ents []raftpb.Entry) (<-chan struct{}, bool) {
156 | 	if len(ents) == 0 {
157 | 		return nil, true
158 | 	}
159 | 
160 | 	data := make([]string, 0, len(ents))
161 | 	for i := range ents {
162 | 		switch ents[i].Type {
163 | 		case raftpb.EntryNormal:
164 | 			if len(ents[i].Data) == 0 {
165 | 				// ignore empty messages
166 | 				break
167 | 			}
168 | 			s := string(ents[i].Data)
169 | 			data = append(data, s)
170 | 		case raftpb.EntryConfChange:
171 | 			var cc raftpb.ConfChange
172 | 			cc.Unmarshal(ents[i].Data)
173 | 			rc.confState = *rc.node.ApplyConfChange(cc)
174 | 			switch cc.Type {
175 | 			case raftpb.ConfChangeAddNode:
176 | 				if len(cc.Context) > 0 {
177 | 					rc.transport.AddPeer(types.ID(cc.NodeID), []string{string(cc.Context)})
178 | 				}
179 | 			case raftpb.ConfChangeRemoveNode:
180 | 				if cc.NodeID == uint64(rc.id) {
181 | 					log.Println("I've been removed from the cluster! Shutting down.")
182 | 					return nil, false
183 | 				}
184 | 				rc.transport.RemovePeer(types.ID(cc.NodeID))
185 | 			}
186 | 		}
187 | 	}
188 | 
189 | 	var applyDoneC chan struct{}
190 | 
191 | 	if len(data) > 0 {
192 | 		applyDoneC = make(chan struct{}, 1)
193 | 		select {
194 | 		case rc.commitC <- &commit{data, applyDoneC}:
195 | 		case <-rc.stopc:
196 | 			return nil, false
197 | 		}
198 | 	}
199 | 
200 | 	// after commit, update appliedIndex
201 | 	rc.appliedIndex = ents[len(ents)-1].Index
202 | 
203 | 	return applyDoneC, true
204 | }
205 | 
206 | func (rc *raftNode) loadSnapshot() *raftpb.Snapshot {
207 | 	if wal.Exist(rc.waldir) {
208 | 		walSnaps, err := wal.ValidSnapshotEntries(rc.logger, rc.waldir)
209 | 		if err != nil {
210 | 			log.Fatalf("raftexample: error listing snapshots (%v)", err)
211 | 		}
212 | 		snapshot, err := rc.snapshotter.LoadNewestAvailable(walSnaps)
213 | 		if err != nil && err != snap.ErrNoSnapshot {
214 | 			log.Fatalf("raftexample: error loading snapshot (%v)", err)
215 | 		}
216 | 		return snapshot
217 | 	}
218 | 	return &raftpb.Snapshot{}
219 | }
220 | 
221 | // openWAL returns a WAL ready for reading.
222 | func (rc *raftNode) openWAL(snapshot *raftpb.Snapshot) *wal.WAL {
223 | 	if !wal.Exist(rc.waldir) {
224 | 		if err := os.Mkdir(rc.waldir, 0750); err != nil {
225 | 			log.Fatalf("raftexample: cannot create dir for wal (%v)", err)
226 | 		}
227 | 
228 | 		w, err := wal.Create(zap.NewExample(), rc.waldir, nil)
229 | 		if err != nil {
230 | 			log.Fatalf("raftexample: create wal error (%v)", err)
231 | 		}
232 | 		w.Close()
233 | 	}
234 | 
235 | 	walsnap := walpb.Snapshot{}
236 | 	if snapshot != nil {
237 | 		walsnap.Index, walsnap.Term = snapshot.Metadata.Index, snapshot.Metadata.Term
238 | 	}
239 | 	log.Printf("loading WAL at term %d and index %d", walsnap.Term, walsnap.Index)
240 | 	w, err := wal.Open(zap.NewExample(), rc.waldir, walsnap)
241 | 	if err != nil {
242 | 		log.Fatalf("raftexample: error loading wal (%v)", err)
243 | 	}
244 | 
245 | 	return w
246 | }
247 | 
248 | // replayWAL replays WAL entries into the raft instance.
249 | func (rc *raftNode) replayWAL() *wal.WAL {
250 | 	// 读取快照文件，该方法会调用 snapshotter.Load() 方法完成快照文件的读取
251 | 	log.Printf("replaying WAL of member %d", rc.id)
252 | 	snapshot := rc.loadSnapshot()
253 | 	// 根据读取到的 Snapshot 实例的元数据创建 WAL 实例
254 | 	w := rc.openWAL(snapshot)
255 | 	// 读取快照数据之后的全部 WAL 日志数据，并获取状态信息
256 | 	_, st, ents, err := w.ReadAll()
257 | 	if err != nil {
258 | 		log.Fatalf("raftexample: failed to read WAL (%v)", err)
259 | 	}
260 | 	// 创建 MemoryStorage 实例
261 | 	rc.raftStorage = raft.NewMemoryStorage()
262 | 	if snapshot != nil {
263 | 		rc.raftStorage.ApplySnapshot(*snapshot)
264 | 	}
265 | 	// 将读取 WAL 日志之后得到的 HardState 加载到 MemoryStorage 中
266 | 	rc.raftStorage.SetHardState(st)
267 | 
268 | 	// 将读取的 WAL 日志得到的 Entry 记录加载到 MemoryStorage 中
269 | 	rc.raftStorage.Append(ents)
270 | 
271 | 	return w
272 | }
273 | 
274 | func (rc *raftNode) writeError(err error) {
275 | 	rc.stopHTTP()
276 | 	close(rc.commitC)
277 | 	rc.errorC <- err
278 | 	close(rc.errorC)
279 | 	rc.node.Stop()
280 | }
281 | 
282 | func (rc *raftNode) startRaft() {
283 | 	if !fileutil.Exist(rc.snapdir) {
284 | 		if err := os.Mkdir(rc.snapdir, 0750); err != nil {
285 | 			log.Fatalf("raftexample: cannot create dir for snapshot (%v)", err)
286 | 		}
287 | 	}
288 | 	rc.snapshotter = snap.New(zap.NewExample(), rc.snapdir)
289 | 	// 创建 WAL 实例，然后加载快照并回放 WAL 日志
290 | 	oldwal := wal.Exist(rc.waldir)
291 | 
292 | 	// raftNode.replayWAL() 方法首先会读取快照数据，
293 | 	//在快照数据中记录了该快照包含的最后一条 Entry 记录的 Term 值 和 索引值。
294 | 	//然后根据 Term 值 和 索引值确定读取 WAL 日志文件的位置， 并进行日志记录的读取。
295 | 	rc.wal = rc.replayWAL()
296 | 
297 | 	// signal replay has finished
298 | 	rc.snapshotterReady <- rc.snapshotter
299 | 
300 | 	rpeers := make([]raft.Peer, len(rc.peers))
301 | 	for i := range rpeers {
302 | 		rpeers[i] = raft.Peer{ID: uint64(i + 1)}
303 | 	}
304 | 	// 创建 raft.Config 实例
305 | 	c := &raft.Config{
306 | 		ID: uint64(rc.id),
307 | 		// 选举超时
308 | 		ElectionTick: 10,
309 | 		// 心跳超时
310 | 		HeartbeatTick:             1,
311 | 		Storage:                   rc.raftStorage,
312 | 		MaxSizePerMsg:             1024 * 1024,
313 | 		MaxInflightMsgs:           256,
314 | 		MaxUncommittedEntriesSize: 1 << 30,
315 | 	}
316 | 	// 初始化底层的 etcd-raft 模块，这里会根据 WAL 日志的回放情况，
317 | 	// 判断当前节点是首次启动还是重新启动
318 | 	if oldwal || rc.join {
319 | 		rc.node = raft.RestartNode(c)
320 | 	} else {
321 | 		// 初次启动
322 | 		rc.node = raft.StartNode(c, rpeers)
323 | 	}
324 | 	// 创建 Transport 实例并启动，他负责 raft 节点之间的网络通信服务
325 | 	rc.transport = &rafthttp.Transport{
326 | 		Logger:      rc.logger,
327 | 		ID:          types.ID(rc.id),
328 | 		ClusterID:   0x1000,
329 | 		Raft:        rc,
330 | 		ServerStats: stats.NewServerStats("", ""),
331 | 		LeaderStats: stats.NewLeaderStats(zap.NewExample(), strconv.Itoa(rc.id)),
332 | 		ErrorC:      make(chan error),
333 | 	}
334 | 	// 启动网络服务相关组件
335 | 	rc.transport.Start()
336 | 	// 建立与集群中其他各个节点的连接
337 | 	for i := range rc.peers {
338 | 		if i+1 != rc.id {
339 | 			rc.transport.AddPeer(types.ID(i+1), []string{rc.peers[i]})
340 | 		}
341 | 	}
342 | 	// 启动一个goroutine，其中会监听当前节点与集群中其他节点之间的网络连接
343 | 	go rc.serveRaft()
344 | 	// 启动后台 goroutine 处理上层应用与底层 etcd-raft 模块的交互
345 | 	go rc.serveChannels()
346 | }
347 | 
348 | // stop closes http, closes all channels, and stops raft.
349 | func (rc *raftNode) stop() {
350 | 	rc.stopHTTP()
351 | 	close(rc.commitC)
352 | 	close(rc.errorC)
353 | 	rc.node.Stop()
354 | }
355 | 
356 | func (rc *raftNode) stopHTTP() {
357 | 	rc.transport.Stop()
358 | 	close(rc.httpstopc)
359 | 	<-rc.httpdonec
360 | }
361 | 
362 | func (rc *raftNode) publishSnapshot(snapshotToSave raftpb.Snapshot) {
363 | 	if raft.IsEmptySnap(snapshotToSave) {
364 | 		return
365 | 	}
366 | 
367 | 	log.Printf("publishing snapshot at index %d", rc.snapshotIndex)
368 | 	defer log.Printf("finished publishing snapshot at index %d", rc.snapshotIndex)
369 | 
370 | 	if snapshotToSave.Metadata.Index <= rc.appliedIndex {
371 | 		log.Fatalf("snapshot index [%d] should > progress.appliedIndex [%d]", snapshotToSave.Metadata.Index, rc.appliedIndex)
372 | 	}
373 | 	rc.commitC <- nil // trigger kvstore to load snapshot
374 | 
375 | 	rc.confState = snapshotToSave.Metadata.ConfState
376 | 	rc.snapshotIndex = snapshotToSave.Metadata.Index
377 | 	rc.appliedIndex = snapshotToSave.Metadata.Index
378 | }
379 | 
380 | var snapshotCatchUpEntriesN uint64 = 10000
381 | 
382 | func (rc *raftNode) maybeTriggerSnapshot(applyDoneC <-chan struct{}) {
383 | 	if rc.appliedIndex-rc.snapshotIndex <= rc.snapCount {
384 | 		return
385 | 	}
386 | 
387 | 	// wait until all committed entries are applied (or server is closed)
388 | 	if applyDoneC != nil {
389 | 		select {
390 | 		case <-applyDoneC:
391 | 		case <-rc.stopc:
392 | 			return
393 | 		}
394 | 	}
395 | 
396 | 	log.Printf("start snapshot [applied index: %d | last snapshot index: %d]", rc.appliedIndex, rc.snapshotIndex)
397 | 	data, err := rc.getSnapshot()
398 | 	if err != nil {
399 | 		log.Panic(err)
400 | 	}
401 | 	snap, err := rc.raftStorage.CreateSnapshot(rc.appliedIndex, &rc.confState, data)
402 | 	if err != nil {
403 | 		panic(err)
404 | 	}
405 | 	if err := rc.saveSnap(snap); err != nil {
406 | 		panic(err)
407 | 	}
408 | 
409 | 	compactIndex := uint64(1)
410 | 	if rc.appliedIndex > snapshotCatchUpEntriesN {
411 | 		compactIndex = rc.appliedIndex - snapshotCatchUpEntriesN
412 | 	}
413 | 	if err := rc.raftStorage.Compact(compactIndex); err != nil {
414 | 		panic(err)
415 | 	}
416 | 
417 | 	log.Printf("compacted log at index %d", compactIndex)
418 | 	rc.snapshotIndex = rc.appliedIndex
419 | }
420 | 
421 | // 会单独启动一个后台 goroutine来负责上层模块 传递给 etcd-ra企 模块的数据，
422 | // 主要 处理前面介绍的 proposeC、 confChangeC 两个通道
423 | func (rc *raftNode) serveChannels() {
424 | 	// 这里是获取快照数据和快照的元数据
425 | 	snap, err := rc.raftStorage.Snapshot()
426 | 	if err != nil {
427 | 		panic(err)
428 | 	}
429 | 	rc.confState = snap.Metadata.ConfState
430 | 	rc.snapshotIndex = snap.Metadata.Index
431 | 	rc.appliedIndex = snap.Metadata.Index
432 | 
433 | 	defer rc.wal.Close()
434 | 
435 | 	// 创建一个每隔 lOOms 触发一次的定时器，那么在逻辑上，lOOms 即是 etcd-raft 组件的最小时间单位 ，
436 | 	// 该定时器每触发一次，则逻辑时钟推进一次
437 | 	ticker := time.NewTicker(100 * time.Millisecond)
438 | 	defer ticker.Stop()
439 | 
440 | 	// 单独启 动一个 goroutine 负责将 proposeC、 confChangeC 远远上接收到
441 | 	// 的数据传递给 etcd-raft 组件进行处理
442 | 	go func() {
443 | 		confChangeCount := uint64(0)
444 | 
445 | 		for rc.proposeC != nil && rc.confChangeC != nil {
446 | 			select {
447 | 			case prop, ok := <-rc.proposeC:
448 | 				if !ok {
449 | 					// 发生异常将proposeC置空
450 | 					rc.proposeC = nil
451 | 				} else {
452 | 					// 阻塞直到消息被处理
453 | 					rc.node.Propose(context.TODO(), []byte(prop))
454 | 				}
455 | 				// 收到上层应用通过 confChangeC远远传递过来的数据
456 | 			case cc, ok := <-rc.confChangeC:
457 | 				if !ok {
458 | 					// 如果发生异常将confChangeC置空
459 | 					rc.confChangeC = nil
460 | 				} else {
461 | 					confChangeCount++
462 | 					cc.ID = confChangeCount
463 | 					rc.node.ProposeConfChange(context.TODO(), cc)
464 | 				}
465 | 			}
466 | 		}
467 | 		// 关闭 stopc 通道，触发 rafeNode.stop() 方法的调用
468 | 		close(rc.stopc)
469 | 	}()
470 | 
471 | 	// 处理 etcd-raft 模块返回给上层模块的数据及其他相关的操作
472 | 	for {
473 | 		select {
474 | 		case <-ticker.C:
475 | 			// 上述 ticker 定时器触发一次
476 | 			rc.node.Tick()
477 | 
478 | 		// 读取 node.readyc 通道
479 | 		// 该通道是 etcd-raft 组件与上层应用交互的主要channel之一
480 | 		// 其中传递的 Ready 实例也封装了很多信息
481 | 		case rd := <-rc.node.Ready():
482 | 			// 将当前 etcd raft 组件的状态信息，以及待持久化的 Entry 记录先记录到 WAL 日志文件中，
483 | 			// 即使之后宕机，这些信息也可以在节点下次启动时，通过前面回放 WAL 日志的方式进行恢复
484 | 			rc.wal.Save(rd.HardState, rd.Entries)
485 | 			// 检测到 etcd-raft 组件生成了新的快照数据
486 | 			if !raft.IsEmptySnap(rd.Snapshot) {
487 | 				// 将新的快照数据写入快照文件中
488 | 				rc.saveSnap(rd.Snapshot)
489 | 				// 将新快照持久化到 raftStorage
490 | 				rc.raftStorage.ApplySnapshot(rd.Snapshot)
491 | 				// 通知上层应用加载新快照
492 | 				rc.publishSnapshot(rd.Snapshot)
493 | 			}
494 | 			// 将待持久化的 Entry 记录追加到 raftStorage 中完成持久化
495 | 			rc.raftStorage.Append(rd.Entries)
496 | 			// 将待发送的消息发送到指定节点
497 | 			rc.transport.Send(rd.Messages)
498 | 			// 将已提交、待应用的 Entry 记录应用到上层应用的状态机中
499 | 			applyDoneC, ok := rc.publishEntries(rc.entriesToApply(rd.CommittedEntries))
500 | 			if !ok {
501 | 				rc.stop()
502 | 				return
503 | 			}
504 | 
505 | 			// 随着节点的运行， WAL 日志量和 raftLog.storage 中的 Entry 记录会不断增加 ，
506 | 			// 所以节点每处理 10000 条(默认值) Entry 记录，就会触发一次创建快照的过程，
507 | 			// 同时 WAL 会释放一些日志文件的句柄，raftLog.storage 也会压缩其保存的 Entry 记录
508 | 			rc.maybeTriggerSnapshot(applyDoneC)
509 | 			// 上层应用处理完该 Ready 实例，通知 etcd-raft 纽件准备返回下一个 Ready 实例
510 | 			rc.node.Advance()
511 | 
512 | 		case err := <-rc.transport.ErrorC:
513 | 			rc.writeError(err)
514 | 			return
515 | 
516 | 		case <-rc.stopc:
517 | 			rc.stop()
518 | 			return
519 | 		}
520 | 	}
521 | }
522 | 
523 | func (rc *raftNode) serveRaft() {
524 | 	// 获取当前节点的 URL 地址
525 | 	url, err := url.Parse(rc.peers[rc.id-1])
526 | 	if err != nil {
527 | 		log.Fatalf("raftexample: Failed parsing URL (%v)", err)
528 | 	}
529 | 
530 | 	// 创建 stoppableListener 实例，stoppableListener 继承了 net.TCPListener
531 | 	// 接口，它会与 http.Server 配合实现对当前节点的 URL 地址进行监听
532 | 	ln, err := newStoppableListener(url.Host, rc.httpstopc)
533 | 	if err != nil {
534 | 		log.Fatalf("raftexample: Failed to listen rafthttp (%v)", err)
535 | 	}
536 | 
537 | 	// 创建 http.Server 实例，它会通过上面的 stoppableListener 实例监听当前的 URL 地址
538 | 	// stoppableListener.Accept() 方法监听到新的连接到来时，会创建对应的 net.Conn 实例，
539 | 	// http.Server 会为每个连接创建单独的 goroutine 处理，每个请求都会由 http.Server.Handler
540 | 	// 处理。这里的 Handler 是由 rafthttp.Transporter 创建的，后面详细介绍 rafthttp.Transporter
541 | 	// 的具体实现。另外需要读者了解的是 http.Server.Serve()方法会一直阻塞，直到 http.Server关闭
542 | 	err = (&http.Server{Handler: rc.transport.Handler()}).Serve(ln)
543 | 	select {
544 | 	case <-rc.httpstopc:
545 | 	default:
546 | 		log.Fatalf("raftexample: Failed to serve rafthttp (%v)", err)
547 | 	}
548 | 	close(rc.httpdonec)
549 | }
550 | 
551 | func (rc *raftNode) Process(ctx context.Context, m raftpb.Message) error {
552 | 	return rc.node.Step(ctx, m)
553 | }
554 | func (rc *raftNode) IsIDRemoved(id uint64) bool  { return false }
555 | func (rc *raftNode) ReportUnreachable(id uint64) { rc.node.ReportUnreachable(id) }
556 | func (rc *raftNode) ReportSnapshot(id uint64, status raft.SnapshotStatus) {
557 | 	rc.node.ReportSnapshot(id, status)
558 | }
559 | 


--------------------------------------------------------------------------------
/raftexample/raftexample_test.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2015 The etcd Authors
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 (the "License");
  4 | // you may not use this file except in compliance with the License.
  5 | // You may obtain a copy of the License at
  6 | //
  7 | //     http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | // Unless required by applicable law or agreed to in writing, software
 10 | // distributed under the License is distributed on an "AS IS" BASIS,
 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | // See the License for the specific language governing permissions and
 13 | // limitations under the License.
 14 | 
 15 | package main
 16 | 
 17 | import (
 18 | 	"bytes"
 19 | 	"fmt"
 20 | 	"io/ioutil"
 21 | 	"net/http"
 22 | 	"net/http/httptest"
 23 | 	"os"
 24 | 	"testing"
 25 | 	"time"
 26 | 
 27 | 	"go.etcd.io/etcd/raft/v3/raftpb"
 28 | )
 29 | 
 30 | func getSnapshotFn() (func() ([]byte, error), <-chan struct{}) {
 31 | 	snapshotTriggeredC := make(chan struct{})
 32 | 	return func() ([]byte, error) {
 33 | 		snapshotTriggeredC <- struct{}{}
 34 | 		return nil, nil
 35 | 	}, snapshotTriggeredC
 36 | }
 37 | 
 38 | type cluster struct {
 39 | 	peers              []string
 40 | 	commitC            []<-chan *commit
 41 | 	errorC             []<-chan error
 42 | 	proposeC           []chan string
 43 | 	confChangeC        []chan raftpb.ConfChange
 44 | 	snapshotTriggeredC []<-chan struct{}
 45 | }
 46 | 
 47 | // newCluster creates a cluster of n nodes
 48 | func newCluster(n int) *cluster {
 49 | 	peers := make([]string, n)
 50 | 	for i := range peers {
 51 | 		peers[i] = fmt.Sprintf("http://127.0.0.1:%d", 10000+i)
 52 | 	}
 53 | 
 54 | 	clus := &cluster{
 55 | 		peers:              peers,
 56 | 		commitC:            make([]<-chan *commit, len(peers)),
 57 | 		errorC:             make([]<-chan error, len(peers)),
 58 | 		proposeC:           make([]chan string, len(peers)),
 59 | 		confChangeC:        make([]chan raftpb.ConfChange, len(peers)),
 60 | 		snapshotTriggeredC: make([]<-chan struct{}, len(peers)),
 61 | 	}
 62 | 
 63 | 	for i := range clus.peers {
 64 | 		os.RemoveAll(fmt.Sprintf("raftexample-%d", i+1))
 65 | 		os.RemoveAll(fmt.Sprintf("raftexample-%d-snap", i+1))
 66 | 		clus.proposeC[i] = make(chan string, 1)
 67 | 		clus.confChangeC[i] = make(chan raftpb.ConfChange, 1)
 68 | 		fn, snapshotTriggeredC := getSnapshotFn()
 69 | 		clus.snapshotTriggeredC[i] = snapshotTriggeredC
 70 | 		clus.commitC[i], clus.errorC[i], _ = newRaftNode(i+1, clus.peers, false, fn, clus.proposeC[i], clus.confChangeC[i])
 71 | 	}
 72 | 
 73 | 	return clus
 74 | }
 75 | 
 76 | // Close closes all cluster nodes and returns an error if any failed.
 77 | func (clus *cluster) Close() (err error) {
 78 | 	for i := range clus.peers {
 79 | 		go func(i int) {
 80 | 			for range clus.commitC[i] {
 81 | 				// drain pending commits
 82 | 			}
 83 | 		}(i)
 84 | 		close(clus.proposeC[i])
 85 | 		// wait for channel to close
 86 | 		if erri := <-clus.errorC[i]; erri != nil {
 87 | 			err = erri
 88 | 		}
 89 | 		// clean intermediates
 90 | 		os.RemoveAll(fmt.Sprintf("raftexample-%d", i+1))
 91 | 		os.RemoveAll(fmt.Sprintf("raftexample-%d-snap", i+1))
 92 | 	}
 93 | 	return err
 94 | }
 95 | 
 96 | func (clus *cluster) closeNoErrors(t *testing.T) {
 97 | 	t.Log("closing cluster...")
 98 | 	if err := clus.Close(); err != nil {
 99 | 		t.Fatal(err)
100 | 	}
101 | 	t.Log("closing cluster [done]")
102 | }
103 | 
104 | // TestProposeOnCommit starts three nodes and feeds commits back into the proposal
105 | // channel. The intent is to ensure blocking on a proposal won't block raft progress.
106 | func TestProposeOnCommit(t *testing.T) {
107 | 	clus := newCluster(3)
108 | 	defer clus.closeNoErrors(t)
109 | 
110 | 	donec := make(chan struct{})
111 | 	for i := range clus.peers {
112 | 		// feedback for "n" committed entries, then update donec
113 | 		go func(pC chan<- string, cC <-chan *commit, eC <-chan error) {
114 | 			for n := 0; n < 100; n++ {
115 | 				c, ok := <-cC
116 | 				if !ok {
117 | 					pC = nil
118 | 				}
119 | 				select {
120 | 				case pC <- c.data[0]:
121 | 					continue
122 | 				case err := <-eC:
123 | 					t.Errorf("eC message (%v)", err)
124 | 				}
125 | 			}
126 | 			donec <- struct{}{}
127 | 			for range cC {
128 | 				// acknowledge the commits from other nodes so
129 | 				// raft continues to make progress
130 | 			}
131 | 		}(clus.proposeC[i], clus.commitC[i], clus.errorC[i])
132 | 
133 | 		// one message feedback per node
134 | 		go func(i int) { clus.proposeC[i] <- "foo" }(i)
135 | 	}
136 | 
137 | 	for range clus.peers {
138 | 		<-donec
139 | 	}
140 | }
141 | 
142 | // TestCloseProposerBeforeReplay tests closing the producer before raft starts.
143 | func TestCloseProposerBeforeReplay(t *testing.T) {
144 | 	clus := newCluster(1)
145 | 	// close before replay so raft never starts
146 | 	defer clus.closeNoErrors(t)
147 | }
148 | 
149 | // TestCloseProposerInflight tests closing the producer while
150 | // committed messages are being published to the client.
151 | func TestCloseProposerInflight(t *testing.T) {
152 | 	clus := newCluster(1)
153 | 	defer clus.closeNoErrors(t)
154 | 
155 | 	// some inflight ops
156 | 	go func() {
157 | 		clus.proposeC[0] <- "foo"
158 | 		clus.proposeC[0] <- "bar"
159 | 	}()
160 | 
161 | 	// wait for one message
162 | 	if c, ok := <-clus.commitC[0]; !ok || c.data[0] != "foo" {
163 | 		t.Fatalf("Commit failed")
164 | 	}
165 | }
166 | 
167 | func TestPutAndGetKeyValue(t *testing.T) {
168 | 	clusters := []string{"http://127.0.0.1:9021"}
169 | 
170 | 	proposeC := make(chan string)
171 | 	defer close(proposeC)
172 | 
173 | 	confChangeC := make(chan raftpb.ConfChange)
174 | 	defer close(confChangeC)
175 | 
176 | 	var kvs *kvstore
177 | 	getSnapshot := func() ([]byte, error) { return kvs.getSnapshot() }
178 | 	commitC, errorC, snapshotterReady := newRaftNode(1, clusters, false, getSnapshot, proposeC, confChangeC)
179 | 
180 | 	kvs = newKVStore(<-snapshotterReady, proposeC, commitC, errorC)
181 | 
182 | 	srv := httptest.NewServer(&httpKVAPI{
183 | 		store:       kvs,
184 | 		confChangeC: confChangeC,
185 | 	})
186 | 	defer srv.Close()
187 | 
188 | 	// wait server started
189 | 	<-time.After(time.Second * 3)
190 | 
191 | 	wantKey, wantValue := "test-key", "test-value"
192 | 	url := fmt.Sprintf("%s/%s", srv.URL, wantKey)
193 | 	body := bytes.NewBufferString(wantValue)
194 | 	cli := srv.Client()
195 | 
196 | 	req, err := http.NewRequest("PUT", url, body)
197 | 	if err != nil {
198 | 		t.Fatal(err)
199 | 	}
200 | 	req.Header.Set("Content-Type", "text/html; charset=utf-8")
201 | 	_, err = cli.Do(req)
202 | 	if err != nil {
203 | 		t.Fatal(err)
204 | 	}
205 | 
206 | 	// wait for a moment for processing message, otherwise get would be failed.
207 | 	<-time.After(time.Second)
208 | 
209 | 	resp, err := cli.Get(url)
210 | 	if err != nil {
211 | 		t.Fatal(err)
212 | 	}
213 | 
214 | 	data, err := ioutil.ReadAll(resp.Body)
215 | 	if err != nil {
216 | 		t.Fatal(err)
217 | 	}
218 | 	defer resp.Body.Close()
219 | 
220 | 	if gotValue := string(data); wantValue != gotValue {
221 | 		t.Fatalf("expect %s, got %s", wantValue, gotValue)
222 | 	}
223 | }
224 | 
225 | // TestAddNewNode tests adding new node to the existing cluster.
226 | func TestAddNewNode(t *testing.T) {
227 | 	clus := newCluster(3)
228 | 	defer clus.closeNoErrors(t)
229 | 
230 | 	os.RemoveAll("raftexample-4")
231 | 	os.RemoveAll("raftexample-4-snap")
232 | 	defer func() {
233 | 		os.RemoveAll("raftexample-4")
234 | 		os.RemoveAll("raftexample-4-snap")
235 | 	}()
236 | 
237 | 	newNodeURL := "http://127.0.0.1:10004"
238 | 	clus.confChangeC[0] <- raftpb.ConfChange{
239 | 		Type:    raftpb.ConfChangeAddNode,
240 | 		NodeID:  4,
241 | 		Context: []byte(newNodeURL),
242 | 	}
243 | 
244 | 	proposeC := make(chan string)
245 | 	defer close(proposeC)
246 | 
247 | 	confChangeC := make(chan raftpb.ConfChange)
248 | 	defer close(confChangeC)
249 | 
250 | 	newRaftNode(4, append(clus.peers, newNodeURL), true, nil, proposeC, confChangeC)
251 | 
252 | 	go func() {
253 | 		proposeC <- "foo"
254 | 	}()
255 | 
256 | 	if c, ok := <-clus.commitC[0]; !ok || c.data[0] != "foo" {
257 | 		t.Fatalf("Commit failed")
258 | 	}
259 | }
260 | 
261 | func TestSnapshot(t *testing.T) {
262 | 	prevDefaultSnapshotCount := defaultSnapshotCount
263 | 	prevSnapshotCatchUpEntriesN := snapshotCatchUpEntriesN
264 | 	defaultSnapshotCount = 4
265 | 	snapshotCatchUpEntriesN = 4
266 | 	defer func() {
267 | 		defaultSnapshotCount = prevDefaultSnapshotCount
268 | 		snapshotCatchUpEntriesN = prevSnapshotCatchUpEntriesN
269 | 	}()
270 | 
271 | 	clus := newCluster(3)
272 | 	defer clus.closeNoErrors(t)
273 | 
274 | 	go func() {
275 | 		clus.proposeC[0] <- "foo"
276 | 	}()
277 | 
278 | 	c := <-clus.commitC[0]
279 | 
280 | 	select {
281 | 	case <-clus.snapshotTriggeredC[0]:
282 | 		t.Fatalf("snapshot triggered before applying done")
283 | 	default:
284 | 	}
285 | 	close(c.applyDoneC)
286 | 	<-clus.snapshotTriggeredC[0]
287 | }
288 | 


--------------------------------------------------------------------------------
/sync/README.md:
--------------------------------------------------------------------------------
1 | ## etcd
2 | 
3 | 通过etcd实现消息订阅和消息发布       


--------------------------------------------------------------------------------
/sync/cache/cache_debug_show.go:
--------------------------------------------------------------------------------
 1 | package cache
 2 | 
 3 | import (
 4 | 	"encoding/json"
 5 | 	"net/http"
 6 | 
 7 | 	"gopkg.in/mgo.v2/bson"
 8 | )
 9 | 
10 | type ShowCache interface {
11 | 	Show(id bson.ObjectId) interface{}
12 | 	ShowAll() interface{}
13 | }
14 | 
15 | func HandleDebugConfCache(cache ShowCache) http.Handler {
16 | 	return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
17 | 
18 | 		var data []byte
19 | 		var err error
20 | 		key := r.URL.Query().Get("key")
21 | 		var bsonKey bson.ObjectId
22 | 		if bson.IsObjectIdHex(key) {
23 | 			bsonKey = bson.ObjectIdHex(key)
24 | 			data, err = json.Marshal(cache.Show(bsonKey))
25 | 		} else {
26 | 			data, err = json.Marshal(cache.ShowAll())
27 | 		}
28 | 		if err != nil {
29 | 			w.Write([]byte(err.Error()))
30 | 			return
31 | 		} else {
32 | 			w.Write(data)
33 | 			return
34 | 		}
35 | 
36 | 	})
37 | }
38 | 
39 | type ShowBookChannelCache interface {
40 | 	Show(id string) interface{}
41 | 	ShowAll() interface{}
42 | }
43 | 
44 | func HandleDebugBookChannelCache(cache ShowBookChannelCache) http.Handler {
45 | 	return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
46 | 
47 | 		var data []byte
48 | 		var err error
49 | 		key := r.URL.Query().Get("key")
50 | 		if key != "" {
51 | 			data, err = json.Marshal(cache.Show(key))
52 | 		} else {
53 | 			data, err = json.Marshal(cache.ShowAll())
54 | 		}
55 | 		if err != nil {
56 | 			w.Write([]byte(err.Error()))
57 | 			return
58 | 		} else {
59 | 			w.Write(data)
60 | 			return
61 | 		}
62 | 
63 | 	})
64 | }
65 | 


--------------------------------------------------------------------------------
/sync/cache/cache_update.go:
--------------------------------------------------------------------------------
  1 | package cache
  2 | 
  3 | import (
  4 | 	"context"
  5 | 	"fmt"
  6 | 	"time"
  7 | 
  8 | 	"go.uber.org/zap"
  9 | 
 10 | 	"go.etcd.io/etcd/api/v3/mvccpb"
 11 | 	clientv3 "go.etcd.io/etcd/client/v3"
 12 | )
 13 | 
 14 | func init() {
 15 | 	handleMap = make(map[string]func([]byte) error)
 16 | }
 17 | 
 18 | var handleMap map[string]func([]byte) error
 19 | 
 20 | /*
 21 | RegisterUpdateHandle
 22 | Must Be Register handle if pub to client
 23 | */
 24 | func RegisterUpdateHandle(key string, f func([]byte) error) {
 25 | 	handleMap[key] = f
 26 | }
 27 | 
 28 | type PubClient interface {
 29 | 	Pub(ctx context.Context, key string, val string) error
 30 | }
 31 | 
 32 | var Pub PubClient
 33 | 
 34 | type PubClientImpl struct {
 35 | 	client *clientv3.Client
 36 | 	logger *zap.Logger
 37 | 	prefix string
 38 | }
 39 | 
 40 | func (c *PubClientImpl) Watcher() {
 41 | 	ctx, cancel := context.WithCancel(context.Background())
 42 | 	rch := c.client.Watch(ctx, c.prefix, clientv3.WithPrefix())
 43 | 	defer cancel()
 44 | 
 45 | 	for wresp := range rch {
 46 | 		for _, ev := range wresp.Events {
 47 | 			switch ev.Type {
 48 | 			case mvccpb.PUT:
 49 | 				c.logger.Warn("Cache Update", zap.Any("value", ev.Kv))
 50 | 				err := handleCacheUpdate(ev.Kv)
 51 | 				if err != nil {
 52 | 					c.logger.Error("Cache Update", zap.Error(err))
 53 | 				}
 54 | 			case mvccpb.DELETE:
 55 | 				c.logger.Error("Cache Delete NOT SUPPORT")
 56 | 			}
 57 | 		}
 58 | 	}
 59 | }
 60 | 
 61 | func handleCacheUpdate(val *mvccpb.KeyValue) error {
 62 | 	if val == nil {
 63 | 		return nil
 64 | 	}
 65 | 	f := handleMap[string(val.Key)]
 66 | 	if f != nil {
 67 | 		return f(val.Value)
 68 | 	}
 69 | 	return nil
 70 | }
 71 | 
 72 | func (c *PubClientImpl) Pub(ctx context.Context, key string, val string) error {
 73 | 	ctx, _ = context.WithTimeout(ctx, time.Second*10)
 74 | 	_, err := c.client.Put(ctx, key, val)
 75 | 	if err != nil {
 76 | 		return err
 77 | 	}
 78 | 	return nil
 79 | }
 80 | 
 81 | func NewPubClient(config clientv3.Config, prefix string, logger *zap.Logger) (err error) {
 82 | 
 83 | 	logger.Info("Pub Etcd Connection", zap.Any("config", config), zap.String("prefix", prefix))
 84 | 	client, err := clientv3.New(config)
 85 | 	if err != nil {
 86 | 		return err
 87 | 	}
 88 | 
 89 | 	pci := &PubClientImpl{
 90 | 		client: client,
 91 | 		logger: logger,
 92 | 		prefix: prefix,
 93 | 	}
 94 | 	go func() {
 95 | 		pci.Watcher()
 96 | 	}()
 97 | 
 98 | 	Pub = pci
 99 | 	return nil
100 | }
101 | 
102 | type FakePubClient struct {
103 | }
104 | 
105 | func (c *FakePubClient) Pub(ctx context.Context, key string, val string) error {
106 | 
107 | 	fmt.Printf("Pub Key %s, Val %s \n", key, val)
108 | 	return nil
109 | }
110 | 
111 | func NewTestPubClient() {
112 | 	Pub = &FakePubClient{}
113 | }
114 | 


--------------------------------------------------------------------------------